def writeout(self): '''Writes output buffer contents to file''' #Generate filename and write to file filename = make_filename.next() #Write article contents to file with open(filename+extensions['content'], 'w') as f: shared.dump(self.output_buffer, f) #Store wordlist as files with open(filename+extensions['words'], 'w') as f: shared.dump(self.words, f) #Store linkhash in files with open(filename+extensions['links'], 'w') as f: shared.dump(self.linkhash, f) if self.verbose: log("wrote "+filename) #Empty output buffer self.flush_output_buffer() return None
def writeout(self): '''Writes output buffer contents to file''' #Generate filename and write to file filename = make_filename.next() #Write article contents to file with open(filename + extensions['content'], 'w') as f: shared.dump(self.output_buffer, f) #Store wordlist as files with open(filename + extensions['words'], 'w') as f: shared.dump(self.words, f) #Store linkhash in files with open(filename + extensions['links'], 'w') as f: shared.dump(self.linkhash, f) if self.verbose: log("wrote " + filename) #Empty output buffer self.flush_output_buffer() return None
def main(): #Import shared parameters and verify output dir exists if not os.path.exists(temp_dir): raise IOError #============================================================================== # Read in link data and update content files accordingly #============================================================================== #Get list of files containing link info and chop it up linkfiles = glob.glob(temp_dir + '*' + extensions['links']) linkchunks = listchopper(linkfiles) linkfiles_read = 0 for linkchunk in linkchunks: #Hash mapping each article to a set of articles linking to it linkhash = {} for filename in linkchunk: with open(filename, 'r') as f: newstuff = shared.load(f) #Add link info to linkhash for target, sources in newstuff.iteritems(): try: linkhash[target].update(set(sources)) except KeyError: linkhash[target] = set(sources) #Log status linkfiles_read += 1 log("Read " + filename + " - " + str(100 * linkfiles_read / len(linkfiles))[:4] + " % of link data.") log("Chunk finished - updating content files") #Update concept with newly read link data contentfiles = glob.glob(temp_dir + '*' + extensions['content']) contentfiles_read = 0 for filename in contentfiles: #Read file. Content is like {'article title' : {'text' : blah}} with open(filename, 'r') as f: content = shared.load(f) #Search linkhash for links going TO concept for concept in content.keys(): try: sources = linkhash[concept] except KeyError: sources = set([]) #Missing key => zero incoming links #Update link info for concept try: content[concept]['links_in'] = set( content[concept]['links_in']) content[concept]['links_in'].update(sources) except KeyError: content[concept]['links_in'] = sources #Save updated content with open(filename, 'w') as f: shared.dump(content, f) contentfiles_read += 1 if contentfiles_read % 100 == 0: log("Fixed " + str(100 * contentfiles_read / len(contentfiles))[:4] + "% of content files") pass #Proceed to next link chunk #============================================================================== # Finished link processing # Remove unworthy concepts and combine concept/word lists. #============================================================================== #What, you think memory grows on trees? del linkhash gc.collect() #Set of all approved concepts concept_list = set([]) #Purge inferior concepts (with insufficient incoming links) for filename in contentfiles: #Read in content file with open(filename, 'r') as f: content = shared.load(f) for concept in content.keys(): entry = content[concept] if 'links_in' in entry and len(entry['links_in']) >= min_links_in: concept_list.add(concept) else: del content[concept] with open(filename, 'w') as f: shared.dump(content, f) log("Links done - saving index files") #Make sure output dir exists if not os.path.exists(matrix_dir): os.makedirs(matrix_dir) #Generate and save a concept index map. Structure: {concept : index} concept_indices = {n: m for m, n in enumerate(concept_list)} with open(matrix_dir + 'concept2index.ind', 'w') as f: shared.dump(concept_indices, f) #Read in all wordlists and combine them. words = set([]) for filename in glob.glob(temp_dir + '*' + extensions['words']): with open(filename, 'r') as f: words.update(shared.load(f)) #Generate and save a word index map. Structure: {word : index} word_indices = {n: m for m, n in enumerate(words)} with open(matrix_dir + 'word2index.ind', 'w') as f: shared.dump(word_indices, f) log("Wrapping up.") #Attempt to notify that job is done if shared.notify: try: shared.pushme(sys.argv[0] + ' completed.') except: log("Job's done. Push failed.") logfile.close()
def main(): #Import shared parameters and verify output dir exists if not os.path.exists(temp_dir): raise IOError #============================================================================== # Read in link data and update content files accordingly #============================================================================== #Get list of files containing link info and chop it up linkfiles = glob.glob(temp_dir + '*'+extensions['links']) linkchunks = listchopper(linkfiles) linkfiles_read = 0 for linkchunk in linkchunks: #Hash mapping each article to a set of articles linking to it linkhash = {} for filename in linkchunk: with open(filename, 'r') as f: newstuff = shared.load(f) #Add link info to linkhash for target, sources in newstuff.iteritems(): try: linkhash[target].update(set(sources)) except KeyError: linkhash[target] = set(sources) #Log status linkfiles_read += 1 log("Read " + filename + " - " + str(100*linkfiles_read/len(linkfiles))[:4] + " % of link data.") log("Chunk finished - updating content files") #Update concept with newly read link data contentfiles = glob.glob(temp_dir + '*'+extensions['content']) contentfiles_read = 0 for filename in contentfiles: #Read file. Content is like {'article title' : {'text' : blah}} with open(filename, 'r') as f: content = shared.load(f) #Search linkhash for links going TO concept for concept in content.keys(): try: sources = linkhash[concept] except KeyError: sources = set([]) #Missing key => zero incoming links #Update link info for concept try: content[concept]['links_in'] = set(content[concept]['links_in']) content[concept]['links_in'].update(sources) except KeyError: content[concept]['links_in'] = sources #Save updated content with open(filename, 'w') as f: shared.dump(content, f) contentfiles_read += 1 if contentfiles_read % 100 == 0: log("Fixed " + str(100*contentfiles_read/len(contentfiles))[:4] + "% of content files") pass #Proceed to next link chunk #============================================================================== # Finished link processing # Remove unworthy concepts and combine concept/word lists. #============================================================================== #What, you think memory grows on trees? del linkhash gc.collect() #Set of all approved concepts concept_list = set([]) #Purge inferior concepts (with insufficient incoming links) for filename in contentfiles: #Read in content file with open(filename, 'r') as f: content = shared.load(f) for concept in content.keys(): entry = content[concept] if 'links_in' in entry and len(entry['links_in']) >= min_links_in: concept_list.add(concept) else: del content[concept] with open(filename, 'w') as f: shared.dump(content, f) log("Links done - saving index files") #Make sure output dir exists if not os.path.exists(matrix_dir): os.makedirs(matrix_dir) #Generate and save a concept index map. Structure: {concept : index} concept_indices = {n: m for m,n in enumerate(concept_list)} with open(matrix_dir+'concept2index.ind', 'w') as f: shared.dump(concept_indices, f) #Read in all wordlists and combine them. words = set([]) for filename in glob.glob(temp_dir + '*'+extensions['words']): with open(filename, 'r') as f: words.update(shared.load(f)) #Generate and save a word index map. Structure: {word : index} word_indices = {n: m for m,n in enumerate(words)} with open(matrix_dir+'word2index.ind', 'w') as f: shared.dump(word_indices, f) log("Wrapping up.") #Attempt to notify that job is done if shared.notify: try: shared.pushme(sys.argv[0]+' completed.') except: log("Job's done. Push failed.") logfile.close()