def main(): #Import shared parameters and verify output dir exists if not os.path.exists(temp_dir): raise IOError #============================================================================== # Read in link data and update content files accordingly #============================================================================== #Get list of files containing link info and chop it up linkfiles = glob.glob(temp_dir + '*' + extensions['links']) linkchunks = listchopper(linkfiles) linkfiles_read = 0 for linkchunk in linkchunks: #Hash mapping each article to a set of articles linking to it linkhash = {} for filename in linkchunk: with open(filename, 'r') as f: newstuff = shared.load(f) #Add link info to linkhash for target, sources in newstuff.iteritems(): try: linkhash[target].update(set(sources)) except KeyError: linkhash[target] = set(sources) #Log status linkfiles_read += 1 log("Read " + filename + " - " + str(100 * linkfiles_read / len(linkfiles))[:4] + " % of link data.") log("Chunk finished - updating content files") #Update concept with newly read link data contentfiles = glob.glob(temp_dir + '*' + extensions['content']) contentfiles_read = 0 for filename in contentfiles: #Read file. Content is like {'article title' : {'text' : blah}} with open(filename, 'r') as f: content = shared.load(f) #Search linkhash for links going TO concept for concept in content.keys(): try: sources = linkhash[concept] except KeyError: sources = set([]) #Missing key => zero incoming links #Update link info for concept try: content[concept]['links_in'] = set( content[concept]['links_in']) content[concept]['links_in'].update(sources) except KeyError: content[concept]['links_in'] = sources #Save updated content with open(filename, 'w') as f: shared.dump(content, f) contentfiles_read += 1 if contentfiles_read % 100 == 0: log("Fixed " + str(100 * contentfiles_read / len(contentfiles))[:4] + "% of content files") pass #Proceed to next link chunk #============================================================================== # Finished link processing # Remove unworthy concepts and combine concept/word lists. #============================================================================== #What, you think memory grows on trees? del linkhash gc.collect() #Set of all approved concepts concept_list = set([]) #Purge inferior concepts (with insufficient incoming links) for filename in contentfiles: #Read in content file with open(filename, 'r') as f: content = shared.load(f) for concept in content.keys(): entry = content[concept] if 'links_in' in entry and len(entry['links_in']) >= min_links_in: concept_list.add(concept) else: del content[concept] with open(filename, 'w') as f: shared.dump(content, f) log("Links done - saving index files") #Make sure output dir exists if not os.path.exists(matrix_dir): os.makedirs(matrix_dir) #Generate and save a concept index map. Structure: {concept : index} concept_indices = {n: m for m, n in enumerate(concept_list)} with open(matrix_dir + 'concept2index.ind', 'w') as f: shared.dump(concept_indices, f) #Read in all wordlists and combine them. words = set([]) for filename in glob.glob(temp_dir + '*' + extensions['words']): with open(filename, 'r') as f: words.update(shared.load(f)) #Generate and save a word index map. Structure: {word : index} word_indices = {n: m for m, n in enumerate(words)} with open(matrix_dir + 'word2index.ind', 'w') as f: shared.dump(word_indices, f) log("Wrapping up.") #Attempt to notify that job is done if shared.notify: try: shared.pushme(sys.argv[0] + ' completed.') except: log("Job's done. Push failed.") logfile.close()
def main(): #Cleanup for f in glob.glob(matrix_dir + '/*' + extensions['matrix']): os.remove(f) #Set pruning parameters window_size = shared.window_size cutoff = shared.cutoff #Read in dicts mapping words and concepts to their respective indices log("Reading in word/index data") word2index = shared.load(open(matrix_dir + 'word2index.ind', 'r')) concept2index = shared.load(open(matrix_dir + 'concept2index.ind', 'r')) log("...Done!") #============================================================================== # Construct count matrix in small chunks #============================================================================== #Count words and concepts n_words = len(word2index) n_concepts = len(concept2index) #Determine matrix dimensions matrix_shape = (n_words, n_concepts) #Allocate sparse matrix. Dict-of-keys should be faster for iterative #construction. Convert to csr for fast row operations later. mtx = sps.dok_matrix(matrix_shape, dtype=datatype) def matrix_chopper(matrix, dim): '''Generator to split a huge matrix into small submatrices, which can then be stored in individual files. This is handy both when constructing the matrix (building the whole matrix without saving to files in the process takes about 50 gigs RAM), and when applying it, as this allows one to load only the submatrix relevant to a given word.''' ind = 0 counter = 0 rows = matrix.get_shape()[0] while ind < rows: end = min(ind + dim, rows) #Return pair of submatrix number and the submatrix itself yield counter, sps.vstack([matrix.getrow(i)\ for i in xrange(ind, end)], format = 'csr') counter += 1 ind += dim def writeout(): '''Saves the matrix as small submatrrices in separate files.''' for n, submatrix in matrix_chopper(mtx, row_chunk_size): filename = matrix_dir + str(n) + extensions['matrix'] #Update submatrix if it's already partially calculated log("Writing out chunk %s" % n) try: with open(filename, 'r') as f: submatrix = submatrix + shared.mload(f) # except IOError: pass #File doesn't exist yet, so no need to change mtx #Dump the submatrix to file with open(filename, 'w') as f: shared.mdump(submatrix, f) return None log("Constructing matrix.") filelist = glob.glob(temp_dir + '*' + extensions['content']) files_read = 0 for filename in filelist: with open(filename, 'r') as f: content = shared.load(f) #Loop over concepts (columns) as so we don't waste time with rare words for concept, entry, in content.iteritems(): #This is the column index (concept w. index j) j = concept2index[concept] #Convert concept 'countmap' like so: {word : n} wordmap = Counter(entry['text'].split()).iteritems() #Add them all to the matrix for word, count in wordmap: #Find row index of the current word i = word2index[word] #Add the number of times word i occurs in concept j to the matrix mtx[i, j] = count # #Update file count files_read += 1 log("Processed content file no. %s of %s - %s" % (files_read, len(filelist) - 1, percentof(files_read, len(filelist)))) if files_read % column_chunk_size == 0: mtx = mtx.tocsr() writeout() mtx = sps.dok_matrix(matrix_shape) # #Convert matrix to CSR format and write to files. mtx = mtx.tocsr() writeout() #============================================================================== # Count matrix/matrices constructed - computing TF-IDF #============================================================================== log("Done - computing TF-IDF") #Grap list of matrix files (containing the submatrices from before) matrixfiles = glob.glob(matrix_dir + "*" + extensions['matrix']) words_processed = 0 #for logging purposes for filename in matrixfiles: with open(filename, 'r') as f: mtx = shared.mload(f) #Number of words in a submatrix n_rows = mtx.get_shape()[0] for w in xrange(n_rows): #Grap non-zero elements from the row corresonding to word w row = mtx.data[mtx.indptr[w]:mtx.indptr[w + 1]] if len(row) == 0: continue #Make a vectorized function to convert a full row to TF-IDF f = np.vectorize(lambda m_ij: (1 + np.log(m_ij)) * np.log( n_concepts / len(row))) #Map all elements to TF-IDF and update matrix row = f(row) #Normalize the row assert row.dtype.kind == 'f' #Non floats round to zero w/o warning normfact = 1.0 / np.linalg.norm(row) row *= normfact #Start inverted index pruning if prune: #Number of documents containing w n_docs = len(row) #Don't prune if the windows exceeds the array bounds (duh) if window_size < n_docs: #Obtain list of indices such that row[index] is sorted indices = np.argsort(row)[::-1] #Generate a sorted row sorted_row = [row[index] for index in indices] #Go through sorted row and truncate when pruning condition is met for i in xrange(n_docs - window_size): if sorted_row[i + window_size] >= cutoff * sorted_row[i]: #Truncate, i.e. set the remaining entries to zero sorted_row[i:] = [0] * (n_docs - i) break else: pass #Unsort to original positions for i in xrange(n_docs): row[indices[i]] = sorted_row[i] #Update matrix mtx.data[mtx.indptr[w]:mtx.indptr[w + 1]] = row #Log it words_processed += 1 if words_processed % 10**3 == 0: log("Processing word %s of %s - %s" % (words_processed, n_words, percentof(words_processed, n_words))) #Keep it sparse - no need to store zeroes mtx.eliminate_zeros() with open(filename, 'w') as f: shared.mdump(mtx, f) log("Done!") #Notify that the job is done if shared.notify: try: shared.pushme(sys.argv[0] + ' completed.') except: log("Job's done. Push failed.") logfile.close() return None
self.flush_output_buffer() return None if __name__ == "__main__": if len(sys.argv) == 2: file_to_parse = sys.argv[1] else: file_to_parse = DEFAULT_FILENAME #Create and configure content handler test = WikiHandler() test.verbose = True #Create a parser and set handler ATST = SAX.make_parser() ATST.setContentHandler(test) #Let the parser walk the file log("Parsing started...") ATST.parse(file_to_parse) log("...Parsing done!") #Attempt to send notification that job is done if shared.notify: try: shared.pushme(sys.argv[0]+' completed.') except: log("Job's done. Push failed.") logfile.close()
def main(): #Import shared parameters and verify output dir exists if not os.path.exists(temp_dir): raise IOError #============================================================================== # Read in link data and update content files accordingly #============================================================================== #Get list of files containing link info and chop it up linkfiles = glob.glob(temp_dir + '*'+extensions['links']) linkchunks = listchopper(linkfiles) linkfiles_read = 0 for linkchunk in linkchunks: #Hash mapping each article to a set of articles linking to it linkhash = {} for filename in linkchunk: with open(filename, 'r') as f: newstuff = shared.load(f) #Add link info to linkhash for target, sources in newstuff.iteritems(): try: linkhash[target].update(set(sources)) except KeyError: linkhash[target] = set(sources) #Log status linkfiles_read += 1 log("Read " + filename + " - " + str(100*linkfiles_read/len(linkfiles))[:4] + " % of link data.") log("Chunk finished - updating content files") #Update concept with newly read link data contentfiles = glob.glob(temp_dir + '*'+extensions['content']) contentfiles_read = 0 for filename in contentfiles: #Read file. Content is like {'article title' : {'text' : blah}} with open(filename, 'r') as f: content = shared.load(f) #Search linkhash for links going TO concept for concept in content.keys(): try: sources = linkhash[concept] except KeyError: sources = set([]) #Missing key => zero incoming links #Update link info for concept try: content[concept]['links_in'] = set(content[concept]['links_in']) content[concept]['links_in'].update(sources) except KeyError: content[concept]['links_in'] = sources #Save updated content with open(filename, 'w') as f: shared.dump(content, f) contentfiles_read += 1 if contentfiles_read % 100 == 0: log("Fixed " + str(100*contentfiles_read/len(contentfiles))[:4] + "% of content files") pass #Proceed to next link chunk #============================================================================== # Finished link processing # Remove unworthy concepts and combine concept/word lists. #============================================================================== #What, you think memory grows on trees? del linkhash gc.collect() #Set of all approved concepts concept_list = set([]) #Purge inferior concepts (with insufficient incoming links) for filename in contentfiles: #Read in content file with open(filename, 'r') as f: content = shared.load(f) for concept in content.keys(): entry = content[concept] if 'links_in' in entry and len(entry['links_in']) >= min_links_in: concept_list.add(concept) else: del content[concept] with open(filename, 'w') as f: shared.dump(content, f) log("Links done - saving index files") #Make sure output dir exists if not os.path.exists(matrix_dir): os.makedirs(matrix_dir) #Generate and save a concept index map. Structure: {concept : index} concept_indices = {n: m for m,n in enumerate(concept_list)} with open(matrix_dir+'concept2index.ind', 'w') as f: shared.dump(concept_indices, f) #Read in all wordlists and combine them. words = set([]) for filename in glob.glob(temp_dir + '*'+extensions['words']): with open(filename, 'r') as f: words.update(shared.load(f)) #Generate and save a word index map. Structure: {word : index} word_indices = {n: m for m,n in enumerate(words)} with open(matrix_dir+'word2index.ind', 'w') as f: shared.dump(word_indices, f) log("Wrapping up.") #Attempt to notify that job is done if shared.notify: try: shared.pushme(sys.argv[0]+' completed.') except: log("Job's done. Push failed.") logfile.close()
def main(): #Cleanup for f in glob.glob(matrix_dir + '/*'+extensions['matrix']): os.remove(f) #Set pruning parameters window_size = shared.window_size cutoff = shared.cutoff #Read in dicts mapping words and concepts to their respective indices log("Reading in word/index data") word2index = shared.load(open(matrix_dir+'word2index.ind', 'r')) concept2index = shared.load(open(matrix_dir+'concept2index.ind', 'r')) log("...Done!") #============================================================================== # Construct count matrix in small chunks #============================================================================== #Count words and concepts n_words = len(word2index) n_concepts = len(concept2index) #Determine matrix dimensions matrix_shape = (n_words, n_concepts) #Allocate sparse matrix. Dict-of-keys should be faster for iterative #construction. Convert to csr for fast row operations later. mtx = sps.dok_matrix(matrix_shape, dtype = datatype) def matrix_chopper(matrix, dim): '''Generator to split a huge matrix into small submatrices, which can then be stored in individual files. This is handy both when constructing the matrix (building the whole matrix without saving to files in the process takes about 50 gigs RAM), and when applying it, as this allows one to load only the submatrix relevant to a given word.''' ind = 0 counter = 0 rows = matrix.get_shape()[0] while ind < rows: end = min(ind+dim, rows) #Return pair of submatrix number and the submatrix itself yield counter, sps.vstack([matrix.getrow(i)\ for i in xrange(ind, end)], format = 'csr') counter += 1 ind += dim def writeout(): '''Saves the matrix as small submatrrices in separate files.''' for n, submatrix in matrix_chopper(mtx, row_chunk_size): filename = matrix_dir+str(n)+extensions['matrix'] #Update submatrix if it's already partially calculated log("Writing out chunk %s" % n) try: with open(filename, 'r') as f: submatrix = submatrix + shared.mload(f) # except IOError: pass #File doesn't exist yet, so no need to change mtx #Dump the submatrix to file with open(filename, 'w') as f: shared.mdump(submatrix, f) return None log("Constructing matrix.") filelist = glob.glob(temp_dir + '*'+extensions['content']) files_read = 0 for filename in filelist: with open(filename, 'r') as f: content = shared.load(f) #Loop over concepts (columns) as so we don't waste time with rare words for concept, entry, in content.iteritems(): #This is the column index (concept w. index j) j = concept2index[concept] #Convert concept 'countmap' like so: {word : n} wordmap = Counter(entry['text'].split()).iteritems() #Add them all to the matrix for word, count in wordmap: #Find row index of the current word i = word2index[word] #Add the number of times word i occurs in concept j to the matrix mtx[i,j] = count # #Update file count files_read += 1 log("Processed content file no. %s of %s - %s" % (files_read, len(filelist)-1, percentof(files_read, len(filelist)))) if files_read % column_chunk_size == 0: mtx = mtx.tocsr() writeout() mtx = sps.dok_matrix(matrix_shape) # #Convert matrix to CSR format and write to files. mtx = mtx.tocsr() writeout() #============================================================================== # Count matrix/matrices constructed - computing TF-IDF #============================================================================== log("Done - computing TF-IDF") #Grap list of matrix files (containing the submatrices from before) matrixfiles = glob.glob(matrix_dir + "*" + extensions['matrix']) words_processed = 0 #for logging purposes for filename in matrixfiles: with open(filename, 'r') as f: mtx = shared.mload(f) #Number of words in a submatrix n_rows = mtx.get_shape()[0] for w in xrange(n_rows): #Grap non-zero elements from the row corresonding to word w row = mtx.data[mtx.indptr[w] : mtx.indptr[w+1]] if len(row) == 0: continue #Make a vectorized function to convert a full row to TF-IDF f = np.vectorize(lambda m_ij: (1+np.log(m_ij))* np.log(n_concepts/len(row))) #Map all elements to TF-IDF and update matrix row = f(row) #Normalize the row assert row.dtype.kind == 'f' #Non floats round to zero w/o warning normfact = 1.0/np.linalg.norm(row) row *= normfact #Start inverted index pruning if prune: #Number of documents containing w n_docs = len(row) #Don't prune if the windows exceeds the array bounds (duh) if window_size < n_docs: #Obtain list of indices such that row[index] is sorted indices = np.argsort(row)[::-1] #Generate a sorted row sorted_row = [row[index] for index in indices] #Go through sorted row and truncate when pruning condition is met for i in xrange(n_docs-window_size): if sorted_row[i+window_size] >= cutoff*sorted_row[i]: #Truncate, i.e. set the remaining entries to zero sorted_row[i:] = [0]*(n_docs-i) break else: pass #Unsort to original positions for i in xrange(n_docs): row[indices[i]] = sorted_row[i] #Update matrix mtx.data[mtx.indptr[w] : mtx.indptr[w+1]] = row #Log it words_processed += 1 if words_processed % 10**3 == 0: log("Processing word %s of %s - %s" % (words_processed, n_words, percentof(words_processed, n_words))) #Keep it sparse - no need to store zeroes mtx.eliminate_zeros() with open(filename, 'w') as f: shared.mdump(mtx, f) log("Done!") #Notify that the job is done if shared.notify: try: shared.pushme(sys.argv[0]+' completed.') except: log("Job's done. Push failed.") logfile.close() return None
return None if __name__ == "__main__": if len(sys.argv) == 2: file_to_parse = sys.argv[1] else: file_to_parse = DEFAULT_FILENAME #Create and configure content handler test = WikiHandler() test.verbose = True #Create a parser and set handler ATST = SAX.make_parser() ATST.setContentHandler(test) #Let the parser walk the file log("Parsing started...") ATST.parse(file_to_parse) log("...Parsing done!") #Attempt to send notification that job is done if shared.notify: try: shared.pushme(sys.argv[0] + ' completed.') except: log("Job's done. Push failed.") logfile.close()