Exemple #1
0
def main():
    #Import shared parameters and verify output dir exists
    if not os.path.exists(temp_dir):
        raise IOError

#==============================================================================
#     Read in link data and update content files accordingly
#==============================================================================

#Get list of files containing link info and chop it up
    linkfiles = glob.glob(temp_dir + '*' + extensions['links'])
    linkchunks = listchopper(linkfiles)

    linkfiles_read = 0
    for linkchunk in linkchunks:
        #Hash mapping each article to a set of articles linking to it
        linkhash = {}

        for filename in linkchunk:
            with open(filename, 'r') as f:
                newstuff = shared.load(f)
            #Add link info to linkhash
            for target, sources in newstuff.iteritems():
                try:
                    linkhash[target].update(set(sources))
                except KeyError:
                    linkhash[target] = set(sources)

            #Log status
            linkfiles_read += 1
            log("Read " + filename + " - " +
                str(100 * linkfiles_read / len(linkfiles))[:4] +
                " % of link data.")

        log("Chunk finished - updating content files")
        #Update concept with newly read link data
        contentfiles = glob.glob(temp_dir + '*' + extensions['content'])
        contentfiles_read = 0
        for filename in contentfiles:
            #Read file. Content is like {'article title' : {'text' : blah}}
            with open(filename, 'r') as f:
                content = shared.load(f)

            #Search linkhash for links going TO concept
            for concept in content.keys():
                try:
                    sources = linkhash[concept]
                except KeyError:
                    sources = set([])  #Missing key => zero incoming links

                #Update link info for concept
                try:
                    content[concept]['links_in'] = set(
                        content[concept]['links_in'])
                    content[concept]['links_in'].update(sources)
                except KeyError:
                    content[concept]['links_in'] = sources

            #Save updated content
            with open(filename, 'w') as f:
                shared.dump(content, f)

            contentfiles_read += 1
            if contentfiles_read % 100 == 0:
                log("Fixed " +
                    str(100 * contentfiles_read / len(contentfiles))[:4] +
                    "% of content files")
        pass  #Proceed to next link chunk


#==============================================================================
#     Finished link processing
#     Remove unworthy concepts and combine concept/word lists.
#==============================================================================

#What, you think memory grows on trees?
    del linkhash
    gc.collect()

    #Set of all approved concepts
    concept_list = set([])

    #Purge inferior concepts (with insufficient incoming links)
    for filename in contentfiles:
        #Read in content file
        with open(filename, 'r') as f:
            content = shared.load(f)

        for concept in content.keys():
            entry = content[concept]
            if 'links_in' in entry and len(entry['links_in']) >= min_links_in:
                concept_list.add(concept)
            else:
                del content[concept]

        with open(filename, 'w') as f:
            shared.dump(content, f)

    log("Links done - saving index files")

    #Make sure output dir exists
    if not os.path.exists(matrix_dir):
        os.makedirs(matrix_dir)

    #Generate and save a concept index map. Structure: {concept : index}
    concept_indices = {n: m for m, n in enumerate(concept_list)}
    with open(matrix_dir + 'concept2index.ind', 'w') as f:
        shared.dump(concept_indices, f)

    #Read in all wordlists and combine them.
    words = set([])
    for filename in glob.glob(temp_dir + '*' + extensions['words']):
        with open(filename, 'r') as f:
            words.update(shared.load(f))

    #Generate and save a word index map. Structure: {word : index}
    word_indices = {n: m for m, n in enumerate(words)}
    with open(matrix_dir + 'word2index.ind', 'w') as f:
        shared.dump(word_indices, f)

    log("Wrapping up.")
    #Attempt to notify that job is done
    if shared.notify:
        try:
            shared.pushme(sys.argv[0] + ' completed.')
        except:
            log("Job's done. Push failed.")

    logfile.close()
Exemple #2
0
def main():
    #Cleanup
    for f in glob.glob(matrix_dir + '/*' + extensions['matrix']):
        os.remove(f)

    #Set pruning parameters
    window_size = shared.window_size
    cutoff = shared.cutoff

    #Read in dicts mapping words and concepts to their respective indices
    log("Reading in word/index data")
    word2index = shared.load(open(matrix_dir + 'word2index.ind', 'r'))
    concept2index = shared.load(open(matrix_dir + 'concept2index.ind', 'r'))
    log("...Done!")

    #==============================================================================
    #     Construct count matrix in small chunks
    #==============================================================================

    #Count words and concepts
    n_words = len(word2index)
    n_concepts = len(concept2index)

    #Determine matrix dimensions
    matrix_shape = (n_words, n_concepts)

    #Allocate sparse matrix. Dict-of-keys should be faster for iterative
    #construction. Convert to csr for fast row operations later.
    mtx = sps.dok_matrix(matrix_shape, dtype=datatype)

    def matrix_chopper(matrix, dim):
        '''Generator to split a huge matrix into small submatrices, which can
        then be stored in individual files.
        This is handy both when constructing the matrix (building the whole
        matrix without saving to files in the process takes about 50 gigs RAM),
        and when applying it, as this allows one to load only the submatrix
        relevant to a given word.'''
        ind = 0
        counter = 0
        rows = matrix.get_shape()[0]
        while ind < rows:
            end = min(ind + dim, rows)
            #Return pair of submatrix number and the submatrix itself
            yield counter, sps.vstack([matrix.getrow(i)\
                                    for i in xrange(ind, end)], format = 'csr')
            counter += 1
            ind += dim

    def writeout():
        '''Saves the matrix as small submatrrices in separate files.'''
        for n, submatrix in matrix_chopper(mtx, row_chunk_size):
            filename = matrix_dir + str(n) + extensions['matrix']
            #Update submatrix if it's already partially calculated
            log("Writing out chunk %s" % n)
            try:
                with open(filename, 'r') as f:
                    submatrix = submatrix + shared.mload(f)
                #
            except IOError:
                pass  #File doesn't exist yet, so no need to change mtx

            #Dump the submatrix to file
            with open(filename, 'w') as f:
                shared.mdump(submatrix, f)
        return None

    log("Constructing matrix.")
    filelist = glob.glob(temp_dir + '*' + extensions['content'])
    files_read = 0
    for filename in filelist:
        with open(filename, 'r') as f:
            content = shared.load(f)

        #Loop over concepts (columns) as so we don't waste time with rare words
        for concept, entry, in content.iteritems():
            #This is the column index (concept w. index j)
            j = concept2index[concept]

            #Convert concept 'countmap' like so: {word : n}
            wordmap = Counter(entry['text'].split()).iteritems()

            #Add them all to the matrix
            for word, count in wordmap:
                #Find row index of the current word
                i = word2index[word]

                #Add the number of times word i occurs in concept j to the matrix
                mtx[i, j] = count
            #
        #Update file count
        files_read += 1
        log("Processed content file no. %s of %s - %s" %
            (files_read, len(filelist) - 1, percentof(files_read,
                                                      len(filelist))))

        if files_read % column_chunk_size == 0:
            mtx = mtx.tocsr()
            writeout()
            mtx = sps.dok_matrix(matrix_shape)
        #

    #Convert matrix to CSR format and write to files.
    mtx = mtx.tocsr()
    writeout()

    #==============================================================================
    # Count matrix/matrices constructed - computing TF-IDF
    #==============================================================================

    log("Done - computing TF-IDF")

    #Grap list of matrix files (containing the submatrices from before)
    matrixfiles = glob.glob(matrix_dir + "*" + extensions['matrix'])
    words_processed = 0  #for logging purposes

    for filename in matrixfiles:
        with open(filename, 'r') as f:
            mtx = shared.mload(f)

        #Number of words in a submatrix
        n_rows = mtx.get_shape()[0]

        for w in xrange(n_rows):
            #Grap non-zero elements from the row corresonding to word w
            row = mtx.data[mtx.indptr[w]:mtx.indptr[w + 1]]
            if len(row) == 0:
                continue

            #Make a vectorized function to convert a full row to TF-IDF
            f = np.vectorize(lambda m_ij: (1 + np.log(m_ij)) * np.log(
                n_concepts / len(row)))

            #Map all elements to TF-IDF and update matrix
            row = f(row)

            #Normalize the row
            assert row.dtype.kind == 'f'  #Non floats round to zero w/o warning
            normfact = 1.0 / np.linalg.norm(row)
            row *= normfact

            #Start inverted index pruning
            if prune:
                #Number of documents containing w
                n_docs = len(row)

                #Don't prune if the windows exceeds the array bounds (duh)
                if window_size < n_docs:

                    #Obtain list of indices such that row[index] is sorted
                    indices = np.argsort(row)[::-1]

                    #Generate a sorted row
                    sorted_row = [row[index] for index in indices]

                    #Go through sorted row and truncate when pruning condition is met
                    for i in xrange(n_docs - window_size):
                        if sorted_row[i +
                                      window_size] >= cutoff * sorted_row[i]:
                            #Truncate, i.e. set the remaining entries to zero
                            sorted_row[i:] = [0] * (n_docs - i)
                            break
                        else:
                            pass

                    #Unsort to original positions
                    for i in xrange(n_docs):
                        row[indices[i]] = sorted_row[i]

            #Update matrix
            mtx.data[mtx.indptr[w]:mtx.indptr[w + 1]] = row

            #Log it
            words_processed += 1
            if words_processed % 10**3 == 0:
                log("Processing word %s of %s - %s" %
                    (words_processed, n_words,
                     percentof(words_processed, n_words)))

        #Keep it sparse - no need to store zeroes
        mtx.eliminate_zeros()
        with open(filename, 'w') as f:
            shared.mdump(mtx, f)

    log("Done!")

    #Notify that the job is done
    if shared.notify:
        try:
            shared.pushme(sys.argv[0] + ' completed.')
        except:
            log("Job's done. Push failed.")

    logfile.close()
    return None
Exemple #3
0
        self.flush_output_buffer()
        return None
    
if __name__ == "__main__":
    if len(sys.argv) == 2:
        file_to_parse = sys.argv[1]
    else:
        file_to_parse = DEFAULT_FILENAME
    
    #Create and configure content handler
    test = WikiHandler()
    test.verbose = True
    
    #Create a parser and set handler
    ATST = SAX.make_parser()
    ATST.setContentHandler(test)
    
    #Let the parser walk the file
    log("Parsing started...")
    ATST.parse(file_to_parse)
    log("...Parsing done!")
    
    #Attempt to send notification that job is done
    if shared.notify:
        try:
            shared.pushme(sys.argv[0]+' completed.')
        except:
            log("Job's done. Push failed.")
    
    logfile.close()
def main():    
    #Import shared parameters and verify output dir exists
    if not os.path.exists(temp_dir):
        raise IOError

#==============================================================================
#     Read in link data and update content files accordingly
#==============================================================================

    #Get list of files containing link info and chop it up
    linkfiles = glob.glob(temp_dir + '*'+extensions['links'])
    linkchunks = listchopper(linkfiles)
    
    linkfiles_read = 0
    for linkchunk in linkchunks:
        #Hash mapping each article to a set of articles linking to it
        linkhash = {}
        
        for filename in linkchunk:
            with open(filename, 'r') as f:
                newstuff = shared.load(f)
            #Add link info to linkhash
            for target, sources in newstuff.iteritems():
                try:
                    linkhash[target].update(set(sources))
                except KeyError:
                    linkhash[target] = set(sources)
            
            #Log status
            linkfiles_read += 1
            log("Read " + filename + " - " + 
                str(100*linkfiles_read/len(linkfiles))[:4] + " % of link data.")
        
        log("Chunk finished - updating content files")
        #Update concept with newly read link data
        contentfiles = glob.glob(temp_dir + '*'+extensions['content'])
        contentfiles_read = 0
        for filename in contentfiles:
            #Read file. Content is like {'article title' : {'text' : blah}}
            with open(filename, 'r') as f:
                content = shared.load(f)

            #Search linkhash for links going TO concept                
            for concept in content.keys():
                try:
                    sources = linkhash[concept]
                except KeyError:
                    sources = set([])  #Missing key => zero incoming links
                
                #Update link info for concept
                try:
                    content[concept]['links_in'] = set(content[concept]['links_in'])
                    content[concept]['links_in'].update(sources)
                except KeyError:
                    content[concept]['links_in'] = sources
                
            #Save updated content
            with open(filename, 'w') as f:
                shared.dump(content, f)
                
            contentfiles_read += 1
            if contentfiles_read % 100 == 0:
                log("Fixed " + str(100*contentfiles_read/len(contentfiles))[:4]
                    + "% of content files")
        pass  #Proceed to next link chunk

#==============================================================================
#     Finished link processing 
#     Remove unworthy concepts and combine concept/word lists.   
#==============================================================================
    
    #What, you think memory grows on trees?
    del linkhash
    gc.collect()    
    
    #Set of all approved concepts
    concept_list = set([])
    
    #Purge inferior concepts (with insufficient incoming links)
    for filename in contentfiles:
        #Read in content file
        with open(filename, 'r') as f:
            content = shared.load(f)
        
        for concept in content.keys():
            entry = content[concept]
            if 'links_in' in entry and len(entry['links_in']) >= min_links_in:
                concept_list.add(concept)
            else:
                del content[concept]
        
        with open(filename, 'w') as f:
            shared.dump(content, f)
    
    log("Links done - saving index files")

    #Make sure output dir exists
    if not os.path.exists(matrix_dir):
        os.makedirs(matrix_dir)    
    
    #Generate and save a concept index map. Structure: {concept : index}
    concept_indices = {n: m for m,n in enumerate(concept_list)}
    with open(matrix_dir+'concept2index.ind', 'w') as f:
        shared.dump(concept_indices, f)
    
    #Read in all wordlists and combine them.
    words = set([])
    for filename in glob.glob(temp_dir + '*'+extensions['words']):
        with open(filename, 'r') as f:
            words.update(shared.load(f))
        
    #Generate and save a word index map. Structure: {word : index}
    word_indices = {n: m for m,n in enumerate(words)}
    with open(matrix_dir+'word2index.ind', 'w') as f:
        shared.dump(word_indices, f)
    
    log("Wrapping up.")
    #Attempt to notify that job is done
    if shared.notify:
        try:
            shared.pushme(sys.argv[0]+' completed.')
        except:
            log("Job's done. Push failed.")    
    
    logfile.close()
def main():
    #Cleanup
    for f in glob.glob(matrix_dir + '/*'+extensions['matrix']):
        os.remove(f)

    #Set pruning parameters
    window_size = shared.window_size
    cutoff = shared.cutoff
    
    #Read in dicts mapping words and concepts to their respective indices
    log("Reading in word/index data")
    word2index = shared.load(open(matrix_dir+'word2index.ind', 'r'))
    concept2index = shared.load(open(matrix_dir+'concept2index.ind', 'r'))
    log("...Done!")
    
#==============================================================================
#     Construct count matrix in small chunks    
#==============================================================================
    
    #Count words and concepts
    n_words = len(word2index)
    n_concepts = len(concept2index)
    
    #Determine matrix dimensions
    matrix_shape = (n_words, n_concepts)
    
    #Allocate sparse matrix. Dict-of-keys should be faster for iterative
    #construction. Convert to csr for fast row operations later.
    mtx = sps.dok_matrix(matrix_shape, dtype = datatype)
    
    def matrix_chopper(matrix, dim):
        '''Generator to split a huge matrix into small submatrices, which can
        then be stored in individual files.
        This is handy both when constructing the matrix (building the whole
        matrix without saving to files in the process takes about 50 gigs RAM),
        and when applying it, as this allows one to load only the submatrix
        relevant to a given word.'''
        ind = 0
        counter = 0
        rows = matrix.get_shape()[0]
        while ind < rows:
            end = min(ind+dim, rows)
            #Return pair of submatrix number and the submatrix itself
            yield counter, sps.vstack([matrix.getrow(i)\
                                    for i in xrange(ind, end)], format = 'csr')
            counter += 1
            ind += dim
    
    def writeout():
        '''Saves the matrix as small submatrrices in separate files.'''
        for n, submatrix in matrix_chopper(mtx, row_chunk_size):
            filename = matrix_dir+str(n)+extensions['matrix']
            #Update submatrix if it's already partially calculated
            log("Writing out chunk %s" % n)
            try:
                with open(filename, 'r') as f:
                    submatrix = submatrix + shared.mload(f)
                #
            except IOError:
                pass #File doesn't exist yet, so no need to change mtx
            
            #Dump the submatrix to file
            with open(filename, 'w') as f:
                shared.mdump(submatrix, f)
        return None
    
    log("Constructing matrix.")
    filelist = glob.glob(temp_dir + '*'+extensions['content'])
    files_read = 0
    for filename in filelist:
        with open(filename, 'r') as f:
            content = shared.load(f)
        
        #Loop over concepts (columns) as so we don't waste time with rare words
        for concept, entry, in content.iteritems():
            #This is the column index (concept w. index j)
            j = concept2index[concept]
            
            #Convert concept 'countmap' like so: {word : n}
            wordmap = Counter(entry['text'].split()).iteritems()
            
            #Add them all to the matrix
            for word, count in wordmap:
                #Find row index of the current word
                i = word2index[word]
    
                #Add the number of times word i occurs in concept j to the matrix
                mtx[i,j] = count
            #
        #Update file count
        files_read += 1
        log("Processed content file no. %s of %s - %s"
            % (files_read, len(filelist)-1, percentof(files_read, len(filelist))))
        
        if files_read % column_chunk_size == 0:
            mtx = mtx.tocsr()
            writeout()
            mtx = sps.dok_matrix(matrix_shape)
        #
    
    #Convert matrix to CSR format and write to files.
    mtx = mtx.tocsr()
    writeout()

#==============================================================================
# Count matrix/matrices constructed - computing TF-IDF
#==============================================================================

    log("Done - computing TF-IDF")
    
    #Grap list of matrix files (containing the submatrices from before)
    matrixfiles = glob.glob(matrix_dir + "*" + extensions['matrix'])
    words_processed = 0  #for logging purposes    
    
    for filename in matrixfiles:
        with open(filename, 'r') as f:
            mtx = shared.mload(f)
        
        #Number of words in a submatrix
        n_rows = mtx.get_shape()[0]
        
        for w in xrange(n_rows):
            #Grap non-zero elements from the row corresonding to word w
            row = mtx.data[mtx.indptr[w] : mtx.indptr[w+1]]
            if len(row) == 0:
                continue
            
            #Make a vectorized function to convert a full row to TF-IDF
            f = np.vectorize(lambda m_ij: (1+np.log(m_ij))*
                             np.log(n_concepts/len(row)))
    
            #Map all elements to TF-IDF and update matrix
            row = f(row)
            
            #Normalize the row
            assert row.dtype.kind == 'f'  #Non floats round to zero w/o warning
            normfact = 1.0/np.linalg.norm(row)            
            row *= normfact
            
            #Start inverted index pruning
            if prune:                
                #Number of documents containing w
                n_docs = len(row)        
                
                #Don't prune if the windows exceeds the array bounds (duh)
                if window_size < n_docs:
                    
                    #Obtain list of indices such that row[index] is sorted
                    indices = np.argsort(row)[::-1]
            
                    #Generate a sorted row
                    sorted_row = [row[index] for index in indices]
            
                    #Go through sorted row and truncate when pruning condition is met
                    for i in xrange(n_docs-window_size):
                        if sorted_row[i+window_size] >= cutoff*sorted_row[i]:   
                            #Truncate, i.e. set the remaining entries to zero
                            sorted_row[i:] = [0]*(n_docs-i)
                            break
                        else:
                            pass
                        
                    #Unsort to original positions
                    for i in xrange(n_docs):    
                        row[indices[i]] = sorted_row[i]
                
            #Update matrix
            mtx.data[mtx.indptr[w] : mtx.indptr[w+1]] = row
            
            #Log it
            words_processed += 1
            if words_processed % 10**3 == 0:
                log("Processing word %s of %s - %s" % 
                    (words_processed, n_words,
                     percentof(words_processed, n_words)))
        
        #Keep it sparse - no need to store zeroes
        mtx.eliminate_zeros()
        with open(filename, 'w') as f:
            shared.mdump(mtx, f)
    
    log("Done!")
    
    #Notify that the job is done
    if shared.notify:
        try:
            shared.pushme(sys.argv[0]+' completed.')
        except:
            log("Job's done. Push failed.")    
    
    logfile.close()
    return None
Exemple #6
0
        return None


if __name__ == "__main__":
    if len(sys.argv) == 2:
        file_to_parse = sys.argv[1]
    else:
        file_to_parse = DEFAULT_FILENAME

    #Create and configure content handler
    test = WikiHandler()
    test.verbose = True

    #Create a parser and set handler
    ATST = SAX.make_parser()
    ATST.setContentHandler(test)

    #Let the parser walk the file
    log("Parsing started...")
    ATST.parse(file_to_parse)
    log("...Parsing done!")

    #Attempt to send notification that job is done
    if shared.notify:
        try:
            shared.pushme(sys.argv[0] + ' completed.')
        except:
            log("Job's done. Push failed.")

    logfile.close()