Example #1
0
 def writeout(self):
     '''Writes output buffer contents to file'''
     #Generate filename and write to file
     filename = make_filename.next()
     #Write article contents to file
     with open(filename+extensions['content'], 'w') as f:
         shared.dump(self.output_buffer, f)
     
     #Store wordlist as files
     with open(filename+extensions['words'], 'w') as f:
         shared.dump(self.words, f)
     
     #Store linkhash in files
     with open(filename+extensions['links'], 'w') as f:
         shared.dump(self.linkhash, f)
     
     if self.verbose:
         log("wrote "+filename)
     
     #Empty output buffer
     self.flush_output_buffer()
     return None
Example #2
0
    def writeout(self):
        '''Writes output buffer contents to file'''
        #Generate filename and write to file
        filename = make_filename.next()
        #Write article contents to file
        with open(filename + extensions['content'], 'w') as f:
            shared.dump(self.output_buffer, f)

        #Store wordlist as files
        with open(filename + extensions['words'], 'w') as f:
            shared.dump(self.words, f)

        #Store linkhash in files
        with open(filename + extensions['links'], 'w') as f:
            shared.dump(self.linkhash, f)

        if self.verbose:
            log("wrote " + filename)

        #Empty output buffer
        self.flush_output_buffer()
        return None
Example #3
0
def main():
    #Import shared parameters and verify output dir exists
    if not os.path.exists(temp_dir):
        raise IOError

#==============================================================================
#     Read in link data and update content files accordingly
#==============================================================================

#Get list of files containing link info and chop it up
    linkfiles = glob.glob(temp_dir + '*' + extensions['links'])
    linkchunks = listchopper(linkfiles)

    linkfiles_read = 0
    for linkchunk in linkchunks:
        #Hash mapping each article to a set of articles linking to it
        linkhash = {}

        for filename in linkchunk:
            with open(filename, 'r') as f:
                newstuff = shared.load(f)
            #Add link info to linkhash
            for target, sources in newstuff.iteritems():
                try:
                    linkhash[target].update(set(sources))
                except KeyError:
                    linkhash[target] = set(sources)

            #Log status
            linkfiles_read += 1
            log("Read " + filename + " - " +
                str(100 * linkfiles_read / len(linkfiles))[:4] +
                " % of link data.")

        log("Chunk finished - updating content files")
        #Update concept with newly read link data
        contentfiles = glob.glob(temp_dir + '*' + extensions['content'])
        contentfiles_read = 0
        for filename in contentfiles:
            #Read file. Content is like {'article title' : {'text' : blah}}
            with open(filename, 'r') as f:
                content = shared.load(f)

            #Search linkhash for links going TO concept
            for concept in content.keys():
                try:
                    sources = linkhash[concept]
                except KeyError:
                    sources = set([])  #Missing key => zero incoming links

                #Update link info for concept
                try:
                    content[concept]['links_in'] = set(
                        content[concept]['links_in'])
                    content[concept]['links_in'].update(sources)
                except KeyError:
                    content[concept]['links_in'] = sources

            #Save updated content
            with open(filename, 'w') as f:
                shared.dump(content, f)

            contentfiles_read += 1
            if contentfiles_read % 100 == 0:
                log("Fixed " +
                    str(100 * contentfiles_read / len(contentfiles))[:4] +
                    "% of content files")
        pass  #Proceed to next link chunk


#==============================================================================
#     Finished link processing
#     Remove unworthy concepts and combine concept/word lists.
#==============================================================================

#What, you think memory grows on trees?
    del linkhash
    gc.collect()

    #Set of all approved concepts
    concept_list = set([])

    #Purge inferior concepts (with insufficient incoming links)
    for filename in contentfiles:
        #Read in content file
        with open(filename, 'r') as f:
            content = shared.load(f)

        for concept in content.keys():
            entry = content[concept]
            if 'links_in' in entry and len(entry['links_in']) >= min_links_in:
                concept_list.add(concept)
            else:
                del content[concept]

        with open(filename, 'w') as f:
            shared.dump(content, f)

    log("Links done - saving index files")

    #Make sure output dir exists
    if not os.path.exists(matrix_dir):
        os.makedirs(matrix_dir)

    #Generate and save a concept index map. Structure: {concept : index}
    concept_indices = {n: m for m, n in enumerate(concept_list)}
    with open(matrix_dir + 'concept2index.ind', 'w') as f:
        shared.dump(concept_indices, f)

    #Read in all wordlists and combine them.
    words = set([])
    for filename in glob.glob(temp_dir + '*' + extensions['words']):
        with open(filename, 'r') as f:
            words.update(shared.load(f))

    #Generate and save a word index map. Structure: {word : index}
    word_indices = {n: m for m, n in enumerate(words)}
    with open(matrix_dir + 'word2index.ind', 'w') as f:
        shared.dump(word_indices, f)

    log("Wrapping up.")
    #Attempt to notify that job is done
    if shared.notify:
        try:
            shared.pushme(sys.argv[0] + ' completed.')
        except:
            log("Job's done. Push failed.")

    logfile.close()
def main():    
    #Import shared parameters and verify output dir exists
    if not os.path.exists(temp_dir):
        raise IOError

#==============================================================================
#     Read in link data and update content files accordingly
#==============================================================================

    #Get list of files containing link info and chop it up
    linkfiles = glob.glob(temp_dir + '*'+extensions['links'])
    linkchunks = listchopper(linkfiles)
    
    linkfiles_read = 0
    for linkchunk in linkchunks:
        #Hash mapping each article to a set of articles linking to it
        linkhash = {}
        
        for filename in linkchunk:
            with open(filename, 'r') as f:
                newstuff = shared.load(f)
            #Add link info to linkhash
            for target, sources in newstuff.iteritems():
                try:
                    linkhash[target].update(set(sources))
                except KeyError:
                    linkhash[target] = set(sources)
            
            #Log status
            linkfiles_read += 1
            log("Read " + filename + " - " + 
                str(100*linkfiles_read/len(linkfiles))[:4] + " % of link data.")
        
        log("Chunk finished - updating content files")
        #Update concept with newly read link data
        contentfiles = glob.glob(temp_dir + '*'+extensions['content'])
        contentfiles_read = 0
        for filename in contentfiles:
            #Read file. Content is like {'article title' : {'text' : blah}}
            with open(filename, 'r') as f:
                content = shared.load(f)

            #Search linkhash for links going TO concept                
            for concept in content.keys():
                try:
                    sources = linkhash[concept]
                except KeyError:
                    sources = set([])  #Missing key => zero incoming links
                
                #Update link info for concept
                try:
                    content[concept]['links_in'] = set(content[concept]['links_in'])
                    content[concept]['links_in'].update(sources)
                except KeyError:
                    content[concept]['links_in'] = sources
                
            #Save updated content
            with open(filename, 'w') as f:
                shared.dump(content, f)
                
            contentfiles_read += 1
            if contentfiles_read % 100 == 0:
                log("Fixed " + str(100*contentfiles_read/len(contentfiles))[:4]
                    + "% of content files")
        pass  #Proceed to next link chunk

#==============================================================================
#     Finished link processing 
#     Remove unworthy concepts and combine concept/word lists.   
#==============================================================================
    
    #What, you think memory grows on trees?
    del linkhash
    gc.collect()    
    
    #Set of all approved concepts
    concept_list = set([])
    
    #Purge inferior concepts (with insufficient incoming links)
    for filename in contentfiles:
        #Read in content file
        with open(filename, 'r') as f:
            content = shared.load(f)
        
        for concept in content.keys():
            entry = content[concept]
            if 'links_in' in entry and len(entry['links_in']) >= min_links_in:
                concept_list.add(concept)
            else:
                del content[concept]
        
        with open(filename, 'w') as f:
            shared.dump(content, f)
    
    log("Links done - saving index files")

    #Make sure output dir exists
    if not os.path.exists(matrix_dir):
        os.makedirs(matrix_dir)    
    
    #Generate and save a concept index map. Structure: {concept : index}
    concept_indices = {n: m for m,n in enumerate(concept_list)}
    with open(matrix_dir+'concept2index.ind', 'w') as f:
        shared.dump(concept_indices, f)
    
    #Read in all wordlists and combine them.
    words = set([])
    for filename in glob.glob(temp_dir + '*'+extensions['words']):
        with open(filename, 'r') as f:
            words.update(shared.load(f))
        
    #Generate and save a word index map. Structure: {word : index}
    word_indices = {n: m for m,n in enumerate(words)}
    with open(matrix_dir+'word2index.ind', 'w') as f:
        shared.dump(word_indices, f)
    
    log("Wrapping up.")
    #Attempt to notify that job is done
    if shared.notify:
        try:
            shared.pushme(sys.argv[0]+' completed.')
        except:
            log("Job's done. Push failed.")    
    
    logfile.close()