Esempio n. 1
0
def main_mpi(comm, filename):
    '''
    Load and process abstracts (parallel MPI non-master/slave implementation)
    '''
    # initialize variables
    rank = comm.Get_rank()
    size = comm.Get_size()
    status = MPI.Status()

    dictlist = defaultdict(int)
    stops = set()
    if rank == 0:
        print "scatter-gather parallel version"
        # load stop words
        "Loading stop words ..."
        stop_file = 'stopwords.txt'
        with open(stop_file, 'rU') as stopFile:
            for row in stopFile.readlines():
                stops.add(row.replace('\n', ''))
    
    stops = comm.bcast(stops, root = 0)
    
    abstracts = []
    if rank == 0:
        print "Loading abstracts using master-slave ..."
        abstracts, dictlist = master_load(comm, filename)
    else:
        # Load abstracts
        while True:
            # get message
            row = comm.recv(source=0, status=status)
            
            # end if done
            if not row:
                break
            
            # create Abstract object
            dict = defaultdict(int)
            abs = load_abs(row, dict, stops)
            
            # send abstract back to master
            comm.send((abs, dict), dest=0)
    
    dictionary = []
    if rank == 0:
        # Create dictionary
        print "Creating dictionary ..."
        create_dict(dictlist, dictionary)
    
    dictionary = comm.bcast(dictionary, root = 0)
    
    abssend = []
    if rank == 0:
        print "Timing abstract send time ..."
        pabsstart = MPI.Wtime()
        numabs = len(abstracts)/size
        if len(abstracts) % size != 0:
            numabs += 1
        for i in range(size-1):
            abssend.append(abstracts[i*numabs:(i+1)*numabs])
        abssend.append(abstracts[(size-1)*numabs:])
    
    abstracts = comm.scatter(abssend, root=0)
    if rank == 0:
        pabsend = MPI.Wtime()
        print "Send abstract time: %f secs" % (pabsend - pabsstart)
        
        pcleanstart = MPI.Wtime()

    # clean text of words not in dictionary
    for abstract in abstracts:
        abstext = [word for word in abstract.Get('cleantext') if word in dictionary]
        abstract.Set('cleantext', abstext)
    
    if rank == 0:
        pcleanend = MPI.Wtime()
        print "Clean text time: %f secs" % (pcleanend - pcleanstart)
        pfreqstart = MPI.Wtime()

    dictlength = len(dictionary) 
    bigramdict = []
    termbowpart = defaultdict(float)
    termbigrampart = defaultdict(float)
    for abstract in abstracts:
        # create dict of word frequency (bag of words)
        bow = create_bagofwords(abstract, dictionary)
        abstract.Set('bow', bow)
        abstract.Set('bownum', dictlength)
        for ind in bow.keys():
            termbowpart[ind] += 1.0
        # create dict of bigram frequency
        bigram, bigramdict = create_bigram(abstract, dictionary, bigramdict)
        abstract.Set('bigram', bigram)
        for pair in bigram.keys():
            termbigrampart[pair] += 1.0
    
    termbowgather = comm.gather(termbowpart,root=0)
    termbow = defaultdict(float)
    if rank == 0:
        for bow in termbowgather:
            for key in bow.keys():
                termbow[key] += 1.0
    termbow = comm.bcast(termbow, root = 0)
    
    termbigramgather = comm.gather(termbigrampart, root=0)
    termbigram = defaultdict(float)
    if rank == 0:
        for bigram in termbigramgather:
            for key in bigram.keys():
                termbigram[key] += 1.0
    termbigram = comm.bcast(termbigram, root = 0)
    
    if rank == 0:
        pfreqend = MPI.Wtime()
        print "Frequency + Send abs, terms time: %f secs" % (pfreqend - pfreqstart)
        ptfidfstart = MPI.Wtime()
    
    # create dict of tfidf
    serial_tfidf(abstracts, 'bow', termbow, len(bigramdict))
    serial_tfidf(abstracts, 'bigram', termbigram)
    
    if rank == 0:
        ptfidfend = MPI.Wtime()
        print "TF-IDF time: %f secs" % (ptfidfend - ptfidfstart)

    # gather abstracts
    abstracts = comm.gather(abstracts, root = 0)

    # master-slave for topic modeling
    allabs = []
    if rank == 0:
        for i in abstracts:
            allabs.extend(i)
        master_topics(comm, allabs, numtopics)
    else:
        ##### TOPICS
        # topic modeling init
        dictionary = None
        
        # get message to begin working
        init = comm.recv(source=0)
        if init == 42:
            dictionary = corpora.Dictionary.load('abstracts.dict')
            comm.send(43, dest=0)
        Lsa.slave(comm, dictionary)
        Lda.slave(comm, dictionary)

    if rank == 0:
        return allabs
Esempio n. 2
0
def slave(comm):
    '''
    Slave function for MPI implementation for loading and processing abstracts
    '''
    status = MPI.Status()

    stops = comm.recv(source = 0)
    
    # Load abstracts
    while True:
        # get message
        row = comm.recv(source=0, status=status)
        
        # end if done
        if not row:
            break
        
        # create Abstract object
        dictlist = defaultdict(int)
        abs = load_abs(row, dictlist, stops)
        
        # send abstract back to master
        comm.send((abs, dictlist), dest=0)

    dictionary = comm.recv(source=0)
    #print dictionary

    # clean abstracts
    while True:
        # get message
        abstext = comm.recv(source=0, tag = MPI.ANY_TAG, status=status)
        
        # end if done
        if not abstext:
            break
        
        # clean text
        abstext = [word for word in abstext if word in dictionary]

        # send abstract back to master
        comm.send(abstext, dest=0, tag=status.Get_tag())

    abstracts = comm.recv(source = 0)

    # Find bag of words and bigram
    #print "Slave: find bow and bigram"
    while True:
        # get message
        absind = comm.recv(source=0, tag=MPI.ANY_TAG, status=status)

        # end if done
        if absind != 0 and not absind:
            break
        
        # find bag of words
        bow = create_bagofwords(abstracts[absind], dictionary)
        # find bigram
        bigramdict = []
        bigram, bigramdict = create_bigram(abstracts[absind], dictionary, bigramdict)
        
        # send bow and bigram back to master
        comm.send((bow, bigram, bigramdict), dest=0, tag=absind)

    abstracts, termbow, termbigram = comm.recv(source=0)
    numabs = len(abstracts)

    # TF-IDF
    #print "Slave: TF-IDF"
    while True:
        # get message
        absind = comm.recv(source=0, tag=MPI.ANY_TAG, status=status)
        
        # end if done
        if absind != 0 and not absind:
            break
        
        # find TF-IDF
        tfidfbow = create_tfidf(abstracts[absind], termbow, numabs, 'bow')
        tfidfbigram = create_tfidf(abstracts[absind], termbigram, numabs, 'bigram')
        
        # send bow and bigram back to master
        comm.send((tfidfbow, tfidfbigram), dest=0, tag=status.Get_tag())

    ##### TOPICS
    # topic modeling init
    dictionary = None
    
    # get message to begin working
    init = comm.recv(source=0)
    if init == 42:
        dictionary = corpora.Dictionary.load('abstracts.dict')
        comm.send(43, dest=0)
    Lsa.slave(comm, dictionary)
    Lda.slave(comm, dictionary)
        
    return