def main_mpi(comm, filename): ''' Load and process abstracts (parallel MPI non-master/slave implementation) ''' # initialize variables rank = comm.Get_rank() size = comm.Get_size() status = MPI.Status() dictlist = defaultdict(int) stops = set() if rank == 0: print "scatter-gather parallel version" # load stop words "Loading stop words ..." stop_file = 'stopwords.txt' with open(stop_file, 'rU') as stopFile: for row in stopFile.readlines(): stops.add(row.replace('\n', '')) stops = comm.bcast(stops, root = 0) abstracts = [] if rank == 0: print "Loading abstracts using master-slave ..." abstracts, dictlist = master_load(comm, filename) else: # Load abstracts while True: # get message row = comm.recv(source=0, status=status) # end if done if not row: break # create Abstract object dict = defaultdict(int) abs = load_abs(row, dict, stops) # send abstract back to master comm.send((abs, dict), dest=0) dictionary = [] if rank == 0: # Create dictionary print "Creating dictionary ..." create_dict(dictlist, dictionary) dictionary = comm.bcast(dictionary, root = 0) abssend = [] if rank == 0: print "Timing abstract send time ..." pabsstart = MPI.Wtime() numabs = len(abstracts)/size if len(abstracts) % size != 0: numabs += 1 for i in range(size-1): abssend.append(abstracts[i*numabs:(i+1)*numabs]) abssend.append(abstracts[(size-1)*numabs:]) abstracts = comm.scatter(abssend, root=0) if rank == 0: pabsend = MPI.Wtime() print "Send abstract time: %f secs" % (pabsend - pabsstart) pcleanstart = MPI.Wtime() # clean text of words not in dictionary for abstract in abstracts: abstext = [word for word in abstract.Get('cleantext') if word in dictionary] abstract.Set('cleantext', abstext) if rank == 0: pcleanend = MPI.Wtime() print "Clean text time: %f secs" % (pcleanend - pcleanstart) pfreqstart = MPI.Wtime() dictlength = len(dictionary) bigramdict = [] termbowpart = defaultdict(float) termbigrampart = defaultdict(float) for abstract in abstracts: # create dict of word frequency (bag of words) bow = create_bagofwords(abstract, dictionary) abstract.Set('bow', bow) abstract.Set('bownum', dictlength) for ind in bow.keys(): termbowpart[ind] += 1.0 # create dict of bigram frequency bigram, bigramdict = create_bigram(abstract, dictionary, bigramdict) abstract.Set('bigram', bigram) for pair in bigram.keys(): termbigrampart[pair] += 1.0 termbowgather = comm.gather(termbowpart,root=0) termbow = defaultdict(float) if rank == 0: for bow in termbowgather: for key in bow.keys(): termbow[key] += 1.0 termbow = comm.bcast(termbow, root = 0) termbigramgather = comm.gather(termbigrampart, root=0) termbigram = defaultdict(float) if rank == 0: for bigram in termbigramgather: for key in bigram.keys(): termbigram[key] += 1.0 termbigram = comm.bcast(termbigram, root = 0) if rank == 0: pfreqend = MPI.Wtime() print "Frequency + Send abs, terms time: %f secs" % (pfreqend - pfreqstart) ptfidfstart = MPI.Wtime() # create dict of tfidf serial_tfidf(abstracts, 'bow', termbow, len(bigramdict)) serial_tfidf(abstracts, 'bigram', termbigram) if rank == 0: ptfidfend = MPI.Wtime() print "TF-IDF time: %f secs" % (ptfidfend - ptfidfstart) # gather abstracts abstracts = comm.gather(abstracts, root = 0) # master-slave for topic modeling allabs = [] if rank == 0: for i in abstracts: allabs.extend(i) master_topics(comm, allabs, numtopics) else: ##### TOPICS # topic modeling init dictionary = None # get message to begin working init = comm.recv(source=0) if init == 42: dictionary = corpora.Dictionary.load('abstracts.dict') comm.send(43, dest=0) Lsa.slave(comm, dictionary) Lda.slave(comm, dictionary) if rank == 0: return allabs
def slave(comm): ''' Slave function for MPI implementation for loading and processing abstracts ''' status = MPI.Status() stops = comm.recv(source = 0) # Load abstracts while True: # get message row = comm.recv(source=0, status=status) # end if done if not row: break # create Abstract object dictlist = defaultdict(int) abs = load_abs(row, dictlist, stops) # send abstract back to master comm.send((abs, dictlist), dest=0) dictionary = comm.recv(source=0) #print dictionary # clean abstracts while True: # get message abstext = comm.recv(source=0, tag = MPI.ANY_TAG, status=status) # end if done if not abstext: break # clean text abstext = [word for word in abstext if word in dictionary] # send abstract back to master comm.send(abstext, dest=0, tag=status.Get_tag()) abstracts = comm.recv(source = 0) # Find bag of words and bigram #print "Slave: find bow and bigram" while True: # get message absind = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) # end if done if absind != 0 and not absind: break # find bag of words bow = create_bagofwords(abstracts[absind], dictionary) # find bigram bigramdict = [] bigram, bigramdict = create_bigram(abstracts[absind], dictionary, bigramdict) # send bow and bigram back to master comm.send((bow, bigram, bigramdict), dest=0, tag=absind) abstracts, termbow, termbigram = comm.recv(source=0) numabs = len(abstracts) # TF-IDF #print "Slave: TF-IDF" while True: # get message absind = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) # end if done if absind != 0 and not absind: break # find TF-IDF tfidfbow = create_tfidf(abstracts[absind], termbow, numabs, 'bow') tfidfbigram = create_tfidf(abstracts[absind], termbigram, numabs, 'bigram') # send bow and bigram back to master comm.send((tfidfbow, tfidfbigram), dest=0, tag=status.Get_tag()) ##### TOPICS # topic modeling init dictionary = None # get message to begin working init = comm.recv(source=0) if init == 42: dictionary = corpora.Dictionary.load('abstracts.dict') comm.send(43, dest=0) Lsa.slave(comm, dictionary) Lda.slave(comm, dictionary) return