def modified_process(filename): ''' Serial processing of abstracts, no topic modeling. ''' abstracts = [] dictionary = [] # load stop words stops = set() stop_file = 'stopwords.txt' with open(stop_file, 'rU') as stopFile: for row in stopFile.readlines(): stops.add(row.replace('\n', '')) dictlist = Process.load(filename, abstracts, stops) # create dictionary Process.create_dict(dictlist, dictionary) # clean text of words not in dictionary for abstract in abstracts: abstext = [word for word in abstract.Get('cleantext') if word in dictionary] abstract.Set('cleantext', abstext) dictlength = len(dictionary) bigramdict = [] termbow = defaultdict(float) termbigram = defaultdict(float) for abstract in abstracts: # create dict of word frequency (bag of words) bow = Process.create_bagofwords(abstract, dictionary) abstract.Set('bow', bow) abstract.Set('bownum', dictlength) for ind in bow.keys(): termbow[ind] += 1.0 # create dict of bigram frequency bigram, bigramdict = Process.create_bigram(abstract, dictionary, bigramdict) abstract.Set('bigram', bigram) for pair in bigram.keys(): termbigram[pair] += 1.0 # create dict of tfidf Process.serial_tfidf(abstracts, 'bow', termbow, len(bigramdict)) Process.serial_tfidf(abstracts, 'bigram', termbigram) return abstracts
with open(stop_file, 'rU') as stopFile: for row in stopFile.readlines(): stops.add(row.replace('\n', '')) for r in range(1,size): comm.send(stops, dest=r) print "Timing load time ..." ploadstart = MPI.Wtime() abstracts, dictlist = Process.master_load(comm, filename) ploadend = MPI.Wtime() print "Timing dictionary creation time ..." pdictstart = MPI.Wtime() # Create dictionary #print "Creating dictionary ..." Process.create_dict(dictlist, dictionary) # send dictionary everywhere for r in range(1,size): comm.send(dictionary, dest=r) pdictend = MPI.Wtime() print "Timing text cleaning time ..." pcleanstart = MPI.Wtime() Process.master_cleantext(comm, abstracts) pcleanend = MPI.Wtime() print "Timing abstract send time ..." pabsstart = MPI.Wtime() # send abstracts to all slaves for r in range(1,size): comm.send(abstracts, dest=r)
def process(filename): ''' Serial processing of abstracts, for evaluation purposes. ''' abstracts = [] dictionary = [] # load stop words stops = set() stop_file = 'stopwords.txt' with open(stop_file, 'rU') as stopFile: for row in stopFile.readlines(): stops.add(row.replace('\n', '')) dictlist = Process.load(filename, abstracts, stops) # create dictionary Process.create_dict(dictlist, dictionary) # clean text of words not in dictionary for abstract in abstracts: abstext = [word for word in abstract.Get('cleantext') if word in dictionary] abstract.Set('cleantext', abstext) dictlength = len(dictionary) bigramdict = [] termbow = defaultdict(float) termbigram = defaultdict(float) for abstract in abstracts: # create dict of word frequency (bag of words) bow = Process.create_bagofwords(abstract, dictionary) abstract.Set('bow', bow) abstract.Set('bownum', dictlength) for ind in bow.keys(): termbow[ind] += 1.0 # create dict of bigram frequency bigram, bigramdict = Process.create_bigram(abstract, dictionary, bigramdict) abstract.Set('bigram', bigram) for pair in bigram.keys(): termbigram[pair] += 1.0 # create dict of tfidf Process.serial_tfidf(abstracts, 'bow', termbow, len(bigramdict)) Process.serial_tfidf(abstracts, 'bigram', termbigram) ##### TOPICS # prepare dictionary and corpora for topic modeling docs = [abstract.Get('cleantext') for abstract in abstracts] dictionary = corpora.Dictionary(docs) dictionary.save('abstracts.dict') corpus = [dictionary.doc2bow(doc) for doc in docs] corpora.MmCorpus.serialize('abstracts.mm', corpus) # use gensim tfidf to transform tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # load lsa and lda models numtopics = 10 # this can be adjusted lsaModel = Lsa.serial(corpus_tfidf, dictionary, numtopics) ldaModel = Lda.serial(corpus_tfidf, dictionary, numtopics, alpha=50.0/numtopics, eta=2.0/numtopics) # store lda and lsa representation in all abstracts for i in xrange(len(abstracts)): lsaVec = lsaModel[tfidf[corpus[i]]] ldaVec = ldaModel[tfidf[corpus[i]]] lsaVector = defaultdict(float) ldaVector = defaultdict(float) for v in lsaVec: lsaVector[v[0]] = v[1] for v in ldaVec: ldaVector[v[0]] = v[1] abstracts[i].Set('lsa', lsaVector) abstracts[i].Set('lda', ldaVector) abstracts[i].Set('numtopics', numtopics) return abstracts