def modified_process(filename): ''' Serial processing of abstracts, no topic modeling. ''' abstracts = [] dictionary = [] # load stop words stops = set() stop_file = 'stopwords.txt' with open(stop_file, 'rU') as stopFile: for row in stopFile.readlines(): stops.add(row.replace('\n', '')) dictlist = Process.load(filename, abstracts, stops) # create dictionary Process.create_dict(dictlist, dictionary) # clean text of words not in dictionary for abstract in abstracts: abstext = [word for word in abstract.Get('cleantext') if word in dictionary] abstract.Set('cleantext', abstext) dictlength = len(dictionary) bigramdict = [] termbow = defaultdict(float) termbigram = defaultdict(float) for abstract in abstracts: # create dict of word frequency (bag of words) bow = Process.create_bagofwords(abstract, dictionary) abstract.Set('bow', bow) abstract.Set('bownum', dictlength) for ind in bow.keys(): termbow[ind] += 1.0 # create dict of bigram frequency bigram, bigramdict = Process.create_bigram(abstract, dictionary, bigramdict) abstract.Set('bigram', bigram) for pair in bigram.keys(): termbigram[pair] += 1.0 # create dict of tfidf Process.serial_tfidf(abstracts, 'bow', termbow, len(bigramdict)) Process.serial_tfidf(abstracts, 'bigram', termbigram) return abstracts
import report import process from threading import Thread import numpy as np from queue import Queue images = process.load("data") #images = list(filter(lambda i: i[2] == "22276.png" or i[2] == "10635.png" or i[2] == "15055.png" or i[2] == "input_7_1.png" or i[2] == "input_2404_1.png" ,images)) images = list(filter(lambda i: i[0] != "unknown", images)) classified = [] threads = [] q = Queue(maxsize=0) num_theads = 20 def thread(_q): while not _q.empty(): image = _q.get() classified.append(process.classify(image)) _q.task_done() for i in images: q.put(i) for t in range(num_theads): t = Thread(target=thread, args=(q, )) t.start()
def process(filename): ''' Serial processing of abstracts, for evaluation purposes. ''' abstracts = [] dictionary = [] # load stop words stops = set() stop_file = 'stopwords.txt' with open(stop_file, 'rU') as stopFile: for row in stopFile.readlines(): stops.add(row.replace('\n', '')) dictlist = Process.load(filename, abstracts, stops) # create dictionary Process.create_dict(dictlist, dictionary) # clean text of words not in dictionary for abstract in abstracts: abstext = [word for word in abstract.Get('cleantext') if word in dictionary] abstract.Set('cleantext', abstext) dictlength = len(dictionary) bigramdict = [] termbow = defaultdict(float) termbigram = defaultdict(float) for abstract in abstracts: # create dict of word frequency (bag of words) bow = Process.create_bagofwords(abstract, dictionary) abstract.Set('bow', bow) abstract.Set('bownum', dictlength) for ind in bow.keys(): termbow[ind] += 1.0 # create dict of bigram frequency bigram, bigramdict = Process.create_bigram(abstract, dictionary, bigramdict) abstract.Set('bigram', bigram) for pair in bigram.keys(): termbigram[pair] += 1.0 # create dict of tfidf Process.serial_tfidf(abstracts, 'bow', termbow, len(bigramdict)) Process.serial_tfidf(abstracts, 'bigram', termbigram) ##### TOPICS # prepare dictionary and corpora for topic modeling docs = [abstract.Get('cleantext') for abstract in abstracts] dictionary = corpora.Dictionary(docs) dictionary.save('abstracts.dict') corpus = [dictionary.doc2bow(doc) for doc in docs] corpora.MmCorpus.serialize('abstracts.mm', corpus) # use gensim tfidf to transform tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # load lsa and lda models numtopics = 10 # this can be adjusted lsaModel = Lsa.serial(corpus_tfidf, dictionary, numtopics) ldaModel = Lda.serial(corpus_tfidf, dictionary, numtopics, alpha=50.0/numtopics, eta=2.0/numtopics) # store lda and lsa representation in all abstracts for i in xrange(len(abstracts)): lsaVec = lsaModel[tfidf[corpus[i]]] ldaVec = ldaModel[tfidf[corpus[i]]] lsaVector = defaultdict(float) ldaVector = defaultdict(float) for v in lsaVec: lsaVector[v[0]] = v[1] for v in ldaVec: ldaVector[v[0]] = v[1] abstracts[i].Set('lsa', lsaVector) abstracts[i].Set('lda', ldaVector) abstracts[i].Set('numtopics', numtopics) return abstracts
# Serial testing if rank == 0: print "Serial testing ..." abstracts = [] dictionary = [] # load stop words stops = set() stop_file = 'stopwords.txt' with open(stop_file, 'rU') as stopFile: for row in stopFile.readlines(): stops.add(row.replace('\n', '')) sloadstart = time.time() dictlist = Process.load(filename, abstracts, stops) sloadend = time.time() # create dictionary sdictstart = time.time() Process.create_dict(dictlist, dictionary) sdictend = time.time() # clean text of words not in dictionary scleanstart = time.time() for abstract in abstracts: abstext = [word for word in abstract.Get('cleantext') if word in dictionary] abstract.Set('cleantext', abstext) scleanend = time.time() sfreqstart = time.time()
num_clu[blg[i]] += 1 cent = np.zeros([k, dim], np.float32) for i in xrange(n): cent[blg[i]] = cent[blg[i]] + data[i] / num_clu[blg[i]] for _k in xrange(k): if num_clu[_k] == 0: cent[_k] = np.asarray([random_sample(dim)], np.float32) return blg if __name__ == '__main__': data, f = load('sample/') n = data.shape[0] print f k = 5 res = kmeans(data, k=k, itr=10) os.system('rm -r res/*') for i in xrange(k): os.system('mkdir res/' + str(i)) for i in xrange(n): os.system('cp sample/' + f[i] + ' res/' + str(res[i])) print res