def extract_features(qfile="question_train.csv", qcatfile="question_category_train.csv", catfile="category.csv", subcats=False, outfile="features.npz"): # loading the categories cats = categorie_class.categories() # initalizing corpus corp = corpus.corpus(cats) # loading questions into corpus corp.load(qfile, qcatfile) # running filers on the raw questions sentence_filters = [filters.punctuation_filter] word_filters = [ filters.small_word_filter, filters.stopword_filter, filters.stemming_filter ] corp.process(sentence_filters, word_filters) # saving corpus into pickle # pickle.dump(corp, "corpus.pkl") # selecting the term-space term_space = ig_based_non_uniform(corp, M=2500, read_from_file=False) d = len(term_space) # create mapping form features names to new ids and inverse term_to_feature = {} feature_to_term = {} for term, i in zip(term_space, range(d)): term_to_feature[term] = i feature_to_term[i] = term # creating features and lable arrays n = len(corp.tr_set) features = np.zeros((d, n)) categoryids = np.zeros(n) # we define new ids vor the parent categories, which will be coherent with ones assigned in categoryids number_of_cats = len(corp.cats.all_names()) new_category_ids = { c: i for c, i in zip(corp.cats.all_names(), range(number_of_cats)) } for q, j in zip(corp.tr_set, range(n)): fe = simple_features(term_space, q["words"]) for term, value in zip(fe.keys(), fe.values()): i = term_to_feature[term] features[i, j] = value categoryids[j] = new_category_ids[q["category"]] categories = { i: c for c, i in zip(new_category_ids.keys(), new_category_ids.values()) } featurenames = [feature_to_term[i] for i in range(d)] np.savez(outfile, features=features, featurenames=featurenames, categoryids=categoryids, categories=categories)
def main(): infile = sys.argv[1] K = int(sys.argv[2]) # D is set below alpha = float(sys.argv[3]) eta = float(sys.argv[4]) kappa = float(sys.argv[5]) L = int(sys.argv[6]) S = int(sys.argv[7]) # batchsize docs = corpus.corpus() docs.read_data(infile) vocab = open(sys.argv[8]).readlines() if L == 1: start = 0 repeat = 30 recset = set(map(int, n.logspace(0, 3.35, num=100, base=10.0).tolist())) modelset = list() for j in range(start, repeat): modelset.append( batchLDA(vocab, K, 2e3, alpha, eta, 0.01, kappa, L) ) for i in range(int(2e3)): for j in range(start, repeat): rand_ind = n.random.randint(int(2e3), size = S) wordids = [docs.docs[idx].words for idx in rand_ind] wordcts = [docs.docs[idx].counts for idx in rand_ind] modelset[j].update_lambda(wordids, wordcts) if i in recset: tmp = 0 for j in range(start, repeat): tmp += modelset[j]._grad grad = tmp/repeat n.savetxt('gradient-%d/gradient-%d-%d' % (L, 31, i), grad.T) # for full sufficient statistics else: start = 0 repeat = 1 recset = set(map(int, n.logspace(0, 3.35, num=100, base=10.0).tolist())) for j in range(start, repeat): model = batchLDA(vocab, K, 2e3, alpha, eta, 0.01, kappa, L) for i in range(int(2e3)): # print i # rand_ind = n.random.randint(int(2e3), size = S) # for rand_ind = range(int(2e3)) # for full sufficient statistics wordids = [docs.docs[idx].words for idx in rand_ind] wordcts = [docs.docs[idx].counts for idx in rand_ind] # wordids = [d.words for d in docs.docs[(i*S):((i+1)*S)]] # wordcts = [d.counts for d in docs.docs[(i*S):((i+1)*S)]] model.update_lambda(wordids, wordcts) # n.savetxt('lambda-%d/lambda-%d-%d' % (L, L, i) , model._lambda.T) if i in recset: grad = model._grad # gradient-1 folder should pre-exist on pwd n.savetxt('gradient-%d/gradient-%d-%d' % (L, 31, i), grad.T) # for full sufficient statistics
def train_apply_classifier(classifier='NaiveBayes', qfile_train='question_train.csv', qcatfile_train='question_category_train.csv', catfile='category.csv', qfile_test='question_test.csv', subcats=False): """This method performs a parameter tuning using cross validation for the specified classfier. After the hyper-parameter(s) are selected it returns the predicted labes for the given test-set. Following 3 classifiers are known to the method: - "NaiveBayes" (default) - "LogisticRegression" - "RandomForest" """ # initalizing corpus corpus = corpus_class.corpus(categories.categories()) corpus.load(qfile_train, qcatfile_train) filts = std_filters() corpus.process(corpus_size=-1, **filts) corpus.simple_split(0) #corpus = corpus_class.load_from_file() #corpus.simple_split(0) if classifier == 'NaiveBayes': clf_par = MultinomialNB_params(corpus) clf, feat_params = CV(corpus, *clf_par, n_folds=3) elif classifier == 'LogisticRegression': clf_par = LogisticRegression_params() clf, feat_params = CV(corpus, *clf_par, n_folds=3) elif classifier == 'RandomForest': clf_par = RandomForest_params(corpus) clf, feat_params = CV(corpus, *clf_par, n_folds=3, skipping_rule=RF_skipping_rule) else: raise ValueError( "The given classfier is not known to this method. Look up the doc to see which classfiers work." ) # making the fit for the entier traing set corpus.simple_split(0) corpus.make_features(**feat_params) clf.fit(corpus.X_tr, corpus.y_tr) X_te = corpus.process_example(qfile_test) return clf.predict(X_te)
def main(): print("main started") with open('../data/movie-dialog-corpus/movie_lines.tsv') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') dialogues = [] for row in reader: dialogues.append(row[-1]) corp = corpus() corp.read_corpus(dialogues) lm = languageModel(corp) methods = ['greedy', 'sampling', 'beamSearch'] for method in methods: print("Method: ", method) conversation = [] sentence = [] #no capital letters! fourgram = ['how', 'much', 'do', 'we'] for i in fourgram: sentence.append(i) print(sentence) n = 1 while n < 6: if fourgram[3] == "</s>": n = n + 1 conversation.append(sentence) sentence = [] nextword = lm.endofSentence sentence.append(nextword) print("New senctence: ", nextword) fourgram = [fourgram[1], fourgram[2], fourgram[3], nextword] else: if method == 'greedy': nextword = lm.greedy(lm.score(fourgram)) elif method == 'sampling': nextword = lm.sampling(lm.score(fourgram)) else: nextword = lm.beamSearch(fourgram, lm.score(fourgram)) sentence.append(nextword) print(nextword) fourgram = [fourgram[1], fourgram[2], fourgram[3], nextword] conversation.append(sentence) speaker = ["Speaker 1:", "Speaker 2:"] for q, sent in enumerate(conversation): speak(speaker[q % 2], sent)
def main(): #try: c = corpus(); tfidf = TFIDF() tf_type='aug_freq' idf_type='inv_smooth_idf' for i, doc in enumerate(c.documents): cnt=0 print("Top words in document {}".format(i + 1)) scores = {word: tfidf.tfidf(word, doc, c.documents,tf_type, idf_type) for word in doc.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words[:10]: cnt+=1 if(score>0): print("\tWord {}: {}, TF-IDF: {}".format(cnt, word, round(score, 5)))
def main(): infile = sys.argv[1] K = int(sys.argv[2]) alpha = float(sys.argv[3]) eta = float(sys.argv[4]) kappa = float(sys.argv[5]) S = int(sys.argv[6]) docs = corpus.corpus() docs.read_data(infile) vocab = open(sys.argv[7]).readlines() model = OnlineLDA(vocab, K, 100000, 0.1, 0.01, 1, 0.75) for i in range(1000): print i wordids = [d.words for d in docs.docs[(i * S):((i + 1) * S)]] wordcts = [d.counts for d in docs.docs[(i * S):((i + 1) * S)]] model.update_lambda(wordids, wordcts) n.savetxt('/tmp/lambda%d' % i, model._lambda.T)
def main(): infile = sys.argv[1] K = int(sys.argv[2]) alpha = float(sys.argv[3]) eta = float(sys.argv[4]) kappa = float(sys.argv[5]) S = int(sys.argv[6]) docs = corpus.corpus() docs.read_data(infile) vocab = open(sys.argv[7]).readlines() model = OnlineLDA(vocab, K, 100000, 0.1, 0.01, 1, 0.75) for i in range(1000): print i wordids = [d.words for d in docs.docs[(i*S):((i+1)*S)]] wordcts = [d.counts for d in docs.docs[(i*S):((i+1)*S)]] model.update_lambda(wordids, wordcts) n.savetxt('/tmp/lambda%d' % i, model._lambda.T)
print("\t-h, --help\t\t\t\t\t\t\tGet usage info") #print("\t-o file, --output=file\t\t\t\t\t\tOutput all information into file") print( "\t-s name1,name2,..nameN, --source=name1,name2,..nameN\t\tSpecifies the corpus source newspaper. Valid sources are RG, Novaya" ) print("\t\t\t\t\t\t\t\t\tWithout key all possible sources are used") print("") if __name__ == "__main__": input_period = [] output = [] newspaper = [] output_file = None get_options(a_period=input_period, a_output=output, a_source=newspaper) corp = corpus.corpus() corp.load('dumps/corp_multy-lemm.dump') data = corp.get_lemm( period=[int(input_period[0][0]), int(input_period[0][1])], sources=newspaper) help() while True: command = input('--> ') if command == "help": help() elif command[:len("collocation")] == "collocation": try: command.split(' ')[1] except:
print("\tcli.py -c first_year,last_year [options]") print("OPTIONS:") print("\t-c first_year,last_year, --corpus=first_year,last_year\t\tMandatory parameter. Certain time period shoud be specifyed") print("\t-h, --help\t\t\t\t\t\t\tGet usage info") #print("\t-o file, --output=file\t\t\t\t\t\tOutput all information into file") print("\t-s name1,name2,..nameN, --source=name1,name2,..nameN\t\tSpecifies the corpus source newspaper. Valid sources are RG, Novaya") print("\t\t\t\t\t\t\t\t\tWithout key all possible sources are used") print("") if __name__ == "__main__": input_period = [] output = [] newspaper = [] output_file = None get_options(a_period=input_period,a_output=output,a_source=newspaper) corp = corpus.corpus() corp.load('dumps/corp_multy-lemm.dump') data = corp.get_lemm(period=[int(input_period[0][0]), int(input_period[0][1])], sources=newspaper) help() while True: command = input('--> ') if command == "help": help() elif command[:len("collocation")] == "collocation": try: command.split(' ')[1] except: print ("ngram wasn't found") help() continue
def main(): corp = corpus() # rg_98 = text_unit(dates=[1997, 1998, 1999], sources=['RG']) # rg_98.read(DATA_DIR, '98', 'rg' + DATA_EXT) # rg_98.get_lemm() # corp.add(rg_98) # rg_01 = text_unit(dates=[2000, 2001, 2002], sources=['RG']) # rg_01.read(DATA_DIR, '01', 'rg' + DATA_EXT) # rg_01.get_lemm() # corp.add(rg_01) # rg_04 = text_unit(dates=[2003, 2004, 2005], sources=['RG']) # rg_04.read(DATA_DIR, '04', 'rg' + DATA_EXT) # rg_04.get_lemm() # corp.add(rg_04) # rg_16 = text_unit(dates=[2016], sources=['RG']) # rg_16.read(DATA_DIR, '16', 'rg' + DATA_EXT) # rg_16.get_lemm() # corp.add(rg_16) # nv_98 = text_unit(dates=[1997, 1998, 1999], sources=['Novaya']) # nv_98.read(DATA_DIR, '98', 'nv' + DATA_EXT) # nv_98.get_lemm() # corp.add(nv_98) # nv_01 = text_unit(dates=[2000, 2001, 2002], sources=['Novaya']) # nv_01.read(DATA_DIR, '01', 'nv' + DATA_EXT) # nv_01.get_lemm() # corp.add(nv_01) # nv_04 = text_unit(dates=[2003, 2004, 2005], sources=['Novaya']) # nv_04.read(DATA_DIR, '04', 'nv' + DATA_EXT) # nv_04.get_lemm() # corp.add(nv_04) # nv_16 = text_unit(dates=[2016], sources=['Novaya']) # nv_16.read(DATA_DIR, '16', 'nv' + DATA_EXT) # nv_16.get_lemm() # corp.add(nv_16) # corp.dump(path='dumps/corp.dump') # corp.load('dumps/corp.dump') # corp.get_info() # lemmas = corp.get_lemm(sources='RG') # corp.add_stat(name='RG_lemmas', value=lemmas, descr='RG lemmas') # lemmas = corp.get_lemm(sources='Novaya') # corp.add_stat(name='Novaya_lemmas', value=lemmas, descr='Novaya lemmas') # lemmas = corp.get_lemm(period=[1997, 1999]) # corp.add_stat( # name='98_lemmas', value=lemmas, descr='Lemmas from 1997 till 1999') # lemmas = corp.get_lemm(period=[2000, 2002]) # corp.add_stat( # name='01_lemmas', value=lemmas, descr='Lemmas from 2000 till 2002') # lemmas = corp.get_lemm(period=[2003, 2005]) # corp.add_stat( # name='04_lemmas', value=lemmas, descr='Lemmas from 2003 till 2005') # lemmas = corp.get_lemm(period=[2016, 2016]) # corp.add_stat( # name='16_lemmas', value=lemmas, descr='Lemmas from 2016') # corp.get_lemm() # corp.dump('dumps/corp_multy-lemm.dump') corp.load('dumps/corp_multy-lemm.dump') corp.get_info() # lemm_rate = corp.get_lemm_freq() # for lemm, q in lemm_rate: # print('{}: {} times'.format(lemm, q)) lemm_rate = corp.get_lemm_freq(5) for lemm, q in lemm_rate: print('{}: {} times'.format(lemm, q))
def __init__(self, _fichier, param): self.corpus = corpus.corpus(_fichier).generer() self.model = w2v.word2vec(param)
inp=Variable(torch.from_numpy(corpus.train_ids[idx:idx+64])).cuda() tag=Variable(torch.from_numpy(corpus.train_tags[idx:idx+64])).cuda() if inp.size(0)!=batch_size: continue pred=model(inp,states) _,pred_idx=torch.max(pred,1) loss = criterion(pred,tag) # 剪裁参数梯度 dev_loss.append(loss.data[0]) dev_acc.append((sum(pred_idx.cpu().data.numpy()==tag.cpu().data.numpy())*1./tag.size(0))) print("epoch :{},train mean loss:{},dev mean loss:{}".format(epoch,np.mean(train_loss),np.mean(dev_loss))) train_loss_p.append(np.mean(train_loss)) dev_loss_p.append(np.mean(dev_loss)) train_acc_p.append(np.mean(train_acc)) dev_acc_p.append(np.mean(dev_acc)) step_p.append(epoch) viz.line( X=np.column_stack((np.array(step_p), np.array(step_p),np.array(step_p), np.array(step_p))), Y=np.column_stack((np.array(train_loss_p),np.array(train_acc_p),np.array(dev_loss_p), np.array(dev_acc_p))), win=line, opts=dict(legend=["Train_mean_loss", "Train_acc","Eval_mean_loss", "Eval_acc"])) if __name__ == '__main__': corpus=corpus("data/pos.csv","data/neg.csv","data/neutral.csv","data/stop_words.csv") model=BILSTM(corpus.num_classes,corpus.vocab_size,hidden_size,num_layers) if cuda: model=model.cuda() train(corpus,model)
# # Initialize the variational distribution q(theta|gamma) for# # the mini-batch# gamma = 1*n.random.gamma(100., 1./100., (batchD, self._K))# Elogtheta = dirichlet_expectation(gamma)# expElogtheta = n.exp(Elogtheta) # sstats = n.zeros(self._lambda.shape)# # Now, for each document d update that document's gamma and phi# it = 0# meanchange = 0# for d in range(0, batchD):# # These are mostly just shorthand (but might help cache locality)# ids = wordids[d]# cts = wordcts[d]# gammad = gamma[d, :]# Elogthetad = Elogtheta[d, :]# expElogthetad = expElogtheta[d, :]# expElogbetad = self._expElogbeta[:, ids]# # The optimal phi_{dwk} is proportional to # # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.# phinorm = n.dot(expElogthetad, expElogbetad) + 1e-100# # Iterate between gamma and phi until convergence# for it in range(0, 100):# lastgamma = gammad# # We represent phi implicitly to save memory and time.# # Substituting the value of the optimal phi back into# # the update for gamma gives this update. Cf. Lee&Seung 2001.# gammad = self._alpha + expElogthetad * \# n.dot(cts / phinorm, expElogbetad.T)# Elogthetad = dirichlet_expectation(gammad)# expElogthetad = n.exp(Elogthetad)# phinorm = n.dot(expElogthetad, expElogbetad) + 1e-100# # If gamma hasn't changed much, we're done.# meanchange = n.mean(abs(gammad - lastgamma))# if (meanchange < meanchangethresh):# break# gamma[d, :] = gammad# # Contribution of document d to the expected sufficient# # statistics for the M step.# sstats[:, ids] += n.outer(expElogthetad.T, cts/phinorm) # # This step finishes computing the sufficient statistics for the# # M step, so that# # sstats[k, w] = \sum_d n_{dw} * phi_{dwk} # # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.# sstats = sstats * self._expElogbeta # return((gamma, sstats)) def update_lambda_docs(self, docs): """ First does an E step on the mini-batch given in wordids and wordcts, then uses the result of that E step to update the variational parameter matrix lambda. Arguments: docs: List of D documents. Each document must be represented as a string. (Word order is unimportant.) Any words not in the vocabulary will be ignored. Returns gamma, the parameters to the variational distribution over the topic weights theta for the documents analyzed in this update. Also returns an estimate of the variational bound for the entire corpus for the OLD setting of lambda based on the documents passed in. This can be used as a (possibly very noisy) estimate of held-out likelihood. """ # rhot will be between 0 and 1, and says how much to weight # the information we got from this mini-batch. rhot = pow(self._tau0 + self._updatect, -self._kappa) self._rhot = rhot # Do an E step to update gamma, phi | lambda for this # mini-batch. This also returns the information about phi that # we need to update lambda. (gamma, sstats) = self.do_e_step_docs(docs) # Estimate held-out likelihood for current values of lambda. bound = self.approx_bound_docs(docs, gamma) # Update lambda based on documents. self._lambda = self._lambda * (1-rhot) + \ rhot * (self._eta + self._D * sstats / len(docs)) self._Elogbeta = dirichlet_expectation(self._lambda) self._expElogbeta = n.exp(self._Elogbeta) self._updatect += 1 return(gamma, bound) def update_lambda(self, wordids, wordcts): """ First does an E step on the mini-batch given in wordids and wordcts, then uses the result of that E step to update the variational parameter matrix lambda. Arguments: docs: List of D documents. Each document must be represented as a string. (Word order is unimportant.) Any words not in the vocabulary will be ignored. Returns gamma, the parameters to the variational distribution over the topic weights theta for the documents analyzed in this update. Also returns an estimate of the variational bound for the entire corpus for the OLD setting of lambda based on the documents passed in. This can be used as a (possibly very noisy) estimate of held-out likelihood. """ # rhot will be between 0 and 1, and says how much to weight # the information we got from this mini-batch. rhot = pow(self._tau0 + self._updatect, -self._kappa) self._rhot = rhot # Do an E step to update gamma, phi | lambda for this # mini-batch. This also returns the information about phi that # we need to update lambda. (gamma, sstats) = self.do_e_step(wordids, wordcts) # Estimate held-out likelihood for current values of lambda. bound = self.approx_bound(wordids, wordcts, gamma) # Update lambda based on documents. self._lambda = self._lambda * (1-rhot) + \ rhot * (self._eta + self._D * sstats / len(wordids)) self._Elogbeta = dirichlet_expectation(self._lambda) self._expElogbeta = n.exp(self._Elogbeta) self._updatect += 1 return(gamma, bound) def approx_bound(self, wordids, wordcts, gamma): """ Estimates the variational bound over *all documents* using only the documents passed in as "docs." gamma is the set of parameters to the variational distribution q(theta) corresponding to the set of documents passed in. The output of this function is going to be noisy, but can be useful for assessing convergence. """ # This is to handle the case where someone just hands us a single # document, not in a list. batchD = len(wordids) score = 0 Elogtheta = dirichlet_expectation(gamma) expElogtheta = n.exp(Elogtheta) # E[log p(docs | theta, beta)] for d in range(0, batchD): gammad = gamma[d, :] ids = wordids[d] cts = n.array(wordcts[d]) phinorm = n.zeros(len(ids)) for i in range(0, len(ids)): temp = Elogtheta[d, :] + self._Elogbeta[:, ids[i]] tmax = max(temp) phinorm[i] = n.log(sum(n.exp(temp - tmax))) + tmax score += n.sum(cts * phinorm)# oldphinorm = phinorm# phinorm = n.dot(expElogtheta[d, :], self._expElogbeta[:, ids])# print oldphinorm# print n.log(phinorm)# score += n.sum(cts * n.log(phinorm)) # E[log p(theta | alpha) - log q(theta | gamma)] score += n.sum((self._alpha - gamma)*Elogtheta) score += n.sum(gammaln(gamma) - gammaln(self._alpha)) score += sum(gammaln(self._alpha*self._K) - gammaln(n.sum(gamma, 1))) # Compensate for the subsampling of the population of documents score = score * self._D / len(wordids) # E[log p(beta | eta) - log q (beta | lambda)] score = score + n.sum((self._eta-self._lambda)*self._Elogbeta) score = score + n.sum(gammaln(self._lambda) - gammaln(self._eta)) score = score + n.sum(gammaln(self._eta*self._W) - gammaln(n.sum(self._lambda, 1))) return(score) def approx_bound_docs(self, docs, gamma): """ Estimates the variational bound over *all documents* using only the documents passed in as "docs." gamma is the set of parameters to the variational distribution q(theta) corresponding to the set of documents passed in. The output of this function is going to be noisy, but can be useful for assessing convergence. """ # This is to handle the case where someone just hands us a single # document, not in a list. if (type(docs).__name__ == 'string'): temp = list() temp.append(docs) docs = temp (wordids, wordcts) = parse_doc_list(docs, self._vocab) batchD = len(docs) score = 0 Elogtheta = dirichlet_expectation(gamma) expElogtheta = n.exp(Elogtheta) # E[log p(docs | theta, beta)] for d in range(0, batchD): gammad = gamma[d, :] ids = wordids[d] cts = n.array(wordcts[d]) phinorm = n.zeros(len(ids)) for i in range(0, len(ids)): temp = Elogtheta[d, :] + self._Elogbeta[:, ids[i]] tmax = max(temp) phinorm[i] = n.log(sum(n.exp(temp - tmax))) + tmax score += n.sum(cts * phinorm)# oldphinorm = phinorm# phinorm = n.dot(expElogtheta[d, :], self._expElogbeta[:, ids])# print oldphinorm# print n.log(phinorm)# score += n.sum(cts * n.log(phinorm)) # E[log p(theta | alpha) - log q(theta | gamma)] score += n.sum((self._alpha - gamma)*Elogtheta) score += n.sum(gammaln(gamma) - gammaln(self._alpha)) score += sum(gammaln(self._alpha*self._K) - gammaln(n.sum(gamma, 1))) # Compensate for the subsampling of the population of documents score = score * self._D / len(docs) # E[log p(beta | eta) - log q (beta | lambda)] score = score + n.sum((self._eta-self._lambda)*self._Elogbeta) score = score + n.sum(gammaln(self._lambda) - gammaln(self._eta)) score = score + n.sum(gammaln(self._eta*self._W) - gammaln(n.sum(self._lambda, 1))) return(score) def main(): infile = sys.argv[1] K = int(sys.argv[2]) alpha = float(sys.argv[3]) eta = float(sys.argv[4]) kappa = float(sys.argv[5]) S = int(sys.argv[6]) docs = corpus.corpus() docs.read_data(infile) vocab = open(sys.argv[7]).readlines() model = OnlineLDA(vocab, K, 100000, 0.1, 0.01, 1, 0.75) for i in range(1000): print i wordids = [d.words for d in docs.docs[(i*S):((i+1)*S)]] wordcts = [d.counts for d in docs.docs[(i*S):((i+1)*S)]] model.update_lambda(wordids, wordcts) n.savetxt('/tmp/lambda%d' % i, model._lambda.T) # infile = open(infile)# corpus.read_stream_data(infile, 100000) if __name__ == '__main__': main()
import sys sys.path.append('./model') sys.path.append('./process') sys.path.append('./training') sys.path.append('./service') from corpus import corpus from corpus import csvExtractor from NLTKpipeline import NLTKpipeline extractor = csvExtractor(label_col=0,text_col=5, delim_patt='"') extractor.setLabelDictionary({"0":"neg", "4":"pos"}) #c = corpus(loc="/home/nitrous/data/sentiment140/subset.csv", extractor=extractor) c = corpus(loc="/home/nitrous/data/sentiment140/rand_subset.csv", extractor=extractor) pipeline = NLTKpipeline() pipeline.process(c) for a in c.docs: a.toString() print "\n"
def main(): infile = sys.argv[1] K = int(sys.argv[2]) # D is set below alpha = float(sys.argv[3]) eta = float(sys.argv[4]) kappa = float(sys.argv[5]) L = int(sys.argv[6]) S = int(sys.argv[7]) # batchsize docs = corpus.corpus() docs.read_data(infile) vocab = open(sys.argv[8]).readlines() if L == 1: start = 0 repeat = 30 recset = set(map(int, n.logspace(0, 3.35, num=100, base=10.0).tolist())) modelset = list() for j in range(start, repeat): modelset.append(batchLDA(vocab, K, 2e3, alpha, eta, 0.01, kappa, L)) for i in range(int(2e3)): for j in range(start, repeat): rand_ind = n.random.randint(int(2e3), size=S) wordids = [docs.docs[idx].words for idx in rand_ind] wordcts = [docs.docs[idx].counts for idx in rand_ind] modelset[j].update_lambda(wordids, wordcts) if i in recset: tmp = 0 for j in range(start, repeat): tmp += modelset[j]._grad grad = tmp / repeat n.savetxt('gradient-%d/gradient-%d-%d' % (L, 31, i), grad.T) # for full sufficient statistics else: start = 0 repeat = 1 recset = set(map(int, n.logspace(0, 3.35, num=100, base=10.0).tolist())) for j in range(start, repeat): model = batchLDA(vocab, K, 2e3, alpha, eta, 0.01, kappa, L) for i in range(int(2e3)): # print i # rand_ind = n.random.randint(int(2e3), size = S) # for rand_ind = range(int(2e3)) # for full sufficient statistics wordids = [docs.docs[idx].words for idx in rand_ind] wordcts = [docs.docs[idx].counts for idx in rand_ind] # wordids = [d.words for d in docs.docs[(i*S):((i+1)*S)]] # wordcts = [d.counts for d in docs.docs[(i*S):((i+1)*S)]] model.update_lambda(wordids, wordcts) # n.savetxt('lambda-%d/lambda-%d-%d' % (L, L, i) , model._lambda.T) if i in recset: grad = model._grad # gradient-1 folder should pre-exist on pwd n.savetxt('gradient-%d/gradient-%d-%d' % (L, 31, i), grad.T) # for full sufficient statistics
pred_ids.append(sent_ids) pred_ids = np.array(pred_ids) acc = [] for idx in tqdm(range(0, len(pred_ids), batch_size)): if cuda: inp = Variable(torch.from_numpy(pred_ids[idx:idx + 64])).cuda() tag = Variable(torch.from_numpy(pred_ids[idx:idx + 64])).cuda() pred = cnn(inp) _, pred_idx = torch.max(pred, 1) acc.append((sum(pred_idx.cpu().data.numpy() == 1) * 1. / tag.size(0))) print(np.mean(acc)) if __name__ == '__main__': corpus_ = corpus('bank_all_0.txt', 'bank_all_1.txt', 'stop_words.csv') print('Max length of sents :{}'.format(np.max(corpus_.lengths))) print('The vocab size is {}'.format(len(corpus_.token2idx))) print('Train ids shape {}'.format(corpus_.train_ids.shape)) print('Eval ids shape {}'.format(corpus_.eval_ids.shape)) num_classes = 2 vocab_size = len(corpus_.token2idx) emb_dim = 128 criterion = nn.CrossEntropyLoss() if not os.path.isfile('cnn.pt'): if cuda: cnn = CNN(num_classes, vocab_size, emb_dim, filter_sizes, num_filters).cuda() else: cnn = CNN(num_classes, vocab_size, emb_dim, filter_sizes,