def tfidf(max_features=5000,prefix="extraction-",begin=1, end=26): # get stopwords sf = open('chi_stopwords.txt','r') stopwords = [x.strip() for x in sf.read().split(',')] vectorizer=tv(max_features=max_features)#tokenizer=tokenizer) d={} st=time.time() d,txt=getText(prefix=prefix,begin=begin,end=end) getdatatime=time.time() print getdatatime-st corpus={} for i in range(len(txt)):#d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False))) tfidf=vectorizer.fit_transform(corpus.values()).toarray() print tfidf.shape voc=vectorizer.get_feature_names() wordssum = tfidf.sum(axis=0) index=range(len(voc)) index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x.encode('utf-8') not in stopwords] print time.time() - st voc_sorted = [voc[i] for i in index] tfidfret = [] print time.time()-getdatatime return tfidf,voc,txt
def tfidf(max_features=5000, prefix="extraction-", begin=1, end=26): # get stopwords sf = open('chi_stopwords.txt', 'r') stopwords = [x.strip() for x in sf.read().split(',')] vectorizer = tv(max_features=max_features) #tokenizer=tokenizer) d = {} st = time.time() d, txt = getText(prefix=prefix, begin=begin, end=end) getdatatime = time.time() print getdatatime - st corpus = {} for i in range(len(txt)): #d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i] = (' '.join(jieba.cut(txt[i], cut_all=False))) tfidf = vectorizer.fit_transform(corpus.values()).toarray() print tfidf.shape voc = vectorizer.get_feature_names() wordssum = tfidf.sum(axis=0) index = range(len(voc)) index = [ index for (y, x, index) in sorted(zip(wordssum, voc, index), reverse=True) if x.encode('utf-8') not in stopwords ] print time.time() - st voc_sorted = [voc[i] for i in index] tfidfret = [] print time.time() - getdatatime return tfidf, voc, txt
def cluster(self, processed_text): vector = tv(use_idf=True) self.matrix = vector.fit_transform(processed_text) self.model = km(n_clusters=clusters, n_init=1000) self.model.fit(self.matrix) self.model_pred = km(n_clusters=clusters, n_init=1000) self.model_pred.fit_predict(self.matrix) return vector
def tfidf(max_features=10000,path="/home/tingyubi/20w/data/",prefix="extraction-",begin=1, end=26): ### get stopwords sf = open('chi_n.txt','r') stopwords = [x.strip().decode('utf-8') for x in sf.read().split('\n')] sf.close() ### load data d={} st=time.time() d,txt=getText(prefix=prefix,begin=begin,end=end) getdatatime=time.time() print "Loading data cost "+str(getdatatime-st)+" seconds." ### cut text corpus={} for i in range(len(txt)):#d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False))) jsonfile = "tfidf_cut_"+prefix+str(begin)+"_"+str(end)+".json" f = open(jsonfile,'w') json.dump(corpus,f) #f = open(jsonfile,'r') #corpus = json.load(f) f.close() ### tfidf vectorizer vectorizer=tv(max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer) tfidf=vectorizer.fit_transform(corpus.values())#.toarray() print "Tfidf vectorizing cost "+str(time.time()-getdatatime)+" seconds." #print tfidf.shape voc=vectorizer.get_feature_names() ### sorting vocabulary #wordssum = tfidf.sum(axis=0) #index=range(len(voc)) #index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x.encode('utf-8') not in stopwords] #voc_sorted = [voc[i] for i in index] ### save to json file #jsonfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".json" #data={} #data['vocabulary']=voc #data['tfidf']=tfidf.tolist() #with open(jsonfile,'w') as f: # json.dump(data,f) #f.close() ### save to pickle file pklfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".mat" f = open(pklfile,'wb') cPickle.dump(tfidf,f,-1) f.close() vocfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".voc" f = open(vocfile,'w') voca=voc f.write("\n".join(voca).encode('utf-8')) f.close() return tfidf,voc,txt
def __init__(self, revisoes, ngram_range=(1, 1)): self.vetorizador = tv(ngram_range=ngram_range) self.revisoes = revisoes self.vetorizar() print("\nTFIDFVectorizer concluiu a vetorização de %s." % str(ngram_range))
# load data d = {} st = time.time() d, txt = getText(prefix=args.prefix, begin=args.begin, end=args.end) getdatatime = time.time() print "Loading data cost " + str(getdatatime - st) + " seconds." # cut words corpus = {} for i in range(len(txt)): #d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i] = (' '.join(jieba.cut(txt[i], cut_all=False))) # tfidf vectorizer = tv(max_features=args.max_features, stop_words=stopwords) #tokenizer=tokenizer) tfidf = vectorizer.fit_transform(corpus.values()).toarray() print tfidf.shape voc = vectorizer.get_feature_names() print "Tfidf calculating cost " + str(time.time() - getdatatime) + " seconds." # sorting according to tfidf wordssum = tfidf.sum(axis=0) index = range(len(voc)) index = [ index for (y, x, index) in sorted(zip(wordssum, voc, index), reverse=True) ] #if x not in stopwords] voc_sorted = [voc[i] for i in index] f = open(args.outputfile, 'w') for x in voc_sorted: f.write(x.encode("utf-8") + "\n")
# load data d={} st=time.time() d,txt=getText(prefix=args.prefix,begin=args.begin,end=args.end) getdatatime=time.time() print "Loading data cost "+ str(getdatatime-st)+" seconds." # cut words corpus={} for i in range(len(txt)):#d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False))) # tfidf vectorizer=tv(max_features=args.max_features,stop_words=stopwords)#tokenizer=tokenizer) tfidf=vectorizer.fit_transform(corpus.values()).toarray() print tfidf.shape voc=vectorizer.get_feature_names() print "Tfidf calculating cost "+str(time.time() - getdatatime)+" seconds." # sorting according to tfidf wordssum = tfidf.sum(axis=0) index = range(len(voc)) index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True)] #if x not in stopwords] voc_sorted = [voc[i] for i in index] f=open(args.outputfile,'w') for x in voc_sorted: f.write(x.encode("utf-8")+"\n") f.close() #return tfidf,voc,txt
print("Accuracy: %0.2f (+/- %0.2f)" % (train_scores.mean(), train_scores.std() * 2)) test_scores = classifier.score(X_test, y_test) print('Test Scores') print(test_scores) print("Accuracy: %0.2f (+/- %0.2f)" % (test_scores.mean(), test_scores.std() * 2)) print('') ''' Train a SGD classifier using unigram representation with tf-idf, predict sentiments on imdb_te.csv, and write output to unigram.output.txt ''' vectorizer = tv(encoding='utf-8', strip_accents='unicode', ngram_range=(1, 1), decode_error='replace') vector_data = vectorizer.fit_transform(train_text) model_selector = model_selection X_train, X_test, y_train, y_test = model_selector.train_test_split( vector_data, labels, stratify=labels, test_size=0.2) classifier = sgd(loss='hinge', penalty='l1') classifier.fit(X_train, y_train) train_scores = classifier.score(X_train, y_train) print('TF-IDF Unigram Results') print('Train Scores') print(train_scores) print("Accuracy: %0.2f (+/- %0.2f)" %
def build_tfidf(corpus): vec = tv() tfidfs = vec.fit_transform(corpus) print (tfidfs.shape) return (tfidfs,vec)
# Import the necessary packages for preforming similarity between texts. from sklearn.metrics.pairwise import cosine_similarity as cs from sklearn.feature_extraction.text import TfidfVectorizer as tv # Load the texts - The original & The generated gensis = open('../Genesis.txt','r').read().split('\r') model_gensis = open('../Model_Genesis.txt','r').read().split('\n') # Initialize TfV = tv() TfV.fit(gensis) Y = TfV.transform(gensis) # Check for every sentence the similarity similaritySum = 0 for sentence in model_gensis: X = TfV.transform([sentence]) print(sentence) print(gensis[cs(X, Y).argmax()]) print(' ') similaritySum = cs(X, Y).max() # Calculate the similarity similarity = similaritySum/7 print('The similarity between the original text - Genesis - and the model is: ' , similarity)
def build_tfidf(corpus): tfidfs = tv(corpus) return tfidfs