def printLSA(self): corpus = [] for message in self.message_list: corpus += message.text # for message in self.message_list: # for text in message.text: # corpus.append(text) #tfidf stuff vectorizer = TfidfVectorizer(min_df=1, stop_words='english') X = vectorizer.fit_transform(corpus) idf = vectorizer.idf_ #lsa stuff lsa = TruncatedSVD(n_components=27, n_iter=100) lsa.fit(X) print dict(zip(vectorizer.get_feature_names(), idf)) print "" #print related concepts terms = vectorizer.get_feature_names() for i, comp in enumerate(lsa.components_): termsInComp = zip (terms,comp) sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10] print "Concept %d:" % i for term in sortedTerms: print term[0] print " " #print sorted stuff to see v = sorted(zip(vectorizer.get_feature_names(), idf), key=lambda x:x[1]) print v print "\n\n"
def getFeatures(tweets, vocabularyWords): """ Gets the features (word count, represented as a sparse matrix), where we can recover the particular feature labels. We then weight features via Tf-idf terms. (http://en.wikipedia.org/wiki/Tf%E2%80%93idf) See: http://scikit-learn.org/dev/modules/feature_extraction.html#text-feature-extraction """ from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(vocabulary = vocabularyWords, ngram_range = (1, 3)) features = vectorizer.fit_transform(tweets) # print "features are: " # print features.toarray() print "features length is: " print len(features.toarray()[0]) # print "feature names are: " # print vectorizer.get_feature_names() print "feature name lengths are: " print len(vectorizer.get_feature_names()) return (features.toarray(), vectorizer.get_feature_names())
def test_text_vectorization(): mongo_dataset = MongoHC("hc", "re0") data = [d for d in mongo_dataset.get_all(order_by="id_doc")] text = [d["text"] for d in data[1:2]] tfidf_vectorizer = TfidfVectorizer( max_df=1, max_features=200000, min_df=1, stop_words="english", strip_accents="unicode", use_idf=True, ngram_range=(1, 1), norm="l2", ) tfidf_matrix = tfidf_vectorizer.fit_transform(text) print tfidf_vectorizer.get_feature_names() print tfidf_matrix.data indices = np.argsort(tfidf_vectorizer.idf_)[::-1] print indices features = tfidf_vectorizer.get_feature_names() top_n = 5 top_features = [features[i] for i in indices[:top_n]] print len(features) print tfidf_matrix.shape print top_features
def text_to_vectors(dirname_or_textdata,test_dirname_or_textdata=None,ngram_range=(1, 1),verbose=False): if isinstance(dirname_or_textdata,str): textdata=load_files(dirname_or_textdata,verbose) else: textdata=dirname_or_textdata from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(ngram_range=ngram_range) vectors = vectorizer.fit_transform(textdata.data) data=Struct() data.vectorizer=vectorizer data.vectors=vectors data.targets=textdata.targets data.target_names=textdata.target_names data.feature_names=vectorizer.get_feature_names() if not test_dirname_or_textdata is None: if isinstance(test_dirname_or_textdata,str): textdata=load_files(test_dirname_or_textdata,verbose) else: textdata=test_dirname_or_textdata test_vectors = vectorizer.transform(textdata.data) test_data=Struct() test_data.vectorizer=vectorizer test_data.vectors=test_vectors test_data.targets=textdata.targets test_data.target_names=textdata.target_names test_data.feature_names=vectorizer.get_feature_names() return data,test_data else: return data
def test2(): with codecs.open('/home/zhangwj/Applications/Scrapy/baike/files/data_fenci.txt', 'rb',encoding='utf-8') as f: data_samples = f.read() n_features = 1000 tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #CountVectorizer是通过fit_transform函数将文本中的词语转换为词频矩阵 max_features=n_features,stop_words=u"应该" ) #TfidfTransformer是统计vectorizer中每个词语的tf-idf权值 tfidf = tfidf_vectorizer.fit_transform(data_samples) # return sparse matrix, [n_samples, n_features],Tf-idf-weighted document-term matrix. tfidf_vectorizer.get_feature_names() #上面输出的是tfidf的权重矩阵 sample*feature, 该函数打印feature names, 一个sample是一篇文档
def main(K, numfeatures, sample_file, num_display_words, outputfile): K_clusters = K stop_words = set(stopwords.words('spanish')).union(set(['http','www','san', '099','098','096','097'])) #stop_words = [word.decode('utf-8') for word in stopwords.words('spanish')]#stopwords.words("spanish") vectorizer = TfidfVectorizer(max_df=0.5, max_features=numfeatures, min_df=2, stop_words=set(stop_words), use_idf=True) text = [] with open(sample_file, 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: text.append(row[1]) t0 = time() print("Extracting features from the training dataset using a sparse vectorizer") X = vectorizer.fit_transform(text) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) idf = vectorizer.idf_ words = dict(zip(vectorizer.get_feature_names(), idf)) terms = sorted(words, key=words.__getitem__)[0:10] # mapping from feature id to acutal word id2words ={} for i,word in enumerate(vectorizer.get_feature_names()): id2words[i] = word t0 = time() print("Applying topic modeling, using LDA") print(str(K_clusters) + " topics") corpus = matutils.Sparse2Corpus(X, documents_columns=False) lda = models.ldamodel.LdaModel(corpus, num_topics=K_clusters, id2word=id2words) print("done in %fs" % (time() - t0)) #write json version json_data = {"terms":terms,"topics":None} json_topics = [] for i, item in enumerate(lda.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)): topic = {} topic['name']= "topic" + str(i) topic['children']= [] for weight,term in item: child = {} child['name'] = term child['weight'] = weight topic['children'].append(child) #output_text.append( term + " : " + str(weight) ) json_topics.append(topic) json_data['topics'] = json_topics with open(outputfile + ".json", 'w') as outfile: json.dump(json_data, outfile)
def tfidf_vectorizer(codex,\ max_df=1,\ min_df=0,\ stop_words='english',\ train_split=False ): """ Calculate term frequency for words in all comments Input: text string (nouns only from noun_tokenizer) Output: transformed input, term list from tfidf, model """ #Select english stopwords cachedStopWords = set(stopwords.words("english")) #Add words to stopwords list cachedStopWords.update(('and','I','A','And','So','arnt','This','When','It',\ 'many','Many','so','cant','Yes','yes','No','no',\ 'These','these','',' ','ok','na', 'edit','idk',\ 'gon','wasnt','yt','sure','watch','whats','youre',\ 'theyll','anyone' )) if train_split: #Initialize model vectorizer = TfidfVectorizer(max_df=max_df,\ min_df=min_df,\ stop_words=cachedStopWords\ ) x_train, x_test = train_test_split(codex) #Transform codex to vectors and calculate TFIDFs X = vectorizer.fit_transform(x_train) #Get all word tokens terms = vectorizer.get_feature_names() return X, terms, vectorizer else: #Initialize model vectorizer = TfidfVectorizer(max_df=max_df,\ min_df=min_df,\ stop_words=cachedStopWords ) #Transform codex to vectors and calculate TFIDFs X = vectorizer.fit_transform(codex) #Get all word tokens terms = vectorizer.get_feature_names() return X, terms, vectorizer
def LoadDocuments(fname, collect_links): crawl_data, urls, titles, relationships = pages_to_mem(fname, collect_links) tfidfVect = TfidfVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1,2), sublinear_tf=True) term_tfidf = tfidfVect.fit_transform(crawl_data) dict_values = tfidfVect.get_feature_names() i = iter(dict_values) term_b = dict(izip(i, xrange(len(dict_values)))) # dictionary of words and indicies tfidfVect = TfidfVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1,2)) title_tfidf = tfidfVect.fit_transform(titles) dict_values = tfidfVect.get_feature_names() i = iter(dict_values) title_b = dict(izip(i, xrange(len(dict_values)))) # dictionary of words and indicies return title_tfidf, title_b, term_tfidf, term_b, urls, relationships
def test1(): n_samples = 2000 n_features = 1000 print("Loading dataset...") dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) data_samples = dataset.data[:n_samples] # Use tf-idf features for NMF. print("Extracting tf-idf features for NMF...") tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tfidf = tfidf_vectorizer.fit_transform(data_samples) #sparse matrix, [n_samples, n_features],Tf-idf-weighted document-term matrix. tfidf_vectorizer.get_feature_names() #上面输出的是tfidf的权重矩阵 sample*feature, 该函数打印feature names, 一个sample是一篇文档
def rocchio(request): from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.utils.extmath import randomized_svd from sklearn import feature_selection import pandas as pd document_index = [] s = SessionStore() sessionData = db.sessionHistory.find_one({"session_key":s.session_key}) urls_visited = sessionData['url_visited'] urls = [] for url in urls_visited: urls.append(url[0]) bodyContentList = db.crawledCollection.find({'url':{"$in":urls}}, {'body':1}) body = [] terms = [] for x in bodyContentList: body.append(re.sub('[!@#$%^&*()[]./<>?\|`~-=_+]0-9', '', x['body'])) # Turning the body content into a bag of words top_features=[] vectorizer = TfidfVectorizer(stop_words = 'english') X = vectorizer.fit_transform(body) indices = np.argsort(vectorizer.idf_)[::-1] features = vectorizer.get_feature_names() top_n = 10 top_features.append([features[i] for i in indices[:top_n]]) print top_features vectorizer = CountVectorizer(min_df = 1, stop_words = 'english') dtm = vectorizer.fit_transform(body) index=pd.DataFrame(dtm.toarray(),index=body,columns=vectorizer.get_feature_names()) indexterms=vectorizer.get_feature_names() transform=TfidfTransformer() tfidf=transform.fit_transform(dtm) U, Sigma, V = randomized_svd(tfidf, n_components=5, n_iter=5, transpose=True, random_state=None) #getting the highes count of words and adding it into the query return HttpResponse(top_features)
def tfidf(synopses): tfidf_vectorizer=TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3)) tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) terms=tfidf_vectorizer.get_feature_names() print("terms:",terms) print(tfidf_matrix.shape) return terms,tfidf_matrix # 返回tfidf矩阵
def get_top_terms(self, stops=STOPS): # vecotrize using only 1-grams vectorizer = TfidfVectorizer(stop_words=stops, ngram_range=(1,3)) tfidf = vectorizer.fit_transform(self.docs) # enumerate feature names, ie. the actual words self.feature_names = vectorizer.get_feature_names() # convert to dense array dense = tfidf.todense() # container for top terms per doc self.features = [] for doc in dense: doc = doc.tolist()[0] # creates a list of tuples, (term_id, score) phrase_scores = [pair for pair in zip(range(0, len(doc)), doc) if pair[1] > 0] # feature_ids = sorted(phrase_scores, key=lambda t: t[1] * -1) doc_features = [] for f_ in phrase_scores: fname = self.feature_names[f_[0]] fscore = f_[1] doc_features.append((fscore, fname)) top_terms = sorted(doc_features, reverse=True) #[:n_terms] # top_terms = ",".join([ x[1] for x in top_terms ]) self.features.append(top_terms)
def preprocess(word_data, targets): print("\n### PREPROCESSING DATA ###") # vectorize print("-- Vectorization") vectorizer = TfidfVectorizer(sublinear_tf=True) # , stop_words='english' data_transformed = vectorizer.fit_transform(word_data) # feature selection print("-- Feature Selection") selector = SelectPercentile(percentile=5) data_selected = selector.fit_transform(data_transformed, targets) if data_selected.shape[1] == 0: data_selected = data_transformed else: print("Top {} features were selected".format(data_selected.shape[1])) # print top features nr_features = 30 i = selector.scores_.argsort()[::-1][:nr_features] top_features = np.column_stack((np.asarray(vectorizer.get_feature_names())[i], selector.scores_[i], selector.pvalues_[i])) print("\nTop %i Features:" % nr_features) print(pd.DataFrame(top_features, columns=["token", "score", "p-val"]), "\n") features_train, features_test, labels_train, labels_test = \ train_test_split(data_selected, targets, test_size=0.2, stratify=targets) return features_train, features_test, labels_train, labels_test
def get_peronalpreference_vectors(vocab, user_pref_values): vectorizer = TfidfVectorizer(vocabulary=vocab, lowercase=False) vectors = vectorizer.fit_transform(user_pref_values).toarray() words = vectorizer.get_feature_names() # idf = vectorizer.idf_ # print dict(zip(vectorizer.get_feature_names(), idf)) return words, vectors
def cluster(data, k): vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=['nfl','game','team']) td_matrix = vectorizer.fit_transform(data) km = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_jobs=-1) km.fit(td_matrix) order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() def count(acc,value): acc[value] += 1 return acc cluster_counts = reduce(count, km.labels_, [0]*k) #_max = (0,0) #for i in range(0,len(cluster_counts)): # if _max[1] < cluster_counts[i]: # _max = (i,cluster_counts[i]) #print _max[0], _max[1], float(_max[1]) / len(data) # print counts result = [] for i in reversed(numpy.array(cluster_counts).argsort()): x = [float(cluster_counts[i])/len(data)] for ind in order_centroids[i, :10]: x.append(terms[ind]) result.append(x) return result
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20): # convert the training data text to features using TF-IDF vectorization vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english') X_train = vectorizer.fit_transform(chapter_contents_train) # X_train_array = X_train.toarray() # print "tfidf vector length: ", len(X_train_array) #dbg # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg # use only the best k features according to chi-sq selection ch2 = SelectKBest(chi2, k=k) X_train = ch2.fit_transform(X_train, y_train) # determine the actual features used after best-k selection feature_names = np.asarray(vectorizer.get_feature_names()) chisq_mask = ch2.get_support() features_masks = zip(feature_names,chisq_mask) selected_features = [z[0] for z in features_masks if z[1]] # train the classifier clf.fit(X_train, y_train) # convert the test data text into features using the same vectorizer as for training X_test = vectorizer.transform(chapter_contents_test) X_test = ch2.transform(X_test) # obtain binary class predictions for the test set preds = clf.predict(X_test) return preds, selected_features, clf
def tfidf_word_match_share(question1, question2): qs = question1 + question2 tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=3) tfidf_matrix = tfidf_vectorizer.fit_transform(qs) feature_names = tfidf_vectorizer.get_feature_names() # dense = tfidf_matrix.todense() # word_index_dict = dict((j, i) for i, j in enumerate(feature_names)) tf_idf = [] for q1, q2 in zip(question1, question2): q1words = {} q2words = {} for word in str(q1).lower().split(): if word not in stops: q1words[word] = 1 for word in str(q2).lower().split(): if word not in stops: q2words[word] = 1 if len(q1words) == 0 or len(q2words) == 0: tf_idf.append([0]) else: q1_tfidf = tfidf_vectorizer.transform([" ".join(q1words.keys())]) q2_tfidf = tfidf_vectorizer.transform([" ".join(q2words.keys())]) inter = np.intersect1d(q1_tfidf.indices, q2_tfidf.indices) shared_weights = 0 for word_index in inter: shared_weights += (q1_tfidf[0, word_index] + q2_tfidf[0, word_index]) total_weights = q1_tfidf.sum() + q2_tfidf.sum() if np.sum(total_weights) == 0: tf_idf.append([0]) else: score = np.sum(shared_weights) / np.sum(total_weights) tf_idf.append([round(score, 2)]) print("Created tf_idf features feature") return np.array(tf_idf)
def get_tfidf_model(self, dirname): data = Sentences(dirname) tfidf_vectorizer = TfidfVectorizer(stop_words='english') tfidf_matrix = tfidf_vectorizer.fit_transform(data) mat_array = tfidf_matrix.toarray() fn = tfidf_vectorizer.get_feature_names() return tfidf_vectorizer
class Train: """Using non-negative matrix factorization to learn the vector of a document""" def __init__(self,filename_in): self.text = [] for line in open(filename_in,'rb'): self.text.append(line.strip().decode('utf-8')) def train(self,n_topics=10): self.vectorizer = TfidfVectorizer(min_df=0.001,max_df=0.6) tfidf = self.vectorizer.fit_transform(self.text) n_samples = len(self.text) print("Fitting the NMF model with n_samples=%d and n_features=%d..." % (n_samples,n_topics)) self.nmf = NMF(n_components = n_topics, random_state = 1).fit(tfidf) def show_result(self,n_top_words=10): feature_names = self.vectorizer.get_feature_names() for topic_idx, topic in enumerate(self.nmf.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) print() def __str__(self): print np.shape(self.nmf.components_)[1]+'topics'
def get_salience_matrix(keys, salient_set): """ run test set on salient terms """ salient_feats = [] tfidf = TfidfVectorizer(stop_words="english") top_n = 100 for key in keys: salience_test = [] top_terms = [] history = clean(tweets[str(key)]["audience"]["user"]["history"])[1:] # print len(history) try: teeeff = tfidf.fit_transform(history) indices = np.argsort(tfidf.idf_)[::-1] features = tfidf.get_feature_names() top_terms = [features[i] for i in indices[:top_n]] except: top_terms = [] for term in salient_set: if term in top_terms: salience_test.append(1) else: salience_test.append(0) salient_feats.append(salience_test) return np.array(salient_feats)
def build_model(self): vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words = 'english') vector = vectorizer.fit_transform(self.df['Comment'].values).toarray() self.model = NMF(n_components=self.n_topics).fit(vector) self.features = vectorizer.get_feature_names() self.matrix = self.model.transform(self.vectorizer) # From the matrix, retrieve top example, topics words, and number of comments per topics def output_data(self): ''' OUTPUT DataFrame ''' self.examples = [] self.comment = [] self.topic_words = [] # Retrieve the comment most relevant to each topic index = self.matrix.argmax(axis=0) self.df = self.df.reset_index() self.examples.append(self.df.ix[index]['Comment'].values) np.sort(self.matrix, axis =1) # Retrieve all comments that are relevant to each topic for i in range(self.n_topics): self.comment.append(len(self.matrix[:,i][self.matrix[:,i] > 0.05])) self.num_per_topics = len(comments) # Retrieve top 10 topic words for topic in self.model.components_: topic_words.append(" ".join([self.features[i] for i in topic.argsort()[:-10-1:-1]])) return self.examples, self.topic_words, self.num_per_topics
def test_string_compare(): # http://stackoverflow.com/questions/8897593/similarity-between-two-text-documents # ? http://stackoverflow.com/questions/2380394/simple-implementation-of-n-gram-tf-idf-and-cosine-similarity-in-python?noredirect=1&lq=1 # "Equivalent to CountVectorizer followed by TfidfTransformer." http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html # http://stackoverflow.com/questions/32128802/how-to-use-sklearns-countvectorizerand-to-get-ngrams-that-include-any-punctua # http://stackoverflow.com/questions/23850256/how-can-i-pass-a-preprocessor-to-tfidfvectorizer-sklearn-python from sklearn.feature_extraction.text import TfidfVectorizer from nltk.tokenize import TreebankWordTokenizer test_text_1 = ["I'd like an apple", "An apple a day keeps the doctor away", "Never compare an apple to an orange", "I prefer scikit-learn to Orange", "I'd like an apple", "I'd like an orange"] test_text_2 = ["I'd like an apple", "I'd like an apple orange"] #<> add a stemming step? couse be useful for typos vect = TfidfVectorizer(min_df=0, stop_words="english", #tokenizer=TreebankWordTokenizer().tokenize, #to get dashes,etc lowercase=True) tfidf = vect.fit_transform(test_text_2) #This is equivalent to fit followed by transform, but more efficiently implemented. sim_M = (tfidf * tfidf.T).A print "tfidf\n", tfidf print print "vect.get_feature_names()\n", vect.get_feature_names() print print "similiarity matrix\n", sim_M #plt.imshow(sim_M, cmap='rainbow', interpolation='nearest') print "similarity between 2 sentences:", sim_M[0,1]
def get_voc_tfidfdoc_from_synant(antPath, synPath, gre): voc = get_voc_from_synant(antPath, synPath, gre) doc = defaultdict(list) with open(antPath, 'r') as f: for row in f: row = row.strip().split() target = row[0] if target in voc: for w in row[1:]: if w in voc: doc[target].append(w) with open(synPath, 'r') as f: for row in f: row = row.strip().split() target = row[0] if target in voc: for w in row[1:]: if w in voc: doc[target].append(w) entries = doc.keys() doc = doc.values() doc = [' '.join(x) for x in doc] td = TfidfVectorizer(token_pattern=u'(?u)\\b[^ ]+\\b') doc = td.fit_transform(doc) items = td.get_feature_names() items = [str(x) for x in items] return voc, doc, entries, items
def create_d3_list(sym): tweets = [] twitter_data = get_twitter_data(sym).items(100) for tweet in twitter_data: tweets.append(tweet.text) vectorizer = TfidfVectorizer(stop_words='english') vectors = vectorizer.fit_transform(tweets).toarray() words = vectorizer.get_feature_names() #words = [ word for word in words if word.isalpha() ] avg = np.sum(vectors, axis=0) #/ np.sum(vectors > 0, axis=0) print "top 10 by average tf-idf" d = enchant.Dict("en_US") #top_vals = [ str(word) for word in get_top_values(avg, 100, words) if d.check(word) ] words_avg = zip(words, avg) words_avg.sort(key=lambda tup: tup[1], reverse=True) d3_list = [] sizing_d3 = (110 / words_avg[0][1]) for cell in words_avg: if d.check(cell[0]): if cell[0] != 'rt': d3_list.append({"text": str(cell[0]), "size": cell[1] * sizing_d3}) return d3_list
def vectorize_words(self, clean_profiles, max_features = 500) : # Vectorize the words in the cleaned profiles using # term frequency/inverse document frequency (TF-IDF) print "Creating the bag of words...\n" # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = TfidfVectorizer(min_df=1, max_features = max_features) vectorizer._validate_vocabulary() # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. data_features = vectorizer.fit_transform(clean_profiles) # Numpy arrays are easy to work with, so convert the result to an # array data_features = data_features.toarray() print data_features.shape vocab = vectorizer.get_feature_names() # Sum up the counts of each vocabulary word dist = np.sum(data_features, axis=0) return vectorizer, data_features, vocab, dist
class Classifier(object): def __init__(self): self.classifier = LogisticRegression(intercept_scaling=100) self.vectorizer = TfidfVectorizer() def trainvectorizer(self,corpus): self.vectorizer.fit_transform(corpus) file1 = open("feature_names.txt","w") names = self.vectorizer.get_feature_names() print len(names) for name in names: file1.write(name.encode('utf8')+"\n") file1.close() print "vectrizer train is over...." def trainclassifier(self,train_X,train_Y): self.classifier.fit(train_X,train_Y) print "classifier train is over ...." def getfeature(self,text):#return a feature array matrx = self.vectorizer.transform([text]).toarray() array = matrx[0] return array def getresult(self,feature):#return true or false return self.classifier.predict(feature)
def getFeatures(self,tweets,query=None): # tfidf matrix rewroteText = [t.processedText for t in tweets] tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=10000, min_df=0.05, stop_words=STOPWORDS, use_idf=True, tokenizer=process, ngram_range=(1,2)) tfidfMatrix =tfidf_vectorizer.fit_transform(rewroteText) print "Found {} meaningful words".format(tfidfMatrix.shape[1]) self.tfidfDict = tfidf_vectorizer.get_feature_names() # print self.tfidfDict context = [] for t in tweets: context.append([t.timezone,t.hasPhoto,t.sentimentScore]) context = np.array(context) #print "tdidf {}".format(tfidfMatrix.shape) #print "norm {} {}".format(normContext.shape,type(normContext)) features = np.hstack((tfidfMatrix.A,context)) #print features.shape return features
def tf_idf_features(train_data, test_data): # Bag-of-words representation tf_idf_vectorize = TfidfVectorizer() tf_idf_train = tf_idf_vectorize.fit_transform(train_data.data) #bag-of-word features for training data feature_names = tf_idf_vectorize.get_feature_names() #converts feature index to the word it represents. tf_idf_test = tf_idf_vectorize.transform(test_data.data) return tf_idf_train, tf_idf_test, feature_names
class MedicalKeywordTfIdf(BaseEstimator, TransformerMixin): MEDICAL_KEYWORDS = ["Medical_Keyword_" + str(i) for i in range(1, 49)] def __init__(self): self._vec = TfidfVectorizer(max_df=0.95, min_df=2) def get_feature_names(self): return [x + "_TFIDF" for x in self._vec.get_feature_names()] def get_data_array(self, df): return df[self.MEDICAL_KEYWORDS] \ .apply(lambda x: " ".join(x[x == 1].index), axis=1).values def fit(self, df, y=None): data_arr = self.get_data_array(df) self._vec.fit(data_arr) return self def transform(self, df): data_arr = self.get_data_array(df) return self._vec.transform(data_arr).toarray()
def tfidf_scores(work_dir, output_file, ngram_range=(1, 1), num_results=5): tf = TfidfVectorizer(analyzer="word", ngram_range=ngram_range, min_df=0, stop_words="english") if work_dir[-1] != "/": work_dir += "/" files = [work_dir + f for f in os.listdir(work_dir) if "@" in f] corpus = [] for f in files: corpus.append(word_bag(f)) print "-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-" print "fitting the corpus to generate TfIdf matrix" tfidf_matrix = tf.fit_transform(corpus) print "finished!" print "-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-" feature_names = tf.get_feature_names() dense = tfidf_matrix.todense() g = open(output_file, "w") for (ii, f) in enumerate(files): chat = dense[ii].tolist()[0] phrase_scores = [pair for pair in enumerate(chat) if pair[1] > 0] sorted_phrase_scores = sorted(phrase_scores, key=lambda x: x[1] * -1) # print 'words most used in %s' %(f.split('/')[-1]) g.write("words most used in %s\n" % (f.split("/")[-1])) for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][ :num_results ]: # print('{0: <20} {1}'.format(phrase, score)) g.write("{0: <20} {1}\n".format(phrase, score))
class LdaWithTfidf: def __init__(self, optimising=True, max_df=config.max_document_freq, min_df=config.min_document_freq, max_feat=config.max_features): self.optimising = optimising self.lemmatizer = WordNetLemmatizer() self.tokenizer = RegexpTokenizer(r'[a-zA-z]+') self.vectorizer = TfidfVectorizer(max_df=max_df, max_features=max_feat, min_df=min_df, stop_words='english', use_idf=True) def split_twitter_file(self): print("Dividing file into 70% training and 30% testing") allTweets = [] columnnames = [] with open(config.twitter_orig_file, "rt", encoding="utf-8") as original: line_count = 0 for line in original: if line_count == 0: columnnames.append(line) line_count += 1 continue allTweets.append(str(line)) print(columnnames) random.shuffle(allTweets) with open(config.twitter_training_file, 'wt', encoding="utf-8") as training: training.write(columnnames[0]) for line in allTweets[0:int(len(allTweets) * 0.70)]: training.write(line) with open(config.twitter_test_file, 'wt', encoding="utf-8") as test: test.write(columnnames[0]) for line in allTweets[int(len(allTweets) * 0.70):]: test.write(line) def read_twitter_training_dataset(self, limit=0): print("File directory found at " + config.twitter_training_file) lines = [] tokenizer = RegexpTokenizer(r'\@\w+') with open(config.twitter_training_file, "rt", encoding="utf-8") as training: reader = csv.reader(training, delimiter=',') line_count = 0 for line in reader: if line_count == 0: print(f'column names are {",".join(line)}') line_count += 1 continue try: timestamp = line[12] user = line[7] output = line[10] sentence = output mentions = [] tokenised = tokenizer.tokenize(output) for token in tokenised: if token.startswith('@'): mentions.append(token[1:]) except: continue lines.append(self.apply_lemmatizer(sentence)) line_count += 1 if limit >= 1: return lines[0:limit] return lines def read_twitter_testing_dataset(self, limit=0): print("File directory found at " + config.twitter_training_file) lines = [] tokenizer = RegexpTokenizer(r'\@\w+') with open(config.twitter_training_file, "rt", encoding="utf-8") as training: reader = csv.reader(training, delimiter=',') line_count = 0 for line in reader: if line_count == 0: print(f'column names are {",".join(line)}') line_count += 1 continue timestamp = line[12] user = line[7] output = line[10] sentence = output mentions = [] tokenised = tokenizer.tokenize(output) for token in tokenised: if token.startswith('@'): mentions.append(token[1:]) lines.append(self.apply_lemmatizer(sentence)) line_count += 1 if limit >= 1: return lines[0:limit] return lines def read_unity_dataset_training(self): print("File directory found at " + config.unity_training_file) timestamp_regex = "\[\d\d\:\d\d\]" timestamp_pattern = re.compile(timestamp_regex) user_regex = "<([a-zA-Z0-9_ ]+)>" user_pattern = re.compile(user_regex) bag_of_words_output = [] with open(config.unity_training_file, "rt", encoding="utf-8") as training: for line in training: output = line timestamp_match = timestamp_pattern.search(line) if timestamp_match: output = re.sub(timestamp_regex, "", output) user_match = user_pattern.search(line) if user_match: output = re.sub(user_regex, "", output) bag_of_words_output.append(self.apply_lemmatizer(output)) print("Training Data Entries: {}".format(len(bag_of_words_output))) return bag_of_words_output def read_unity_dataset_testing(self): print("File directory found at " + config.unity_test_file) timestamp_regex = "\[\d\d\:\d\d\]" timestamp_pattern = re.compile(timestamp_regex) user_regex = "<([a-zA-Z0-9_ ]+)>" user_pattern = re.compile(user_regex) bag_of_words_output = [] with open(config.unity_test_file, "rt", encoding="utf-8") as training: for line in training: print("line is: ", line) output = line timestamp_match = timestamp_pattern.search(line) if timestamp_match: output = re.sub(timestamp_regex, "", output) user_match = user_pattern.search(line) if user_match: output = re.sub(user_regex, "", output) bag_of_words_output.append(self.apply_lemmatizer(output)) print("Training Data Entries: {}".format(len(bag_of_words_output))) return bag_of_words_output def apply_lemmatizer(self, sentence): temp = [ self.lemmatizer.lemmatize(t) for t in self.tokenizer.tokenize(sentence) ] # join tokens as vectorizer will split them again lemmas = " ".join(temp) return lemmas def build_topic_model(self, num_of_topics=config.num_of_topics, max_iterations=config.max_iterations): corpus = self.read_twitter_training_dataset() test = self.read_twitter_testing_dataset() if self.optimising: corpus = self.read_twitter_testing_dataset() training_tfidf = self.vectorizer.fit_transform(corpus) print("number of tfidf features: %d" % training_tfidf.get_shape()[1]) feat_names = self.vectorizer.get_feature_names() # Run LDA lda = LatentDirichletAllocation(n_components=num_of_topics, max_iter=max_iterations, learning_method='online', learning_offset=50., random_state=0).fit(training_tfidf) ldaModel = lda.fit_transform(training_tfidf) model = (feat_names, lda.components_, lda.exp_dirichlet_component_, lda.doc_topic_prior_) if not self.optimising: # if used in production save the model print(f"Saving model to file: {config.lda_model_file_name}") with open(config.lda_model_file_name, 'wb') as fp: joblib.dump(model, fp) else: self.evaluate_model(model, training_tfidf) def load_model_for_eval(self): model = (features, components_, exp_dirichlet_component_, doc_topic_prior_) = joblib.load(config.lda_model_file_name) # self.tf_vectorizer = CountVectorizer(vocabulary=self.features, stop_words='english') self.evaluate_model(model) def evaluate_model(self, model, data=None): (features, components_, exp_dirichlet_component_, doc_topic_prior_) = model print(f"First 10 words in the vocabulary: %s", features[0:10]) # Print words associated with each topic for topic_idx, topic in enumerate(components_): print("Topic %d:" % (topic_idx)) print(" ".join([ features[i] for i in topic.argsort()[:-config.num_of_topics - 1:-1] ])) if not data is None: # extract from vectrorised data termFreq = data.sum(axis=0).getA1() docLength = data.sum(axis=1).getA1() termDists = components_ / components_.sum(axis=1)[:, None] print("Data present, ", termFreq, docLength) # when optimising output graphs for compNum in range(0, config.num_of_topics): comp = components_[compNum] indeces = np.argsort(comp).tolist() indeces.reverse() terms = [features[weightIndex] for weightIndex in indeces[0:10]] weights = [comp[weightIndex] for weightIndex in indeces[0:10]] terms.reverse() weights.reverse() positions = np.arange(10) + .5 # plot strongest terms for each term plt.plot(compNum) plt.barh(positions, weights, align='center') plt.yticks(positions, terms) plt.xlabel('Weight') plt.title('Strongest terms for component %d' % compNum) plt.grid(True) plt.savefig(config.topics_results_dir + "topic%d" % compNum + ".png") plt.close() return
print ("PART 1\n") #Loading the data sample_docs = [] with open("top1000_movie_summaries.tsv") as fi: tsvReader = csv.reader(fi, delimiter='\t') for i, (title, plot) in enumerate(tsvReader): sample_docs.append(plot) #Tfidf Vectorizer vectorizer = TfidfVectorizer(min_df=1) tfidf_matrix = vectorizer.fit_transform(sample_docs) tfidf_array = tfidf_matrix.toarray() #Displaying Top 10 Terms and their Tfidf Scores terms_list = [(score, term) for score, term in zip(tfidf_array[0], vectorizer.get_feature_names()) if score > 0] terms_list = sorted(terms_list, key=lambda x: x[0],reverse = True) i = 0 print ("Displaying Top 10 Terms and their Tfidf Scores") for tuples in terms_list: i+=1 print (tuples) if i>=10: break #Using K-Means Algorithm to compute clusters kmeans = KMeans(n_clusters=20, random_state=0) kmeans.fit(tfidf_matrix) #Exploring Cluster Sizes cluster_labels = kmeans.labels_
with open('custom_stopwords.txt', 'r') as myfile: data=myfile.readlines()#.replace('\n', '') cus_stop = [i.replace('\n','') for i in data] cus_stop.append(str(curr_year)) punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"] stop_words = feature_extraction.text.ENGLISH_STOP_WORDS.union(punc) stop_words = stop_words.union(cus_stop) #Join English + Custom Stop Words stopwords = nltk.corpus.stopwords.words('english') vectorizer = TfidfVectorizer(stop_words=stop_words) X = vectorizer.fit_transform(df['content'].values.astype('U')) word_features = vectorizer.get_feature_names() stemmer = SnowballStemmer('english') tokenizer = RegexpTokenizer(r'[a-zA-Z\']+') def tokenize(text): return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())] vectorizer2 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize) X2 = vectorizer2.fit_transform(df['content'].values.astype('U')) word_features2 = vectorizer2.get_feature_names() vectorizer3 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize, max_features = 1000) X3 = vectorizer3.fit_transform(df['content'].values.astype('U')) words = vectorizer3.get_feature_names()
def give_top_utility_score_features(tweet, corpus, local_ner_corpus): final_segments = {} local_weight = 0 global_weight = 0 final_weight_for_segment = 0 list_of_segments = ng.generate_ngrams(tweet) tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2)) tfs = tfidf.fit_transform(corpus) #local_ngrams response = tfidf.transform([tweet]) local_list_of_ngrams = tfidf.get_feature_names() local_ngrams = {} for col in response.nonzero()[1]: local_ngrams[local_list_of_ngrams[col]] = response[0, col] #print(local_ngrams) #local_list_of_ngrams=list(set(local_list_of_ngrams)) #global_ner global_list_of_ners = [] tokenized_text = word_tokenize(tweet) classified_text = st.tag(tokenized_text) filteredList = filter(lambda x: x[1] != 'O', classified_text) filteredList = list(filteredList) if filteredList: for objects in filteredList: global_list_of_ners.append(objects[0]) #global_ngram global_microsoft_list_of_ngrams = [] try: global_microsoft_list_of_ngrams = microsost_ngram_service(tweet) except: pass for segment in list_of_segments: final_weight_for_segment = 0 if segment in global_list_of_ners: global_weight += 0.3 if segment in global_microsoft_list_of_ngrams: global_weight += 0.3 if segment in local_ner_corpus: local_weight += 0.3 if segment in local_ngrams: local_weight += local_ngrams[segment] final_weight_for_segment = ((global_weight)) + (local_weight) #print(segment," : ",final_weight_for_segment) if (final_weight_for_segment >= 0.3): final_segments[segment] = final_weight_for_segment #print(segment," ; " , final_weight_for_segment) # new_final_segments = dict(sorted(final_segments.iteritems(), key=operator.itemgetter(1), reverse=True)[:5]) new_final_segments = sorted(final_segments, key=final_segments.get, reverse=True)[:5] #print(new_final_segments) new_final_segments = new_final_segments + list(global_list_of_ners) + list( global_microsoft_list_of_ngrams) #print(new_final_segments) tweet = tp.clean(tweet) tweet = ip.imply_preprocess(tweet) corpus.append(tweet) local_ner_corpus = set(local_ner_corpus) | set(global_list_of_ners) return (new_final_segments, corpus, local_ner_corpus)
#text = [wn.lemmatize(word) for word in text] return text #Removing original text column del data['text'] list(data) data.shape #Vectorizing processed text column i.e. p_text tfidf_vect = TfidfVectorizer(analyzer=clean_text) X_tfidf = tfidf_vect.fit_transform(data['p_text']) print(X_tfidf.shape) print(tfidf_vect.get_feature_names()) X_tfidf_df = pd.DataFrame(X_tfidf.toarray()) X_tfidf_df.columns = tfidf_vect.get_feature_names() #Taking independent variables together X_features = pd.concat([ data[data.columns[1:18]].reset_index(drop=True), X_tfidf_df.reset_index(drop=True) ], axis=1) X_features.head() #Divide data in train and test X_train, X_test, y_train, y_test = train_test_split(X_features,
# In[26]: x # In[27]: v # In[28]: df1 = pd.DataFrame(x.toarray(), columns=v.get_feature_names()) # In[29]: df.drop('text', axis=1, inplace=True) df = pd.concat([df, df1], axis=1) # In[30]: df.columns #these contains all columns previously selected previously
def represent(): try: li = os.listdir('CodeExample') except FileNotFoundError: return for name in sorted(li): is_ok = True rep_cluster_arr = [] try: sample = os.listdir('CodeExampleJson/{}'.format(name)) except FileNotFoundError: continue tmp = os.listdir('CodeExample/{}'.format(name)) sample = [s for s in sample if '{}.txt'.format(s[:-5]) in tmp] # sample.sort() tmp = [] for s in sample: with open('CodeExampleJson/{}/{}'.format(name, s), 'r') as f: data = json.load(f) tmp.append([len(data['lines']), s]) tmp.sort(key=lambda x: x[0]) sample = [t[1] for t in tmp] try: sz = max([ int(df.at[(name, '{}.txt'.format(s[:-5])), 'cluster']) for s in sample ]) except KeyError: sz = 0 tfidf_vectorizer = TfidfVectorizer(input='filename', max_df=0.5, min_df=1, max_features=3, norm='l2') for i in range(sz + 1): try: tmp = [ s[:-5] for s in sample if i == int(df.at[(name, '{}.txt'.format(s[:-5])), 'cluster']) ] files = ['CodeExample/{}/{}.txt'.format(name, t) for t in tmp] if not files: is_ok = False break try: tfidf = tfidf_vectorizer.fit_transform(files) except ValueError: tfidf_vectorizer = TfidfVectorizer(input='filename', max_df=1.0, min_df=1, max_features=3, norm='l2') tfidf = tfidf_vectorizer.fit_transform(files) feature = tfidf_vectorizer.get_feature_names() y = len(tmp) for j in range(len(tmp)): x = tmp[j] try: with open('CodeAst/_{}_{}.txt'.format(name, x), 'r') as f: rd = f.read() except FileNotFoundError: continue p = 0 with open('ast_seqs.txt', 'r') as f: for k, comm in enumerate(f): if comm == rd: p = k break with open('jd_comms.txt', 'r') as f: z = f.read().split('\n')[p] break else: x = tmp[0] except KeyError: x = sample[0][:-5] files = ['CodeExample/{}/{}.txt'.format(name, x)] if not files: is_ok = False break try: tfidf = tfidf_vectorizer.fit_transform(files) except ValueError: tfidf_vectorizer = TfidfVectorizer(input='filename', max_df=1.0, min_df=1, max_features=3, norm='l2') tfidf = tfidf_vectorizer.fit_transform(files) feature = tfidf_vectorizer.get_feature_names() y = 1 try: with open('CodeAst/_{}_{}.txt'.format(name, x), 'r') as f: rd = f.read() except FileNotFoundError: continue p = 0 with open('ast_seqs.txt', 'r') as f: for k, comm in enumerate(f): if comm == rd: p = k break with open('jd_comms.txt', 'r') as f: z = f.read().split('\n')[p] rep_cluster_arr.append({ 'id': x, 'num': y, 'comment': z, 'feature': feature }) if not is_ok: continue text = '# {}\n\n***\n\n'.format(name) for i, dic in enumerate(rep_cluster_arr): with open('CodeExampleJson/{}/{}.json'.format(name, dic['id']), 'r') as f: data = json.load(f) if not data['lines']: continue try: text += '## [Cluster {} ({}, {}, {})](./{})\n'.format( i + 1, dic['feature'][0], dic['feature'][1], dic['feature'][2], i + 1) except IndexError: text += '## [Cluster {}](./{})\n'.format(i + 1, i + 1) text += '{} results\n'.format(dic['num']) text += '> {}\n'.format(dic['comment']) text += '{% highlight java %}\n' for j, line in data['lines'].items(): text += '{0}. {1}\n'.format(j, line.split('\n')[0]) text += '{% endhighlight %}\n\n***\n\n' catalog(name, i, dic['feature']) os.makedirs('docs/{}'.format(name), exist_ok=True) with open('docs/{}/index.md'.format(name), 'w') as f: f.write(text) print(text)
def get_text_features(fnum, fname, df, nvalues, vectorize, ngrams_max): r"""Transform text features with count vectorization and TF-IDF, or alternatively factorization. Parameters ---------- fnum : int Feature number, strictly for logging purposes fname : str Name of the text column in the dataframe ``df``. df : pandas.DataFrame Dataframe containing the column ``fname``. nvalues : int The number of unique values. vectorize : bool If ``True``, then attempt count vectorization. ngrams_max : int The maximum number of n-grams for count vectorization. Returns ------- new_features : numpy array The vectorized or factorized text features. new_fnames : list The new feature name(s) for the numerical variable. References ---------- To use count vectorization and TF-IDF, you can find more information here [TFE]_. """ feature = df[fname] min_length = int(feature.astype(str).str.len().min()) max_length = int(feature.astype(str).str.len().max()) if len(feature) == nvalues: logger.info( "Feature %d: %s is a text feature [%d:%d] with maximum number of values %d", fnum, fname, min_length, max_length, nvalues) else: logger.info( "Feature %d: %s is a text feature [%d:%d] with %d unique values", fnum, fname, min_length, max_length, nvalues) # need a null text placeholder for vectorization feature.fillna(value=NULLTEXT, inplace=True) # vectorization creates many columns, otherwise just factorize if vectorize: logger.info("Feature %d: %s => Attempting Vectorization", fnum, fname) vectorizer = TfidfVectorizer(ngram_range=[1, ngrams_max]) try: new_features = vectorizer.fit_transform(feature) new_fnames = vectorizer.get_feature_names() logger.info("Feature %d: %s => Vectorization Succeeded", fnum, fname) except: logger.info("Feature %d: %s => Vectorization Failed", fnum, fname) new_features, _ = pd.factorize(feature) new_fnames = [USEP.join([fname, 'factor'])] else: logger.info("Feature %d: %s => Factorization", fnum, fname) new_features, _ = pd.factorize(feature) new_fnames = [USEP.join([fname, 'factor'])] return new_features, new_fnames
from sklearn.feature_extraction import stop_words count_vect = CountVectorizer() tfidf_vect = TfidfVectorizer() #print(stop_words.ENGLISH_STOP_WORDS) # removes stop words text1 = 'How are you, are you doing fine?' text2 = "What's up?" count_test_vect_text = count_vect.fit_transform([text1, text2]) # train count vect tfidf_vect_text = tfidf_vect.fit_transform([text1, text2]) # train tfidf vect ## tfidf-vectorizer print(tfidf_vect_text) print(tfidf_vect_text.toarray()) print(tfidf_vect.get_feature_names()) print(tfidf_vect_text[0]) print("#"*20) print(tfidf_vect_text[1]) print(tfidf_vect.inverse_transform(tfidf_vect_text[1])) print("-"*20) ##count vectorizer print(count_test_vect_text) print(count_test_vect_text.toarray()) print(count_vect.get_feature_names()) print(count_test_vect_text[0]) print("#"*20) print(count_test_vect_text[1]) print(count_vect.inverse_transform(count_test_vect_text[0]))
### remainder go into training) ### feature matrices changed to dense representations for compatibility with ### classifier functions in versions 0.15.2 and earlier from sklearn import cross_validation features_train, features_test, labels_train, labels_test = cross_validation.train_test_split( word_data, authors, test_size=0.1, random_state=42) from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train = vectorizer.fit_transform(features_train) features_test = vectorizer.transform(features_test).toarray() ### a classic way to overfit is to use a small number ### of data points and a large number of features; ### train on only 150 events to put ourselves in this regime features_train = features_train[:150].toarray() labels_train = labels_train[:150] ### your code goes here from sklearn.tree import DecisionTreeClassifier dtree = DecisionTreeClassifier() dtree.fit(features_train, labels_train) print "score: ", dtree.score(features_test, labels_test) fimp = dtree.feature_importances_ print "max: ", max(fimp) print "len: ", len(fimp) print "idx: ", numpy.where(fimp == max(fimp)) print vectorizer.get_feature_names()[numpy.where(fimp == max(fimp))[0][0]]
from sklearn.feature_extraction.text import TfidfVectorizer # Create a TfidfVectorizer: tfidf tfidf = TfidfVectorizer() # Apply fit_transform to document: csr_mat csr_mat = tfidf.fit_transform(documents) # Print result of toarray() method print(csr_mat.toarray()) # [[0.51785612 0. 0. 0.68091856 0.51785612 0. ] # [0. 0. 0.51785612 0. 0.51785612 0.68091856] # [0.51785612 0.68091856 0.51785612 0. 0. 0. ]] # Get the words: words words = tfidf.get_feature_names() # Print words print(words) # ['cats', 'chase', 'dogs', 'meow', 'say', 'woof'] # Clustering Wikipedia part I # You saw in the video that TruncatedSVD is able to perform PCA on sparse arrays in csr_matrix format, such as word-frequency arrays. Combine your knowledge of TruncatedSVD and k-means to cluster some popular pages from Wikipedia. In this exercise, build the pipeline. In the next exercise, you'll apply it to the word-frequency array of some Wikipedia articles. # Create a Pipeline object consisting of a TruncatedSVD followed by KMeans. (This time, we've precomputed the word-frequency matrix for you, so there's no need for a TfidfVectorizer). # The Wikipedia dataset you will be working with was obtained from here. # Perform the necessary imports from sklearn.decomposition import TruncatedSVD from sklearn.cluster import KMeans from sklearn.pipeline import make_pipeline
# from_data.append(1) from_data.append(0 if name == "sara" else 1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump(word_data, open("your_word_data.pkl", "w")) pickle.dump(from_data, open("your_email_authors.pkl", "w")) # The string that you get for word_data[152] word_data[152] ### in Part 4, do TfIdf vectorization here from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer vectorizer = TfidfVectorizer(stop_words="english") X = vectorizer.fit_transform(word_data) transformer = TfidfTransformer() tfidf = transformer.fit_transform(X) vector = vectorizer.get_feature_names() # How many unique words are there in your Tfldf? print len(vector) # What is word number 34597 in your TfIdf? vector[34597]
title = data.Title #abstract_title = pd.Series() #for i in range(len(title)): #abstract_title[str(i)] = title[i] + abstract[i] tf = abstract ############################################################################### #remove dominant words ##td-idf####################################################################### from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(stop_words = 'english') X = vectorizer.fit_transform(tf) dense_X = X.todense() idf = vectorizer.idf_ featurename1 = vectorizer.get_feature_names() #print(dict(zip(vectorizer.get_feature_names(), idf))) #get dominant words one = dense_X > 0 frequency1 = sum(one) #plt.plot(np.transpose(frequency1)) #By looking at the frequency of each word, find the threshold #400, frequency > 500 are dominant words do = pd.Series(frequency1.getA()[0],index = featurename1) freq_sort1 = do.sort_values(ascending=False) c1 = freq_sort1[:20].index index = np.where(frequency1 > 1000)[1] stopwords = [featurename1[x] for x in index]
def weighted_embeddings(esco_df, eperusteet_df, model): """ Create TFIDF weighted embeddings for ESCO and ePerusteet. The input sentences should be separated with newlines. Args: esco_df (DataFrame) : Requires cols 'label' and 'text', where 'text' contains textual representation of ESCO. eperusteet_df (DataFrame) : Requires cols 'label' and 'text', where 'text' contains textual representation of ePerusteet. model (fasttext.model) : Model for word-embeddings. Return: X_esco (xArray) : Embeddings for ESCO texts. X_eperusteet (xArray) : Embeddings for ePerusteet texts. """ assert isinstance(esco_df, pd.DataFrame) assert isinstance(eperusteet_df, pd.DataFrame) text_esco = esco_df["text"] text_eperusteet = eperusteet_df["text"] # Do not sort - to we can resplit using the indices combined_texts = pd.concat([text_esco, text_eperusteet], sort=False) vectorizer = TfidfVectorizer() vectorizer.fit(combined_texts) tokenizer = vectorizer.build_tokenizer() feature_array = vectorizer.get_feature_names() identifiers = [] embeddings = [] for _, row in tqdm(esco_df.iterrows(), total=esco_df.shape[0], desc="Computing embeddings for ESCOs"): identifiers.append(row["label"]) texts = row["text"].split("\n") # Take average over the sentences competence_embedding = xr.DataArray(np.zeros(model.get_dimension()), dims=["embedding"]) for text in texts: sentence_embedding = xr.DataArray(np.zeros(model.get_dimension()), dims=["embedding"]) weights = vectorizer.transform([text]) nonzero_indexes = weights.nonzero() weights = np.asarray(weights[nonzero_indexes][0]).reshape((-1, )) weights = [w / sum(weights) for w in weights] weight_dict = { feature_array[idx]: weights[i] for i, idx in enumerate(nonzero_indexes[1]) } for word in text.split(" "): try: token = tokenizer(word)[0] except IndexError: continue weight = weight_dict[token] sentence_embedding += (model[word] * weight) competence_embedding += sentence_embedding # If the texts was empty, avoid division and add the 0-vector if not texts: competence_embedding = competence_embedding / len(texts) embeddings.append(competence_embedding) embeddings = np.stack(embeddings, axis=0) esco_embeddings = xr.DataArray(embeddings, coords={"ESCO": identifiers}, dims=["ESCO", "embedding"]) identifiers = [] embeddings = [] for _, row in tqdm(eperusteet_df.iterrows(), total=eperusteet_df.shape[0], desc="Computing embeddings for ePerusteet"): identifiers.append(row["label"]) texts = row["text"].split("\n") # Take average over the sentences degree_embedding = xr.DataArray(np.zeros(model.get_dimension()), dims=["embedding"]) for text in texts: sentence_embedding = xr.DataArray(np.zeros(model.get_dimension()), dims=["embedding"]) weights = vectorizer.transform([text]) nonzero_indexes = weights.nonzero() weights = np.asarray(weights[nonzero_indexes][0]).reshape((-1, )) weights = [w / sum(weights) for w in weights] weights = { feature_array[idx]: weights[i] for i, idx in enumerate(nonzero_indexes[1]) } for word in text.split(" "): try: token = tokenizer(word)[0] except IndexError: continue weight = weights[token] sentence_embedding += (model[word] * weight) degree_embedding += sentence_embedding # If the texts was empty, avoid division and add the 0-vector if not texts: degree_embedding = degree_embedding / len(texts) embeddings.append(degree_embedding) embeddings = np.stack(embeddings, axis=0) eperusteet_embeddings = xr.DataArray(embeddings, coords={"ePerusteet": identifiers}, dims=["ePerusteet", "embedding"]) return esco_embeddings, eperusteet_embeddings
df["title"] = df['Name'].apply(lambda row: title_extractor(row)) print(df[["Name", "title"]].head(10)) df.title.value_counts() # Vectorizing text ## tf/idf - method for weighting of each word ### tf = term frequency (# of times word appears in doc) ### idf = inverse document frequency (# of docs / total # of docs containing term t) # Source: https://www.kaggle.com/edchen/tf-idf from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer() corpus = [ 'the brown fox jumped over the brown dog', 'the quick brown fox', 'the brown brown dog', 'the fox ate the dog' ] X = vectorizer.fit_transform(corpus) print(vectorizer.get_feature_names()) print(X.toarray()) print(X.shape) print(tfidf.vocabulary_) # dict w/ vocabulary and index value of each vocab = {v: k for k, v in tfidf.vocabulary_.items()}
#Genera un modelo de vectorizacion ModeloVectorizacion = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.003, max_df=0.5, max_features=5000, stop_words=stopwords_list) ## #Convierte en lista los elementos item_ids = articulos['contentId'].tolist() #Le hace fitting al modelo de vectorizacion con las columnas de titulo y articulo de acuerdo a las palabras pone sus valores #de aparicion en la matriz tfidf_matrix = ModeloVectorizacion.fit_transform(articulos['title'] + "" + articulos['text']) tfidf_feature_names = ModeloVectorizacion.get_feature_names() print(tfidf_feature_names) ##Evaluacion del modelo #SE haca cross validation con 20% interactions_train_df, interactions_test_df = train_test_split( dataSetInteracciones, stratify=dataSetInteracciones['personId'], test_size=0.20, random_state=42) #print(interactions_train_df.head()) ##Diseño del perfil de usuario #Obtiene el perfil de los items def getPerfilObjeto(item_id):
from sklearn import tree clf = tree.DecisionTreeClassifier() #print features_train[0] #for i in range(len(features_train[0])): # if features_train[0][i] print len(features_train[0]) t0 = time() clf.fit(features_train, labels_train) print "training time:", round(time() - t0, 3), "s" t0 = time() pred = clf.predict(features_test) print "predict time:", round(time() - t0, 3), "s" t0 = time() from sklearn.metrics import accuracy_score score = accuracy_score(labels_test, pred) print score print "score time:", round(time() - t0, 3), "s" for i, f in enumerate(clf.feature_importances_): if f > 0.2: print i print f print vectorizer.get_feature_names()[i] print clf.score(features_test, labels_test)
import numpy as np from sklearn import datasets from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import KFold, GridSearchCV from sklearn.svm import SVC newsgroups = datasets.fetch_20newsgroups( subset='all', categories=['alt.atheism', 'sci.space']) transformer = TfidfVectorizer() transformed = transformer.fit_transform(newsgroups.data) feature_mapping = transformer.get_feature_names() for i in feature_mapping: print(i) grid = {'C': np.power(10.0, np.arange(-5, 6))} cv = KFold(n_splits=5, shuffle=True, random_state=241) clf = SVC(kernel='linear', random_state=241) gs = GridSearchCV(estimator=clf, param_grid=grid, scoring='accuracy', cv=cv, return_train_score=True) gs.fit(transformed, newsgroups.target) clf._get_coef() # for i in gs.cv_results_: # print(i.mean_validation_score) # print(i.parameters)
stop_words='english') t0 = time() tf = tf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) print() # Fit the NMF model print("Fitting the NMF model (Frobenius norm) with tf-idf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model (Frobenius norm):") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) # Fit the NMF model print("Fitting the NMF model (generalized Kullback-Leibler divergence) with " "tf-idf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0))
def get_vocabulary(self, linked_pages, categories_links): """Scrapp a wiki page to get vocabulary for each category""" total_vocabulary = {} unique_vocabulary = [] unique_vocabulary_tfidf = [] # For each category for parent, pages in linked_pages.items(): children_pages = [] downloaded_pages = [] # For every pages linked to this wategory on Wiki for page in pages: sys.stdout.write('\t{} / {} pages downloaded for [{}] category.\r'.format(len(children_pages)-1, len(pages), parent)) sys.stdout.flush() # Get data wiki_url = 'https://en.wikipedia.org/wiki/{}'.format(page) data = requests.get(wiki_url) data_soup = BeautifulSoup(data.text, 'html.parser') paragraphs = [str(paragraph) for paragraph in data_soup.find_all('p')] paragraphs_joined = ' '.join(paragraphs) # Clean, tokenize, stemm and rebuild the document page_vocabulary = [] cleaned_data = self.clean_xml(text=paragraphs_joined.strip()) tokenized_data = self.tokenizer.tokenize(cleaned_data) for token in tokenized_data: if token.lower() not in self.stopwords: word = self.lemmatizer.lemmatize(token.lower()) # Check if the word is correct if self.english_dict.check(word) is True: page_vocabulary.append(word) # Track total vocabulary if word not in unique_vocabulary: unique_vocabulary.append(word) # Here, why not Levenstein for correction, but gonna be long page_nlp_treated = ' '.join(page_vocabulary) if len(children_pages) >= self.configuration['options']['pages_per_category'] or len(children_pages) == len(pages): break else: children_pages.append(page_nlp_treated) downloaded_pages.append(page) # Wikipedia is cool, be cool with their servers. time.sleep(self.configuration['options']['waiting_time']) # StdOut summary print('\n\t\t- ' + '\n\t\t- '.join(downloaded_pages)) # TF_IDF for vocabulary of each category and get top score tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=0, stop_words=self.stopwords) try: tfidf_matrix = tf.fit_transform(children_pages) except ValueError: # In case of an old empty page continue feature_names = tf.get_feature_names() dense = tfidf_matrix.todense() episode = dense[0].tolist()[0] phrase_scores = [pair for pair in zip(range(0, len(episode)), episode) if pair[1] > 0] sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1) category_words = [] for word, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:self.configuration['options']['word_per_page']]: category_words.append({word: score}) if word not in unique_vocabulary_tfidf: unique_vocabulary_tfidf.append(word) # Get linked categories to category linked_categories = [] for relation in relations: if relation[0] == parent and relation[0] not in linked_categories: linked_categories.append(relation[1]) if relation[1] == parent and relation[1] not in linked_categories: linked_categories.append(relation[0]) # Get linked pages to category for category, pages in linked_pages.items(): if category == parent: linked_pages_to_category = pages category_details = {} category_details['terminology'] = category_words category_details['linked_pages_to_category'] = linked_pages_to_category category_details['linked_categories'] = linked_categories total_vocabulary[parent] = category_details # Statistics about our terminology print('\nA total of {} words have been scanned to extract {} important words covering {} categories.'.format(len(unique_vocabulary), len(unique_vocabulary_tfidf), len(linked_pages))) return total_vocabulary
from sklearn.feature_extraction.text import TfidfVectorizer # http://stackoverflow.com/questions/23792781/ # tf-idf-feature-weights-using-sklearn-feature-extraction-text-tfidfvectorizer # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html corpus = ["This is very strange", "This is very nice", "This is a flower"] vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 3)) X = vectorizer.fit_transform(corpus) idf = vectorizer.idf_ d= dict(zip(vectorizer.get_feature_names(), idf)) for key in d: print("{} = {}".format(key, d[key]))
### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] ### append the text to word_data word_data.append(text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara": from_data.append(0) else: from_data.append(1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump( word_data, open("your_word_data.pkl", "w") ) pickle.dump( from_data, open("your_email_authors.pkl", "w") ) print word_data[152] ### in Part 4, do TfIdf vectorization here from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(stop_words="english") transformed_word_data = vectorizer.fit_transform(word_data) print "count of words: ", len(vectorizer.get_feature_names()) print "word 34597: ", vectorizer.get_feature_names()[34597]
movies.genres = movies.genres.str.split('|') movies.head() movies.genres = movies.genres.fillna("").astype('str') movies.head() from sklearn.feature_extraction.text import TfidfVectorizer tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english') tfidf_matrix = tf.fit_transform(movies['genres']) tfidf_matrix.shape tfidf_matrix print(tf.get_feature_names()) """## Cosine Similarity""" from sklearn.metrics.pairwise import cosine_similarity sim = cosine_similarity(tfidf_matrix) sim.shape sim[:4, :4] """##Predictions""" # Build a 1-dimensional array with movie titles titles = movies['title'] indices = pd.Series(movies.index, index=movies['title'])
def getFeature(foldername): filenamelist = [] # foldername = 'ratings2020' for subdir, dirs, files in os.walk(foldername): for file in os.listdir(subdir): filepath = subdir + os.sep + file re.sub(r"\\", "/", filepath) if ".csv" in filepath: filenamelist.append(filepath) # ----------------> Merging all the data in one csv df_merged = (pd.read_csv(filepath_or_buffer=file, sep=',', encoding='utf-16', error_bad_lines=False, engine='python') for file in filenamelist) df_merged = pd.concat(df_merged, ignore_index=True) df_merged.to_csv("merged.csv") df_merged.columns = [ column.replace(" ", "_") for column in df_merged.columns ] df = df_merged[[ "Star_Rating", "Reviewer_Language", "Review_Text", "App_Version_Code" ]] pd.set_option('mode.chained_assignment', None) # to remove SettingwithcopyWarning df['Positively_Rated'] = np.where(df['Star_Rating'] >= 3, 1, 0) # @@@@@@@@@@@@@@@@@@@ UI FEATURE 1: @@@@@@@@@@@@@@@@@@@@@@@@@@ total_rating = len(df['Star_Rating']) pd.set_option('mode.chained_assignment', None) df.dropna(inplace=True, how='any') total_reviews = len(df(l1['Review_Text'])) # In version 1.0 , we'll be checking only english revviews.... df = df[df.Reviewer_Language == 'en'] # Telling the positive and negative Cont and propotion for a particular version latest_version = max(df["App_Version_Code"]) VrsnRating = df[df.App_Version_Code == latest_version].Positively_Rated.mean() VrsnRating = round(VrsnRating * 100, 2) ########## DATA CLEANING ##################333 df['Review'] = df['Review_Text'].apply(lambda x: x.lower()) df['Review'] = df['Review'].apply( lambda x: re.sub(r"\W", " ", x)) # non -word charactrer df['Review'] = df['Review'].apply( lambda x: re.sub(r"\d", " ", x)) # removing digits df['Review'] = df['Review'].apply( lambda x: re.sub("([^\x00-\x7F])+", " ", x)) # removing emojis df['Review'] = df['Review'].apply( lambda x: re.sub(' \w{1,4} ', ' ', x)) # removing 2 char wrds df['Review'] = df['Review'].apply(lambda x: re.sub(r"\s+", " ", x)) df['Review'] = lemma(df['Review']) df['Review'] = df['Review'].apply(stp) nan_value = float("NaN") df.replace("", nan_value, inplace=True) df.dropna(inplace=True) df.isnull() df['Review'] = tagme(df['Review']) sid = SentimentIntensityAnalyzer() df["sentiments"] = df["Review_Text"].apply(lambda x: sid.polarity_scores( x)) #'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound':.. df = pd.concat( [df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)], axis=1) # add number of characters column df["nb_chars"] = df["Review_Text"].apply(lambda x: len(x)) # add number of words column df["nb_words"] = df["Review_Text"].apply(lambda x: len(x.split(" "))) documents = [ TaggedDocument(doc, [i]) for i, doc in enumerate(df["Review"].apply( lambda x: str(x).split(" "))) ] # train a Doc2Vec model with our text data model = Doc2Vec(documents, vector_size=30, window=2, min_count=1, workers=4) # transform each document into a vector data doc2vec_df = df["Review"].apply( lambda x: model.infer_vector(str(x).split(" "))).apply(pd.Series) doc2vec_df.columns = [ "doc2vec_vector_" + str(x) for x in doc2vec_df.columns ] df = pd.concat([df, doc2vec_df], axis=1) corpus = [] for sentences in df["Review"]: corpus.append([word for word, tag in sentences]) df['cln_Reviews'] = [" ".join(review) for review in corpus] # add tf-idfs columns tfidf = TfidfVectorizer( min_df=5) # ignore terms appearing less than 5 documents tfidf_result = tfidf.fit_transform(df["cln_Reviews"]).toarray() tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names()) tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns] tfidf_df.index = df.index reviews_df = pd.concat([df, tfidf_df], axis=1) wrdcldimg = show_wordcloud_fn(corpus) best_negsentences = reviews_df[reviews_df["nb_words"] >= 5].sort_values( "neg", ascending=False)[["Review_Text"]].head() #best_negsentences = reviews_df.sort_values("neg", ascending=False)[["Review_Text"]].head() best_negsentences = best_negsentences.to_string(index=False) pos_best_sentences = reviews_df[reviews_df["nb_words"] >= 5].sort_values( "pos", ascending=False)[["Review_Text"]].head() #pos_best_sentences = reviews_df.sort_values("pos", ascending=False)[["Review_Text"]].head() pos_best_sentences = pos_best_sentences.to_string(index=False) # apprtngimg = appvsrating(reviews_df) return (best_negsentences, pos_best_sentences, total_rating, total_reviews, VrsnRating, latest_version, wrdcldimg)
data.columns = ['labels', 'texts'] # Explore the dataset print('Out of {} rows, {} are spam, {} are ham'.format(len(data), len(data[data['labels']=='spam']), len(data[data['labels']=='ham']))) # Check the Number of missing data print('Number of null in labels: {} and number of null in texts: {}'.format(data['labels'].isnull().sum(), data['texts'].isnull().sum())) # stopwords removal stopwords = nltk.corpus.stopwords.words('english') # Wordnetlemmatizer wm = nltk.WordNetLemmatizer() # pre-processing data def data_clean(texts): text = "".join([char for char in texts if char not in string.punctuation]) tokens = re.split('W+', text) text = [wm.lemmatize(word) for word in tokens if word not in stopwords] return text data['cleaned_text'] = data['texts'].apply(lambda x: data_clean(x.lower())) # Vectorizing tfidf_vect = TfidfVectorizer(analyzer=data_clean) X_tfidf = tfidf_vect.fit_transform(data['cleaned_text']) import ipdb; ipdb.set_trace() print(X_tfidf.shape, tfidf_vect.get_feature_names())
tv = TfidfVectorizer(min_df=0., max_df=1., norm="l2", use_idf=True, smooth_idf=True) tv_train_features = tv.fit_transform(train_corpus) tv_test_features = tv.transform(test_corpus) print('TF-IDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape) # - tv_matrix = tv_train_features.toarray() vocab = tv.get_feature_names() pd.DataFrame(np.round(tv_matrix, 2), columns=vocab) # ### ML algorithms on TF-IDF model import time import warnings warnings.filterwarnings('ignore') from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score from sklearn.model_selection import cross_val_predict from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_validate # + from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import LogisticRegression
class KeywordsGenerator(BaseEstimator, TransformerMixin): """Class to extract list of keywords from text. It is compatible with scikit-learn API (i.e. contains fit, transform methods). Parameters ---------- max_tfidf_features : int, optional Size of vocabulary for tfidf. Default value, 10000. keywords : list, optional Keywords to extracted as priority. Default value, "keywords" list defined in conf file. stopwords : list, optional Stopwords not to be extracted. Default value, "names" and "stopwords" lists defined in conf file. resample : bool, optional True if dataset must be resampled according to class distribution, else False. Default value, True. n_jobs : int, optional Number of cores used for computation. Default value, 20. copy : bool, optional Make a copy of DataFrame. Default value, True. n_max_keywords : int, optional Maximum number of keywords to be returned. Default value, 6. n_min_keywords : int, optional Minimum number of keywords to be returned. Default value, 0. threshold_keywords : float, optional Minimum tf-idf score for word to be selected as keyword. Default value, 0.0. n_docs_in_class : int, optional Number of documents in each classes. Default value, 100. keywords_coef : int, optional Coefficient multiplied with the tf-idf scores of each keywords. Default value, 10. Attributes ---------- max_tfidf_features, keywords, stopwords, resample, n_jobs, progress_bar, copy, n_max_keywords, n_min_keywords, threshold_keywords, n_docs_in_class, keywords_coef, tfidf_vectorizer : TfidfVectorizer instance from sklearn, dict_scores_ : dictionary, Tf-idf scores for each tokens. max_score_ : np.array, Examples -------- >>> from melusine.summarizer.keywords_generator import KeywordsGenerator >>> keywords_generator = KeywordsGenerator() >>> keywords_generator.fit(X, y) >>> keywords_generator.transform(X) >>> print(X['keywords']) """ def __init__(self, max_tfidf_features=10000, keywords=keywords, stopwords=stopwords, resample=False, n_jobs=20, progress_bar=True, copy=True, n_max_keywords=6, n_min_keywords=0, threshold_keywords=0.0, n_docs_in_class=100, keywords_coef=10): self.max_tfidf_features_ = max_tfidf_features self.tfidf_vectorizer = TfidfVectorizer(max_features=max_tfidf_features ) self.keywords = keywords self.stopwords = stopwords self.resample = resample self.n_jobs = n_jobs self.progress_bar = progress_bar self.copy = copy self.n_max_keywords = n_max_keywords self.n_min_keywords = n_min_keywords self.threshold_keywords = threshold_keywords self.n_docs_in_class = n_docs_in_class self.keywords_coef = keywords_coef def fit(self, X, y=None): """Fit the weighted tf-idf model with input data. If resample attribute is True the dataset will be resampled according to class distribution. Parameters ---------- X : pandas.DataFrame, shape (n_samples, n_features) X must contain ['tokens'] column. y : Ignored Returns ------- self : object Returns the instance itself. """ if self.resample: X_resample = self.resample_docs(X, y) else: X_resample = X X_resample['tokens'] = X_resample['tokens'].apply(self._remove_stopwords) # fit tf-idf on resample data set tokens_joined = X_resample['tokens'].apply(lambda x: ' '.join(x)) self.tfidf_vectorizer.fit(tokens_joined) # modify the idf weights given frequency in the corpus idf_weights = self._add_tf_to_idf(X_resample) self.tfidf_vectorizer._tfidf._idf_diag = sp.spdiags(idf_weights, diags=0, m=len(idf_weights), n=len(idf_weights)) # return vetorizer with binary term frequency atribute self.dict_scores_ = dict(zip(self.tfidf_vectorizer.get_feature_names(), self.tfidf_vectorizer.idf_)) self.max_score_ = np.max(self.tfidf_vectorizer.idf_) return self def transform(self, X): """Returns list of keywords in apparition order for each document with the weighted tf-idf already fitted. Parameters ---------- X : pandas.DataFrame, shape (n_samples, n_features) X must contain ['tokens'] column. Returns ------- X_new : pandas.DataFrame, shape (n_samples, n_components) """ if self.copy: X_ = X.copy() else: X_ = X X_['keywords'] = apply_by_multiprocessing(df=X_[['tokens']], func=self.get_keywords, axis=1, workers=self.n_jobs, progress_bar=self.progress_bar) return X_ def get_keywords(self, row): """Returns list of keywords in apparition order with the weighted tf-idf already fitted. Parameters ---------- row : row of pd.Dataframe, columns ['tokens'] Returns ------- list of strings """ tokens = self._remove_stopwords(row['tokens']) tokens = [x for x in tokens if not x.isdigit()] scores = Counter({t: self.dict_scores_.get(t, 0) for t in tokens}) n = sum(i > self.threshold_keywords for i in list(scores.values())) n = min(n, self.n_max_keywords) n = max(n, self.n_min_keywords) keywords = [x[0] for x in scores.most_common(n)] index_sorted = [(k, tokens.index(k)) for k in keywords if k in tokens] index_sorted = sorted(index_sorted, key=lambda x: x[1]) keywords_sorted = [i[0] for i in index_sorted] return keywords_sorted def resample_docs(self, X, y=None): """Method for resampling documents according to class distribution.""" X_ = X.copy() if y is not None: X_['label'] = y X_['split'] = 0 for c in X_.label.unique(): N_c = X_[X_["label"] == c].shape[0] I_c = np.random.randint(0, self.n_docs_in_class+1, N_c) X_.loc[X_["label"] == c, 'split'] = I_c X_resample = pd.DataFrame( X_[['label', 'split', 'tokens']] .groupby(['label', 'split'], as_index=False)['tokens'] .sum() ) return X_resample def _remove_stopwords(self, tokens): """Method to filter stopwords from potential list of keywords.""" return [t for t in tokens if t not in self.stopwords] def _add_tf_to_idf(self, X): """Returns the tf-idf weights of each tokens""" tokens_joined = X['tokens'].apply(lambda x: ' '.join(x)) X_vec = self.tfidf_vectorizer.transform(tokens_joined) feature_names = self.tfidf_vectorizer.get_feature_names() idf_weights = self._get_weights(X_vec.toarray(), self.keywords, feature_names) return idf_weights def _get_weights(self, X_vec, keywords_list, feature_names): """Put max weights for each word of redistributed mails.""" max_ = np.max(X_vec, axis=0) mmax_ = np.max(max_) for k in keywords_list: if k in feature_names: max_[feature_names.index(k)] = mmax_ * self.keywords_coef return max_
files = os.listdir(gen_path + category) files = [file for file in files if file.find('txt') > 0] for name in files: path = gen_path + category + '/' + name with open(path, 'r') as f: data = f.read() text.append(clean_str(data)) cat_list.append(start) start += 1 result = np.zeros((1, len(cat_list)), dtype=np.int) result = result.tolist()[0] tfidf = TfidfVectorizer(strip_accents='ascii') sparse_matrix = tfidf.fit_transform(text) vocab = tfidf.get_feature_names() print type(sparse_matrix) print sparse_matrix.shape print 'Vocabulary Loaded:' pkl_file = open('/Users/HENGJIE/Desktop/text repo/bbcsport/w2v_bbc.pkl', 'rb') w2v = cPickle.load(pkl_file) pkl_file.close() pkl_file = open('/Users/HENGJIE/Desktop/text repo/bbcsport/glove_bbc.pkl', 'rb') glove = cPickle.load(pkl_file) pkl_file.close() # w2v = load_bin_vec('/Users/HENGJIE/Desktop/FYP Python/wv_google.bin',vocab)
use_idf=True, min_df=1, smooth_idf=True, norm='') base = pd.read_csv("films.csv") # pridanie novych prazdnych stlpcov z tfidf base['tfidf1'] = 0 base['tfidf2'] = 0 base['tfidf3'] = 0 base['tfidf4'] = 0 # pocitanie idf x = v.fit_transform(base.loc[:, 'storyline'].values.astype('U')) idf = v.idf_ # zrobenie dictionary v tvare -> token : hodnota idf dictineri = dict(zip(v.get_feature_names(), idf)) for i, row in base.iterrows(): accStoryline = list(map(lambda x: x.lower(), row['storyline'].split())) trol = dict() # ulozenie hodnot tfidf s tokenmi do trola for accWord in accStoryline: foo = accWord.replace('.', '') if foo in dictineri: if foo in trol: trol[foo] += dictineri[foo] else: trol[foo] = dictineri[foo] # normalizovanie trola podla logE for k, v in trol.items():
for word in bagOfWordsA: numOfWordsA[word] += 1 return numOfWordsA # array_negative=count_vectorizer.fit_transform(data_negative.splitlines()) # array_positive=count_vectorizer.fit_transform(data_positive.splitlines()) # tfidf_negative = tfidf_vector.fit_transform(data_negative.splitlines()) # tfidf_positive = tfidf_vector.fit_transform(data_positive.splitlines()) # print(tfidf_negative) data = open("test.txt").read().splitlines() tf_idf = tfidf_vector.fit_transform(data) print(tfidf_vector.get_feature_names()) array_train = [] list = [] for i in data_negative.splitlines(): array_train.append(0) for i in data_positive.splitlines(): array_train.append(1) print(tf_idf.toarray()) # X = count_vectorizer.fit_transform(data) # tfidf_vector # print(count_vectorizer.vocabulary_) # array=X.toarray() # print(array) # for x in array: