def textExtraction(df, series): vectorizer = CountVectorizer(analyzer = text_process, min_df = 0.1) df[series] = df[series].replace(np.nan, '', regex=True) vectorizer.fit_transform(df[series]) vocab = vectorizer.get_feature_names() return vocab
def tokenize(self, analyzer='word', ngram_range=(1, 1)): text = self.tweet['text'] if text: vec = CountVectorizer(analyzer=analyzer, ngram_range=ngram_range, lowercase=False) vec.fit_transform([text]) self.tw_features = self.tw_features.union(set(vec.get_feature_names()))
def score(self, curr_example): # pdb.set_trace() processed_example = self.add_start_end_tokens(curr_example) example_length = len(processed_example.split()) trigram_vectorizer = CountVectorizer(ngram_range=(3,3),\ min_df=1,\ max_df=1.0,\ lowercase=True, analyzer="word", token_pattern=self.VECTORIZER_TOKEN_PATTERN) trigram_vectorizer.fit_transform([processed_example]) trigram_count_matrix = trigram_vectorizer.transform([processed_example]) score = 0 for gram, count in zip(trigram_vectorizer.get_feature_names(), np.asarray(trigram_count_matrix.sum(axis=0)).ravel()): score += count * math.log(self.ngrams_dict[gram] + 1) leading_bigram = self.get_leading_bigram(gram) score -= count * math.log(self.ngrams_dict[leading_bigram] + self.training_vocab_size) ## Calculate perplexity for scoring exponent = -float(1) / example_length pp_score = math.pow(math.exp(score), exponent) if math.exp(score) > 0.0 else float('+inf') return pp_score
def get_features_by_wordbag(): global max_features x_train, x_test, y_train, y_test=load_all_files() vectorizer = CountVectorizer( decode_error='ignore', strip_accents='ascii', max_features=max_features, stop_words='english', max_df=1.0, min_df=1 ) print vectorizer x_train=vectorizer.fit_transform(x_train) x_train=x_train.toarray() vocabulary=vectorizer.vocabulary_ vectorizer = CountVectorizer( decode_error='ignore', strip_accents='ascii', vocabulary=vocabulary, stop_words='english', max_df=1.0, min_df=1 ) print vectorizer x_test=vectorizer.fit_transform(x_test) x_test=x_test.toarray() return x_train, x_test, y_train, y_test
def TFIDF(): global segcont global weight vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(segcont)) word = vectorizer.get_feature_names() # 所有文本的关键字 weight = tfidf.toarray() # 对应的tfidf矩阵 del segcont seg = [] for i in range(len(weight)): enstr = "" for j in range(len(word)): if weight[i][j] >= 0.1:##################################### enstr = enstr + " " + word[j] seg.append(enstr) del weight vec = CountVectorizer() tra = TfidfTransformer() tidf = tra.fit_transform(vec.fit_transform(seg)) wo = vec.get_feature_names() we = tidf.toarray() global we
def wordexist(): corpus = [ 'UNC played Duke in basketball', 'Duke lost the basketball game' ] vectorizer = CountVectorizer() print vectorizer.fit_transform(corpus).todense() print vectorizer.vocabulary_ #词库表
def createCorpus(data,i, binaryX="False", stopWords=None, lemmatize="False", tfidf= "False", useidf="True"): # will vectorize BOG using frequency as the parameter and will return the required arrays X_train =[] X_test=[] Y_train=[] Y_test=[] for key in data: if key in i: for filename in data[key]: text = data[key][filename][0] if lemmatize == "True": port = WordNetLemmatizer() text = " ".join([port.lemmatize(k,"v") for k in text.split()]) X_test.append(text) Y_test.append(data[key][filename][1]) else: for filename in data[key]: text = data[key][filename][0] if lemmatize == "True": port = WordNetLemmatizer() text = " ".join([port.lemmatize(k,"v") for k in text.split()]) X_train.append(text) Y_train.append(data[key][filename][1]) if tfidf == "False": vectorizer = CountVectorizer(min_df=1, binary= binaryX, stop_words=stopWords) X_train_ans = vectorizer.fit_transform(X_train) X_test_ans = vectorizer.transform(X_test) return X_train_ans, Y_train, X_test_ans,Y_test elif tfidf == "True": vectorizer = TfidfVectorizer(min_df=1, use_idf=useidf) X_train_ans = vectorizer.fit_transform(X_train) X_test_ans = vectorizer.transform(X_test) return X_train_ans, Y_train, X_test_ans,Y_test
def filter_corpora(corpora, num_top_wrds, skip=0): """Filter each inputted corpus into only `num_top_words` after `skip` words. Args: ---- corpora: list of tuples (name (str), corpus (1d np.ndarray of strings) pairs) num_top_words :int skip (optional): int allows for looking at the second, third, fourth `num_top_wrds` Return: ------ filtered_corpora: list of tuples """ num_top_wrds += skip vectorizer = CountVectorizer(max_features=num_top_wrds, stop_words='english') filtered_corpora = [] for name, corpus in corpora: vectorizer.fit_transform(corpus) most_common_wrds = vectorizer.get_feature_names()[skip:] filtered_corpora.append((name, most_common_wrds)) return filtered_corpora
class RandomForestRegressor(Regressor): def findImportantFeatures(self, numFeatures = 500): self.features = [] count = 0 if self.isGroup: for key in sorted(self.trainSet.getVocabulary(), key = lambda word: self.trainSet.getMI(word, self.trainSet), reverse=True): self.features.append(key) count += 1 if count == numFeatures: self.minMI = self.trainSet.getMI(key, self.trainSet) break else: for key in sorted(self.trainSet.getVocabulary(), key = lambda word: math.fabs(self.trainSet.getUniqueWeightOf(word)), reverse=True): self.features.append(key) count += 1 if count == numFeatures: self.minMI = self.trainSet.getUniqueWeightOf(key) break def train(self, numFeatures = 500): self.findImportantFeatures(numFeatures) self.regressor = RFR() self.vectorizer = CountVectorizer(vocabulary = self.features, min_df = 1) strings = [] Y = [] for docKey in self.trainSet.getDocuments(): document = self.trainSet.getDocument(docKey) strings.append(" ".join(document.getBagOfWords2("all"))) Y.append(document.getSalary()) X = self.vectorizer.fit_transform(strings).toarray() self.regressor.fit(X, Y) def predict(self, document): strings = [] strings.append(" ".join(document.getBagOfWords2("all"))) Z = self.vectorizer.fit_transform(strings).toarray() return self.regressor.predict(Z)[0]
class SVM(Classifier): def findImportantFeatures(self, numFeatures = 500): self.features = [] count = 0 for key in sorted(self.trainingSet.getVocabulary(), key = lambda word: self.trainingSet.getUniqueWeightOf(word), reverse=True): self.features.append(key) count += 1 if count == numFeatures: break def train(self, numFeatures = 500): self.findImportantFeatures(numFeatures) self.classifier = svm.LinearSVC(C = 5.0, dual = True, verbose = 0) self.vectorizer = CountVectorizer(vocabulary = self.features, min_df = 1) strings = [] Y = [] for docKey in self.trainingSet.getDocuments(): document = self.trainingSet.getDocument(docKey) strings.append(" ".join(document.getBagOfWords2("all"))) Y.append(document.getGroup().getKey()) X = self.vectorizer.fit_transform(strings) self.classifier.fit(X, Y) def classify(self, document): strings = [] strings.append(" ".join(document.getBagOfWords2("all"))) Z = self.vectorizer.fit_transform(strings) return self.classifier.predict(Z)[0] def classifyAll(self, testSet): for docKey in self.testSet.getDocuments(): document = self.testSet.getDocument(docKey) strings.append(" ".join(document.getBagOfWords2("all"))) Z = self.vectorizer.fit_transform(strings) return self.classifier.predict(Z)
def cal_product_description_tfidf(): #PART II compute the tf-idf for product description print "\nBegins,compute the tf-idf for product description ..." product_description_data = pd.read_csv('product_descriptions.csv') print "\nMerge the product description into database..." AllSet = pd.merge( AllSet , product_description_data, how='left', on='product_uid') print "\nStemming the product description ..." AllSet['product_description'] = AllSet['product_description'].map(lambda x: stem_process(x)) product_description=AllSet['product_description'] print "\nGet the (product description vocabulary)-(search term) frequency matrix..." search_vect_descrip = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency search_vect_descrip.fit(product_description)#learn the vocabulary search_descrip_fq_matrix = search_vect_descrip.transform(search_term) #get the (product description vocabulary)-(search term) frequency matrix print "\nGet the (product description vocabulary)-(product_description) frequency matrix..." description_vect = CountVectorizer(stop_words ='english') description_vect.fit_transform(product_description)#learn the vocabulary description_fq_matrix=description_vect.transform(product_description) #get the (product discription vocabulary)-(product_description) frequency matrix print "\nGet the idf matrix..." tfidf_transformer = TfidfTransformer(norm="l2",smooth_idf=True) tfidf_transformer.fit(description_fq_matrix) # get idf for each vocabulary tf_idf_descrip_matrix = tfidf_transformer.transform(description_fq_matrix) #get the idf matrix print "\nCompute the result of tf-idf for product description ..." tf_idf_descrip_result=[]#compute the result of tf-idf for product title for index in range(tf_idf_descrip_matrix.shape[0]): tf_idf_descrip_result.append((np.multiply(tf_idf_descrip_matrix[index], search_descrip_fq_matrix[index].transpose()))[0, 0]) pd.DataFrame({"id":AllSet['id'],"product_description_tfidf": tf_idf_descrip_result}).to_csv('product_description_tfidf.csv', index=False)
def main(): twenty = fetch_20newsgroups() tfidf = TfidfVectorizer().fit_transform(twenty.data) cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten() related_docs_indices = cosine_similarities.argsort()[:-5:-1] print related_docs_indices print cosine_similarities[related_docs_indices] # vectorizer = CountVectorizer(min_df=1) # corpus = [ # 'This is the first document.', # 'This is the second second document.', # 'And the third one.', # 'Is this the first document?', # ] # tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english') # tfs = tfidf.fit_transform(token_dict.values()) train_set = ("The sky is blue.", "The sun is bright.") test_set = ("The sun in the sky is bright.", "We can see the shining sun, the bright sun.") count_vectorizer = CountVectorizer() count_vectorizer.fit_transform(train_set) print "Vocabulary:", count_vectorizer.vocabulary # Vocabulary: {'blue': 0, 'sun': 1, 'bright': 2, 'sky': 3} freq_term_matrix = count_vectorizer.transform(test_set) print freq_term_matrix.todense() tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) print "IDF:", tfidf.idf_ tf_idf_matrix = tfidf.transform(freq_term_matrix) print tf_idf_matrix.todense()
class NERKNNClassifier(NERClassifier): def __init__(self, recbysns): NERClassifier.__init__(self, recbysns) self.knn = None self.vectorizer = None def train(self, entities): self.knn = KNeighborsClassifier(n_neighbors=10) self.vectorizer = CountVectorizer( analyzer=NEREntityAnalyzer(self.recbysns), max_df=1.0, min_df=2) self.vectorizer.fit_transform(entities) X = [self.generate_features(entity) for entity in entities if entity.pos() == u'title'] Y = [entity.ner_class() for entity in entities if entity.pos() == u'title'] self.knn = self.knn.fit(X, Y) def predict(self, entity): if entity.pos() == u'url': return NER_VIDEO else: X = [self.generate_features(entity)] return self.knn.predict(X)[0] def generate_features(self, entity): text_features = self.vectorizer.transform([entity]).\ toarray()[0].tolist() features = entity.features().values() return text_features + features
class SVMRegressor(Regressor): def findImportantFeatures(self, numFeatures = 1000): #Selecting the important features self.features = [] count = 0 for key in sorted(self.trainSet.getVocabulary(), key = lambda word: self.trainSet.getUniqueWeightOf(word), reverse=True): count += 1 self.features.append(key) if count == numFeatures: break def train(self, numFeatures = 1000): self.findImportantFeatures(numFeatures) self.vectorizer = CountVectorizer(vocabulary = self.features,min_df = 1) self.regressor = SVR(kernel='linear', C=25, epsilon=10) strings = [] Y = [] for docKey in self.trainSet.getDocuments(): document = self.trainSet.getDocument(docKey) strings.append(" ".join(document.getBagOfWords2("all"))) Y.append(document.getSalary()) X = self.vectorizer.fit_transform(strings) self.regressor.fit(X,Y) Coef = self.regressor.coef_ coef_list = Coef.toarray() #for i in range(len(coef_list[0])): # if math.fabs(coef_list[0][i]-0.0) > 0.1: # print self.features[i],coef_list[0][i] def predict(self, document): strings = [] strings.append(" ".join(document.getBagOfWords2("all"))) Z = self.vectorizer.fit_transform(strings) return self.regressor.predict(Z)[0]
def tfidf_step_by_step(): """ Example of calculating TF-IDF for OSM nodes. Document is a list of keys. """ learn_data_set = documents_gen() test_data_set = documents_gen() # calculate term-frequency vectorizer = CountVectorizer(stop_words=stop_words, token_pattern='[a-z0-9_\-:]+') vectorizer.fit_transform(learn_data_set) #pprint.pprint(vectorizer.vocabulary_) # freq_term_matrix is a sparse matrix (elemens stored in Coordinate format # http://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_.28COO.29 ) freq_term_matrix = vectorizer.transform(test_data_set) # freq_term_matrix.todense() # l2 - Euclidean normalization # http://en.wikipedia.org/wiki/Norm_%28mathematics%29#Euclidean_norm tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) tf_idf = tfidf.transform(freq_term_matrix) pprint.pprint(tf_idf.todense())
def action(self, tweets_list): corpus = [] for tweet in tweets_list: #corpus += [t["text"]] tweet_str = tweet["text"].encode("utf-8") tweet_str = unicode(tweet_str,'utf-8') corpus.append(tweet_str) print(corpus) vectorizer = CountVectorizer() X = vectorizer.fit_transform(corpus) M,P=X.shape dist_corpus=euclidean_distances(X) stwf=stopwords.words('french') stwf.append('les') vectorizer=CountVectorizer(stop_words=stwf) X = vectorizer.fit_transform(corpus) dico=vectorizer.vocabulary_ #Tous les print regroupés ici print("Results of Birch algorithm") clusters = birch_algo(X.toarray(), None) quit()
def cal_product_title_tfidf(): #PART I compute the tf-idf for product title print "\nBegins,compute the tf-idf for product title ..." print "\nStemming product_title..." AllSet['product_title'] = AllSet['product_title'].map(lambda x : stem_process(x)) product_title = AllSet['product_title'] print "\nGet the (product title vocabulary)-(search term) frequency matrix..." search_vect_tittle = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency search_vect_tittle.fit(product_title)#learn the vocabulary search_tittle_fq_matrix = search_vect_tittle.transform(search_term) #get the (product title vocabulary)-(search term) frequency matrix print "\nGet the (product title vocabulary)-(product_title) frequency matrix" title_vect = CountVectorizer(stop_words='english') title_vect.fit_transform(product_title)#learn the vocabulary title_fq_matrix = title_vect.transform(product_title) #get the (product title vocabulary)-(product_title) frequency matrix print "\nGet the idf matrix" tfidf_transformer = TfidfTransformer(norm="l2", smooth_idf=True) tfidf_transformer.fit(title_fq_matrix) # get idf for each vocabulary tf_idf_title_matrix = tfidf_transformer.transform(title_fq_matrix) #get the idf matrix print "\nCompute the result of tf-idf for product title ..." tf_idf_title_result = [] #compute the result of tf-idf for product title for index in range(tf_idf_title_matrix.shape[0]): tf_idf_title_result.append((np.multiply(tf_idf_title_matrix[index], search_tittle_fq_matrix[index].transpose()))[0, 0]) pd.DataFrame({"id": AllSet['id'],"product_title_tfidf": tf_idf_title_result}).to_csv('product_title_tfidf.csv', index=False) return 0
def improveVocabulary(positiveDocuments, negativeDocuments): countVectPos = CountVectorizer(min_df = 0.1, stop_words = 'english') countVectNeg = CountVectorizer(min_df = 0.1, stop_words = 'english') positiveCandidates = [] negativeCandidates = [] if len(positiveDocuments) > 0: try: countVectPos.fit_transform(positiveDocuments) positiveCandidates = countVectPos.get_feature_names() except: a = 1 #print "count vector failed" if len(negativeDocuments) > 0: try: countVectNeg.fit_transform(negativeDocuments) negativeCandidates = countVectNeg.get_feature_names() except: a = 1 #print "countvector failed" global listPos, listNeg, countDictPos, countDictNeg #pdb.set_trace() for candidate in (positiveCandidates + negativeCandidates): score = (getMapOutput(countVectPos.vocabulary_, candidate) - getMapOutput(countVectNeg.vocabulary_, candidate)) if (score > 0 and score/getMapOutput(countVectPos.vocabulary_, candidate) >= 0.1): insertMap(listPos, candidate) elif (score < 0 and abs(score)/getMapOutput(countVectNeg.vocabulary_, candidate) >= 0.1): insertMap(listNeg, candidate) '''
def make_week1_plot(df): vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1), token_pattern='[A-Za-z]+') features = vectorizer.fit_transform(df.ingredient_txt) ## features is a document x term matrix. wc = feature_counts(vectorizer, features) ## plot of most common words: p1 = wc.sort('count').tail(20).plot('word','count', kind='bar') v2 = CountVectorizer(stop_words=get_stop_words(), ngram_range=(1, 1), token_pattern='[A-Za-z]+') f2 = v2.fit_transform(df.ingredient_txt) ## features is a document x term matrix. wc2 = feature_counts(v2, f2) ## plot of most common words: n = 50 plt.figure(1) plt.subplot(211) p1 = wc.sort('count').tail(n).plot('word','count', kind='bar') plt.subplot(212) p2 = wc2.sort('count').tail(n).plot('word','count', kind='bar') plt.tight_layout() plt.savefig('fig-word-count-histograms.png')
def prepare_data(test_train_split, train_features_file, test_features_file): vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 1000) song_id = 0 clean_train_songs=[] clean_test_songs=[] clean_train_labels=[] clean_test_labels=[] for genre in os.listdir(dirname): print("\nProcessing Genre: " + genre) songs = os.listdir(os.path.join(dirname, genre)) num_songs = len(songs) song_index = 0 # Index of song within this genre for song in songs: print(song) with open(os.path.join(dirname, genre, song), 'r') as song_lyrics: lyrics = song_lyrics.read() words = process_song(lyrics) print 'NumSongs: %d SongIndex %d SongId %d' % (num_songs, song_index, song_id) if (song_index + 1) <= test_train_split * num_songs: clean_train_songs.append(words) clean_train_labels.append(genre) else: clean_test_songs.append(words) clean_test_labels.append(genre) song_index = song_index + 1 song_id = song_id + 1 # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. train_data_features = vectorizer.fit_transform(clean_train_songs) test_data_features = vectorizer.fit_transform(clean_test_songs) # Numpy arrays are easy to work with, so convert the result to an # array train_data_features = train_data_features.toarray() test_data_features = test_data_features.toarray() print train_data_features.shape print test_data_features.shape # Initialize a Random Forest classifier with 100 trees forest = RandomForestClassifier(n_estimators = 100) # Fit the forest to the training set, using the bag of words as # features and the sentiment labels as the response variable # # This may take a few minutes to run forest = forest.fit( train_data_features, clean_train_labels ) # Use the random forest to make sentiment label predictions result = forest.predict(test_data_features) iteration=0 acc=0 for x in result: if (x==clean_test_labels[iteration]): acc=acc+1 iteration=iteration+1 print 'Accuracy=%f' % (acc*1.0/len(result))
def makeFeatures(): connection = happybase.Connection(MACHINE + '.vampire', table_prefix=VUID) table = connection.table(DCOG_TABLE) f_table = connection.table(DCOG_F_TABLE) genre_data = [] style_data = [] keys = [] for key,d in table.scan(): data = json.loads(d.itervalues().next()) genres = data['genres'] styles = data['styles'] if (genres): genre_data.append(genres) else: genre_data.append(' ') if (styles): style_data.append(styles) else: style_data.append(' ') keys.append(key) # Vectorize genre word counts g_vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None) g_features = g_vectorizer.fit_transform(genre_data) g_features = g_features.toarray() # Vectorize style word counts s_vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None) s_features = s_vectorizer.fit_transform(style_data) s_features = s_features.toarray() # Create Key Vector k_arr = np.array(keys) k_arr.shape = (-1, 1) features = np.concatenate((k_arr, g_features, s_features), axis=1) b = f_table.batch() for row in features: data = row[1:] data = list(data.astype(int)) b.put(row[0], {DCOG_F_COLUMN_FAMILY + ':' + DCOG_F_COLUMN : json.dumps(data)}) b.send()
def train_test(args): # unpack arguments and make train/test data/label dicts/lists train, test, features, classifier = args # create tf idf spare matrix from training data if features == 'tfidf': fe = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_features=1290) trainfe = fe.fit_transform(train['data']) elif features == 'dict': fe = CountVectorizer(tokenizer=tokenize, stop_words='english', binary=True) trainfe = fe.fit_transform(train['data']) elif features == 'lsa': svd = TruncatedSVD(n_components=100, random_state=42) fe = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_df=0.115, max_features=11500) trainfe = svd.fit_transform(fe.fit_transform(train['data'])) elif features == 'rule': hamfe = CountVectorizer(tokenizer=tokenize, stop_words='english', max_features=1150) spamfe = CountVectorizer(tokenizer=tokenize, stop_words='english', max_features=1150) hamfit = hamfe.fit_transform(train['data'].loc[train['labels'] == 0]) spamfit = spamfe.fit_transform(train['data'].loc[train['labels'] == 1]) # train multinomial nb classifier on training data if classifier == 'mnb': from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(trainfe, train['labels']) elif classifier == 'gnb': from sklearn.naive_bayes import GaussianNB clf = GaussianNB().fit(trainfe.toarray(), train['labels']) elif classifier == 'svm': from sklearn.linear_model import SGDClassifier clf = SGDClassifier(loss='squared_hinge', penalty='l2').fit(trainfe, train['labels']) elif classifier == 'log': from sklearn.linear_model import SGDClassifier clf = SGDClassifier(loss='log', penalty='l2').fit(trainfe, train['labels']) elif classifier == 'rule': hamfeats = hamfe.transform(test['data']) spamfeats = spamfe.transform(test['data']) hyp = np.array(hamfeats.sum(axis=1) < spamfeats.sum(axis=1)).reshape(-1).T # extract features from test data if features == 'lsa': feats = svd.transform(fe.transform(test['data'])) else: feats = fe.transform(test['data']) # use trained classifier to generate class predictions from test features if classifier == 'gnb': hyp = clf.predict(feats.toarray()) elif classifier == 'rule': pass else: hyp = clf.predict(feats) # compare predictions with test labels score = np.mean(hyp == test['labels']) return score
def test_transformation(self): #TODO: Remove this function. Useless now train_set = ("The sky is blue.", "The sun is bright.") #test_set = ("The sun in the sky is bright.", #"We can see the shining sun, the bright sun.") count_vectorizer = CountVectorizer('english') count_vectorizer.fit_transform(train_set)
def get_features(text): count_vect = CountVectorizer(analyzer='char_wb', ngram_range=(1, 5), min_df=1, vocabulary=unique) if isinstance(text, list): x_train_counts = count_vect.fit_transform([abstract.lower() for abstract in text]) else: x_train_counts = count_vect.fit_transform([text.lower()]) tfidf_transformer = TfidfTransformer() return tfidf_transformer.fit_transform(x_train_counts)
def vocabulary(text): count = CountVectorizer(analyzer='word',ngram_range=(1,1),stop_words='english') countTotal = CountVectorizer(analyzer='word',ngram_range=(1,1)) counter = count.fit_transform([text]).toarray() countT = countTotal.fit_transform([text]).toarray() matrix = np.zeros((1, 1)) matrix[0, 0] = (countT.sum()-counter.sum())/float(countT.sum()) return matrix
def setup(train, test, binaryOpt = False): count_vectorizer = CountVectorizer(binary = binaryOpt) count_vectorizer.fit_transform(train) freq_term_matrix = count_vectorizer.transform(test) if binaryOpt: return freq_term_matrix tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) tf_idf_matrix = tfidf.transform(freq_term_matrix) return tf_idf_matrix
def bagofword(self): count_vectorizer = CountVectorizer() if self.all_wordlist: feature_vectors = count_vectorizer.fit_transform(self.all_wordlist) print(feature_vectors.toarray()) self.all_bagofwords = feature_vectors if self.wordlist: feature_vectors = count_vectorizer.fit_transform(self.wordlist) print(feature_vectors.toarray()) self.bagofwords = feature_vectors
def to_features(tweet): stop_words = ['iphone', 'ipod', 'ipad', 'mac', 'imac', 'rt', 'apple', 'amp'] stop_words = ENGLISH_STOP_WORDS.union(stop_words) vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 2), stop_words=stop_words) tweet = rm_usernames(rm_links(tweet)) try: vectorizer.fit_transform([tweet]) return vectorizer.get_feature_names() except ValueError: return ['']
def tfidf_normalize(articles_with_id): global NON_STOPWORD_LIMIT stemmed_articles_with_id = [(aid, stem_article(article)) for (aid, article) in articles_with_id] stemmed_articles = [article for (aid, article) in stemmed_articles_with_id] # test_set = train_set # instantiate vectorizer with English language, using stopwords and set min_df, max_df parameters and the tokenizer vectorizer = CountVectorizer(stop_words="english", min_df=3, max_df=0.1, token_pattern=r"\b[a-zA-Z][a-zA-Z]+\b") # by appling the vectorizer instance to the train set # it will create a vocabulary from all the words that appear in at least min_df and in no more than max_df # documents in the train_set vectorizer.fit_transform(stemmed_articles) # vectorizer transform will apply the vocabulary from the train set to the test set. In my case, # they are the same set: whole Wikipedia. # this means that each article will get representation based on the words from the vocabulary and # their TF-IDF values in the Scipy sparse output matricx freq_term_matrix = vectorizer.transform(stemmed_articles) long_articles_with_id = [] assert freq_term_matrix.shape[0] == len(articles_with_id) for (i, article_with_id) in zip(xrange(freq_term_matrix.shape[0]), stemmed_articles_with_id): row = freq_term_matrix.getrow(i) if row.getnnz() >= NON_STOPWORD_LIMIT: long_articles_with_id.append(article_with_id) long_articles = [article for (aid, article) in long_articles_with_id] vectorizer = CountVectorizer(stop_words="english", min_df=3, max_df=0.1, token_pattern=r"\b[a-zA-Z][a-zA-Z]+\b") vectorizer.fit_transform(long_articles) freq_term_matrix = vectorizer.transform(long_articles) # Gabrilovich says that they threshold TF on 3 (remove word-article association if that word # does not appear at least 3 times in that single article # freq_term_matrix.data *= freq_term_matrix.data>=3 # freq_term_matrix.eliminate_zeros() # I think this is not necessary... # this is a log transformation as applied in (Gabrilovich, 2009), i.e., that is # how he defines TF values. In case of TF = 0, this shall not affect such value # freq_term_matrix.data = 1 + np.log( freq_term_matrix.data ) # instantiate tfidf trnasformer tfidf = TfidfTransformer(norm=None, smooth_idf=False, sublinear_tf=True) # tfidf uses the freq_term_matrix to calculate IDF values for each word (element of the vocabulary) tfidf.fit(freq_term_matrix) # finally, tfidf will calculate TFIDF values with transform() tf_idf_matrix = tfidf.transform(freq_term_matrix) # tf_idf_matrix.data = np.log(np.log(tf_idf_matrix.data)) tf_idf_matrix = normalize(tf_idf_matrix, norm="l2", axis=0, copy=False) # now we put our matrix to CSC format (as it helps with accessing columns for inversing the vectors to # words' concept vectors) tf_idf_matrix = tf_idf_matrix.tocsc() # we need vocabulary_ to be accessible by the index of the word so we inverse the keys and values of the # dictionary and put them to new dictionary word_index word_index = dict((v, k) for k, v in vectorizer.vocabulary_.iteritems()) M, N = tf_idf_matrix.shape print "Articles: ", M print "Words: ", N return tf_idf_matrix, word_index, long_articles_with_id
class TextSimilarity(object): ''' classdocs ''' def __init__(self,max_ngram=2,needStem=False): ''' Constructor ''' self.stemmer=PorterStemmer() if not needStem: self.vectorizer=CountVectorizer(stop_words = 'english',ngram_range=(1,max_ngram)) else: self.vectorizer=CountVectorizer(analyzer=self.AnalyseText,ngram_range=(1,max_ngram)) self.stop = stopwords.words('english') def get_cos_similarity(self,text1,text2): tfidf=self.vectorizer.fit_transform([text1,text2]) cos_sim=cosine_similarity(tfidf[0], tfidf[1])[0][0] return cos_sim def calculate_TF(self,sents): all_finished=True for sent in sents: if not hasattr(sent, 'tf'): all_finished=False break if all_finished: return texts=[sent.content for sent in sents] vectors=self.get_count_vector(texts) for i in xrange(len(sents)): sents[i].tf=vectors[i] def calculate_sentence_similarity(self,sent1,sent2): return self.get_similarity_from_vectors(sent1.tf, sent2.tf) def get_count_vector(self,texts): vectors=self.vectorizer.fit_transform(texts) return vectors def get_similarity_from_vectors(self,vector1,vector2): sim=cosine_similarity(vector1, vector2)[0][0] return sim def AnalyseText(self,doc): doc=doc.lower() doc=re.sub(r'[^a-z\d\s]',' ',doc) doc=re.sub(r'\d','#',doc) tokens=doc.split() stems=[] for t in tokens: if len(t)<2 or t in self.stop: continue stems.append(self.stemmer.stem(t)) return stems
class Lsa(BaseModel): results_folder = Hyper.lsa_result def __init__(self, n_components=300): super(Lsa, self).__init__() self.svd = TruncatedSVD(n_components, random_state=42) self.vectorizer = CountVectorizer() @staticmethod def load(): with open(Hyper.lsa_pickle, 'rb') as fd: return pickle.load(fd) def save(self): with open(Hyper.lsa_pickle, 'wb') as fd: pickle.dump(self, fd, protocol=4) def fit(self): print('create counter') new_corpus = self.vectorizer.fit_transform(self.corpus) print('fit lsa') self.svd.fit(new_corpus) print('save lsa') self.save() def process(self): with open(Hyper.processed_queries) as fd: queries = fd.readlines() query_to_docs = collections.defaultdict(list) with open(Hyper.sample_submission) as fd: fd.readline() for line in fd: line = line.strip().split(',') query_to_docs[int(line[0]) - 1].append(int(line[1]) - 1) queries_vec = list(map(lambda x: x.strip().split('\t')[1], queries)) queries_vec = self.vectorizer.transform(queries_vec) corpus_vec = self.vectorizer.transform(self.corpus) queries_vec = self.svd.transform(queries_vec) corpus_vec = self.svd.transform(corpus_vec) model_results = ModelResult([]) for query_id, doc_ids in tqdm.tqdm(query_to_docs.items()): sim = cosine_similarity(queries_vec[query_id].reshape(1, -1), corpus_vec[doc_ids]) query_result = QueryResult(int(query_id) + 1, []) for i, doc_id in enumerate(doc_ids): doc_result = DocScore(doc_id + 1, sim[0][i]) query_result.results.append(doc_result) model_results.queries.append(query_result) print('save results to {}'.format(self.results_folder)) with open(self.results_folder, 'w') as fd: for query in model_results.queries: for doc in query.results: fd.write('{}\t{}\t{}\n'.format(query.id, doc.id, doc.score)) return model_results
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV from time import time from collections import Counter train, test = train_test_split(data, test_size=0.3, random_state=42) train_clean_tweet = [] for tweets in train['text']: train_clean_tweet.append(tweets) test_clean_tweet = [] for tweets in test['text']: test_clean_tweet.append(tweets) v = CountVectorizer(analyzer="word") train_features = v.fit_transform(train_clean_tweet) test_features = v.transform(test_clean_tweet) Classifiers = [ LogisticRegression(C=0.000000001, solver='liblinear', max_iter=200), KNeighborsClassifier(3), DecisionTreeClassifier(), RandomForestClassifier(n_estimators=200), AdaBoostClassifier(), GaussianNB() ] dense_features = train_features.toarray() dense_test = test_features.toarray() Accuracy = [] Model = []
end_time = time.time() Tools.flushPrint("tokenized in {} seconds".format(end_time - start_time)) start_time = time.time() vectorizer = CountVectorizer( # so we can pass it strings input='content', # turn off preprocessing of strings to avoid corrupting our keys lowercase=False, preprocessor=lambda x: x, # use our token dictionary tokenizer=lambda x: x, min_df=3, max_df=0.8, max_features=20000) vectorized_data = vectorizer.fit_transform(tokenized_data) # Tools.flushPrint(vectorized_data[:10]) del tokenized_data end_time = time.time() Tools.flushPrint("vectorized in {} seconds".format(end_time - start_time)) start_time = time.time() n_topics = 50 lda = trainLDA(vectorized_data, n_topics, max_iterations) Tools.flushPrint(lda) try: Tools.dillDump(os.path.join(output_data_dir, "lda_tag.pkl"), lda) except: pass end_time = time.time() Tools.flushPrint("Trained LDA in {} Seconds".format(end_time - start_time))
#Opening train and test file from tsv with separator \t train= pd.read_csv("train.tsv", sep="\t") test= pd.read_csv("test.tsv", sep="\t") #Converting the reviews into a count matrix #The feature is Bag of Words with tokenization, removing tag </br>, n gram, mark_negation count_vector = CountVectorizer(analyzer="word", #prerequisite from preprocessor function tokenizer=lambda text: mark_negation(word_tokenize(text)), #to override the tokenization with marking negation preprocessor=lambda text: text.replace("<br />", " "), #to override the preprocessing with replacing tag br with empty character ngram_range=(1, 3), #obtaining the combination of term as unigram, bigram and trigram ) #Fit to data and transform it train_counts = count_vector.fit_transform(train['Phrase']) #Setting tf (without idf) and learn the idf vector tf_transformer = TfidfTransformer(use_idf=False).fit(train_counts) #Transforming a count matrix to a tf representation train_tf = tf_transformer.transform(train_counts) tfidf_transformer = TfidfTransformer() #Fit to data and transform it train_tfidf = tfidf_transformer.fit_transform(train_counts) #Using Linear Support Vector Classification classifier for fit to data then transform it classifier = svm.LinearSVC().fit(train_tfidf, train['Sentiment'])
def fichier_rec(myDirectory): for f in listdir(myDirectory): chemin = join(myDirectory, f) if isfile(chemin): with open(chemin, 'rb') as file: content = '' for line in file: word = str(line).split(" ") for m in word: if not (pattern.match(str(m))): content += str(m) + " " documents = [content] cv = CountVectorizer(stop_words="english") count_vector = cv.fit_transform(documents) #sort the counts of first book title by descending order of counts sorted_items = sort_coo(count_vector[0].tocoo()) #Get feature names (words/n-grams). It is sorted by position in sparse matrix feature_names = cv.get_feature_names() n_grams = extract_topn_from_vector(feature_names, sorted_items, 20) listCat = {} with open(LaCateg) as json_file: if not exists(monRep): mkdir(monRep) dataCateg = json.load(json_file) for key in dataCateg.keys(): if not exists(monRep + "/" + key): mkdir(monRep + "/" + key) if not exists(monRep + "/Other"): mkdir(monRep + "/Other") listCat[key] = 0 for key in dataCateg.keys(): for i in n_grams: if i[0] in dataCateg[key]: listCat[key] += i[1] cpt = 0 k = "" for j in listCat: if cpt < listCat[j]: cpt = listCat[j] k = j if k == "": k = "Other" """with open(allVariables.pathToProg + "/class.txt", "a") as clas: clas.write("fichier: " + str(chemin.split("\\")[-1]) + "\n") clas.write("listCat: " + str(listCat) + "\n") clas.write("mot: " + str(n_grams) + "\n") clas.write(k + "\n\n")""" copyfile( chemin, monRep + "/" + k + "/" + basename(chemin) + "-" + myDirectory.split("\\")[-1]) else: print(chemin) fichier_rec(chemin)
wordcloud_pos_in_pos = WordCloud(background_color='black', width=1800, height=1400).generate(corpus_m_pos_in_pos) plt.imshow(wordcloud_pos_in_pos) # Unique words unique_words = list(set(" ".join(corpus_m_words).split(" "))) ################################################# ################################################# from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 3)) X1 = vectorizer.fit_transform(words_tokens) features = list(vectorizer.get_feature_names()) counter = Counter(features) counter = Counter(counter) counter.most_common(20) # convert list of tuples into data frame freq_df_b = pd.DataFrame.from_records(counter.most_common(20), columns=['bigram', 'Count']) #Creating a bar plot freq_df_b.plot(kind='bar', x='bigram', figsize=(15, 10), fontsize=15) vect = CountVectorizer(min_df=5, ngram_range=(1, 2)).fit(words_tokens) X1 = vect.transform(words_tokens)
print() places_df['des'] = places_df[['des', 'ktop:category']].apply(lambda x: (' ').join(x), axis=1) places_df['lab'] = places_df[['lab', 'rdf:type']].apply(lambda x: (' ').join(x), axis=1) print(places_df['lab']) print(places_df['des']) print("==문자열로 재 변환 완료==") count_vect = CountVectorizer(min_df=0, ngram_range=(1, 2)) print(count_vect) print("==유사도 벡터화 작업 중==") cat_mat1 = count_vect.fit_transform(places_df['des']) cat_mat2 = count_vect.fit_transform(places_df['lab']) print("cat_mat1", cat_mat1.shape, cat_mat1, "cat_mat2", cat_mat2.shape, cat_mat2, sep="\n") cat_sim1 = cosine_similarity(cat_mat1, cat_mat1) cat_sim2 = cosine_similarity(cat_mat2, cat_mat2) cat_sim1 *= 0.9 cat_sim2 *= 0.1
# Use tf-idf features for NMF. print("Extracting tf-idf features for NMF...") tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tfidf = tfidf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tf = tf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) print() # Fit the NMF model print("Fitting the NMF model (Frobenius norm) with tf-idf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model (Frobenius norm):") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words)
max_words=100, max_font_size=50, random_state=42).generate(str(corpus)) print(wordcloud) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') plt.show() fig.savefig("social.png", dpi=900) from sklearn.feature_extraction.text import CountVectorizer import re cv = CountVectorizer(stop_words=stop_words, max_features=10000, ngram_range=(1, 3)) X = cv.fit_transform(corpus) list(cv.vocabulary_.keys())[:10] #Most frequently occuring words def get_top_n_words(corpus, n=None): vec = CountVectorizer().fit(corpus) bag_of_words = vec.transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) return words_freq[:n] #Convert most freq words to dataframe for plotting bar plot
import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity df = pd.read_csv('final_data(movies 1950-2020).csv') movies_name = list(df['movie_title']) df['comb'] = df['actor_1_name'] + ' ' + df['actor_2_name'] + ' ' + df[ 'actor_3_name'] + ' ' + df['director_name'] + ' ' + df['genres'] df['comb'] = df['comb'].fillna('unknown') df['genres'] = df['genres'].replace('None', ' ') df.set_index('movie_title', inplace=True) # instantiating and generating the count matrix count = CountVectorizer() count_matrix = count.fit_transform(df['comb']) # creating a Series for the movie titles so they are associated to an ordered numerical # list I will use later to match the indexes indices = pd.Series(df.index) cosine_sim = cosine_similarity(count_matrix) # function that takes in movie_title as input and returns the top 10 recommended movies def recommendations(title): recommended_movies = [] ratings = [] title = title.lower() # getting the index of the movie that matches the title
import data_scan import math from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import BernoulliNB from sklearn import metrics import numpy as np import pandas """using sklern for the bernoulli naive bayes to test my implementation's accuracy""" #import the polished data X_train, X_test, y_train, y_tests = data_scan.main_test() features = 10000 vect = CountVectorizer(max_features=features, binary=True) X_train_vectorized = vect.fit_transform(X_train) X_train_vectorized_array = X_train_vectorized.toarray() X_test_vectorized = vect.fit_transform(X_test) X_test_vectorized_array = X_test_vectorized.toarray() #vetrorizing the vords - bag of words #logistic regression bernoulli = BernoulliNB().fit(X_train_vectorized, y_train) #prediction prediction = bernoulli.predict(X_test_vectorized) # return predicted y print(metrics.accuracy_score(prediction, y_tests)) #writing out the data #np.savetxt("submission.csv", np.column_stack((kaggle_files, kaggle_label)), delimiter=",")
}) # print data_clean[0] data_words_only = [] for article in data_clean: data_words_only.append(" ".join( article["body"])) # glue all words together into a list of strings vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words='english') _features = vectorizer.fit_transform(data_words_only) _features_array = _features.toarray() print "Got all features...", _features_array.shape # ---------------------------------------- sim1 = min_hash(10, _features_array, data_clean) buckets = {} print len(sim1) for id in sim1: similarities_for_one_id = sim1[id] sims = []
def print_topics(model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_): #print(" ".join([feature_names[i] # for i in topic.argsort()[:-n_top_words - 1:-1]])) topicList.append(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]).encode('utf-8')) print("Deepak printing **************") print(topicList) print() # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") tf_vectorizer = CountVectorizer(max_df=0.95, min_df=20, max_features=n_features, stop_words='english') t0 = time() corpus = featureContents tf = tf_vectorizer.fit_transform(corpus) print("done in %0.3fs." % (time() - t0)) # Fit the NMF model print("Fitting the NMF model with tf-idf features," "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tf) #exit() print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model:") tfidf_feature_names = tf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words)
data.append(temp) file.close() if path == pos_path: class_id.append(1) else: class_id.append(0) return data # the training data consisits of 5 positive reviews and five negative reviews train_data = read_in_files(all_neg, neg_path, 0, 20, train_class) train_data += read_in_files(all_pos, pos_path, 0, 20, train_class) x_train, x_test, y_train, y_test = train_test_split(train_data, train_class, test_size=0.30) vectorizer = CountVectorizer(stop_words='english') train_features = vectorizer.fit_transform([r for r in x_train]) test_features = vectorizer.transform([r for r in x_test]) logreg = LogisticRegression() logreg.fit(train_features, y_train) y_pred = logreg.predict(test_features) print('Accuracy of logistic regression classifier on test set: {:.2f}'.format( logreg.score(test_features, y_test))) confusion_matrix = confusion_matrix(y_test, y_pred) print(confusion_matrix) print(classification_report(y_test, y_pred))
messages = [] categories = [] # train_files = ['train/data.json'] train_files = glob.glob('train/*.json') for input_json in train_files: with open(input_json, 'r') as f: lines = json.loads(f.read()) for line in lines: messages.append(line['test_message']) categories.append(line['reason_id']) # vectorize count_vector = CountVectorizer(tokenizer=Token.token) X_train_counts = count_vector.fit_transform(messages) # print(count_vector.vocabulary_) # tf-idf tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = Classifier.naive_bayes(X_train_tfidf, categories) test_json = 't.json' fail_messages = [] with open(test_json, 'r') as f: lines = json.loads(f.read()) for line in lines: fail_messages.append(line['test_message'])
review = re.sub(r'\W', ' ', str(X[i])) review = review.lower() review = re.sub(r'^br$', ' ', review) review = re.sub(r'\s+br\s+', ' ', review) review = re.sub(r'\s+[a-z]\s+', ' ', review) review = re.sub(r'^b\s+', '', review) review = re.sub(r'\s+', ' ', review) corpus.append(review) # Creating the BOW model from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(max_features=2234, min_df=3, max_df=0.6, stop_words=stopwords.words('english')) X = vectorizer.fit_transform(corpus).toarray() B = X # Creating the Tf-Idf Model from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer() X = transformer.fit_transform(X).toarray() # Creating the Tf-Idf model directly from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(max_features=2234, min_df=3, max_df=0.6, stop_words=stopwords.words('english')) X = vectorizer.fit_transform(corpus).toarray()
for w in words: if (w not in stop_words): new_words.append(w) after_stem_words = [] for w in new_words: after_stem_words.append(ps.stem(w)) clean_msg = ' '.join(after_stem_words) return clean_msg df['Review'] = df.Review.apply(clean_text) print('data cleaned...') # df.Liked.value_counts().plot(kind='bar') X = cv.fit_transform(df.Review).toarray() new_X = pca.fit_transform(X) y = df.iloc[:, -1].values print('going for training...') log.fit(new_X, y) print('model trained....') # def graph(): # a=df.Liked.value_counts().plot(kind='bar') # l4.configure(a) root = Tk() root.state('zoomed') root.configure(background='gray85') root.title("Restaurant Reviews Project")
############################################################ ## Model設置 ## ############################################################ with open("E:/AB104/AlgorithmTest/Jieba_Booking.json", 'r') as a: data = json.load(a) data = DataFrame(data) classifier = MultinomialNB() X_train, X_test, y_train, y_test = train_test_split(data['comments'].values, data['mark'].values, test_size=0) targets = y_train # print len(targets) #241221 count_vectorizer = CountVectorizer() counts = count_vectorizer.fit_transform(X_train) # print len(X_train) #241221 classifier.fit(counts, targets) ############################################################ ## 進行檢測之結果儲存 ## ############################################################ commList_Jieba_marked = [] for i in commList_Jieba: commList_Jieba_marked_dict = {} examples = [i["comments"]] # print i["comments"] example_counts = count_vectorizer.transform(examples) predictions = classifier.predict(example_counts) commList_Jieba_marked_dict["mark"] = predictions.tolist() # print predictions
x = [] y = [] for i in range(len(tweets_data)): if tweets_data[i]['id'] == sent['id'][i]: x.append(tweets_data[i]['text']) y.append(sent['sentiment'][i]) #print(x[0].split(" ")) #print(y[0]) from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn import metrics vectorizer = CountVectorizer(stop_words='english') train_features = vectorizer.fit_transform(x) actual = y[:-500] nb = MultinomialNB() nb.fit(train_features, [int(r) for r in y]) test_features = vectorizer.transform(x[:-500]) test_try = vectorizer.transform([ "Can we all stop treating anxiety like it's a choice and something cool to have thank you" ]) test_try2 = vectorizer.transform(["I feel like drinking alchohol"]) predict2 = nb.predict(test_try) predict3 = nb.predict(test_try2)
def pipeline_train(train, test, lim_unigram): """ Process train set, create relevant vectorizers Args: train: Data object, training set test: Data object, testing set lim_unigram: int, number of most frequent words to consider Returns: train_set: list, of numpy arrays train_stances: list, of ints bow_vectorizer: sklearn CountVectorizer tfreq_vectorizer: sklearn TfidfTransformer(use_idf=False) tfidf_vectorizer: sklearn TfidfVectorizer() """ # Initialise heads = [] heads_track = {} bodies = [] bodies_track = {} body_ids = [] id_ref = {} train_set = [] train_stances = [] cos_track = {} test_heads = [] test_heads_track = {} test_bodies = [] test_bodies_track = {} test_body_ids = [] head_tfidf_track = {} body_tfidf_track = {} # Identify unique heads and bodies for instance in train.instances: head = instance['Headline'] body_id = instance['Body ID'] if head not in heads_track: heads.append(head) heads_track[head] = 1 if body_id not in bodies_track: bodies.append(train.bodies[body_id]) bodies_track[body_id] = 1 body_ids.append(body_id) for instance in test.instances: head = instance['Headline'] body_id = instance['Body ID'] if head not in test_heads_track: test_heads.append(head) test_heads_track[head] = 1 if body_id not in test_bodies_track: test_bodies.append(test.bodies[body_id]) test_bodies_track[body_id] = 1 test_body_ids.append(body_id) # Create reference dictionary for i, elem in enumerate(heads + body_ids): id_ref[elem] = i # Create vectorizers and BOW and TF arrays for train set bow_vectorizer = CountVectorizer(max_features=lim_unigram, stop_words=stop_words) bow = bow_vectorizer.fit_transform(heads + bodies) # Train set only tfreq_vectorizer = TfidfTransformer(use_idf=False).fit(bow) tfreq = tfreq_vectorizer.transform(bow).toarray() # Train set only tfidf_vectorizer = TfidfVectorizer(max_features=lim_unigram, stop_words=stop_words).\ fit(heads + bodies + test_heads + test_bodies) # Train and test sets # Process train set for instance in train.instances: head = instance['Headline'] body_id = instance['Body ID'] head_tf = tfreq[id_ref[head]].reshape(1, -1) body_tf = tfreq[id_ref[body_id]].reshape(1, -1) if head not in head_tfidf_track: head_tfidf = tfidf_vectorizer.transform([head]).toarray() head_tfidf_track[head] = head_tfidf else: head_tfidf = head_tfidf_track[head] if body_id not in body_tfidf_track: body_tfidf = tfidf_vectorizer.transform([train.bodies[body_id] ]).toarray() body_tfidf_track[body_id] = body_tfidf else: body_tfidf = body_tfidf_track[body_id] if (head, body_id) not in cos_track: tfidf_cos = cosine_similarity(head_tfidf, body_tfidf)[0].reshape(1, 1) cos_track[(head, body_id)] = tfidf_cos else: tfidf_cos = cos_track[(head, body_id)] feat_vec = np.squeeze(np.c_[head_tf, body_tf, tfidf_cos]) train_set.append(feat_vec) train_stances.append(label_ref[instance['Stance']]) return train_set, train_stances, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer
values = pickle.load(open(conf.fileValues, 'rb')) corpusKey = conf.corpusKey #corpus = {d:{k:corpus[d][k][:100] for k in corpus[d]} for d in corpus} print("--------") y = [values.index(i) for i in corpus['train'][corpusKey]] yV = [values.index(i) for i in corpus['valid'][corpusKey]] yT = [values.index(i) for i in corpus['test'][corpusKey]] vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=3) transformer = TfidfTransformer(smooth_idf=False) #train counts = vectorizer.fit_transform(corpus['train']['text']) tfidf = transformer.fit_transform(counts) #valid countsValid = vectorizer.transform(corpus['valid']['text']) tfidfValid = transformer.transform(countsValid) #test countsTest = vectorizer.transform(corpus['test']['text']) tfidfTest = transformer.transform(countsTest) bestAcc = 0. for nn in range(1, 30): print("{}: ".format(nn), end="") clf = neighbors.KNeighborsClassifier(nn, weights='uniform') clf.fit(tfidf, y)
ps=PorterStemmer() for w in filedata: filedata3.append(ps.stem(w)) unique = [] for fdata in filedata3: #each files data was stored in one single string therefore each string is split ... neww=fdata.split(' ') #...to obtain all the words from that document to be later used in VSM neww=neww[:-1] #each document's last index contains ' ' as a feature which is removed from every where unique.append(neww) #IMPORTANT NOTE: unique doesnot contain unique words of all files it is just the variable name... count_vec = CountVectorizer(stop_words='english', ngram_range=(1, 1), max_df=0.2, min_df=0.1, max_features=None) #count_train = count_vec.fit(filedata) #bag_of_words = count_vec.transform(filedata) bag_of_words2=count_vec.fit_transform(filedata) #print('dsdsdsds',count_vec.get_feature_names()) #tfidfmatrix= count_vec.it_transform(filedata) tfidf_vector = TfidfVectorizer( sublinear_tf = True , max_df= 0.8 , min_df = 0.1,stop_words="english") #words= word_tokenize(tfidf_vector) #print(tfidf_vector) tfidf_matrix = tfidf_vector.fit_transform(filedata) print(tfidf_matrix) print('tfidf_matrix shape: ',tfidf_matrix.shape) num_clusters = 5
#朴素叶贝斯新闻分类 from sklearn.datasets import fetch_20newsgroups #feth_20newsgroups 需要即时从互联网下载数据 news = fetch_20newsgroups(subset='all') #验视数据样本 print(len(news.data)) #交叉检验 from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33) #文本向量转换 from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer() x_train = vec.fit_transform(x_train) x_test = vec.transform(x_test) #导入naive_bayes模型MultinomialNB from sklearn.naive_bayes import MultinomialNB mnb = MultinomialNB() mnb.fit(x_train,y_train) y_predict = mnb.predict(x_test) #性能评估 from sklearn.metrics import classification_report print('The accuracy of Naive Bayes Classifier is ',mnb.score(x_test,y_test)) print(classification_report(y_test,y_predict,target_names=news.target_names))
from sklearn import metrics # In[495]: # instantiate CountVectorizer #using unigram model - most frequent 3000 words #baseline classifier will be based on this approach - Notes from meeting with Kanchana) stop = set(stopwords.words('english')) vect = CountVectorizer(stop_words=stop, max_features = 3000) # In[496]: # fit and transform X_train into X_train_fit X_train_fit = vect.fit_transform(X_train) X_train_fit.shape # In[497]: # transform X_test into X_test_fit X_test_fit = vect.transform(X_test) X_test_fit.shape # In[498]: # import and instantiate Multinomial NB classifier nb = MultinomialNB()
# -*- coding: UTF-8 -*- import numpy categories = [ 'alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med' ] from sklearn.datasets import fetch_20newsgroups twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) from sklearn.feature_extraction.text import TfidfTransformer tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_train_tfidf.shape from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) docs_new = ['God is love', 'OpenGL on the GPU is fast'] X_new_counts = count_vect.transform(docs_new) tfidf_transformer = TfidfTransformer() X_new_tfidf = tfidf_transformer.transform(X_new_counts)
class FeatureExtractor(object): def __init__(self, embedded_transformer=None, external_features=None, FEATURE_SCALER='StandardScaler'): """ Initialize variables and check essay set type """ self.transformers = None self.feature_type = None self.selected_columns = None self.POS_ngram = ( 2, 3, ) # hard-coded, no unigrams self.text_types = None self.analysis_type = None self.featureselection = None self.embedded_transformer = embedded_transformer self.external_features = external_features self.word_embedding = None self.maxlen_words = None self.final_features = -1 self.maxlen = None self.sequence_model_type = None self.embedding_type = None self.feature_scale_multiplier = None self.ngramMaxLength = None self.FEATURE_SCALER = FEATURE_SCALER self.sequence_vocabulary = None self.final_data_scaler = None # final scaling transform of data matrix # apply transforms to data, used for test data, no learning takes place! No target data is used # order of operations is crucial! Otherwise results are nonsense def transform(self, data_x, x_meta=None, x_custom=None, post_transformer=None, text_IDs=None, stringkernels=None): #print('Transforming features (testing)') XX = [] if 'CUSTOM' in self.feature_type: x = np.array(x_custom, dtype=float) XX.append(x) if 'TAGS' in self.feature_type: x = np.array(x_meta, dtype=float) XX.append(x) if len(XX) > 0: XX = np.concatenate(tuple(XX), axis=1) if self.analysis_type == 'SEQUENCE': from keras.preprocessing import sequence # apply fiature scaling if len(XX) > 0: XX = self.final_data_scaler.transform(XX) if self.sequence_model_type == 'FASTTEXT': X = data_x[self.text_types[0]] # replace unknown words with RARE_WORD for k1 in range(0, len(X)): for k2 in range(0, len(X[k1])): token = X[k1][k2] if token not in self.sequence_vocabulary: X[k1][k2] = 'RARE_WORD' # convert words to counts X_mat = self.transformer.transform(X) X = [] for row in range(X_mat.shape[0]): # how many tokens of a kind tokens = [-1 for _ in range(np.sum(X_mat[row, :]))] # nonzero elements ind = np.argwhere(X_mat[row, :] > 0) k = 0 for _, col in ind: for _ in range(0, X_mat[row, col]): tokens[k] = col + 1 k += 1 assert tokens[-1] > -1, 'Negative indices found! BUG!' X.append(tokens) # print('Pad sequences (samples x time)') if len(XX) > 0: X_test = [ sequence.pad_sequences(X, maxlen=self.maxlen), XX ] else: X_test = [sequence.pad_sequences(X, maxlen=self.maxlen)] # x_test = sequence.pad_sequences(x_test, maxlen=maxlen) else: X = {'SENTENCES': [], 'FLAT': []} for k1, text in enumerate(data_x[self.text_types[0] + '_SENTENCES']): X['SENTENCES'].append([]) for k2, sent in enumerate(text): if k2 < self.maxlen_doc: X['SENTENCES'][-1].append([]) for k3, word in enumerate(sent): if k3 < self.maxlen_sent: # three cases: (1) in list and dictionary (2) in dictionary (3) nowhere if word not in self.sequence_vocabulary: word = 'RARE_WORD' word_index = self.sequence_vocabulary[word] X['SENTENCES'][-1][-1].append(word_index) X['FLAT'].append( list(itertools.chain.from_iterable( X['SENTENCES'][-1]))) X['FLAT'] = sequence.pad_sequences(X['FLAT'], maxlen=self.maxlen_words) if len(XX) > 0: X_test = [X, XX] else: X_test = [ X, ] elif self.analysis_type[0:3] == 'BOW': # apply all transformers sequentically (same as in training) X_test = [] for transformer in self.transformers: # apply raw data transform x = transformer[0].transform( data_x[transformer[1]]).todense() #.astype(np.float32) # apply scaling transform, identity for TFIDF X_test.append(x) # apply all selections sequentically (same as in training) is_selected = False if len( X_test ) > 0 and self.featureselection != None and self.featureselection[ 1] != 'global': is_selected = True if self.featureselection[1] == 'single': for k, x in enumerate(X_test): X_test[k] = np.take(x, indices=self.selected_columns[k], axis=1) elif self.featureselection[1] == 'all': X_test = np.concatenate(tuple(X_test), axis=1) X_test = [ np.take(X_test, indices=self.selected_columns, axis=1) ] else: raise (Exception( 'Unknown featureselection, must be single or all!')) # add embedding features if 'EMBEDDING' in self.feature_type: if self.embedding_type == 'LEMMA': x = self.embedded_transformer.transform( replace_hash( data_x[self.embedding_type])) #.astype(np.float32) else: x = self.embedded_transformer.transform( data_x[self.embedding_type]) # .astype(np.float32) X_test.append(x) # add external data, if any if len(XX) > 0: X_test.append(XX) X_test = np.concatenate(tuple(X_test), axis=1) if self.featureselection != None and self.featureselection[ 1] == 'global': assert is_selected == False, 'Trying selection twice!' X_test = np.take(X_test, indices=self.selected_columns, axis=1) # apply fiature scaling X_test = self.final_data_scaler.transform(X_test) if self.analysis_type == 'BOW_StringKernel': assert len(set(text_IDs[0]).intersection( text_IDs[1])) == 0, 'test and train data are overlapping!' X_stringkernel = get_stringkernel(stringkernels, self.text_types, text_IDs, self.ngramMaxLength) X_test = self.stringkernel_ratio * X_stringkernel + ( 1.0 - self.stringkernel_ratio) * self.kernelfunction( X=X_test, Y=self.kerneldata_Y) assert self.final_features == X_test.shape[ 1], 'Final feature size not equal!' if post_transformer is not None: X_test = post_transformer(X_test) return X_test # get best features def get_best(self, x, pass2_features): ind = np.argsort(x) ind = np.flipud(ind) assert x[ind[0]] == max(x), 'sort failed!' return ind[0:pass2_features] # method to choose columns def column_selector(self, X, Y, type, pass2_features): if type == 'regression': val = f_regression(X, Y) val = val[0] / np.max(val[0]) # these are f-values! return self.get_best(val, pass2_features) elif type == 'fisher': return self.fisher_selector(Y, X, pass2_features) elif type == 'chi2': return self.chi2_selector(Y, X, pass2_features) elif type == 'mutualinfo': val = mutual_info_regression_partial(X, Y) val = val / np.max(val) return self.get_best(val, pass2_features) else: raise (Exception('Unknown method')) def chi2_selector(self, set_score, dict_mat, max_feats_pass2): med_score = np.median(set_score) new_score = set_score new_score[set_score < med_score] = 0 new_score[set_score >= med_score] = 1 ch2 = SelectKBest(chi2, k=max_feats_pass2) ch2.fit(dict_mat, new_score) good_cols = ch2.get_support(indices=True) return good_cols def fisher_selector(self, set_score, dict_mat, max_feats_pass2): med_score = np.median(set_score) new_score = set_score new_score[set_score < med_score] = 0 new_score[set_score >= med_score] = 1 new_score_1 = new_score == 1 new_score_0 = new_score == 0 fish_vals = np.empty(dict_mat.shape[1]) fish_vals[:] = np.nan for col_num in range(0, dict_mat.shape[1]): # loop_vec = np.squeeze(np.asarray(dict_mat[:, col_num])) # good_loop_vec = loop_vec[new_score == 1] # bad_loop_vec = loop_vec[new_score == 0] # good_loop_present = len(good_loop_vec[good_loop_vec > 0]) # good_loop_missing = len(good_loop_vec[good_loop_vec == 0]) # bad_loop_present = len(bad_loop_vec[bad_loop_vec > 0]) # bad_loop_missing = len(bad_loop_vec[bad_loop_vec == 0]) loop_vec = dict_mat[:, col_num] good_loop_vec = loop_vec[new_score_1] bad_loop_vec = loop_vec[new_score_0] good_loop_present = np.sum(good_loop_vec != 0) good_loop_missing = np.sum(good_loop_vec == 0) bad_loop_present = np.sum(bad_loop_vec != 0) bad_loop_missing = np.sum(bad_loop_vec == 0) fish_vals[col_num] = pvalue(good_loop_present, bad_loop_present, good_loop_missing, bad_loop_missing).two_tail cutoff = 1 if (len(fish_vals) > max_feats_pass2): cutoff = sorted(fish_vals)[max_feats_pass2] good_cols = np.asarray([ num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff ]) return good_cols # tf-idf weighted transformer for document embedding class TfidfEmbeddingVectorizer(object): def __init__(self, word2vec, dim): self.word2vec = word2vec self.word2weight = None self.dim = dim def fit(self, X, y=None): tfidf = TfidfVectorizer(analyzer=lambda x: x) tfidf.fit(X) # if a word was never seen - it must be at least as infrequent # as any of the known words - so the default idf is the max of # known idf's max_idf = max(tfidf.idf_) self.word2weight = defaultdict( lambda: max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()]) return self def transform(self, X, y=None): return np.array([ np.mean([ self.word2vec[w] * self.word2weight[w] for w in words if w in self.word2vec ] or [np.zeros(self.dim)], axis=0) for words in X ]) def main(self, data_x, data_y, Params, x_meta=None, x_custom=None, print_info=False, text_IDs=None, stringkernels=None): if Params['Algorithm'][0] == 'SEQUENCE': self.analysis_type = 'SEQUENCE' else: self.analysis_type = 'BOW' self.transformers = [] self.text_types = Params['TextTypes'] self.feature_type = Params['FeatureMethod'] if self.feature_type is None: self.feature_type = [] # custom text features if 'CUSTOM' in self.feature_type: assert x_meta != None, 'Customdata not set!' if print_info: start_time = time.time() print('... adding (custom) count measures', end='') x = np.array(x_custom[1], dtype=float) x_label = x_custom[0] X_custom = x X_custom_columns = x_label if print_info: end_time = time.time() print(' ... done (%1.1fs)' % (end_time - start_time)) # tag features if 'TAGS' in self.feature_type: assert x_meta != None, 'Metadata not set!' if print_info: start_time = time.time() print('... adding metainfo', end='') x = np.array(x_meta[1], dtype=float) x_label = x_meta[0] X_tags = x X_tags_columns = x_label if print_info: end_time = time.time() print(' ... done (%1.1fs)' % (end_time - start_time)) if self.analysis_type == 'SEQUENCE': from keras.preprocessing import sequence # convert text to index sequences, returns # data = text x sentence x word XX = [] XX_columns = [] if 'CUSTOM' in self.feature_type: XX.append(X_custom) XX_columns.append(X_custom_columns) if 'TAGS' in self.feature_type: XX.append(X_tags) XX_columns.append(X_tags_columns) if len(XX) > 0: XX = np.concatenate(tuple(XX), axis=1) self.final_data_scaler = get_scaler(self.FEATURE_SCALER) XX = self.final_data_scaler.fit_transform(XX) X = data_x[self.text_types[0]] max_sequence = max([len(x) for x in X]) if Params['Algorithm'][1]['algorithm'] == 'FASTTEXT': self.sequence_model_type = 'FASTTEXT' X = [x[0:np.minimum(max_sequence, len(x))] for x in X] # get all words that appeared at least in two articles transformer = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, max_df=1.0, min_df=2, max_features=50000, ngram_range=(1, 1)) transformer.fit(X) for k1 in range(0, len(X)): for k2 in range(0, len(X[k1])): token = X[k1][k2] if token not in transformer.vocabulary_: X[k1][k2] = 'RARE_WORD' self.transformer = CountVectorizer( tokenizer=lambda x: x, preprocessor=lambda x: x, max_df=1.0, min_df=2, max_features=100000, ngram_range=(1, Params['Algorithm'][1]['ngram'])) X_mat = self.transformer.fit_transform(X) self.sequence_vocabulary = { key: (val + 1) for key, val in self.transformer.vocabulary_.items() } # additional tokens for empty and unknown word assert 'PADDED_WORD' not in self.transformer.vocabulary_ self.sequence_vocabulary['PADDED_WORD'] = 0 ind2word = [ '' for x in range(0, len(self.sequence_vocabulary)) ] for word in self.sequence_vocabulary.keys(): ind2word[self.sequence_vocabulary[word]] = word maxlen_words = 0 X = [] for row in range(X_mat.shape[0]): tokens = [-1 for _ in range(np.sum(X_mat[row, :]))] ind = np.argwhere(X_mat[row, :] > 0) k = 0 for _, col in ind: for _ in range(0, X_mat[row, col]): tokens[k] = col + 1 k += 1 maxlen_words = np.maximum(maxlen_words, len(tokens)) X.append(tokens) #maxlen_words = np.minimum(Params['Algorithm'][1]['max_sequence'], maxlen_words) self.maxlen = maxlen_words # print('Pad sequences (samples x time)') #X = [sequence.pad_sequences(X, maxlen=self.maxlen)] # x_test = sequence.pad_sequences(x_test, maxlen=maxlen) # print('Pad sequences (samples x time)') if len(XX) > 0: X = [sequence.pad_sequences(X, maxlen=self.maxlen), XX] X_columns = [self.sequence_vocabulary, XX_columns] else: X = [sequence.pad_sequences(X, maxlen=self.maxlen)] X_columns = [self.sequence_vocabulary] else: max_sequence = np.minimum( max_sequence, Params['Algorithm'][1]['max_seq_length']) X = [x[0:np.minimum(max_sequence, len(x))] for x in X] # get all words that appeared at least in two articles transformer = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, max_df=1.0, min_df=2, max_features=20000, ngram_range=(1, 1)) transformer.fit(X) self.sequence_vocabulary = { key: (val + 2) for key, val in transformer.vocabulary_.items() } # additional tokens for empty and unknown word assert 'UNKNOWN_WORD' not in self.sequence_vocabulary assert 'PADDED_WORD' not in self.sequence_vocabulary self.sequence_vocabulary['PADDED_WORD'] = 0 self.sequence_vocabulary['RARE_WORD'] = 1 # compute mean and mean norm of all word vectors sumvec = 0.0 sumnorm = 0.0 for k, word in enumerate( self.external_features.word_embedding): vec = self.external_features.word_embedding[word] sumvec += vec sumnorm += np.linalg.norm(vec) if k > 10000: break vec_mean = 0 * sumvec / (k + 1) vec_norm = sumnorm / (k + 1) EMBEDDING_DIM = len(vec_mean) def get_random_vec(): # generate random vector with same mean and norm as embeddings on avarage a = 2 * np.random.rand(EMBEDDING_DIM) - 1 #a = a + vec_mean a = (a / np.linalg.norm(a)) * vec_norm return a word_embedding = {} word_embedding['RARE_WORD'] = get_random_vec() word_embedding['PADDED_WORD'] = 0 X = { 'FLAT': [], 'SENTENCES': [] } # index matrix with splitted sentences maxlen_doc = 0 maxlen_sent = 0 maxlen_words = 0 unknown_words = set() total_words = [0, 0] # convert tokens to indices, keep sentences for k1, text in enumerate(data_x[self.text_types[0] + '_SENTENCES']): X['SENTENCES'].append([]) words = 0 maxlen_doc = np.maximum(maxlen_doc, len(text)) for k2, sent in enumerate(text): X['SENTENCES'][-1].append([]) words += len(sent) maxlen_words = np.maximum(maxlen_words, words) maxlen_sent = np.maximum(maxlen_sent, len(sent)) if len(sent) == maxlen_sent: maxlen_sent_example = sent for k3, word in enumerate(sent): lemma_word = data_x['LEMMA_SENTENCES'][k1][k2][k3] lemma_word = lemma_word.replace('#', '') total_words[0] += 1 # three cases: (1) in list and dictionary (2) in dictionary (3) nowhere vec = None if word in self.external_features.word_embedding: # word has embeddings vec = self.external_features.word_embedding[ word] elif lemma_word in self.external_features.word_embedding: # lemma has embedding, use that instead vec = self.external_features.word_embedding[ lemma_word] word_embedding[word] = vec if word in self.sequence_vocabulary: # word must have embedding, even a random one if vec is None: vec = get_random_vec() # null vector word_embedding[word] = vec else: if vec is None: # word not in vocabulary and no embedding, mark as unknown word = 'RARE_WORD' total_words[1] += 1 unknown_words.add(word) else: # word not in vocabulary but has embedding, self.sequence_vocabulary[word] = len( self.sequence_vocabulary) word_index = Params['sequence_vocabulary'][word] X['SENTENCES'][-1][-1].append(word_index) X['FLAT'].append( list(itertools.chain.from_iterable( X['SENTENCES'][-1]))) X['FLAT'] = sequence.pad_sequences(X['FLAT'], maxlen=maxlen_words) assert (total_words[1] / total_words[0] ) < 0.10, 'over 10% of words (tokens) are unknown!' vals = sorted([ self.sequence_vocabulary[key] for key in self.sequence_vocabulary ]) assert np.max(vals) + 1 == len(vals) self.maxlen_words = maxlen_words self.maxlen_doc = maxlen_doc self.maxlen_sent = maxlen_sent self.max_unique_words = len(self.sequence_vocabulary) # vals = sorted([self.transformer.vocabulary_[key] for key in self.transformer.vocabulary_.keys()]) W = np.zeros((self.max_unique_words, EMBEDDING_DIM), dtype=np.float32) W.fill(np.nan) ind2word = [ '' for x in range(0, len(self.sequence_vocabulary)) ] for word in self.sequence_vocabulary.keys(): W[self.sequence_vocabulary[word]] = word_embedding[word] ind2word[self.sequence_vocabulary[word]] = word #for k,word in Params['sequence_vocabulary'] # if 0: data_x_check = [] for k1 in range(0, len(X['FLAT'])): data_x_check.append([]) for k2 in range(0, len(X['FLAT'][k1])): data_x_check[k1].append( ind2word[X['FLAT'][k1][k2]]) Params['W_embedding_matrix'] = W Params['max_document_sentences'] = maxlen_doc Params['max_sentence_words'] = maxlen_sent Params['max_words_in_doc'] = maxlen_words Params['max_unique_words'] = self.max_unique_words self.word_embedding = word_embedding if len(XX) > 0: X = [X, XX] X_columns = 'sequence data (up to %i words) + metadata (% items)' % ( maxlen_words, XX.shape[1]) else: X = [ X, ] X_columns = 'sequence data (up to %i words)' % maxlen_words elif self.analysis_type == 'BOW': # feature selection type, only for BOW algorithms (not including fasttext) self.featureselection = Params['FeatureSelection'] if not isinstance(self.text_types, list) and not isinstance( self.text_types, tuple): self.text_types = [self.text_types] if print_info: print('\nBuilding and transforming features (training phase)') X = [] X_columns = [] for feature in self.feature_type: for text_type in self.text_types: if feature == 'TFIDF': if text_type == 'POS': ngram_range = self.POS_ngram else: ngram_range = Params['TFIDF_ngram'] if print_info: start_time = time.time() print('... adding TF-IDF (%s, ngram=%s)' % (text_type, str(ngram_range)), end='') self.transformers.append((TfidfVectorizer( tokenizer=lambda x: x, preprocessor=lambda x: x, max_df=1.0, min_df=2, use_idf=True, max_features=Params['pass1_features'], ngram_range=ngram_range), text_type)) x = self.transformers[-1][0].fit_transform( data_x[text_type]).todense() X.append(x) x = self.transformers[-1][0].get_feature_names() x = [ 'term=' + y + ',type=%s+TFIDF' % text_type for y in x ] X_columns.append(x) if print_info: end_time = time.time() print(' ... done (%1.1fs)' % (end_time - start_time)) elif feature == 'BOW': if text_type == 'POS': ngram_range = self.POS_ngram else: ngram_range = Params['BOW_ngram'] if print_info: start_time = time.time() print('... adding BOW (%s, ngram=%s)' % (text_type, str(ngram_range)), end='') self.transformers.append((CountVectorizer( tokenizer=lambda x: x, preprocessor=lambda x: x, max_df=1.0, min_df=2, max_features=Params['pass1_features'], ngram_range=ngram_range, dtype=np.float32), text_type)) x = self.transformers[-1][0].fit_transform( data_x[text_type]).todense() X.append(x) x = self.transformers[-1][0].get_feature_names() x = [ 'term=' + y + ',type=%s+BOW' % text_type for y in x ] X_columns.append(x) if print_info: end_time = time.time() print(' ... done (%1.1fs)' % (end_time - start_time)) else: pass # do feature selection for individual BOW features or all of them is_selected = False if len( X ) > 0 and self.featureselection != None and self.featureselection[ 1] != 'global': is_selected = True if print_info: start_time = time.time() print('... doing feature selection (type=%s)' % str(self.featureselection), end='') self.selected_columns = [] if self.featureselection[1] == 'single': for k, x in enumerate(X): self.selected_columns.append( self.column_selector( x, data_y.copy(), Params['FeatureSelection'][0], Params['FeatureSelection'][2])) X[k] = np.take(x, indices=self.selected_columns[-1], axis=1) X_columns[k] = [ X_columns[k][kk] for kk in self.selected_columns[-1] ] elif self.featureselection[1] == 'all': X = np.concatenate(tuple(X), axis=1) self.selected_columns = self.column_selector( X, data_y.copy(), Params['FeatureSelection'][0], Params['FeatureSelection'][2]) X = [np.take(X, indices=self.selected_columns, axis=1)] X_columns = list(itertools.chain.from_iterable(X_columns)) X_columns = [ list([X_columns[kk] for kk in self.selected_columns]) ] else: raise (Exception( 'featureselection property must be single or all!')) if print_info: end_time = time.time() print(' ... done (%1.1fs)' % (end_time - start_time)) # tf-ifd weighted document embedding if 'EMBEDDING' in self.feature_type: if print_info: start_time = time.time() print( '... adding embedded document vectors (dim %i) with tf-idf scaling' % self.external_features.embedding_dim, end='') self.embedding_type = Params['EMBEDDING_type'] if self.embedded_transformer == None: self.embedded_transformer = self.TfidfEmbeddingVectorizer( self.external_features.word_embedding, self.external_features.embedding_dim) if self.embedding_type == 'LEMMA': self.embedded_transformer.fit( replace_hash(data_x[self.embedding_type])) else: self.embedded_transformer.fit( data_x[self.embedding_type]) if self.embedding_type == 'LEMMA': x = self.embedded_transformer.transform( replace_hash( data_x[self.embedding_type])) #.astype(np.float32) else: x = self.embedded_transformer.transform( data_x[self.embedding_type]) # .astype(np.float32) X.append(x) X_columns.append([ 'emb%i_%3.0f' % (self.external_features.embedding_dim, kk) for kk in range(1, self.external_features.embedding_dim + 1) ]) if print_info: end_time = time.time() print(' ... done (%1.1fs)' % (end_time - start_time)) if 'CUSTOM' in self.feature_type: X.append(X_custom) X_columns.append(X_custom_columns) if 'TAGS' in self.feature_type: X.append(X_tags) X_columns.append(X_tags_columns) X = np.concatenate(tuple(X), axis=1) X_columns = list(itertools.chain.from_iterable(X_columns)) # do global feature selection if self.featureselection != None and self.featureselection[ 1] == 'global': assert is_selected == False, 'Trying selection twice!' if print_info: start_time = time.time() print('... doing feature selection (type=%s)' % str(self.featureselection), end='') self.selected_columns = self.column_selector( X, data_y.copy(), Params['FeatureSelection'][0], Params['FeatureSelection'][2]) X = np.take(X, indices=self.selected_columns, axis=1) X_columns = list( [X_columns[kk] for kk in self.selected_columns]) if print_info: end_time = time.time() print(' ... done (%1.1fs)' % (end_time - start_time)) assert X.shape[1] == len( X_columns), 'X and X_labels have different size! BUG!' self.final_data_scaler = get_scaler(self.FEATURE_SCALER) if self.FEATURE_SCALER is not 'StandardScaler': temp_scaler = get_scaler('StandardScaler') temp_scaler.fit(X) self.feature_scale_multiplier = temp_scaler.scale_ X = self.final_data_scaler.fit_transform(X) self.feature_scale_multiplier = self.feature_scale_multiplier / self.final_data_scaler.scale_ else: self.feature_scale_multiplier = np.ones(X.shape[1]) X = self.final_data_scaler.fit_transform(X) if Params['Algorithm'][0] == 'StringKernel': self.analysis_type = 'BOW_StringKernel' self.ngramMaxLength = Params['Algorithm'][1]['ngram'] X_stringkernel = get_stringkernel(stringkernels, self.text_types, text_IDs, self.ngramMaxLength) X_columns = [ 'String kernels for %s' % " ".join(self.text_types) ] self.kerneldata_Y = X self.kernelfunction = get_kernel( Params['Algorithm'][1]['kerneltype']) self.stringkernel_ratio = Params['Algorithm'][1][ 'stringkernel_ratio'] X = (self.stringkernel_ratio * X_stringkernel ) + (1.0 - self.stringkernel_ratio) * self.kernelfunction( X=X, Y=None) self.final_features = X.shape[1] else: raise (Exception( 'Unknown analysis type (should be sequence or classical)')) return X, X_columns, Params
class Reader_APNEWS: """ This class is responsible for preprocessing the newsgroup data as well as creating batches to train. the input is always a list with all documents: """ def __init__(self, datapath, n_features=100000, lm_minimum_freq=5, train_perc=0.6, valid_perc=0.2, language="english", length_batch=10, batch_size=5, sample_size=10000): #data preprocessing #todo: remove limiting number of samples random.seed(1) self.language = language self.lm_minimum_freq = lm_minimum_freq self.train_perc = train_perc self.valid_perc = valid_perc self.length_batch = length_batch self.batch_size = batch_size data = self.get_data(datapath)[:sample_size] print("len data:", len(data)) # print("len data", len(data)) # print(data[:2]) self.data_samples = self.preprocessing_general(self.shuffle(data)) # print(self.data_samples[:2]) self.data_tm = self.preprocessing_tm(self.data_samples) #use for ntm model self.data_prepped = [ self.process_doc(doc, i) for i, doc in enumerate(self.data_samples) ] self.tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10, max_features=n_features, stop_words=self.language) #first fit the matrix on the train set self.tf_vectorizer.fit_transform( self.data_tm[:int(len(self.data_tm) * train_perc)]) self.tf = self.reluDerivative( self.tf_vectorizer.transform(self.data_tm)) self.idx2word = self.tf_vectorizer.get_feature_names() self.vocab_size = np.shape(self.tf)[1] print("vocab size", self.vocab_size) #LM data self.train, self.valid, self.test, self.lm_id2word, self.lm_word2id, self.lm_vocab_size = self.preprocessing_lm( data=self.data_samples, minimum_tf=lm_minimum_freq) def get_data(self, datapath): with open(datapath) as f: content = f.readlines() # you may also want to remove whitespace characters like `\n` at the end of each line content = [x.strip() for x in content] return content def shuffle(self, x): x_new = [[doc] for doc in x] random.shuffle(x_new) return [x[0] for x in x_new] # takes data in the form of list of strings def preprocessing_lm(self, data, minimum_tf): # gets tf from corpus def get_tf(d): tf = defaultdict(int) for doc in d: for sen in doc: for word in sen: tf[word] += 1 return tf def create_vocab(data): idx2word = [] word2idx = dict() for doc in data: for sen in doc: for word in sen: if word not in word2idx: word2idx[word] = len(idx2word) idx2word.append(word) word2idx["<EOS>"] = len(idx2word) idx2word.append("<EOS>") word2idx["<BOS>"] = len(idx2word) idx2word.append("<BOS>") word2idx["<PAD>"] = len(idx2word) idx2word.append("<PAD>") return idx2word, word2idx def remove_numbers(data): return [[[ word if not word.isdigit() else "<NUMBER>" for word in sen ] for sen in doc] for doc in data] # removes rare words def remove_rare_words(data, tf, min_freq): return [[[ word if tf[word] >= min_freq else "<UNK>" for word in sen ] for sen in doc] for doc in data] def create_language_model_data(data, word2idx): lm_data = [] for doc in data: if doc == []: lm_data.append(None) continue doc_new = [copy.deepcopy(sen) for sen in doc] doc_new[0].insert(0, word2idx["<EOS>"]) for sen in doc_new: sen.append(word2idx["<EOS>"]) lm_data.append(doc_new) # print( lm_data) lm_data = [ list(itertools.chain.from_iterable(doc)) if doc != None else None for doc in lm_data ] return lm_data def get_batch_data(data): def create_batches(d, batch_size=1, lstm_length=20): batches = len(d) // (lstm_length * batch_size) if batches == 0: # print( "peep peep") return None cutoff = batches * lstm_length * batch_size d = np.array(d[:cutoff]) # for larger batch size d = d.reshape((batch_size, batches * lstm_length)) # horizontal split output = np.hsplit( d, [i * lstm_length for i in range(1, batches)]) # output = d.reshape(-1, 1, lstm_length) return output x = copy.deepcopy(data[:-1]) y = copy.deepcopy(data[1:]) x_batch = create_batches(x, batch_size=self.batch_size, lstm_length=self.length_batch) y_batch = create_batches(y, batch_size=self.batch_size, lstm_length=self.length_batch) if x_batch == None: return None return [(x_batch[i], y_batch[i]) for i in range(len(x_batch))] data_listform = [[ word_tokenize(y, language=self.language) for y in sent_tokenize(x, language=self.language) ] for x in data] #get tf for train set # with open('coherence_data/apnews/corpus.0', 'w') as f: # for doc in data_listform: # doc = " ".join([item for sublist in doc for item in sublist]) # f.write(doc + "\n") tf_train = get_tf( data_listform[:int(len(data_listform) * self.train_perc)]) data_listform = remove_numbers(data_listform) data_listform = remove_rare_words(data_listform, tf_train, min_freq=self.lm_minimum_freq) # statistic purposes sp = [len(x) for x in data_listform] print("min number of words in a document:", min(sp)) print("max number of words in a document:", max(sp)) print("average number of words:", sum(sp) / len(sp)) idx2word, word2idx = create_vocab(data_listform) tokenized_data = [[[word2idx[word] for word in sen] for sen in doc] for doc in data_listform] language_model_data = create_language_model_data( tokenized_data, word2idx) new_tf = copy.deepcopy(self.tf) new_data_set = [ { "doc_tm": x, "doc_tm_sparse": np.where(x > 0)[0], "doc_lm": get_batch_data(language_model_data[i]) } for i, x in enumerate(new_tf) if len(np.where(x > 0)[0]) > 0 and language_model_data[i] != None and get_batch_data(language_model_data[i]) != None ] total_length = len(new_data_set) train_idx = int(total_length * self.train_perc) valid_idx = int(total_length * (self.train_perc + self.valid_perc)) train = new_data_set[:train_idx] valid = new_data_set[train_idx:valid_idx] test = new_data_set[valid_idx:] return train, valid, test, idx2word, word2idx, len(idx2word) def get_sets(self, valid_perc=0.2): new_tf = copy.deepcopy(self.tf) # here we add the indices that are on and remove documents that contain no words that are in te vocab # the third variable is the text new_data_set = [{ "doc_tm": x, "doc_tm_1": np.where(x > 0)[0], "doc_lm": self.language_model_data[i] } for i, x in enumerate(new_tf) if len(np.where( x > 0)[0]) > 0 and self.language_model_data[i] != None] total_length = len(new_data_set) train_idx = int(total_length * self.train_perc) valid_idx = int(total_length * (self.train_perc + valid_perc)) train = new_data_set[:train_idx] valid = new_data_set[train_idx:valid_idx] test = new_data_set[valid_idx:] return train, valid, test # removes lowercase, lemmatize? , stem? def preprocessing_general(self, data, remove_the_uppercase=True, remove_the_numbers=False, stem=False, lemmatize=False): def remove_uppercase(data): new_data = [] for x in data: new_data.append(x.lower()) return new_data def remove_numbers(d): new_data = [[ word_tokenize(y, language=self.language) for y in sent_tokenize(x, language=self.language) ] for x in d] data_no_digits = [[[ word if not word.isdigit() else "<NUMBER>" for word in sen ] for sen in doc] for doc in new_data] return [ " ".join([" ".join([word for word in s]) for s in doc]) for doc in data_no_digits ] new_data = data if remove_the_uppercase: print("replacing uppercase by lowercase") new_data = remove_uppercase(new_data) if remove_the_numbers: print("removing numbers from general data") new_data = remove_numbers(new_data) return new_data def preprocessing_tm(self, data): return data def process_doc(self, doc, i): """"this function preprocesses the documents """ sentences = sent_tokenize(doc) output_data = [word_tokenize(s) for s in sentences] return output_data def reluDerivative(self, input): x = input.toarray() x[x <= 0] = 0 x[x > 0] = 1 return x
'Seed1-Napier', 'Seed2-Devon', 'Seed3-Richmond', 'Seed4-Bessborough' ] seed['YEAR'] = [1884, 1845, 1882, 1881] seed = seed[['BILL', 'YEAR', 'SPEECH_ACT']] # append to end of text df text = pd.concat([text, seed]).reset_index(drop=True) # now that the raw data has been processed, we build up the dictionary # prepare the corpus corpus = list(text['SPEECH_ACT']) nr_docs = 10e0**np.linspace(0, 7, num=8) max_df = (nr_docs + 0.5) / len(corpus) # get unique words, remove special chars, spellcheck, lemma/stem for i in range(len(nr_docs)): vectorizer = CountVectorizer(max_df=max_df[i]) vec = vectorizer.fit_transform(corpus) words = vectorizer.get_feature_names() # remove words with special characters and numbers in them words_nonr = [word for word in words if word.isalpha()] # correctly and incorrectly spelled english words words_en = [word for word in words_nonr if dictionary.check(word)] words_nonen = [ word for word in words_nonr if dictionary.check(word) == False ] # lemmatize # orig_lemmas = [word for word in words_en if lemmatizer.lemmatize(word) is not None] # lemmas = [lemmatizer.lemmatize(word) for word in words_en] # stem orig_stems = [word for word in words_en if stemmer.stem(word) is not None] stems = [stemmer.stem(word) for word in words_en] # create dictionary from lists
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.naive_bayes import MultinomialNB pd.options.mode.chained_assignment = None from sklearn.metrics import confusion_matrix from sklearn.cross_validation import cross_val_score, train_test_split df = pd.read_csv("../prediction_app/static/merged_data.csv") essay_df = df[['_projectid', 'RESP', ' essay']] essay_df['new_essay'] = essay_df[' essay'].map(lambda x: type(x)) essay_df = essay_df[essay_df.new_essay == str] print "done throwing out floats" print "percent remaining", len(essay_df) / len(df) essay_df.new_essay = essay_df[' essay'].map(lambda x: x.decode('utf-8')) print "done decoding" documents = essay_df.new_essay.tolist() classes = essay_df.RESP.tolist() vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 2)) doc_vectors = vectorizer.fit_transform(documents) print "done vectorizing" \ "" model = MultinomialNB().fit(doc_vectors, classes) print "done fitting model" precision = np.mean( cross_val_score(model, doc_vectors, classes, scoring='precision')) cm = confusion_matrix(classes, model.predict(doc_vectors)) print "Precision", precision print "Percentage off", cm[0][1] / (cm[0][0] + cm[0][1]) print cm
from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer corpus = [] for i in range(0, 1000): review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) review = review.lower() review = review.split() ps = PorterStemmer() review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] review = ' '.join(review) corpus.append(review) # Creating the Bag of Words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features = 1500) X = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, 1].values # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) # Fitting Max Entropy Classification to the Training set from Scipy from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test)