class UnitClassifier(Trainer): def __init__(self, x, y, train_ratio): super(UnitClassifier, self).__init__(x, y, train_ratio) self._count_vec = CountVectorizer() self._tfidf_transformer = TfidfTransformer() def Fit(self): x_count = self._count_vec.fit_transform(self._x_train) self._tfidf_transformer.fit(x_count) def Preprocess(self, x): return self._tfidf_transformer.transform(self._count_vec.transform(x)) def Learn(self, x_train, y_train): LOG.info('x_train.shape = %s', str(x_train.shape)) LOG.info('len(y_train) = %d', len(y_train)) clf = RandomForestClassifier(verbose=0, n_jobs=-1, n_estimators=20) LOG.info('Training...') clf.fit(x_train, y_train) LOG.info('Done...') return clf def Eval(self): LOG.info('Eval ...') y_pred = self.Predict(self._x_test) return { 'misclass': np.mean(y_pred != self._y_test), 'report': classification_report(self._y_test, y_pred, target_names=self._model.classes_) }
def race_tfidf(data, can_be_noun_arg, stop_words): print data = data.groupby('race')['last'] data = dict(list(data)) docs = [] for k in data: docs.append(' '.join(data[k])) count_vectorizer = CountVectorizer(stop_words='english') counts = count_vectorizer.fit_transform(docs) #print counts.todense().shape tfidf = TfidfTransformer(norm="l2", sublinear_tf='True') tfidf.fit(counts) #print "IDF:", tfidf.idf_.shape tf_idf_matrix = tfidf.transform(counts) freqs = {} sorted_voc = sorted(count_vectorizer.vocabulary_.iteritems(), key=operator.itemgetter(1)) terms,_ = zip(*sorted_voc) for i,k in enumerate(data.keys()): # make list row = np.array(tf_idf_matrix.todense()[i,:])[0].tolist() freq = zip(terms, row) freqs[k] = sorted(freq, reverse=True, key=lambda x: x[1]) print freqs[k][:5] #print tf_idf_matrix.todense().shape return freqs
def cal_weight(self, key_words): """ 计算获取特征词后的权重信息 :param key_words: [{'sentence': {}}, ...] or [{}, ...] 有可能是测试集数据有可能是训练集数据 :return: """ print "Cal Weight: ", time.strftime('%Y-%m-%d %H:%M:%S') if not self.istrain: dir_ = os.path.join(TEXT_OUT, "key_words") filename = self.__class__.__name__ + ".txt" if self.subjective else self.__class__.__name__ + "_objective.txt" url = os.path.join(dir_, filename) train_key_words = FileUtil.read(url) else: train_key_words = key_words train_key_words = [d.get("sentence") if "sentence" in d else d for d in train_key_words] key_words = [d.get("sentence") if "sentence" in d else d for d in key_words] # 获得 tf key_words = [{k: v / sum(d.values()) for k, v in d.items()} for d in key_words] fit_train_key_words = Feature_Hasher.transform(train_key_words) fit_key_words = Feature_Hasher.transform(key_words) tfidf = TfidfTransformer() # 训练 idf tfidf.fit(fit_train_key_words) weight_matrix = tfidf.transform(fit_key_words) print "Cal Weight Done: ", time.strftime('%Y-%m-%d %H:%M:%S') print return weight_matrix
def tfidf_score(train_set, test_set): stopwords = nltk.corpus.stopwords.words('english') vectorizer = TfidfVectorizer(min_df=1, stop_words=set(stopwords)) #Remove all the None Types from the input datasets train_set = filter(None, train_set) test_set = filter(None, test_set) vectorizer.fit_transform(train_set) #print "Word Index is {0} \n".format(vectorizer.vocabulary_) smatrix = vectorizer.transform(test_set) tfidf = TfidfTransformer(norm="l2") tfidf.fit(smatrix) #print "IDF scores:", tfidf.idf_ tf_idf_matrix = tfidf.transform(smatrix) pairwise_similarity = tf_idf_matrix * tf_idf_matrix.T msum = tf_idf_matrix.sum(axis=1) cos_sum = pairwise_similarity.sum(axis=1) mlist = msum.tolist() cos_sim = cos_sum.tolist() count = 0 tfidfscores = {} for s in train_set: tfidfscores[s] = [] tfidfscores[s].append(mlist[count][0]) tfidfscores[s].append(cos_sim[count][0]) count += 1 return tfidfscores
class FeatureVectorTfIdf(object): def __init__(self): self.cvector_obj = CountVectorizer() self.tfidf_obj = TfidfTransformer(norm="l2") def extract_features(self, data): # fit_transform get train data and extract features count_vect = self.cvector_obj.fit_transform(data) return self.tf_idf_vector(count_vect) # You can get the list of features # print self.cvector_obj.get_feature_names() def word_vectors_tfidf(self, vect_data): '''convert documents into matrix''' vect_data = self.cvector_obj.transform(vect_data) return self.tf_idf_vector(vect_data) def tf_idf_vector(self, vect_data): '''convert documnet matrix into idf matrix ''' self.tfidf_obj.fit(vect_data) '''convert into tf-idf matrix''' tfidf = self.tfidf_obj.transform(vect_data) return tfidf
def tf_idf_features(train_ls): train_set = train_ls #Documents vectorizer = CountVectorizer()#stop_words = stopWords transformer = TfidfTransformer() trainVectorizerArray = vectorizer.fit_transform(train_set).toarray() transformer.fit(trainVectorizerArray) return transformer.transform(trainVectorizerArray).toarray()
def tf_idf(tag_matrix): #calculate TF-IDF tfidf = TfidfTransformer(None, use_idf=True) tfidf.fit(tag_matrix) tag_matrix = tfidf.transform(tag_matrix) dense_tag_matrix = tag_matrix.todense() return dense_tag_matrix
def tfidf_step_by_step(): """ Example of calculating TF-IDF for OSM nodes. Document is a list of keys. """ learn_data_set = documents_gen() test_data_set = documents_gen() # calculate term-frequency vectorizer = CountVectorizer(stop_words=stop_words, token_pattern='[a-z0-9_\-:]+') vectorizer.fit_transform(learn_data_set) #pprint.pprint(vectorizer.vocabulary_) # freq_term_matrix is a sparse matrix (elemens stored in Coordinate format # http://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_.28COO.29 ) freq_term_matrix = vectorizer.transform(test_data_set) # freq_term_matrix.todense() # l2 - Euclidean normalization # http://en.wikipedia.org/wiki/Norm_%28mathematics%29#Euclidean_norm tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) tf_idf = tfidf.transform(freq_term_matrix) pprint.pprint(tf_idf.todense())
def cal_product_title_tfidf(): #PART I compute the tf-idf for product title print "\nBegins,compute the tf-idf for product title ..." print "\nStemming product_title..." AllSet['product_title'] = AllSet['product_title'].map(lambda x : stem_process(x)) product_title = AllSet['product_title'] print "\nGet the (product title vocabulary)-(search term) frequency matrix..." search_vect_tittle = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency search_vect_tittle.fit(product_title)#learn the vocabulary search_tittle_fq_matrix = search_vect_tittle.transform(search_term) #get the (product title vocabulary)-(search term) frequency matrix print "\nGet the (product title vocabulary)-(product_title) frequency matrix" title_vect = CountVectorizer(stop_words='english') title_vect.fit_transform(product_title)#learn the vocabulary title_fq_matrix = title_vect.transform(product_title) #get the (product title vocabulary)-(product_title) frequency matrix print "\nGet the idf matrix" tfidf_transformer = TfidfTransformer(norm="l2", smooth_idf=True) tfidf_transformer.fit(title_fq_matrix) # get idf for each vocabulary tf_idf_title_matrix = tfidf_transformer.transform(title_fq_matrix) #get the idf matrix print "\nCompute the result of tf-idf for product title ..." tf_idf_title_result = [] #compute the result of tf-idf for product title for index in range(tf_idf_title_matrix.shape[0]): tf_idf_title_result.append((np.multiply(tf_idf_title_matrix[index], search_tittle_fq_matrix[index].transpose()))[0, 0]) pd.DataFrame({"id": AllSet['id'],"product_title_tfidf": tf_idf_title_result}).to_csv('product_title_tfidf.csv', index=False) return 0
def cal_product_description_tfidf(): #PART II compute the tf-idf for product description print "\nBegins,compute the tf-idf for product description ..." product_description_data = pd.read_csv('product_descriptions.csv') print "\nMerge the product description into database..." AllSet = pd.merge( AllSet , product_description_data, how='left', on='product_uid') print "\nStemming the product description ..." AllSet['product_description'] = AllSet['product_description'].map(lambda x: stem_process(x)) product_description=AllSet['product_description'] print "\nGet the (product description vocabulary)-(search term) frequency matrix..." search_vect_descrip = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency search_vect_descrip.fit(product_description)#learn the vocabulary search_descrip_fq_matrix = search_vect_descrip.transform(search_term) #get the (product description vocabulary)-(search term) frequency matrix print "\nGet the (product description vocabulary)-(product_description) frequency matrix..." description_vect = CountVectorizer(stop_words ='english') description_vect.fit_transform(product_description)#learn the vocabulary description_fq_matrix=description_vect.transform(product_description) #get the (product discription vocabulary)-(product_description) frequency matrix print "\nGet the idf matrix..." tfidf_transformer = TfidfTransformer(norm="l2",smooth_idf=True) tfidf_transformer.fit(description_fq_matrix) # get idf for each vocabulary tf_idf_descrip_matrix = tfidf_transformer.transform(description_fq_matrix) #get the idf matrix print "\nCompute the result of tf-idf for product description ..." tf_idf_descrip_result=[]#compute the result of tf-idf for product title for index in range(tf_idf_descrip_matrix.shape[0]): tf_idf_descrip_result.append((np.multiply(tf_idf_descrip_matrix[index], search_descrip_fq_matrix[index].transpose()))[0, 0]) pd.DataFrame({"id":AllSet['id'],"product_description_tfidf": tf_idf_descrip_result}).to_csv('product_description_tfidf.csv', index=False)
class CaloriesRegressor(Trainer): def __init__(self, x, y, train_ratio): super(CaloriesRegressor, self).__init__(x, y, train_ratio) self._count_vec = CountVectorizer() self._tfidf_transformer = TfidfTransformer() def Fit(self): x_count = self._count_vec.fit_transform(self._x_train) self._tfidf_transformer.fit(x_count) def Preprocess(self, x): return self._tfidf_transformer.transform(self._count_vec.transform(x)) def Learn(self, x_train, y_train): LOG.info('x_train.shape = %s', str(x_train.shape)) LOG.info('len(y_train) = %d', len(y_train)) clf = RandomForestRegressor(verbose=0, n_jobs=-1, n_estimators=100) LOG.info('Training...') clf.fit(x_train, y_train) LOG.info('Done...') return clf def Eval(self): LOG.info('Eval ...') y_pred = self.Predict(self._x_test) return { 'median_absolute_error': median_absolute_error(self._y_test, y_pred), 'mean_squared_error': mean_squared_error(self._y_test, y_pred), 'explained_variance_score': explained_variance_score(self._y_test, y_pred), }
def return_idf(instances, labels): transformer = TfidfTransformer(smooth_idf=True) transformer.fit(instances) idf = dict.fromkeys(range(instances.shape[1]), 0) for feature,value in enumerate(list(transformer._idf_diag.data)): idf[feature] = value return idf
def TextTransform(X, Xtest = None): Write("Process Data with TFIDF...\n") tfidf = TfidfTransformer() if Xtest is None: X = tfidf.fit_transform(X).toarray() return X else: tfidf.fit(X) return tfidf.transform(X).toarray(), tfidf.transform(Xtest).toarray()
def setup(train, test, binaryOpt = False): count_vectorizer = CountVectorizer(binary = binaryOpt) count_vectorizer.fit_transform(train) freq_term_matrix = count_vectorizer.transform(test) if binaryOpt: return freq_term_matrix tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) tf_idf_matrix = tfidf.transform(freq_term_matrix) return tf_idf_matrix
def do_idf_original(self): """Calcul de l'idf directement avec sklearn On calcule ici l'idf directement avec les classe de sklearn. On obtient le même résultat que do_idf_variante. Calculer nous même l'idf nous permet de mieux contrôler ce que l'on fait, notamment sur la variante utilisée. """ tfidf_transformer = TfidfTransformer() tfidf_transformer.fit(self.tfidf_matrix) self.idf = tfidf_transformer.idf_
def create_tf_idf(bow): """ (SPARSE VERSION) Reads the bag of words representation from GridFS, then generates the TF-IDF representation """ print "Creating TF-IDF Bag of Words" transformer = TfidfTransformer(norm=u'l2', use_idf=True) transformer.fit(bow) tf_idf = transformer.transform(bow) return tf_idf
def _collect(self, splited_words_list, sentence_size): print "Collection datas: ", time.strftime('%Y-%m-%d %H:%M:%S') data = [d.get("sentence") for d in splited_words_list[: sentence_size]] class_label = [d.get("emotion-1-type") for d in splited_words_list[: sentence_size]] fit_data = Feature_Hasher.transform(data) tfidf = TfidfTransformer() tfidf.fit(fit_data) a = tfidf.transform(fit_data) print "Done: ", time.strftime('%Y-%m-%d %H:%M:%S') return a, class_label, []
def tfidf_normalize(articles_with_id): global NON_STOPWORD_LIMIT stemmed_articles_with_id = [(aid, stem_article(article)) for (aid, article) in articles_with_id] stemmed_articles = [article for (aid, article) in stemmed_articles_with_id] # test_set = train_set # instantiate vectorizer with English language, using stopwords and set min_df, max_df parameters and the tokenizer vectorizer = CountVectorizer(stop_words="english", min_df=3, max_df=0.1, token_pattern=r"\b[a-zA-Z][a-zA-Z]+\b") # by appling the vectorizer instance to the train set # it will create a vocabulary from all the words that appear in at least min_df and in no more than max_df # documents in the train_set vectorizer.fit_transform(stemmed_articles) # vectorizer transform will apply the vocabulary from the train set to the test set. In my case, # they are the same set: whole Wikipedia. # this means that each article will get representation based on the words from the vocabulary and # their TF-IDF values in the Scipy sparse output matricx freq_term_matrix = vectorizer.transform(stemmed_articles) long_articles_with_id = [] assert freq_term_matrix.shape[0] == len(articles_with_id) for (i, article_with_id) in zip(xrange(freq_term_matrix.shape[0]), stemmed_articles_with_id): row = freq_term_matrix.getrow(i) if row.getnnz() >= NON_STOPWORD_LIMIT: long_articles_with_id.append(article_with_id) long_articles = [article for (aid, article) in long_articles_with_id] vectorizer = CountVectorizer(stop_words="english", min_df=3, max_df=0.1, token_pattern=r"\b[a-zA-Z][a-zA-Z]+\b") vectorizer.fit_transform(long_articles) freq_term_matrix = vectorizer.transform(long_articles) # Gabrilovich says that they threshold TF on 3 (remove word-article association if that word # does not appear at least 3 times in that single article # freq_term_matrix.data *= freq_term_matrix.data>=3 # freq_term_matrix.eliminate_zeros() # I think this is not necessary... # this is a log transformation as applied in (Gabrilovich, 2009), i.e., that is # how he defines TF values. In case of TF = 0, this shall not affect such value # freq_term_matrix.data = 1 + np.log( freq_term_matrix.data ) # instantiate tfidf trnasformer tfidf = TfidfTransformer(norm=None, smooth_idf=False, sublinear_tf=True) # tfidf uses the freq_term_matrix to calculate IDF values for each word (element of the vocabulary) tfidf.fit(freq_term_matrix) # finally, tfidf will calculate TFIDF values with transform() tf_idf_matrix = tfidf.transform(freq_term_matrix) # tf_idf_matrix.data = np.log(np.log(tf_idf_matrix.data)) tf_idf_matrix = normalize(tf_idf_matrix, norm="l2", axis=0, copy=False) # now we put our matrix to CSC format (as it helps with accessing columns for inversing the vectors to # words' concept vectors) tf_idf_matrix = tf_idf_matrix.tocsc() # we need vocabulary_ to be accessible by the index of the word so we inverse the keys and values of the # dictionary and put them to new dictionary word_index word_index = dict((v, k) for k, v in vectorizer.vocabulary_.iteritems()) M, N = tf_idf_matrix.shape print "Articles: ", M print "Words: ", N return tf_idf_matrix, word_index, long_articles_with_id
def test_same_idf_diag(self): X, X_rdd = self.generate_dataset(4, 1000, None) local = TfidfTransformer() dist = SparkTfidfTransformer() local.fit(X) dist.fit(X_rdd) assert_array_almost_equal(local._idf_diag.toarray(), dist._idf_diag.toarray())
def data_pro(): [[corpus_train, target_train], [corpus_test, target_test]] = load_data() count_v1 = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 counts_train = count_v1.fit_transform(corpus_train) # fit_transform是将文本转为词频矩阵 transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值 tfidf_train = transformer.fit(counts_train).transform(counts_train) # fit_transform是计算tf-idf weight_train = tfidf_train.toarray() # weight[i][j],第i个文本,第j个词的tf-idf值 count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_) # 让两个CountVectorizer共享vocabulary counts_test = count_v2.fit_transform(corpus_test) # fit_transform是将文本转为词频矩阵 transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值 tfidf_test = transformer.fit(counts_train).transform(counts_test) # fit_transform是计算tf-idf weight_test = tfidf_test.toarray() # weight[i][j],第i个文本,第j个词的tf-idf值 return [[weight_train, target_train], [weight_test, target_test]]
def __init__(self, feature='tfidf', **kwargs): super(IMDB, self).__init__(**kwargs) if self.conf is not None: feature = self.conf.get('feature', 'tfidf') if feature.startswith('tfidf'): max_features = 5000 (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features) else: (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=None, skip_top=0, maxlen=None, seed=113, start_char=1, oov_char=2, index_from=3) X, y = self.get_data_by_imageset(X_train, y_train, X_test, y_test) print('data_set={}, Average sequence length: {}'.format(self.data_set, np.mean(list(map(len, X))))) #feature if feature == 'origin': maxlen = 400 X = sequence.pad_sequences(X, maxlen=maxlen) elif feature == 'tfidf': from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer(smooth_idf=False) #transformer = TfidfTransformer(smooth_idf=True) X_train_bin = np.zeros((len(X_train), max_features), dtype=np.int16) X_bin = np.zeros((len(X), max_features), dtype=np.int16) for i, X_i in enumerate(X_train): X_train_bin[i, :] = np.bincount(X_i, minlength=max_features) for i, X_i in enumerate(X): X_bin[i, :] = np.bincount(X_i, minlength=max_features) transformer.fit(X_train_bin) X = transformer.transform(X_bin) X = np.asarray(X.todense()) elif feature == 'tfidf_seq': from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer(smooth_idf=False) maxlen = 400 N = len(X) X_bin = np.zeros((N, max_features), dtype=np.int16) for i, X_i in enumerate(X): X_bin_i = np.bincount(X_i) X_bin[i, :len(X_bin_i)] = X_bin_i tfidf = transformer.fit_transform(X_bin) tfidf = np.asarray(tfidf.todense()) X_id = sequence.pad_sequences(X, maxlen=maxlen) X = np.zeros(X_id.shape, dtype=np.float32) for i in range(N): X[i, :] = tfidf[i][X_id[i]] else: raise ValueError('Unkown feature: ', feature) X = X[:,np.newaxis,:,np.newaxis] self.X = self.init_layout_X(X) self.y = self.init_layout_y(y)
def tf(train,test): """Transform feature vectors: TF""" trf = TfidfTransformer(use_idf=False) trf = trf.fit(train) train = trf.transform(train) test = trf.transform(test) return train,test
def tfidf(train,test): """Transform feature vectors: TFIDF""" trf = TfidfTransformer() trf = trf.fit(train) train = trf.transform(train) test = trf.transform(test) return train,test
def tfidf_raw(cnt_articles, article_ids, train_set_dict): # use the whole (Wiki) set as both the train and test set train_set = train_set_dict.values() test_set = train_set # train_set = ("The sky is blue.", "The sun is bright.") # test_set = ("The sun in the sky is bright.","We can see the shining sun, the bright sun.") vectorizer = CountVectorizer(stop_words="english") # instantiate vectorizer with English language, using stopwords and set min_df, max_df parameters and the tokenizer # vectorizer = CountVectorizer(stop_words='english', min_df=2, max_df=0.7, token_pattern=r'\b[a-zA-Z][a-zA-Z]+\b') # by appling the vectorizer instance to the train set # it will create a vocabulary from all the words that appear in at least min_df and in no more than max_df # documents in the train_set vectorizer.fit_transform(train_set) # print "Vocabulary:", vectorizer.vocabulary_ # vectorizer transform will apply the vocabulary from the train set to the test set. In my case, # they are the same set: whole Wikipedia. # this means that each article will get representation based on the words from the vocabulary and # their TF-IDF values in the Scipy sparse output matricx freq_term_matrix = vectorizer.transform(test_set) print freq_term_matrix.todense() # this is a log transformation as applied in (Gabrilovich, 2009), i.e., that is # how he defines TF values. In case of TF = 0, this shall not affect such value # freq_term_matrix.data = 1 + np.log( freq_term_matrix.data ) # instantiate tfidf trnasformer tfidf = TfidfTransformer(norm=None, smooth_idf=False, sublinear_tf=True) # print tfidf # tfidf uses the freq_term_matrix to calculate IDF values for each word (element of the vocabulary) tfidf.fit(freq_term_matrix) # print tfidf.idf_ # finally, tfidf will calculate TFIDF values with transform() tf_idf_matrix = tfidf.transform(freq_term_matrix) print # print tf_idf_matrix.todense() print # now we put our matrix to CSC format (as it helps with accessing columns for inversing the vectors to # words' concept vectors) CSC_matrix = tf_idf_matrix.tocsc() CSC_matrix = normalize(CSC_matrix, norm="l2", axis=0, copy=False) # we need vocabulary_ to be accessible by the index of the word so we inverse the keys and values of the # dictionary and put them to new dictionary word_index word_index = dict((v, k) for k, v in vectorizer.vocabulary_.iteritems()) print word_index M, N = CSC_matrix.shape print "Articles: ", M print "Words: ", N return M, N, CSC_matrix, word_index
def preprocess(self): count_vect = CountVectorizer(stop_words='english') tokens = [self.tokenize(line) for line in self.data] count_vect.fit(tokens) self.count_vect = count_vect self.tfidf_vect = TfidfVectorizer(stop_words='english') self.tfidf_vect.fit(tokens) self.X_tfidfVect = self.tfidf_vect.transform(tokens) X_train_counts = count_vect.transform(tokens) self.X = X_train_counts tf_transformer = TfidfTransformer() tf_transformer.fit(X_train_counts) X_train_counts = tf_transformer.transform(X_train_counts) self.tf_transformer = tf_transformer self.X_tfidf = X_train_counts self.query_vector = self.tfidf_vect.transform([self.tokenize(self.query)]) voc = count_vect.vocabulary_ self.vocabulary = [(v,k) for k, v in voc.iteritems()] sorted_voc = sorted(self.vocabulary,key=lambda value:value[0],reverse=False) self.vocabulary = [k for v,k in sorted_voc]
def get_keyphrase_data(home, group, max_keyphrases, update=False): start = time.time() path = '%s/output/entity/%s/keyphrase-data.txt' % (home, group) fp = codecs.open(path, 'r', 'UTF-8') data = simplejson.load(fp) fp.close() nrow = data["next-mention-index"] ncol = data["next-kp-index"] path = '%s/output/entity/%s/kp-fof-sparse.npz' % (home, group) if update or not os.path.exists(path): print "Updating first-order features..." keyphrase_map = {} S = lil_matrix((nrow, ncol)) for mention_id, mention_index in data['mention-index-map'].iteritems(): keyphrases = sorted(data['mention-kps'][mention_id], key=lambda x: -x[1]) keyphrases = keyphrases[:max_keyphrases] keyphrase_map[mention_index] = keyphrases for kp_index, _, frequency in keyphrases: S[mention_index, kp_index] = 1.0 tfidf = TfidfTransformer(norm="l2") tfidf.fit(S) S = tfidf.transform(S) S = S.tocsc() sums = S.sum(axis=0) np.savez(path, S.data, S.indices, S.indptr, sums) fp = codecs.open('%s/output/entity/%s/keyphrase-map.json' % (home, group), 'w', 'UTF-8') simplejson.dump(keyphrase_map, fp, indent=4) fp.close() else: print "Loading keyphrase data..." npzfile = np.load(path) S = csc_matrix((npzfile['arr_0'], npzfile['arr_1'], npzfile['arr_2']), shape=(nrow, ncol)) sums = npzfile['arr_3'] fp = codecs.open('%s/output/entity/%s/keyphrase-map.json' % (home, group), 'r', 'UTF-8') keyphrase_map = simplejson.load(fp) fp.close() keyphrase_map = {int(key):value for key, value in keyphrase_map.iteritems()} finish = time.time() print '\ttook %0.3f s' % (finish-start) return data, keyphrase_map, S, sums
def get_matrices(good_deals,bad_deals,test_deals): """ Return the training and testing matrices with labels """ # Generte labels for good and bad deals labels=[0]*len(good_deals)+[1]*len(bad_deals) deals = good_deals+bad_deals # Instance of vectorizer that records counts of terms count_vectorizer = CountVectorizer() # Fit training and testing data to transfroms into matrix train=count_vectorizer.fit_transform(deals) test = count_vectorizer.transform(test_deals) # Initialize TFIDF tranformer and transform testing and training data to tfidf matrix tfidf = TfidfTransformer() tfidf.fit(test) tfidf.fit(train) train_mat=tfidf.transform(train) test_mat = tfidf.transform(test) return train_mat.todense(),labels,test_mat.todense()
class TFIDF(object): def __init__(self, preprocessing=None, binary=False): super(TFIDF, self).__init__() if preprocessing is None: preprocessing = TextPreProcessing() self.preprocessing = preprocessing _ = lambda x: x self._cv = CountVectorizer(tokenizer=_, analyzer=_, preprocessor=_, binary=binary) self._tfidf = TfidfTransformer( norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False) def fit(self, documents): counts = self._cv.fit_transform( [self.preprocessing(d) for d in documents]) self._tfidf.fit(counts) def transform(self, documents): counts = self._cv.transform([self.preprocessing(d) for d in documents]) return self._tfidf.transform(counts)
def vectorize_data(train_data, test_data): global app_vocabulary from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer vocabulary=[x.replace('.', '_') for x in list(app_vocabulary)] save_vocabulary(vocabulary, 'vocabulary.txt') # train_data=[x.replace('.','_') for x in train_data] # test_data=[x.replace('.','_') for x in test_data] count_v1= CountVectorizer(vocabulary=vocabulary) # import pdb;pdb.set_trace() counts_train = count_v1.fit_transform(train_data) # print "the shape of train is "+repr(counts_train.shape) count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_) counts_test = count_v2.fit_transform(test_data) save_vocabulary(count_v2.get_feature_names(), 'vocabulary.txt') tfidftransformer=TfidfTransformer() tfidf_train=tfidftransformer.fit(counts_train).transform(counts_train) tfidf_test=tfidftransformer.fit(counts_test).transform(counts_test) return tfidf_train, tfidf_test
def provide_idf_transformer_and_idf_dtm(vectorizer_params,count_dtm): """ #TODO: update docu Takes count_dtm and transforms it to idf_dtm. :param count_dtm - scipy.sparse.csr.csr_matrix (n_samples, n_features) :return tfidf_transformer_fit - fitted TfidfTransformer :return dtm_idf - scipy.sparse.csr.csr_matrix (n_samples, n_features) with idf. """ tfidf_transformer = TfidfTransformer(use_idf=True, sublinear_tf=False, norm=vectorizer_params[DEFAULT_NORM_KEY]) tfidf_transformer_fit = tfidf_transformer.fit(count_dtm) #the document_term_matrix with idf*tf dtm_idf = tfidf_transformer_fit.transform(count_dtm) return tfidf_transformer_fit, dtm_idf
# "We can see the shining sun, the bright sun.") count_vectorizer = CountVectorizer(stop_words='english') count_vectorizer.fit_transform(docs) print "Vocabulary:", count_vectorizer.vocabulary_ print "size of terms", len(count_vectorizer.vocabulary_) freq_term_matrix = count_vectorizer.transform(docs) #print freq_term_matrix.todense() from sklearn.feature_extraction.text import TfidfTransformer tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) #print "IDF:", tfidf.idf_ tf_idf_matrix = tfidf.transform(freq_term_matrix) print len(tf_idf_matrix.todense()) #print tf_idf_matrix.todense()[1,2] #print type(count_vectorizer.vocabulary_.keys()[0]) #print count_vectorizer.vocabulary_.keys()[0].encode('ascii','replace') #+++++++++++++++++++numpy to json, too slow # results=[] # for index in range(0, len(keys)-550): # result = {} # result["business_id"]=keys[index] # for i in range(0,len(count_vectorizer.vocabulary_.keys())-35800):
def PredictionScoreLeaveOneOut(X, y, limit, columnName): from sklearn.metrics import f1_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.svm import SVC, LinearSVC import matplotlib.pyplot as plt names = [ "Linear SVM", "Nearest Neighbors", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes" ] # names = ["Linear SVM","Linear SVM","Linear SVM","Linear SVM"] classifiers = [ SVC(kernel="linear", C=0.025, probability=True), KNeighborsClassifier(3), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB() ] outFile = open('output.txt', 'a') vec = DictVectorizer() for name, clf in zip(names, classifiers): try: accuracy = 0.0 count = 0.0 total_accuracy = 0.0 total_f1 = 0.0 total_precision = 0.0 total_recall = 0.0 count = 1.0 from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() loo.get_n_splits(X) # print(loo) y_test_all = [] y_pred_all = [] accuracy_total = 0 count = 0 for train_index, test_index in loo.split(X): # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_fit = count_vect.fit(X_train) X_train_counts = X_train_fit.transform(X_train) X_test_counts = X_train_fit.transform(X_test) from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() fit = tfidf_transformer.fit(X_train_counts) X_train_tfidf = fit.transform(X_train_counts) X_test_tfidf = fit.transform(X_test_counts) X_train_counts = X_train_tfidf X_test_counts = X_test_tfidf try: clf.fit(X_train_counts.toarray(), y_train) accuracy_total += clf.score(X_test_counts.toarray(), y_test) count += 1 y_pred = clf.predict(X_test_counts.toarray()) # # binary_predictions = [x if x == 'good' else 0 for x in y_pred] # binary_predictions = [x if x == 0 else 1 for x in binary_predictions] # # binary_labels = [x if x == 'good' else 0 for x in y_test] # binary_labels = [x if x == 0 else 1 for x in binary_labels] y_pred_all.append(y_pred[0]) y_test_all.append(y_test[0]) except BaseException as b: print(b) f1 = f1_score(y_test_all, y_pred_all, average='weighted') precision = precision_score(y_test_all, y_pred_all, average='weighted') recall = recall_score(y_test_all, y_pred_all, average='weighted') print( str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" + str(accuracy_total / count) + "\t" + str(f1) + "\t" + str(precision) + "\t" + str(recall)) outFile.write( str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" + str(accuracy_total / count) + "\t" + str(f1) + "\t" + str(precision) + "\t" + str(recall) + "\n") # acc, f1,prc,rec = classify(clf,X_train,X_test,y_train,y_test) # # total_accuracy +=acc # total_f1 += f1 # total_precision += prc # total_recall += rec except BaseException as b: print(b) outFile.close()
if labels[i] == 'ham': labels[i] = 0 ham_index.append(i) elif labels[i] == 'spam': labels[i] = 1 spam_index.append(i) else: print('UNIDENTIFIED LABEL AT INDEX: ' + str(i)) #count the occurance of each word CV = CountVectorizer() TF = TfidfTransformer() #fit and transform the features features_count = CV.fit_transform( features) #contains a count of each word in each sms TF.fit(features_count) features_tfidf = TF.transform(features_count) #TfIdf representation #convert to dense arrays in order to seperate into test and training data and perform feature selection features_count = features_count.toarray() features_tfidf = features_tfidf.toarray() #seperate into traiing and testing data num = round(rows / 20) spam_num = len(spam_index) ham_num = len(ham_index) #select 10% of the rows for testing random_ham_index = random.sample(range(0, ham_num), num) random_spam_index = random.sample(range(0, spam_num), num) hams = [ham_index[i] for i in random_ham_index]
with open(output, 'wb') as fd: pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL) pass os.makedirs(sys.argv[2], exist_ok=True) # Generate train feature matrix df_train = get_df(train_input) train_words = np.array(df_train.text.str.lower().values.astype('U')) bag_of_words = CountVectorizer(stop_words='english', max_features=max_features, ngram_range=(1, ngrams)) bag_of_words.fit(train_words) train_words_binary_matrix = bag_of_words.transform(train_words) tfidf = TfidfTransformer(smooth_idf=False) tfidf.fit(train_words_binary_matrix) train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix) save_matrix(df_train, train_words_tfidf_matrix, train_output) # Generate test feature matrix df_test = get_df(test_input) test_words = np.array(df_test.text.str.lower().values.astype('U')) test_words_binary_matrix = bag_of_words.transform(test_words) test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix) save_matrix(df_test, test_words_tfidf_matrix, test_output)
'OtherP' ] X = data[features[0]] Y = data['ope'] print "X", len(X) #(48701, 55) print "Y", len(Y) #(48701, 1) topic_train = X ''' Vectorizing X ''' vect = CountVectorizer() vect.fit(topic_train) topic_train_dtm = vect.transform(topic_train) print "topic train dym shape", topic_train_dtm.shape ''' TFIDF Transform ''' tfidf = TfidfTransformer(norm="l2") tfidf.fit(topic_train_dtm) topic_train_tfidf_dtm = tfidf.transform(topic_train_dtm) print "topic_train_tfidf_dtm size", topic_train_tfidf_dtm.shape #(48701, 96459) ''' Getting the LIWC features ''' lwic = data[features[1:]] print "lwic.shape : ", lwic.shape lwic_array = lwic.as_matrix() lwic_array_numpy = np.array(lwic_array) #(48701, 54) print "lwic_array_numpy.shape : ", lwic_array_numpy.shape ''' Only TOPICS ''' X_matrix = topic_train_tfidf_dtm print "X_matrix shape : ", X_matrix.shape #(48701, 96513) Y_matrix = np.array(Y).reshape(len(Y), 1) print "Y_matrix shape : ", Y_matrix.shape #(48701, 1)
def train(self, train_data_file, test_data_file, chain_tgt_fields, metadata_file=None): print('start train') raw_train = pickle.load(open(train_data_file, 'rb')) raw_test = pickle.load(open(test_data_file, 'rb')) print('get count matrix') train_x_count_matrix = get_count_matrix(raw_train['x'], raw_train['src_size']) test_x_count_matrix = get_count_matrix(raw_test['x'], raw_train['src_size']) classification_model_num = len(raw_train['y']) tfidf_transformer = TfidfTransformer() tfidf_transformer.fit(train_x_count_matrix) train_x = tfidf_transformer.transform(train_x_count_matrix) test_x = tfidf_transformer.transform(test_x_count_matrix) # train_y = binarize_label(raw_train['y'], raw_train['tgt_size']) # test_y = binarize_label(raw_test['y'], raw_train['tgt_size']) chains = [] choose_feature_list = [] train_chain_pred_history = sp.csr_matrix([]) feature_num = 20000 model_type = 'svm' for i in range(classification_model_num): chains.append(self.get_model(model_type)) chain_train_x = train_x print(chain_train_x.shape) for idx, tgt_field in enumerate(chain_tgt_fields): print(tgt_field) # chain.fit(chain_train_x, raw_train['y'][tgt_field]) # train_pred = chain.predict(chain_train_x) # train_pred_pro = sp.csr_matrix(chain.predict_proba(chain_train_x)) choose_feature = self.feature_slect(train_x, raw_train['y'][tgt_field], feature_num) if train_chain_pred_history.shape[1] != 0: chain_train_x = sp.hstack( [train_x[:, choose_feature], train_chain_pred_history]) else: chain_train_x = train_x[:, choose_feature] choose_feature_list.append(choose_feature) print(chain_train_x.shape) chains[idx], train_pred_pro = self.get_fit( chains[idx], model_type, chain_train_x, raw_train['y'][tgt_field], ) if train_chain_pred_history.shape[1] != 0: train_chain_pred_history = sp.hstack( [train_chain_pred_history, train_pred_pro]) else: train_chain_pred_history = train_pred_pro # chain_train_x = sp.hstack([chain_train_x, train_pred_pro]) # chain_train_x = sp.csr_matrix(chain_train_x) # print(train_pred) self.chains = chains self.chain_tgt_fields = chain_tgt_fields metadata = {'tfidf': tfidf_transformer, 'model': self} pickle.dump(metadata, open(metadata_file, 'wb')) chain_test_x = test_x test_chain_pred_history = sp.csr_matrix([]) for chain, tgt_field, choose_feature in zip(chains, chain_tgt_fields, choose_feature_list): # preds = chain.predict(chain_test_x) # test_pred_pro = sp.csr_matrix(chain.predict_proba(chain_test_x)) if test_chain_pred_history.shape[1] != 0: chain_test_x = sp.hstack( [test_x[:, choose_feature], test_chain_pred_history]) else: chain_test_x = test_x[:, choose_feature] test_pred_pro, test_pred = self.get_predict( chain, model_type, chain_test_x) if test_chain_pred_history.shape[1] != 0: test_chain_pred_history = sp.hstack( [test_chain_pred_history, test_pred_pro]) else: test_chain_pred_history = test_pred_pro # chain_test_x = sp.hstack([chain_test_x, test_pred_pro]) # chain_test_x = sp.csr_matrix(chain_test_x) print(tgt_field) print( metrics.classification_report(raw_test['y'][tgt_field], test_pred, digits=4))
def parse_group(group): group_id = '-' + group offset = 0 all_posts = [] r = requests.get( 'https://api.vk.com/method/wall.get', params={ 'owner_id': group_id, 'offset': offset, 'count': 10, 'access_token': 'd933e827d933e827d933e82762d95bd7acdd933d933e827857a5be3f0d490a5fdc7bfbe', 'v': '5.95' }) posts = r.json()['response']['items'] all_posts.extend(posts) data_posts = [] likes_response = [] all_likes = [] for p in all_posts: data_posts.append(get_data(p)) r = requests.get( 'https://api.vk.com/method/likes.getList', params={ 'owner_id': group_id, 'offset': offset, 'type': 'post', 'item_id': p['id'], 'filter': 'likes', 'friends_only': 0, 'extended': 1, 'count': p['likes']['count'], 'access_token': 'd933e827d933e827d933e82762d95bd7acdd933d933e827857a5be3f0d490a5fdc7bfbe', 'v': '5.95' }) likes_response.extend(r.json()['response']['items']) for like_response in likes_response: like = Like(group_id, like_response['id'], like_response['type'], like_response['first_name'], like_response['last_name']) all_likes.append(like) write_likes_json(all_likes, group_id) write_posts_json(data_posts, group_id) my_stop_words = get_stop_words('ru') vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=my_stop_words) X = vectorizer.fit_transform([data_post.text for data_post in data_posts]) idf = vectorizer.idf_ #*************** cv = CountVectorizer(max_df=0.85, stop_words=my_stop_words, max_features=10000) word_count_vector = cv.fit_transform( [data_post.text for data_post in data_posts]) tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) tfidf_transformer.fit(word_count_vector) feature_names = cv.get_feature_names() #all keywords keywords = [] morph = pymorphy2.MorphAnalyzer() #generate tf-idf for the given document for data_post in data_posts: tf_idf_vector = tfidf_transformer.transform( cv.transform([data_post.text])) #sort the tf-idf vectors by descending order of scores sorted_items = sort_coo(tf_idf_vector.tocoo()) #extract only the top n; n here is 1 results = extract_topn_from_vector(feature_names, sorted_items, 1) result = '' if results: result = next(iter(results)) if result != '' and not result.isdigit(): result = morph.parse(result)[0].normal_form if len(result) > 2: keyword = KeyWord(data_post.id, result, 1) keywords.append(keyword) return data_posts, keywords
print("Using the quora data set...") print("Total data set size" , len(test.question1)) print("Training data set size" , len(test.question1)//18) for i in range(len(test.question1)//18): train_set.append(test.question1[i]) train_set.append(test.question2[i]) vectorizer = CountVectorizer(stop_words = stopWords) transformer = TfidfTransformer() print("Count vector is built") trainVectorizerArray = vectorizer.fit_transform(train_set).toarray() transformer.fit(trainVectorizerArray) print("Tf-Idf vector is learnt from count vector") output = [] test_set = [] for i in range(10000): test_set = [] test_set.append(test.question1[i]) test_set.append(test.question2[i]) testVectorizerArray = vectorizer.transform(test_set).toarray() tfidf = transformer.transform(testVectorizerArray) similarilty = 1 - spatial.distance.cosine((tfidf.todense())[0][0], (tfidf.todense())[1][0]) output.append([similarilty, test.is_duplicate[i]]) for i in range(1,51):
scaler = StandardScaler() tfidf = TfidfTransformer(norm=None) dense = Data_Utils.DenseTransformer() for train, test in skf.split(CU_X, Y): #train split CU_train_data = CU_X[train] train_labels = Y[train] #test split CU_eval_data = CU_X[test] eval_labels = Y[test] # tf-idf tfidf.fit(CU_train_data) CU_train_data = dense.transform(tfidf.transform(CU_train_data)) CU_eval_data = dense.transform(tfidf.transform(CU_eval_data)) # standardization scaler.fit(CU_train_data) CU_train_data = scaler.transform(CU_train_data) CU_eval_data = scaler.transform(CU_eval_data) # normalization CU_train_data = normalize(CU_train_data) CU_eval_data = normalize(CU_eval_data) train_data = CU_train_data eval_data = CU_eval_data
def query(Category, category_obj): base_url = 'http://export.arxiv.org/api/query?' search_query = Category query = 'search_query=%s&max_results=30&sortBy=submittedDate&sortOrder=descending' % ( search_query) feedparser._FeedParserMixin.namespaces[ 'http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch' feedparser._FeedParserMixin.namespaces[ 'http://arxiv.org/schemas/atom'] = 'arxiv' with libreq.urlopen(base_url + query) as url: response = url.read() feed = feedparser.parse(response) #date = the_date corpus_entry = [] exists = False count = 0 all_dates = [] for entry in feed.entries: # print (entry.title + " " + entry.published + "\n") #if entry.published[0:10] == date: print('entery published ', entry.published, type(entry.published)) date_ = entry.published[:10] date = datetime.strptime(date_, '%Y-%m-%d').date() all_dates.append(date) corpus_entry.append(entry) #xists = True # If date does not exist just returns most recent articles # if exists == False: # if count == 0: # count = 1 # if entry.published[0:10] == date: # return False # else: # date = entry.published[0:10] # if entry.published[0:10] == date: # corpus_entry.append(entry) for paper in corpus_entry: paper.summary = prePro(paper.summary.lower()) stop_Words = stop_words.ENGLISH_STOP_WORDS # Dictionary here corpusSumm = [] for paper in corpus_entry: corpusSumm.append(paper.summary) cv = CountVectorizer(max_df=.85, stop_words=stop_Words) word_count_vector = cv.fit_transform(corpusSumm) tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) tfidf_transformer.fit(word_count_vector) feature_names = cv.get_feature_names() i = 0 d_i = 0 for paper in corpus_entry: i += 1 tf_idf_vector = tfidf_transformer.transform( cv.transform([paper.summary])) sorted_items = sort_coo(tf_idf_vector.tocoo()) keywords = extract_topn_from_vector(feature_names, sorted_items) top3_sentences = [] top3_scores = [] top3_breakdown = [] for sentence in paper.summary.split("."): # how many scores are higher in top3 than this sentence, if all 3, delete and replace, otherwise delete lowest higherScores = 0 sentenceTotal = 0 theSentence = [] breakdown = [] index = 0 # keep track of words / sentence to get average score word_count = 0 for word in sentence.split(" "): # Add up all of the tf_idf scores if word.lower() in keywords: sentenceTotal = sentenceTotal + keywords[word.lower()] breakdown.append(keywords[word.lower()]) # Average by word theSentence.append(word.lower()) breakdown.append("0") word_count = word_count + 1 sentenceTotal = sentenceTotal / word_count min_score = 1000 # print (theSentence,sentenceTotal,word_count) # get index of min score and append if should if top3_sentences: # print (top3_scores) if len(top3_scores) == 3: for idx, score in enumerate(top3_scores): if score > sentenceTotal: higherScores = higherScores + 1 elif score < min_score: index = idx min_score = score if higherScores < 3: del top3_sentences[index] del top3_scores[index] top3_sentences.append(sentence) top3_scores.append(sentenceTotal) top3_breakdown.append(breakdown) else: top3_sentences.append(sentence) top3_scores.append(sentenceTotal) top3_breakdown.append(breakdown) else: top3_sentences.append(sentence) top3_scores.append(sentenceTotal) top3_breakdown.append(breakdown) three_sentences = {} k = 0 for sentence in top3_sentences: # if sentence == '': # print(sentence) # else: # if sentence[0] == ' ': # print(sentence[1:] + "\n") # else: # print(sentence + "\n") three_sentences[k] = {"sentence": sentence} k += 1 obj, created = Articles.objects.get_or_create(link=paper.link, defaults={ 'title': paper.title, 'sentence': three_sentences, 'category': category_obj, 'date': all_dates[d_i] }) print('obj created ', created, ' date', all_dates[d_i]) d_i += 1
top3_words = get_top_three_words(dt['text'], n=20) top3_df = pd.DataFrame(top3_words) top3_df.columns = ["Tri-gram", "Freq"] print(top3_df) # Barplot of most freq Tri-grams for sns.set(rc={'figure.figsize': (13, 8)}) j = sns.barplot(x="Tri-gram", y="Freq", data=top3_df) j.set_xticklabels(j.get_xticklabels(), rotation=45) tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) tfidf_transformer.fit(word_count_vector) # read test docs into a dataframe and concatenate title and body df_test = dataframe[15000:] df_test['text'] = df_test['Title'] + df_test['Body'] df_test['text'] = df_test['text'].apply(lambda x: cleanse_text(x)) # get test docs into a list docs_test = df_test['text'].tolist() def sort_coo(coo_matrix): tuples = zip(coo_matrix.col, coo_matrix.data) return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def apply_tf_idf(X): transformer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True) transformer.fit(X) return transformer
def PredictionScoreLeaveOneOutSpecifyClassifier(X, y, limit, columnName, classifierNames, classifiers): from sklearn.metrics import f1_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.svm import SVC, LinearSVC import matplotlib.pyplot as plt names = classifierNames outFile = open('output.txt', 'a') vec = DictVectorizer() for name, clf in zip(names, classifiers): try: accuracy = 0.0 count = 0.0 total_accuracy = 0.0 total_f1 = 0.0 total_precision = 0.0 total_recall = 0.0 count = 1.0 from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() loo.get_n_splits(X) # print(loo) y_test_all = [] y_pred_all = [] accuracy_total = 0 count = 0 for train_index, test_index in loo.split(X): # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_fit = count_vect.fit(X_train) X_train_counts = X_train_fit.transform(X_train) X_test_counts = X_train_fit.transform(X_test) # from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() fit = tfidf_transformer.fit(X_train_counts) X_train_tfidf = fit.transform(X_train_counts) X_test_tfidf = fit.transform(X_test_counts) X_train_counts = X_train_tfidf X_test_counts = X_test_tfidf try: clf.fit(X_train_counts.toarray(), y_train) accuracy_total += clf.score(X_test_counts.toarray(), y_test) count += 1 y_pred = clf.predict(X_test_counts.toarray()) # # binary_predictions = [x if x == 'good' else 0 for x in y_pred] # binary_predictions = [x if x == 0 else 1 for x in binary_predictions] # # binary_labels = [x if x == 'good' else 0 for x in y_test] # binary_labels = [x if x == 0 else 1 for x in binary_labels] y_pred_all.append(y_pred[0]) y_test_all.append(y_test[0]) except BaseException as b: print(b) f1 = f1_score(y_test_all, y_pred_all, average='weighted') precision = precision_score(y_test_all, y_pred_all, average='weighted') recall = recall_score(y_test_all, y_pred_all, average='weighted') print( str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" + str(accuracy_total / count) + "\t" + str(f1) + "\t" + str(precision) + "\t" + str(recall)) outFile.write( str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" + str(accuracy_total / count) + "\t" + str(f1) + "\t" + str(precision) + "\t" + str(recall) + "\n") # acc, f1,prc,rec = classify(clf,X_train,X_test,y_train,y_test) # # total_accuracy +=acc # total_f1 += f1 # total_precision += prc # total_recall += rec except BaseException as b: print(b) outFile.close()
print() print('TF-IDF Embedding Using Scikit-learn') from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import TfidfVectorizer cv = CountVectorizer() tf = cv.fit_transform(texts) # Vocabulary vocabulary = cv.get_feature_names() print(vocabulary[:10]) print(vocabulary[-10:]) tt = TfidfTransformer() tt.fit(tf) tf_idf = tt.transform(tf) print() print('tf matrix') print(tf_idf.shape) print(tf_idf.toarray()) tv = TfidfVectorizer() tf_idf_2 = tv.fit_transform(texts) print() print('tf-idf matrix') print(tf_idf_2.toarray()) """import random len_query = 5
def tfidf_process(data): from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer() transformer = transformer.fit(data) return transformer
dataFrame.at[i, "AuthorName"] = dataFrame_final.at[i, "AuthorName"] dataFrame = dataFrame.reindex(np.random.permutation(dataFrame.index)) ######################### ######################## print("\n\n", "::::::>>>>>>> NAIVE <<<<<<<::::::") #Naive + Vectorizer X_train, X_test, y_train, y_test = train_test_split(dataFrame["Content"], dataFrame["AuthorName"], test_size=0.2, random_state=1) count_vect = CountVectorizer() fiter = count_vect.fit(X_train) X_train_counts = fiter.transform(X_train) tfidf_transformer = TfidfTransformer() fiter2 = tfidf_transformer.fit(X_train_counts) X_train_tfidf = fiter2.transform(X_train_counts) clf = MultinomialNB().fit(X_train_tfidf, y_train) print(">>> Naive Score:") print(clf.score(fiter2.transform(fiter.transform(X_test)), y_test)) #Cross Validation X = dataFrame['Content'] y = dataFrame['AuthorName'] scores1 = cross_val_score(clf, fiter2.transform(fiter.transform(X)), y, cv=10) print(">>> Cross Validation Score:") print(scores1) ########################
def generateLazyLoadForModel2(useIntegrated, category, platform, uniqueFileConvention, dataFileConvention, test_size=defaultTestSize): if os.path.isfile('data/' + category + '/' + dataFileConvention + '_dataset.csv'): print(dataFileConvention + '_dataset.csv' + ' already generated only shuffling right now') with open( 'data/' + category + '/' + dataFileConvention + '_dataset.csv', 'r') as f: l = list(csv.reader(f)) firstHeadersRow = l[0] l = l[1:] random.shuffle(l) with open( 'data/' + category + '/' + dataFileConvention + '_dataset.csv', 'w') as f: csv.writer(f).writerows([firstHeadersRow] + l) return print(dataFileConvention + '_dataset.csv' + ' not found') print(dataFileConvention + '_dataset.csv' + ' generating') probs = [] if len(generateLazyLoad.probs) == 0: generateLazyLoadForModel2.probs = get_all_probs_without_category_NA( useIntegrated, platform) probs = generateLazyLoadForModel2.probs random.shuffle(probs) train_set = tuple([prob.modified_description for prob in probs]) prob_class = [] for prob in probs: prob_class.append(1.0 if category in prob.category else 0.0) print 'Test Size: ' + str((test_size)) print 'Total: ' + str((len(probs))) print 'Train Set Length: ' + str(len(train_set)) timeStart = time.time() if os.path.isfile(PlatformType.platformString[platform] + '_tfidfMatrix_' + '.pickle'): print('Loading tfidf matrix from pickle') with open( PlatformType.platformString[platform] + '_tfidfMatrix_' + '.pickle', 'rb') as f: tf_idf_matrix = pickle.load(f) else: print('Building tfidf matrix and dumping in pickle') count_vectorizer = CountVectorizer(stop_words='english') count_vectorizer.fit_transform(train_set) freq_term_matrix = count_vectorizer.transform(train_set) tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) tf_idf_matrix = tfidf.transform(freq_term_matrix) with open( PlatformType.platformString[platform] + '_tfidfMatrix_' + '.pickle', 'wb') as f: pickle.dump(tf_idf_matrix, f) # print(str(tf_idf_matrix)) numpyAr = tf_idf_matrix.toarray() np.set_printoptions(threshold='nan') print 'Tfidf Feature Size: ' + str(len(numpyAr[0]) + 1) list_of_features_needed = [] keepPercentage = 0.05 if os.path.isfile(PlatformType.platformString[platform] + '_list_of_features_aftertfidf_' + str(keepPercentage) + '.pickle'): print('List of features after tfidf already found, reading from there') with open( PlatformType.platformString[platform] + '_list_of_features_aftertfidf_' + str(keepPercentage) + '.pickle', 'rb') as f: list_of_features_needed = pickle.load(f) else: print('Making List of features after tfidf and dumping as pickle') for cat in categories: list_of_features_needed = list_of_features_needed + get_categorywise_features( numpyAr, cat, probs, keepPercentage) list_of_features_needed = list(set(list_of_features_needed)) list_of_features_needed.sort() with open( PlatformType.platformString[platform] + '_list_of_features_aftertfidf_' + str(keepPercentage) + '.pickle', 'wb') as f: pickle.dump(list_of_features_needed, f) # print(str(list_of_features_needed)) numpyArrayList = [] for i in range(len(numpyAr)): newRowList = [] for j in list_of_features_needed: newRowList.append(numpyAr[i][j]) numpyArrayList = numpyArrayList + [np.array(newRowList)] numpyAr = np.array(numpyArrayList) print(numpyAr.shape) # Applying SVD to reduce features # svd = TruncatedSVD(int(0.1*len(numpyAr[0] + 1))) # lsa = make_pipeline(svd, Normalizer(copy = False)) # reduced_lsa_features = lsa.fit_transform(tf_idf_matrix) # numpyAr = reduced_lsa_features # print 'Reduced Feature Size: ' + str(len(numpyAr[0]) + 1) print('Time taken for generating data: ' + str(time.time() - timeStart)) if not os.path.exists('data/' + category): os.makedirs('data/' + category) print('Currently on cat: ' + str(category)) with open('data/' + category + '/' + dataFileConvention + '_dataset.csv', 'w') as f: print('Writing ' + 'data/' + category + '/' + dataFileConvention + '_dataset.csv') writer = csv.writer(f) i = 0 for row in numpyAr: writer.writerow(list(row) + [prob_class[i]]) i += 1
post = " ".join(post) return post my_data['processed_text'] = my_data.title + my_data.body my_data['processed_text'] = my_data.processed_text.apply(text_processing) doc = my_data['processed_text'].tolist() #lista coloana "processed_text cv = CountVectorizer(analyzer='word', stop_words='english', max_df=0.85, ngram_range=(1, 2)) word_count = cv.fit_transform(my_data['processed_text']) tfidf_transf = TfidfTransformer() tfidf_transf.fit(word_count) tf_idf_matrix = tfidf_transf.transform(word_count).toarray() dict_sent = {} for doc_nr in range(0, len(tf_idf_matrix)): df_tfidf = pd.DataFrame(tf_idf_matrix[doc_nr], index=cv.get_feature_names(), columns=['tf_idf_scores' ]).sort_values(by=['tf_idf_scores'], ascending=False).head(3) dict_sent[doc_nr] = df_tfidf.index.values.tolist() df_results = pd.DataFrame(columns=['Title', 'Top 3 topics']) df_results['Title'] = my_data['title'].values df_results['index'] = df_results.reset_index().index df_results['Top_3_topics'] = df_results['index'].map(dict_sent)
def tf_idf(X): tfidfTransformer = TfidfTransformer(use_idf=True) tfidfTransformer.fit(X) X = tfidfTransformer.transform(X, copy=True) return X
documents = [data[d]['plot'] for d in range(l)] titles = [data[d]['title'] for d in range(l)] LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english') LemVectorizer.fit_transform(documents) stemmer = nltk.stem.porter.PorterStemmer() # print LemVectorizer.vocabulary_ tf_matrix = LemVectorizer.transform(documents).toarray() # print tf_matrix # print tf_matrix.shape from sklearn.feature_extraction.text import TfidfTransformer tfidfTran = TfidfTransformer(norm="l2") tfidfTran.fit(tf_matrix) # print tfidfTran.idf_ import math def idf(n, df): result = math.log((n + 1.0) / (df + 1.0)) + 1 return result # print "The idf for terms that appear in one document: " + str(idf(4,1)) # print "The idf for terms that appear in two documents: " + str(idf(4,2)) tfidf_matrix = tfidfTran.transform(tf_matrix)
analyzer = 'word', tokenizer = tokenize, lowercase = True, ngram_range = (1,3), max_df = 0.9261187281287935, min_df = 4 ) corpus_data_fitted = vectorizer.fit(X_train_full) pickle.dump(corpus_data_fitted.vocabulary_, # save vocabulary for future open('models/vocabulary.p', 'wb')) # predictions corpus_data_features = corpus_data_fitted.transform(X_train_full) # Transform count matrix to a normalized tf-idf representation tfidf_transformer = TfidfTransformer() corpus_data_tfidf_fitted = tfidf_transformer.fit(corpus_data_features) pickle.dump(corpus_data_tfidf_fitted, # save fitted Tfidf for future open('models/corpus_data_tfidf_fitted.p', 'wb')) # predictions corpus_data_features_tfidf = corpus_data_tfidf_fitted\ .transform(corpus_data_features) # TRAIN CLASSIFIER print("Training Logistic Regression classifier... " "This takes around 10 minutes!") clf_logreg = LogisticRegression(max_iter=100, n_jobs=-1, C=3.41)\ .fit(corpus_data_features_tfidf, y_train_full) # SAVE MODEL print("Saving the trained classifier in models/ folder...")
class SklearnGridSearch: """ Run grid search with selected sklearn classifiers """ grid = { "RandomForestClassifier" : { "criterion": ["gini", "entropy"], "max_depth": [5, 10, 20, 40, 100, 200], "min_samples_leaf": [1, 2, 4, 8] }, "MLPClassifier" : { "activation": ["logistic", "relu"], "hidden_layer_sizes": [(10,), (20, ), (50, )] } } def __init__(self, classifier_type :str, parameters: Dict, verbose=False): """ Initialize the classifier :param classifier_type: The name of the classifiers :param grid: Parameter settings to check :param verbose: verbosity true or false """ self.verbose = verbose if classifier_type not in self.grid: raise Exception("Unsupported classifier type {0}. Use one of {1}".format(classifier_type, self.grid.keys())) if classifier_type == "RandomForestClassifier": self.sklearn_classifier = RandomForestClassifier() elif classifier_type == "MLPClassifier": self.sklearn_classifier = MLPClassifier() self.parameters = parameters self.classifier_type = classifier_type self.count_vectorizer = CountVectorizer(min_df=10, max_df=0.8, ngram_range=(1, 1)) self.tfidf_transformer = TfidfTransformer(use_idf=True) def grid_search(self, training_data: str, text_label: str, class_label: str) -> Dict: """ Train the classifier :param training_data: File name. Training data is one json per line :param text_label: Json field which contains the text :param class_label: Json field which contains the label for the classes to train :return: Nothing """ """ Train the algorithm with the data from the knowledge graph """ data_train = create_data_table_from_training_file(training_data, text_label, class_label, 10000) print("INFO: grid evaluation with {0} data points".format(len(data_train))) data_train = data_train.fillna(0) matrix_train_counts = self.count_vectorizer.fit_transform(data_train.text) self.tfidf_transformer = self.tfidf_transformer.fit(matrix_train_counts) matrix_train_tf = self.tfidf_transformer.transform(matrix_train_counts) matrix_train_tf = matrix_train_tf.toarray() grid_search = GridSearchCV(self.sklearn_classifier, self.parameters, n_jobs=10) grid_search.fit(matrix_train_tf, data_train.label) print(grid_search.best_params_) return grid_search.best_params_
qualified_descriptions = getDescriptionsFromSheet(qualified_sheet) qualified_clean_descriptions = cleanUp(qualified_descriptions) disqualified_sheet = getSheet('input/disqualified.xlsx') disqualified_descriptions = getDescriptionsFromSheet(disqualified_sheet) disqualified_clean_descriptions = cleanUp(disqualified_descriptions, True) X = qualified_clean_descriptions + disqualified_clean_descriptions Y = createLabelArray(qualified_clean_descriptions, disqualified_clean_descriptions) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42) vectorizer.fit(X_train) X_train, X_test = transformByVectorizer(X_train, X_test, vectorizer) tfidf_transformer.fit(X_train) X_train, X_test = transformByTfIdf(X_train, X_test, tfidf_transformer) # gnb, gnb_score=runGaussianNB(X_train, X_test, Y_train, Y_test) forest_score, forest = runForest(X_train, X_test, Y_train, Y_test) print 'Random Forest score: ', forest_score # print 'Random Gaussian Naive Bayes score: ', gnb_score with open('forest', 'wb') as f: cPickle.dump(forest, f) # with open('gnb', 'wb') as fi: # cPickle.dump(gnb, fi) with open('vectorizer', 'wb') as file:
# In[13]: from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer # In[14]: count_v1 = CountVectorizer(max_df=0.4, max_features=3000) # 考虑到内存只选用了3000个特征 counts_train = count_v1.fit_transform(train_data_list) print("the shape of train is " + repr(counts_train.shape)) tfidftransformer = TfidfTransformer() tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train) tfidf_ndarray = tfidf_train.toarray() # ### 用K-Means聚类 # In[15]: from sklearn.cluster import KMeans # In[16]: kmeans = KMeans(n_clusters=10, random_state=0).fit(tfidf_ndarray)
def test_vectorizer(): # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert_equal(counts_train[0, v1.vocabulary_["pizza"]], 2) # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary_) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() vocabulary = v.vocabulary_ assert_equal(counts_test[0, vocabulary["salad"]], 1) assert_equal(counts_test[0, vocabulary["tomato"]], 1) assert_equal(counts_test[0, vocabulary["water"]], 1) # stop word from the fixed list assert_false("the" in vocabulary) # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert_false("copyright" in vocabulary) # not present in the sample assert_equal(counts_test[0, vocabulary["coke"]], 0) assert_equal(counts_test[0, vocabulary["burger"]], 0) assert_equal(counts_test[0, vocabulary["beer"]], 0) assert_equal(counts_test[0, vocabulary["pizza"]], 0) # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = t1.fit(counts_train).transform(counts_train).toarray() assert_equal(len(t1.idf_), len(v1.vocabulary_)) assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_))) # test tf-idf with new data tfidf_test = t1.transform(counts_test).toarray() assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_))) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = t2.fit(counts_train).transform(counts_train).toarray() assert_equal(t2.idf_, None) # test idf transform with unlearned idf vector t3 = TfidfTransformer(use_idf=True) assert_raises(ValueError, t3.transform, counts_train) # test idf transform with incompatible n_features X = [[1, 1, 5], [1, 1, 0]] t3.fit(X) X_incompt = [[1, 3], [1, 3]] assert_raises(ValueError, t3.transform, X_incompt) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = TfidfVectorizer(norm='l1') assert_false(tv.fixed_vocabulary) tv.max_df = v1.max_df tfidf2 = tv.fit_transform(train_data).toarray() assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = tv.transform(test_data).toarray() assert_array_almost_equal(tfidf_test, tfidf_test2) # test transform on unfitted vectorizer with empty vocabulary v3 = CountVectorizer(vocabulary=None) assert_raises(ValueError, v3.transform, train_data) # ascii preprocessor? v3.set_params(strip_accents='ascii', lowercase=False) assert_equal(v3.build_preprocessor(), strip_accents_ascii) # error on bad strip_accents param v3.set_params(strip_accents='_gabbledegook_', preprocessor=None) assert_raises(ValueError, v3.build_preprocessor) # error with bad analyzer type v3.set_params = '_invalid_analyzer_type_' assert_raises(ValueError, v3.build_analyzer)
def createFeatureVectorsLazy(self, input_dir, label, chunks_record, flat_list=False, chunks=True): print("in createFeatureVectorsLazy") function_words_vectorizer = CountVectorizer(token_pattern=r"[a-z]*'[a-z]*|(?u)\b\w\w+\b|...|.|!|\?|\"|\'", vocabulary=self.function_words_list) POS_vectorizer = CountVectorizer(ngram_range=(3, 3), vocabulary=self.POS_trigrams_list, stop_words=None, tokenizer=BinaryNLIClassifier.Tokenize, lowercase=False) function_word_feature_vector = [] POS_feature_vector = [] text = [] for f in os.listdir(input_dir): with open(os.path.join(input_dir, f), 'rb') as fh: # print("file = {}".format(f)) text.append(pickle.load(fh)) # text = pickle.load(fh) if flat_list: text = [item for sublist in text for item in sublist] text = [item for sublist in text for item in sublist] shuffle(text) try: text = [s.split(BEGIN_SENTENCE)[1] for s in text] except: print(text) # text = [s.split(END_SENTENCE)[0] for s in text] chunk = 0 token_counter = 0 lemmas = {} pos = {} lemmas[chunk] = [] pos[chunk] = [] pos_seq = "" lemmas_seq = "" for sentence in text: sentence = sentence.strip() if len(sentence) > 0 and chunks and token_counter >= CHUNK_SIZE: lemmas[chunk].append(lemmas_seq) pos[chunk].append(pos_seq) chunks_record.append("{}_{}".format(os.path.basename(os.path.normpath(input_dir)), chunk)) chunk += 1 pos[chunk] = [] lemmas[chunk] = [] token_counter = 0 pos_seq = "" lemmas_seq = "" token_counter += len(sentence.split(" ")) # print(token_counter) pos_seq += BEGIN_SENTENCE + " " for token in sentence.split(" "): try: lemma_pos = token.split(SEPERATOR) lemmas_seq += lemma_pos[0] + " " pos_seq += lemma_pos[1] + " " lemmas_seq += ' ' except: continue pos_seq += END_SENTENCE + ' ' if not chunks: lemmas[chunk].append(lemmas_seq) pos[chunk].append(pos_seq) chunks_record.append("{}_{}".format(f, chunk)) chunk += 1 for index in range(chunk): if len(lemmas[index]) == 0: continue # print(index) function_word_feature_vector.append(function_words_vectorizer.fit_transform(lemmas[index])) POS_feature_vector.append(POS_vectorizer.fit_transform(pos[index])) with open(os.path.join(TOEFL_DEBUG, "{}_{}".format(f, index)), 'w', encoding='utf-8') as chunk_out: for sent in lemmas[index]: # sent = (sent.split(END_SENTENCE)[0]).strip() chunk_out.write(sent + "\n") # print(pos[index]) # print(POS_feature_vector[index]) # print(lemmas[index]) # print(function_word_feature_vector[index]) # print(len(function_word_feature_vector)) # break print("chunk = {}".format(chunk)) result_func_words = vstack(function_word_feature_vector) result_POS_trigrams = vstack(POS_feature_vector) final_feature_vecotr_structure = hstack([result_func_words, result_POS_trigrams], format='csr') # final_feature_vecotr_structure = result_POS_trigrams # final_feature_vecotr_structure = result_func_words tf_idf_transformer = TfidfTransformer() tf_idf_transformer.fit(final_feature_vecotr_structure) tf_idf_transformer.transform(final_feature_vecotr_structure) labels = [label] * final_feature_vecotr_structure.shape[0] return final_feature_vecotr_structure, labels
encoding='utf-8').read().split('\n') all_text = train_texts + test_texts print('(2) doc to var...') from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer count_v0 = CountVectorizer() counts_all = count_v0.fit_transform(all_text) count_v1 = CountVectorizer(vocabulary=count_v0.vocabulary_) counts_train = count_v1.fit_transform(train_texts) print("the shape of train is " + repr(counts_train.shape)) count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_) counts_test = count_v2.fit_transform(test_texts) print("the shape of test is " + repr(counts_test.shape)) tfidftransformer = TfidfTransformer() train_data = tfidftransformer.fit(counts_train).transform(counts_train) test_data = tfidftransformer.fit(counts_test).transform(counts_test) x_train = train_data y_train = train_labels x_test = test_data y_test = test_labels print('(3) Naive Bayes...') from sklearn.naive_bayes import MultinomialNB from sklearn import metrics clf = MultinomialNB(alpha=0.01) clf.fit(x_train, y_train) preds = clf.predict(x_test) num = 0 preds = preds.tolist()
class SequenceBagOfWordsSVMClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params]): """ BBN D3M Naive Sequence Classifier Arguments: hp_seed ... hp_splice ... """ __author__ = 'BBN' __metadata__ = { "common_name": "Discontinuity-based segmentation", "algorithm_type": ["Bayesian"], "original_name": "bbn_primitives.time_series.SequenceBagOfWordsSVMClassifier", "task_type": ["Modeling"], "learning_type": ["Supervised learning"], "compute_resources": { "sample_size": [], "sample_unit": [], "disk_per_node": [], "expected_running_time": [], "gpus_per_node": [], "cores_per_node": [], "mem_per_gpu": [], "mem_per_node": [], "num_nodes": [], }, "handles_regression": False, "handles_classification": False, "handles_multiclass": False, "handles_multilabel": False, } def __init__( self, hp_seed: int = 0, hp_splice: int = 0, ): super().__init__() np.random.seed(hp_seed) self.hp_splice = hp_splice self.training_inputs = None self.training_outputs = None self.tfidf = None self.vocab = None self.model = None self.fitted = False def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: self.training_inputs = inputs self.training_outputs = outputs self.fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> None: """ Arguments - inputs: List(d3m_ndarray) """ if self.fitted: return if self.training_inputs is None or self.training_outputs is None: raise Exception('Missing training data') with stopit.ThreadingTimeout(timeout) as timer: # if self.hp_splice > 1: # spliced_data = list() # for cinput in self.training_inputs: # framer = SignalFramer( # sampling_rate = 1, # frame_length_s = self.hp_splice, # frame_shift_s = 1, # flatten_output = True, # ) # cdata = frames.produce([cinput])[0] # spliced_data.append(cdata) # else: # spliced_data = self.training_inputs self.vocab = seq_vocab(self.training_inputs) train_x = seq_to_tokenfreq_csr(self.training_inputs, self.vocab) train_y = self.training_outputs self.tfidf = TfidfTransformer(norm='l1') train_x_tfid = self.tfidf.fit(train_x).transform(train_x) # Build a classification model svm = sklearn.svm.SVC(probability=True) svm.fit(train_x_tfid, train_y) self.model = svm self.fitted = True if timer.state == timer.EXECUTED: return else: raise TimeoutError( 'SequenceBagOfWordsSVMClassifier exceeded time limit') def produce(self, inputs: Inputs, timeout: float = None, iterations: int = None) -> Outputs: """ Arguments: - inputs: List(d3m_ndarray) Returns: - List(int) """ with stopit.ThreadingTimeout(timeout) as timer: x = seq_to_tokenfreq_csr(inputs, self.vocab) train_x_tfid = self.tfidf.transform(x) pred = self.model.predict(train_x_tfid) outputs = [cpred for cpred in pred] if timer.state == timer.EXECUTED: return outputs else: raise TimeoutError( 'SequenceBagOfWordsSVMClassifier exceeded time limit') def get_params(self) -> Params: return Params(coefficient=self.model.coef_) def set_params(self, *, params: Params) -> None: self.model.coef_ = params.coefficient
def Train(): CU_X, Y = create_Preturbed_Dataset(inputFile='CASIS-25_CU.txt') fold_accuracy = [] for repeat in range(1): # it was 10 intitially #-----------------------------Classifiers------------------------ # Multilayer Perceptron mlp = MLPClassifier(hidden_layer_sizes=(95, 25), activation=('relu'), max_iter=1000) # Data Manipulation, Preprocessing, Training and Testing # 4-Fold CrossValidation with Shuffling skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0) scaler = StandardScaler() tfidf = TfidfTransformer(norm=None) dense = DenseTransformer() for train, test in skf.split(CU_X, Y): #train split CU_train_data = CU_X[train] train_labels = Y[train] #test split CU_eval_data = CU_X[test] eval_labels = Y[test] # tf-idf tfidf.fit(CU_train_data) CU_train_data = dense.transform(tfidf.transform(CU_train_data)) CU_eval_data = dense.transform(tfidf.transform(CU_eval_data)) # standardization scaler.fit(CU_train_data) CU_train_data = scaler.transform(CU_train_data) CU_eval_data = scaler.transform(CU_eval_data) # normalization CU_train_data = normalize(CU_train_data) CU_eval_data = normalize(CU_eval_data) train_data = CU_train_data eval_data = CU_eval_data # training mlp.fit(train_data, train_labels) # evaluation mlp_acc = mlp.score(eval_data, eval_labels) print('MLP Accuracy = ', mlp_acc) #--------------------------------------------------------------------------- # Save the Trained Models Now path = "Trained_Models/" dump(mlp, open(path + 'mlp.pkl', 'wb')) print('***************************') print('Trained and Saved the Model') print('***************************') return mlp
def run_classifer(X_train, s_train, y_train, X_test, s_test, y_test): s_train = np.array(s_train) # samples x features s_test = np.array(s_test) num_labels = 15 batch_size = 100 stemmer = sb.SnowballStemmer('english') swlist = sw.words('english') swlist += [stemmer.stem(w) for w in swlist] swlist += [ "'d", "'s", 'abov', 'ani', 'becaus', 'befor', 'could', 'doe', 'dure', 'might', 'must', "n't", 'need', 'onc', 'onli', 'ourselv', 'sha', 'themselv', 'veri', 'whi', 'wo', 'would', 'yourselv' ] #complained about not having these as stop words pubs = [ 'buzzfe', 'buzzf', 'npr', 'cnn', 'vox', 'reuter', 'breitbart', 'fox', 'guardian', 'review', 'theatlant' ] punct = [ ] #[':', '..', '“', '@', '%', ';', '→', ')', '#', '(', '*', '&', '[', ']', '…', '?','—', '‘', '$'] #gonna leave these in for now swlist += pubs swlist += punct if sys.argv[4].lower() == 'true': tkzr = StemTokenizer() else: tkzr = None if sys.argv[5].lower() != 'true': swlist = [] #what features are we using? if sys.argv[7].lower() == 'word': count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr) count_vect.fit(X_train) X_train = count_vect.transform(X_train) X_test = count_vect.transform(X_test) tfidf_transformer = TfidfTransformer() tfidf_transformer.fit(X_train) X_train = tfidf_transformer.transform(X_train) X_test = tfidf_transformer.transform(X_test) elif sys.argv[7].lower() == 'topic': count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr) count_vect.fit(X_train) X_train = count_vect.transform(X_train) X_test = count_vect.transform(X_test) lda_model = LatentDirichletAllocation(n_components=10) lda_model.fit(X_train) X_train = lda_model.transform(X_train) X_test = lda_model.transform(X_test) elif sys.argv[7].lower() == 'style': X_train = csr_matrix(s_train) X_test = csr_matrix(s_test) elif sys.argv[7].lower() == 'all': count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr) count_vect.fit(X_train) X_train = count_vect.transform(X_train) X_test = count_vect.transform(X_test) tfidf_transformer = TfidfTransformer() tfidf_transformer.fit(X_train) X_train_tf = tfidf_transformer.transform(X_train) X_test_tf = tfidf_transformer.transform(X_test) print(type(X_train_tf)) lda_model = LatentDirichletAllocation(n_components=10) lda_model.fit(X_train) X_train_lda = lda_model.transform(X_train) X_test_lda = lda_model.transform(X_test) print(type(X_train_lda)) X_train = csr_matrix( sparse.hstack( [X_train_tf, csr_matrix(X_train_lda), csr_matrix(s_train)])) X_test = csr_matrix( sparse.hstack( [X_test_tf, csr_matrix(X_test_lda), csr_matrix(s_test)])) print(type(X_train)) # sparse.save_npz("X_train" + sys.argv[6] + ".npz", X_train) # sparse.save_npz("X_test" + sys.argv[6] + ".npz", X_test) else: sys.exit('unknown features') encoder = LabelBinarizer() encoder.fit(y_train) y_train = encoder.transform(y_train) y_test = encoder.transform(y_test) # np.save('X_train.npy', X_train) # np.save('X_test.npy', X_test) # np.save('y_train.npy', y_train) # np.save('y_test.npy', y_test) # sparse.save_npz("y_train" + sys.argv[6] + ".npz", y_train) # sparse.save_npz("y_test" + sys.argv[6] + ".npz", y_test) # load everything back # X_train = sparse.load_npz("X_train.npz") input_dim = X_train.shape[1] model = Sequential() model.add(Dense(512, input_shape=(input_dim, ))) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(num_labels)) model.add(Activation('softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(X_train, y_train, batch_size=batch_size, epochs=5, verbose=1, validation_split=0.1) # model.model.save(sys.argv[6] + '.h5') # X_train = np.load('X_train.npy') # X_test = np.load('X_test.npy') # y_train = np.load('y_train.npy') # y_test = np.load('y_test.npy') # model = keras.models.load_model(sys.argv[6] + '.h5') score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1) print('Test accuracy:', score[1]) y_pred = model.predict(X_test, batch_size=batch_size, verbose=1) predicted = np.argmax(y_pred, axis=1) p, r, fs, s = precision_recall_fscore_support(np.argmax(y_test, axis=1), predicted) print(p, r, fs, s)