def cal_product_title_tfidf(): #PART I compute the tf-idf for product title print "\nBegins,compute the tf-idf for product title ..." print "\nStemming product_title..." AllSet['product_title'] = AllSet['product_title'].map(lambda x : stem_process(x)) product_title = AllSet['product_title'] print "\nGet the (product title vocabulary)-(search term) frequency matrix..." search_vect_tittle = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency search_vect_tittle.fit(product_title)#learn the vocabulary search_tittle_fq_matrix = search_vect_tittle.transform(search_term) #get the (product title vocabulary)-(search term) frequency matrix print "\nGet the (product title vocabulary)-(product_title) frequency matrix" title_vect = CountVectorizer(stop_words='english') title_vect.fit_transform(product_title)#learn the vocabulary title_fq_matrix = title_vect.transform(product_title) #get the (product title vocabulary)-(product_title) frequency matrix print "\nGet the idf matrix" tfidf_transformer = TfidfTransformer(norm="l2", smooth_idf=True) tfidf_transformer.fit(title_fq_matrix) # get idf for each vocabulary tf_idf_title_matrix = tfidf_transformer.transform(title_fq_matrix) #get the idf matrix print "\nCompute the result of tf-idf for product title ..." tf_idf_title_result = [] #compute the result of tf-idf for product title for index in range(tf_idf_title_matrix.shape[0]): tf_idf_title_result.append((np.multiply(tf_idf_title_matrix[index], search_tittle_fq_matrix[index].transpose()))[0, 0]) pd.DataFrame({"id": AllSet['id'],"product_title_tfidf": tf_idf_title_result}).to_csv('product_title_tfidf.csv', index=False) return 0
def vectorize_data(quote_list, vectorizer = None, Tfidf = True, min_df = 1, ngram_range = (1,2), token_pattern = r'\b\w\w+\b'): ''' Vectorizes given data using desired vectorizer object. Input: quote_list: list of data to vectorize vectorizer : CountVectorizer object (optional) A CountVectorizer object to use. If None, then create and fit a new CountVectorizer. Otherwise, re-fit the provided CountVectorizer using the provided data data Output: numpy array (dims: nreview, nwords) Bag-of-words representation for each quote. ''' # if no vectorizer was passed, declare a vectorizer object if(vectorizer is None): if(Tfidf == False): vectorizer = CountVectorizer(min_df = min_df, ngram_range = ngram_range, token_pattern = token_pattern) else: vectorizer = TfidfVectorizer(min_df = min_df, ngram_range = ngram_range, token_pattern = token_pattern) # build the vectorizer vocabulary vectorizer.fit(quote_list) # transform into bag of words X = vectorizer.transform(quote_list) return X.tocsc()
class ACMClassificator(BaseACMClassificator): def __init__(self): self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize) self.mlb = MultiLabelBinarizer() self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, class_weight=None), n_jobs=-1 ) def _prepare_problems(self, problems): return self.vectorizer.transform([p.statement for p in problems]) def fit(self, problems, tags): nltk.download('punkt', quiet=True) self.vectorizer.fit([p.statement for p in problems]) mat = self._prepare_problems(problems) self.mlb = self.mlb.fit(tags) self.classificator.fit(mat.toarray(), self.mlb.transform(tags)) def predict(self, problems): mat = self._prepare_problems(problems) predicted = self.classificator.predict(mat.toarray()) return self.mlb.inverse_transform(predicted)
def get_vectorizer(article_texts, max_features=50000): vectorizer = CountVectorizer(ngram_range=(1,2), stop_words="english", min_df=2, token_pattern=r"(?u)95% confidence interval|95% CI|95% ci|[a-zA-Z0-9_*\-][a-zA-Z0-9_/*\-]+", binary=False, max_features=max_features) vectorizer.fit(article_texts) return vectorizer
def trainModel(test_data): predictions = dict() outcome_list=('DE', 'LT', 'HO', 'DS', 'CA', 'RI', 'OT') for o in outcome_list: info,outcome=loadData('Outcomes' + '/' + o +'.txt') #split data into training dataset train, test, labels_train, labels_test = train_test_split(info, outcome, test_size=0.33) counter = CountVectorizer() counter.fit(train) #count the number of times each term appears in a document and transform each doc into a count vector counts_train = counter.transform(train)#transform the training data counts_test = counter.transform(test_data)#transform the new data #build a classifier on the training data LR = LogisticRegression() LR.fit(counts_train,labels_train) #use the classifier to predict on new data predicted=LR.predict(counts_test) #determine prediction results if 1 in predicted: flag = 'yes' else: flag = 'no' predictions[o] = flag #store result of each outcome return predictions
def main(): global tweetdata for d in tweetdata.find({}, {'_id': 1, 'id': 1, 'text': 1}): res = mecab_analysis(unicodedata.normalize('NFKC', d['text'])) for k in res.keys(): if k == '形容詞': adjective_list = [] for w in res[k]: adjective_list.append(w) freq[w] += 1 tweetdata.update({'_id': d['_id']}, {'$push': {'adjective': {'$each': adjective_list}}}) elif k == '動詞': verb_list = [] for w in res[k]: verb_list.append(w) freq[w] += 1 tweetdata.update({'_id': d['_id']}, {'$push': {'verb': {'$each': verb_list}}}) elif k == '名詞': noun_list = [] for w in res[k]: noun_list.append(w) freq[w] += 1 tweetdata.update({'_id': d['_id']}, {'$push': {'noun': {'$each': noun_list}}}) tweetdata.update({'_id': d['_id']}, {'$set': {'mecabed': True}}) ret_all = get_mecabed_strings() tw_list_all = ret_all['tweet_list'] c_vec = CountVectorizer(stop_words=[u"寿司"]) c_vec.fit(tw_list_all) c_terms = c_vec.get_feature_names() transformed = c_vec.transform(tw_list_all) arg_ind = np.argsort(transformed.toarray())[0][:-50:-1] genexp = ((k, freq[k]) for k in sorted(freq, key=freq.get, reverse=True)[0:100]) write_to_csv(genexp) for k, v in genexp: print(k + '\t\t\t' + str(v))
def naive_bayes(x_value, y_value): X = x_value y = y_value #train/test split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123) vect = CountVectorizer() vect.fit(X_train) X_train_dtm = vect.transform(X_train) X_test_dtm = vect.transform(X_test) from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() nb.fit(X_train_dtm, y_train) y_pred_class = nb.predict(X_test_dtm) print 'Accuracy: ' print metrics.accuracy_score(y_test, y_pred_class) print 'Null Accuracy: ' print y_test.value_counts().head(1) / len(y_test) print 'Confusion Matrix: ' print metrics.confusion_matrix(y_test, y_pred_class)
def train_vectorizer(corpus, max_features=10000): """ Train the vectorizer """ print "training the vectorizer..." vectorizer = CountVectorizer(decode_error='ignore', max_features=max_features) vectorizer.fit(corpus) print "ok" return vectorizer
def vectorize_in_test(col_name): v = CountVectorizer(tokenizer=my_tokenizer, stop_words=None, strip_accents="unicode") vv = CountVectorizer(tokenizer=my_tokenizer, stop_words=None, strip_accents="unicode") v.fit(train_data[col_name]) vv.fit(test_data[col_name]) stop = [w for w in v.vocabulary_.keys() if w not in vv.vocabulary_.keys()] return stop
def fit(x, y, estimator, dataframe, params): vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True) vectorizer.fit(dataframe[x].values) fresh_estimator = clone(estimator) x_np, y_np, feature_names, selector = \ select_features( df = dataframe, vectorizer=vectorizer, feature_col=x, label_col=y, select_method=None, continuous_col=None ) estimator = RandomizedSearchCV(estimator, params, n_iter=60, cv=3, n_jobs=3, refit=True) estimator.fit(x_np, y_np) best_params = estimator.best_params_ if method not in ['lr', 'svm']: print("Calibrating...") estimator = CalibratedClassifierCV(fresh_estimator.set_params(**best_params), 'isotonic', 3) estimator.fit(x_np, y_np) from sklearn.base import _pprint _pprint(estimator.get_params(deep=True), offset=2) return estimator, selector, vectorizer
def find_common_words(all_words, num_most_frequent_words): vectorizer = CountVectorizer( stop_words=None, # 'english', max_features=num_most_frequent_words, binary=True) vectorizer.fit(all_words) return (vectorizer.vocabulary_, vectorizer.get_feature_names())
def main(): print 'Opening ZIP file' zin = zipfile.ZipFile(config.html_cleaned_zip, 'r') filenames = zin.namelist() filenames = filenames[0:10] filenames.sort() print 'Reading ZIP file' ordering = {n:i for i, n in enumerate(filenames)} #contents = [zin.open(n, 'r') for n in filenames] cv = CountVectorizer(stop_words=config.common_words, input='file', dtype=np.float32) print 'Learning vocabulary' cv.fit(izip(zin, filenames)) vocabulary = cv.vocabulary_ print 'Generating word vectors' docmat1 = cv.transform(izip(zin, filenames)) print 'Generating TF-IDF word vectors' docmat2 = TfidfTransformer().fit_transform(docmat1) print 'Writing output' with open(config.html_config, 'w') as pf: pickle.dump((filenames, ordering, vocabulary), pf, pickle.HIGHEST_PROTOCOL) np.savez(config.doc_mat, plain=docmat1, tfidf=docmat2)
class punctuation_ngrams_fe(feature_extractor): def __init__(self, config_file): super(punctuation_ngrams_fe, self).__init__(config_file) self.token_pattern = u'[,;\.:?!¿¡]+' self.ngram_x = 2 self.ngram_y = 2 def train(self, authors): documents = [self.db.get_author(a)["corpus"] for a in authors] documents = utils.flatten(documents) self.ngram_vectorizer = \ CountVectorizer(ngram_range=(self.ngram_x, self.ngram_y),\ token_pattern=self.token_pattern,\ analyzer='word') self.ngram_vectorizer.fit(documents) # use only normalized term frequencies self.transformer = TfidfTransformer(use_idf=False) def compute_features(self, author): freq = self.ngram_vectorizer.transform(author["corpus"]) freq = freq.toarray().astype(int) # normalized ngram frequencies norm_freq = self.transformer.fit_transform(freq).toarray() # average normalized frequencies among all author documents norm_freq = np.divide(np.sum(norm_freq, axis=0), len(norm_freq)) ngrams = self.ngram_vectorizer.get_feature_names() for id_ngram, (ngram, value) in enumerate(zip(ngrams, norm_freq)): author = self.db.set_feature(author, "Ngram::punct::" + ngram, value) return author
def featTransform(sents_train, sents_test): cv = CountVectorizer() cv.fit(sents_train) print(cv.get_params()) features_train = cv.transform(sents_train) features_test = cv.transform(sents_test) return features_train, features_test, cv
def company_search(company): CONSUMER_KEY = 'fH4YFq25oK61JwakuaJ5g' CONSUMER_SECRET = 'S8v2bm0y8jPy3oIsJl8QdZtx6BnDtbkiN2ANK65ZLM' OAUTH_TOKEN = '21964998-aeEYdcIHsmaKMrjBM4wqMqpFLlJ8Npy002DepKYsa' OAUTH_TOKEN_SECRET = 'fZa21ALNIBiWetskCIuaywLro05EwgG2VjgaczpbRawjB' auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET) twitter_api = twitter.Twitter(auth=auth) search_results = twitter_api.search.tweets(q=company,count=10000) date_status = [(datetime.datetime.strptime(re.sub('\+0000 ','',status['created_at']),'%a %b %d %H:%M:%S %Y').date(),status['text']) for status in search_results['statuses']] date_string_dict = {} for date,text in date_status: if date in date_string_dict: date_string_dict[date] = date_string_dict[date]+text else: date_string_dict[date]=text vectorizer = CountVectorizer(min_df=0) vectorizer.fit(date_string_dict.values()) bag_matrix = vectorizer.transform(date_string_dict.values()) bag_matrix=sparse.csc_matrix(bag_matrix) #type(bag_matrix) #bag_matrix.toarray() return date_string_dict,bag_matrix
def build_classifier(df_curated, df_all): vec = CountVectorizer(tokenizer=pre_process) vec.fit(df_all.tweet) bagofwords = vec.transform(df_curated.tweet) bagofwords = bagofwords.toarray() clf = MultinomialNB().fit(bagofwords, df_curated['class']) return vec, clf
def prep_train_evaluate(docs_train, docs_test, labs_train, labs_test, **kwargs): '''func to prep text, extract features, train model, predict, evaluate''' # instantiate vectorizer + classifier vectorizer = CountVectorizer(token_pattern=r'\b[a-zA-Z0-9_<>]{1,}\b', **kwargs) classifier = LogisticRegression(solver='liblinear') # construct feature matrices for train and test sets vectorizer.fit(docs_train) X_train = vectorizer.transform(docs_train) X_test = vectorizer.transform(docs_test) # fit/train classifier using train features and labels classifier.fit(X_train, labs_train) # generate test set model predictions from test matrix preds_test = classifier.predict(X_test) # measure performance using simple accuracy (proportion correct) accuracy = accuracy_score(labs_test, preds_test) # print lil message showing param settings + performance print(f' >> test set accuracy: {accuracy:.3f}\n({kwargs})\n') # return classifier, vectorizer, predictions, and score for inspection return {'clf': classifier, 'vect': vectorizer, 'preds': preds_test, 'acc': accuracy}
def test_countvectorizer_custom_vocabulary(): what_we_like = ["pizza", "beer"] vect = CountVectorizer(vocabulary=what_we_like) vect.fit(JUNK_FOOD_DOCS) assert_equal(set(vect.vocabulary), set(what_we_like)) X = vect.transform(JUNK_FOOD_DOCS) assert_equal(X.shape[1], len(what_we_like))
def preprocesar(labeled, unlabeled, dims, stop_words=None): """preprocesar.""" instances = [] labels = [] for v_l in labeled.values(): instances += v_l['X'] labels += v_l['y'] if unlabeled is not None: for v_ul in unlabeled.values(): instances += v_ul['X'] x_cv = CountVectorizer(max_features=dims, ngram_range=(1, 2), binary=True, stop_words=stop_words) x_cv.fit(instances) y_cv = CountVectorizer() y_cv.fit(labels) print "\nEtiquetas:" for etiqueta, valor in y_cv.vocabulary_.items(): print "\tEtiqueta: %s - Valor: %d" % (etiqueta, valor) print "" for d_l in labeled: labeled[d_l]['X'] = x_cv.transform(labeled[d_l]['X']) labeled[d_l]['y'] = y_cv.transform(labeled[d_l]['y']) if unlabeled is not None: for d_ul in unlabeled: unlabeled[d_ul]['X'] = x_cv.transform(unlabeled[d_ul]['X']) return labeled, unlabeled
def fit(self, X, y, min_df=0.005,max_df=0.8, *args, **kwargs): # Train the model using the training sets vect = CountVectorizer(min_df=self.min_df, max_df=self.max_df, max_features=4500, ngram_range=(2,2)) vect.fit(X) self.bivect = CountVectorizer(ngram_range=(2,2), vocabulary=vect.vocabulary_) super(TlinReg, self).fit(vect.transform(X), y, *args, **kwargs) return self
def createSparsMatrix(featureDict, tupledTweets, flag): # print features tples = tupledTweets m = len(tples) tweets = [] yValues = np.empty((m,)) for i, line in enumerate(tples): yValues[i,] = int(line[0] == "true") tweets.append(line[1]) vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 3), max_features=10000) if flag == 1: features = read_words(featureDict) vectorizer.fit(features) else: vectorizer.fit(featureDict) # print vectorizer.get_feature_names() xValues = vectorizer.transform(tweets) # print vectorizer.vocabulary_.get('high') # print xValues.toarray() return xValues, yValues, vectorizer
class ACMClassificator(BaseACMClassificator): def __init__(self): self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize) self.classificator = RandomForestClassifier(n_estimators=256, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=-1, class_weight=None) def _prepare_problems(self, problems): return self.vectorizer.transform([p.statement for p in problems]) def fit(self, problems, tags): nltk.download('punkt', quiet=True) self.vectorizer.fit([p.statement for p in problems]) mat = self._prepare_problems(problems) self.classificator.fit(mat.toarray(), tags) def predict(self, problems): mat = self._prepare_problems(problems) return self.classificator.predict(mat.toarray())
class lang_detector(): def __init__(self,classifier=MultinomialNB()): self.classifier = classifier self.vectorizer = CountVectorizer(ngram_range=(1,2),max_features=1000,preprocessor=self._remove_noise) # need to remove #hastage, @mention and links def _remove_noise(self, document): noise_pattern = re.compile("|".join(["http\S+","\@\w+","\#\w+"])) clean_text = re.sub(noise_pattern,"",document) return clean_text def features(self,X): return self.vectorizer.transform(X) def fit(self, X, y): self.vectorizer.fit(X) self.classifier.fit(self.features(X), y) def predict(self,x): return self.classifier.predict(self.features([x])) def score(self,X,y): return self.classifier.score(self.features(X),y)
def main(train_file, test_file): #print "loading data.." csv.field_size_limit(1310720) trainreader = csv.reader (open( '/home/kiran/kdd/train.csv' )) projectid, traindata_old = zip (*trainreader) testreader = csv.reader (open ('/home/kiran/kdd/test.csv')) projectid, testdata_old = zip (*testreader) # remove stopwords traindata = [] testdata = [] for observation in traindata_old: traindata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False)) for observation in testdata_old: testdata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False)) tfv = CountVectorizer (binary=1,ngram_range=(1, 1)) X_all = traindata + testdata lentrain = len(traindata) tfv.fit(X_all) X_all = tfv.transform(X_all) X = X_all[:lentrain] X_test = X_all[lentrain:] scipy.io.mmwrite ('x_train_bin_1gram.mtx', X, field = 'real') scipy.io.mmwrite ('x_test_bin_1gram.mtx', X_test, field = 'real') myCols = tfv.get_feature_names () myCols = DataFrame (myCols) myCols.to_csv ('bin_1gram.csv', index=False)
def fit(self, X, y, min_df=0.005,max_df=0.8, *args, **kwargs): # Train the model using the training sets vect = CountVectorizer(stop_words='english', min_df=self.min_df, max_df=self.max_df, max_features=4500, ngram_range=(2,2)) vect.fit([e['text'] for e in X]) self.vocabulary_ = vect.vocabulary_ super(TlinReg, self).fit(vect.transform(e['text'] for e in X), y, *args, **kwargs) return self
def cal_product_description_tfidf(): #PART II compute the tf-idf for product description print "\nBegins,compute the tf-idf for product description ..." product_description_data = pd.read_csv('product_descriptions.csv') print "\nMerge the product description into database..." AllSet = pd.merge( AllSet , product_description_data, how='left', on='product_uid') print "\nStemming the product description ..." AllSet['product_description'] = AllSet['product_description'].map(lambda x: stem_process(x)) product_description=AllSet['product_description'] print "\nGet the (product description vocabulary)-(search term) frequency matrix..." search_vect_descrip = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency search_vect_descrip.fit(product_description)#learn the vocabulary search_descrip_fq_matrix = search_vect_descrip.transform(search_term) #get the (product description vocabulary)-(search term) frequency matrix print "\nGet the (product description vocabulary)-(product_description) frequency matrix..." description_vect = CountVectorizer(stop_words ='english') description_vect.fit_transform(product_description)#learn the vocabulary description_fq_matrix=description_vect.transform(product_description) #get the (product discription vocabulary)-(product_description) frequency matrix print "\nGet the idf matrix..." tfidf_transformer = TfidfTransformer(norm="l2",smooth_idf=True) tfidf_transformer.fit(description_fq_matrix) # get idf for each vocabulary tf_idf_descrip_matrix = tfidf_transformer.transform(description_fq_matrix) #get the idf matrix print "\nCompute the result of tf-idf for product description ..." tf_idf_descrip_result=[]#compute the result of tf-idf for product title for index in range(tf_idf_descrip_matrix.shape[0]): tf_idf_descrip_result.append((np.multiply(tf_idf_descrip_matrix[index], search_descrip_fq_matrix[index].transpose()))[0, 0]) pd.DataFrame({"id":AllSet['id'],"product_description_tfidf": tf_idf_descrip_result}).to_csv('product_description_tfidf.csv', index=False)
class Featurizer(object): def __init__(self): self.sentiment_analyzer = Sentiment('data/AFINN-111.txt') self.bow_vectorizer = None self.bow_analyzer = None def bag_of_words(self, body): return self.bow_vectorizer.transform([body]).toarray() def text_features(self, comment): num_chars = len(comment.get("body")) num_links = count_links(comment.get("body")) simple_tokens = comment.get("body").split(' ') num_words = 0 avg_word_length = 0 for token in simple_tokens: num_words += 1 avg_word_length += len(token) avg_word_length = float(avg_word_length) / float(num_words) sentiment = self.sentiment_analyzer.analyze( self.bow_analyzer(comment.get("body"))) score = comment.get("score") return [num_chars, num_links, num_words, num_words, avg_word_length, sentiment] def transform_comment(self, comment): return numpy.hstack(( numpy.array([self.text_features(comment)], dtype='float_'), self.bag_of_words(comment.get("body")))) def score_comment(self, comment): return comment.get("score") def transform(self, comments): """ Returns a Nx(D+1) numpy matrix of features. The first D columns correspond to features, where the final column corresponds to the scores of each comment""" # if it's a single instance, return an array if isinstance(comments, dict): return transform_comment(comments) # http://scikit-learn.org/stable/modules/feature_extraction.html self.bow_vectorizer = CountVectorizer(min_df=1) self.bow_vectorizer.fit([c.get("body") for c in comments]) self.bow_analyzer = self.bow_vectorizer.build_analyzer() def features_and_label(comment): return numpy.hstack(( self.transform_comment(comment), numpy.array([[self.score_comment(comment)]], dtype='float_'))) return numpy.vstack([features_and_label(c) for c in comments])
class BiGramPreProcessor(PreProcessor): def __init__(self, url_list=None, vocab=None): self.stemmer = RSLPStemmer() self.vectorizer = CountVectorizer(preprocessor=self.stemmer.stem, tokenizer=tokenizer_with_numeric, ngram_range=(1,2)) if url_list is not None: self.fit_vocab(url_list) else: self.vectorizer.vocabulary_ = vocab self.vocab_size = len(self.vectorizer.vocabulary_) def fit_vocab(self, url_list): text_generator = url2text_generator(url_list) self.vectorizer.fit(text_generator) def url_to_bow(self, url): print url text_generator = url2text_generator([url]) sparse_matrix = self.vectorizer.transform(text_generator) return [(sparse_matrix.indices[i], value) for i, value in enumerate(sparse_matrix.data)] def idf(self, term_id): return None def dict_from_idf(self, idf_path): return None
def generatePredictingModel(data): """ Build the prediction model (based on the data set we have) in order to be able to predict the category of a new video from the user input Return a classifier able to predict the category of a video based on its title and description. """ try: # Intitialize a timer to compute the time to build the model start = time.time() # Split into train-test data set X = data[[x for x in data.columns if x in ('title', 'description')]] Y = data[[x for x in data.columns if x in ('video_category_id')]] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.80, random_state = 10) # Build the 2 text corpus corpus_title = X_train['title'].values.tolist() corpus_description = X_train['description'].values.tolist() # initializes the 2 vectorizers. count_vectorizer_title = CountVectorizer() count_vectorizer_description = CountVectorizer() # learn the 2 vocabulary dictionary count_vectorizer_title.fit(corpus_title) count_vectorizer_description.fit(corpus_description) # Build the sparse matrices X_train_count_title = count_vectorizer_title.transform(X_train['title']) X_train_count_description = count_vectorizer_description.transform(X_train['description']) X_test_count_title = count_vectorizer_title.transform(X_test['title']) X_test_count_description = count_vectorizer_description.transform(X_test['description']) # Set and train the models (for title and description features) model_count_title = BernoulliNB() model_count_description = BernoulliNB() model_count_title.fit(X_train_count_title, Y_train['video_category_id']) model_count_description.fit(X_train_count_description, Y_train['video_category_id']) # Merge the title and description predictions and build a new prediction based on these 2 predictions combined new_df_train = pd.DataFrame() new_df_train['title_prediction'] = model_count_title.predict(X_train_count_title) new_df_train['description_prediction'] = model_count_description.predict(X_train_count_description) new_df_test = pd.DataFrame() new_df_test['title_prediction'] = model_count_title.predict(X_test_count_title) new_df_test['description_prediction'] = model_count_description.predict(X_test_count_description) tree = DecisionTreeClassifier() tree.fit(new_df_train, Y_train) end = time.time() execution_time = end - start print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format(execution_time) time.sleep(3) return tree, model_count_title, model_count_description,count_vectorizer_title,count_vectorizer_description except: raise VideoAnalysisException(" Error while creation of predictive model ")
def __init__(self, subset, n_character_deleted=1): assert subset in ['train', 'valid', 'test'] twenty_news_groups = fetch_20newsgroups(subset=subset) count_vect = CountVectorizer() count_vect.fit(twenty_news_groups.data) self.words = count_vect.vocabulary_.keys() random.shuffle(self.words) self.idx = 0
"imdb": "../../datasets/opiniones/imdb_labelledes.csv" } df_list = [] for source, filepath in filepath_dict.items(): df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t') df['source'] = source df_list.append(df) df = pd.concat(df_list) print(df.head()) #1era estrategia vamos a crear un copntador de palabras frases = ["A Victor le gusta linux", "Victor no no le gusta Tortoise"] etiquetas = [1, 0] #countvectorizer para vectorizar operaciones print(frases) vectorizar = CountVectorizer(lowercase=False) vectorizar.fit(frases) print(vectorizar.vocabulary_) feature_vector = vectorizar.transform(frases).toarray() print(feature_vector) X = feature_vector y = etiquetas print(X) print(y)
def fit(self, corpus, max_df_frac=0.90, min_df_frac=0.000025, is_featurizer_for_test=False): logging.info('Usage at beginning of featurizer fit: %s', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e6) if is_featurizer_for_test: paper_ids_for_training = corpus.train_ids + corpus.valid_ids else: paper_ids_for_training = corpus.train_ids # Fitting authors and venues logging.info('Fitting authors and venues') author_counts = collections.Counter() venue_counts = collections.Counter() keyphrase_counts = collections.Counter() for doc_id in tqdm.tqdm(paper_ids_for_training): doc = corpus[doc_id] author_counts.update(doc.authors) venue_counts.update([doc.venue]) keyphrase_counts.update(doc.key_phrases) c = 1 for author, count in author_counts.items(): if count >= self.min_author_papers: self.author_to_index[author] = c c += 1 c = 1 for venue, count in venue_counts.items(): if count >= self.min_venue_papers: self.venue_to_index[venue] = c c += 1 c = 1 for keyphrase, count in keyphrase_counts.items(): if count >= self.min_keyphrase_papers: self.keyphrase_to_index[keyphrase] = c c += 1 # Step 1: filter out some words and make a vocab if self.use_pretrained: vocab_file = dp.vocab_for_corpus('shared') with open(vocab_file, 'r') as f: vocab = f.read().split() else: logging.info('Cleaning text.') all_docs_text = [ ' '.join((_clean(corpus[doc_id].title), _clean(corpus[doc_id].abstract))) for doc_id in tqdm.tqdm(paper_ids_for_training) ] logging.info('Fitting vectorizer...') if self.max_features is not None: count_vectorizer = CountVectorizer( max_df=max_df_frac, max_features=self.max_features, stop_words=self.STOPWORDS) else: count_vectorizer = CountVectorizer(max_df=max_df_frac, min_df=min_df_frac, stop_words=self.STOPWORDS) count_vectorizer.fit(tqdm.tqdm(all_docs_text)) vocab = count_vectorizer.vocabulary_ logging.info('Usage after word count: %s', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e6) # Step 4: Initialize mapper from word to index self.word_indexer = FeatureIndexer(vocab=vocab, use_pretrained=self.use_pretrained) self.n_features = 1 + len(self.word_indexer.word_to_index) logging.info('Usage after word_indexer: %s', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e6) logging.info('Usage at end of fit: %s', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e6) logging.info('Total words %d ' % len(self.word_indexer.word_to_index)) logging.info('Total authors %d ' % self.n_authors) logging.info('Total venues %d ' % self.n_venues) logging.info('Total keyphrases %d ' % self.n_keyphrases)
text_1=read_text('LOTR1.txt') text_2=read_text('LOTR2.txt') text_3=read_text('LOTR3.txt') text = [text_1,text_2,text_3] lotrDF = DataFrame() # basic text level features lotrDF['text'] = text lotrDF['char_count'] = lotrDF['text'].apply(len) lotrDF['word_count']=lotrDF['text'].apply(lambda x: len(x.split())) lotrDF['word_density']=lotrDF['char_count']/(lotrDF['word_count']+1) # count vectorizer count_vec = CountVectorizer(stop_words='english', analyzer='word') count_fit = count_vec.fit(lotrDF['text']) vector_count=count_fit.transform(lotrDF['text']) count_feat=count_vec.get_feature_names() count_set = set(count_feat) count_freqs=zip(count_feat,vector_count.sum(axis=0).tolist()[0]) fellowship_count_vec = CountVectorizer(stop_words='english', analyzer='word') fellowship_vector = fellowship_count_vec.fit_transform([text_1]) fellowship_feat = fellowship_count_vec.get_feature_names() fellowship_set = set(fellowship_feat) towers_count_vec = CountVectorizer(stop_words='english', analyzer='word') towers_vector = towers_count_vec.fit_transform([text_2]) towers_feat = towers_count_vec.get_feature_names() towers_set = set(towers_feat)
from sklearn import preprocessing from sklearn import metrics import pandas as pd import joblib df = pd.read_csv('Movie_Metadata_Sentiments.csv') # Subset only emotions required to get overall emotion detected from the text content sub_df = df[['anger', 'joy', 'fear', 'sadness']] # Label the movie with the highest count of emotions df['Max'] = sub_df.idxmax(axis=1) token = RegexpTokenizer(r'[a-zA-Z0-9]+') cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 1), tokenizer=token.tokenize) cv = cv.fit(df['Text_Content']) text_counts = cv.transform(df['Text_Content']) # Save the vectorizer joblib.dump(cv, "vectorizer.pkl") X_train, X_test, y_train, y_test = train_test_split(text_counts, df['Max'], test_size=0.2, random_state=1) print(X_train.shape) le = preprocessing.LabelEncoder() le.fit(y_train) print(le.classes_) y_train = le.transform(y_train) y_test = le.transform(y_test)
train_x = train[['creativeSize']] #刚才没处理的连续型数据 test_x = test[['creativeSize']] for feature in one_hot_feature: enc.fit(data[feature].values.reshape(-1, 1)) train_a = enc.transform(train[feature].values.reshape(-1, 1)) test_a = enc.transform(test[feature].values.reshape(-1, 1)) train_x = sparse.hstack( (train_x, train_a)) #scipy.sparse 稀疏矩阵 sparse.hstack横向合并train_x和train_a test_x = sparse.hstack((test_x, test_a)) print('one-hot prepared !') cv = CountVectorizer() for feature in vector_feature: cv.fit(data[feature]) train_a = cv.transform(train[feature]) test_a = cv.transform(test[feature]) train_x = sparse.hstack((train_x, train_a)) test_x = sparse.hstack((test_x, test_a)) print('cv prepared !') def LGB_test(train_x, train_y, test_x, test_y): print("LGB test") clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=1000,
def svm(): train = load_model('model_rf/train_bow1_2.pkl') if train is None: train = load_data('datavn/train') vectorizer = load_model('model_rf/vectorizer_bow1_2.pkl') if vectorizer == None: # vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_df=0.7, min_df=2, max_features=1000) vectorizer = CountVectorizer(ngram_range=(1, 2), max_df=0.7, min_df=2, max_features=1000) test = load_model('model/test1_2.pkl') if test is None: test = load_data('datavn/test') train_text = train["question"].values test_text = test["question"].values vectorizer.fit(train_text) X_train = vectorizer.transform(train_text) joblib.dump(vectorizer, 'model_rf/vectorizer_bow1_2.pkl') X_train = X_train.toarray() y_train = train["label1"] y_train2 = train["label2"] X_test = vectorizer.transform(test_text) X_test = X_test.toarray() y_test = test["label1"] y_test2 = test["label2"] # joblib.dump(vectorizer, 'model/vectorizer2.pkl') print "---------------------------" print "Training" print "---------------------------" # iterate over classifiers clf = load_model('model/bow1_2.pkl') if clf is None: t0 = time.time() clf = RandomForestClassifier(n_estimators=100) clf.fit(X_train, y_train) joblib.dump(clf, 'model_rf/bow1_2.pkl') print " %s - Training completed %s" % (datetime.datetime.now(), time_diff_str(t0, time.time())) t1 = time.time() y_pred = clf.predict(X_test) print " %s - Converting completed %s" % (datetime.datetime.now(), time_diff_str(t1, time.time())) print " accuracy: %0.3f" % accuracy_score(y_test, y_pred) print " f1 accuracy: %0.3f" % f1_score(y_test, y_pred, average='weighted') print "confuse matrix: \n", confusion_matrix( y_test, y_pred, labels=["ABBR", "DESC", "ENTY", "HUM", "LOC", "NUM"]) print "-----------------------" print "fine grained category" print "-----------------------" clf2 = load_model('model_rf/bow_fine1_2.pkl') if clf2 is None: t2 = time.time() clf2 = RandomForestClassifier(n_estimators=100) clf2.fit(X_train, y_train2) joblib.dump(clf2, 'model/bow_fine1_2.pkl') print " %s - Training for fine grained category completed %s" % ( datetime.datetime.now(), time_diff_str(t2, time.time())) t3 = time.time() y_pred2 = clf2.predict(X_test) print " %s - Converting completed %s" % (datetime.datetime.now(), time_diff_str(t3, time.time())) print " accuracy for fine grained category: %0.3f\n" % accuracy_score( y_test2, y_pred2) print " f1 accuracy: %0.3f" % f1_score( y_test2, y_pred2, average='weighted')
from sklearn.linear_model import ElasticNet # Create the token pattern: TOKENS_ALPHANUMERIC TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' TokenBasic ='\\S+(?=\\s+)' # Fill missing values in df.Position_Extra vec_alphanumeric = CountVectorizer(token_pattern = TOKENS_ALPHANUMERIC,stop_words='english') # Fit to the data vec_alphanumeric.fit(Jc.trans) print(msg.format(len(vec_alphanumeric.get_feature_names()))) print(vec_alphanumeric.get_feature_names()[:70]) # Split out only the text data X_train, X_test, y_train, y_test = train_test_split(Jc.trans, Jc.readRatePercent, random_state=42) ('tfidf', TfidfVectorizer()) ('vec', CountVectorizer(token_pattern = TOKENS_ALPHANUMERIC,stop_words='english',
# In[10]: train['question1'] = train['question1'].fillna("dhainchu") train['question2'] = train['question2'].fillna("dhainchu") # In[11]: print("Creating the vocabulary of words occurred more than", MIN_WORD_OCCURRENCE) all_questions = pd.Series(train['question1'].tolist() + train['question2'].tolist()).unique() cv = CountVectorizer(lowercase=False, token_pattern="\S+", min_df=MIN_WORD_OCCURRENCE) cv.fit(all_questions) top_words = set(cv.vocabulary_.keys()) top_words.add(REPLACE_WORD) # In[12]: embeddings_index = get_embedding() # In[13]: print("Words are not found in the embedding:", top_words - embeddings_index.keys()) top_words = embeddings_index.keys() # In[14]:
print('-----finished corpus tokenization-----') # Load the document you wish to summarize title = 'American Missouri River Dakota Access Pipeline Fort Yates Standing Rock America Bakkan Sioux Youth Army Corps Engineer North Obama Trump Native DAPL Radio Energy Transfer Gonacon' count = 0 for ele in raw: document = ele['Sentences'] cleaned_document = clean_document(document) doc = remove_stop_words(cleaned_document) # Merge corpus data and new document data data = [' '.join(document) for document in data] train_data = set(data + [doc]) # Fit and Transform the term frequencies into a vector count_vect = CountVectorizer() count_vect = count_vect.fit(train_data) freq_term_matrix = count_vect.transform(train_data) feature_names = count_vect.get_feature_names() # Fit and Transform the TfidfTransformer tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) # Get the dense tf-idf matrix for the document story_freq_term_matrix = count_vect.transform([doc]) story_tfidf_matrix = tfidf.transform(story_freq_term_matrix) story_dense = story_tfidf_matrix.todense() doc_matrix = story_dense.tolist()[0] # Get Top Ranking Sentences and join them as a summary top_sents = rank_sentences(doc, doc_matrix, feature_names,top_n=1)
from sklearn.feature_extraction.text import CountVectorizer # Note we're doing "CountVectorizer" here and not TfidfVectorizer. Hmm... word_features = CountVectorizer( strip_accents="unicode", lowercase=True, ngram_range=(1, 1), ) # How does it take a whole paragraph and turn it into words? text_to_words = word_features.build_analyzer() # text_to_words is a function (str) -> List[str] assert text_to_words("Hello world!") == ["hello", "world"] # Learn columns from training data (again) word_features.fit(ex_train) # Translate our list of texts -> matrices of counts X_train = word_features.transform(ex_train) X_vali = word_features.transform(ex_vali) X_test = word_features.transform(ex_test) print(X_train.shape, X_vali.shape, X_test.shape) #%% Accumulate results here; to be box-plotted. results: Dict[str, List[float]] = {} #%% try sklearn MultinomialNB: ## SKLearn has it's own Multinomial Naive Bayes, # and it uses the alpha / additive smoothing to deal with zeros! from sklearn.naive_bayes import MultinomialNB
n = data[i].nunique() if n > 5: print(i) data = feature_count(data, [i]) #构造交叉特征对应的记数特征 else: print(i, ':', n) #%% # user_tags CountVectorizer train_new = pd.DataFrame() test_new = pd.DataFrame() train = data[:train.shape[0]] test = data[train.shape[0]:] train_y = train['click'] cntv = CountVectorizer() cntv.fit(train['user_tags']) train_a = cntv.transform(train['user_tags']) test_a = cntv.transform(test['user_tags']) train_new = sparse.hstack( (train_new, train_a), 'csr' ) #hstack : 将矩阵按照列进行拼接,对应的列数必须相等,hstack(blocks, format=None, dtype=None) test_new = sparse.hstack((test_new, test_a), 'csr') SKB = SelectPercentile(chi2, percentile=95).fit( train_new, train_y) #区别:SelectKBest选择排名排在前n个的变量 SelectPercentile 选择排名排在前n%的变量 train_new = SKB.transform(train_new) test_new = SKB.transform(test_new) ''' 在稀疏矩阵存储格式中: # - COO 格式在构建矩阵时比较高效 # - CSC 和 CSR 格式在乘法计算时比较高效
train_data, test_data, train_labels, test_labels = train_test_split( all_tweets, labels, test_size=0.2, random_state=1) print(len(train_data)) print(len(test_data)) # ------------------------------------------------- # Transform tweets into count vectors from sklearn.feature_extraction.text import CountVectorizer counter = CountVectorizer() # Teach the counter the vocabulary counter.fit(train_data) # Transform into count vectors train_counts = counter.transform(train_data) test_counts = counter.transform(test_data) print(train_counts[3]) print(test_counts[3]) # ------------------------------------ # Train and test the Classifier from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB()
simple_train = [ 'call you tonight', 'Call me a cab', 'please call me... PLEASE!' ] # From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction): # # > Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect **numerical feature vectors with a fixed size** rather than the **raw text documents with variable length**. # # We will use [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to "convert text into a matrix of token counts": # import and instantiate CountVectorizer (with the default parameters) from sklearn.feature_extraction.text import CountVectorizer vect = CountVectorizer() # learn the 'vocabulary' of the training data (occurs in-place) vect.fit(simple_train) # examine the fitted vocabulary vect.get_feature_names() # transform training data into a 'document-term matrix' simple_train_dtm = vect.transform(simple_train) simple_train_dtm # convert sparse matrix to a dense matrix simple_train_dtm.toarray() # examine the vocabulary and document-term matrix together pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names()) # From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):
def mlprice(request): nome = '' result = 'Here goes the price' productname = 'Example: Spiderman Figure Action Marvel #123' if request.method == 'POST': nome = request.POST['nome'] #get the url re = requests.get(nome) #make a requisition to page soup = BeautifulSoup(re.text, 'html.parser') needed = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ',' ] #here is just an array to extract numbers and ',' from the requisition test_texts = [ value for element in soup.find_all(class_=True) for value in element["class"] ] #here we're getting all the classes name from the site structure possible_texts = [ "game_purchase_price", "price", "catalog-detail-price-value", "preco_desconto", "preco", "preco_desconto_avista-cm", 'css-ovezyj', "currency-value", "best-price" ] #possible price classes extracted from some websites negative_texts = [ "container", "pop_up", "menu", "hr", "nav-menu-item" ] #menu, popups, etc. training_texts = possible_texts + negative_texts training_labels = ["positive"] * len( negative_texts) + ["negative"] * len(possible_texts) vectorizer = CountVectorizer() vectorizer.fit(training_texts) training_vectors = vectorizer.transform(training_texts) testing_vectors = vectorizer.transform(test_texts) classifier = tree.DecisionTreeClassifier() classifier.fit(training_vectors, training_labels) predictions = classifier.predict(testing_vectors) c = 0 #counter valuesInsideFoundit = [] for i in predictions: #here's are passing throung the predictions if i == "positive": #if it's possible to be a price foundit = soup.find( class_=test_texts[c] ) #we will get the text inside that class from test_text in the index of the variable 'c' valuesInsideFoundit.append( foundit.text ) #and we will append that value that we have just find c += 1 #counter increment firstValuesInsideIt = [] #↓ Here we are filtering some values in text because in some pages we will find a lot of prices ou symbols that we have in our 'dictonary'(variable 'needed'), and probably #will see values like: ["R", "R", "$", "1", "3", "R", "$"]. #So, we need to filter it to get an expected value like:"R$123,99". for k in filter( None, valuesInsideFoundit ): #passing through the values without passing through empty indexes cc = 0 #counter for y in list( k): #passing through any index inside the filtered list. if y in needed or y == "R" and list( k )[cc + 1] == "$" or y == "$" and list( k )[cc - 1] == "R": #if y is inside needed variable or y is "R" and k in the next index of cc is "$" #or y is "$" and k in the last index of cc is "R" it's probably a value in Reais. firstValuesInsideIt.append( str(y).replace("\n", "").replace(' ', '') ) #let's padding it and remove the spaces and the line jumps else: #if not, let's ignore it pass cc += 1 #cc increment ccc = 0 whatWeWant = "" indexOf = 0 #more formating for b in firstValuesInsideIt: if b == "R" and firstValuesInsideIt[ccc + 1] == "$": indexOf = firstValuesInsideIt.index( str(firstValuesInsideIt[ccc])) break ccc += 1 lastFormatedValues = [] cccc = 0 for y in firstValuesInsideIt[indexOf:]: lastFormatedValues.append(y) try: if lastFormatedValues[cccc - 2] == ",": break except: pass cccc += 1 ccccc = 0 indexOf2 = 0 for bb in lastFormatedValues: if lastFormatedValues[ccccc] == "R" and lastFormatedValues[ ccccc + 1] == "$" and lastFormatedValues[ccccc + 2] in needed: indexOf2 = lastFormatedValues.index( str(lastFormatedValues[ccccc])) ccccc += 1 for z in lastFormatedValues[indexOf2 - 8:]: whatWeWant = whatWeWant + z #our variable recives our value. al = re.text productname = al[al.find('<title>') + 7:al.find('</title>')] print(productname) #let's put it on page: return render(request, 'savepage/main.html', { 'nome': nome, 'result': result, 'productname': productname })
print("total examples %s" % len(labels)) # split the dataset into training and test datasets train_x, test_x, train_y, test_y = train_test_split(dataDF['text'], dataDF['label'], random_state=24, test_size=0.2) # label encode the target variable encoder = LabelEncoder() train_y = encoder.fit_transform(train_y) test_y = encoder.fit_transform(test_y) # create a count vectorizer object count_vect = CountVectorizer(analyzer=lambda x: x) count_vect.fit(dataDF['text']) # transform the training and test data using count vectorizer object xtrain_count = count_vect.transform(train_x) xtest_count = count_vect.transform(test_x) # word level tf-idf tfidf_vect = TfidfVectorizer(analyzer=lambda x: x) tfidf_vect.fit(dataDF['text']) xtrain_tfidf = tfidf_vect.transform(train_x) xtest_tfidf = tfidf_vect.transform(test_x) ''' MODELS ''' ## Naive Bayes
y.append(path) return X, y #Hàm ghi file def writeFile(txtArrAfter, outputName, path): f = open(path + "/" + outputName + ".txt", 'a',encoding="utf-8") f.write(str(txtArrAfter)) f.close() #Load dữ liệu vào X_data, y_data train_path = os.path.join(dir_path, 'C:/Users/vai22/OneDrive/Desktop/Data/Train_Full') X_data, y_data = get_data(train_path) count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') count_vect.fit(X_data) # transform the training and validation data using count vectorizer object X_train_count = count_vect.transform(X_data) # word level - we choose max number of words equal to 30000 except all words (100k+ words) tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000) tfidf_vect.fit(X_data) # learn vocabulary and idf from training set X_data_tfidf = tfidf_vect.transform(X_data) # assume that we don't have test set before # writeFile(X_data,'X_data', 'C:/Users/vai22/OneDrive/Desktop/final/Data/vnexpress.net') # writeFile(y_data,'y_data', 'C:/Users/vai22/OneDrive/Desktop/final/Data/vnexpress.net')
q2_word_test = [x[1] for x in datas_test] q1_char_train = [x[2] for x in datas_train] q1_char_dev = [x[2] for x in datas_dev] q1_char_test = [x[2] for x in datas_test] q2_char_train = [x[3] for x in datas_train] q2_char_dev = [x[3] for x in datas_dev] q2_char_test = [x[3] for x in datas_test] label_train = [x[4] for x in datas_train] label_dev = [x[4] for x in datas_dev] label_test = [x[4] for x in datas_test] # sklearn extract feature # feature1: count(csr_matrix) vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=( 1, 1)) # token_pattern must remove \w, or single char not counted vectorizer.fit(q1_word_train + q2_word_train) q1_train_count = vectorizer.transform(q1_word_train) q1_dev_count = vectorizer.transform(q1_word_dev) q1_test_count = vectorizer.transform(q1_word_test) q2_train_count = vectorizer.transform(q2_word_train) q2_dev_count = vectorizer.transform(q2_word_dev) q2_test_count = vectorizer.transform(q2_word_test) # feature2: binary(csr_matrix) q1_train_binary = q1_train_count.copy() q1_dev_binary = q1_dev_count.copy() q1_test_binary = q1_test_count.copy() q1_train_binary[q1_train_binary > 0] = 1.0 q1_dev_binary[q1_dev_binary > 0] = 1.0 q1_test_binary[q1_test_binary > 0] = 1.0 q2_train_binary = q2_train_count.copy()
Messages['preprocessed message'] = Messages.apply( lambda row: stopWordRemoval(row['lemmantized words']), axis=1) '''print('\nData Frame after stop word removal: \n', Messages[0:2])''' TrainingData = pd.Series(list(Messages['preprocessed message'])) TrainingLabel = pd.Series(list(Messages['label'])) ## Feature Extraction (Convert the text content into the vector form) ### Bag of Words(BOW) most widely used method for generating features in NLP used for calculating the word frequency which can be used as feature for training a classifier ### TDM (Term Document Matrix) is the matrix that contain the frequencies of occurances of terms in collection of documents, rows correspond documents and columns correspond terms tfVectorizer = CountVectorizer(ngram_range=(1, 2), min_df=(1 / len(TrainingLabel)), max_df=0.7) totalDictionaryTDM = tfVectorizer.fit(TrainingData) messageDataTDM = totalDictionaryTDM.transform(TrainingData) print(messageDataTDM.shape) ### Term Frequency Inverse Document Frequency (TFIDF), IDF diminishes the weight of most common occuring words and increases the weightage of the rare words tfIdfVectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=(1 / len(TrainingLabel)), max_df=0.7) totalDictionaryTFIDF = tfIdfVectorizer.fit(TrainingData) messageDataTFIDF = totalDictionaryTFIDF.transform(TrainingData) print(messageDataTFIDF.shape) ## Splitting the training and test data
class SentimentAnalyzer: final_model_path = 'data/model_cache/final_model.sav' vocabulary_path = 'data/model_cache/vocabulary.pkl' data_train_path = 'data/movie_data/full_train.txt' data_test_path = 'data/movie_data/full_test.txt' def preprocess_reviews(self, reviews): REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]") REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)") reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews] reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews] return reviews def load_model(self, train_required): if train_required: self.train() else: self.load() def load(self): self.final_model = pickle.load( open(SentimentAnalyzer.final_model_path, 'rb')) self.cv = CountVectorizer(binary=True, vocabulary=pickle.load( open(SentimentAnalyzer.vocabulary_path, "rb"))) def train(self): print("Training...") # data preprocessing reviews_train = [] for line in open(SentimentAnalyzer.data_train_path, encoding='utf8', mode='r'): reviews_train.append(line.strip()) reviews_test = [] for line in open(SentimentAnalyzer.data_test_path, encoding='utf8', mode='r'): reviews_test.append(line.strip()) reviews_train_clean = self.preprocess_reviews(reviews_train) reviews_test_clean = self.preprocess_reviews(reviews_test) # vectorization self.cv = CountVectorizer(binary=True) self.cv.fit(reviews_train_clean) X = self.cv.transform(reviews_train_clean) X_test = self.cv.transform(reviews_test_clean) # the first 12.5k reviews are positive and the last 12.5k are negative. target = [1 if i < 12500 else 0 for i in range(25000)] X_train, X_val, y_train, y_val = train_test_split(X, target, train_size=0.75) # choosing the best hyperparameter C which adjust regularization accuracy = 0 c = 0 for current_c in [0.01, 0.05, 0.25, 0.5, 1]: lr = LogisticRegression(C=current_c, solver='liblinear') lr.fit(X_train, y_train) current_accuracy = accuracy_score(y_val, lr.predict(X_val)) if current_accuracy > accuracy: accuracy = current_accuracy c = current_c print(f"Accuracy for C = {current_c}: {current_accuracy}") print(f"The best accuracy is {accuracy}, C = {c}") # train final model self.final_model = LogisticRegression(C=c, solver='liblinear') self.final_model.fit(X, target) print( f"Final accuracy is {accuracy_score(target, self.final_model.predict(X_test))}" ) # save final model pickle.dump(self.final_model, open(SentimentAnalyzer.final_model_path, 'wb')) pickle.dump(self.cv.vocabulary_, open(SentimentAnalyzer.vocabulary_path, "wb")) def predict(self, new_review): # predict on new review new_review_clean = self.preprocess_reviews([new_review]) X_new_review = self.cv.transform(new_review_clean) y_new_review = self.final_model.predict(X_new_review) return y_new_review
def lasso(term, year_start=1990, year_end=2016, qa='A', reg_type='lasso'): ngram_range=(2, 5) vectorizer = CountVectorizer(max_features= 50000, ngram_range=ngram_range, stop_words='english', min_df=5) docs_all = document_iterator(type=qa, year_start=year_start, year_end=year_end, format='docs_only', search_term=term) vectorizer.fit(docs_all) vocabulary = vectorizer.get_feature_names() vectorizer_plaintiff = TfidfVectorizer(vocabulary=vocabulary, ngram_range=ngram_range, use_idf=True) docs_plaintiff = document_iterator(type=qa, year_start=year_start, side_answer='Plaintiff', format='docs_only', search_term=term) dtm_plaintiff = vectorizer_plaintiff.fit_transform(docs_plaintiff) vectorizer_defendant = TfidfVectorizer(vocabulary=vocabulary, ngram_range=ngram_range, use_idf=True) docs_defendant = document_iterator(type=qa, year_start=year_start, side_answer='Defendant', format='docs_only', search_term=term) dtm_defendant = vectorizer_defendant.fit_transform(docs_defendant) X = vstack([dtm_plaintiff, dtm_defendant]) y = np.ndarray(shape=(X.shape[0])) # Plaintiff docs = 1, defendant docs = 0 y[:dtm_plaintiff.shape[0]] = 1 y[dtm_plaintiff.shape[0]:] = 0 if reg_type == 'ridge': alpha = 0.00001 clf = Ridge(alpha=alpha) clf.fit(X, y) coeff = clf.coef_ elif reg_type == 'lasso': alpha = 0.0001 clf = Lasso(alpha=alpha, max_iter=1000) clf.fit(X, y) coeff = clf.coef_ elif reg_type == 'logistic': alpha=None clf = LogisticRegression() clf.fit(X, y) coeff = clf.coef_[0] mse = mean_squared_error(y, clf.predict(X)) mae = mean_absolute_error(y, clf.predict(X)) argsorted = np.argsort(coeff) min_coef = argsorted[:10] max_coef = argsorted[-10:][::-1] min_coefs = [(vocabulary[i], coeff[i]) for i in min_coef] max_coefs = [(vocabulary[i], coeff[i]) for i in max_coef] print "Using {} regression. Mean Squared Error: {}. Mean Absolute Error: {}".format(reg_type, mse, mae) print "Samples. Plaintiff: {}. Defendant: {}. Total: {}. Number of tokens: {}".format(dtm_plaintiff.shape[0], dtm_defendant.shape[0], X.shape[0], X.shape[1]) print "Predictors for Defendants:\n{}".format(min_coefs) print "\nPredictors for Plaintiffs:\n{}\n\n".format(max_coefs)
def text_classification_tradition(): train_df = read_data('./data/train_set.csv') test_df = read_data('./data/test_a.csv') data = pd.concat([train_df, test_df], axis=0) print(data.shape) """ 传统的文本表示方法: 1. One-hot 2. BOW(Bag of Words,词袋表示) 3. N-gram 4. TF-IDF 使用sklearn feature_extraction.text里的文本表示接口时,输入格式为: corpus = [ ... 'This is the first document.', ... 'This document is the second document.', ... 'And this is the third one.', ... 'Is this the first document?', ... ] 详见:https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text """ # from sklearn.preprocessing import OneHotEncoder # # # 语料库里所有单词的集合 # words_set = set(' '.join(list(data['text'])).split(' ')) # # # 对每个单词编号,得到其索引(这一步也可以用sklearn的LabelEncoder来实现) # word2idx = {} # idx2word = {} # for i, word in enumerate(words_set): # word2idx[word] = i + 1 # idx2word[i + 1] = word # print(word2idx) # """ # {'6981': 1, '6307': 2, '5367': 3, '1066': 4,...} # """ # # # OneHotEncoder输入为shape=(N_words,1)的索引值,输出为各索引值下的one-hot向量word_onehot # idx = list(word2idx.values()) # idx = np.array(idx).reshape(len(idx), -1) # print(idx.shape) #(2958, 1) # print(idx) # """ # [[ 1] # [ 2] # [ 3] # ... # [2956] # [2957] # [2958]] # """ # onehotenc = OneHotEncoder() # onehotenc.fit(idx) # word_onehot = onehotenc.transform(idx).toarray() # for i, word_onehot_i in enumerate(word_onehot): # print("{0}\t-->\t{1}".format(idx2word[i + 1], word_onehot_i)) # """ # 6981 --> [1. 0. 0. ... 0. 0. 0.] # 6307 --> [0. 1. 0. ... 0. 0. 0.] # """ # # # 用法:给定word,找到它的idx,然后从word_onehot里取出对应的one-hot向量 # x = word_onehot[word2idx['6981']] # print(x) #word 6981 的idx 对应的one-hot向量 # 2. BOW: CountVectorizer corpus = data['text'].values vectorizer = CountVectorizer(max_features=3000) vectorizer.fit(corpus) #用训练集和测试集的所有语料训练特征提取器 X_train_all = vectorizer.transform(train_df['text'].values) y_train_all = train_df['label'].values X_test = vectorizer.transform(test_df['text'].values) X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, test_size=0.1, random_state=2020) clf = RidgeClassifier() clf.fit(X_train, y_train) y_valid_pred = clf.predict(X_valid) print("f1 score: %.6f" % f1_score(y_valid, y_valid_pred, average='macro')) """ f1 score: 0.820636 """ y_test_pred = clf.predict(X_test) test_df['label'] = y_test_pred print(test_df.shape) # (50000, 2) test_df[['label']].to_csv('./data/submission_bow_20200725.csv', index=False) print(test_df['label'].value_counts()) """ 1 11305 0 10942 2 8012 3 5798 4 3311 5 2740 6 1975 7 1563 9 1134 8 1128 10 1085 11 548 12 322 13 137 Name: label, dtype: int64 """ # 3. N-gram: CountVectorizer(ngram_range=(1,N)) # 4. TF-IDF: TfidfVectorizer corpus = data['text'].values vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=3000) vectorizer.fit(corpus) X_train_all = vectorizer.transform(train_df['text'].values) y_train_all = train_df['label'].values X_test = vectorizer.transform(test_df['text'].values) X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, test_size=0.1, random_state=2020) clf = RidgeClassifier() clf.fit(X_train, y_train) y_valid_pred = clf.predict(X_valid) print("f1 score: %.6f" % f1_score(y_valid, y_valid_pred, average='macro')) """ f1 score: 0.897664 """ y_test_pred = clf.predict(X_test) test_df['label'] = y_test_pred print(test_df.shape) # test_df[['label']].to_csv('./data/submission_tfidf_20200725.csv', index=False) print(test_df['label'].value_counts()) """
X_train, X_val, y_train, y_val = train_test_split(dataset.content.values, y, stratify=y, shuffle=True, random_state=42, test_size=0.1) # print(X_train, X_val) # print(X_train.shape) # print(X_val.shape) tfidf = TfidfVectorizer(max_features=1000, analyzer='word', ngram_range=(1, 3)) X_train_tfidf = tfidf.fit_transform(X_train) X_val_tfidf = tfidf.fit_transform(X_val) # print(X_train_tfidf, X_val_tfidf) count_vect = CountVectorizer(analyzer='word') count_vect.fit(dataset['content']) X_train_count = count_vect.transform(X_train) X_val_count = count_vect.transform(X_val) # print(X_train_count, X_val_count) # print(dataset.head(5)) #### Linear SVM # text=voice.sesal() lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None) lsvm.fit(X_train_count, y_train) y_pred = lsvm.predict(X_val_count) # print('lsvm using count vectors accuracy: %s' % accuracy_score(y_pred, y_val)) def pre_pro(text): print("Preprocessing fonksiyoyuna girdi")
# for test for topic in os.listdir(path): pd = os.path.join(path, topic) for file in os.listdir(pd): pf = os.path.join(pd, file) text = open(pf, 'r', encoding="utf8").read() X_test10.append(text) Y_test10.append(topic) # Count Vectorizer from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn import preprocessing count_vector = CountVectorizer() X_tranform10 = count_vector.fit(X_train10) X_tranform10 = count_vector.transform(X_train10) # label encoder le = preprocessing.LabelEncoder() le.fit(Y_train10) Y_transform10 = le.transform(Y_train10) # for validation data X_valid_transform10 = count_vector.transform(X_valid10) Y_valid_transform10 = le.transform(Y_valid10) # for test data X_test_transform10 = count_vector.transform(X_test10) Y_test_transform10 = le.transform(Y_test10)
london_text = london_tweets['text'].tolist() paris_text = paris_tweets['text'].tolist() # combine all text into one long list of tweets all_tweets = new_york_text + london_text + paris_text # create labels for tweets by location; 0 = new york, 1 = london, 2 = paris labels = [0] * len(new_york_text) + [1] * len(london_text) + [2] * len( paris_text) # divide set into train and test set train_data, test_data, train_labels, test_labels = train_test_split( all_tweets, labels, test_size=0.2, random_state=1) # create a counter and transform train/test data counter = CountVectorizer() counter.fit(train_data + test_data) train_counts = counter.transform(train_data) test_counts = counter.transform(test_data) # create a NB classifier, fit it with the training data and create predictions on the test data to evaluate the model classifier = MultinomialNB() classifier.fit(train_counts, train_labels) predictions = classifier.predict(test_counts) # classify by using accuracy_score # print(accuracy_score(test_labels, predictions)) # classify by using confusion matrix # print(confusion_matrix(test_labels, predictions))
def read_gbif_extract_csv( file_name="../data/gbif_extract.csv", output_file="../data/gbif_extract_canonicalName_short.csv", ): """ load gbif csv data base """ """ output csv files with short names and key """ """ used to detect canonical names """ """ see "search_canonicalName()" function in scispacy_lib.py module """ important_cols = ["key", "canonicalName"] df = pd.read_csv(file_name) print(df.shape, list(df.columns)) print("-important columns:") df = df[important_cols] print(df.shape, list(df.columns)) # print(df.head(10)) for c in list(df.columns): if c.find("Key") == -1: df[c] = df[c].apply(str).str.lower() print(df.shape, list(df.columns)) print("canonicalName:") print(df.head(10)) df["name_lst"] = df["canonicalName"].apply(lambda x: x.split(" ")) df["size"] = df["name_lst"].apply(lambda x: len(x)) print(df["size"].describe()) ##### output print("Output file with short reduce canonical name:", output_file) print("- to find canonical name in abstract sci. papers") tot_rows = df.shape[0] ts = time.time() total_read = 0 total_write = 0 dct = {} with open(output_file, "w") as f: f.write("canonicalName_word;tab_key\n") for i, r in df.iterrows(): total_read += 1 if time.time() - ts > 10: ts = time.time() print( total_read, "/", tot_rows, round(total_read / tot_rows * 100, 2), "%", ) if r["key"] is None: continue if r["size"] == 1: name = r["canonicalName"] # dct[name] = [r['key']] # print(name, dct[name], 'name first in') f.write(name + ";" + str(r["key"]) + "\n") total_write += 1 continue for name in r["name_lst"]: f.write(name + ";" + str(r["key"]) + "\n") total_write += 1 continue print("Data report:", "total_read", total_read, "total_write", total_write) print("End file generated:", output_file) if False: # following code just produced an out of memory text = df["canonicalName"].values # create the transform vectorizer_canonicalName = CountVectorizer() # tokenize and build vocab vectorizer_canonicalName.fit(text) # summarize print("len vectorizer.vocabulary_):", len(vectorizer.vocabulary_)) print("canonicalName:") i = 0 for k, v in vectorizer_canonicalName.vocabulary_.items(): print(k, v) i += 1 if i > 10: break # encode document* text = "psygmatocerus guianensis" vector = vectorizer.transform([text]) vector.toarray() text_out = vectorizer.inverse_transform(vector) print(text_out) dataX = [] dataY = [] for i, r in df_canonicalName.iterrows(): dataY.append(int(r["key"])) v = vectorizer_canonicalName.transform([r["canonicalName"]]) dataX.append(list(v.toarray()[0])) n_patterns = len(dataX) print("Total Patterns: ", n_patterns)
labels.append(content[0]) texts.append(content[1:]) # create a dataframe using texts and lables trainDF = pandas.DataFrame() trainDF['text'] = texts trainDF['label'] = labels # split the dataset into training and validation datasets train_x, valid_x, train_y, valid_y = model_selection.train_test_split( trainDF['text'], trainDF['label']) # label encode the target variable encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) valid_y = encoder.fit_transform(valid_y) #%% """ Count Vector is a matrix notation of the dataset in which every row represents a document from the corpus, every column represents a term from the corpus, and every cell represents the frequency count of a particular term in a particular document. """ # create a count vectorizer object count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') count_vect.fit(trainDF['text']) # transform the training and validation data using count vectorizer object xtrain_count = count_vect.transform(train_x) xvalid_count = count_vect.transform(valid_x)
print('begin') if __name__ == '__main__': # 读取数据 train = pd.read_csv(r'../../../DATA/TRAIN_1/train.tsv', sep='\t') labels = np.array(train['Sentiment']) test = pd.read_csv(r'../../../DATA/TRAIN_1/test.tsv', sep='\t') print(train.shape) print(test.shape) train_size = train.shape[0] test_size = test.shape[0] bag_size = 17441 # 数据预处理 ct = CountVectorizer(max_df=0.95, min_df=5, stop_words='english') vector = ct.fit(pd.concat([train['Phrase'], test['Phrase']])) train_vec = ct.transform(train['Phrase']) # test_vec = ct.transform(test['Phrase']) print(train_vec.shape) # print(test_vec.shape) # one_hot = vector.toarray() # word_bag = ct.vocabulary_ train_one_hot = train_vec.toarray() # test_one_hot = test_vec.toarray() print('train_ont size = ', len(train_one_hot)) input_size = train_vec.shape[1] state = torch.load('../task2/task_1.pt')
list_val = pd.to_numeric(df_val['label']) list_val = list(list_val) df_val['label'] = list_val df_val['cleaned_tweet'] = df_val.tweet.apply(lambda x: ' '.join( [word for word in x.split() if not word.startswith('@')])) df_val['cleaned_tweet'] = df_val.tweet.apply(lambda x: ' '.join( [word for word in x.split() if not word.startswith('#')])) df_val['cleaned_tweet'] = df_val.tweet.apply(lambda x: ' '.join( [word for word in x.split() if not word.startswith(' ')])) tweetVal_train, tweetVal_test, label_train, label_test = train_test_split( df_val['tweet'], df_val['label'], test_size=0.25) cv = CountVectorizer() vect = cv.fit(tweetVal_train) tweetVal_train_vec = cv.transform(tweetVal_train) # tweetVal_train # print tweetVal_train_vec # print 'tweetval_train',tweetVal_train # print 'label_train',label_train reg = LogisticRegression(random_state=0) reg.fit(tweetVal_train_vec, label_train) label_pred = reg.predict(vect.transform(tweetVal_test)) print("accuracy Score :", mt.accuracy_score(label_test, label_pred)) # print("Precisiopn Score:",mt.precision_score(label_test,label_pred)) # cm=confusion_matrix(label_test,label_pred) # print(cm)
df = pd.read_csv('./datasets/combined_data.csv') df = df.drop(columns='Unnamed: 0') df['label'] = df['label'].map({'true': 0, 'false': 1, 'misleading': 1}) X = df['text'] y = df['label'] X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) cvec = CountVectorizer(min_df=2, max_features=5000, ngram_range=(1, 2), stop_words=None) cvec.fit(X_train) @app.route('/') def home(): return 'Thanks for checking out our misinformation classifier!' # route 1: show a form to the user @app.route('/form') def form(): # use flask's render_template function to display an html page return render_template('form.html') # route 2: accept the form submission and do something fancy with it
data_x_test=pd.DataFrame() OHE=OneHotEncoder() for feature in one_hot_feature: OHE.fit(data_x[feature].values.reshape(-1,1)) train_a=OHE.transform(x_train[feature].values.reshape(-1,1)) valid_a=OHE.transform(x_valid[feature].values.reshape(-1,1)) test_a=OHE.transform(data_test[feature].values.reshape(-1,1)) data_x_train=sparse.hstack((data_x_train,train_a)) data_x_valid=sparse.hstack((data_x_valid,valid_a)) data_x_test=sparse.hstack((data_x_test,test_a)) print 'one_hot finish' CVec=CountVectorizer(analyzer='word',token_pattern=r'(?u)\b\w+\b',tokenizer =lambda x: x.split(' ')) #CVec=CountVectorizer() for feature in vector_feature: CVec.fit(data_x[feature]) train_a=CVec.transform(x_train[feature]) valid_a=CVec.transform(x_valid[feature]) test_a=CVec.transform(data_test[feature]) data_x_train=sparse.hstack((data_x_train,train_a)) data_x_valid=sparse.hstack((data_x_valid,valid_a)) data_x_test=sparse.hstack((data_x_test,test_a)) df_tmp=pd.DataFrame(CVec.get_feature_names(),columns=['val']) #feature important mapping df_tmp['feature']='%s' %feature df_feature_map=pd.concat([df_feature_map,df_tmp]) print ' countvec finish' df_feature_map.to_csv(save_path+"feature_important_mapping_cut.csv") sparse.save_npz(save_path+"data_x_train_cut.npz",data_x_train) x_train.to_csv(save_path+"x_train_cut.csv",index=None)