def train_and_predict_m8 (train, test, labels) : ## Apply basic concatenation + stemming trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'porter') ## TF-IDF transform with sub-linear TF and stop-word removal tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) tfv.fit(trainData) X = tfv.transform(trainData) X_test = tfv.transform(testData) ## Create the classifier print ("Fitting Ridge Classifer...") clf = RidgeClassifier(class_weight = 'auto', alpha = 1, normalize = True) ## Create a parameter grid to search for best parameters for everything in the pipeline param_grid = {'alpha' : [0.1, 0.3, 1, 3, 10], 'normalize' : [True, False]} ## Predict model with best parameters optimized for quadratic_weighted_kappa if (gridSearch) : model = perform_grid_search (clf, param_grid, X, labels) pred = model.predict(X_test) else : clf.fit(X, labels) pred = clf.predict(X_test) return pred
def train_and_predict_m1 (train, test, labels) : print ("Training M1 (randomState = %d ...", randomState) ## Apply basic concatenation + stemming trainData, testData = stemmer_clean (train, test, stemmerEnableM1, stemmer_type = 'porter') ## TF-IDF transform with sub-linear TF and stop-word removal vectorizer = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) vectorizer.fit(trainData) X = vectorizer.transform(trainData) X_test = vectorizer.transform(testData) ## Use Stemmer post TF-IDF to check if things change # print (X) print ("X.shape: ", X.shape) print ("X_test.shape: ", X_test.shape) ## Create the pipeline # 07/02 - RandomizedPCA/PCA does not work on sparse input (so cannot be applied on output of Vectorizer) # JimingYe says LDA did not give much benefit. clf = Pipeline([('svd', TruncatedSVD(random_state = randomState, n_components = 330)), ('scl', StandardScaler()), ('svm', SVC(random_state = randomState, cache_size = 500, C = 12))]) ## Create a parameter grid to search for best parameters for everything in the pipeline param_grid = {'svd__n_components' : [200, 250, 300], 'svm__C': [10, 12]}
def tfidf_ize(train, test, node_info): vectorizer = TfidfVectorizer(ngram_range=(1,1)) vectorizer.fit(node_info.abstract.as_matrix()) for table in [train, test]: table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna('')) table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna('')) table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna('')) table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna('')) #table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1) table.loc[:, 'temp22'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel table.loc[:, 'temp23'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1) table.loc[:, 'temp24'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \ + table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1) vectorizer = TfidfVectorizer(ngram_range=(2,2)) vectorizer.fit(node_info.abstract.as_matrix()) for table in [train, test]: table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna('')) table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna('')) table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna('')) table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna('')) #table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1) table.loc[:, 'temp27'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel table.loc[:, 'temp28'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1) table.loc[:, 'temp29'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \ + table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1) return train, test
def fit(self, comments, y=None): # get the google bad word list # with open("google_badlist.txt") as f: with open("my_badlist.txt") as f: badwords = [l.strip() for l in f.readlines()] self.badwords_ = badwords print("vecorizing") if self.word: if self.tokenizer_func != None: def build_tokenizer(func): regexp = re.compile(ur"\b\w\w+\b") tokenizer = lambda doc: [func(word) for word in regexp.findall(doc)] return tokenizer tokenizer = build_tokenizer(self.tokenizer_func) else: tokenizer = None countvect = TfidfVectorizer(ngram_range=self.word_range, binary=False, tokenizer=tokenizer, min_df=2) countvect.fit(comments) self.countvect = countvect if self.char: countvect_char = TfidfVectorizer(ngram_range=self.char_range, analyzer="char", binary=False) countvect_char.fit(comments) self.countvect_char = countvect_char return self
def train_and_predict_m3 (train, test, labels) : ## Apply basic concatenation + stemming trainData, testData = stemmer_clean (train, test, stemmerEnableM3, stemmer_type = 'porter') """ # Beautiful soup cleanup and stemming stemmer = PorterStemmer() trainData = modified_cleanup(train, stemmer, is_train = True) testData = modified_cleanup(test, stemmer, is_train = False) """ ## TF-IDF transform with sub-linear TF and stop-word removal tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) tfv.fit(trainData) X = tfv.transform(trainData) X_test = tfv.transform(testData) ## Create the classifier clf = SGDClassifier(random_state = randomState, n_jobs = 1, penalty = 'l2', loss = 'huber', n_iter = 50, class_weight = 'auto', learning_rate = 'optimal', epsilon = 1) ## Create a parameter grid to search for best parameters for everything in the pipeline param_grid = {'n_iter' : [30, 50, 80, 100, 200], 'loss': ['huber'], 'epsilon' : [0.3, 1], 'alpha' : [0.0001, 0.0003, 0.001] } ## Predict model with best parameters optimized for quadratic_weighted_kappa if (gridSearch) : model = perform_grid_search (clf, param_grid, X, labels) pred = model.predict(X_test) else : clf.fit(X, labels) pred = clf.predict(X_test) return pred
def compute_tf_idf_vectorizer(data_path="/Users/HyNguyen/Documents/Research/Data/stories", save_path="exsum/tf_idf_vectorizer_200_05.pickle", min_df = 200, max_df = 0.5): """ Detail: Params: data_path: data directory save_path: idfs save to, suffix: 200_05: min_df= 200, max_df = 0.5(len(documents)) min_df: lower bound max_df: upper bound """ dataset = loadData(data_path) documents = [] for counter, sample in enumerate(dataset): filename, contents, highlights = sample content_str = "" for content in contents: if content[-1] != ".": content += "." content_str += " " + content documents.append(content_str) tf_idf_vectorizer = TfidfVectorizer(max_df=max_df,min_df=min_df,stop_words=stopwords.words('english')) tf_idf_vectorizer.fit(documents) with open(save_path, mode="wb") as f: pickle.dump(tf_idf_vectorizer,f) print ("Tf-idf Vectorizer: length of vocabulary: ", len(tf_idf_vectorizer.vocabulary))
def test_tfidfvectorizer_invalid_idf_attr(): vect = TfidfVectorizer(use_idf=True) vect.fit(JUNK_FOOD_DOCS) copy = TfidfVectorizer(vocabulary=vect.vocabulary_, use_idf=True) expected_idf_len = len(vect.idf_) invalid_idf = [1.0] * (expected_idf_len + 1) assert_raises(ValueError, setattr, copy, 'idf_', invalid_idf)
class MedicalKeywordTfIdf(BaseEstimator, TransformerMixin): MEDICAL_KEYWORDS = ["Medical_Keyword_" + str(i) for i in range(1, 49)] def __init__(self): self._vec = TfidfVectorizer(max_df=0.95, min_df=2) def get_feature_names(self): return [x + "_TFIDF" for x in self._vec.get_feature_names()] def get_data_array(self, df): return df[self.MEDICAL_KEYWORDS] \ .apply(lambda x: " ".join(x[x == 1].index), axis=1).values def fit(self, df, y=None): data_arr = self.get_data_array(df) self._vec.fit(data_arr) return self def transform(self, df): data_arr = self.get_data_array(df) return self._vec.transform(data_arr).toarray()
def ridge_003(): print('*** CLEANING ***') tfidf_wrd = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='word', ngram_range=(1, 3), lowercase=True, stop_words='english', min_df=3, max_df=0.5) tfidf_wrd.fit(train_set['tweet']) X_train_wrd = tfidf_wrd.transform(train_set['tweet']) X_test_wrd = tfidf_wrd.transform(test_set['tweet']) tfidf_char = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='char', ngram_range=(4, 10), lowercase=True, stop_words='english', min_df=3, max_df=0.5) tfidf_char.fit(train_set['tweet']) X_train_char = tfidf_char.transform(train_set['tweet']) X_test_char = tfidf_char.transform(test_set['tweet']) y_train = np.array(train_set.ix[:, 4:]) print('*** TRAINING ***') mdl_wrd = model.ridge(X_train_wrd, y_train) mdl_char = model.ridge(X_train_char, y_train) print('*** PREDICTING ***') test_prediction_wrd = mdl_wrd.predict(X_test_wrd) test_prediction_char = mdl_char.predict(X_test_char) test_prediction = (test_prediction_wrd + test_prediction_char) / 2 print('*** OUTPUTTING ***') output('results/ridge_003.csv', test_prediction)
def processEssay(self, testidx, trainidx): #process essay self.rawdata['essay'] = self.rawdata['essay'].apply(clean) self.trdata = self.rawdata['essay'].ix[trainidx] self.testdata = self.rawdata['essay'].ix[testidx] trainessay = np.array(self.trdata.fillna('Missing')) testessay = np.array(self.testdata.fillna('Missing')) tfidfEs = TfidfVectorizer(min_df=4, max_features=500) tfidfEs.fit(trainessay) #======================================================================= # #process need statement # self.rawdata['need_statement'] = self.rawdata['need_statement'].apply(clean) # self.trdata = self.rawdata['need_statement'].ix[trainidx] # self.testdata = self.rawdata['need_statement'].ix[testidx] # trainneedst = np.array(self.trdata.fillna('Missing')) # testneedst= np.array(self.testdata.fillna('Missing')) # tfidfNs = TfidfVectorizer(min_df=3, max_features=20) # tfidfNs.fit(trainneedst) # # #process short desc # self.rawdata['short_description'] = self.rawdata['short_description'].apply(clean) # self.trdata = self.rawdata['short_description'].ix[trainidx] # self.testdata = self.rawdata['short_description'].ix[testidx] # trainshortd = np.array(self.trdata.fillna('Missing')) # testshortd= np.array(self.testdata.fillna('Missing')) # tfidfSd = TfidfVectorizer(min_df=3, max_features=20) # tfidfSd.fit(trainshortd) # # self.exdata_train = sp.hstack((tfidfEs.transform(trainessay),tfidfNs.transform(trainneedst),tfidfSd.transform(trainshortd) )) # self.exdata_test = sp.hstack((tfidfEs.transform(testessay),tfidfNs.transform(testneedst),tfidfSd.transform(testshortd) )) #======================================================================= self.exdata_train = tfidfEs.transform(trainessay) #only use the essay self.exdata_test = tfidfEs.transform(testessay)
def _calculate_tfidf_vectorizer(base_corpus_name=BASE_CORPUS_NAME): index_to_token = load_index_to_item(get_index_to_token_path(base_corpus_name)) token_to_index = {v: k for k, v in index_to_token.items()} train_lines = _load_train_lines() tfidf_vectorizer = TfidfVectorizer(tokenizer=get_tokens_sequence, vocabulary=token_to_index) tfidf_vectorizer.fit(train_lines) return tfidf_vectorizer
def num_feat_select(n,k): tfidf = TfidfVectorizer(max_features=n, strip_accents='unicode', tokenizer = MyTokenizer(), analyzer='word') tfidf.fit(train['tweet']) trainf = tfidf.transform(train['tweet']) testf = tfidf.transform(test['tweet']) trainlab = np.array(train.ix[:,4:]) knn = neighbors.KNeighborsRegressor(n_neighbors=k) knn.fit(trainf,trainlab) print 'here' tim = time.time(); n = 10 pred = [] for i in range(0,n): pred.extend(knn.predict(testf[(i*1000):((i+1)*(1000))])) print(i) print "time: " + str(time.time() - tim) #RMSE: testlab = np.array(test.ix[:,4:]) err = format(np.sqrt(np.sum(np.array(np.array(pred-testlab)**2)/ (testf.shape[0]*24.0)))) print err
def main(): print "loading data.." traindata = list(np.array(p.read_table('/Users/lyj/Downloads/train.tsv'))[:,2]) testdata = list(np.array(p.read_table('/Users/lyj/Downloads/test.tsv'))[:,2]) y = np.array(p.read_table('/Users/lyj/Downloads/train.tsv'))[:,-1] tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1) rd = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) X_all = traindata + testdata lentrain = len(traindata) print "fitting pipeline" tfv.fit(X_all) print "transforming data" X_all = tfv.transform(X_all) X = X_all[:lentrain] X_test = X_all[lentrain:] print "20 Fold CV Score: ", np.mean(cross_validation.cross_val_score(rd, X, y, cv=20, scoring='roc_auc')) print "training on full data" rd.fit(X,y) pred = rd.predict_proba(X_test)[:,1] testfile = p.read_csv('/Users/lyj/Downloads/data/test.tsv', sep="\t", na_values=['?'], index_col=1) pred_df = p.DataFrame(pred, index=testfile.index, columns=['label']) pred_df.to_csv('benchmark.csv') print "submission file created.."
class TfidfBuilder: def __init__(self, filtered_out_words=[]): self.lemmatizer = WordNetLemmatizer() self.tfidf = TfidfVectorizer(tokenizer=self.get_tokens) self.filtered_out_words = filtered_out_words def filter(self, word): result = True if word in self.filtered_out_words: result = False return result def get_tokens(self, text): all_tokens = nltk.word_tokenize(text) filtered_tokens = [word for word in all_tokens if self.filter(word)] lemmatized_tokens = [self.lemmatizer.lemmatize(word) for word in filtered_tokens] return lemmatized_tokens def to_tfidf(self, documents): self.tfidf.fit(documents) return self.tfidf def to_tfidf_vector(self, document): return self.tfidf.transform([document]).toarray()
def scoring(self): ''' Scoring articles based on their frequency of usage on Wikipedia ''' vectorizer = TfidfVectorizer() vectorizer.fit(self.articles) idf_score = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) ## Opening pickle of Wikipedia word frequencies with open('wiki_freq.pickle', 'r') as f: wiki_freq = pickle.load(f) for i, doc in enumerate(self.docs): total_score = 0 doc_word_score = [] for word in doc: word_score = 0 try: word_score = wiki_freq[word]*idf_score[word] total_score += word_score doc_word_score.append((word,word_score)) except: pass doc_word_score.sort(key=lambda x: x[1], reverse=True) self.doc_scores.append(total_score/(float(len(doc)+1))) self.article_info[i]['topwords'] = doc_word_score[0:25] self.article_info[i]['score'] = self.doc_scores[i] self.article_info.sort(key=lambda x: x['score'], reverse=True) self.article_rank = {rank: key for rank, key in enumerate(self.article_info, 1)}
def _train(self, train_data, resources): sample_length = len(train_data) dict_status_path = os.path.join(root_dic, 'dict_vectorizer_{}.status'. format(sample_length)) if os.path.isfile(dict_status_path): dictVectorizer = joblib.load(dict_status_path) else: dictVectorizer = DictVectorizer() dictVectorizer.fit(train_data[self.features]. fillna(0). to_dict('record')) joblib.dump(dictVectorizer, dict_status_path) tfidf_status_path = os.path.join(root_dic, 'tfidf_vectorizer_{}.status'. format(sample_length)) if os.path.isfile(tfidf_status_path): tfidf = joblib.load(tfidf_status_path) else: tfidf = TfidfVectorizer(min_df=40, max_features=300) tfidf.fit(train_data.essay) joblib.dump(tfidf, tfidf_status_path) resources['dictVectorizer'] = dictVectorizer resources['tfidf'] = tfidf print 'Head Processing Completed' return train_data, resources
def vectorize(data, new_doc, local=False): """ Converts data and new doc to vectors that can be used in KNN """ vectorizer = TfidfVectorizer(use_idf=True) glossaries = dict(map(lambda x: (x, data.tag_glossary(x)), data.tags())) vectorizer.fit(glossaries.values()) # Get all glossaries for all tags glossary_bows = vectorizer.transform(glossaries.values()) glossary_bows = dict(zip(glossaries.keys(), glossary_bows)) zero_vector = sparse.csc_matrix((1, len(vectorizer.get_feature_names()))) descriptors = [] doc_tags = map(lambda x: (x[0], x[1]['tags']), data.data['items'].items()) for key, tags in doc_tags: bows = map(lambda x: glossary_bows[x], tags) descriptor = (sum(bows) + zero_vector) #/ float(len(tags) + 1) descriptors += [(key, descriptor)] # Get all tags for the new document new_doc_descriptor = sum(map(lambda x: glossary_bows[x], new_doc['tags'])) + zero_vector if(local): return(descriptors, new_doc_descriptor, vectorizer) return(descriptors, new_doc_descriptor)
def create_vectorizer(self, names): # create the transform vectorizer = TfidfVectorizer(stop_words='english') # tokenize and build vocab vectorizer.fit(names) return vectorizer
def trainTFIDF2(bow21features, bow2kfold, test): idx = (test[0][:, 0]).astype(int) tfv = TfidfVectorizer(min_df=5, max_df=500, max_features=None, strip_accents='ascii', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words='english') pipeline = Pipeline( [('svd', TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=10.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None))]) tfidf2CrossValidationTest = None if toTestModel: tfidf2CrossValidationTest = tfidfCrossValidation(tfv, pipeline, bow2kfold) trainData, lblsTrain, testData, lblstest = bow21features tfv.fit(trainData) X_train = tfv.transform(trainData) X_test = tfv.transform(testData) if isinstance(lblsTrain, list): lblsTrain = lblsTrain[0] lblsTrain = (lblsTrain.astype(int)) pipeline.fit(X_train, lblsTrain) predictions = pipeline.predict(X_test) finalResults = pd.DataFrame({"id": idx, "prediction": predictions}) return tfidf2CrossValidationTest, finalResults
def main(files): word2vec_file_path = files[0] output_file_path = files[1] input_file_paths = files[2:] print 'processing files:', input_file_paths, 'using word vector file', word2vec_file_path print 'outputting to', output_file_path vocab = [] line_counter = 0 vectorizer = TfidfVectorizer(input='filename') vectorizer.fit(input_file_paths) for word in vectorizer.vocabulary_: word = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", word) if not (word in vocab): vocab.append(word.encode('ascii', 'replace')) print "len vocab =", len(vocab) with open(output_file_path, 'w') as output_file: with open(word2vec_file_path) as word2vec: while True: line = word2vec.readline() if not line: break else: tokens = tokenize(line) word, vector = tokens[0], tokens[1:] word = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", word) if word in vocab: output_file.write(word + ' ') for token in vector: output_file.write(token + ' ') output_file.write('\n') del vocab[vocab.index(word)] line_counter += 1 print 'len file =', line_counter
def extract_tfidf_nmf_feats(self, df_data, n_components): """ Extract tfidf features using nmf. """ df_feat = pd.DataFrame(index=range(df_data.shape[0])) tfidf = TfidfVectorizer(ngram_range=(2, 3), stop_words='english') tsvd = TruncatedSVD(n_components=n_components, random_state = 2016) nmf = NMF(solver='cd', n_components=n_components, init='nndsvda', random_state=0, tol=1e-3) df_data['q'].to_csv('q', index=False) df_data['t'].to_csv('t', index=False) df_data['d'].to_csv('d', index=False) print('fitting in tfidf') tfidf.set_params(input='filename') tfidf.fit(['q','t','d']) tfidf.set_params(input='content') for col in ['d', 't', 'q', 'b']: print('process column', col) txt = df_data[col] tfidf_mat = tfidf.transform(txt) nd_feat = nmf.fit_transform(tfidf_mat) tmp = pd.DataFrame(nd_feat, columns=[col+'_tfidf_nmf_comp'+str(i) \ for i in range(n_components)]) df_feat = pd.merge(df_feat, tmp, left_index=True, right_index=True) saveit(df_feat, 'df_tfidf_nmf_feats')
def main(): twenty = fetch_20newsgroups() tfidf = TfidfVectorizer().fit_transform(twenty.data) cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten() related_docs_indices = cosine_similarities.argsort()[:-5:-1] print related_docs_indices print cosine_similarities[related_docs_indices] # vectorizer = CountVectorizer(min_df=1) # corpus = [ # 'This is the first document.', # 'This is the second second document.', # 'And the third one.', # 'Is this the first document?', # ] # tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english') # tfs = tfidf.fit_transform(token_dict.values()) train_set = ("The sky is blue.", "The sun is bright.") test_set = ("The sun in the sky is bright.", "We can see the shining sun, the bright sun.") count_vectorizer = CountVectorizer() count_vectorizer.fit_transform(train_set) print "Vocabulary:", count_vectorizer.vocabulary # Vocabulary: {'blue': 0, 'sun': 1, 'bright': 2, 'sky': 3} freq_term_matrix = count_vectorizer.transform(test_set) print freq_term_matrix.todense() tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) print "IDF:", tfidf.idf_ tf_idf_matrix = tfidf.transform(freq_term_matrix) print tf_idf_matrix.todense()
def get_vectorizer(): if os.path.isfile(j_vec): vec = joblib.load(j_vec) return vec touch(j_vec) # load french stop words list STOPWORDS = [] with open("stop-words_french_1_fr.txt", "r") as f: STOPWORDS += f.read().split("\n") with open("stop-words_french_2_fr.txt", "r") as f: STOPWORDS += f.read().split("\n") STOPWORDS = set(STOPWORDS) vec = TfidfVectorizer( min_df=1, max_features=123456, stop_words=STOPWORDS, strip_accents="unicode", smooth_idf=True, norm="l2", sublinear_tf=False, use_idf=True, ngram_range=(1, 3), ) df_test = pd.read_csv(f_test, sep=";") vec.fit(iterText(df_test)) joblib.dump(vec, j_vec) return vec
def train_and_predict_m6 (train, test, labels) : ## Apply basic concatenation + stemming trainData, testData = stemmer_clean (train, test, stemmerEnableM6, stemmer_type = 'snowball') ## TF-IDF transform with sub-linear TF and stop-word removal tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) tfv.fit(trainData) X = tfv.transform(trainData) X_test = tfv.transform(testData) ## Create the classifier print ("Fitting K-Nearest Neighbors...") clf = KNeighborsClassifier(p = 2, n_neighbors = 5) ## Create a parameter grid to search for best parameters for everything in the pipeline # Note: minkowski with p > 2 does not work for sparse matrices param_grid = {'n_neighbors' : [3, 4, 5, 6, 7], 'weights' : ['uniform', 'distance'], 'leaf_size' : [1, 3, 5, 10] } ## Predict model with best parameters optimized for quadratic_weighted_kappa if (gridSearch) : model = perform_grid_search (clf, param_grid, X, labels) pred = model.predict(X_test) else : clf.fit(X, labels) pred = clf.predict(X_test) return pred
def vectorize(data, new_doc, local = False): """ Vectorize the data as described in file docstring. """ # Generator for all glossaries glossaries = lambda: (data.tag_glossary(t) for t in data.tags()) # Create the bag of words descriptors for each glossary vectorizer = TfidfVectorizer(use_idf=True) vectorizer.fit(glossaries()) tag_bows = dict(zip(data.tags(), vectorizer.transform(glossaries()))) # Count the number of occurences for each tag tag_counter = Counter() for i in data.items(): tag_counter.update(data.item(i)['tags']) # Generator for lists of tags for each item item_tags = (data.item(i)['tags'] for i in data.items()) # The number of dimensions in the bow vector v_dim = len(vectorizer.get_feature_names()) # lambda function to create descriptors create_desc = lambda x: create_descriptor(x, tag_bows, tag_counter, v_dim, len(data.data['items'])) # Create descriptors for all known documents and new document item_descriptors = [create_desc(tags) for tags in item_tags] new_doc_descriptor = create_desc(new_doc['tags']) # For analysis or use in other vectorizers, also return the vectorizer itself if(local): return (zip(data.items(), item_descriptors), new_doc_descriptor, vectorizer) # Asssociate document ids with descriptors and return. return(zip(data.items(), item_descriptors), new_doc_descriptor)
def train_and_predict_m7 (train, test, labels) : ## Apply basic concatenation + stemming trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'snowball') ## TF-IDF transform with sub-linear TF and stop-word removal tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) tfv.fit(trainData) X = tfv.transform(trainData) X_test = tfv.transform(testData) ## Create the classifier print ("Fitting Passive-Aggressive Classifer...") clf = PassiveAggressiveClassifier(random_state = randomState, loss = 'squared_hinge', n_iter = 100, C = 0.01) ## Create a parameter grid to search for best parameters for everything in the pipeline # Note: minkowski with p > 2 does not work for sparse matrices param_grid = {'C' : [0.003, 0.01, 0.03, 0.1], 'loss': ['hinge', 'squared_hinge'], 'n_iter': [5, 10, 30, 100, 300]} #param_grid = {'C' : [0.003, 0.01, 0.03, 0.1, 0.3, 1], 'loss': ['hinge'], 'n_iter': [5, 10, 30, 100, 300, 1000]} ## Predict model with best parameters optimized for quadratic_weighted_kappa if (gridSearch) : model = perform_grid_search (clf, param_grid, X, labels) pred = model.predict(X_test) else : clf.fit(X, labels) pred = clf.predict(X_test) return pred
def train_and_predict_m4 (train, test, labels) : ## Apply basic concatenation + stemming trainData, testData = stemmer_clean (train, test, stemmerEnableM4, stemmer_type = 'porter') ## TF-IDF transform with sub-linear TF and stop-word removal tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) tfv.fit(trainData) X = tfv.transform(trainData) X_test = tfv.transform(testData) ## Create the classifier clf = LogisticRegression(random_state = randomState, penalty = 'l2', C = 12, class_weight = 'auto') ## Create a parameter grid to search for best parameters for everything in the pipeline #param_grid = {'C' : [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 30], 'penalty' : ['l2']} param_grid = {'C' : [1, 3, 5, 6, 7, 8, 10, 11, 12], 'penalty' : ['l2']} ## Predict model with best parameters optimized for quadratic_weighted_kappa if (gridSearch) : model = perform_grid_search (clf, param_grid, X, labels) pred = model.predict(X_test) else : clf.fit(X, labels) pred = clf.predict(X_test) return pred
def train_and_predict_m5 (train, test, labels) : # Beautiful soup cleanup and stemming (just to mix it up) stemmer = PorterStemmer() trainData = modified_cleanup(train, stemmer, is_train = True, pretag = 'full') testData = modified_cleanup(test, stemmer, is_train = False, pretag = 'full') ## TF-IDF transform with sub-linear TF and stop-word removal tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) tfv.fit(trainData) X = tfv.transform(trainData) X_test = tfv.transform(testData) ## Create the classifier print ("Fitting Multinominal Naive Bayes...") clf = MultinomialNB(alpha = 0.03) ## Create a parameter grid to search for best parameters for everything in the pipeline # param_grid = {'alpha' : [0.01, 0.03, 0.1, 0.3, 1]} param_grid = {'alpha' : [0.01, 0.03]} ## Predict model with best parameters optimized for quadratic_weighted_kappa if (gridSearch) : model = perform_grid_search (clf, param_grid, X, labels) pred = model.predict(X_test) else : clf.fit(X, labels) pred = clf.predict(X_test) return pred
def calc_tfidf_cosine(file_name): print "calculating cosine similarity" data = pd.read_csv(file_name) prod_titles = list(data.apply(lambda x:'%s' % (x['product_title']),axis=1)) queries = list(data.apply(lambda x:'%s' % (x['query']),axis=1)) # after you konw the rest of this is workign # to be supplied in stop_words as a **LIST** # stoplist = set('for a of the and to in with an on oz lbs. lbs ft ft. in. ml inch cu. cu ft. ft up cm oz. mm ounce'.split()) # this improved score so using custom stoplist stoplist = list('for a of the and to in with an on oz lbs. lbs ft ft. in. ml inch cu. cu ft. ft up cm oz. mm ounce'.split()) tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'stoplist') # Fit TFIDF tfv.fit(prod_titles) prod_title_tfidf = tfv.transform(prod_titles) # transpose for matrix multiplication and division pt_tfidf_T = np.transpose(prod_title_tfidf) print pt_tfidf_T.shape tfv.fit(queries) query_tfidf = tfv.transform(queries) q_tfidf_T = np.transpose(query_tfidf) print q_tfidf_T.shape cosine_tfidf = cosine_similarity(pt_tfidf_T[0:1], q_tfidf_T[0:1]) print cosine_tfidf return cosine_tfidf
def tfIDFeats(ids,data): # the infamous tfidf vectorizer (Do you remember this one?) tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') # Fit TFIDF tfv.fit(data) X = tfv.transform(data) # Initialize SVD svd = TruncatedSVD(n_components=350) # Initialize the standard scaler scl = StandardScaler( with_mean=False) if X.shape[1]>350: X = svd.fit_transform(X) X = scl.fit_transform(X,ids) if plotData: X = PCA(n_components=2).fit_transform(X) return (X,ids)
x_text = train = X.Text x_sm = train = X[feature_cols_sm] x_sm = X[feature_cols_sm] # #Converting data frame to sparce matrix x_sm = scipy.sparse.csr_matrix(x_sm.values) y = dataset.IsCyberbullying # Target # # # 1.2) Feature Extraction (Textual Features) # # The terms' weights were calculated using the Term Frequency - Inverse Document Frequency (TF-IDF) tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=50000) tfidf_vect.fit(x_text) x_text_tfidf = tfidf_vect.transform(x_text) # 1.3) Feature Selection (Textual Features) # Feature selection using a chi-square score was applied for each applied machine learning algorithm to select relevant textual features. # COMMENT OUT following code block for experimenting different feature sizes for each classifier clf = clf = svm.SVC() for x in range(5, 23, 15): test = SelectKBest(score_func=chi2, k=x) fit = test.fit(x_sm, y) x_s = fit.transform(x_sm) scores = cross_val_score(clf, x_s, y, cv=10) # print(scores) test = SelectKBest(score_func=chi2, k=15)
#f = open("../datasets/classifier2_datasetA_pickle.pkl", "rb") f = open("../datasets/classifier1_datasetA_Combined_pickle.pkl", "rb") filter_tweets = pickle.load(f) y = pickle.load(f) print(len(filter_tweets)) print(len(y)) def baseform(input): ans=[] for i in word_tokenize(input): ans.append(nltk.wordnet.WordNetLemmatizer().lemmatize(i)) #Alternative ans.append(PorterStemmer().stem(i)) return ans train_X, test_X, train_y, test_y = train_test_split(filter_tweets, y, test_size = 0.25, random_state = 42) vectorizer = TfidfVectorizer(ngram_range = (1,2),tokenizer=baseform,max_features=2000)#Remove max features parameter for 2nd classifier vectorizer.fit(train_X, train_y) print(len(train_X)) print(len(test_X)) train_X = vectorizer.transform(train_X) test_X = vectorizer.transform(test_X) print(train_X.shape) print(test_X.shape) train_X = train_X.toarray() test_X = test_X.toarray() from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import accuracy_score from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix clf_RandomForest = RandomForestClassifier(n_estimators=120) clf_RandomForest.fit(train_X, train_y)
print('Done!\n') ## Split data for train and test the model print( "\n\n#############################################################################################################" ) print("\nTraining Classifier...") X_train, X_test, y_train, y_test = train_test_split(whole_text, class_label, train_size=0.75, test_size=0.85) # Vectorization vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5)) vectorizer.fit(X_train) X_train = vectorizer.transform(X_train) X_test = vectorizer.transform(X_test) joblib.dump(vectorizer, filename_vec) ## Deletion of reapeted columns. variance_filtter = VarianceThreshold() variance_filtter.fit(X_train, y_train) joblib.dump(variance_filtter, filename_variance) X_train = variance_filtter.transform(X_train) X_test = variance_filtter.transform(X_test) percentile_filtter = SelectPercentile(f_classif, percentile=50) percentile_filtter.fit(X_train, y_train) X_train = percentile_filtter.transform(X_train) X_test = percentile_filtter.transform(X_test)
def tfidf_transform(weights): tfidf = TfidfVectorizer(tokenizer=lambda x: x.split(), norm=None) tfidf.fit(weights.Tweet) features = pd.Series(tfidf.get_feature_names()) transformed = tfidf.transform(weights.Tweet) return features, transformed
f1 = open(fpTestWLabel, 'r') arrLabels = f1.read().strip().split('\n') f1.close() f1 = open(fpTestWLocation, 'r') arrLocations = f1.read().strip().split('\n') f1.close() for i in range(0, len(arrItems)): item = arrItems[i] X_TestW.append(item) y_TestW.append(arrLabels[i]) l_TestW.append(arrLocations[i]) lstAllText.append(item) vectorizer = TfidfVectorizer(ngram_range=(1, 4), max_features=1000) model = vectorizer.fit(lstAllText) vec_total_all = model.transform(lstAllText).toarray() vec_train_all = model.transform(X_Train).toarray() vec_testP_all = model.transform(X_TestP).toarray() vec_testW_all = model.transform(X_TestW).toarray() #pca = PCA(n_components=100) print('prepare to fit transform') # modelPCA = pca.fit(vec_total_all) # vec_train=modelPCA.transform(vec_train_all) # vec_testP=modelPCA.transform(vec_testP_all) # vec_testW=modelPCA.transform(vec_testW_all) vec_train = vec_train_all vec_testP = vec_testP_all vec_testW = vec_testW_all print('end fit transform')
train = pd.read_csv('C:/Users/Jie.Hu/Desktop/Data Science/Practice/Kaggle_nlp_1/train.csv').fillna("unknown") test = pd.read_csv('C:/Users/Jie.Hu/Desktop/Data Science/Practice/Kaggle_nlp_1/test.csv').fillna("unknown") train_text = train['comment_text'] test_text = test['comment_text'] all_text = pd.concat([train_text, test_text]) word_vectorizer = TfidfVectorizer( sublinear_tf=True, strip_accents='unicode', analyzer='word', #token_pattern=r'\w{1,}', stop_words='english', ngram_range=(1, 3)) word_vectorizer.fit(all_text) train_word_features = word_vectorizer.transform(train_text) test_word_features = word_vectorizer.transform(test_text) char_vectorizer = TfidfVectorizer( sublinear_tf=True, strip_accents='unicode', analyzer='char', stop_words='english', ngram_range=(2, 4)) char_vectorizer.fit(all_text) train_char_features = char_vectorizer.transform(train_text) test_char_features = char_vectorizer.transform(test_text) train_features = hstack([train_word_features, train_char_features]) test_features = hstack([test_word_features, test_char_features])
# comments_test = transform_com.transform(test['comment_text']) # gc.collect() word_vect = word_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 1), max_features=20000) char_vect = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(1, 4), max_features=20000) word_vect.fit(pd.concat([train['comment_text'], test['comment_text']], axis=0)) char_vect.fit(pd.concat([train['comment_text'], test['comment_text']], axis=0)) comments_train_word = word_vect.transform(train_mes) comments_train_char = char_vect.transform(train_mes) comments_valid_word = word_vect.transform(valid_mes) comments_valid_char = char_vect.transform(valid_mes) comments_test_word = word_vect.transform(test['comment_text']) comments_test_char = char_vect.transform(test['comment_text']) comments_train = hstack((comments_train_word, comments_train_char)) comments_valid = hstack((comments_valid_word, comments_valid_char)) comments_test = hstack((comments_test_word, comments_test_char)) import xgboost as xgb ''' def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2, num_rounds=400): param = {} param['objective'] = 'binary:logistic'
return txt #print(raw_data.head()) X_train, X_test, Y_train, Y_test = train_test_split( raw_data[['text', 'text_length', 'Punct_pc']], raw_data['label'], test_size=0.2, random_state=123) '''print(pd.crosstab(Y_train,columns = 'label',normalize=True)) print(pd.crosstab(Y_test,columns = 'label',normalize=True)) print(X_train.head())''' Tfidf_Vect = TfidfVectorizer(analyzer=clean_data) Tfidf_vect_fit = Tfidf_Vect.fit(X_train['text']) X_train_Tfidf_vect = Tfidf_vect_fit.transform(X_train['text']) X_test_Tfidf_vect = Tfidf_vect_fit.transform(X_test['text']) X_train_vect = pd.concat([ X_train[['text_length', 'Punct_pc']].reset_index(drop=True), pd.DataFrame(X_train_Tfidf_vect.toarray()) ], axis=1) X_test_vect = pd.concat([ X_test[['text_length', 'Punct_pc']].reset_index(drop=True), pd.DataFrame(X_test_Tfidf_vect.toarray()) ], axis=1)
#print (xtrain.shape) #print (xvalid.shape) # Using TF-IDF as features tfv = TfidfVectorizer(min_df=1, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') ## Fit and transform reviews text to sparse TF-IDF features matrix tfv.fit(list(xtrain) + list(xvalid)) xtrain_tfv = tfv.transform(xtrain) xvalid_tfv = tfv.transform(xvalid) #Using Bag of Words as features ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words = 'english') # Fitting Count Vectorizer to both training and test sets ctv.fit(list(xtrain) + list(xvalid)) xtrain_ctv = ctv.transform(xtrain) xvalid_ctv = ctv.transform(xvalid)
print("Se cargaron datos de clasificación en: " + str(tiempoFinal)) ################################################################################ ########## Vectorizacion ########## ################################################################################ print("----------------------------------") print("\n Creando la representacion numerica (vectorizando tweets)") tiempo0 = time() # max_features : int or None, default=None #If not None, build a vocabulary that only consider the top max_features # ordered by term frequency across the corpus. # This parameter is ignored if vocabulary is not None. vectorizador = TfidfVectorizer(max_features=54) #vectorizador = TfidfVectorizer(max_features=74) #TfidfVectorizer: Convert a collection of raw documents to a matrix of TF-IDF features. vectorizador.fit(bolsaDePalabras) #fit(raw_documents[, y]) Learn vocabulary and idf from training set. x_matrizSetEntrenamientoVect = vectorizador.fit_transform( x_setEntrenamientoTweets) #Learn vocabulary and idf, return term-document matrix. #This is equivalent to fit followed by transform, but more efficiently implemented. #Parameters: raw_documents : iterable #an iterable which yields either str, unicode or file objects #Returns: #X : sparse matrix, [n_samples, n_features] #Tf-idf-weighted document-term matrix. print(x_matrizSetEntrenamientoVect) #transform(raw_documents, copy=True)[source]. #Uses the vocabulary and document frequencies (df) learned by fit (or fit_transform).
for f, (t_, v_) in enumerate(kf.split(X=df, y=y)): df.loc[v_, "kfold"] = f # creating test and train dataframes for fold_ in range(5): train_df = df[df.kfold != fold_].reset_index(drop=True) test_df = df[df.kfold == fold_].reset_index(drop=True) # countvectorizer initialization tfidf = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None) # print(cv) # fit countvectorizer to the real/fake tweets (tweet text) tfidf.fit(train_df.text) # transform into sparse term-document matrix xtrain = tfidf.transform(train_df.text) xtest = tfidf.transform(test_df.text) xvalidate = tfidf.transform(validation_df.text) # print(xtrain) # print("_"*50) # print(xtest) # Logistic Regression Model logistic_model = linear_model.LogisticRegression(solver="liblinear") # fit logistic model logistic_model.fit(xtrain, train_df.target)
per_word_vocab = dict() for substitutes_dump in substs_list: loader = Substs_loader(data_name, lemmatizing_method='all', drop_duplicates=False, count_lemmas_weights=True) df = loader.get_substitutes(substitutes_dump, topk) substs_texts = df['substs'] for word in df.word.unique(): mask = (df.word == word) vec = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", min_df=min_df, max_df=max_df) vec.fit(substs_texts[mask]) words = set([w for w in vec.vocabulary_]) if word not in per_word_vocab: per_word_vocab[word] = words else: per_word_vocab[word].update(words) for word in per_word_vocab: per_word_vocab[word] = {w: i for i, w in enumerate(per_word_vocab[word])} data = dict() for substitutes_dump in substs_list: loader = Substs_loader(data_name, lemmatizing_method='all', drop_duplicates=False, count_lemmas_weights=True)
def feature_engineering(self, sentence_df, input_column, output_column, method='tfidf', model_param={}): """ 建立文字特徵 :param sentence_df: :param input_column: :param output_column: :param method: :param model_param: :return: """ method = method.lower() cut_words = sentence_df[input_column] print(cut_words) if method == 'tfidf': """ TF-IDF(詞頻/逆文檔頻率)是最流行的IR(信息檢索)技術之一 用於分析單詞在文檔中的重要性。研究表明,83%的基於文本的推薦系統使用TF-IDF TF-IDF衡量文檔中文字的重要性 例如,「the」在任何文檔中都是常用的,因此TF-IDF並不認為「the」對文檔的特性很重要 相反,IT相關主題使用「python」,TF-IDF認為「python」是識別主題和類別的重要特徵詞 """ vectorizer = TfidfVectorizer(norm=None, stop_words=None) # 預測時須將新的單字加入重新計算 if self.transformer: # vectorizer.vocabulary = self.transformer.vocabulary_ x_train_feature = self.transformer.transform(cut_words) else: self.transformer = vectorizer.fit(cut_words) x_train_feature = self.transformer.transform(cut_words) # x_train_feature = self.transformer.transform(cut_words)t output_value = list(x_train_feature.toarray()) elif method == 'count': """ Bag of Words是文檔數據的表示模型 它簡單地計算單詞在文檔中出現的次數 Bag-of-Words通常通過權衡特殊單詞和相關術語來用於聚類,分類和主題建模 """ vectorizer = CountVectorizer(stop_words=None, token_pattern=r"(?u)\b\w+\b") # 預測時須將新的單字加入重新計算 if self.transformer: vectorizer.vocabulary = self.transformer.vocabulary_ self.transformer = vectorizer.fit(cut_words) x_train_feature = self.transformer.transform(cut_words) output_value = list(x_train_feature.toarray()) elif method == 'word2vec': """ Word2vec擅長對相似的單詞進行分組,並根據上下文對單詞的含義進行高度準確的猜測 它內部有兩種不同的算法:CBoW(Continuous Bag-of-Words)和skip gram模型 Skip Gram用於預測目標詞的上下文 CBoW是從上下文預測目標詞 """ output_value = [] else: output_value = [] # np array 轉 dataframe series sentence_df[output_column] = output_value # # 預測時不用保存特徵處理器 # if is_training: # # 將特徵處理器存到fs, ML預測時必須使用 # self.save_vectorizer(feature_transformer_path, feature_transformer_name) return sentence_df
classification = model.predict(X_test) classification = pd.DataFrame(classification) classification = classification.set_index(a.index) a.index classification.index list(a) data = pd.concat([a[['raw_Comments']], classification], axis = 1) data.head data.shape #data.to_excel("Software_Calls_Classification.xlsx") os.chdir(r"C:\Users\SG185314\Desktop\Education\py\Software") trftran = tfidfconverter.fit(df["cleaned_Comments"]) y = trftran.fit_transform(df["cleaned_Comments"]) with open('text_classifier', 'wb') as picklefile: pickle.dump(clf, picklefile) with open('text_classifier', 'rb') as training_model: model = pickle.load(training_model) with open('tfidf_transform', 'wb') as picklefile: pickle.dump(trftran, picklefile) with open('tfidf_transform', 'rb') as training_model: transformation = pickle.load(training_model)
from sklearn.pipeline import Pipeline from sklearn import svm import numpy as np if __name__ == '__main__': model_file_name = 'model' parser = argparse.ArgumentParser() parser.add_argument('-t', '--training_data_file', type=str, default='train.json') args = parser.parse_args() training_data_file = args.training_data_file with open(training_data_file, 'rb') as f: train_data = json.loads(f.read()) x_train = [t['data'] for t in train_data] y_train = [t['label'] for t in train_data] tv = TfidfVectorizer(max_features=50000, ngram_range=(1, 5)) tv.fit(x_train) classifier = svm.LinearSVC() classifier.fit(tv.transform(x_train), y_train) joblib.dump(classifier, model_file_name) # Pipeline = ([('vect', TfidfVectorizer()), ('lsvc', svm.LinearSVC())]) # parameter = {'vect__max_feature': 15000, 'vect__ngram_range': [(1, 5)], 'lsvc': }
except: print(" error:", sys.exc_info(), 'sentence:', sentence) return "" goods_nms = [] goods_nms = datas['goods_nm'].values with Pool(processes=cpu_count()) as pool: goods_nms = pool.map(word_analysis, goods_nms) # for i, row in datas.iterrows(): # goods_nms.append(word_analysis(row.goods_nm)) # if i%10000 == 0: # print(datetime.datetime.now(),': word_analysis:', i) # datas['goods_nm_ana'] = goods_nms # 형태소 분석 완료 datas = datas.drop(columns=['goods_nm']) ctv = TfidfVectorizer() ctv.fit(datas['goods_nm_ana']) with open('cate_suggest_ctv.dump', 'wb') as f: pickle.dump(ctv, f, pickle.HIGHEST_PROTOCOL) with open('cate_suggest_datas.dump', 'wb') as f: pickle.dump(datas, f, pickle.HIGHEST_PROTOCOL) print(datetime.datetime.now(), ':_fin_')
pearsonr(df_train['severe_toxic'].values, feature_values)[0], pearsonr(df_train['obscene'].values, feature_values)[0], pearsonr(df_train['threat'].values, feature_values)[0], pearsonr(df_train['insult'].values, feature_values)[0], pearsonr(df_train['identity_hate'].values, feature_values)[0] )) exit() #stemmer = PorterStemmer() #my_tokenizer = lambda sentence: [stemmer.stem(t) for t in wordpunct_tokenize(sentence.lower())] #my_tokenizer = lambda sentence: [stemmer.stem(t) for t in sentence.lower().split()] my_tokenizer = lambda s: preprocess_string(s) char_trigrams = TfidfVectorizer(min_df=10, max_df=0.75, strip_accents='ascii', analyzer='char', ngram_range=(3, 3), sublinear_tf=True) char_trigrams.fit(list(X_train.values) + list(X_test.values)) # TODO min_df=3 ?!? (Helps score but it seems wrong) #word_vect = TfidfVectorizer(min_df=50, max_df=0.75, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', sublinear_tf=True, stop_words='english', ngram_range=(2, 2)) word_ngrams = TfidfVectorizer(min_df=3, max_df=0.75, strip_accents='unicode', analyzer='word', tokenizer=my_tokenizer, sublinear_tf=True, stop_words='english', ngram_range=(1, 2)) word_ngrams.fit(list(X_train.values) + list(X_test.values)) #word_unigrams = TfidfVectorizer(min_df=50, max_df=0.75, strip_accents='unicode', analyzer='word', tokenizer=my_tokenizer, sublinear_tf=True, stop_words='english') #word_unigrams = TfidfVectorizer(min_df=50, max_df=0.75, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', sublinear_tf=True, stop_words='english') #word_unigrams.fit(list(X_train.values) + list(X_test.values)) # TODO Can we do something with PCA here ? #pca = PCA() def calc_feature_sparse(data, feature_function): return csr_matrix(np.reshape(data.map(feature_function).values, (data.shape[0], 1)))
Corpus.loc[index, 'text_final'] = str(Final_words) #print(Corpus['text_final'].head()) # Step - 2: Split the model into Train and Test Data set Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split( Corpus['text_final'], Corpus['label'], test_size=0.3) # Step - 3: Label encode the target variable - This is done to transform Categorical data of string type in the data set into numerical values Encoder = LabelEncoder() Train_Y = Encoder.fit_transform(Train_Y) Test_Y = Encoder.fit_transform(Test_Y) # Step - 4: Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus Tfidf_vect = TfidfVectorizer(max_features=5000) Tfidf_vect.fit(Corpus['text_final']) Train_X_Tfidf = Tfidf_vect.transform(Train_X) Test_X_Tfidf = Tfidf_vect.transform(Test_X) # Step - 5: Now we can run different algorithms to classify out data check for accuracy # Classifier - Algorithm - Naive Bayes # fit the training dataset on the classifier Naive = naive_bayes.MultinomialNB() Naive.fit(Train_X_Tfidf, Train_Y) # predict the labels on validation dataset predictions_NB = Naive.predict(Test_X_Tfidf) # Use accuracy_score function to get the accuracy
import pandas as pd import pickle from sklearn.feature_extraction.text import TfidfVectorizer train = pd.read_csv('data/raw/train.csv.zip').fillna(' ') test = pd.read_csv('data/raw/test.csv.zip').fillna(' ') train_text = train['comment_text'] test_text = test['comment_text'] all_text = pd.concat([train_text, test_text]) train_text_flagged = train[train.iloc[:, 2:].sum(axis=1) > 0]['comment_text'] flagged_word_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', stop_words='english', ngram_range=(1, 2), max_features=20000) flagged_word_vectorizer.fit(train_text_flagged) print('Finished fitting flagged_word_vectorizer') train_flagged_features = flagged_word_vectorizer.transform(train_text) test_flagged_features = flagged_word_vectorizer.transform(test_text) print('Finished transforming flagged_word_vectorizer') pickle.dump(train_flagged_features, open('src/data/train_flagged_features.sav', 'wb')) pickle.dump(test_flagged_features, open('src/data/test_flagged_features.sav', 'wb'))
# Split data into train and validation and create TF-IDF vectorizer from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder train_x, valid_x, train_y, valid_y = \ train_test_split(df['clean_text'], df['product'], \ test_size=0.2, random_state=42) encoder = LabelEncoder() train_y = encoder.fit_transform(train_y) valid_y = encoder.fit_transform(valid_y) from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000) # Regex: '\w{1,}' = 1+ ASCII char, digits, or underscore tfidf_vect.fit(df['clean_text']) xtrain_tfidf = tfidf_vect.transform(train_x) xvalid_tfidf = tfidf_vect.transform(valid_x) # Logistic regression from sklearn.linear_model import LogisticRegression LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False) model = LogisticRegression().fit(xtrain_tfidf, train_y) # Compute model accuracy and confusion matrix from sklearn.metrics import accuracy_score, confusion_matrix, classification_report print(classification_report(valid_y, model.predict(xvalid_tfidf), \
trainDF['tweets'], trainDF['labels'], train_size=0.8, test_size=0.2, stratify=trainDF['labels']) #encode test_y and train_y encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) test_y = encoder.fit_transform(test_y) # word level tf-idf tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=280) tfidf_vect.fit(trainDF['tweets']) xtrain_tfidf = tfidf_vect.transform(train_x) xtest_tfidf = tfidf_vect.transform(test_x) # create a count vectorizer object count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') count_vect.fit(trainDF['tweets']) # transform the training and validation data using count vectorizer object xtrain_count = count_vect.transform(train_x) xtest_count = count_vect.transform(test_x) #recuperer le code de la classe personal codeSubj = 0 for i, item in enumerate(encoder.classes_): if (item == 'personal'): codeSubj = i
t_start = time.time() """===================================================================================================================== 1 数据预处理 """ df_train = pd.read_csv('../data/train_set.csv') df_train.drop(columns='article', inplace=True) df_test = pd.read_csv('../data/test_set.csv') df_test.drop(columns='article', inplace=True) df_all = pd.concat(objs=[df_train, df_test], axis=0, sort=True) y_train = (df_train['class'] - 1).values """===================================================================================================================== 2 特征工程 """ vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, max_features=1000) vectorizer.fit(df_all['word_seg']) x_train = vectorizer.transform(df_train['word_seg']) x_test = vectorizer.transform(df_test['word_seg']) """===================================================================================================================== 3 保存至本地 """ data = (x_train, y_train, x_test) fp = open('./data_tfidf_1000.pkl', 'wb') pickle.dump(data, fp) fp.close() t_end = time.time() print("已将原始数据数字化为tfidf特征,共耗时:{}min".format((t_end - t_start) / 60))
class CustomTextVectorizer(): def __init__(self, **args): self.data = args['data'] self.labels = args.get('labels') self.stop_words = args.get('stop_words') self.fn = args.get('fn') self.vecs = None if self.fn is None: self.fn = 'training_vecs.csv' # best so far # stop_words = self.stop_words, # ngram_range = (2, 3), # max_df = .2, # max_features = 16000 # after normalization & new stop words: # stop_words = self.stop_words, # ngram_range = (2, 3), # max_df = .7, # max_features = 128000 # ~.5 precision & recall on test data # works with culling # analyzer = 'char', # stop_words = self.stop_words, # ngram_range = (3, 4), # max_df = .7, # max_features = 6000 # 48k features best on train set. if args.get('vectorizer') is not None: self.vectorizer = args.get('vectorizer') else: self.vectorizer = TfidfVectorizer(stop_words=self.stop_words, ngram_range=(2, 3), max_df=.9, max_features=16000) self.vectorizer.fit(self.data['text']) print(len(self.vectorizer.vocabulary_.items())) def vectorize(self, data): self.vecs = self.vectorizer.transform(data) return self.vectorizer def write(self): with open(os.path.abspath(os.path.dirname(__file__) + './' + self.fn), 'w') as outf: outf.write('id,' + ','.join( ['f' + str(i) for i in range(len(self.vecs.toarray()[0]))]) + ',label\n') for index, itm in enumerate(self.vecs.toarray()): current_row = str(self.data['id'][index]) + ',' + ','.join( list(str(f) for f in itm)) if self.labels is not None: current_row += ',' + str(int(self.labels[index])) outf.write(current_row + '\n') outf.close() def dump(self): return self.vecs
import matplotlib.pyplot as plt import numpy as np #3k1 is the source file f = open("3k3.json", "r") content = f.readlines() f.close() #get text from tweets l = [] for i in range(len(content)): k = content[i].split(",") l.append(k[3][7:]) vectorizer = TfidfVectorizer() vectorizer.fit(l) vector = vectorizer.transform([l[0]]) #reshape the vector to 2d if len(vectorizer.idf_) % 2 == 1: a = vectorizer.idf_[:-1] else: a = vectorizer.idf_ c = a.reshape((int(len(a) / 2), 2)) kmeans = KMeans(n_clusters=3) kmeans.fit(c) r = kmeans.cluster_centers_ plt.scatter(c[:, 0], c[:, 1], s=50, c='b') plt.scatter(r[0][0], r[0][1], s=200, c='g', marker='s') plt.scatter(r[1][0], r[1][1], s=200, c='r', marker='s')
def namestring_match(external_names, internal_names, THRESH_1, THRESH_2, doBestMatch=False, loadPriorRoughmatch=False, save_path='matching__temp.json'): # get feature vectors for vocab_subset + sequela if not (loadPriorRoughmatch): start_time = time.time() vectorizer = TfidfVectorizer( min_df=1, analyzer=ngrams ) # note that authors with many publications will have bigger idf components vectorizer.fit(internal_names) rough_edgelist = [] # (external name, internal name) curPosition = 0 blockSize = 50000 N_vocab = len(internal_names) # estimated 30 min for entire list while curPosition < N_vocab: print("cur block: {} : {}".format(curPosition, curPosition + blockSize)) # get next block of embedding words to match if (curPosition + blockSize) < N_vocab: vocab_subset = internal_names[curPosition:( curPosition + blockSize)] # todo loop over vocab by subsetting in blocks else: vocab_subset = internal_names[ curPosition:] # avoid idx overflow combined_words = external_names + vocab_subset # create feature vector for this block tf_idf_matrix = vectorizer.transform( combined_words).toarray() # rows are L2 normalized external_features = tf_idf_matrix[:len( external_names), :] # first portion of concatenated list internal_features = tf_idf_matrix[len( external_names):, :] # second portion of concatenated list similarity_matrix = np.matmul( external_features, internal_features.T ) # match external strings to internal strings #similarity_scores = similarity_matrix.flatten() # first-pass cutoff #cutoff = np.percentile(similarity_scores, P_THRESH_1) # consider iterative filtering cutoff = THRESH_1 print("cutoff: {}".format(cutoff)) for i_row, ext_name in enumerate( external_names): # for each disease in the sequelae list internal_idxs = np.where( similarity_matrix[i_row, :] > cutoff)[0] for i in internal_idxs: int_name = vocab_subset[i] rough_edgelist.append((ext_name, int_name)) if PRINT_VERBOSE: # just print a few of these to get a flavor print(ext_name, ':', int_name) loopTime = time.time() print("elapsed: {}".format(loopTime - start_time)) curPosition += blockSize save_obj = { 'rough_edgelist': rough_edgelist, 'external_names': external_names, 'internal_names': internal_names } # todo save and export rough matches since this step takes ~30 min with open(save_path, 'w') as f: json.dump(save_obj, f) N_roughmatches = len(rough_edgelist) # expected ~ 1 500 000 print("N rough matches: {}".format( N_roughmatches)) # if this is big another pass will be necessary else: with open(save_path, 'r') as f: data = json.load(f) rough_edgelist = data['rough_edgelist'] external_names = data['external_names'] internal_names = data['internal_names'] ################# # 2nd filtering step - 2-grams print(rough_edgelist[:200]) # filter as a second pass vectorizer_2 = TfidfVectorizer(min_df=1, analyzer=ngrams_finetooth) # secondpass_words = external_names + [e[1] for e in rough_edgelist] # build data structure for fast indexing: d_externalIdxs, d_internalIdxs = {}, {} L_ext = len(external_names) for i, w in enumerate(secondpass_words[:L_ext]): d_externalIdxs[w] = i for i, w in enumerate(secondpass_words[L_ext:]): d_internalIdxs[w] = L_ext + i tf_idf_matrix2 = vectorizer_2.fit_transform( secondpass_words).toarray() # rows are L2 normalized scores = [] for i_edge, edge in enumerate(rough_edgelist): if not (i_edge % 100000): print("edge number {}".format(i_edge)) ext_name = edge[0] int_name = edge[1] s_idx = d_externalIdxs[ext_name] t_idx = d_internalIdxs[int_name] v_s = tf_idf_matrix2[s_idx, :] v_t = tf_idf_matrix2[t_idx, :] similarity_score = 1 - sp.spatial.distance.cosine(v_s, v_t) scores.append(similarity_score) scores = np.nan_to_num( scores) # catch the ' ' and '3' that slipped through # second-pass cutoff #cutoff = np.percentile(scores, P_THRESH_2) # consider iterative filtering cutoff = THRESH_2 print("cutoff: {}".format(cutoff)) edgelist = [] # d_matches := external (key) -> internal (value) d_matches = defaultdict(list) # convenience, alternative representation for idx, edge in enumerate(rough_edgelist): if scores[idx] > cutoff: new_edge = (edge[0], edge[1], scores[idx]) edgelist.append(new_edge) d_matches[edge[0]].append((edge[1], scores[idx])) print(new_edge) if doBestMatch: # match external name to single best internal name edgelist_bestmatch = [] for key in d_matches: scores = [t[1] for t in d_matches[key]] tokens = [t[0] for t in d_matches[key]] max_score_idx = np.argmax(scores) best_token = tokens[max_score_idx] edgelist_bestmatch.append((key, best_token, scores[max_score_idx])) # todo should probably leave the punctuation in place when matching in prior steps - # currently I am reg-exp'ing it out before finding engrams edgelist = edgelist_bestmatch save_obj = { 'rough_edgelist': rough_edgelist, 'external_names': external_names, 'internal_names': internal_names, 'edgelist': edgelist } with open(save_path, 'w') as f: json.dump(save_obj, f) return edgelist
train_x, valid_x, train_y, valid_y = model_selection.train_test_split( trainDF['tweet'], trainDF['class'], test_size=0.2) # create a count vectorizer object count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') count_vect.fit(trainDF['tweet']) # transform the training and validation data using count vectorizer object xtrain_count = count_vect.transform(train_x) xvalid_count = count_vect.transform(valid_x) # word level tf-idf tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=20000) tfidf_vect.fit(trainDF['tweet']) xtrain_tfidf = tfidf_vect.transform(train_x) xvalid_tfidf = tfidf_vect.transform(valid_x) # ngram level tf-idf tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2, 3), max_features=20000) tfidf_vect_ngram.fit(trainDF['tweet']) xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x) xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x) # characters level tf-idf tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}',
def get_accuracy2(): labels, texts = [], [] #reading good and Bad dataset file with open("Dataset//BadWords.txt") as fp: data = fp.readlines() for abc in data: labels.append("0") texts.append(abc) with open("Dataset//Goodwords.txt") as fp: data = fp.readlines() for abc in data: labels.append("1") texts.append(abc) trainDF = pandas.DataFrame() trainDF['text'] = texts trainDF['label'] = labels # split the dataset into training and validation datasets train_x, valid_x, train_y, valid_y = model_selection.train_test_split( trainDF['text'], trainDF['label']) # label encode the target variable encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) valid_y = encoder.fit_transform(valid_y) # create a count vectorizer object count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') count_vect.fit(trainDF['text']) # transform the training and validation data using count vectorizer object xtrain_count = count_vect.transform(train_x) xvalid_count = count_vect.transform(valid_x) # word level tf-idf tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000) tfidf_vect.fit(trainDF['text']) xtrain_tfidf = tfidf_vect.transform(train_x) xvalid_tfidf = tfidf_vect.transform(valid_x) # ngram level tf-idf tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2, 3), max_features=5000) tfidf_vect_ngram.fit(trainDF['text']) xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x) xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x) # characters level tf-idf tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2, 3), max_features=5000) tfidf_vect_ngram_chars.fit(trainDF['text']) xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(train_x) xvalid_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(valid_x) # Naive Bayes on Count Vectors accuracy = train_model(valid_y, linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count) stri = "" stri = stri + "LC, Count Vectors: " + str(accuracy) + " + " # Naive Bayes on Word Level TF IDF Vectors accuracy = train_model(valid_y, linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf) stri = stri + "LC, WordLevel TF-IDF: " + str(accuracy) + " + " # Naive Bayes on Ngram Level TF IDF Vectors accuracy = train_model(valid_y, linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram) stri = stri + "LC, N-Gram Vectors: " + str(accuracy) + " + " # Naive Bayes on Character Level TF IDF Vectors accuracy = train_model(valid_y, linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars) stri = stri + "LC, CharLevel Vectors: " + str(accuracy) + " + " return stri
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import MultiLabelBinarizer # create a count vectorizer object count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') count_vect.fit(X_train) # Transform documents to document-term matrix. X_train_count = count_vect.transform(X_train) X_test_count = count_vect.transform(X_test) # preprocessing multilabel_binarizer = MultiLabelBinarizer() multilabel_binarizer.fit(y_train) Y = multilabel_binarizer.transform(y_train) tfidf_vect = TfidfVectorizer(analyzer='word', max_features=90000) tfidf_vect.fit(X_train) # learn vocabulary and idf from training set X_data_tfidf = tfidf_vect.transform(X_train) # giả sử không có tập test trước đó X_test_tfidf = tfidf_vect.transform(X_test) y_train """**Training a classifier**""" from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier from sklearn.metrics import accuracy_score def train_model(classifier, X_train, y_train, X_test): classifier.fit(X_train, y_train)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer from operator import itemgetter corpus = ['This is the first document kaa', 'This is the second second document.', 'And the third one.', 'Is this the first?'] vectorizer = TfidfVectorizer(max_df=0.5,max_features=40) # print vectorizer vectorizer.fit(corpus) vocab = vectorizer.vocabulary_ X_vocab = sorted(vocab.items(),key=itemgetter(1)) # print vocab # print X_vocab baseline_vectorizer = CountVectorizer(vocabulary=vocab) X_base = baseline_vectorizer.fit_transform(corpus).toarray() # print baseline_vectorizer print X_base train = [] for i in range(X_base.shape[0]): user_rating = X_base[i].nonzero()[0] print user_rating train.append((i, user_rating)) print train # for j in range(4): # item_rating = X_base.tocsc().T[j].nonzero()[1] # print(item_rating)#[2] [0 1] [0 3] [0] # train.append((item_rating[0], j)) #
x_train = df['sentence'] y_train = df['label'] # x_train, x_test, y_train, y_test = train_test_split(df['sentence'], df['label'], test_size=0.01, random_state=42) # Feature extraction tfidf_vectorizer1 = TfidfVectorizer(analyzer='word', stop_words=None, ngram_range=(1, 3), min_df=1, max_features=100000) tfidf_vectorizer2 = TfidfVectorizer(analyzer='char', stop_words=None, ngram_range=(1, 4), max_features=50000) tfidf_vectorizer1.fit(df['sentence'].values) tfidf_vectorizer2.fit(df['sentence'].values) vec1 = tfidf_vectorizer1.transform(x_train) vec2 = tfidf_vectorizer2.transform(x_train) x_train = hstack([vec1, vec2]) # Conctruct model and train svm = LinearSVC(verbose=True) print("Start training...") svm.fit(x_train, y_train) # y_pred_test = SVM.predict(x_test) # Prepare predicion data print(df_test['sentence'].shape) vec1 = tfidf_vectorizer1.transform(df_test['sentence']) vec2 = tfidf_vectorizer2.transform(df_test['sentence'])