def train(): pro_dir = Path(__file__).absolute().parent.parent train_file = pro_dir / 'input' / 'processed' / 'train_second.csv' test_file = pro_dir / 'input' / 'processed' / 'predict_second.csv' test_a_file = pro_dir / 'input' / 'processed' / 'predict_first.csv' columns = ['jieba'] stop_words = get_stop_words() train = pd.read_csv(train_file) test = pd.read_csv(test_file) test_a = pd.read_csv(test_a_file) corpus = [] for df in [train, test, test_a]: for col in columns: ser = df[col] ser = ser[ser.isnull() == False] # 因为样本中有NAN v = ser.apply( lambda x: [w for w in x.split(" ") if w not in stop_words]).values.tolist() corpus.extend(v) model = Word2Vec(corpus, size=FLAGS.hidden_dim, window=FLAGS.window, min_count=len(columns) * FLAGS.min_count, sg=1, iter=FLAGS.iter, workers=multiprocessing.cpu_count()) vector_file = pro_dir / 'input' / 'word2vec' / "my_w2v_{dim}_{iter}_{wd}.txt".format( dim=FLAGS.hidden_dim, iter=FLAGS.iter, wd=FLAGS.window) vector_file.parent.mkdir(mode=0o755, exist_ok=True) model.wv.save_word2vec_format(str(vector_file), binary=False)
def remove_stop_words(words): stop_words = get_stop_words() final_words = [] for word in words: if len(word) > 1 and word not in stop_words: final_words.append(word) return final_words
def get_tf_vectorizer_data(posts): tf_vectorizer = utils.get_model(os.path.join(ROOT, "outputs", "tf.pkl")) if tf_vectorizer is None: tf_vectorizer = CountVectorizer(max_df=0.6, min_df=0.01, stop_words=utils.get_stop_words()) tf_vectorizer.fit(posts) utils.save_model(tf_vectorizer, os.path.join(ROOT, 'outputs', 'tf.pkl')) return tf_vectorizer.transform(posts)
def create_word_cloud(no_topics, lda, feature_names): for i in range(0, no_topics): d = dict(zip(utils.traverse(feature_names), lda.components_[i])) wc = wordcloud.WordCloud(background_color='white', max_words=50, stopwords=utils.get_stop_words()) image = wc.generate_from_frequencies(d) image.to_file(WHERE_OUTPUTS / 'outputs' + r'\Topic' + str(i + 1) + '.png') plt.figure() plt.imshow(wc, interpolation='bilinear') plt.axis("off") plt.show()
def extract_tf_idf(df): posts = df['text'].tolist() tf_idf_model = utils.get_model(os.path.join(ROOT / "outputs", "tfidf.pkl")) if tf_idf_model is None: tf_idf_model = TfidfVectorizer(stop_words=utils.get_stop_words(), ngram_range=(1, 2)) tf_idf_model.fit(posts) utils.save_model(tf_idf_model, os.path.join(ROOT / 'outputs', 'tfidf.pkl')) tf_idf_matrix = tf_idf_model.transform(posts) tf_idf_dataframe = pd.DataFrame(columns=['id', 'tfidf']) tf_idf_dataframe['id'] = df['id'].tolist() tf_idf_dataframe['tfidf'] = helpers.reduce_damnation(tf_idf_matrix) return tf_idf_dataframe
def display(label, parser, summarizer): label.delete(1.0, END) summarizer.stop_words = get_stop_words(LANGUAGE) #USED TO COUNT THE WORDS IN A SUMMARY global displayCount counter = 0 displayCount += 1 '''for item in summarizer(parser.document,SENTENCES_COUNT): for word in parser.tokenize_words(item): counter+=1 print(" wordcount ", counter) print("--------------------") if displayCount%3==0: print("--------ARTICLE ",runCount+26,"------------") #END SUMMARY COUNT CODE''' for sentence in summarizer(parser.document, SENTENCES_COUNT): label.insert(END, sentence)
def get_meaningful_words_tf_idf_difference(df): path = ROOT + '/outputs/MeaningfulWords.pkl' path_object = pathlib.Path(path) if path_object.exists(): return pd.read_pickle(path) df_neg = utils.get_abusive_df(df) df_pos = utils.get_no_abusive_df(df) posts = [' '.join(df_neg['text'].tolist()), ' '.join(df_pos['text'].tolist())] tfidf = utils.get_model(os.path.join(ROOT, "outputs", "tfidf.pkl")) if tfidf is None: tfidf = TfidfVectorizer(stop_words=utils.get_stop_words(), ngram_range=(1, 2)) tfidf.fit(posts) utils.save_model(tfidf, os.path.join(ROOT, 'outputs', 'tfidf.pkl')) x = tfidf.transform(posts) x = x[0, :] - x[1, :] df_tf_idf = pd.DataFrame(x.toarray(), columns=tfidf.get_feature_names()) df_tf_idf = df_tf_idf.sort_values(by=0, axis=1, ascending=False) df_tf_idf.to_pickle(path) return df_tf_idf
#!/usr/bin/env python # coding: utf8 import os import re from tqdm import tqdm import pandas as pd # 分词工具 import jieba from utils import get_stop_words stop_words = get_stop_words() fill_value = "CSFxe" # user_dict = './yan_word.txt' def clean_str(stri): stri = re.sub(r'[a-zA-Z0-9]+', '', stri) if stri == '': return fill_value return stri.strip() def _filter_stop_words(word_list): _filter_words = [w for w in word_list if w not in stop_words and len(w) > 0] x = " ".join(_filter_words) return x data_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "input")
from nlp.stemmers import Stemmer from utils import get_stop_words LANGUAGE = "english" SENTENCES_COUNT = 10 if __name__ == "__main__": #url = "http://www.zsstritezuct.estranky.cz/clanky/predmety/cteni/jak-naucit-dite-spravne-cist.html" url = "https://www.npr.org/2018/10/21/658921379/futuristic-dreams-turn-to-nightmare-in-electric-state" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = lexSum(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print('') print('') summarizer = luhnSum(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print('') print('')
np.save(utils.get_y_train_path(save_directory), y_train) np.save(utils.get_y_test_path(save_directory), y_test) return embeddings if __name__ == '__main__': input_train = pnd.read_csv(params.INPUT_TRAIN_FILENAME, sep=';') # input_test = pnd.read_csv(params.INPUT_TEST_FILENAME, sep=';') y = pnd.read_csv(utils.get_labels_path(), sep=';') drug_names_path = utils.get_drug_names_path() drug_names_df = pnd.read_csv(drug_names_path) drug_names_set = set(drug_names_df[params.DRUG_NAME_COL]) stop_words = utils.compute_stop_words(input_train.question, max_df=STOP_WORDS_TFIDF_MAX_DF) if COMPUTE_STOP_WORDS \ else utils.get_stop_words(STOP_WORDS_FILEPATH) if COMPUTE_STOP_WORDS: print("stop words: %s" % ', '.join(stop_words)) fast_text_embedding = FastTextEmbedding(input_train.question, y.intention, drug_description_embedding=False, drug_names_set=drug_names_set, stop_words=stop_words, model_path=MODEL_PATH, do_correction=True, verbose=True) utils.create_dir(EMBEDDING_DIRPATH) fast_text_embedding.run(save_directory=EMBEDDING_DIRPATH)
def get_chi_wordlist(datas, num, key): # file_name_all = '/home/ren/law_crawler/data_law_tf/data_law_tf_all.pkl' # in_file = open(file_name_all, 'rb') # datas = pickle.load(in_file) # in_file.close() # a = ['我的','的','我的' ] # print a.count('我') # print '---' # datas = [{'ob_content_seg': ['我的','我的','阿噗'], 'ob_label':1}, {'ob_content_seg':['我的','我的','阿噗'], 'ob_label':0}, # {'ob_content_seg':['我','我他','阿噗大'], 'ob_label': 1}, {'ob_content_seg':['我的','我的哦哦','阿大噗'], 'ob_label': 0}] word_dict = {} # label_num={label1:num1, label2:num2, ...} label_num = {} for i in range(len(datas)): data = datas[i] if not label_num.get(data['ob_label']): label_num[data['ob_label']] = 1 else: label_num[data['ob_label']] += 1 ob_content_seg = data[key] if ob_content_seg: for word in set(ob_content_seg): if word.strip() != '': if not word_dict.get(word.strip()): word_dict[word.strip()] = {} word_dict[word.strip()][data['ob_label']] = 1 else: if not word_dict[word.strip()].get(data['ob_label']): word_dict[word.strip()][data['ob_label']] = 1 else: word_dict[word.strip()][data['ob_label']] += 1 # for a in word_dict: # print a, word_dict[a] print len(word_dict) print label_num label_list = label_num.keys() print label_list # word_dict={word1:{label1: , label2: ,...'chi': {label1: , label2: ,...}}, word2:{}...} for word in word_dict: for label in label_list: if not word_dict[word].get(label): word_dict[word][label] = 0 # 计算词的CHI # word_dict[word]['chi'] = {} for label in label_list: a = word_dict[word][label] b = 0 for i in label_list: if i != label: b += word_dict[word][i] c = label_num[label] - a tmp = 0 for i in label_num: if i != label: tmp += label_num[i] d = tmp - b word_dict[word]['chi'][label] = float((a * c - b * d)**2) / float( (a + b) * (c + d)) word_list = word_dict.items() # 删掉停用词 stop_words_list = utils.get_stop_words() i = 0 while i < len(word_list): if len(word_list[i][0]) < 2 or word_list[i][0] in stop_words_list: del word_list[i] else: i += 1 sort_lists = [] for label in label_list: lst = sorted(word_list, key=lambda x: x[1]['chi'][label], reverse=True) words = [word[0] for word in lst] sort_lists.extend(words[:num]) sort_lists = set(sort_lists) return list(sort_lists)
def main(*args): # load stop words stop_words = get_stop_words() plot = const.PLOT_DEFAULT print_ = const.PRINT_DEFAULT max_features = None random_state = const.RANDOM_STATE_DEFAULT order = -1 # default descending order wordcloud_n = None wordcloud_ = False cos_sim = False even_distrib = const.EVEN_DISTRIB_DEFAULT plt.rcParams.update({'font.size': const.FONT_SIZE_DEFAULT}) pre_vec = False limit_size = False min_df = 1 max_df = 1.0 param_compare = False # print command line arguments for arg in args: k = arg.split("=")[0] v = arg.split("=")[1] if k == 'plot': plot = utils.str_to_bool(v) elif k == 'print': print_ = utils.str_to_bool(v) elif k == 'max_features': max_features = int(v) elif k == 'stop_words': if utils.str_to_bool(v) == False: stop_words = None elif k == 'random_state': random_state = int(v) elif k == 'order': order = int(v) elif k == 'wordcloud': wordcloud_ = utils.str_to_bool(v) elif k == 'wordcloud_n': wordcloud_n = int(v) elif k == 'cos_sim': cos_sim = utils.str_to_bool(v) elif k == 'font_size': plt.rcParams.update({'font.size': int(v)}) elif k == 'even_distrib': even_distrib = utils.str_to_bool(v) elif k == 'pre_vec': pre_vec = utils.str_to_bool(v) elif k == 'limit_size': limit_size = utils.str_to_bool(v) elif k == 'min_df': min_df = int(v) elif k == 'max_df': max_df = float(v) if max_df > 1: max_df = int(max_df) elif k == 'param_compare': param_compare = utils.str_to_bool(v) else: print("Unknown param: {}".format(k)) if print_: print() print("-- Analysis config --") print("even_distrib: {}".format(even_distrib)) print("stop_words: {}".format(stop_words != None)) print("max_features: {}".format(max_features)) print("random_state: {}".format(random_state)) print("wordcloud: {}".format(wordcloud_)) print("wordcloud_n: {}".format(wordcloud_n)) print("order: {}".format(order)) print("cos_sim: {}".format(cos_sim)) print("param_compare: {}".format(param_compare)) print("pre_vec: {}".format(pre_vec)) print("limit_size: {}".format(limit_size)) print("min_df: {}".format(min_df)) print("max_df: {}".format(max_df)) print("plot: {}".format(plot)) print("--------------------") print() gen_spotify_df = pd.read_csv(const.GEN_SPOTIFY) clean_spotify_df = pd.read_csv(const.CLEAN_SPOTIFY) if even_distrib == False: clean_spotify_df = pd.read_csv(const.CLEAN_UNEVEN_SPOTIFY) gen_deezer_df = pd.read_csv(const.GEN_DEEZER) clean_deezer_df = pd.read_csv(const.CLEAN_DEEZER) if even_distrib == False: clean_deezer_df = pd.read_csv(const.CLEAN_UNEVEN_DEEZER) datasets = [ (const.SPOTIFY, clean_spotify_df), (const.DEEZER, clean_deezer_df), ] vectorizer = CountVectorizer( stop_words=stop_words, ngram_range=(1, 1), min_df=min_df, max_df=max_df, max_features=max_features, binary=True, ) # word clouds if wordcloud_: top_n = gen_word_cloud_grid( const.SPOTIFY, clean_spotify_df, vectorizer=vectorizer, n=wordcloud_n, order=order, random_state=random_state, print_=print_ ) spotify_shared, spotify_unique = get_shared_words(top_n) top_n = gen_word_cloud_grid( const.DEEZER, clean_deezer_df, vectorizer=vectorizer, n=wordcloud_n, order=order, random_state=random_state, print_=print_ ) deezer_shared, deezer_unique = get_shared_words(top_n) if print_: print() print("Spotify: count shared={}".format( len(spotify_shared)/len(spotify_unique))) print("Deezer: count shared={}".format( len(deezer_shared)/len(deezer_unique))) print() # cosine similarity if cos_sim: for name, dataset in datasets: if pre_vec: dataset = utils.get_vectorized_df(dataset, vectorizer) print("{} class data similarity analysis...".format(name)) for i in dataset.y.unique(): class_df = utils.get_class_based_data( dataset, i, random_state=random_state, include_other_classes=True, even_distrib=False, limit_size=limit_size, print_=True, ) if pre_vec == False: class_df = utils.get_vectorized_df(class_df, vectorizer) pos_df = utils.get_class_based_data(class_df, 1) pos_df.pop('y') ave_pos = utils.get_average_cos_sim(pos_df.values) neg_df = utils.get_class_based_data(class_df, -1.0) neg_df.pop('y') ave_neg = utils.get_average_cos_sim(neg_df.values) ave_between = utils.get_average_cos_sim( pos_df.values, neg_df.values) print("class {}".format(i)) print("data shape: {}".format(class_df.shape)) print("average positive cosine similarity: {}".format(ave_pos)) print("average negative cosine similarity: {}".format(ave_neg)) print("average between cosine similarity: {}".format(ave_between)) print("(pos - between )+ (neg - between) percentage = {} ".format( (ave_pos - ave_between) / ave_pos + (ave_neg - ave_between) / ave_neg )) print() if param_compare: # min_df vs pos_sim, neg_sim, between_sim params_grid = { 'min_df': [i for i in range(1, 15)], 'max_df': np.arange(0.1, 1.0, 0.1), } for name, dataset in datasets: for i in dataset.y.unique(): df = utils.get_class_based_data( dataset, i, random_state=random_state, include_other_classes=True, even_distrib=False, limit_size=limit_size, ) for p, v in params_grid.items(): print("Comparing cosine similarity vs {} for {} Class {} data...".format(p, name, i)) vectorizer = CountVectorizer( stop_words=stop_words, ngram_range=(1, 1), min_df=min_df, max_df=max_df, max_features=max_features, binary=True, ) pos_sim = [] neg_sim = [] between_sim = [] diff = [] for j in range(len(v)): vectorizer.set_params(**{p: v[j]}) class_df = utils.get_vectorized_df(df, vectorizer) pos_df = utils.get_class_based_data(class_df, 1) pos_df.pop('y') ave_pos = utils.get_average_cos_sim(pos_df.values) neg_df = utils.get_class_based_data(class_df, -1.0) neg_df.pop('y') ave_neg = utils.get_average_cos_sim(neg_df.values) ave_between = utils.get_average_cos_sim( pos_df.values, neg_df.values) pos_sim.append(ave_pos) neg_sim.append(ave_neg) between_sim.append(ave_between) diff.append((ave_pos - ave_between)/ave_pos + (ave_neg - ave_between)/ave_neg) plt.figure() plt.title("{} Class {}: {} vs cosine similarity".format(name,i, p)) pos_sim = np.array(list(zip(v, pos_sim))) neg_sim = np.array(list(zip(v, neg_sim))) between_sim = np.array(list(zip(v, between_sim))) diff = np.array(list(zip(v, diff))) plt.plot(pos_sim[:, 0], pos_sim[:, 1], label='pos sim') plt.plot(neg_sim[:, 0], neg_sim[:, 1], label='neg sim') plt.plot(between_sim[:, 0], between_sim[:, 1], label='between sim') plt.plot(diff[:, 0], diff[:, 1], label='sim difference (%)') plt.xlabel(p) plt.legend() # grid search eval if plot: plt.draw() plt.show()