def main(): # Parameters use_stopwords = True character_level = False # 是否为单字级别的token raw_set_name = 'THUCNews' stopwords_name = 'cn_stopwords_punctuations.csv' new_set_name = 'THUCNews.txt' cut_set_name = 'THUCNews_jieba.txt' # Paths init_path = os.getcwd() data_dir = os.path.join(init_path, 'datasets') stopwords_path = os.path.join(data_dir, stopwords_name) raw_set_path = os.path.join(data_dir, raw_set_name) new_set_path = os.path.join(data_dir, new_set_name) cut_set_path = os.path.join(data_dir, cut_set_name) # Transforming and Dumping x_texts, y_labels = get_texts_from_source(raw_set_path) pd.DataFrame({'label': y_labels, 'text': x_texts})\ .to_csv(new_set_path, sep='\t', index=False, header=True) if use_stopwords: stopwords = pp.get_stopwords(stopwords_path) x_texts = pp.tokenize_texts(x_texts, stopwords, character_level) else: x_texts = pp.tokenize_texts(x_texts, character_level=character_level) pd.DataFrame({'label': y_labels, 'text': x_texts})\ .to_csv(cut_set_path, sep='\t', index=False, header=True)
def titile_tokens(): cnt = 0 try: sw_path = os.path.join(cwd, "text/news_stopwords.txt") sw_list = prep.get_stopwords(sw_path) conn = MongoClient("127.0.0.1", 27017) db = conn.netease target = db.token_war.find({}) for i in target: if "title_keywords" not in i.keys() or not i['title_keywords']: cnt += 1 title_keywords = [] if len(i['keywords']) > 1: # article have keywords title_keywords = i['title'] + '\t\t' + ' '.join( i['keywords']) else: title_keywords = i['title'] tokens_string = ' '.join( prep.tokenize(title_keywords, sw_list=sw_list, language='CN')) db.token_war.update_one( {'number': i['number']}, {'$set': { 'title_keywords': tokens_string }}) print "%d title tokenize completed!" % cnt conn.close() except Exception as e: print "From:art_tokenize:\t\nUnexpect Error: {}".format(e)
def testwebhook(): if request.method == 'GET': return verify_webhook(request) elif request.method == 'POST': return do_conversational_flow(bot, request, utterances, model, encoder, vector, stemmer, get_stopwords()) else: return 'Error de solicitud'
def get_new_vec(model, docs, sw_path=default_sw_path, language="CN"): sw_list = get_stopwords(sw_path) new_doc_vec = [] try: for doc in docs: tokens = tokenize(doc, sw_list, language=language) doc_vec = model.infer_vector(tokens) new_doc_vec.append(doc_vec) return new_doc_vec except Exception as e: print "From get_new_vec:\n\tUnexpect Error:{}".format(e)
def __iter__(self): sw_list = get_stopwords(self.sw_path) f = open(self.file_path) csv_reader = csv.reader(f, delimiter='\t') for i, line in enumerate(csv_reader): if i + 1 > self.lines: self.lines = i + 1 self.label_list.append(line[0]) # get the doc label tag = "%s_%s" % (self.file_name, str(i)) # print '---1' yield doc2vec.TaggedDocument(tokenize(line[1], sw_list, self.t), tags=[tag])
def main(): crawl_state = crawl.main() sp = os.path.join(cwd, "text/news_stopwords.txt") sw_list = prep.get_stopwords(sp) a = art_tokenize() b = rm_sp_tokens(sw_list) c = pos_clean_tokens() titile_tokens() title_clean_tokens() crawl_state = crawl_state + '\n' + a + '\n' + b + '\n' + c print "From the mongodb_io.py:" print crawl_state return crawl_state
def extract_bag_of_words(df, use_cached=True): if use_cached and os.path.isfile('./cache/_cached_bag_of_words.pkl'): with open('./cache/_cached_bag_of_words.pkl', 'rb') as infile: tokens = pickle.load(infile) return tokens stopwords = get_stopwords() tokens = {} for n in range(df.shape[0]): if not n % 1000: print(n) tokens[df.iloc[n]['status_id']] = pipe(df.iloc[n]['message'], stopwords) with open('./cache/_cached_bag_of_words.pkl', 'wb') as outfile: pickle.dump(tokens, outfile) return tokens
def get_word_count(x): """ return normal word count, stop word count, unusual word count """ wc = x.apply(lambda text: len(word_tokenize(text))) unique_wc = x.apply(lambda text: len(np.unique(tokenizer(text)))) stop_wc = x.apply(lambda text: len(get_stopwords(tokenizer(text)))) unusual_wc = x.apply(lambda text: len(get_unusual_words(tokenizer(text)))) return pd.DataFrame( { '{}_word_count'.format(x.name): wc, '{}_unique_word_count'.format(x.name): unique_wc, '{}_stopword_count'.format(x.name): stop_wc, '{}_unusual_word_count'.format(x.name): unusual_wc, '{}_total_word_count'.format(x.name): wc + stop_wc + unusual_wc } )
def tokenize(): content = request.args.get('content', None, type=str) print "From view.py tokenize:%s" % content sw_list = prep.get_stopwords( "/home/skipper/study/python/project/text/news_stopwords.txt") tokens = prep.tokenize(content, sw_list, language="CN") for i in tokens: print i tokens_string = '/'.join(tokens) print tokens_string pos_tag, pos_string = prep.pos_test(tokens) print "From view.py tokenize: %s\n%s\n%s\n" % (tokens_string, pos_tag, pos_string) detail = {} detail['tokens_string'] = tokens_string detail['pos_tag'] = pos_tag detail['pos_string'] = pos_string return jsonify(detail)
# Paths init_path = os.getcwd() data_dir = os.path.join(init_path, 'datasets') stopwords_path = os.path.join(data_dir, stopwords_name) save_dir = os.path.join(init_path, 'saved_models') model_path = os.path.join(save_dir, model_name) tokenizer_path = os.path.join(save_dir, tokenizer_name) labels_path = os.path.join(save_dir, labels_index) graph = get_default_graph() # stopwords, label names stopwords = pp.get_stopwords(stopwords_path) with open(labels_path) as f: labels_name = f.readlines() labels_name = [x.split('\t') for x in labels_name] labels_name = {int(x[0]): x[1] for x in labels_name} # model, tokenizer model = load_model(model_path, compile=True) with open(tokenizer_path, 'rb') as f: tokenizer = pickle.load(f) def get_pred(texts): """预测文本类别。 Parameters
def main(date_start=None, date_end=None, n_clusters=0, days=5, saving=0, show=0): if saving == 1: import myvisual as visual else: import myvisual_show as visual sw_path = os.path.join(cwd, "text/news_stopwords.txt") sw_list = prep.get_stopwords(sw_path) if date_start and date_end: s_eles = date_start.split('/') s_eles = [int(i) for i in s_eles] s_eles = [str(i) for i in s_eles] d_st = datetime(int(s_eles[0]), int(s_eles[1]), int(s_eles[2])) e_eles = date_end.split('/') e_eles = [int(i) for i in e_eles] e_eles = [str(i) for i in e_eles] d_end = datetime(int(e_eles[0]), int(e_eles[1]), int(e_eles[2]), 23, 59, 59) if n_clusters == 0: base_path = '_'.join(s_eles) + '-' + '_'.join(e_eles) + '-orign' else: base_path = '_'.join(s_eles) + '-' + '_'.join( e_eles) + '-Clusters_%s' % str(n_clusters) print "The store paht: %s" % base_path else: c_t = time.localtime(time.time()) tag_time = crawl.get_point_time( days) # 0 present today 0:0 -1 present tommorow b = str(tag_time).split()[0].split('-') b = [int(i) for i in b] b = [str(i) for i in b] # modify 9/19/2017 # add date_start = '/'.join(i for i in b) date_end = '/'.join([str(c_t[0]), str(c_t[1]), str(c_t[2])]) # print "-------" # print "%s---%s"%(date_start, date_end) d_st = tag_time d_end = datetime(c_t[0], c_t[1], c_t[2], c_t[3], c_t[4], c_t[5]) # print d_st, d_end # add done if n_clusters == 0: base_path = '_'.join(b) + '-' + '_'.join( [str(c_t[0]), str(c_t[1]), str(c_t[2])]) + '-orign' else: base_path = '_'.join(b) + '-' + '_'.join( [str(c_t[0]), str(c_t[1]), str(c_t[2])]) + '-Clusters_%s' % str(n_clusters) print "The store path: %s" % base_path flask_path = '/static/img/' + base_path print flask_path if os.path.exists(os.path.join(cwd, 'app') + flask_path) and show == 0: p_list = ['mds.jpg', 'pca.jpg', 'bar.jpg', 'pie.jpg'] p_list = [flask_path + '/' + i for i in p_list] p_list.append(base_path) print "The main function return file path list:" print p_list return p_list else: path = os.path.join(cwd, 'app') + flask_path if saving == 1: os.mkdir(path) # changing print "Loading data from the MongoDB......" # if date_start and date_end: # target = mgio.get_target_1(d_st, d_end) # else: # target = mgio.get_target(tag_time) target = mgio.get_target_1(d_st, d_end) # changed # modify 9/19/2017 # Maybe the mongodb no news during specified time period # add if not target.count(): print "Oh Oh Oh No news between %s and %s" % (d_st, d_end) print "Crawling the news form the website..." # crawl.main() mgio.main() target = mgio.get_target_1(d_st, d_end) # return # From the first execute when every test, so using jupyter, but you need recreate jupyter file # add done data = prep.convert_to_dataframe(target) cnt = len(data) doc_id = list(data.id) print "The number of article: %d" % cnt print "Detail as follows:" df = pd.DataFrame(data, columns=['id', 'date', 'title', 'url']) print df tokens_string = list(data.im_tokens) d = {'id': data['id'], 'date': data['date'], 'keywords': tokens_string} df = pd.DataFrame(d) print df if cnt > 0: print "Creating TF TDM......" tf_matrix, dictionary = mycluster.get_tf_matrix(tokens_string) print "The length of dictionary: %d" % len(dictionary) print "Creating TF-IDF TDM......" tfidf_matrix = mycluster.get_tfidf_matrix(tf_matrix) print "Creating the words and weight of every doc......" docs_keywords_string, docs_words_weight = mycluster.get_first_n_words( tfidf_matrix, 10, dictionary) k_w = {} for i, ele in enumerate(docs_words_weight): keys = ele.keys() weight = ele.values() k_w['%d_word' % i] = pd.Series(keys) k_w['%d_weight' % i] = pd.Series(weight) df = pd.DataFrame(k_w) print df if n_clusters == 0: n_clusters = int(sqrt(cnt / 2)) print "The n_clusters is: %d" % n_clusters print "Clustering......" labels = mycluster.get_label_list(n_clusters, tfidf_matrix) data['cluster'] = labels index_list = list(data['cluster'].value_counts().index) hot_sort = data['cluster'].value_counts() df = pd.DataFrame.from_dict({ 'Cluster Label': hot_sort.index, 'Article Count': hot_sort.values }) df = pd.DataFrame(df, columns=['Cluster Label', 'Article Count']) print df for i in index_list: print "Cluster Lable: %d" % index_list[i] print "# The count of article: %d" % hot_sort[i] df = pd.DataFrame.from_dict({'id': list(pd.DataFrame(data.loc[data['cluster'] == i])['id']),\ 'title':list(pd.DataFrame(data.loc[data['cluster'] == i])['title']),\ 'url': list(pd.DataFrame(data.loc[data['cluster'] == i])['url'])}) df = pd.DataFrame(df, columns=['id', 'title', 'url']) print df clusters_topic_dict = {} clusters_art_number = {} clusters_title_keywords = mycluster.get_clusters_detail( n_clusters, data, 'im_title') for i, art_list in enumerate(clusters_title_keywords): texts = [ele.split() for ele in art_list] t = topic.get_topic_string( texts) # get topic according im_tokens through lda model clusters_topic_dict[index_list[i]] = t clusters_art_number[index_list[i]] = len(art_list) topic_list = [] art_number_list = [] c_classes_keywords_sort = [] for idx in index_list: topic_list.append(clusters_topic_dict[idx]) art_number_list.append(clusters_art_number[idx]) df = pd.DataFrame.from_dict({'Cluster Label': index_list, 'Article Count': art_number_list,\ 'Topic Based on Title': topic_list}) df = pd.DataFrame(df, columns=[ 'Cluster Label', 'Article Count', 'Topic Based on Title' ]) print df visual.mds_show(tfidf_matrix, clusters_topic_dict, n_clusters, data, path, saving) visual.pca_show(tfidf_matrix, clusters_topic_dict, n_clusters, data, path, saving) clusters_cmt_number = {} clusters_cmt_detail = [] clusters_id_list = mycluster.get_clusters_detail( n_clusters, data, 'id') for i, cluster_id_list in enumerate(clusters_id_list): cmt_detail = mgio.id_get_cmts( cluster_id_list) # return DataFrame format clusters_cmt_detail.append(cmt_detail) clusters_cmt_number[index_list[i]] = len(cmt_detail) visual.cluster_barh_new(n_clusters, index_list, clusters_topic_dict,\ clusters_art_number, clusters_cmt_number, path, saving) cmt_number_list = [] for idx in index_list: cmt_number_list.append(clusters_cmt_number[idx]) hot_sort = data['cluster'].value_counts() df = pd.DataFrame.from_dict({ 'Cluster Label': hot_sort.index, 'Comment Count': cmt_number_list }) df = pd.DataFrame(df, columns=['Cluster Label', 'Comment Count']) print df file_path = os.path.join(cwd, 'cmt_stit.txt') # file_path = '/home/skipper/study/python/project_v2/cmt_stit.txt' stopwords_path = os.path.join(cwd, "text/stopsign.txt") # stopwords_path = "/home/skipper/nltk_data/Other_data/stopwords/stopsign.txt" d_stit_prop_dict = {} s_stit_prop_dict = {} for i, cmt_detail in enumerate(clusters_cmt_detail): cmt_list = list(cmt_detail['content']) svm_label = sentiment.get_stit_label(cmt_list, file_path = file_path, \ sw_path = stopwords_path, name = "svm", language = "CN") snow_label = sentiment.get_snow_label(cmt_list) # Add to DataFrame cmt_detail['d_label'] = svm_label cmt_detail['s_label'] = snow_label d_stit_prop_dict[index_list[i]] = sentiment.get_stit_prop( svm_label) s_stit_prop_dict[index_list[i]] = sentiment.get_stit_prop( snow_label) for i, cmt_detail in enumerate(clusters_cmt_detail): print "# The Cluster label: %d" % index_list[i] print "# The count of article: %d" % cmt_number_list[i] df = pd.DataFrame( cmt_detail, columns=['vote', 'against', 'd_label', 'content']) print df visual.cluster_stit_pie(n_clusters, index_list, d_stit_prop_dict, clusters_topic_dict, path, saving) print "Creating the sub class Topic......" sub_cluster, sub_weight = mycluster.get_classes( n_clusters, data, docs_words_weight, docs_keywords_string) classes_keywords = mycluster.get_class_keywords( n_clusters, 3, sub_cluster, sub_weight) print "For K-means clustering " mycluster.all_output(n_clusters, classes_keywords, clusters_topic_dict, \ sub_cluster, data, d_stit_prop_dict, s_stit_prop_dict) p_list = ['mds.jpg', 'pca.jpg', 'bar.jpg', 'pie.jpg'] p_list = [flask_path + '/' + i for i in p_list] p_list.append(base_path) print "The main function return file path list:" print p_list return p_list