def create_all_indi_word_clouds(infold, outfold, sub='tweets'): stopwords = utils.get_stopwords(sub) outfold = os.path.join(outfold, sub, 'indi') os.makedirs(outfold, exist_ok=True) for i, city in enumerate(config.CITIES + ['other']): print(city) create_indi_word_cloud(infold, outfold, sub, city, stopwords)
def grid_search_dim_red(*args): # Dummy search. stop_words = get_stopwords() n_estimators = (30, 40, 50) max_depths = [5, 7, 10, 20] criterions = ('gini', 'entropy') pca_params = [7, 14, 21, 30] i = 1 for max_depth in max_depths: for criterion in criterions: for n_estimator in n_estimators: for pca_param in pca_params: print( f'{i}. {max_depth}, {criterion}, {n_estimator}, {pca_param}.' ) vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=.7) clf = RandomForestClassifier(n_estimators=n_estimator, criterion=criterion, max_depth=max_depth) print( grid_search_with_dim_red_one(clf, vectorizer, *args, n=pca_param)) i += 1
def get_text_ranks(): segmenter = Segmenter() stopwords = get_stopwords() print("Start TextRank over the selected quatrains ...") corpus = get_corpus() adjlist = dict() for idx, poem in enumerate(corpus): if 0 == (idx + 1) % 10000: print("[TextRank] Scanning %d/%d poems ..." % (idx + 1, len(corpus))) for sentence in poem['sentence']: segs = list( filter(lambda word: word not in stopwords, segmenter.segment(sentence))) for seg in segs: if seg not in adjlist: adjlist[seg] = dict() for i, seg in enumerate(segs): for _, other in enumerate(segs[i + 1:]): if seg != other: adjlist[seg][other] = adjlist[seg][other] + 1 \ if other in adjlist[seg] else 1.0 adjlist[other][seg] = adjlist[other][seg] + 1 \ if seg in adjlist[other] else 1.0 for word in adjlist: w_sum = sum(weight for other, weight in adjlist[word].items()) for other in adjlist[word]: adjlist[word][other] /= w_sum print("[TextRank] Weighted graph has been built.") _text_rank(adjlist)
def get_distribution_other(locfile): stopwords = utils.get_stopwords() freqs = {} with open(locfile, 'r') as f: for line in f: parts = line.strip().split() if len(parts) < 2: continue freqs[parts[0]] = int(parts[1]) freqs = utils.filter_stopwords(freqs, stopwords, filter_unprintable=True) cloud = wordcloud.WordCloud( background_color="white").generate_from_frequencies(freqs) fig = plt.figure(figsize=(10, 5)) plt.imshow(cloud, interpolation='bilinear') plt.axis("off") plt.title('Locations of Other', fontsize=15) plt.show() plt.clf() plt.close() # get_distribution_other('loc.vocab') # rank_city_by_uniqueness('tweets', 'images/unique', thres1=5,thres2=5, at_least_ratio=2, results=2) # rank_city_by_uniqueness('bios', 'images/unique', thres1=2,thres2=2, at_least_ratio=2, results=2) # get_popuplarity_distribution('images/pop')
def remove_stopwords(sub, d='processed'): nlp = spacy.load('en_core_web_sm') stopwords = utils.get_stopwords() infold = d + '/tokenized' outfold = d + '/cleaned' subinfold = os.path.join(infold, sub) suboutfold = os.path.join(outfold, sub) os.makedirs(suboutfold, exist_ok=True) files = glob.glob(subinfold + '/*.tok') for file in files: filename = file[file.rfind('/') + 1:] out = open(os.path.join(suboutfold, filename), 'w') with open(file, 'r') as f: for line in f: line = line.strip().replace("’", "'") line = line.replace("’", "'") line = line.replace("“", '"') line = line.replace('”', '"') raw_tokens = line.split() tokens = [ token for token in raw_tokens if len(token) > 0 and not token.lower() in stopwords ] if len(tokens) > 0: out.write(' '.join(tokens) + '\n') out.close()
def get_wordvectors(): print("wordvectors read ...") word_dict = utils.get_word_counts("/../resources/yelp/data/yelp_restaurant_word_counts.txt") stop_words = utils.get_stopwords() vw_model = utils.get_word2vec_model('../resources/yelp/word2vec/yelp_restaurants_word2vector', ncols, nwin) vw_model.syn0 = utils.normalize2(vw_model.syn0) glove_dict = utils.get_glove_data('../resources/yelp/glove/','vectors_'+str(ncols)+'.txt') return vw_model, word_dict, stop_words, glove_dict
def __init__(self, flags): # 加载模型 self.model = get_model(flags.max_len, flags.vocab_size, flags.embedding_dim, flags.lstm_unit, flags.dropout_loss_rate, flags.label_num) self.model.load_weights(flags.weight_save_path) # 预加载处理评价数据库 self.stopwords = get_stopwords(flags.stopwords_file) self.w2i, _ = read_vocab(flags.vocab_file) with open(flags.label_file, 'r') as f: self.labels = [l.strip() for l in f.readlines()] self.classify = ['Not mention', 'Bad', 'Normal', 'Good']
def __init__(self, **kargs): """Initialize, load Glove embeddings.""" super(GloVeSemanticSimilarityMetric, self).__init__() get_nltk_data() self._glove = get_glove_emb() stopwords = get_stopwords() logger.info("Glove embeddings and stopwords loaded.") for word in stopwords: word = word.lower().strip() if word in self._glove["tok2id"]: self._glove["emb_table"][self._glove["tok2id"][word], :] = 0
def search_synonym (query): try: solr = connect_solr() list_words = ViTokenizer.tokenize(query).split() stopwords = utils.get_stopwords() words = [] # word after remove stop word for word in list_words: if word not in stopwords: words.append(word) except Exception: print("[ERROR] search synoym error: Something went wrong!")
def create_all_duo_word_clouds(infold, outfold, sub): stopwords = utils.get_stopwords(sub) outfold = os.path.join(outfold, sub, 'duo') os.makedirs(outfold, exist_ok=True) # for i, city1 in enumerate(config.CITIES + ['other']): # for j, city2 in enumerate(config.CITIES + ['other']): stopwords.add('favorite') stopwords.add('favourite') stopwords.add('mom') stopwords.add('mum') for i, city1 in enumerate(['nyc']): for j, city2 in enumerate(['melbourne']): if i <= j: create_duo_word_clouds(infold, outfold, sub, city1, city2, stopwords)
def sentences2idx(texts, words): """ Take in data, output array of word indices that can be fed into the algorithms. :param texts: List of texts :return: x1, m1. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location) """ seq = [] for t in texts: # Doing some cleaning of the text stopwords = utils.get_stopwords() text = t.strip().strip('"') text_clean = utils.clean_text(text) s = [w for w in text_clean.split(" ") if w not in stopwords] s = s[0:MAX_WORDS] seq.append(data_io.getSeq(' '.join(s), words)) x1, m1 = data_io.prepare_data(seq) return x1, m1
def get_tf(text): stopdict = utils.get_stopwords() tf = {} # seg = utils.chinese_segment.get_segment(text) word_count = 1 for word in text.split(' '): if not stopdict.__contains__(word) and word.strip().__len__() > 1: word_count += 1 if tf.__contains__(word): tf[word] += 1 else: tf[word] = 1 rs = sorted(iteritems(tf), key=lambda d: d[1], reverse=True) rs = [(a[0], a[1] / float(word_count)) for a in rs] if rs.__len__() > 20: return rs[:20] else: return rs
def search (query, page=1): try: solr = connect_solr() list_words = ViTokenizer.tokenize(query).split() stopwords = utils.get_stopwords() words = [] # word after remove stop word for word in list_words: if word not in stopwords: words.append(word) if len(words) == 0: return { "results": [], "numFound": 0 } else: clean_query = ' '.join(words) page = int(page) results = solr.search("content_clean:{}".format(clean_query), **{'fl': '*, score', 'start': "{}".format((page - 1)*10)}) return { "results": results, "numFound": results.raw_response['response']['numFound']} except Exception: print("[ERROR] search error: Something went wrong!")
def vanilla_experiment(dataset, classifiers, tf_idf_params, nlp=NLP): train_df = read_data(dataset['train']) dev_df = read_data(dataset['dev']) stop_words = list(get_stopwords()) X_train, y_train, X_dev, y_dev = get_train_dev_data(train_df, dev_df, nlp) print(len(X_train), len(X_dev), flush=True) best_clfs = grid_search(classifiers, tf_idf_params, X_train, y_train, stop_words=stop_words) test_on_dev(best_clfs, X_train, y_train, X_dev, y_dev) conf_train, conf_dev, labels = get_confusion_matrix( best_clfs[0], X_train, y_train, X_dev, y_dev) pprint(conf_train) pprint(conf_dev) return best_clfs
def __init__(self, dataset, model_init): self._tokenizer = BertTokenizer.from_pretrained( utils.get_transformers(model_init), do_lower_case="uncased" in model_init) self._glove = get_glove_emb() stopwords = get_stopwords() for word in stopwords: word = word.lower().strip() if word in self._glove["tok2id"]: self._glove["emb_table"][self._glove["tok2id"][word], :] = 0 data = [] logger.info("processing data for wordpiece embedding training") for item in tqdm.tqdm(dataset["data"]): text = item["text0"] if "text1" in item: text += " " + item["text1"] text_toks = word_tokenize(text) data += [x for x in text_toks if x.lower() in self._glove["tok2id"]] self._data = data
def processing_data(infile, labelfile, outfile, vocab_file, stopwords_file): print('Loading stopwords...') stopwords = get_stopwords(stopwords_file) print('Loading data...') data = pd.read_csv(infile) print('Saving labels') with open(labelfile, 'w') as f: for label in data.columns[2:]: f.write(label + '\n') # 把句子分割成词 print('Splitting content') contents = data['content'].tolist() seg_contents = segmentData(contents, stopwords) if not os.path.exists(vocab_file): print('Creating vocabulary...') create_vocab(seg_contents, vocab_file, 50000) print('Loading vocabulary...') w2i, _ = read_vocab(vocab_file) # word2id print('Tokenize...') token_contents = [tokenizer(c, w2i) for c in seg_contents] data['content'] = token_contents # 把标签转换成one hot形式 print('One-hot label') for col in data.columns[2:]: label = data[col].tolist() onehot_label = [onehot(l) for l in label] data[col] = onehot_label print('Saving...') data[data.columns[1:]].to_csv(outfile, index=False)
def calculate(raw_data, tag): '''tag:原始新闻属于哪一个信息来源''' """获取原始新闻数据""" if not raw_data == {}: entityId_list = [] # 新闻ID train_titles = [] publishDateTime_list = [] count = 0 for k, v in raw_data.items(): if v.get('content') is None or v.get('content') == '': continue count += 1 entityId_list.append(k) if v.get('title') is None or v.get('title') == '': train_titles.append(v.get('content')) else: train_titles.append(v.get('content') + ' ' + v.get('title')) publishDateTime_list.append(v.get('publishDateTime')) print "get raw data done!" """聚类分析""" hotId2newsId = {} #热点对应的新闻ID hotId_to_be_write = {} start = datetime.now() train_titles = [ chinese_segment.get_segment(title) for title in train_titles ] train_data = [model_train.get_tf(d) for d in train_titles] print 'seg take time %s' % (datetime.now() - start).seconds '''第一次聚类,将原始数据聚类为若干热点''' start = datetime.now() first_cluster_result = model_train.first_cluster( train_data, entityId_list, publishDateTime_list) print 'first_cluster_result take time %s' % (datetime.now() - start).seconds old_hotId_words = utils.get_hotId_words(tag) hotId_newsId = [] #热点对应的新闻ID ''' 当前新闻热点与已有新闻热点比较,如果该热点不在已有热点中则,插入该热点,否则跳过 ''' start = datetime.now() for first_result in first_cluster_result: terms = model_train.get_tf_from_list(first_result) #热点下面的关键词 hotId, hotIdflag = utils.get_hot_id( terms, old_hotId_words) #该新闻热点产生一个热点标志 # if hotId not in old_hotId_words: if hotIdflag: hotId_to_be_write[hotId] = terms #新产生一个热点 for news in first_result: # record = [hotId, news[0], news[2], utils.generateTime(),tag]#news[0],[2]分别是新闻ID 以及 产生时间 record = [hotId, news[0], news[2], utils.generateTime()] #news[0],[2]分别是新闻ID 以及 产生时间 hotId_newsId.append(record) if hotId2newsId.__contains__(hotId): hotId2newsId[hotId].append(news[0]) #热点事件以及其对应的新闻事件 else: hotId2newsId[hotId] = [news[0]] utils.write2hotId_newsId(hotId_newsId) print "write to newsId done!" print 'hot compare take time %s' % (datetime.now() - start).seconds """写新的新闻热点""" for hotId, terms in hotId_to_be_write.items(): if hotId in hotId2newsId: utils.write2hotId_words(hotId, terms, tag) print "write to words done!>>>>>>>>>" """抽取每个热点下新闻主题词""" # 统计采集的这一批次所有新闻的 idf stopdict = utils.get_stopwords() hotId2title = {} df_dict = {} n_doc = 0 for line in train_titles: arr = line.split(' ') n_doc += 1 temp_dict = {} #去重,因为求的 Document_Frequency,简称DF for tk in arr: # if token2id.__contains__(tk): temp_dict[tk] = 1 for k, _ in temp_dict.items(): if df_dict.__contains__(k): df_dict[k] += 1 else: df_dict[k] = 1 idf_dict = {} for w, df in iteritems(df_dict): idf_dict[w] = math.log(n_doc / float(df + 1)) # 统计每个热点下的TF-IDF,找出每一个热点下的关键词 for hotId, newsId_list in hotId2newsId.items(): tf = {} title_words = [] for newsId in newsId_list: content = raw_data[newsId].get('content') seg = chinese_segment.get_segment(content) for word in seg.split(' '): if not stopdict.__contains__( word) and word.strip().__len__() > 1: if tf.__contains__(word): tf[word] += 1 else: tf[word] = 1 tf_idf = {} for word, freq in tf.items(): if not idf_dict.__contains__(word): tf_idf[word] = 0 else: tf_idf[word] = freq * idf_dict[word] rs = sorted(iteritems(tf_idf), key=lambda d: d[1], reverse=True) if len(rs) >= 3: title_words = [rs[0][0], rs[1][0], rs[2][0]] else: for r in rs: title_words.append(r[0]) hotId2title[hotId] = title_words """依据每个热点的新闻数据做统计分析""" """order, hotScore, category, totalComments, totalLikes, siteCoverage, area, totalNews""" hot_information_data = [ ] # [{"hotId": 1, "order": 1, "title": "apple"},{...}] update = [] insert_count = 0 for hotId, newsId_list in hotId2newsId.items(): # firstFindSite, lastDays keywords = hotId2title[hotId] news_count = len(newsId_list) totalComments = 0 totalLikes = 0 category_dict = {} site_set = set() local_area_count = 0 earliestFindTime = "2049" firstFindSite = "" max_title = "" # 得分最高的新闻的标题 max_score = -1 # 新闻最高得分 second_title = "" # 得分第二高的新闻的标题 second_score = -1 # 新闻第二得分 for newsId in newsId_list: news_category = raw_data[newsId].get('categoryName') news_comments = raw_data[newsId].get('commentCount') news_likes = raw_data[newsId].get('joinCount') news_site = raw_data[newsId].get('siteName') news_source_site = raw_data[newsId].get('sourceSiteName') news_publishDateTime = raw_data[newsId].get('publishDateTime') news_area = raw_data[newsId].get('siteTypeName') news_title = raw_data[newsId].get('title') if category_dict.__contains__(news_category): category_dict[news_category] += 1 else: category_dict[news_category] = 1 if news_area == u"本地": local_area_count += 1 # 找到最早的新闻时间 if news_publishDateTime < earliestFindTime: earliestFindTime = news_publishDateTime if news_source_site is None or news_source_site == '': firstFindSite = news_site else: firstFindSite = news_source_site temp_score = 0 if news_comments is not None and news_comments.isdigit(): totalComments += int(news_comments) temp_score += int(news_comments) * 0.6 if news_likes is not None and news_likes.isdigit(): totalLikes += int(news_likes) temp_score += int(news_likes) * 0.4 # 找到得分最高的两篇新闻的标题 if temp_score > max_score: second_score = max_score second_title = max_title max_score = temp_score max_title = news_title else: if temp_score > second_score: second_score = temp_score second_title = news_title site_set.add(news_site) title = [max_title, second_title] rs = sorted(iteritems(category_dict), key=lambda d: d[1], reverse=True) # 类别统计排序 category = rs[0][0] # 取新闻最多的分类 all_site_number = 6. # 总的网站数 siteCoverage = round(len(site_set) / all_site_number, 2) if (float(local_area_count) / news_count) > 0.5: area = "local" else: area = "global" if tag == 'weibo': hotScore = 80 * (totalComments + totalLikes + news_count * siteCoverage) else: hotScore = 0.8 * totalComments + 1.0 * totalLikes + news_count * siteCoverage if tag == 'weibo': firstFindSite = '新浪微博' # elif tag == 'weixin': # firstFindSite = '微信' hotScore = round(math.log(hotScore + 1), 2) if hotScore > 30: hotScore = 20 elif hotScore < 30 and hotScore > 20: hotScore = 19.8 #统计已有热点新闻条数 news_count_old = news_count if hotId in old_hotId_words.keys(): news_count_old = utils.get_newsId_count(hotId) print 'old total news:%s' % news_count_old hot_data_wrap = { "hotId": hotId, "hotScore": hotScore, "title": title, "category": category, "totalComments": totalComments, "totalLikes": totalLikes, "siteCoverage": siteCoverage, "area": area, "keywords": keywords, "totalNews": news_count_old, "firstFindSite": firstFindSite, "datatagcategory": tag, "lastDays": 1 } #热点发现时间 if hotId in old_hotId_words: update.append(1) else: hot_data_wrap.setdefault("findTime", utils.generateTime()) update.append(0) insert_count += 1 hot_data_wrap.setdefault("lastUpdate", utils.generateTime()) hot_information_data.append(hot_data_wrap) print "insert_count/operation_count:", insert_count, '/', hot_information_data.__len__( ) utils.write2hot_info(hot_information_data, update) print "write to hot_info done!>>>>>>>>>" """计算order、hotScore(满分100分)""" hotScore_data = utils.get_hotScore_from_hot_info_time_decay( 'datatagcategory', tag) # 获取hotId、hotScore hotScore_data = sorted(hotScore_data, key=lambda d: d["hotScore"], reverse=True) # 按hotScore降序排序 # max_hotScore = hotScore_data[0].get('hotScore') # 取最大值 # print max_hotScore for i in range(len(hotScore_data)): hotScore_data[i].setdefault("order", i + 1) # hotScore_data[i].__setitem__("hotScore", int(100 * hotScore_data[i].get("hotScore") / float(max_hotScore))) utils.update_hot_info(hotScore_data) print "update rank to hot_info done!>>>>>>>>>" else: print "empty raw data"
# tmp 表示每一行自由组合后的结果(n gram) # 雪落/ 山庄/ 不是/ 一座/ 山庄/ 只是/ 一个/ 客栈 # tmp: [['雪落'], ['山庄'], ['不是'], ['一座'],['山庄'],['只是'],['一个'],['客栈'], # ['雪落', '山庄'], ['山庄', '不是'], ['不是', '一座'],['一座','山庄'],['山庄','只是'],['只是','一个'],['一个','客栈'] # ['雪落', '山庄', '不是'], ['山庄', '不是', '一座'],[不是'','一座','山庄'],... ngrams = generate_ngram(word_list, 3) #print(ngrams) for d in ngrams: root.add(d) print('------> 插入成功') if __name__ == "__main__": root = TrieNode('*', None) stopwords = get_stopwords('./data/stopword.txt') data = load_data('./data/data.txt', stopwords) # 将新的文章插入到Root中 load_data_2_root(data) # 定义取TOP5个 topN = 20 result, add_word = root.find_word(topN) # 如果想要调试和选择其他的阈值,可以print result来调整 # print("\n----\n", result) print("\n----\n", '增加了 %d 个新词, 词语和得分分别为: \n' % len(add_word)) print('#############################') for word, score in add_word.items(): print(word + ' ----> ', score) print('#############################')
def main(distance_type, refresh): ''' INPUT distance_type<string>: Gaussian for gaussian projection OR Sparse for Achiloptas projection default: Sparse refresh<bool>: refreshes data model, recomputing word2idx, bow OUTPUT dist<float<D,D>>: matrix of distances using random projections algorithm values are always saved following the pattern <projection>_<eps>_distance_matrix.txt D: original count of documents ''' filename_distance_matrix = '%s_distance_matrix.txt' % (distance_type) if refresh: devel_path = DATASET_PATH + 'development.json' print('reading...\r') data = pd.read_json(devel_path, orient='records') print('reading...done\r') print('') print('tokenizing...\r') this_stemmer = get_stemmer() this_stopwords = get_stopwords() tokenfy = lambda x: tokenizer2( x, stemmer=this_stemmer, stopwords=this_stopwords) data['token_description'] = data['description'].apply(tokenfy) print('tokenizing...done\r') print('') print('indexing...\r') word2idx = {} data = data2idx(data, word2idx, colname='token_description', new_colname='idx_description') print('indexing...done\r') print('') print('generating bag of words...\r') print('') bow2 = data2bow(data, word2idx) print('generating bag of words...done\r') else: print('') print('retrieving word2idx...') word2idx_path = DATASET_PATH + 'word2idx2.txt' df = pd.read_csv(word2idx_path, sep=' ', index_col=0, header=None) word2idx = {k: v for k, v in zip(df.index, df.iloc[:, 0])} print('retrieving word2idx...done') print('retrieving bow2...') bow2_path = DATASET_PATH + 'bow2.txt' df = pd.read_csv(bow2_path, sep=' ', index_col=None, header=None, skiprows=1) bow2 = df.as_matrix() print('retrieving bow2...done') print('') print('compute %s distance...\r' % (distance_type)) dist = bow2dist(bow2, verbose=True, distance_type=distance_type) print('compute %s distance...done\r' % (distance_type)) print('') print('storing %s distance matrix...\r' % (distance_type)) matrix2txt(dist, filename=filename_distance_matrix) print('storing %s distance matrix...done\r' % (distance_type))
from utils import get_stopwords STOPWORDS = get_stopwords() COLUMNS = [ "utc_time", "country_name", "country_code", "place_type", "place_name", "language", "username", "user_screen_name", "timezone_offset", "number_of_friends", "tweet_text", "latitude", "longitude" ]
def main(projection_type, eps, store, refresh): ''' INPUT projection_type<string>: Gaussian for gaussian projection OR Sparse for Achiloptas projection default: Sparse eps<float>: threshold for acceptable distorsions higher eps -> higher theoretical probability of distorsions is bounded between 0-1 refresh<bool>: refreshes data model, recomputing word2idx, bow store<bool>: stores 3 intermediary results: word2idx, bow, proj_bow. OUTPUT dist<float<D,D>>: matrix of distances using random projections algorithm values are always saved following the pattern <projection>_<eps>_distance_matrix.txt D: original count of documents proj_bow<int<v,D>>: projection over bag-of-words using random projections algorithm values are always saved following the pattern <projection>_<eps>_bow.txt v<int>: v<<V is the new vocabulary size D<int>: original count of documents ''' startTime = datetime.now() dt = startTime.strftime('%Y-%m-%d %H:%M:%S') filename_distance_matrix = '%s_%.1f_%s_distance_matrix.txt' % ( projection_type, eps, dt) filename_projection_bow = '%s_%.1f_%s_bow.txt' % (projection_type, eps, dt) profiler_breakdown = dict([ ('io', timedelta(0)), ('tokenizer', timedelta(0)), ('indexing', timedelta(0)), ('BoW', timedelta(0)), ('rnd_proj', timedelta(0)), ('BoW2Dist', timedelta(0)), ('total', timedelta(0)) ]) profiler_modelparams = {} breakdownTime = startTime if refresh: devel_path = DATASET_PATH + 'development.json' print('reading...\r') data = pd.read_json(devel_path, orient='records') print('reading...done\r') profiler_breakdown['io'] += datetime.now() - breakdownTime breakdownTime = datetime.now() print('') print('tokenizing...\r') this_stemmer = get_stemmer() this_stopwords = get_stopwords() tokenfy = lambda x: tokenizer2( x, stemmer=this_stemmer, stopwords=this_stopwords) data['token_description'] = data['description'].apply(tokenfy) print('tokenizing...done\r') profiler_breakdown['tokenizer'] += datetime.now() - breakdownTime breakdownTime = datetime.now() print('') print('indexing...\r') word2idx = {} data = data2idx(data, word2idx, colname='token_description', new_colname='idx_description') print('indexing...done\r') profiler_breakdown['indexing'] += datetime.now() - breakdownTime breakdownTime = datetime.now() print('') print('generating bag of words...\r') print('') bow2 = data2bow(data, word2idx) print('generating bag of words...done\r') profiler_breakdown['BoW'] += datetime.now() - breakdownTime if store: breakdownTime = datetime.now() print('') print('storing indexes...\r') word2idx2txt(word2idx, filename='word2idx2.txt') print('storing indexes...done\r') print('') print('storing bag of words...') matrix2txt(bow2, filename='bow2.txt') print('storing bag of words...done\r') profiler_breakdown['io'] += datetime.now() - breakdownTime else: breakdownTime = datetime.now() print('') print('retrieving word2idx...') word2idx_path = DATASET_PATH + 'word2idx2.txt' df = pd.read_csv(word2idx_path, sep=' ', index_col=0, header=None) word2idx = {k: v for k, v in zip(df.index, df.iloc[:, 0])} print('retrieving word2idx...done') print('retrieving bow2...') bow2_path = DATASET_PATH + 'bow2.txt' df = pd.read_csv(bow2_path, sep=' ', index_col=None, header=None, skiprows=1) bow2 = df.as_matrix() print('retrieving bow2...done') profiler_breakdown['io'] += datetime.now() - breakdownTime breakdownTime = datetime.now() print('') print('compute random projection...\r') proj = bow2rnd_proj(bow2, projection_type=projection_type, eps=eps) print( 'compute random projection... done new (reduced) dimensions:%dx%d\r' % proj.shape) profiler_breakdown['rnd_proj'] += datetime.now() - breakdownTime if store: breakdownTime = datetime.now() print('') print('storing bag of words...\r') matrix2txt(proj, filename=filename_projection_bow) print('storing bag of words...done\r') profiler_breakdown['io'] += datetime.now() - breakdownTime breakdownTime = datetime.now() print('') print('compute %s distance...\r' % (projection_type)) proj_dist = bow2dist(proj) print('compute %s distance...done\r' % (projection_type)) profiler_breakdown['BoW2Dist'] += datetime.now() - breakdownTime breakdownTime = datetime.now() print('') print('storing %s distance matrix...\r' % (projection_type)) dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S') matrix2txt(proj_dist, filename=filename_distance_matrix) print('storing %s distance matrix...done\r' % (projection_type)) profiler_breakdown['io'] += datetime.now() - breakdownTime V = proj.shape[0] profiler_breakdown['total'] = sum(profiler_breakdown.values(), timedelta()) profiler_breakdown = {k: str(td) for k, td in profiler_breakdown.items()} filename_profiler = '%s_%.1f_%d_profiler_%s.txt' % (projection_type, eps, V, dt) dict2txt(profiler_breakdown, filename=filename_profiler)
print('------> 插入节点') #对于每一行句子进行n-gram的组合 for word_list in data: # tmp 表示每一行自由组合后的结果(n gram) # tmp: [['它'], ['是'], ['小'], ['狗'], ['它', '是'], ['是', '小'], ['小', '狗'], ['它', '是', '小'], ['是', '小', '狗']] ngrams = generate_ngram(word_list, 3) #建立存储这些词汇的字典树,存储词汇出现的次数 for d in ngrams: root.add(d) print('------> 插入成功') if __name__ == "__main__": #root_name = basedir + "/data/root.pkl" root_name = basedir + "/data/jianzhu.pkl" stopwords = get_stopwords() if os.path.exists(root_name): root = load_model(root_name) else: #文档不能正确反映单个词的词频,所以引入Jieba自带的外部词典 dict_name = basedir + '/data/dict.txt' #读取字典文件,取出词频大于2的建立字典{单词:频数} word_freq = load_dictionary(dict_name) #建立词汇树 root = TrieNode('*', word_freq) save_model(root, root_name) # 加载新的文章 #filename = 'data/demo.txt' filename = 'data/jianzhu.txt' #data是二维数组,存储[[第一行list][第二行list].....]
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [a + b[1:] for a, b in splits if b] transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1] replaces = [a + c + b[1:] for a, b in splits for c in ALPHABET if b] inserts = [a + c + b for a, b in splits for c in ALPHABET] return set(deletes + transposes + replaces + inserts) def known_edits2(word, forms): return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in forms) if __name__ == "__main__": print("Please wait, reading data files...") stopwords = get_stopwords() forms = get_corrections(FORMS_FILE, stopwords, FORMS_ENCODING) corpus_statistics = corpus_stats(CORPUS, len(forms), stopwords) error_statistics = error_stats(ERROR_FILE) print("Please enter words to correct them.") while True: try: word = input("> ") time1 = time() # noinspection PyTypeChecker corrections = correct_word( word, corpus_statistics, error_statistics, compute_possible_corrections(word, forms) if ENHANCED_VERSION else forms )