def generate_newsid2pubtime(corpus_file): ''' 从原始语料中生成每篇文章id与发布时间的映射,时间转换为时间戳 ''' print 'run generate_newsid2pubtime...' id2pubtime = {} f = open(corpus_file, 'r') line = f.readline() while line: parts = line.strip().split('\t') id_ = parts[1].strip() pubtime_str = parts[5].strip() if pubtime_str.lower() == 'null': line = f.readline() continue print pubtime_str if ':' not in pubtime_str: #存在没有分和秒的 pubtime_str += '00:00' pub_time = datetime.strptime(unicode2str(pubtime_str), unicode2str(u'%Y年%m月%d日%H:%M')) pub_time = int(time.mktime(pub_time.timetuple())) id2pubtime[id_] = pub_time line = f.readline() print 'finish generating %s id2pubtime elements' % len(id2pubtime.items()) return id2pubtime
def segment_train_data(train_data_path, saved_file): ''' ''' #filter_stopwords = lambda x: text_segment(x) stopwords = load_stopwords() def filter_stopwords(words): #import pdb;pdb.set_trace() return [r for r in words if r not in stopwords] print 'run segment_train_data...' titles = [] f = open(train_data_path, 'r') line = f.readline() start_time = time.time() res = [] cnt = 0 newsids = set() round_start = time.time() while line: #line = unicode2str(line) parts = line.strip().split('\t') if parts[1].strip() in newsids: line = f.readline() continue newsids.add(parts[1].strip()) cnt += 1 parts[3] = unicode2str(' '.join( filter_stopwords(text_segment(parts[3], is_ret_utf8=True)))) parts[4] = unicode2str(' '.join( filter_stopwords(text_segment(parts[4], is_ret_utf8=True)))) res.append('\t\t'.join(parts)) if cnt % 1000 == 0: round_cost = (time.time() - round_start) round_start = time.time() print 'segmenting %s, cost %.3fs, aver=%.3fs' % ( cnt, round_cost, round_cost / 100.0) line = f.readline() end_time = time.time() total_cost = (end_time - start_time) / 60.0 aver_cost = total_cost / float(cnt) print 'segmenting all %s records, total cost=%.3fmin, average=%.3fmin' % ( cnt, total_cost, aver_cost) fw = open(data_dir + saved_file, 'w+') fw.write('\n'.join(res)) fw.close() print 'res is saved in %s' % (saved_file)
def segment_train_data(train_data_path, saved_original_file, saved_unique_file): ''' 包括对title和content的分词与去掉停用词 生成两份文件: 1,和原数据结构保持一致的分词后结果; 2,将文章利用newsid进行消重,得到一份新的data,用来index中使用(unique_news) ''' stopwords = load_stopwords() def filter_stopwords(words): return [r for r in words if r not in stopwords] print 'run segment_train_data...' f = open(train_data_path, 'r') line = f.readline() nid2tc, original_news, unique_news, newsids = {}, [], [], set() start_time, round_start, cnt = time.time(), time.time(), 0 while line: parts = line.strip().split('\t') original_news.append(parts) nid = parts[1].strip() if nid in newsids: line = f.readline() continue newsids.add(nid) cnt += 1 parts[3] = unicode2str(' '.join( filter_stopwords(text_segment(parts[3], is_ret_utf8=True)))) parts[4] = unicode2str(' '.join( filter_stopwords(text_segment(parts[4], is_ret_utf8=True)))) unique_news.append('\t'.join(parts)) nid2tc[parts[1]] = (parts[3], parts[4]) if cnt % 1000 == 0: round_cost = (time.time() - round_start) round_start = time.time() print 'segmenting %s, cost %.3fs, aver=%.3fs' % ( cnt, round_cost, round_cost / 100.0) line = f.readline() end_time = time.time() total_cost = (end_time - start_time) / 60.0 aver_cost = total_cost / float(cnt) print 'segmenting all %s records(cnt=%s), total cost=%.3fmin, average=%.3fmin' % ( len(unique_news), cnt, total_cost, aver_cost) save_original_data(original_news, nid2tc, saved_original_file) save_unique_data(unique_news, saved_unique_file)
def segment_train_data(train_data_path, saved_file): ''' ''' #filter_stopwords = lambda x: text_segment(x) stopwords = load_stopwords() def filter_stopwords(words): #import pdb;pdb.set_trace() return [r for r in words if r not in stopwords] print 'run segment_train_data...' titles = [] f = open(train_data_path, 'r') line = f.readline() start_time = time.time() res = [] cnt = 0 newsids = set() round_start = time.time() while line: #line = unicode2str(line) parts = line.strip().split('\t') if parts[1].strip() in newsids: line = f.readline() continue newsids.add(parts[1].strip()) cnt += 1 parts[3] = unicode2str(' '.join(filter_stopwords(text_segment(parts[3], is_ret_utf8=True)))) parts[4] = unicode2str(' '.join(filter_stopwords(text_segment(parts[4], is_ret_utf8=True)))) res.append('\t\t'.join(parts)) if cnt % 1000 == 0: round_cost = (time.time() - round_start) round_start = time.time() print 'segmenting %s, cost %.3fs, aver=%.3fs' % (cnt, round_cost, round_cost / 100.0 ) line = f.readline() end_time = time.time() total_cost = (end_time - start_time) / 60.0 aver_cost = total_cost / float(cnt) print 'segmenting all %s records, total cost=%.3fmin, average=%.3fmin' % (cnt, total_cost, aver_cost) fw = open(data_dir + saved_file, 'w+') fw.write('\n'.join(res)) fw.close() print 'res is saved in %s' % (saved_file)
def segment_train_data(train_data_path, saved_original_file, saved_unique_file): ''' 包括对title和content的分词与去掉停用词 生成两份文件: 1,和原数据结构保持一致的分词后结果; 2,将文章利用newsid进行消重,得到一份新的data,用来index中使用(unique_news) ''' stopwords = load_stopwords() def filter_stopwords(words): return [r for r in words if r not in stopwords] print 'run segment_train_data...' f = open(train_data_path, 'r') line = f.readline() nid2tc, original_news, unique_news, newsids = {}, [], [], set() start_time, round_start, cnt = time.time(), time.time(), 0 while line: parts = line.strip().split('\t') original_news.append(parts) nid = parts[1].strip() if nid in newsids: line = f.readline() continue newsids.add(nid) cnt += 1 parts[3] = unicode2str(' '.join(filter_stopwords(text_segment(parts[3], is_ret_utf8=True)))) parts[4] = unicode2str(' '.join(filter_stopwords(text_segment(parts[4], is_ret_utf8=True)))) unique_news.append('\t'.join(parts)) nid2tc[parts[1]] = (parts[3], parts[4]) if cnt % 1000 == 0: round_cost = (time.time() - round_start) round_start = time.time() print 'segmenting %s, cost %.3fs, aver=%.3fs' % (cnt, round_cost, round_cost / 100.0 ) line = f.readline() end_time = time.time() total_cost = (end_time - start_time) / 60.0 aver_cost = total_cost / float(cnt) print 'segmenting all %s records(cnt=%s), total cost=%.3fmin, average=%.3fmin' % (len(unique_news), cnt, total_cost, aver_cost) save_original_data(original_news, nid2tc, saved_original_file) save_unique_data(unique_news, saved_unique_file)
def cut(self, text): text = unicode2str(text) content = self._request(text) words = json.loads(content, encoding='UTF-8') start_pos = 0 for word in words: try: yield word['word'], start_pos, start_pos + len(word['word']) start_pos += len(word['word']) except Exception, e: # sae analyzer unknown exception logging.warn('sae tokenizer error', e)
def get_recommend_news_by_tfidf_sim(): ''' 基于tfidf生成的user profile和文章的keywords(topN,设为20),从用户的candidate articles中选出相似度最大的TopN返回 ''' topN = 20 uids = os.listdir(user_keywords_by_tfidf) uid2can_newsids = get_user_candidate_newsids(user_candidate_newsids_path) user_recommend_res = [] #recommend_res_path = recommend_res_path.replace('.csv', '_by_tfidf.csv') cnt = 0 for uid in uids: cnt += 1 if cnt % 100 == 0: print 'recommend %d user: %s' % (cnt, uid) user_terms = get_user_tfidf_terms( os.path.join(user_keywords_by_tfidf, uid), topN) candidate_newsids = uid2can_newsids.get(uid, []) if not candidate_newsids: continue candidate_news_top_terms = get_news_top_terms(candidate_newsids, topN) #can_news_vectors和candidate_newsids中的nid一一对应 user_vector, can_news_vectors = generate_feature_vectors( user_terms, candidate_news_top_terms, topN) #调用sklearn接口,可以一次计算user和全部news的cosine distance #注意,该接口的值是1-product(v1, v2), 所以值越小,越相似,表示distance越小 user_news_distances = distance(user_vector, Y=can_news_vectors, metric='cosine') user_news_distances = zip(candidate_newsids, user_news_distances.tolist()[0]) user_news_distances = sorted(user_news_distances, key=lambda d: d[1]) user_recommend_res.append( (uid, [nid for nid, d in user_news_distances][:REC_NUM])) fw = open(recommend_res_path, 'w+') fw.write('userid,newsid\n') cnt = 0 for uid, rec_news in user_recommend_res: #import pdb;pdb.set_trace() cnt += 1 if cnt % 100 == 0: print 'finish %d user: %s, %s' % (cnt, uid, ' '.join(rec_news)) fw.write('\n'.join( [','.join((uid, unicode2str(nid))) for nid in rec_news])) fw.write('\n') fw.close() print 'finish recommending, res saved in %s' % recommend_res_path
def generate_tfidf(corpus_path, tfidf_dir): ''' 从分词和去停用词后的文档集中读入corpus和文档id,然后计算tf-idf值,结果保存在以文档id作为 文件名的文件中 ''' newsids, corpus = generate_copurs_from_file(corpus_path) tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) terms = vectorizer.get_feature_names() weights = tfidf.toarray() tfidf_res = [] for i, id_ in enumerate(newsids): each_tf_idf = [] save_file = tfidf_dir + id_ if i % 100 == 0: print i for j, term in enumerate(terms): if weights[i][j] > 0.0001: each_tf_idf.append((terms[j], weights[i][j])) each_tf_idf = sorted(each_tf_idf, key=lambda d: d[1], reverse=True) fw = open(save_file, 'w+') fw.write('\n'.join(['%s,%s' % (unicode2str(t), w) for t, w in each_tf_idf])) fw.close()
def get_recommend_news_by_xapian(): ''' 根据每个用户的关键词,使用xapian进行检索,每个用户返回的结果为用户的 阅读数+N(暂定为5) ''' print 'run get_recommend_news...' user_keywords = get_user_keywords() uid2newsids = get_user_read_list() def generate_query_str(keywords): return ' '.join(['title:%s content:%s' % (k, k) for k in keywords]) rec_res = [] for uid, keywords in user_keywords.items(): if not keywords: #rec_res.append(uid, []) continue if uid not in uid2newsids: continue query_str = generate_query_str(keywords) read_news = set(uid2newsids.get(uid, [])) read_num = len(read_news) search_res = search(indexed_file_path, query_str, ret_num=read_num + REC_NUM) user_rec_news = [r for r in search_res if r not in read_news] rec_res.append((uid, user_rec_news[:REC_NUM])) f = open(recommend_res_path, 'w+') f.write('userid,newsid\n') for uid, rec_news in rec_res: print uid, rec_news f.write('\n'.join( [','.join((uid, unicode2str(nid))) for nid in rec_news])) f.write('\n') f.close() print 'finish recommending, res saved in %s' % recommend_res_path
def save_uid_terms(saved_file, uid_terms_weights): fw = open(saved_file, 'w+') fw.write('\n'.join(['%s,%s' % (unicode2str(t), w) for t, w in uid_terms_weights])) fw.close()