def preprocess(fin_path): fin = open(fin_path, 'r') manager_groups = {} for line in fin: splits = line.split(',') aa = re.findall('\d+', splits[0])[0] pid = int(aa) try: ll = splits[3].replace('\n', '') if pid in manager_groups: manager_groups[pid].add_node(SentenceNode(splits[4], extra=int(ll))) else: snm = SentenceNodeManager() snm.add_node(SentenceNode(splits[4], extra=int(ll))) manager_groups[pid] = snm except Exception, e: print e print line
def text_rank(sentences, num=10, sim_func=get_similarity, pagerank_config={'alpha': 0.85, }): """将句子按照关键程度从大到小排序 Keyword arguments: sentences -- 列表,元素是句子 words -- 二维列表,子列表和sentences中的句子对应,子列表由单词组成 sim_func -- 计算两个句子的相似性,参数是两个由单词组成的列表 pagerank_config -- pagerank的设置 """ def cal_cos(vec1, vec2): def __cal_vecmod(vec): mod_result = 0.0 for i in vec: mod_result += i ** 2 return math.sqrt(mod_result) # 首先是分子的计算 fenzi = 0.0 for i in xrange(len(vec1)): fenzi += vec1[i] * vec2[i] # 计算分母 fenmu = __cal_vecmod(vec1) * __cal_vecmod(vec2) if fenmu == 0: return 0 else: return fenzi / fenmu sorted_sentences = [] snm = SentenceNodeManager() for sent in sentences: snode = SentenceNode(sent) snm.add_node(snode) snm.normalize_all_sentnodes() vlist = snm.get_vec_list() # _source = words sentences_num = len(vlist) graph = np.zeros((sentences_num, sentences_num)) for x in xrange(sentences_num): for y in xrange(x, sentences_num): similarity = cal_cos(vlist[x], vlist[y]) graph[x, y] = similarity graph[y, x] = similarity nx_graph = nx.from_numpy_matrix(graph) scores = nx.pagerank(nx_graph, **pagerank_config) # this is a dict sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True) for index, score in sorted_scores: item = AttrDict(sentence=sentences[index], weight=score) sorted_sentences.append(item) return sorted_sentences[:num]
def save_csv(jd_result, fout_path=None): if not fout_path: fout_path = jd_result['title'][:10] with open('../data/csv/%s-%s.csv' % (arrow.utcnow().timestamp, fout_path), 'w') as fout: fout.write(codecs.BOM_UTF8) for item in jd_result['comments']: fout.write('%s,%s,%s,0,%s\n' % ( jd_result['pid'], item['creationTime'], item['nickname'], item['content'].replace('\n', ''))) print 'saved! path = %s' % fout_path if __name__ == '__main__': ITEM_ID = 411082 MAXPAGE = 20 # RES = get_jd_rate(ITEM_ID,MAXPAGE) jd_res = jd_parser(ITEM_ID, MAXPAGE) open('%s-%s-parser_result.json' % (arrow.utcnow().timestamp, jd_res['title'][:10]), 'w').write( json.dumps(jd_res, ensure_ascii=False)) save_csv(jd_res) # add sentence # st = open('parser_result.json', 'r').read() # jd_res = json.loads(st) snm = SentenceNodeManager() for item in jd_res['comments']: snm.add_node(SentenceNode(item['content'])) import utils.node_vec_utils.node_cluster_utils as CU CU.APcluster(snm, '../data/clusters/%s-APresult.json' % jd_res['title'][:10]) CU.DBSCANcluster(snm, '../data/clusters/%s-DBSCANresult.json' % jd_res['title'][:10])
def amazon_preprocess(start=0, end=10, label_rate=0.65, min_vote=0): """ :param start: :param end: :param label_rate: :return: """ # prepare train set db_inst = get_db_inst('AmazonReviews', 'AndroidAPP') # print len(db_inst.distinct('asin')) manager_groups = {} asin_file = open('%s/process/data/asin.list' % PROJECT_PATH, 'r') # for asin in db_inst.distinct('asin'): # asin_file.write('%s\n' % asin) lines = asin_file.readlines() shuffle(lines) # for asin in db_inst.distinct('asin'): tlines = lines[start:end] review_dicts = {} asin_list = [] for asin in tlines: asin = asin.replace('\n', '') asin_list.append(asin) print 'loading %s' % asin # snm.add_node(SentenceNode(splits[4], extra=int(ll))) # 计算每个APP下的评论 a_reviews = [] max_vote = 0 # 常量 for find_item in db_inst.find({"asin": asin, 'total_vote': {"$gt": min_vote}}): max_vote = max(find_item['total_vote'], max_vote) a_reviews.append(find_item) # process item reviews VOTE RANK review_rank = [] print '%s has %s reviews' % (asin, len(a_reviews)) snm = SentenceNodeManager() for review in a_reviews: alpha_const = 0 T = float(review['total_vote']) / max_vote V = 1 / (1.0 + math.exp(-0.01 * (2 * review['up_vote'] - review['total_vote']))) # V = float(review['up_vote']) / review['total_vote'] vote_rank_value = 2 * (T + alpha_const) * (V + alpha_const) / (T + V + 2 * alpha_const) if vote_rank_value >= label_rate: snm.add_node( SentenceNode(review['reviewText'].lower(), extra=(int(1), vote_rank_value, review['reviewerID']), get_pos_func=tag_sents, get_keywords_func=cal_en_tfidf)) elif vote_rank_value < label_rate: snm.add_node( SentenceNode(review['reviewText'].lower(), extra=(int(0), vote_rank_value, review['reviewerID']), get_pos_func=tag_sents, get_keywords_func=cal_en_tfidf)) review_rank.append((review, vote_rank_value)) manager_groups[asin] = snm review_dicts[asin] = review_rank # else: # break veclist = [] sentlist = [] labellist = [] tokenlist = [] nodelist = [] group_nodelist = [] print 'start normalizing vecs' for pid in manager_groups.keys(): manager = manager_groups[pid] # DBSCANcluster(manager, '%s_DBSCANcluster.json' % pid) # APcluster(manager, '%s_APcluster.json' % pid) manager.normalize_all_sentnodes(tfidf_func=tag_sents) veclist.extend(manager.get_vec_list()) sentlist.extend(manager.get_sent_list()) gnodelist = [] for node in manager.node_list: labellist.append(node.extra[0]) tokenlist.append(node.feature2token()) nodelist.append(node) gnodelist.append(node) group_nodelist.append(gnodelist) print 'end normalizing vecs' return veclist, sentlist, labellist, tokenlist, nodelist, manager_groups