def start_search(bid, tbw_list): """ 经过聚类后便可以开始搜索, 进行st().search_topic搜索,生成文件夹,如果不是热点的话题,便删除文件夹 :param bid: 每一个类的代表博文id :param tbw_list: 这个类的博文相关属性的组合, [(title1,blog1,word1),(title2,blog2,word2)] :return: """ short_dir = create_time_file(tbw_list[0][0]) # 创建标题文件夹 corpus_dir = DOC_DIR + '/' + short_dir + 'uid=' + str(bid) + '.txt' print "生成的动态目录:", corpus_dir txt_file = open(corpus_dir, 'w+') bid_like = 0 # 该bid类的点赞规模 bid_forward = 0 bid_comment = 0 for tbw_tuple in tbw_list: # [(title1,blog1,word1),(title2,blog2,word2)] print bid, "类的博文标题, 博文, 关键词", tbw_tuple[0], tbw_tuple[1], tbw_tuple[2] total_result_list, total_reason_list, lfc_all_num = st().search_topic( bid, tbw_tuple) # total_result_list = [[博文id,博文时间,标题,博文,关键词,首发者的id,首发者的名字,点赞,转发,评论,origin], ] for com_index in range(len(total_result_list)): for com in total_result_list[com_index]: txt_file.write(str(com) + '\n') # total_reason_list = [[古阿陌@洗洗睡@人民日报, 南方报@人民日报, yutsfa@人民日报 ], ] for reason in total_reason_list[com_index]: print "转发理由:", reason txt_file.write(str(reason) + '\n') txt_file.write('\n') txt_file.write('\n\n\n--------------------\n') # lfc_all_num = [该话题的点赞规模,该话题的转发规模,该话题的评论规模] print '*******************************' print "该话题的点赞规模", lfc_all_num[0] print "该话题的转发规模", lfc_all_num[1] print "该话题的评论规模", lfc_all_num[2] bid_like += int(lfc_all_num[0]) bid_forward += int(lfc_all_num[1]) bid_comment += int(lfc_all_num[2]) print tbw_list[0][0], '聚合的类的话题规模', bid_like, bid_forward, bid_comment txt_file.write( str(bid_like) + '\n' + str(bid_forward) + '\n' + str(bid_comment) + '\n') txt_file.close() if bid_like + bid_forward + bid_comment == 0: deleted_dir = DOC_DIR + '/' + 'topic' + '/' + tbw_list[0][0] print "不是热点的文件夹删除", deleted_dir shutil.rmtree(deleted_dir)
def start_search(bid, tbw_list): """ 经过聚类后便可以开始搜索, 进行st().search_topic搜索,生成文件夹,如果不是热点的话题,便删除文件夹 :param bid: 每一个类的代表博文id :param tbw_list: 这个类的博文相关属性的组合, [(title1,blog1,word1),(title2,blog2,word2)] :return: """ short_dir = create_time_file(tbw_list[0][0]) # 创建标题文件夹 corpus_dir = DOC_DIR + '/' + short_dir + 'uid=' + str(bid) + '.txt' print "生成的动态目录:", corpus_dir txt_file = open(corpus_dir, 'w+') bid_like = 0 # 该bid类的点赞规模 bid_forward = 0 bid_comment = 0 for tbw_tuple in tbw_list: # [(title1,blog1,word1),(title2,blog2,word2)] print bid, "类的博文标题, 博文, 关键词", tbw_tuple[0], tbw_tuple[1], tbw_tuple[2] total_result_list, total_reason_list, lfc_all_num = st().search_topic(bid, tbw_tuple) # total_result_list = [[博文id,博文时间,标题,博文,关键词,首发者的id,首发者的名字,点赞,转发,评论,origin], ] for com_index in range(len(total_result_list)): for com in total_result_list[com_index]: txt_file.write(str(com) + '\n') # total_reason_list = [[古阿陌@洗洗睡@人民日报, 南方报@人民日报, yutsfa@人民日报 ], ] for reason in total_reason_list[com_index]: print "转发理由:", reason txt_file.write(str(reason) + '\n') txt_file.write('\n') txt_file.write('\n\n\n--------------------\n') # lfc_all_num = [该话题的点赞规模,该话题的转发规模,该话题的评论规模] print '*******************************' print "该话题的点赞规模", lfc_all_num[0] print "该话题的转发规模", lfc_all_num[1] print "该话题的评论规模", lfc_all_num[2] bid_like += int(lfc_all_num[0]) bid_forward += int(lfc_all_num[1]) bid_comment += int(lfc_all_num[2]) print tbw_list[0][0], '聚合的类的话题规模', bid_like, bid_forward, bid_comment txt_file.write(str(bid_like) + '\n' + str(bid_forward) + '\n' + str(bid_comment) + '\n') txt_file.close() if bid_like + bid_forward + bid_comment == 0: deleted_dir = DOC_DIR + '/' + 'topic' + '/' + tbw_list[0][0] print "不是热点的文件夹删除", deleted_dir shutil.rmtree(deleted_dir)
def start_search(time_dir,one_blog): """ 对一个博文在微博平台进行搜索 :return: """ total_dir = create_topic_file(time_dir, one_blog) print total_dir print "正在进行搜索并爬取..." # [blog_id, ptime, topic, content, user_id, user_name,like_num, rpt_num, cmt_num] total_result_list, total_reason_list, total_comment_list, total_participants_list, total_originator, lfc_all_num = st().search_topic(one_blog) # 写入内容 write_forward(total_dir, total_reason_list) write_comment(total_dir, total_comment_list) write_participants(total_dir, total_participants_list) # lfc_all_num = [该话题的点赞规模,该话题的转发规模,该话题的评论规模] print '*******************************' print "该话题的点赞规模", lfc_all_num[0] print "该话题的转发规模", lfc_all_num[1] print "该话题的评论规模", lfc_all_num[2] blog_id = total_result_list[0] content = total_result_list[3] post_time = total_result_list[1] comment_num = lfc_all_num[2] like_num = lfc_all_num[0] repost_num = lfc_all_num[1] result_dict.setdefault(one_blog,[blog_id,content,total_originator,post_time,comment_num,repost_num,like_num])
def start_search(one_blog, event_name): """ 对一个博文在微博平台进行搜索 :return: """ time_dir = create_time_file() event_news = event_name+'/'+one_blog total_dir = create_topic_file(event_news, time_dir) print total_dir print "正在进行搜索并爬取..." total_result_list, total_reason_list, total_comment_list, total_participants_list = st().search_topic(one_blog) if len(total_result_list) > 0: # 写入内容 write_forward(total_dir, total_reason_list) write_comment(total_dir, total_comment_list) write_participants(total_dir, total_participants_list) # [[blog_id, content, user_name, user_id, ptime, topic, like_num, rpt_num, cmt_num]] blog_id = total_result_list[0][0] content = total_result_list[0][1] originator = total_result_list[0][2] originator_id = total_result_list[0][3] post_time = total_result_list[0][4] topic = total_result_list[0][5] like_num = total_result_list[0][6] repost_num = total_result_list[0][7] comment_num = total_result_list[0][8] print "博文id ", blog_id print "博文内容 ", content print "发源者昵称 ", originator print "发源者id ", originator_id print "发表时间 ", post_time print "话题 ", topic print "点赞数 ", like_num print "转发数 ", repost_num print "评论数 ", comment_num total_result_list[0].append(total_dir) result_dict.setdefault(one_blog, total_result_list[0]) # [blog_id,content,originator,originator_id,post_time,topic,like_num,repost_num,comment_num,total_dir]) else: print "爬取失败"
def start_search(bid, tbw_list): """ 经过聚类后便可以开始搜索, 进行st().search_topic搜索,生成文件夹,如果不是热点的话题,便删除文件夹 :param bid: 每一个类的代表博文id :param tbw_list: 这个类的博文相关属性的组合, [(title1,blog1,word1),(title2,blog2,word2)] :return: """ db = Database() eve = Event() con = Content() e_id = 'tp' + bid short_dir = create_time_file(tbw_list[0][0]) # 创建标题文件夹 corpus_dir = DOC_DIR + '/' + short_dir + 'uid=' + str(bid) + '.txt' # print "生成的动态目录:", corpus_dir txt_file = open(corpus_dir, 'w+') bid_like = 0 # 该bid类的点赞规模 bid_forward = 0 bid_comment = 0 for tbw_tuple in tbw_list: # [(title1,blog1,word1),(title2,blog2,word2)] # print bid, "类的博文标题, 博文, 关键词", tbw_tuple[0], tbw_tuple[1], tbw_tuple[2] total_result_list, total_reason_list, lfc_all_num = st().search_topic(bid, tbw_tuple) # total_result_list = [[博文id,博文时间,标题,博文,关键词,首发者的id,首发者的名字,点赞,转发,评论,origin], ] for com_index in range(len(total_result_list)): # for com in total_result_list[com_index]: # txt_file.write(str(com) + '\n') # total_reason_list = [[古阿陌@洗洗睡@人民日报, 南方报@人民日报, yutsfa@人民日报 ], ] for reason in total_reason_list[com_index]: # print "转发理由:", reason txt_file.write(str(reason) + '\n') # txt_file.write('\n') # txt_file.write('\n\n\n--------------------\n') # lfc_all_num = [该话题的点赞规模,该话题的转发规模,该话题的评论规模] # print '*******************************' # print "该话题的点赞规模", lfc_all_num[0] # print "该话题的转发规模", lfc_all_num[1] # print "该话题的评论规模", lfc_all_num[2] bid_like += int(lfc_all_num[0]) bid_forward += int(lfc_all_num[1]) bid_comment += int(lfc_all_num[2]) print '*******************************' print tbw_list[0][0], '***聚合的类的话题规模***', bid_like, bid_forward, bid_comment # txt_file.write(str(bid_like) + '\n' + # str(bid_forward) + '\n' + # str(bid_comment) + '\n') txt_file.close() if bid_like + bid_forward + bid_comment == 0: deleted_dir = DOC_DIR + '/' + 'topic' + '/' + tbw_list[0][0] pic_dir = deleted_dir.replace('documents','static/sna') print "不是热点的文件夹删除", deleted_dir shutil.rmtree(deleted_dir) shutil.rmtree(pic_dir) eve.delete_event(e_id) else: # now = datetime.datetime.now() # other_style_time = now.strftime("%Y-%m-%d %H:%M:%S") # 存 interface_networkscale()和increment: main_network(DOC_DIR+'/'+short_dir) corpus_dir = short_dir + 'uid=' + str(bid) + '.txt' label_dir = short_dir + 'new_label_link.xls' sna_dir = 'sna/'+ short_dir + 'SNA.png' print 'label!!!!!!', DOC_DIR+'/'+label_dir if os.path.exists(DOC_DIR+'/'+label_dir): print 'yep' try: leader = find_leader(label_dir) db.save_network_scale(e_id, corpus_dir, label_dir,sna_dir, leader) db.save_increment(e_id, bid_comment,bid_forward, bid_like) except: leader = 'Unknown' db.save_network_scale(e_id, corpus_dir, label_dir,sna_dir, leader) db.save_increment(e_id, bid_comment,bid_forward, bid_like) else: print 'nope' label_dir = 'None' sna_dir = 'None' leader = 'Unknown' db.save_network_scale(e_id, corpus_dir, label_dir,sna_dir, leader) db.save_increment(e_id, bid_comment,bid_forward, bid_like) print "storing!!!"
def start_search(time_dir, one_blog): """ 对一个博文在微博平台进行搜索 :return: """ total_dir = create_topic_file(time_dir, one_blog) print total_dir print "正在进行搜索并爬取..." total_result_list, total_reason_list, total_comment_list, total_participants_list = st( ).search_topic(one_blog) if len(total_result_list) > 0: # 写入内容 write_forward(total_dir, total_reason_list) write_comment(total_dir, total_comment_list) write_participants(total_dir, total_participants_list) # [[blog_id, content, user_name, user_id, ptime, topic, like_num, rpt_num, cmt_num]] blog_id = total_result_list[0][0] content = total_result_list[0][1] originator = total_result_list[0][2] originator_id = total_result_list[0][3] post_time = total_result_list[0][4] like_num = total_result_list[0][5] repost_num = total_result_list[0][6] comment_num = total_result_list[0][7] print "博文id ", blog_id print "博文内容 ", content print "发源者昵称 ", originator print "发源者id ", originator_id print "发表时间 ", post_time print "点赞数 ", like_num print "转发数 ", repost_num print "评论数 ", comment_num result_dict.setdefault(one_blog, total_result_list[0]) # [blog_id,content,originator,originator_id,post_time,like_num,repost_num,comment_num]) else: print "爬取失败"
def start_search(time_dir, one_blog): """ 对一个博文在微博平台进行搜索 :return: """ total_dir = create_topic_file(time_dir, one_blog) print total_dir print "正在进行搜索并爬取..." # [blog_id, ptime, topic, content, user_id, user_name,like_num, rpt_num, cmt_num] total_result_list, total_reason_list, total_comment_list, total_participants_list, total_originator, lfc_all_num = st( ).search_topic(one_blog) # 写入内容 write_forward(total_dir, total_reason_list) write_comment(total_dir, total_comment_list) write_participants(total_dir, total_participants_list) # lfc_all_num = [该话题的点赞规模,该话题的转发规模,该话题的评论规模] print '*******************************' print "该话题的点赞规模", lfc_all_num[0] print "该话题的转发规模", lfc_all_num[1] print "该话题的评论规模", lfc_all_num[2] blog_id = total_result_list[0] content = total_result_list[3] post_time = total_result_list[1] comment_num = lfc_all_num[2] like_num = lfc_all_num[0] repost_num = lfc_all_num[1] result_dict.setdefault(one_blog, [ blog_id, content, total_originator, post_time, comment_num, repost_num, like_num ])
def start_search(bid, tbw_list): """ 经过聚类后便可以开始搜索, 进行st().search_topic搜索,生成文件夹,如果不是热点的话题,便删除文件夹 :param bid: 每一个类的代表博文id :param tbw_list: 这个类的博文相关属性的组合, [(title1,blog1,word1),(title2,blog2,word2)] :return: """ e_id = "tp" + bid short_dir = create_time_file(tbw_list[0][0]) # 创建标题文件夹 corpus_dir = DOC_DIR + "/" + short_dir + "uid=" + str(bid) + ".txt" print "生成的动态目录:", corpus_dir txt_file = open(corpus_dir, "w+") bid_like = 0 # 该bid类的点赞规模 bid_forward = 0 bid_comment = 0 for tbw_tuple in tbw_list: # [(title1,blog1,word1),(title2,blog2,word2)] print bid, "类的博文标题, 博文, 关键词", tbw_tuple[0], tbw_tuple[1], tbw_tuple[2] total_result_list, total_reason_list, lfc_all_num = st().search_topic(bid, tbw_tuple) # total_result_list = [[博文id,博文时间,标题,博文,关键词,首发者的id,首发者的名字,点赞,转发,评论,origin], ] for com_index in range(len(total_result_list)): for com in total_result_list[com_index]: txt_file.write(str(com) + "\n") # total_reason_list = [[古阿陌@洗洗睡@人民日报, 南方报@人民日报, yutsfa@人民日报 ], ] for reason in total_reason_list[com_index]: print "转发理由:", reason txt_file.write(str(reason) + "\n") txt_file.write("\n") txt_file.write("\n\n\n--------------------\n") # lfc_all_num = [该话题的点赞规模,该话题的转发规模,该话题的评论规模] print "*******************************" print "该话题的点赞规模", lfc_all_num[0] print "该话题的转发规模", lfc_all_num[1] print "该话题的评论规模", lfc_all_num[2] bid_like += int(lfc_all_num[0]) bid_forward += int(lfc_all_num[1]) bid_comment += int(lfc_all_num[2]) print tbw_list[0][0], "聚合的类的话题规模", bid_like, bid_forward, bid_comment txt_file.write(str(bid_like) + "\n" + str(bid_forward) + "\n" + str(bid_comment) + "\n") txt_file.close() if bid_like + bid_forward + bid_comment == 0: deleted_dir = DOC_DIR + "/" + "topic" + "/" + tbw_list[0][0] print "不是热点的文件夹删除", deleted_dir shutil.rmtree(deleted_dir) else: now = datetime.datetime.now() other_style_time = now.strftime("%Y-%m-%d %H:%M:%S") # 存 interface_networkscale(): event_id = e_id corpus_dir = short_dir label_dir = label路径 sna_dir = sna图的路径, 这个你可以留空 leader = 留空 # 存 interface_increment(): check_time = other_style_time comment_num = bid_comment repost_num = bid_forward like_num = bid_like