def get_all_fans(self, all_sub_id_list, fan_id_dict): all_fan_id_counter = 0 all_fan_url_list = [] #get every sub's fan's url, get first page first step for sub_id in all_sub_id_list: fan_url = self.get_sub_fan_first_url(sub_id) all_fan_url_list.append(fan_url) #first page first_page_result_queue = Queue.Queue() mt_create_queue_flag = False mt.runMT("get_all_subs", get_html_func, all_fan_url_list, mt_create_queue_flag, first_page_result_queue) #for every fan's url , other_page_url_list = [] # other page while (not first_page_result_queue.empty()): _url, resp = first_page_result_queue.get() fan_id_list = [] ttl_fans_num = self.get_uid_from_tudou_response(resp, fan_id_list) fan_id_counter = len(fan_id_list) ##print("the number of subs found from the fan id %s 's first page : %s "% (sub_url, sub_id_counter)) self.store_id_to_dict(fan_id_list, fan_id_dict) page_ttl = ttl_fans_num / max_num_per_page # ttl_subs_num max is 999, per page max_num_per_page if (ttl_fans_num % max_num_per_page > 0): page_ttl = page_ttl + 1 if page_ttl > 1: # has more page for pagenum in range( 2, page_ttl + 1): #next page from 2 to page_ttl , more subs other_url = self.get_fan_page_from_fan_url( fan_url, pagenum) other_page_url_list.append(other_url) #print("the number of other page of subs need to get : %s "% (len(other_page_sub_url_list))) mt.runMT("get_all_fans", get_tudou_json, other_page_url_list, False, None, store_to_dict, fan_id_dict)
def deep_dig_relation( first_iid, deep_level): ttl_iid_set = set() playinfo_dict = {} has_diged_iid_set = set() dig_iid_set = set([first_iid]) #only get one id from argv, use for i in range(deep_level): #loop from 0 to deep_level - 1 this_level = i + 1 print("scrapy iid number %s, level %s "% (len(dig_iid_set), this_level)) url_list = [] for iid in dig_iid_set: # dig each iid ,get all sub to iid_dict url_list.append( get_tudou_tj_json_url(iid)) html_result_queue = Queue.Queue() mt_create_queue_flag = False mt.runMT("deep_dig_relation", get_html_func, url_list, mt_create_queue_flag, html_result_queue) #for every sub's url , while (not html_result_queue.empty()): _url , resp = html_result_queue.get() get_tjinfo_from_tudou_response( resp, ttl_iid_set, playinfo_dict ) has_diged_iid_set = has_diged_iid_set | dig_iid_set dig_iid_set = dig_iid_set | ttl_iid_set - has_diged_iid_set relation_html_name = "relation"+str(first_iid)+"_"+str(deep_level)+".htm" write_relation_result(playinfo_dict, relation_html_name )
def get_fans(self, sub_id): all_fan_id_list = [] #要获取到页数,无法mt sub_fan_url = self.getsubfanurl(sub_id) #get fan's home url from sub_fan_url resp = func.GetHttpContent(sub_fan_url) if resp is None: func._print("failed get html .The url = %s \n" % (sub_fan_url)) return fan_id_list = [] ttl_fans_num = self.get_uid_from_tudou_response(resp, fan_id_list) all_fan_id_list = all_fan_id_list + fan_id_list page_ttl = ttl_fans_num / max_num_per_page if (ttl_fans_num % max_num_per_page > 0): page_ttl = page_ttl + 1 #print("the number of fans found from sub id %s: %s "% (sub_id, len(all_fan_id_list))) #print("the number of pages of fans of sub id %s: %s "% (sub_id, page_ttl )) if (page_ttl > 1): sub_fan_url_list = [] for pagenum in range(2, page_ttl + 1): #next page from 2 to page_ttl , more fans sub_fan_url = self.getsubfanurl_page(sub_id, pagenum) sub_fan_url_list.append(sub_fan_url) mt.runMT("get_fans", get_tudou_json, sub_fan_url_list, False, None, store_to_list, all_fan_id_list) #print("found %s fans from sub id %s: %s "% (len(all_fan_id_list), sub_id )) return all_fan_id_list
def get_all_subs(self, all_fan_id_list, sub_id_dict): all_sub_id_counter = 0 all_sub_url_list = [] #get every fan's sub's url, get first page first step for fan_id in all_fan_id_list : sub_url = self.get_fan_sub_first_url(fan_id) all_sub_url_list.append(sub_url) #first page first_page_result_queue = Queue.Queue() mt_create_queue_flag = False mt.runMT("get_all_subs", get_html_func, all_sub_url_list, mt_create_queue_flag, first_page_result_queue) #for every sub's url , other_page_sub_url_list = [] # other page while (not first_page_result_queue.empty()): sub_url , resp = first_page_result_queue.get() sub_id_list = [] ttl_subs_num = self.get_uid_from_tudou_response(resp, sub_id_list) sub_id_counter = len(sub_id_list) print("the number of subs found from the fan id %s 's first page : %s "% (sub_url, sub_id_counter)) self.store_id_to_dict(sub_id_list , sub_id_dict) page_ttl = ttl_subs_num / max_num_per_page # ttl_subs_num max is 999, per page max_num_per_page if (ttl_subs_num % max_num_per_page > 0) : page_ttl = page_ttl + 1 if page_ttl > 1 :# has more page for pagenum in range(2, page_ttl + 1): #next page from 2 to page_ttl , more subs other_sub_url = self.get_sub_page_from_sub_url(sub_url, pagenum) other_page_sub_url_list.append( other_sub_url ) print("the number of other page of subs need to get : %s "% (len(other_page_sub_url_list))) mt.runMT("get_all_subs 2", get_tudou_json, other_page_sub_url_list, False, None, store_to_dict, sub_id_dict) this_counter = self.get_subs(fan_id, sub_id_dict) all_sub_id_counter = all_sub_id_counter + this_counter
def mt_get_html_and_parser(url_dict, xpath_list): page_result_queue = Queue.Queue() mt_create_queue_flag = False url_list = url_dict.keys() mt.runMT("x", get_html_func, url_list, mt_create_queue_flag, page_result_queue) #for every url , all_result_list = [] # other page while (not page_result_queue.empty()): url, html = page_result_queue.get() #html = html.encode('UTF-8') tree = etree.HTML(html) idx = url_dict[url] result_list = [] result_list.append(url) for xpath in xpath_list[idx]: r = tree.xpath(xpath) if (len(r) == 0): #print ("Empty result extract from the url = %s, xpath = %s\n " % (url, xpath )) result_list.append("extract failed or not found!") continue elif len(r) > 1: print( "The number of results extract from the url = %s is %s ,, xpath = %s\n" % (url, len(r), xpath)) item = r[0].text.strip() result_list.append(item) if (len(result_list)) > 1: #+ len(xpath_list[idx]) : #complete extract all_result_list.append(result_list) return all_result_list
def get_fans(self, sub_id): all_fan_id_list = [] #要获取到页数,无法mt sub_fan_url = self.getsubfanurl(sub_id) #get fan's home url from sub_fan_url resp = func.GetHttpContent( sub_fan_url ) if resp is None : func._print ("failed get html .The url = %s \n"%(sub_fan_url )) return fan_id_list = [] ttl_fans_num = self.get_uid_from_tudou_response(resp, fan_id_list) all_fan_id_list = all_fan_id_list + fan_id_list page_ttl = ttl_fans_num / max_num_per_page if (ttl_fans_num % max_num_per_page > 0) : page_ttl = page_ttl + 1 print("the number of fans found from sub id %s: %s "% (sub_id, len(all_fan_id_list))) print("the number of pages of fans of sub id %s: %s "% (sub_id, page_ttl )) if (page_ttl > 1) : sub_fan_url_list = [] for pagenum in range(2, page_ttl + 1): #next page from 2 to page_ttl , more fans sub_fan_url = self.getsubfanurl_page(sub_id, pagenum) sub_fan_url_list.append(sub_fan_url) mt.runMT("get_fans", get_tudou_json, sub_fan_url_list, False, None, store_to_list, all_fan_id_list) print("the number of fans found from sub id %s: %s "% (sub_id, len(all_fan_id_list))) return all_fan_id_list
def deep_dig_relation(first_iid, deep_level): ttl_iid_set = set() playinfo_dict = {} has_diged_iid_set = set() dig_iid_set = set([first_iid]) #only get one id from argv, use for i in range(deep_level): #loop from 0 to deep_level - 1 this_level = i + 1 print("scrapy iid number %s, level %s " % (len(dig_iid_set), this_level)) url_list = [] for iid in dig_iid_set: # dig each iid ,get all sub to iid_dict url_list.append(get_tudou_tj_json_url(iid)) html_result_queue = Queue.Queue() mt_create_queue_flag = False mt.runMT("deep_dig_relation", get_html_func, url_list, mt_create_queue_flag, html_result_queue) #for every sub's url , while (not html_result_queue.empty()): _url, resp = html_result_queue.get() get_tjinfo_from_tudou_response(resp, ttl_iid_set, playinfo_dict) has_diged_iid_set = has_diged_iid_set | dig_iid_set dig_iid_set = dig_iid_set | ttl_iid_set - has_diged_iid_set relation_html_name = "relation" + str(first_iid) + "_" + str( deep_level) + ".htm" write_relation_result(playinfo_dict, relation_html_name)
def mt_get_html_and_parser(url_dict, target_list ): global target_event target_event.clear() #print("%s : start checking ....."%(time.strftime("%Y/%m/%d %H:%M:%S"))) page_result_queue = Queue.Queue() mt_create_queue_flag = False url_list = url_dict.keys() mt.runMT("x", get_html_func, url_list, mt_create_queue_flag, page_result_queue) #for every html got while (not page_result_queue.empty()): url , html = page_result_queue.get() #print html idx = url_dict[url] target = target_list[idx] r1 = target.judge_have_stock(html) r2 = target.judge_have_low_price(html) target_event.set()
def write_sub_info(self, sub_id_dict, file_name, file_mode='w'): sub_id_focus_pair_list = sorted(sub_id_dict.items(), lambda x, y: cmp(x[1], y[1]), reverse=True) all_sub_url_list = [] for _pair in sub_id_focus_pair_list: sub_id = _pair[0] sub_url = self.getsubinfourl(sub_id) all_sub_url_list.append(sub_url) #get url's page sub_page_result_queue = Queue.Queue() mt_create_queue_flag = False mt.runMT("get_all_subs", get_html_func, all_sub_url_list, mt_create_queue_flag, sub_page_result_queue) #for every sub's url , sub_url_name_dict = {} id_name_xpath = '//*[@id="topTitle"]/h3' while (not sub_page_result_queue.empty()): _url, html = sub_page_result_queue.get() tree = etree.HTML(html) r_list = tree.xpath(id_name_xpath) #sub_url_name_dict[_url] = str(_url) if len(r_list) > 0: id_name = r_list[0].text.strip() # print id_name sub_url_name_dict[_url] = id_name fp = open(file_name, file_mode) for _pair in sub_id_focus_pair_list: sub_id = _pair[0] focus_num = _pair[1] sub_url = self.getsubinfourl(sub_id) id_name = str(sub_id) if sub_url_name_dict.has_key(sub_url): id_name = sub_url_name_dict[sub_url] fp.write('<a href = "' + sub_url + '" target="_blank">' + (str(id_name.encode("GBK", 'ignore'))) + ' </a> ' + str(focus_num)) fp.write('<br>') fp.close() print("%s sub's url and focus value writed to %s!" % (len(sub_id_focus_pair_list), file_name)) return sub_id_focus_pair_list
def mt_get_html_and_parser(url_dict, xpath_list, currency_list): page_result_queue = Queue.Queue() mt_create_queue_flag = False url_list = url_dict.keys() mt.runMT("x", get_html_func, url_list, mt_create_queue_flag, page_result_queue) #for every url , all_result_list = [] # other page while (not page_result_queue.empty()): url, html = page_result_queue.get() #html = html.decode('UTF-8', 'replace') # for show in gbk in windows cmd shell #print html tree = etree.HTML(html) idx = url_dict[url] result_list = [] result_list.append(url) cur_rate = currency_list[idx] for i in range(len(xpath_list[idx])): xpath = xpath_list[idx][i] r_list = tree.xpath(xpath) if (len(r_list) == 0): print(" %s \n extract 0 result \n xpath = %s\n " % (url, xpath)) #result_list.append("extract failed or not found!") continue elif len(r_list) > 1: print( "The number of results extract from the url = %s is %s xpath = %s\n" % (url, len(r_list), xpath)) for r in r_list: item = r.text.strip() #xpath 的第2开始都是价格要换算 result_list.append(item) if i == 1: #xpath中第一个是价格 item = "RMB " + change_currency(item, cur_rate) result_list.append(item) #c = judge_currency(item) #if c is not None : # item = item + " = RMB " + _change_currency(item, c) if (len(result_list)) > 1: #+ len(xpath_list[idx]) : #complete extract all_result_list.append(result_list) return all_result_list
def write_sub_info(self, sub_id_dict, file_name, file_mode = 'w'): sub_id_focus_pair_list = sorted(sub_id_dict.items(), lambda x, y: cmp(x[1], y[1]), reverse=True) all_sub_url_list = [] for _pair in sub_id_focus_pair_list: sub_id = _pair[0] sub_url = self.getsubinfourl(sub_id) all_sub_url_list.append(sub_url) #get url's page sub_page_result_queue = Queue.Queue() mt_create_queue_flag = False mt.runMT("get_all_subs", get_html_func, all_sub_url_list, mt_create_queue_flag, sub_page_result_queue) #for every sub's url , sub_url_name_dict = {} id_name_xpath = '//*[@id="topTitle"]/h3' while (not sub_page_result_queue.empty()): _url , html = sub_page_result_queue.get() tree = etree.HTML(html) r_list = tree.xpath(id_name_xpath) #sub_url_name_dict[_url] = str(_url) if len(r_list) > 0: id_name = r_list[0].text.strip() # print id_name sub_url_name_dict[_url] = id_name fp = open(file_name ,file_mode) for _pair in sub_id_focus_pair_list: sub_id = _pair[0] focus_num = _pair[1] sub_url = self.getsubinfourl(sub_id) id_name = str(sub_id) if sub_url_name_dict.has_key(sub_url): id_name = sub_url_name_dict[sub_url] fp.write('<a href = "' +sub_url + '" target="_blank">' + (str(id_name.encode("GBK", 'ignore'))) +' </a> ' + str(focus_num)) fp.write('<br>') fp.close() print("%s sub's url and focus value writed to %s!"%(len(sub_id_focus_pair_list),file_name )) return sub_id_focus_pair_list