def get_subs(self, fan_id, sub_id_dict): #error runing! get nothing! need fix! fan_sub_url = self.get_fan_sub_first_url(fan_id) resp = func.GetHttpContent("GET", fan_sub_url) if resp is None: func._print("failed get html .The url = %s \n" % (fan_sub_url)) return 0 sub_id_list = [] ttl_subs_num = self.get_uid_from_tudou_response(resp, sub_id_list, pagenum=1) #print("the number of subs in the json from the fan id %s : %s "% (fan_id, ttl_subs_num)) self.store_id_to_dict(sub_id_list, sub_id_dict) page_ttl = ttl_subs_num / max_num_per_page # ttl_subs_num max is 999, per page max_num_per_page if (ttl_subs_num % max_num_per_page > 0): page_ttl = page_ttl + 1 sub_id_counter = len(sub_id_list) for pagenum in range(2, page_ttl + 1): #next page from 2 to page_ttl , more subs fan_sub_url = self.get_sub_page(fan_id, pagenum) resp = func.GetHttpContent("GET", fan_sub_url) if resp is None: func._print("failed get html .The url = %s \n" % (fan_sub_url)) continue sub_id_list = [] self.get_uid_from_tudou_response(resp, sub_id_list, pagenum) self.store_id_to_dict(sub_id_list, sub_id_dict) sub_id_counter = sub_id_counter + len(sub_id_list) print("scrapy %s subs found from the fan id %s : %s " % (sub_id_counter, fan_id)) return sub_id_counter
def get_sub_fan(self, sub_id): all_fan_id_list = [] sub_fan_url = self.getsubfanurl(sub_id) #get fan's home url from sub_fan_url resp = func.GetHttpContent(sub_fan_url) if resp is None: func._print("failed get html .The url = %s \n" % (sub_fan_url)) return fan_id_list = [] ttl_fans_num = self.get_uid_from_tudou_response(resp, fan_id_list, pagenum=1) all_fan_id_list = all_fan_id_list + fan_id_list page_ttl = ttl_fans_num / max_num_per_page if (ttl_fans_num % max_num_per_page > 0): page_ttl = page_ttl + 1 for pagenum in range(2, page_ttl + 1): #next page from 2 to page_ttl , more fans sub_fan_url = self.getsubfanurl_page(sub_id, pagenum) resp = func.GetHttpContent(sub_fan_url) if resp is None: func._print("failed get html .The url = %s \n" % (sub_fan_url)) continue fan_id_list = [] self.get_uid_from_tudou_response(resp, fan_id_list, pagenum) all_fan_id_list = all_fan_id_list + fan_id_list print("the number of fans found from sub id %s: %s " % (sub_id, len(all_fan_id_list))) return all_fan_id_list
def get_fans(self, sub_id): all_fan_id_list = [] #要获取到页数,无法mt sub_fan_url = self.getsubfanurl(sub_id) #get fan's home url from sub_fan_url resp = func.GetHttpContent(sub_fan_url) if resp is None: func._print("failed get html .The url = %s \n" % (sub_fan_url)) return fan_id_list = [] ttl_fans_num = self.get_uid_from_tudou_response(resp, fan_id_list) all_fan_id_list = all_fan_id_list + fan_id_list page_ttl = ttl_fans_num / max_num_per_page if (ttl_fans_num % max_num_per_page > 0): page_ttl = page_ttl + 1 #print("the number of fans found from sub id %s: %s "% (sub_id, len(all_fan_id_list))) #print("the number of pages of fans of sub id %s: %s "% (sub_id, page_ttl )) if (page_ttl > 1): sub_fan_url_list = [] for pagenum in range(2, page_ttl + 1): #next page from 2 to page_ttl , more fans sub_fan_url = self.getsubfanurl_page(sub_id, pagenum) sub_fan_url_list.append(sub_fan_url) mt.runMT("get_fans", get_tudou_json, sub_fan_url_list, False, None, store_to_list, all_fan_id_list) #print("found %s fans from sub id %s: %s "% (len(all_fan_id_list), sub_id )) return all_fan_id_list
def extract(the_url, _xpath_list): html = '' try: html = func.GetHttpContent(the_url) #f = open(file_name, 'w') #f.write(html) #f.close #html = open(file_name).read()#.decode(decoding) #print html except: pass if html is None: return tree = etree.HTML(html) item1 = tree.xpath(_xpath_list[0]) item2 = tree.xpath(_xpath_list[1]) #item3 = tree.xpath(xpath_list[2]) items_dict = dict(zip(item1, item2)) item_no = 0 print(the_url) for goods_name, price in items_dict.iteritems(): item_no = item_no + 1 print "%-3s %-30s \t %s" % (item_no, goods_name.text, price.text)
def get_tudou_json(_url): resp = func.GetHttpContent(_url) if resp is None: func._print("failed get html .The url = %s \n" % (_url)) return None id_list = [] #print json_txt jsn = json.loads(resp) ttl_num = jsn['data']['total'] for each_data in jsn['data']['dataList']: id = each_data['uid'] id_list.append(id) return id_list
def get_html_func(_url): resp = func.GetHttpContent(_url) if resp is None: func._print("failed get html .The url = %s \n" % (_url)) return resp
#print json_txt jsn = json.loads(resp) ttl_num = jsn['data']['total'] for each_data in jsn['data']['dataList']: id = each_data['uid'] id_list.append(id) return id_list td_first_video_url = 'http://www.tudou.com/programs/view/KCByQcmiCHc/' td = 'http://tdrec.youku.com/tjpt/tdrec?encode=utf-8&count=20&juid=019fd7i8t9vb9&pcode=20000300&userid=68259393&itemid=202693301&_=1432960070475' td_url = 'http://tdrec.youku.com/tjpt/tdrec?encode=utf-8&count=20&itemid=202693301&pcode=20000300' yk = 'http://ykrec.youku.com/video/packed/list.json?guid=1425299560757n5F&vid=227687152&sid=0&cate=90&apptype=1&pg=1&module=1&pl=20&needTags=1&atrEnable=true&callback=RelationAsync.videoCallback&uid=59780527&t=0.028587819542735815' resp = func.GetHttpContent( yk ) print resp if resp is not None : id_name_xpath = '//*[@id="tjptList"]/li[1]/div[2]/h6/a' play_times_xpath = '//*[@id="tjptList"]/li[1]/div[2]/p[2]' yk_xpath = '//*[@id="relationvideo_async"]/div/div[1]/div[3]/div[1]/a' tree = etree.HTML(resp) r_list = tree.xpath(id_name_xpath) if len(r_list) > 0: print r_list[0].text.strip() r_list = tree.xpath(play_times_xpath) if len(r_list) > 0: print r_list[0].text.strip()