Example #1
0
 def get_all_fans(self, all_sub_id_list, fan_id_dict):
     all_fan_id_counter = 0
     all_fan_url_list = []
     #get every sub's fan's url, get first page first step
     for sub_id in all_sub_id_list:
         fan_url = self.get_sub_fan_first_url(sub_id)
         all_fan_url_list.append(fan_url)  #first page
     first_page_result_queue = Queue.Queue()
     mt_create_queue_flag = False
     mt.runMT("get_all_subs", get_html_func, all_fan_url_list,
              mt_create_queue_flag, first_page_result_queue)
     #for every fan's url ,
     other_page_url_list = []  # other page
     while (not first_page_result_queue.empty()):
         _url, resp = first_page_result_queue.get()
         fan_id_list = []
         ttl_fans_num = self.get_uid_from_tudou_response(resp, fan_id_list)
         fan_id_counter = len(fan_id_list)
         ##print("the number of subs found from  the fan id  %s 's first page : %s "% (sub_url, sub_id_counter))
         self.store_id_to_dict(fan_id_list, fan_id_dict)
         page_ttl = ttl_fans_num / max_num_per_page  # ttl_subs_num max is 999, per page max_num_per_page
         if (ttl_fans_num % max_num_per_page > 0):
             page_ttl = page_ttl + 1
         if page_ttl > 1:  # has more page
             for pagenum in range(
                     2, page_ttl +
                     1):  #next page from 2 to page_ttl , more subs
                 other_url = self.get_fan_page_from_fan_url(
                     fan_url, pagenum)
                 other_page_url_list.append(other_url)
     #print("the number of other page of subs need to get  : %s "% (len(other_page_sub_url_list)))
     mt.runMT("get_all_fans", get_tudou_json, other_page_url_list, False,
              None, store_to_dict, fan_id_dict)
Example #2
0
def deep_dig_relation( first_iid, deep_level):    
    ttl_iid_set = set()
    playinfo_dict = {}
    has_diged_iid_set = set()    
    dig_iid_set = set([first_iid])  #only get one id from argv, use     

    for i in range(deep_level): #loop from 0 to  deep_level - 1
       this_level = i + 1
       print("scrapy iid number %s, level %s "% (len(dig_iid_set), this_level))
       url_list = []
       for iid in dig_iid_set:   # dig each iid  ,get all sub to iid_dict
           url_list.append( get_tudou_tj_json_url(iid))
                      
       html_result_queue = Queue.Queue() 
       mt_create_queue_flag = False
       mt.runMT("deep_dig_relation", get_html_func, url_list, mt_create_queue_flag, html_result_queue)
       #for every sub's url ,        
       while (not html_result_queue.empty()):
          _url , resp  =   html_result_queue.get()
          get_tjinfo_from_tudou_response( resp,  ttl_iid_set, playinfo_dict )   
           
       has_diged_iid_set = has_diged_iid_set | dig_iid_set 
       dig_iid_set = dig_iid_set | ttl_iid_set - has_diged_iid_set
   
    relation_html_name = "relation"+str(first_iid)+"_"+str(deep_level)+".htm"  
    write_relation_result(playinfo_dict, relation_html_name ) 
Example #3
0
    def get_fans(self, sub_id):
        all_fan_id_list = []

        #要获取到页数,无法mt
        sub_fan_url = self.getsubfanurl(sub_id)
        #get fan's home url from sub_fan_url
        resp = func.GetHttpContent(sub_fan_url)
        if resp is None:
            func._print("failed get html .The url = %s \n" % (sub_fan_url))
            return
        fan_id_list = []
        ttl_fans_num = self.get_uid_from_tudou_response(resp, fan_id_list)
        all_fan_id_list = all_fan_id_list + fan_id_list
        page_ttl = ttl_fans_num / max_num_per_page
        if (ttl_fans_num % max_num_per_page > 0):
            page_ttl = page_ttl + 1
        #print("the number of fans found from sub id %s: %s "% (sub_id, len(all_fan_id_list)))
        #print("the number of pages of fans of sub id %s: %s "% (sub_id, page_ttl ))
        if (page_ttl > 1):
            sub_fan_url_list = []
            for pagenum in range(2, page_ttl +
                                 1):  #next page from 2 to page_ttl , more fans
                sub_fan_url = self.getsubfanurl_page(sub_id, pagenum)
                sub_fan_url_list.append(sub_fan_url)

            mt.runMT("get_fans", get_tudou_json, sub_fan_url_list, False, None,
                     store_to_list, all_fan_id_list)
        #print("found %s fans   from sub id %s: %s "% (len(all_fan_id_list), sub_id  ))
        return all_fan_id_list
Example #4
0
 def get_all_subs(self, all_fan_id_list, sub_id_dict):
   all_sub_id_counter = 0     
   all_sub_url_list = []
   #get every fan's sub's url, get first page first step
   for fan_id  in all_fan_id_list :     
   	  sub_url =  self.get_fan_sub_first_url(fan_id)
   	  all_sub_url_list.append(sub_url) #first page
   first_page_result_queue = Queue.Queue() 
   mt_create_queue_flag = False
   mt.runMT("get_all_subs", get_html_func, all_sub_url_list, mt_create_queue_flag, first_page_result_queue)
   #for every sub's url , 
   other_page_sub_url_list = []     # other page
   while (not first_page_result_queue.empty()):
       sub_url , resp  =   first_page_result_queue.get()
       sub_id_list = []       
       ttl_subs_num = self.get_uid_from_tudou_response(resp, sub_id_list)           
       sub_id_counter = len(sub_id_list)
       print("the number of subs found from  the fan id  %s 's first page : %s "% (sub_url, sub_id_counter))                
       self.store_id_to_dict(sub_id_list ,  sub_id_dict)   
       page_ttl = ttl_subs_num / max_num_per_page    # ttl_subs_num max is 999, per page max_num_per_page
       if (ttl_subs_num % max_num_per_page > 0) :
       	page_ttl = page_ttl + 1       
       if page_ttl > 1 :# has more page 
          for   pagenum  in range(2, page_ttl + 1):	#next page from 2 to page_ttl , more subs
       	    other_sub_url = self.get_sub_page_from_sub_url(sub_url, pagenum)
       	    other_page_sub_url_list.append( other_sub_url )
   print("the number of other page of subs need to get  : %s "% (len(other_page_sub_url_list)))                            	
   mt.runMT("get_all_subs 2", get_tudou_json, other_page_sub_url_list,
            False,  None,
            store_to_dict, sub_id_dict)        	
   this_counter = self.get_subs(fan_id, sub_id_dict)
   all_sub_id_counter = all_sub_id_counter + this_counter  
Example #5
0
def mt_get_html_and_parser(url_dict, xpath_list):
    page_result_queue = Queue.Queue()
    mt_create_queue_flag = False
    url_list = url_dict.keys()
    mt.runMT("x", get_html_func, url_list, mt_create_queue_flag,
             page_result_queue)
    #for every url ,
    all_result_list = []  # other page
    while (not page_result_queue.empty()):
        url, html = page_result_queue.get()
        #html = html.encode('UTF-8')
        tree = etree.HTML(html)
        idx = url_dict[url]
        result_list = []
        result_list.append(url)
        for xpath in xpath_list[idx]:
            r = tree.xpath(xpath)
            if (len(r) == 0):
                #print ("Empty result  extract from the url = %s, xpath   = %s\n " % (url, xpath ))
                result_list.append("extract failed or not found!")
                continue
            elif len(r) > 1:
                print(
                    "The number of results extract from the url = %s is %s ,, xpath = %s\n"
                    % (url, len(r), xpath))

            item = r[0].text.strip()
            result_list.append(item)
        if (len(result_list)) > 1:  #+ len(xpath_list[idx]) : #complete extract
            all_result_list.append(result_list)
    return all_result_list
Example #6
0
 def get_fans(self,  sub_id):
   all_fan_id_list = []
   
   #要获取到页数,无法mt
   sub_fan_url = self.getsubfanurl(sub_id)
   #get fan's home url from sub_fan_url
   resp = func.GetHttpContent( sub_fan_url )           
   if resp is None :
      func._print ("failed get html .The url = %s \n"%(sub_fan_url ))      
      return
   fan_id_list = []          
   ttl_fans_num = self.get_uid_from_tudou_response(resp, fan_id_list) 
   all_fan_id_list = all_fan_id_list + fan_id_list    
   page_ttl = ttl_fans_num / max_num_per_page    
   if (ttl_fans_num % max_num_per_page > 0) :
     	page_ttl = page_ttl + 1    
   print("the number of fans found from sub id %s: %s "% (sub_id, len(all_fan_id_list)))   	
   print("the number of pages of fans of sub id %s: %s "% (sub_id, page_ttl )) 
   if (page_ttl > 1) :
     sub_fan_url_list = []     
     for  pagenum  in range(2, page_ttl  + 1):	 #next page from 2 to page_ttl , more fans    	
           sub_fan_url = self.getsubfanurl_page(sub_id, pagenum)
           sub_fan_url_list.append(sub_fan_url)
  
     mt.runMT("get_fans", get_tudou_json, sub_fan_url_list, 
            False,  None,
            store_to_list, all_fan_id_list)  
   print("the number of fans found from sub id %s: %s "% (sub_id, len(all_fan_id_list))) 
   return all_fan_id_list
Example #7
0
def deep_dig_relation(first_iid, deep_level):
    ttl_iid_set = set()
    playinfo_dict = {}
    has_diged_iid_set = set()
    dig_iid_set = set([first_iid])  #only get one id from argv, use

    for i in range(deep_level):  #loop from 0 to  deep_level - 1
        this_level = i + 1
        print("scrapy iid number %s, level %s " %
              (len(dig_iid_set), this_level))
        url_list = []
        for iid in dig_iid_set:  # dig each iid  ,get all sub to iid_dict
            url_list.append(get_tudou_tj_json_url(iid))

        html_result_queue = Queue.Queue()
        mt_create_queue_flag = False
        mt.runMT("deep_dig_relation", get_html_func, url_list,
                 mt_create_queue_flag, html_result_queue)
        #for every sub's url ,
        while (not html_result_queue.empty()):
            _url, resp = html_result_queue.get()
            get_tjinfo_from_tudou_response(resp, ttl_iid_set, playinfo_dict)

        has_diged_iid_set = has_diged_iid_set | dig_iid_set
        dig_iid_set = dig_iid_set | ttl_iid_set - has_diged_iid_set

    relation_html_name = "relation" + str(first_iid) + "_" + str(
        deep_level) + ".htm"
    write_relation_result(playinfo_dict, relation_html_name)
Example #8
0
def mt_get_html_and_parser(url_dict, target_list ):    
    global target_event
    target_event.clear()
    #print("%s : start checking ....."%(time.strftime("%Y/%m/%d %H:%M:%S")))	    
    page_result_queue = Queue.Queue() 
    mt_create_queue_flag = False
    url_list = url_dict.keys()
    mt.runMT("x", get_html_func, url_list, mt_create_queue_flag, page_result_queue)
    #for every html got       
    while (not page_result_queue.empty()):
       url , html  =   page_result_queue.get() 
      #print html
       idx = url_dict[url]
       target = target_list[idx]      
       r1 = target.judge_have_stock(html)
       r2 = target.judge_have_low_price(html)
    target_event.set()
Example #9
0
    def write_sub_info(self, sub_id_dict, file_name, file_mode='w'):
        sub_id_focus_pair_list = sorted(sub_id_dict.items(),
                                        lambda x, y: cmp(x[1], y[1]),
                                        reverse=True)

        all_sub_url_list = []
        for _pair in sub_id_focus_pair_list:
            sub_id = _pair[0]
            sub_url = self.getsubinfourl(sub_id)
            all_sub_url_list.append(sub_url)
        #get url's page
        sub_page_result_queue = Queue.Queue()
        mt_create_queue_flag = False
        mt.runMT("get_all_subs", get_html_func, all_sub_url_list,
                 mt_create_queue_flag, sub_page_result_queue)
        #for every sub's url ,

        sub_url_name_dict = {}
        id_name_xpath = '//*[@id="topTitle"]/h3'
        while (not sub_page_result_queue.empty()):
            _url, html = sub_page_result_queue.get()
            tree = etree.HTML(html)
            r_list = tree.xpath(id_name_xpath)
            #sub_url_name_dict[_url] = str(_url)
            if len(r_list) > 0:
                id_name = r_list[0].text.strip()
                # print id_name
                sub_url_name_dict[_url] = id_name

        fp = open(file_name, file_mode)
        for _pair in sub_id_focus_pair_list:
            sub_id = _pair[0]
            focus_num = _pair[1]
            sub_url = self.getsubinfourl(sub_id)
            id_name = str(sub_id)
            if sub_url_name_dict.has_key(sub_url):
                id_name = sub_url_name_dict[sub_url]
            fp.write('<a href = "' + sub_url + '" target="_blank">' +
                     (str(id_name.encode("GBK", 'ignore'))) + ' </a>  ' +
                     str(focus_num))
            fp.write('<br>')
        fp.close()
        print("%s sub's url and focus value writed to %s!" %
              (len(sub_id_focus_pair_list), file_name))
        return sub_id_focus_pair_list
Example #10
0
def mt_get_html_and_parser(url_dict, xpath_list, currency_list):
    page_result_queue = Queue.Queue()
    mt_create_queue_flag = False
    url_list = url_dict.keys()
    mt.runMT("x", get_html_func, url_list, mt_create_queue_flag,
             page_result_queue)
    #for every url ,
    all_result_list = []  # other page
    while (not page_result_queue.empty()):
        url, html = page_result_queue.get()
        #html = html.decode('UTF-8', 'replace') # for show in gbk in windows cmd shell
        #print html
        tree = etree.HTML(html)
        idx = url_dict[url]
        result_list = []
        result_list.append(url)
        cur_rate = currency_list[idx]
        for i in range(len(xpath_list[idx])):
            xpath = xpath_list[idx][i]
            r_list = tree.xpath(xpath)
            if (len(r_list) == 0):
                print(" %s \n extract 0 result \n xpath   = %s\n " %
                      (url, xpath))
                #result_list.append("extract failed or not found!")
                continue
            elif len(r_list) > 1:
                print(
                    "The number of results extract from the url = %s is %s   xpath = %s\n"
                    % (url, len(r_list), xpath))
            for r in r_list:
                item = r.text.strip()
                #xpath 的第2开始都是价格要换算
                result_list.append(item)
                if i == 1:  #xpath中第一个是价格
                    item = "RMB " + change_currency(item, cur_rate)
                    result_list.append(item)
                #c = judge_currency(item)
                #if c is not None :
                #   item = item + " = RMB " + _change_currency(item, c)
        if (len(result_list)) > 1:  #+ len(xpath_list[idx]) : #complete extract
            all_result_list.append(result_list)
    return all_result_list
Example #11
0
  def write_sub_info(self, sub_id_dict, file_name, file_mode = 'w'):
    sub_id_focus_pair_list = sorted(sub_id_dict.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)    
   
    all_sub_url_list = []    
    for  _pair in sub_id_focus_pair_list:
           sub_id = _pair[0]
           sub_url = self.getsubinfourl(sub_id)
           all_sub_url_list.append(sub_url)
    #get url's page
    sub_page_result_queue = Queue.Queue() 
    mt_create_queue_flag = False
    mt.runMT("get_all_subs", get_html_func, all_sub_url_list, mt_create_queue_flag, sub_page_result_queue)
    #for every sub's url , 

    sub_url_name_dict = {}
    id_name_xpath = '//*[@id="topTitle"]/h3'
    while (not sub_page_result_queue.empty()):
        _url , html  =   sub_page_result_queue.get()
        tree = etree.HTML(html)
        r_list =  tree.xpath(id_name_xpath)
        #sub_url_name_dict[_url] = str(_url)
        if len(r_list) > 0:    
           id_name = r_list[0].text.strip() 
          # print id_name
           sub_url_name_dict[_url] = id_name

    fp = open(file_name ,file_mode) 
    for  _pair in sub_id_focus_pair_list:
           sub_id = _pair[0]
           focus_num = _pair[1]          
           sub_url = self.getsubinfourl(sub_id)
           id_name = str(sub_id)
           if  sub_url_name_dict.has_key(sub_url):
             id_name = sub_url_name_dict[sub_url]
           fp.write('<a href = "' +sub_url +  '" target="_blank">' + 
                      (str(id_name.encode("GBK", 'ignore'))) +' </a>  ' + str(focus_num))
           fp.write('<br>')
    fp.close()   
    print("%s sub's url and focus value writed to %s!"%(len(sub_id_focus_pair_list),file_name ))
    return sub_id_focus_pair_list