Exemple #1
0
 def get_car_url(self,response):
     html_body = response.body
     page = response.request.meta.get('page',0)
     city = response.request.meta.get('city',0)
     cookie_old = response.request.meta.get('cookie_old','')
     hxs = Selector(text = html_body)
     cars = hxs.xpath('//ul[@class="car-info-ul clearfix"]/li/a/@href').extract()
     # city = hxs.xpath('//span[@id="current_city"]/text()').extract()
     #print ''.join(city)
     print city
     for car in cars:
        #print car
        car_id = re.compile('\/info\/v\/(\d+)').findall(car)
        if car_id:
          car_id = car_id[0]
        else:
          continue
        print car_id
        url = 'http://ic5u.com%s' % car
        item = {'car_id':car_id,'city':city,'url':url}
        facade.update_item('taoche0515',{'car_id':car_id,'city':city},item)
        #self.mongo_db.car_list_url.update({'url':url,'city':city},item,True)
     if len(cars) == 12:
       next_page = page + 1
       next_url = 'http://ic5u.com/index.php/Home/CarSearch/index/p/%d.html' % next_page
       print 'next_page',next_url
       facade.update_item('taoche_linkbase',{'url':next_url,'city':city},{'url':next_url,'cookie_old':cookie_old,'page':page,'city':'city'})
       logging.info('保存城市%s,第%d页下一页的的链接:%s' %(city,page,next_url))
       print cookie_old
       return Request(next_url,meta={'q_flag':2,'city':city,'page':next_page,'cookie_old':cookie_old},cookies = {'PHPSESSID':cookie_old})
 def get_car_url(self, response):
     html_body = response.body
     print html_body
     result = json.loads(html_body)
     page = result.get('data', {}).get('page', 0)
     total = result.get('data', {}).get('total', 0)
     third_ids = result.get('data', {}).get('third_ids', '')
     round_id = result.get('data', {}).get('round_ids', '')
     tags = result.get('data', {}).get('tags', '')
     tags = urllib.quote(tags.encode('utf-8'))
     area = result.get('data', {}).get('area', '')
     if page == 1 and total > 10:
         for pn in range(2, total / 10 + 2):
             new_url = "https://www.innotree.cn/ajax/bigdata/company/getList?type=&page=%d&size=10&first_ids=&second_ids=&third_ids=%s&round=%s&area=%s&tags=%s" % (
                 pn, third_ids, round_id, area, tags)
             logging.info('挖掘出新链接[%s],来自:%s' % (new_url, response.url))
             facade.save_new_yinguoshu(new_url)
             print new_url
     data = result.get('data', {}).get('list', [])
     if len(data) == 0:
         facade.update_item('yinguoshu_no_item', {'from_url': response.url},
                            {'from_url': response.url})
     for item in data:
         item['from_url'] = response.url
         item['crawl_time'] = datetime.datetime.now().strftime(
             "%Y%m%d %H:%M:%S")
         facade.update_item('yinguoshu_item', {'cid': item['cid']}, item)
         #facade.save_yinguoshu(item)
         logging.info('save item:{"cid":"%s"}' % item['cid'])
 def process_step4(self, response, page_item):
     body = response.body_as_unicode().encode('utf-8')
     real_url = response.url
     logging.info('第4步:通过跳转破解加密连接,得到real_url:%s' % real_url)
     if 'www.baidu.com/link?url' in real_url:
         logging.warning("破解失败,链接仍然是百度加密url:%s" % real_url)
         return []
     #保存搜索结果
     item = {
         'real_url': real_url,
         'crawl_time': math.floor(time.time()),
         'query': response.request.meta.get('query', '')
     }
     self.process_tst_info(target=item, src=response.request.meta)
     logging.debug("通过跳转破解加密连接seitem:%s" % item)
     #保存
     where = {
         'query': item.get('query', ''),
         'real_url': item.get('real_url', '')
     }
     facade.update_item(self.decode_url_table_name, where, item)
     for parser in self.parsers:
         items = parser.process_page(page_item,
                                     html_body=page_item['html_body']
                                     if 'html_body' in page_item else None,
                                     p_facade=facade)
         for item in items:
             #透传字段
             self.process_tst_info(target=item, src=response.request.meta)
             parser.save_item(item)
             logging.debug(
                 '解析出的完整item:\n  %s' %
                 '  \n '.join(['%s:%s' % (k, v) for k, v in item.items()]))
     return []
 def parse(self, response):
     info_content = response.body
     url = response.url
     info_json = json.loads(info_content)
     info_json['crawl_time'] = datetime.datetime.now().strftime(
         '%Y-%m-%d %H:%M:%S')
     facade.update_item('koubei_autohome_app', {'from_url': url}, info_json)
Exemple #5
0
    def save_item(self, item):
        #where不能是None,不能是空(空则是所有进行更新),没有必要更新的都用id来做
        where = {}
        print item['from_url']
        print '**********************'
        for key in self.replace_keys:
            where[key] = item[key]
            if len(where) == 0:
                where["_id"] = ObjectId('123456789012')
                logging.error(
                    'get_update_where error! 没有配置replace_keys或replace_key不是must_keys'
                )
        print where
        #old_item = self.item_col.find_one(where)
        old_item = facade.find_one_item(self.table_name, where)
        if old_item is None:
            item['first_crawl_time'] = datetime.datetime.now().strftime(
                "%Y%m%d %H:%M:%S")
            if 'selled' in item:
                item['sell_time'] = datetime.datetime.now().strftime(
                    "%Y%m%d %H:%M:%S")
        else:
            item['first_crawl_time'] = old_item[
                'first_crawl_time'] if 'first_crawl_time' in old_item else old_item[
                    'frist_crawl_time']
            if 'selled' in item and 'sell_time' not in old_item:
                item['sell_time'] = item['crawl_time']
            if 'selled' in item and 'sell_time' in old_item:
                item['sell_time'] = old_item['sell_time']

        logging.debug('保存解析item. from url:%s get where for a item:%s' %
                      (item['from_url'], where))
        #self.item_col.update(where, item, True)
        facade.update_item(self.table_name, where, item)
Exemple #6
0
 def drop_item(self, url):
     #如果规则里面有dorp_item且不为空,说明item已经没用。需要删除,更新item状态添加字段drop_item:yes。
     car_id = re.findall(r'infoid=(\d+)', url)[0]
     where = {'car_id': car_id}
     #old_item = self.item_col.find(where)
     old_item = facade.find_one_item(self.table_name, where)
     for item in old_item:
         item['drop_item'] = 'yes'
         item['crawl_time'] = datetime.datetime.now().strftime(
             "%Y%m%d %H:%M:%S")
         #self.item_col.update(where, item, True)
         facade.update_item(self.table_name, where, item)
Exemple #7
0
  def extract_links_withrule(self, response, dups, rule):
    logging.debug('执行extract-link. rule:%s' % rule)
    linkinfos = []
    if rule.get('xpath', None) is None or len(rule.get('xpath')) < 1:
      logging.debug("rule没有xpath,不执行具体的解析")
      return linkinfos
    sgml = LinkExtractor(allow_domains=self.allowed_domain, restrict_xpaths = (rule['xpath']), allow=rule['allow'], deny=rule['deny'],tags=('a'))
    links = sgml.extract_links(response)
    logging.info('从xpath挖出item-url数量[%d] url[%s] xpath:%s ' % (len(links), response.url, rule['xpath']))
    for link in links:
      if link.url in dups: continue
      dups.add(link.url)
      linkinfo = {'url':link.url, 'from_url':response.url, 'spider_name':self.name, 'status':0, 'schedule_cnt':0, 'failed_cnt':0, 'in_time':math.floor(time.time()), 'update_time':math.floor(time.time()), 'dont_redirect':self.dont_redirect,'parser_ranks':rule.get('parser_ranks',[]),'link_type':rule.get('link_type','')}
      if rule.get('rule', None) is not None:
        linkinfo['rule'] = rule.get('rule')
      linkinfos.append(linkinfo)
      logging.debug('target link:%s from_url:%s' % (link.url, response.url))
    logging.debug('[%d]个targetLink extract from url:%s' % (len(linkinfos), response.url))
    if rule['xpath'] == 'regex':
      print rule['allow']
      html_parser = HTMLParser.HTMLParser()
      txt = html_parser.unescape(response.body)
      urls = re.findall(rule['allow'],txt)
      logging.info('从regex挖出item-url数量[%d] url[%s] xpath:%s ' % (len(urls), response.url, rule['xpath']))
      for url in urls:
        if url in dups: continue
        dups.add(url)
        linkinfo = {'url':url, 'from_url':response.url, 'spider_name':self.name, 'status':0, 'schedule_cnt':0, 'failed_cnt':0, 'in_time':math.floor(time.time()), 'update_time':math.floor(time.time()), 'dont_redirect':self.dont_redirect}
        linkinfos.append(linkinfo)
        logging.debug('target link:%s from_url:%s' % (url, response.url))
    if len(linkinfos) == 0:
      print str(rule)
      spider_warning = {'spider_name':self.name,'rule':str(rule),'url':response.request.url}
      facade.update_item('spider_warning',{'rule':str(rule),'url':response.request.url},spider_warning)
      logging.warning('spider_name:%s,xtarget rule:%s,url:%s 挖掘到的新链接为0 ,不符合预期' % (self.name,str(rule),response.request.url))

    return linkinfos