コード例 #1
0
 def determine_level(self, response):
     """
     determine the index level of current response, so we can decide wether to continue crawl or not.
     level 1: people/[a-z].html
     level 2: people/[A-Z][\d+].html
     level 3: people/[a-zA-Z0-9-]+.html
     level 4: search page, pub/dir/.+
     level 5: profile page
     """
     import re
     url = response.url
     if re.match(".+/[a-z]\.html", url):
         return 1
     elif re.match(".+/[A-Z]\d+.html", url):
         return 2
     elif re.match(".+/people-[a-zA-Z0-9-]+", url):
         return 3
     elif re.match(".+/pub/dir/.+", url):
         return 4
     elif re.match(".+/search/._", url):
         return 4
     elif re.match(".+/pub/.+", url):
         return 5
     log.msg("Crawl cannot determine the url's level: " + url)
     return None
コード例 #2
0
def read_datas(row):

    data = {
        'partNumber': row['件号'.encode('utf-8')],
        'name': row['名称'.encode('utf-8')],
        'category': row['类别'.encode('utf-8')],
        # 'minStock': row['最低库存'.encode('utf-8')],
        'unit': row['单位'.encode('utf-8')],
        'applicableModel': "运5B(D)",
    }
    cate_list = ['一般航材', '工装设备', '消耗品', '化工品', '时控件', '时寿件']
    if not data['name'] or not data['partNumber']:
        logging.warn("件号或名称没有。 件号:%s,名称:%s" %
                     (data['partNumber'], data['name'].decode("utf-8")))
        return None

    if data['category'].decode("utf-8") not in cate_list:
        logging.warn("%s的航材类别有误." % data['name'].decode("utf-8"))
        return None

    if row['最低库存'.encode('utf-8')]:
        if row['最低库存'.encode('utf-8')] <= 0:
            logging.msg("航材(%s)的最低库存应大于0" % data['name'].decode("utf-8"))
            return None
        data['minStock'] = int(row['最低库存'.encode('utf-8')])

    data['statusName'] = data['auditStatus'] = InitialState

    return data
コード例 #3
0
    def parsePage(self, response):
        sel = Selector(response)

        try:
            try:
                # Number of pages per hotel
                page_list = len(
                    sel.xpath(
                        '/html/body/div/div/div/div[4]/div/div[1]/text()'))
                page = str(
                    sel.xpath('/html/body/div/div/div/div[4]/div/div[1]/a[' +
                              str(page_list - 2) +
                              ']/span/text()')).split(' ')[2].split('\'')[1]
                print "page_list!!!"
                print page_list - 2
                print page

            except:
                page = 0

            for key in range(0, int(page)):
                link = response.url.replace(
                    urlparse(response.url)[4].split('&')[4],
                    'currentPage=' + str(key))
                print urlparse(response.url)[4].split('&')[4]
                yield Request(url=link, callback=self.parseReview)
                print 'sleep 5 secs'
                time.sleep(5)
        except:
            log.msg("Page Error !!!!! " + response.url, level=log.WARNING)
コード例 #4
0
 def parse(self, response):
     """
     default parse method, rule is not useful now
     """
     # import pdb; pdb.set_trace()
     response = response.replace(
         url=HtmlParser.remove_url_parameter(response.url))
     hxs = HtmlXPathSelector(response)
     index_level = self.determine_level(response)
     log.msg("Parse: index level:" + str(index_level))
     if index_level in [1, 2, 3, 4]:
         self.save_to_file_system(index_level, response)
         relative_urls = self.get_follow_links(index_level, hxs)
         if relative_urls is not None:
             for url in relative_urls:
                 log.msg('yield process, url:' + url)
                 yield Request(url, callback=self.parse)
     elif index_level == 5:
         personProfile = HtmlParser.extract_person_profile(hxs)
         linkedin_id = self.get_linkedin_id(response.url)
         linkedin_id = UnicodeDammit(
             urllib.unquote_plus(linkedin_id)).markup
         if linkedin_id:
             personProfile['_id'] = linkedin_id
             personProfile['url'] = UnicodeDammit(response.url).markup
             yield personProfile
コード例 #5
0
ファイル: check.py プロジェクト: wuyongdec/openslack-crawler
    def parsePage(self, response):
        sel = Selector(response)

        try:
            try:
                # Number of pages per hotel
                page_list = len(
                    sel.xpath(
                        '/html/body/div/div/div/div[4]/div/div[1]/text()'))
                page = str(
                    sel.xpath('/html/body/div/div/div/div[4]/div/div[1]/a[' +
                              str(page_list - 2) +
                              ']/span/text()')).split(' ')[2].split('\'')[1]
                print "page_list!!!"
                print page_list - 2
                print page
                if int(page) >= 74:
                    print "get!"
                    check.append(response.meta['hotel'])
                    con = json.dumps(check, ensure_ascii=False).encode('utf8')
                    print con
                    f = open('check', 'r+')
                    f.write(con)

            except:
                page = 0
                print "let page == 0"

            print 'sleep 5 secs'
            time.sleep(5)
        except:
            log.msg("Page Error !!!!! " + response.url, level=log.WARNING)
コード例 #6
0
ファイル: middleware.py プロジェクト: yangaoquan/qhpage
 def process_request(self, request, spider):
     # TODO implement complex proxy providing algorithm
     if self.use_proxy(request):
         p = random.choice(PROXIES)
         try:
             request.meta['proxy'] = "http://%s" % p['ip_port']
         except Exception, e:
             logging.msg("Exception %s" % e, _level=logging.CRITICAL)
コード例 #7
0
 def process_exception(self, request, exception, spider):
     # print '----',exception
     # print request.meta['handle_httpstatus_all']
     # print dir(request)
     # if 'proxy' in request.meta.keys():
     proxy = request.meta['proxy']
     log.msg('message:%s,url:(%s),failed proxy <%s>' %
             (exception.message, request, proxy))
コード例 #8
0
 def process_request(self, request, spider):
     # TODO implement complex proxy providing algorithm
     if self.use_proxy(request):
         p = random.choice(PROXIES)
         try:
             request.meta['proxy'] = "http://%s" % p['ip_port']
         except Exception, e:
             log.msg("Exception %s" % e, _level=log.CRITICAL)
コード例 #9
0
ファイル: __init__.py プロジェクト: eldakar/ascii-mapper
 def handle(self, signum, frame):
     self.exitCode = const.INVALID_EXIT + signum
     msg = "Received signal %s: '%s'; exiting with code %s" % (
         signum, const.signalLookup[signum], self.exitCode)
     # XXX do a terminal write here
     print "\n" + msg
     log = registry.getLogger()
     log.msg(msg)
     sys.exit(self.exitCode)
コード例 #10
0
    def __call__(self, values):
        if self.__req_fields is None:
            return values

        out_value = []
        for v in arg_to_iter(values):
            if all(key.lower() in v.keys() for key in self.__req_fields):
                out_value.append(v)
            else :
                log.msg("Failed to validate %s => %s" % (v, self.__req_fields), level=log.CRITICAL)
        return out_value
コード例 #11
0
 def process_item(self, item, spider):
     if self.__get_uniq_key() is None:
         self.collection.insert(dict(item))
     else:
         self.collection.update(
             {self.__get_uniq_key(): item[self.__get_uniq_key()]},
             dict(item),
             upsert=True)
     log.msg("Item wrote to MongoDB database %s/%s" %
             (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
             level=log.DEBUG,
             spider=spider)
     return item
コード例 #12
0
 def __call__(self, values):
     out_values = []
     for v in arg_to_iter(values):
         if isinstance(v, (str, unicode)):
             try:
                 out_values.append(dateutil.parser.parse(str(v), fuzzy=True).strftime(self.format))
             except:
                 log.msg('Failed to convert datetime string: "%s"' % v, level=log.WARNING)
                 out_values.append(None)
         elif isinstance(v, datetime):
             out_values.append(v.strftime(self.format))
         else:
             out_values.append(datetime(v).strftime(self.format))
     return out_values
コード例 #13
0
 def process_item(self, item, spider):
     valid = True
     for data in item:
         if not data:
             valid = False
             raise DropItem("Missing {0}!".format(data))
     #if valid:
     #self.collection.insert(dict(item))
     self.collection.update({'headline': item['headline']},
                            dict(item),
                            upsert=True)
     logging.msg("Article added to collection!",
                 level=logging.DEBUG,
                 spider=spider)
     return item
コード例 #14
0
ファイル: pipelines.py プロジェクト: ArielLaub/newsler
 def process_item(self, item, spider):
     valid = True
     for data in item:
         # here we only check if the data is not null
         # but we could do any crazy validation we want
         if not data:
             valid = False
             raise DropItem("Missing %s of blogpost from %s" %
                            (data, item['url']))
     if valid:
         self.collection.insert(dict(item))
         log.msg("Item wrote to MongoDB database %s/%s" %
                 (settings.MONGODB_DB, settings.MONGODB_COLLECTION),
                 level=log.DEBUG,
                 spider=spider)
     return item
コード例 #15
0
ファイル: qunar.py プロジェクト: openslack/openslack-crawler
 def inner_page(self, response):
     log.msg(response.url)
     res = Selector(response)
     item = WebspiderItem()
     if not res.xpath('//h1[@class="sight_info_name"]/@title'):
         yield Request(url=response.url, dont_filter=True)
     item["detil_title"] = res.xpath('//h1[@class="sight_info_name"]/@title').extract()[0]
     title_ticket = res.xpath('//h3[@class="ticket_item_title ticket_item_title_mainpage"]/text()').extract()
     price = res.xpath('//em[@class="txt_orange"]/strong[not(@style)]/text()').extract()
     if title_ticket and price:
         item["ticket_title"] = ",".join(["--".join(k) for k in zip(title_ticket, price)])
     else:
         item["ticket_title"] = ""
     item["introduce"] = "".join(
         res.xpath('//div[@class="intro_item_des"]/div[@class="module_des_content"]/p/text()').extract()
     )
     yield item
コード例 #16
0
ファイル: middleware.py プロジェクト: yaogdu/analysis
 def process_exception(self, request, exception, spider):
     log.msg("Catch a Exception: ******%s******" %repr(exception), level = log.INFO)
     log.msg("request is %s" %request.url, level = log.INFO)
     if request.meta.has_key('proxy'):
         proxy = request.meta.pop('proxy')
     if proxy:
         self.proxies.remove({'ip_port': proxy})
         self.bad.append({'ip_port': proxy})
         log.msg("Proxy %s cannot ******REACHED******, remove it." %proxy, level = log.INFO)
         log.msg("Retry the request %s." %request.url, level = log.INFO)
     return request
コード例 #17
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1
        if retries <= self.max_retry_times:
            log.msg(format="Retrying %(request)s " \
                           "(failed %(retries)d times): %(reason)s",
                    level=log.DEBUG, spider=spider, request=request,
                    retries=retries, reason=reason)
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            # our priority setup is different from super
            retryreq.meta['priority'] = retryreq.meta['priority'] - 10

            return retryreq
        else:
            log.msg(format="Gave up retrying %(request)s " \
                           "(failed %(retries)d times): %(reason)s",
                    level=log.DEBUG, spider=spider, request=request,
                    retries=retries, reason=reason)
コード例 #18
0
    def process_item(self, item, spider):
        if spider.name is not 'movieContent':
            return item

        try:
            logging.msg("[+] " + item["MovieName"])
            self.batch.put(
                item['MovieName'], {
                    "Movie:PostUrl": item['PostUrl'],
                    'Movie:Director': item['Director'],
                    "Movie:ReleaseTime": item['ReleaseTime'],
                    "Movie:Area": item['Area'],
                    "Movie:Performers": item['Performers']
                })
            self.batch.send()
        except:
            logging.msg("[-] %s Failed." % item["MovieName"])
        finally:
            self.conn.close()

        return item
コード例 #19
0
ファイル: middleware.py プロジェクト: yaogdu/analysis
 def process_request(self, request, spider):
     print 'process_request'
     # TODO implement complex proxy providing algorithm
     if len(self.proxies) < self.MIN_NUM_PROXY:
         log.msg("The volume of proxy-pool is ******LOW******. Now reload proxy-pool.", level = log.INFO)
         self.proxies = self.__getproxies__()
         log.msg("Reload proxies successfully.Now there are %s proxies." %len(self.proxies), level = log.INFO)
         
     p = random.choice(self.proxies)
     try:
         request.meta['proxy'] = p['ip_port']
         log.msg("Add proxy %s" % p['ip_port'], level=log.DEBUG)
     except Exception, e:
         log.msg("Exception %s" % e, _level=log.CRITICAL)
コード例 #20
0
ファイル: mongodb.py プロジェクト: wyc1314/CrawlerSystem
    def process_item(self, item, spider):
        book_detail = {
            'book_name': item.get('book_name'),
            'alias_name': item.get('alias_name', []),
            'author': item.get('author', []),
            'book_description': item.get('book_description', ''),
            'book_covor_image_path': item.get('book_covor_image_path', ''),
            'book_covor_image_url': item.get('book_covor_image_url', ''),
            'book_download': item.get('book_download', []),
            'book_file_url': item.get('book_file_url', ''),
            'book_file': item.get('book_file', ''),
            'original_url': item.get('original_url', ''),
            'update_time': datetime.datetime.utcnow(),
        }

        result = self.db['book_detail'].insert(book_detail)
        item["mongodb_id"] = str(result)

        log.msg("Item %s wrote to MongoDB database %s/book_detail" %
                (result, self.MONGODB_DB),
                level=log.DEBUG,
                spider=spider)
        return item
コード例 #21
0
 def inner_page(self, response):
     log.msg(response.url)
     res = Selector(response)
     item = WebspiderItem()
     if not res.xpath('//h1[@class="sight_info_name"]/@title'):
         yield Request(url=response.url, dont_filter=True)
     item['detil_title'] = res.xpath(
         '//h1[@class="sight_info_name"]/@title').extract()[0]
     title_ticket = res.xpath(
         '//h3[@class="ticket_item_title ticket_item_title_mainpage"]/text()'
     ).extract()
     price = res.xpath(
         '//em[@class="txt_orange"]/strong[not(@style)]/text()').extract()
     if title_ticket and price:
         item['ticket_title'] = ','.join(
             ['--'.join(k) for k in zip(title_ticket, price)])
     else:
         item['ticket_title'] = ''
     item['introduce'] = ''.join(
         res.xpath(
             '//div[@class="intro_item_des"]/div[@class="module_des_content"]/p/text()'
         ).extract())
     yield item
コード例 #22
0
ファイル: mongodb.py プロジェクト: AugustLONG/crawler
    def process_item(self, item, spider):
        book_detail = {
            "book_name": item.get("book_name"),
            "alias_name": item.get("alias_name", []),
            "author": item.get("author", []),
            "book_description": item.get("book_description", ""),
            "book_covor_image_path": item.get("book_covor_image_path", ""),
            "book_covor_image_url": item.get("book_covor_image_url", ""),
            "book_download": item.get("book_download", []),
            "book_file_url": item.get("book_file_url", ""),
            "book_file": item.get("book_file", ""),
            "original_url": item.get("original_url", ""),
            "update_time": datetime.datetime.utcnow(),
        }

        result = self.db["book_detail"].insert(book_detail)
        item["mongodb_id"] = str(result)

        log.msg(
            "Item %s wrote to MongoDB database %s/book_detail" % (result, self.MONGODB_DB),
            level=log.DEBUG,
            spider=spider,
        )
        return item
コード例 #23
0
 def close_spider(self, spider, reason):
     if self._dump:
         log.msg("Dumping Scrapy stats:\n" + pprint.pformat(self.get_stats()), \
                 spider=spider)
     self._persist_stats(self.get_stats(), spider)
コード例 #24
0
ファイル: graphite.py プロジェクト: webvul/crawler
 def close_spider(self, spider, reason):
     if self._dump:
         log.msg("Dumping Scrapy stats:\n" + pprint.pformat(self.get_stats()), \
                 spider=spider)
     self._persist_stats(self.get_stats(), spider)
コード例 #25
0
ファイル: storage_list.py プロジェクト: GSIL-Monitor/wxk
def read_datas(row):

    data = {
        'category': row['类型'.encode('utf-8')],
        'partNumber': row['件号'.encode('utf-8')],
        'serialNum': row['序号'.encode('utf-8')],
        'name': row['名称'.encode('utf-8')],
        'unit': row['单位'.encode('utf-8')],
        # 'flyTime': float(row['飞行小时'.encode('utf-8')]),
        # 'engineTime': float(row['发动机小时'.encode('utf-8')]),
        # 'flightTimes': int(row['起落架次'.encode('utf-8')]),
        'applicableModel': "运5B(D)",
        'storehouse': row['仓库'.encode('utf-8')],
        'minStock': row['最低库存'.encode('utf-8')],
        'shelf': row['架位'.encode('utf-8')],
        'effectiveDate': row['库存有效期'.encode('utf-8')],
        'certificateNum': row['证书编号'.encode('utf-8')],
        'airworthinessTagNum': row['适航标签号'.encode('utf-8')],
        'lastCheckDate': row['上次检查日期'.encode('utf-8')],
        'nextCheckDate': row['下次检查日期'.encode('utf-8')],
        'manufacturer': row['生产厂商'.encode('utf-8')],
        'supplier': row['供应商'.encode('utf-8')],

    }

    if row['数量'.encode('utf-8')]:
        data['quantity'] = int(row['数量'.encode('utf-8')])

    if row['冻结数量'.encode('utf-8')]:
        data['freezingQuantity'] = int(row['冻结数量'.encode('utf-8')])

    if row['起落架次'.encode('utf-8')]:
        data['flightTimes'] = int(row['起落架次'.encode('utf-8')])

    if row['飞行小时'.encode('utf-8')]:
        data['flyTime'] = row['飞行小时'.encode('utf-8')]

    if row['发动机小时'.encode('utf-8')]:
        data['engineTime'] = row['发动机小时'.encode('utf-8')]

    if not data['name'] or not data['partNumber']:
        logging.warn("名称(%s)或件号(%s)不存在。" % (
            data['name'].decode('utf-8'), data['partNumber']))
        return None

    if data['minStock'] and data['minStock'] <= 0:
        logging.msg("航材(%s)的最低库存应大于0" % data['name'].decode("utf-8"))
        return None

    if data['partNumber'] and data['serialNum'] and data['quantity'] != 1:
        logging.warn("件号和序号都存在时,数量必须为1。第%s条数据" % row['序号1'.encode('utf-8')])
        return None

    if data['freezingQuantity'] < 0 or \
            data['freezingQuantity'] > data['quantity']:
        logging.warn("冻结数量应至少为0,且不大于数量。第%s条数据" % row['序号1'.encode('utf-8')])
        return None

    if data['lastCheckDate']:
        try:
            date = datetime.strptime(data['lastCheckDate'], "%Y-%m-%d")
        except Exception as e:
            logging.warn("lastCheckDate is wrong. number:%s" % row['序号1'.encode('utf-8')])
            return

    if data['nextCheckDate']:
        try:
            date = datetime.strptime(data['nextCheckDate'], "%Y-%m-%d")
        except Exception as e:
            logging.warn("nextCheckDate is wrong. number:%s" % row['序号1'.encode('utf-8')])
            return

    am = AirmaterialCategory.query.filter(
        AirmaterialCategory.partNumber == data['partNumber'],
        AirmaterialCategory.category == data['category'],
        AirmaterialCategory.name == data['name']).first()
    if not am:
        logging.warn("该库存对应的航材不存在或库存的名称或类型不对应. 件号:%s名称:%s" % (
            data['partNumber'], data['name'].decode("utf-8")))
        return None
    return data
コード例 #26
0
def spider_closed(spider):
    logging.msg('Spider closed: %s' % spider, level=log.INFO)
    print results
コード例 #27
0
    def parseReview(self, response):
        sel = Selector(response)
        review_list = []
        hotel_overview = {}
        # hotel profile

        hotel_url = sel.xpath(
            '/html/body/div/div/div/div[2]/a[1]/@href').extract()

        hotel_overview['url'] = 'http://hotels.ctrip.com' + str(
            hotel_url[0].split('_')[0])
        hotel_overview['total_overall_rating'] = \
            sel.xpath('/html/body/div/div/div/div[1]/div[1]/span[2]/span/text()').extract()[0].strip()

        hotel_overview['per_recomment'] = \
            sel.xpath('/html/body/div/div/div/div[1]/div[1]/span[3]/span/text()').extract()[0].strip()
        hotel_overview['for_biz'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[1]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_friend'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[2]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_couple'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[3]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_family'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[4]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_single'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[5]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_agent'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[6]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_others'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[7]/span/text()').extract()[0].strip())[0]

        hotel_overview['avg_location'] = sel.xpath(
            '/html/body/div/div/div/div[1]/div[3]/p[1]/span/text()').extract(
            )[0].strip()
        hotel_overview['avg_facility'] = sel.xpath(
            '/html/body/div/div/div/div[1]/div[3]/p[2]/span/text()').extract(
            )[0].strip()
        hotel_overview['avg_service'] = sel.xpath(
            '/html/body/div/div/div/div[1]/div[3]/p[3]/span/text()').extract(
            )[0].strip()
        hotel_overview['avg_clean'] = sel.xpath(
            '/html/body/div/div/div/div[1]/div[3]/p[4]/span/text()').extract(
            )[0].strip()
        hotel_overview['all_comment'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="All_Commnet"]/text()').extract()[0].strip())[0]
        hotel_overview['recomment'] = re.findall(
            r'\d+',
            sel.xpath('//*[@id="Recomment"]/text()').extract()[0].strip())[0]
        hotel_overview['no_recomment'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="No_Recoment"]/text()').extract()[0].strip())[0]

        review_list.append(dict(hotel_overview))

        try:
            # Number of reviews per page
            num = len(sel.xpath('/html/body/div/div/div/div[3]/text()'))
            # Hotel Profile

            for flag in xrange(1, num):
                # Review
                item = hotelReview()
                print flag
                author = sel.xpath('/html/body/div/div/div/div[3]/div[' +
                                   str(flag) +
                                   ']/div[1]/p[2]/text()').extract()
                user_type = sel.xpath('/html/body/div/div/div/div[3]/div[' +
                                      str(flag) +
                                      ']/div[1]/p[1]/@title').extract()
                date = sel.xpath('/html/body/div/div/div/div[3]/div[' +
                                 str(flag) + ']/p/span[3]/a/text()').extract()
                room_type = sel.xpath('/html/body/div/div/div/div[3]/div[' +
                                      str(flag) +
                                      ']/div[1]/p[3]/text()').extract()
                review_overall_rating = sel.xpath(
                    '/html/body/div/div/div/div[3]/div[' + str(flag) +
                    ']/p/span[2]/span/text()').extract()
                review_aspect_rating = sel.xpath(
                    '/html/body/div/div/div/div[3]/div[' + str(flag) +
                    ']/p/span[1]/@data-value').extract()
                helpful = sel.xpath('/html/body/div/div/div/div[3]/div[' +
                                    str(flag) +
                                    ']/div[2]/a/span/text()').extract()
                review = sel.xpath('/html/body/div/div/div/div[3]/div[' +
                                   str(flag) + ']/div[2]/text()').extract()

                # print str(response.body).decode('GB2312').encode('utf8')
                filename = response.url.split('?')[1].split('&')[1].split(
                    '=')[1]
                print 'HIIIIIIIIIII'
                print filename
                # item is an object
                item['author'] = author[0].strip()
                item['user_type'] = user_type[0].strip()
                item['date'] = date[0].strip()
                item['room_type'] = room_type[0].strip()
                item['review_overall_rating'] = review_overall_rating[0].strip(
                )
                # """
                # "clean": ["", " ", "卫生:5", " ", "服务:5", " ", "设施:5", " ", "位置:5\r\n", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""]
                # """
                item['location'] = re.findall(
                    r'\d+', review_aspect_rating[0].strip().split(',')[0])[0]
                item['facility'] = re.findall(
                    r'\d+', review_aspect_rating[0].strip().split(',')[1])[0]
                item['service'] = re.findall(
                    r'\d+', review_aspect_rating[0].strip().split(',')[2])[0]
                item['clean'] = re.findall(
                    r'\d+', review_aspect_rating[0].strip().split(',')[3])[0]
                item['review'] = review[0].strip()
                item['helpful'] = re.findall(r'\d+', helpful[0].strip())[0]

                review_list.append(dict(item))

                print review_list
            # Write the file like the pipe
            con = json.dumps(review_list, ensure_ascii=False).encode('utf8')
            self.writeAppendFile(filename, con)
        except:
            log.msg("Review Error !!!!" + response.url, level=log.WARNING)
コード例 #28
0
 def get_linkedin_id(self, url):
     find_index = url.find("www.linkedin.com/")
     if find_index >= 0:
         log.msg(url, url[find_index + 13:].replace('/', '-'))
         return url[find_index + 13:].replace('/', '-')
     return None
コード例 #29
0
ファイル: middleware.py プロジェクト: yaogdu/analysis
 def process_request(self, request, spider):
     agent = random.choice(AGENTS)
     request.headers['User-Agent'] = agent
     log.msg("Add agent %s" % agent, level=log.DEBUG)
コード例 #30
0
ファイル: middleware.py プロジェクト: yaogdu/analysis
    def process_response(self, request, response, spider):

        log.msg("Response Status code is : %s." %repr(response.status), level = log.INFO)
        log.msg("Response Headers is :\n %s." %repr(response.headers), level = log.INFO)
        if response.status == 302 and "Location" in response.headers:
            proxy = request.meta.pop('proxy')
            self.proxies.remove({'ip_port': proxy})
            self.anti.append({'ip_port': proxy})
            log.msg("Proxy %s has been ******ANTIED******, remove it." %proxy, level = log.INFO)
            log.msg("Retry the request %s." %request.url, level = log.INFO)
            log.msg("Original request is %s" %request.url, level = log.INFO)
            log.msg("Response request is %s" %repr(response.request), level = log.INFO)
            return request
        elif response.status == 404:
            proxy = request.meta.pop('proxy')
            self.proxies.remove({'ip_port': proxy})
            self.anti.append({'ip_port': proxy})
            log.msg("Proxy %s is ******NOT WORK******, remove it." %proxy, level = log.INFO)
            log.msg("Retry the request %s." %request.url, level = log.INFO)
            log.msg("Original request is %s" %request.url, level = log.INFO)
            log.msg("Response request is %s" %repr(response.request), level = log.INFO)
            return request
        elif response.status in [403,]:
            proxy = request.meta.pop('proxy')
            self.proxies.remove({'ip_port': proxy})
            self.anti.append({'ip_port': proxy})
            log.msg("Proxy %s has been ******FORBIDDEN******, remove it." %proxy, level = log.INFO)
            log.msg("Retry the request %s." %request.url, level = log.INFO)
            log.msg("Original request is %s" %request.url, level = log.INFO)
            log.msg("Response request is %s" %repr(response.request), level = log.INFO)
            return request
        elif response.status in [500, 501, 502, 503, 504, 505]:
            proxy = request.meta.pop('proxy')
            self.proxies.remove({'ip_port': proxy})
            self.anti.append({'ip_port': proxy})
            log.msg("Proxy %s is ******NOT WORK******, remove it." %proxy, level = log.INFO)
            log.msg("Retry the request %s." %request.url, level = log.INFO)
            log.msg("Original request is %s" %request.url, level = log.INFO)
            log.msg("Response request is %s" %repr(response.request), level = log.INFO)
            return request

        else:
            return response