Example #1
0
 def process_item(self, item, spider):
     meta = item['meta_dict']
     if item['error']:
         du.error_insert(item, meta)
     else:
         db_handler_str = item['db_handler']
         du.db_handler_dict[db_handler_str](item['result'], meta)
     return item
Example #2
0
    def __init__(self):
        super().__init__()
        self.exchange = ExchangeParser()

        # parameters
        self.mkt_id = du.get_mkt_id(self.exchange.uptick_name)
        self.pdfs_dir = utils.PDF_DIR + self.exchange.uptick_name + '/'
        utils.create_pdf_dir(self.pdfs_dir)
        self.latest_date = du.get_latest_date_time(self.exchange.uptick_name,
                                                   self.exchange.tzinfo)
    def parse_news_page(self, response):
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)
        stop_scrape_flag = False
        news_list = self.exchange.get_news_list(response)
        if not news_list:
            raise Exception('Error: Website Structure Has Been Changed!' +
                            ' Maintainance Needed!')
        for i, news_row in enumerate(news_list):
            # has to assign new dict every loop
            # otherwise mongodb raises dup key (Id) error
            item = {
                'mkt': self.exchange.uptick_name,
                'mkt_id': self.mkt_id,
                'tzinfo': self.exchange.tzinfo,
                'error': True
            }
            try:  # news row won't have error
                date_time, url, title, misc_fields_dict = self.exchange.get_news_fields(
                    news_row)

                # database has previous news and scraped news is older than database
                if self.latest_date and date_time < self.latest_date:
                    # todo: detect disordered news list
                    # shenzhen news list has disordered news
                    # stop_scrape_flag = True
                    continue

                # generate file name by date and number of events on that date
                filename = du.get_filename(date_time,
                                           self.exchange.uptick_name)

                # insert record to mongodb
                item['date_time'] = date_time
                item['title'] = title
                item['url'] = url
                item['unique_id'] = filename
                item['error'] = False
                item.update(misc_fields_dict)
                yield item

                utils.save_pdf_url_or_chrome(url, self.pdfs_dir + filename)

            except Exception as e:  # not news row, skip
                item['error'] = {
                    'news_row_html': news_row.extract(),
                    'error_message': '%s: %s' % (e.__class__, str(e)),
                    'row_no': i,
                    'traceback': traceback.format_exc(),
                    'url': response.url
                }
                yield item
                continue

        if self.exchange.keep_follow_pagination and not stop_scrape_flag:
            for url, meta in self.exchange.get_pagination_urls(response):
                yield scrapy.Request(url,
                                     callback=self.parse_news_page,
                                     meta=meta)
Example #4
0
def update_ip_list():
    try:
        bjtz = pytz.timezone('Asia/Shanghai')
        sydtz = pytz.timezone('Australia/Sydney')

        to_remove_list = []
        for i in db.proxy_col.find():
            exp_time = dp.parse(i['expire_time']).replace(tzinfo=bjtz)
            now = datetime.datetime.now().replace(tzinfo=sydtz)
            if time.localtime().tm_isdst:
                now = now + datetime.timedelta(hours=-1)
            secs = (exp_time - now).total_seconds()
            if secs < 10:
                print("Calculation")
                print(i)
                print(exp_time)
                print(now)
                print(secs)
                to_remove_list.append(i)

        res = db.proxy_col.delete_many(
            {'_id': {
                '$in': [i['_id'] for i in to_remove_list]
            }})
        num = res.deleted_count
        if num != len(to_remove_list):
            import ipdb
            ipdb.set_trace(context=7)

        # insert new ips
        if num:
            print("Deleted :")
            for i in to_remove_list:
                print(i)
            ip_list = get_ip_list(num)
            print("Insert " + str(ip_list))
            db.insert_proxy(ip_list)
    except:
        import ipdb
        ipdb.set_trace(context=7)
    def __init__(self):
        super().__init__()
        self.exchange = ExchangeParser()

        # parameters
        self.mkt_id = du.get_mkt_id(self.exchange.uptick_name)
        # todo: change uptick_name to col_name
        self.pdfs_dir = utils.PDF_DIR + self.exchange.col_name + '/'
        utils.create_pdf_dir(self.pdfs_dir)
        # private
        # if self.exchange.is_multi_source_exchange:
        self.latest_date = utils.create_date_time_tzinfo(
            '30 DEC 2017', self.exchange.tzinfo)
Example #6
0
 def closed(self, reason):
     self.logger.info('spider closed: ' + reason)
     du.close_mongo_access()
Example #7
0
 def process_request(self, request, spider):
     ip = random.choice(db.get_proxy_list())
     request.meta['proxy'] = ip
Example #8
0
                to_remove_list.append(i)

        res = db.proxy_col.delete_many(
            {'_id': {
                '$in': [i['_id'] for i in to_remove_list]
            }})
        num = res.deleted_count
        if num != len(to_remove_list):
            import ipdb
            ipdb.set_trace(context=7)

        # insert new ips
        if num:
            print("Deleted :")
            for i in to_remove_list:
                print(i)
            ip_list = get_ip_list(num)
            print("Insert " + str(ip_list))
            db.insert_proxy(ip_list)
    except:
        import ipdb
        ipdb.set_trace(context=7)


if __name__ == "__main__":
    db.insert_proxy(get_ip_list(20))
    time.sleep(2)
    while True:
        update_ip_list()
        time.sleep(5)