def process_item(self, item, spider): meta = item['meta_dict'] if item['error']: du.error_insert(item, meta) else: db_handler_str = item['db_handler'] du.db_handler_dict[db_handler_str](item['result'], meta) return item
def __init__(self): super().__init__() self.exchange = ExchangeParser() # parameters self.mkt_id = du.get_mkt_id(self.exchange.uptick_name) self.pdfs_dir = utils.PDF_DIR + self.exchange.uptick_name + '/' utils.create_pdf_dir(self.pdfs_dir) self.latest_date = du.get_latest_date_time(self.exchange.uptick_name, self.exchange.tzinfo)
def parse_news_page(self, response): # from scrapy.shell import inspect_response # inspect_response(response, self) stop_scrape_flag = False news_list = self.exchange.get_news_list(response) if not news_list: raise Exception('Error: Website Structure Has Been Changed!' + ' Maintainance Needed!') for i, news_row in enumerate(news_list): # has to assign new dict every loop # otherwise mongodb raises dup key (Id) error item = { 'mkt': self.exchange.uptick_name, 'mkt_id': self.mkt_id, 'tzinfo': self.exchange.tzinfo, 'error': True } try: # news row won't have error date_time, url, title, misc_fields_dict = self.exchange.get_news_fields( news_row) # database has previous news and scraped news is older than database if self.latest_date and date_time < self.latest_date: # todo: detect disordered news list # shenzhen news list has disordered news # stop_scrape_flag = True continue # generate file name by date and number of events on that date filename = du.get_filename(date_time, self.exchange.uptick_name) # insert record to mongodb item['date_time'] = date_time item['title'] = title item['url'] = url item['unique_id'] = filename item['error'] = False item.update(misc_fields_dict) yield item utils.save_pdf_url_or_chrome(url, self.pdfs_dir + filename) except Exception as e: # not news row, skip item['error'] = { 'news_row_html': news_row.extract(), 'error_message': '%s: %s' % (e.__class__, str(e)), 'row_no': i, 'traceback': traceback.format_exc(), 'url': response.url } yield item continue if self.exchange.keep_follow_pagination and not stop_scrape_flag: for url, meta in self.exchange.get_pagination_urls(response): yield scrapy.Request(url, callback=self.parse_news_page, meta=meta)
def update_ip_list(): try: bjtz = pytz.timezone('Asia/Shanghai') sydtz = pytz.timezone('Australia/Sydney') to_remove_list = [] for i in db.proxy_col.find(): exp_time = dp.parse(i['expire_time']).replace(tzinfo=bjtz) now = datetime.datetime.now().replace(tzinfo=sydtz) if time.localtime().tm_isdst: now = now + datetime.timedelta(hours=-1) secs = (exp_time - now).total_seconds() if secs < 10: print("Calculation") print(i) print(exp_time) print(now) print(secs) to_remove_list.append(i) res = db.proxy_col.delete_many( {'_id': { '$in': [i['_id'] for i in to_remove_list] }}) num = res.deleted_count if num != len(to_remove_list): import ipdb ipdb.set_trace(context=7) # insert new ips if num: print("Deleted :") for i in to_remove_list: print(i) ip_list = get_ip_list(num) print("Insert " + str(ip_list)) db.insert_proxy(ip_list) except: import ipdb ipdb.set_trace(context=7)
def __init__(self): super().__init__() self.exchange = ExchangeParser() # parameters self.mkt_id = du.get_mkt_id(self.exchange.uptick_name) # todo: change uptick_name to col_name self.pdfs_dir = utils.PDF_DIR + self.exchange.col_name + '/' utils.create_pdf_dir(self.pdfs_dir) # private # if self.exchange.is_multi_source_exchange: self.latest_date = utils.create_date_time_tzinfo( '30 DEC 2017', self.exchange.tzinfo)
def closed(self, reason): self.logger.info('spider closed: ' + reason) du.close_mongo_access()
def process_request(self, request, spider): ip = random.choice(db.get_proxy_list()) request.meta['proxy'] = ip
to_remove_list.append(i) res = db.proxy_col.delete_many( {'_id': { '$in': [i['_id'] for i in to_remove_list] }}) num = res.deleted_count if num != len(to_remove_list): import ipdb ipdb.set_trace(context=7) # insert new ips if num: print("Deleted :") for i in to_remove_list: print(i) ip_list = get_ip_list(num) print("Insert " + str(ip_list)) db.insert_proxy(ip_list) except: import ipdb ipdb.set_trace(context=7) if __name__ == "__main__": db.insert_proxy(get_ip_list(20)) time.sleep(2) while True: update_ip_list() time.sleep(5)