def test(): # logging.basicConfig(level=logging.INFO) query = HKEX_API(from_date=n_yearsago(n=1), to_date=today()) for data in query.data: try: url = data.file_link # url = 'https://www.dropbox.com/' print(url) pdf = PDF(url) pdf_obj = pdf.pdf_obj f = AuditFee(pdf_obj) tab_sum = [] for table in f.tables: tab_sum.append(table.summary) except KeyboardInterrupt: break except Exception as e: # print(e) result = { 'table_summary' : e, 'ERROR': True, 'url' : url, } write_to_csv(result, 'result_3.csv') continue else: # print('ok') result = { 'table_summary' : list(filter(None, tab_sum)), 'ERROR': None, 'url' : url } write_to_csv(result, 'result_3.csv')
def set_payloads(from_date: str, to_date: str = today('%Y%m%d'), stock_id: str = None, doc: str = None) -> dict: HKEX_API.date_fmt_validator(from_date, '%Y%m%d') HKEX_API.date_fmt_validator(to_date, '%Y%m%d') if over_a_year(from_date=from_date) and stock_id is None: ytd = n_yearsago(1) raise ValueError( f'Query over a year must specify stock_id e.g "1", global query can only from "{ytd}"' ) payloads = { 'sortDir': '0', 'sortByOptions': 'DateTime', 'category': '0', 'market': 'SEHK', 'stockId': stock_id or '-1', 'documentType': '-1', 'fromDate': from_date, 'toDate': to_date, 'title': '', 'searchType': '1', 't1code': '40000', 't2Gcode': '-2', 't2code': HKEX_API.doc_code.get(doc, None) or '40100', # 40100: annual report, 40200: half_year_report, 40300: quaterly_report 'rowRange': '100', 'lang': 'EN' } return payloads
'rowRange': '100', 'lang': 'EN' } return payloads def get_data(self) -> list: return HKEX_API.call_api(endpoint=HKEX_API.endpoint, payloads=self.payloads) if __name__ == '__main__': # pass # print(_get_data()) # query = HKEX_API(from_date=yesterday(), to_date=today()) query = HKEX_API(from_date=n_yearsago(n=1), to_date=today()) print(query.doc) print(query.payloads['t2code']) print(len(query.get_data())) print(len([i for i in query.data])) # query = HKEX_API(from_date=n_yearsago(n=1), to_date=today(), doc='half_year_report') print('>>>>>>') query.doc = 'half_year_report' print(query.doc) print(query.payloads['t2code']) print(len(query.get_data())) print(len([i for i in query.data])) print('>>>>>>') print(query.from_date) query.from_date = yesterday() print(query.from_date)
def start(action): # 读取今天已经抓取过的url global fetched_url_list json_txt = helper.readFile('./logs/goat-%s.json' % helper.today()) try: if json_txt: fetched_url_list = json.loads(json_txt) except: fetched_url_list = [] f = open('./keyword.json') txt = f.read() f.close() key_list = json.loads(txt) # 去重 # key_list = list(set(key_list)) key_list = helper.delRepeat(key_list) crawl_counter = mongo.get_crawl_counter(platform) # 创建一个队列用来保存进程获取到的数据 q = Queue() # 有错误的页面链接 error_page_url_queue = Queue() # TODO: key_list = ['DUNK'] for key in key_list: key = key.replace('\n', '') helper.log('[INFO] now key = ' + key, platform) # 先取男鞋 价格从低到高 if fetch_page(1, 'PRICE_LOW_HIGH', key, q, error_page_url_queue, crawl_counter): helper.log('[INFO] => fetch_page is done, 1, PRICE_LOW_HIGH', platform) # 先取男鞋 价格从高到低 if fetch_page(1, 'PRICE_HIGH_LOW', key, q, error_page_url_queue, crawl_counter): helper.log('[INFO] => fetch_page is done, 1, PRICE_HIGH_LOW', platform) # 先取女鞋 价格从低到高 if fetch_page(2, 'PRICE_LOW_HIGH', key, q, error_page_url_queue, crawl_counter): helper.log('[INFO] => fetch_page is done, 2, PRICE_LOW_HIGH', platform) # 先取女鞋 价格从高到低 if fetch_page(2, 'PRICE_HIGH_LOW', key, q, error_page_url_queue, crawl_counter): helper.log('[INFO] => fetch_page is done, 2, PRICE_HIGH_LOW', platform) # 先取青少年鞋 价格从低到高 if fetch_page(5, 'PRICE_LOW_HIGH', key, q, error_page_url_queue, crawl_counter): helper.log('[INFO] => fetch_page is done, 5, PRICE_LOW_HIGH', platform) # 先取青少年鞋 价格从高到低 if fetch_page(5, 'PRICE_HIGH_LOW', key, q, error_page_url_queue, crawl_counter): helper.log('[INFO] => fetch_page is done, 5, PRICE_HIGH_LOW', platform) # # 先取婴儿鞋 价格从低到高 # if fetch_page(6, 'PRICE_LOW_HIGH', key, q, error_page_url_queue, crawl_counter): # helper.log('[INFO] => fetch_page is done, 6, PRICE_LOW_HIGH', platform) # # 先取婴儿鞋 价格从高到低 # fetch_page(6, 'PRICE_HIGH_LOW', key, q, error_page_url_queue, crawl_counter) # helper.log('[INFO] => fetch_page is done, 6, PRICE_HIGH_LOW', platform) # goods_spider = GoodsSpider('https://www.goat.com/sneakers/force-savage-pro-baseball-cleat-880144-410', 1, Queue(), crawl_counter) # goods_spider.start() # 处理出错的链接 # while not error_page_url_queue.empty(): # error_page_url_list = [] # while not error_page_url_queue.empty(): # error_page_url_list.append(error_page_url_queue.get()) # error_page_men_url_list = [{'url': url_data.get('url'), 'count': url_data.get('count')} for url_data in error_page_url_list if url_data.get('gender') == 1] # fetch_page([{'url': url_data.get('url'), 'count': url_data.get('count')} for url_data in error_page_men_url_list], 1, q, error_page_url_queue, { # 'mnid': 'men_shoes', # 'Ns': 'sku.bestSeller | 1', # 'isAjax': 'true' # }, crawl_counter) helper.log('done', platform)
def run(self): ''' 解析网站源码 ''' time.sleep(3.6) global platform global error_detail_url try: slug = self.url.replace('https://www.goat.com/sneakers/', '') html = helper.get(self.url, returnText=True, platform=platform) if html: json_data = re.compile(r'window.__context__.*') json_data = json_data.findall(html)[0] json_data = json_data.replace('window.__context__ = ', '') json_data = json_data.replace('</script>', '') json_data = json.loads(json_data) json_data = json_data.get('default_store') json_data = json_data.get('product-templates') product_json = json_data.get('slug_map').get(slug) name = product_json.get('name') number = product_json.get('sku') color_value = product_json.get('details') color_name = name.split('\'')[1] if '\'' in name else '' size_list = product_json.get( 'formatted_available_sizes_new_v2') size_price_list = [{ 'size': float(data.get('size')), 'price': float(data.get('price_cents') / 100), 'isInStock': True } for data in size_list] # print({ # 'name': name, # 'number': number, # 'color_value': color_value, # 'color_name': color_name, # 'size_price_list': size_price_list, # }) img_downloaded = mongo.is_pending_goods_img_downloaded( self.url) if not img_downloaded: img_url = product_json.get('original_picture_url') result = helper.downloadImg( img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu( platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number)) img_downloaded = True mongo.insert_pending_goods( name, number, self.url, size_price_list, ['%s.jpg' % number], self.gender, color_value, platform, '5bbf4561c7e854cab45218ba', self.crawl_counter, color_name, img_downloaded) fetched_url_list.append(self.url) helper.writeFile(json.dumps(fetched_url_list), './logs/goat-%s.json' % helper.today()) else: error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log( '[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform) if error_counter < 3: self.q.put(self.url) except Exception as e: error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log( '[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform) helper.log(e, platform) if error_counter < 3: self.q.put(self.url) finally: helper.log('[INFO] %s is done' % self.url, platform)
def parse_training_arguments(): parser = argparse.ArgumentParser( description="GAN Training", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--subset-fraction", type=float, default=1., help="fraction of the full data being used (for testing out the code)") parser.add_argument( "--num-workers", type=int, default=0, help="number of workers in torch.DataLoader for parallel") parser.add_argument("--n-epoch", type=int, default=25, help="number of epochs for training") parser.add_argument("--batch-size", type=int, default=32, help="batch size") parser.add_argument("--feature-dim", type=int, default=100, help="feature space (noise) dimension") parser.add_argument("--lr", type=float, default=None, help="learning rate (overwrites --lrG and --lrD)") parser.add_argument( "--time-scale", type=float, default=1., help= ("time scale in learning rate (used with --lr to overwrite --lrD and --lrG), see" "\t- Gans trained by a two time-scale update rule converge to a local nash equilibrium. " "\t M. Heusel, H. Ramsauer, T. Unterthiner, B. Nessler, and S. Hochreiter - 2017\n" "\t- https://sthalles.github.io/advanced_gans/ suggests use 4.")) parser.add_argument("--lrG", type=float, default=1e-4, help="generator learning rate") parser.add_argument("--lrD", type=float, default=1e-4, help="discriminator learning rate") parser.add_argument("--net-size", type=int, default=64, help="network size") parser.add_argument("--G-batch", action="store_true", help="Batch normalization in training G") parser.add_argument("--D-batch", action="store_true", help="Batch normalization in training D") parser.add_argument("--alpha-region", type=float, default=1.0, help="weight for the region (C, T, L) classification") # ignores when log parser.add_argument("--n-step-eval", type=int, default=10, help="number of epochs between evaluation") parser.add_argument("--eval-size", type=int, default=2000, help="number of test images generated for evaluation") parser.add_argument("--exp", type=str, default=today(), help="experiment suffix") parser.add_argument("--run-name", type=str, default=now(), help="run name (default to be a time tag)") parser.add_argument( "--continued", action="store_true", help= ("continued from checkpoint:\n" "automatically activated if --checkpoint provided; \n" "use the default checkpoint (EXP/RUN_NAME.ckpt) if no --checkpoint provided" )) parser.add_argument("--checkpoint", type=str, default=None, help="the checkpoint to be loaded") parser.add_argument("--no-pbar", action="store_true", help="no progress bar") parser.add_argument( "--diter", type=int, default=1, help="number of iter of discriminator for 1 iter of gene") parser.add_argument("--data-augm", action="store_true", help="Run data small data augmentation") return parser
def find_file(url): query = HKEX_API(from_date=n_yearsago(n=1), to_date=today()) idx = [i+1 for i, data in enumerate(query.data) if data.file_link == url][0] print(idx) print([data for data in query.data][idx])