def test():
        # logging.basicConfig(level=logging.INFO)
        query = HKEX_API(from_date=n_yearsago(n=1), to_date=today())
        for data in query.data:
            try:
                url = data.file_link
                # url = 'https://www.dropbox.com/'
                print(url)

                pdf = PDF(url)
                pdf_obj = pdf.pdf_obj
                f = AuditFee(pdf_obj) 
                tab_sum = []
                for table in f.tables:
                    tab_sum.append(table.summary)
            except KeyboardInterrupt:
                break
            except Exception as e:
                # print(e)
                result = {
                'table_summary' : e,
                'ERROR': True,
                'url' : url,
                }
                write_to_csv(result,  'result_3.csv')
                continue
            else:
                # print('ok')
                result = {
                'table_summary' : list(filter(None, tab_sum)),
                'ERROR': None,
                'url' : url
                }
                write_to_csv(result,  'result_3.csv')
Ejemplo n.º 2
0
    def set_payloads(from_date: str,
                     to_date: str = today('%Y%m%d'),
                     stock_id: str = None,
                     doc: str = None) -> dict:

        HKEX_API.date_fmt_validator(from_date, '%Y%m%d')
        HKEX_API.date_fmt_validator(to_date, '%Y%m%d')

        if over_a_year(from_date=from_date) and stock_id is None:
            ytd = n_yearsago(1)
            raise ValueError(
                f'Query over a year must specify stock_id e.g "1", global query can only from "{ytd}"'
            )

        payloads = {
            'sortDir': '0',
            'sortByOptions': 'DateTime',
            'category': '0',
            'market': 'SEHK',
            'stockId': stock_id or '-1',
            'documentType': '-1',
            'fromDate': from_date,
            'toDate': to_date,
            'title': '',
            'searchType': '1',
            't1code': '40000',
            't2Gcode': '-2',
            't2code': HKEX_API.doc_code.get(doc, None) or
            '40100',  # 40100: annual report, 40200: half_year_report, 40300: quaterly_report
            'rowRange': '100',
            'lang': 'EN'
        }

        return payloads
Ejemplo n.º 3
0
            'rowRange': '100',
            'lang': 'EN'
        }

        return payloads

    def get_data(self) -> list:
        return HKEX_API.call_api(endpoint=HKEX_API.endpoint,
                                 payloads=self.payloads)


if __name__ == '__main__':
    # pass
    # print(_get_data())
    # query = HKEX_API(from_date=yesterday(), to_date=today())
    query = HKEX_API(from_date=n_yearsago(n=1), to_date=today())
    print(query.doc)
    print(query.payloads['t2code'])
    print(len(query.get_data()))
    print(len([i for i in query.data]))
    # query = HKEX_API(from_date=n_yearsago(n=1), to_date=today(), doc='half_year_report')
    print('>>>>>>')
    query.doc = 'half_year_report'
    print(query.doc)
    print(query.payloads['t2code'])
    print(len(query.get_data()))
    print(len([i for i in query.data]))
    print('>>>>>>')
    print(query.from_date)
    query.from_date = yesterday()
    print(query.from_date)
Ejemplo n.º 4
0
def start(action):
    # 读取今天已经抓取过的url
    global fetched_url_list
    json_txt = helper.readFile('./logs/goat-%s.json' % helper.today())
    try:
        if json_txt:
            fetched_url_list = json.loads(json_txt)
    except:
        fetched_url_list = []
    f = open('./keyword.json')
    txt = f.read()
    f.close()
    key_list = json.loads(txt)
    # 去重
    # key_list = list(set(key_list))
    key_list = helper.delRepeat(key_list)
    crawl_counter = mongo.get_crawl_counter(platform)
    # 创建一个队列用来保存进程获取到的数据
    q = Queue()
    # 有错误的页面链接
    error_page_url_queue = Queue()

    # TODO:
    key_list = ['DUNK']
    for key in key_list:
        key = key.replace('\n', '')
        helper.log('[INFO] now key = ' + key, platform)
        # 先取男鞋 价格从低到高
        if fetch_page(1, 'PRICE_LOW_HIGH', key, q, error_page_url_queue,
                      crawl_counter):
            helper.log('[INFO] => fetch_page is done, 1, PRICE_LOW_HIGH',
                       platform)
        # 先取男鞋 价格从高到低
        if fetch_page(1, 'PRICE_HIGH_LOW', key, q, error_page_url_queue,
                      crawl_counter):
            helper.log('[INFO] => fetch_page is done, 1, PRICE_HIGH_LOW',
                       platform)
        # 先取女鞋 价格从低到高
        if fetch_page(2, 'PRICE_LOW_HIGH', key, q, error_page_url_queue,
                      crawl_counter):
            helper.log('[INFO] => fetch_page is done, 2, PRICE_LOW_HIGH',
                       platform)
        # 先取女鞋 价格从高到低
        if fetch_page(2, 'PRICE_HIGH_LOW', key, q, error_page_url_queue,
                      crawl_counter):
            helper.log('[INFO] => fetch_page is done, 2, PRICE_HIGH_LOW',
                       platform)
        # 先取青少年鞋 价格从低到高
        if fetch_page(5, 'PRICE_LOW_HIGH', key, q, error_page_url_queue,
                      crawl_counter):
            helper.log('[INFO] => fetch_page is done, 5, PRICE_LOW_HIGH',
                       platform)
        # 先取青少年鞋 价格从高到低
        if fetch_page(5, 'PRICE_HIGH_LOW', key, q, error_page_url_queue,
                      crawl_counter):
            helper.log('[INFO] => fetch_page is done, 5, PRICE_HIGH_LOW',
                       platform)
        #     # 先取婴儿鞋 价格从低到高
        #     if fetch_page(6, 'PRICE_LOW_HIGH', key, q, error_page_url_queue, crawl_counter):
        #         helper.log('[INFO] => fetch_page is done, 6, PRICE_LOW_HIGH', platform)
        #         # 先取婴儿鞋 价格从高到低
        #         fetch_page(6, 'PRICE_HIGH_LOW', key, q, error_page_url_queue, crawl_counter)
        #         helper.log('[INFO] => fetch_page is done, 6, PRICE_HIGH_LOW', platform)

    # goods_spider = GoodsSpider('https://www.goat.com/sneakers/force-savage-pro-baseball-cleat-880144-410', 1, Queue(), crawl_counter)
    # goods_spider.start()

    # 处理出错的链接
    # while not error_page_url_queue.empty():
    #     error_page_url_list = []
    #     while not error_page_url_queue.empty():
    #         error_page_url_list.append(error_page_url_queue.get())

    #     error_page_men_url_list = [{'url': url_data.get('url'), 'count': url_data.get('count')} for url_data in error_page_url_list if url_data.get('gender') == 1]
    #     fetch_page([{'url': url_data.get('url'), 'count': url_data.get('count')} for url_data in error_page_men_url_list], 1, q, error_page_url_queue, {
    #         'mnid': 'men_shoes',
    #         'Ns': 'sku.bestSeller | 1',
    #         'isAjax': 'true'
    #     }, crawl_counter)
    helper.log('done', platform)
Ejemplo n.º 5
0
 def run(self):
     '''
     解析网站源码
     '''
     time.sleep(3.6)
     global platform
     global error_detail_url
     try:
         slug = self.url.replace('https://www.goat.com/sneakers/', '')
         html = helper.get(self.url, returnText=True, platform=platform)
         if html:
             json_data = re.compile(r'window.__context__.*')
             json_data = json_data.findall(html)[0]
             json_data = json_data.replace('window.__context__ = ', '')
             json_data = json_data.replace('</script>', '')
             json_data = json.loads(json_data)
             json_data = json_data.get('default_store')
             json_data = json_data.get('product-templates')
             product_json = json_data.get('slug_map').get(slug)
             name = product_json.get('name')
             number = product_json.get('sku')
             color_value = product_json.get('details')
             color_name = name.split('\'')[1] if '\'' in name else ''
             size_list = product_json.get(
                 'formatted_available_sizes_new_v2')
             size_price_list = [{
                 'size':
                 float(data.get('size')),
                 'price':
                 float(data.get('price_cents') / 100),
                 'isInStock':
                 True
             } for data in size_list]
             # print({
             #     'name': name,
             #     'number': number,
             #     'color_value': color_value,
             #     'color_name': color_name,
             #     'size_price_list': size_price_list,
             # })
             img_downloaded = mongo.is_pending_goods_img_downloaded(
                 self.url)
             if not img_downloaded:
                 img_url = product_json.get('original_picture_url')
                 result = helper.downloadImg(
                     img_url,
                     os.path.join('.', 'imgs', platform, '%s.jpg' % number))
                 if result == 1:
                     # 上传到七牛
                     qiniuUploader.upload_2_qiniu(
                         platform, '%s.jpg' % number,
                         './imgs/%s/%s.jpg' % (platform, number))
                     img_downloaded = True
             mongo.insert_pending_goods(
                 name, number, self.url, size_price_list,
                 ['%s.jpg' % number], self.gender, color_value, platform,
                 '5bbf4561c7e854cab45218ba', self.crawl_counter, color_name,
                 img_downloaded)
             fetched_url_list.append(self.url)
             helper.writeFile(json.dumps(fetched_url_list),
                              './logs/goat-%s.json' % helper.today())
         else:
             error_counter = error_detail_url.get(self.url, 1)
             error_detail_url[self.url] = error_counter + 1
             helper.log(
                 '[ERROR] error timer = %s, url = %s' %
                 (error_counter, self.url), platform)
             if error_counter < 3:
                 self.q.put(self.url)
     except Exception as e:
         error_counter = error_detail_url.get(self.url, 1)
         error_detail_url[self.url] = error_counter + 1
         helper.log(
             '[ERROR] error timer = %s, url = %s' %
             (error_counter, self.url), platform)
         helper.log(e, platform)
         if error_counter < 3:
             self.q.put(self.url)
     finally:
         helper.log('[INFO] %s is done' % self.url, platform)
Ejemplo n.º 6
0
def parse_training_arguments():
    parser = argparse.ArgumentParser(
        description="GAN Training",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--subset-fraction",
        type=float,
        default=1.,
        help="fraction of the full data being used (for testing out the code)")
    parser.add_argument(
        "--num-workers",
        type=int,
        default=0,
        help="number of workers in torch.DataLoader for parallel")
    parser.add_argument("--n-epoch",
                        type=int,
                        default=25,
                        help="number of epochs for training")

    parser.add_argument("--batch-size",
                        type=int,
                        default=32,
                        help="batch size")
    parser.add_argument("--feature-dim",
                        type=int,
                        default=100,
                        help="feature space (noise) dimension")
    parser.add_argument("--lr",
                        type=float,
                        default=None,
                        help="learning rate (overwrites --lrG and --lrD)")
    parser.add_argument(
        "--time-scale",
        type=float,
        default=1.,
        help=
        ("time scale in learning rate (used with --lr to overwrite --lrD and --lrG), see"
         "\t- Gans trained by a two time-scale update rule converge to a local nash equilibrium. "
         "\t  M. Heusel, H. Ramsauer, T. Unterthiner, B. Nessler, and S. Hochreiter - 2017\n"
         "\t- https://sthalles.github.io/advanced_gans/ suggests use 4."))
    parser.add_argument("--lrG",
                        type=float,
                        default=1e-4,
                        help="generator learning rate")
    parser.add_argument("--lrD",
                        type=float,
                        default=1e-4,
                        help="discriminator learning rate")
    parser.add_argument("--net-size",
                        type=int,
                        default=64,
                        help="network size")

    parser.add_argument("--G-batch",
                        action="store_true",
                        help="Batch normalization in training G")
    parser.add_argument("--D-batch",
                        action="store_true",
                        help="Batch normalization in training D")

    parser.add_argument("--alpha-region",
                        type=float,
                        default=1.0,
                        help="weight for the region (C, T, L) classification")

    # ignores when log
    parser.add_argument("--n-step-eval",
                        type=int,
                        default=10,
                        help="number of epochs between evaluation")
    parser.add_argument("--eval-size",
                        type=int,
                        default=2000,
                        help="number of test images generated for evaluation")
    parser.add_argument("--exp",
                        type=str,
                        default=today(),
                        help="experiment suffix")
    parser.add_argument("--run-name",
                        type=str,
                        default=now(),
                        help="run name (default to be a time tag)")
    parser.add_argument(
        "--continued",
        action="store_true",
        help=
        ("continued from checkpoint:\n"
         "automatically activated if --checkpoint provided; \n"
         "use the default checkpoint (EXP/RUN_NAME.ckpt) if no --checkpoint provided"
         ))
    parser.add_argument("--checkpoint",
                        type=str,
                        default=None,
                        help="the checkpoint to be loaded")
    parser.add_argument("--no-pbar",
                        action="store_true",
                        help="no progress bar")
    parser.add_argument(
        "--diter",
        type=int,
        default=1,
        help="number of iter of discriminator for 1 iter of gene")
    parser.add_argument("--data-augm",
                        action="store_true",
                        help="Run data small data augmentation")

    return parser
 def find_file(url):
     query = HKEX_API(from_date=n_yearsago(n=1), to_date=today())
     idx = [i+1 for i, data in enumerate(query.data) if data.file_link == url][0]
     print(idx)
     print([data for data in query.data][idx])