#!/usr/bin/env python # encoding=utf-8 """ 数据库查询接口 """ import sys import os from scpy.logger import get_logger LOGGER = get_logger() ROOT_PATH = os.path.abspath( os.path.join(os.path.dirname(__file__), os.path.pardir)) if ROOT_PATH not in sys.path: sys.path.append(ROOT_PATH) from psycopg2 import pool from psycopg2.extras import RealDictCursor from config import PG_HOST, PG_PORT, SC_CRAWLER_PG_QUERY sc_crawler_pg_cnn_pool = pool.ThreadedConnectionPool(minconn=1, maxconn=20, host=PG_HOST, port=PG_PORT, **SC_CRAWLER_PG_QUERY) def sc_crawler_insert_one(pg_sql, data=None): conne = sc_crawler_pg_cnn_pool.getconn() try: cursor = conne.cursor() if data:
from scpy.date_extractor import extract_first_date client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, region_name=AWS_REGION_NAME) s3_resource = boto3.resource('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, region_name=AWS_REGION_NAME) region_code = u"|".join(BID_REGION_CODE_TABLE.values()) reg_key = re.compile(u"^(?:%s)/.*/\d{4}/\d{1,2}/\d{1,2}/.*\.json" % region_code) s3util = aws_s3_sc.S3Util() logger = get_logger(__file__) def search_item(): paginator = client.get_paginator('list_objects') prefixes = [ 'gan_su', 'guang_dong', 'guang_xi', 'gui_zhou', 'hai_nan', 'he_bei', 'he_nan', 'hei_long_jiang', 'hu_bei', 'hu_nan', 'ji_lin', 'jiang_su', 'jiang_xi', 'liao_ning', 'nei_meng_gu', 'ning_xia', 'qing_hai', 'quan_guo', 'shan_dong', 'shan_xi-', 'shan_xi_', 'shang_hai', 'si_chuan', 'tai_wan', 'tian_jin', 'xi_zhang', 'xiang_gang', 'xing_jiang', 'yun_nan', 'zhe_jiang' ] count = 0
import json import copy import tornado.web import tornado.ioloop from tornado.concurrent import run_on_executor from tornado.web import HTTPError from concurrent.futures import ThreadPoolExecutor from scpy.threadlocal import set_threadlocal from scpy.logger import get_logger import uuid from utils.pgutil import PgUtil from db import postgresql_sc logger = get_logger() PG = PgUtil() MAX_WORKERS = 512 database = postgresql_sc.PostgresqlUtil() announce_keys = [ "purchaser", "purchase_agent", "purchase_category", "title", "url", "region", "published_ts", "announced_ts", "winning_ts", "announce_type", "amount", "unit", "currency" ] result_keys = ["winning_company", "winning_amount", "unit", "currency"] class InfoHandler(tornado.web.RequestHandler): executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)