Beispiel #1
0
 def __init__(self, timeout=None, service_args=[]):
     self.logger = getLogger(__name__)
     self.chrome_options = None
     self.first_time_browse = True
     self.timeout = timeout
     self.browser = True
     # current_page = 当前翻到第几页
     self.current_session_access_counter = 0
     self.reset_session_pages = 0
     # 以下处理当前应该跳到第几页
     self.databaseConnection = DBKits()
     self.website_info_DB_operation = WebsiteInfoDBOperation(
         db_engine=self.databaseConnection)
     self.query_condition = {WebsiteInfo.website_id == WEBSITE_ID_TO_CRAWL}
     self.query_result = self.website_info_DB_operation.query_record(
         query_conditions=self.query_condition)
     self.current_proxy: ProxyInfo = None
     if self.query_result.count() == 1:
         website_message: WebsiteInfo = self.query_result.first()
         self.start_URL = website_message.start_up_url
         self.current_page = website_message.current_page_number
         self.go_to_page_string = website_message.website_goto_page_str
         self.next_page_string = website_message.website_next_page_str
     else:
         raise RuntimeError('No website ID:%d' % WEBSITE_ID_TO_CRAWL)
         self.current_page = 1
     self.new_browser()
Beispiel #2
0
def reget_proxy():
    http = urllib3.PoolManager()
    try:
        r = http.request(
            'GET',
            'http://d.jghttp.golangapi.com/getip?num=1&type=3&pro=&city=0&yys=0&port=1&pack=3283&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions='
        )
        if r.status == 200:
            database_connection = DBKits()
            database_operation = ProxyInfoDBOperation(
                db_engine=database_connection)
            for file_content in r.data.decode('utf-8').splitlines():
                for proxy_string in file_content.split():
                    proxy_info = ProxyInfo()
                    proxy_info.proxy_ip = proxy_string.split(':')[0]
                    proxy_info.proxy_port = proxy_string.split(':')[1]
                    proxy_info.proxy_source = '极光免费'
                    proxy_info.proxy_type = 'http'
                    proxy_info.added_time = datetime.now()
                    proxy_info.status = 0
                    database_operation.insert_record(
                        proxy_info, skip_duplicated_record=False)
                    proxy_info = None
        else:
            return None
    except Exception as excepinfo:
        error(excepinfo)
        return -1
    return None
Beispiel #3
0
 def __init__(self, category=None, *args, **kwargs):
     super(GdgovdataSpider, self).__init__(*args, **kwargs)
     self.databaseConnection = DBKits()
     self.require_to_read_DB_operation = \
         RequirementToReadDBOperation(db_engine=self.databaseConnection)
     self.sourcing_announcement_to_read_DB_operation = \
         SourcingAnnouncementToReadDBOperation(db_engine=self.databaseConnection)
     self.contract_to_read_DB_operation = \
         ContractInfoToReadDBOperation(db_engine=self.databaseConnection)
     self.list_items_map_DB_Operation = \
         ListItemsMapDBOperation(db_engine=self.databaseConnection)
     self.sourcing_plans_to_read_DB_operation = \
         SourcingPlanToReadDBOperation(db_engine=self.databaseConnection)
     self.website_info_DB_operation = \
         WebsiteInfoDBOperation(db_engine=self.databaseConnection)
     self.duplicate_record_qty = 0
     self.items_processor = ListItemsProcessor()
     self.items_processor.read_item_parameter(
         db_connection=self.databaseConnection,
         websiteID=WEBSITE_ID_TO_CRAWL)
     self.website_info = WebsiteInfo()
     self.website_info = self.website_info_DB_operation.query_record(
         {WebsiteInfo.website_id == WEBSITE_ID_TO_CRAWL}).first()
     self.duplicate_record_condition = None
     self.DB_operation = None
     pass
Beispiel #4
0
def mark_unavailable_proxy(proxy_id=None):
    database_connection = DBKits()
    database_operation = ProxyInfoDBOperation(db_engine=database_connection)
    query_conditions = {ProxyInfo.proxy_id == proxy_id}
    proxy_data_for_update = ProxyInfo()
    proxy_data_for_update.status = 1
    database_operation.update_record(query_conditions=query_conditions,
                                     update_data=proxy_data_for_update)
    pass
Beispiel #5
0
def mark_used_proxy(proxy_id=None):
    database_connection = DBKits()
    database_operation = ProxyInfoDBOperation(db_engine=database_connection)
    query_conditions = {ProxyInfo.proxy_id == proxy_id}
    proxy_data_for_update = ProxyInfo()
    proxy_data_for_update.last_used_time = datetime.now()
    database_operation.update_record(query_conditions=query_conditions,
                                     update_data=proxy_data_for_update)
    pass
Beispiel #6
0
def make_website_info():
    website_data = WebsiteInfo()
    # 设置信息
    website_data.website_name = '广东省采购中心-省直-采购需求'
    website_data.start_up_url = 'http://www.gdgpo.gov.cn/queryMoreInfoList/channelCode/0005.html'
    website_data.website_crawl_scope = 'gdgpo.gov.cn'
    # 设置信息结束
    database_connection = DBKits()
    database_operation = WebsiteInfoDBOperation(db_engine=database_connection)
    database_operation.insert_record(website_data, skip_duplicated_record=True)
Beispiel #7
0
def make_website_info_contract():
    website_data = WebsiteInfo()
    # 设置信息
    website_data.website_name = '广东省采购中心-省直-采购合同1'
    website_data.start_up_url = START_URL_CONTRACT_INFO_TO_BE_READ
    website_data.website_crawl_scope = 'gdgpo.gov.cn'
    website_data.current_page_number = 16000
    # 设置信息结束
    database_connection = DBKits()
    database_operation = WebsiteInfoDBOperation(db_engine=database_connection)
    database_operation.insert_record(website_data, skip_duplicated_record=True)
Beispiel #8
0
def make_website_info_accept():
    website_data = WebsiteInfo()
    # 设置信息
    website_data.website_name = '广东省采购中心-省直-履约验收'
    website_data.start_up_url = START_URL_ACCEPT_TO_BE_READ
    website_data.website_crawl_scope = 'gdgpo.gov.cn'
    website_data.current_page_number = 1
    website_data.website_next_page_str = '//a[@class="aborder2"]/span[contains(.,"下一页")]'
    website_data.website_goto_page_str = '//input[@id="pointPageIndexId"]'
    # 设置信息结束
    database_connection = DBKits()
    database_operation = WebsiteInfoDBOperation(db_engine=database_connection)
    database_operation.insert_record(website_data, skip_duplicated_record=True)
Beispiel #9
0
def get_random_http_proxy():
    database_connection = DBKits()
    database_operation = ProxyInfoDBOperation(db_engine=database_connection)
    get_all_available_proxy = {
        ProxyInfo.status == 0 and ProxyInfo.proxy_type == 'http'
    }
    record_sets = database_operation.query_record(query_conditions=get_all_available_proxy).\
        order_by(ProxyInfo.added_time.desc(),
                 ProxyInfo.last_used_time.asc())
    if record_sets.count() > 0:
        return_proxy_info: ProxyInfo = record_sets.first()
        mark_used_proxy(return_proxy_info.proxy_id)
        return return_proxy_info
    return None
Beispiel #10
0
def read_proxy_info():
    database_connection = DBKits()
    database_operation = ProxyInfoDBOperation(db_engine=database_connection)

    for file_content in open('./proxy.txt', 'r').readlines():
        for proxy_string in file_content.split():
            proxy_info = ProxyInfo()
            proxy_info.proxy_ip = proxy_string.split(':')[0]
            proxy_info.proxy_port = proxy_string.split(':')[1]
            proxy_info.proxy_source = '极光免费'
            proxy_info.proxy_type = 'http'
            proxy_info.added_time = datetime.now()
            proxy_info.status = 0
            database_operation.insert_record(proxy_info,
                                             skip_duplicated_record=False)
            proxy_info = None
Beispiel #11
0
def update_website_info():
    website_data_to_update = WebsiteInfo()
    website_data_to_query = WebsiteInfo()
    # 设置信息
    website_data_to_query.start_up_url = 'http://www.gdgpo.gov.cn/queryMoreInfoList/channelCode/0005.html'
    website_data_to_query.website_crawl_scope = 'gdgpo.gov.cn'
    # 设置信息结束
    database_connection = DBKits()
    database_operation = WebsiteInfoDBOperation(db_engine=database_connection)
    website_data_to_update.current_page_number = 800
    query_conditions = {
        WebsiteInfo.start_up_url == website_data_to_query.start_up_url
        and WebsiteInfo.website_crawl_scope
        == website_data_to_query.website_crawl_scope
    }
    database_operation.update_record(record_type=WebsiteInfo,
                                     update_data=website_data_to_update)
Beispiel #12
0
def make_website_items_mapping(website_id=-1):
    filename_test = './list_items_map4sourcing_plan.txt'
    database_connection = DBKits()
    database_operation = WebsiteInfoDBOperation(db_engine=database_connection)
    line_count = 0
    for file_content in open(filename_test, 'r').readlines():
        if line_count > 0:
            list_item = ListItemsMap()
            if website_id == -1:
                list_item.website_id = int(file_content.split(',')[0])
            else:
                list_item.website_id = website_id
            if file_content.split(',')[1] == 'True':
                list_item.result_is_list = True
            else:
                list_item.result_is_list = False
            list_item.list_index = int(file_content.split(',')[2])
            list_item.struct_member_name = file_content.split(',')[3]
            list_item.xpath_string = file_content.split(',')[4]
            if file_content.split(',')[5] == 'True':
                list_item.trim_enter = True
            else:
                list_item.trim_enter = False
            if file_content.split(',')[6] == 'True':
                list_item.trim_space = True
            else:
                list_item.trim_space = False
            if file_content.split(',')[7] == 'True':
                list_item.is_url = True
            else:
                list_item.is_url = False
            if file_content.split(',')[8] == 'True':
                list_item.is_abstract_url = True
            else:
                list_item.is_abstract_url = False
            database_operation.insert_record(record_data=list_item,
                                             skip_duplicated_record=False)
            list_item = None
        line_count += 1
Beispiel #13
0
 def __init__(self):
     self.databaseConnection = DBKits()
     self.sourcing_message_to_read_DB_operation = SourcingAnnouncementToReadDBOperation(
         db_engine=self.databaseConnection)
Beispiel #14
0
def test_db():
    conditions = {ProxyInfo.status == 1}
    database_connection = DBKits()
    proxy_operation = ProxyInfoDBOperation(db_engine=database_connection)
    print(proxy_operation.query_record(conditions).count())
    pass