def __init__(self, logger=None): self.logger = logger or get_logger( 'Monitor', True) # try to get logger from engine self.StrategySpiderMap_dict = GlobalConfig.StrategySpiderMap_dict # will be used in spider plugin s self.globalize_strategystatus()
def __init__(self): self.logger = get_logger('Core', True) # core moudles share the same logger self.Scheduler = Scheduler(self.logger) self.Downloader = Downloader(self.logger) self.Uploader = Uploader(self.logger) self.Monitor = Monitor(self.logger)
def __init__(self, logger=None): self.logger = logger or get_logger('Downloader', True) self.StrategySpiderMap_dict = GlobalConfig.StrategySpiderMap_dict # GlobalConfig self.StrategyTaskQueue_dict = GlobalQueues.StrategyTaskQueue_dict # GlobalQueues self.UploadResults_queue = GlobalQueues.UploadResults_queue self.StrategyGroup_dict = GlobalDicts.StrategyGroup_dict # GlobalDicts self.globalize_spiderthreads() # GlobalDicts
def __init__(self, logger=None): self.logger = logger or get_logger( 'Scheduler', True) # try to get logger from engine self.StrategySpiderMap_dict = GlobalConfig.StrategySpiderMap_dict self.SpiderThreads_dict = GlobalDicts.SpiderThreads_dict self.globalize_queues() self.globalize_strategygroup() self.load_strategies()
# -*- coding:utf-8 -*- ''' Created on 2017年6月19日 @author: Thinkpad ''' import json import copy import global_vars.global_config as GlobalConfig from log.log import get_logger logger = get_logger('strategy') class Strategy(): def __init__(self, StrategyID='TEST', Timeout=0, WaitTime=1, RetryTime=3, AdditionParams={}, Encoding='utf-8', FragmentalUpload=False, FragmentalAmount=5, ContentException=[], CookieUse=False): self.StrategyID = StrategyID self.Encoding = Encoding self.Timeout = Timeout self.WaitTime = WaitTime self.RetryTime = RetryTime self.AdditionParams = AdditionParams
import requests from lxml import html import log.log as log import random import traceback import re import os from concurrent.futures import ThreadPoolExecutor as TPE from concurrent.futures import as_completed proxyLogger = log.get_logger(__name__) httpsCount = 0 httpCount = 0 def proxyScraper(): """ Parses htmlobject using xpath search pulling IP, port, Country, type, HTTPS and Time discovered information for a single request :param htmlobject: :return: proxyDict """ uri = 'https://free-proxy-list.net/' pageContent = requests.get(url=uri, headers=rand_useragent(), timeout=10) tree = html.fromstring(pageContent.content) proxyIP = [item for item in tree.xpath('//table/tbody/tr/td[1]/text()')]
# !/usr/bin/env python3.7 from datetime import datetime as dt from sqlalchemy.orm import sessionmaker import log.log as log from database.db_base import Base, engine, URLs, RapplerURLs dbLogger = log.get_logger(__name__) dateFormat = "%Y-%m-%d" Base.metadata.create_all(engine, checkfirst=True) DBSession = sessionmaker(bind=engine) session = DBSession() def recentRecords(TableName): """ search db table for dynamic column, return max 1000 'recentResult' is a list if no table is present from db, return an empty list ###SQL QUERY: SELECT composite FROM table ORDER BY id DESC """ records = [] dbRecords = session.query(TableName).order_by(
# -*- coding:utf-8 -*- ''' Created on 2017年6月19日 @author: Thinkpad ''' import json import copy import global_vars.global_dicts as GlobalDicts import global_vars.global_config as GlobalConfig from log.log import get_logger logger = get_logger('Task') class Task(): def __init__(self, TaskID=0, StrategyID='TEST', TaskType=0, TaskContent='', TaskStatus=0, AdditionParams={}, Encoding=None): # TaskType = [0, 1, 2] # 0: plain return {TaskIns_received : {FileName : FileContent, ...}, TaskIns_generated_1 : {}, ...} # 1: deep .. # 2: reservation # TaskStatus = [0, 1, 2] # 0: fail
def upload_results_fragment(logger, *args, **kws): # TODO: add a sign in additional params of tasks? method_logger = logger or get_logger('server_interact') pass
def get_proxy(logger, *args, **kw): method_logger = logger or get_logger('server_interact') pass
def get_captcha(logger, *args, **kws): method_logger = logger or get_logger('server_interact') pass
def get_captcha(logger, *args, **kws): method_logger = logger or get_logger('server_interact') pass def get_proxy(logger, *args, **kw): method_logger = logger or get_logger('server_interact') pass if __name__ == '__main__': methods_shared_logger = get_logger('server_interact',True) get_strategy(methods_shared_logger) get_task('TEST_1', 10,methods_shared_logger) # UploadPack = {0 : ("TaskStatus", "zipfile_md5_task0"), 1 : ("TaskStatus", "zipfile_md5_task1")} UploadPack = {0 : ("TaskStatus", "emlwZmlsZV9tZDVfdGFzazA=")} ComfirmInfo = upload_results(UploadPack, methods_shared_logger) from data_structure.status import MachineStatus, StrategyStatus MachineStatusIns = MachineStatus() StrategyStatusIns = StrategyStatus(MachineStatusIns, 'TEST_1') StrategyStatusIns = StrategyStatus(MachineStatusIns, 'TEST_2')
def __init__( self, StrategyID ): # cause when firstly instantiate the SpiderIns, the StrategyIns is not loaded yet, so pass the StrategyID artificially super(Spider_Test, self).__init__() # load default params LogFileName = 'Spider_Test_%s' % str(StrategyID) self.logger = get_logger(LogFileName, True)
def __init__(self, logger=None): self.logger = logger or get_logger( 'Uploader', True) # try to get logger from engine self.UploadResults_queue = GlobalQueues.UploadResults_queue
# -*- coding:utf-8 -*- ''' Created on 2017年6月23日 @author: Thinkpad ''' import json import base64 import os import time from flask import Flask from flask import request from log.log import get_logger logger = get_logger('server_service') app = Flask(__name__) def StrategyGenerator(): # initial strategies TEST_1_1_json = '{"RetryTime": 2, "WaitTime": 1, "CookieUse":false, "StrategyID": "TEST_1", "AdditionParams": "{\\"rules\\":[\\"rule_1\\", \\"rule_2\\"]}", "Timeout": 3, "Encoding": "utf-8", "FragmentalUpload": false, "FragmentalAmount":5, "ContentException":[]}' TEST_2_1_json = '{"RetryTime": 2, "WaitTime": 1, "CookieUse":false, "StrategyID": "TEST_2", "AdditionParams": "{\\"rules\\":[\\"rule_1\\", \\"rule_2\\"]}", "Timeout": 3, "Encoding": "utf-8", "FragmentalUpload": false, "FragmentalAmount":5, "ContentException":[]}' StrategyGroup_1_dict = {'TEST_1': TEST_1_1_json, 'TEST_2': TEST_2_1_json} StrategyGroupJson_1 = json.dumps(StrategyGroup_1_dict) # new strategies TEST_1_2_json = '{"RetryTime": 3, "WaitTime": 1, "CookieUse":false, "StrategyID": "TEST_1", "AdditionParams": "{\\"rules\\":[\\"rule_1\\", \\"rule_2\\"]}", "Timeout": 3, "Encoding": "utf-8", "FragmentalUpload": false, "FragmentalAmount":5, "ContentException":[]}' TEST_2_2_json = '{"RetryTime": 3, "WaitTime": 1, "CookieUse":false, "StrategyID": "TEST_2", "AdditionParams": "{\\"rules\\":[\\"rule_1\\", \\"rule_2\\"]}", "Timeout": 3, "Encoding": "utf-8", "FragmentalUpload": false, "FragmentalAmount":5, "ContentException":[]}' StrategyGroup_2_dict = {'TEST_1': TEST_1_2_json, 'TEST_2': TEST_2_2_json} StrategyGroupJson_2 = json.dumps(StrategyGroup_2_dict)
# -*- coding:utf-8 -*- ''' Created on 2017年6月19日 @author: Thinkpad ''' import json from collections import Counter from utils.decorators import Singleton from log.log import get_logger logger = get_logger('status') @Singleton class MachineStatus(): # MachineStatus is instantiated once for a engine startup def __init__(self, user='******'): self._user = user # usually IP of vps self._MachineStatusCollector_dict = { } # {StrategyID : strategy_status_collector, ...} def machine_status_collector(self): return self._MachineStatusCollector_dict def get_user(self): return self._user def get_json( self ): # when sending pass the _MachineStatusCollector_dict into upload_status directly MachineStatusJson = json.dumps(self._MachineStatusCollector_dict,