def __init__(self, start_date, end_date): Spider.__init__(self) ApiBase.__init__(self, 'https://chipublib.bibliocommons.com/', start_date, end_date, date_format='%Y-%m-%d')
def __init__(self, *arg, **argdict): """ 初始化对象属性 """ self.rule = '' self.titleXpath = '' self.descriptionXpath = '' self.descriptionLenght = 0 self.contentXpath = '' self.linkXpath = '' self.imgUrlXpath = '' self.imageNum = 0 self.goodRemarkXpath = '' self.badRemarkXpath = '' self.zhunfaRemarkXpath = '' self.videoUrlXpath = '' self.pubDateXpath = '' self.guidXpath = '' self.rule_id = '' self.checkTxtXpath = '' self.max_deepth = 0 self.is_duplicate = False self.last_md5 = '' self.next_request_url = '' self.next_page_url_prefix = '' Spider.__init__(self, *arg, **argdict) self.currentNode = None self.isDone = False self.isFirstListPage = True
def __init__(self, **kwargs): Spider.__init__(self, **kwargs) self.config_file = kwargs.get('config_file', None) config = kwargs.get('config', None) if self.config_file: jconfig = jsonload(open(self.config_file)) elif config: jconfig = jsonloads(config) else: logger.critical('config_file or config is expected') raise Exception('config_file or config is expected') self.template = config_parse(jconfig) # 指定单个要爬的入口地址,可用于测试,或者单独爬取某个页面 self.test_url = kwargs.get('test_url', None) # 指定抓取页面数 self.max_pages = kwargs.get('max_pages', None) self.max_pages = int( self.max_pages) if self.max_pages is not None else None # extractor 测试 self.test_extractor = kwargs.get('test_extractor', None) # entity 测试 self.test_entity = kwargs.get('test_entity', None)
def __init__(self, **kwargs): Spider.__init__(self, **kwargs) self.config_file = kwargs.get('config_file', None) config = kwargs.get('config', None) if self.config_file: jconfig = jsonload(open(self.config_file)) elif config: jconfig = jsonloads(config) else: logger.critical('config_file or config is expected') raise Exception('config_file or config is expected') self.template = config_parse(jconfig) # 指定单个要爬的入口地址,可用于测试,或者单独爬取某个页面 self.test_url = kwargs.get('test_url', None) # 指定抓取页面数 self.max_pages = kwargs.get('max_pages', None) self.max_pages = int(self.max_pages) if self.max_pages is not None else None # extractor 测试 self.test_extractor = kwargs.get('test_extractor', None) # entity 测试 self.test_entity = kwargs.get('test_entity', None)
def __init__(self, start_date, end_date): Spider.__init__(self) SpiderBase.__init__(self, 'https://greatlakes.org/', start_date, end_date, date_format='%B %d', request_date_format='%Y-%m-%d')
def __init__(self, *args, **kwargs): Spider.__init__(self, *args, **kwargs) self.worker_id = "%s_%s" % (socket.gethostname(), get_ip_address()) self.worker_id = self.worker_id.replace('.', '-') self.gen_field = self._yield_field() self.base_item_cls = type("RawResponseItem", (Item, ), dict(zip(BASE_FIELD, self.gen_field)))
def __init__(self, userName='',password='',*args, **kwargs): Spider.__init__(self,*args, **kwargs) if ((userName.replace(' ','')=='') or (password.replace(' ','')=='')): print('请输入账号密码') self.closed('退出') else: self.userName=userName self.password=password
def __init__(self, crawler, *args, **kwargs): print "wwj debug in scrapy spider init" Spider.__init__(self, name=None, **kwargs) self.seed_mode= crawler.settings.get('SEED_MODE') if(self.seed_mode == 'seeds'): seeds_file = crawler.settings.get('SEEDS_FILE') self.load_seeds(seeds_file) elif (self.seed_mode == 'redis'): redis_client_path = crawler.settings.get('REDIS_CLIENT')
def __init__(self, userName='', password='', *args, **kwargs): Spider.__init__(self, *args, **kwargs) if ((userName.replace(' ', '') == '') or (password.replace(' ', '') == '')): print('请输入账号密码') self.closed('退出') else: self.userName = userName self.password = password
def __init__(self, *args, **kwargs): Spider.__init__(self, *args, **kwargs) self.worker_id = ("%s_%s" % (socket.gethostname(), get_ip_address())).replace( '.', '_') self.base_item_cls = type("RawResponseItem", (Item, ), dict(zip(BASE_FIELD, repeat(Field())))) self.redis_conn = None
def __init__(self, start_date, end_date): url = 'https://greatlakes.org/events/?ical=1&tribe_display=list' Spider.__init__(self) ApiBase.__init__(self, url, start_date, end_date, date_format='%Y-%m-%d') tz = timezone('America/Chicago') self.reader = ICal.from_url(self.base_url, tz)
def __init__(self): Spider.__init__(self) self.formdata = { 'zipcode': '2016/03/19', 'syr': '2016', 'smo': '03', 'sdy': '19', 'B1': 'PLOT' } self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' + '50.0.2661.75 Safari/537.36' }
def __init__(self, *arg, **argdict): self.titleXpath = '' self.descriptionXpath = '' self.descriptionLenght = 0 self.linkXpath = '' self.imgUrlXpath = '' self.imageNum = 1 self.videoUrlXpath = '' self.pubDateXpath = '' self.guidXpath = '' self.rule_id = '' self.checkTxtXpath = '' self.is_remove_namespaces = False Spider.__init__(self, *arg, **argdict) self.currentNode = None
def __init__(self, name=None, **kwargs): Spider.__init__(self, name, **kwargs) # self.arch = LORUrlBuf(DATA_BASE_PATH + '/arch.pkl') self.topic = LORUrlBuf(DATA_BASE_PATH + '/topic.pkl') # with open('arch_urls.txt', 'r') as f: start_urls = f.readlines() self.arch_n = len(start_urls) if not self.arch.urls and not self.topic.urls: for url in start_urls: self.arch.append(url[:-1]) #Dump all the urls self.arch.dump() try: with open(DATA_BASE_PATH + '/topic_num.pkl', 'rb') as f: n = pk.load(f) self.topic_n = max(n, len(self.topic.urls)) except Exception as e: print(e)
def __init__(self): Spider.__init__(self) # trans from the ”民國“ into YYYY form. today = datetime(date.today().year, date.today().month, date.today().day - 1) def date_trans(date_): year = int(date_.split('/')[0]) - 1911 return '/'.join([str(year)] + date_.split('/')[1:]) self.temp = get_viewstate() self.formdata = { "ctl00$ScriptManager_Master": "ctl00$contentPlaceHolder$updatePanelMain|ctl00$contentPlaceHolder$btnQuery", "ctl00$ucLogin$txtMemberID": "", "ctl00$ucLogin$txtPassword": "", "ctl00$ucLogin$txtValCode": "", "ctl00$contentPlaceHolder$ucSolarLunar$radlSolarLunar": "S", "ctl00$contentPlaceHolder$txtSTransDate": date_trans(today.strftime('%Y/%m/%d')), "ctl00$contentPlaceHolder$txtETransDate": date_trans(today.strftime('%Y/%m/%d')), "ctl00$contentPlaceHolder$txtMarket": "全部市場", "ctl00$contentPlaceHolder$hfldMarketNo": "ALL", "ctl00$contentPlaceHolder$txtProduct": "全部產品", "ctl00$contentPlaceHolder$hfldProductNo": "ALL", "ctl00$contentPlaceHolder$hfldProductType": "A", "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__VIEWSTATE": self.temp[0], "__EVENTVALIDATION": self.temp[1], "__ASYNCPOST": "true", "ctl00$contentPlaceHolder$btnQuery": "查詢"} self.headers = {'Referer': self.start_urls, 'Accept': ' application/json, text/javascript, */*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36" }
def __init__(self, *arg, **argdict): """ 初始化对象属性 """ self.rule = '' self.titleXpath = '' self.descriptionXpath = '' self.descriptionLenght = 0 self.linkXpath = '' self.imgUrlXpath = '' self.imageNum = 1 self.videoUrlXpath = '' self.pubDateXpath = '' self.guidXpath = '' self.rule_id = '' self.checkTxtXpath = '' self.is_remove_namespaces = False self.last_md5 = '' self.next_request_url = '' Spider.__init__(self, *arg, **argdict) self.currentNode = None self.isDone = False self.isFirstListPage = True
def __init__(self): Spider.__init__(self) self.temp = get_viewstate() self.formdata = { "ctl00$ScriptManager_Master": "ctl00$contentPlaceHolder$updatePanelMain|ctl00$contentPlaceHolder$btnQuery", "ctl00$ucLogin$txtMemberID": "", "ctl00$ucLogin$txtPassword": "", "ctl00$ucLogin$txtValCode": "", "ctl00$contentPlaceHolder$ucSolarLunar$radlSolarLunar": "S", "ctl00$contentPlaceHolder$txtSTransDate": "105/04/22", "ctl00$contentPlaceHolder$txtETransDate": "105/04/22", "ctl00$contentPlaceHolder$txtMarket": "全部市場", "ctl00$contentPlaceHolder$hfldMarketNo": "ALL", "ctl00$contentPlaceHolder$txtProduct": "FE 冬瓜", "ctl00$contentPlaceHolder$hfldProductNo": "FE", "ctl00$contentPlaceHolder$hfldProductType": "B", "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__VIEWSTATE": self.temp[0], "__EVENTVALIDATION": self.temp[1], "__ASYNCPOST": "true", "ctl00$contentPlaceHolder$btnQuery": "查詢" } self.headers = { 'Referer': self.start_urls, 'Accept': ' application/json, text/javascript, */*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36" }
def __init__(self, **kwargs): kwargs = {k: v for k, v in kwargs.items()} self.logger.info(u'Spider arguments:\n{}'.format(pp.pformat(kwargs))) Spider.__init__(self, **kwargs) self.settings = get_project_settings() self.start_urls = kwargs['start_urls'].split(';') if 'allowed_domains' in kwargs and kwargs[ 'allowed_domains'] is not None: self.allowed_domains = kwargs['allowed_domains'].split(';') else: self.allowed_domains = [] for url in self.start_urls: parsed_url = urlparse(url) self.allowed_domains.append(parsed_url.hostname) self.pagination_xpath = kwargs['pagination_xpath'] self.item_xpath = kwargs['item_xpath'] self.title_xpath = kwargs['title_xpath'] self.img_xpath = kwargs['img_xpath'] self.price_xpath = kwargs['price_xpath'] self.price_regex = re.compile("\d+\,\d+") self.description_xpath = kwargs['description_xpath']
def __init__(self, *args, **kwargs): Spider.__init__(self) SpiderBase.__init__(*args, **kwargs)
def __init__(self): Spider.__init__(self) self.__video_links__ = [] self.__video_count__ = None
def __init__(self): Spider.__init__(self) # trans from the ”民國“ into YYYY form. today = datetime(date.today().year, date.today().month, date.today().day - 1) def date_trans(date_): year = int(date_.split('/')[0]) - 1911 return '/'.join([str(year)] + date_.split('/')[1:]) self.temp = get_viewstate() self.formdata = { "ctl00$ScriptManager_Master": "ctl00$contentPlaceHolder$updatePanelMain|ctl00$contentPlaceHolder$btnQuery", "ctl00$ucLogin$txtMemberID": "", "ctl00$ucLogin$txtPassword": "", "ctl00$ucLogin$txtValCode": "", "ctl00$contentPlaceHolder$ucSolarLunar$radlSolarLunar": "S", "ctl00$contentPlaceHolder$txtSTransDate": date_trans(today.strftime('%Y/%m/%d')), "ctl00$contentPlaceHolder$txtETransDate": date_trans(today.strftime('%Y/%m/%d')), "ctl00$contentPlaceHolder$txtMarket": "全部市場", "ctl00$contentPlaceHolder$hfldMarketNo": "ALL", "ctl00$contentPlaceHolder$txtProduct": "全部產品", "ctl00$contentPlaceHolder$hfldProductNo": "ALL", "ctl00$contentPlaceHolder$hfldProductType": "A", "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__VIEWSTATE": self.temp[0], "__EVENTVALIDATION": self.temp[1], "__ASYNCPOST": "true", "ctl00$contentPlaceHolder$btnQuery": "查詢" } self.headers = { 'Referer': self.start_urls, 'Accept': ' application/json, text/javascript, */*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36" }
def __init__(self, *args, **kwargs): Spider.__init__(self, *args, **kwargs) self.redis_conn = None
def __init__(self, *args, **kwargs): Spider.__init__(self) self._set_config(**kwargs)
def __init__(self): Spider.__init__(self) self.browser = webdriver.Chrome( '/Users/liulizhe/Desktop/python_file/chromedriver')
def __init__(self, *args, **kwargs): BaseSpider.__init__(self, *args, **kwargs) SD.__init__(self, *args, **kwargs)
def __init__(self, name=None, **kwargs): Spider.__init__(self, name, **kwargs)