Exemple #1
0
 def __init__(self, start_date, end_date):
     Spider.__init__(self)
     ApiBase.__init__(self,
                      'https://chipublib.bibliocommons.com/',
                      start_date,
                      end_date,
                      date_format='%Y-%m-%d')
Exemple #2
0
    def __init__(self, *arg, **argdict):
        """ 初始化对象属性 """

        self.rule = ''
        self.titleXpath = ''
        self.descriptionXpath = ''
        self.descriptionLenght = 0
        self.contentXpath = ''
        self.linkXpath = ''
        self.imgUrlXpath = ''
        self.imageNum = 0
        self.goodRemarkXpath = ''
        self.badRemarkXpath = ''
        self.zhunfaRemarkXpath = ''

        self.videoUrlXpath = ''
        self.pubDateXpath = ''
        self.guidXpath = ''
        self.rule_id = ''
        self.checkTxtXpath = ''
        self.max_deepth = 0
        self.is_duplicate = False
        self.last_md5 = ''
        self.next_request_url = ''
        self.next_page_url_prefix = ''
        Spider.__init__(self, *arg, **argdict)
        self.currentNode = None
        self.isDone = False
        self.isFirstListPage = True
Exemple #3
0
    def __init__(self, **kwargs):
        Spider.__init__(self, **kwargs)

        self.config_file = kwargs.get('config_file', None)
        config = kwargs.get('config', None)
        if self.config_file:
            jconfig = jsonload(open(self.config_file))
        elif config:
            jconfig = jsonloads(config)
        else:
            logger.critical('config_file or config is expected')
            raise Exception('config_file or config is expected')

        self.template = config_parse(jconfig)

        # 指定单个要爬的入口地址,可用于测试,或者单独爬取某个页面
        self.test_url = kwargs.get('test_url', None)

        # 指定抓取页面数
        self.max_pages = kwargs.get('max_pages', None)
        self.max_pages = int(
            self.max_pages) if self.max_pages is not None else None

        # extractor 测试
        self.test_extractor = kwargs.get('test_extractor', None)

        # entity 测试
        self.test_entity = kwargs.get('test_entity', None)
Exemple #4
0
    def __init__(self, *arg, **argdict):
        """ 初始化对象属性 """

        self.rule = ''
        self.titleXpath = ''
        self.descriptionXpath = ''
        self.descriptionLenght = 0
        self.contentXpath = ''
        self.linkXpath = ''
        self.imgUrlXpath = ''
        self.imageNum = 0
        self.goodRemarkXpath = ''
        self.badRemarkXpath = ''
        self.zhunfaRemarkXpath = ''

        self.videoUrlXpath = ''
        self.pubDateXpath = ''
        self.guidXpath = ''
        self.rule_id = ''
        self.checkTxtXpath = ''
        self.max_deepth = 0
        self.is_duplicate = False
        self.last_md5 = ''
        self.next_request_url = ''
        self.next_page_url_prefix = ''
        Spider.__init__(self, *arg, **argdict)
        self.currentNode = None
        self.isDone = False
        self.isFirstListPage = True
Exemple #5
0
    def __init__(self, **kwargs):
        Spider.__init__(self, **kwargs)

        self.config_file = kwargs.get('config_file', None)
        config = kwargs.get('config', None)
        if self.config_file:
            jconfig = jsonload(open(self.config_file))
        elif config:
            jconfig = jsonloads(config)
        else:
            logger.critical('config_file or config is expected')
            raise Exception('config_file or config is expected')

        self.template = config_parse(jconfig)

        # 指定单个要爬的入口地址,可用于测试,或者单独爬取某个页面
        self.test_url = kwargs.get('test_url', None)

        # 指定抓取页面数
        self.max_pages = kwargs.get('max_pages', None)
        self.max_pages = int(self.max_pages) if self.max_pages is not None else None

        # extractor 测试
        self.test_extractor = kwargs.get('test_extractor', None)

        # entity 测试
        self.test_entity = kwargs.get('test_entity', None)
Exemple #6
0
 def __init__(self, start_date, end_date):
     Spider.__init__(self)
     SpiderBase.__init__(self,
                         'https://greatlakes.org/',
                         start_date,
                         end_date,
                         date_format='%B %d',
                         request_date_format='%Y-%m-%d')
Exemple #7
0
    def __init__(self, *args, **kwargs):

        Spider.__init__(self, *args, **kwargs)
        self.worker_id = "%s_%s" % (socket.gethostname(), get_ip_address())
        self.worker_id = self.worker_id.replace('.', '-')
        self.gen_field = self._yield_field()
        self.base_item_cls = type("RawResponseItem", (Item, ),
                                  dict(zip(BASE_FIELD, self.gen_field)))
	def __init__(self, userName='',password='',*args, **kwargs):
		Spider.__init__(self,*args, **kwargs)
		if ((userName.replace(' ','')=='') or  (password.replace(' ','')=='')):
			print('请输入账号密码')
			self.closed('退出')
		else:
			self.userName=userName
			self.password=password
Exemple #9
0
 def __init__(self, crawler, *args, **kwargs):
     print "wwj debug in scrapy spider init"
     Spider.__init__(self, name=None, **kwargs)
     self.seed_mode= crawler.settings.get('SEED_MODE')
     if(self.seed_mode == 'seeds'):
         seeds_file = crawler.settings.get('SEEDS_FILE')
         self.load_seeds(seeds_file)
     elif (self.seed_mode == 'redis'):
         redis_client_path = crawler.settings.get('REDIS_CLIENT')
Exemple #10
0
 def __init__(self, userName='', password='', *args, **kwargs):
     Spider.__init__(self, *args, **kwargs)
     if ((userName.replace(' ', '') == '')
             or (password.replace(' ', '') == '')):
         print('请输入账号密码')
         self.closed('退出')
     else:
         self.userName = userName
         self.password = password
Exemple #11
0
    def __init__(self, *args, **kwargs):

        Spider.__init__(self, *args, **kwargs)
        self.worker_id = ("%s_%s" %
                          (socket.gethostname(), get_ip_address())).replace(
                              '.', '_')
        self.base_item_cls = type("RawResponseItem", (Item, ),
                                  dict(zip(BASE_FIELD, repeat(Field()))))
        self.redis_conn = None
Exemple #12
0
 def __init__(self, start_date, end_date):
     url = 'https://greatlakes.org/events/?ical=1&tribe_display=list'
     Spider.__init__(self)
     ApiBase.__init__(self,
                      url,
                      start_date,
                      end_date,
                      date_format='%Y-%m-%d')
     tz = timezone('America/Chicago')
     self.reader = ICal.from_url(self.base_url, tz)
 def __init__(self):
     Spider.__init__(self)
     self.formdata = {
         'zipcode': '2016/03/19',
         'syr': '2016',
         'smo': '03',
         'sdy': '19',
         'B1': 'PLOT'
     }
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) ' +
         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' +
         '50.0.2661.75 Safari/537.36'
     }
Exemple #14
0
    def __init__(self, *arg, **argdict):

        self.titleXpath = ''
        self.descriptionXpath = ''
        self.descriptionLenght = 0
        self.linkXpath = ''
        self.imgUrlXpath = ''
        self.imageNum = 1
        self.videoUrlXpath = ''
        self.pubDateXpath = ''
        self.guidXpath = ''
        self.rule_id = ''
        self.checkTxtXpath = ''
        self.is_remove_namespaces = False
        Spider.__init__(self, *arg, **argdict)
        self.currentNode = None
Exemple #15
0
 def __init__(self, name=None, **kwargs):
     Spider.__init__(self, name, **kwargs)
     #
     self.arch = LORUrlBuf(DATA_BASE_PATH + '/arch.pkl')
     self.topic = LORUrlBuf(DATA_BASE_PATH + '/topic.pkl')
     #
     with open('arch_urls.txt', 'r') as f:
         start_urls = f.readlines()
         self.arch_n = len(start_urls)
         if not self.arch.urls and not self.topic.urls:
             for url in start_urls:
                 self.arch.append(url[:-1])
             #Dump all the urls
             self.arch.dump()
     try:
         with open(DATA_BASE_PATH + '/topic_num.pkl', 'rb') as f:
             n = pk.load(f)
             self.topic_n = max(n, len(self.topic.urls))
     except Exception as e:
         print(e)
    def __init__(self):
        Spider.__init__(self)
        # trans from the ”民國“ into YYYY form.
        today = datetime(date.today().year, date.today().month,
                         date.today().day - 1)

        def date_trans(date_):
            year = int(date_.split('/')[0]) - 1911
            return '/'.join([str(year)] + date_.split('/')[1:])

        self.temp = get_viewstate()
        self.formdata = {
            "ctl00$ScriptManager_Master":
            "ctl00$contentPlaceHolder$updatePanelMain|ctl00$contentPlaceHolder$btnQuery",
            "ctl00$ucLogin$txtMemberID": "",
            "ctl00$ucLogin$txtPassword": "",
            "ctl00$ucLogin$txtValCode": "",
            "ctl00$contentPlaceHolder$ucSolarLunar$radlSolarLunar": "S",
            "ctl00$contentPlaceHolder$txtSTransDate":
            date_trans(today.strftime('%Y/%m/%d')),
            "ctl00$contentPlaceHolder$txtETransDate":
            date_trans(today.strftime('%Y/%m/%d')),
            "ctl00$contentPlaceHolder$txtMarket": "全部市場",
            "ctl00$contentPlaceHolder$hfldMarketNo": "ALL",
            "ctl00$contentPlaceHolder$txtProduct": "全部產品",
            "ctl00$contentPlaceHolder$hfldProductNo": "ALL",
            "ctl00$contentPlaceHolder$hfldProductType": "A",
            "__EVENTTARGET": "",
            "__EVENTARGUMENT": "",
            "__VIEWSTATE": self.temp[0],
            "__EVENTVALIDATION": self.temp[1],
            "__ASYNCPOST": "true",
            "ctl00$contentPlaceHolder$btnQuery": "查詢"}
        self.headers = {'Referer': self.start_urls,
                        'Accept': ' application/json, text/javascript, */*',
                        'Content-Type':
                        'application/x-www-form-urlencoded; charset=UTF-8',
                        'X-Requested-With': 'XMLHttpRequest',
                        'User-Agent':
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36"
                        }
    def __init__(self, *arg, **argdict):
        """ 初始化对象属性 """

        self.rule = ''
        self.titleXpath = ''
        self.descriptionXpath = ''
        self.descriptionLenght = 0
        self.linkXpath = ''
        self.imgUrlXpath = ''
        self.imageNum = 1
        self.videoUrlXpath = ''
        self.pubDateXpath = ''
        self.guidXpath = ''
        self.rule_id = ''
        self.checkTxtXpath = ''
        self.is_remove_namespaces = False
        self.last_md5 = ''
        self.next_request_url = ''
        Spider.__init__(self, *arg, **argdict)
        self.currentNode = None
        self.isDone = False
        self.isFirstListPage = True
    def __init__(self):
        Spider.__init__(self)

        self.temp = get_viewstate()
        self.formdata = {
            "ctl00$ScriptManager_Master":
            "ctl00$contentPlaceHolder$updatePanelMain|ctl00$contentPlaceHolder$btnQuery",
            "ctl00$ucLogin$txtMemberID": "",
            "ctl00$ucLogin$txtPassword": "",
            "ctl00$ucLogin$txtValCode": "",
            "ctl00$contentPlaceHolder$ucSolarLunar$radlSolarLunar": "S",
            "ctl00$contentPlaceHolder$txtSTransDate": "105/04/22",
            "ctl00$contentPlaceHolder$txtETransDate": "105/04/22",
            "ctl00$contentPlaceHolder$txtMarket": "全部市場",
            "ctl00$contentPlaceHolder$hfldMarketNo": "ALL",
            "ctl00$contentPlaceHolder$txtProduct": "FE 冬瓜",
            "ctl00$contentPlaceHolder$hfldProductNo": "FE",
            "ctl00$contentPlaceHolder$hfldProductType": "B",
            "__EVENTTARGET": "",
            "__EVENTARGUMENT": "",
            "__VIEWSTATE": self.temp[0],
            "__EVENTVALIDATION": self.temp[1],
            "__ASYNCPOST": "true",
            "ctl00$contentPlaceHolder$btnQuery": "查詢"
        }

        self.headers = {
            'Referer':
            self.start_urls,
            'Accept':
            ' application/json, text/javascript, */*',
            'Content-Type':
            'application/x-www-form-urlencoded; charset=UTF-8',
            'X-Requested-With':
            'XMLHttpRequest',
            'User-Agent':
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36"
        }
Exemple #19
0
    def __init__(self, **kwargs):
        kwargs = {k: v for k, v in kwargs.items()}
        self.logger.info(u'Spider arguments:\n{}'.format(pp.pformat(kwargs)))
        Spider.__init__(self, **kwargs)
        self.settings = get_project_settings()

        self.start_urls = kwargs['start_urls'].split(';')
        if 'allowed_domains' in kwargs and kwargs[
                'allowed_domains'] is not None:
            self.allowed_domains = kwargs['allowed_domains'].split(';')
        else:
            self.allowed_domains = []
            for url in self.start_urls:
                parsed_url = urlparse(url)
                self.allowed_domains.append(parsed_url.hostname)

        self.pagination_xpath = kwargs['pagination_xpath']
        self.item_xpath = kwargs['item_xpath']
        self.title_xpath = kwargs['title_xpath']
        self.img_xpath = kwargs['img_xpath']
        self.price_xpath = kwargs['price_xpath']
        self.price_regex = re.compile("\d+\,\d+")
        self.description_xpath = kwargs['description_xpath']
 def __init__(self, *args, **kwargs):
     Spider.__init__(self)
     SpiderBase.__init__(*args, **kwargs)
Exemple #21
0
 def __init__(self):
     Spider.__init__(self)
     self.__video_links__ = []
     self.__video_count__ = None
Exemple #22
0
    def __init__(self):
        Spider.__init__(self)
        # trans from the ”民國“ into YYYY form.
        today = datetime(date.today().year,
                         date.today().month,
                         date.today().day - 1)

        def date_trans(date_):
            year = int(date_.split('/')[0]) - 1911
            return '/'.join([str(year)] + date_.split('/')[1:])

        self.temp = get_viewstate()
        self.formdata = {
            "ctl00$ScriptManager_Master":
            "ctl00$contentPlaceHolder$updatePanelMain|ctl00$contentPlaceHolder$btnQuery",
            "ctl00$ucLogin$txtMemberID":
            "",
            "ctl00$ucLogin$txtPassword":
            "",
            "ctl00$ucLogin$txtValCode":
            "",
            "ctl00$contentPlaceHolder$ucSolarLunar$radlSolarLunar":
            "S",
            "ctl00$contentPlaceHolder$txtSTransDate":
            date_trans(today.strftime('%Y/%m/%d')),
            "ctl00$contentPlaceHolder$txtETransDate":
            date_trans(today.strftime('%Y/%m/%d')),
            "ctl00$contentPlaceHolder$txtMarket":
            "全部市場",
            "ctl00$contentPlaceHolder$hfldMarketNo":
            "ALL",
            "ctl00$contentPlaceHolder$txtProduct":
            "全部產品",
            "ctl00$contentPlaceHolder$hfldProductNo":
            "ALL",
            "ctl00$contentPlaceHolder$hfldProductType":
            "A",
            "__EVENTTARGET":
            "",
            "__EVENTARGUMENT":
            "",
            "__VIEWSTATE":
            self.temp[0],
            "__EVENTVALIDATION":
            self.temp[1],
            "__ASYNCPOST":
            "true",
            "ctl00$contentPlaceHolder$btnQuery":
            "查詢"
        }
        self.headers = {
            'Referer':
            self.start_urls,
            'Accept':
            ' application/json, text/javascript, */*',
            'Content-Type':
            'application/x-www-form-urlencoded; charset=UTF-8',
            'X-Requested-With':
            'XMLHttpRequest',
            'User-Agent':
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36"
        }
 def __init__(self, *args, **kwargs):
     Spider.__init__(self, *args, **kwargs)
     self.redis_conn = None
Exemple #24
0
    def __init__(self, *args, **kwargs):
        Spider.__init__(self)

        self._set_config(**kwargs)
Exemple #25
0
 def __init__(self):
     Spider.__init__(self)
     self.browser = webdriver.Chrome(
         '/Users/liulizhe/Desktop/python_file/chromedriver')
Exemple #26
0
 def __init__(self, *args, **kwargs):
     BaseSpider.__init__(self, *args, **kwargs)
     SD.__init__(self, *args, **kwargs)
Exemple #27
0
 def __init__(self, name=None, **kwargs):
     Spider.__init__(self, name, **kwargs)