def __init__(self):
     #settings.overrides['DEPTH_LIMIT'] = 0
     settings.set('ITEM_PIPELINES', {'gentest1.pipelines.Gentest1Pipeline':1000})
     settings.set('LOG_FILE', 'log.txt')
     for n,d in self.documents.iteritems():
         self.allowed_domains.append(d['allow_url'])
         self.start_urls.append(d['start_url'])
Exemple #2
0
    def process_item(self, item, spider):
        name = item['name']
        picture_url = item['picture_url']
        count = item['count']
        spider_name = item['spider_name']

        # 处理sql语句
        create_table_sql = '''create table if not exists {}(id int unsigned primary key auto_increment not null,name varchar(40),picture_url varchar(300) default '',count int unsigned default 1)'''.format(
            spider_name)

        try:
            if settings.get('FLAG') == False:
                self.cs.execute(create_table_sql)
        except:
            settings.set('FLAG', 'True')

        sql = 'insert into {} values (null,"{}","{}",{});'.format(
            spider_name, name, picture_url, count)

        self.cs.execute(sql)
        # 获取数据
        # data = self.cs.fetchall()

        self.conn.commit()

        return item
 def __init__(self, *args, **kwargs):
     super(SunshineRewardsSpider, self).__init__(*args, **kwargs)
     settings.set('RETRY_HTTP_CODES', [500, 503, 504, 400, 408, 404] )
     settings.set('RETRY_TIMES', 5 )
     settings.set('REDIRECT_ENABLED', True)
     settings.set('METAREFRESH_ENABLED', True)
     settings.set('USER_AGENT', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36')
 def __init__(self, *args, **kwargs):
     super(SplenderSpider, self).__init__(*args, **kwargs)
     settings.set('RETRY_HTTP_CODES', [500, 503, 504, 400, 408, 404] )
     settings.set('RETRY_TIMES', 5 )
     settings.set('REDIRECT_ENABLED', True)
     settings.set('METAREFRESH_ENABLED', True)
     settings.set('USER_AGENT', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36')
Exemple #5
0
 def set_cookie(response):
     """
     保存cookie
     :param response: Scrapy的返回对象
     """
     cookie = "".join([
         i.decode("UTF-8") for i in response.headers.getlist('Set-Cookie')
     ])
     settings.set("DAJIE_COOKIE", cookie)
Exemple #6
0
 def __setting_fake_info_IP(self, signal):
     '''
     set fake ip
     '''
     package = signal.data
     self.fake_pool['ip'] = package.forge_info
     ip = []
     for fake_info in self.fake_pool['ip']:
         ip.append(fake_info.forgr_info)
     if len(ip):
         settings.set('HTTP_PROXY', ip)
Exemple #7
0
 def __setting_fake_info_UA(self, signal):
     '''
     set fake ua
     '''
     package = signal.data
     self.fake_pool['ua'] = package.forge_info
     ua = []
     for fake_info in self.fake_pool['ua']:
         ua.append(fake_info.forgr_info)
     if len(ua):
         settings.set('USER_AGENT', ua)
Exemple #8
0
    def __init__(self, *a, **kw):
        super(SuperSpider, self).__init__(*a, **kw)

        self.rd = redis.Redis(settings.get("REDIS_HOST"),
                              settings.get("REDIS_PORT"),
                              db=settings.get("MAIN_REDIS_DB"))

        domain = settings.get("DOMAIN")
        self.domain = Domain(domain)
        self.rule = Rule(domain)
        settings.set('DOWNLOAD_DELAY', float(self.domain["download_delay"]))
        settings.set('CONCURRENT_REQUESTS', int(self.domain["concurrent_requests"]))
 def __init__(self, *args, **kwargs):
   super(RegularSpider, self).__init__(*args, **kwargs)
   self.start_urls.append(kwargs["base_url"])
   # self.allowed_domains = kwargs["allowed_domains"]
   self.allowed_domains = [kwargs["allowed_domains"]]
   # if 'depth' in kwargs:
   settings.set(self.name, int(kwargs["depth"]), priority='cmdline')
   if 'username' in kwargs:
       self.username = kwargs['username']
   if 'password' in kwargs:
       self.password = kwargs['password']
   if 'login' in kwargs:
       self.login = True
    def __init__(self, crawl_date = None, region = None, *args, **kwargs):
        super(ParkingSpider, self).__init__(*args, **kwargs)
        #处理时间
        now = datetime.now()
        yesterday = now - timedelta(days=1)
        yesterday_str = yesterday.strftime('%Y-%m-%d')

        today_str = now.strftime('%Y-%m-%d')

        if crawl_date == 'today':
            self.crawl_date = today_str
        elif crawl_date == 'yesterday':
            self.crawl_date = yesterday_str
        else:
            self.crawl_date = crawl_date if crawl_date else yesterday_str
        #help(settings)
        settings.set('crawl_date', self.crawl_date)
        #settings['crawl_date'] = self.crawl_date
        self.now = now.strftime('%Y%m%d_%H%M')

        #处理起始爬取url
        url = settings['INIT_RENT_PARKING_URL']
        

        #根据地区爬取
        self.region = region if region != None else 'all'
        self.region_num = self.map_region(self.region)
        if self.region_num != None:
            url += '&region1=%s' % self.region_num

        self.start_urls.append(url)

        #房产列表网页保存路径
        self.list_output_path = './output/list_rent_parking/%s/%s/%s/' % (self.crawl_date, self.region, self.now)
        self.list_output_path = self.list_output_path.strip('\r')
        if not os.path.exists(self.list_output_path):
            os.makedirs(self.list_output_path)

        #房产详情页面保存路径
        self.detail_output_path = './output/detail_rent_parking/%s/%s/%s/' % (self.crawl_date, self.region, self.now)
        self.detail_output_path = self.detail_output_path.strip('\r')
        if not os.path.exists(self.detail_output_path):
            os.makedirs(self.detail_output_path)

        #房产图片保存路径
        self.image_output_path = './output/image_rent_parking/'
        self.image_output_path = self.image_output_path.strip('\r')
        if not os.path.exists(self.image_output_path):
            os.makedirs(self.image_output_path)
        
        self.go_next = settings['GO_NEXT']
Exemple #11
0
    def __init__(self,**kwargs):
        
        if not 'config' in kwargs:
            err =  'failed to find seed file (config=*.conf)'
            print err
#         if not 'keywords' in kwargs:
#             err =  'failed to find seed file (keywords=*.dat)'
#             print err
        config = kwargs['config']
#         self.keywords = kwargs['keywords']
        self.load_conf(config)
        if self.Sleep_Flag=='SEARCH_ENGINE_SLEEP' or self.Sleep_Flag=='true' or not self.Sleep_Flag:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline')
            settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline')
        else:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline')
        
        log_filename = self.conf_name.replace('.conf','')+'.log'
        settings.set('LOG_FILE', log_filename, priority='cmdline')
        #redis key
        self.meta_next_url = meta_redis_key()
        #初始化redis
        self.init_redis()
        self.redis_keyword = get_redis_key(self.conf_name)
        #注册signal
        sig = SignalManager(dispatcher.Any)
        sig.connect(self.idle,signal=signals.spider_idle)
        self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        #保存该次获取的url,用于判断该次抓取是否和上次重复{keyword:md5(url)}
        self.urlmd5 = dict()
        self.log_writer = open('log.dat','a+') 
        self.date_from_url_re = re.compile("[-_/][a-zA-Z]*[-_]?(?P<year>(20)?([0-1][0-9]))([-_/])?(?P<m>(10|11|12|(0?[1-9])){1})([-_/])?(?P<day>(10|20|30|31|([0-2]?[1-9])){1})([-_/])")
Exemple #12
0
    def __init__(self, **kwargs):
        # problem report
        super(CarSpider, self).__init__(**kwargs)
        self.mailer = MailSender.from_settings(settings)
        self.counts = 0
        self.carnum = 1010000
        # Mongo
        settings.set('DOWNLOAD_DELAY', '0', priority='cmdline')
        settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        settings.set('MONGODB_DB', 'newcar', priority='cmdline')
        settings.set('MONGODB_COLLECTION', website, priority='cmdline')
        self.nationp = dict()
        self.npcounts = 0
        # nation select
        self.browser = webdriver.PhantomJS(
            executable_path=settings['PHANTOMJS_PATH'])

        # desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
        # proxy = webdriver.Proxy()
        # proxy.proxy_type = ProxyType.MANUAL
        # proxy.http_proxy = self.getProxy()
        # proxy.add_to_capabilities(desired_capabilities)

        # self.browser.start_session(desired_capabilities)
        # self.browser.set_page_load_timeout(12)
        # self.browser = webdriver.PhantomJS(executable_path=settings['PHANTOMJS_PATH'])
        # self.browser = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs")
        # self.browser = webdriver.PhantomJS(executable_path="/root/home/phantomjs")
        super(CarSpider, self).__init__()
        dispatcher.connect(self.spider_closed, signals.spider_closed)
Exemple #13
0
    def __init__(self, **kwargs):

        # 爬取字段 set到REFERER_NAME中
        if len(kwargs) != 0:
            self._search_name = kwargs['search_name']
            settings.set("REFERER_NAME", self._search_name)
        else:
            self._search_name = settings.get("REFERER_NAME")

        # 请求json的url
        self._cookie_url = "https://so.dajie.com/job/search?keyword={}&from=job&clicktype=blank"
        self._url = 'https://so.dajie.com/job/ajax/search/filter?keyword={' \
                    '}&order=0&city&recruitType&salary&experience&page={}&positionFunction&_CSRFToken&ajax=1'

        super().__init__(**kwargs)
Exemple #14
0
class BookSpider(scrapy.Spider):
    name = 'BookSpider'

    settings.set('CONCURRENT_REQUESTS',100)

    def start_requests(self):
        return [
            scrapy.Request('https://search.jd.com/Search?keyword=python&enc=utf-8&wq=python&pvid=186e2514605040b4987bfc7a62e3d5e0',callback=self.parseBookList)
            ]
    def parseBookList(self,response):       
        hrefs = response.xpath('//*[@id="J_goodsList"]/ul/li/div/div[1]/a/@href[starts-with(.,"//item.jd.com/")]').extract()
        for href in hrefs:
            result = re.match('.*/(\d+).*',href)
            productId = result.group(1)
            yield scrapy.Request('https:' + href,meta={'productId':productId,'url':'https:' + href},callback=self.parseBook)
    def parseBook(self, response):
        values = response.xpath('//*[@id="name"]/div[1]/text()').extract()
        title = ''
        for value in values:
            title += value
        title =  title.strip()
        press = response.xpath('//*[@id="parameter2"]/li[1]/a/text()').extract()[0]
        ISBN = response.xpath('//*[@id="parameter2"]/li[2]/@title').extract()[0]
        productId = response.meta['productId']
        url = response.meta['url']
        bookItem = NewBookItem()
        bookItem['url'] = url
        bookItem['title'] = title
        bookItem['press'] = press
        bookItem['ISBN'] = ISBN
        bookItem['productId'] = productId
        return bookItem
Exemple #15
0
    def __init__(self, crawl_date = None, region = None, *args, **kwargs):
        super(ResidenceImageSpider, self).__init__(*args, **kwargs)
        #处理时间
        now = datetime.now()
        yesterday = now - timedelta(days=1)
        yesterday_str = yesterday.strftime('%Y-%m-%d')

        today_str = now.strftime('%Y-%m-%d')

        if crawl_date == 'today':
            self.crawl_date = today_str
        elif crawl_date == 'yesterday':
            self.crawl_date = yesterday_str
        else:
            self.crawl_date = crawl_date if crawl_date else yesterday_str
        #help(settings)
        settings.set('crawl_date', self.crawl_date)
        #settings['crawl_date'] = self.crawl_date
        self.now = now.strftime('%Y%m%d_%H%M')
Exemple #16
0
    def __init__(self, search_name, *args, **kwargs):
        # 拉勾登录账号密码
        self._username = settings['USERNAME']
        self._password = settings['PASSWORD']

        # login_cookies
        self.login_cookies = None

        # 爬取字段 set到referer_name中
        self._search_name = search_name
        settings.set("REFERER_NAME", self._search_name)

        # 页数设个起始值
        self.page_no = settings['START_PAGE_NUM']

        # 请求json的url
        self._url = 'https://www.lagou.com/jobs/positionAjax.json?px=new&kd={}&pn={}&'

        super(LagouSpider, self).__init__(*args)
Exemple #17
0
    def __init__(self):
        # Setting scrapy params
        settings.set('RETRY_HTTP_CODES', [503, 504, 400, 408, 404] )
        settings.set('RETRY_TIMES', 5)
        settings.set('REDIRECT_ENABLED', True)
        settings.set('METAREFRESH_ENABLED', True)

        # Excluded categories
        self.EXCLUDE_CATEGORY_LEVEL_1 = [
            # 'designers',
            'look book'
        ]
 def __init__(self, **kwargs):
     # problem report
     super(CarSpider, self).__init__(**kwargs)
     self.mailer = MailSender.from_settings(settings)
     self.counts = 0
     self.carnum = 1010000
     # Mongo
     settings.set('DOWNLOAD_DELAY', '0', priority='cmdline')
     settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'newcar', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
     self.nationp = dict()
     self.npcounts=0
Exemple #19
0
class PycoderSpider(CrawlSpider):
    name = "tut"
    settings.set('DEPTH_LIMIT', 1)
    start_urls = ['https://en.belstu.by']

    rules = (Rule(LinkExtractor(), callback='parse_url', follow=True), )

    def parse_url(self, response):
        item = MyItem()
        item['url'] = response.url
        links = response.xpath("//a[@href]/@href").extract()
        item['links'] = links
        item['countOfLinks'] = len(links)
        texts = response.xpath('//text()').extract()
        words = []
        for t in texts:
            t = ' '.join(t.split()).strip()
            if len(t) > 0:
                curWords = re.sub('[^0-9a-zA-Z]+', '*', t).split('*')
                curWords = list(filter(None, curWords))
                words.extend(curWords)
        
        item['countOfWords'] = len(words)

        frequency = []
        frequencyWords = []


        for w in words:
            l = len(w)
            finded = next((x for x in frequency if x['countLetters'] == l), None)

            if finded:
                finded['countWords'] = finded['countWords'] + 1
            else:
                newLength = LengthDictionary()
                newLength['countLetters'] = l
                newLength['countWords'] = 1
                frequency.append(newLength)
            
            findedWord = next((t for t in frequencyWords if t['word'] == w), None)
            if findedWord:
                findedWord['count'] = findedWord['count'] + 1
            else:
                newWord = WordsDictionary()
                newWord['word'] = w
                newWord['count'] = 1
                frequencyWords.append(newWord)    
        
        item['frequencyOfLength'] = frequency
        item['frequencyOfWords'] = frequencyWords

        return item
Exemple #20
0
 def __init__(self, **kwargs):
     # args
     super(CarSpider, self).__init__(**kwargs)
     #problem report
     self.mailer = MailSender.from_settings(settings)
     self.counts=0
     # Mongo
     settings.set('CrawlCar_Num', carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'usedcar', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
     #mysql
     # mysql
     mysqldb = MySQLdb.connect("192.168.1.94", "root", "Datauser@2017", "usedcar", port=3306)
     mysqldbc = mysqldb.cursor()
     # read
     mysqldbc.execute("select newcarurl from che58")
     items = mysqldbc.fetchall()
     self.urllist=[]
     df =pybloom.BloomFilter(carnum,0.01)
     for i in items:
         j=i[0]
         md5i= hashlib.md5(j)
         rf = df.add(md5i)
         if not rf:
             self.urllist.append(j)
Exemple #21
0
    def __init__(self, **kwargs):
        super(CarSpider, self).__init__(**kwargs)
        self.mailer = MailSender.from_settings(settings)
        self.counts = 0
        self.carnum = 800000

        settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        settings.set('MONGODB_DB', 'carbusiness', priority='cmdline')
        settings.set('MONGODB_COLLECTION', website, priority='cmdline')

        with codecs.open("D:/county.txt", "r", "utf-8") as f:
            filecontent = f.read()
            # print(filecontent)
            indexlist = re.findall("\d+\_\d+\_\d+|\d+\_\d+", filecontent)
            indexlist.append("0")
            # print(indexlist)
            datalist = re.findall("\[(.*?)\]", filecontent, re.S)
            # print(datalist)
        self.datadict = {}
        for index in indexlist:
            self.datadict[index] = datalist[indexlist.index(index)]

        # print(self.datadict)

        self.browser = webdriver.PhantomJS(
            executable_path=settings['PHANTOMJS_PATH'])
        # self.browser = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs")
        # self.browser = webdriver.PhantomJS(executable_path="/root/home/phantomjs")
        super(CarSpider, self).__init__()
        dispatcher.connect(self.spider_closed, signals.spider_closed)
    def __init__(self, **kwargs):
        super(CarSpider, self).__init__(**kwargs)

        self.mailer = MailSender.from_settings(settings)
        self.counts = 0
        self.carnum = 2000000
        #MonGo
        settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        settings.set('MONGODB_DB', 'newcar', priority='cmdline')
        settings.set('MONGODB_COLLECTION', website, priority='cmdline')

        with open("blm/" + settings['MONGODB_DB'] + "/yiche_city.txt") as f:
            content = f.read()
            f.close()
        obj = json.loads(content)
        self.city_id_list = []
        for city in obj:
            self.city_id_list.append(city['cityId'])

        desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
        desired_capabilities[
            "phantomjs.page.settings.userAgent"] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'

        self.browser = webdriver.PhantomJS(
            executable_path="/home/phantomjs-2.1.1-linux-x86_64/bin/phantomjs",
            desired_capabilities=desired_capabilities)
        # self.browser = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs")
        # self.browser = webdriver.PhantomJS(executable_path="D:/phantomjs", desired_capabilities=desired_capabilities)
        self.browser.set_page_load_timeout(10)
        super(CarSpider, self).__init__()
        dispatcher.connect(self.spider_closed, signals.spider_closed)
Exemple #23
0
    def __init__(self, **kwargs):

        if not 'config' in kwargs:
            err = 'failed to find seed file (config=*.conf)'
            print err
        if 'startdate' in kwargs:
            self.startdate = kwargs['startdate']
        else:
            self.startdate = (
                datetime.datetime.now() -
                datetime.timedelta(days=2)).strftime('%Y-%m-%d %H:%M:%S')
        if 'enddate' in kwargs:
            self.enddate = kwargs['enddate']
        else:
            self.enddate = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
#         if not 'keywords' in kwargs:
#             err =  'failed to find seed file (keywords=*.dat)'
#             print err
        config = kwargs['config']
        self.load_conf(config)
        if self.Sleep_Flag == 'SEARCH_ENGINE_SLEEP' or self.Sleep_Flag == 'true' or not self.Sleep_Flag:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline')
            settings.set('DOWNLOAD_DELAY',
                         float(self.SE_Sleep_Base),
                         priority='cmdline')
        else:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline')

        log_filename = self.conf_name.replace('.conf', '') + '.log'
        settings.set('LOG_FILE', log_filename, priority='cmdline')
        #初始化redis
        self.init_redis()
        self.redis_keyword = get_redis_key()
        #注册signal
        sig = SignalManager(dispatcher.Any)
        sig.connect(self.idle, signal=signals.spider_idle)
        sig.connect(self.close, signal=signals.spider_closed)
        self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        self.conn_local = mysql.connect('meta', host='localhost')
        self.conn_local_cursor = self.conn_local.cursor()
        #        self.conn_local_cursor.execute('set global autocommit=1')
        try:
            self.meta_ip = get_meta_ip(network_card='enp7s0')
        except:
            self.meta_ip = get_meta_ip(network_card='eth0')
        #初始化meta库的state
        self.init_state()
Exemple #24
0
    def __init__(self,
                 _code_list_path='',
                 _proxy_list_path='',
                 _concurrent_requests=32):
        super(mango, self).__init__()
        print('========== code list path =======', _code_list_path)
        self.code_list_path = _code_list_path
        self.proxy_list_path = _proxy_list_path

        # initialize redis
        self.r = redis.StrictRedis(host='localhost', port=6379, db=0)
        self.p = self.r.pubsub()
        # self.p.subscribe(**{'scrapy-channel': self.my_handler})
        # self.p.run_in_thread(sleep_time=0.001)

        option = webdriver.ChromeOptions()
        option.add_argument('headless')
        option.add_argument('blink-settings=imagesEnabled=false')
        option.add_argument('--ignore-certificate-errors')
        option.add_argument('--ignore-ssl-errors')
        option.add_argument("--no-sandbox")
        option.add_argument("--disable-impl-side-painting")
        option.add_argument("--disable-setuid-sandbox")
        option.add_argument("--disable-seccomp-filter-sandbox")
        option.add_argument("--disable-breakpad")
        option.add_argument("--disable-client-side-phishing-detection")
        option.add_argument("--disable-cast")
        option.add_argument("--disable-cast-streaming-hw-encoding")
        option.add_argument("--disable-cloud-import")
        option.add_argument("--disable-popup-blocking")
        option.add_argument("--disable-session-crashed-bubble")
        option.add_argument("--disable-ipv6")

        self.driver = webdriver.Chrome(
            executable_path='./data/chromedriver.exe', chrome_options=option)

        # change concurrent threads in setting.py
        settings.set('CONCURRENT_REQUESTS', _concurrent_requests)
 def __init__(self, **kwargs):
     super(CarSpider, self).__init__(**kwargs)
     self.mailer = MailSender.from_settings(settings)
     self.counts = 0
     self.carnum = 2000000
     self.dbname = 'usedcar_evaluation'
     settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'usedcar_evaluation', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
Exemple #26
0
 def __init__(self,**kwargs):
     print "test"
     # problem report
     self.counts = 0
     self.carnum = 500000
     # Mongo
     settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'usedcar', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
Exemple #27
0
    def __init__(self,**kwargs):
        super(CarSpider,self).__init__(**kwargs)
        self.mailer=MailSender.from_settings(settings)
        self.counts=0
        self.carnum=800000

        settings.set('CrawlCar_Num',self.carnum,priority='cmdline')
        settings.set('MONGODB_DB','carbusiness',priority='cmdline')
        settings.set('MONGODB_COLLECTION',website,priority='cmdline')
Exemple #28
0
 def __init__(self, **kwargs):
     # report bug session
     self.mailer = MailSender.from_settings(settings)
     self.counts = 0
     self.carnum = 50000
     # Mongo setting
     settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'newcar', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
 def __init__(self, **kwargs):
     # problem report
     super(CarSpider, self).__init__(**kwargs)
     self.mailer = MailSender.from_settings(settings)
     self.counts = 0
     self.carnum = 1010000
     # Mongo
     settings.set('DOWNLOAD_DELAY', '0', priority='cmdline')
     settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'newcar', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
     self.nationp = dict()
     self.npcounts = 0
     self.browser = webdriver.PhantomJS(
         executable_path="/home/phantomjs-2.1.1-linux-x86_64/bin/phantomjs")
     super(CarSpider, self).__init__()
     dispatcher.connect(self.spider_closed, signals.spider_closed)
 def __init__(self, **kwargs):
     super(CarSpider, self).__init__(**kwargs)
     #problem report
     self.mailer = MailSender.from_settings(settings)
     self.counts = 0
     self.carnum = 50000
     #mongo
     settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'newcar', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
Exemple #31
0
 def __init__(self, **kwargs):
     print "do initial"
     super(CarSpider, self).__init__(**kwargs)
     self.mailer = MailSender.from_settings(settings)
     self.counts = 0
     self.carnum = 50000
     #MonGo
     settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'network', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
     print "finish initial"
Exemple #32
0
    def __init__(self,**kwargs):
        
        if not 'config' in kwargs:
            err =  'failed to find seed file (config=*.conf)'
            print err
        if 'startdate' in kwargs:
            self.startdate = kwargs['startdate']
        else:
            self.startdate = (datetime.datetime.now()-datetime.timedelta(days=2)).strftime('%Y-%m-%d %H:%M:%S')
        if 'enddate' in kwargs:
            self.enddate = kwargs['enddate']
        else:
            self.enddate = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
#         if not 'keywords' in kwargs:
#             err =  'failed to find seed file (keywords=*.dat)'
#             print err
        config = kwargs['config']
        self.load_conf(config)
        if self.Sleep_Flag=='SEARCH_ENGINE_SLEEP' or self.Sleep_Flag=='true' or not self.Sleep_Flag:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline')
            settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline')
        else:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline')
        
        log_filename = self.conf_name.replace('.conf','')+'.log'
        settings.set('LOG_FILE', log_filename, priority='cmdline')
        #初始化redis
        self.init_redis()
        self.redis_keyword = get_redis_key()
        #注册signal
        sig = SignalManager(dispatcher.Any)
        sig.connect(self.idle,signal=signals.spider_idle)
        sig.connect(self.close,signal=signals.spider_closed)
        self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        self.conn_local = mysql.connect('meta',host='localhost')
        self.conn_local_cursor = self.conn_local.cursor()
#        self.conn_local_cursor.execute('set global autocommit=1')
	try:
            self.meta_ip = get_meta_ip(network_card='enp7s0')
	except:
	    self.meta_ip = get_meta_ip(network_card='eth0')
        #初始化meta库的state
        self.init_state()
Exemple #33
0
    def __init__(self, **kwargs):
        super(CarSpider, self).__init__(**kwargs)
        self.mailer = MailSender.from_settings(settings)
        self.counts = 0
        self.carnum = 80000
        self.city_list = [
            "hk", "you", "ls", "qh", "sjz", "san", "ty", "wx", "wc", "wn", "xa"
        ]

        settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        settings.set('MONGODB_DB', 'carbusiness', priority='cmdline')
        settings.set('MONGODB_COLLECTION', website, priority='cmdline')
Exemple #34
0
 def __init__(self, **kwargs):
     #args
     super(CarSpider, self).__init__(**kwargs)
     #carnum
     self.carnum = 200000
     # problem report
     self.mailer = MailSender.from_settings(settings)
     self.counts = 0
     # Mongo
     settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'newcar', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
     self.df = BloomFilter(capacity=self.carnum * 1.1, error_rate=0.001)
    def __init__(self,**kwargs):
        super(CarSpider,self).__init__(**kwargs)
        self.mailer=MailSender.from_settings(settings)
        self.counts=0
        self.carnum=800000

        settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        settings.set('MONGODB_DB', 'carbusiness', priority='cmdline')
        settings.set('MONGODB_COLLECTION', website, priority='cmdline')

        self.browser = webdriver.PhantomJS(executable_path=settings['PHANTOMJS_PATH'])
        super(CarSpider, self).__init__()
        dispatcher.connect(self.spider_closed, signals.spider_closed)
Exemple #36
0
    def __init__(self, **kwargs):
        # args
        super(CarSpider, self).__init__(**kwargs)

        # problem report
        self.mailer = MailSender.from_settings(settings)
        self.carnum = 2000000000
        self.counts = 0
        self.today = date.today()
        # Mongo
        settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        settings.set('MONGODB_DB', 'usedcar_evaluation', priority='cmdline')
        settings.set('MONGODB_COLLECTION', website, priority='cmdline')
Exemple #37
0
 def __init__(self, **kwargs):
     super(CarSpider, self).__init__(**kwargs)
     self.mailer = MailSender.from_settings(settings)
     self.counts = 0
     self.carnum = 800000
     self.city_count = 0
     self.headers = {
         "app-version": "C 3.9.17",
         "User-Agent": "EquityPrice/3.9.17",
     }
     settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'residual_value', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
    def __init__(self, **kwargs):
        # args
        super(CarSpider, self).__init__(**kwargs)

        # problem report
        self.mailer = MailSender.from_settings(settings)
        self.carnum = 2000000000
        self.counts = 0
        # Mongo
        settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        settings.set('MONGODB_DB', 'usedcar_evaluation', priority='cmdline')
        settings.set('MONGODB_COLLECTION', website, priority='cmdline')
        self.citylist=["1","2","3","4","5","6","8","9","10","11","12","13","14","15",
                       "16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32"]
Exemple #39
0
    def __init__(self, *args, **kwargs):
        CrawlSpider.__init__(self)
        if 'mining_job_id' in kwargs:
            self.mining_job_id = kwargs['mining_job_id']
        if 'site_id' in kwargs:
            self.site_id = kwargs['site_id']
        if 'preview' in kwargs:
            self.preview = 1
        if 'iteration' in kwargs:
            self.iteration = kwargs['iteration']
        if 'management_node' in kwargs:
            self.management_node = kwargs['management_node']
        if 'username' in kwargs:
            self.username = kwargs['username']
        if 'password' in kwargs:
            self.password = kwargs['password']
        if 'proxy' in kwargs:
            self.proxy = kwargs['proxy']
        if 'robots_obey' in kwargs:
            settings.set('ROBOTSTXT_OBEY', int(kwargs['robots_obey']), priority='cmdline')
        if 'url' in kwargs:
            self.start_urls.append(kwargs['url'] + self.url_fragmentanchor)
        if 'extract' in kwargs:
            self.extract = kwargs['extract']
        if 'maxjobs' in kwargs:
            self.maxjobs = int(kwargs['maxjobs'])
        if 'protocol' in kwargs:
            self.protocol = kwargs['protocol']
        if 'maximum_try' in kwargs:
            self.maximum_try = kwargs['maximum_try']
        if 'on_demand' in kwargs:
            self.on_demand = kwargs['on_demand']
        if 'debug_id' in kwargs:
            self.debug_id = kwargs['debug_id']
        if 'stale_limit_seconds' in kwargs:
            self.stale_limit = int(kwargs['stale_limit_seconds'])
        if 'subspider_detector' in kwargs:
            self.subspider_detector = True
            self.required_fields = self.subspider_detect_fields
            # Sending max items to be scraped.
            if 'max_items_count' in kwargs:
                self.max_items_count = int(kwargs['max_items_count'])
                # set spider_valid_cutoff, default 80 percent of max_items_count
                spider_valid_cutoff = kwargs.get("valid_cutoff")
                if spider_valid_cutoff:
                    self.spider_valid_cutoff = int(spider_valid_cutoff)
                else:
                    self.spider_valid_cutoff = int(self.max_items_count * 0.8)

                # this will reduce extra requstes after a close_spider call
                settings.overrides['CONCURRENT_REQUESTS'] = 1

        self.debug = int(kwargs.get('debug', '0'))
        if 'download_delay' in kwargs or hasattr(self, 'download_delay'):
            download_delay = float(kwargs.get('download_delay', getattr(self, 'download_delay', 0)))
            settings.set('DOWNLOAD_DELAY', download_delay, priority='cmdline')
            if download_delay > 0:
                settings.set('AUTOTHROTTLE_ENABLED', True, priority='cmdline')
        if self.allowed_domain_bynetloc:
            self.allowed_domains.append(urlparse.urlparse(kwargs['url']).netloc) # set list of domain allowed to crawl

        self.default_job_field_getters.update({
                                               'url': lambda self, response, item: response.url,
                                               'date': lambda self, response, item: datetime.now().strftime('%Y/%m/%d'),
                                               'language': lambda self, response, item: self.language if hasattr(self, 'language') else None
                                               })
        if self.extract_logo:
            self.default_job_field_getters.update({'autoextracted_logo_urls': self.get_logos})
        if self.extract_email:
            self.default_job_field_getters.update({'autoextracted_emails': self.get_emails})
        if self.extract_salary:
            self.default_job_field_getters.update({'autoextracted_salaries': self.get_salaries})
        if self.extract_website:
            self.default_job_field_getters.update({'autoextracted_company_websites': self.get_websites})
        self.default_fields = self.default_job_field_getters.keys()
        self.validate_parse_job_wrapper = validate(fields_to_check=self.required_fields)(type(self).parse_job_wrapper)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
Exemple #40
0
    def __init__(self):
        self.redis = RedisUtil()
        self.article_reqs = []
	self.appnameid = get_appnameid('sina')
        settings.set('LOG_FILE', self.name+'.log', priority='cmdline')
Exemple #41
0
from scrapy_redis import connection
from scrapy.conf import settings


from IsvServiceInfo.spiders import IsvServiceInfoSpider

# scrapy api
from scrapy import signals, log
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy.conf import settings

def spider_closing(spider):
    """Activates on spider closed signal"""
    log.msg("Closing reactor", level=log.INFO)
    reactor.stop()


# crawl responsibly
settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36")
crawler = Crawler(settings)

# stop reactor when spider closes
crawler.signals.connect(spider_closing, signal=signals.spider_closed)

crawler.configure()
crawler.crawl(IsvServiceInfoSpider())
crawler.start()
reactor.run()
    def __init__(self):
        self.redis = RedisUtil()
	self.appnameid = get_appnameid('toutiao')
        settings.set('LOG_FILE', self.name+'.log', priority='cmdline')
Exemple #43
0
    def __init__(self):
	self.appnameid = get_appnameid('ifeng')
        settings.set('LOG_FILE', self.name+'.log', priority='cmdline')
# set LOG_ENABLED=FALSE on scrapy
from scrapy.conf import settings
settings.set('LOG_ENABLED', False ,priority='cmdline')
 def __init__(self,proxy, *a, **kwargs):
     super(ZhihuLoginSpider, self).__init__(*a, **kwargs)
     self.user_names = []
     if(proxy):
         settings.set('PROXY', proxy)
Exemple #46
0
from __future__ import print_function
from collections import namedtuple

import datetime
import dateutil.parser
import re

from jira.client import JIRA

import scrapy
from scrapy.http import Request
from scrapy.conf import settings


# Log only at INFO level -- change to 'DEBUG' for extremely verbose output.
settings.set('LOG_LEVEL', 'INFO')

SERVER = 'https://openedx.atlassian.net'

# Regex to match the duration field ("14d 22h 5m", "2h 33m", or "1m 10s")
DURATION_REGEX = re.compile(r'((?P<days>\d+?)d)?((?P<hours>\d+?)h)?((?P<minutes>\d+?)m)?((?P<seconds>\d+?)s)?')

# All states in the ospr jira ticket workflow
OSPR_STATES = [
    'Needs Triage',
    'Waiting on Author',
    'Blocked by Other Work',
    'Product Review',
    'Community Manager Review',
    'Awaiting Prioritization',
    'Engineering Review',
Exemple #47
0
 def process_request(self, request, spider):
     ua = random.choice(settings.get("USER_AGENT_LIST"))
     settings.set("USER_AGENT", ua)
     if ua:
         request.headers.setdefault("User-Agent", ua)
Exemple #48
0
import collections
import urlparse

import scrapy
from scrapy.conf import settings
from fake_useragent import UserAgent


DOWNLOADER_MIDDLEWARES = {
  'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
  'scrapper.RandomUserAgentMiddleware': 400
}

settings.set('DOWNLOADER_MIDDLEWARES', DOWNLOADER_MIDDLEWARES)


URLS = []

for line in open('dataset/top-1m.csv'):
  parts = line.split(',')
  url = ''.join(parts[1:]).strip()
  if not url.startswith('http'):
    url = 'http://' + url + '/'
  URLS.append(url)


class RandomUserAgentMiddleware(object):
  def __init__(self):
    super(RandomUserAgentMiddleware, self).__init__()

    self.ua = UserAgent()