def __init__(self): #settings.overrides['DEPTH_LIMIT'] = 0 settings.set('ITEM_PIPELINES', {'gentest1.pipelines.Gentest1Pipeline':1000}) settings.set('LOG_FILE', 'log.txt') for n,d in self.documents.iteritems(): self.allowed_domains.append(d['allow_url']) self.start_urls.append(d['start_url'])
def process_item(self, item, spider): name = item['name'] picture_url = item['picture_url'] count = item['count'] spider_name = item['spider_name'] # 处理sql语句 create_table_sql = '''create table if not exists {}(id int unsigned primary key auto_increment not null,name varchar(40),picture_url varchar(300) default '',count int unsigned default 1)'''.format( spider_name) try: if settings.get('FLAG') == False: self.cs.execute(create_table_sql) except: settings.set('FLAG', 'True') sql = 'insert into {} values (null,"{}","{}",{});'.format( spider_name, name, picture_url, count) self.cs.execute(sql) # 获取数据 # data = self.cs.fetchall() self.conn.commit() return item
def __init__(self, *args, **kwargs): super(SunshineRewardsSpider, self).__init__(*args, **kwargs) settings.set('RETRY_HTTP_CODES', [500, 503, 504, 400, 408, 404] ) settings.set('RETRY_TIMES', 5 ) settings.set('REDIRECT_ENABLED', True) settings.set('METAREFRESH_ENABLED', True) settings.set('USER_AGENT', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36')
def __init__(self, *args, **kwargs): super(SplenderSpider, self).__init__(*args, **kwargs) settings.set('RETRY_HTTP_CODES', [500, 503, 504, 400, 408, 404] ) settings.set('RETRY_TIMES', 5 ) settings.set('REDIRECT_ENABLED', True) settings.set('METAREFRESH_ENABLED', True) settings.set('USER_AGENT', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36')
def set_cookie(response): """ 保存cookie :param response: Scrapy的返回对象 """ cookie = "".join([ i.decode("UTF-8") for i in response.headers.getlist('Set-Cookie') ]) settings.set("DAJIE_COOKIE", cookie)
def __setting_fake_info_IP(self, signal): ''' set fake ip ''' package = signal.data self.fake_pool['ip'] = package.forge_info ip = [] for fake_info in self.fake_pool['ip']: ip.append(fake_info.forgr_info) if len(ip): settings.set('HTTP_PROXY', ip)
def __setting_fake_info_UA(self, signal): ''' set fake ua ''' package = signal.data self.fake_pool['ua'] = package.forge_info ua = [] for fake_info in self.fake_pool['ua']: ua.append(fake_info.forgr_info) if len(ua): settings.set('USER_AGENT', ua)
def __init__(self, *a, **kw): super(SuperSpider, self).__init__(*a, **kw) self.rd = redis.Redis(settings.get("REDIS_HOST"), settings.get("REDIS_PORT"), db=settings.get("MAIN_REDIS_DB")) domain = settings.get("DOMAIN") self.domain = Domain(domain) self.rule = Rule(domain) settings.set('DOWNLOAD_DELAY', float(self.domain["download_delay"])) settings.set('CONCURRENT_REQUESTS', int(self.domain["concurrent_requests"]))
def __init__(self, *args, **kwargs): super(RegularSpider, self).__init__(*args, **kwargs) self.start_urls.append(kwargs["base_url"]) # self.allowed_domains = kwargs["allowed_domains"] self.allowed_domains = [kwargs["allowed_domains"]] # if 'depth' in kwargs: settings.set(self.name, int(kwargs["depth"]), priority='cmdline') if 'username' in kwargs: self.username = kwargs['username'] if 'password' in kwargs: self.password = kwargs['password'] if 'login' in kwargs: self.login = True
def __init__(self, crawl_date = None, region = None, *args, **kwargs): super(ParkingSpider, self).__init__(*args, **kwargs) #处理时间 now = datetime.now() yesterday = now - timedelta(days=1) yesterday_str = yesterday.strftime('%Y-%m-%d') today_str = now.strftime('%Y-%m-%d') if crawl_date == 'today': self.crawl_date = today_str elif crawl_date == 'yesterday': self.crawl_date = yesterday_str else: self.crawl_date = crawl_date if crawl_date else yesterday_str #help(settings) settings.set('crawl_date', self.crawl_date) #settings['crawl_date'] = self.crawl_date self.now = now.strftime('%Y%m%d_%H%M') #处理起始爬取url url = settings['INIT_RENT_PARKING_URL'] #根据地区爬取 self.region = region if region != None else 'all' self.region_num = self.map_region(self.region) if self.region_num != None: url += '®ion1=%s' % self.region_num self.start_urls.append(url) #房产列表网页保存路径 self.list_output_path = './output/list_rent_parking/%s/%s/%s/' % (self.crawl_date, self.region, self.now) self.list_output_path = self.list_output_path.strip('\r') if not os.path.exists(self.list_output_path): os.makedirs(self.list_output_path) #房产详情页面保存路径 self.detail_output_path = './output/detail_rent_parking/%s/%s/%s/' % (self.crawl_date, self.region, self.now) self.detail_output_path = self.detail_output_path.strip('\r') if not os.path.exists(self.detail_output_path): os.makedirs(self.detail_output_path) #房产图片保存路径 self.image_output_path = './output/image_rent_parking/' self.image_output_path = self.image_output_path.strip('\r') if not os.path.exists(self.image_output_path): os.makedirs(self.image_output_path) self.go_next = settings['GO_NEXT']
def __init__(self,**kwargs): if not 'config' in kwargs: err = 'failed to find seed file (config=*.conf)' print err # if not 'keywords' in kwargs: # err = 'failed to find seed file (keywords=*.dat)' # print err config = kwargs['config'] # self.keywords = kwargs['keywords'] self.load_conf(config) if self.Sleep_Flag=='SEARCH_ENGINE_SLEEP' or self.Sleep_Flag=='true' or not self.Sleep_Flag: settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline') settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline') else: settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline') log_filename = self.conf_name.replace('.conf','')+'.log' settings.set('LOG_FILE', log_filename, priority='cmdline') #redis key self.meta_next_url = meta_redis_key() #初始化redis self.init_redis() self.redis_keyword = get_redis_key(self.conf_name) #注册signal sig = SignalManager(dispatcher.Any) sig.connect(self.idle,signal=signals.spider_idle) self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') #保存该次获取的url,用于判断该次抓取是否和上次重复{keyword:md5(url)} self.urlmd5 = dict() self.log_writer = open('log.dat','a+') self.date_from_url_re = re.compile("[-_/][a-zA-Z]*[-_]?(?P<year>(20)?([0-1][0-9]))([-_/])?(?P<m>(10|11|12|(0?[1-9])){1})([-_/])?(?P<day>(10|20|30|31|([0-2]?[1-9])){1})([-_/])")
def __init__(self, **kwargs): # problem report super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 1010000 # Mongo settings.set('DOWNLOAD_DELAY', '0', priority='cmdline') settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'newcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') self.nationp = dict() self.npcounts = 0 # nation select self.browser = webdriver.PhantomJS( executable_path=settings['PHANTOMJS_PATH']) # desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() # proxy = webdriver.Proxy() # proxy.proxy_type = ProxyType.MANUAL # proxy.http_proxy = self.getProxy() # proxy.add_to_capabilities(desired_capabilities) # self.browser.start_session(desired_capabilities) # self.browser.set_page_load_timeout(12) # self.browser = webdriver.PhantomJS(executable_path=settings['PHANTOMJS_PATH']) # self.browser = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs") # self.browser = webdriver.PhantomJS(executable_path="/root/home/phantomjs") super(CarSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, **kwargs): # 爬取字段 set到REFERER_NAME中 if len(kwargs) != 0: self._search_name = kwargs['search_name'] settings.set("REFERER_NAME", self._search_name) else: self._search_name = settings.get("REFERER_NAME") # 请求json的url self._cookie_url = "https://so.dajie.com/job/search?keyword={}&from=job&clicktype=blank" self._url = 'https://so.dajie.com/job/ajax/search/filter?keyword={' \ '}&order=0&city&recruitType&salary&experience&page={}&positionFunction&_CSRFToken&ajax=1' super().__init__(**kwargs)
class BookSpider(scrapy.Spider): name = 'BookSpider' settings.set('CONCURRENT_REQUESTS',100) def start_requests(self): return [ scrapy.Request('https://search.jd.com/Search?keyword=python&enc=utf-8&wq=python&pvid=186e2514605040b4987bfc7a62e3d5e0',callback=self.parseBookList) ] def parseBookList(self,response): hrefs = response.xpath('//*[@id="J_goodsList"]/ul/li/div/div[1]/a/@href[starts-with(.,"//item.jd.com/")]').extract() for href in hrefs: result = re.match('.*/(\d+).*',href) productId = result.group(1) yield scrapy.Request('https:' + href,meta={'productId':productId,'url':'https:' + href},callback=self.parseBook) def parseBook(self, response): values = response.xpath('//*[@id="name"]/div[1]/text()').extract() title = '' for value in values: title += value title = title.strip() press = response.xpath('//*[@id="parameter2"]/li[1]/a/text()').extract()[0] ISBN = response.xpath('//*[@id="parameter2"]/li[2]/@title').extract()[0] productId = response.meta['productId'] url = response.meta['url'] bookItem = NewBookItem() bookItem['url'] = url bookItem['title'] = title bookItem['press'] = press bookItem['ISBN'] = ISBN bookItem['productId'] = productId return bookItem
def __init__(self, crawl_date = None, region = None, *args, **kwargs): super(ResidenceImageSpider, self).__init__(*args, **kwargs) #处理时间 now = datetime.now() yesterday = now - timedelta(days=1) yesterday_str = yesterday.strftime('%Y-%m-%d') today_str = now.strftime('%Y-%m-%d') if crawl_date == 'today': self.crawl_date = today_str elif crawl_date == 'yesterday': self.crawl_date = yesterday_str else: self.crawl_date = crawl_date if crawl_date else yesterday_str #help(settings) settings.set('crawl_date', self.crawl_date) #settings['crawl_date'] = self.crawl_date self.now = now.strftime('%Y%m%d_%H%M')
def __init__(self, search_name, *args, **kwargs): # 拉勾登录账号密码 self._username = settings['USERNAME'] self._password = settings['PASSWORD'] # login_cookies self.login_cookies = None # 爬取字段 set到referer_name中 self._search_name = search_name settings.set("REFERER_NAME", self._search_name) # 页数设个起始值 self.page_no = settings['START_PAGE_NUM'] # 请求json的url self._url = 'https://www.lagou.com/jobs/positionAjax.json?px=new&kd={}&pn={}&' super(LagouSpider, self).__init__(*args)
def __init__(self): # Setting scrapy params settings.set('RETRY_HTTP_CODES', [503, 504, 400, 408, 404] ) settings.set('RETRY_TIMES', 5) settings.set('REDIRECT_ENABLED', True) settings.set('METAREFRESH_ENABLED', True) # Excluded categories self.EXCLUDE_CATEGORY_LEVEL_1 = [ # 'designers', 'look book' ]
def __init__(self, **kwargs): # problem report super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 1010000 # Mongo settings.set('DOWNLOAD_DELAY', '0', priority='cmdline') settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'newcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') self.nationp = dict() self.npcounts=0
class PycoderSpider(CrawlSpider): name = "tut" settings.set('DEPTH_LIMIT', 1) start_urls = ['https://en.belstu.by'] rules = (Rule(LinkExtractor(), callback='parse_url', follow=True), ) def parse_url(self, response): item = MyItem() item['url'] = response.url links = response.xpath("//a[@href]/@href").extract() item['links'] = links item['countOfLinks'] = len(links) texts = response.xpath('//text()').extract() words = [] for t in texts: t = ' '.join(t.split()).strip() if len(t) > 0: curWords = re.sub('[^0-9a-zA-Z]+', '*', t).split('*') curWords = list(filter(None, curWords)) words.extend(curWords) item['countOfWords'] = len(words) frequency = [] frequencyWords = [] for w in words: l = len(w) finded = next((x for x in frequency if x['countLetters'] == l), None) if finded: finded['countWords'] = finded['countWords'] + 1 else: newLength = LengthDictionary() newLength['countLetters'] = l newLength['countWords'] = 1 frequency.append(newLength) findedWord = next((t for t in frequencyWords if t['word'] == w), None) if findedWord: findedWord['count'] = findedWord['count'] + 1 else: newWord = WordsDictionary() newWord['word'] = w newWord['count'] = 1 frequencyWords.append(newWord) item['frequencyOfLength'] = frequency item['frequencyOfWords'] = frequencyWords return item
def __init__(self, **kwargs): # args super(CarSpider, self).__init__(**kwargs) #problem report self.mailer = MailSender.from_settings(settings) self.counts=0 # Mongo settings.set('CrawlCar_Num', carnum, priority='cmdline') settings.set('MONGODB_DB', 'usedcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') #mysql # mysql mysqldb = MySQLdb.connect("192.168.1.94", "root", "Datauser@2017", "usedcar", port=3306) mysqldbc = mysqldb.cursor() # read mysqldbc.execute("select newcarurl from che58") items = mysqldbc.fetchall() self.urllist=[] df =pybloom.BloomFilter(carnum,0.01) for i in items: j=i[0] md5i= hashlib.md5(j) rf = df.add(md5i) if not rf: self.urllist.append(j)
def __init__(self, **kwargs): super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 800000 settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'carbusiness', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') with codecs.open("D:/county.txt", "r", "utf-8") as f: filecontent = f.read() # print(filecontent) indexlist = re.findall("\d+\_\d+\_\d+|\d+\_\d+", filecontent) indexlist.append("0") # print(indexlist) datalist = re.findall("\[(.*?)\]", filecontent, re.S) # print(datalist) self.datadict = {} for index in indexlist: self.datadict[index] = datalist[indexlist.index(index)] # print(self.datadict) self.browser = webdriver.PhantomJS( executable_path=settings['PHANTOMJS_PATH']) # self.browser = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs") # self.browser = webdriver.PhantomJS(executable_path="/root/home/phantomjs") super(CarSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, **kwargs): super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 2000000 #MonGo settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'newcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') with open("blm/" + settings['MONGODB_DB'] + "/yiche_city.txt") as f: content = f.read() f.close() obj = json.loads(content) self.city_id_list = [] for city in obj: self.city_id_list.append(city['cityId']) desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() desired_capabilities[ "phantomjs.page.settings.userAgent"] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' self.browser = webdriver.PhantomJS( executable_path="/home/phantomjs-2.1.1-linux-x86_64/bin/phantomjs", desired_capabilities=desired_capabilities) # self.browser = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs") # self.browser = webdriver.PhantomJS(executable_path="D:/phantomjs", desired_capabilities=desired_capabilities) self.browser.set_page_load_timeout(10) super(CarSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, **kwargs): if not 'config' in kwargs: err = 'failed to find seed file (config=*.conf)' print err if 'startdate' in kwargs: self.startdate = kwargs['startdate'] else: self.startdate = ( datetime.datetime.now() - datetime.timedelta(days=2)).strftime('%Y-%m-%d %H:%M:%S') if 'enddate' in kwargs: self.enddate = kwargs['enddate'] else: self.enddate = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # if not 'keywords' in kwargs: # err = 'failed to find seed file (keywords=*.dat)' # print err config = kwargs['config'] self.load_conf(config) if self.Sleep_Flag == 'SEARCH_ENGINE_SLEEP' or self.Sleep_Flag == 'true' or not self.Sleep_Flag: settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline') settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline') else: settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline') log_filename = self.conf_name.replace('.conf', '') + '.log' settings.set('LOG_FILE', log_filename, priority='cmdline') #初始化redis self.init_redis() self.redis_keyword = get_redis_key() #注册signal sig = SignalManager(dispatcher.Any) sig.connect(self.idle, signal=signals.spider_idle) sig.connect(self.close, signal=signals.spider_closed) self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') self.conn_local = mysql.connect('meta', host='localhost') self.conn_local_cursor = self.conn_local.cursor() # self.conn_local_cursor.execute('set global autocommit=1') try: self.meta_ip = get_meta_ip(network_card='enp7s0') except: self.meta_ip = get_meta_ip(network_card='eth0') #初始化meta库的state self.init_state()
def __init__(self, _code_list_path='', _proxy_list_path='', _concurrent_requests=32): super(mango, self).__init__() print('========== code list path =======', _code_list_path) self.code_list_path = _code_list_path self.proxy_list_path = _proxy_list_path # initialize redis self.r = redis.StrictRedis(host='localhost', port=6379, db=0) self.p = self.r.pubsub() # self.p.subscribe(**{'scrapy-channel': self.my_handler}) # self.p.run_in_thread(sleep_time=0.001) option = webdriver.ChromeOptions() option.add_argument('headless') option.add_argument('blink-settings=imagesEnabled=false') option.add_argument('--ignore-certificate-errors') option.add_argument('--ignore-ssl-errors') option.add_argument("--no-sandbox") option.add_argument("--disable-impl-side-painting") option.add_argument("--disable-setuid-sandbox") option.add_argument("--disable-seccomp-filter-sandbox") option.add_argument("--disable-breakpad") option.add_argument("--disable-client-side-phishing-detection") option.add_argument("--disable-cast") option.add_argument("--disable-cast-streaming-hw-encoding") option.add_argument("--disable-cloud-import") option.add_argument("--disable-popup-blocking") option.add_argument("--disable-session-crashed-bubble") option.add_argument("--disable-ipv6") self.driver = webdriver.Chrome( executable_path='./data/chromedriver.exe', chrome_options=option) # change concurrent threads in setting.py settings.set('CONCURRENT_REQUESTS', _concurrent_requests)
def __init__(self, **kwargs): super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 2000000 self.dbname = 'usedcar_evaluation' settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'usedcar_evaluation', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def __init__(self,**kwargs): print "test" # problem report self.counts = 0 self.carnum = 500000 # Mongo settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'usedcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def __init__(self,**kwargs): super(CarSpider,self).__init__(**kwargs) self.mailer=MailSender.from_settings(settings) self.counts=0 self.carnum=800000 settings.set('CrawlCar_Num',self.carnum,priority='cmdline') settings.set('MONGODB_DB','carbusiness',priority='cmdline') settings.set('MONGODB_COLLECTION',website,priority='cmdline')
def __init__(self, **kwargs): # report bug session self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 50000 # Mongo setting settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'newcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def __init__(self, **kwargs): # problem report super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 1010000 # Mongo settings.set('DOWNLOAD_DELAY', '0', priority='cmdline') settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'newcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') self.nationp = dict() self.npcounts = 0 self.browser = webdriver.PhantomJS( executable_path="/home/phantomjs-2.1.1-linux-x86_64/bin/phantomjs") super(CarSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, **kwargs): super(CarSpider, self).__init__(**kwargs) #problem report self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 50000 #mongo settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'newcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def __init__(self, **kwargs): print "do initial" super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 50000 #MonGo settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'network', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') print "finish initial"
def __init__(self,**kwargs): if not 'config' in kwargs: err = 'failed to find seed file (config=*.conf)' print err if 'startdate' in kwargs: self.startdate = kwargs['startdate'] else: self.startdate = (datetime.datetime.now()-datetime.timedelta(days=2)).strftime('%Y-%m-%d %H:%M:%S') if 'enddate' in kwargs: self.enddate = kwargs['enddate'] else: self.enddate = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # if not 'keywords' in kwargs: # err = 'failed to find seed file (keywords=*.dat)' # print err config = kwargs['config'] self.load_conf(config) if self.Sleep_Flag=='SEARCH_ENGINE_SLEEP' or self.Sleep_Flag=='true' or not self.Sleep_Flag: settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline') settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline') else: settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline') log_filename = self.conf_name.replace('.conf','')+'.log' settings.set('LOG_FILE', log_filename, priority='cmdline') #初始化redis self.init_redis() self.redis_keyword = get_redis_key() #注册signal sig = SignalManager(dispatcher.Any) sig.connect(self.idle,signal=signals.spider_idle) sig.connect(self.close,signal=signals.spider_closed) self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') self.conn_local = mysql.connect('meta',host='localhost') self.conn_local_cursor = self.conn_local.cursor() # self.conn_local_cursor.execute('set global autocommit=1') try: self.meta_ip = get_meta_ip(network_card='enp7s0') except: self.meta_ip = get_meta_ip(network_card='eth0') #初始化meta库的state self.init_state()
def __init__(self, **kwargs): super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 80000 self.city_list = [ "hk", "you", "ls", "qh", "sjz", "san", "ty", "wx", "wc", "wn", "xa" ] settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'carbusiness', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def __init__(self, **kwargs): #args super(CarSpider, self).__init__(**kwargs) #carnum self.carnum = 200000 # problem report self.mailer = MailSender.from_settings(settings) self.counts = 0 # Mongo settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'newcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') self.df = BloomFilter(capacity=self.carnum * 1.1, error_rate=0.001)
def __init__(self,**kwargs): super(CarSpider,self).__init__(**kwargs) self.mailer=MailSender.from_settings(settings) self.counts=0 self.carnum=800000 settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'carbusiness', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') self.browser = webdriver.PhantomJS(executable_path=settings['PHANTOMJS_PATH']) super(CarSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, **kwargs): # args super(CarSpider, self).__init__(**kwargs) # problem report self.mailer = MailSender.from_settings(settings) self.carnum = 2000000000 self.counts = 0 self.today = date.today() # Mongo settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'usedcar_evaluation', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def __init__(self, **kwargs): super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 800000 self.city_count = 0 self.headers = { "app-version": "C 3.9.17", "User-Agent": "EquityPrice/3.9.17", } settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'residual_value', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def __init__(self, **kwargs): # args super(CarSpider, self).__init__(**kwargs) # problem report self.mailer = MailSender.from_settings(settings) self.carnum = 2000000000 self.counts = 0 # Mongo settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'usedcar_evaluation', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') self.citylist=["1","2","3","4","5","6","8","9","10","11","12","13","14","15", "16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32"]
def __init__(self, *args, **kwargs): CrawlSpider.__init__(self) if 'mining_job_id' in kwargs: self.mining_job_id = kwargs['mining_job_id'] if 'site_id' in kwargs: self.site_id = kwargs['site_id'] if 'preview' in kwargs: self.preview = 1 if 'iteration' in kwargs: self.iteration = kwargs['iteration'] if 'management_node' in kwargs: self.management_node = kwargs['management_node'] if 'username' in kwargs: self.username = kwargs['username'] if 'password' in kwargs: self.password = kwargs['password'] if 'proxy' in kwargs: self.proxy = kwargs['proxy'] if 'robots_obey' in kwargs: settings.set('ROBOTSTXT_OBEY', int(kwargs['robots_obey']), priority='cmdline') if 'url' in kwargs: self.start_urls.append(kwargs['url'] + self.url_fragmentanchor) if 'extract' in kwargs: self.extract = kwargs['extract'] if 'maxjobs' in kwargs: self.maxjobs = int(kwargs['maxjobs']) if 'protocol' in kwargs: self.protocol = kwargs['protocol'] if 'maximum_try' in kwargs: self.maximum_try = kwargs['maximum_try'] if 'on_demand' in kwargs: self.on_demand = kwargs['on_demand'] if 'debug_id' in kwargs: self.debug_id = kwargs['debug_id'] if 'stale_limit_seconds' in kwargs: self.stale_limit = int(kwargs['stale_limit_seconds']) if 'subspider_detector' in kwargs: self.subspider_detector = True self.required_fields = self.subspider_detect_fields # Sending max items to be scraped. if 'max_items_count' in kwargs: self.max_items_count = int(kwargs['max_items_count']) # set spider_valid_cutoff, default 80 percent of max_items_count spider_valid_cutoff = kwargs.get("valid_cutoff") if spider_valid_cutoff: self.spider_valid_cutoff = int(spider_valid_cutoff) else: self.spider_valid_cutoff = int(self.max_items_count * 0.8) # this will reduce extra requstes after a close_spider call settings.overrides['CONCURRENT_REQUESTS'] = 1 self.debug = int(kwargs.get('debug', '0')) if 'download_delay' in kwargs or hasattr(self, 'download_delay'): download_delay = float(kwargs.get('download_delay', getattr(self, 'download_delay', 0))) settings.set('DOWNLOAD_DELAY', download_delay, priority='cmdline') if download_delay > 0: settings.set('AUTOTHROTTLE_ENABLED', True, priority='cmdline') if self.allowed_domain_bynetloc: self.allowed_domains.append(urlparse.urlparse(kwargs['url']).netloc) # set list of domain allowed to crawl self.default_job_field_getters.update({ 'url': lambda self, response, item: response.url, 'date': lambda self, response, item: datetime.now().strftime('%Y/%m/%d'), 'language': lambda self, response, item: self.language if hasattr(self, 'language') else None }) if self.extract_logo: self.default_job_field_getters.update({'autoextracted_logo_urls': self.get_logos}) if self.extract_email: self.default_job_field_getters.update({'autoextracted_emails': self.get_emails}) if self.extract_salary: self.default_job_field_getters.update({'autoextracted_salaries': self.get_salaries}) if self.extract_website: self.default_job_field_getters.update({'autoextracted_company_websites': self.get_websites}) self.default_fields = self.default_job_field_getters.keys() self.validate_parse_job_wrapper = validate(fields_to_check=self.required_fields)(type(self).parse_job_wrapper) dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def __init__(self): self.redis = RedisUtil() self.article_reqs = [] self.appnameid = get_appnameid('sina') settings.set('LOG_FILE', self.name+'.log', priority='cmdline')
from scrapy_redis import connection from scrapy.conf import settings from IsvServiceInfo.spiders import IsvServiceInfoSpider # scrapy api from scrapy import signals, log from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy.conf import settings def spider_closing(spider): """Activates on spider closed signal""" log.msg("Closing reactor", level=log.INFO) reactor.stop() # crawl responsibly settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36") crawler = Crawler(settings) # stop reactor when spider closes crawler.signals.connect(spider_closing, signal=signals.spider_closed) crawler.configure() crawler.crawl(IsvServiceInfoSpider()) crawler.start() reactor.run()
def __init__(self): self.redis = RedisUtil() self.appnameid = get_appnameid('toutiao') settings.set('LOG_FILE', self.name+'.log', priority='cmdline')
def __init__(self): self.appnameid = get_appnameid('ifeng') settings.set('LOG_FILE', self.name+'.log', priority='cmdline')
# set LOG_ENABLED=FALSE on scrapy from scrapy.conf import settings settings.set('LOG_ENABLED', False ,priority='cmdline')
def __init__(self,proxy, *a, **kwargs): super(ZhihuLoginSpider, self).__init__(*a, **kwargs) self.user_names = [] if(proxy): settings.set('PROXY', proxy)
from __future__ import print_function from collections import namedtuple import datetime import dateutil.parser import re from jira.client import JIRA import scrapy from scrapy.http import Request from scrapy.conf import settings # Log only at INFO level -- change to 'DEBUG' for extremely verbose output. settings.set('LOG_LEVEL', 'INFO') SERVER = 'https://openedx.atlassian.net' # Regex to match the duration field ("14d 22h 5m", "2h 33m", or "1m 10s") DURATION_REGEX = re.compile(r'((?P<days>\d+?)d)?((?P<hours>\d+?)h)?((?P<minutes>\d+?)m)?((?P<seconds>\d+?)s)?') # All states in the ospr jira ticket workflow OSPR_STATES = [ 'Needs Triage', 'Waiting on Author', 'Blocked by Other Work', 'Product Review', 'Community Manager Review', 'Awaiting Prioritization', 'Engineering Review',
def process_request(self, request, spider): ua = random.choice(settings.get("USER_AGENT_LIST")) settings.set("USER_AGENT", ua) if ua: request.headers.setdefault("User-Agent", ua)
import collections import urlparse import scrapy from scrapy.conf import settings from fake_useragent import UserAgent DOWNLOADER_MIDDLEWARES = { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'scrapper.RandomUserAgentMiddleware': 400 } settings.set('DOWNLOADER_MIDDLEWARES', DOWNLOADER_MIDDLEWARES) URLS = [] for line in open('dataset/top-1m.csv'): parts = line.split(',') url = ''.join(parts[1:]).strip() if not url.startswith('http'): url = 'http://' + url + '/' URLS.append(url) class RandomUserAgentMiddleware(object): def __init__(self): super(RandomUserAgentMiddleware, self).__init__() self.ua = UserAgent()