def prepare(self, fromtext=False, start_idx=0, end_idx=100): if not fromtext: host = settings.get("REDIS_HOST", REDIS_HOST) port = settings.get("REDIS_PORT", REDIS_PORT) self.r = _default_redis(host, port) uids_set = UIDS_SET.format(spider=self.name) log.msg(format="Load uids from %(uids_set)s", level=log.WARNING, uids_set=uids_set) uids = self.r.smembers(uids_set) if uids == []: log.msg(format="Not load any uids from %(uids_set)s", level=log.WARNING, uids_set=uids_set) else: uids = [] fname = "uidlist_20140103.txt" log.msg(format="Load uids from %(uids_set)s", level=log.WARNING, uids_set=fname) if os.getcwd()[-8:] == "cron4win": f = open("../test/%s" % fname, "r") else: f = open("./test/%s" % fname, "r") count = 0 for line in f.readlines(): count += 1 if count >= start_idx and count <= end_idx: uids.append(int(line.strip().split(",")[0])) elif count < start_idx: pass else: break if uids == []: log.msg(format="Not load any uids from %(uids_set)s", level=log.WARNING, uids_set=fname) f.close() return uids
def login(self, response): """Generate a login request.""" # from scrapy.shell import inspect_response # inspect_response(response) hxs = HtmlXPathSelector(response) email = settings.get('FOLHA_USER') password = settings.get('FOLHA_PASS') challenge = hxs.select("//form[@name='login']/input[@name='challenge']/@value").extract()[0] password_challenge = hashlib.md5(challenge + hashlib.md5(password).hexdigest()).hexdigest() data = {'email': email, 'password_challenge': password_challenge, 'password': password, 'challenge': challenge, r'auth.x': '1', r'auth.y': '1', 'auth': 'Autenticar' } return [FormRequest.from_response(response, formname='login', formdata=data, callback=self.check_login_response)]
def __init__(self, *args, **kwargs): join_multivalued = settings.get('CSV_JOIN_MULTIVALUED', None) if join_multivalued: kwargs['join_multivalued'] = join_multivalued kwargs['delimiter'] = settings.get('CSV_DELIMITER', ',') kwargs['fields_to_export'] = settings.get('EXPORT_FIELDS', None) super(CSVItemExporter, self).__init__(*args, **kwargs)
def open_spider(self, spider): res = super(MongoDBPipeline, self).open_spider(spider) if not res: return spider.username = self.username spider.password = self.password self.ensure_index(RackUsage) self.ensure_index(RackServers) old_invoices = [i for i in self.mongodb[RackServers._collection_name].find( dict( cloud_account_id=self.user_id, invoice_id={"$exists": True, "$ne": ""}, enddate={"$exists": True} ))] spider.old_invoices = [i['invoice_id'] for i in old_invoices] urls = settings.get('URLS') base_url = settings.get('BASE_URL') specific_urls = settings.get("SPECIFIC_URLS") if not self.base_url: self.base_url = base_url specific_url = specific_urls.get(self.base_url, {}) for attr, url in urls.items(): if attr in specific_url: url = specific_url[attr] setattr(spider, attr, urljoin(self.base_url, url))
def prepare(self, fromtext=False, start_idx=0, end_idx=100): if not fromtext: host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) self.r = _default_redis(host, port) uids_set = UIDS_SET.format(spider=self.name) log.msg(format='Load uids from %(uids_set)s', level=log.WARNING, uids_set=uids_set) uids = self.r.smembers(uids_set) if uids == []: log.msg(format='Not load any uids from %(uids_set)s', level=log.WARNING, uids_set=uids_set) else: uids = [] fname = 'uid_about_marine' log.msg(format='Load uids from %(uids_set)s', level=log.WARNING, uids_set=fname) f = open('./source/%s' % fname) count = 0 for line in f: count += 1 if count >= start_idx and count <= end_idx: uids.append(int(line.strip())) elif count > start_idx: break else: pass if uids == []: log.msg(format='Not load any uids from %(uids_set)s', level=log.WARNING, uids_set=fname) f.close() return uids
def connect(): return psycopg2.connect( user=settings.get('PG_USER'), dbname=settings.get('PG_DBNAME'), host=settings.get('PG_HOST'), password=settings.get('PG_PASSWORD') )
def __init__(self,name=None,*args,**kwargs): '''Initialize weibospider Parameters ---------- login : login status (True/False) start_urls : default urls to start crawling login_url : url to login redis_server : redis server Connected ids_toCrawl_name : name of toCrawl ids Queue ids_crawled_name : name of crawled ids Queue ids_processing_name: name of ids crawling now Queue ids_problem_name : name of ids sucked Queue ''' super(WeiboSpider,self).__init__(name,*args,**kwargs) self.login = False self.start_urls = [] self.login_url = self.weibo.login(self.username, self.password) self.redis_server = redis.Redis(self.REDIS_HOST,self.REDIS_PORT) self.ids_toCrawl_name = settings.get('REDIS_TOCRAWL_QUEUE' ,'user_ids_toCrawl' ) self.ids_crawled_name = settings.get('REDIS_CRAWLED_QUEUE' ,'user_ids_crawled' ) self.ids_processing_name = settings.get('REDIS_PROCESSING_QUEUE','user_ids_processing') self.ids_problem_name = settings.get('REDIS_PROBLEM_QUEUE' ,'user_ids_problem' ) if self.login_url: self.start_urls.append(self.login_url)
def __init__(self, emotion='smile face', start='20100101', end='20151221', interval=30, *args, **kwargs): super(FlickrSpider, self).__init__(*args, **kwargs) self.emotion = emotion self.start_date = start self.js_bin = settings.get('JS_BIN') self.js_wait = settings.get('JS_WAIT') start_date = int(time.mktime(time.strptime(start, '%Y%m%d'))) end_date = int(time.mktime(time.strptime(end, '%Y%m%d'))) ONEDAYSECONDS = 24 * 3600 num = (end_date - start_date) // (int(interval) * ONEDAYSECONDS) startlist = [(start_date + i * int(interval) * ONEDAYSECONDS, start_date + (i + 1) * int(interval) * ONEDAYSECONDS) for i in xrange(num)] self.start_urls = [ "http://www.flickr.com/search/?text={0}&view_all=1&media=photos&min_upload_date={1}&max_upload_date={2}" .format(emotion, *dateInterval) for dateInterval in startlist ] self.conn = sqlite3.connect('%s%s.db' % (emotion, start)) self._create_table() service_args = ['--proxy=%s' % settings.get('HTTP_PROXY'), '--proxy-type=http', '--load-images=false'] self.driver = webdriver.PhantomJS( executable_path=self.js_bin, service_args=service_args) self.driver.set_window_size(1920, 1080) self.rules = [ Rule(LinkExtractor(allow=['search/?text=%s&view_all=1' % emotion]), )]
def after_login(self, response): hxs = HtmlXPathSelector(response) alert = hxs.select('//ul[@class="message-alert"]').extract() if alert: print "Invalid login" raise CloseSpider(alert) return self.log.msg("Parsing current usage") meta = {} for region in settings.get("REGIONS"): item = HPCloudService(region=region, number=0) meta = {'item': item} headers = { 'X-Requested-With':'XMLHttpRequest', 'Accept':'application/json, text/javascript, */*; q=0.01', } yield Request(self._FILES_URL.format(region=region), headers=headers, callback=self.parse_files, meta=meta, errback=self.current_error) for zone in settings.get("ZONES"): item = HPCloudService(region=region, number=0) meta = {'item': item, 'zone': zone} yield Request(self._SERVERS_URL.format(region=region, zone=zone), callback=self.parse_servers, meta=meta, errback=self.current_error) yield Request(url=self._BILLS_URL, callback=self.parse_invoices)
def get_start_urls(self): """Extracts urls from a text file into the list of URLs to crawl""" if not settings.get('URLS'): raise ValueError('No text file. Use -s URLS=somefile.txt') with open(settings.get('URLS')) as data: return [line.rstrip('\r\n') for line in data]
def _loop(self, args, opts): if settings.get('MEMDEBUG_WITH_GUPPY', False) and guppy: heapy = guppy.hpy() task = Task().next(locked=0, completed=0) if task: task.lock() cmd = ['python', os.path.join(os.getcwd(), 'scrapy-ctl.py'), 'run'] cmd.append('--task-id=%s'%task.id) if opts.child_logfile: cmd.append('--logfile=%s'%opts.child_logfile) cmd.append('--child') task.start = datetime.now() process = subprocess.Popen(cmd, shell=False, stderr=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True) task.result, task.errors = process.communicate() task.finish = datetime.now() task.completed = 1 task.save() timetext.LANG = 'en' total = task.finish - task.start log.msg('Finished: %s(%s) in %s'%(task.name, task.id, timetext.stringify(total)), level=log.INFO, domain=task.domain) if settings.get('MEMDEBUG_WITH_GUPPY', False) and guppy: log.msg(heapy.heap(), level=log.DEBUG) heapy.setref() else: time.sleep(30)
def from_settings(cls,settings): ret = { 'mongo_server':settings.get('MONGODB_SERVER',MONGODB_SERVER), 'mongo_port':settings.get('MONGODB_PORT',MONGODB_PORT), 'mongo_db_name':settings.get('MONGODB_DB',MONGODB_DB), } return cls(**ret)
def load(self, task): ''' Gets task for the spider, loads the tasks's module code and applies code from configuration to the spider. ''' self.task = task configuration = None if settings.get('TASKS'): available_tasks = settings.get('TASKS') if available_tasks.has_key(task.name): try: configuration = load_object(available_tasks[task.name]) except Exception, (ErrorMessage): log.msg('Could not load configuration for task %s' % task.name, level=log.ERROR) log.msg(ErrorMessage, level=log.DEBUG, domain='tripcentral.ca') configuration = configuration(task, self) if hasattr(configuration, 'start_urls'): setattr(self, 'start_urls', configuration.start_urls) if hasattr(configuration, 'rules'): setattr(self, 'rules', configuration.rules) if hasattr(configuration, 'parse_start_url'): setattr(self, 'parse_start_url', configuration.parse_start_url) self.start_urls = self.get_start_urls() self._compile_rules() else: log.msg('%s is not defined in settings.TASKS' % task.name, level=log.ERROR, domain=task.domain )
def __init__(self, *args, **kwargs): kwargs['fields_to_export'] = settings.getlist('EXPORT_FIELDS') or None kwargs['encoding'] = settings.get('EXPORT_ENCODING', 'utf-8') delimiter = settings.get('CSV_DELIMITER', '|') kwargs['delimiter'] = delimiter kwargs['include_headers_line'] = False super(ProductCSVExporter, self).__init__(*args, **kwargs)
def __init__(self): self.queries = settings.get('GOOGLER_QUERIES') self.pages_to_get = settings.get('GOOGLER_PAGES_TO_GET_FROM_ENGINE') self.engines = googler.utils.loading.load_modules("googler.engines", settings.get('GOOGLER_USE_ENGINES')) self.load_crawling_config() self.forbid_regexps = map(re.compile, settings.get('GOOGLER_FORBID_URLS')) super(GooglerSpider, self).__init__()
def parse_item_wrapper(self, response): """Wrapper for parse_item enabling exception notifications.""" try: item = self.parse_item(response) return item except Exception, ex: url = None if response.url: url = response.url quarantine_database = get_quarantine_database() if quarantine_database and settings.get('QUARANTINE_MODE'): e = { 'exception': str(type(ex)), 'stacktrace': traceback.format_exc(), 'link': url } quarantine_database.save_exception(e) if settings.get('DEBUG'): self.log('Spider Exception trying to parse: ' + url) self.log(str(type(ex)) + " - " + traceback.format_exc()) if not isinstance(ex, DropItem): self.log_exceptions += 1 raise
def __init__(self, *args, **kwargs): delimiter = settings.get('CSV_DELIMITER', ',') kwargs['delimiter'] = delimiter fields_to_export = settings.get('FIELDS_TO_EXPORT', []) if fields_to_export: kwargs['fields_to_export'] = fields_to_export super(YellowpagesItemExporter, self).__init__(*args, **kwargs)
def __init__(self, *args, **kwargs): delimiter = settings.get("CSV_DELIMITER", ",") kwargs["delimiter"] = delimiter fields_to_export = settings.get("FIELDS_TO_EXPORT", []) if fields_to_export: kwargs["fields_to_export"] = fields_to_export super(MyProjectCsvItemExporter, self).__init__(*args, **kwargs)
def __init__(self): # {{{ db_url = settings.get("DB_URL") table_name = settings.get("DB_TABLE") if not db_url or not table_name: raise NotConfigured self.engine = create_engine(db_url, echo=False) self.metadata = MetaData(bind=self.engine) self.table = Table(table_name, self.metadata, autoload=True)
def __init__(self): connection = pymongo.MongoClient( settings.get('MONGODB_SERVER'), settings.get('MONGODB_PORT') ) db = connection[settings.get('MONGODB_DB')] self.collection = db[settings.get('MONGODB_TABLE')]
def from_settings(cls, settings): server = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT')) persist = settings.get('SCHEDULER_PERSIST', True) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) retries = settings.get('SCHEDULER_ITEM_RETRIES', 3) return cls(server, persist, timeout, retries)
def from_settings(cls, settings): cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0) cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0) cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90) cls.THUMBS = settings.get('IMAGES_THUMBS', {}) cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD', cls.DEFAULT_IMAGES_URLS_FIELD) cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD', cls.DEFAULT_IMAGES_RESULT_FIELD) store_uri = settings['IMAGES_STORE'] return cls(store_uri)
def __init__(self): self.dbpool = adbapi.ConnectionPool( "MySQLdb", db=settings.get("DATABASE_NAME"), user=settings.get("DATABASE_USER"), passwd=settings.get("DATABASE_PASSWORD"), cursorclass=MySQLdb.cursors.DictCursor, charset="utf8", use_unicode=True, )
def push_data(items): data = { 'items': json.dumps(items), } encoded_data = urllib.urlencode(data) remote_server = '127.0.0.1:8000' if settings.get('REMOTE_SERVER'): remote_server = settings.get('REMOTE_SERVER') text = urllib2.urlopen('http://%s/robot/push' % remote_server, encoded_data).read() print text
def login(self, resp): user = settings.get('VIETNAMWORK_USERNAME') password = settings.get('VIETNAMWORK_PASSWORD') return FormRequest.from_response(resp, method='POST', formdata={'form[username]': user, 'form[password]': password}, callback=self.check_login, dont_filter=True )
def __init__(self, id='keywordSpider'): self.rules = rules.rules self.seeds = json.JSONDecoder('utf-8').decode(''.join(open(settings.get('SEEDS')).readlines())) self.id = id self.start_urls = [] self.ts = datetime.now() self.domain = settings.get('DOMAIN') self.seed = self.seeds.get(self.domain) self.rule = self.rules.get(self.domain) self.getQueryWord()
def get_books(self): mongo_uri = settings.get("MONGO_URI") db_name = settings.get("DB_NAME") auth = settings.get("AUTH") client = pymongo.MongoClient(mongo_uri) db = client[db_name] if auth: db.authenticate(**auth) books = set(i['book_id'] for i in db['book_index'].find({'source_id': 21}, {'book_id': 1})) self.books = books
def __init__(self): dbargs = settings.get('DB_CONNECT') db_server = settings.get('DB_SERVER') dbpool = adbapi.ConnectionPool(db_server,**dbargs) self.dbpool = dbpool #更新看过的id列表 d = self.dbpool.runInteraction(self.update_feed_seen_ids) d.addErrback(self._database_error) u = self.dbpool.runInteraction(self.update_user_seen_ids) u.addErrback(self._database_error)
def process_request(self, request, spider): hostname = urlparse(request.url).hostname solr = pysolr.Solr(settings.get('SOLR_CONNECTION'), timeout=10) query = 'domain:*' + hostname.split(".")[-2] + '.onion*' if solr.search(query).hits > settings.get('MAX_PER_DOMAIN'): # Do not execute this request request.meta['proxy'] = "" msg = "Ignoring request {}, More than 1000 sites crawled from this domain.".format(request.url) log.msg(msg, level=log.INFO) raise IgnoreRequest()
def __init__(self, **kwargs): super(ChuangshiIndexSpider, self).__init__(**kwargs) mongo_uri = settings.get("MONGO_URI") db_name = settings.get("DB_NAME") auth = settings.get("AUTH") client = pymongo.MongoClient(mongo_uri) db = client[db_name] if auth: db.authenticate(**auth) books = set(i['book_id'] for i in db['book_index'].find({'source_id': 8}, {'book_id': 1})) self.books = books
def process_request(self, request, spider): addproxy = random.randrange(0, 1000) if addproxy > 500: request.meta['proxy'] = settings.get('HTTP_PROXY')
def open_spider(self, spider): file_path = settings.get("FILE_PATH") self.file = open(file_path, 'w', encoding='utf-8')
def process_request(self, request, spider): ua = random.choice(settings.get('USER_AGENT_LIST')) if ua: request.headers.setdefault('User-Agent', ua)
def __init__(self, *args, **kwargs): export_fields = settings.get('FIELDS_TO_EXPORT', []) if export_fields: kwargs['fields_to_export'] = export_fields super(TopVNCsvItemExporter, self).__init__(*args, **kwargs)
class ZhipinSpider(scrapy.Spider): name = "boss" allowed_domains = ["www.zhipin.com"] current_page = 1 #开始页码 start_urls = [ "https://www.zhipin.com/mobile/jobs.json?city=" + settings.get("BOSS_CITY_CODE") + "&query=" + settings.get("LANGUAGE"), ] custom_settings = { "ITEM_PIPELINES": { 'tutorial.pipelines.ZhipinPipeline': 300, }, "DOWNLOADER_MIDDLEWARES": { 'tutorial.middlewares.ZhipinMiddleware': 299, # 'tutorial.middlewares.ProxyMiddleware':301 }, "DEFAULT_REQUEST_HEADERS": { 'Accept': 'application/json', 'Accept-Language': 'zh-CN,zh;q=0.9', 'User-Agent': 'Mozilla/5.0 (Linux; Android 9.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Mobile Safari/537.36', 'Referer': 'https://www.zhipin.com/', 'X-Requested-With': "XMLHttpRequest", "cookie": "lastCity=101020100; JSESSIONID=" "; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1532401467,1532435274,1532511047,1532534098; __c=1532534098; __g=-; __l=l=%2Fwww.zhipin.com%2F&r=; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101020100-p100103%2F; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1532581213; __a=4090516.1532500938.1532516360.1532534098.11.3.7.11" } } def parse(self, response): js = json.loads(response.body) html = js['html'] q = Selector(text=html) items = q.css('.item') host = 'https://www.zhipin.com' x = 1 redis_host = settings.get('REDIS_HOST') redis_port = settings.get('REDIS_PORT') #初始化redis pool = redis.ConnectionPool(host=redis_host, port=redis_port, decode_responses=True) r = redis.Redis(connection_pool=pool) setkey = settings.get('REDIS_POSITION_KEY') for item in items: url = host + item.css('a::attr(href)').extract_first() position_name = item.css('h4::text').extract_first() #职位名称 salary = item.css('.salary::text').extract_first() or '' #薪资 work_year = item.css( '.msg em:nth-child(2)::text').extract_first() or '不限' #工作年限 educational = item.css( '.msg em:nth-child(3)::text').extract_first() #教育程度 meta = { "position_name": position_name, "salary": salary, "work_year": work_year, "educational": educational } sleep_seconds = int(settings.get('SLEEP_SECONDS')) time.sleep(int(random.uniform(sleep_seconds, sleep_seconds + 20))) position_id = url.split("/")[-1].split('.')[0] print(position_id) if (r.sadd(setkey, position_id)) == 1: yield Request(url, callback=self.parse_item, meta=meta) max_page = settings.get('MAX_PAGE') if self.current_page < max_page: self.current_page += 1 api_url = "https://www.zhipin.com/mobile/jobs.json?city=" + settings.get( "BOSS_CITY_CODE") + "&query=" + settings.get( "LANGUAGE") + "&page=" + str(self.current_page) time.sleep(int(random.uniform(sleep_seconds, sleep_seconds + 20))) yield Request(api_url, callback=self.parse) pass def parse_item(self, response): item = TutorialItem() q = response.css item['address'] = q('.location-address::text').extract_first() item['create_time'] = q('.job-tags .time::text').extract_first() item['body'] = q('.text').xpath('string(.)').extract_first() item['company_name'] = q('.business-info h4::text').extract_first() item['postion_id'] = response.url.split("/")[-1].split('.')[0] item = dict(item, **response.meta) yield item
class A91Spider(scrapy.Spider): name = '91' allowed_domains = ['www.91porn.com'] # start_urls = ['https://www.google.com/'] start_urls = ['http://www.91porn.com/v.php?next=watch&page=2859'] proxies_ = settings.get('PROXIES') max_page = 4698 cookies = settings.get("COOKIES") # def start_requests(self): for url in self.start_urls: yield scrapy.Request(url=url, callback=self.parse, meta={'proxy': random.choice(self.proxies_)}, cookies=self.cookies) def parse(self, response): # print(response.text) doc = PyQuery(response.text) rows = doc('.videos-text-align').items() for row in rows: try: # print(row.text()) split = row.text().strip() item = a91Item() r1 = split.split("积分:") item['score'] = r1[1] r2 = r1[0].split("留言:") item['msg'] = r2[1] r3 = r2[0].split("收藏:") item['favorite'] = r3[1] r4 = r3[0].split("查看:") item["views"] = r4[1] r5 = r4[0].split("作者:") item['author'] = r5[1] r6 = r5[0].split("添加时间:") item['add_time'] = r6[1].strip() r7 = r6[0].split(" ") if r6[0].lower().startswith("hd"): item['time'] = r6[0][3:8] item['title'] = r6[0][8:] else: item['time'] = r6[0][0:5] item['title'] = r6[0][5:] if utils.time_cmp(item['time'], min_time) < 0: continue img = row.find(".img-responsive") if img is not None: item['img'] = img.attr('src') else: item['img'] = None href = doc('.videos-text-align a').eq(0).attr("href") item['video_url'] = self.get_video_url(href) item['cell_url'] = href yield item except Exception as e: text = doc('span.pagingnav').text() error_list.append(text) print("parse" + e.__str__()) navs = doc(".pagingnav a") navs_eq = navs.eq(navs.length - 1) if navs_eq.text() == "»" and navs_eq.attr("href") is not None: print(prefix + navs_eq.attr("href")) yield scrapy.Request(prefix + navs_eq.attr("href"), callback=self.parse, meta={'proxy': random.choice(self.proxies_)}, cookies=self.cookies) def get_video_url(self, href): mget = utils.mget(href) if mget is None: return "" doc = PyQuery(mget.text) split = doc('#player_one script').eq(0).text().replace("\"", "").split("(")[2].split(",") with open("D:\develop\Python\scrapy\demo1\demo1\spiders\md5.js", "r") as f: data_func = f.read() # 读取js文件 tk = execjs.compile(data_func) # 编译执行js代码 a = "NS0tQCoqBCQKMg0AWjwoUwFaEGcTPzUBIRNBPCcAUi14AmRpIgwfS3QjETYLDVw7BigDKARRIQgLLXJ3N2x6MjALMEIaZR4NOy1nLhgEO3AqYiExCBNzHy0Obj8/A389KnA+R1AcORw8GihrIggYAiswMk00CSoFDX5hUhUjB3dydQ8BFX8UIzsdaUIrJgAd" b = "eec6NrNNPaOz9QejKhxWwwt7mjyDhT5X5h1Xnfx28IzNGteOelRRH+lqFG7Fz/OFSOamyVO4nh1lV5KCd7UzlF5fxcWneh5syBp44ecplNmZlbM2dtQ4zMokD63gvdRN8FUqO8BUw/X5" '''strencode()''' tk = tk.call('strencode', split[0], split[1]) # 调用函数 token为js里面的函数 a为传的参数 # tk = tk.call('strencode', a, b) # 调用函数 token为js里面的函数 a为传的参数 tk = str(tk).split('src=\'')[1].split("'")[0] # print('tk', tk) return tk
def closed(self, reason): # Write to TXT File with open(settings.get('STOCKLIST_FILE'), 'w') as txtfile: for row in self.stocklist_mainboard + self.stocklist_gem: txtfile.write(row[0] + ',' + row[1] + '\n')
def __init__(self): self.db = sa.create_engine(settings.get('MYSQLDB'), encoding='utf-8') self.conn = self.db.raw_connection() self.cursor = self.conn.cursor() # self.table = self.table[spider.name] self.count = 0
class AppWashingtonPostSpider(scrapy.Spider): name = 'app.Washington.post' limittime = settings.get('CRAWL_START_DATE') entry_point = { 'world': 'world/', 'business': 'business/', 'national': 'national/', 'politics': 'politics/', } headers = { 'authority': "www.washingtonpost.com", 'cache-control': "max-age=0,no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 'accept-encoding': "gzip, deflate, br", 'accept-language': "zh-CN,zh;q=0.9,en;q=0.8", } # querystring = {"id": "fB7vVe1tAPpwur", # "contentConfig": '{"path":"/world/?query=/WashingtonPost/Production/Digital/Queries/site-service/world/global-wire-feed&limit=15&offset=51"}', # "uri": "/pb/world/", # "service": "com.washingtonpost.webapps.pagebuilder.services.StoryAdapterService"} # # querystring = {"id": "f0TOnrBpFnCqur", # "contentConfig": '{"path":"/national/?query=/WashingtonPost/Production/Digital/Pages-Tablet/business/_module-content/refresh-query-ipadbiz&limit=15&offset=20"}', # "uri": "/pb/business/", # "service": "com.washingtonpost.webapps.pagebuilder.services.StoryAdapterService"} # # querystring = {"id": "fnFGqW1K2HCqur", # "contentConfig": '{"path":"/national/?query=/WashingtonPost/Production/Digital/Pages-Tablet/national/_module-content/refresh-query-ipadnational&limit=15&offset=20"}', # "uri": "/pb/national/", # "service": "com.washingtonpost.webapps.pagebuilder.services.StoryAdapterService"} # # querystring = {"id": "fzUfk61n8wBqur", # "contentConfig": '{"path":"/politics/?limit=15&offset=20"}', # "uri": "/pb/politics/", # "service": "com.washingtonpost.webapps.pagebuilder.services.StoryAdapterService"} def start_requests(self): for key in self.entry_point.keys(): yield Request(url='https://www.washingtonpost.com/{}'.format( self.entry_point[key]), method='GET', callback=self.parse, headers=self.headers, dont_filter=True) def parse(self, response): links = response.css('.story-headline a::attr(href)').extract() for link in set(links): yield Request(url=link, method='GET', callback=self.content_parse, headers=self.headers) def content_parse(self, response): date = response.css('.author-timestamp::attr(content)').extract_first() date = helper.list2str( re.findall('(\d{4}-\d{2}-\d{2}|\d{2}:\d{2})', date)) if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor4Item() pipleitem['date'] = date id = re.findall('(\d{2,4}/).*', response.url) pipleitem['id'] = id[0] if len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('.title::text').extract_first() pipleitem['source'] = 'WashingtonPost' pipleitem['content'] = helper.list2str( response.css('.paywall p').xpath('string(.)').extract()) pipleitem['editor'] = response.css('.author::text').extract_first() pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str( response.css('.paywall img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( response.css('.paywall video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None url = re.findall('"@id":(\S*)', response.text)[0] rs = requests.get(url='{asset(url:' + url + '){totalCommentCount}}', headers=self.headers).text pipleitem['comment'] = re.findall( '\d*', rs)[0] if len(re.findall('\d*', rs)) > 0 else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def __init__(self): log.init_log(settings.get('LOG_DIR')) logging.info("spider start......") print "spider start......" logging.info("fafafa")
def __init__(self): log.init_log(settings.get('LOG_DIR'))
def __set_page_range(self): self.__range_list['start'] = settings.get('WEIBO_INFO_START_PAGE') self.__range_list['end'] = settings.get('WEIBO_INFO_END_PAGE')
def from_settings(cls, settings): server = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT')) persist = settings.get('SCHEDULER_PERSIST', True) up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) hits = settings.get('QUEUE_HITS', 10) window = settings.get('QUEUE_WINDOW', 60) mod = settings.get('QUEUE_MODERATED', False) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) add_type = settings.get('SCHEDULER_TYPE_ENABLED', False) add_ip = settings.get('SCHEDULER_IP_ENABLED', False) retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) ip_regex = settings.get('IP_ADDR_REGEX', '.*') my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) return cls(server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex)
def process_request(self, request, spider): if 'proxy' not in request.meta: proxy = getattr(spider, 'proxy', settings.get('PROXY')) if proxy: request.meta['proxy'] = proxy
def process_request(self, request, spider): request.meta['proxy'] = settings.get('HTTP_PROXY') logger.debug('using proxy %s' % request.meta['proxy'] )
def __init__(self): self.redis_db = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'), db=1, password=settings.get('REDIS_PASSWD')) self.url_uuid = "news_uuid"
class MySpider(RedisSpider): name = 'caijing_yicai_Agu' allowed_domains = ['www.yicai.com'] ori_path = settings.get('ORI_PATH') encoding = "utf-8" start_urls = [ "https://www.yicai.com/news/gushi/", ] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0' } def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, callback=self.parse, headers=self.headers, dont_filter=True) def parse(self, response): start_url = response.url try: data = htmlparser.Parser(response.body.decode(self.encoding)) except Exception as e: print('response failed %s' % e) return org_list = data.xpathall('''//div[@id="newslist"]/a''') # for org in org_list[:5]: for org in org_list: if org: title = org.xpath('''//h2/text()''').text().strip() ctime = org.xpath('''//div[@class="author"]/span''').regex( '(\d+-\d+-\d+ \d+:\d+)').text().strip() org_url = org.xpath('''//@href''').text().strip() if title: url = urljoin(start_url, org_url) print(url) ctime = local_timestamp(ctime) item = {'ctime': ctime, 'title': title} print(item) yield scrapy.Request(url, callback=self.detail_parse, meta={'item': item}, headers=self.headers, dont_filter=True) def detail_parse(self, response): item = response.meta['item'] try: data = htmlparser.Parser(response.body.decode(self.encoding)) except Exception as e: print('second response failed %s' % e) return url = response.url contents = [] # 全部的文本内容 content_list = data.xpathall('''//div[@class="m-txt"]/p''') for con in content_list: con = con.text().strip() if con: contents.append(con) content_x = data.xpath('''//div[@class="m-txt"]''').data content_xml = content_x label = {} img_list = data.xpathall('''//div[@class="m-txt"]//img''') if img_list: for count, image in enumerate(img_list): image_dict = {} image_url = image.xpath('//@src').text().strip() if image_url: image_url = urljoin(url, image_url) node = '#image{}#'.format(count) file_name = image_url.split('/')[-1] image_dict['url'] = image_url image_dict['name'] = '' image_dict['file_name'] = file_name label[node] = image_dict table_list = data.xpathall('''//div[@class="m-txt"]//table''') if table_list: for count, table in enumerate(table_list): table_dict = {} node = "#table{}#".format(count) table_sele = table.data table_dict['table_xml'] = table_sele node_p = "<p>" + node + "</p>" content_x = content_x.replace(table_sele, node_p) label[node] = table_dict xml = htmlparser.Parser(content_x) web_contents = [] # web直接展示的content(表格替换成node) content_list = xml.xpathall('''//p''') for con in content_list: con = con.text().strip() if con: web_contents.append(con) breadcrumb = ["首页", "新闻", "A股"] article_info = {} channel = 'A股' accessory = [] # 附件 # all_acc = data.xpathall('''//div[@class="ewb-info-con"]//a''') # if all_acc: # for acc in all_acc: # temp = {} # acc_url = acc.xpath('//@href').text().strip() # if acc_url and '@' not in acc_url: # acc_url = urljoin(url, acc_url) # name = acc.text().strip() # file_name = acc_url.split('/')[-1].split('=')[-1] # temp['url'] = acc_url # temp['name'] = name # temp['file_name'] = file_name # dir_path = os.path.join(self.ori_path, self.dir_name) # if not os.path.isdir(dir_path): # os.makedirs(dir_path) # path = os.path.join(dir_path, file_name) # dow_img_acc(path, acc_url) # # file_content = parse_main(path) # temp['file_content'] = '' # file_content # accessory.append(temp) gtime = int(time.time()) main_business = '' source = data.xpath( '''//div[@class="title f-pr"]/p/span/text()''').text().strip() webname = '第一财经' domain = self.allowed_domains[0] uid = add_uuid(url) item["collection_name"] = "news_finance_yicai_raw" # 集合名 item["url"] = url # 链接 item["uid"] = uid # 去重id item["contents"] = contents # 数据处理的内容 item["web_contents"] = web_contents # 前端使用的内容 item["article_info"] = article_info # 文章的相关信息 item["label"] = label # 图片、表格 item["accessory"] = accessory # 附件 item["gtime"] = gtime # 爬虫时间 item['breadcrumb'] = breadcrumb # 导航 item['channel'] = channel # 频道 item["spider_name"] = self.name # 爬虫名 item["webname"] = webname # 网站名 item["domain"] = domain # 域名 item["source"] = source # 来源 item["main_business"] = main_business # 相关行业 item['path'] = '' # 附件路径 yield item
def from_crawler(cls, crawler): return cls(mongo_uri=crawler.settings.get('MONGODB_URI'), mongo_db=settings.get('MONGODB_DATABASE', 'items'))
def process_request(self, request, spider): request.meta['proxy'] = settings.get('HTTP_PROXY')
class OnionSpider(CrawlSpider): name = "OnionSpider" ALLOWED_DOMAINS = settings.get('ALLOWED_DOMAINS') if ALLOWED_DOMAINS and os.path.isfile(ALLOWED_DOMAINS): # Read a list of URLs from file # Create the target file list with open(ALLOWED_DOMAINS) as f: allowed_domains = f.read().splitlines() # Make it to Python list allowed_domains = filter(None, allowed_domains) # Remove empty strings else: allowed_domains = ["onion"] TARGET_SITES = settings.get('TARGET_SITES') if TARGET_SITES and os.path.isfile(TARGET_SITES): # Read a list of URLs from file # Create the target file list with open(TARGET_SITES) as f: start_urls = f.read().splitlines() # Make it to Python list start_urls = filter(None, start_urls) # Remove empty strings else: start_urls = [ 'https://ahmia.fi/address/', ] rules = (Rule(LinkExtractor(), callback='parse_item', follow=True), ) def parse_item(self, response): hxs = Selector(response) item = CrawledWebsiteItem() # Also the header item['header'] = response.headers item['url'] = response.url # Add the domain domain = urlparse(item['url']).hostname item['domain'] = domain title_list = hxs.xpath('//title/text()').extract() h1_list = hxs.xpath("//h1/text()").extract() item['h1'] = " ".join(h1_list) h2_list = hxs.xpath("//h2/text()").extract() item['h2'] = " ".join(h2_list) title = ' '.join(title_list) item['title'] = title encoding = self.detect_encoding(response) decoded_html = response.body.decode(encoding, 'ignore') html_text = self.html2string(decoded_html) words = self.extract_words(html_text) item['text'] = title + " " + " ".join(words) # For each link on this page item['links'] = [] links = hxs.xpath('//a') for link in links: link_obj = {} # Extract the link's URL link_str = " ".join(link.xpath('@href').extract()) link_obj['link'] = link_str.replace("\n", "") # Extract the links value link_name_str = " ".join(link.xpath('text()').extract()) link_name_str = link_name_str.replace("\n", "") link_name_str = link_name_str.lstrip() link_name_str = link_name_str.rstrip() link_obj['link_name'] = link_name_str item['links'].append(link_obj) return item def detect_encoding(self, response): return response.headers.encoding or "utf-8" def html2string(self, decoded_html): """HTML 2 string converter. Returns a string.""" converter = html2text.HTML2Text() converter.ignore_links = True string = converter.handle(decoded_html) return string def extract_words(self, html_string): """Create a word list.""" string_list = re.split(r' |\n|#|\*', html_string) # Cut a word list that is larger than 10000 words if len(string_list) > 10000: string_list = string_list[0:10000] words = [] for word in string_list: # Word must be longer than 0 letter # And shorter than 45 # The longest word in a major English dictionary is # Pneumonoultramicroscopicsilicovolcanoconiosis (45 letters) if len(word) > 0 and len(word) <= 45: words.append(word) return words
def open_spider(self, spider): db_config = settings.get("MONGODB_CONFIG") self.client = MongoClient(db_config["url"]) db = self.client[db_config["db"]] self.collection = db.line
def __init__(self): ''' 初始化 ''' self.dbItem = settings.get('DB_ITEM')
class NewsYorkbbsCaSpider(scrapy.Spider): name = 'news.yorkbbs.ca' limittime = settings.get('CRAWL_START_DATE') entry_point = { '综合': ['https://news.yorkbbs.ca/api/getlist', 'world'], '本地': ['https://news.yorkbbs.ca/api/getlist', 'local'], '专题': ['https://news.yorkbbs.ca/api/getMore/topics', 'breakingnews'], } param = {'type': None, 'pageIndex': '1', 'pageSize': '20'} commentpar = { 'articleId': None, 'origin': 'news', 'device': 'pc', 'pageIndex': '1', 'pageSize': '20' } headers = { 'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Origin': 'http://news.yorkbbs.ca', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } commentheaders = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Origin': 'http://news.yorkbbs.ca', 'Referer': None, 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } def start_requests(self): for key in self.entry_point.keys(): self.param['type'] = self.entry_point[key][1] yield FormRequest(url=self.entry_point[key][0], formdata=self.param, callback=self.parse, headers=self.headers, dont_filter=True, meta={'type': key}) def parse(self, response): jsbd = json.loads(response.text) for item in jsbd['result']: if 'contentid' not in item.keys() or len(str( item['contentid'])) == 0: continue if response.meta['type'] == '专题': yield Request( url='https://news.yorkbbs.ca/breakingnews/{}'.format( item['contentid']), method='GET', callback=self.zhuanti_parse, headers=self.headers, dont_filter=True) else: yield Request(url='https://news.yorkbbs.ca/{}'.format( item['contentid']), method='GET', callback=self.content_parse, headers=self.headers, meta={'id': item['contentid']}, dont_filter=True) def zhuanti_parse(self, response): links = response.css('.g-burst-left li .ig a::attr(href)').extract() for li in links: if re.search('news.yorkbbs.ca', li) == None: continue id = re.findall('\d+', li)[0] yield Request(url='https://news.yorkbbs.ca/{}'.format(id), method='GET', callback=self.content_parse, headers=self.headers, meta={'id': id}, dont_filter=True) def content_parse(self, response): date = response.xpath( '//div[@class="fl times"]/text()').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor4Item() pipleitem['date'] = date pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys( ) else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = re.sub( '来源:', '', response.xpath('//div[@class="fl origin"]/text()').extract_first()) pipleitem['content'] = helper.list2str( response.css('.news-detail-cont').xpath('string(.)').extract()) pipleitem['editor'] = None pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str( response.css('.news-detail-cont img::attr(src)').extract()) pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = response.css( '#support .num-total::text').extract_first() pipleitem['dislike'] = response.css( '#against .num-total::text').extract_first() self.commentheaders['Referer'] = response.url self.commentpar['articleId'] = response.meta['id'] html = requests.post( url='https://comment.yorkbbs.ca/api/comment/getComment', data=self.commentpar, headers=self.commentheaders) pipleitem['comment'] = json.loads(html.text)['totalCount'] pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
class AppLeparisienSpider(scrapy.Spider): name = 'app.leparisien' limittime = settings.get('CRAWL_START_DATE') entry_point = { 'politique': 'http://www.leparisien.fr/politique/', 'economie': 'http://www.leparisien.fr/economie/', 'societe': 'http://www.leparisien.fr/societe/' } headers = { 'Accept': '*/*', 'Host': 'www.leparisien.fr', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } def start_requests(self): for key in self.entry_point.keys(): yield Request(url=self.entry_point.get(key), method='GET', callback=self.parse, headers=self.headers, dont_filter=True) def parse(self, response): links = response.css('a::attr(href)').extract() for link in set(links): if link == None or re.search( 'www.leparisien.fr', link) == None or re.search( '\d{2}-\d{2}-\d{4}', link) == None: continue if re.match('(https|http):', link) == None: link = 'http:' + link yield Request(url=link, method='GET', callback=self.content_parse, headers=self.headers) def content_parse(self, response): date = response.xpath( '//meta[@property="article:published_time"]/@content' ).extract_first() if date == None or len(date) == 0: return date = re.findall('\d+[-:]\d+[-:]*\d*', date) try: if helper.compare_time(helper.list2str(date), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor5Item() id = re.findall('\d{7,}', response.url) pipleitem['date'] = helper.list2str(date) pipleitem['id'] = id[0] if len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = response.xpath( '//span[@class="margin_top_sm ui_bold"]/text()').extract_first() pipleitem['content'] = helper.list2str( response.xpath( 'string(//div[@class="article-section margin_bottom_article"])' ).extract()) pipleitem['editor'] = response.xpath( '//span[@class="margin_top_sm ui_bold"]/text()').extract_first() pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str( input=response.css('article.grid img::attr(src)').extract(), prefix='http://www.leparisien.fr') pipleitem['video_urls'] = helper.list2str( response.xpath('//iframe[@allow="autoplay"]/@src').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def __init__(self): connection = MongoClient(settings.get('MONGODB_URI')) db = connection[settings['MONGODB_DATABASE']] # db.authenticate(settings['MONGODB_USERNAME'], settings['MONGODB_PASSWORD']) self.collection = db[settings['CRAWLER_COLLECTION']]
def process_request(self, request, spider): request.headers['cookie'] = settings.get('BOSS_COOKIE')
class ChinatimesComSpider(scrapy.Spider): name = 'chinatimes.com' limittime = settings.get('CRAWL_START_DATE') # 'https://www.chinatimes.com/politic/total/' entry_point = { '政治': 'politic', '财经': 'money', '国际': 'world', '两岸': 'chinese', '军事': 'armament', '社会': 'society', '言论': 'opinion', } headers = { 'authority': "www.chinatimes.com", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Mobile Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 'referer': "https://www.chinatimes.com/opinion/", 'accept-encoding': "gzip, deflate, br", 'accept-language': "zh-CN,zh;q=0.9,en;q=0.8,co;q=0.7", 'Cache-Control': "no-cache", 'Host': "www.chinatimes.com", } # 'https://www.chinatimes.com/opinion/PageListTotal/?page=2&_=1561981086807' def start_requests(self): for key in self.entry_point.keys(): yield Request(url='https://www.chinatimes.com/{}/PageListTotal/?page=1'.format(self.entry_point[key]), method='GET', callback=self.parse, headers=self.headers, dont_filter=True) def parse(self, response): jsbd = json.loads(response.text) for item in jsbd['list'] if 'list' in jsbd.keys() else []: if 'HyperLink' not in item.keys() or len(str(item['HyperLink'])) == 0: continue date = '{date} {time}'.format(date=item['ArticleDate'],time=item['ArticleTime']) date = helper.formatTime(date) id = item['Id'] if 'Id' in item.keys() else None yield Request(url='https://www.chinatimes.com{}'.format(item['HyperLink']), method='GET', callback=self.content_parse, headers=self.headers, meta={'date': date, 'id': id}) # 'https://www.chinatimes.com/realtimenews/20190701003505-260407' def content_parse(self, response): date = response.meta['date'] if 'date' in response.meta.keys() else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor4Item() pipleitem['date'] = date pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys() else None pipleitem['url'] = response.url pipleitem['title'] = response.css('.title::text').extract_first() pipleitem['source'] = response.xpath('//div[@name="source"]/@content').extract_first() pipleitem['content'] = helper.list2str(response.css('.article-body p').xpath('string(.)').extract()) pipleitem['editor'] = response.css('.author a::text').extract_first() pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str(response.css('img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str(response.css('video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
class CaseNumberSpider(scrapy.Spider): name = 'casenumber' allowed_domains = ['www.itslaw.com'] custom_settings = { "LOG_LEVEL": "DEBUG", # "DOWNLOAD_TIMEOUT": 5, # "DOWNLOAD_DELAY": 0.2, "DOWNLOADER_MIDDLEWARES": { # 'itslaw.middlewares.ProxyMiddleware': 543, "itslaw.middlewares.ItslawDownloaderMiddleware": 534 }, "DEFAULT_REQUEST_HEADERS": { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36", "Referer": "https://www.itslaw.com/search?searchMode=judgements&sortType=1&conditions=trialYear%2B1994%2B7%2B1994", # "Cookie": "_t=0e9084b2-59b6-4cab-985f-be99b553e944; sessionId=49f99a6a-99e0-438a-8181-f3757aa8e267; LXB_REFER=mail.qq.com; _u=f0c76f8f-8df1-4e56-832a-7aa5fe6118c4; Hm_lvt_bc6f194cb44b24b9f44f1c8766c28008=1554555977,1554601580,1554601590,1554601609; Hm_lvt_e496ad63f9a0581b5e13ab0975484c5c=1554555977,1554601580,1554601591,1554601609; _i=bf039e9d-6188-4c4a-8bce-b4bd757b6b67; _p=032dd594-483e-4ec8-ab62-773cf754fdb9; Hm_lpvt_e496ad63f9a0581b5e13ab0975484c5c=1554601618; Hm_lpvt_bc6f194cb44b24b9f44f1c8766c28008=1554601618", }, "ITEM_PIPELINES": { 'itslaw.pipelines.ConditionPipeline': 300, } } settings = get_project_settings() redis_host = settings.get("REDIS_HOST") redis_port = settings.get("REDIS_PORT") proxy_server = settings.get("PROXY_SERVER") proxy_user = settings.get("PROXY_USER") proxy_pass = settings.get("PROXY_PASS") proxy_auth = "Basic " + base64.urlsafe_b64encode( bytes((proxy_user + ":" + proxy_pass), "ascii")).decode("utf8") pool = ConnectionPool(host=redis_host, port=redis_port, db=0) r = Redis(connection_pool=pool) count = os.getenv("COUNT", "") key = f'conditions:case{count}' # key = f'conditions:error' def start_requests(self): # $env:COUNT="" self.name += self.count while True: self.r.sdiffstore(self.key, self.key, "conditions:crawled") self.r.sdiffstore(self.key, self.key, "conditions:pages") self.r.sdiffstore(self.key, self.key, "conditions:beyond") left = self.r.sdiffstore(self.key, self.key, "conditions:noresult") self.logger.info(f"left {left} condition combinations to crawl.") urls = self.r.srandmember(self.key, number=10000) if not urls: break for url in urls: yield Request(str(url, encoding="utf-8"), dont_filter=True) def parse(self, response): url = response.url res = json.loads(response.body_as_unicode()) code = res["result"]["code"] message = res["result"]["message"] self.logger.debug(message) if 0 != code: error_essage = res["result"]["errorMessage"] self.logger.debug(error_essage) return try: data = res["data"] except Exception as e: self.logger.debug(e) yield Request(url=response.url, dont_filter=True) return searchResult = data["searchResult"] total_count = searchResult["totalCount"] if 0 == total_count: self.r.sadd("conditions:noresult", url) elif total_count <= 20: judgements = searchResult["judgements"] for each in judgements: jid = each["id"] yield JudgementItem(id=jid) else: self.r.sadd("conditions:crawled", url) elif total_count <= 400: self.r.sadd("conditions:pages", url) else: self.r.sadd("conditions:beyond", url)
def __init__(self, *args, **kwargs): kwargs['fields_to_export'] = settings.getlist('EXPORT_FIELDS') or None kwargs['encoding'] = settings.get('EXPORT_ENCODING', 'utf-8') super(CSVkwItemExporter, self).__init__(*args, **kwargs)
class ConditionSpider(scrapy.Spider): name = 'condition' allowed_domains = ['www.itslaw.com'] custom_settings = { # "LOG_LEVEL": "DEBUG", "DOWNLOAD_TIMEOUT": 5, # "DOWNLOAD_DELAY": 0.2, "DOWNLOADER_MIDDLEWARES": { 'itslaw.middlewares.ProxyMiddleware': 543, # "itslaw.middlewares.ItslawDownloaderMiddleware": 534 }, "DEFAULT_REQUEST_HEADERS": { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36", "Referer": "https://www.itslaw.com/search?searchMode=judgements&sortType=1&conditions=trialYear%2B1994%2B7%2B1994", "Cookie": "_t=0e9084b2-59b6-4cab-985f-be99b553e944; showSubSiteTip=false; subSiteCode=bj; LXB_REFER=www.wusong.com; Hm_lvt_bc6f194cb44b24b9f44f1c8766c28008=1555339418,1555339440,1555339451; Hm_lvt_e496ad63f9a0581b5e13ab0975484c5c=1555339418,1555339440,1555339451; sessionId=53b834b2-5dc8-4be5-889f-c5c425f51fc6; _u=8768e601-6c73-4ff3-941a-99f77f09b573; Hm_lpvt_bc6f194cb44b24b9f44f1c8766c28008=1557581284; Hm_lpvt_e496ad63f9a0581b5e13ab0975484c5c=1557581284", }, "ITEM_PIPELINES": { 'itslaw.pipelines.ConditionPipeline': 300, } } settings = get_project_settings() redis_host = settings.get("REDIS_HOST") redis_port = settings.get("REDIS_PORT") proxy_server = settings.get("PROXY_SERVER") proxy_user = settings.get("PROXY_USER") proxy_pass = settings.get("PROXY_PASS") proxy_auth = "Basic " + base64.urlsafe_b64encode(bytes((proxy_user + ":" + proxy_pass), "ascii")).decode("utf8") pool = ConnectionPool(host=redis_host, port=redis_port, db=0) r = Redis(connection_pool=pool) key = f'condition:searchword{os.getenv("COUNT", "")}' # key = f'conditions:error' def start_requests(self): # $env:COUNT="" while True: left = self.r.sdiffstore(self.key, self.key, "condition:crawled") self.logger.info(f"[*] left {left} condition combinations to crawl.") urls = self.r.srandmember(self.key, number=10000) if not urls: break for url in urls: yield Request(str(url, encoding="utf-8"), dont_filter=True) def parse(self, response): url = response.url try: res = json.loads(response.body_as_unicode()) except Exception as e: return code = res["result"]["code"] message = res["result"]["message"] self.logger.debug(message) if 0 != code: error_essage = res["result"]["errorMessage"] self.logger.debug(error_essage) self.r.sadd("condition:error", response.url) return try: data = res["data"] except Exception as e: self.r.sadd("condition:error", response.url) self.logger.debug(e) yield Request(url=response.url, dont_filter=True) return searchResult = data["searchResult"] judgements = searchResult.get("judgements", []) for each in judgements: jid = each["id"] yield JudgementItem(id=jid) else: self.r.sadd("condition:crawled", url)