def parse(self, response): xs = Selector(response) base_objects = [] base_elem = self.scraper.get_base_elem() rpt = response.request.meta['rpt'] page_num = response.request.meta['page_num'] page = self.pages[page_num - 1] follow_page_num = response.request.meta['follow_page_num'] if rpt.page_type == 'MP': if self.current_output_num_mp_response_bodies < self.conf[ 'OUTPUT_NUM_MP_RESPONSE_BODIES']: self.current_output_num_mp_response_bodies += 1 self.log( "Response body ({url})\n\n***** RP_MP_{num}_START *****\n{resp_body}\n***** RP_MP_{num}_END *****\n\n" .format(url=response.url, resp_body=response.body, num=self.current_output_num_mp_response_bodies), logging.INFO) if rpt.content_type == 'J': json_resp = None try: json_resp = json.loads(response.body_as_unicode()) except ValueError: msg = "JSON response for MP could not be parsed!" self.log(msg, logging.ERROR) if json_resp: try: jsonpath_expr = parse(base_elem.x_path) except JsonPathLexerError: msg = "JsonPath for base elem could not be processed!" self.dds_logger.error(msg) raise CloseSpider() base_objects = [ match.value for match in jsonpath_expr.find(json_resp) ] if len(base_objects) > 0: base_objects = base_objects[0] else: base_objects = response.xpath(base_elem.x_path) if (len(base_objects) == 0): self.log( "{cs}No base objects found.{ce}".format( cs=self.bcolors["INFO"], ce=self.bcolors["ENDC"]), logging.ERROR) if (self.conf['MAX_ITEMS_READ']): items_left = min( len(base_objects), self.conf['MAX_ITEMS_READ'] - self.items_read_count) base_objects = base_objects[0:items_left] for obj in base_objects: item_num = self.items_read_count + 1 self.tmp_non_db_results[item_num] = {} page_str = str(page_num) + '(' + str(follow_page_num) + ')' self.dds_logger.info("") self.dds_logger.info( self.bcolors['BOLD'] + '--------------------------------------------------------------------------------------' + self.bcolors['ENDC']) self.struct_log( "{cs}Starting to crawl item {i} from page {p}.{ce}".format( i=str(item_num), p=page_str, cs=self.bcolors["HEADER"], ce=self.bcolors["ENDC"])) self.dds_logger.info( self.bcolors['BOLD'] + '--------------------------------------------------------------------------------------' + self.bcolors['ENDC']) item = self.parse_item(response, obj, rpt.page_type, item_num) item._dds_item_page = page item._dds_item_page_num = page_num item._dds_item_follow_page_num = follow_page_num item._dds_item_id = item_num item._dds_id_str = str(item._dds_item_page_num) + '(' + str( item._dds_item_follow_page_num) + ')-' + str(item._dds_item_id) if item: only_main_page_idfs = True idf_elems = self.scraper.get_id_field_elems() for idf_elem in idf_elems: if idf_elem.request_page_type != 'MP': only_main_page_idfs = False is_double = False if only_main_page_idfs: item, is_double = self._check_for_double_item(item) # Don't go on reading detail pages when... # No detail page URLs defined or # DOUBLE item with only main page IDFs and no standard update elements to be scraped from detail pages or # generally no attributes scraped from detail pages cnt_sue_detail = self.scraper.get_standard_update_elems_from_detail_pages( ).count() cnt_detail_scrape = self.scraper.get_from_detail_pages_scrape_elems( ).count() if self.scraper.get_detail_page_url_elems().count() == 0 or \ (is_double and cnt_sue_detail == 0) or cnt_detail_scrape == 0: self.non_db_results[id( item)] = self.tmp_non_db_results[item_num].copy() yield item else: #self.run_detail_page_request() url_elems = self.scraper.get_detail_page_url_elems() for url_elem in url_elems: if not url_elem.scraped_obj_attr.save_to_db: url_before = self.tmp_non_db_results[item_num][ url_elem.scraped_obj_attr.name] url, applied = self._replace_placeholders( url_before, item, item_num, True) self.tmp_non_db_results[item_num][ url_elem.scraped_obj_attr.name] = url else: url_before = item[url_elem.scraped_obj_attr.name] url, applied = self._replace_placeholders( url_before, item, item_num, True) item[url_elem.scraped_obj_attr.name] = url if len(applied) > 0: msg = "Detail page URL placeholder(s) applied (item {id}): {a}".format( a=str(applied), id=item._dds_id_str) self.log(msg, logging.DEBUG) self.log("URL before: " + url_before, logging.DEBUG) self.log("URL after : " + url, logging.DEBUG) dp_rpt = self.scraper.get_rpt_for_scraped_obj_attr( url_elem.scraped_obj_attr) kwargs = self.dp_request_kwargs[ dp_rpt.page_type].copy() if 'meta' not in kwargs: kwargs['meta'] = {} kwargs['meta']['page_num'] = page_num kwargs['meta']['follow_page_num'] = follow_page_num kwargs['meta']['item'] = item kwargs['meta']['from_page'] = dp_rpt.page_type kwargs['meta']['item_num'] = item_num kwargs['meta']['rpt'] = dp_rpt if 'headers' in kwargs: kwargs['headers'] = self._do_req_info_replacements( item, item_num, page, kwargs['headers'], "HEADERS") if 'body' in kwargs: body_before = kwargs['body'] kwargs['body'] = kwargs['body'].replace( '{page}', str(page)) kwargs[ 'body'], applied = self._replace_placeholders( kwargs['body'], item, item_num, True) if len(applied) > 0: msg = "Request info placeholder(s) applied (item {id}): {a}".format( a=str(applied), id=item._dds_id_str) self.log(msg, logging.DEBUG) self.log("BODY before: " + body_before, logging.DEBUG) self.log("BODY after : " + kwargs['body'], logging.DEBUG) if 'cookies' in kwargs: kwargs['cookies'] = self._do_req_info_replacements( item, item_num, page, kwargs['cookies'], "COOKIES") form_data = None if dp_rpt.request_type == 'F' and dp_rpt.form_data: form_data = json.loads(dp_rpt.form_data).copy() form_data = self._do_req_info_replacements( item, item_num, page, form_data, "FORM DATA") if url_elem == url_elems[len(url_elems) - 1]: kwargs['meta']['last'] = True else: kwargs['meta']['last'] = False self._set_meta_splash_args() #logging.info(str(kwargs)) self.log( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>", logging.INFO) msg = "{cs}Calling {dp} URL for item {id}...{ce}".format( dp=dp_rpt.page_type, id=item._dds_id_str, cs=self.bcolors["HEADER"], ce=self.bcolors["ENDC"]) self.log(msg, logging.INFO) msg = "URL : {url}".format(url=url) self.log(msg, logging.INFO) self._log_request_info(dp_rpt, form_data, kwargs) self.log( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>", logging.INFO) if dp_rpt.request_type == 'R': yield response.follow( url, callback=self.parse_item, method=dp_rpt.method, dont_filter=dp_rpt.dont_filter, **kwargs) else: yield FormRequest(url, callback=self.parse_item, method=dp_rpt.method, formdata=form_data, dont_filter=dp_rpt.dont_filter, **kwargs) for key, value in list(item.items()): #Fixing some extremely weird Python 2 encoding failure, 2017-06-29 if type(value).__name__ == 'str': try: value = value.decode('utf-8') except AttributeError: pass if value and (type(value).__name__ in ['str', 'unicode']) and '{page}' in value: msg = "Applying page placeholder on {k}...".format( k=key) self.log(msg, logging.DEBUG) self.log("Value before: " + value, logging.DEBUG) value = value.replace('{page}', str(page)) item[key] = value self.log("Value after: " + value, logging.DEBUG) else: self.log("Item could not be read!", logging.ERROR) mir_reached = False if self.conf['MAX_ITEMS_READ'] and ( self.conf['MAX_ITEMS_READ'] - self.items_read_count <= 0): mir_reached = True if self.scraper.follow_pages_url_xpath and not mir_reached: if not self.conf['NUM_PAGES_FOLLOW'] or follow_page_num < self.conf[ 'NUM_PAGES_FOLLOW']: url = response.xpath( self.scraper.follow_pages_url_xpath).extract_first() if url is not None: self._set_meta_splash_args() follow_page = '' if self.scraper.follow_pages_page_xpath: follow_page = response.xpath( self.scraper.follow_pages_page_xpath ).extract_first() form_data_orig = None if self.scraper.get_follow_page_rpts().count() > 0: f_rpt = self.scraper.get_follow_page_rpts()[0] form_data_orig = self.scraper.get_follow_page_rpts( )[0].form_data else: f_rpt = self.scraper.get_main_page_rpt() form_data_orig = self.scraper.get_main_page_rpt( ).form_data kwargs, form_data = self._prepare_mp_req_data( self.fp_request_kwargs, form_data_orig, page, follow_page) follow_page_num += 1 kwargs['meta']['page_num'] = page_num kwargs['meta']['follow_page_num'] = follow_page_num kwargs['meta']['rpt'] = f_rpt self._log_page_info(page_num, follow_page_num, url, f_rpt, form_data, kwargs) if f_rpt.request_type == 'R': yield response.follow(url, callback=self.parse, method=f_rpt.method, dont_filter=f_rpt.dont_filter, **kwargs) else: url = response.urljoin(url) yield FormRequest(url, callback=self.parse, method=f_rpt.method, formdata=form_data, dont_filter=f_rpt.dont_filter, **kwargs)
def parse(self, response): type = response.css('head').xpath( './meta[@property="og:type"]/@content').get().split('.')[1] if not type == "movie": yield None return titleSection = response.css('.subpage_title_block') if titleSection is None: yield None return idMovie = titleSection.css(".parent").xpath( "./h3/a/@href").get().split('/')[2] if idMovie in self.moviesScrapped: yield None return movieYear = titleSection.css('.nobr').xpath( './text()').get().strip().replace(')', '(').split('(')[1].split(' ')[0] if movieYear is None: yield None return if (int(movieYear) < 1980 or int(movieYear) > 1989): yield None return movieName = titleSection.xpath('./div/h3/a/text()').get() actorList = response.css('.cast_list').xpath('./tr')[1::] nextScrap = [] for c in actorList: if self.documentscount >= 5000: yield None raise CloseSpider('Number of documents reached') if (len(c.xpath('./td').getall()) < 3): continue actorURL = c.xpath('./td/a/@href').get() actorId = actorURL.split('/')[2] actorName = c.xpath( './td[@class="primary_photo"]//a/img/@alt').get() actorRole = c.xpath( './td[@class="character"]/text()').get().strip().replace( "\n", "") if actorRole == '': actorRole = c.xpath('./td[@class="character"]/a/text()').get() if actorURL is not None: nextScrap.append({ "url": self.allowed_domains[0] + actorURL, "id": actorId }) yield { "movie_id": idMovie, "movie_name": movieName, "movie_year": movieYear, "actor_name": actorName, "actor_id": actorId, "role_name": actorRole } self.documentscount = self.documentscount + 1 self.moviesScrapped.append(idMovie) for a in nextScrap: if a['id'] not in self.actorsScrapped: self.actorsScrapped.append(a['id']) next_page = "https://" + a['url'] yield Request(next_page, callback=self.parse_artist)
def parse_page(self, response): # inspect_response(response, self) if 'reaction_units/more' in response.url: json_data = json.loads(response.body_as_unicode().replace( 'for (;;);', '')) post_html = json_data.get('domops')[0][-1].get('__html') structural_json_data = self._create_structed_json_data(json_data) else: main_content_id = response.css( '#pagelet_timeline_main_column>div::attr(id)').extract_first() if not main_content_id: raise CloseSpider('Main content id not found') main_script = response.xpath( f'//script/text()[contains(.,"{main_content_id}") ' f'and contains(.,"content:")]').extract_first() main_id = re.search(r'container_id\:"(.*?)"', main_script).group(1) post_html = response.css(f'#{main_id}').extract_first() post_html = post_html.replace('-->', '').replace('<!--', '') sel = Selector(text=post_html) posts = sel.xpath( '//div[@class="_1xnd"]' '/div[@class and not(descendant::*[contains(@class,"uiMorePagerPrimary")])]' ) page_name = response.meta.get('page_name') or \ response.css('#pageTitle::text').extract_first() page_name = page_name.split('-')[0].rstrip() page_id = response.meta.get('page_id') for post in posts: loader = FacebookPostItemLoader(selector=post) loader.add_value('page_name', page_name) loader.add_value('page_id', page_id) loader.add_css('post_id', 'input[name*="identifier"]::attr(value)') post_id = loader.get_output_value('post_id') loader.add_value( 'post_url', f'https://www.facebook.com/{page_id}/posts/{post_id}') loader.add_xpath( 'post_text', './/div[@data-testid="post_message"]' '//text()[not(ancestor::span[@class="text_exposed_hide"])]') loader.add_css('image_urls', '.mtm a::attr(data-ploi)', MapCompose(lambda v: v.split('?')[0])) loader.add_css( 'video_url', '.fsm>a::attr(href)', MapCompose(response.urljoin, lambda v: v if 'videos' in v else None, lambda v: v.split('?')[0])) if 'reaction_units/more' in response.url: post_json_data = structural_json_data.get(post_id) else: # inspect_response(response, self) post_script = response.xpath( f'//script/text()[contains(.,"{post_id}") ' f'and (contains(.,"post_fbid") or contains(.,"photo_fbid"))]' ).extract_first() post_script = re.search( r'onPageletArrive\((\{.*\})', post_script).group(1).split('all_phases')[0] + '}' json_data = demjson.decode(post_script) json_data = json_data.get('jsmods').get( 'pre_display_requires')[0][3][1].get('__bbox') variables = json_data.get('variables') post_json_data = json_data.get('result').get('data').get( 'feedback') loader.add_value( 'comment_count', post_json_data.get('comment_count').get('total_count')) loader.add_value('reaction_count', post_json_data.get('reaction_count').get('count')) loader.add_value('share_count', post_json_data.get('share_count').get('count')) comment_json = post_json_data.get('display_comments') edges = comment_json.get('edges') for edge in edges: comment_loader = FacebookCommentItemLoader() node = edge.get('node') comment_loader.add_value('comment_id', node.get('id')) try: comment_loader.add_value('comment_text', node.get('body').get('text')) except AttributeError: pass author = node.get('author') comment_loader.add_value('author_name', author.get('name')) comment_loader.add_value('author_id', author.get('id')) comment_loader.add_value('author_url', author.get('www_url')) loader.add_value('comments', comment_loader.load_item()) yield loader.load_item() # TODO: Fetch first 50 comments # page_info = comment_json.get('page_info') # has_next_comment_page = page_info.get('has_next_page') # if has_next_comment_page: # end_cursor = page_info.get('end_cursor') # variables['after'] = end_cursor # variables['before'] = None # # # yield Request( # # url='https://www.facebook.com/api/graphql/', # # method='POST', # # body=json.dumps(body), # # callback=self.parse_next_comment, # # headers=headers, # # ) async_get_token = response.xpath( '//script/text()[contains(.,"async_get_token")]').extract_first( ) or response.body_as_unicode() async_get_token = re.search(r'"async_get_token"\:"(.*?)"', async_get_token).group(1) next_page = sel.css( '.uiMorePagerPrimary::attr(ajaxify)').extract_first() if next_page: next_url = response.urljoin(next_page) extra_params = urllib.parse.urlencode({ '__a': 1, 'fb_dtsg_ag': async_get_token }) next_url += '&' + extra_params yield Request(next_url, callback=self.parse_page, meta={ 'page_name': page_name, 'page_id': page_id })
class ZoominTvSpider(scrapy.Spider): name = "zoomin.tv" allowed_domains = ["zoomin.tv"] callbacked = False pids = [ 'corporateusahddp', 'corporateuk', 'corporateke', 'corporatees', 'corporatelatamdp', 'corporatecataldp', 'corporatenl', 'corporatevla', 'corporatede', 'corporateit', 'corporatefr', 'corporatewal', 'corporatebradp', 'corporatetr', 'corporateswedp', 'corporateru', 'corporatejp', 'corporatechinacndp', 'corporatearabdp' ] # start_urls = ( # 'http://www.zoomin.tv/', # ) # http://blackbird.zoomin.tv/ProgramXml/.json?feedtype=json&pid=corporateusahddp&vtype=direct&aid=754116 # http://zoomin.tv/video/#!v/754116/ def __init__(self, url, uuid, upload_url, callback, check_video_url=None, *args, **kwargs): super(ZoominTvSpider, self).__init__(*args, **kwargs) print 'init', url self.config = ConfigParser.ConfigParser() self.config.read("config/config.ini") self.uuid = uuid self.upload_url = upload_url self.callback = callback self.check_video_url = check_video_url # initialize db with open("config/database.cnf") as f: config = json.load(f) db_cls = get_database(config.get("database_type", None)) self.db = db_cls(**config.get("database", {})) self.start_urls.append(url) def parse(self, response): print 'parsePlayurl', response.url try: video_id = self._match_id(self.start_urls[0]) except AssertionError, e: raise CloseSpider('link not supported') logger.warn('[parse]' + self.start_urls[0] + ' [uuid]' + self.uuid + ' [video_id]' + video_id) if self.check_db(): return video = None for pid in self.pids: getinfo_url = 'http://blackbird.zoomin.tv/ProgramXml/.json?feedtype=json&pid=%s&vtype=direct&aid=%s' % ( pid, video_id) resp = requests.get(getinfo_url) info = resp.json() print info if len(info['programme']) > 0: video = info['programme'][0] break video_url = video['videourl'] endpoint, backet, obj = service.utils.paseUploadUrl(self.upload_url) print endpoint, backet, obj result = service.utils.uploadVideoByUrl(video_url, endpoint, backet, obj) if not result: raise CloseSpider('upload oss failed') filesize = video['videosize'] length = int(video['videoduration']) / 1000.0 title = video['title'] print 'filesize', filesize # callback data = { "video_id": self.uuid, "state": 1, "message": u'成功', "length": length, "play_id": self.uuid, "size": filesize, "cover": '', "title": title } self.callbacked = service.utils.callback_result(self.callback, data=data) logger.info('[finished]' + str(self.callbacked) + '[uuid]' + self.uuid) video_data = { 'title': title, 'video_id': video_id, 'author': self.name, 'publish': time.strftime('%Y-%m-%d %H:%M:%S'), 'page_url': self.start_urls[0], 'video_length': length, 'video_size': filesize, 'video_url': video_url, 'easub_uuid': self.uuid } self.db.save_video(video_data)
def parse(self, response): # x_path test checker = response.request.meta['checker'] rpt = response.request.meta['rpt'] if self.conf['OUTPUT_RESPONSE_BODY']: self.log( "Response body ({url})\n\n***** RP_START *****\n{resp_body}\n***** RP_END *****\n\n" .format(url=response.url, resp_body=response.body.decode('utf-8')), logging.INFO) if checker.checker_type == '4': self.log( "{cs}No 404 result ({c} checker type).{ce}".format( c=str(checker), cs=self.bcolors["OK"], ce=self.bcolors["ENDC"]), logging.INFO) if self.conf['DO_ACTION']: self.dds_logger.info("{cs}Item kept.{ce}".format( cs=self.bcolors["OK"], ce=self.bcolors["ENDC"])) return if rpt.content_type == 'J': json_resp = json.loads(response.body_as_unicode()) try: jsonpath_expr = parse(checker.checker_x_path) except JsonPathLexerError: msg = "Invalid checker JSONPath ({c})!".format(c=str(checker)) self.dds_logger.error(msg) raise CloseSpider() test_select = [ match.value for match in jsonpath_expr.find(json_resp) ] #self.log(unicode(test_select), logging.INFO) else: try: test_select = response.xpath(checker.checker_x_path).extract() except ValueError: self.log("Invalid checker XPath ({c})!".format(c=str(checker)), logging.ERROR) return if len(test_select) > 0 and checker.checker_x_path_result == '': self.log( "{cs}Elements for XPath found on page (no result string defined) ({c}). Delete reason.{ce}" .format(c=str(checker), cs=self.bcolors["ERROR"], ce=self.bcolors["ENDC"]), logging.INFO) if self.conf['DO_ACTION']: self._del_ref_object() return elif len(test_select ) > 0 and test_select[0] == checker.checker_x_path_result: self.log( "{cs}XPath result string '{s}' found on page ({c}). Delete reason.{ce}" .format(s=checker.checker_x_path_result, c=str(checker), cs=self.bcolors["ERROR"], ce=self.bcolors["ENDC"]), logging.INFO) if self.conf['DO_ACTION']: self._del_ref_object() return else: self.log( "{cs}XPath result string not found ({c}).{ce}".format( c=str(checker), cs=self.bcolors["OK"], ce=self.bcolors["ENDC"]), logging.INFO) if self.conf['DO_ACTION']: self.dds_logger.info("{cs}Item kept.{ce}".format( cs=self.bcolors["OK"], ce=self.bcolors["ENDC"])) return
def parse(self, response): ''' this part parses the response, then call the request again for the next pages and so on ''' print("") print("") print("") print("") print(" ======== " + self.name + " from " + str(self.page_0) + " to " + str(self.page_1) + " ========") print("page ============================ ", str(self.page)) print("page ============================ ", str(self.page)) print("iterations =======================", str(self.iters)) print("timestamp ======================= ", datetime.datetime.now()) print("time since start ==================== ", datetime.datetime.now() - self.start_time) #uncomment below to check IP one by one #yield scrapy.Request('http://checkip.dyndns.org/', headers = {'Connection': 'close'}, callback=self.check_ip, dont_filter = True) #uncomment to check IP one by one #randomizes the user agents to make detection harder ua_files = open('ua_files.txt').read().splitlines() user_agents = random.choice(ua_files) url = self.url.replace('__pagenum__', str((self.page * 50))) print('attempts on this page ============================', str(self.attempts + 1)) print("user agent ====================", user_agents) #headers for the request, might need checking once in a while whether it match the actual request headers headers = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9' #,'if-none-match-' : '55b03-20443a68390f59aa1bc448bc3b42fa6e' , 'referer': self.referer.replace('__pagenum__', str(self.page)), 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': user_agents, 'x-api-source': 'pc', 'x-requested-with': 'XMLHttpRequest', 'Connection': 'close' } yield scrapy.Request(url=url, callback=self.parse, headers=headers, dont_filter=True) data = json.loads(response.text) #print(data) #every region has different error patterns, as shown below. Need to be checked manually #give up after 50 tries, also the possibility of the category have less than 160 pages if self.attempts <= 10: if self.region in ['id', 'vn', 'th']: self.check_corrupt(data=data, zeroes=5, pagenum=50) elif self.region == 'ph': self.check_corrupt(data=data, zeroes=5, pagenum=45) elif self.region == 'my': self.check_corrupt(data=data, zeroes=3, pagenum=50) print("data corrupted ================ ", self.corrupt) else: print( "data corrupted ================ but gave up trying on this page" ) self.corrupt = 0 self.breaker = 1 pass print("data corrupted ================ ", self.corrupt) self.iters += 1 #if hit max page, that call the cleaning function ##if you're using unbatched pagination and wants to use single process cleaning, uncomment the cleaning func ##if you're using unbatched pagination and wants to use single process cleaning with an integrated dataframe (ie not reading the entire printed JSON object), uncomment the df processs ##the integrated df method should be more pythonic and efficient. It is still somewhat unstable though, use with caution if self.page >= self.page_max + 1: #if you're using unbatched pagination and wants to use single process cleaning, uncomment this #cleaning(self.name, self.output, self.region, self.category, self.subcategory, self.subsubcategory) ##if you're using unbatched pagination and wants to use single process cleaning with an integrated dataframe (ie not reading the entire printed JSON object), uncomment this '''if 'rank' not in self.df: self.df['rank'] = np.arange(len(self.df)) print(' =================== raw' + self.name + '.csv') self.df.to_csv('raw' + self.name + '.csv', index=False) print(' =================== raw' + self.name + '.csv')''' raise CloseSpider("====MAX PAGE HAS BEEN REACHED!==== ") ##if you're using unbatched pagination and wants to use single process cleaning, you can comment out this entire elif part as it becomes redundant elif self.page >= self.page_1: #self.cleaning(self.name, self.output, self.region, self.category, self.subcategory, self.subsubcategory) '''if 'rank' not in self.df: self.df['rank'] = np.arange(len(self.df)) print(' =================== raw' + self.name + '.csv') self.df.to_csv('raw' + self.name + '.csv', index=False) print(' =================== raw' + self.name + '.csv')''' raise CloseSpider("====MAX PAGE HAS BEEN REACHED!==== ") #if error occurs, and max threshold is hit, print JSON as-is elif self.corrupt == 0 and self.breaker == 1: cleaning(self.name, self.output, self.region, self.category, self.subcategory, self.subsubcategory) if data['items'] is None: raise CloseSpider("====NO DATA IS RETURNED!==== ") elif data['query_rewrite'] is None: raise CloseSpider("====DATA CORRUPTED!====") else: raise CloseSpider("==== UNKNOWN ERROR ==== ") #if error occurs, print out the corresponding error types, then loop to scrapthe same page again elif self.corrupt == 1: self.attempts += 1 print("Something went wrong!, retry attempts ===== ", self.attempts) if data['items'] is None: print( "Error =========== data[item] is None, no data is returned!" ) time.sleep(5) elif self.corrupt == 1: print( "Error =========== data is corrupted!, retrying in 5 secs") time.sleep(5) #if OK, then print the acquired JSON data to a JSON file to be compiled later by the cleaning function else: with open(os.path.join( 'raw_shopee/raw_shopee_' + self.region + '/' + self.name, 'data_q_' + str(self.page) + '.json'), 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) #data = data['items'] #data.update({'page':self.page}) #data.update({'rating_star': obj['items'][k]['item_rating']['rating_star']}) #data.update({'timestamp':pd.datetime.now().replace(microsecond=0)}) #df = pd.DataFrame ##if you're using unbatched pagination and wants to use single process cleaning with an integrated dataframe (ie not reading the entire printed JSON object), uncomment this section '''k=0 for j in data['items']: j.update({'page_num':self.page}) j.update({'rating_star': data['items'][k]['item_rating']['rating_star']}) j.update({'timestamp':pd.datetime.now().replace(microsecond=0)}) for m in range(0,6): j.update({'star_' + str(5-m): data['items'][k]['item_rating']['rating_count'][m]}) self.df = self.df.append(j, ignore_index = True) k+=1 #if 'rank' not in df: # df['rank'] = np.arange(len(data)) if 'category' not in self.df: self.df['category'] = self.category if 'subcategory' not in self.df: self.df['subcategory'] = self.subcategory if 'subsubcategory' not in self.df: self.df['subsubcategory'] = self.subsubcategory if 'platform' not in self.df: self.df['platform'] = 'shopee' if 'region' not in data: self.df['region'] =self.region if 'engine_ver' not in self.df: self.df['engine_ver'] = 'v0.4.2' #print(self.df)''' self.page += 1 self.attempts = 0
def parse_weibo(self, response): """解析网页中的微博信息""" keyword = response.meta.get('keyword') for sel in response.xpath("//div[@class='card-wrap']"): info = sel.xpath( "div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']" ) if info: weibo = WeiboItem() weibo['id'] = sel.xpath('@mid').extract_first() weibo['bid'] = sel.xpath( '(.//p[@class="from"])[last()]/a[1]/@href').extract_first( ).split('/')[-1].split('?')[0] weibo['user_id'] = info[0].xpath( 'div[2]/a/@href').extract_first().split('?')[0].split( '/')[-1] weibo['screen_name'] = info[0].xpath( 'div[2]/a/@nick-name').extract_first() txt_sel = sel.xpath('.//p[@class="txt"]')[0] retweet_sel = sel.xpath('.//div[@class="card-comment"]') retweet_txt_sel = '' if retweet_sel and retweet_sel[0].xpath('.//p[@class="txt"]'): retweet_txt_sel = retweet_sel[0].xpath( './/p[@class="txt"]')[0] content_full = sel.xpath( './/p[@node-type="feed_list_content_full"]') is_long_weibo = False is_long_retweet = False if content_full: if not retweet_sel: txt_sel = content_full[0] is_long_weibo = True elif len(content_full) == 2: txt_sel = content_full[0] retweet_txt_sel = content_full[1] is_long_weibo = True is_long_retweet = True elif retweet_sel[0].xpath( './/p[@node-type="feed_list_content_full"]'): retweet_txt_sel = retweet_sel[0].xpath( './/p[@node-type="feed_list_content_full"]')[0] is_long_retweet = True else: txt_sel = content_full[0] is_long_weibo = True weibo['text'] = txt_sel.xpath( 'string(.)').extract_first().replace('\u200b', '').replace( '\ue627', '') weibo['article_url'] = self.get_article_url(txt_sel) weibo['location'] = self.get_location(txt_sel) if weibo['location']: weibo['text'] = weibo['text'].replace( '2' + weibo['location'], '') weibo['text'] = weibo['text'][2:].replace(' ', '') if is_long_weibo: weibo['text'] = weibo['text'][:-6] weibo['at_users'] = self.get_at_users(txt_sel) weibo['topics'] = self.get_topics(txt_sel) reposts_count = sel.xpath( './/a[@action-type="feed_list_forward"]/text()' ).extract_first() try: reposts_count = re.findall(r'\d+.*', reposts_count) except TypeError: print('cookie无效或已过期,请按照' 'https://github.com/dataabc/weibo-search#如何获取cookie' ' 获取cookie') raise CloseSpider() weibo['reposts_count'] = reposts_count[ 0] if reposts_count else '0' comments_count = sel.xpath( './/a[@action-type="feed_list_comment"]/text()' ).extract_first() comments_count = re.findall(r'\d+.*', comments_count) weibo['comments_count'] = comments_count[ 0] if comments_count else '0' attitudes_count = sel.xpath( '(.//a[@action-type="feed_list_like"])[last()]/em/text()' ).extract_first() weibo['attitudes_count'] = (attitudes_count if attitudes_count else '0') created_at = sel.xpath( '(.//p[@class="from"])[last()]/a[1]/text()').extract_first( ).replace(' ', '').replace('\n', '').split('前')[0] weibo['created_at'] = util.standardize_date(created_at) source = sel.xpath('(.//p[@class="from"])[last()]/a[2]/text()' ).extract_first() weibo['source'] = source if source else '' pics = '' is_exist_pic = sel.xpath( './/div[@class="media media-piclist"]') if is_exist_pic: pics = is_exist_pic[0].xpath('ul[1]/li/img/@src').extract() pics = [pic[2:] for pic in pics] pics = [ re.sub(r'/.*?/', '/large/', pic, 1) for pic in pics ] pics = ['http://' + pic for pic in pics] video_url = '' is_exist_video = sel.xpath( './/div[@class="thumbnail"]/a/@action-data') if is_exist_video: video_url = is_exist_video.extract_first() video_url = unquote( str(video_url)).split('video_src=//')[-1] video_url = 'http://' + video_url if not retweet_sel: weibo['pics'] = pics weibo['video_url'] = video_url else: weibo['pics'] = '' weibo['video_url'] = '' weibo['retweet_id'] = '' if retweet_sel and retweet_sel[0].xpath( './/div[@node-type="feed_list_forwardContent"]/a[1]'): retweet = WeiboItem() retweet['id'] = retweet_sel[0].xpath( './/a[@action-type="feed_list_like"]/@action-data' ).extract_first()[4:] retweet['bid'] = retweet_sel[0].xpath( './/p[@class="from"]/a/@href').extract_first().split( '/')[-1].split('?')[0] info = retweet_sel[0].xpath( './/div[@node-type="feed_list_forwardContent"]/a[1]' )[0] retweet['user_id'] = info.xpath( '@href').extract_first().split('/')[-1] retweet['screen_name'] = info.xpath( '@nick-name').extract_first() retweet['text'] = retweet_txt_sel.xpath( 'string(.)').extract_first().replace('\u200b', '').replace( '\ue627', '') retweet['article_url'] = self.get_article_url( retweet_txt_sel) retweet['location'] = self.get_location(retweet_txt_sel) if retweet['location']: retweet['text'] = retweet['text'].replace( '2' + retweet['location'], '') retweet['text'] = retweet['text'][2:].replace(' ', '') if is_long_retweet: retweet['text'] = retweet['text'][:-6] retweet['at_users'] = self.get_at_users(retweet_txt_sel) retweet['topics'] = self.get_topics(retweet_txt_sel) reposts_count = retweet_sel[0].xpath( './/ul[@class="act s-fr"]/li/a[1]/text()' ).extract_first() reposts_count = re.findall(r'\d+.*', reposts_count) retweet['reposts_count'] = reposts_count[ 0] if reposts_count else '0' comments_count = retweet_sel[0].xpath( './/ul[@class="act s-fr"]/li[2]/a[1]/text()' ).extract_first() comments_count = re.findall(r'\d+.*', comments_count) retweet['comments_count'] = comments_count[ 0] if comments_count else '0' attitudes_count = retweet_sel[0].xpath( './/a[@action-type="feed_list_like"]/em/text()' ).extract_first() retweet['attitudes_count'] = (attitudes_count if attitudes_count else '0') created_at = retweet_sel[0].xpath( './/p[@class="from"]/a[1]/text()').extract_first( ).replace(' ', '').replace('\n', '').split('前')[0] retweet['created_at'] = util.standardize_date(created_at) source = retweet_sel[0].xpath( './/p[@class="from"]/a[2]/text()').extract_first() retweet['source'] = source if source else '' retweet['pics'] = pics retweet['video_url'] = video_url retweet['retweet_id'] = '' yield {'weibo': retweet, 'keyword': keyword} weibo['retweet_id'] = retweet['id'] # print(weibo) yield {'weibo': weibo, 'keyword': keyword}
def parse_detail(self, response): if self.killed: raise CloseSpider("Spider already died.") if not response.body: self.error_count += 1 if self.error_count >= self.error_threshold: self.logger.error('[ JobPageRequestException ] {url}'.format(url=response.url.encode('utf-8'))) self.sqllogger.log_error_page( hash_code = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')), web_id = self.web_id, url = response.url.encode('utf-8'), meta = response.meta, html_path = html_path, crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status = 'FAILED', error_message= "Empty request's response" ) yield None return if self.use_proxy: proxy = choice(self.proxies) self.logger.info('[ JobPageRetry ] {url} with proxy {proxy}'.format(url=response.url.encode('utf-8'), proxy=proxy)) yield scrapy.Request(response.url, callback=self.parse_detail , meta={'proxy': proxy}) return else: self.logger.info('[ JobPageRetry ] {url}'.format(url=response.url.encode('utf-8'))) yield scrapy.Request(response.url, callback=self.parse_detail) return self.error_count = 0 try: html_path = self.html_path.format(dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url}'.format(url=response.url.encode('utf-8'))) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url}'.format(url=response.url.encode('utf-8'))) try: ret = {} ret['company'] = response.xpath('.//h1[@itemprop="hiringOrganization"]/a/span/text()').extract_first() ret['pos'] = response.xpath('.//div[@class="job-detail-top col-xs-12"]/h2/a/text()').extract_first() ret['etype'] = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[0]) ret['loc'] = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[1]) ret['sal'] = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[2]) ret['hour'] = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[4]) ret['desc'] = '|'.join([i.strip() for i in response.xpath('.//div[@itemprop="responsibilities"]/text()').extract()]) ret['qual'] = '|'.join([ i for i in [self.clean_tag(i).strip() for i in response.xpath('.//div[@itemprop="skills"]').extract_first().split('\n')] if i]) ret['benef'] = '|'.join([ i for i in [self.clean_tag(i).strip() for i in response.xpath('.//div[@itemprop="incentives"]').extract_first().replace('<li>','\n').split('\n')] if i]) ret['pdate'] = self.convert_pdate(response.xpath('.//div[@itemprop="datePosted"]/text()').extract_first()) if ret['pdate'].split()[0].split('-')[0] == "2017": self.logger.info("[ JobEndReached ] 2017 reached") self.killed = 1 raise CloseSpider("2017 reached") for key in ret.keys(): if ret[key]: ret[key] = ret[key].strip().encode('utf-8') _hash = hash_dn(ret['desc'],ret['company']) #log result to MySQL try: self.sqllogger.log_crawled_page( hash_code = _hash, position = ret['pos'], employer = ret['company'], exp = '', salary = ret['sal'], location = ret['loc'], web_id = self.web_id, url = response.url.encode('utf-8'), meta = response.meta, html_path = html_path, crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time = ret['pdate'], job_status = 'SUCCESS', error_message= '' ) self.logger.info('[ RDSLogged ] {url}'.format(url=response.url.encode('utf-8'))) except exc.IntegrityError as e: if e.orig.args[0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info("[ JobEndReached ] crawled record reached exceeding threshold") self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info("[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e self.repeat_count = 0 yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error('[ JobDetailException ] {url} {html_path} {e}'.format(url=response.url.encode('utf-8'),html_path=html_path.encode('utf-8'),e=e)) self.sqllogger.log_error_page( hash_code = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')), web_id = self.web_id, url = response.url.encode('utf-8'), meta = response.meta, html_path = html_path, crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status = 'FAILED', error_message= e )
website_url=website_url, website_key=json_key, settings=settings) #logging.info('结束网站爬虫'+json_key+':'+url_key+':'+website_urls[url_key]+'-'+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())) wait = runner.join() wait.addBoth(lambda _: reactor.stop()) #阻塞进程直到爬虫完毕 reactor.run() #end_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) #information = "开始爬虫时间:"+ begin_time + "\n爬虫结束时间: "+ end_time + " 凤凰类别数据爬虫完毕" #email_object.send_information(information,"完成凤凰类别数据爬虫通知",True) #print "通知成功" #end_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) #结束时间 #info_spider = ' begin at :'+begin_time+' end at :'+end_time #logging.info(info_spider) os._exit(0) except BaseException, error: end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) information = "time: " + end_time + "错误:" + str(error) + '\n' email_object.send_information(information) logging.exception(error) raise CloseSpider('爬虫识别') os._exit(1) finally: read_json_file.changejson(settings['SPLIT_JSON_FILE'])
def parse_detail(self, response): if self.killed: raise CloseSpider("Spider already died.") if not response.body: self.error_count += 1 if self.error_count >= self.error_threshold: self.logger.error('[ JobPageRequestException ] {url}'.format( url=response.url.encode('utf-8'))) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message="Empty request's response") yield None return if self.use_proxy: proxy = choice(self.proxies) self.logger.info( '[ JobPageRetry ] {url} with proxy {proxy}'.format( url=response.url.encode('utf-8'), proxy=proxy)) yield scrapy.Request(response.url, callback=self.parse_detail, meta={'proxy': proxy}) return else: self.logger.info('[ JobPageRetry ] {url}'.format( url=response.url.encode('utf-8'))) yield scrapy.Request(response.url, callback=self.parse_detail) return self.error_count = 0 try: html_path = self.html_path.format( dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url}'.format( url=response.url.encode('utf-8'))) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url}'.format( url=response.url.encode('utf-8'))) try: ret = {} head = {} row = response.xpath( '//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]/p|//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]/ul' )[1:] topic = response.xpath( '//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]//b/u/text()' ).extract() head['amnt'] = u'\u0e2d\u0e31\u0e15\u0e23\u0e32' head[ 'sal'] = u'\u0e40\u0e07\u0e34\u0e19\u0e40\u0e14\u0e37\u0e2d\u0e19' head[ 'benef'] = u'\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e34\u0e01\u0e32\u0e23' head[ 'req'] = u'\u0e04\u0e38\u0e13\u0e2a\u0e21\u0e1a\u0e31\u0e15\u0e34\u0e1c\u0e39\u0e49\u0e2a\u0e21\u0e31\u0e04\u0e23' head[ 'loc_det'] = u'\u0e2a\u0e16\u0e32\u0e19\u0e17\u0e35\u0e48\u0e1b\u0e0f\u0e34\u0e1a\u0e31\u0e15\u0e34\u0e07\u0e32\u0e19' head['loc'] = u'\u0e08\u0e31\u0e07\u0e2b\u0e27\u0e31\u0e14' ret['pos'], ret['desc'] = [ self.clean_tag(x) for x in response.xpath( '//div[@class="w3-theme-l4"]/div').extract() ] ret['pdate'] = self.cdate[response.url] ret['company'] = self.comnm[response.url] del self.cdate[response.url] del self.comnm[response.url] ret['loc'] = '' ret['sal'] = '' for key in head.keys(): try: idx = topic.index(head[key]) except ValueError: continue ret[key] = '|'.join([ i for i in [ remove_tags(i) for i in row[idx].xpath('./text()|./li').extract() ] if i ]) if ret['pdate'].split()[-1] == "2560": self.killed += 1 raise CloseSpider("2017 reached") for key in ret.keys(): if ret[key]: ret[key] = ' '.join( ret[key].strip().split()).encode('utf-8') _hash = hash_dn(ret['desc'], ret['company']) try: self.sqllogger.log_crawled_page( hash_code=_hash, position=ret['pos'], employer=ret['company'], exp='', salary=ret['sal'], location=ret['loc'], web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time=ret['pdate'], job_status='SUCCESS', error_message='') self.logger.info('[ RDSLogged ] {url}'.format( url=response.url.encode('utf-8'))) except exc.IntegrityError as e: if e.orig.args[ 0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info( "[ JobEndReached ] crawled record reached exceeding threshold" ) self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[ 0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info( "[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e self.repeat_count = 0 for key in ret.keys(): if not ret[key]: del ret[key] yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error( '[ JobDetailException ] {url} {html_path} {e}'.format( url=response.url.encode('utf-8'), html_path=html_path.encode('utf-8'), e=e)) self.sqllogger.log_error_page( hash_code=hash_dn(response.url.encode('utf-8'), datetime.now().strftime('%Y%m%d%H%M%S')), web_id=self.web_id, url=response.url.encode('utf-8'), meta=response.meta, html_path=html_path, crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status='FAILED', error_message=e)
def process_response(self, request, response, spider): if response.status == 402: raise CloseSpider('402 proxy no use') else: return response
def parse(self, response): global ult datas = response.selector.xpath( '//tr/td[@class="date"]/text()').extract() links = response.selector.xpath('//tr/td[5]/a/@href').extract() descs = response.selector.xpath('//tr/td[5]/a[@href]/text()').extract() for data, desc, link in zip(datas, descs, links): if desc.encode('utf-8') != ult and ult == '': with open('baseDos.txt', 'a+') as arq: arq.write(data.strip() + '\n') arq.write(desc.encode('utf-8') + '\n') arq.write(link + '\n\n') arq.close() i = 0 elif desc.encode('utf-8') != ult: with open('aux.txt', 'a+') as arq: arq.write(data.strip() + '\n') arq.write(desc.encode('utf-8') + '\n') arq.write(link + '\n\n') arq.close() i = 1 else: i = 0 break if not i: if os.path.exists('aux.txt'): os.remove('baseDos.txt') os.rename('aux.txt', 'baseDos.txt') # HERE YOU PUT THE TOKEN OF YOUR PAGE ON FACEBOOK access_token = "HERE YOU PUT THE TOKEN OF YOUR PAGE ON FACEBOOK" api = facebook.GraphAPI(access_token) total = 0 arq = open('baseDos.txt', 'r') linhas = arq.readlines() for i in linhas: if i == '\n': total = total + 1 a = 0 b = 3 lista = [] for i in range(total): xxx = linhas[a:b] lista.append(xxx) xxx = '' a = b + 1 b = a + 3 lista.reverse() for i in lista: x = ''.join(i) api.put_wall_post(x) raise CloseSpider('[+] BASE ATUALIZADA [+]') else: try: proxima_pagina = response.xpath( '//a[@href and contains(.,"next")]/@href').extract()[0] if proxima_pagina: yield scrapy.Request(url=proxima_pagina, callback=self.parse) except: pass
def twse_mining_Data_Parse(self, response): if (not (self.is_TPEX_open and self.is_TWSE_open)): print(self.se_status) pass else: local_Co_ids = [] if (self.TPEX_First_Run): local_Co_ids = self.Co_ids else: local_Co_ids = self.possible_Co_ids_TWSE for data in response.xpath('body'): domain = urlParse.urlparse(response.url).hostname print('First RUN:', self.TWSE_First_Run) print('爬取開始') print(f'網域:{domain}') for co_id in local_Co_ids: self.not_manual_cancel = sg.one_line_progress_meter( '目前爬取進度', self.current, self.isExist - 1, 'Stock', '運行時請勿點擊視窗,顯示沒有回應請勿關閉,為正常現象。\nElapsed Time 為已運行時間\nTime Remaining 為剩餘時間\nEstimated Total Time 為估計完成時間', no_titlebar=False, orientation='h') if (not self.not_manual_cancel and self.current < self.isExist - 1): Button = sg.popup_yes_no('是否取消?', '取消爬取') if (Button == 'Yes'): sg.popup('已手動取消!') raise CloseSpider("使用者取消!") items = StockPrice_items() print('First RUN:', self.TWSE_First_Run) print(co_id) twse_get = '' twse_get = str( response.xpath( f'//td[text()="{co_id}"]//text()').get()) twse_co_name = str( data.xpath( f'//td[text()="{co_id}"]/following-sibling::td[1]//text()' ).get()) print(twse_get) if (twse_get == 'None' and (not twse_co_name.isnumeric())): if (self.TPEX_First_Run): print(f'股號 {co_id} 不存在於交易所,可能為TPEX的股號,丟入至暫存中...') self.possible_Co_ids_TPEX.append(co_id) continue else: print(f'股號 {co_id} 不存在兩邊交易所,丟入到未存在股號中...') self.noExist.append(co_id) continue else: print('TWSE GET ITEMS') self.current += 1 twse_price = str( data.xpath( f'//td[text()="{co_id}"]/following-sibling::td[6]//text()' ).get()) twse_price = twse_price.replace(',', '') print(twse_price) if (self.is_number(twse_price)): twse_price = float(twse_price) else: twse_price = None items['CO_ID'] = str(co_id) items['CO_SHORT_NAME'] = str(twse_co_name) items['Price'] = twse_price items['SUB_DATA_TYPE'] = 'TWSE' items['SYear'] = str(self.Year) items['SDate'] = str(self.Date) items['DATA_TYPE'] = self.Type yield (items) self.TWSE_First_Run = False yield scrapy.Request(self.se_urls[0], callback=self.tpex_mining_Data_Parse, dont_filter=True)
def process_item(self, response, spider): self.count += 1 if (self.count == 5): print("======in test pipeline========") raise CloseSpider("in exception")
def parse(self, response): # def get_proxy(): # return requests.get("http://127.0.0.1:5010/get/").content # # def delete_proxy(proxy): # requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) #或者可以设置随机ip #不需要在这里设置,在retry中间件中设置即可 #轮询使用ip,假设有500可用ip,一分钟500个页面,对服务器来说相当于每台主机访问一页面 # while response.status == 403 or response.status == 302: # # print(response.status) # # print(response.meta) # # # delete_proxy(response.headers) # # #删除proxy # # # 获取proxy # # proxy = get_proxy() # # print("使用新代理:" + str(proxy)) # # #如果proxy_pool耗尽,暂时暂停爬虫或者更换目标网站,移动端或者wap,或者各大网站的cache # # response = scrapy.Request(url=response.url, meta={'proxy':'http://' + str(proxy)}) # # print(type(response)) # print("有respose") item = LearningItem() #爬取书名 #作者有联合作者,会和译者一样放在一个span里面,单个作者单独放在文本为 作者 的span 的后面的同级a节点,所以也要分类讨论 #或者作者无链接——不会,会有search #单个作者也会用一组嵌套的span括住 #翻译者的链接也是author,既然是爬取图书,就没有关系了,如果要研究翻译相关的话,主数据库有译者字段 def is_exist(item_argv, xpath1, **xpath2): # item[item_argv] = info.xpath(xpath1).extract().strip() try: item[item_argv] = info.xpath(xpath1).extract() except: print(str(item_argv) + "出错") item[item_argv] = '' if len(item[item_argv]) == 1: item[item_argv] = item[item_argv][0].strip() # if len(item[item_argv]) == 0 and item[item_argv] != '': # # item[item_argv] = '' # return item[item_argv][0].strip() if len(item[item_argv]) == 1 else item[item_argv] return item[item_argv] # try: #先确定豆瓣会出错的几种方式 #返回403 #返回200,但需登陆 #返回此应用出错 # print("尝试爬取") # except: # print() # print("被ban!!!!!!!!!!!!!") #只会停止其中一个协程,其他要逐渐停止,强行ctrl + z 会导致后面的链接被添加到filter中,以后都不会再被爬取 if response.status != 200: #不知道会不会将缺少 '/"的页面重定向到别的地方,导致状态码变为301,改next_page的代码 #shell后发现不会,重定向会直接返回200的response,服务器补全了后面的 / raise CloseSpider('强制停止!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') # time.sleep(600) # raise CloseSpider() # return ##这里写ADSL拨号或者换ip的逻辑 # print() # return print("此时的URL为:" + str(response.url)) # writer_link_list = [] # series_link_list = [] try: info = response.xpath(u'//*[@id="info"]')[0] except: raise CloseSpider("出现200以外的错误,此时的url为 %s" % response.url) #在这里一并处理了作者列表和翻译者列表 #判断有无作者 #判断有无翻译者 #翻译者以上的author link 的text 加入到作者列表中 #如无翻译者,则author link 的 text 默认为全是作者 #容易出错,比如出现个志愿者什么的,举例而已 #作者节点:作者节点的下一个同辈span节点的所有前同辈a节点,因为作者节点排第一,没有其他节点会影响它 #先确定是两种模式的哪一种 #直接写四种模式,用 a = b or c = d的写法,一句 #如果以某个字段为基准,比如出版社以上的a tag 为作者,以下为翻译者的话,当出版社字段不存在,就会出错,所以还是以自身为基准,爬虫会更具健壮性 #有冒号无嵌套 w_name1 = info.xpath( u'//span[./text()="作者:"]/following-sibling::span[1]/preceding-sibling::a' ) #有冒号有嵌套 w_name2 = info.xpath(u'//span[./text()="作者:"]/parent::span/a') #无冒号无嵌套 w_name3 = info.xpath( u'//span[./text()=" 作者"]/following-sibling::span[1]/preceding-sibling::a' ) #无冒号有嵌套 w_name4 = info.xpath(u'//span[./text()=" 作者"]/parent::span/a') if w_name1: item['writers'] = w_name1.xpath("./text()").extract() item['writers_link'] = w_name1.xpath("./@href").extract() elif w_name2: item['writers'] = w_name2.xpath("./text()").extract() item['writers_link'] = w_name2.xpath("./@href").extract() elif w_name3: item['writers'] = w_name3.xpath("./text()").extract() item['writers_link'] = w_name3.xpath("./@href").extract() elif w_name4: item['writers'] = w_name4.xpath("./text()").extract() item['writers_link'] = w_name4.xpath("./@href").extract() else: item['writers'] = '' item['writers_link'] = '' #————————————————————————————————————————————————————————————————————————————————————————————————————————————————# #译者 # contains(@name,'na') #有冒号无嵌套 t_name1 = info.xpath( u'//span[./text()="译者:"]/following-sibling::a[contains(@href,"search")]' ) #有冒号有嵌套 t_name2 = info.xpath( u'//span[./text()="译者:"]/following-sibling::a[contains(@href,"author")]' ) #无冒号无嵌套 #选中属性中包含某个字符串的href #链接可以直接爬取了,但是中文字段还是要靠后续的处理和提取 #出错 #仍有问题,无法替换和正确拼接 # t_name3 = info.xpath(u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"search") or contains(@href,"author")]') t_name3 = info.xpath( u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"search")]' ) #无冒号有嵌套 t_name4 = info.xpath( u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"author")]' ) if t_name4: item['translators'] = t_name4.xpath("./text()").extract() item['translators_link'] = t_name4.xpath("./@href").extract() elif t_name3: item['translators'] = t_name3.xpath("./text()").extract() item['translators_link'] = t_name3.xpath("./@href").extract() elif t_name2: item['translators'] = t_name2.xpath("./text()").extract() item['translators_link'] = t_name2.xpath("./@href").extract() elif t_name1: item['translators'] = t_name1.xpath("./text()").extract() item['translators_link'] = t_name1.xpath("./@href").extract() else: item['translators'] = '' item['translators_link'] = '' #————————————————————————————————————————————————————————————————————————————————————————————————————————————————# item["publish"] = is_exist( "publish", u'//span[./text()="出版社:"]/following::text()[1]') item["publish_date"] = is_exist( "publish_date", u'//span[./text()="出版年:"]/following::text()[1]') item["pages"] = is_exist( "pages", u'//span[./text()="页数:"]/following::text()[1]') item["price"] = is_exist( "price", u'//span[./text()="定价:"]/following::text()[1]') item["binding"] = is_exist( "binding", u'//span[./text()="装帧:"]/following::text()[1]') item["ISBN"] = is_exist( "ISBN", u'//span[./text()="ISBN:"]/following::text()[1]') item["orgin_name"] = is_exist( "orgin_name", u'//span[./text()="原作名:"]/following::text()[1]') item["series"] = is_exist( "series", u'//span[./text()="丛书:"]/following::a[1]/text()') item["series_link"] = is_exist( "series_link", u'//span[./text()="丛书:"]/following-sibling::a[1]/@href') # item["summary"] = is_exist("summary",) # item["w_summary"] = is_exist("w_summary",) item["catalog"] = is_exist("catalog", '//*[contains(@id,"dir_")]/text()') item["tag"] = is_exist("tag", '//*[@id="db-tags-section"]/div/span/a/text()') item["series_info"] = is_exist( "series_info", '//*[@id="content"]/div/div[1]/div[3]/div[@class="subject_show block5"]/div//text()' ) # item["readers"] = is_exist("readers",).extract().strip() # item["title"] = is_exist("title",).extract().strip() # item["url"] = is_exist("url",).extract().strip() # item["score"] = is_exist("score",).extract().strip() try: item['title'] = response.xpath( "//*[@id='wrapper']/h1/span/text()").extract_first() except: item['title'] = '' item['url'] = response.url.replace("https://book.douban.com/subject/", "").strip('/') try: item['score'] = response.css( '#interest_sectl > div > div.rating_self.clearfix > strong::text' ).extract_first().strip() if item['score'] == '': item['score'] = '0' except: item['score'] = '0' # try: # item['publish'] = info.xpath().extract_first().strip() # except: # item['publish'] = '' # try: # item['publish_date'] = info.xpath(u'//span[./text()="出版年:"]/following::text()[1]').extract_first().strip() # except: # item['publish_date'] = '' # try: # item['pages'] = info.xpath(u'//span[./text()="页数:"]/following::text()[1]').extract_first().strip() # except: # item['pages'] = '' # try: # item['price'] = info.xpath(u'//span[./text()="定价:"]/following::text()[1]').extract_first().strip() # except: # item['price'] = '' # try: # item['binding'] = info.xpath(u'//span[./text()="装帧:"]/following::text()[1]').extract_first().strip() # except: # item['binding'] = '' # try: # item['ISBN'] = info.xpath(u'//span[./text()="ISBN:"]/following::text()[1]').extract_first().strip() # except: # item['ISBN'] = '' # try: # item['orgin_name'] = info.xpath(u'//span[./text()="原作名:"]/following::text()[1]').extract_first().strip() # except: # item['orgin_name'] = '' # try: # item['series'] = info.xpath(u'//span[./text()="丛书:"]/following::a[1]/text()').extract_first().strip() # except: # item['series'] = '' # try: # item['series_link'] = info.xpath(u'//span[./text()="丛书:"]/following-sibling::a[1]/@href').extract_first().strip() # except: # item['series_link'] = '' #这里有两种情况,一种有折叠,一种没有,先提取包含折叠内容的,没有再提取另一个 try: summary = response.xpath( '//*[@id="link-report"]/span/div/div[@class="intro"]/p/text()') if summary: item['summary'] = summary.extract() else: item['summary'] = response.xpath( '//*[@id="link-report"]/div[1]/div/p/text()').extract() # if len(item['summary']) == 0 and item['summary'] != '': # # item['summary'] = '' except: item['summary'] = '' try: w_summary = response.css( '#content > div > div.article > div.related_info > div:nth-child(4) > span.all.hidden > div > p::text' ) if w_summary: item['w_summary'] = w_summary.extract() else: item['w_summary'] = response.css( '#content > div > div.article > div.related_info > div:nth-child(4) > span.short > div > p::text' ).extract() # if len(item['w_summary']) == 0 and item['w_summary'] != '': # # item['w_summary'] = '' except: item['w_summary'] = '' # try: # #出错 # # item['catalog'] = response.xpath('//*[contains(@id,"full") and contains(@id,"dir")]/text()').extract() # item['catalog'] = response.xpath('//*[contains(@id,"dir_")]/text()').extract() # except: # item['catalog'] = '' # try: # item['tag'] = response.xpath('//*[@id="db-tags-section"]/div/span/a/text()').extract() # except: # item['tag'] = '' # try: # #丛书信息会随机抽取 # item['series_info'] = response.xpath('//*[@id="content"]/div/div[1]/div[3]/div[@class="subject_show block5"]/div//text()').extract() # except: # item['series_info'] = '' try: item['readers'] = response.css( '#interest_sectl > div > div.rating_self.clearfix > div > div.rating_sum > span > a > span::text' ).extract_first() if item['readers'] is None: item['readers'] = '0' except: item['readers'] = '0' # '//*[@id="link-report"]/div[1]/div/p'/div/div[@class="intro"]/p/text() # if w_name_mode1: # # w_name = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()').extract_first().replace("\n","").replace(" ","") # w_name = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()') # #如果能捕获作者名字,则写入,否则,为span嵌套模式 # if w_name: # item['writer'] = w_name.extract() # else: # item['writer'] = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()') # / # writer_name_type2 = links.xpath('//span[./text()=" 作者"]/following-sibling::span[1]/preceding-sibling::a/text()').extract_first().replace("\n","").replace(" ","") # writer_name_type3 = # #单个作者节点已经完成,需要完成一组的作者节点,具体参考大学教材 # #一组作者节点同一组翻译者节点 # #翻译者节点:翻译者节点的下一个span节点 # #一组翻译者的已经解决,单个翻译者的参考傅雷 # # link_extract = item.extract() # if "author" in link: # # print(item.xpath('./@href').extract()) # #这里可以缩减 # writer_link_list.append(link) # #存储完整的网址,日后爬取可以少一个拼接网址的逻辑,加快爬取速度,硬盘开销不大 # if "search" in link: # link = "https://book.douban.com/" + link # writer_link_list.append(link) # if "series" in link: # series_link_list.append(link) # item['writer_link'] = writer_link_list # item['series_link'] = series_link_list # # item['writer'] = response.xpath(u'//span[./text()="作者:"]/following::a[2]') # # # // *[ @ id = "info"] / a[1] # # item['publish'] = response.xpath(u'//span[./text()="出版社:"]/following::text()[1]') # # item['orgin_name'] = response.xpath(u'//span[./text()="原作名:"]/following::text()[1]') # #这里只是其中一种情况,还有一种,要增加对应的try...except,以及中文图书没有翻译的问题,全半角符号的问题 # c = ""#单个翻译者 # try: # if a: # item['translator'] = a[0].xpath('./a/text()').extract() # if b: # item['translator'] = b[0].xpath('./a/text()').extract() # except: # item['translator'] = '' #有效评分人数 # if item['readers']: # v = int(item['readers']) # else: # v = 0 # #入选top250的最低人数 # m = 10000 # #书本得分 # if item['score']: # R = float(item['score']) # else: # R = 0 # # C是所有书本的得分平均分,都存在数据库中,取个大概值就行了 # C = 7 item["weighting"] = 0 item['seen'] = 0 yield item # item['p_date'] # item['total_pages'] # item['price'] # item['binding'] # item['series'] # item['ISBN'] # item['summary'] # item['w_introduce'] # item['ca'] # item['tag'] # item['s_info'] # item['score'] # item['readers'] # print(item['title']) # all = response.xpath("string(//*[@id='info'])") # all = # print(all.extract()) # print(all.extract()[0].replace("\n","")) # print(all.extract()[0].replace("\n","").replace(" ","")) # print(type(all.extract())) # yield item #id一般固定,可以忽略css的变化 #先不清洗,换取爬取的速度提升 # all = response.xpath('//*[@id="info"]') # all = all.extract()[0].replace("\n","").replace("\t","").split("<br>") # for item in all: # print(item.replace('<spanclass="pl">',"").replace("</span>","").replace("""<divid="info"class="">""","").replace("</div>","").replace("</a>","").replace("""<aclass=""href=""","").replace("<span>","").replace("<ahref=","")) # all = response.xpath(u'//span[./text()=" 作者"]/following::text()') # print(all) #mysql批量写入,不要每次写入 # #抽取"喜欢这本书的用户也喜欢"的链接 link = LinkExtractor( restrict_xpaths=('//*[@id="db-rec-section"]/div//dl//dd')) links = link.extract_links(response) #如果链接是直接相关的话,也可以用response.follow,会返回一个url实例,然后可以yield相关的url: # links = response.xpath('//*[@id="db-rec-section"]/div//dl//dd').extract() # for link in links: # yield response.follow(link,callback=self.parse) for link in links: # print("弹出一个url") # if link.url.endswith('/'): # pass # else: # link.url = link.url + "/" #没有"/"作为结尾的话,网址会重定向,不必要,但是可能是识别爬虫的依据 yield scrapy.Request(url=link.url, callback=self.parse)
def parse_item(self, response): if (self.count < int(self.limit)): item = MyItem() item['url'] = response.url p = r"^\S*article\/view\/\S*$" a = r"^(\s*Abstrak\s*$)|(^\s*Abstract\s*$)" if (re.match(p, item['url'])): journal = JournalItem() article = ArticleItem() references = ReferencesItem() author = AuthorItem() item['title'] = response.css('title::text').getall() dc = "//meta[@name='DC.{}']/@content" citation = "//meta[@name='citation_{}']/@content" author_name = response.xpath( dc.format('Creator.PersonalName')).extract() abstract = response.xpath( dc.format('Description')).extract_first() doi = response.xpath( dc.format('Identifier.DOI')).extract_first() issn = response.xpath(dc.format('Source.ISSN')).extract_first() issue = response.xpath( dc.format('Source.Issue')).extract_first() volume = response.xpath( dc.format('Source.Volume')).extract_first() title = response.xpath(dc.format('Title')).extract_first() uri = response.xpath( dc.format('Identifier.URI')).extract_first() journal_title = response.xpath( citation.format('journal_title')).extract_first() author_institution = response.xpath( citation.format('author_institution')).extract() date = response.xpath(citation.format('date')).extract_first() keyword = response.xpath( citation.format('keywords')).extract_first() pdf_uri = response.xpath( citation.format('pdf_url')).extract_first() language = response.xpath( citation.format('language')).extract_first() if not abstract: abstract = response.xpath( '//*[text()[re:test(., "{}")]]/parent::*//text()'. format(a)).extract() article['title'] = title article['abstract'] = abstract article['doi'] = doi article['uri'] = uri article['pdf_uri'] = pdf_uri article['publication_date'] = date article['keyword'] = keyword article['issn'] = issn article['language'] = language journal['title'] = journal_title journal['issn'] = issn journal['issue'] = issue journal['volume'] = volume author['name'] = author_name author['affiliate'] = author_institution #Match reference with regex pattern = "^(\s*References\s*$)|(^\s*Referensi\s*$)" pattern2 = r"^[a-zA-Z/[]|['__']{2}" pattern3 = r"\s?[a-zA-Z0-9\.\ ]{1}$" result = response.xpath( '//*[text()[re:test(., "{}")]]/parent::*//text()'.format( pattern)).extract() #Remove control character like \n,\t, etc. t = dict.fromkeys(range(32)) ref = [ x.translate(t) for x in result if x.translate(t) and x.translate(t) != "References" and x.translate(t) != "Referensi" and len(x) > 20 ] references['title'] = "" references['classification'] = "" if len(ref) > 0: data = pd.read_csv( '/home/bandreg/Skripsi/Program/JournalCrawler/scrapy_app/scrapy_app/spiders/data2.csv', index_col=None) vectorizer = CountVectorizer() X1 = vectorizer.fit_transform(data['Reference'].values) test = vectorizer.transform(ref) model = joblib.load( '/home/bandreg/Skripsi/Program/JournalCrawler/scrapy_app/scrapy_app/spiders/model.sav' ) result = model.predict(test) references['title'] = ref references['classification'] = result #Count item self.count += 1 yield { 'journal': journal, 'item': item, 'article': article, 'author': author, 'references': references } else: raise CloseSpider('limit reached')
def parse_subscriptions_period_variants(self, response): if len(self.devices) < 1: self.log("[[ORANGECH]] No devices collected on previous steps. Stopping!") return self.current_step = 'PROCESS_PLANS_DEVICES' if not self._browser_load_page_with_tries(devices_url): self.errors.append("Failed to load page with PhantomJS: %s" % devices_url) raise CloseSpider("Failed to load page with PhantomJS: %s" % devices_url) # reset to SIM-only time.sleep(30) el = self._browser.find_element_by_xpath("//div[@class='product-item'][not(@id)]//button[contains(text(), 'Select')]") self._do_browser_action_tries(el.click) time.sleep(30) if not self._browser_load_page_with_tries(response.url): self.errors.append("Failed to load page with PhantomJS: %s" % response.url) raise CloseSpider("Failed to load page with PhantomJS: %s" % response.url) for self.current_period in ['12', '24']: if self.current_period not in self.processed_priceplans: self.processed_priceplans[self.current_period] = {} if len(self.priceplans) < 1: return self.log('[[ORANGECH]] Processing period: %s months' % self.current_period) drop_down_el = self._browser.find_element_by_xpath("//form[@id='form_subscription_length']//a[@class='select2-choice']") self._do_browser_action_tries(drop_down_el.click) el = self._browser.find_element_by_xpath("//ul[@id='select2-results-6']/li/div[contains(text(), '%s')]" % self.current_period) self._do_browser_action_tries(el.click) for plan_name_base in sorted(self.priceplans): # plan_formdata = self.priceplans_formdata[plan_name_base] self.log('[[ORANGECH]] Processing base price plan %s with period %s months' % (plan_name_base, self.current_period)) drop_down_el = self._browser.find_element_by_xpath("//form[@id='form_subscription_choice']//a[@class='select2-choice']") self._do_browser_action_tries(drop_down_el.click) el = self._browser.find_element_by_xpath("//ul[@id='select2-results-4']/li/div[contains(text(), '%s')]" % plan_name_base) self._do_browser_action_tries(el.click) if plan_name_base not in self.processed_priceplans[self.current_period]: self.processed_priceplans[self.current_period][plan_name_base] = set() for i, variant in enumerate(sorted(self.priceplans_variants[plan_name_base])): grouped_key = ";".join(["%s:%s" % (key, variant[key]) for key in sorted(variant.keys())]) if grouped_key in self.processed_priceplans[self.current_period][plan_name_base]: continue if 'young' in plan_name_base.lower(): plan_name = plan_name_base[:] for key, value in variant.items(): if 'Young' in self.priceplans[plan_name_base][key][value]['name']: plan_name = plan_name + ' ' + self.priceplans[plan_name_base][key][value]['name'].replace("Orange Young", "").replace("Young", "").strip() for key, value in variant.items(): if 'Young' not in self.priceplans[plan_name_base][key][value]['name']: plan_name = plan_name + ', ' + self.priceplans[plan_name_base][key][value]['name'] else: plan_name = plan_name_base + " " + ", ".join([self.priceplans[plan_name_base][key][variant[key]]['name'] for key in sorted(variant.keys())]) price = sum([int(self.priceplans[plan_name_base][key][variant[key]]['price']) for key in variant.keys()]) meta = { 'plan_name_base': plan_name_base, 'grouped_key': grouped_key, 'plan_name': plan_name, 'per_month': price } self.log('[[ORANGECH]] Selecting price plan %s with period %s months' % (plan_name, self.current_period)) for key, value in variant.items(): el = self._browser.find_element_by_xpath("//input[@name='%s'][@value='%s']" % (key, value)) self._do_browser_action_tries(el.click) time.sleep(5) self.log('[[ORANGECH]] Clicking period again: %s months' % self.current_period) el = self._browser.find_element_by_xpath("//select[@name='contract_length']/option[@value='%s']" % self.current_period) if not el.is_selected(): self._do_browser_action_tries(el.click) time.sleep(5) self.log('[[ORANGECH]] Loading device prices for price plan: %s, %s months' % (plan_name, self.current_period)) # time.sleep(30) if not self._browser_load_page_with_tries(devices_url): self.errors.append("Failed to load page with PhantomJS: %s" % devices_url) raise CloseSpider("Failed to load page with PhantomJS: %s" % devices_url) hxs = HtmlXPathSelector(text=self._browser.page_source) for item in self.parse_device_prices_for_priceplan(hxs, meta): yield item self.processed_priceplans[self.current_period][plan_name_base].add(grouped_key) if not self._browser_load_page_with_tries(subscriptions_url): self.errors.append("Failed to load page with PhantomJS: %s" % subscriptions_url) raise CloseSpider("Failed to load page with PhantomJS: %s" % subscriptions_url)
def parse_item_page(self, response): if self.close_down: raise CloseSpider() item = IpropertyItem() item['url'] = response.url item['scraped_date'] = time.strftime("%Y-%m-%d %H:%M:%S") # categories # pre-filled with None for x in xrange(1, 7): item['cat_{}'.format(x)] = None categories = [ x for x in response.css("div.breadcrumbs-ld a::text").extract() if x != 'Home' ] for index, cat in enumerate(categories): if index > 5: raise CloseSpider("Category tree too long: {}".format( ','.join(categories))) item['cat_{}'.format(index + 1)] = cat # unique ID result = re.search(r'.+-(\d+)$', response.url) if result: item['unique_id'] = result.group(1) # title item['title'] = next( iter(response.css("h1.main-title::text").extract()), '') if item['title'][-3:] == '...': item['title'] = next(iter(response.css("title ::text").extract()), '') # price item['price'] = next(iter(response.css("h2.price::text").extract()), '').replace('RM', '').replace(',', '').strip() # address item['address'] = next( iter(response.css(".building-info-one h2::attr(title)").extract()), '') # item details details = {} for d in response.css("ul.infos>li::text").extract(): if ':' not in d: details.setdefault('facility', []).append(d.strip()) else: splitted = d.split(' : ') if len(splitted) == 2: details[splitted[0].strip()] = splitted[1].strip() # bedroom if 'Bedrooms' in details: item['bedroom'] = details['Bedrooms'] else: item['bedroom'] = next( iter( response.css( ".ld_mis_detail p.room span.bedroom::attr(title)"). extract()), '').replace('Bedrooms', '').strip() # bathroom if 'Bathrooms' in details: item['bathroom'] = details['Bathrooms'] else: item['bathroom'] = next( iter( response.css( ".ld_mis_detail p.room span.bathroom::attr(title)"). extract()), '').replace('Bathrooms', '').strip() item['carpark'] = next( iter( response.css(".ld_mis_detail p.room span.garage::attr(title)"). extract()), '').replace('Car parks', '').strip() item['agent_name'] = next( iter(response.css("#agent-info .name a::text").extract()), '') item['agent_url'] = next( iter(response.css("#agent-info .name a::text").extract()), '') item['agent_phone'] = next( iter(response.css("#agentPhone::attr(value)").extract()), '') item['images'] = list( set(response.css("ul.gallery a::attr(href)").extract())) item['property_type'] = details.get('Property Type:', '') item['tenure'] = details.get('Tenure', '') item['land_area'] = details.get('Land Area', '') item['builtup'] = details.get('Built-Up', '') item['occupancy'] = details.get('Occupancy', '') item['furnishing'] = details.get('Furnishing', '') item['posted_date'] = details.get('Posted Date', '') item['facing_direction'] = details.get('Facing Direction', '') item['facility'] = details.get('facility', []) item['description'] = ' '.join([x for x in response.css("div.detail-info-wide ::text").extract() if x.strip() != ''])\ .replace("\n", ' ').replace("\r", " ").replace(" ", " ") # expired expired = False for tag in response.css("h6 ::text").extract(): if 'expired listing' in tag.lower(): expired = True break item['expired'] = expired yield item
def parse(self, response): # Parse articles flux_state_script = response \ .xpath("//script[contains(., 'window.FLUX_STATE')]/text()") if not flux_state_script: raise CloseSpider(reason='FLUX_STATE not found') flux_state_json = flux_state_script.extract_first()[20:] flux_state = json.loads(flux_state_json) articles = flux_state['adSearch']['data']['ads'] print(articles) for article in articles: yield { 'search': self.search_id, 'url': article['url'], 'original_id': article['list_id'], 'title': article['subject'], 'description': article['body'], 'price': article['price'][0], 'charges_included': LeboncoinSpider.get_attribute(article, 'charges_included', lambda x: bool(int(x))), 'publication_date': self.get_publication_date(article), 'real_estate_type': LeboncoinSpider.get_attribute(article, 'real_estate_type', None, None, True), 'rooms': LeboncoinSpider.get_attribute(article, 'rooms', int), 'furnished': LeboncoinSpider.get_attribute(article, 'furnished', lambda x: bool(int(x))), 'surface': LeboncoinSpider.get_attribute(article, 'square', int), 'images': LeboncoinSpider.get_images(article), 'zipcode': article['location']['zipcode'], 'city': article['location']['city'], 'ges': LeboncoinSpider.get_attribute(article, 'ges'), 'energy_rate': LeboncoinSpider.get_attribute(article, 'energy_rate'), } # Follow pagination (max=nbr_of_pages) if self.cur_nbr_of_pages < self.nbr_of_pages: self.cur_nbr_of_pages += 1 next_url = '{}/p-{}'.format(self.start_urls[0], self.cur_nbr_of_pages) yield response.follow(next_url, self.parse)
def you_get(self): command = ['you-get', '--json', self.start_urls[0]] print command stdout, stderr = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() print 'stdout', stdout, 'stderr', stderr if len(stdout) < 2: return False logger.info('[you-get]' + '[uuid]' + self.uuid) video = json.loads(stdout) if 'streams' not in video: return False title = video['title'] srcs = [] for key in video['streams'].keys(): print key if 'src' in video['streams'][key]: srcs = video['streams'][key]['src'] print srcs break concatfile = 'cache/' + self.uuid + '.txt' mp4file = 'cache/' + self.uuid + '.mp4' for idx, src in enumerate(srcs): src_path = 'cache/' + self.uuid + '_' + str(idx) + '.mp4' _, success = service.utils.download_file(src, src_path) if not success: return False open(concatfile, 'a+').write('file ' + string.replace(src_path, 'cache/', '') + "\n") length = service.utils.mergeVideo(mp4file, concatfile) print '[merged video duration]', length if length == 0: return False filesize = os.path.getsize(mp4file) endpoint, backet, obj = service.utils.paseUploadUrl(self.upload_url) print endpoint, backet, obj uploadResult = service.utils.uploadVideo(mp4file, endpoint, backet, obj) print 'uploadResult:', uploadResult if not uploadResult: return False logger.warn('[uploadVideo]' + '[uuid]' + self.uuid) data = { "video_id": self.uuid, "state": 1, "message": u'成功', "length": length, "play_id": self.uuid, "size": filesize, "cover": '', "title": title } self.callbacked = service.utils.callback_result(self.callback, data=data) logger.info('[finished]' + str(self.callbacked) + '[uuid]' + self.uuid) video_data = { 'title': title, 'video_id': self.video_id, 'author': self.name, 'publish': time.strftime('%Y-%m-%d %H:%M:%S'), 'page_url': self.start_urls[0], 'video_length': length, 'video_size': filesize, 'video_url': '', 'easub_uuid': self.uuid } self.db.save_video(video_data) raise CloseSpider('finished')
def process_item(self, item, spider): if isinstance(item, Huangye88KunmingItem): # sql = """insert into kuchuan_all(id, app_package, down, trend) VALUES(%s, %s, %s, %s) ON DUPLICATE KEY UPDATE app_package=VALUES(app_package), down=VALUES(down), down=VALUES(trend)""" sql = """insert into jianjie_huangye88_kunming (comp_url, comp_name, intro) VALUES(%s, %s, %s)""" args = [item['comp_url'], item['comp_name'], item['intro']] elif isinstance(item, Huangye88LiuzhouItem): sql = """insert into jianjie_huangye88_liuzhou (comp_url, comp_name, intro) VALUES(%s, %s, %s)""" args = [item['comp_url'], item['comp_name'], item['intro']] elif isinstance(item, ShunqiLiuzhouItem): sql = """insert into jianjie_shunqi_liuzhou (comp_url, comp_name, intro) VALUES(%s, %s, %s)""" args = [item['comp_url'], item['comp_name'], item['intro']] elif isinstance(item, ShunqiKunmingItem): sql = """insert into jianjie_shunqi_kunming (comp_url, comp_name, intro) VALUES(%s, %s, %s)""" args = [item['comp_url'], item['comp_name'], item['intro']] elif isinstance(item, MinglujiLiuzhouItem): sql = """insert into jianjie_mingluji_liuzhou (comp_url, comp_name, intro) VALUES(%s, %s, %s)""" args = [item['comp_url'], item['comp_name'], item['intro']] elif isinstance(item, MinglujiKunmingItem): sql = """insert into jianjie_mingluji_kunming (comp_url, comp_name, intro) VALUES(%s, %s, %s)""" args = [item['comp_url'], item['comp_name'], item['intro']] elif isinstance(item, ShunqiAllItem): sql = """insert into jianjie_shunqi_all (comp_url, comp_name, intro, city) VALUES(%s, %s, %s, %s)""" args = [ item['comp_url'], item['comp_name'], item['intro'], item['city'] ] # print(str(item['comp_url']) + ' ' + str(item['comp_name'])) # if len(self.item_list) == 500: # sql = """insert into jianjie_shunqi_all_copy (comp_url, comp_name, intro, city) VALUES(%s, %s, %s, %s)""" # self.cursor.executemany(sql, self.item_list) # self.conn.commit() # self.item_list.clear() # print('200 insert') # else: # self.item_list.append([item['comp_url'], item['comp_name'], item['intro'], item['city']]) elif isinstance(item, Huangye88AllItem): sql = """insert into jianjie_huangye88_all (comp_url, comp_name, intro, city) VALUES(%s, %s, %s, %s)""" args = [ item['comp_url'], item['comp_name'], item['intro'], item['city'] ] elif isinstance(item, Huangye88AotuItem): sql = """insert into jianjie_huangye88_aotu (comp_url, comp_name, intro, posi, shengshi, cat) VALUES(%s, %s, %s, %s, %s, %s)""" args = [ item['comp_url'], item['comp_name'], item['intro'], item['posi'], item['shengshi'], item['cat'] ] elif isinstance(item, WuyouAllItem): sql = """insert into jianjie_wuyou_all (comp_url, comp_name, intro, area) VALUES(%s, %s, %s, %s)""" args = [ item['comp_url'], item['comp_name'], item['intro'], item['area'] ] elif isinstance(item, huang114AllItem): sql = """insert into jianjie_114_all_copy (comp_url, comp_name, link_man, tel, email, addr, intro) VALUES(%s, %s, %s, %s, %s, %s, %s)""" args = [ item['comp_url'], item['comp_name'], item['link_man'], item['tel'], item['email'], item['addr'], item['intro'] ] elif isinstance(item, ZhizaoAllItem): sql = """insert into jianjie_zhizao_all (comp_url, comp_name, addr, intro) VALUES(%s, %s, %s, %s)""" args = [ item['comp_url'], item['comp_name'], item['addr'], item['intro'] ] elif isinstance(item, Ca800Item): sql = """insert into jianjie_ca800_all (comp_url, comp_name, cat_url, cat, loc, sheng, shi, intro) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)""" args = [ item['comp_url'], item['comp_name'], item['cat_url'], item['cat'], item['loc'], item['sheng'], item['shi'], item['intro'] ] elif isinstance(item, JiqirenItem): sql = """insert into jianjie_jiqiren_all (zhuying, comp_url, comp_name, cat_url, cat, loc, sheng, shi, intro) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)""" args = [ item['zhuying'], item['comp_url'], item['comp_name'], item['cat_url'], item['cat'], item['loc'], item['sheng'], item['shi'], item['intro'] ] elif isinstance(item, JiqirenItem): sql = """insert into ChuanItem (zhuying, comp_url, comp_name, cat_url, cat, loc, sheng, shi, intro) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)""" args = [ item['zhuying'], item['comp_url'], item['comp_name'], item['cat_url'], item['cat'], item['loc'], item['sheng'], item['shi'], item['intro'] ] else: raise CloseSpider('no item match...') try: self.cursor.execute(sql, args) self.conn.commit() # print(str(item['comp_url']) + ' ' + str(item['comp_name'])) except pymysql.err.InterfaceError: print('reconnect mysql...') time.sleep(3) self.__init__() self.process_item(item, spider)
def hooks(self, d): if d['status'] == 'finished': filename = d['filename'] l = filename.split('.') ext = l[len(l) - 1] print ext jsonfile = string.replace(filename, ext, 'info.json') info = json.loads(open(jsonfile).read()) outpath = 'cache/' + self.uuid + '_.mp4' length = service.utils.coverterMp4(filename, outpath) print length, outpath if not length: logger.error('error trancode mp4' + self.uuid) raise CloseSpider('covert failed') total_bytes = os.path.getsize(outpath) endpoint, backet, obj = service.utils.paseUploadUrl( self.upload_url) print endpoint, backet, obj result = service.utils.uploadVideo(outpath, endpoint, backet, obj) # os.remove('cache/' + info['id'] + '*') if not result: self.logger.error('upload video error', self.uuid) raise CloseSpider('upload oss failed') print 'easub_uuid', result if 'thumbnail' in info: cover = service.utils.get_clip_cover_url( info['thumbnail'], self.uuid) else: cover = '' data = { "video_id": self.uuid, "state": 1, "message": u'成功', "length": length, "play_id": self.uuid, "size": total_bytes, "cover": cover, "title": info['title'] } self.callbacked = service.utils.callback_result(self.callback, data=data) logger.info('[finished]' + str(self.callbacked) + '[uuid]' + self.uuid) video_data = { 'title': info['title'], 'video_id': self.video_id, 'author': info['extractor'], 'publish': time.strftime('%Y-%m-%d %H:%M:%S'), 'page_url': info['webpage_url'], 'video_length': length, 'video_size': total_bytes, 'video_url': '', 'easub_uuid': self.uuid, 'cover': cover } self.db.save_video(video_data) if d['status'] == 'error': print 'error', d['filename'] raise CloseSpider('download failed')
def parse(self, response): print 'parsePlayurl', response.url try: video_id = self._match_id(self.start_urls[0]) except AssertionError, e: raise CloseSpider('link not supported')
def get_detail_post(self, response): """ get detail post :param response: :return: """ if self.close_down: raise CloseSpider('OVER NUMBER_POST') post_title = response.meta['post_title'] post_link = response.meta['post_link'] author = response.xpath( '//div[@class="details__author"]//a/img/@alt').extract_first() public_date = response.xpath( '//div[@class="details__meta"]/div[@class="meta"]/time/text()' ).extract_first() public_date = datetime.strptime(public_date, '%H:%M - %d/%m/%Y') public_date = public_date.timestamp() * 1000 div_body = response.xpath('//div[@class="pswp-content"]') arr_summary = div_body.xpath('//div[@class="sapo"]//text()').extract() summary = '' for i in arr_summary: i = re.sub('\s\s+', ' ', i) summary += i summary = summary.strip() div_content = div_body.xpath('//div[@class="cms-body detail"]/div/div') content = '' for _ in div_content: arr_content = _.xpath('//text()').extract() for i in arr_content: i = re.sub('\s\s+', ' ', i) content += i.strip() tag = '' try: div_tag = response.xpath('//div[@class="details__tags"]/a') for _ in div_tag: str_tag = _.xpath('//text()').extract_first() tag = str_tag.strip('') + '/' except: pass id_picture = str(uuid.uuid1()) + str(uuid.uuid1()) item = CrawlNewsItem() item_image = ImageItem() item['tbl_tag'] = 'tbl_news' item['id_picture'] = id_picture if 'source_title' in self.arr_detail: item['source_title'] = 'thanh nien' if 'source_link' in self.arr_detail: item['source_link'] = 'https://thanhnien.vn/' if 'category_title' in self.arr_detail: item['category_title'] = self.category_title if 'category_link' in self.arr_detail: item['category_link'] = self.category_link if 'post_title' in self.arr_detail: item['post_title'] = post_title if 'post_link' in self.arr_detail: item['post_link'] = post_link if 'sumary' in self.arr_detail: item['sumary'] = summary if 'content' in self.arr_detail: item['content'] = content if 'author' in self.arr_detail: item['author'] = author or '' if 'update_time' in self.arr_detail: item['update_time'] = int(round(time.time() * 1000)) if 'public_date' in self.arr_detail: item['public_date'] = public_date if 'tag' in self.arr_detail: item['tag'] = tag yield item arr_image = div_body.xpath('//img/@src').extract() arr_image = list(set(arr_image)) for i in arr_image: if i.find('https://image.thanhnien.vn') == 0: item_image['tbl_tag'] = 'tbl_images' item_image['id_picture'] = id_picture item_image['image'] = i yield item_image
def parse_page(self, response): ''' Parse the given page selecting the posts. Then ask recursively for another page. ''' # #open page in browser for debug # from scrapy.utils.response import open_in_browser # open_in_browser(response) #select all posts for post in response.xpath( "//div[contains(@data-ft,'top_level_post_id')]"): many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date(date, {'lang': self.lang}) current_date = datetime.strptime( date, '%Y-%m-%d %H:%M:%S') if date is not None else date if current_date is None: date_string = post.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) current_date = datetime(date.year, date.month, date.day) if date is not None else date date = str(date) #if 'date' argument is reached stop crawling if self.date > current_date: raise CloseSpider('Reached date: {}'.format(self.date)) new = ItemLoader(item=FbcrawlItem(), selector=post) if abs(self.count) + 1 > self.max: raise CloseSpider( 'Reached max num of post: {}. Crawling finished'.format( abs(self.count))) self.logger.info('Parsing post n = {}, post_date = {}'.format( abs(self.count) + 1, date)) new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') new.add_value('date', date) new.add_xpath('post_id', './@data-ft') new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority=self.count, meta={'item': new}) #load following page, try to click on "more" #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet #click once on the year and go back to clicking "more" #new_page is different for groups if self.group == 1: new_page = response.xpath( "//div[contains(@id,'stories_container')]/div[2]/a/@href" ).extract() else: new_page = response.xpath( "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href" ).extract() #this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^ if not new_page: self.logger.info( '[!] "more" link not found, will look for a "year" link') #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: while not new_page: #sometimes the years are skipped this handles small year gaps self.logger.info( 'Link not found for year {}, trying with previous year {}' .format(self.k, self.k - 1)) self.k -= 1 if self.k < self.year: raise CloseSpider( 'Reached date: {}. Crawling finished'.format( self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info( 'Page scraped, clicking on "more"! new_page = {}'.format( new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': response.meta['flag']}) else: self.logger.info( 'First page scraped, clicking on "more"! new_page = {}'. format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k})
def start_requests(self): with open('config.json', 'r') as f: data = json.load(f) for i in data.items(): if i[0] == self.name: self.config.append(i) print(i[0]) f.close() for v in self.config: if len(v[1]) == 1: self.Index_Url = v[1][0]['Index_Url'] print( "At Time %s : 爬虫开始爬取层数为1的页面Title = %s , Index_Url = %s " % (time.ctime(), v[0], self.Index_Url), file=self.log) Max_Page = v[1][0]['Max_Page'] Final_Url = v[1][0]['Final_Url'] One_Xpath = v[1][0]['One_Xpath'] if Max_Page: headers = { 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36" } response = requests.get(self.Index_Url, headers=headers) soup = BeautifulSoup(response.content, "lxml") result = str(soup.select(Max_Page['soup'])) pageNums = re.search(Max_Page['re'], result).group() if Final_Url: url = re.sub(Final_Url, "{limit}", self.Index_Url) real_url = url.format(limit=pageNums) else: real_url = self.Index_Url request = Request(real_url, callback=self.parse) request.meta['One_Xpath'] = One_Xpath yield request if len(v[1]) == 2: self.Index_Url = v[1][0]['Index_Url'] print( "At Time %s : 爬虫开始爬取层数为2的页面Title = %s , Index_Url = %s " % (time.ctime(), v[0], self.Index_Url), file=self.log) print("!!!!!!!!!!!!!!!!!!!!!!!!!Index_Url = %s" % self.Index_Url) Max_Page = v[1][0]['Max_Page'] #Head_Url = v[1][0]['Head_Url'] Post_Data = v[1][0]['Post_Data'] Two_Xpath = v[1][1]['Two_Xpath'] headers = { 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36" } response = requests.get(self.Index_Url, headers=headers) soup = BeautifulSoup(response.content, "lxml") result = str(soup.select(Max_Page['soup'])) pageNums = re.search(Max_Page['re'], result).group() #urls = re.sub(Head_Url,"%s",self.Index_Url) if Post_Data: self.flag = 1 urls = get_HeadUrl(self.Index_Url, self.flag) if urls == -1: raise CloseSpider( "______________________________ 构造url失败,爬取结束,请查看日志!_____________________________" ) postdata = "" if Post_Data: keys = list(Post_Data.keys()) for key in keys: if Post_Data[key]: if re.search(Post_Data[key], str(soup)): postdata += (key + "=" + str( (re.search(Post_Data[key], str(soup)).group())).replace( "\"", "") + "&") else: postdata += (key + "=" + Post_Data[key] + "&") else: postdata += (key + "={page}&") if not postdata: urls = urls.replace("%s", "{page}") else: urls = urls % postdata for i in range(1, int(pageNums)): url = urls.format(page=str(i)) request = Request(url, callback=self.parse) request.meta['Two_Xpath'] = Two_Xpath yield request elif len(v[1]) == 3: self.Index_Url = v[1][0]['Index_Url'] print( "At Time %s : 爬虫开始爬取层数为3的页面Title = %s , Index_Url = %s " % (time.ctime(), v[0], self.Index_Url), file=self.log) Max_Page = v[1][0]['Max_Page'] #Head_Url = v[1][0]['Head_Url'] Post_Data = v[1][0]['Post_Data'] Valid_Url = v[1][1]['Valid_Url'] Three_Xpath = v[1][2]['Three_Xpath'] headers = { 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36" } response = requests.get(self.Index_Url, headers=headers) soup = BeautifulSoup(response.content, "lxml") result = str(soup.select(Max_Page['soup'])) pageNums = re.search(Max_Page['re'], result).group() #urls = re.sub(Head_Url,"%s",self.Index_Url) print("最大页数是:%s" % pageNums) if Post_Data: self.flag = 1 urls = get_HeadUrl(self.Index_Url, self.flag) if urls == -1: raise CloseSpider( "______________________________ 构造url失败,爬取结束,请查看日志!_____________________________" ) #print urls postdata = "" if Post_Data: keys = list(Post_Data.keys()) for key in keys: if Post_Data[key]: if re.search(Post_Data[key], str(soup)): postdata += (key + "=" + quote_plus( (re.search(Post_Data[key], str(soup)).group()).replace( '"', "")) + "&") else: postdata += (key + "=" + Post_Data[key] + "&") else: postdata += (key + "={page}&") if not postdata: urls = urls.replace("%s", "{page}") else: urls = urls % postdata for i in range(1, int(pageNums)): url = urls.format(page=str(i)) request = Request(url, callback=self.parse_first) request.meta['Valid_Url'] = Valid_Url request.meta['Three_Xpath'] = Three_Xpath yield request
def parse_item(self, response): item = item_Noticia() # TITULAR item['titularNoticia'] = response.xpath( XPATH_NOTICIA_TITULO).extract()[0] # LINK item['linkNoticia'] = response.url # KEYWORDS # Las keywords se ponen con el formato "A,B,C" item['keywordsNoticia'] = [] try: keywords = response.xpath( XPATH_NOTICIA_KEYWORDS).extract()[0].split(",") for keyword in keywords: item['keywordsNoticia'].append(keyword.strip()) except: item['keywordsNoticia'] = [] # DESCRIPCIÓN item['resumenNoticia'] = response.xpath( XPATH_NOTICIA_RESUMEN).extract() # AUTORES # Los autores, en el caso de haber más de uno, se posicionan en tags diferentes item['autorNoticia'] = [] autores = response.xpath(XPATH_NOTICIA_AUTORES).extract() for autor in autores: item['autorNoticia'].append(autor.strip()) # LOCALIZACIONES # No se muestran en la noticia. Hay veces que aparece con el autor, pero aparecen de esta manera: # Juan Pérez. Barcelona # Lo cuál interfiere con los nombres de autores los cuales firmán con las iniciales de su nombre y apellidos. # Ejemplo: J. P. item['localizacionNoticia'] = [] # FECHA # Se encuentra en el interior de la noticia como "YYYY-MM-ddThh:mm:ssZ" try: item['fechaPublicacionNoticia'] = response.xpath( XPATH_NOTICIA_FECHA_PUBLICACION).extract()[0] except: return # PIE DE FOTO # 3 casos: 1) No foto. 2) Pie de foto pero NO firma. 3) Pie y firma de foto try: pieDeFoto = response.xpath( XPATH_NOTICIA_FOTO_PIE).extract()[0].strip() item['pieDeFotoNoticia'] = pieDeFoto.split("(")[0].strip() except: item['pieDeFotoNoticia'] = "" item['firmaDeFotoNoticia'] = "" # FIRMA DE FOTO try: item['firmaDeFotoNoticia'] = pieDeFoto.split("(")[1].split( ")")[0].strip() except: item['firmaDeFotoNoticia'] = "" # CUERPO listPartesCuerpo = response.xpath(XPATH_NOTICIA_CUERPO).extract() cuerpoNoticia = "".join(listPartesCuerpo) cuerpoNoticia = TAG_RE.sub('', cuerpoNoticia) item['cuerpoNoticia'] = cuerpoNoticia # TAGS item['tagsNoticia'] = [] tagsNoticia = response.xpath(XPATH_NOTICIA_TAGS).extract() for tag in tagsNoticia: item['tagsNoticia'].append(tag) # ZONA DE TEST #self.newsCount+=1 if self.newsCount > 10: raise CloseSpider("\x1b[1;33m" + "Noticias de test recogidas" + "\033[0;m") yield item
def parse(self, response): item = CollectorSpiderItem() One_Xpath = response.meta.get('One_Xpath', None) Two_Xpath = response.meta.get('Two_Xpath', None) Three_Xpath = response.meta.get('Three_Xpath', None) if One_Xpath: for i in response.xpath(One_Xpath['Lost_Xpath']): item['lost_url'] = response.url item['lost_from'] = "" if not re.search( One_Xpath['Lost_From'], response.url).group() else re.search( One_Xpath['Lost_From'], response.url).group() item['lost_id'] = format_string("" if not i.xpath( One_Xpath['Lost_Id'] if One_Xpath['Lost_Id'] else "/" ).extract() else i.xpath( One_Xpath['Lost_Id'] if One_Xpath['Lost_Id'] else "/"). extract()[0]) item['lost_title'] = format_string("" if not i.xpath( One_Xpath['Lost_Title'] if One_Xpath['Lost_Title'] else "/" ).extract() else i.xpath( One_Xpath['Lost_Title'] if One_Xpath['Lost_Title'] else "/" ).extract()[0]) item['lost_describe'] = format_string("" if not i.xpath( One_Xpath['Lost_Describe'] if One_Xpath['Lost_Describe'] else "/").extract() else i.xpath( One_Xpath['Lost_Describe'] if One_Xpath['Lost_Describe'] else "/").extract()[0]) item['lost_person'] = format_string("" if not i.xpath( One_Xpath['Lost_Person'] if One_Xpath['Lost_Person'] else "/").extract() else i.xpath( One_Xpath['Lost_Person'] if One_Xpath['Lost_Person'] else "/").extract()[0]) item['lost_time'] = format_time( format_string("" if not i.xpath( One_Xpath['Lost_Time'] if One_Xpath['Lost_Time'] else "/").extract() else i.xpath( One_Xpath['Lost_Time'] if One_Xpath['Lost_Time'] else "/").extract()[0])) item['lost_location'] = One_Xpath['Lost_Location'][1] + ( format_string("" if not i.xpath(One_Xpath['Lost_Location'][ 0] if One_Xpath['Lost_Location'][0] else "/").extract( ) else i.xpath(One_Xpath['Lost_Location'][0] if One_Xpath['Lost_Location'][0] else "/"). extract()[0])) item['lost_mid'] = hashlib.md5( (item['lost_from'] + item['lost_id'] + item['lost_describe'] + item['lost_time']).encode('utf-8')).hexdigest()[8:-8] if os.path.exists( "/home/hong/文档/sina_working/2to3_test/filter.bloom"): #token = str(item['lost_url'])+str(item['lost_id'])+str(item['lost_describe']) token = item['lost_mid'] if self.bf.__contains__(token): print( "\ntime waiting......\ntime waiting......\ntime waiting......\n\nAt Time %s , The spider TOKEN : %s has been destroied_______________" % (time.ctime(), token), file=self.log) self.log.close() #time.sleep(10) raise CloseSpider( "______________________________ item已经捕获重复,爬取结束!_____________________________" ) yield item elif Two_Xpath: for i in response.xpath(Two_Xpath['Lost_Xpath']): #item['lost_mid'] = resposne.url item['lost_url'] = response.url item['lost_from'] = "" if not re.search( Two_Xpath['Lost_From'], response.url).group() else re.search( Two_Xpath['Lost_From'], response.url).group() item['lost_id'] = format_string("" if not i.xpath( Two_Xpath['Lost_Id'] if Two_Xpath['Lost_Id'] else "/" ).extract() else i.xpath( Two_Xpath['Lost_Id'] if Two_Xpath['Lost_Id'] else "/"). extract()[0]) item['lost_title'] = format_string("" if not i.xpath( Two_Xpath['Lost_Title'] if Two_Xpath['Lost_Title'] else "/" ).extract() else i.xpath( Two_Xpath['Lost_Title'] if Two_Xpath['Lost_Title'] else "/" ).extract()[0]) item['lost_describe'] = format_string("" if not i.xpath( Two_Xpath['Lost_Describe'] if Two_Xpath['Lost_Describe'] else "/").extract() else i.xpath( Two_Xpath['Lost_Describe'] if Two_Xpath['Lost_Describe'] else "/").extract()[0]) item['lost_person'] = format_string("" if not i.xpath( Two_Xpath['Lost_Person'] if Two_Xpath['Lost_Person'] else "/").extract() else i.xpath( Two_Xpath['Lost_Person'] if Two_Xpath['Lost_Person'] else "/").extract()[0]) item['lost_time'] = format_time( format_string("" if not i.xpath( Two_Xpath['Lost_Time'] if Two_Xpath['Lost_Time'] else "/").extract() else i.xpath( Two_Xpath['Lost_Time'] if Two_Xpath['Lost_Time'] else "/").extract()[0])) item['lost_location'] = Two_Xpath['Lost_Location'][ 1] + format_string("" if not i.xpath( Two_Xpath['Lost_Location'][0] if Two_Xpath['Lost_Location'][0] else "/").extract( ) else i.xpath(Two_Xpath['Lost_Location'][0] if Two_Xpath['Lost_Location'][0] else "/"). extract()[0]) item['lost_mid'] = hashlib.md5( (item['lost_from'] + item['lost_id'] + item['lost_describe'] + item['lost_time']).encode('utf-8')).hexdigest()[8:-8] #time_temp = re.search(r'\d+-\d+-\d+',str(item['lost_time'])).group() #if not re.search(r'20',time_temp): # time_temp = "20"+time_temp #print "time_temp = %s"%time_temp #time_stamp = datetime.datetime(int(re.search(r'\d+',time_temp).group()),int(re.search(r'(?<=-)\d+',time_temp).group()),int(re.search(r'\d+$',time_temp).group())) #if time.mktime(time_stamp.timetuple()) < time.mktime(self.one_month_ago.timetuple()): # print >> self.log,"At Time %s , the item[%s] : the datetime is overtimed._____________"%(time.ctime(),time_stamp) # raise CloseSpider("_____________________________The datetime is overtimed,爬取结束!!_______________________") if os.path.exists( "/home/hong/文档/sina_working/2to3_test/filter.bloom"): #token = str(item['lost_url'])+str(item['lost_id'])+str(item['lost_describe']) token = item['lost_mid'] if self.bf.__contains__(token): #self.log.write("TRUE,存在重复元素,到达这里没有?") print( "\ntime waiting......\ntime waiting......\ntime waiting......\n\nAt Time %s , The spider TOKEN : %s has been destroied_______________" % (time.ctime(), token), file=self.log) self.log.close() #time.sleep(10) raise CloseSpider( "______________________________ item已经捕获重复,爬取结束!_____________________________" ) yield item else: item['lost_url'] = response.url item['lost_from'] = "" if not re.search( Three_Xpath['Lost_From'], response.url).group() else re.search( Three_Xpath['Lost_From'], response.url).group() item['lost_id'] = format_string("" if not response.xpath( Three_Xpath['Lost_Id'] if Three_Xpath['Lost_Id'] else "/" ).extract() else response.xpath( Three_Xpath['Lost_Id'] if Three_Xpath['Lost_Id'] else "/"). extract()[0]) item['lost_title'] = format_string("" if not response.xpath( Three_Xpath['Lost_Title'] if Three_Xpath['Lost_Title'] else "/" ).extract() else response.xpath( Three_Xpath['Lost_Title'] if Three_Xpath['Lost_Title'] else "/" ).extract()[0]) item['lost_describe'] = format_string("" if not response.xpath( Three_Xpath['Lost_Describe'] if Three_Xpath['Lost_Describe'] else "/").extract() else response.xpath( Three_Xpath['Lost_Describe'] if Three_Xpath['Lost_Describe'] else "/").extract()[0]) item['lost_person'] = format_string("" if not response.xpath( Three_Xpath['Lost_Person'] if Three_Xpath['Lost_Person'] else "/").extract() else response.xpath( Three_Xpath['Lost_Person'] if Three_Xpath['Lost_Person'] else "/").extract()[0]) item['lost_time'] = format_time( format_string("" if not response.xpath( Three_Xpath['Lost_Time'] if Three_Xpath['Lost_Time'] else "/").extract() else response.xpath( Three_Xpath['Lost_Time'] if Three_Xpath['Lost_Time'] else "/").extract()[0])) #print(type(Three_Xpath['Lost_Location'][1])) item['lost_location'] = Three_Xpath['Lost_Location'][ 1] + format_string("" if not response.xpath( Three_Xpath['Lost_Location'][0] if Three_Xpath['Lost_Location'][0] else "/" ).extract()[0] else response.xpath( Three_Xpath['Lost_Location'][0] if Three_Xpath['Lost_Location'][0] else "/").extract()[0]) item['lost_mid'] = hashlib.md5( (item['lost_from'] + item['lost_id'] + item['lost_describe'] + item['lost_time']).encode('utf-8')).hexdigest()[8:-8] if os.path.exists( "/home/hong/文档/sina_working/2to3_test/filter.bloom"): #token = str(item['lost_url']+item['lost_id']+item['lost_describe']) token = item['lost_mid'] if self.bf.__contains__(token): print( "\ntime waiting......\ntime waiting......\ntime waiting......\n\nAt Time %s , The spider TOKEN : %s has been destroied_______________" % (time.ctime(), token), file=self.log) self.log.close() #time.sleep(10) raise CloseSpider( "______________________________ url已经捕获重复,爬取结束!_____________________________" ) yield item
def spider_closed(self, spider): self.file.close() raise CloseSpider('Shutdown by ctrl-c')
def _set_start_urls(self, scrape_url): self.start_urls = [] if self.scraper.pagination_type in [ 'R', 'F', ]: if not self.scraper.pagination_page_replace: msg = 'Please provide a pagination_page_replace context corresponding to pagination_type!' self.dds_logger.error(msg) raise CloseSpider() if self.scraper.pagination_type == 'R': try: pages = self.scraper.pagination_page_replace pages = pages.split(',') if len(pages) > 3: raise Exception pages = list(range(*list(map(int, pages)))) except Exception: msg = 'Pagination_page_replace for pagination_type "RANGE_FUNCT" ' +\ 'has to be provided as python range function arguments ' +\ '[start], stop[, step] (e.g. "1, 50, 10", no brackets)!' self.dds_logger.error(msg) raise CloseSpider() pages = self.limit_page_nums(pages) if self.scraper.pagination_type == 'F': try: pages = self.scraper.pagination_page_replace pages = pages.strip(', ') pages = ast.literal_eval("[" + pages + ",]") except: msg = 'Wrong pagination_page_replace format for pagination_type "FREE_LIST", ' +\ "Syntax: 'Replace string 1', 'Another replace string 2', 'A number 3', ..." self.dds_logger.error(msg) raise CloseSpider() pages = self.limit_page_nums(pages) if self.scraper.pagination_type in [ 'R', 'F', ]: append_str = self.scraper.pagination_append_str if scrape_url[-1:] == '/' and append_str[0:1] == '/': append_str = append_str[1:] self.pages = pages if self.conf['MAX_PAGES_READ']: self.pages = self.pages[0:self.conf['MAX_PAGES_READ']] for page in self.pages: url = scrape_url + append_str.format(page=page) self.start_urls.append(url) if not self.scraper.pagination_on_start and not self.conf[ 'START_PAGE']: self.start_urls.insert(0, scrape_url) self.pages.insert(0, "") if self.scraper.pagination_type in [ 'N', 'O', ]: self.start_urls.append(scrape_url) self.pages = [ "", ] num = len(self.start_urls) if (num == 1): url_str = 'URL' else: url_str = 'URLs' self.log( "Scraper set to run on {num} start {url_str}.".format( num=num, url_str=url_str), logging.INFO)