def parse(self, response): title = text_strip(remove_tags(response.css('h1').extract_first())) category = text_strip(response.css('.ask6new_mbx li:nth-child(3) dl dt a::text').extract_first()) sub_category = text_strip(response.css('.ask6new_mbx li:nth-child(4) dl dt a::text').extract_first()) question_item = Question() question_item['title'] = title question_item['category'] = category if category else 'N/A' question_item['sub_category'] = sub_category if sub_category else 'N/A' question_item['source_name'] = 'jia.com' question_item['source_url'] = response.url question_item['entry_url'] = response.url answers = response.css('.con_text.con_text2, .content_text2').extract() answers = map(text_strip, answers) answers = map(text_clear, answers) answers = filter(text_filter, answers) answer_items = [] for answer in answers: answer_body = remove_tags(answer, keep=['br']) answer_item = Answer() answer_item['body'] = answer_body answer_items.append(answer_item) question_item['answers'] = answer_items yield question_item
def parse(self, response): title = text_strip(remove_tags(response.css('h1').extract_first())) category = text_strip( response.css( '.ask6new_mbx li:nth-child(3) dl dt a::text').extract_first()) sub_category = text_strip( response.css( '.ask6new_mbx li:nth-child(4) dl dt a::text').extract_first()) question_item = Question() question_item['title'] = title question_item['category'] = category if category else 'N/A' question_item['sub_category'] = sub_category if sub_category else 'N/A' question_item['source_name'] = 'jia.com' question_item['source_url'] = response.url question_item['entry_url'] = response.url answers = response.css('.con_text.con_text2, .content_text2').extract() answers = map(text_strip, answers) answers = map(text_clear, answers) answers = filter(text_filter, answers) answer_items = [] for answer in answers: answer_body = remove_tags(answer, keep=['br']) answer_item = Answer() answer_item['body'] = answer_body answer_items.append(answer_item) question_item['answers'] = answer_items yield question_item
def process_item(self, item, spider): # cleanup introduction item['introduction'] = ''.join( re.sub(r'\[\d+\]', '', remove_tags(x)) for x in item['introduction']) # clean up techniques for heading, text in item['content']['Techniques'].iteritems(): item['content']['Techniques'][heading] = re.sub( r'\[\d+\]', '', remove_tags(text)) # cleanup legal issues for heading, text in item['content']['Legal issues'].iteritems(): item['content']['Legal issues'][heading] = re.sub( r'\[\d+\]', '', remove_tags(text)) # clen up methods to prevent scraping item['content']['Methods to prevent web scraping'] = [ remove_tags(x) for x in item['content']['Methods to prevent web scraping'] ] # cleanup references item['content']['References'] = [ re.sub('"', '', remove_tags(x)) for x in item['content']['References'] ] return item
def get_data(urls): data = [] for url in urls: driver.get(url) time.sleep(5) title = "" try: title = driver.find_element_by_xpath( '//*[@id="pdp-product-title"]/span/span[3]').get_attribute( "innerHTML") title = remove_tags(title).replace("&", "&") except: title = "" text = "" try: text = driver.find_element_by_xpath( '//*[@id="main-content-inside"]/div[2]/div/section/div[2]/div/div[2]/p' ).get_attribute("innerHTML") text = remove_tags(text).replace("&", "&") # //*[@id="main-content-inside"]/div[2]/div/section/div[2]/div/div[2]/p except: text = "" if len(text.split()) > 8: data.append({ "description": text, "title": title, "category": cat + "_" + sub_cat }) return data
def parse(self, response): if re.match('https://www.homes.co.jp/distil.+', response.url): time.sleep(5) yield scrapy.Request(url=response.request.meta['redirect_urls'][0], callback=self.parse, dont_filter=True) else: enabled_stations = response.css( 'ul.checkboxLinkList li label span a::text').extract() disabled_stations = response.css( 'ul.checkboxLinkList li.disabled label span.linkName::text' ).extract() railway_name = response.css('span.linkNameAll').extract_first() pref_name = re.sub('https://www.homes.co.jp/chintai/(.+)/.+/$', r"\1", response.url) stations = enabled_stations + disabled_stations for station_name in stations: item_loader = HomesStationItemLoader(item=HomesStationItem()) item_loader.add_value('web_site', 'HOMES') item_loader.add_value('pref_name', self.prefs[pref_name]) item_loader.add_value('railway', remove_tags(railway_name)) item_loader.add_value('station', remove_tags(station_name)) item_loader.add_value('url', response.url) yield item_loader.load_item()
def _convert(data): if t not in ['join', 'list'] and isinstance(data, list): data = TakeFirst()(data) if type(data) in [str, unicode]: data = data.strip() elif type(data) in [int, float, datetime]: data = str(data) else: return data if t == 'join': sep = inf.get('sep', u' ') return Join(sep)(data) elif t == 'list': sep = inf.get('sep', u' ') return remove_tags(Join(sep)(data)).strip() elif t == 'text': return remove_tags(data).strip() elif t == 'clean': cleaner = Cleaner(style=True, scripts=True, javascript=True, links=True, meta=True) return cleaner.clean_html(data) elif t == 'unesc': return HTMLParser().unescape(data) elif t == 'base64': return base64.decodestring(data) elif t == 'sub': frm = inf.get('from') to = inf.get('to') return re.sub(frm, to, data) elif t == 'jpath': qs = inf.get('query') return jsonpath.jsonpath(json.loads(data), qs) elif t == 'map': m = inf.get('map') d = inf.get('default') return m.get(data, d) elif t == 'int': return int(float(data)) elif t == 'float': return float(data) elif t == 'date': fmt = inf.get('fmt', 'auto') tz = inf.get('tz', '+00:00') return parse_date(data, fmt, tz) elif t == 'cst': fmt = inf.get('fmt', 'auto') return parse_date(data, fmt, '+08:00') else: return data
def process_item(self, item, spider): i = item['summary'][0] i = remove_tags(i) i = replace_escape_chars(i) item['summary'][0] = i i = item['job_title'][0] i = remove_tags(i) i = replace_escape_chars(i) item['job_title'][0] = i print item return item
def process_item(self, item, spider): i = item['summary'][0] i = remove_tags(i) i = replace_escape_chars(i) item['summary'][0] = i i = item['job_title'][0] i = remove_tags(i) i = replace_escape_chars(i) item['job_title'][0] = i return item
def parse_article(self, response): articulos = articleswiki() articulo = article() articulos["link"] = response.meta["link"] articulo['title'] = remove_tags( response.css("#firstHeading").extract()[0]) for p in response.css(".mw-parser-output > p").extract(): paragraph = remove_tags(p) if not paragraph.isspace(): articulo['paragraph'] = paragraph break articulos["body"] = articulo return articulos
def parse_article(self, response): title = response.meta['title'] url = response.meta['url'] article_id = response.meta['article_id'] time = response.meta['time'] print article_id, time, title, url article = response.xpath('//div[@itemprop="articleBody"]') print remove_tags(article.extract_first()) item = ArticleItem(title=title, article_id=article_id, time=time, content=remove_tags(article.extract_first())) yield item
def _convert(data): if t not in ['join', 'list'] and isinstance(data, list): data = TakeFirst()(data) if type(data) in [str, unicode]: data = data.strip() elif type(data) in [int, float, datetime]: data = str(data) else: return data if t=='join': sep = inf.get('sep', u' ') return Join(sep)(data) elif t=='list': sep = inf.get('sep', u' ') return remove_tags(Join(sep)(data)).strip() elif t=='text': return remove_tags(data).strip() elif t=='clean': cleaner = Cleaner(style=True, scripts=True, javascript=True, links=True, meta=True) return cleaner.clean_html(data) elif t=='unesc': return HTMLParser().unescape(data) elif t=='base64': return base64.decodestring(data) elif t=='sub': frm = inf.get('from') to = inf.get('to') return re.sub(frm, to, data) elif t=='jpath': qs = inf.get('query') return jsonpath.jsonpath(json.loads(data), qs) elif t=='map': m = inf.get('map') d = inf.get('default') return m.get(data, d) elif t=='int': return int(float(data)) elif t=='float': return float(data) elif t=='date': fmt = inf.get('fmt', 'auto') tz = inf.get('tz', '+00:00') return parse_date(data, fmt, tz) elif t=='cst': fmt = inf.get('fmt', 'auto') return parse_date(data, fmt, '+08:00') else: return data
def parse_content(self, response): titulo = response.css('div.tituloreal::text').extract_first() contenido = response.css('div.cuerpoarticulo').extract_first() url = response.url timestamp = datetime.datetime.today().timestamp() #date = response.css('div.cuerpoarticulo b::text').extract_first().split(',')[1].replace('-', '') #date_time = datetime.datetime.strptime(date, '%d %b. %Y.') yield { 'titulo': titulo, 'contenido': remove_tags(contenido), 'url': url, 'timestamp': timestamp, 'record_type': 'r1', 'source': 'acbcom' } # get related news for related in response.css( 'div.cuerpoarticulo2 a.negro::attr(href)').extract(): yield response.follow(related, callback=self.parse_content) # top news for top in response.css( 'div.menucontenido3 a.blanco::attr(href)').extract(): yield response.follow(top, callback=self.parse_content)
def parse_blog(self, response): #HTML Content blog_id = response.xpath('/html/head/link[4]/@href').get().strip( '/node/') blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@id="block-zerohedge-page-title"]/h1/span/text()').get() blog['author'] = response.xpath( '//*[@id="block-zerohedge-content"]/article/footer/div[1]/div/div[1]/span/a/text()' ).get() blog['published_date'] = convert_date( response.xpath( '//*[@id="block-zerohedge-content"]/article/footer/div[1]/div/div[2]/span/text()' ).get()) blog['content'] = remove_tags("\n".join( response.xpath( '//*[@id="block-zerohedge-content"]/article/div/div[1]/p'). getall())) blog['content_html'] = "".join( response.xpath( '//*[@id="block-zerohedge-content"]/article/div/div[1]/p'). getall()) blog['links'] = get_links(blog['content_html']) yield blog #Stats requests stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = requests.get( 'https://www.zerohedge.com/statistics-ajax?entity_ids={}'.format( blog_id)).json()[blog_id] stat['likes'] = None stat['comments'] = requests.get( 'https://www.zerohedge.com/coral-talk-comment-counts?nids={}'. format(blog_id)).json()[blog_id] yield stat #Comments requests payload = { "query": "query CoralEmbedStream_Embed($assetId: ID, $assetUrl: String, $commentId: ID!, $hasComment: Boolean!, $excludeIgnored: Boolean, $sortBy: SORT_COMMENTS_BY!, $sortOrder: SORT_ORDER!) {\n me {\n id\n state {\n status {\n username {\n status\n __typename\n }\n banned {\n status\n __typename\n }\n suspension {\n until\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n asset(id: $assetId, url: $assetUrl) {\n ...CoralEmbedStream_Configure_asset\n ...CoralEmbedStream_Stream_asset\n ...CoralEmbedStream_AutomaticAssetClosure_asset\n __typename\n }\n ...CoralEmbedStream_Stream_root\n ...CoralEmbedStream_Configure_root\n}\n\nfragment CoralEmbedStream_Stream_root on RootQuery {\n me {\n state {\n status {\n username {\n status\n __typename\n }\n banned {\n status\n __typename\n }\n suspension {\n until\n __typename\n }\n __typename\n }\n __typename\n }\n ignoredUsers {\n id\n __typename\n }\n role\n __typename\n }\n settings {\n organizationName\n __typename\n }\n ...TalkSlot_StreamFilter_root\n ...CoralEmbedStream_Comment_root\n __typename\n}\n\nfragment CoralEmbedStream_Comment_root on RootQuery {\n me {\n ignoredUsers {\n id\n __typename\n }\n __typename\n }\n ...TalkSlot_CommentInfoBar_root\n ...TalkSlot_CommentAuthorName_root\n ...TalkEmbedStream_DraftArea_root\n ...TalkEmbedStream_DraftArea_root\n __typename\n}\n\nfragment TalkEmbedStream_DraftArea_root on RootQuery {\n __typename\n}\n\nfragment CoralEmbedStream_Stream_asset on Asset {\n comment(id: $commentId) @include(if: $hasComment) {\n ...CoralEmbedStream_Stream_comment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n id\n title\n url\n isClosed\n created_at\n settings {\n moderation\n infoBoxEnable\n infoBoxContent\n premodLinksEnable\n questionBoxEnable\n questionBoxContent\n questionBoxIcon\n closedTimeout\n closedMessage\n disableCommenting\n disableCommentingMessage\n charCountEnable\n charCount\n requireEmailConfirmation\n __typename\n }\n totalCommentCount @skip(if: $hasComment)\n comments(query: {limit: 50000, excludeIgnored: $excludeIgnored, sortOrder: $sortOrder, sortBy: $sortBy}) @skip(if: $hasComment) {\n nodes {\n ...CoralEmbedStream_Stream_comment\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n ...TalkSlot_StreamFilter_asset\n ...CoralEmbedStream_Comment_asset\n __typename\n}\n\nfragment CoralEmbedStream_Comment_asset on Asset {\n __typename\n id\n ...TalkSlot_CommentInfoBar_asset\n ...TalkSlot_CommentReactions_asset\n ...TalkSlot_CommentAuthorName_asset\n}\n\nfragment CoralEmbedStream_Stream_comment on Comment {\n id\n status\n user {\n id\n __typename\n }\n ...CoralEmbedStream_Comment_comment\n __typename\n}\n\nfragment CoralEmbedStream_Comment_comment on Comment {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n}\n\nfragment CoralEmbedStream_Comment_SingleComment on Comment {\n id\n body\n created_at\n status\n replyCount\n tags {\n tag {\n name\n __typename\n }\n __typename\n }\n user {\n id\n username\n __typename\n }\n status_history {\n type\n __typename\n }\n action_summaries {\n __typename\n count\n current_user {\n id\n __typename\n }\n }\n editing {\n edited\n editableUntil\n __typename\n }\n ...TalkSlot_CommentInfoBar_comment\n ...TalkSlot_CommentReactions_comment\n ...TalkSlot_CommentAvatar_comment\n ...TalkSlot_CommentAuthorName_comment\n ...TalkSlot_CommentContent_comment\n ...TalkEmbedStream_DraftArea_comment\n ...TalkEmbedStream_DraftArea_comment\n __typename\n}\n\nfragment TalkEmbedStream_DraftArea_comment on Comment {\n __typename\n ...TalkSlot_DraftArea_comment\n}\n\nfragment CoralEmbedStream_Stream_singleComment on Comment {\n id\n status\n user {\n id\n __typename\n }\n ...CoralEmbedStream_Comment_SingleComment\n __typename\n}\n\nfragment CoralEmbedStream_Configure_root on RootQuery {\n __typename\n ...CoralEmbedStream_Settings_root\n}\n\nfragment CoralEmbedStream_Settings_root on RootQuery {\n __typename\n}\n\nfragment CoralEmbedStream_Configure_asset on Asset {\n __typename\n ...CoralEmbedStream_AssetStatusInfo_asset\n ...CoralEmbedStream_Settings_asset\n}\n\nfragment CoralEmbedStream_AssetStatusInfo_asset on Asset {\n id\n closedAt\n isClosed\n __typename\n}\n\nfragment CoralEmbedStream_Settings_asset on Asset {\n id\n settings {\n moderation\n premodLinksEnable\n questionBoxEnable\n questionBoxIcon\n questionBoxContent\n __typename\n }\n __typename\n}\n\nfragment CoralEmbedStream_AutomaticAssetClosure_asset on Asset {\n id\n closedAt\n __typename\n}\n\nfragment TalkSlot_StreamFilter_root on RootQuery {\n ...TalkViewingOptions_ViewingOptions_root\n __typename\n}\n\nfragment TalkViewingOptions_ViewingOptions_root on RootQuery {\n __typename\n}\n\nfragment TalkSlot_CommentInfoBar_root on RootQuery {\n ...TalkModerationActions_root\n __typename\n}\n\nfragment TalkModerationActions_root on RootQuery {\n me {\n id\n __typename\n }\n __typename\n}\n\nfragment TalkSlot_CommentAuthorName_root on RootQuery {\n ...TalkAuthorMenu_AuthorName_root\n __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_root on RootQuery {\n __typename\n ...TalkSlot_AuthorMenuActions_root\n}\n\nfragment TalkSlot_StreamFilter_asset on Asset {\n ...TalkViewingOptions_ViewingOptions_asset\n __typename\n}\n\nfragment TalkViewingOptions_ViewingOptions_asset on Asset {\n __typename\n}\n\nfragment TalkSlot_CommentInfoBar_asset on Asset {\n ...TalkModerationActions_asset\n ...TalkPermalink_Button_asset\n __typename\n}\n\nfragment TalkModerationActions_asset on Asset {\n id\n __typename\n}\n\nfragment TalkPermalink_Button_asset on Asset {\n url\n __typename\n}\n\nfragment TalkSlot_CommentReactions_asset on Asset {\n ...VoteButton_asset\n __typename\n}\n\nfragment VoteButton_asset on Asset {\n id\n __typename\n}\n\nfragment TalkSlot_CommentAuthorName_asset on Asset {\n ...TalkAuthorMenu_AuthorName_asset\n __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_asset on Asset {\n __typename\n}\n\nfragment TalkSlot_CommentInfoBar_comment on Comment {\n ...CollapseCommentButton_comment\n ...TalkModerationActions_comment\n ...TalkPermalink_Button_comment\n ...TalkInfoBar_moveReportButton_Comment\n ...TalkInfoBar_addEdiableClass_Comment\n __typename\n}\n\nfragment CollapseCommentButton_comment on Comment {\n id\n replyCount\n __typename\n}\n\nfragment TalkModerationActions_comment on Comment {\n id\n status\n user {\n id\n __typename\n }\n tags {\n tag {\n name\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment TalkPermalink_Button_comment on Comment {\n id\n __typename\n}\n\nfragment TalkInfoBar_moveReportButton_Comment on Comment {\n id\n __typename\n}\n\nfragment TalkInfoBar_addEdiableClass_Comment on Comment {\n id\n editing {\n __typename\n editableUntil\n }\n __typename\n}\n\nfragment TalkSlot_CommentReactions_comment on Comment {\n ...TalkDisableDeepReplies_disableDeepReplies_Comment\n ...VoteButton_comment\n __typename\n}\n\nfragment TalkDisableDeepReplies_disableDeepReplies_Comment on Comment {\n id\n __typename\n}\n\nfragment VoteButton_comment on Comment {\n id\n action_summaries {\n __typename\n ... on UpvoteActionSummary {\n count\n current_user {\n id\n __typename\n }\n __typename\n }\n ... on DownvoteActionSummary {\n count\n current_user {\n id\n __typename\n }\n __typename\n }\n }\n __typename\n}\n\nfragment TalkSlot_CommentAvatar_comment on Comment {\n ...UserAvatar_comment\n __typename\n}\n\nfragment UserAvatar_comment on Comment {\n user {\n avatar\n __typename\n }\n __typename\n}\n\nfragment TalkSlot_CommentAuthorName_comment on Comment {\n ...TalkAuthorMenu_AuthorName_comment\n __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_comment on Comment {\n __typename\n id\n user {\n username\n __typename\n }\n ...TalkSlot_AuthorMenuActions_comment\n}\n\nfragment TalkSlot_CommentContent_comment on Comment {\n ...TalkPluginRichText_CommentContent_comment\n __typename\n}\n\nfragment TalkPluginRichText_CommentContent_comment on Comment {\n body\n richTextBody\n __typename\n}\n\nfragment TalkSlot_DraftArea_comment on Comment {\n ...TalkPluginRichText_Editor_comment\n __typename\n}\n\nfragment TalkPluginRichText_Editor_comment on Comment {\n body\n richTextBody\n __typename\n}\n\nfragment TalkSlot_AuthorMenuActions_root on RootQuery {\n ...TalkIgnoreUser_IgnoreUserAction_root\n __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserAction_root on RootQuery {\n me {\n id\n __typename\n }\n __typename\n}\n\nfragment TalkSlot_AuthorMenuActions_comment on Comment {\n ...TalkIgnoreUser_IgnoreUserAction_comment\n ...TalkDrupalUserId_DrupalProfile_comment\n __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserAction_comment on Comment {\n user {\n id\n __typename\n }\n ...TalkIgnoreUser_IgnoreUserConfirmation_comment\n __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserConfirmation_comment on Comment {\n user {\n id\n username\n __typename\n }\n __typename\n}\n\nfragment TalkDrupalUserId_DrupalProfile_comment on Comment {\n user {\n id\n __typename\n }\n __typename\n}\n", "variables": { "assetId": "", "assetUrl": blog['url'], "commentId": "", "hasComment": False, "excludeIgnored": False, "sortBy": "CREATED_AT", "sortOrder": "DESC" }, "operationName": "CoralEmbedStream_Embed" } yield scrapy.Request('https://talk.zerohedge.com/api/v1/graph/ql', method='POST', body=json.dumps(payload), headers={'Content-Type': 'application/json'}, callback=self.process_comments)
def parse(self, response): sel = Selector(response) for i in sel.css('ul.pj-rank-cate-list>li.pj-rank-cate-item'): cid = int(i.css('a::attr(data-id)').extract()[0]) cname = remove_tags(i.extract()).strip() yield self.make_tags_ajax_request(cid, cname)
def parse_rss(self, response): posts_link = response.css( 'item link::text').extract()[:self.POST_PER_RSS] posts_title = response.css( 'item title::text').extract()[:self.POST_PER_RSS] posts_descriptions = response.css( 'item description::text').extract()[:self.POST_PER_RSS] posts_descriptions = [remove_tags(desc) for desc in posts_descriptions] meta_data = { 'blog_name': response.css('channel>title::text').extract()[0], 'blog_url': response.url, 'posts_link': posts_link, 'next_link': posts_link[1:], 'posts_full_content': [], 'posts_content': posts_descriptions, 'posts_title': posts_title } if posts_link: yield scrapy.Request(url=posts_link[0], callback=self.parse, meta=meta_data) else: result = { 'type': 'blog', 'blog_name': meta_data['blog_name'], 'blog_url': meta_data['blog_url'] } yield result self.BLOG_LIMIT -= 1
def extract_post_data(response): # the xpath.extract() function returns a list of all instances that match the xpath string. # i.e. post_titles will be a list of all post titles in the order encountered, post_users will be all users in the order encountered, etc post_titles = response.xpath( "//div[@class='postbody']//h3/a/text()").extract() post_users = response.xpath( "//div[@class='postbody']//span[@class='username']/text()" ).extract() post_contents = response.xpath( "//div[@class='postbody']//div[@class='content']").extract() post_datestrings = response.xpath( "//div[@class='postbody']//p[@class='author']/text()[3]" ).extract() # zip the lists togeather, and for each set make a new Post object and add it to a list, return the list posts = [] for title, author, content, datestring in zip( post_titles, post_users, post_contents, post_datestrings): new_post = Post() new_post['user'] = author new_post['title'] = title new_post['content'] = ''.join( remove_tags(content) ) # we use join here as content for each post is a list, which we want to flatten into a string # clean the date string: "Wed Feb 06, 2019 12:37 am" => "WedFeb0620191237am" datestring = datestring.replace("\n", "").replace( "\t", "").replace(" ", "").replace(":", "").replace(",", "") # create a date time object from the cleaned string using the strptime format string: # %a:day(Wed), %b:month(Feb), %d:day(06), %Y:year(2019), %M:12h time(1237), %p:am/pm(am) dt = datetime.datetime.strptime(datestring, "%a%b%d%Y%I%M%p") new_post['datetime'] = dt posts.append(new_post) return posts
def parse_article(response): link = response.url title = response.css("title ::text").extract_first() author = response.css( "div.tec--container div.tec--article__body-grid div.tec--author__info p a::text" ).extract_first() image = response.css( "div.tec--container div.tec--article__header-grid header figure img::attr(data-src)" ).extract_first() # text = response.css("div.tec--container div.tec--article__body p::text").getall() text = response.css( "div.tec--container div.tec--article__body").getall() if text: text = text[0][:text[0].find("<p><span>Cupons")] text = remove_tags(text) text = text.replace('</div', '') notice = NoticiasItem(title=title, author=author, text=text, link=link, image=image, source="Tecmundo", uuid=str(uuid.uuid4())) yield notice
def parse_post(self, response): print("parse_post ", response.url) # text = ''.join( response.css( 'div.container div.col-md-8 > p' ).extract() ) text = response.css('div.entry.clearfix').extract()[0] text = remove_tags(text) print("text", text) yield {"text": text}
def parse(self, response): sel = Selector(response) for i in sel.css('ul.pj-rank-cate-list>li.pj-rank-cate-item'): cid = int(i.css('a::attr(data-id)').extract()[0]) cname = remove_tags(i.extract()).strip() yield self.make_tags_ajax_request(cid, cname)
def parse_news(self, response): date = response.css( "div.content-time-published.margin .time-modified.margin::text" ).extract_first() title = response.css( "span#id-blasting-tv-masthead-video-title::text").extract_first() subtitle = response.css("h2.title-h2::text").extract_first() try: article = remove_tags_with_content(response.css( "div.article-body.p402_premium.template-a").extract_first(), which_ones=('div', 'script')) except: article = remove_tags_with_content( response.css("div#article-body-p1").extract_first(), which_ones=('div', 'a', 'script')) article = remove_tags(article) article = replace_escape_chars(article, which_ones=('\n')) article = re.sub(r'http\S+', '', article).strip() yield { 'article': article, 'subtitle': subtitle, 'title': title, 'date': date, 'link': response.url, 'website': 'blasting' }
def parse_news(self, response): article_body = [] for column in response.css("div.mc-column.content-text"): text_with_tags = column.css( "div.mc-column.content-text").extract_first() text_without_tags = remove_tags(text_with_tags) article_body.append(text_without_tags.strip()) article_body = ''.join(article_body) dateTime = response.css( "p.content-publication-data__updated time::text").extract_first() if dateTime: date = dateTime.split()[0] else: date = None yield { 'article': article_body, 'subtitle': response.css("h2.content-head__subtitle::text").extract_first(), 'title': response.css("h1.content-head__title::text").extract_first(), 'date': date, 'link': response.url, 'website': 'globo' }
def parse_post( self , response ): print( "parse_post " , response.url ) # text = ''.join( response.css( 'div.container div.col-md-8 > p' ).extract() ) text = response.css( 'div.entry.clearfix' ).extract()[ 0 ] text = remove_tags( text ) print( "text" , text ) yield { "text" : text }
def parse(self, response): title = text_strip(response.css('h2.ask-question-title::text').extract_first()) body = text_strip(response.css('div.common-answer-text::text').extract_first()) category = text_strip(response.css('.breadcrumb-pre:nth-child(4) a::text').extract_first()) sub_category = text_strip(response.css('.breadcrumb-pre:nth-child(5) a::text').extract_first()) question_item = Question() question_item['title'] = title question_item['body'] = body question_item['category'] = category if category else 'N/A' question_item['sub_category'] = sub_category if sub_category else 'N/A' question_item['source_name'] = 'to8to.com' question_item['source_url'] = response.url question_item['entry_url'] = response.url answers = response.css('.common-answer-text h3').extract() answers = map(text_strip, answers) answers = map(text_clear, answers) answers = filter(text_filter, answers) answer_items = [] for answer in answers: answer_body = remove_tags(answer, keep=['br']) answer_item = Answer() answer_item['body'] = answer_body answer_items.append(answer_item) question_item['answers'] = answer_items yield question_item
def parse_abstract(self, response): meta = response.meta abstract_text = remove_tags( response.xpath( './/div[@class="page-title"]/following-sibling::p[1]'). extract_first().strip()).strip() meta['Abstract'] = abstract_text links = response.xpath(".//a/@href").extract() html_link = None for link in links: if "fulltext" in link: html_link = "https://www.lens.org" + link break meta['HTML_Link'] = html_link if html_link is not None: yield scrapy.Request( html_link, callback=self.parse_full_text, meta=meta, ) else: self.insert_patent(meta)
def parse_unit(self, response): print(response.url) title = response.css("h1.title-3283765216::text")[0].extract() price = response.css("span.currentPrice-2872355490")[0].css( "span::text")[0].extract() location = response.css( "span.address-2932131783::text").extract_first() description = remove_tags( response.css( "div.descriptionContainer-2901313666").extract_first()) attributes = [{ attr.css("dt::text").extract_first(), attr.css("dd::text").extract_first() } for attr in response.css("dl.itemAttribute-2841032265")] uploadTime = response.css("div.datePosted-1350605722")[0].css( "span::attr(title)").extract_first() if uploadTime == None: uploadTime = response.css("div.datePosted-1350605722")[0].css( "time::attr(title)").extract_first() yield { "title": title, "url": response.url, "price": price, "location": location, "description": description, "attributes": attributes, "datetime": uploadTime }
def process_item(self, item, spider): name = item['name'] print("""insert into recipes values (?, ?)""", (self.couter, name)) self.cursor.execute("""insert into recipes values (?, ?)""", (self.couter, name)) description_counter = 1 for description in item['description']: #print("""insert into order values (?, ?, ?)""", (self.couter, description_counter, remove_tags(description))) self.cursor.execute("""insert into recipes_order values (?, ?, ?)""", (self.couter, description_counter, remove_tags(description).strip())) description_counter+=1 for product_and_qty in item['products']: data = remove_tags(product_and_qty).split('-') product = data[0].strip() try: qty = data[1].strip() except: qty = 'На вкус' if product not in self.products: self.products.append(product) self.cursor.execute("""insert into products values (?, ?)""", (self.product_counter, product)) self.product_counter+=1 self.cursor.execute("""insert into main values (?, (select distinct id from products where name = ?), ?)""", (self.couter, product, qty)) self.couter+=1 return item
def parse_items(self, response): global count x = response.xpath("//section[@class='userbody']") #print x if x: for sel in x: #print sel item = craigslistItem() #item['title'] = sel.xpath("//span[@class='postingtitletext']/text()").extract() #item['link'] = response.url item['desc'] = remove_tags(sel.xpath("//section[@id='postingbody']").extract()[0].replace('<br>',' ').replace('\n',' ')) #for i in item['desc'] print item['desc'] count+=1 print "Pages so far = ", count with open("item_desc","a") as i: i.write(str(item['desc'])+'\n') '''if re.search('shoe',str(item['title'][0]), re.IGNORECASE): price = -1 try: price = item['price'][0][4:] except: pass if float(price)>0.0 and float(price)<300.00: count+=1 print count with open("webpages/result"+str(count)+".html","wb") as f: f.write(response.body) with open("item_links","a") as i: i.write(str(item)+'\n')''' if count==2000: raise CloseSpider('Downloaded 2000 pages')
def parse_post(self, response): print("parse_post ", response.url) text = response.css( 'div.container.container_single-article').extract()[0] text = remove_tags(text) print("text", text) yield {"text": text}
def parse(self, response): for item in response.css(".feed-item"): date_tag = item.css('a.answer-date-link') last_updated_tag = date_tag.css('::text').extract_first() created_tag = date_tag.css('::attr(data-tooltip)').extract_first() if not created_tag: created_tag = last_updated_tag yield { 'url': item.css('link::attr(href)').extract_first(), 'question': item.css('h2>a::text').extract_first(), 'answer': remove_tags( item.css('textarea.content::text').extract_first()), 'created': get_date(created_tag), 'last_updated': get_date(last_updated_tag), 'likes': int(item.css('span.js-voteCount::text').extract_first()), } next_href = response.css( '.zm-invite-pager span:last-of-type>a::attr(href)').extract_first( ) print(next_href) if next_href: yield response.follow(next_href, callback=self.parse)
def parse_post(self, response): print("parse_post ", response.url) text = ' '.join( filter(lambda s: 'function' not in s, response.css('div#story_body p').extract())) text = remove_tags(text) print("text", text) yield {"text": text}
def parse_item(self, response): txt = remove_tags(response.css('div.article').extract()[0]) for ip, port, prot in re.findall( r'(\d+\.\d+\.\d+\.\d+):(\d+)@(HTTPS?)', txt): yield ProxyHunterItem(prot=prot, ip=ip, port=port)
def parse_list_page(self, response): para = "" for sub_block in response.css('div.content-body'): for p in sub_block.xpath('.//p'): para += remove_tags(p.get().strip()) yield { "article": para, }
def parse_item(self, response): hxs = HtmlXPathSelector(response) guid = hxs.select('//div[@class="postContent"]//h2//a//@href').extract() title = hxs.select('//div[@class="postContent"]//h2//a//text()').extract() content = hxs.select('//div[@class="postContent"]').extract() date = hxs.select('//small[@class="timeStamp"]').extract() counter=0 items = [] for it in guid: item = MassItem() item['guid'] = it item['date'] = datetime.strptime(re.sub('^\n[ ]+','',re.sub('[ ]+$','',remove_tags((date[counter])))), '%b %d, %Y%H:%M %p')#.strftime('%Y-%m-%d %H:%M') item['content']= remove_tags(content[counter]) item['content'] = item['content'].replace("\n"," ") item['title'] = title[counter] counter=counter+1 items.append(item) return items
def parse_item(self, response): paragraphs = response.selector.xpath("//p").extract() item = Sports4Item() item['feed'] = "".join( remove_tags(paragraph).encode('utf-8') for paragraph in paragraphs) yield item
def _extract_links(self, response_text, response_url, response_encoding): base_url = self.base_url if self.base_url else response_url clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) return [Link(url, text) for url, text in urlstext]
def parse_item(self, response): hxs = HtmlXPathSelector(response) guid = hxs.select('//div[@class="postContent"]//h2//a//@href').extract() title = hxs.select('//div[@class="postContent"]//h2//a//text()').extract() content = hxs.select('//div[@class="postContent"]').extract() date = hxs.select('//small[@class="timeStamp"]').extract() counter=0 items = [] for it in guid: item = MassItem() item['guid'] = it item['date'] = datetime.strptime(re.sub('^\n[ ]+','',re.sub('[ ]+$','',remove_tags((date[counter])))), '%b %d, %Y%H:%M %p')#.strftime('%Y-%m-%d %H:%M') item['content']= remove_tags(content[counter]) item['content'] = item['content'].replace("\n"," ") item['title'] = title[counter] counter=counter+1 items.append(item) return items
def parse(self, response): if re.match('https://www.homes.co.jp/distil.+', response.url): time.sleep(5) yield scrapy.Request(url=response.request.meta['redirect_urls'][0], callback=self.parse, dont_filter=True) else: railway_boxes = response.css( 'div.mod-checkList.rosen.fitting fieldset') for railway_box in railway_boxes: railway_company = remove_tags( railway_box.css('legend').extract_first()) names_and_counts = railway_box.css('ul li label') for name_and_count in names_and_counts: item_loader = HomesRailwayItemLoader( item=HomesRailwayItem()) bukken_count = remove_tags( name_and_count.css('span').extract_first()) railway_url = name_and_count.css( 'a::attr(href)').extract_first() railway_name = name_and_count.extract() railway_name = remove_tags(railway_name) railway_name = re.sub('\n|\s', '', railway_name) if railway_name is None: railway_name = 'no_bukken' pref_name = re.sub('https://www.homes.co.jp/(.+)/line/$', r"\1", response.url) item_loader.add_value('web_site', 'HOMES') item_loader.add_value('pref_name', self.prefs[pref_name]) item_loader.add_value('railway_company', railway_company) item_loader.add_value('railway', railway_name) item_loader.add_value('bukken_count', bukken_count) item_loader.add_value('url', railway_url) yield item_loader.load_item()
def process_item(self, item, spider): if item['title'] and item['author'] and item['date'] and item['text'] and item['link']: if not item['link'] in self.urls_seen: item['text'] = remove_tags(remove_tags_with_content(replace_escape_chars(filter(lambda x: x in string.printable, item['text'][25:])), which_ones=('div', 'img', 'script'))) item['title'] = filter(lambda x: x in string.printable, item['title']) self.urls_seen.add(item['link']) return item else: raise DropItem('Duplicate item %s' % item) else: raise DropItem('Missing fields %s' % item)
def parse(self, response): hrefs = response.selector.xpath( '//div[contains(@class, "c-container")]/h3/a/@href').extract() containers = response.selector.xpath( '//div[contains(@class, "c-container")]') for container in containers: href = container.xpath('h3/a/@href').extract()[0] title = remove_tags(container.xpath('h3/a').extract()[0]) c_abstract = container.xpath( 'div/div/div[contains(@class, "c-abstract")]').extract() abstract = "" if len(c_abstract) > 0: abstract = remove_tags(c_abstract[0]) request = scrapy.Request(href, callback=self.parse_url) request.meta['title'] = title request.meta['abstract'] = abstract yield request
def process_item(self, item, spider): if 'RemoveTagsPipeline' not in getattr(spider, 'pipelines', []): return item fields = ['episodeDescription', 'brandDescription', 'channelDescription'] for field in fields: if field in item: log.msg('RemoveTagsPipeline %s' % (field), level=log.DEBUG) if item[field]: html=item[field][0] html.replace('<![CDATA[', '') html.replace(']]>', '') item[field] = [remove_tags(html)] return item
def parse_item(self, response): self.log('This is a wood ID page %s' % response.url) hxs = HtmlXPathSelector(response) DBitem = WoodDbItem() values = [] description = hxs.select('//td/p[@style="text-align: left;"]').extract() for i in description: values.append(re.split(':', remove_tags(i))[1]) DBitem['name'] = values[0] DBitem['latin'] = values[1] DBitem['distribution'] = values[2] DBitem['size'] = values[3] DBitem['density'] = values[4].split()[0] DBitem['sg'] = values[5] DBitem['janka'] = values[6] DBitem['MoR'] = values[7] DBitem['EM'] = values[8].split()[0] DBitem['crush_strength'] = values[9] DBitem['shrink'] = values[10] return DBitem
def parse_post( self , response ): print( "parse_post " , response.url ) text = response.css( 'div.apos-rich-text-item.apos-item' ).extract()[ 0 ] text = remove_tags( text ) print( "text" , text ) yield { "text" : text }
def parse_topic(self, response): data = CloudFormationObjectDataItem() # save the url data[ 'url' ] = response.request.url # get the tile of this resource resource_type = response.xpath("//h1[@class='topictitle']/text()").extract() data[ 'topic_title' ] = resource_type[0] # clean up the resource name resource_name = unicode(data[ 'topic_title' ]) resource_name = resource_name.replace( '\n', ' ' ) resource_name = resource_name.replace( '::', ' ' ) resource_name = resource_name.replace( u'\xa0', ' ' ) #  resource_name = re.sub( " +", " ", resource_name ) resource_name = resource_name.replace( ' ', '_' ) data[ 'resource_name' ] = resource_name data[ 'ruby_comment_name' ] = resource_name.replace( '-', "::" ) data[ 'ruby_class_name' ] = "".join( resource_name.split( "_" )[1:]) data[ 'ruby_class_name_underscore' ] = "_".join( resource_name.lower().split( "_" )[1:]) data[ 'variables' ] = [] data[ 'list_variables' ] = [] mixins = set() # iterate over the lists of variables variablelists = response.xpath("//div[@class='variablelist']/dl") for variablelist in variablelists: variable_titles = variablelist.xpath( "dt" ) variable_descriptions = variablelist.xpath( "dd" ) if( len( variable_titles ) == len( variable_descriptions ) ): # iterate over the variables in this list for i in range( len( variable_titles ) ): variable_data = {} variable_name = remove_tags( variable_titles[ i ].xpath( "span" ).extract()[ 0 ] ) variable_name = variable_name.replace( u'\xa0', ' ' ) #  variable_data[ "name" ] = variable_name variable_data[ "name_underscore" ] = inflection.underscore( variable_name ) paragraphs = variable_descriptions[ i ].xpath("p").extract() for paragraph in paragraphs: clean_paragraph = remove_tags( paragraph ) if( clean_paragraph.startswith( "Type:" ) ): variable_type = clean_paragraph.replace( "Type:", "", 1 ).strip() variable_data[ "type" ] = variable_type elif( clean_paragraph.startswith( "Type" ) ): variable_type = clean_paragraph.replace( "Type", "", 1 ).strip() variable_data[ "type" ] = variable_type elif( clean_paragraph.startswith( "Required:" ) ): variable_required = clean_paragraph.replace( "Required:", "", 1 ).strip() variable_data[ "required" ] = variable_required elif( clean_paragraph.startswith( "Required" ) ): variable_required = clean_paragraph.replace( "Required", "", 1 ).strip() variable_data[ "required" ] = variable_required # if there is no type or no variable data, set some defaults if "type" not in variable_data: variable_data[ "type" ] = "String" if "required" not in variable_data: variable_data[ "required" ] = "No" ## TODO: keep track of entires without a type or required element # is it tags? if( variable_data[ "name" ] == "Tags" ): mixins.add( "Taggable" ) # is it a list? lower_type = variable_data[ "type" ].lower() if( lower_type.startswith( "list of " ) or lower_type.startswith( "a list of " ) or lower_type.endswith( " list" ) ): data[ "list_variables" ].append( variable_data ) else: data[ "variables" ].append( variable_data ) # clean up the list of mixins data[ 'mixins' ] = list( mixins ) yield data
def parse_post( self , response ): print( "parse_post " , response.url ) text = response.css( 'div.post-body.wysiwyg' ).extract()[ 0 ] text = remove_tags( text ) print( "text" , text ) yield { "text" : text }
def parse_post( self , response ): print( "parse_post " , response.url ) text = response.css( 'article' ).extract()[ 0 ] text = remove_tags( text ) print( "text" , text ) yield { "text" : text }
def parse_post( self , response ): print( "parse_post " , response.url ) text = ' '.join( filter( lambda s:'function' not in s , response.css( 'div#story_body p' ).extract() ) ) text = remove_tags( text ) print( "text" , text ) yield { "text" : text }
def test_remove_tags(self): # make sure it always return unicode assert isinstance(remove_tags('no tags'), unicode) assert isinstance(remove_tags('no tags', which_ones=('p',)), unicode) assert isinstance(remove_tags('<p>one tag</p>'), unicode) assert isinstance(remove_tags('<p>one tag</p>', which_ones=('p')), unicode) assert isinstance(remove_tags('<a>link</a>', which_ones=('b',)), unicode) # text without tags self.assertEqual(remove_tags(u'no tags'), u'no tags') self.assertEqual(remove_tags(u'no tags', which_ones=('p','b',)), u'no tags') # text with tags self.assertEqual(remove_tags(u'<p>one p tag</p>'), u'one p tag') self.assertEqual(remove_tags(u'<p>one p tag</p>', which_ones=('b',)), u'<p>one p tag</p>') self.assertEqual(remove_tags(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)), u'<b>not will removed</b>i will removed') # text with tags and attributes self.assertEqual(remove_tags(u'<p align="center" class="one">texty</p>'), u'texty') self.assertEqual(remove_tags(u'<p align="center" class="one">texty</p>', which_ones=('b',)), u'<p align="center" class="one">texty</p>') # text with empty tags self.assertEqual(remove_tags(u'a<br />b<br/>c'), u'abc') self.assertEqual(remove_tags(u'a<br />b<br/>c', which_ones=('br',)), u'abc') # test keep arg self.assertEqual(remove_tags(u'<p>a<br />b<br/>c</p>', keep=('br',)), u'a<br />b<br/>c') self.assertEqual(remove_tags(u'<p>a<br />b<br/>c</p>', keep=('p',)), u'<p>abc</p>') self.assertEqual(remove_tags(u'<p>a<br />b<br/>c</p>', keep=('p','br','div')), u'<p>a<br />b<br/>c</p>')
def parse(self, data): return remove_tags(data).strip()
def parse(self, data): if type(data) not in [str, unicode]: data = str(data) return remove_tags(data).strip()
def parse(self, response): n_page = 0 text = remove_tags( response.css( "div.col-md-12.editor-block" ).extract()[ 0 ] ) print( "text" , text ) yield { "text" : text }
def remove_html(text): return remove_tags(text)
def parse_post( self , response ): print( "parse_post " , response.url ) text = ''.join( response.css( 'div.node > div.content > p' ).extract() ) text = remove_tags( text ) print( "text" , text ) yield { "text" : text }
def parse(self, response): n_page = 0 text = remove_tags( response.css( "article.PageArticle" ).extract()[ 0 ] ) print( "text" , text ) yield { "text" : text }
def parse(self, response): n_page = 0 text = remove_tags( response.css( "#listin" ).extract()[ 0 ] ) print( "text" , text ) yield { "text" : text }