def parse(self, response): ts = time.time() html_name = 'txt/artist/artist' + str(ts) + '.txt' file = codecs.open(html_name, 'w+', 'utf-8') # file.write(response.url) # file.write('\n') for body in response.css('div.layoutSingleColumn h3').extract(): body = body.encode(response.encoding) body = remove_tags(body) print "Header" print(body) try: file.write(body) except AttributeError: print(AttributeError) sys.exit(0) for body in response.css('div.layoutSingleColumn p').extract(): body = body.encode(response.encoding) body = remove_tags(body) print "Paragraph" print(body) try: file.write(body) except AttributeError: print(AttributeError) sys.exit(0) file.close()
def test_remove_tags(self): # text with tags self.assertEqual(remove_tags(u'<p>one p tag</p>'), u'one p tag') self.assertEqual(remove_tags(u'<p>one p tag</p>', which_ones=('b',)), u'<p>one p tag</p>') self.assertEqual(remove_tags(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)), u'<b>not will removed</b>i will removed')
def parse(self, response): max_position = '' koma = ',' headers = response.headers itemselector = Selector(response).xpath('//div[@class="content"]') if headers['Content-Type'] == 'application/json;charset=utf-8': data = json.loads(response.body) itemselector = Selector(text=data['items_html']).xpath('//div[@class="content"]') max_position = data['min_position'] yield Request("https://twitter.com/i/search/timeline?f=tweets&vertical=default&q=%22demam%20berdarah%22%20OR%20dbd%20OR%20dhf%20OR%20%22dengue%20fever%22%20OR%20%22dengue%20hemorrhagic%22%20OR%20%22sakit%20db%22%20lang%3Aid%20since%3A"+self.start+"%20until%3A"+self.end+"&src=typd&include_available_features=1&include_entities=1&max_position="+max_position+"&reset_error_state=false", callback=self.parse, method="GET",) for sel in itemselector: self.index += 1 item = TwitterscrapingItem() item['index'] = self.index item['userid'] = ''.join( map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/@data-user-id').extract())) item['username'] = ''.join( map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/span[@class="username js-action-profile-name"]/b/text()').extract())) item['fullname'] = ''.join( map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/strong/text()').extract())) text_tweet = ''.join( map(unicode.strip, sel.xpath('p[@class="TweetTextSize js-tweet-text tweet-text"]').extract())) item['text_tweet'] = remove_tags(text_tweet).replace('\n',' ').replace('\u',' ') item['original_text_tweet'] = text_tweet hash_tags = koma.join( map(unicode.strip, sel.xpath('p[@class="TweetTextSize js-tweet-text tweet-text"]' '/a[@class="twitter-hashtag pretty-link js-nav"]').extract())) item['hash_tags'] = remove_tags(hash_tags) item['time_tweet'] = ''.join( map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/small[@class="time"]/a/@title').extract())) item['lang'] = ''.join( map(unicode.strip, sel.xpath('p[@class="TweetTextSize js-tweet-text tweet-text"]/@lang').extract())) retweets = ''.join( map(unicode.strip, sel.xpath('div[@class="stream-item-footer"]' '/div[@class="ProfileTweet-actionList js-actions"]' '/div[@class="ProfileTweet-action ProfileTweet-action--retweet js-toggleState js-toggleRt"]' '/button[@class="ProfileTweet-actionButton js-actionButton js-actionRetweet"]' '/div[@class="IconTextContainer"]').extract())) item['retweets'] = remove_tags(retweets).strip() favorite = ''.join( map(unicode.strip, sel.xpath('div[@class="stream-item-footer"]' '/div[@class="ProfileTweet-actionList js-actions"]' '/div[@class="ProfileTweet-action ProfileTweet-action--favorite js-toggleState"]' '/button[@class="ProfileTweet-actionButton js-actionButton js-actionFavorite"]' '/div[@class="IconTextContainer"]').extract())) item['favorite'] = remove_tags(favorite).strip() item['place_id'] = ''.join( map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/span[@class="Tweet-geo u-floatRight js-tooltip"]/a/@data-place-id').extract())) item['place'] = ''.join( map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/span[@class="Tweet-geo u-floatRight js-tooltip"]/a/span[@class="u-hiddenVisually"]/text()').extract())) item['max_position'] = max_position yield item
def test_returns_unicode(self): # make sure it always return unicode assert isinstance(remove_tags(b'no tags'), six.text_type) assert isinstance(remove_tags(b'no tags', which_ones=('p',)), six.text_type) assert isinstance(remove_tags(b'<p>one tag</p>'), six.text_type) assert isinstance(remove_tags(b'<p>one tag</p>', which_ones=('p')), six.text_type) assert isinstance(remove_tags(b'<a>link</a>', which_ones=('b',)), six.text_type) assert isinstance(remove_tags(u'no tags'), six.text_type) assert isinstance(remove_tags(u'no tags', which_ones=('p',)), six.text_type) assert isinstance(remove_tags(u'<p>one tag</p>'), six.text_type) assert isinstance(remove_tags(u'<p>one tag</p>', which_ones=('p')), six.text_type) assert isinstance(remove_tags(u'<a>link</a>', which_ones=('b',)), six.text_type)
def parse(self, response): hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract() containers = response.selector.xpath('//div[contains(@class, "c-container")]') for container in containers: href = container.xpath('h3/a/@href').extract()[0] title = remove_tags(container.xpath('h3/a').extract()[0]) c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract() abstract = "" if len(c_abstract) > 0: abstract = remove_tags(c_abstract[0]) request = scrapy.Request(href, callback=self.parse_url) request.meta['title'] = title request.meta['abstract'] = abstract yield request
def parse_item(self, response): province = response.css('.dqwz>a:last-child::attr(title)').re_first(ur'2017年(.+?)省?本科') school = response.css('.nr>h2::text').extract_first() count = len(response.xpath('//div[@id="ivs_content"]/table//tr[1]/td').extract()) for row in response.xpath('//div[@id="ivs_content"]/table//tr[position()>1]'): fields = [remove_tags(i).strip() for i in row.css('td').extract()] if count == 4: del fields[0] if len(fields) == 3: rowspan_count = [e.css('::attr(rowspan)').extract_first(1) for e in row.css('td')][-3:] rowspan_value = fields rowspans = len([i for i in rowspan_count if i > 1]) elif len(fields) + rowspans == 3: new_fields = [] fields.reverse() for k, v in zip(rowspan_count, rowspan_value): if k == 1: new_fields.append(fields.pop()) else: new_fields.append(v) fields = new_fields else: continue yield ShmecItem( province=province, school=school, major=fields[0], require=fields[1], remark=fields[2], )
def make_it_clean(line): ''' Очистка текста от тегов html, css стилей, js (string) line - входной текст ''' cleari = remove_tags(line) soline = re.compile("(\<.+\n)", re.DOTALL) boline = re.compile("(.+\>)", re.DOTALL) alline = re.compile("\<.+\>", re.DOTALL) cleari = re.sub(soline, '', cleari) cleari = re.sub(boline, '', cleari) cssline = re.compile(r"\{.+\}{1}", re.DOTALL) cleari = re.sub(cssline, ' ', cleari) cleari = re.sub("async=\"async\"\n", '', cleari) cleari = re.sub("src=.+\"", '', cleari) cleari = re.sub("var\s_.+\)", '', cleari) cleari = re.sub("function.+\"\)", '', cleari) cleari = re.sub("document.+\);", " ", cleari) cleari = re.sub("function.+\)", " ", cleari) cleari = re.sub("«", " «", cleari) cleari = re.sub("»", "» ", cleari) cleari = re.sub("→", "→", cleari) cleari = re.sub(r' ', ' ', cleari) cleari = re.sub(r'(—)|(–)', '-', cleari) cleari = re.sub(r'\t{2,}', ' ', cleari) cleari = re.sub(r'\s{2,}', ' ', cleari) cleari = re.sub(r'\n{2,}', '\n', cleari) cleari = re.sub(r"(\<\!\-\-.*\-\-\>)", '', cleari) return cleari
def parse(self, response): s = Selector(response) next_link = s.xpath('//div[@class="w-button-more"]/a/@href').extract() if len(next_link): yield Request("https://mobile.twitter.com"+next_link[0], callback=self.parse) itemselector = Selector(response).xpath('//*[@id="main_content"]/div/div[3]/table') #regex = re.compile(r"([\\]+u\d*)", re.MULTILINE) for sel in itemselector: self.index += 1 item = TwitterscrapingItem() item['index'] = self.index item['username'] = ''.join( map(unicode.strip, sel.xpath('tr[1]/td[2]/a/div/text()').extract())) tweet = remove_tags(''.join( map(unicode.strip, sel.xpath('tr[2]/td/div').extract())) ).replace('&','&').replace(' ','').replace('\n ','').replace('\n ','').replace('\n','').replace('\u',' ') item['text_tweet'] = u''+tweet item['original_tweet'] = ''.join(sel.xpath('tr[2]/td/div/div').extract()) item['time_tweet'] = ''.join( map(unicode.strip, sel.xpath('tr[1]/td[3]/a/text()').extract())) item['url'] = ''.join( map(unicode.strip, sel.xpath('tr[2]/td/div/@data-id').extract())) item['data_id'] = ''.join( map(unicode.strip, sel.xpath('tr[3]/td/span[1]/a/@href').extract())) yield item
def parse_speech(self, response): paragraphs = response.css('p')[:-1] # last p contains pagination text = remove_tags(''.join(paragraphs.extract())) l = ParlamentHuSpeechLoader(item=Speech(), selector=response, scheme='parlament.hu/people') l.add_value('text', text) l.add_value('type', 'speech') l.add_value('sources', [response.url]) l.add_xpath('position', '//b[1]/text()') l.add_xpath('video', '//table//tr[6]//td[2]/a/@href') l.add_xpath('creator_id', '//table//tr[2]//td[2]/a/@href', re=r'ogy_kpv\.kepv_adat\?p_azon=(\w\d+)') l.add_value('event_id', response.meta['event_id']) date = response.xpath( '//table//tr[1]/th/text()').re(r'\d{4}\.\d{2}.\d{2}\.') time = response.meta.get('time') if date: date = date[0] if time: date += time[0] l.add_value('date', date) item = l.load_item() yield item if 'creator_id' in item: yield scrapy.Request(self.get_api_url( self.PERSON_ENDPOINT, params={ 'p_azon': item['creator_id']['identifier']}), callback=self.parse_person, meta={ 'p_azon': item['creator_id']['identifier']})
def _extract_features(self, sel, item): description_xpath = '//div[@id="tab1"]/ul/li' data = sel.xpath(description_xpath).extract() if len(data) > 0 : data = [remove_tags(v).strip().replace(' ',' ').replace('>','>').strip() for v in data] data = filter(None,data) item['description'] = '<br>'.join(data)
def parse_linklist(text, remove_tags=False): data = [] for row in text.split('\n'): rowparts = row.strip().split(' ') if len(rowparts) < 2: break time = rowparts[0] if rowparts[1].startswith('<') and rowparts[1].endswith('>'): url = rowparts[1][1:-1] textparts = rowparts[2:] else: url = '' textparts = rowparts[1:] text = ' '.join(textparts) if remove_tags: text = html.remove_tags(text) data.append( { 'time': time, 'url': url, 'text': text } ) return data
def _extract_description(self, sel, item): return desc_xpath = '//div[@id="item-overview"]/ul/li/node()' data = sel.xpath(desc_xpath).extract() if len(data) != 0: data = [remove_tags(v.strip()) for v in data] description = ';'.join(data).replace(':;',':').replace('from;','from ') item['description'] = description
def clean_tags_from_affiliations(value): """Clean the affiliaton string for an author.""" for affiliation in value.get('affiliations', []): # Remove tag AND content of any prefix like <label><sup>1</sup></label> affiliation['value'] = remove_tags_with_content(affiliation['value'], ('label',)) # Now remove all tags but KEEP content affiliation['value'] = remove_tags(affiliation['value']) # Remove random whitespaces affiliation['value'] = clean_whitespace_characters(affiliation['value']) return value
def _extract_links(self, response_text, response_url, response_encoding): base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) return [Link(url, text) for url, text in urlstext]
def process_response(self, request, response, spider): # clean body orig_body = response.body_as_unicode() body = remove_tags_with_content(orig_body, which_ones=('script', 'head')) body = remove_tags(remove_comments(body)) terms = tokenize(body.lower()) request.meta['terms'] = terms request.meta['body'] = body return response
def parse(self, response): """ Parse the response page """ # Skip error URLs if response.status != 200: return data = json.loads(response.text) title = data['title'] # Replace / with a space - creates issues with writing to file title = title.replace('/', ' ') description = data['description'] data = data['content'] # Remove <script>, <sup>, <math> tags with the content paragraph = remove_tags_with_content(data, which_ones=('script', 'sup', 'math', 'style')) # Remove the rest of the tags without removing the content paragraph = remove_tags(paragraph) # Replace & with & paragraph = paragraph.replace('&', '&') # Replace ' with ' paragraph = paragraph.replace(''', "'") paragraph = paragraph.replace('’', "'") paragraph = paragraph.replace('“', "'") paragraph = paragraph.replace('”', "'") # Replace with a space paragraph = re.sub("&.....;", ' ', paragraph) paragraph = re.sub("&....;", ' ', paragraph) # Replace 'U.S.' with 'US': paragraph = paragraph.replace('U.S.', 'US') # Some more replacements to improve the default tokenization paragraph = paragraph.replace('\r', '') paragraph = paragraph.replace('\t', '') text = title + '\n\n' + description + '\n\n' + paragraph # Create the directory dirname = 'data/qplum' if not os.path.exists(dirname): os.makedirs(dirname, exist_ok=True) elif not os.path.isdir(dirname): os.remove(dirname) os.makedirs(dirname, exist_ok=True) # Save the title and the text both filename = '{}/{}'.format(dirname, title) f = open(filename, 'w') f.write(text) f.close()
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) return [Link(clean_url(url).encode(response_encoding), clean_text(text)) for url, _, text in links_text]
def _extract_description(self, sel, item): description_xpath = '//div[@class="description"]/ul/li/node()' data = sel.xpath(description_xpath).extract() if len(data) > 0: data = [remove_tags(v.strip()) for v in data] description = '' for index,desc in enumerate(data): if index % 2 == 0: description += desc else : description += desc + ';' item['description'] = description
def parse_item(self, response): if(collect().count() < 10000): # print '*******', response.url hxs = HtmlXPathSelector(response) titles=hxs.select("//div[@id='articleNew']/h1/text()").extract() if len(titles) == 0: return title=''.join(titles).strip() txts=hxs.select("//div[@id='articleNew']/p").extract() conteudo=remove_comments(remove_tags(''.join(txts))) i = Artigo() i['url']=response.url i['nome']=title i['conteudo']=conteudo #opiniao = {"url":response.url, "nome":title, "conteudo": conteudo} opiniao2 = {"conteudo":conteudo} # collect().insert(opiniao) # Colecao Opinioes : Todas opinioes coletadas no Painel do Leitor ############################################################################################################################## # Filtrando por conteudo e direcionando para diferentes colecoes # Filtrando por conteudo e salvando em arquivo arqfile = leitorOpiniao frase = conteudo.split() if "Dilma" in frase: #database()['dilma'].insert(opiniao2) # Colecao dilma arqfile = leitorDilma elif "Copa" in frase: #database()['copa'].insert(opiniao2) # Colecao copa arqfile = leitorCopa elif "Palmeiras" in frase: #database()['palmeiras'].insert(opiniao2) # Colecao palmeiras arqfile = leitorPalmeiras arq = open(arqfile, 'a') arq.writelines(str(opiniao2)) arq.close() # yield i # Mensagem na tela print '##########################################################' # print ("TOTAL DE OPINIOES: %d" %collect().count()) print ("Salvando em %s " %arqfile) print '##########################################################' else: print 'Fim de scraping leitor' exit()
def _extract_features(self, sel, item): description_xpath = '//ul[@class="ul_description"]/li/node()' data = sel.xpath(description_xpath).extract() if len(data) > 0 : data = [remove_tags(v).strip().replace(' ',' ').replace('>','>') for v in data] data = filter(None,data) description = '' for index,desc in enumerate(data): if index % 2 == 0: description += desc else : description += desc + ';' item['description'] = description
def _extract_description(self, sel, item): description_xpath = '//div[@class="itemAttr"]/div[@class="section"]/table//tr/node()' data = sel.xpath(description_xpath).extract() if len(data) > 0 : data = [remove_tags(v).strip().replace('\t','').replace('\n','') for v in data] data = filter(None,data) description = '' for index,desc in enumerate(data): if index % 2 == 0: description += desc else : description += desc + ';' item['description'] = description
def parse(self, response): if response.url.find('redirect') > 0: print('Invalid url') return title = text_clear(text_strip(response.css('h1::text').extract_first())) body = text_clear(text_strip(response.css('.index_center .content::text').extract_first())) category = text_strip(response.css('.breadCrumbList li:nth-child(2) a b::text').extract_first()) sub_category = text_strip(response.css('.breadCrumbList li:nth-child(3) a b::text').extract_first()) if category: category = category.replace('问答', '') if sub_category: sub_category = sub_category.replace('问答', '') question_item = Question() question_item['title'] = title question_item['body'] = body question_item['category'] = category if category else 'N/A' question_item['sub_category'] = sub_category if sub_category else 'N/A' question_item['source_name'] = '17house.com' question_item['source_url'] = response.url question_item['entry_url'] = response.url answers = response.css('.list .top').extract() answers = map(text_strip, answers) answers = map(text_clear, answers) answers = filter(text_filter, answers) answer_items = [] for answer in answers: answer_body = remove_tags(answer, keep=['br']) if not answer_body: continue answer_item = Answer() answer_item['body'] = answer_body answer_items.append(answer_item) if len(answer_items) == 0: print("No Answer") return question_item['answers'] = answer_items print(question_item)
def parse(self, response): hxs = HtmlXPathSelector(response) curr_url = response.url txt = hxs.select('//body') if txt: txt = remove_tags(txt.extract()[0]) self.db.add_to_index(curr_url, txt) #for word in self.db.separate_words(txt): print word urls = hxs.select('//a[contains(@href,".html")]/@href') if urls: urls = urls.extract() #self.db.commit() for url in urls: if url.find("'")!=-1 : continue url=url.split('#')[0] if url[0:4] !='http': url = '%s%s'%(base_url, url) if urlparse.urlsplit(url)[1].split(':')[0].startswith('www.newyorker.com'): link_text = remove_tags(url) self.db.add_link_ref(curr_url, url, link_text) yield Request(url, self.parse)
def parse_profile(self, response): title = response.xpath('//title/text()').extract()[0].strip() create_time = response.xpath('//em[@id="post-date"]/text()').extract()[0].strip() source = response.xpath('//a[@id="post-user"]/text()').extract()[0].strip() body = response.body.strip() tag_content = response.xpath('//div[@id="js_content"]').extract()[0].strip() content = remove_tags(tag_content).strip() item = WeixinItem() item['title'] = title item['create_time'] = create_time item['source'] = source item['body'] = body item['content'] = content return item
def parse_store_data(self, response): """ Yield a GeojsonPointItem of the store's data """ # Pull the data off the stores page store = json.loads(remove_tags(response.xpath('//script[@type="application/ld+json"]')[1:].extract()[0])) store_hours = self.hours(store) yield GeojsonPointItem( ref=store['url'], lat=store['geo']['latitude'], lon=store['geo']['longitude'], addr_full=store['address']['streetAddress'], city=store['address']['addressLocality'], postcode=store['address']['postalCode'], name=store['branchOf']['name'], phone=store['telephone'], opening_hours=store_hours )
def parse(self, response): post_links = response.xpath('//div[contains(@id, "CollapsiblePanel")]') for post in post_links: title = post.xpath('./div[@class="CollapsiblePanelTab"]/text()[normalize-space()and not(ancestor::em)]').get() description = post.xpath('./div[@class="CollapsiblePanelContent"]//text()[normalize-space()]').getall() description = [remove_tags(p).strip() for p in description] description = ' '.join(description).strip() date = post.xpath('./div[@class="CollapsiblePanelTab"]/em/text()').get() item = ItemLoader(item=PostedcorsItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) yield item.load_item()
def parse_post(self, response): title = response.xpath('//h1/text()').get() description = response.xpath( '//div[@class="post"]//text()[normalize-space()]').getall() description = [remove_tags(p).strip() for p in description] description = ' '.join(description).strip() date = response.xpath( '//article[@class="intro center"]/time/text()').get() item = ItemLoader(item=DirektnabankarsItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_post(self, response): title = response.xpath('//h1/text()[normalize-space()]').get() description = response.xpath( '//div[@class="contained-9 centered body-copy"]//text()[normalize-space() and not(ancestor::em)]' ).getall() description = [remove_tags(p).strip() for p in description] description = ' '.join(description).strip() date = response.xpath('//em/text()').get() item = ItemLoader(item=ObsrsItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def save_artic_to_es(self): Tec_article = TechnologyType() Tec_article.title = self['title'] Tec_article.link_url = self['link_url'] Tec_article.content = remove_tags(self['content']) Tec_article.source = self['source'] Tec_article.time = self['time'] Tec_article.comment_num = self['comment_num'] # 评论数 Tec_article.read_num = self['read_num'] # 阅读数 Tec_article.meta.id = self['url_object_id'] Tec_article.suggest = gen_suggest(Tec_article._doc_type.index, ((Tec_article.title, 10))) Tec_article.save() redis_cli.incr("BokeYuan") return
def parse(self, response): reported_date = dt.datetime.utcnow().isoformat() table = response.xpath('//table[@class="table bg-gray"]') for tbl in table: header = [hd for hd in tbl.xpath('./thead//th//text()').extract()] for idx, row in enumerate(tbl.xpath('.//tbody//tr')): row = [ remove_tags(cell) for cell in row.xpath('./td').extract() ] raw_item = {header[idx]: cell for idx, cell in enumerate(row)} raw_item.update(port_name='Mundra', provider_name=self.provider, reported_date=reported_date) yield normalize.process_item(raw_item)
def parse_post(self, response): title = response.xpath('//div[@class="newsDetail"]/h1/text()').get() description = response.xpath( '//div[@class="perex"]//text()|//div[@class="htmlText"]//text()[normalize-space() and not(ancestor::ul | ancestor::h3)]' ).getall() description = [remove_tags(p).strip() for p in description] description = ' '.join(description).strip() date = response.xpath('//time/text()').get() item = ItemLoader(item=AxaskItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def processText(value): """process to get text, clean specifix character Arguments: value {string} -- input value Returns: string -- out put value """ if value: value = replace_escape_chars(value) value = remove_tags(value) value = value.replace('#ft5_slash#', '/').replace('\\/', '/') return value else: return ''
def parse_store_data(self, response): """ Yield a GeojsonPointItem of the store's data """ # Pull the data off the stores page store = json.loads( remove_tags( response.xpath('//script[@type="application/ld+json"]') [1:].extract()[0])) store_hours = self.hours(store) yield GeojsonPointItem(ref=store['url'], lat=store['geo']['latitude'], lon=store['geo']['longitude'], addr_full=store['address']['streetAddress'], city=store['address']['addressLocality'], postcode=store['address']['postalCode'], name=store['branchOf']['name'], phone=store['telephone'], opening_hours=store_hours)
def parse_info(self, response): html = response.text item = LiepinItem() title_res = re.compile(r'<h1 title="(.*)">') item['title'] = re.findall(title_res, html)[0] price_res = re.compile(r'<strong>(.*)</strong>\r\n.*<p class="cname">') item['price'] = re.findall(price_res, html)[0] data_res = re.compile(r'<p.class="msg.ltype".title="(.*)">') data = re.findall(data_res, html)[0] res = data.replace(' ', '') data_list = res.split('|') if len(data_list) == 4: item['site'] = data_list[0] if '无' in data_list[1]: item['experience'] = '无工作经验' else: item['experience'] = data_list[1] if '招' in data_list[2]: item['education'] = '无' else: item['education'] = data_list[2] item['time'] = data_list[3] else: item['site'] = data_list[0] item['experience'] = data_list[1] if '招' in data_list[2]: item['education'] = '无' item['time'] = data_list[3] else: item['education'] = data_list[2] item['time'] = data_list[4] description_res = re.compile( r'<div class="bmsg job_msg inbox">(.*)<div class="mt10">', re.S) description = re.findall(description_res, html)[0] item['description'] = remove_tags(description).strip() item['website'] = '51job' print(item['title']) print(item['price']) print(item['site']) print(item['experience']) print(item['education']) print(item['time']) print(item['description']) yield item
def parse_mesa(self, response): filename = "mesa_" + response.meta['mesa'] + '.html' with open(filename, 'wb') as f: f.write(response.body) item = OnpeCrawlerItem() ubigeo = response.xpath( "//table[@class='table14']//tr[2]//td").extract() ubigeo = [remove_tags(i) for i in ubigeo] item['content_results'] = response.xpath( "//div[@class='contenido-resultados']").extract_first() item['department'] = ubigeo[0] item['province'] = ubigeo[1] item['district'] = ubigeo[2] item['local'] = ubigeo[3] item['address'] = ubigeo[4] mesa_info = response.xpath( "//table[@class='table15']//tr[2]//td/text()").extract() item['electors'] = mesa_info[0] item['voters'] = mesa_info[1] item['acta_status'] = mesa_info[2].strip() item['resolutions'] = response.xpath( '//div[contains(@class, "pbot30_acta")]/text()[3]').extract_first( ).strip() item['resolutions_note'] = response.xpath( '//div[contains(@class, "pbot30_acta")]/p[2]/text()' ).extract_first() votes = response.xpath( '//div[@class="cont-tabla1"]//td/text()').extract() item['votes_ppk'] = votes[3].strip() item['votes_fp'] = votes[5].strip() item['votes_blank'] = votes[7].strip() item['votes_null'] = votes[9].strip() item['votes_contested'] = votes[11].strip() item['votes_total'] = votes[13].strip() item['table_number'] = response.xpath( "//table[@class='table13']//td/text()").extract_first() item['copy_number'] = response.xpath( '//table[@class="table13"]//td/text()').extract()[1].strip() href = response.xpath('//a/@href').extract_first() item['acta_image_url'] = "{}/{}".format(self.start_url, href) filename = "acta_mesa_" + item['table_number'] + '.pdf' urlretrieve(item['acta_image_url'], filename) return item
def _parse_topic_response(self, response): """ Parses various topics e.g. www.investopedia.com/terms/o/oddlottheory.asp """ # Get the title first title = response.css('title::text').extract_first() # Replace / with a space - creates issues with writing to file title = title.replace('/', ' ') # Get the first div with class content content = response.css('div.content') if isinstance(content, list) and len(content) > 0: content = content[0] else: content = response.css('div.roth__content')[0] text = title + '\n\n' for child in content.xpath('//p'): # Get the text from this child <p></p> tag paragraph = child.extract() # Remove tags including <p> and <a> paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip() # Replace '&' with '&' paragraph = paragraph.replace('&', '&') # Add to the file text += paragraph + '\n' # Create the directory dirname = 'data/investopedia' if not os.path.exists(dirname): os.makedirs(dirname, exist_ok=True) elif not os.path.isdir(dirname): os.remove(dirname) os.makedirs(dirname, exist_ok=True) # Save the text name = response.url.split('/')[-1] filename = '{}/{}'.format(dirname, name) f = open(filename, 'w') f.write(text) f.close()
def parse_detail(self, response): """ 解析具体网页 :param response: :return: """ url = response.url post_id = re.findall(r'p/(\d*)', url) if post_id: post_id = post_id[0] else: return blog_id = re.findall(r'currentBlogId=(\d*)', response.text) if blog_id: blog_id = blog_id[0] else: return cnblogs_item = TechnicalArticleItem() title = response.css('#cb_post_title_url::text').extract_first() publish_time = response.css('#post-date::text').extract_first() if publish_time: publish_time = datetime.datetime.strptime(publish_time, '%Y-%m-%d %H:%M') abstract = response.css('#cnblogs_post_body').extract_first() if abstract: abstract = remove_tags(abstract)[:300] cnblogs_item['url_object_id'] = get_md5(url) cnblogs_item['url'] = url cnblogs_item['title'] = title cnblogs_item['article_type'] = response.meta['article_type'] cnblogs_item['data_source'] = '博客园' cnblogs_item['read_num'] = self.get_read_num(post_id) cnblogs_item['comment_num'] = self.get_comment_num(post_id) cnblogs_item['praise_num'] = 0 cnblogs_item['collection_num'] = 0 cnblogs_item['publish_time'] = publish_time cnblogs_item['abstract'] = remove_t_r_n(abstract) cnblogs_item['tags'] = self.get_tags(blog_id, post_id) # pass yield cnblogs_item
def get_tags(self, blog_id, post_id): """ 获取标签 :param blog_id: :param post_id: :return: """ res = requests.get( 'https://www.cnblogs.com/mvc/blog/CategoriesTags.aspx?blogApp=quanxiaoha' '&blogId=%s&postId=%s' % (blog_id, post_id)) try: json_dict = json.loads(res.text) tags = remove_tags(json_dict['Tags']).replace('标签: ', '') except Exception as e: tags = '' return tags
def parse_item(self, response): links = dict() link_titles = set() url = response.url.split('#')[0].lower() url_head = url.split('/pages/')[0] + '/pages/' title = response.xpath('//meta[@name="DC.title"]/@content').extract_first() if title and title.endswith('- NHS Choices'): title = title.rstrip(' NHS Choices').rstrip(' -') subjects = response.xpath('//meta[@name="DC.Subject"][@scheme="NHSC.Ontology"]/@content').extract_first().split(', ') subjects = [s.lower() for s in subjects if s] if not subjects: subjects = [title.lower()] description = clean_text(response.xpath('//meta[@name="DC.description"]/@content').extract_first()) raw_page_content = response.xpath('//div[@class="main-content healthaz-content clear"]/.').extract_first() page_content = clean_text(replace_entities(remove_tags(raw_page_content))) for a in response.xpath('//div[@class="main-content healthaz-content clear"]/descendant::a'): label = a.xpath('text()').extract_first() href = a.xpath('@href').extract_first() if href and label: href = self.base_url + href.lstrip('/') href = href.lower() label = clean_text(label) if '/conditions/' in href and url_head not in href: link_titles.add(label) if href in links: links[href]['count'] += 1 else: links[href] = { 'count': 1, 'label': label } if url_head in href and href != url: print("********************", href) yield scrapy.Request(href, self.parse_item) article = NhsItem() article['url'] = url article['title'] = title article['subjects'] = subjects article['description'] = description article['page_content'] = str(page_content) article['links'] = links article['link_titles'] = list(link_titles) yield article
def parse(self, response): # remove <script> tags from <p> elements for text in response.css('p'): yield { 'text': remove_tags( remove_tags_with_content(text.extract(), ('script', ))) } # add new URLs that are descendants of the request URL (same domain) for next_page in response.css('div > a'): a_tag = next_page.extract() if "href=" in a_tag: link = (a_tag.split('href="')[1]).split('"')[0] if link.count("/") > 2: if link.split("/")[2] in response.request.url: yield response.follow(next_page, self.parse)
def clean_data(self): try: self["praise_num"] = extract_num("".join(self["praise_num"])) except BaseException: self["praise_num"] = 0 self["comments_num"] = extract_num("".join(self["comments_num"])) self["create_time"] = datetime.datetime.fromtimestamp( self["create_time"]).strftime(SQL_DATETIME_FORMAT) try: self["update_time"] = datetime.datetime.fromtimestamp( self["update_time"]).strftime(SQL_DATETIME_FORMAT) except: self["update_time"] = self["create_time"] self["crawl_time"] = self["crawl_time"].strftime(SQL_DATETIME_FORMAT) self["content"] = remove_tags(self["content"])
def save_artic_to_es(self): Tec_article = TechnologyType() Tec_article.title = self['title'] Tec_article.time = self['time'] Tec_article.content = remove_tags(self['content']) Tec_article.link_url = self['link_url'] Tec_article.meta.id = self['zhihu_id'] Tec_article.source = self['source'] Tec_article.tag = self['topic'] Tec_article.read_num = self['click_num'] Tec_article.comment_num = self['comment_num'] Tec_article.suggest = gen_suggest(Tec_article._doc_type.index, ((Tec_article.title, 10), (Tec_article.tag, 7))) Tec_article.save() redis_cli.incr("zhihu") return
def parse2(self, response): try: timeout = WebDriverWait(self.driver, 10) except: print("Timed out waiting for page load.") self.driver.quit() title = Selector(response).xpath( '//div[@class="leftContainer"]/div/div/div/div/ \ a/img[@id="coverImage"]/@alt') genre = Selector( response).xpath('//div[@class="rightContainer"]/div/div/ \ div[@class="bigBoxBody"]/div/div/div[@class="left"]/a/text()' ) rating = Selector(response).xpath( '//div[@class="leftContainer"]/div/div[@id="metacol"]/ \ div[@id="bookMeta"]/span/span[@class="average"]/text()') reviews = Selector(response).xpath('//div[@id="bookReviews"]/ \ div[@class="friendReviews elementListBrown"]') for review in reviews: try: item = GoodreadsItem() item['title'] = title.extract()[0] item['rating'] = rating.extract()[0] item['book_url'] = response.meta['book_url'] item['genre'] = genre.extract()[0] item['link_url'] = review.xpath( './/div/div/link/@href').extract()[0] item['reviewDate'] = review.xpath( './/div/div/div/div/a/text()').extract()[0] item['user'] = review.xpath( './/div/div/div/div/span/a/text()').extract()[0] review_text = review.xpath('.//div/div/div/ \ div[@class="reviewText stacked"]/span/ \ span[2]/text()').extract()[0] # remove html tags item['review'] = remove_tags(review_text) except IndexError as e: print(e, ": title: ", item['title'], "user: ", item['user']) logger.error(e.args[0]) raise yield item
def save_to_es(self): news = NewsType() news.url = self['url'] news.meta.id = self['url_md5'] news.title = self['title'] news.create_time = self['create_time'] news.source = self['source'] news.source_url = self['source_url'] # 这个函数功能有点小爽 news.content = remove_tags(self['content']) news.suggest = gen_suggests(NewsType._doc_type.index, ((news.title, 10),)) news.save() redis_cli.incr("news_count") return
def parse_post(self, response, date): print(response) title = response.xpath('//h1/text()').get() description = response.xpath( '//div[@class="new-item-content"]').getall() if description: comments = re.sub(r'<!--[\S\s]*?-->', '', str(description[0])) description = remove_tags(str(comments)).strip() else: description = '' item = ItemLoader(item=KnejaItem(), response=response) item.add_value('title', title) item.add_value('description', description) item.add_value('date', date.strip()) return item.load_item()
def parse_detail(self, response): self.logger.info('Parse function called on %s', response.url) head = response.selector.css('.articleName') body = response.selector.css('.art_context') content = body.xpath('.//div[@class="art_contextBox"]').extract_first() if content is None: content = body.extract_first() article_item = ArticleItem( title=head.xpath('.//h1/text()').extract_first(), date_time=head.xpath( './/span[@class="pr20"]/text()').extract_first(), source=head.xpath('.//a[@rel="nofollow"]/text()').extract_first(), content=remove_tags(content)) yield article_item
def parse_investimento_noticias(self, response, info): content = response.css('.itemIntroText p').getall() +\ response.css('.itemFullText p').getall() content = ''.join([remove_tags(x) for x in content]) date = response.css('time::attr(datetime)').get() date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%S%z') news_obj = { 'title': response.xpath('//header/h1/text()').get(), 'content': content, 'date': date, #response.css('time::text').get().strip() 'author': 'None', 'url': info['url'], 'site': info['site'], 'tick': info['tick'] } yield Noticia(news_obj)
def parse(self, response): for item in response.xpath("//div[@class='auction ']"): logging.debug('item:' + str(item)) location = item.xpath( "normalize-space(.//button[contains(@class,'location-link')])" ).get() logging.debug('Location:' + str(location)) # filter non SC items if not (bool(re.search('SC', location)) or bool(re.search('South Carolina', location))): logging.debug('SC NOT found: ') continue adItem = ClassifiedscraperItem() adItem.set_all(None) adItem['source'] = self.name adItem['location'] = location #response.request.url request_url_base = furl.furl(response.request.url).origin logging.debug('request_url_base:' + request_url_base) link_raw = item.xpath(".//a/@href").get(default='/not-found') item_link = request_url_base + link_raw logging.debug('link:' + item_link) adItem['link'] = item_link title_raw = remove_tags( item.xpath( "normalize-space(.//p[@class='auctionTitle'])").get()) logging.debug('title_raw:' + title_raw) adItem['title'] = title_raw #image_link_raw = remove_tags(item.xpath(".//img/@src").get()) #logging.debug('image_link_raw: ' + image_link_raw) #adItem['image_link'] = image_link_raw #raw_post_date = item.xpath("normalize-space(.//div[contains(@class,'auctionLocation')]/span)").get() #logging.debug('raw_post_date:' + raw_post_date) #adItem['post_date'] = raw_post_date.split('ST')[0].strip() #raw_desc = item.xpath("normalize-space(.//div[@class='col-sm-12'])").get() #logging.debug('raw_post_date:' + raw_post_date) #adItem['description'] = raw_desc yield adItem
def save_to_es(self): article = ArticleType() article.title = self['title'] article.create_date = self["create_date"] article.content = remove_tags(self["content"]) article.front_image_url = self["front_image_url"] if "front_image_path" in self: article.front_image_path = self["front_image_path"] article.praise_nums = self["praise_nums"] article.fav_nums = self["fav_nums"] article.comment_nums = self["comment_nums"] article.url = self["url"] article.tags = self["tags"] article.meta.id = self["url_object_id"] article.save() return
def save_to_es(self): crawl_time = datetime.datetime.now().strftime(SQL_DATE_FORMAT) create_time = datetime.datetime.fromtimestamp(self["create_time"]).strftime(SQL_DATE_FORMAT) update_time = datetime.datetime.fromtimestamp(self["update_time"]).strftime(SQL_DATE_FORMAT) article = ZhihuAnswerType() article.meta.id = self['zhihu_id'] article.url = self['url'] article.question_id = self['question_id'] article.content = remove_tags(self['content']) article.praise_num = self['praise_num'] article.comments_num = self['comments_num'] article.create_time = create_time article.update_time = update_time article.crawl_time = crawl_time article.save() return
def parse_plaintext_format(text): data = [] for row in text.split('\n'): rowparts = row.strip().split(' ') if len(rowparts) < 2: break timestamp = rowparts[0] if rowparts[1].startswith('<') and rowparts[1].endswith('>'): url = rowparts[1][1:-1] textparts = rowparts[2:] else: url = '' textparts = rowparts[1:] text = remove_tags(' '.join(textparts)) data.append({'timestamp': timestamp, 'url': url, 'text': text}) return data
def parse(self, response): data = json.loads(response.text) next_page = data['payload']['nextPage'] for post in data['payload']['articles']: title = post['title'] date = post['createdAt'] description = remove_tags(post['html']) item = ItemLoader(item=UnibankmkItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) yield item.load_item() if next_page: yield response.follow(base.format(next_page), self.parse)
def parse_tvn24bis(self, response): url = response.url art_id = url.split(',')[-1].split('.')[0] date = response.css( 'article.detail header time::attr("datetime")').extract_first() date = date.split(' ') time = date[1][0:4] date = date[0] title = response.css("article.detail header h1 ::text").extract_first() title = replace_escape_chars(title).strip() lead = response.css("div.content p.lead ::text").extract_first() lead = replace_escape_chars(lead).strip() text = response.xpath( '//div[@class="content"]/p[not(contains(@clas, "rules") or contains(@clas, "footer"))]/text()' ).extract() text = ' || '.join(text) text = remove_tags(text) text = replace_escape_chars(text) text = clear_text(text) autor = response.css( "div.content div.footer ::text").extract()[1].split('/') if len(autor) > 1: source = autor[1] source = source.strip().replace('Źródło: ', '') autor = autor[0].strip().replace('Autor: ', '') else: source = '' autor = autor[0].strip().replace('Autor: ', '') yield { 'id': art_id, 'url': url, 'date': date, 'time': time, 'title': ''.join(title), 'lead': lead, 'text': text, 'autor': autor, 'source': source }
def parse_post(self, response): title = response.xpath( '//div[@class="number_list pj"]/h3/text()').get() description = response.xpath( '//div[@class="number_list pj"]//text()[normalize-space() and not(ancestor::h3 | ancestor::i | ancestor::video)]' ).getall() description = [remove_tags(p).strip() for p in description] description = ' '.join(description).strip() date = response.xpath('//i/text()').get() item = ItemLoader(item=NbsrsItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_article(self, response): lalitem = LalItem() data = response.meta['data'] lalitem['url'] = response.url lalitem['title'] = data['title'] # lalitem['authors'] = data['author'].split('; ') lalitem['authors'] = data['author'].replace('; ', ', ') lalitem['doi'] = data['doi'] lalitem['journal'] = response.css( '.article-journal-name li strong::text').extract_first('') if not lalitem['journal']: lalitem['journal'] = data['name'].split(',')[0] lalitem['year'] = int(data['years']) lalitem['keywords'] = '' lalitem['abs_img_url'] = response.css( 'img[alt="Fig. 1"]::attr(data-src)').extract_first(default='') abstract_text = response.css('#articleBody p').extract() abstract_list = [] if abstract_text: #有些文章是没有摘要的。如果有摘要,格式也是多变的。 for element in abstract_text: if '©' in element: break else: abstract_match = re.match(r'<.+?>(.*)</.+>', element, re.S) abstract_list.append(abstract_match.group(1)) lalitem['abstract'] = ''.join(abstract_list) lalitem['_id'] = get_md5(lalitem['url']) lalitem['company'] = self.name ##请求glgoo来获取citation glgoo_url = 'https://xs.glgoo.top/scholar?' headers = { 'User_Agent': 'Mozilla/5.0', 'Referer': 'https://gf1.jwss.site/' } yield Request(url=glgoo_url + urlencode({'q': remove_tags(lalitem['title'])}) + '&hl=zh-CN&as_sdt=0&as_vis=1&oi=scholart', headers=headers, meta={'lalitem': lalitem}, dont_filter=True, callback=self.get_citation)
def parse_item(self, response): item = ScifibotItem() # clean body orig_body = response.body_as_unicode() body = remove_tags_with_content(orig_body, which_ones=('script', 'head')) body = remove_tags(remove_comments(body)) tokens = tokenize(body.lower()) # decide if the page is interesting if not is_relevant(tokens): stats.inc_value('scifi/filtered_out') # probably not scifi page return item['keywords'] = tokens item['page'] = orig_body item['url'] = response.url return item
def parse_item(self, response): item = ItemLoader(item=OlxAdItem(), response=response) item.default_output_processor = TakeFirst() # # address item.add_value('address', response.url) # # title item.add_xpath('title', '//div[@class="offer-titlebox"]/h1/text()') # # # id of advertising item.add_xpath( 'number', '//div[@class="offer-titlebox__details"]/em/small/text()', re='(\d+)') # info th = response.xpath('//table[@class="item"]//th/text()').extract() td = [ re.sub('([\t\n]+)', ', ', remove_tags(s).strip()) for s in response.xpath( '//table[@class="item"]//td[@class="value"]/strong').extract() ] item.item.setdefault('info', {th: td for th, td in zip(th, td)}) # price if exists check https://www.olx.ua/rabota/ price = response.xpath( '//div[@class="price-label"]/strong/text()').get() # Обработать валюту чтобы все выводилось в гривне item.add_value('price', price.strip() if price is not None else 'Бесплатно') # # # phone ToDO phone = '+380' item.add_value('phone', phone) # # # description item.add_xpath('description', '//div[@id="textContent"]//text()', Compose(MapCompose(str.strip), Join('. '), lambda s: normalize('NFKC', s)), re='[^\n\r]+') # # images links item.add_xpath('images', '//*[@class="photo-glow"]/img/@src') yield item.load_item()
def parse_item(self, response): error_location = False error_category = False item = response.meta['item'] item['title'] = response.xpath( '//a[@class="heading detail-title"]/@title').extract()[0] item['offer_id'] = response.xpath( '//div[@itemtype="http://schema.org/JobPosting"]/@id').extract()[0] item['lang_code'] = 'en-US' item['date'] = self.dt.strftime('%Y%m%d') item['description'] = remove_tags( response.xpath('//div[@itemprop="description"]').extract()[0]) item['location_name'] = response.xpath( '//span[@itemprop="name"]/text()').extract()[0] item['category_name'] = response.xpath( '//span[@itemprop="occupationalCategory"]/text()').extract()[0] # GEONAME MANAGEMENT try: item['geoname_id'] = self.geoCache.getGeonameId( item['location_name']) item['country_code'] = self.geoCache.getCountryCode( item['location_name']) except: error_message = "%s location not found in GeoName" % str( item['location_name']) print error_message error_location = True self.beBeeLogger.failure(item['offer_id'], error_message) # CATEGORY MANAGEMENT category_id = self.categoryMapper(item['category_name']) if category_id: item['category_id'] = category_id else: error_message = "category not found: %s" % str( item['category_name']) print error_message error_category = True self.beBeeLogger.failure(item['offer_id'], error_message) if not (error_location or error_category): self.beBeeLogger.success(item['offer_id']) return item
def parse_topic_response(self, response): """ Parse the content """ # Get the title first title = response.css('title::text').extract_first() # Replace / with a space - creates issues with writing to file title = title.replace('/', ' ') content = response.css('div#mw-content-text') # Just extract all the '<p></p>' children from this text = title + '\n\n' for child in content.xpath('//p'): # Get the text from this child <p></p> tag paragraph = child.extract() # Remove <script>, <sup>, <math> tags with the content paragraph = remove_tags_with_content(paragraph, which_ones=('script', 'sup', 'math')) # Remove the rest of the tags without removing the content paragraph = remove_tags(paragraph) # Replace '&' with '&' paragraph = paragraph.replace('&', '&') # Add to the file text += paragraph + '\n' # Create the directory dirname = 'data/wikipedia' if not os.path.exists(dirname): os.makedirs(dirname, exist_ok=True) elif not os.path.isdir(dirname): os.remove(dirname) os.makedirs(dirname, exist_ok=True) # Save the text name = response.url.split('/')[-1] filename = '{}/{}'.format(dirname, name) f = open(filename, 'w') f.write(text) f.close()
def parse_zhuanye(self, response): item = response.meta['item'] majors = {} for e in response.css(u'.schoolIntro_con2>div'): cls = e.css('::attr(class)').extract_first() if cls == 'catTitle': cat1 = e.css('h2::text').re_first(u'开设(.+)专业') majors[cat1] = {} elif cls == 'majorCon': cat2 = e.css('h3::text').re_first(u'■ (.+)(') majors[cat1][cat2] = [remove_tags(i) for i in e.css('ul>li').extract()] else: pass item['majors'] = majors yield item