Example #1
0
	def parse(self, response):
		ts = time.time()
		html_name = 'txt/artist/artist' + str(ts) + '.txt'
		file = codecs.open(html_name, 'w+', 'utf-8')
		
		# file.write(response.url)
		# file.write('\n')

		for body in response.css('div.layoutSingleColumn h3').extract():
			body = body.encode(response.encoding)
			body = remove_tags(body)
			print "Header"
			print(body)
			try:
				file.write(body)
			except AttributeError:
				print(AttributeError)
				sys.exit(0)

		for body in response.css('div.layoutSingleColumn p').extract():
			body = body.encode(response.encoding)
			body = remove_tags(body)
			print "Paragraph"
			print(body)
			try:
				file.write(body)
			except AttributeError:
				print(AttributeError)
				sys.exit(0)

		file.close()
Example #2
0
    def test_remove_tags(self):
        # text with tags
        self.assertEqual(remove_tags(u'<p>one p tag</p>'), u'one p tag')
        self.assertEqual(remove_tags(u'<p>one p tag</p>', which_ones=('b',)), u'<p>one p tag</p>')

        self.assertEqual(remove_tags(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
                         u'<b>not will removed</b>i will removed')
	def parse(self, response):
		max_position = ''
		koma = ','
		headers = response.headers
		itemselector = Selector(response).xpath('//div[@class="content"]')

		if headers['Content-Type'] == 'application/json;charset=utf-8':
			data = json.loads(response.body)
			itemselector = Selector(text=data['items_html']).xpath('//div[@class="content"]')
			max_position = data['min_position']
			yield Request("https://twitter.com/i/search/timeline?f=tweets&vertical=default&q=%22demam%20berdarah%22%20OR%20dbd%20OR%20dhf%20OR%20%22dengue%20fever%22%20OR%20%22dengue%20hemorrhagic%22%20OR%20%22sakit%20db%22%20lang%3Aid%20since%3A"+self.start+"%20until%3A"+self.end+"&src=typd&include_available_features=1&include_entities=1&max_position="+max_position+"&reset_error_state=false", 
					callback=self.parse, 
					method="GET",)
		
		for sel in itemselector:
			self.index += 1
			item = TwitterscrapingItem()
			item['index'] = self.index
			item['userid'] = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/@data-user-id').extract()))
			item['username'] = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/span[@class="username js-action-profile-name"]/b/text()').extract()))
			item['fullname'] = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/strong/text()').extract()))
			text_tweet = ''.join(
				map(unicode.strip, sel.xpath('p[@class="TweetTextSize  js-tweet-text tweet-text"]').extract()))
			item['text_tweet'] = remove_tags(text_tweet).replace('\n',' ').replace('\u',' ')
			item['original_text_tweet'] = text_tweet
			hash_tags = koma.join(
				map(unicode.strip, sel.xpath('p[@class="TweetTextSize  js-tweet-text tweet-text"]'
					'/a[@class="twitter-hashtag pretty-link js-nav"]').extract()))
			item['hash_tags'] = remove_tags(hash_tags)
			item['time_tweet'] = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/small[@class="time"]/a/@title').extract()))
			item['lang'] = ''.join(
				map(unicode.strip, sel.xpath('p[@class="TweetTextSize  js-tweet-text tweet-text"]/@lang').extract()))
			retweets = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-footer"]'
					'/div[@class="ProfileTweet-actionList js-actions"]'
					'/div[@class="ProfileTweet-action ProfileTweet-action--retweet js-toggleState js-toggleRt"]'
					'/button[@class="ProfileTweet-actionButton  js-actionButton js-actionRetweet"]'
					'/div[@class="IconTextContainer"]').extract()))
			item['retweets'] = remove_tags(retweets).strip()
			favorite = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-footer"]'
					'/div[@class="ProfileTweet-actionList js-actions"]'
					'/div[@class="ProfileTweet-action ProfileTweet-action--favorite js-toggleState"]'
					'/button[@class="ProfileTweet-actionButton js-actionButton js-actionFavorite"]'
					'/div[@class="IconTextContainer"]').extract()))
			item['favorite'] = remove_tags(favorite).strip()
			item['place_id'] = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/span[@class="Tweet-geo u-floatRight js-tooltip"]/a/@data-place-id').extract()))	
			item['place'] = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/span[@class="Tweet-geo u-floatRight js-tooltip"]/a/span[@class="u-hiddenVisually"]/text()').extract()))	
			item['max_position'] = max_position

			yield item
Example #4
0
 def test_returns_unicode(self):
     # make sure it always return unicode
     assert isinstance(remove_tags(b'no tags'), six.text_type)
     assert isinstance(remove_tags(b'no tags', which_ones=('p',)), six.text_type)
     assert isinstance(remove_tags(b'<p>one tag</p>'), six.text_type)
     assert isinstance(remove_tags(b'<p>one tag</p>', which_ones=('p')), six.text_type)
     assert isinstance(remove_tags(b'<a>link</a>', which_ones=('b',)), six.text_type)
     assert isinstance(remove_tags(u'no tags'), six.text_type)
     assert isinstance(remove_tags(u'no tags', which_ones=('p',)), six.text_type)
     assert isinstance(remove_tags(u'<p>one tag</p>'), six.text_type)
     assert isinstance(remove_tags(u'<p>one tag</p>', which_ones=('p')), six.text_type)
     assert isinstance(remove_tags(u'<a>link</a>', which_ones=('b',)), six.text_type)
Example #5
0
 def parse(self, response):
     hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract()
     containers = response.selector.xpath('//div[contains(@class, "c-container")]')
     for container in containers:
         href = container.xpath('h3/a/@href').extract()[0]
         title = remove_tags(container.xpath('h3/a').extract()[0])
         c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract()
         abstract = ""
         if len(c_abstract) > 0:
             abstract = remove_tags(c_abstract[0])
         request = scrapy.Request(href, callback=self.parse_url)
         request.meta['title'] = title
         request.meta['abstract'] = abstract
         yield request
Example #6
0
    def parse_item(self, response):

        province = response.css('.dqwz>a:last-child::attr(title)').re_first(ur'2017年(.+?)省?本科')
        school = response.css('.nr>h2::text').extract_first()
        count = len(response.xpath('//div[@id="ivs_content"]/table//tr[1]/td').extract())
        for row in response.xpath('//div[@id="ivs_content"]/table//tr[position()>1]'):
            fields = [remove_tags(i).strip() for i in row.css('td').extract()]
            if count == 4:
                del fields[0]
            if len(fields) == 3:
                rowspan_count = [e.css('::attr(rowspan)').extract_first(1) for e in row.css('td')][-3:]
                rowspan_value = fields
                rowspans = len([i for i in rowspan_count if i > 1])
            elif len(fields) + rowspans == 3:
                new_fields = []
                fields.reverse()
                for k, v in zip(rowspan_count, rowspan_value):
                    if k == 1:
                        new_fields.append(fields.pop())
                    else:
                        new_fields.append(v)
                fields = new_fields
            else:
                continue

            yield ShmecItem(
                province=province,
                school=school,
                major=fields[0],
                require=fields[1],
                remark=fields[2],
            )
Example #7
0
def make_it_clean(line):
	'''
	Очистка текста от тегов html, css стилей, js
	(string) line - входной текст
	'''
	cleari = remove_tags(line)
	soline = re.compile("(\<.+\n)", re.DOTALL)
	boline = re.compile("(.+\>)", re.DOTALL)
	alline = re.compile("\<.+\>", re.DOTALL)
	cleari = re.sub(soline, '', cleari)
	cleari = re.sub(boline, '', cleari)
	cssline = re.compile(r"\{.+\}{1}", re.DOTALL)
	cleari = re.sub(cssline, ' ', cleari)
	cleari = re.sub("async=\"async\"\n", '', cleari)
	cleari = re.sub("src=.+\"", '', cleari)
	cleari = re.sub("var\s_.+\)", '', cleari)
	cleari = re.sub("function.+\"\)", '', cleari)
	cleari = re.sub("document.+\);", " ", cleari)
	cleari = re.sub("function.+\)", " ", cleari)
	cleari = re.sub("&laquo;", " «", cleari)
	cleari = re.sub("&raquo;", "» ", cleari)
	cleari = re.sub("&rarr;", "→", cleari)
	cleari = re.sub(r'&nbsp;', ' ', cleari)
	cleari = re.sub(r'(&mdash;)|(&ndash;)', '-', cleari)
	cleari = re.sub(r'\t{2,}', ' ', cleari)
	cleari = re.sub(r'\s{2,}', ' ', cleari)
	cleari = re.sub(r'\n{2,}', '\n', cleari)
	cleari = re.sub(r"(\<\!\-\-.*\-\-\>)", '', cleari)

	return cleari
Example #8
0
	def parse(self, response):
		s = Selector(response)
		next_link = s.xpath('//div[@class="w-button-more"]/a/@href').extract()
		if len(next_link):
			yield Request("https://mobile.twitter.com"+next_link[0], callback=self.parse)
		itemselector = Selector(response).xpath('//*[@id="main_content"]/div/div[3]/table')
		#regex = re.compile(r"([\\]+u\d*)", re.MULTILINE)
		for sel in itemselector:
			self.index += 1
			item = TwitterscrapingItem()
			item['index'] = self.index
			item['username'] = ''.join(
				map(unicode.strip, sel.xpath('tr[1]/td[2]/a/div/text()').extract()))
			tweet = remove_tags(''.join(
				map(unicode.strip, sel.xpath('tr[2]/td/div').extract()))
				).replace('&amp','&').replace('  ','').replace('\n      ','').replace('\n    ','').replace('\n','').replace('\u',' ')
			item['text_tweet'] = u''+tweet
			item['original_tweet'] = ''.join(sel.xpath('tr[2]/td/div/div').extract())
			item['time_tweet'] = ''.join(
				map(unicode.strip, sel.xpath('tr[1]/td[3]/a/text()').extract()))
			item['url'] = ''.join(
				map(unicode.strip, sel.xpath('tr[2]/td/div/@data-id').extract()))
			item['data_id'] = ''.join(
				map(unicode.strip, sel.xpath('tr[3]/td/span[1]/a/@href').extract()))
			yield item
    def parse_speech(self, response):
        paragraphs = response.css('p')[:-1]  # last p contains pagination
        text = remove_tags(''.join(paragraphs.extract()))

        l = ParlamentHuSpeechLoader(item=Speech(), selector=response,
            scheme='parlament.hu/people')
        l.add_value('text', text)
        l.add_value('type', 'speech')
        l.add_value('sources', [response.url])
        l.add_xpath('position', '//b[1]/text()')
        l.add_xpath('video', '//table//tr[6]//td[2]/a/@href')
        l.add_xpath('creator_id', '//table//tr[2]//td[2]/a/@href',
            re=r'ogy_kpv\.kepv_adat\?p_azon=(\w\d+)')
        l.add_value('event_id', response.meta['event_id'])

        date = response.xpath(
            '//table//tr[1]/th/text()').re(r'\d{4}\.\d{2}.\d{2}\.')
        time = response.meta.get('time')
        if date:
            date = date[0]
            if time:
                date += time[0]
            l.add_value('date', date)
        item = l.load_item()
        yield item
        if 'creator_id' in item:
            yield scrapy.Request(self.get_api_url(
                self.PERSON_ENDPOINT, params={
                    'p_azon': item['creator_id']['identifier']}),
                callback=self.parse_person, meta={
                    'p_azon': item['creator_id']['identifier']})
Example #10
0
 def _extract_features(self, sel, item):
     description_xpath = '//div[@id="tab1"]/ul/li'
     data = sel.xpath(description_xpath).extract()
     if len(data) > 0 :
         data = [remove_tags(v).strip().replace('&nbsp;',' ').replace('&gt;','>').strip()  for v in data]
         data = filter(None,data)
         item['description'] = '<br>'.join(data)
Example #11
0
def parse_linklist(text, remove_tags=False):
    data = []

    for row in text.split('\n'):
        rowparts = row.strip().split(' ')
        if len(rowparts) < 2:
            break
        time = rowparts[0]
        if rowparts[1].startswith('<') and rowparts[1].endswith('>'):
            url = rowparts[1][1:-1]
            textparts = rowparts[2:]
        else:
            url = ''
            textparts = rowparts[1:]
        text = ' '.join(textparts)
        if remove_tags:
            text = html.remove_tags(text)
        data.append(
            {
                'time': time,
                'url': url,
                'text': text
            }
        )
    return data
Example #12
0
 def _extract_description(self, sel, item):
     return
     desc_xpath = '//div[@id="item-overview"]/ul/li/node()'
     data = sel.xpath(desc_xpath).extract()
     if len(data) != 0:
         data = [remove_tags(v.strip()) for v in data]
         description = ';'.join(data).replace(':;',':').replace('from;','from ')
         item['description'] = description
Example #13
0
def clean_tags_from_affiliations(value):
    """Clean the affiliaton string for an author."""
    for affiliation in value.get('affiliations', []):
        # Remove tag AND content of any prefix like <label><sup>1</sup></label>
        affiliation['value'] = remove_tags_with_content(affiliation['value'], ('label',))
        # Now remove all tags but KEEP content
        affiliation['value'] = remove_tags(affiliation['value'])
        # Remove random whitespaces
        affiliation['value'] = clean_whitespace_characters(affiliation['value'])
    return value
Example #14
0
    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
Example #15
0
    def process_response(self, request, response, spider):
        # clean body
        orig_body = response.body_as_unicode()
        body = remove_tags_with_content(orig_body, which_ones=('script', 'head'))
        body = remove_tags(remove_comments(body))
        terms = tokenize(body.lower())
        request.meta['terms'] = terms
        request.meta['body'] = body

        return response
Example #16
0
    def parse(self, response):
        """
        Parse the response page
        """
        # Skip error URLs
        if response.status != 200:
            return

        data = json.loads(response.text)

        title = data['title']
        # Replace / with a space - creates issues with writing to file
        title = title.replace('/', ' ')

        description = data['description']
        data = data['content']

        # Remove <script>, <sup>, <math> tags with the content
        paragraph = remove_tags_with_content(data, which_ones=('script', 'sup', 'math', 'style'))
        # Remove the rest of the tags without removing the content
        paragraph = remove_tags(paragraph)

        # Replace &amp; with &
        paragraph = paragraph.replace('&amp;', '&')
        # Replace &#39; with '
        paragraph = paragraph.replace('&#39;', "'")
        paragraph = paragraph.replace('&rsquo;', "'")
        paragraph = paragraph.replace('&ldquo;', "'")
        paragraph = paragraph.replace('&rdquo;', "'")
        # Replace &nbsp; with a space
        paragraph = re.sub("&.....;", ' ', paragraph)
        paragraph = re.sub("&....;", ' ', paragraph)

        # Replace 'U.S.' with 'US':
        paragraph = paragraph.replace('U.S.', 'US')

        # Some more replacements to improve the default tokenization
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\t', '')

        text = title + '\n\n' + description + '\n\n' + paragraph

        # Create the directory
        dirname = 'data/qplum'
        if not os.path.exists(dirname):
            os.makedirs(dirname, exist_ok=True)
        elif not os.path.isdir(dirname):
            os.remove(dirname)
            os.makedirs(dirname, exist_ok=True)

        # Save the title and the text both
        filename = '{}/{}'.format(dirname, title)
        f = open(filename, 'w')
        f.write(text)
        f.close()
Example #17
0
File: regex.py Project: 0326/scrapy
    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text]
Example #18
0
 def _extract_description(self, sel, item):
     description_xpath = '//div[@class="description"]/ul/li/node()'
     data = sel.xpath(description_xpath).extract()
     if len(data) > 0:
         data = [remove_tags(v.strip()) for v in data]
         description = ''
         for index,desc in enumerate(data):
             if index % 2 == 0:
                 description += desc
             else :
                 description += desc + ';'
         item['description'] = description
    def parse_item(self, response):
        if(collect().count() < 10000):
        	# print '*******', response.url
        	hxs = HtmlXPathSelector(response)
        	titles=hxs.select("//div[@id='articleNew']/h1/text()").extract()

        	if len(titles) == 0: return

        	title=''.join(titles).strip()

        	txts=hxs.select("//div[@id='articleNew']/p").extract()
        	conteudo=remove_comments(remove_tags(''.join(txts)))

        	i = Artigo()
        	i['url']=response.url
        	i['nome']=title
        	i['conteudo']=conteudo 
          
        	#opiniao = {"url":response.url, "nome":title, "conteudo": conteudo}
                opiniao2 = {"conteudo":conteudo}
                
             #   collect().insert(opiniao) # Colecao Opinioes : Todas opinioes coletadas no Painel do Leitor
   
 ##############################################################################################################################
 # Filtrando por conteudo e direcionando para diferentes colecoes
 # Filtrando por conteudo e salvando em arquivo
                
                arqfile = leitorOpiniao
                frase = conteudo.split()
                if "Dilma" in frase:
                    #database()['dilma'].insert(opiniao2)          # Colecao dilma
                    arqfile = leitorDilma
                elif "Copa" in frase:
                    #database()['copa'].insert(opiniao2)           # Colecao copa
                    arqfile = leitorCopa
                elif "Palmeiras" in frase:
                    #database()['palmeiras'].insert(opiniao2)      # Colecao palmeiras
                    arqfile = leitorPalmeiras
                arq = open(arqfile, 'a')
                arq.writelines(str(opiniao2))
                arq.close()
            #    yield i # Mensagem na tela
		
		print '##########################################################'
	#	print ("TOTAL DE OPINIOES: %d" %collect().count())
                print ("Salvando em %s " %arqfile)
                print '##########################################################'

		
	else:
            	print 'Fim de scraping leitor'
		exit()
Example #20
0
 def _extract_features(self, sel, item):
     description_xpath = '//ul[@class="ul_description"]/li/node()'
     data = sel.xpath(description_xpath).extract()
     if len(data) > 0 :
         data = [remove_tags(v).strip().replace('&nbsp;',' ').replace('&gt;','>')  for v in data]
         data = filter(None,data)
         description = ''
         for index,desc in enumerate(data):
             if index % 2 == 0:
                 description += desc
             else :
                 description += desc + ';'
         item['description'] = description
Example #21
0
 def _extract_description(self, sel, item):
     description_xpath = '//div[@class="itemAttr"]/div[@class="section"]/table//tr/node()'
     data = sel.xpath(description_xpath).extract()
     if len(data) > 0 :
         data = [remove_tags(v).strip().replace('\t','').replace('\n','')  for v in data]
         data = filter(None,data)
         description = ''
         for index,desc in enumerate(data):
             if index % 2 == 0:
                 description += desc
             else :
                 description += desc + ';'
         item['description'] = description
Example #22
0
    def parse(self, response):
        if response.url.find('redirect') > 0:
            print('Invalid url')

            return

        title = text_clear(text_strip(response.css('h1::text').extract_first()))
        body = text_clear(text_strip(response.css('.index_center .content::text').extract_first()))
        category = text_strip(response.css('.breadCrumbList li:nth-child(2) a b::text').extract_first())
        sub_category = text_strip(response.css('.breadCrumbList li:nth-child(3) a b::text').extract_first())

        if category:
            category = category.replace('问答', '')

        if sub_category:
            sub_category = sub_category.replace('问答', '')

        question_item = Question()
        question_item['title'] = title
        question_item['body'] = body
        question_item['category'] = category if category else 'N/A'
        question_item['sub_category'] = sub_category if sub_category else 'N/A'
        question_item['source_name'] = '17house.com'
        question_item['source_url'] = response.url
        question_item['entry_url'] = response.url

        answers = response.css('.list .top').extract()
        answers = map(text_strip, answers)
        answers = map(text_clear, answers)
        answers = filter(text_filter, answers)

        answer_items = []
        for answer in answers:
            answer_body = remove_tags(answer, keep=['br'])

            if not answer_body:
                continue

            answer_item = Answer()
            answer_item['body'] = answer_body

            answer_items.append(answer_item)

        if len(answer_items) == 0:
            print("No Answer")
            return

        question_item['answers'] = answer_items

        print(question_item)
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        curr_url = response.url
        txt = hxs.select('//body')
        if txt: 
            txt = remove_tags(txt.extract()[0])
            self.db.add_to_index(curr_url, txt)
            #for word in self.db.separate_words(txt): print word
		
        urls =  hxs.select('//a[contains(@href,".html")]/@href')
        if urls:
            urls = urls.extract()
            #self.db.commit()
            for url in urls: 
                if url.find("'")!=-1 : continue
                url=url.split('#')[0]
                if url[0:4] !='http': 
                    url = '%s%s'%(base_url, url)
                if urlparse.urlsplit(url)[1].split(':')[0].startswith('www.newyorker.com'):
                    link_text = remove_tags(url)
                    self.db.add_link_ref(curr_url, url, link_text)                
                    yield Request(url, self.parse)                        
Example #24
0
 def parse_profile(self, response):
     title = response.xpath('//title/text()').extract()[0].strip()
     create_time = response.xpath('//em[@id="post-date"]/text()').extract()[0].strip()
     source = response.xpath('//a[@id="post-user"]/text()').extract()[0].strip()
     body = response.body.strip()
     tag_content = response.xpath('//div[@id="js_content"]').extract()[0].strip()
     content = remove_tags(tag_content).strip()
     item = WeixinItem()
     item['title'] = title
     item['create_time'] = create_time
     item['source'] = source
     item['body'] = body
     item['content'] = content
     return item
Example #25
0
 def parse_store_data(self, response):
     """ Yield a GeojsonPointItem of the store's data """ # Pull the data off the stores page
     store = json.loads(remove_tags(response.xpath('//script[@type="application/ld+json"]')[1:].extract()[0]))
     store_hours = self.hours(store)
     yield GeojsonPointItem(
         ref=store['url'],
         lat=store['geo']['latitude'],
         lon=store['geo']['longitude'],
         addr_full=store['address']['streetAddress'],
         city=store['address']['addressLocality'],
         postcode=store['address']['postalCode'],
         name=store['branchOf']['name'],
         phone=store['telephone'],
         opening_hours=store_hours
     )
Example #26
0
	def parse(self, response):
		post_links = response.xpath('//div[contains(@id, "CollapsiblePanel")]')
		for post in post_links:
			title = post.xpath('./div[@class="CollapsiblePanelTab"]/text()[normalize-space()and not(ancestor::em)]').get()
			description = post.xpath('./div[@class="CollapsiblePanelContent"]//text()[normalize-space()]').getall()
			description = [remove_tags(p).strip() for p in description]
			description = ' '.join(description).strip()
			date = post.xpath('./div[@class="CollapsiblePanelTab"]/em/text()').get()

			item = ItemLoader(item=PostedcorsItem(), response=response)
			item.default_output_processor = TakeFirst()
			item.add_value('title', title)
			item.add_value('description', description)
			item.add_value('date', date)

			yield item.load_item()
Example #27
0
    def parse_post(self, response):
        title = response.xpath('//h1/text()').get()
        description = response.xpath(
            '//div[@class="post"]//text()[normalize-space()]').getall()
        description = [remove_tags(p).strip() for p in description]
        description = ' '.join(description).strip()
        date = response.xpath(
            '//article[@class="intro center"]/time/text()').get()

        item = ItemLoader(item=DirektnabankarsItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)
        item.add_value('date', date)

        return item.load_item()
Example #28
0
    def parse_post(self, response):
        title = response.xpath('//h1/text()[normalize-space()]').get()
        description = response.xpath(
            '//div[@class="contained-9 centered body-copy"]//text()[normalize-space() and not(ancestor::em)]'
        ).getall()
        description = [remove_tags(p).strip() for p in description]
        description = ' '.join(description).strip()
        date = response.xpath('//em/text()').get()

        item = ItemLoader(item=ObsrsItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)
        item.add_value('date', date)

        return item.load_item()
Example #29
0
    def save_artic_to_es(self):
        Tec_article = TechnologyType()
        Tec_article.title = self['title']
        Tec_article.link_url = self['link_url']
        Tec_article.content = remove_tags(self['content'])
        Tec_article.source = self['source']
        Tec_article.time = self['time']
        Tec_article.comment_num = self['comment_num']  # 评论数
        Tec_article.read_num = self['read_num']  # 阅读数
        Tec_article.meta.id = self['url_object_id']

        Tec_article.suggest = gen_suggest(Tec_article._doc_type.index,
                                          ((Tec_article.title, 10)))
        Tec_article.save()
        redis_cli.incr("BokeYuan")
        return
Example #30
0
    def parse(self, response):
        reported_date = dt.datetime.utcnow().isoformat()
        table = response.xpath('//table[@class="table bg-gray"]')

        for tbl in table:
            header = [hd for hd in tbl.xpath('./thead//th//text()').extract()]
            for idx, row in enumerate(tbl.xpath('.//tbody//tr')):
                row = [
                    remove_tags(cell) for cell in row.xpath('./td').extract()
                ]

                raw_item = {header[idx]: cell for idx, cell in enumerate(row)}
                raw_item.update(port_name='Mundra',
                                provider_name=self.provider,
                                reported_date=reported_date)
                yield normalize.process_item(raw_item)
Example #31
0
    def parse_post(self, response):
        title = response.xpath('//div[@class="newsDetail"]/h1/text()').get()
        description = response.xpath(
            '//div[@class="perex"]//text()|//div[@class="htmlText"]//text()[normalize-space() and not(ancestor::ul | ancestor::h3)]'
        ).getall()
        description = [remove_tags(p).strip() for p in description]
        description = ' '.join(description).strip()
        date = response.xpath('//time/text()').get()

        item = ItemLoader(item=AxaskItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)
        item.add_value('date', date)

        return item.load_item()
Example #32
0
def processText(value):
    """process to get text, clean specifix character
    
    Arguments:
        value {string} -- input value
    
    Returns:
        string -- out put value
    """
    if value:
        value = replace_escape_chars(value)
        value = remove_tags(value)
        value = value.replace('#ft5_slash#', '/').replace('\\/', '/')
        return value
    else:
        return ''
Example #33
0
 def parse_store_data(self, response):
     """ Yield a GeojsonPointItem of the store's data """  # Pull the data off the stores page
     store = json.loads(
         remove_tags(
             response.xpath('//script[@type="application/ld+json"]')
             [1:].extract()[0]))
     store_hours = self.hours(store)
     yield GeojsonPointItem(ref=store['url'],
                            lat=store['geo']['latitude'],
                            lon=store['geo']['longitude'],
                            addr_full=store['address']['streetAddress'],
                            city=store['address']['addressLocality'],
                            postcode=store['address']['postalCode'],
                            name=store['branchOf']['name'],
                            phone=store['telephone'],
                            opening_hours=store_hours)
Example #34
0
    def parse_info(self, response):
        html = response.text
        item = LiepinItem()
        title_res = re.compile(r'<h1 title="(.*)">')
        item['title'] = re.findall(title_res, html)[0]
        price_res = re.compile(r'<strong>(.*)</strong>\r\n.*<p class="cname">')
        item['price'] = re.findall(price_res, html)[0]

        data_res = re.compile(r'<p.class="msg.ltype".title="(.*)">')
        data = re.findall(data_res, html)[0]
        res = data.replace('&nbsp;&nbsp;', '')
        data_list = res.split('|')

        if len(data_list) == 4:
            item['site'] = data_list[0]
            if '无' in data_list[1]:
                item['experience'] = '无工作经验'
            else:
                item['experience'] = data_list[1]
            if '招' in data_list[2]:
                item['education'] = '无'
            else:
                item['education'] = data_list[2]
            item['time'] = data_list[3]
        else:
            item['site'] = data_list[0]
            item['experience'] = data_list[1]
            if '招' in data_list[2]:
                item['education'] = '无'
                item['time'] = data_list[3]
            else:
                item['education'] = data_list[2]
                item['time'] = data_list[4]

        description_res = re.compile(
            r'<div class="bmsg job_msg inbox">(.*)<div class="mt10">', re.S)
        description = re.findall(description_res, html)[0]
        item['description'] = remove_tags(description).strip()
        item['website'] = '51job'
        print(item['title'])
        print(item['price'])
        print(item['site'])
        print(item['experience'])
        print(item['education'])
        print(item['time'])
        print(item['description'])
        yield item
Example #35
0
    def parse_mesa(self, response):
        filename = "mesa_" + response.meta['mesa'] + '.html'
        with open(filename, 'wb') as f:
            f.write(response.body)

        item = OnpeCrawlerItem()
        ubigeo = response.xpath(
            "//table[@class='table14']//tr[2]//td").extract()
        ubigeo = [remove_tags(i) for i in ubigeo]
        item['content_results'] = response.xpath(
            "//div[@class='contenido-resultados']").extract_first()
        item['department'] = ubigeo[0]
        item['province'] = ubigeo[1]
        item['district'] = ubigeo[2]
        item['local'] = ubigeo[3]
        item['address'] = ubigeo[4]

        mesa_info = response.xpath(
            "//table[@class='table15']//tr[2]//td/text()").extract()
        item['electors'] = mesa_info[0]
        item['voters'] = mesa_info[1]
        item['acta_status'] = mesa_info[2].strip()
        item['resolutions'] = response.xpath(
            '//div[contains(@class, "pbot30_acta")]/text()[3]').extract_first(
            ).strip()
        item['resolutions_note'] = response.xpath(
            '//div[contains(@class, "pbot30_acta")]/p[2]/text()'
        ).extract_first()

        votes = response.xpath(
            '//div[@class="cont-tabla1"]//td/text()').extract()
        item['votes_ppk'] = votes[3].strip()
        item['votes_fp'] = votes[5].strip()
        item['votes_blank'] = votes[7].strip()
        item['votes_null'] = votes[9].strip()
        item['votes_contested'] = votes[11].strip()
        item['votes_total'] = votes[13].strip()

        item['table_number'] = response.xpath(
            "//table[@class='table13']//td/text()").extract_first()
        item['copy_number'] = response.xpath(
            '//table[@class="table13"]//td/text()').extract()[1].strip()
        href = response.xpath('//a/@href').extract_first()
        item['acta_image_url'] = "{}/{}".format(self.start_url, href)
        filename = "acta_mesa_" + item['table_number'] + '.pdf'
        urlretrieve(item['acta_image_url'], filename)
        return item
    def _parse_topic_response(self, response):
        """
        Parses various topics
        e.g. www.investopedia.com/terms/o/oddlottheory.asp
        """
        # Get the title first
        title = response.css('title::text').extract_first()

        # Replace / with a space - creates issues with writing to file
        title = title.replace('/', ' ')

        # Get the first div with class content
        content = response.css('div.content')
        if isinstance(content, list) and len(content) > 0:
            content = content[0]
        else:
            content = response.css('div.roth__content')[0]

        text = title + '\n\n'
        for child in content.xpath('//p'):

            # Get the text from this child <p></p> tag
            paragraph = child.extract()

            # Remove tags including <p> and <a>
            paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip()

            # Replace '&amp;' with '&'
            paragraph = paragraph.replace('&amp;', '&')

            # Add to the file
            text += paragraph + '\n'

        # Create the directory
        dirname = 'data/investopedia'
        if not os.path.exists(dirname):
            os.makedirs(dirname, exist_ok=True)
        elif not os.path.isdir(dirname):
            os.remove(dirname)
            os.makedirs(dirname, exist_ok=True)

        # Save the text
        name = response.url.split('/')[-1]
        filename = '{}/{}'.format(dirname, name)
        f = open(filename, 'w')
        f.write(text)
        f.close()
Example #37
0
    def parse_detail(self, response):
        """
        解析具体网页
        :param response:
        :return:
        """
        url = response.url
        post_id = re.findall(r'p/(\d*)', url)
        if post_id:
            post_id = post_id[0]
        else:
            return

        blog_id = re.findall(r'currentBlogId=(\d*)', response.text)
        if blog_id:
            blog_id = blog_id[0]
        else:
            return

        cnblogs_item = TechnicalArticleItem()
        title = response.css('#cb_post_title_url::text').extract_first()

        publish_time = response.css('#post-date::text').extract_first()
        if publish_time:
            publish_time = datetime.datetime.strptime(publish_time,
                                                      '%Y-%m-%d %H:%M')

        abstract = response.css('#cnblogs_post_body').extract_first()
        if abstract:
            abstract = remove_tags(abstract)[:300]

        cnblogs_item['url_object_id'] = get_md5(url)
        cnblogs_item['url'] = url
        cnblogs_item['title'] = title
        cnblogs_item['article_type'] = response.meta['article_type']
        cnblogs_item['data_source'] = '博客园'
        cnblogs_item['read_num'] = self.get_read_num(post_id)
        cnblogs_item['comment_num'] = self.get_comment_num(post_id)
        cnblogs_item['praise_num'] = 0
        cnblogs_item['collection_num'] = 0

        cnblogs_item['publish_time'] = publish_time
        cnblogs_item['abstract'] = remove_t_r_n(abstract)
        cnblogs_item['tags'] = self.get_tags(blog_id, post_id)
        # pass

        yield cnblogs_item
Example #38
0
    def get_tags(self, blog_id, post_id):
        """
        获取标签
        :param blog_id:
        :param post_id:
        :return:
        """
        res = requests.get(
            'https://www.cnblogs.com/mvc/blog/CategoriesTags.aspx?blogApp=quanxiaoha'
            '&blogId=%s&postId=%s' % (blog_id, post_id))
        try:
            json_dict = json.loads(res.text)
            tags = remove_tags(json_dict['Tags']).replace('标签: ', '')
        except Exception as e:
            tags = ''

        return tags
Example #39
0
    def parse_item(self, response):
        links = dict()
        link_titles = set()

        url = response.url.split('#')[0].lower()
        url_head = url.split('/pages/')[0] + '/pages/'

        title = response.xpath('//meta[@name="DC.title"]/@content').extract_first()
        if title and title.endswith('- NHS Choices'):
            title = title.rstrip(' NHS Choices').rstrip(' -')
        subjects = response.xpath('//meta[@name="DC.Subject"][@scheme="NHSC.Ontology"]/@content').extract_first().split(', ')
        subjects = [s.lower() for s in subjects if s]
        if not subjects:
            subjects = [title.lower()]
        description = clean_text(response.xpath('//meta[@name="DC.description"]/@content').extract_first())
        raw_page_content = response.xpath('//div[@class="main-content healthaz-content clear"]/.').extract_first()
        page_content = clean_text(replace_entities(remove_tags(raw_page_content)))
        for a in response.xpath('//div[@class="main-content healthaz-content clear"]/descendant::a'):
            label = a.xpath('text()').extract_first()
            href = a.xpath('@href').extract_first()
            if href and label:
                href = self.base_url + href.lstrip('/')
                href = href.lower()
                label = clean_text(label)
                if '/conditions/' in href and url_head not in href:
                    link_titles.add(label)
                    if href in links:
                        links[href]['count'] += 1
                    else:
                        links[href] = {
                            'count': 1,
                            'label': label
                        }
                if url_head in href and href != url:
                    print("********************", href)
                    yield scrapy.Request(href, self.parse_item)

        article = NhsItem()
        article['url'] = url
        article['title'] = title
        article['subjects'] = subjects
        article['description'] = description
        article['page_content'] = str(page_content)
        article['links'] = links
        article['link_titles'] = list(link_titles)
        yield article
Example #40
0
    def parse(self, response):
        # remove <script> tags from <p> elements
        for text in response.css('p'):
            yield {
                'text':
                remove_tags(
                    remove_tags_with_content(text.extract(), ('script', )))
            }

        # add new URLs that are descendants of the request URL (same domain)
        for next_page in response.css('div > a'):
            a_tag = next_page.extract()
            if "href=" in a_tag:
                link = (a_tag.split('href="')[1]).split('"')[0]
                if link.count("/") > 2:
                    if link.split("/")[2] in response.request.url:
                        yield response.follow(next_page, self.parse)
    def clean_data(self):
        try:
            self["praise_num"] = extract_num("".join(self["praise_num"]))
        except BaseException:
            self["praise_num"] = 0
        self["comments_num"] = extract_num("".join(self["comments_num"]))

        self["create_time"] = datetime.datetime.fromtimestamp(
            self["create_time"]).strftime(SQL_DATETIME_FORMAT)
        try:
            self["update_time"] = datetime.datetime.fromtimestamp(
                self["update_time"]).strftime(SQL_DATETIME_FORMAT)
        except:
            self["update_time"] = self["create_time"]

        self["crawl_time"] = self["crawl_time"].strftime(SQL_DATETIME_FORMAT)
        self["content"] = remove_tags(self["content"])
Example #42
0
 def save_artic_to_es(self):
     Tec_article = TechnologyType()
     Tec_article.title = self['title']
     Tec_article.time = self['time']
     Tec_article.content = remove_tags(self['content'])
     Tec_article.link_url = self['link_url']
     Tec_article.meta.id = self['zhihu_id']
     Tec_article.source = self['source']
     Tec_article.tag = self['topic']
     Tec_article.read_num = self['click_num']
     Tec_article.comment_num = self['comment_num']
     Tec_article.suggest = gen_suggest(Tec_article._doc_type.index,
                                       ((Tec_article.title, 10),
                                        (Tec_article.tag, 7)))
     Tec_article.save()
     redis_cli.incr("zhihu")
     return
    def parse2(self, response):
        try:
            timeout = WebDriverWait(self.driver, 10)
        except:
            print("Timed out waiting for page load.")
            self.driver.quit()

        title = Selector(response).xpath(
            '//div[@class="leftContainer"]/div/div/div/div/ \
                    a/img[@id="coverImage"]/@alt')
        genre = Selector(
            response).xpath('//div[@class="rightContainer"]/div/div/ \
                    div[@class="bigBoxBody"]/div/div/div[@class="left"]/a/text()'
                            )
        rating = Selector(response).xpath(
            '//div[@class="leftContainer"]/div/div[@id="metacol"]/ \
                    div[@id="bookMeta"]/span/span[@class="average"]/text()')
        reviews = Selector(response).xpath('//div[@id="bookReviews"]/ \
                    div[@class="friendReviews elementListBrown"]')

        for review in reviews:
            try:
                item = GoodreadsItem()
                item['title'] = title.extract()[0]
                item['rating'] = rating.extract()[0]
                item['book_url'] = response.meta['book_url']
                item['genre'] = genre.extract()[0]
                item['link_url'] = review.xpath(
                    './/div/div/link/@href').extract()[0]
                item['reviewDate'] = review.xpath(
                    './/div/div/div/div/a/text()').extract()[0]
                item['user'] = review.xpath(
                    './/div/div/div/div/span/a/text()').extract()[0]

                review_text = review.xpath('.//div/div/div/ \
                                div[@class="reviewText stacked"]/span/ \
                                span[2]/text()').extract()[0]
                # remove html tags
                item['review'] = remove_tags(review_text)

            except IndexError as e:
                print(e, ": title: ", item['title'], "user: ", item['user'])
                logger.error(e.args[0])
                raise

            yield item
    def save_to_es(self):
        news = NewsType()
        news.url = self['url']
        news.meta.id = self['url_md5']
        news.title = self['title']
        news.create_time = self['create_time']
        news.source = self['source']
        news.source_url = self['source_url']
        # 这个函数功能有点小爽
        news.content = remove_tags(self['content'])

        news.suggest = gen_suggests(NewsType._doc_type.index, ((news.title, 10),))

        news.save()

        redis_cli.incr("news_count")
        return
Example #45
0
    def parse_post(self, response, date):
        print(response)
        title = response.xpath('//h1/text()').get()
        description = response.xpath(
            '//div[@class="new-item-content"]').getall()
        if description:
            comments = re.sub(r'<!--[\S\s]*?-->', '', str(description[0]))
            description = remove_tags(str(comments)).strip()
        else:
            description = ''

        item = ItemLoader(item=KnejaItem(), response=response)
        item.add_value('title', title)
        item.add_value('description', description)
        item.add_value('date', date.strip())

        return item.load_item()
Example #46
0
    def parse_detail(self, response):
        self.logger.info('Parse function called on %s', response.url)

        head = response.selector.css('.articleName')
        body = response.selector.css('.art_context')

        content = body.xpath('.//div[@class="art_contextBox"]').extract_first()
        if content is None:
            content = body.extract_first()

        article_item = ArticleItem(
            title=head.xpath('.//h1/text()').extract_first(),
            date_time=head.xpath(
                './/span[@class="pr20"]/text()').extract_first(),
            source=head.xpath('.//a[@rel="nofollow"]/text()').extract_first(),
            content=remove_tags(content))
        yield article_item
Example #47
0
    def parse_investimento_noticias(self, response, info):
        content = response.css('.itemIntroText p').getall() +\
            response.css('.itemFullText p').getall()
        content = ''.join([remove_tags(x) for x in content])

        date = response.css('time::attr(datetime)').get()
        date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%S%z')
        news_obj = {
            'title': response.xpath('//header/h1/text()').get(),
            'content': content,
            'date': date,  #response.css('time::text').get().strip()
            'author': 'None',
            'url': info['url'],
            'site': info['site'],
            'tick': info['tick']
        }
        yield Noticia(news_obj)
Example #48
0
    def parse(self, response):

        for item in response.xpath("//div[@class='auction  ']"):
            logging.debug('item:' + str(item))
            location = item.xpath(
                "normalize-space(.//button[contains(@class,'location-link')])"
            ).get()
            logging.debug('Location:' + str(location))
            # filter non SC items
            if not (bool(re.search('SC', location))
                    or bool(re.search('South Carolina', location))):
                logging.debug('SC NOT found: ')
                continue

            adItem = ClassifiedscraperItem()
            adItem.set_all(None)
            adItem['source'] = self.name
            adItem['location'] = location

            #response.request.url
            request_url_base = furl.furl(response.request.url).origin
            logging.debug('request_url_base:' + request_url_base)
            link_raw = item.xpath(".//a/@href").get(default='/not-found')
            item_link = request_url_base + link_raw
            logging.debug('link:' + item_link)
            adItem['link'] = item_link

            title_raw = remove_tags(
                item.xpath(
                    "normalize-space(.//p[@class='auctionTitle'])").get())
            logging.debug('title_raw:' + title_raw)
            adItem['title'] = title_raw

            #image_link_raw = remove_tags(item.xpath(".//img/@src").get())
            #logging.debug('image_link_raw: ' + image_link_raw)
            #adItem['image_link'] = image_link_raw

            #raw_post_date = item.xpath("normalize-space(.//div[contains(@class,'auctionLocation')]/span)").get()
            #logging.debug('raw_post_date:'  + raw_post_date)
            #adItem['post_date'] = raw_post_date.split('ST')[0].strip()

            #raw_desc = item.xpath("normalize-space(.//div[@class='col-sm-12'])").get()
            #logging.debug('raw_post_date:'  + raw_post_date)
            #adItem['description'] = raw_desc

            yield adItem
Example #49
0
    def save_to_es(self):
        article = ArticleType()
        article.title = self['title']
        article.create_date = self["create_date"]
        article.content = remove_tags(self["content"])
        article.front_image_url = self["front_image_url"]
        if "front_image_path" in self:
            article.front_image_path = self["front_image_path"]
        article.praise_nums = self["praise_nums"]
        article.fav_nums = self["fav_nums"]
        article.comment_nums = self["comment_nums"]
        article.url = self["url"]
        article.tags = self["tags"]
        article.meta.id = self["url_object_id"]

        article.save()
        return
Example #50
0
    def save_to_es(self):
        crawl_time = datetime.datetime.now().strftime(SQL_DATE_FORMAT)
        create_time = datetime.datetime.fromtimestamp(self["create_time"]).strftime(SQL_DATE_FORMAT)
        update_time = datetime.datetime.fromtimestamp(self["update_time"]).strftime(SQL_DATE_FORMAT)
        article = ZhihuAnswerType()
        article.meta.id = self['zhihu_id']
        article.url = self['url']
        article.question_id = self['question_id']
        article.content = remove_tags(self['content'])
        article.praise_num = self['praise_num']
        article.comments_num = self['comments_num']
        article.create_time = create_time
        article.update_time = update_time
        article.crawl_time = crawl_time

        article.save()
        return
Example #51
0
def parse_plaintext_format(text):
    data = []

    for row in text.split('\n'):
        rowparts = row.strip().split(' ')
        if len(rowparts) < 2:
            break
        timestamp = rowparts[0]
        if rowparts[1].startswith('<') and rowparts[1].endswith('>'):
            url = rowparts[1][1:-1]
            textparts = rowparts[2:]
        else:
            url = ''
            textparts = rowparts[1:]
        text = remove_tags(' '.join(textparts))
        data.append({'timestamp': timestamp, 'url': url, 'text': text})
    return data
Example #52
0
	def parse(self, response):
		data = json.loads(response.text)
		next_page = data['payload']['nextPage']
		for post in data['payload']['articles']:
			title = post['title']
			date = post['createdAt']
			description = remove_tags(post['html'])

			item = ItemLoader(item=UnibankmkItem(), response=response)
			item.default_output_processor = TakeFirst()
			item.add_value('title', title)
			item.add_value('description', description)
			item.add_value('date', date)

			yield item.load_item()
		if next_page:
			yield response.follow(base.format(next_page), self.parse)
Example #53
0
    def parse_tvn24bis(self, response):
        url = response.url
        art_id = url.split(',')[-1].split('.')[0]

        date = response.css(
            'article.detail header time::attr("datetime")').extract_first()
        date = date.split(' ')
        time = date[1][0:4]
        date = date[0]

        title = response.css("article.detail header h1 ::text").extract_first()
        title = replace_escape_chars(title).strip()

        lead = response.css("div.content p.lead ::text").extract_first()
        lead = replace_escape_chars(lead).strip()

        text = response.xpath(
            '//div[@class="content"]/p[not(contains(@clas, "rules") or contains(@clas, "footer"))]/text()'
        ).extract()

        text = ' || '.join(text)
        text = remove_tags(text)
        text = replace_escape_chars(text)
        text = clear_text(text)

        autor = response.css(
            "div.content div.footer ::text").extract()[1].split('/')
        if len(autor) > 1:
            source = autor[1]
            source = source.strip().replace('Źródło: ', '')
            autor = autor[0].strip().replace('Autor: ', '')
        else:
            source = ''
            autor = autor[0].strip().replace('Autor: ', '')

        yield {
            'id': art_id,
            'url': url,
            'date': date,
            'time': time,
            'title': ''.join(title),
            'lead': lead,
            'text': text,
            'autor': autor,
            'source': source
        }
Example #54
0
    def parse_post(self, response):
        title = response.xpath(
            '//div[@class="number_list pj"]/h3/text()').get()
        description = response.xpath(
            '//div[@class="number_list pj"]//text()[normalize-space() and not(ancestor::h3 | ancestor::i | ancestor::video)]'
        ).getall()
        description = [remove_tags(p).strip() for p in description]
        description = ' '.join(description).strip()
        date = response.xpath('//i/text()').get()

        item = ItemLoader(item=NbsrsItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)
        item.add_value('date', date)

        return item.load_item()
Example #55
0
File: osa.py Project: QiliWu/lalsci
    def parse_article(self, response):
        lalitem = LalItem()
        data = response.meta['data']
        lalitem['url'] = response.url
        lalitem['title'] = data['title']
        # lalitem['authors'] = data['author'].split('; ')
        lalitem['authors'] = data['author'].replace('; ', ', ')
        lalitem['doi'] = data['doi']
        lalitem['journal'] = response.css(
            '.article-journal-name li strong::text').extract_first('')
        if not lalitem['journal']:
            lalitem['journal'] = data['name'].split(',')[0]

        lalitem['year'] = int(data['years'])
        lalitem['keywords'] = ''
        lalitem['abs_img_url'] = response.css(
            'img[alt="Fig. 1"]::attr(data-src)').extract_first(default='')

        abstract_text = response.css('#articleBody p').extract()
        abstract_list = []
        if abstract_text:
            #有些文章是没有摘要的。如果有摘要,格式也是多变的。
            for element in abstract_text:
                if '©' in element:
                    break
                else:
                    abstract_match = re.match(r'<.+?>(.*)</.+>', element, re.S)
                    abstract_list.append(abstract_match.group(1))
        lalitem['abstract'] = ''.join(abstract_list)

        lalitem['_id'] = get_md5(lalitem['url'])
        lalitem['company'] = self.name

        ##请求glgoo来获取citation
        glgoo_url = 'https://xs.glgoo.top/scholar?'
        headers = {
            'User_Agent': 'Mozilla/5.0',
            'Referer': 'https://gf1.jwss.site/'
        }
        yield Request(url=glgoo_url +
                      urlencode({'q': remove_tags(lalitem['title'])}) +
                      '&hl=zh-CN&as_sdt=0&as_vis=1&oi=scholart',
                      headers=headers,
                      meta={'lalitem': lalitem},
                      dont_filter=True,
                      callback=self.get_citation)
Example #56
0
    def parse_item(self, response):
        item = ScifibotItem()
        # clean body
        orig_body = response.body_as_unicode()
        body = remove_tags_with_content(orig_body,
            which_ones=('script', 'head'))
        body = remove_tags(remove_comments(body))
        tokens = tokenize(body.lower())
        # decide if the page is interesting
        if not is_relevant(tokens):
            stats.inc_value('scifi/filtered_out') # probably not scifi page
            return

        item['keywords'] = tokens
        item['page'] = orig_body
        item['url'] = response.url
        return item
Example #57
0
    def parse_item(self, response):
        item = ItemLoader(item=OlxAdItem(), response=response)
        item.default_output_processor = TakeFirst()

        # # address
        item.add_value('address', response.url)
        # # title
        item.add_xpath('title', '//div[@class="offer-titlebox"]/h1/text()')
        #
        # # id of advertising
        item.add_xpath(
            'number',
            '//div[@class="offer-titlebox__details"]/em/small/text()',
            re='(\d+)')

        # info
        th = response.xpath('//table[@class="item"]//th/text()').extract()
        td = [
            re.sub('([\t\n]+)', ', ',
                   remove_tags(s).strip())
            for s in response.xpath(
                '//table[@class="item"]//td[@class="value"]/strong').extract()
        ]
        item.item.setdefault('info', {th: td for th, td in zip(th, td)})

        # price if exists check https://www.olx.ua/rabota/
        price = response.xpath(
            '//div[@class="price-label"]/strong/text()').get()
        # Обработать валюту чтобы все выводилось в гривне
        item.add_value('price',
                       price.strip() if price is not None else 'Бесплатно')
        #
        # # phone ToDO
        phone = '+380'
        item.add_value('phone', phone)
        #
        # # description
        item.add_xpath('description',
                       '//div[@id="textContent"]//text()',
                       Compose(MapCompose(str.strip), Join('. '),
                               lambda s: normalize('NFKC', s)),
                       re='[^\n\r]+')
        # # images links
        item.add_xpath('images', '//*[@class="photo-glow"]/img/@src')

        yield item.load_item()
    def parse_item(self, response):
        error_location = False
        error_category = False

        item = response.meta['item']
        item['title'] = response.xpath(
            '//a[@class="heading detail-title"]/@title').extract()[0]
        item['offer_id'] = response.xpath(
            '//div[@itemtype="http://schema.org/JobPosting"]/@id').extract()[0]
        item['lang_code'] = 'en-US'
        item['date'] = self.dt.strftime('%Y%m%d')
        item['description'] = remove_tags(
            response.xpath('//div[@itemprop="description"]').extract()[0])
        item['location_name'] = response.xpath(
            '//span[@itemprop="name"]/text()').extract()[0]
        item['category_name'] = response.xpath(
            '//span[@itemprop="occupationalCategory"]/text()').extract()[0]

        # GEONAME MANAGEMENT
        try:
            item['geoname_id'] = self.geoCache.getGeonameId(
                item['location_name'])
            item['country_code'] = self.geoCache.getCountryCode(
                item['location_name'])
        except:
            error_message = "%s location not found in GeoName" % str(
                item['location_name'])
            print error_message
            error_location = True
            self.beBeeLogger.failure(item['offer_id'], error_message)

        # CATEGORY MANAGEMENT
        category_id = self.categoryMapper(item['category_name'])
        if category_id:
            item['category_id'] = category_id
        else:
            error_message = "category not found: %s" % str(
                item['category_name'])
            print error_message
            error_category = True
            self.beBeeLogger.failure(item['offer_id'], error_message)

        if not (error_location or error_category):
            self.beBeeLogger.success(item['offer_id'])

        return item
Example #59
0
    def parse_topic_response(self, response):
        """
        Parse the content
        """

        # Get the title first
        title = response.css('title::text').extract_first()

        # Replace / with a space - creates issues with writing to file
        title = title.replace('/', ' ')

        content = response.css('div#mw-content-text')

        # Just extract all the '<p></p>' children from this
        text = title + '\n\n'
        for child in content.xpath('//p'):

            # Get the text from this child <p></p> tag
            paragraph = child.extract()

            # Remove <script>, <sup>, <math> tags with the content
            paragraph = remove_tags_with_content(paragraph, which_ones=('script', 'sup', 'math'))
            # Remove the rest of the tags without removing the content
            paragraph = remove_tags(paragraph)

            # Replace '&amp;' with '&'
            paragraph = paragraph.replace('&amp;', '&')

            # Add to the file
            text += paragraph + '\n'

        # Create the directory
        dirname = 'data/wikipedia'
        if not os.path.exists(dirname):
            os.makedirs(dirname, exist_ok=True)
        elif not os.path.isdir(dirname):
            os.remove(dirname)
            os.makedirs(dirname, exist_ok=True)

        # Save the text
        name = response.url.split('/')[-1]
        filename = '{}/{}'.format(dirname, name)
        f = open(filename, 'w')
        f.write(text)
        f.close()
Example #60
0
    def parse_zhuanye(self, response):

        item = response.meta['item']
        majors = {}

        for e in response.css(u'.schoolIntro_con2>div'):
            cls = e.css('::attr(class)').extract_first()
            if cls == 'catTitle':
                cat1 = e.css('h2::text').re_first(u'开设(.+)专业')
                majors[cat1] = {}
            elif cls == 'majorCon':
                cat2 = e.css('h3::text').re_first(u'■ (.+)(')
                majors[cat1][cat2] = [remove_tags(i) for i in e.css('ul>li').extract()]
            else:
                pass

        item['majors'] = majors
        yield item