Esempio n. 1
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        for sel in hxs.select(
                '//div[@id="ires"]//li[@class="g"]//h3[@class="r"]'):
            name = u''.join(sel.select(".//text()").extract())
            url = _parse_url(sel.select('.//a/@href').extract()[0])
            region = _get_region(url)
            if len(url):
                if self.download_html:
                    yield Request(url=url,
                                  callback=self.parse_item,
                                  meta={
                                      'name': name,
                                      'query': response.meta['query']
                                  })
                else:
                    yield GoogleSearchItem(
                        url=url,
                        name=name,
                        query=response.meta['query'],
                        crawled=datetime.datetime.utcnow().isoformat())

        next_page = hxs.select(
            '//table[@id="nav"]//td[contains(@class, "b") and position() = last()]/a'
        )
        if next_page:
            url = self._build_absolute_url(
                response,
                next_page.select('.//@href').extract()[0])
            yield Request(url=url,
                          callback=self.parse,
                          meta={'query': response.meta['query']})
Esempio n. 2
0
 def parse_item(self, response):
     name = response.meta['name']
     query = response.meta['query']
     url = response.url
     html = response.body[:1024 * 256]
     timestamp = datetime.datetime.utcnow().isoformat()
     yield GoogleSearchItem({'name': name,
                             'url': url,
                             'html': html,
                             'region': self.region,
                             'query': query,
                             'crawled': timestamp})
Esempio n. 3
0
    def parse_item(self, response):

        name = response.meta['name']
        query = response.meta['query']
        first_title = response.meta['first_title']
        priority = response.meta['priority']
        url = response.url

        # Parse webpage
        article = Article(url)
        try:
            article.download()  # sometimes failed on time out
            article.parse()

            paragraphs = article.text.split('\n')
            merged = ' '.join(
                [p for p in paragraphs if query.lower() in p.lower()])

            tokens = nltk.word_tokenize(merged)  # split into tokens
            query_tokens = [qt.lower() for qt in nltk.word_tokenize(query)
                            ]  # avoid query replacement

            # Skip the word replacement -a replace=True(default)/False
            if self.replace:
                # Replace verbs, nouns, adverbs, adjectives if word is not capitalised
                replaceble_words = [
                    word.lower() for word, pos in nltk.pos_tag(tokens)
                    if (pos == 'RB' or pos == 'JJ') and not word[0].istitle()
                    and word.lower() not in query_tokens
                ]

                # Calculate the ration of words to replace
                length = len(replaceble_words)

                try:
                    self.percent = int(self.percent)
                    if self.percent < 0:
                        self.percent = 0
                    elif self.percent > 100:
                        self.percent = 100
                except:
                    self.percent = 20

                percentage = round(self.percent * length / 100)
                random_words = random.sample(replaceble_words, percentage)

                # Replace words
                replaced_text = []
                if len(random_words) > 0:  # if there are words to be replaced
                    for w in tokens:
                        if w.lower() in random_words and wordnet.synsets(
                                w.lower(
                                )):  # word in replaceble and has synonyms
                            synonyms = find_synonyms(w)
                            if len(synonyms) > 0:
                                new_w = w.lower().replace(w, synonyms[0])
                                if w.istitle(
                                ):  # capitalize if original word starts with capital letter
                                    new_w = new_w.capitalize()
                                replaced_text.append(new_w)
                            else:  # if no synonyms found - return the original word
                                replaced_text.append(w)
                        else:  # if word is not in WordNet
                            replaced_text.append(w)
                else:  # return same text as original
                    replaced_text = tokens

            else:  # return same text as original
                replaced_text = tokens

            if len(replaced_text) != 0:
                self.final_dict[first_title] += join_punctuation(
                    replaced_text) + ' '

            with open('output.csv', 'w') as csv_file:
                writer = csv.writer(csv_file, delimiter='\t')
                for key, value in self.final_dict.items():
                    writer.writerow([key, value])

            yield GoogleSearchItem({
                'name': name,
                'url': url,
                'merged': join_punctuation(replaced_text),
                'query': query,
                'priority': priority
            })
        except:
            yield GoogleSearchItem({
                'name': name,
                'url': url,
                'merged': "",
                'query': query,
                'priority': priority
            })