Python GoogleSearchItem Examples

Programming Language: Python

Namespace/Package Name: googlesearch.items

Class/Type: GoogleSearchItem

Examples at hotexamples.com: 3

Python GoogleSearchItem - 3 examples found. These are the top rated real world Python examples of googlesearch.items.GoogleSearchItem extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GoogleSearchItem(3)

Frequently Used Methods

GoogleSearchItem (3)

Example #1

Show file

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        for sel in hxs.select(
                '//div[@id="ires"]//li[@class="g"]//h3[@class="r"]'):
            name = u''.join(sel.select(".//text()").extract())
            url = _parse_url(sel.select('.//a/@href').extract()[0])
            region = _get_region(url)
            if len(url):
                if self.download_html:
                    yield Request(url=url,
                                  callback=self.parse_item,
                                  meta={
                                      'name': name,
                                      'query': response.meta['query']
                                  })
                else:
                    yield GoogleSearchItem(
                        url=url,
                        name=name,
                        query=response.meta['query'],
                        crawled=datetime.datetime.utcnow().isoformat())

        next_page = hxs.select(
            '//table[@id="nav"]//td[contains(@class, "b") and position() = last()]/a'
        )
        if next_page:
            url = self._build_absolute_url(
                response,
                next_page.select('.//@href').extract()[0])
            yield Request(url=url,
                          callback=self.parse,
                          meta={'query': response.meta['query']})

Example #2

Show file

File: googlespider.py Project: scub72/Scraper

 def parse_item(self, response):
     name = response.meta['name']
     query = response.meta['query']
     url = response.url
     html = response.body[:1024 * 256]
     timestamp = datetime.datetime.utcnow().isoformat()
     yield GoogleSearchItem({'name': name,
                             'url': url,
                             'html': html,
                             'region': self.region,
                             'query': query,
                             'crawled': timestamp})

Example #3

Show file

    def parse_item(self, response):

        name = response.meta['name']
        query = response.meta['query']
        first_title = response.meta['first_title']
        priority = response.meta['priority']
        url = response.url

        # Parse webpage
        article = Article(url)
        try:
            article.download()  # sometimes failed on time out
            article.parse()

            paragraphs = article.text.split('\n')
            merged = ' '.join(
                [p for p in paragraphs if query.lower() in p.lower()])

            tokens = nltk.word_tokenize(merged)  # split into tokens
            query_tokens = [qt.lower() for qt in nltk.word_tokenize(query)
                            ]  # avoid query replacement

            # Skip the word replacement -a replace=True(default)/False
            if self.replace:
                # Replace verbs, nouns, adverbs, adjectives if word is not capitalised
                replaceble_words = [
                    word.lower() for word, pos in nltk.pos_tag(tokens)
                    if (pos == 'RB' or pos == 'JJ') and not word[0].istitle()
                    and word.lower() not in query_tokens
                ]

                # Calculate the ration of words to replace
                length = len(replaceble_words)

                try:
                    self.percent = int(self.percent)
                    if self.percent < 0:
                        self.percent = 0
                    elif self.percent > 100:
                        self.percent = 100
                except:
                    self.percent = 20

                percentage = round(self.percent * length / 100)
                random_words = random.sample(replaceble_words, percentage)

                # Replace words
                replaced_text = []
                if len(random_words) > 0:  # if there are words to be replaced
                    for w in tokens:
                        if w.lower() in random_words and wordnet.synsets(
                                w.lower(
                                )):  # word in replaceble and has synonyms
                            synonyms = find_synonyms(w)
                            if len(synonyms) > 0:
                                new_w = w.lower().replace(w, synonyms[0])
                                if w.istitle(
                                ):  # capitalize if original word starts with capital letter
                                    new_w = new_w.capitalize()
                                replaced_text.append(new_w)
                            else:  # if no synonyms found - return the original word
                                replaced_text.append(w)
                        else:  # if word is not in WordNet
                            replaced_text.append(w)
                else:  # return same text as original
                    replaced_text = tokens

            else:  # return same text as original
                replaced_text = tokens

            if len(replaced_text) != 0:
                self.final_dict[first_title] += join_punctuation(
                    replaced_text) + ' '

            with open('output.csv', 'w') as csv_file:
                writer = csv.writer(csv_file, delimiter='\t')
                for key, value in self.final_dict.items():
                    writer.writerow([key, value])

            yield GoogleSearchItem({
                'name': name,
                'url': url,
                'merged': join_punctuation(replaced_text),
                'query': query,
                'priority': priority
            })
        except:
            yield GoogleSearchItem({
                'name': name,
                'url': url,
                'merged': "",
                'query': query,
                'priority': priority
            })