def parse(self, response): hxs = HtmlXPathSelector(response) for sel in hxs.select( '//div[@id="ires"]//li[@class="g"]//h3[@class="r"]'): name = u''.join(sel.select(".//text()").extract()) url = _parse_url(sel.select('.//a/@href').extract()[0]) region = _get_region(url) if len(url): if self.download_html: yield Request(url=url, callback=self.parse_item, meta={ 'name': name, 'query': response.meta['query'] }) else: yield GoogleSearchItem( url=url, name=name, query=response.meta['query'], crawled=datetime.datetime.utcnow().isoformat()) next_page = hxs.select( '//table[@id="nav"]//td[contains(@class, "b") and position() = last()]/a' ) if next_page: url = self._build_absolute_url( response, next_page.select('.//@href').extract()[0]) yield Request(url=url, callback=self.parse, meta={'query': response.meta['query']})
def parse_item(self, response): name = response.meta['name'] query = response.meta['query'] url = response.url html = response.body[:1024 * 256] timestamp = datetime.datetime.utcnow().isoformat() yield GoogleSearchItem({'name': name, 'url': url, 'html': html, 'region': self.region, 'query': query, 'crawled': timestamp})
def parse_item(self, response): name = response.meta['name'] query = response.meta['query'] first_title = response.meta['first_title'] priority = response.meta['priority'] url = response.url # Parse webpage article = Article(url) try: article.download() # sometimes failed on time out article.parse() paragraphs = article.text.split('\n') merged = ' '.join( [p for p in paragraphs if query.lower() in p.lower()]) tokens = nltk.word_tokenize(merged) # split into tokens query_tokens = [qt.lower() for qt in nltk.word_tokenize(query) ] # avoid query replacement # Skip the word replacement -a replace=True(default)/False if self.replace: # Replace verbs, nouns, adverbs, adjectives if word is not capitalised replaceble_words = [ word.lower() for word, pos in nltk.pos_tag(tokens) if (pos == 'RB' or pos == 'JJ') and not word[0].istitle() and word.lower() not in query_tokens ] # Calculate the ration of words to replace length = len(replaceble_words) try: self.percent = int(self.percent) if self.percent < 0: self.percent = 0 elif self.percent > 100: self.percent = 100 except: self.percent = 20 percentage = round(self.percent * length / 100) random_words = random.sample(replaceble_words, percentage) # Replace words replaced_text = [] if len(random_words) > 0: # if there are words to be replaced for w in tokens: if w.lower() in random_words and wordnet.synsets( w.lower( )): # word in replaceble and has synonyms synonyms = find_synonyms(w) if len(synonyms) > 0: new_w = w.lower().replace(w, synonyms[0]) if w.istitle( ): # capitalize if original word starts with capital letter new_w = new_w.capitalize() replaced_text.append(new_w) else: # if no synonyms found - return the original word replaced_text.append(w) else: # if word is not in WordNet replaced_text.append(w) else: # return same text as original replaced_text = tokens else: # return same text as original replaced_text = tokens if len(replaced_text) != 0: self.final_dict[first_title] += join_punctuation( replaced_text) + ' ' with open('output.csv', 'w') as csv_file: writer = csv.writer(csv_file, delimiter='\t') for key, value in self.final_dict.items(): writer.writerow([key, value]) yield GoogleSearchItem({ 'name': name, 'url': url, 'merged': join_punctuation(replaced_text), 'query': query, 'priority': priority }) except: yield GoogleSearchItem({ 'name': name, 'url': url, 'merged': "", 'query': query, 'priority': priority })