def parse_item(self, response): print print 'general website processing',(len(self.start_urls)) sel = Selector(response) ## Get meta info from website title = sel.xpath('//title/text()').extract() if len(title)>0: title = title[0].encode('utf-8') contents = sel.xpath('/html/head/meta[@name="description"]/@content').extract() content = ' '.join([c.encode('utf-8') for c in contents]).strip() fromurl = response.request.headers['Referer'] tourl = response.url depth = response.request.meta['depth'] #get search item search_item = SearchItem.django_model.objects.get(id=self.search_id) #newpage if not PageItem.django_model.objects.filter(url=tourl).exists(): newpage = PageItem() newpage['searchterm'] = search_item newpage['title'] = title newpage['content'] = content newpage['url'] = tourl newpage['depth'] = depth newpage.save() print fromurl,'--title:',title,'-',response.url,' depth:',depth #print contents #if( int(depth)> 1): # print fromurl,'--title:',title,'-',response.url,' depth:',depth #get from_id,to_id from_page = PageItem.django_model.objects.get(url=fromurl) from_id = from_page.id to_page = PageItem.django_model.objects.get(url=tourl) to_id = to_page.id #newlink if not LinkItem.django_model.objects.filter(from_id=from_id).filter(to_id=to_id).exists(): newlink = LinkItem() newlink['searchterm'] = search_item newlink['from_id'] = from_id newlink['to_id'] = to_id newlink.save()
def parse_item(self, response): print print 'general website processing', (len(self.start_urls)) sel = Selector(response) ## Get meta info from website title = sel.xpath('//title/text()').extract() if len(title) > 0: title = title[0].encode('utf-8') contents = sel.xpath( '/html/head/meta[@name="description"]/@content').extract() content = ' '.join([c.encode('utf-8') for c in contents]).strip() fromurl = response.request.headers['Referer'] tourl = response.url depth = response.request.meta['depth'] #get search item search_item = SearchItem.django_model.objects.get(id=self.search_id) #newpage if not PageItem.django_model.objects.filter(url=tourl).exists(): newpage = PageItem() newpage['searchterm'] = search_item newpage['title'] = title newpage['content'] = content newpage['url'] = tourl newpage['depth'] = depth newpage.save() print fromurl, '--title:', title, '-', response.url, ' depth:', depth #print contents #if( int(depth)> 1): # print fromurl,'--title:',title,'-',response.url,' depth:',depth #get from_id,to_id from_page = PageItem.django_model.objects.get(url=fromurl) from_id = from_page.id to_page = PageItem.django_model.objects.get(url=tourl) to_id = to_page.id #newlink if not LinkItem.django_model.objects.filter(from_id=from_id).filter( to_id=to_id).exists(): newlink = LinkItem() newlink['searchterm'] = search_item newlink['from_id'] = from_id newlink['to_id'] = to_id newlink.save()
def parse_site(self, response): ## Get the selector for xpath parsing or from newspaper def crop_emptyel(arr): return [u for u in arr if u != ' '] domain, path = urlparse(response.url).hostname, urlparse( response.url).path url = "https://" + str(domain + path) print(url) article = Article(url) article.download() # Parse the article and fetch authors name article.parse() ## Get meta info from website title = article.title content = article.text #print("content: ", content) # sel = Selector(response) if title == None: title = sel.xpath('//title/text()').extract() if len(title) > 0: title = title[0].encode('utf-8').strip().lower() if content == None: content = 'none' if len( crop_emptyel( sel.xpath('//div//article//p/text()').extract())) > 1: contents = crop_emptyel( sel.xpath('//div//article//p/text()').extract()) print('divarticle') elif len( crop_emptyel( sel.xpath( '//article[contains(@class,"article")]//p/text()'). extract())) > 1: print('article') contents = crop_emptyel( sel.xpath('//article[contains(@class,"article")]//p/text()' ).extract()) elif len( crop_emptyel( sel.xpath('//div[contains(@id,"content")]//p/text()'). extract())) > 1: print('3') contents = crop_emptyel( sel.xpath( '//div[contains(@id,"content")]//p/text()').extract()) elif len( crop_emptyel( sel.xpath('//div[contains(@class,"body")]//p/text()'). extract())) > 1: print('4') contents = crop_emptyel( sel.xpath( '//div[contains(@class,"body")]//p/text()').extract()) elif len( crop_emptyel( sel.xpath( '//section[contains(@class,"text")]//p/text()'). extract())) > 1: print('6') contents = crop_emptyel( sel.xpath('//section[contains(@class,"text")]//p/text()'). extract()) elif len( crop_emptyel( sel.xpath( '//div[contains(@itemprop,"article")]//p/text()'). extract())) > 0: print('7') contents = crop_emptyel( sel.xpath('//div[contains(@itemprop,"article")]//p/text()' ).extract()) elif len( crop_emptyel( sel.xpath( '//div//article[contains(@itemprop,"article")]//p/text()' ).extract())) > 1: contents = crop_emptyel( sel.xpath( '//div//article[contains(@itemprop,"article")]//p/text()' ).extract()) elif len( crop_emptyel( sel.xpath( '//div[contains(@id,"description")]//span/text()'). extract())) > 1: print('descr') contents = crop_emptyel( sel.xpath('//div[contains(@id,"description")]//span/text()' ).extract()) elif len( crop_emptyel( sel.xpath( '//div[contains(@class,"article")]//div/text()'). extract())) > 1: print('div contains article') contents = crop_emptyel( sel.xpath('//div[contains(@class,"article")]//div/text()'). extract()) elif len( crop_emptyel( sel.xpath('//div[contains(@class,"article")]//p/text()' ).extract())) > 1: print('2') contents = crop_emptyel( sel.xpath('//div[contains(@class,"article")]//p/text()'). extract()) elif len( crop_emptyel( sel.xpath('//p[contains(@class,"lead")]//text()'). extract())) > 0: print('5') contents = crop_emptyel( sel.xpath( '//p[contains(@class,"lead")]//text()').extract()) elif len( crop_emptyel( sel.xpath('//div[contains(@class,"text")]//p/text()'). extract())) > 0: print('1') contents = crop_emptyel( sel.xpath( '//div[contains(@class,"text")]//p/text()').extract()) elif len( crop_emptyel( sel.xpath( '/html/head/meta[@name="description"]/@content'). extract())) > 0: contents = crop_emptyel( sel.xpath('/html/head/meta[@name="description"]/@content'). extract()) content = ' '.join([c.encode('utf-8') for c in contents]).strip().lower() #print 'content:',content #get search item search_item = SearchItem.django_model.objects.get(term=self.search_key) #save item if not PageItem.django_model.objects.filter(url=response.url).exists(): if len(content) > 0: if CheckQueryinReview(self.keywords, title, content): if domain not in unwanted_domains: newpage = PageItem() newpage['searchterm'] = self.search_key newpage['title'] = title newpage['content'] = content newpage['url'] = url newpage['depth'] = 0 newpage['review'] = True newpage.save() print(newpage) return newpage else: return null '''
def parse_site(self, response): print 'parsing',response.url ## Get the selector for xpath parsing sel = Selector(response) def crop_emptyel(arr): return [u for u in arr if u!=' '] ## Get meta info from website title = sel.xpath('//title/text()').extract() if len(title)>0: title = title[0].encode('utf-8').strip().lower() content = 'none' if len(crop_emptyel(sel.xpath('//div//article//p/text()').extract()))>1: contents = crop_emptyel(sel.xpath('//div//article//p/text()').extract()) print 'divarticle' elif len(crop_emptyel(sel.xpath('//article[contains(@class,"article")]//p/text()').extract()))>1: print 'article' contents = crop_emptyel(sel.xpath('//article[contains(@class,"article")]//p/text()').extract()) elif len(crop_emptyel(sel.xpath('//div[contains(@id,"content")]//p/text()').extract()))>1: print '3' contents = crop_emptyel(sel.xpath('//div[contains(@id,"content")]//p/text()').extract()) elif len(crop_emptyel(sel.xpath('//div[contains(@class,"body")]//p/text()').extract()))>1: print '4' contents = crop_emptyel(sel.xpath('//div[contains(@class,"body")]//p/text()').extract()) elif len(crop_emptyel(sel.xpath('//section[contains(@class,"text")]//p/text()').extract()))>1: print '6' contents = crop_emptyel(sel.xpath('//section[contains(@class,"text")]//p/text()').extract()) elif len(crop_emptyel(sel.xpath('//div[contains(@itemprop,"article")]//p/text()').extract()))>0: print '7' contents = crop_emptyel(sel.xpath('//div[contains(@itemprop,"article")]//p/text()').extract()) elif len(crop_emptyel(sel.xpath('//div//article[contains(@itemprop,"article")]//p/text()').extract()))>1: contents = crop_emptyel(sel.xpath('//div//article[contains(@itemprop,"article")]//p/text()').extract()) elif len(crop_emptyel(sel.xpath('//div[contains(@id,"description")]//span/text()').extract()))>1: print 'descr' contents = crop_emptyel(sel.xpath('//div[contains(@id,"description")]//span/text()').extract()) elif len(crop_emptyel(sel.xpath('//div[contains(@class,"article")]//div/text()').extract()))>1: print 'div contains article' contents = crop_emptyel(sel.xpath('//div[contains(@class,"article")]//div/text()').extract()) elif len(crop_emptyel(sel.xpath('//div[contains(@class,"article")]//p/text()').extract()))>1: print '2' contents = crop_emptyel(sel.xpath('//div[contains(@class,"article")]//p/text()').extract()) elif len(crop_emptyel(sel.xpath('//p[contains(@class,"lead")]//text()').extract()))>0: print '5' contents = crop_emptyel(sel.xpath('//p[contains(@class,"lead")]//text()').extract()) elif len(crop_emptyel(sel.xpath('//div[contains(@class,"text")]//p/text()').extract()))>0: print '1' contents = crop_emptyel(sel.xpath('//div[contains(@class,"text")]//p/text()').extract()) elif len(crop_emptyel(sel.xpath('/html/head/meta[@name="description"]/@content').extract()))>0: contents = crop_emptyel(sel.xpath('/html/head/meta[@name="description"]/@content').extract()) content = ' '.join([c.encode('utf-8') for c in contents]).strip().lower() print 'title:',title print 'content:',content #get search item search_item = SearchItem.django_model.objects.get(term=self.search_key) if not PageItem.django_model.objects.filter(url=response.url).exists(): if len(content) > 0: if self.search_key in content or self.search_key in title: newpage = PageItem() newpage['searchterm'] = search_item newpage['title'] = title newpage['content'] = content newpage['url'] = response.url newpage['depth'] = 0 newpage['review'] = True #newpage.save() return newpage else: return null '''
def parse_site(self, response): #print("Current depth: ", response.request.meta) #Exclude empty characters def crop_emptyel(arr): return [u for u in arr if u != ' '] domain = urlparse(response.url).hostname a = Article(response.url) a.download() a.parse() ## Get meta info from website title = a.title sel = Selector(response) if title == None: title = sel.xpath('//title/text()').extract() if len(title) > 0: title = title[0].strip().lower() content = a.text.replace('\n', '') #If we couldn't get the content using the newspaper library then we will try with Selector with several #filters if content == None: content = 'none' if len( crop_emptyel( sel.xpath('//div//article//p/text()').extract())) > 1: contents = crop_emptyel( sel.xpath('//div//article//p/text()').extract()) #print('divarticle') elif len( crop_emptyel( sel.xpath( '//article[contains(@class,"article")]//p/text()'). extract())) > 1: #print('article') contents = crop_emptyel( sel.xpath('//article[contains(@class,"article")]//p/text()' ).extract()) elif len( crop_emptyel( sel.xpath('//div[contains(@id,"content")]//p/text()'). extract())) > 1: #print('using method 3') contents = crop_emptyel( sel.xpath( '//div[contains(@id,"content")]//p/text()').extract()) elif len( crop_emptyel( sel.xpath('//div[contains(@class,"body")]//p/text()'). extract())) > 1: #print('using method 4') contents = crop_emptyel( sel.xpath( '//div[contains(@class,"body")]//p/text()').extract()) elif len( crop_emptyel( sel.xpath( '//section[contains(@class,"text")]//p/text()'). extract())) > 1: #print('using method 5') contents = crop_emptyel( sel.xpath('//section[contains(@class,"text")]//p/text()'). extract()) elif len( crop_emptyel( sel.xpath( '//div[contains(@itemprop,"article")]//p/text()'). extract())) > 0: #print('using method 6') contents = crop_emptyel( sel.xpath('//div[contains(@itemprop,"article")]//p/text()' ).extract()) elif len( crop_emptyel( sel.xpath( '//div//article[contains(@itemprop,"article")]//p/text()' ).extract())) > 1: #print('using method 7') contents = crop_emptyel( sel.xpath( '//div//article[contains(@itemprop,"article")]//p/text()' ).extract()) elif len( crop_emptyel( sel.xpath( '//div[contains(@id,"description")]//span/text()'). extract())) > 1: #print('using descr') contents = crop_emptyel( sel.xpath('//div[contains(@id,"description")]//span/text()' ).extract()) elif len( crop_emptyel( sel.xpath( '//div[contains(@class,"article")]//div/text()'). extract())) > 1: #print('using div contains article'); contents = crop_emptyel( sel.xpath('//div[contains(@class,"article")]//div/text()'). extract()) elif len( crop_emptyel( sel.xpath('//div[contains(@class,"article")]//p/text()' ).extract())) > 1: #print('using method 8') contents = crop_emptyel( sel.xpath('//div[contains(@class,"article")]//p/text()'). extract()) elif len( crop_emptyel( sel.xpath('//p[contains(@class,"lead")]//text()'). extract())) > 0: #print('using method 9') contents = crop_emptyel( sel.xpath( '//p[contains(@class,"lead")]//text()').extract()) elif len( crop_emptyel( sel.xpath('//div[contains(@class,"text")]//p/text()'). extract())) > 0: #print('using method 10') contents = crop_emptyel( sel.xpath( '//div[contains(@class,"text")]//p/text()').extract()) elif len( crop_emptyel( sel.xpath( '/html/head/meta[@name="description"]/@content'). extract())) > 0: contents = crop_emptyel( sel.xpath('/html/head/meta[@name="description"]/@content'). extract()) content = ' '.join([c for c in contents]).strip().lower() #print('title:', title) #print('content:',content) # get search item search_item = SearchItem.django_model.objects.get(term=self.search_key) # save item #If the PageItem object does not exist if not PageItem.django_model.objects.filter(url=response.url).exists(): #check content and create PageItem if the content contains any of the keywords and the page domain is #allowed if len(content) > 0: if check_query_in_review(self.keywords, title, content): if domain not in unwanted_domains: newpage = PageItem() newpage['searchterm'] = search_item newpage['title'] = title newpage['content'] = content newpage['url'] = response.url newpage['depth'] = 0 newpage['review'] = True # newpage.save() # The newpage object will be saved in the pipeline return newpage else: pass #print("No keywords in title or content") else: return None
def parse_item(self, response): sel = Selector(response) print("Current depth: ", response.request.meta['depth']) ## Get meta info from website title = sel.xpath('//title/text()').extract() if len(title) > 0: title = title[0] ##We are not that interested in the content, that's why we don't apply more filters to force having ##something in the content field. #We are more interested in the depth, fromulr and tourl that we will use for our page rank algorith. contents = sel.xpath( '/html/head/meta[@name="description"]/@content').extract() content = ' '.join([c for c in contents]).strip() #Get source url fromurl = response.request.headers['Referer'].decode( 'ascii', 'ignore') #Get current url tourl = response.url #Get depth depth = response.request.meta['depth'] # Get search item using its id. search_item = SearchItem.django_model.objects.get(id=self.search_id) # If a PageItem linked to the current url does not exist, then create a new one. if not PageItem.django_model.objects.filter(url=tourl).exists(): newpage = PageItem() newpage['searchterm'] = search_item newpage['title'] = title newpage['content'] = content newpage['url'] = tourl newpage['depth'] = depth newpage.save( ) # cant use pipeline cause the execution can finish here ''' print("============================================") try: print("--From url", fromurl) print('--title:', title) print('--depth: ', depth) except ValueError: print("########################################## Encoding error") print(fromurl.encode("ascii", "ignore")) print(title.encode("ascii", "ignore")) raise ''' # print contents # if( int(depth)> 1): # print fromurl,'--title:',title,'-',response.url,' depth:',depth # Get source PageItem, and current PageItem and get their ids from_page = PageItem.django_model.objects.get(url=fromurl) from_id = from_page.id to_page = PageItem.django_model.objects.get(url=tourl) to_id = to_page.id #Create a link from the previous page to the current page. # If a LinkItem linking source link and current link doesn't exist we will create a new one- if not LinkItem.django_model.objects.filter(from_id=from_id).filter( to_id=to_id).exists(): newlink = LinkItem() newlink['searchterm'] = search_item newlink['from_id'] = from_id newlink['to_id'] = to_id newlink.save()