def parse_item(self, response):
        print 
        print 'general website processing',(len(self.start_urls))
        sel = Selector(response)
        
        ## Get meta info from website
        title = sel.xpath('//title/text()').extract()
        if len(title)>0:
            title = title[0].encode('utf-8')
            
        contents = sel.xpath('/html/head/meta[@name="description"]/@content').extract()
        content = ' '.join([c.encode('utf-8') for c in contents]).strip()

        fromurl = response.request.headers['Referer']
        tourl = response.url
        depth = response.request.meta['depth']
        
        #get search item 
        search_item = SearchItem.django_model.objects.get(id=self.search_id)
        #newpage
        if not PageItem.django_model.objects.filter(url=tourl).exists():
            newpage = PageItem()
            newpage['searchterm'] = search_item
            newpage['title'] = title
            newpage['content'] = content
            newpage['url'] = tourl
            newpage['depth'] = depth
            newpage.save()



        

        print fromurl,'--title:',title,'-',response.url,' depth:',depth
        #print contents
        #if( int(depth)> 1):
        #   print fromurl,'--title:',title,'-',response.url,' depth:',depth
        
        #get from_id,to_id
        from_page = PageItem.django_model.objects.get(url=fromurl)
        from_id = from_page.id
        to_page = PageItem.django_model.objects.get(url=tourl)
        to_id = to_page.id

        

        
        #newlink
        if not LinkItem.django_model.objects.filter(from_id=from_id).filter(to_id=to_id).exists():
            newlink = LinkItem()
            newlink['searchterm'] = search_item
            newlink['from_id'] = from_id
            newlink['to_id'] = to_id
            newlink.save()
Esempio n. 2
0
    def parse_item(self, response):
        print
        print 'general website processing', (len(self.start_urls))
        sel = Selector(response)

        ## Get meta info from website
        title = sel.xpath('//title/text()').extract()
        if len(title) > 0:
            title = title[0].encode('utf-8')

        contents = sel.xpath(
            '/html/head/meta[@name="description"]/@content').extract()
        content = ' '.join([c.encode('utf-8') for c in contents]).strip()

        fromurl = response.request.headers['Referer']
        tourl = response.url
        depth = response.request.meta['depth']

        #get search item
        search_item = SearchItem.django_model.objects.get(id=self.search_id)
        #newpage
        if not PageItem.django_model.objects.filter(url=tourl).exists():
            newpage = PageItem()
            newpage['searchterm'] = search_item
            newpage['title'] = title
            newpage['content'] = content
            newpage['url'] = tourl
            newpage['depth'] = depth
            newpage.save()

        print fromurl, '--title:', title, '-', response.url, ' depth:', depth
        #print contents
        #if( int(depth)> 1):
        #   print fromurl,'--title:',title,'-',response.url,' depth:',depth

        #get from_id,to_id
        from_page = PageItem.django_model.objects.get(url=fromurl)
        from_id = from_page.id
        to_page = PageItem.django_model.objects.get(url=tourl)
        to_id = to_page.id

        #newlink
        if not LinkItem.django_model.objects.filter(from_id=from_id).filter(
                to_id=to_id).exists():
            newlink = LinkItem()
            newlink['searchterm'] = search_item
            newlink['from_id'] = from_id
            newlink['to_id'] = to_id
            newlink.save()
Esempio n. 3
0
    def parse_site(self, response):
        ## Get the selector for xpath parsing or from newspaper

        def crop_emptyel(arr):
            return [u for u in arr if u != ' ']

        domain, path = urlparse(response.url).hostname, urlparse(
            response.url).path
        url = "https://" + str(domain + path)
        print(url)

        article = Article(url)
        article.download()
        # Parse the article and fetch authors name
        article.parse()

        ## Get meta info from website
        title = article.title
        content = article.text

        #print("content: ", content)

        #         sel = Selector(response)
        if title == None:
            title = sel.xpath('//title/text()').extract()
            if len(title) > 0:
                title = title[0].encode('utf-8').strip().lower()

        if content == None:
            content = 'none'
            if len(
                    crop_emptyel(
                        sel.xpath('//div//article//p/text()').extract())) > 1:
                contents = crop_emptyel(
                    sel.xpath('//div//article//p/text()').extract())
                print('divarticle')
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '//article[contains(@class,"article")]//p/text()').
                        extract())) > 1:
                print('article')
                contents = crop_emptyel(
                    sel.xpath('//article[contains(@class,"article")]//p/text()'
                              ).extract())
            elif len(
                    crop_emptyel(
                        sel.xpath('//div[contains(@id,"content")]//p/text()').
                        extract())) > 1:
                print('3')
                contents = crop_emptyel(
                    sel.xpath(
                        '//div[contains(@id,"content")]//p/text()').extract())
            elif len(
                    crop_emptyel(
                        sel.xpath('//div[contains(@class,"body")]//p/text()').
                        extract())) > 1:
                print('4')
                contents = crop_emptyel(
                    sel.xpath(
                        '//div[contains(@class,"body")]//p/text()').extract())
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '//section[contains(@class,"text")]//p/text()').
                        extract())) > 1:
                print('6')
                contents = crop_emptyel(
                    sel.xpath('//section[contains(@class,"text")]//p/text()').
                    extract())
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '//div[contains(@itemprop,"article")]//p/text()').
                        extract())) > 0:
                print('7')
                contents = crop_emptyel(
                    sel.xpath('//div[contains(@itemprop,"article")]//p/text()'
                              ).extract())
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '//div//article[contains(@itemprop,"article")]//p/text()'
                        ).extract())) > 1:
                contents = crop_emptyel(
                    sel.xpath(
                        '//div//article[contains(@itemprop,"article")]//p/text()'
                    ).extract())
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '//div[contains(@id,"description")]//span/text()').
                        extract())) > 1:
                print('descr')
                contents = crop_emptyel(
                    sel.xpath('//div[contains(@id,"description")]//span/text()'
                              ).extract())
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '//div[contains(@class,"article")]//div/text()').
                        extract())) > 1:
                print('div contains article')
                contents = crop_emptyel(
                    sel.xpath('//div[contains(@class,"article")]//div/text()').
                    extract())
            elif len(
                    crop_emptyel(
                        sel.xpath('//div[contains(@class,"article")]//p/text()'
                                  ).extract())) > 1:
                print('2')
                contents = crop_emptyel(
                    sel.xpath('//div[contains(@class,"article")]//p/text()').
                    extract())
            elif len(
                    crop_emptyel(
                        sel.xpath('//p[contains(@class,"lead")]//text()').
                        extract())) > 0:
                print('5')
                contents = crop_emptyel(
                    sel.xpath(
                        '//p[contains(@class,"lead")]//text()').extract())
            elif len(
                    crop_emptyel(
                        sel.xpath('//div[contains(@class,"text")]//p/text()').
                        extract())) > 0:
                print('1')
                contents = crop_emptyel(
                    sel.xpath(
                        '//div[contains(@class,"text")]//p/text()').extract())
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '/html/head/meta[@name="description"]/@content').
                        extract())) > 0:
                contents = crop_emptyel(
                    sel.xpath('/html/head/meta[@name="description"]/@content').
                    extract())
            content = ' '.join([c.encode('utf-8')
                                for c in contents]).strip().lower()

        #print 'content:',content

        #get search item
        search_item = SearchItem.django_model.objects.get(term=self.search_key)

        #save item
        if not PageItem.django_model.objects.filter(url=response.url).exists():
            if len(content) > 0:
                if CheckQueryinReview(self.keywords, title, content):
                    if domain not in unwanted_domains:
                        newpage = PageItem()
                        newpage['searchterm'] = self.search_key
                        newpage['title'] = title
                        newpage['content'] = content
                        newpage['url'] = url
                        newpage['depth'] = 0
                        newpage['review'] = True
                        newpage.save()
                        print(newpage)
                        return newpage
        else:
            return null
        '''
    def parse_site(self, response):
        print 'parsing',response.url
        ## Get the selector for xpath parsing
        sel = Selector(response)
        
        
        def crop_emptyel(arr):
            return [u for u in arr if u!=' ']
        
            
        ## Get meta info from website
        title = sel.xpath('//title/text()').extract()
        if len(title)>0:
            title = title[0].encode('utf-8').strip().lower()

        content = 'none'

        if len(crop_emptyel(sel.xpath('//div//article//p/text()').extract()))>1:
            contents = crop_emptyel(sel.xpath('//div//article//p/text()').extract())
            print 'divarticle'
        elif len(crop_emptyel(sel.xpath('//article[contains(@class,"article")]//p/text()').extract()))>1:
            print 'article'
            contents = crop_emptyel(sel.xpath('//article[contains(@class,"article")]//p/text()').extract()) 
        elif len(crop_emptyel(sel.xpath('//div[contains(@id,"content")]//p/text()').extract()))>1:
            print '3'
            contents = crop_emptyel(sel.xpath('//div[contains(@id,"content")]//p/text()').extract())
        elif len(crop_emptyel(sel.xpath('//div[contains(@class,"body")]//p/text()').extract()))>1:
            print '4'
            contents = crop_emptyel(sel.xpath('//div[contains(@class,"body")]//p/text()').extract())
        elif len(crop_emptyel(sel.xpath('//section[contains(@class,"text")]//p/text()').extract()))>1:
            print '6'
            contents = crop_emptyel(sel.xpath('//section[contains(@class,"text")]//p/text()').extract())
        elif len(crop_emptyel(sel.xpath('//div[contains(@itemprop,"article")]//p/text()').extract()))>0:
            print '7'
            contents = crop_emptyel(sel.xpath('//div[contains(@itemprop,"article")]//p/text()').extract())
        elif len(crop_emptyel(sel.xpath('//div//article[contains(@itemprop,"article")]//p/text()').extract()))>1:
            contents = crop_emptyel(sel.xpath('//div//article[contains(@itemprop,"article")]//p/text()').extract())
        elif len(crop_emptyel(sel.xpath('//div[contains(@id,"description")]//span/text()').extract()))>1:
            print 'descr'
            contents = crop_emptyel(sel.xpath('//div[contains(@id,"description")]//span/text()').extract())
        elif len(crop_emptyel(sel.xpath('//div[contains(@class,"article")]//div/text()').extract()))>1:
            print 'div contains article'
            contents = crop_emptyel(sel.xpath('//div[contains(@class,"article")]//div/text()').extract())
        elif len(crop_emptyel(sel.xpath('//div[contains(@class,"article")]//p/text()').extract()))>1:
            print '2'
            contents = crop_emptyel(sel.xpath('//div[contains(@class,"article")]//p/text()').extract())
        elif len(crop_emptyel(sel.xpath('//p[contains(@class,"lead")]//text()').extract()))>0:
            print '5'
            contents = crop_emptyel(sel.xpath('//p[contains(@class,"lead")]//text()').extract())
        elif len(crop_emptyel(sel.xpath('//div[contains(@class,"text")]//p/text()').extract()))>0:
            print '1'
            contents = crop_emptyel(sel.xpath('//div[contains(@class,"text")]//p/text()').extract())
        elif len(crop_emptyel(sel.xpath('/html/head/meta[@name="description"]/@content').extract()))>0:
            contents = crop_emptyel(sel.xpath('/html/head/meta[@name="description"]/@content').extract())
            
        content = ' '.join([c.encode('utf-8') for c in contents]).strip().lower()
        print 'title:',title
        print 'content:',content
                
        #get search item 
        search_item = SearchItem.django_model.objects.get(term=self.search_key)
        if not PageItem.django_model.objects.filter(url=response.url).exists():
            if len(content) > 0:
                if self.search_key in content or self.search_key in title:
                    newpage = PageItem()
                    newpage['searchterm'] = search_item
                    newpage['title'] = title
                    newpage['content'] = content
                    newpage['url'] = response.url
                    newpage['depth'] = 0
                    newpage['review'] = True
                    #newpage.save()
                    return newpage  
        else:
            return null      
        '''
Esempio n. 5
0
    def parse_site(self, response):

        #print("Current depth: ", response.request.meta)
        #Exclude empty characters
        def crop_emptyel(arr):
            return [u for u in arr if u != ' ']

        domain = urlparse(response.url).hostname
        a = Article(response.url)
        a.download()
        a.parse()
        ## Get meta info from website

        title = a.title

        sel = Selector(response)
        if title == None:
            title = sel.xpath('//title/text()').extract()
            if len(title) > 0:
                title = title[0].strip().lower()

        content = a.text.replace('\n', '')

        #If we couldn't get the content using the newspaper library then we will try with Selector with several
        #filters
        if content == None:
            content = 'none'
            if len(
                    crop_emptyel(
                        sel.xpath('//div//article//p/text()').extract())) > 1:
                contents = crop_emptyel(
                    sel.xpath('//div//article//p/text()').extract())
                #print('divarticle')
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '//article[contains(@class,"article")]//p/text()').
                        extract())) > 1:
                #print('article')
                contents = crop_emptyel(
                    sel.xpath('//article[contains(@class,"article")]//p/text()'
                              ).extract())
            elif len(
                    crop_emptyel(
                        sel.xpath('//div[contains(@id,"content")]//p/text()').
                        extract())) > 1:
                #print('using method 3')
                contents = crop_emptyel(
                    sel.xpath(
                        '//div[contains(@id,"content")]//p/text()').extract())
            elif len(
                    crop_emptyel(
                        sel.xpath('//div[contains(@class,"body")]//p/text()').
                        extract())) > 1:
                #print('using method 4')
                contents = crop_emptyel(
                    sel.xpath(
                        '//div[contains(@class,"body")]//p/text()').extract())
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '//section[contains(@class,"text")]//p/text()').
                        extract())) > 1:
                #print('using method 5')
                contents = crop_emptyel(
                    sel.xpath('//section[contains(@class,"text")]//p/text()').
                    extract())
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '//div[contains(@itemprop,"article")]//p/text()').
                        extract())) > 0:
                #print('using method 6')
                contents = crop_emptyel(
                    sel.xpath('//div[contains(@itemprop,"article")]//p/text()'
                              ).extract())
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '//div//article[contains(@itemprop,"article")]//p/text()'
                        ).extract())) > 1:
                #print('using method 7')
                contents = crop_emptyel(
                    sel.xpath(
                        '//div//article[contains(@itemprop,"article")]//p/text()'
                    ).extract())
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '//div[contains(@id,"description")]//span/text()').
                        extract())) > 1:
                #print('using descr')
                contents = crop_emptyel(
                    sel.xpath('//div[contains(@id,"description")]//span/text()'
                              ).extract())
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '//div[contains(@class,"article")]//div/text()').
                        extract())) > 1:
                #print('using div contains article');
                contents = crop_emptyel(
                    sel.xpath('//div[contains(@class,"article")]//div/text()').
                    extract())
            elif len(
                    crop_emptyel(
                        sel.xpath('//div[contains(@class,"article")]//p/text()'
                                  ).extract())) > 1:
                #print('using method 8')
                contents = crop_emptyel(
                    sel.xpath('//div[contains(@class,"article")]//p/text()').
                    extract())
            elif len(
                    crop_emptyel(
                        sel.xpath('//p[contains(@class,"lead")]//text()').
                        extract())) > 0:
                #print('using method 9')
                contents = crop_emptyel(
                    sel.xpath(
                        '//p[contains(@class,"lead")]//text()').extract())
            elif len(
                    crop_emptyel(
                        sel.xpath('//div[contains(@class,"text")]//p/text()').
                        extract())) > 0:
                #print('using method 10')
                contents = crop_emptyel(
                    sel.xpath(
                        '//div[contains(@class,"text")]//p/text()').extract())
            elif len(
                    crop_emptyel(
                        sel.xpath(
                            '/html/head/meta[@name="description"]/@content').
                        extract())) > 0:
                contents = crop_emptyel(
                    sel.xpath('/html/head/meta[@name="description"]/@content').
                    extract())

            content = ' '.join([c for c in contents]).strip().lower()
        #print('title:', title)
        #print('content:',content)

        # get search item
        search_item = SearchItem.django_model.objects.get(term=self.search_key)
        # save item
        #If the PageItem object does not exist
        if not PageItem.django_model.objects.filter(url=response.url).exists():
            #check content and create PageItem if the content contains any of the keywords and the page domain is
            #allowed
            if len(content) > 0:
                if check_query_in_review(self.keywords, title, content):
                    if domain not in unwanted_domains:
                        newpage = PageItem()
                        newpage['searchterm'] = search_item
                        newpage['title'] = title
                        newpage['content'] = content
                        newpage['url'] = response.url
                        newpage['depth'] = 0
                        newpage['review'] = True
                        # newpage.save()
                        # The newpage object will be saved in the pipeline
                        return newpage
                else:
                    pass
                    #print("No keywords in title or content")
        else:
            return None
Esempio n. 6
0
    def parse_item(self, response):
        sel = Selector(response)
        print("Current depth: ", response.request.meta['depth'])

        ## Get meta info from website
        title = sel.xpath('//title/text()').extract()
        if len(title) > 0:
            title = title[0]

        ##We are not that interested in the content, that's why we don't apply more filters to force having
        ##something in the content field.
        #We are more interested in the depth, fromulr and tourl that we will use for our page rank algorith.
        contents = sel.xpath(
            '/html/head/meta[@name="description"]/@content').extract()
        content = ' '.join([c for c in contents]).strip()

        #Get source url

        fromurl = response.request.headers['Referer'].decode(
            'ascii', 'ignore')

        #Get current url
        tourl = response.url
        #Get depth
        depth = response.request.meta['depth']

        # Get search item using its id.
        search_item = SearchItem.django_model.objects.get(id=self.search_id)

        # If a PageItem linked to the current url does not exist, then create a new one.
        if not PageItem.django_model.objects.filter(url=tourl).exists():
            newpage = PageItem()
            newpage['searchterm'] = search_item
            newpage['title'] = title
            newpage['content'] = content
            newpage['url'] = tourl
            newpage['depth'] = depth
            newpage.save(
            )  # cant use pipeline cause the execution can finish here
        '''
        print("============================================")
        try:
            print("--From url", fromurl)
            print('--title:', title)
            print('--depth: ', depth)
        except ValueError:
            print("##########################################  Encoding error")
            print(fromurl.encode("ascii", "ignore"))
            print(title.encode("ascii", "ignore"))
            raise
        '''
        # print contents
        # if( int(depth)> 1):
        #   print fromurl,'--title:',title,'-',response.url,' depth:',depth

        # Get source PageItem, and current PageItem and get their ids

        from_page = PageItem.django_model.objects.get(url=fromurl)
        from_id = from_page.id
        to_page = PageItem.django_model.objects.get(url=tourl)
        to_id = to_page.id

        #Create a link from the previous page to the current page.
        # If a LinkItem linking source link and current link doesn't exist we will create a new one-
        if not LinkItem.django_model.objects.filter(from_id=from_id).filter(
                to_id=to_id).exists():
            newlink = LinkItem()
            newlink['searchterm'] = search_item
            newlink['from_id'] = from_id
            newlink['to_id'] = to_id
            newlink.save()