Exemple #1
0
    def get_question(self, selector, response):
# both select function and selector's join function need to add dot to search from relative based directory
        question_loader = XPathItemLoader(item = LazyTweetQuestion(), \
                selector = selector)
        question_loader.add_xpath('question_content', ''.join([
            './/span[@class="post-body"]',
            '//span[@class="post-status"]/descendant-or-self::text()'
            ]))
        # not useful
        question_loader.add_xpath('question_tags', ''.join([
            '//*[@id="post-tags"]/ul/li/a/text()'
            ]))
        question_loader.add_xpath('asking_date', ''.join([
            './/span[@class="post-meta"]//span[@class="timestamp"]/text()'
            ]))
        question_loader.add_value('asker', self.get_user(selector.select(''.join([
            './/span[@class="post-meta"]'
            ]))))
        question_loader.add_xpath('number_of_answers', ''.join([
            './/span[@class="post-meta"]',
            '//a[last()]/text()'
            ]))
        question_loader.add_value('question_id', response.url.split('/')[-1])
        print question_loader.get_output_value('question_tags')
        return question_loader.load_item()
Exemple #2
0
 def get_question(self, selector, response):
     # both select function and selector's join function need to add dot to search from relative based directory
     question_loader = XPathItemLoader(item = LazyTweetQuestion(), \
             selector = selector)
     question_loader.add_xpath(
         'question_content', ''.join([
             './/span[@class="post-body"]',
             '//span[@class="post-status"]/descendant-or-self::text()'
         ]))
     # not useful
     question_loader.add_xpath(
         'question_tags', ''.join(['//*[@id="post-tags"]/ul/li/a/text()']))
     question_loader.add_xpath(
         'asking_date', ''.join([
             './/span[@class="post-meta"]//span[@class="timestamp"]/text()'
         ]))
     question_loader.add_value(
         'asker',
         self.get_user(
             selector.select(''.join(['.//span[@class="post-meta"]']))))
     question_loader.add_xpath(
         'number_of_answers',
         ''.join(['.//span[@class="post-meta"]', '//a[last()]/text()']))
     question_loader.add_value('question_id', response.url.split('/')[-1])
     print question_loader.get_output_value('question_tags')
     return question_loader.load_item()
Exemple #3
0
 def get_answer(self, selector, response):
     answer_loader = XPathItemLoader(item = LazyTweetAnswer(), \
             selector = selector)
     answer_loader.add_value('question_id', response.url.split('/')[-1])
     answer_loader.add_value('answerer', self.get_user(selector.select(''.join([
         './/span[@class="answer-meta"]'
         ]))))
     answer_loader.add_xpath('answer_content',''.join([
         './/span[@class="answer-body"]',
         '//span[@class="answer-status"]//descendant-or-self::text()'
         ]))
     print answer_loader.get_output_value('answer_content')
     a = input()
     return answer_loader.load_item()
Exemple #4
0
    def process_item(self, task_id):
        report = self.db.loadScrapedFullReport(task_id)
        if report is None:
            return

        text = report["full_report_body"]
        text = "".join(chr(min(ord(c), 127)) for c in text)
        t = TextResponse(url=report["full_report_url"], body=text.encode("utf-8"))  # must have utf-8 here
        l = XPathItemLoader(NrcParsedReport(), response=t)
        l.add_value("reportnum", task_id)

        patterns = self.compile_patterns()

        for p in patterns:
            l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1])

        county = l.get_output_value("county")
        pattern = self.get_area_code_pattern(county)
        if pattern:
            l.add_value("areaid", county)
            l.add_value("blockid", text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern)
            l.add_value("blockid", text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)")

        item = l.load_item()

        yield item
        self.item_completed(task_id)
Exemple #5
0
 def get_answer(self, selector, response):
     answer_loader = XPathItemLoader(item = LazyTweetAnswer(), \
             selector = selector)
     answer_loader.add_value('question_id', response.url.split('/')[-1])
     answer_loader.add_value(
         'answerer',
         self.get_user(
             selector.select(''.join(['.//span[@class="answer-meta"]']))))
     answer_loader.add_xpath(
         'answer_content', ''.join([
             './/span[@class="answer-body"]',
             '//span[@class="answer-status"]//descendant-or-self::text()'
         ]))
     print answer_loader.get_output_value('answer_content')
     a = input()
     return answer_loader.load_item()
Exemple #6
0
    def get_answer(self, selector, question_loader):
        answer_loader = XPathItemLoader(item = YahooAnswer(), selector = selector)
        answer_loader.add_xpath('answer_id', './@id')
        answer_loader.add_xpath('answer_content','.//div[@class="qa-container"]//div[@class="content"]//text()')
        answer_loader.add_value('answerer',self.get_user(selector))
        answer_loader.add_value('question_id',question_loader.get_output_value('question_id'))
        answer_loader.add_xpath('answering_date',''.join([
            './/div[@class="qa-container"]//ul[@class="meta"]',
            '/li[1]/abbr/@title'
            ]))
        answer_loader.add_xpath('marks',''.join([
            './/div[@class="utils-container"]',
            '//li[@class="rate-up"]',
            '//span[@class="seo-rated"]/text()'
            ]))
        answer_loader.add_xpath('marks',''.join([
            './/div[@class="utils-container"]',
            '//li[@class="rate-up"]',
            '//span[@class="seo-rated"]//strong/text()'
            ]))
# get the good number ot bad number
        marks = answer_loader.get_output_value('marks')
        # print marks
        if marks.find('good'):
            answer_loader.add_value('number_of_good_marks', marks.split(' ')[0])
#bad numbers
# is best answer
        answer_class = selector.select('./@class').extract()[0]
        if answer_class.find('best') != -1:
            answer_loader.add_value('is_best_answer', 1)
        else:
            answer_loader.add_value('is_best_answer', 0)

        return answer_loader.load_item()
 def process_item(self, task_id):
     report = self.db.loadScrapedFullReport(task_id)
     if report is None:
         return
         
     text = report['full_report_body']
     text = "".join(chr(min(ord(c),127)) for c in text)
     t = TextResponse (url=report['full_report_url'], body=text.encode('utf-8')) #must have utf-8 here
     l = XPathItemLoader(NrcParsedReport(), response=t)
     l.add_value('reportnum', task_id)
     
     patterns = self.compile_patterns ()
     
     for p in patterns:
         l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1])
             
     county = l.get_output_value('county')
     pattern = self.get_area_code_pattern(county)
     if pattern:
         l.add_value ('areaid', county)
         l.add_value('blockid', text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern)
         l.add_value('blockid', text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)")
         
                     
     item = l.load_item()
     
     yield item
     self.item_completed(task_id)
    def parse(self, response):
        sel = Selector(response)
        answers_xpath = '//div[@id="zh-question-answer-wrap"]/div[contains(@class, "zm-item-answer")]'
        asker_xpath = '//div[contains(@class, "zh-question-followers-sidebar")]//a[contains(@class, "zm-item-link-avatar")]'

        # use Itemloader to populate the data
        # question
        q_id = int(response.url.split('/')[-1])
        q_loader = XPathItemLoader(item = ZhiHuQ(), selector=sel)
        q_loader.add_xpath('title', '//div[@id="zh-question-title"]/h2/text()')
        q_loader.add_xpath('content', '//div[@id="zh-question-detail"]//text()')
        q_loader.add_value('id', q_id)

        # asker information
        asker_loader = XPathItemLoader(item = ZhiHuU(), selector=sel)
        asker_loader.add_xpath('name', '//div[contains(@class, "zh-question-followers-sidebar")]//a[contains(@class, "zm-item-link-avatar")][1]/@title')
        asker_loader.add_xpath('url', '//div[contains(@class, "zh-question-followers-sidebar")]//a[contains(@class, "zm-item-link-avatar")][1]/@href')
        asker_loader.add_value('id', generate_uid(asker_loader.get_output_value('name')))

        # add user to question field
        q_loader.add_value('user', asker_loader.load_item())

        # yiled question and asker
        yield q_loader.load_item()
        yield asker_loader.load_item()

        # generate answer information
        for ans_selector in sel.xpath(answers_xpath):
            answer_loader = XPathItemLoader(item =  ZhiHuA(), selector = ans_selector)
            answer_loader.add_xpath('id', './@data-aid')
            answer_loader.add_value('qid', q_loader.get_output_value('id'))
            answer_loader.add_xpath('content', './/div[contains(@class, "zm-item-rich-text")]//text()')
            answer_loader.add_xpath('score', './/div[contains(@class, "zm-item-vote")]/a[contains(@class, "zm-item-vote-count")]/@data-votecount')

            # answerer info
            user_loader = XPathItemLoader(item = ZhiHuU(), selector = ans_selector)
            # some user is anonymity
            user_loader.add_xpath('name', './/div[contains(@class, "zm-item-answer-author-info")]/h3//a[2]/text()')
            user_loader.add_xpath('url', './/div[contains(@class, "zm-item-answer-author-info")]/h3//a[2]/@href')
            if user_loader.get_output_value('name') is not None:
                # print user_loader.get_output_value('name').encode('utf-8')
                user_loader.add_value('id', generate_uid(user_loader.get_output_value('name')))
                answer_loader.add_value('asr', user_loader.load_item())
                yield answer_loader.load_item()
                yield user_loader.load_item()
            else:
                continue
Exemple #9
0
    def get_user(self, selector, response, label):
        user_loader = XPathItemLoader(item = StackOverflowUser(),
                selector = selector)
        user_loader.add_xpath('user_name', ''.join([
            './/div[contains(@class, "user-details")]',
            '/a/text()'
            ]))
        user_loader.add_xpath('user_link', ''.join([
            './/div[contains(@class, "user-details")]',
            '/a/@href'
            ]))

        if user_loader.get_output_value('user_link'):
            user_id = user_loader.get_output_value('user_link')
            user_loader.add_value('user_id',
                    user_loader.get_output_value('user_link'))

        return user_loader.load_item()
Exemple #10
0
 def get_user(self, selector):
     user_loader = XPathItemLoader(item=LazyTweetUser(), selector=selector)
     user_loader.add_xpath('twitter_username', ''.join(['./a[1]/text()']))
     user_loader.add_value(
         'twitter_url', ''.join([
             r'http://twitter.com/',
             user_loader.get_output_value('twitter_username')
         ]))
     return user_loader.load_item()
Exemple #11
0
 def get_user(self, selector):
     user_loader = XPathItemLoader(item = LazyTweetUser(), selector = selector)
     user_loader.add_xpath('twitter_username', ''.join([
         './a[1]/text()'
         ]))
     user_loader.add_value('twitter_url', ''.join([
         r'http://twitter.com/',
         user_loader.get_output_value('twitter_username')
         ]))
     return user_loader.load_item()
 def parse(self, response):
     """
     # """
     selector = HtmlXPathSelector(response)
     # iterate over tickets
     for ticket in selector.select(self.tickets_list_xpath):
         loader = XPathItemLoader(ComparatorItem(), selector=ticket)
         # define loader
         loader.default_input_processor = MapCompose(unicode.strip)
         loader.default_output_processor = Join()
         # iterate over fields and add xpaths to the loader
         loader.add_xpath('eventname' , './/span[@class="summary listingEventName"]/text()')
         loader.add_xpath('eventlocation' , './/div[@class="divVenue location"]/text()')
         loader.add_xpath('ticketslink' , './/a[@class="divEventDetails url"]/@href')
         
         print "Here is ticket link \n" + loader.get_output_value("ticketslink")
         ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketslink")
         ticketsURL = urljoin(response.url, ticketsURL)
         yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
Exemple #13
0
    def get_user(self, selector):
        user_loader = XPathItemLoader(item = YahooUser(), selector = selector)
        user_loader.add_xpath('user_name', './/span[contains(@class, "user")]//span[contains(@class, "fn")]/text()')
        user_loader.add_xpath('user_url', './/span[@class="user"]//a[@class="url"]/@href')
        user_loader.add_value('user_id', re.match(r'http://answers\.yahoo\.com/my/profile\?show=(.*)',
            user_loader.get_output_value('user_url')
            ).group(1))

        if user_loader.get_collected_values('user_name'):
            return user_loader.load_item()
        else:
            return None
    def parse(self, response):
        """
        # """
        selector = HtmlXPathSelector(response)
        # iterate over tickets
        for ticket in selector.select(self.tickets_list_xpath):
            loader = XPathItemLoader(ComparatorItem(), selector=ticket)
            # define loader
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            # iterate over fields and add xpaths to the loader
            loader.add_xpath('eventname' , './/*[@class="productionsEvent"]/text()')
            loader.add_xpath('eventlocation' , './/*[@class = "productionsVenue"]/span[@itemprop  = "name"]/text()')
            loader.add_xpath('ticketslink' , './/*/a[@class = "btn btn-primary"]/@href')
            loader.add_xpath('eventdate' , './/*[@class = "productionsDate"]/text()')
            loader.add_xpath('eventcity' , './/*[@class = "productionsVenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressLocality"]/text()')
            loader.add_xpath('eventstate' , './/*[@class = "productionsVenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressRegion"]/text()')
            loader.add_xpath('eventtime' , './/*[@class = "productionsTime"]/text()')

            print "Here is ticket link \n" + loader.get_output_value("ticketslink")
            ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketslink")
            ticketsURL = urljoin(response.url, ticketsURL)
            yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
Exemple #15
0
    def parse_question_page(self,response):
        hxs = HtmlXPathSelector(response)
        question_loader = XPathItemLoader(item = YahooQuestion(), selector = hxs)
        answers_loader = XPathItemLoader(item = YahooAnswer(), selector = hxs)
# get question id
        question_loader.add_value('question_id',''.join(parse_qs(urlparse(response.request.url).query)['qid']))
# print question_loader.get_output_value('question_id')

# get question title
        question_loader.add_xpath('question_title',self.question_xpath+'//h1[contains(@class, "subject")]/text()')

# get question content
        question_loader.add_xpath('question_content',self.question_xpath+'//div[contains(@class, "content")]/text()')

# get question status
        question_loader.add_xpath('status',self.question_xpath+'//div[@class="hd"]//h2/text()')

#get question url
        question_loader.add_value('question_url',''.join([
            'http://answers.yahoo.com/question/index?qid=',
            question_loader.get_output_value('question_id')
            ]))
#get question date
        question_loader.add_xpath('asking_date',''.join([
            self.question_xpath,
            '//div[@class="qa-container"]//ul[@class="meta"]',
            '/li[1]/abbr/@title'
            ]))
#import date
        question_loader.add_value('import_date',time.strftime("%Y-%m-%d %A %X %Z", time.localtime()))

# asking user
        question_loader.add_value('asker', self.get_user(hxs.select(''.join([
            self.question_xpath,
            ]))))

# interestin marks
        question_loader.add_xpath('number_of_interesting_marks', ''.join([
            '//ul[@id="yan-question-tools"]',
            '//li[@id="yan-starthis"]',
            '//span[contains(@class,"star-count")]/text()'
            ]))
# number of answers
        question_loader.add_xpath('number_of_answers',''.join([
            self.answer_xpath,
            '/div[@class="hd"]',
            '/h3/text()'
            ]))
#begin to parse answers

# category of the question item
        question_loader.add_xpath('category',''.join([self.category_xpath, '//li//a//text()']))
# best answer
        best_answer_selector = hxs.select(self.best_answer_xpath)

        if best_answer_selector:
            yield self.get_answer(best_answer_selector, question_loader)

#other answers
        for ans_selector in hxs.select(self.answer_xpath).select('.//li/div[@class="answer"]'):
            # self.get_answer(ans_selector, question_loader)
            yield self.get_answer(ans_selector, question_loader)

        yield question_loader.load_item()
Exemple #16
0
    def parse(self, response):
        sel = Selector(response)
        answers_xpath = '//div[@id="zh-question-answer-wrap"]/div[contains(@class, "zm-item-answer")]'
        asker_xpath = '//div[contains(@class, "zh-question-followers-sidebar")]//a[contains(@class, "zm-item-link-avatar")]'
        answer_number = 0

        # use Itemloader to populate the data
        # question
        q_id = int(response.url.split('/')[-1])
        q_loader = XPathItemLoader(item=ZhiHuQ(), selector=sel)
        q_loader.add_xpath('title', '//div[@id="zh-question-title"]/h2/text()')
        q_loader.add_xpath('content',
                           '//div[@id="zh-question-detail"]//text()')
        q_loader.add_value('id', q_id)

        # asker information
        asker_loader = XPathItemLoader(item=ZhiHuU(), selector=sel)
        asker_loader.add_xpath(
            'name',
            '//div[contains(@class, "zh-question-followers-sidebar")]//a[contains(@class, "zm-item-link-avatar")][1]/@title'
        )
        asker_loader.add_xpath(
            'url',
            '//div[contains(@class, "zh-question-followers-sidebar")]//a[contains(@class, "zm-item-link-avatar")][1]/@href'
        )
        asker_loader.add_value(
            'id', generate_uid(asker_loader.get_output_value('name')))
        print asker_loader.get_output_value('name')

        # add user to question field
        q_loader.add_value('user', asker_loader.load_item())

        # yiled question and asker
        yield q_loader.load_item()
        yield asker_loader.load_item()

        # generate answer information
        for ans_selector in sel.xpath(answers_xpath):
            answer_loader = XPathItemLoader(item=ZhiHuA(),
                                            selector=ans_selector)
            answer_loader.add_xpath('id', './@data-aid')
            answer_loader.add_value('qid', q_loader.get_output_value('id'))
            answer_loader.add_xpath(
                'content',
                './/div[contains(@class, "zm-item-rich-text")]//text()')
            answer_loader.add_xpath(
                'score',
                './/div[contains(@class, "zm-item-vote")]/a[contains(@class, "zm-item-vote-count")]/@data-votecount'
            )

            # answerer info
            user_loader = XPathItemLoader(item=ZhiHuU(), selector=ans_selector)
            # some user is anonymity
            user_loader.add_xpath(
                'name',
                './/div[contains(@class, "zm-item-answer-author-info")]/h3//a[2]/text()'
            )
            user_loader.add_xpath(
                'url',
                './/div[contains(@class, "zm-item-answer-author-info")]/h3//a[2]/@href'
            )
            if user_loader.get_output_value('name') is not None:
                # print user_loader.get_output_value('name').encode('utf-8')

                # add answer_number
                answer_number += 1
                user_loader.add_value(
                    'id', generate_uid(user_loader.get_output_value('name')))
                answer_loader.add_value('asr', user_loader.load_item())
                yield answer_loader.load_item()
                yield user_loader.load_item()
            else:
                continue

        q_loader.add_value('num', answer_number)
        print q_loader.get_output_value('num')