def get_question(self, selector, response): # both select function and selector's join function need to add dot to search from relative based directory question_loader = XPathItemLoader(item = LazyTweetQuestion(), \ selector = selector) question_loader.add_xpath('question_content', ''.join([ './/span[@class="post-body"]', '//span[@class="post-status"]/descendant-or-self::text()' ])) # not useful question_loader.add_xpath('question_tags', ''.join([ '//*[@id="post-tags"]/ul/li/a/text()' ])) question_loader.add_xpath('asking_date', ''.join([ './/span[@class="post-meta"]//span[@class="timestamp"]/text()' ])) question_loader.add_value('asker', self.get_user(selector.select(''.join([ './/span[@class="post-meta"]' ])))) question_loader.add_xpath('number_of_answers', ''.join([ './/span[@class="post-meta"]', '//a[last()]/text()' ])) question_loader.add_value('question_id', response.url.split('/')[-1]) print question_loader.get_output_value('question_tags') return question_loader.load_item()
def get_question(self, selector, response): # both select function and selector's join function need to add dot to search from relative based directory question_loader = XPathItemLoader(item = LazyTweetQuestion(), \ selector = selector) question_loader.add_xpath( 'question_content', ''.join([ './/span[@class="post-body"]', '//span[@class="post-status"]/descendant-or-self::text()' ])) # not useful question_loader.add_xpath( 'question_tags', ''.join(['//*[@id="post-tags"]/ul/li/a/text()'])) question_loader.add_xpath( 'asking_date', ''.join([ './/span[@class="post-meta"]//span[@class="timestamp"]/text()' ])) question_loader.add_value( 'asker', self.get_user( selector.select(''.join(['.//span[@class="post-meta"]'])))) question_loader.add_xpath( 'number_of_answers', ''.join(['.//span[@class="post-meta"]', '//a[last()]/text()'])) question_loader.add_value('question_id', response.url.split('/')[-1]) print question_loader.get_output_value('question_tags') return question_loader.load_item()
def get_answer(self, selector, response): answer_loader = XPathItemLoader(item = LazyTweetAnswer(), \ selector = selector) answer_loader.add_value('question_id', response.url.split('/')[-1]) answer_loader.add_value('answerer', self.get_user(selector.select(''.join([ './/span[@class="answer-meta"]' ])))) answer_loader.add_xpath('answer_content',''.join([ './/span[@class="answer-body"]', '//span[@class="answer-status"]//descendant-or-self::text()' ])) print answer_loader.get_output_value('answer_content') a = input() return answer_loader.load_item()
def process_item(self, task_id): report = self.db.loadScrapedFullReport(task_id) if report is None: return text = report["full_report_body"] text = "".join(chr(min(ord(c), 127)) for c in text) t = TextResponse(url=report["full_report_url"], body=text.encode("utf-8")) # must have utf-8 here l = XPathItemLoader(NrcParsedReport(), response=t) l.add_value("reportnum", task_id) patterns = self.compile_patterns() for p in patterns: l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1]) county = l.get_output_value("county") pattern = self.get_area_code_pattern(county) if pattern: l.add_value("areaid", county) l.add_value("blockid", text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern) l.add_value("blockid", text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)") item = l.load_item() yield item self.item_completed(task_id)
def get_answer(self, selector, response): answer_loader = XPathItemLoader(item = LazyTweetAnswer(), \ selector = selector) answer_loader.add_value('question_id', response.url.split('/')[-1]) answer_loader.add_value( 'answerer', self.get_user( selector.select(''.join(['.//span[@class="answer-meta"]'])))) answer_loader.add_xpath( 'answer_content', ''.join([ './/span[@class="answer-body"]', '//span[@class="answer-status"]//descendant-or-self::text()' ])) print answer_loader.get_output_value('answer_content') a = input() return answer_loader.load_item()
def get_answer(self, selector, question_loader): answer_loader = XPathItemLoader(item = YahooAnswer(), selector = selector) answer_loader.add_xpath('answer_id', './@id') answer_loader.add_xpath('answer_content','.//div[@class="qa-container"]//div[@class="content"]//text()') answer_loader.add_value('answerer',self.get_user(selector)) answer_loader.add_value('question_id',question_loader.get_output_value('question_id')) answer_loader.add_xpath('answering_date',''.join([ './/div[@class="qa-container"]//ul[@class="meta"]', '/li[1]/abbr/@title' ])) answer_loader.add_xpath('marks',''.join([ './/div[@class="utils-container"]', '//li[@class="rate-up"]', '//span[@class="seo-rated"]/text()' ])) answer_loader.add_xpath('marks',''.join([ './/div[@class="utils-container"]', '//li[@class="rate-up"]', '//span[@class="seo-rated"]//strong/text()' ])) # get the good number ot bad number marks = answer_loader.get_output_value('marks') # print marks if marks.find('good'): answer_loader.add_value('number_of_good_marks', marks.split(' ')[0]) #bad numbers # is best answer answer_class = selector.select('./@class').extract()[0] if answer_class.find('best') != -1: answer_loader.add_value('is_best_answer', 1) else: answer_loader.add_value('is_best_answer', 0) return answer_loader.load_item()
def process_item(self, task_id): report = self.db.loadScrapedFullReport(task_id) if report is None: return text = report['full_report_body'] text = "".join(chr(min(ord(c),127)) for c in text) t = TextResponse (url=report['full_report_url'], body=text.encode('utf-8')) #must have utf-8 here l = XPathItemLoader(NrcParsedReport(), response=t) l.add_value('reportnum', task_id) patterns = self.compile_patterns () for p in patterns: l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1]) county = l.get_output_value('county') pattern = self.get_area_code_pattern(county) if pattern: l.add_value ('areaid', county) l.add_value('blockid', text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern) l.add_value('blockid', text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)") item = l.load_item() yield item self.item_completed(task_id)
def parse(self, response): sel = Selector(response) answers_xpath = '//div[@id="zh-question-answer-wrap"]/div[contains(@class, "zm-item-answer")]' asker_xpath = '//div[contains(@class, "zh-question-followers-sidebar")]//a[contains(@class, "zm-item-link-avatar")]' # use Itemloader to populate the data # question q_id = int(response.url.split('/')[-1]) q_loader = XPathItemLoader(item = ZhiHuQ(), selector=sel) q_loader.add_xpath('title', '//div[@id="zh-question-title"]/h2/text()') q_loader.add_xpath('content', '//div[@id="zh-question-detail"]//text()') q_loader.add_value('id', q_id) # asker information asker_loader = XPathItemLoader(item = ZhiHuU(), selector=sel) asker_loader.add_xpath('name', '//div[contains(@class, "zh-question-followers-sidebar")]//a[contains(@class, "zm-item-link-avatar")][1]/@title') asker_loader.add_xpath('url', '//div[contains(@class, "zh-question-followers-sidebar")]//a[contains(@class, "zm-item-link-avatar")][1]/@href') asker_loader.add_value('id', generate_uid(asker_loader.get_output_value('name'))) # add user to question field q_loader.add_value('user', asker_loader.load_item()) # yiled question and asker yield q_loader.load_item() yield asker_loader.load_item() # generate answer information for ans_selector in sel.xpath(answers_xpath): answer_loader = XPathItemLoader(item = ZhiHuA(), selector = ans_selector) answer_loader.add_xpath('id', './@data-aid') answer_loader.add_value('qid', q_loader.get_output_value('id')) answer_loader.add_xpath('content', './/div[contains(@class, "zm-item-rich-text")]//text()') answer_loader.add_xpath('score', './/div[contains(@class, "zm-item-vote")]/a[contains(@class, "zm-item-vote-count")]/@data-votecount') # answerer info user_loader = XPathItemLoader(item = ZhiHuU(), selector = ans_selector) # some user is anonymity user_loader.add_xpath('name', './/div[contains(@class, "zm-item-answer-author-info")]/h3//a[2]/text()') user_loader.add_xpath('url', './/div[contains(@class, "zm-item-answer-author-info")]/h3//a[2]/@href') if user_loader.get_output_value('name') is not None: # print user_loader.get_output_value('name').encode('utf-8') user_loader.add_value('id', generate_uid(user_loader.get_output_value('name'))) answer_loader.add_value('asr', user_loader.load_item()) yield answer_loader.load_item() yield user_loader.load_item() else: continue
def get_user(self, selector, response, label): user_loader = XPathItemLoader(item = StackOverflowUser(), selector = selector) user_loader.add_xpath('user_name', ''.join([ './/div[contains(@class, "user-details")]', '/a/text()' ])) user_loader.add_xpath('user_link', ''.join([ './/div[contains(@class, "user-details")]', '/a/@href' ])) if user_loader.get_output_value('user_link'): user_id = user_loader.get_output_value('user_link') user_loader.add_value('user_id', user_loader.get_output_value('user_link')) return user_loader.load_item()
def get_user(self, selector): user_loader = XPathItemLoader(item=LazyTweetUser(), selector=selector) user_loader.add_xpath('twitter_username', ''.join(['./a[1]/text()'])) user_loader.add_value( 'twitter_url', ''.join([ r'http://twitter.com/', user_loader.get_output_value('twitter_username') ])) return user_loader.load_item()
def get_user(self, selector): user_loader = XPathItemLoader(item = LazyTweetUser(), selector = selector) user_loader.add_xpath('twitter_username', ''.join([ './a[1]/text()' ])) user_loader.add_value('twitter_url', ''.join([ r'http://twitter.com/', user_loader.get_output_value('twitter_username') ])) return user_loader.load_item()
def parse(self, response): """ # """ selector = HtmlXPathSelector(response) # iterate over tickets for ticket in selector.select(self.tickets_list_xpath): loader = XPathItemLoader(ComparatorItem(), selector=ticket) # define loader loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader loader.add_xpath('eventname' , './/span[@class="summary listingEventName"]/text()') loader.add_xpath('eventlocation' , './/div[@class="divVenue location"]/text()') loader.add_xpath('ticketslink' , './/a[@class="divEventDetails url"]/@href') print "Here is ticket link \n" + loader.get_output_value("ticketslink") ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketslink") ticketsURL = urljoin(response.url, ticketsURL) yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
def get_user(self, selector): user_loader = XPathItemLoader(item = YahooUser(), selector = selector) user_loader.add_xpath('user_name', './/span[contains(@class, "user")]//span[contains(@class, "fn")]/text()') user_loader.add_xpath('user_url', './/span[@class="user"]//a[@class="url"]/@href') user_loader.add_value('user_id', re.match(r'http://answers\.yahoo\.com/my/profile\?show=(.*)', user_loader.get_output_value('user_url') ).group(1)) if user_loader.get_collected_values('user_name'): return user_loader.load_item() else: return None
def parse(self, response): """ # """ selector = HtmlXPathSelector(response) # iterate over tickets for ticket in selector.select(self.tickets_list_xpath): loader = XPathItemLoader(ComparatorItem(), selector=ticket) # define loader loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader loader.add_xpath('eventname' , './/*[@class="productionsEvent"]/text()') loader.add_xpath('eventlocation' , './/*[@class = "productionsVenue"]/span[@itemprop = "name"]/text()') loader.add_xpath('ticketslink' , './/*/a[@class = "btn btn-primary"]/@href') loader.add_xpath('eventdate' , './/*[@class = "productionsDate"]/text()') loader.add_xpath('eventcity' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressLocality"]/text()') loader.add_xpath('eventstate' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressRegion"]/text()') loader.add_xpath('eventtime' , './/*[@class = "productionsTime"]/text()') print "Here is ticket link \n" + loader.get_output_value("ticketslink") ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketslink") ticketsURL = urljoin(response.url, ticketsURL) yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
def parse_question_page(self,response): hxs = HtmlXPathSelector(response) question_loader = XPathItemLoader(item = YahooQuestion(), selector = hxs) answers_loader = XPathItemLoader(item = YahooAnswer(), selector = hxs) # get question id question_loader.add_value('question_id',''.join(parse_qs(urlparse(response.request.url).query)['qid'])) # print question_loader.get_output_value('question_id') # get question title question_loader.add_xpath('question_title',self.question_xpath+'//h1[contains(@class, "subject")]/text()') # get question content question_loader.add_xpath('question_content',self.question_xpath+'//div[contains(@class, "content")]/text()') # get question status question_loader.add_xpath('status',self.question_xpath+'//div[@class="hd"]//h2/text()') #get question url question_loader.add_value('question_url',''.join([ 'http://answers.yahoo.com/question/index?qid=', question_loader.get_output_value('question_id') ])) #get question date question_loader.add_xpath('asking_date',''.join([ self.question_xpath, '//div[@class="qa-container"]//ul[@class="meta"]', '/li[1]/abbr/@title' ])) #import date question_loader.add_value('import_date',time.strftime("%Y-%m-%d %A %X %Z", time.localtime())) # asking user question_loader.add_value('asker', self.get_user(hxs.select(''.join([ self.question_xpath, ])))) # interestin marks question_loader.add_xpath('number_of_interesting_marks', ''.join([ '//ul[@id="yan-question-tools"]', '//li[@id="yan-starthis"]', '//span[contains(@class,"star-count")]/text()' ])) # number of answers question_loader.add_xpath('number_of_answers',''.join([ self.answer_xpath, '/div[@class="hd"]', '/h3/text()' ])) #begin to parse answers # category of the question item question_loader.add_xpath('category',''.join([self.category_xpath, '//li//a//text()'])) # best answer best_answer_selector = hxs.select(self.best_answer_xpath) if best_answer_selector: yield self.get_answer(best_answer_selector, question_loader) #other answers for ans_selector in hxs.select(self.answer_xpath).select('.//li/div[@class="answer"]'): # self.get_answer(ans_selector, question_loader) yield self.get_answer(ans_selector, question_loader) yield question_loader.load_item()
def parse(self, response): sel = Selector(response) answers_xpath = '//div[@id="zh-question-answer-wrap"]/div[contains(@class, "zm-item-answer")]' asker_xpath = '//div[contains(@class, "zh-question-followers-sidebar")]//a[contains(@class, "zm-item-link-avatar")]' answer_number = 0 # use Itemloader to populate the data # question q_id = int(response.url.split('/')[-1]) q_loader = XPathItemLoader(item=ZhiHuQ(), selector=sel) q_loader.add_xpath('title', '//div[@id="zh-question-title"]/h2/text()') q_loader.add_xpath('content', '//div[@id="zh-question-detail"]//text()') q_loader.add_value('id', q_id) # asker information asker_loader = XPathItemLoader(item=ZhiHuU(), selector=sel) asker_loader.add_xpath( 'name', '//div[contains(@class, "zh-question-followers-sidebar")]//a[contains(@class, "zm-item-link-avatar")][1]/@title' ) asker_loader.add_xpath( 'url', '//div[contains(@class, "zh-question-followers-sidebar")]//a[contains(@class, "zm-item-link-avatar")][1]/@href' ) asker_loader.add_value( 'id', generate_uid(asker_loader.get_output_value('name'))) print asker_loader.get_output_value('name') # add user to question field q_loader.add_value('user', asker_loader.load_item()) # yiled question and asker yield q_loader.load_item() yield asker_loader.load_item() # generate answer information for ans_selector in sel.xpath(answers_xpath): answer_loader = XPathItemLoader(item=ZhiHuA(), selector=ans_selector) answer_loader.add_xpath('id', './@data-aid') answer_loader.add_value('qid', q_loader.get_output_value('id')) answer_loader.add_xpath( 'content', './/div[contains(@class, "zm-item-rich-text")]//text()') answer_loader.add_xpath( 'score', './/div[contains(@class, "zm-item-vote")]/a[contains(@class, "zm-item-vote-count")]/@data-votecount' ) # answerer info user_loader = XPathItemLoader(item=ZhiHuU(), selector=ans_selector) # some user is anonymity user_loader.add_xpath( 'name', './/div[contains(@class, "zm-item-answer-author-info")]/h3//a[2]/text()' ) user_loader.add_xpath( 'url', './/div[contains(@class, "zm-item-answer-author-info")]/h3//a[2]/@href' ) if user_loader.get_output_value('name') is not None: # print user_loader.get_output_value('name').encode('utf-8') # add answer_number answer_number += 1 user_loader.add_value( 'id', generate_uid(user_loader.get_output_value('name'))) answer_loader.add_value('asr', user_loader.load_item()) yield answer_loader.load_item() yield user_loader.load_item() else: continue q_loader.add_value('num', answer_number) print q_loader.get_output_value('num')