def parse(self, response): movies_1 = response.xpath('//td[@width="313"]').extract()[0] movie_links1 = Selector(text=movies_1).xpath('//a/@href').extract() movies_2 = response.xpath('//td[@width="252"]').extract()[0] movie_links2 = Selector(text=movies_2).xpath('//a/@href').extract() movie_links1.extend(movie_links2) for link in movie_links1: next_link = 'http://www.onthesetofnewyork.com/' + str(link) yield Request(next_link, callback=self.other_parse_meth)
def parse(self, response): # GET URL # questions = Selector(response).xpath('//div[@class="summary"]/h3') # # for question in questions: # item = QLinkItem() # item['title'] = question.xpath( # 'a[@class="question-hyperlink"]/text()').extract()[0] # item['url'] = question.xpath( # 'a[@class="question-hyperlink"]/@href').extract()[0] # yield item # questions = Selector(response).xpath('//div[@class="summary"]/h3') ###### get q info # item = QuestionItem() # title = Selector(response).xpath('//div[@id="question-header"]/h1/a/text()').extract()[0] # url = Selector(response).xpath('//div[@id="question-header"]/h1/a/@href').extract()[0] # mainbar = Selector(response).xpath('//div[@id="question"]') # # print mainbar # votes = Selector(response).xpath('//div[@id="question"]/table/tr/td/div/span/text()').extract()[0] # desc = str(Selector(response).xpath('//td[@class="postcell"]/div/div[@class="post-text"]').extract()[0]).strip() # tags = Selector(response).xpath('//td[@class="postcell"]/div/div[@class="post-taglist"]/a/text()').extract() # start_time = Selector(response).xpath('//td[@class="post-signature owner"]/div/div/span[@class="relativetime"]/@title').extract()[0] # update_time = Selector(response).xpath('//div[@class="user-action-time"]/span[@class="relativetime"]/@title').extract()[0] # user_id = Selector(response).xpath( # '//td[@class="post-signature owner"]/div/div[@class="user-details"]/a/@href').extract() # item['source'] = 'stackoverflow' # item['title'] = title # item['url'] = url # item['votes'] = votes # item['desc'] = desc # item['tag'] = tags # item['user_id'] = user_id # item['start_time'] = start_time # item['update_time'] = update_time # yield item ##### get answer info answers = Selector(response).xpath('//div[@class="answer"]') ac_answer = Selector(response).xpath('//div[@class="answer accepted-answer"]') answers.extend(ac_answer) url = Selector(response).xpath('//div[@id="question-header"]/h1/a/@href').extract()[0] print "!!!!!!!!!!!" print len(answers) for answer in answers: item = AnswerItem() votes = answer.xpath('table/tr/td/div/span/text()').extract()[0] desc = answer.xpath('table/tr/td[@class="answercell"]/div[@class="post-text"]').extract()[0] user_ids = answer.xpath('table/tr/td[@class="answercell"]/table/tr/td[@class="post-signature"]/div/div[@class="user-details"]/a/@href').extract() action_time = answer.xpath('table/tr/td[@class="answercell"]/table/tr/td[@class="post-signature"]/div/div[@class="user-action-time"]/span/@title').extract() print action_time if len(action_time) == 1: start_time = action_time[0] update_time = action_time[0] elif len(action_time) == 2: start_time = action_time[1] update_time = action_time[0] if len(user_ids) == 1: user_id = user_ids[0] elif len(user_ids) == 2: user_id = user_ids[1] else: user_id = '' item['votes'] = votes item['desc'] = desc item['user_id'] = user_id item['start_time'] = start_time item['update_time'] = update_time item['url'] = url yield item
def parse_link(self, response): """Find all resources that coincide with what was specified and also find all followable links. Arguments: response -- returned Request object Returns: generator -- yields Resource Items followed by followable Requests """ url = response.url if url in self.parsed: return else: self.parsed.add(url) mimetype = response.headers['Content-Type'] base_url, base_path = ResourceSpider.get_baseurl(url) # Extract links. if 'text/javascript' in mimetype: # Perhaps other cases as well, but I've seen JS text files be # parsed when it contained HTML. resources = Selector() else: try: # Consider including background-image found in [style], however # only inline styles can be retrieved. Don't forget to extract # 'url(' prefix and ')' suffix when implemented! resources = response.css('[href],[src]') except AttributeError as e: resources = Selector() # We only want to crawl normal stuff. hrefs will be used to filter # returned Requests. hrefs = resources.css('[href]::attr(href)').extract() resources = resources.css('[src]::attr(src)').extract() resources.extend(hrefs) requests = set() for link in resources: link = link.strip() linkp = urlparse(link) if linkp.scheme in ('mailto', 'tel') or link.startswith( ('#', 'mailto:', 'tel:')): # URLs to ignore. continue elif not urlparse(link).netloc: # Fix a relative URL. link = base_url + os.path.join('/', base_path, link) elif not urlparse(link).scheme: # Fix URL scheme. link = 'http://%s' % link # Determine if examined this URL before. mimetype = size = None if link not in self.seen: self.seen.add(link) # Get URL header information: mimetype, size if self.optimize: mimetype, encoding = mimetypes.guess_type(link) if mimetype is None: mimetype, size = self.get_header_info(link) if mimetype: # Yield Items. if any(mt in mimetype for mt in self.mimetypes) and link not in self.found: size = ResourceSpider.bytes2human( int(size) if size is not None else size) count = len(self.found) + 1 if count == 1: log.msg('%5s %-16s %-8s %-64s' % ('COUNT', 'MIMETYPE', 'SIZE', 'REFERRER'), level=log.INFO) log.msg('%4d: %-16s %-8s %-64s' % (count, mimetype, size, link), level=log.INFO) # MIME type format example: 'text/html; charset=utf-8' self.found.add(link) yield ResourceItem(url=link, mimetype=mimetype, size=size, referrer=url) # Build Requests. if self.follow and any(href.strip() in link for href in hrefs): requests.add(link) # Yield Requests after having yielded Items. for link in requests: if link not in self.requested and self.isallowed(link): self.requested.add(link) yield Request(link, callback=self.parse_link)