def downloadFreshDocuments ( ): """ Based on the media items in data/item directory, download 10 webpages with a query from the first line in each item file. Keep querying until 10 pages have been successfullly download. Sleep for 60 seconds before requesting url results from the metaSearch engine. This throttling was successful with Google's ajax api on 2/10/15. Once pages are downloaded, parse, stem, and store them with the storePageAndHeader function. """ for query, itemName, itemType in mediaItems( 'data/item' ): count = 10 - len(db.lookupUrlsForItem( itemName, itemType )) start = 0 while ( count > 0 ): time.sleep( 60 ) urlList, start = metaSearch.executeQuery( db, query, count, start ) for url in urlList: webPage, headers = spider.fetch( url ) if ( webPage ): storePageAndHeader( url, webPage, headers, itemName, itemType ) count -= 1 if ( count <= 0 ): break
def jd_save2db(data, opt): skus = data['skuid'] url = 'http://p.3.cn/prices/mgets?skuIds=J_%s&type=1' % skus #(',J_'.join(skus)) o = {'referer': opt.url} # print opt resp = spider.fetch(url, o) jscode = resp.text #[{'p': '769.00', 'm': '859.00', 'id': 'J_954086'}] obj = Global() info = None with PyV8.JSContext(obj) as ctx: c = ctx.eval(jscode) info = PyV8.convert(c) # print info print '++++++++++++++++++++++++++++++++++++++++++++++++++++++++' data['price'] = info[0]['p'] data['old_price'] = info[0]['m'] data['title'] = data['name'].decode('utf8') save2db(data, opt)
def cache(self): """Fetches photo via HTTP and caches it.""" try: self.logger.info("Fetching and caching photo '%s'" % self.title) content_types = [ 'image/jpeg', 'image/gif', 'image/png' ] resp, content = spider.fetch(self.photo_url, valid_content_types=content_types) self.logger.debug('HTTP Status: %s' % str(resp.status)) if resp.status == 200: self.logger.debug('Saving photo to cache') f = open(self._get_cached_original_fn(), 'w') f.write(content) f.close() except: self.logger.exception("Problem caching photo!")
from spider import fetch if __name__ == '__main__': fetch()
def parse(self): """Fetches Tumblr API data and parses it.""" self.logger.info("Fetching API data at '%s'" % self.api_url) self.http_response, self.http_content = spider.fetch(self.api_url) self.logger.info("Parsing API data for entries...") t = tumblr.parse(self.api_url) for post in t.posts: try: if post.type == 'regular': self.logger.info("Tumblr post type: regular") e = Post() e.title = post.title e.summary = post.content e.content = post.content elif post.type == 'link': if 'link' in self.excluded_types: self.logger.debug("Skipping Tumblr link") continue else: self.logger.info("Tumblr post type: link") e = Link() e.title = post.title e.summary = post.content e.content = post.content e.url = post.related e.comments = post.url elif post.type == 'quote': self.logger.info("Tumblr post type: quote") e = Quote() e.summary = post.content # Chop the smart quotes that Tumblr automatically # adds to to a quote e.summary = e.summary.lstrip("“").rstrip("”") e.content = e.summary # Get the quote's citation, and, if possible its source e.citation = post.source try: soup = BeautifulSoup(e.citation) e.citation_url = soup.find('a').get('href') e.via = e.citation_url except AttributeError: e.citation_url = None elif post.type == 'photo': self.logger.info("Tumblr post type: photo") e = Photo() e.photo_type = 'tumblr' e.title = '' e.summary = post.caption #e.content = e.summary # post.urls is a dictionary of photo URLs keyed by size. # Let's get the big one. e.photo_url = post.urls['500'] e.cached_url = config.IMAGES_URL + '/' + e._get_cached_original_shortname() self.logger.debug("Tumblr photo URL: '%s'" % e.photo_url) e.cache() e.set_dimensions() e.set_content() # Conversation, Video, and Audio post types aren't # going to be implemented for a while elif post.type == 'conversation': # TODO: Support Tumblr conversations self.logger.info("Tumblr post type: conversation") continue #e = Conversation() elif post.type == 'video': # TODO: Support Tumblr videos self.logger.info("Tumblr post type: video") continue #e = Video() elif post.type == 'audio': # TODO: Support Tumblr audio self.logger.info("Tumblr post type: audio") continue #e = Audio() e.source.name = self.name e.source.url = self.url if e.url == '': e.url = post.url e.author = self.owner e.date = post.date e.date_parsed = parse_date(post.date) self.logger.debug("Tumblr post date: %s" % e.date_as_string(e.date_parsed)) self.logger.info("Entry title: '%s'" % e.title) self.logger.debug("Entry URL: '%s'" % e.url) self.entries.append(e) except AttributeError: # FIXME: Why is this exception handler here??? pass
def parse(self): """Fetches the contents of the weblog's feed and parses it. Each entry in the feed becomes an Entry object, and each entry attribute is normalized.""" self.logger.info("Fetching feed '%s'" % self.feed_url) self.http_response, self.http_content = spider.fetch(self.feed_url) self.logger.info("Parsing feed for entries...") feed_data = feedparser.parse(self.feed_url) self.id = feed_data.feed.get("id", "") self.name = feed_data.feed.get("title", self.name) self.generator = feed_data.feed.get("generator", None) self.url = feed_data.feed.get("link", self.url) self.logger.debug("Weblog URL: '%s'" % self.url) self.tagline = feed_data.feed.get("tagline", "") self.updated = feed_data.feed.get("updated", None) self.updated_parsed = feed_data.feed.get("updated_parsed", None) self.rights = feed_data.feed.get("rights", None) self.atom = self.is_atom(feed_data.version) for entry in feed_data.entries: # This method will be inherited by all other feed-based # sources; because we assume that the only difference between # feeds of type Weblog, Linklog, and Commentlog is the # presentation of their entries, instantiating the appropriate # entry class here means that we don't have to write new # parse() methods for Linklog and Commentlog. if self.type == "linklog": e = Link() # NOTE: The following is a workaround for a feedparser bug. # http://code.google.com/p/feedparser/issues/detail?id=129 if self.is_delicious(): self.atom = False elif self.type == "commentlog": e = Quote() else: e = Post() e.source.name = self.name e.source.url = self.url e.atom = self.atom e.title = entry.get("title", "") self.logger.info("Entry title: '%s'" % e.title) e.author = entry.get("author", self.owner) try: e.author_url = entry.author_detail["href"] except (AttributeError, KeyError): e.author_url = None e.summary = entry.get("summary", "") # Need to get 'content[x]["value"]', not just 'content', # and we prefer something marked "text/html" try: html_types = ["text/html", "application/xhtml+xml", "application/xml", "text/xml", "text/plain"] for content in entry["content"]: if content["type"] in html_types: e.content = content["value"] continue except (KeyError, IndexError, AttributeError): e.content = e.summary # Atom weblog feeds should used 'rel="related"' for # the linked page, so need to make sure we get that link # and not the 'alternate' or 'via' link. e.url = entry.get("link", None) try: for link in entry.links: if link["rel"] == "via": e.via = link["href"] break else: e.via = None for link in entry.links: if link["rel"] == "related": e.url = link["href"] break else: e.url = entry.link except AttributeError: # In rare cases, entry.links is not populated pass e.comments = entry.get("comments", None) # 'alternate' represents the linklog entry itself, # which is often a comments page if self.type == "linklog": e.comments = e.get_delicious_url() if e.comments is None: try: for link in entry.links: if link["rel"] == "alternate": e.comments = link["href"] except AttributeError: # In rare cases, entry.links is not populated pass # Now, get tags/categories try: if len(entry.tags) > 0: for tag in entry.tags: e.tags.append(tag.term) except AttributeError: # No tags! Forget it. pass # Nix the comments property if it's the same link as the permalink if e.url == e.comments: e.comments = None # Put together the Atom <source> info, if applicable if e.atom: e.atom_source = AtomSource() try: e.atom_source.id = self.id e.atom_source.title = sanitize(self.name) e.atom_source.url = self.url e.atom_source.updated = self.updated except AttributeError: self.logger.exception("Trouble getting Atom source for '%s'!" % self.name) e.atom_source = None else: e.atom_source = None # DATES! e.date = entry.get("date") e.date_parsed = entry.get("date_parsed") self.logger.debug("Entry date: %s" % e.date_as_string(e.date_parsed)) e.published = entry.get("published", e.date) e.published_parsed = entry.get("published_parsed", e.date_parsed) e.updated = entry.get("updated", e.date) e.updated_parsed = entry.get("updated_parsed", e.date_parsed) e.created = entry.get("created", e.date) e.created_parsed = entry.get("created_parsed", e.date_parsed) # Build GUID # Use backup_id if feed doesn't provide one already backup_id = e.get_tag_uri(e.date_parsed, e.url) e.id = entry.get("id", backup_id) # Done parsing this entry self.entries.append(e)
def test_one(): import pdb pdb.set_trace() res = fetch(url) doc = fromstring(d.text.decode('utf-8', errors='ignore')) parse_category(doc)
def extract_product_detail(url, doc, res=''): ''' desc: 通过详情页获取一系列信息,入库 ''' if not doc.xpath('//div[@class="breadcrumb"]'): logger.info('extract_product_detail --- url %s %s' % (url, u'全球购不处理!!!')) return if doc.xpath('//div[@class="breadcrumb"]//a/text()')[0] == u"首页": logger.info('extract_product_detail --- url %s %s' % (url, u'闪购页面暂时不处理!!!')) return _this_dao = Product.select().where(Product.purl == url).first() if _this_dao: logger.info('extract_product_detail --- product %s exist' % (url)) return # pid pid = re.search('http://item.jd.com/(?P<id>\d*).html', url).groupdict()['id'] # product brand brand = doc.xpath('//*[@id="parameter-brand"]/li/a[1]')[0].text_content() # same detail page not contains brand img ,so set null _brand_img = doc.xpath('//*[@id="extInfo"]/div[1]/a/img/@src') if _brand_img: brand_img = _brand_img[0] brand_img = perfect_href(brand_img) else: brand_img = '' # product img imgs = doc.xpath('//div[@class="spec-items"]/ul/li/img/@src') fix_img = lambda x: re.sub('/n5/', '/imgzone/', "http:" + x) imgs = map(fix_img, imgs) img_first = imgs.pop(0) # pname pname = doc.xpath('//div[@id="product-intro"]//div[@id="itemInfo"]//h1' )[0].text_content() # 价格 _price_url = "http://p.3.cn/prices/get?skuid=J_{pid}" price = None _price_res = fetch(_price_url.format(pid=pid)) if _price_res.status_code == 200: price = json.loads(_price_res.text)[0]['p'] else: raise ("Not Parse Price") # 面包屑 == category _cat_body = parse_category(url, doc, res) if not ProductAndCategory.select().where( ProductAndCategory.pid == pid).first(): _cat_body.update({'pid': int(pid)}) ProductAndCategory.create(**_cat_body) data = { 'pid': pid, 'purl': url, 'pname': pname.encode('utf-8'), 'brand': brand.encode('utf-8'), 'brand_img': brand_img, 'product_img': img_first, 'price': price, 'extra': json.dumps({'img': imgs}) } try: Product.create(**data) logger.info('product success save--- url: %s' % (url)) except IntegrityError: logger.info('product faild repeat --- url: %s' % (url)) except Exception, e: ex = traceback.format_exc() logger.error('product faild exception --- url: %s\n %s' % (url, ex))