def downloadFreshDocuments ( ):
    """
    Based on the media items in data/item directory, download 10 webpages
    with a query from the first line in each item file. Keep querying until
    10 pages have been successfullly download. Sleep for 60 seconds before
    requesting url results from the metaSearch engine. This throttling was
    successful with Google's ajax api on 2/10/15.

    Once pages are downloaded, parse, stem, and store them with the
    storePageAndHeader function.
    """

    for query, itemName, itemType in mediaItems( 'data/item' ):
        count = 10 - len(db.lookupUrlsForItem( itemName, itemType ))
        start = 0
        while ( count > 0 ):
            time.sleep( 60 )
            urlList, start = metaSearch.executeQuery( db, query, count, start )

            for url in urlList:
                webPage, headers = spider.fetch( url )

                if ( webPage ):
                    storePageAndHeader( url, webPage, headers, itemName, itemType )

                    count -= 1
                    if ( count <= 0 ):
                        break
Ejemplo n.º 2
0
def jd_save2db(data, opt):
    skus = data['skuid']
    url = 'http://p.3.cn/prices/mgets?skuIds=J_%s&type=1' % skus #(',J_'.join(skus))
    o = {'referer': opt.url}
    # print opt
    resp = spider.fetch(url, o)
    jscode = resp.text  #[{'p': '769.00', 'm': '859.00', 'id': 'J_954086'}]
    obj = Global()
    info = None
    with PyV8.JSContext(obj) as ctx:
        c = ctx.eval(jscode)
        info =  PyV8.convert(c)
        # print  info
    print '++++++++++++++++++++++++++++++++++++++++++++++++++++++++'
    data['price'] = info[0]['p']
    data['old_price'] = info[0]['m']
    data['title'] = data['name'].decode('utf8')
    save2db(data, opt)
Ejemplo n.º 3
0
 def cache(self):
     """Fetches photo via HTTP and caches it."""
     try:
         self.logger.info("Fetching and caching photo '%s'" % self.title)
         content_types = [
             'image/jpeg',
             'image/gif',
             'image/png'
         ]
         resp, content = spider.fetch(self.photo_url, valid_content_types=content_types)
         self.logger.debug('HTTP Status: %s' % str(resp.status))
         if resp.status == 200:
             self.logger.debug('Saving photo to cache')
             f = open(self._get_cached_original_fn(), 'w')
             f.write(content)
             f.close()
     except:
         self.logger.exception("Problem caching photo!")
Ejemplo n.º 4
0
from spider import fetch

if __name__ == '__main__':
    fetch()
Ejemplo n.º 5
0
 def parse(self):
     """Fetches Tumblr API data and parses it."""
     self.logger.info("Fetching API data at '%s'" % self.api_url)
     self.http_response, self.http_content = spider.fetch(self.api_url)
     self.logger.info("Parsing API data for entries...")
     t = tumblr.parse(self.api_url)
     for post in t.posts:
         try:
             if post.type == 'regular':
                 self.logger.info("Tumblr post type: regular")
                 e = Post()
                 e.title = post.title
                 e.summary = post.content
                 e.content = post.content
             elif post.type == 'link':
                 if 'link' in self.excluded_types:
                     self.logger.debug("Skipping Tumblr link")
                     continue
                 else:
                     self.logger.info("Tumblr post type: link")
                     e = Link()
                     e.title = post.title
                     e.summary = post.content
                     e.content = post.content
                     e.url = post.related
                     e.comments = post.url
             elif post.type == 'quote':
                 self.logger.info("Tumblr post type: quote")
                 e = Quote()
                 e.summary = post.content
                 # Chop the smart quotes that Tumblr automatically 
                 # adds to to a quote                
                 e.summary = e.summary.lstrip("&#8220;").rstrip("&#8221;")
                 e.content = e.summary
                 # Get the quote's citation, and, if possible its source
                 e.citation = post.source
                 try:
                     soup = BeautifulSoup(e.citation)
                     e.citation_url = soup.find('a').get('href')
                     e.via = e.citation_url
                 except AttributeError:
                     e.citation_url = None
             elif post.type == 'photo':
                 self.logger.info("Tumblr post type: photo")
                 e = Photo()
                 e.photo_type = 'tumblr'
                 e.title = ''
                 e.summary = post.caption
                 #e.content = e.summary
                 # post.urls is a dictionary of photo URLs keyed by size.
                 # Let's get the big one.
                 e.photo_url = post.urls['500']
                 e.cached_url = config.IMAGES_URL + '/' + e._get_cached_original_shortname()
                 self.logger.debug("Tumblr photo URL: '%s'" % e.photo_url)
                 e.cache()
                 e.set_dimensions()
                 e.set_content()
             # Conversation, Video, and Audio post types aren't 
             # going to be implemented for a while
             elif post.type == 'conversation':
                 # TODO: Support Tumblr conversations
                 self.logger.info("Tumblr post type: conversation")
                 continue
                 #e = Conversation()
             elif post.type == 'video':
                 # TODO: Support Tumblr videos
                 self.logger.info("Tumblr post type: video")
                 continue
                 #e = Video()
             elif post.type == 'audio':
                 # TODO: Support Tumblr audio
                 self.logger.info("Tumblr post type: audio")
                 continue
                 #e = Audio()
             e.source.name = self.name
             e.source.url = self.url
             if e.url == '':
                 e.url = post.url
             e.author = self.owner
             e.date = post.date
             e.date_parsed = parse_date(post.date)
             self.logger.debug("Tumblr post date: %s" % e.date_as_string(e.date_parsed))
             self.logger.info("Entry title: '%s'" % e.title)
             self.logger.debug("Entry URL: '%s'" % e.url)
             self.entries.append(e)
         except AttributeError:
             # FIXME: Why is this exception handler here???
             pass
Ejemplo n.º 6
0
 def parse(self):
     """Fetches the contents of the weblog's feed and parses it.
     Each entry in the feed becomes an Entry object, and each entry 
     attribute is normalized."""
     self.logger.info("Fetching feed '%s'" % self.feed_url)
     self.http_response, self.http_content = spider.fetch(self.feed_url)
     self.logger.info("Parsing feed for entries...")
     feed_data = feedparser.parse(self.feed_url)
     self.id = feed_data.feed.get("id", "")
     self.name = feed_data.feed.get("title", self.name)
     self.generator = feed_data.feed.get("generator", None)
     self.url = feed_data.feed.get("link", self.url)
     self.logger.debug("Weblog URL: '%s'" % self.url)
     self.tagline = feed_data.feed.get("tagline", "")
     self.updated = feed_data.feed.get("updated", None)
     self.updated_parsed = feed_data.feed.get("updated_parsed", None)
     self.rights = feed_data.feed.get("rights", None)
     self.atom = self.is_atom(feed_data.version)
     for entry in feed_data.entries:
         # This method will be inherited by all other feed-based
         # sources; because we assume that the only difference between
         # feeds of type Weblog, Linklog, and Commentlog is the
         # presentation of their entries, instantiating the appropriate
         # entry class here means that we don't have to write new
         # parse() methods for Linklog and Commentlog.
         if self.type == "linklog":
             e = Link()
             # NOTE: The following is a workaround for a feedparser bug.
             # http://code.google.com/p/feedparser/issues/detail?id=129
             if self.is_delicious():
                 self.atom = False
         elif self.type == "commentlog":
             e = Quote()
         else:
             e = Post()
         e.source.name = self.name
         e.source.url = self.url
         e.atom = self.atom
         e.title = entry.get("title", "")
         self.logger.info("Entry title: '%s'" % e.title)
         e.author = entry.get("author", self.owner)
         try:
             e.author_url = entry.author_detail["href"]
         except (AttributeError, KeyError):
             e.author_url = None
         e.summary = entry.get("summary", "")
         # Need to get 'content[x]["value"]', not just 'content',
         # and we prefer something marked "text/html"
         try:
             html_types = ["text/html", "application/xhtml+xml", "application/xml", "text/xml", "text/plain"]
             for content in entry["content"]:
                 if content["type"] in html_types:
                     e.content = content["value"]
                     continue
         except (KeyError, IndexError, AttributeError):
             e.content = e.summary
         # Atom weblog feeds should used 'rel="related"' for
         # the linked page, so need to make sure we get that link
         # and not the 'alternate' or 'via' link.
         e.url = entry.get("link", None)
         try:
             for link in entry.links:
                 if link["rel"] == "via":
                     e.via = link["href"]
                     break
                 else:
                     e.via = None
             for link in entry.links:
                 if link["rel"] == "related":
                     e.url = link["href"]
                     break
                 else:
                     e.url = entry.link
         except AttributeError:
             # In rare cases, entry.links is not populated
             pass
         e.comments = entry.get("comments", None)
         # 'alternate' represents the linklog entry itself,
         # which is often a comments page
         if self.type == "linklog":
             e.comments = e.get_delicious_url()
         if e.comments is None:
             try:
                 for link in entry.links:
                     if link["rel"] == "alternate":
                         e.comments = link["href"]
             except AttributeError:
                 # In rare cases, entry.links is not populated
                 pass
         # Now, get tags/categories
         try:
             if len(entry.tags) > 0:
                 for tag in entry.tags:
                     e.tags.append(tag.term)
         except AttributeError:
             # No tags! Forget it.
             pass
         # Nix the comments property if it's the same link as the permalink
         if e.url == e.comments:
             e.comments = None
         # Put together the Atom <source> info, if applicable
         if e.atom:
             e.atom_source = AtomSource()
             try:
                 e.atom_source.id = self.id
                 e.atom_source.title = sanitize(self.name)
                 e.atom_source.url = self.url
                 e.atom_source.updated = self.updated
             except AttributeError:
                 self.logger.exception("Trouble getting Atom source for '%s'!" % self.name)
                 e.atom_source = None
         else:
             e.atom_source = None
         # DATES!
         e.date = entry.get("date")
         e.date_parsed = entry.get("date_parsed")
         self.logger.debug("Entry date: %s" % e.date_as_string(e.date_parsed))
         e.published = entry.get("published", e.date)
         e.published_parsed = entry.get("published_parsed", e.date_parsed)
         e.updated = entry.get("updated", e.date)
         e.updated_parsed = entry.get("updated_parsed", e.date_parsed)
         e.created = entry.get("created", e.date)
         e.created_parsed = entry.get("created_parsed", e.date_parsed)
         # Build GUID
         # Use backup_id if feed doesn't provide one already
         backup_id = e.get_tag_uri(e.date_parsed, e.url)
         e.id = entry.get("id", backup_id)
         # Done parsing this entry
         self.entries.append(e)
Ejemplo n.º 7
0
def test_one():
    import pdb
    pdb.set_trace()
    res = fetch(url)
    doc = fromstring(d.text.decode('utf-8', errors='ignore'))
    parse_category(doc)
Ejemplo n.º 8
0
def extract_product_detail(url, doc, res=''):
    '''
        desc:
            通过详情页获取一系列信息,入库
    '''
    if not doc.xpath('//div[@class="breadcrumb"]'):
        logger.info('extract_product_detail --- url %s  %s' %
                    (url, u'全球购不处理!!!'))
        return

    if doc.xpath('//div[@class="breadcrumb"]//a/text()')[0] == u"首页":
        logger.info('extract_product_detail --- url %s  %s' %
                    (url, u'闪购页面暂时不处理!!!'))
        return

    _this_dao = Product.select().where(Product.purl == url).first()
    if _this_dao:
        logger.info('extract_product_detail --- product %s exist' % (url))
        return

    # pid
    pid = re.search('http://item.jd.com/(?P<id>\d*).html',
                    url).groupdict()['id']

    # product brand
    brand = doc.xpath('//*[@id="parameter-brand"]/li/a[1]')[0].text_content()
    # same detail page not contains brand img ,so set null
    _brand_img = doc.xpath('//*[@id="extInfo"]/div[1]/a/img/@src')
    if _brand_img:
        brand_img = _brand_img[0]
        brand_img = perfect_href(brand_img)
    else:
        brand_img = ''

    # product img
    imgs = doc.xpath('//div[@class="spec-items"]/ul/li/img/@src')
    fix_img = lambda x: re.sub('/n5/', '/imgzone/', "http:" + x)
    imgs = map(fix_img, imgs)
    img_first = imgs.pop(0)

    # pname
    pname = doc.xpath('//div[@id="product-intro"]//div[@id="itemInfo"]//h1'
                      )[0].text_content()

    # 价格
    _price_url = "http://p.3.cn/prices/get?skuid=J_{pid}"
    price = None
    _price_res = fetch(_price_url.format(pid=pid))

    if _price_res.status_code == 200:
        price = json.loads(_price_res.text)[0]['p']
    else:
        raise ("Not Parse Price")

    # 面包屑 == category
    _cat_body = parse_category(url, doc, res)
    if not ProductAndCategory.select().where(
            ProductAndCategory.pid == pid).first():
        _cat_body.update({'pid': int(pid)})
        ProductAndCategory.create(**_cat_body)

    data = {
        'pid': pid,
        'purl': url,
        'pname': pname.encode('utf-8'),
        'brand': brand.encode('utf-8'),
        'brand_img': brand_img,
        'product_img': img_first,
        'price': price,
        'extra': json.dumps({'img': imgs})
    }
    try:
        Product.create(**data)
        logger.info('product success save--- url: %s' % (url))
    except IntegrityError:
        logger.info('product faild repeat --- url: %s' % (url))
    except Exception, e:
        ex = traceback.format_exc()
        logger.error('product faild exception --- url: %s\n %s' % (url, ex))