def handler(doc):
    hash = None
    if doc['domain'] == "youtube.com":
        match = youtube_query_regex.search(doc['query']) or youtube_path_regex.search(doc['path'])
        if match and match.group(1):
            hash = match.group(1)
    elif doc['domain'] == "youtu.be":
        match = youtube_short_path_regex.search(doc['path'])
        if match and match.group(1):
            hash = match.group(1)
            youtubes.append( (link['url'], match.group(1)) )

    if hash is None:
        return

    link = doc['url']
    logger.debug("working on youtube video http://www.youtube.com/watch?v=%s ", hash)
    gdata_api = "http://gdata.youtube.com/feeds/api/videos/%s?v=2&alt=json" % hash
    try:
        opener = urllib2.build_opener()
        obj = json.load(opener.open(gdata_api))
        opener.close()

        obj = obj.get("entry")
        obj['ref_link'] = link
        emit_schema('rd.attach.link.youtubed', obj)
    except urllib2.HTTPError, exc:
        if exc.code == 404:
            logger.debug("404 at video: http://www.youtube.com/watch?v=%s",
                          hash)
        else:
            logger.error("Failed to obtain youtube info: %s", exc)
Example #2
0
    def view(self, viewname, **kw):
        # The javascript impl has a painful separation of the 'keys' param
        # from other params - hide that.
        kw = kw.copy()
        keys = kw.get('keys')
        if keys is not None:
            del kw['keys']
        # stale is a little tricky - 'ok' is the only valid option.  So if
        # the user puts 'stale=None' we assume stale is *not* ok!
        if 'stale' in kw:
            stale = kw['stale']
            assert stale in (None, 'ok'), stale # only ok and None are allowed!
            if stale is None:
                del kw['stale']
        else:
            kw['stale'] = 'ok'
        viewParts = viewname.split('/')
        viewPath = self.uri + "_design/" + viewParts[0] + "/_view/" \
            + viewParts[1] + self.encodeOptions(kw)
        if keys is None:
            resp = self.request("GET", viewPath);
        else:
            resp = self.request("POST", viewPath,
                                headers={"Content-Type": "application/json"},
                                body=json.dumps({'keys':keys}))

        if resp.status == 404:
          return None
        self.maybeThrowError(resp)
        return json.load(resp);
Example #3
0
 def allDocs(self, keys, **options):
     assert keys, "don't call me if you don't want any docs!"
     uri = self.uri + "_all_docs" + self.encodeOptions(options)
     resp = self.request("POST", uri,
                         headers={"Content-Type": "application/json"},
                         body=json.dumps({'keys':keys}))
     self.maybeThrowError(resp)
     return json.load(resp);
def handler(doc):
    photo_id = None
    if doc['domain'] == "flickr.com":
        match = flickr_photo_regex.search(doc['path'])
        if match and match.group(1):
            photo_id = match.group(1)
    elif doc['domain'] == "flic.kr":
        match = flickr_canonical_photo_regex.search(doc['path'])
        if match and match.group(1):
            photo_id = base58decode(match.group(1))

    if photo_id is None:
        return

    url = doc['url']
    # http://www.flickr.com/services/api/response.json.html
    options = {
        "method"       : "flickr.photos.getInfo",
        # http://www.flickr.com/services/apps/23470/
        "api_key"      : "f6c619b20c6dbe75f9c940cfdf5c2f44",
        "photo_id"     : photo_id,
        "format"       : "json",
        "nojsoncallback" : "1"
    }

    info_api = "http://api.flickr.com/services/rest/?%s" % "&".join(['%s=%s' % v for v in options.items()])
    opener = urllib2.build_opener()
    obj = json.load(opener.open(info_api))

    if obj.get('stat') == "ok":
        photo = obj.get('photo')

        thumb = FLICKR_PHOTO_URL % (photo.get('farm'), photo.get('server'),
                                    photo.get('id'), photo.get('secret'),
                                    "_s")
        img = FLICKR_PHOTO_URL % (photo.get('farm'), photo.get('server'),
                                  photo.get('id'), photo.get('secret'), "")

        schema = {"thumb"       : thumb,
                  "img"         : img,
                  "title"       : photo.get('title').get('_content'),
                  "href"        : "http://www.flickr.com/%s/%s/" % (photo.get('owner').get('nsid'),
                                                                    photo.get('id')),
                  "userName"    : photo.get('owner').get('username'),
                  "realName"    : photo.get('owner').get('realname'),
                  "description" : photo.get('description').get('_content'),
                  "ref_link"    : url,
                  }
        schema['location'] = photo.get('location')
        schema['flickr'] = photo
        emit_schema('rd.attach.link.img', schema)
Example #5
0
def add_schemas(parser, options, args):
    """Add one or more schema documents to the couch"""
    if not args:
        parser.error("You must supply filenames containing json for the docs")
    dm = model.get_doc_model()
    for arg in args:
        try:
            with open(arg) as f:
                try:
                    vals = json.load(f)
                except ValueError, why:
                    parser.error("file %r has invalid json: %s" % (arg, why))
        except IOError:
            parser.error("Failed to open json document %r" % arg)

        got = dm.create_schema_items([vals])
        print "Saved doc id %(id)r at rev %(rev)s" % got[0]
Example #6
0
 def gen_corpus_schema_items(self, corpus_name, item_spec="*"):
     cwd = os.getcwd()
     corpus_dir = self.get_corpus_dir(corpus_name)
     num = 0
     # We try and make life simple for people by auto-determining the
     # 'schema' for some well-known file types (eg, .rfc822.txt)
     pattern = "%s/%s.*" % (corpus_dir, item_spec)
     base_names = set()
     for filename in glob.iglob(pattern):
         try:
             path, name = os.path.split(filename)
             # don't use splitext - we want the *first* dot.
             first, _ = filename.split(".", 1)
             base = os.path.join(path, first)
         except ValueError:
             base = filename
         base_names.add(base)
     for basename in base_names:
         if basename.endswith('README') or basename.endswith('raindrop'):
             continue
         # .json files get first go - they may 'override' what we would
         # otherwise deduce.
         elif os.path.exists(basename + ".json"):
             filename = basename + ".json"
             with open(filename) as f:
                 try:
                     ob = json.load(f)
                 except ValueError, why:
                     self.fail("%r has invalid json: %r" % (filename, why))
                 # XXX - the below is probably broken but none of our
                 # JSON files provide them
                 assert '_attachments' not in ob, "please revisit this code!"
                 for name, data in ob.get('_attachments', {}).iteritems():
                     fname = os.path.join(corpus_dir, data['filename'])
                     with open(fname, 'rb') as attach_f:
                         data['data'] = attach_f.read()
             si = self.doc_model.doc_to_schema_item(ob)
         elif os.path.exists(basename + ".rfc822.txt"):
             # plain rfc822.txt file.
             with open(basename + ".rfc822.txt", 'rb') as f:
                 si = self.rfc822_to_schema_item(f)
Example #7
0
def get_api_handler(options, req):
    # path format is "db_name/external_name/app_name/class_name/method_name
    if len(req.get('path', [])) != 5:
        raise APILoadError("invalid api request format")
    dbname = req['path'][0]
    cache_key = tuple(req['path'][:4])
    try:
        return _handlers[cache_key]
    except KeyError:
        # first request for this handler
        pass

    # Load the schemas which declare they implement this end-point
    apiid = req['path'][2:4] # the 'app' name and the 'class' name.
    path = "/%s/_design/raindrop!content!all/_view/api_endpoints" % dbname
    req_options = {'key': json.dumps(apiid), 'include_docs': 'true'}
    uri = path + "?" + urlencode(req_options)

    c = httplib.HTTPConnection(options.couchdb_host, options.couchdb_port)
    c.request("GET", uri)
    resp = c.getresponse()
    if resp.status != 200:
        raise APILoadError("api query failure (%s: %s) to %s:%s", resp.status,
                           resp.reason, options.couchdb_host, options.couchdb_port)
    result = json.load(resp)
    resp.close()
    rows = result['rows']
    if not rows:
        raise APILoadError("No such API end-point %s", apiid)
    if len(rows) != 1: # should only be one doc with this criteria!
        raise APILoadError("too many docs say they implement this api!")
    doc = rows[0]['doc']
    if doc.get('content_type') != 'application/x-python' or not doc.get('code'):
        raise APILoadError("document is not a python implemented API (%s)", doc['content_type'])

    # Now dynamically compile the code we loaded.
    globs = api_globals.copy()
    try:
        exec doc['code'] in globs
    except Exception, exc:
        raise APILoadError("Failed to initialize api: %s", exc)
Example #8
0
def handler(doc):
    link = doc
    hash = None
    service = SERVICES.get(link["domain"], None)
    if service is not None:
        prop = service.get("link_prop")
        match = service.get("regex").search(link[prop])
        if match and match.group(1):
            hash = match.group(1)

    if hash is None:
        return
    service = SERVICES.get(link["domain"])

    options = service.get("options")
    options[service.get("hash_option_name")] = service.get("hash_option_function")(link["url"], hash)

    api = "%s?%s" % (service.get("api"), "&".join(["%s=%s" % v for v in options.items()]))

    opener = urllib2.build_opener()
    obj = json.load(opener.open(api))
    if obj.get("errorCode") == 0:
        shorty = obj.get("results").get(hash)
        ss = service.get("schema")
        # XXX not all of these items are actually used, we could trim down
        # the size of these documents if space needed to be saved but for
        # now it's nice to have the extra data in case we want it later
        schema = {
            "short_url": ss.get("short_url")(link, shorty),
            "long_url": ss.get("long_url")(link, shorty),
            "title": ss.get("title")(link, shorty),
            "thumbnail": ss.get("thumbnail")(link, shorty),
            "user_name": ss.get("user_name")(link, shorty),
            "display_name": ss.get("display_name")(link, shorty),
            "user_url": ss.get("user_url")(link, shorty),
            "description": ss.get("description")(link, shorty),
            "extra": shorty,
            "domain": link.get("domain"),
            "ref_link": link["url"],
        }
        emit_schema("rd.attach.link.expanded", schema)
def handler(doc):
    video_id = None
    # Check for normal flickr urls and only add to list if not
    # already in the list.
    match = vimeo_video_regex.search(doc['url'])
    if match and match.group(1):
        video_id = match.group(1)

    if video_id is None:
        return

    # http://vimeo.com/api/docs/simple-api
    info_api = "http://vimeo.com/api/v2/video/%s.json" % video_id

    opener = urllib2.build_opener()
    # They don't like the urllib user-agent!
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    obj = json.load(opener.open(info_api))
    opener.close()

    # Vimeo always returns a list and we only asked for one video
    schema = obj.pop()
    schema['ref_link'] = doc['url']
    emit_schema('rd.attach.link.vimeo', schema)
def handler(link):
    foursq = None
    if link['domain'] == "4sq.com" and foursq_path_regex.search(link['path']):
        try:
            opener = urllib2.build_opener()
            redir = opener.open(link['url'])
            path = urllib2.urlparse.urlparse(redir.url).path
            match = foursquare_venue_path_regex.search(path)
            if match and match.group(1):
                foursq = match.group(1)
        except urllib2.HTTPError, e:
            logger.error("link: %s error: %s",link['url'], e)
    elif link['domain'] == "foursquare.com":
        match = foursquare_venue_path_regex.search(link['path'])
        if match and match.group(1):
            foursq = match.group(1)

    if foursq is None:
        return

    options = {
        'vid'    : foursq
    }

    info_api = "http://api.foursquare.com/v1/venue.json?%s" % "&".join(['%s=%s' % v for v in options.items()])
    opener = urllib2.build_opener()
    obj = json.load(opener.open(info_api))
    if obj:
        obj["ref_link"] = link['url']
        emit_schema('rd.attach.link.foursquare', obj)