def handler(doc): hash = None if doc['domain'] == "youtube.com": match = youtube_query_regex.search(doc['query']) or youtube_path_regex.search(doc['path']) if match and match.group(1): hash = match.group(1) elif doc['domain'] == "youtu.be": match = youtube_short_path_regex.search(doc['path']) if match and match.group(1): hash = match.group(1) youtubes.append( (link['url'], match.group(1)) ) if hash is None: return link = doc['url'] logger.debug("working on youtube video http://www.youtube.com/watch?v=%s ", hash) gdata_api = "http://gdata.youtube.com/feeds/api/videos/%s?v=2&alt=json" % hash try: opener = urllib2.build_opener() obj = json.load(opener.open(gdata_api)) opener.close() obj = obj.get("entry") obj['ref_link'] = link emit_schema('rd.attach.link.youtubed', obj) except urllib2.HTTPError, exc: if exc.code == 404: logger.debug("404 at video: http://www.youtube.com/watch?v=%s", hash) else: logger.error("Failed to obtain youtube info: %s", exc)
def view(self, viewname, **kw): # The javascript impl has a painful separation of the 'keys' param # from other params - hide that. kw = kw.copy() keys = kw.get('keys') if keys is not None: del kw['keys'] # stale is a little tricky - 'ok' is the only valid option. So if # the user puts 'stale=None' we assume stale is *not* ok! if 'stale' in kw: stale = kw['stale'] assert stale in (None, 'ok'), stale # only ok and None are allowed! if stale is None: del kw['stale'] else: kw['stale'] = 'ok' viewParts = viewname.split('/') viewPath = self.uri + "_design/" + viewParts[0] + "/_view/" \ + viewParts[1] + self.encodeOptions(kw) if keys is None: resp = self.request("GET", viewPath); else: resp = self.request("POST", viewPath, headers={"Content-Type": "application/json"}, body=json.dumps({'keys':keys})) if resp.status == 404: return None self.maybeThrowError(resp) return json.load(resp);
def allDocs(self, keys, **options): assert keys, "don't call me if you don't want any docs!" uri = self.uri + "_all_docs" + self.encodeOptions(options) resp = self.request("POST", uri, headers={"Content-Type": "application/json"}, body=json.dumps({'keys':keys})) self.maybeThrowError(resp) return json.load(resp);
def handler(doc): photo_id = None if doc['domain'] == "flickr.com": match = flickr_photo_regex.search(doc['path']) if match and match.group(1): photo_id = match.group(1) elif doc['domain'] == "flic.kr": match = flickr_canonical_photo_regex.search(doc['path']) if match and match.group(1): photo_id = base58decode(match.group(1)) if photo_id is None: return url = doc['url'] # http://www.flickr.com/services/api/response.json.html options = { "method" : "flickr.photos.getInfo", # http://www.flickr.com/services/apps/23470/ "api_key" : "f6c619b20c6dbe75f9c940cfdf5c2f44", "photo_id" : photo_id, "format" : "json", "nojsoncallback" : "1" } info_api = "http://api.flickr.com/services/rest/?%s" % "&".join(['%s=%s' % v for v in options.items()]) opener = urllib2.build_opener() obj = json.load(opener.open(info_api)) if obj.get('stat') == "ok": photo = obj.get('photo') thumb = FLICKR_PHOTO_URL % (photo.get('farm'), photo.get('server'), photo.get('id'), photo.get('secret'), "_s") img = FLICKR_PHOTO_URL % (photo.get('farm'), photo.get('server'), photo.get('id'), photo.get('secret'), "") schema = {"thumb" : thumb, "img" : img, "title" : photo.get('title').get('_content'), "href" : "http://www.flickr.com/%s/%s/" % (photo.get('owner').get('nsid'), photo.get('id')), "userName" : photo.get('owner').get('username'), "realName" : photo.get('owner').get('realname'), "description" : photo.get('description').get('_content'), "ref_link" : url, } schema['location'] = photo.get('location') schema['flickr'] = photo emit_schema('rd.attach.link.img', schema)
def add_schemas(parser, options, args): """Add one or more schema documents to the couch""" if not args: parser.error("You must supply filenames containing json for the docs") dm = model.get_doc_model() for arg in args: try: with open(arg) as f: try: vals = json.load(f) except ValueError, why: parser.error("file %r has invalid json: %s" % (arg, why)) except IOError: parser.error("Failed to open json document %r" % arg) got = dm.create_schema_items([vals]) print "Saved doc id %(id)r at rev %(rev)s" % got[0]
def gen_corpus_schema_items(self, corpus_name, item_spec="*"): cwd = os.getcwd() corpus_dir = self.get_corpus_dir(corpus_name) num = 0 # We try and make life simple for people by auto-determining the # 'schema' for some well-known file types (eg, .rfc822.txt) pattern = "%s/%s.*" % (corpus_dir, item_spec) base_names = set() for filename in glob.iglob(pattern): try: path, name = os.path.split(filename) # don't use splitext - we want the *first* dot. first, _ = filename.split(".", 1) base = os.path.join(path, first) except ValueError: base = filename base_names.add(base) for basename in base_names: if basename.endswith('README') or basename.endswith('raindrop'): continue # .json files get first go - they may 'override' what we would # otherwise deduce. elif os.path.exists(basename + ".json"): filename = basename + ".json" with open(filename) as f: try: ob = json.load(f) except ValueError, why: self.fail("%r has invalid json: %r" % (filename, why)) # XXX - the below is probably broken but none of our # JSON files provide them assert '_attachments' not in ob, "please revisit this code!" for name, data in ob.get('_attachments', {}).iteritems(): fname = os.path.join(corpus_dir, data['filename']) with open(fname, 'rb') as attach_f: data['data'] = attach_f.read() si = self.doc_model.doc_to_schema_item(ob) elif os.path.exists(basename + ".rfc822.txt"): # plain rfc822.txt file. with open(basename + ".rfc822.txt", 'rb') as f: si = self.rfc822_to_schema_item(f)
def get_api_handler(options, req): # path format is "db_name/external_name/app_name/class_name/method_name if len(req.get('path', [])) != 5: raise APILoadError("invalid api request format") dbname = req['path'][0] cache_key = tuple(req['path'][:4]) try: return _handlers[cache_key] except KeyError: # first request for this handler pass # Load the schemas which declare they implement this end-point apiid = req['path'][2:4] # the 'app' name and the 'class' name. path = "/%s/_design/raindrop!content!all/_view/api_endpoints" % dbname req_options = {'key': json.dumps(apiid), 'include_docs': 'true'} uri = path + "?" + urlencode(req_options) c = httplib.HTTPConnection(options.couchdb_host, options.couchdb_port) c.request("GET", uri) resp = c.getresponse() if resp.status != 200: raise APILoadError("api query failure (%s: %s) to %s:%s", resp.status, resp.reason, options.couchdb_host, options.couchdb_port) result = json.load(resp) resp.close() rows = result['rows'] if not rows: raise APILoadError("No such API end-point %s", apiid) if len(rows) != 1: # should only be one doc with this criteria! raise APILoadError("too many docs say they implement this api!") doc = rows[0]['doc'] if doc.get('content_type') != 'application/x-python' or not doc.get('code'): raise APILoadError("document is not a python implemented API (%s)", doc['content_type']) # Now dynamically compile the code we loaded. globs = api_globals.copy() try: exec doc['code'] in globs except Exception, exc: raise APILoadError("Failed to initialize api: %s", exc)
def handler(doc): link = doc hash = None service = SERVICES.get(link["domain"], None) if service is not None: prop = service.get("link_prop") match = service.get("regex").search(link[prop]) if match and match.group(1): hash = match.group(1) if hash is None: return service = SERVICES.get(link["domain"]) options = service.get("options") options[service.get("hash_option_name")] = service.get("hash_option_function")(link["url"], hash) api = "%s?%s" % (service.get("api"), "&".join(["%s=%s" % v for v in options.items()])) opener = urllib2.build_opener() obj = json.load(opener.open(api)) if obj.get("errorCode") == 0: shorty = obj.get("results").get(hash) ss = service.get("schema") # XXX not all of these items are actually used, we could trim down # the size of these documents if space needed to be saved but for # now it's nice to have the extra data in case we want it later schema = { "short_url": ss.get("short_url")(link, shorty), "long_url": ss.get("long_url")(link, shorty), "title": ss.get("title")(link, shorty), "thumbnail": ss.get("thumbnail")(link, shorty), "user_name": ss.get("user_name")(link, shorty), "display_name": ss.get("display_name")(link, shorty), "user_url": ss.get("user_url")(link, shorty), "description": ss.get("description")(link, shorty), "extra": shorty, "domain": link.get("domain"), "ref_link": link["url"], } emit_schema("rd.attach.link.expanded", schema)
def handler(doc): video_id = None # Check for normal flickr urls and only add to list if not # already in the list. match = vimeo_video_regex.search(doc['url']) if match and match.group(1): video_id = match.group(1) if video_id is None: return # http://vimeo.com/api/docs/simple-api info_api = "http://vimeo.com/api/v2/video/%s.json" % video_id opener = urllib2.build_opener() # They don't like the urllib user-agent! opener.addheaders = [('User-agent', 'Mozilla/5.0')] obj = json.load(opener.open(info_api)) opener.close() # Vimeo always returns a list and we only asked for one video schema = obj.pop() schema['ref_link'] = doc['url'] emit_schema('rd.attach.link.vimeo', schema)
def handler(link): foursq = None if link['domain'] == "4sq.com" and foursq_path_regex.search(link['path']): try: opener = urllib2.build_opener() redir = opener.open(link['url']) path = urllib2.urlparse.urlparse(redir.url).path match = foursquare_venue_path_regex.search(path) if match and match.group(1): foursq = match.group(1) except urllib2.HTTPError, e: logger.error("link: %s error: %s",link['url'], e) elif link['domain'] == "foursquare.com": match = foursquare_venue_path_regex.search(link['path']) if match and match.group(1): foursq = match.group(1) if foursq is None: return options = { 'vid' : foursq } info_api = "http://api.foursquare.com/v1/venue.json?%s" % "&".join(['%s=%s' % v for v in options.items()]) opener = urllib2.build_opener() obj = json.load(opener.open(info_api)) if obj: obj["ref_link"] = link['url'] emit_schema('rd.attach.link.foursquare', obj)