def run(*dataspec, **query): f = urllib.urlopen("http://api.census.gov/data/" + "/".join(dataspec) + "?" + urllib.urlencode(query)) response = tangelo.empty_response() response['result'] = f.read() return response
def run(database, table, start_time, end_time, center, degree, host="mongo", port="21000", fields="true"): response = tangelo.empty_response() try: degree = int(degree) except ValueError: response["error"] = "argument 'degree' must be an integer" return response client = impala.ImpalaBeeswaxClient(host + ':' + port) client.connect() talkers = set([center]) distance = {center: 0} current_talkers = list(talkers) all_results = [] for i in range(degree): query = build_query(database, table, start_time, end_time, current_talkers) qResults = client.execute(query) results = convert_results(qResults, "true") current_talkers = list(itertools.chain(*map(lambda x: [x["emailto"], x["emailfrom"]], results))) current_talkers = list(set(current_talkers)) talkers = talkers.union(current_talkers) for t in current_talkers: if t not in distance: distance[t] = i+1 all_results.append(results) talkers = list(talkers) talker_index = {name: index for (index, name) in enumerate(talkers)} all_results = itertools.chain(*all_results) edges = [] ident = 0 for result in all_results: source = result["emailfrom"] target = result["emailto"] ident += 1 rec = { "source": talker_index[source], "target": talker_index[target], "id": str(ident) } edges.append(rec) talkers = [{"email": n, "distance": distance[n]} for n in talkers] response["result"] = { "nodes": talkers, "edges": edges } return response
def run(servername, dbname, datatype, by=None, datemin=None, datemax=None, charity=None): # Construct an empty response object. response = tangelo.empty_response() # Establish a connection to the MongoDB server. try: conn = pymongo.Connection(servername) except pymongo.errors.AutoReconnect as e: response["error"] = "error: %s" % (e.message) return bson.json_util.dumps(response) # Extract the requested database and collection. db = conn[dbname] if datatype == "transactions": coll = db["charitynet.normalized.transactions"] conditions = [{"date": {"$ne": None}}] if datemin != None and datemax != None: date_min = datetime.datetime.strptime(datemin, "%Y-%m-%d") date_max = datetime.datetime.strptime(datemax, "%Y-%m-%d") conditions.append({"date": {"$gte": date_min}}) conditions.append({"date": {"$lt": date_max}}) if charity != None: conditions.append({"charity_id": int(charity)}) pipeline = [] if len(conditions) > 0: pipeline.append({"$match": {"$and": conditions}}) if by == "month": group = {"year": {"$year": "$date"}, "month": {"$month": "$date"}} else: group = "$county" pipeline.append({"$group": {"_id": group, "amount": {"$sum": "$amount"}}}) result = coll.aggregate(pipeline) if by == "month": response = [[d["_id"], float(d["amount"])] for d in result["result"] if d["_id"] != None] else: response = [["%05d" % d["_id"], float(d["amount"])] for d in result["result"] if d["_id"] != None] elif datatype == "population": coll = db["census"] response = [[d["_id"], int(d["pop2010"])] for d in coll.find()] elif datatype == "charities": coll = db["charitynet.normalized.transactions"] result = coll.aggregate([{"$group": {"_id": "$charity_id", "count": {"$sum": 1}}}, {"$sort": {"count": -1}}]) response = [[d["_id"], d["_id"], d["count"]] for d in result["result"]] else: response["error"] = "error: unknown datatype requested" # Convert to JSON and return the result. return bson.json_util.dumps(response)
def stream(self, key=None, action="next"): if action != "show": # Check for key parameter. if key is None: raise cherrypy.HTTPError("400 Required Query Parameter Missing", "The streaming API requires a 'key' query parameter") # Check that the key actually exists. if key not in self.streams: raise cherrypy.HTTPError("404 Key Not Found", "The key '%s' does not reference any existing stream" % (key)) # Construct a container object. result = tangelo.empty_response() # Perform the requested action. actions = ["next", "delete", "show"] if action == "next": # Grab the stream in preparation for running it. stream = self.streams[key] # Attempt to run the stream via its next() method - if this yields a # result, then continue; if the next() method raises StopIteration, # then there are no more results to retrieve; if any other exception # is raised, this is treated as an error. try: result["stream_finished"] = False result["result"] = stream.next() except StopIteration: result["stream_finished"] = True del self.streams[key] except: del self.streams[key] raise cherrypy.HTTPError("501 Error in Python Service", "Caught exception while executing stream service keyed by %s:<br><pre>%s</pre>" % (key, traceback.format_exc())) elif action == "delete": del self.streams[key] result["result"] = "OK" elif action == "show": raise cherrypy.HTTPError("501 Unimplemented", "The 'show' action in the Tangelo streaming API has not yet been implemented") else: raise cherrypy.HTTPError("400 Bad Query Parameter", "The 'action' parameter must be one of: %s" % (", ".join(actions))) try: result = json.dumps(result) except TypeError: raise cherrypy.HTTPError("501 Bad Response from Python Service", "The stream keyed by %s returned a non JSON-seriazable result: %s" % (key, result["result"])) return result
def run(servername, dbname, collname, file_hash=None, data=None): # Construct an empty response object. response = tangelo.empty_response(); # If no schema was passed in, give an error. # # TODO(choudhury): see comment below about error codes, etc. if file_hash == None: response['error'] = "no file hash" return bson.json_util.dumps(response) # Establish a connection to the MongoDB server. try: conn = pymongo.Connection(servername) except pymongo.errors.AutoReconnect as e: response['error'] = "error: %s" % (e.message) return bson.json_util.dumps(response) # Extract the requested database and collection. db = conn[dbname] coll = db[collname] # If no data field was specified, treat this as a read request; # otherwise, write the data to the database. if data == None: # Create a search schema for finding the record with the appropriate # hash. schema = {'file_hash' : file_hash} # Apply the schema to retrieve documents. response['result'] = [d for d in coll.find(schema)] else: # Convert the JSON object "data" to a Python object. try: pydata = bson.json_util.loads(data) except ValueError as e: response['error'] = e.message return bson.json_util.dumps(response) # Apply the schema to an insert request. coll.insert({'file_hash': file_hash, 'data': data}) # Return a success code. response['result'] = "ok" # Convert to JSON and return the result. return bson.json_util.dumps(response)
def run(servername, dbname, datatype, datemin = "2012-01-01", datemax = "2012-02-01"): # Construct an empty response object. response = tangelo.empty_response(); # Establish a connection to the MongoDB server. try: conn = pymongo.Connection(servername) except pymongo.errors.AutoReconnect as e: response['error'] = "error: %s" % (e.message) return bson.json_util.dumps(response) # Extract the requested database and collection. db = conn[dbname] if datatype == "full": # Output number of donors per county. Use a string prefixed by "0" if needed # for the county code to output state codes less than 10 correctly. coll = db["charitynet.normalized.donors.counties"] result = coll.find() # Do not use normal 'result' field so the URL be used directly in a vega # specification. response = [["%05d" % d['_id'], int(d['value'])] for d in result if d['_id'] != None] elif datatype == "bycounty": coll = db["charitynet.normalized.transactions"] result = coll.aggregate([{"$group": {"_id": "$county", "amount": {"$sum": "$amount"}}}]) response = [["%05d" % d['_id'], float(d['amount'])] for d in result["result"] if d["_id"] != None] elif datatype == "bydate": date_min = datetime.datetime.strptime(datemin, "%Y-%m-%d") date_max = datetime.datetime.strptime(datemax, "%Y-%m-%d") coll = db["charitynet.normalized.transactions"] query = {"$and": [{"date": {"$gte": date_min}}, {"date": {"$lt": date_max}}]} group = {"_id": "$county", "amount": {"$sum": "$amount"}} result = coll.aggregate([{"$match": query}, {"$group": group}]) response = [["%05d" % d['_id'], float(d['amount'])] for d in result["result"] if d["_id"] != None] elif datatype == "population": coll = db["census"] response = [[d["_id"], int(d["pop2010"])] for d in coll.find()] else: response['error'] = "error: unknown datatype requested" # Convert to JSON and return the result. return bson.json_util.dumps(response)
def run(text=""): # Create an empty result container. response = empty_response(); response['result'] = []; # If nothing passed in, return an empty result. if text == "": return response # Otherwise, perform named entity recognition. sentences = nltk.sent_tokenize(text) chunks = [nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(s))) for s in sentences] # Now find all tagged chunks that are not whole sentences - gather the leaves of such # chunks into strings, and place them in the list of named entities. for c in chunks: for subtree in filter(lambda x: x.node != 'S', c.subtrees()): response['result'].append( (subtree.node, ' '.join(map(lambda x: x[0], subtree.leaves())) ) ) return response
def run(servername, dbname, collname, name=None, data=None, code=None): # Construct an empty response object. response = tangelo.empty_response() # If no schema was passed in, give an error. # # TODO(choudhury): see comment below about error codes, etc. if name == None: response['error'] = "no name" return bson.json_util.dumps(response) # Establish a connection to the MongoDB server. try: conn = pymongo.Connection(servername) except pymongo.errors.AutoReconnect as e: response['error'] = "error: %s" % (e.message) return bson.json_util.dumps(response) # Extract the requested database and collection. db = conn[dbname] coll = db[collname] # If no data field was specified, treat this as a read request; # otherwise, write the data to the database. if data == None and code == None: # Create a search schema for finding the record with the appropriate # hash. schema = {'_id': name} # Apply the schema to retrieve documents. response['result'] = [d for d in coll.find(schema)] else: # Apply the schema to an insert request. coll.save({'_id': name, 'data': data, 'code': code}) # Return a success code. response['result'] = "ok" # Convert to JSON and return the result. return bson.json_util.dumps(response)
def run(server, db, coll, method="find", query=None, limit=1000, fields=None, sort=None, fill=None): # Create an empty response object. response = tangelo.empty_response() # Check the requested method. if method not in ["find", "insert"]: response["error"] = "Unsupported MongoDB operation '%s'" % (method) return bson.json_util.dumps(response) # Decode the query strings into Python objects. try: if query is not None: query = decode(query, "query", response) if fields is not None: fields = decode(fields, "fields", response) if sort is not None: sort = decode(sort, "sort", response) if fill is not None: fill = decode(fill, "fill", response) else: fill = True except ValueError: return bson.json_util.dumps(response) # Cast the limit value to an int. try: limit = int(limit) except ValueError: response["error"] = "Argument 'limit' ('%s') could not be converted to int." % (limit) return bson.json_util.dumps(response) # Create database connection. try: c = pymongo.Connection(server)[db][coll] except pymongo.errors.AutoReconnect: response["error"] = "Could not connect to MongoDB server '%s'" % (server) return bson.json_util.dumps(response) # Perform the requested action. if method == "find": # Do a find operation with the passed arguments. it = c.find(spec=query, fields=fields, limit=limit, sort=sort) # Create a list of the results. if fill: results = [x for x in it] else: results = [] # Create an object to structure the results. retobj = {} retobj["count"] = it.count() retobj["data"] = results # Pack the results into the response object, and return it. response["result"] = retobj else: raise RuntimeError("illegal method '%s' in module 'mongo'") # Return the response object. return bson.json_util.dumps(response)
def run(servername, dbname, data_coll, name=None, objectid=None, _id=None, accession=None, scientific_name=None, noid=False, noloc=False, maxdepth=100): def recursiveHelper(child, depth=0): it = c.find({'_id': child}) phylo = it[0] if 'clades' in phylo: counter = 0 for child in phylo['clades']: if depth >= maxdepth: phylo['clades'][counter] = str(child) else: phylo['clades'][counter] = recursiveHelper( child, depth + 1) counter += 1 if noid: del phylo['_id'] else: phylo['_id'] = str(phylo['_id']) if 'loc' in phylo: if noloc: del phylo['loc'] return phylo # Construct an empty response object. response = tangelo.empty_response() query = dict() # Decode the query strings into Python objects. try: if name is not None: decodeAndAdd(name, query, 'sequences.name', response) if objectid is not None: decodeAndAdd(ObjectId(objectid), query, 'objectid', response) if _id is not None: decodeAndAdd(_id, query, '_id', response) if accession is not None: decodeAndAdd(accession, query, 'sequences.accession.source', response) if scientific_name is not None: decodeAndAdd(scientific_name, query, 'taxonomies.scientific_name', response) except ValueError: return bson.json_util.dumps(response) # Cast the maxdepth value to an int. try: maxdepth = int(maxdepth) except ValueError: response[ 'error'] = "Argument 'limit' ('%s') could not be converted to int." % ( maxdepth) return bson.json_util.dumps(response) # Create database connection. try: c = pymongo.Connection(servername)[dbname][data_coll] except pymongo.errors.AutoReconnect: response['error'] = "Could not connect to MongoDB server '%s'" % ( servername) return bson.json_util.dumps(response) # if no arguments given just search from root if not query: query['rooted'] = True it = c.find(query) # create a new tree for results if it.count() == 1: phylo = it[0] phylotree = recursiveHelper(phylo['_id']) # Convert to JSON and return the result. return bson.json_util.dumps(phylotree, sort_keys=True) else: response['error'] = "Search returned %s object(s) to root the tree" % ( it.count()) response['error'] += "| %s" % (str(query)) return bson.json_util.dumps(response)
def run(start_date=None, end_date=None, omit_countries=None, omit_diseases=None): # Check for required arguments. if start_date is None: return tangelo.HTTPStatusCode("422 Missing Parameter", "Required parameter <i>start_date</i> missing."); elif end_date is None: return tangelo.HTTPStatusCode("422 Missing Parameter", "Required parameter <i>end_date</i> missing."); # Convert arguments to date objects. try: start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") except ValueError: return tangelo.HTTPStatusCode("422 Bad Parameter", "Parameter <i>start_date</i> ('%s') was not in YYYY-MM-DD form." % (start_date)) try: end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") except ValueError: return tangelo.HTTPStatusCode("422 Bad Parameter", "Parameter <i>end_date</i> ('%s') was not in YYYY-MM-DD form." % (end_date)) # See if there are any countries or diseases to omit. if omit_countries is None: omit_countries = [] else: try: omit_countries = bson.json_util.loads(omit_countries) except ValueError: return tangelo.HTTPStatusCode("422 Bad Parameter", "Parameter <i>omit_countries</i> ('%s') was not JSON-deserializable." % (omit_countries)) if omit_diseases is None: omit_diseases = [] else: try: omit_diseases = bson.json_util.loads(omit_diseases) except ValueError: return tangelo.HTTPStatusCode("422 Bad Parameter", "Parameter <i>omit_diseases</i> ('%s') was not JSON-deserializable." % (omit_diseases)) # Perform the lookup. coll = pymongo.Connection("mongo").canepi.alerts query = coll.find({"$and": [{"date": {"$gte": start_date} }, {"date": {"$lt": end_date} }, {"disease": {"$not": {"$in": omit_diseases}}}, {"country": {"$not": {"$in": omit_countries}}}] }, fields = ["_id", "date", "rating.rating", "disease", "country"]) # Compute the graph structure. nodes = [] links = [] diseases = {} countries = {} for q in query: # For each result record, construct an "alert type" node and store it in # the node list. alert = {"id": str(q["_id"]), "date": q["date"], "rating": q["rating"]["rating"], "type": "alert"} nodes.append(alert) # Extract the disease and country, and create nodes for them if they # don't already exist. if q["country"] not in countries: countries[q["country"]] = {"id": q["country"], "type": "country"} country = countries[q["country"]] if q["disease"] not in diseases: diseases[q["disease"]] = {"id": q["disease"], "type": "disease"} disease = diseases[q["disease"]] # Create links between the alert and its country and its disease. links += [{"source": alert, "target": country}, {"source": alert, "target": disease}] # Add the disease and country nodes to the node list. nodes += countries.values() + diseases.values() # Create an index map of the nodes. nodemap = {value["id"]: index for (index, value) in enumerate(nodes)} # Replace the raw entries in the links list with indices into the node # array. for i, v in enumerate(links): links[i]["source"] = nodemap[v["source"]["id"]] links[i]["target"] = nodemap[v["target"]["id"]] # Create a response object and pack the graph structure into it. r = tangelo.empty_response() r["result"] = {"nodes": nodes, "links": links} # Use the special bson encoder and return the result. return bson.json_util.dumps(r)
def run(host, database, collection, start_time=None, end_time=None, center=None, degree=None): response = tangelo.empty_response() # Bail with error if any of the required arguments is missing. missing = map(lambda x: x[0], filter(lambda x: x[1] is None, zip(["start_time", "end_time", "center", "degree"], [start_time, end_time, center, degree]))) if len(missing) > 0: response["error"] = "missing required arguments: %s" % (", ".join(missing)) return response # Cast the arguments to the right types. # # The degree is the degree of separation between the center element and the # retrieved nodes - an integer. try: degree = int(degree) except ValueError: response["error"] = "argument 'degree' must be an integer" return response # The start time is the number of milliseconds since the epoch (which is how # JavaScript dates are constructed, and therefore how dates are stored in # MongoDB) - an integer. try: start_time = datetime.datetime.strptime(start_time, "%Y-%m-%d") except ValueError: response["error"] = "argument 'start_time' must be in YYYY-MM-DD format" return response # The end time is another date - an integer. try: end_time = datetime.datetime.strptime(end_time, "%Y-%m-%d") except ValueError: response["error"] = "argument 'end_time' must be in YYYY-MM-DD format" return response # Get a handle to the database collection. if SparkContext._active_spark_context == None: sc = SparkContext('spark://impaladev.darpa.mil:7077', 'Enron Emailers') else: sc = SparkContext._active_spark_context enronData = sc.textFile('hdfs://localhost:8020/user/bigdata/pgill/enron/email_graph_fixed.txt').map(lambda line: line.split('\t')).cache() def withinTimespan(record): recordDate = datetime.datetime.strptime(record[2], "%Y-%m-%d") return recordDate >= start_time and recordDate < end_time def emptyRecords(record): return record[0] != "" and record[1] != "" def orderRecord(record): if record[1] < record[0]: record[0], record[1] = record[1], record[0] return record enronSpan = enronData.filter(withinTimespan).filter(emptyRecords).map(orderRecord).map(lambda rec: (rec[0], rec[1])).distinct().cache() # Start a set of all interlocutors we're interested in - that includes the # center emailer. talkers = set([center]) # Also start a table of distances from the center. distance = {center: 0} current_talkers = list(talkers) all_results = [] for i in range(degree): def emailsInvolved(record): return any(keyword in record for keyword in current_talkers) results = enronSpan.filter(emailsInvolved).collect() # Collect the names. current_talkers = list(itertools.chain(*map(lambda x: [x[1], x[0]], results))) current_talkers = list(set(current_talkers)) talkers = talkers.union(current_talkers) # Compute updates to everyone's distance from center. for t in current_talkers: if t not in distance: distance[t] = i+1 # save the cursor. all_results.append(results) # Construct a canonical graph structure from the set of talkers and the list # of emails. # # Start with an index map of the talkers. talkers = list(talkers) talker_index = {name: index for (index, name) in enumerate(talkers)} # Create a chained iterable from all the rewound partial results. all_results = itertools.chain(*all_results) # Create a list of graph edges suitable for use by D3 - replace each record # in the data with one that carries an index into the emailers list. edges = [] ident = 0 for result in all_results: source = result[0] target = result[1] ident += 1 rec = { "source": talker_index[source], "target": talker_index[target], "id": str(ident) } edges.append(rec) talkers = [{"email": n, "distance": distance[n]} for n in talkers] # Stuff the graph data into the response object, and return it. response["result"] = { "nodes": talkers, "edges": edges } return response
def run(host, database, collection, start_time=None, end_time=None, center=None, degree=None): response = tangelo.empty_response() # Bail with error if any of the required arguments is missing. missing = map( lambda x: x[0], filter( lambda x: x[1] is None, zip(["start_time", "end_time", "center", "degree"], [start_time, end_time, center, degree]))) if len(missing) > 0: response["error"] = "missing required arguments: %s" % ( ", ".join(missing)) return response # Cast the arguments to the right types. # # The degree is the degree of separation between the center element and the # retrieved nodes - an integer. try: degree = int(degree) except ValueError: response["error"] = "argument 'degree' must be an integer" return response # The start time is the number of milliseconds since the epoch (which is how # JavaScript dates are constructed, and therefore how dates are stored in # MongoDB) - an integer. try: start_time = datetime.datetime.strptime(start_time, "%Y-%m-%d") except ValueError: response[ "error"] = "argument 'start_time' must be in YYYY-MM-DD format" return response # The end time is another date - an integer. try: end_time = datetime.datetime.strptime(end_time, "%Y-%m-%d") except ValueError: response["error"] = "argument 'end_time' must be in YYYY-MM-DD format" return response # Get a handle to the database collection. try: c = pymongo.Connection(host)[database][collection] except pymongo.errors.AutoReconnect as e: response["error"] = "database error: %s" % (e.message) return response # Start a set of all interlocutors we're interested in - that includes the # center emailer. talkers = set([center]) # Also start a table of distances from the center. distance = {center: 0} current_talkers = list(talkers) all_results = [] for i in range(degree): # Construct and send a query to retrieve all records involving the # current talkers, occurring within the time bounds specified, and # involving two known addresses. query = { "$and": [{ "date": { "$gte": start_time } }, { "date": { "$lt": end_time } }, { "source": { "$ne": "" } }, { "target": { "$ne": "" } }, { "$or": [{ "source": { "$in": current_talkers } }, { "target": { "$in": current_talkers } }] }] } results = c.find(query, fields=["target", "source"]) # Collect the names. #current_talkers = list(set(map(lambda x: x["target"] if x["source"] == center else x["source"], results))) current_talkers = list( itertools.chain( *map(lambda x: [x["target"], x["source"]], results))) talkers = talkers.union(current_talkers) # Compute updates to everyone's distance from center. for t in current_talkers: if t not in distance: distance[t] = i + 1 # Rewind and save the cursor. results.rewind() all_results.append(results) # Construct a canonical graph structure from the set of talkers and the list # of emails. # # Start with an index map of the talkers. talkers = list(talkers) talker_index = {name: index for (index, name) in enumerate(talkers)} # Create a chained iterable from all the rewound partial results. all_results = itertools.chain(*all_results) # Create a list of graph edges suitable for use by D3 - replace each record # in the data with one that carries an index into the emailers list. edges = [] for result in all_results: source = result["source"] target = result["target"] ident = str(result["_id"]) rec = { "source": talker_index[source], "target": talker_index[target], "id": ident } edges.append(rec) talkers = [{"email": n, "distance": distance[n]} for n in talkers] # Stuff the graph data into the response object, and return it. response["result"] = {"nodes": talkers, "edges": edges} return response
def run(host, database, collection, start_time=None, end_time=None, center=None, degree=None): response = tangelo.empty_response() # Bail with error if any of the required arguments is missing. missing = map(lambda x: x[0], filter(lambda x: x[1] is None, zip(["start_time", "end_time", "center", "degree"], [start_time, end_time, center, degree]))) if len(missing) > 0: response["error"] = "missing required arguments: %s" % (", ".join(missing)) return response # Cast the arguments to the right types. # # The degree is the degree of separation between the center element and the # retrieved nodes - an integer. try: degree = int(degree) except ValueError: response["error"] = "argument 'degree' must be an integer" return response # The start time is the number of milliseconds since the epoch (which is how # JavaScript dates are constructed, and therefore how dates are stored in # MongoDB) - an integer. try: start_time = datetime.datetime.strptime(start_time, "%Y-%m-%d") except ValueError: response["error"] = "argument 'start_time' must be in YYYY-MM-DD format" return response # The end time is another date - an integer. try: end_time = datetime.datetime.strptime(end_time, "%Y-%m-%d") except ValueError: response["error"] = "argument 'end_time' must be in YYYY-MM-DD format" return response # Get a handle to the database collection. try: c = pymongo.Connection(host)[database][collection] except pymongo.errors.AutoReconnect as e: response["error"] = "database error: %s" % (e.message) return response # Start a set of all interlocutors we're interested in - that includes the # center emailer. talkers = set([center]) # Also start a table of distances from the center. distance = {center: 0} current_talkers = list(talkers) all_results = [] for i in range(degree): # Construct and send a query to retrieve all records involving the # current talkers, occurring within the time bounds specified, and # involving two known addresses. query = {"$and": [ {"date": {"$gte": start_time} }, {"date": {"$lt": end_time} }, {"source": {"$ne": ""} }, {"target": {"$ne": ""} }, {"$or": [ {"source": {"$in": current_talkers} }, {"target": {"$in": current_talkers} } ] } ] } results = c.find(query, fields=["target", "source"]) # Collect the names. #current_talkers = list(set(map(lambda x: x["target"] if x["source"] == center else x["source"], results))) current_talkers = list(itertools.chain(*map(lambda x: [x["target"], x["source"]], results))) talkers = talkers.union(current_talkers) # Compute updates to everyone's distance from center. for t in current_talkers: if t not in distance: distance[t] = i+1 # Rewind and save the cursor. results.rewind() all_results.append(results) # Construct a canonical graph structure from the set of talkers and the list # of emails. # # Start with an index map of the talkers. talkers = list(talkers) talker_index = {name: index for (index, name) in enumerate(talkers)} # Create a chained iterable from all the rewound partial results. all_results = itertools.chain(*all_results) # Create a list of graph edges suitable for use by D3 - replace each record # in the data with one that carries an index into the emailers list. edges = [] for result in all_results: source = result["source"] target = result["target"] ident = str(result["_id"]) rec = { "source": talker_index[source], "target": talker_index[target], "id": ident } edges.append(rec) talkers = [{"email": n, "distance": distance[n]} for n in talkers] # Stuff the graph data into the response object, and return it. response["result"] = { "nodes": talkers, "edges": edges } return response
def run(servername, projectName,datasetName, name=None, objectid=None, _id=None, accession=None, scientific_name=None, noid=False, noloc=False, nobranchlength=False, maxdepth=1000): def recursiveHelper(child, depth = 0): it = c.find({'_id':child}) phylo = it[0] if 'clades' in phylo: counter = 0 for child in phylo['clades']: if depth >= maxdepth: phylo['clades'][counter] = str(child) else: phylo['clades'][counter] = recursiveHelper(child, depth + 1) counter += 1 if noid: del phylo['_id'] else: phylo['_id'] = str(phylo['_id']) if nobranchlength: if 'branch_length' in phylo: del phylo['branch_length'] if 'loc' in phylo: if noloc: del phylo['loc'] return phylo # lookup the data collection data_coll = api.returnCollectionForObjectByName(projectName,'PhyloTree',datasetName) dbname = api.getMongoDatabase() print "initializing to db: ",dbname, "collection: ",data_coll # Construct an empty response object. response = tangelo.empty_response(); query = dict() # Decode the query strings into Python objects. try: if name is not None: decodeAndAdd(name, query, 'sequences.name', response) if objectid is not None: decodeAndAdd(ObjectId(objectid), query, 'objectid', response) if _id is not None: decodeAndAdd(_id, query, '_id', response) if accession is not None: decodeAndAdd(accession, query, 'sequences.accession.source', response) if scientific_name is not None: decodeAndAdd(scientific_name, query, 'taxonomies.scientific_name', response) except ValueError: return bson.json_util.dumps(response) # Cast the maxdepth value to an int. try: maxdepth = int(maxdepth) except ValueError: response['error'] = "Argument 'limit' ('%s') could not be converted to int." % (maxdepth) return bson.json_util.dumps(response) # Create database connection. try: c = pymongo.Connection(servername)[dbname][data_coll] except pymongo.errors.AutoReconnect: response['error'] = "Could not connect to MongoDB server '%s'" % (servername) return bson.json_util.dumps(response) # if no arguments given just search from root if not query: query['rooted'] = True it = c.find(query) # create a new tree for results if it.count() == 1: phylo = it[0] phylotree = recursiveHelper(phylo['_id']) # Convert to JSON and return the result. return bson.json_util.dumps(phylotree, sort_keys=True) else: response['error'] = "Search returned %s object(s) to root the tree" % (it.count()) response['error'] += "| %s" %(str(query)) return bson.json_util.dumps(response)
def run(servername, dbname, datatype, by=None, datemin=None, datemax=None, charity=None): # Construct an empty response object. response = tangelo.empty_response() # Establish a connection to the MongoDB server. try: conn = pymongo.Connection(servername) except pymongo.errors.AutoReconnect as e: response['error'] = "error: %s" % (e.message) return bson.json_util.dumps(response) # Extract the requested database and collection. db = conn[dbname] if datatype == "transactions": coll = db["charitynet.normalized.transactions"] conditions = [{ "date": { "$ne": None } }] if datemin != None and datemax != None: date_min = datetime.datetime.strptime(datemin, "%Y-%m-%d") date_max = datetime.datetime.strptime(datemax, "%Y-%m-%d") conditions.append({"date": {"$gte": date_min}}) conditions.append({"date": {"$lt": date_max}}) if charity != None: conditions.append({"charity_id": int(charity)}) pipeline = [] if len(conditions) > 0: pipeline.append({"$match": {"$and": conditions}}) if by == "month": group = {"year": {"$year": "$date"}, "month": {"$month": "$date"}} else: group = "$county" pipeline.append( {"$group": { "_id": group, "amount": { "$sum": "$amount" } }}) result = coll.aggregate(pipeline) if by == "month": response = [[d["_id"], float(d["amount"])] for d in result["result"] if d["_id"] != None] else: response = [["%05d" % d["_id"], float(d["amount"])] for d in result["result"] if d["_id"] != None] elif datatype == "population": coll = db["census"] response = [[d["_id"], int(d["pop2010"])] for d in coll.find()] elif datatype == "charities": coll = db["charitynet.normalized.transactions"] result = coll.aggregate([{ "$group": { "_id": "$charity_id", "count": { "$sum": 1 } } }, { "$sort": { "count": -1 } }]) response = [[d["_id"], d["_id"], d["count"]] for d in result["result"]] else: response['error'] = "error: unknown datatype requested" # Convert to JSON and return the result. return bson.json_util.dumps(response)
def run(server, db, coll, method='find', query=None, limit=1000, fields=None, sort=None, fill=None): # Create an empty response object. response = tangelo.empty_response() # Check the requested method. if method not in ['find', 'insert']: response['error'] = "Unsupported MongoDB operation '%s'" % (method) return bson.json_util.dumps(response) # Decode the query strings into Python objects. try: if query is not None: query = decode(query, 'query', response) if fields is not None: fields = decode(fields, 'fields', response) if sort is not None: sort = decode(sort, 'sort', response) if fill is not None: fill = decode(fill, 'fill', response) else: fill = True except ValueError: return bson.json_util.dumps(response) # Cast the limit value to an int. try: limit = int(limit) except ValueError: response[ 'error'] = "Argument 'limit' ('%s') could not be converted to int." % ( limit) return bson.json_util.dumps(response) # Create database connection. try: c = pymongo.Connection(server)[db][coll] except pymongo.errors.AutoReconnect: response['error'] = "Could not connect to MongoDB server '%s'" % ( server) return bson.json_util.dumps(response) # Perform the requested action. if method == 'find': # Do a find operation with the passed arguments. it = c.find(spec=query, fields=fields, limit=limit, sort=sort) # Create a list of the results. if fill: results = [x for x in it] else: results = [] # Create an object to structure the results. retobj = {} retobj['count'] = it.count() retobj['data'] = results # Pack the results into the response object, and return it. response['result'] = retobj else: raise RuntimeError("illegal method '%s' in module 'mongo'") # Return the response object. return bson.json_util.dumps(response)
def invoke_service(self, module, *pargs, **kwargs): # TODO(choudhury): This method should attempt to load the named module, # then invoke it with the given arguments. However, if the named module # is "config" or something similar, the method should instead launch a # special "config" app, which lists the available app modules, along # with docstrings or similar. It should also allow the user to # add/delete search paths for other modules. tangelo.content_type("text/plain") # Save the system path (be sure to *make a copy* using the list() # function) - it will be modified before invoking the service, and must # be restored afterwards. origpath = list(sys.path) # By default, the result should be a bare response that we will place an # error message in if something goes wrong; if nothing goes wrong this # will be replaced with some other object. result = tangelo.empty_response() # Store the modpath in the thread-local storage (tangelo.paths() makes # use of this per-thread data, so this is the way to get the data across # the "module boundary" properly). modpath = os.path.dirname(module) cherrypy.thread_data.modulepath = modpath cherrypy.thread_data.modulename = module # Extend the system path with the module's home path. sys.path.insert(0, modpath) # Import the module if not already imported previously (or if the module # to import, or its configuration file, has been updated since the last # import). try: stamp = self.modules.get(module) mtime = os.path.getmtime(module) config_file = module[:-2] + "json" config_mtime = None if os.path.exists(config_file): config_mtime = os.path.getmtime(config_file) if stamp is None or mtime > stamp["mtime"] or (config_mtime is not None and config_mtime > stamp["mtime"]): if stamp is None: tangelo.log("loading new module: " + module) else: tangelo.log("reloading module: " + module) # Load any configuration the module might carry with it. if config_mtime is not None: try: with open(config_file) as f: config = json.loads(json_minify(f.read())) if type(config) != dict: msg = "Service module configuration file does not contain a key-value store (i.e., a JSON Object)" tangelo.log(msg) raise TypeError(msg) except IOError: tangelo.log("Could not open config file %s" % (config_file)) raise except ValueError as e: tangelo.log("Error reading config file %s: %s" % (config_file, e)) raise else: config = {} cherrypy.config["module-config"][module] = config # Remove .py to get the module name name = module[:-3] # Load the module. service = imp.load_source(name, module) self.modules[module] = { "module": service, "mtime": max(mtime, config_mtime) } else: service = stamp["module"] except: bt = traceback.format_exc() tangelo.log("Error importing module %s" % (tangelo.request_path()), "SERVICE") tangelo.log(bt, "SERVICE") result = tangelo.HTTPStatusCode("501 Error in Python Service", "There was an error while trying to import module %s:<br><pre>%s</pre>" % (tangelo.request_path(), bt)) else: # Try to run the service - either it's in a function called "run()", # or else it's in a REST API consisting of at least one of "get()", # "put()", "post()", or "delete()". # # Collect the result in a variable - depending on its type, it will be # transformed in some way below (by default, to JSON, but may also raise # a cherrypy exception, log itself in a streaming table, etc.). # try: if 'run' in dir(service): # Call the module's run() method, passing it the positional and # keyword args that came into this method. result = service.run(*pargs, **kwargs) else: # Reaching here means it's a REST API. Check for the # requested method, ensure that it was marked as being part # of the API, and call it; or give a 405 error. method = cherrypy.request.method restfunc = service.__dict__[method.lower()] if restfunc is not None and hasattr(restfunc, "restful") and restfunc.restful: result = restfunc(*pargs, **kwargs) else: result = tangelo.HTTPStatusCode(405, "Method not allowed") except Exception as e: bt = traceback.format_exc() tangelo.log("Caught exception while executing service %s" % (tangelo.request_path()), "SERVICE") tangelo.log(bt, "SERVICE") result = tangelo.HTTPStatusCode("501 Error in Python Service", "There was an error executing service %s:<br><pre>%s</pre>" % (tangelo.request_path(), bt)) # Restore the path to what it was originally. sys.path = origpath # Check the type of the result to decide what result to finally return: # # 1. If it is an HTTPStatusCode object, raise a cherrypy HTTPError # exception, which will cause the browser to do the right thing. # # 2. TODO: If it's a Python generator object, log it with the Tangelo # streaming API. # # 3. If it's a Python dictionary, convert it to JSON. # # 4. If it's a string, don't do anything to it. # # This allows the services to return a Python object if they wish, or to # perform custom serialization (such as for MongoDB results, etc.). if isinstance(result, tangelo.HTTPStatusCode): if result.msg: raise cherrypy.HTTPError(result.code, result.msg) else: raise cherrypy.HTTPError(result.code) elif "next" in dir(result): # Generate a key corresponding to this object, using 100 random # bytes from the system - ensure the random key is not already in # the table (even though it would be crazy to wind up with a # collision). # # TODO(choudhury): replace this with a call to generate_key(). # Move the comment above into the generate_key() function. key = md5.md5(os.urandom(100)).hexdigest() while key in self.streams: key = md5.md5(os.urandom(100)).hexdigest() # Log the object in the streaming table. self.streams[key] = result # Create an object describing the logging of the generator object. result = tangelo.empty_response() result["stream_key"] = key # Serialize it to JSON. result = json.dumps(result) elif not isinstance(result, types.StringTypes): try: result = json.dumps(result) except TypeError as e: t = e.message.split("<service.")[1].split()[0] msg = "Service %s returned an object of type %s that could not be serialized to JSON" % (tangelo.request_path(), t) tangelo.log("Error: %s" % (msg), "SERVICE") raise cherrypy.HTTPError("501 Error in Python Service", msg) return result
def run(database, table, start_time, end_time, center, degree, host="localhost", port=10000, fields="true"): response = tangelo.empty_response() try: degree = int(degree) except ValueError: response["error"] = "argument 'degree' must be an integer" return response client = init_shark(host, port, database) talkers = set([center]) distance = {center: 0} current_talkers = list(talkers) all_results = [] for i in range(degree): query = build_query(database, table, start_time, end_time, current_talkers) client.execute(query) results = client.fetchAll() current_talkers = list(itertools.chain(*map(lambda x: [x.split("\t")[0], x.split("\t")[1]], results))) current_talkers = list(set(current_talkers)) talkers = talkers.union(current_talkers) for t in current_talkers: if t not in distance: distance[t] = i+1 all_results.append(results) talkers = list(talkers) talker_index = {name: index for (index, name) in enumerate(talkers)} all_results = itertools.chain(*all_results) edges = [] ident = 0; for result in all_results: resultArray = result.split("\t") source = resultArray[1] target = resultArray[0] ident += 1 rec = { "source": talker_index[source], "target": talker_index[target], "id": str(ident) } edges.append(rec) talkers = [{"email": n, "distance": distance[n]} for n in talkers] response["result"] = { "nodes": talkers, "edges": edges } return response
def run(host, port, graph, start_time=None, days=1, center=None, degree=None): response = tangelo.empty_response() # Bail with error if any of the required arguments is missing. missing = map(lambda x: x[0], filter(lambda x: x[1] is None, zip(["start_time", "days", "center", "degree"], [start_time, days, center, degree]))) if len(missing) > 0: response["error"] = "missing required arguments: %s" % (", ".join(missing)) return response # Cast the arguments to the right types. # # The degree is the degree of separation between the center element and the # retrieved nodes - an integer. try: degree = int(degree) except ValueError: response["error"] = "argument 'degree' must be an integer" return response # The start time is the number of milliseconds since the epoch (which is how # JavaScript dates are constructed, and therefore how dates are stored in # MongoDB) - an integer. try: start_time = datetime.datetime.strptime(start_time, "%Y-%m-%d") except ValueError: response["error"] = "argument 'start_time' must be in YYYY-MM-DD format" return response try: days = int(days) except ValueError: response["error"] = "argument 'days' must be an integer" return response dateList = [ start_time + datetime.timedelta(days=x) for x in range(0,days) ] config = Config("http://"+host+":"+port+"/graphs/"+graph) config.set_logger(DEBUG) #client = RexsterClient(config) g = Graph(config) # Start a set of all interlocutors we're interested in - that includes the # center emailer. talkers = set([center]) # Also start a table of distances from the center. distance = {center: 0} current_talkers = list(talkers) center_vertex = g.vertices.index.lookup(email=center).next() edgeId = 0; edges = [] for i in range(degree): new_talkers = [] for talker_email in current_talkers: current_vertex = g.vertices.index.lookup(email=talker_email).next() for day in dateList: dayString = day.strftime('%m/%d/%Y') adjacent = current_vertex.bothV(dayString) if adjacent != None: adjacent_talkers = list(set(itertools.chain(*map(lambda x: [x.email], adjacent)))) if '' in adjacent_talkers: adjacent_talkers.remove('') for this_talker in adjacent_talkers: newEdge = { "source": this_talker, "target": talker_email, "id": edgeId } edges.append(newEdge) edgeId += 1 new_talkers.extend(adjacent_talkers) current_talkers.extend(new_talkers) current_talkers = list(set(current_talkers)) talkers = talkers.union(current_talkers) # Compute updates to everyone's distance from center. for t in current_talkers: if t not in distance: distance[t] = i+1 # Construct a canonical graph structure from the set of talkers and the list # of emails. # # Start with an index map of the talkers. talkers = list(set(talkers)) talker_index = {name: index for (index, name) in enumerate(talkers)} for edge in edges: edge["source"] = talker_index[edge["source"]] edge["target"] = talker_index[edge["target"]] talkers = [{"email": n, "distance": distance[n]} for n in talkers] # Stuff the graph data into the response object, and return it. response["result"] = { "nodes": talkers, "edges": edges } return response
def run_on_collection(servername, dbname, data_coll, boundary_type, _id=None, lng=-1.0, lat=-1.0, radius=0.0, swlng=-1.0, swlat=-1.0, nelng=-1.0, nelat=-1.0, limit=1000, _filter='true'): earthRadius = 6378137 #meters # Construct an empty response object. response = tangelo.empty_response(); try: limit = int(limit) except ValueError: response['error'] = "Argument (%s), value (%s) could not be converted to int" % ('limit', limit) if limit > 1000 or limit < 1: limit = 1000 if boundary_type == 'circle': # convert types to floats try: lng = float(lng) lat = float(lat) radius = float(radius) except ValueError as e: response['error'] = e.message + " Argument could not be converted to float." return bson.json_util.dumps(response) #check bounds if lng > 180.0 or lng < -180.0: response['error'] = "Longitude out of bounds: %s" % (lng) return bson.json_util.dumps(response) if lat > 90.0 or lat < -90.0: response['error'] = "Latitude out of bounds: %s" % (lat) return bson.json_util.dumps(response) if radius < 0.0: response['error'] = "Radius cannot be negative: %s" % (radius) return bson.json_util.dumps(response) # convert radius from meters to percentage of earth radius = radius / earthRadius container = [[lng, lat], radius] query = {'loc' : {'$within' : { '$centerSphere' : container }}} elif boundary_type == 'rect': try: swlng = float(swlng) swlat = float(swlat) nelng = float(nelng) nelat = float(nelat) except ValueError as e: response['error'] = e.message + " Argument could not be converted to float." return bson.json_util.dumps(response) container = [[swlng, swlat], [nelng, nelat]] query = {'loc' : {'$within' : { '$box' : container }}} elif boundary_type == 'id': True else: response['error'] = "Invalid geometery type: %s" % (boundary_type) return bson.json_util.dumps(response) # Create database connection. try: c = pymongo.Connection(servername)[dbname][data_coll] except pymongo.errors.AutoReconnect: response['error'] = "Could not connect to MongoDB server '%s'" % (servername) return bson.json_util.dumps(response) # Perform the query if boundary_type == 'id': mpath = materializedPaths.materializedPaths(servername,dbname,data_coll,data_coll) if mpath.checkIfPresent(): it = mpath.getDescendantsCriteria(_id, "loc") else: # materialized paths not present, create it! mpath.generateFromChildTree() it = mpath.getDescendantsCriteria(_id, "loc") #response['error'] = "Materialized Paths not present in dataset" #return bson.json_util.dumps(response) else: it = c.find(spec=query, limit=limit) #response['error'] = "we made it here" + " " + str(_id) + " " + str(it.count()) #return bson.json_util.dumps(response) # Create a list of the results results = list() # if we want to filter by only those locations of items in our range if _filter == 'true': count = 0 try: for item in it: # for each location of the item for location in item['loc']: # if it's in bounds create new object, add to results if boundary_type == 'id' or isInBounds(float(location[0]), float(location[1]), container, boundary_type, response): marker = dict() marker['name'] = item['taxonomies'][0]['scientific_name'] marker['ID'] = item['_id'] marker['lng'] = location[0] marker['lat'] = location[1] results.append(marker) count += 1 # else: don't add point except ValueError: return bson.json_util.dumps(response) except KeyError: return bson.json_util.dumps(response) # otherwise return document containing all locations of item in range else: results = [x for x in it] # Create an object to structure the results retobj = dict() retobj['count'] = count if _filter == 'true' else it.count() retobj['data'] = results # Pack the results in the response object, and return it response['result'] = retobj return bson.json_util.dumps(response)