def search_email_by_community(*args, **param_args): tangelo.content_type("application/json") tangelo.log("search_email_by_community(args: %s kwargs: %s)" % (str(args), str(param_args))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **param_args) community = nth(args, 0, '') # TODO: set from UI size = param_args.get('size', 2500) if not data_set_id: return tangelo.HTTPStatusCode( 400, "invalid service call - missing data_set_id") if not community: return tangelo.HTTPStatusCode(400, "invalid service call - missing sender") email_addrs = parseParam_email_addr(**param_args) qs = parseParamTextQuery(**param_args) return es_get_all_email_by_community(data_set_id, community, email_addrs, qs, start_datetime, end_datetime, size)
def post(*arg, **kwarg): ''' This is the main listener for github webhooks. ''' # retrieve the headers from the request # headers = tangelo.request_headers() # <- not merged headers = cherrypy.request.headers # get the request body as a dict body = tangelo.request_body() s = body.read() # make sure this is a valid request coming from github computed_hash = hmac.new(str(_secret_key), s, hashlib.sha1).hexdigest() received_hash = headers.get('X-Hub-Signature', 'sha1=')[5:] if not hmac.compare_digest(computed_hash, received_hash): return tangelo.HTTPStatusCode(403, "Invalid signature") try: obj = json.loads(s) except: return tangelo.HTTPStatusCode(400, "Could not load json object.") if headers['X-Github-Event'] == 'push': # add a new item to the test queue add_push(obj) else: return tangelo.HTTPStatusCode(400, "Unhandled event") return 'OK'
def search_email_by_topic(*args, **param_args): tangelo.content_type("application/json") tangelo.log("search_email_by_topic(args: %s kwargs: %s)" % (str(args), str(param_args))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **param_args) # TODO: set from UI size = param_args.get('size', 2500) if not data_set_id: return tangelo.HTTPStatusCode( 400, "invalid service call - missing data_set_id") if not param_args.get("topic_index"): return tangelo.HTTPStatusCode( 400, "invalid service call - missing topic_index") topic = parseParamTopic(**param_args) email_addrs = parseParam_email_addr(**param_args) qs = parseParamTextQuery(**param_args) return es_get_all_email_by_topic(data_set_id, topic=topic, email_addrs=email_addrs, qs=qs, start_datetime=start_datetime, end_datetime=end_datetime, size=size)
def seedSearch(*args): email_id = urllib.unquote(nth(args, 0)) if not email_id: return tangelo.HTTPStatusCode(400, "missing argument email_id") line_num = findLineNum(email_id) seed_url = "{0}/firstemail/{1}".format(daemon_url, line_num) request(seed_url) next_url = "{0}/getNextEmail".format(daemon_url) start_point = request(next_url) if not start_point: return tangelo.HTTPStatusCode(400, "failed to set starting email") return findEmailId(start_point)
def advanced_query(args, kwargs): if len(args) != 1 or 'q' not in kwargs: raise cherrypy.HTTPError(400) query_string = kwargs['q'] try: if '~slr~' in query_string: helium_query = query.to_helium_query(query_string) mongo_query = helium_query_to_mongo_query(helium_query) else: mongo_query = query.to_mongo_query(query_string) except query.InvalidQuery: return tangelo.HTTPStatusCode(400, 'Invalid query') rep = args[0] proj = generate_mongo_projection(rep) limit = getlimit(kwargs) if rep == 'cjson': complete_data = { '$and': [{ '3dStructure': { '$exists': 1 } }, { 'diagram': { '$exists': 1 } }] } mongo_query = {'$and': [complete_data, mongo_query]} return execute_query(mongo_query, proj, rep, limit)
def get(*pargs, **kwargs): if len(pargs) == 3: return basic_query(pargs, kwargs) elif len(pargs) == 1: return advanced_query(pargs, kwargs) return tangelo.HTTPStatusCode(400, "Invalid request")
def get_top_email_by_text_query(data_set_id, qs, start_datetime, end_datetime, size): if not qs: return tangelo.HTTPStatusCode( 400, "invalid service call - missing search term(s)") query = _build_email_query(qs=qs, date_bounds=(start_datetime, end_datetime)) tangelo.log("es_search.get_graph_for_text_query(query: %s)" % (query)) results = _query_emails(data_set_id, size, query) graph = _build_graph_for_emails(data_set_id, results["hits"], results["total"]) # Get attachments for community query = _build_email_query(qs=qs, date_bounds=(start_datetime, end_datetime), attachments_only=True) tangelo.log("es_search.get_top_email_by_text_query(attachment-query: %s)" % (query)) attachments = _query_email_attachments(data_set_id, size, query) graph["attachments"] = attachments return graph
def search_email_by_conversation(*path_args, **param_args): tangelo.content_type("application/json") tangelo.log("search.search_email_by_conversation(path_args[%s] %s)" % (len(path_args), str(path_args))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **param_args) # TODO: set from UI size = param_args.get('size', 2500) # parse the sender address and the recipient address sender_list = parseParamEmailSender(**param_args) cherrypy.log("\tsender_list: %s)" % str(sender_list)) recipient_list = parseParamEmailRecipient(**param_args) cherrypy.log("\trecipient_list: %s)" % str(recipient_list)) document_uid = parseParamDocumentUID(**param_args) cherrypy.log("\tdocument_uid: %s)" % str(document_uid)) document_datetime = parseParamDocumentDatetime(**param_args) cherrypy.log("\tdocument_datetime: %s)" % str(document_datetime)) if not document_datetime: return tangelo.HTTPStatusCode( 400, "invalid service call - missing mandatory param 'document_datetime'" ) sender_address, recipient_address = parseParamAllSenderAllRecipient( **param_args) return es_get_conversation(data_set_id, sender_address, recipient_address, start_datetime, end_datetime, size / 2, document_uid, document_datetime)
def getEmail(*args): email = urllib.unquote(nth(args, 0, '')) if not email: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") tangelo.content_type("application/json") return {"email": queryEmail(email), "entities": queryEntity(email)}
def getAllAttachmentBySender(*args, **kwargs): tangelo.log("getAttachmentsSender(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) sender = nth(args, 0, '') if not data_set_id: return tangelo.HTTPStatusCode( 400, "invalid service call - missing data_set_id") if not sender: return tangelo.HTTPStatusCode(400, "invalid service call - missing sender") tangelo.content_type("application/json") return get_attachments_by_sender(data_set_id, sender, start_datetime, end_datetime, size)
def email_scores(*args): email_id = unquote(nth(args, 0, '')) category = nth(args, 1, 'all') if not email_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing email") return {"scores": [], "email": email_id, "category": category}
def download(data): user = data.get("user") if not user: return tangelo.HTTPStatusCode(400, "invalid service call missing user") passwd = data.get("pass") limit = data.get("limit", "2000") logfile = "{}/{}.log".format(work_dir, user) spit(logfile, "[Start] {}\n".format(user), True) cherrypy.log("logfile: {}".format(logfile)) def download_thread(): try: cherrypy.log("Thread Start User: {}".format(user)) try: session = newman_email.login(user, passwd, logfile) fldr = "{}/emails/{}".format(webroot, user) cherrypy.log("Login User: {}".format(user)) if os.path.exists(fldr): rmrf(fldr) mkdir(fldr) spit("{}/output.csv".format(fldr), newman_email.headerrow() + "\n") mkdir(fldr + "/emails") newman_email.download(session, user, fldr, int(limit), logfile) spit(logfile, "[Completed Download] {}\n".format(user)) except Exception as ex: spit(logfile, "[Error] {}\n".format(ex)) cherrypy.log("[Error] {}\n".format(ex)) except: spit(logfile, "[Error]") cherrypy.log("[Error]") error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) finally: newman_email.close_session(session) except: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) thr = threading.Thread(target=download_thread, args=()) thr.start() tangelo.content_type("application/json") return {"id": user}
def delete(resource, projname, datatype=None, dataset=None): if resource != "project": return tangelo.HTTPStatusCode( 400, "Bad resource type '%s' - allowed types are: project") # (This is expressing xor) if (datatype is None) != (dataset is None): return tangelo.HTTPStatusCode( 400, "Bad arguments - 'datatype' and 'dataset' must both be specified if either one is specified" ) if datatype is None: api.deleteProjectNamed(projname) else: api.deleteDataset(projname, datatype, dataset) return "OK"
def setSelectedDataSet(*args): tangelo.content_type("application/json") data_set_id = urllib.unquote(nth(args, 0, '')) if not data_set_id: return tangelo.HTTPStatusCode( 400, "invalid service call - missing data_set_id") resp = initialize_email_addr_cache(data_set_id) _current_data_set_selected = data_set_id return _index_record(data_set_id)
def getAttachCount(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("getAttachCount(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) attach_type = urllib.unquote(nth(args, 0, '')) if not attach_type: return tangelo.HTTPStatusCode( 400, "invalid service call - missing attach_type") attach_type = 'all' #hack for now email_address_list = parseParamEmailAddress(**kwargs) if not email_address_list: activity = get_total_attachment_activity( data_set_id, data_set_id, query_function=attachment_histogram, sender_email_addr="", start=start_datetime, end=end_datetime, interval="week") result = { "account_activity_list": [{ "account_id": data_set_id, "data_set_id": data_set_id, "account_start_datetime": start_datetime, "account_end_datetime": end_datetime, "activities": activity }] } else: result = { "account_activity_list": [{ "account_id": account_id, "data_set_id": data_set_id, "account_start_datetime": start_datetime, "account_end_datetime": end_datetime, "activities": get_emailer_attachment_activity(data_set_id, account_id, (start_datetime, end_datetime), interval="week") } for account_id in email_address_list] } return result
def getRollup(*args): entity = urllib.unquote(nth(args, 0, '')) if not entity: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt_entity_rollup_id, entity) as qry: rtn = qry.cursor().fetchone() tangelo.content_type("application/json") return {"rollupId": rtn}
def post(*pargs, **kwargs): input = cherrypy.request.body.read() if not pargs: return tangelo.HTTPStatusCode(400, "No task module specified") pargs = filter(None, pargs) task_module = '.'.join(pargs) async_result = celery.send_task('%s.run' % task_module, [input]) return {'id': async_result.task_id}
def get_attachment_by_id(*args, **kwargs): data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) attachment_id=nth(args, 0, '') if not attachment_id: attachment_id = parseParamAttachmentGUID(**kwargs) cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" % (data_set_id, attachment_id)) if not data_set_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing index") if not attachment_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing attachment_id") attachment = es().get(index=data_set_id, doc_type="attachments", id=attachment_id) if not attachment: return tangelo.HTTPStatusCode(400, "no attachments found for (index=%s, attachment_id=%s)" % (data_set_id, attachment_id)) attachment = attachment["_source"] ext = attachment["extension"] filename = attachment["filename"] mime_type = mimetypes.guess_type(filename)[0] if not mime_type: tangelo.content_type("application/x-download") header("Content-Disposition", 'attachment; filename="{}"'.format(filename)) else: tangelo.content_type(mime_type) header("Content-Disposition", 'inline; filename="{}"'.format(filename)) content = attachment["contents64"] bytes = base64.b64decode(content) # dump(bytes, filename) as_str = str(bytes) tangelo.log(str(len(as_str)), "Uploading Attachment - length = ") return as_str
def getAccountActivity(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("getAccountActivity(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) account_type = urllib.unquote(nth(args, 0, '')) if not account_type: return tangelo.HTTPStatusCode( 400, "invalid service call - missing account_type") email_address_list = parseParamEmailAddress(**kwargs) if not email_address_list: result = { "account_activity_list": [{ "account_id": data_set_id, "data_set_id": data_set_id, "account_start_datetime": start_datetime, "account_end_datetime": end_datetime, "activities": get_email_activity(data_set_id, data_set_id, date_bounds=(start_datetime, end_datetime), interval="week") }] } else: result = { "account_activity_list": [{ "account_id": account_id, "data_set_id": data_set_id, "account_start_datetime": start_datetime, "account_end_datetime": end_datetime, "activities": get_email_activity(data_set_id, data_set_id, account_id, date_bounds=(start_datetime, end_datetime), interval="week") } for account_id in email_address_list] } return result
def getTopRollup(*args): amt = urllib.unquote(nth(args, 0, '')) if not amt: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") stmt = stmt_top_rollup_entities + ("limit {0}".format(amt)) with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [r for r in qry.cursor()] rtn = rtn if rtn else [] tangelo.content_type("application/json") return {"entities": rtn}
def setStarred(*args, **kwargs): tangelo.log("setStarred(args: %s kwargs: %s)" % (str(args), str(kwargs))) tangelo.content_type("application/json") data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) email_id = args[-1] if not email_id: return tangelo.HTTPStatusCode( 400, "invalid service call - missing email_id") starred = parseParamStarred(**kwargs) return set_starred(data_set_id, [email_id], starred)
def getEmail(*args, **kwargs): tangelo.log("getEmail(args: %s kwargs: %s)" % (str(args), str(kwargs))) tangelo.content_type("application/json") data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) qs = parseParamTextQuery(**kwargs) email_id = args[-1] if not email_id: return tangelo.HTTPStatusCode( 400, "invalid service call - missing email_id") return get_email(data_set_id, email_id, qs)
def get(job_id, operation, **kwargs): job = AsyncResult(job_id, backend=celery.backend) if operation == 'status': response = {'status': job.state} if job.state == states.FAILURE: response['message'] = str(job.result) elif job.state == 'PROGRESS': response['meta'] = str(job.result) return response elif operation == 'result': response = {'result': job.result} return response else: return tangelo.HTTPStatusCode(400, "Invalid request")
def email_scores(*args): email_id = unquote(nth(args, 0, '')) category = nth(args, 1, 'all') if not email_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing email") stmt = (" select score from xref_email_topic_score " " where category_id = %s and email_id = %s " " order by idx ") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt, category, email_id) as qry: rtn = [head(r) for r in qry.cursor()] tangelo.content_type("application/json") return {"scores": rtn, "email": email_id, "category": category}
def getAttachmentsSender(*args): sender = urllib.unquote(nth(args, 0, '')) if not sender: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") tangelo.content_type("application/json") stmt = ( " select id, dir, datetime, from_addr, tos, ccs, bccs, subject, attach, bodysize " " from email " " where from_addr = %s and attach != '' ") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt, sender) as qry: rtn = [[ val.encode('utf-8') if isinstance(val, basestring) else str(val) for val in row ] for row in qry.cursor()] return {"sender": sender, "email_attachments": rtn}
def execute_query(find, proj, rep, limit): retry_count = 10 while (retry_count > 0): try: db = connect(config['server'], config['db']) tangelo.log(str(find)) cursor = db['molecules'].find(find, proj) cursor.limit(limit) return result_to_response(rep, cursor) except pymongo.errors.AutoReconnect: retry_count -= 1 return tangelo.HTTPStatusCode(500, 'Unable to connect to mongochem')
def unknown(*args): return tangelo.HTTPStatusCode(400, "invalid service call")
def unknown(**kwargs): return tangelo.HTTPStatusCode(404, "unknown service call")
def invoke_service(self, module, *pargs, **kwargs): # TODO(choudhury): This method should attempt to load the named module, # then invoke it with the given arguments. However, if the named # module is "config" or something similar, the method should instead # launch a special "config" app, which lists the available app modules, # along with docstrings or similar. It should also allow the user to # add/delete search paths for other modules. tangelo.content_type("text/plain") # Save the system path (be sure to *make a copy* using the list() # function) - it will be modified before invoking the service, and must # be restored afterwards. origpath = list(sys.path) # By default, the result should be an object with error message in if # something goes wrong; if nothing goes wrong this will be replaced # with some other object. result = {} # Store the modpath in the thread-local storage (tangelo.paths() makes # use of this per-thread data, so this is the way to get the data # across the "module boundary" properly). modpath = os.path.dirname(module) cherrypy.thread_data.modulepath = modpath cherrypy.thread_data.modulename = module # Extend the system path with the module's home path. sys.path.insert(0, modpath) # Import the module if not already imported previously (or if the # module to import, or its configuration file, has been updated since # the last import). try: stamp = self.modules.get(module) mtime = os.path.getmtime(module) config_file = module[:-2] + "json" config_mtime = None if os.path.exists(config_file): config_mtime = os.path.getmtime(config_file) if (stamp is None or mtime > stamp["mtime"] or (config_mtime is not None and config_mtime > stamp["mtime"])): if stamp is None: tangelo.log("loading new module: " + module) else: tangelo.log("reloading module: " + module) # Load any configuration the module might carry with it. if config_mtime is not None: try: with open(config_file) as f: config = json.loads(json_minify(f.read())) if type(config) != dict: msg = ("Service module configuration file " + "does not contain a key-value store " + "(i.e., a JSON Object)") tangelo.log(msg) raise TypeError(msg) except IOError: tangelo.log("Could not open config file %s" % (config_file)) raise except ValueError as e: tangelo.log("Error reading config file %s: %s" % (config_file, e)) raise else: config = {} cherrypy.config["module-config"][module] = config # Remove .py to get the module name name = module[:-3] # Load the module. service = imp.load_source(name, module) self.modules[module] = { "module": service, "mtime": max(mtime, config_mtime) } else: service = stamp["module"] except: bt = traceback.format_exc() tangelo.log("Error importing module %s" % (tangelo.request_path()), "SERVICE") tangelo.log(bt, "SERVICE") result = tangelo.HTTPStatusCode( "501 Error in Python Service", Tangelo.literal + "There was an error while " + "trying to import module " + "%s:<br><pre>%s</pre>" % (tangelo.request_path(), bt)) else: # Try to run the service - either it's in a function called # "run()", or else it's in a REST API consisting of at least one of # "get()", "put()", "post()", or "delete()". # # Collect the result in a variable - depending on its type, it will # be transformed in some way below (by default, to JSON, but may # also raise a cherrypy exception, log itself in a streaming table, # etc.). try: if 'run' in dir(service): # Call the module's run() method, passing it the positional # and keyword args that came into this method. result = service.run(*pargs, **kwargs) else: # Reaching here means it's a REST API. Check for the # requested method, ensure that it was marked as being part # of the API, and call it; or give a 405 error. method = cherrypy.request.method restfunc = service.__dict__[method.lower()] if (restfunc is not None and hasattr(restfunc, "restful") and restfunc.restful): result = restfunc(*pargs, **kwargs) else: result = tangelo.HTTPStatusCode( 405, "Method not allowed") except Exception as e: bt = traceback.format_exc() tangelo.log( "Caught exception while executing service %s" % (tangelo.request_path()), "SERVICE") tangelo.log(bt, "SERVICE") result = tangelo.HTTPStatusCode( "501 Error in Python Service", Tangelo.literal + "There was an error " + "executing service " + "%s:<br><pre>%s</pre>" % (tangelo.request_path(), bt)) # Restore the path to what it was originally. sys.path = origpath # Check the type of the result to decide what result to finally return: # # 1. If it is an HTTPStatusCode object, raise a cherrypy HTTPError # exception, which will cause the browser to do the right thing. # # 2. TODO: If it's a Python generator object, log it with the Tangelo # streaming API. # # 3. If it's a Python dictionary, convert it to JSON. # # 4. If it's a string, don't do anything to it. # # This allows the services to return a Python object if they wish, or # to perform custom serialization (such as for MongoDB results, etc.). if isinstance(result, tangelo.HTTPStatusCode): if result.msg: raise cherrypy.HTTPError(result.code, result.msg) else: raise cherrypy.HTTPError(result.code) elif "next" in dir(result): if self.stream: return self.stream.add(result) else: return json.dumps({ "error": "Streaming is not supported " + "in this instance of Tangelo" }) elif not isinstance(result, types.StringTypes): try: result = json.dumps(result) except TypeError as e: msg = Tangelo.literal + "<p>A JSON type error occurred in service " + tangelo.request_path( ) + ":</p>" msg += "<p><pre>" + cgi.escape(e.message) + "</pre></p>" raise cherrypy.HTTPError("501 Error in Python Service", msg) return result
def export_emails_archive(data_set_id, email_ids=["f9c9c59a-7fe8-11e5-bb05-08002705cb99"]): cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" % (data_set_id, email_ids)) if not data_set_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing index") # if not email: # return tangelo.HTTPStatusCode(400, "invalid service call - missing attachment_id") # elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='10.1.70.143', port=9200): Read timed out. (read timeout=10)) es = Elasticsearch([{ "host": "10.1.70.143", "port": 9200 }], request_timeout=60) # TODO can implement with multiple doc_types and combine attachments in emails = es.mget(index=data_set_id, doc_type="emails", body={"docs": [{ "_id": id } for id in email_ids]}) # TODO filename filename = "export.tar.gz" tangelo.content_type("application/x-gzip") header("Content-Disposition", 'attachment; filename="{}"'.format(filename)) string_buffer = cStringIO.StringIO() tar = tarfile.open(mode='w:gz', fileobj=string_buffer) # Add each email to the tar for email_source in emails["docs"]: email = email_source["_source"] tarinfo_parent = tarfile.TarInfo(name=email["id"]) tarinfo_parent.type = tarfile.DIRTYPE tarinfo_parent.mode = 0755 tarinfo_parent.mtime = time.time() tar.addfile(tarinfo_parent) tarinfo = tarfile.TarInfo(email["id"] + "/" + email["id"] + ".json") # TODO -- email transformation data_string = json.dumps(email) fobj = cStringIO.StringIO(data_string) tarinfo.size = len(data_string) tarinfo.mode = 0644 tarinfo.mtime = time.time() tar.addfile(tarinfo, fobj) # Get the attachments if email["attachments"]: attachments = es.mget(index=data_set_id, doc_type="attachments", body={ "docs": [{ "_id": attch["guid"] } for attch in email["attachments"]] }) for attachment_source in attachments["docs"]: attachment = attachment_source["_source"] filename = attachment["filename"] attch_data = str(base64.b64decode(attachment["contents64"])) tarinfo_attch = tarfile.TarInfo(email["id"] + "/" + filename) tarinfo_attch.size = len(attch_data) tarinfo_attch.mode = 0644 tarinfo_attch.mtime = time.time() tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data)) tar.close() return string_buffer.getvalue()