def search_email_by_conversation(*path_args, **param_args): tangelo.content_type("application/json") tangelo.log("search.search_email_by_conversation(path_args[%s] %s)" % (len(path_args), str(path_args))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **param_args) # TODO: set from UI size = param_args.get('size', 2500) # parse the sender address and the recipient address sender_list = parseParamEmailSender(**param_args) cherrypy.log("\tsender_list: %s)" % str(sender_list)) recipient_list = parseParamEmailRecipient(**param_args) cherrypy.log("\trecipient_list: %s)" % str(recipient_list)) document_uid = parseParamDocumentUID(**param_args) cherrypy.log("\tdocument_uid: %s)" % str(document_uid)) document_datetime = parseParamDocumentDatetime(**param_args) cherrypy.log("\tdocument_datetime: %s)" % str(document_datetime)) if not document_datetime: return tangelo.HTTPStatusCode( 400, "invalid service call - missing mandatory param 'document_datetime'" ) sender_address, recipient_address = parseParamAllSenderAllRecipient( **param_args) return es_get_conversation(data_set_id, sender_address, recipient_address, start_datetime, end_datetime, size / 2, document_uid, document_datetime)
def run(usertoken): # Create an empty response object. response = {} collectionNames = [] # build custom girder header for authenticated access girderheader = {'Girder-Token': usertoken} print 'girderheader:',girderheader # look through the collections in girder. Return a list of collections that are in this local # Arbor instance girderlocation = 'http://localhost:9000' resp = requests.get(girderlocation+'/api/v1/collection',headers=girderheader) # nothing particularly interesting here #print resp.headers #print requests.utils.dict_from_cookiejar(resp.cookies) for entry in resp.json(): collname = entry['name'] print "found collection:", collname collectionNames.append(entry['name']) # Pack the results into the response object, and return it. response['result'] = collectionNames # Return the response object. tangelo.log(str(response)) return json.dumps(response)
def queryEmail(email): with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt_email_by_id, email) as qry: tangelo.log("node-vals: %s" % qry.stmt) rtn = qry.cursor().fetchone() tangelo.content_type("application/json") return rtn if rtn else []
def queryEntity(email): with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt_email_entities_by_id, email) as qry: tangelo.log("node-vals: %s" % qry.stmt) rtn = [r for r in qry.cursor()] return rtn if rtn else []
def parseParamAllSenderAllRecipient( **kwargs ): tangelo.log("parseParamAllSenderAllRecipient(kwargs[%s] %s)" % (len(kwargs), str(kwargs))) sender = kwargs.get('sender','').split(",")[0] recipient = kwargs.get('recipient','').split(",") recipient = [x for x in recipient if (x is not '' and x is not None)] return sender, recipient
def module_reload_changed(key): """ Reload a module if it has changed since we last imported it. This is necessary if module a imports script b, script b is changed, and then module c asks to import script b. :param key: our key used in the WatchList. :returns: True if reloaded. """ imp.acquire_lock() try: modkey = module_sys_modules_key(key) if not modkey: return False found = None if modkey: for second in WatchList: secmodkey = module_sys_modules_key(second) if secmodkey and sys.modules[modkey] == sys.modules[secmodkey]: found = second foundmodkey = secmodkey break if not found: return filemtime = module_getmtime(WatchList[found]["file"]) filemtime = latest_submodule_time(found, filemtime) if filemtime > WatchList[found]["time"]: tangelo.log("Reloaded %s" % found) reload_including_local(sys.modules[foundmodkey]) for second in WatchList: if WatchList[second]["file"] == WatchList[found]["file"]: WatchList[second]["time"] = filemtime finally: imp.release_lock() return True
def scrape_page(team_id,domain_id,trail_id,url,content,user_email): content = urllib.unquote(content).encode('utf-8') url = url.encode('utf-8') connector = factory.get_entity_data_connector() # blacklist of pages to not extract data from blacklist = config.get_extraction_blacklist() if urlparse(url).netloc not in blacklist: (features,errors) = extractors.extractAll(content) for error in errors: tangelo.log("FEATURE EXTRACTION ERROR: "+error) for type,values in features.iteritems(): connector.insert_entities(url,type,values) if len(values) > 0: features_in_domain = connector.get_domain_entity_matches(domain_id,type,values) if len(features_in_domain) > 0: tangelo.log("INSERTING DOMAIN ENTITIES") tangelo.log(type) connector.insert_domain_entities(str(domain_id),url, type, features_in_domain) # we also don't want to export blacklisted pages. tangelo.log("Calling export") export_to_services(domain_id, team_id, trail_id, url, content, user_email, features) else: tangelo.log("Url: %s IN blacklist"%url) id = db.addBrowsePathData(team_id,domain_id,trail_id,url, user_email) count = db.getUrlCount(team_id,domain_id,trail_id, url) result = dict(id=id, count=count) return json.dumps(result)
def run(host,database): # Create an empty response object. response = {} collectionNames = ['select a dataset'] # look through the collections in the ivaan database and return the name of all collections # that match the naming profile for tables. This is matching to see if the collection name # begins with "table_" client = MongoClient(host, 27017) db = client[database] # get a list of all collections (excluding system collections) collection_list = db.collection_names(False) for coll in collection_list: # exclude the seeds collections if coll[:6] == 'seeds_': #print "found seeds:", coll collectionNames.append(coll) client.close() # Pack the results into the response object, and return it. response['result'] = collectionNames # Return the response object. tangelo.log(str(response)) return json.dumps(response)
def __init__(self, bus, cfg_file=None, logfile=None, pidfile=None, webroot=None, hostname=None, port=None): SimplePlugin.__init__(self, bus) self.finalized = False self.pid = os.getpid() self.filename = StatusFile.status_filename(self.pid) tangelo.log("here") self.status = { k: str(v) for k, v in zip(StatusFile.fields, map(eval, StatusFile.fields)) } self.status["pid"] = str(self.pid) tangelo.log("there") for k, v in self.status.iteritems(): if v is None: raise TypeError("argument '%s' cannot be None" % (k))
def scrape_page(html, url, userId, userName, trail, domain, org): #tangelo.log('USER NAME: ' + userName) domain = domain.encode('utf-8') org = org.encode('utf-8') html = urllib.unquote(html).encode('utf-8') url = url.encode('utf-8') connector = factory.get_entity_data_connector() (features,errors) = extractors.extractAll(html) tangelo.log(features) for type,values in features.iteritems(): connector.insert_entities(url,type,values) #for value in values: # tangelo.log("EXTRACTED: "+type+"\t"+value) if len(values) > 0: features_in_domain = connector.get_domain_entity_matches(domain,type,values) if len(features_in_domain) > 0: connector.insert_domain_entities(domain,url, type, features_in_domain) #tangelo.log("EXTRACTED "+str(len(features_in_domain))+" DOMAIN FEATURES") for error in errors: tangelo.log("FEATURE EXTRACTION ERROR: "+error) id = db.addBrowsePathData(org, url, userId, userName, trail, domain=domain) # get number of times this url appears in the database count = db.getUrlCount(org, url, domain=domain) result = dict(id=id, count=count) #tangelo.log("POSTED url:" + url + " return: " + str(result)) return json.dumps(result)
def search_email_by_community(*args, **param_args): tangelo.content_type("application/json") tangelo.log("search_email_by_community(args: %s kwargs: %s)" % (str(args), str(param_args))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **param_args) community = nth(args, 0, '') # TODO: set from UI size = param_args.get('size', 2500) if not data_set_id: return tangelo.HTTPStatusCode( 400, "invalid service call - missing data_set_id") if not community: return tangelo.HTTPStatusCode(400, "invalid service call - missing sender") email_addrs = parseParam_email_addr(**param_args) qs = parseParamTextQuery(**param_args) return es_get_all_email_by_community(data_set_id, community, email_addrs, qs, start_datetime, end_datetime, size)
def getRankedAddresses(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("getRankedAddresses(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) # TODO - reminder no 'qs' here set to '' # qs = parseParamTextQuery(**kwargs) qs='' # TODO this needs to come from UI size = size if size >500 else 2500 ranked_addresses = get_ranked_email_address_from_email_addrs_index(data_set_id, start_datetime, end_datetime, size) top_address_list = [] for i, email_address in enumerate(ranked_addresses["emails"]): graph = es_get_all_email_by_address(data_set_id, email_address[0], qs, start_datetime, end_datetime, size ) top_address_list.append({ "address_search_url_path" : email_address[0], "parameters" : kwargs, "search_results" : { "mail_sent_count" : email_address[6], "mail_received_count" : email_address[5], "mail_attachment_count" : email_address[7], "query_matched_count" : graph["query_hits"], "associated_count" : len(graph["graph"]["nodes"]) }, "TEMPORARY_GRAPH" : graph }) return {"top_address_list" : top_address_list}
def es_get_exif_emails(data_set_id, size): tangelo.log("es_geo.es_get_exif_emails()" ) emails_resp = es().search(index=data_set_id, doc_type="emails", size=size, body=_geo_exif_query()) tangelo.log("es_geo.es_get_exif_emails(total document hits = %s)" % emails_resp["hits"]["total"]) docs = [hit["_source"] for hit in emails_resp["hits"]["hits"]] return {"total":emails_resp["hits"]["total"], "exif_docs" : docs}
def _build_graph_for_emails(index, emails, query_hits): nodes = [] edge_map = {} addr_index = {} total = count(index,"email_address") print total for email in emails: from_addr = email["from"] if from_addr not in _EMAIL_ADDR_CACHE[index]: tangelo.log("WARNING: From email address not found in cache <%s>" % email) continue; if from_addr not in addr_index: nodes.append(_map_node(_EMAIL_ADDR_CACHE[index][from_addr],total)) addr_index[from_addr] = len(nodes)-1 for rcvr_addr in email["to"]+email["cc"]+email["bcc"]: if rcvr_addr not in _EMAIL_ADDR_CACHE[index]: tangelo.log("WARNING: RCVR email address not found in cache <%s>" % rcvr_addr) continue; if rcvr_addr not in addr_index: nodes.append(_map_node(_EMAIL_ADDR_CACHE[index][rcvr_addr], total)) addr_index[rcvr_addr] = len(nodes)-1 #TODO reduce by key instead of mapping? src->target and sum on value edge_key = from_addr+"#"+rcvr_addr if edge_key not in edge_map: edge_map[edge_key] = {"source" : addr_index[from_addr],"target": addr_index[rcvr_addr],"value": 1} else: edge_map[edge_key]["value"]=edge_map[edge_key]["value"]+1 return {"graph":{"nodes":nodes, "links":edge_map.values()}, "rows": [_map_emails_to_row(email) for email in emails], "query_hits" : query_hits}
def parseParamEmailAddress( **kwargs ): tangelo.log("parseParamEmailAddress(kwargs[%s] %s)" % (len(kwargs), str(kwargs))) email_regex = re.compile("[^@]+@[^@]+\\.[^@]+") key_list = [k for k in kwargs.keys() if email_regex.match(k)] tangelo.log("\tkey_list[] = %s" % str(key_list)) return key_list
def get_top_email_by_text_query(data_set_id, qs, start_datetime, end_datetime, size): if not qs: return tangelo.HTTPStatusCode( 400, "invalid service call - missing search term(s)") query = _build_email_query(qs=qs, date_bounds=(start_datetime, end_datetime)) tangelo.log("es_search.get_graph_for_text_query(query: %s)" % (query)) results = _query_emails(data_set_id, size, query) graph = _build_graph_for_emails(data_set_id, results["hits"], results["total"]) # Get attachments for community query = _build_email_query(qs=qs, date_bounds=(start_datetime, end_datetime), attachments_only=True) tangelo.log("es_search.get_top_email_by_text_query(attachment-query: %s)" % (query)) attachments = _query_email_attachments(data_set_id, size, query) graph["attachments"] = attachments return graph
def query(data): url = data['url'] max_results_per_node = int(data['mrpn']) indd = data['index'] search_terms = data['search_terms'] es = Elasticsearch([url]) if esauth.get(url) != None: cred = esauth[url] tangelo.log('http://' + cred + '@' + url) es = Elasticsearch(['http://' + cred + '@' + url]) ind = indd rr = [] num = 0 for t in search_terms: if t['type'] == 'selection' or t['type'] == 'phone' or t['type'] == 'email' or t['type'] == 'info': num_to_search = t['id'] if t['type'] == 'selection': num_to_search = t['data'] if t['type'] == 'info': num_to_search = t['id'].split('->')[1].strip() results = es.search(index=ind,body={"size":max_results_per_node,"fields":["_index","_type","_id"],"query":{"match_phrase": {"_all": num_to_search}}}) num += results['hits']['total'] for hit in results['hits']['hits']: rr.append({'nid':t['id'],'search_term':num_to_search,'eid':hit['_id'],'itype':hit['_type'],'jindex':ind,'url':url}) return json.dumps({'num':num,'hits':rr})
def search_email_by_topic(*args, **param_args): tangelo.content_type("application/json") tangelo.log("search_email_by_topic(args: %s kwargs: %s)" % (str(args), str(param_args))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **param_args) # TODO: set from UI size = param_args.get('size', 2500) if not data_set_id: return tangelo.HTTPStatusCode( 400, "invalid service call - missing data_set_id") if not param_args.get("topic_index"): return tangelo.HTTPStatusCode( 400, "invalid service call - missing topic_index") topic = parseParamTopic(**param_args) email_addrs = parseParam_email_addr(**param_args) qs = parseParamTextQuery(**param_args) return es_get_all_email_by_topic(data_set_id, topic=topic, email_addrs=email_addrs, qs=qs, start_datetime=start_datetime, end_datetime=end_datetime, size=size)
def parseParamEmailIds(**kwargs): tangelo.log("parseParamEmailIds(kwargs[%s] %s)" % (len(kwargs), str(kwargs))) value = kwargs.get('email_ids') email_ids = value.split(",") return [x for x in email_ids if (x is not '' and x is not None)]
def getHourlyBrowsePathCounts(org, users, trail, domain='default'): sql = 'SELECT (unix_timestamp(ts) DIV 3600)*3600 as group_hour, count(1) from datawake_data where org = %s AND domain = %s ' params = [org.upper(), domain] if trail != '*' and trail != '': sql = sql + ' AND trail = %s ' params.append(trail) if len(users) > 0: param_string = ','.join(['%s' for i in range(len(users))]) sql = sql + ' AND userId in (' + param_string + ') ' params.extend(users) sql = sql + " GROUP BY group_hour" tangelo.log(sql) tangelo.log(str(params)) rows = dbGetRows(sql, params) result = [] delta = 3600 if len(rows) > 0: curr = rows[0][0] for row in rows: if row[0] is None: continue print 'row ', row dt = row[0] while (dt - curr > 3600): curr = curr + delta result.append({'ts': curr, 'count': 0}) result.append({'ts': dt, 'count': row[1]}) curr = dt # add one hour if len(result) > 0: curr = curr + 3600 result.append({'ts': curr, 'count': 0}) return result
def get_user(token): user = helper.get_user() if helper.get_token() != token or user is None: user_auth = factory.get_authentication_object(token) user = user_auth.get_user_from_token() tangelo.log('session.post verified user: ' + str(user)) return user
def get_attachments_by_sender(data_set_id, sender, start_datetime, end_datetime, size): # fields= ["id", "dir", "datetime", "from", "tos", "ccs", "bccs", "subject", "attach", "bodysize"] # fields= ["id", "datetime", "senders", "tos", "ccs", "bccs", "subject", "attachments.filename"] # body={"filter":{"exists":{"field":"attachments"}}, "query":{"match":{"senders":sender}}} body = _build_email_query(sender_addrs=[sender], date_bounds=(start_datetime, end_datetime), attachments_only=True) tangelo.log("get_attachments_by_sender.Query %s"%body) attachments_resp = es().search(index=data_set_id, doc_type="emails", size=size, body=body) email_attachments = [] for attachment_item in attachments_resp["hits"]["hits"]: _source = attachment_item["_source"] attachment_entry = [_source["id"], "PLACEHOLDER", _source["datetime"], _source.get("senders","")[0], ';'.join(_source.get("tos","")), ';'.join(_source.get("ccs","")), ';'.join(_source.get("bccs","")), _source.get("subject","")] for attachment in _source["attachments"]: l = list(attachment_entry) l[1] = attachment["guid"] l.append(attachment["filename"]) l.append(0) email_attachments.append(l) return {"sender":sender, "email_attachments":email_attachments}
def get_entity_histogram(index, type, email_addrs=[], qs='', topic_score=None, date_bounds=None, entity_agg_size=10): tangelo.log("===================================================") body = entity_histogram_query(email_addrs=email_addrs, qs=qs, topic_score=topic_score, date_bounds=date_bounds, entity_agg_size=entity_agg_size) tangelo.log("get_entity_histogram: query = %s" % body) resp = es().search(index=index, doc_type=type, body=body) return sorted([ dict(d, **{"type": "location"}) for d in resp["aggregations"] ["filtered_entity_agg"]["location"]["buckets"] ] + [ dict(d, **{"type": "organization"}) for d in resp["aggregations"] ["filtered_entity_agg"]["organization"]["buckets"] ] + [ dict(d, **{"type": "person"}) for d in resp["aggregations"] ["filtered_entity_agg"]["person"]["buckets"] ] + [ dict(d, **{"type": "misc"}) for d in resp["aggregations"]["filtered_entity_agg"]["misc"]["buckets"] ], key=lambda d: d["doc_count"], reverse=True)
def createResults(field, args_array): ## is text search if not field.lower() in ["email", "entity"]: text = head(args_array) if text: tangelo.log("text search : %s" % text) es = Elasticsearch() res = es.search(index="newman", doc_type="emails", size=1000, q=text, body= {"fields": ["_id"], "query": {"match_all": {}}}) ingestESTextResults(jsonGet(['hits','hits'], res, [])) node_vals = getNodeVals(field, args_array) colors = {k:v.get("group_id") for k,v in node_vals.iteritems()} for k,v in node_vals.iteritems(): node_vals[k]["color"] = colors.get(k) emails = sorted(getEmails(colors, field, args_array), key=lambda x: str(x.get('datetime'))) idx_lookup = {} nodes = [] for i, o in enumerate(node_vals.iteritems()): k,v = o idx_lookup[k]=i #nodes.append({"name": k, "num": v.get("num"), "rank": v.get("rank"), "group": v.get("color"), "community": colors.get(v.get("comm"))}) nodes.append({"name": k, "num": v.get("num"), "rank": v.get("rank"), "group": v.get("color"), "community": v.get("comm_id")}) edges = getEdges(idx_lookup, field, args_array) results = { 'rows': emails, 'graph': { 'nodes': nodes, 'links': edges }} return results
def getRankedAddresses(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("getRankedAddresses(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) # TODO - reminder no 'qs' here set to '' # qs = parseParamTextQuery(**kwargs) qs = '' # TODO this needs to come from UI size = size if size > 500 else 2500 ranked_addresses = get_ranked_email_address_from_email_addrs_index( data_set_id, start_datetime, end_datetime, size) top_address_list = [] for i, email_address in enumerate(ranked_addresses["emails"]): graph = es_get_all_email_by_address(data_set_id, email_address[0], qs, start_datetime, end_datetime, size) top_address_list.append({ "address_search_url_path": email_address[0], "parameters": kwargs, "search_results": { "mail_sent_count": email_address[6], "mail_received_count": email_address[5], "mail_attachment_count": email_address[7], "query_matched_count": graph["query_hits"], "associated_count": len(graph["graph"]["nodes"]) }, "TEMPORARY_GRAPH": graph }) return {"top_address_list": top_address_list}
def getAttachFileType(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("getAttachFileType(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) top_count = int(size) attach_type = urllib.unquote(nth(args, 0, '')) if not attach_type: attach_type = 'all' #hack for now email_address_list = parseParamEmailAddress(**kwargs); if not email_address_list : file_types = get_top_attachment_types(data_set_id, date_bounds=(start_datetime, end_datetime), num_top_attachments=top_count)[:top_count] else : #TODO: implement populating the attachment file-types under individual email-accounts; simulate result for now file_types = get_top_attachment_types(data_set_id, date_bounds=(start_datetime, end_datetime), num_top_attachments=top_count)[:top_count] result = { "account_id" : data_set_id, "data_set_id" : data_set_id, "account_start_datetime" : start_datetime, "account_end_datetime" : end_datetime, "types" : file_types } return result
def getEdges(node_idx, field, args_array): with newman_connector() as read_cnx: tangelo.log("start edge query") with execute_query(*edgeQueryObj(read_cnx.conn(), field, args_array)) as qry: tangelo.log("edges : %s" % qry.stmt) return [{"source": node_idx.get(from_), "target": node_idx.get(to_), "value": int(weight)} for from_, to_, weight in qry.cursor()]
def upload_file(*args, **kwargs): domain_content_connector = factory.get_entity_data_connector() try: domain_file = kwargs.get("file_upload") domain_name = kwargs.get("name") domain_description = kwargs.get("description") if not db.domain_exists(domain_name): if domain_file is not None: tangelo.log("read domain file") domain_file_lines = domain_file.file.readlines() domain_file_lines = map(lambda x: x.strip().replace('\0',''), domain_file_lines) db.add_new_domain(domain_name, domain_description) rowkeys = [] for line in domain_file_lines: i = line.index(',') # split on the first comma type = line[:i] value = line[i+1:] if type[0] == '"' and type[len(type)-1] == '"': type = type[1:-1] if value[0] == '"' and value[len(value)-1] == '"': value = value[1:-1] rowkeys.append( domain_name+'\0'+type+'\0'+value ) result = domain_content_connector.add_new_domain_items(rowkeys) return json.dumps(dict(success=result)) else: return json.dumps(dict(success=False)) else: return json.dumps(dict(success=False)) finally: domain_content_connector.close()
def run(): # Create an empty response object. response = {} collectionNames = [] # look through the collections in the ivaan database and return the name of all collections # that match the naming profile for tables. This is matching to see if the collection name # begins with "table_cardiac" since it is only returning cardiac studies from the IVAaN database connection = Connection('localhost', 27017) db = connection['ivaan'] # get a list of all collections (excluding system collections) collection_list = db.collection_names(False) for coll in collection_list: # if it is a table, then add it to the response if (str(coll[:14]) == 'table_cardiac_'): print "found table:", coll # don't return the prefix in the project name. Users don't have to know the # cardiac project collection names are prepended collectionNames.append(coll[14:]) connection.close() # if no projects found at all, return a default name if len(collectionNames) == 0: collectionNames.append("default") # Pack the results into the response object, and return it. response['result'] = collectionNames # Return the response object. tangelo.log(str(response)) return bson.json_util.dumps(response)
def listAllDataSet(): tangelo.log("datasource.listAllDataSet()") # Ignore index keys in ES that are not in the newman_app.conf # Find all the indexes that begin with the index loader prefix indexes = [ _index_record(index) for index in index_list() if index in data_set_names() or index.startswith(index_creator_prefix()) ] data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**{}) email_addrs = get_ranked_email_address_from_email_addrs_index( data_set_id, start_datetime, end_datetime, size)["emails"] email_addrs = {email_addr[0]: email_addr for email_addr in email_addrs} return { "data_set_selected": getDefaultDataSetID(), "data_sets": indexes, "top_hits": { "order_by": "rank", "email_addrs": email_addrs } }
def get_entities(trail_id): tangelo.log('Getting entities for trail: %s' % trail_id) entities = {} entityList = [] urls = [] rows = db.getBrowsePathUrls(trail_id) for row in rows: urls.append(row['url']) entity_data_connector = factory.get_entity_data_connector() results = entity_data_connector.get_extracted_entities_from_urls(urls) tangelo.log('Got entities') for result in results: for entityType in results[result]: for entityName in results[result][entityType]: if entityName in entities: entities[entityName]['pages'] = entities[entityName]['pages'] + 1 else: entities[entityName] = {'type': entityType, 'pages':1} # TODO either figure out how how map the data or do this differently for entity in entities: entityList.append({'name': entity, 'type': entities[entity]['type'], 'pages': entities[entity]['pages']}) return json.dumps(entityList)
def es_get_sender_locations(data_set_id, size): tangelo.log("es_geo.es_get_sender_locations()" ) emails_resp = es().search(index=data_set_id, doc_type="emails", size=size, body=_geo_xoip_query()) tangelo.log("es_geo.es_get_sender_locations(total document hits = %s)" % emails_resp["hits"]["total"]) docs = [_map_geo_response(hit["_source"])for hit in emails_resp["hits"]["hits"]] return {"total":emails_resp["hits"]["total"], "XOIP_locations" : docs}
def export(cdr): dd_url = '%s/%s/%s/'%(conf.get_deepdive_url(), conf.get_deepdive_user(), conf.get_deepdive_repo()) headers = {'Authorization': 'Token %s' % conf.get_deepdive_token()} r = requests.post(dd_url, headers=headers, data=cdr) tangelo.log('Sending page to deepdive at: %s' % r.url)
def _query_email_attachments(index, size, emails_query): tangelo.log("_query_email_attachments.Query %s" % emails_query) attachments_resp = es().search(index=index, doc_type="emails", size=size, body=emails_query) email_attachments = [] for attachment_item in attachments_resp["hits"]["hits"]: _source = attachment_item["_source"] attachment_entry = [ _source["id"], "PLACEHOLDER", _source["datetime"], _source.get("senders", ""), ';'.join(_source.get("tos", "")), ';'.join(_source.get("ccs", "")), ';'.join(_source.get("bccs", "")), _source.get("subject", "") ] for attachment in _source["attachments"]: l = list(attachment_entry) l[1] = attachment["guid"] l.append(attachment["filename"]) l.append(0) email_attachments.append(l) return email_attachments
def parseParamPhoneNumbers(**kwargs): tangelo.log("parseParamPhoneNumbers(kwargs[%s] %s)" % (len(kwargs), str(kwargs))) value = kwargs.get('phone_numbers') phone_numbers = value.split(",") return [x for x in phone_numbers if (x is not '' and x is not None)]
def run(tablename): # Create an empty response object. response = {} print "using collection:",tablename # first find out the types of the attributes in this collection. Create a dictionary with the names and types connection = Connection('localhost', 27017) db = connection['ivaan'] dataset_collection = db[tablename] #tablerecord = dataset_collection.find()[0] # Now that we have the proper types in the table collection stored in a handy local dictionary "attributes", lets # build a query for mongoDB depending on how many filters are enabled. querystring = {} print "query to perform:", querystring # now that we have the query build, execute it and return the matching records from the collection connection = Connection('localhost', 27017) db = connection['polar'] dataset_collection = db[tablename] # Do a find operation with the passed arguments. it = dataset_collection.find(querystring) results = [x for x in it] connection.close() print results # convert from the local json to a geojson multipoint result #latitudes = [30, 30, 30] #longitudes = [10, 20, 30] #define multipoint geometry multipoint = ogr.Geometry(ogr.wkbMultiPoint) #create point geometry and add to multipoint geometry for i in range(len(results)): point = ogr.Geometry(ogr.wkbPoint) fixedlocation = convertStringToFloatPoint(results[i]['stationLng'], results[i]['stationLat']) point.AddPoint(fixedlocation['lng'],fixedlocation['lat']) multipoint.AddGeometry(point) #convert geometry to GeoJSON format geojson_multipoint = multipoint.ExportToJson() # Pack the results into the response object, and return it. response['count'] = it.count() response['data'] = geojson_multipoint response['result'] = 'OK' # Return the response object. tangelo.log(str(response)) return bson.json_util.dumps(response)
def parseParamAllSenderAllRecipient(**kwargs): tangelo.log("parseParamAllSenderAllRecipient(kwargs[%s] %s)" % (len(kwargs), str(kwargs))) sender = kwargs.get('sender', '').split(",")[0] recipient = kwargs.get('recipient', '').split(",") recipient = [x for x in recipient if (x is not '' and x is not None)] return sender, recipient
def parseParamEmailAddress(**kwargs): tangelo.log("parseParamEmailAddress(kwargs[%s] %s)" % (len(kwargs), str(kwargs))) email_regex = re.compile("[^@]+@[^@]+\\.[^@]+") key_list = [k for k in kwargs.keys() if email_regex.match(k)] tangelo.log("\tkey_list[] = %s" % str(key_list)) return key_list
def __init__(self, *pargs, **kwargs): ws4py.websocket.WebSocket.__init__(self, *pargs, **kwargs) url = "ws://%s:%d/ws" % (hostname, port) tangelo.log("websocket created at %s:%d/%s (proxy to %s)" % (hostname, port, key, url)) self.client = VTKWebSocketAB(url, self)
def parseParamEntity(**kwargs): tangelo.log("parseParamEntity(kwargs[%s] %s)" % (len(kwargs), str(kwargs))) entity_dict = { k: v.split(",") for k, v in kwargs.iteritems() if k.startswith("entities") } return entity_dict
def get(term): google_results = search_google(term) tangelo.log(google_results) onion_results = search_onion(term) tangelo.log(onion_results) results = google_results + onion_results if len(results) != 0: return json.dumps(dict(success=True, resultCount=len(results), results=results))
def export(cdr): dd_url = '%s/%s/%s/' % (conf.get_deepdive_url(), conf.get_deepdive_user(), conf.get_deepdive_repo()) headers = {'Authorization': 'Token %s' % conf.get_deepdive_token()} r = requests.post(dd_url, headers=headers, data=cdr) tangelo.log('Sending page to deepdive at: %s' % r.url)
def post(action, *args, **kwargs): tangelo.log("Comments") post_data = json.loads(tangelo.request_body().read(), strict=False) def unknown(**kwargs): return tangelo.HTTPStatusCode(404, "unknown service call") return post_actions.get(action, unknown)(**post_data)
def add_trail(trailname, domain, traildescription=u''): tangelo.log('datawake_trails POST trailname=%s traildescription=%s domain=%s' % (trailname, traildescription, domain)) user = helper.get_user() org = user.get_org() invalid = re.match('^[\w]*(?!:)+$', trailname) is None if invalid: raise ValueError("Trail names must be alphanumeric and not contain a ':'") last_row = db.addTrail(org, trailname, traildescription, user.get_email(), domain=domain) return json.dumps(dict(success=last_row >= 0))
def getCommunities(*args, **kwargs): tangelo.log("getCommunities(args: %s kwargs: %s)" % (str(args), str(kwargs))) tangelo.content_type("application/json") data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) #top_count = int(urllib.unquote(nth(args, 0, "40"))) top_count = int(size); return {"communities" : get_top_communities(data_set_id, date_bounds=(start_datetime, end_datetime), num_communities=top_count)[:top_count]}
def post(action, *args, **kwargs): body = tangelo.request_body().read() post_data = json.loads(body, strict=False) def unknown(*args): return tangelo.HTTPStatusCode(400, "invalid service call") tangelo.log(post_data) return post_actions.get(action, unknown)(**post_data)
def get(term): google_results = search_google(term) tangelo.log(google_results) onion_results = search_onion(term) tangelo.log(onion_results) results = google_results + onion_results if len(results) != 0: return json.dumps( dict(success=True, resultCount=len(results), results=results))
def getRankedEmails(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("getRankedEmails(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) return get_ranked_email_address_from_email_addrs_index( data_set_id, start_datetime, end_datetime, size)
def service_status(id, type, url, domain_id, team_id, trail_id, status): if UseRestAPI: ts = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S') service_status = restPost('DatawakeXmits', dict(xmitId=0, recipientId=id, serviceType=type, datawakeUrl=url, domainId=domain_id, teamId=team_id, trailId=trail_id, xmitStatus=status, ts=ts)) tangelo.log(service_status) return service_status.xmitId else: sql = 'insert into datawake_xmit (recipient_id, service_type, datawake_url, domain_id, team_id, trail_id, xmit_status, ts) values(%s,%s,%s,%s,%s,%s,%s,sysdate())' params = [id, type, url, domain_id, team_id, trail_id, status] return dbCommitSQL(sql, params)
def save_page_selection(selection, domain, url): tangelo.log('savePageSelection url=' + str(url) + ' selection=' + selection + ' domain=' + domain) user = helper.get_user() org = user.get_org() postId = db.get_post_id(url) row = db.getBrowsePathData(org, postId, domain) row_id = -1 if row['org'] == org: # ensure the user is saving a selection to a post from their org row_id = db.addSelection(postId, selection) return json.dumps(dict(id=row_id))
def getNodeVals(field, args_array): """ nodes should be the all of the emails an email addr is a part of and then all of then all of the email addr associated with that set of emails """ with newman_connector() as read_cnx: tangelo.log("start node query") with execute_query(*nodeQueryObj(read_cnx.conn(), field, args_array)) as qry: tangelo.log("node-vals: %s" % qry.stmt) return {item[0]: { 'num': int(item[4]+item[5]), 'comm_id': item[2], 'group_id': item[3], 'comm': item[1], 'rank': item[6] } for item in qry.cursor() }