def getBrowsePathWithTextSelections(org,startdate,enddate,userlist=[],trail='*',domain=''): # first get the browse path graph = getBrowsePathEdges(org,startdate,enddate,userlist,trail,domain) nodes = graph['nodes'] edges = graph['edges'] newnodes = {} try: # for each node in the browse path pull any related notes: for key,node in nodes.iteritems(): postIds = node['postIds'] if len(postIds) > 0: params = ','.join(['%s' for i in range(len(postIds))]) sql = """ SELECT posts.id, selections.id, unix_timestamp(posts.ts),posts.url,posts.userId,posts.userName,selections.selection FROM datawake_data posts, datawake_selections selections WHERE posts.id = selections.postId and posts.id in ("""+params+")" rows = datawake_mysql.dbGetRows(sql,postIds) for row in rows: postid = row[0] selectionId = row[1] ts = row[2] url = row[3] userId = row[4] userName = row[5].encode() selection = row[6] id = 'selection_'+str(postid)+'_'+str(selectionId)+'_'+'_'+url node = {'id':id, 'type':'selection', 'size':5, 'groupName':'', 'timestamps':[ts], 'userNames':[userName], 'userIds':[userId], 'data':selection } newnodes[id] = node edges.append((key,id)) nodes.update(newnodes) if len(userlist) == 1 and trail != '*': nodes = addUrlRankstoNodes(org,nodes,userlist[0],trail,domain=domain) return {'nodes':nodes,'edges':edges} except: raise
def getOculusForensicGraph(org, startdate, enddate, userlist=[], trail='*', domain=''): startMillis = int(round(time.time() * 1000)) entityDataConnector.close() org = org.upper() command = """ SELECT id,unix_timestamp(ts) as ts,url FROM memex_sotera.datawake_data WHERE org=%s AND domain=%s """ params = [org, domain] # add the user list filter if given if (len(userlist) > 0): command = command + " AND " newparams = ['%s' for i in range(len(userlist))] newparams = ','.join(newparams) command = command + " userId in (" + params + ") " params.extend(newparams) # add the trail filter if trail != '*': command = command + " AND trail = %s" params.append(trail) # add the time filter to the query if (startdate == '' and enddate == ''): pass elif (startdate != '' and enddate == ''): command = command + " AND unix_timestamp(ts) >= %s " params.append(startdate) elif (startdate == '' and enddate != ''): command = command + " AND unix_timestamp(ts) <= %s " params.append(enddate) else: command = command + " AND unix_timestamp(ts) >= %s and unix_timestamp(ts) <= %s " params.append(startdate) params.append(enddate) command = command + " GROUP BY url ORDER BY ts asc " db_rows = datawake_mysql.dbGetRows(command, params) urls = map(lambda x: x[2], db_rows) extracted_features = entityDataConnector.get_extracted_entities_from_urls( urls) browsePath = {} adj_urls = set([]) entities = [] for row in db_rows: (id, ts, url) = row #tangelo.log("URL: "+url) if url not in extracted_features: #tangelo.log("skipping url: "+url) continue extracted_features_for_url = extracted_features[url] for entity_type, entity_values in extracted_features_for_url.iteritems( ): if entity_type == "info": continue #tangelo.log("\tENTITY TYPE: "+entity_type) for entity_value in entity_values: #tangelo.log("\t\tENTITY VALUE: "+entity_value) if trail is None or trail.strip() == '': trail = "default" if id not in browsePath: ext = tldextract.extract(url) browsePath[id] = { 'id': id, 'url': url, 'timestamp': ts, 'subdomain': ext.subdomain, 'domain': ext.domain, 'suffix': ext.suffix } entity = {'id': id, 'type': entity_type, 'value': entity_value} bAdd = True if (entity_type == 'email'): emailPieces = entity_value.split('@') entity['user_name'] = emailPieces[0] emailURL = 'mailto://' + emailPieces[1] emailExt = tldextract.extract(emailURL) entity['domain'] = emailExt.domain entity['subdomain'] = emailExt.subdomain elif (entity_type == 'phone'): areaCode = '' if (len(entity_value) == 10): areaCode = entity_value[1:4] if (areaCode != ''): entity['area_code'] = areaCode else: adj_urls.add(entity_value) webExt = tldextract.extract(entity_value) entity['subdomain'] = webExt.subdomain entity['domain'] = webExt.domain entity['suffix'] = webExt.suffix if (bAdd): entities.append(entity) # Get all the lookahead features if (len(adj_urls) > 0): lookaheadFeatures = entityDataConnector.get_extracted_entities_from_urls( adj_urls) # add place holders for urls with no extracted data for adj_url in adj_urls: if adj_url not in lookaheadFeatures: lookaheadFeatures[adj_url] = {} domainLookaheadFeatures = entityDataConnector.get_extracted_domain_entities_from_urls( domain, adj_urls) else: lookaheadFeatures = [] domainLookaheadFeatures = [] entityDataConnector.close() endMillis = int(round(time.time() * 1000)) # tangelo.log('Processing time = ' + str((endMillis-startMillis)/1000) + 's'); return { 'browsePath': browsePath, 'entities': entities, 'lookaheadFeatures': lookaheadFeatures, 'domainLookaheadFeatures': domainLookaheadFeatures }
def getBrowsePathEdges(org,startdate,enddate,userlist=[],trail='*',domain=''): print 'getBrowsePathEdges(',startdate,',',enddate,',',userlist,')' org = org.upper() command = """SELECT unix_timestamp(t1.ts) as ts, t1.url,hits,userName,userId,id,trail FROM datawake_data as t1 LEFT JOIN (select url,count(url) as hits from datawake_data WHERE org = %s and domain = %s group by url ) as t2 ON t1.url = t2.url WHERE t1.org = %s and t1.domain = %s """ commandArgs = [org,domain,org,domain] # add the time filter to the query if (startdate == '' and enddate == ''): pass elif (startdate != '' and enddate == ''): command = command +" AND unix_timestamp(t1.ts) >= %s " commandArgs.append(startdate) elif (startdate == '' and enddate != ''): command = command + " AND unix_timestamp(t1.ts) <= %s " commandArgs.append(enddate) else: command = command + " AND unix_timestamp(t1.ts) >= %s and unix_timestamp(t1.ts) <= %s " commandArgs.append(startdate) commandArgs.append(enddate) # add the user filter if (len(userlist) > 0): command = command +" AND " params = ['%s' for i in range(len(userlist))] params = ','.join(params) command = command + " userId in ("+params+") " commandArgs.extend(userlist) # add the trail filter if trail != '*': command = command +" AND " command = command + " trail = %s" commandArgs.append(trail) command = command + " ORDER BY userId,t1.ts asc" rows = datawake_mysql.dbGetRows(command,commandArgs) edges = [] nodes = {} edge_buffer = [] for row in rows: (ts,url,hits,username,userId,postId,trail) = row if trail is None or trail.strip() == '': trail = "default" if url not in nodes: nodes[url] = {'id':url, 'type':'browse path ', 'size':10, 'timestamps':[], 'hits':0, 'userNames':[], 'userIds':[], 'postIds':[], 'trails':[] } nodes[url]['timestamps'].append(ts) nodes[url]['hits'] = hits nodes[url]['userNames'].append(username) nodes[url]['userIds'].append(userId) nodes[url]['postIds'].append(postId) nodes[url]['trails'].append(trail) edge_buffer.append(url) if len(edge_buffer) == 2: if (edge_buffer[0] != edge_buffer[1]): if 'chrome://newtab/' not in edge_buffer[1]: users1 = nodes[edge_buffer[0]]['userIds'][-1] users2 = nodes[edge_buffer[1]]['userIds'][-1] if users1 == users2: edges.append((edge_buffer[0],edge_buffer[1])) edge_buffer = [edge_buffer[1]] # set group name from each node for key,value in nodes.iteritems(): domain = 'n/a' if '//' in key: domain = key.split('/')[2] value['groupName'] = domain if len(userlist) == 1 and trail != '*': nodes = addUrlRankstoNodes(org,nodes,userlist[0],trail,domain=domain) return {'nodes':nodes,'edges':edges}
def getOculusForensicGraph(org,startdate,enddate,userlist=[],trail='*',domain=''): startMillis = int(round(time.time() * 1000)) entityDataConnector.close() org = org.upper() command = """ SELECT id,unix_timestamp(ts) as ts,url FROM memex_sotera.datawake_data WHERE org=%s AND domain=%s """ params = [org,domain] # add the user list filter if given if (len(userlist) > 0): command = command +" AND " newparams = ['%s' for i in range(len(userlist))] newparams = ','.join(newparams) command = command + " userId in ("+params+") " params.extend(newparams) # add the trail filter if trail != '*': command = command +" AND trail = %s" params.append(trail) # add the time filter to the query if (startdate == '' and enddate == ''): pass elif (startdate != '' and enddate == ''): command = command +" AND unix_timestamp(ts) >= %s " params.append(startdate) elif (startdate == '' and enddate != ''): command = command + " AND unix_timestamp(ts) <= %s " params.append(enddate) else: command = command + " AND unix_timestamp(ts) >= %s and unix_timestamp(ts) <= %s " params.append(startdate) params.append(enddate) command = command + " GROUP BY url ORDER BY ts asc " db_rows = datawake_mysql.dbGetRows(command,params) urls = map(lambda x: x[2],db_rows) extracted_features = entityDataConnector.get_extracted_entities_from_urls(urls) browsePath = {} adj_urls = set([]) entities = [] for row in db_rows: (id,ts,url) = row #tangelo.log("URL: "+url) if url not in extracted_features: #tangelo.log("skipping url: "+url) continue extracted_features_for_url = extracted_features[url] for entity_type,entity_values in extracted_features_for_url.iteritems(): if entity_type == "info": continue #tangelo.log("\tENTITY TYPE: "+entity_type) for entity_value in entity_values: #tangelo.log("\t\tENTITY VALUE: "+entity_value) if trail is None or trail.strip() == '': trail = "default" if id not in browsePath: ext = tldextract.extract(url) browsePath[id] = {'id':id, 'url':url, 'timestamp':ts, 'subdomain':ext.subdomain, 'domain':ext.domain, 'suffix':ext.suffix } entity = { 'id':id, 'type':entity_type, 'value':entity_value } bAdd = True; if (entity_type=='email'): emailPieces = entity_value.split('@') entity['user_name'] = emailPieces[0] emailURL = 'mailto://'+emailPieces[1] emailExt = tldextract.extract(emailURL) entity['domain'] = emailExt.domain entity['subdomain'] = emailExt.subdomain elif (entity_type=='phone'): areaCode = '' if (len(entity_value) == 10): areaCode = entity_value[1:4] if (areaCode != ''): entity['area_code'] = areaCode else: adj_urls.add(entity_value) webExt = tldextract.extract(entity_value) entity['subdomain']=webExt.subdomain entity['domain']=webExt.domain entity['suffix']=webExt.suffix if (bAdd): entities.append(entity) # Get all the lookahead features if (len(adj_urls) > 0): lookaheadFeatures = entityDataConnector.get_extracted_entities_from_urls(adj_urls) # add place holders for urls with no extracted data for adj_url in adj_urls: if adj_url not in lookaheadFeatures: lookaheadFeatures[adj_url] = {} domainLookaheadFeatures = entityDataConnector.get_extracted_domain_entities_from_urls(domain,adj_urls) else: lookaheadFeatures = [] domainLookaheadFeatures = [] entityDataConnector.close() endMillis = int(round(time.time() * 1000)) # tangelo.log('Processing time = ' + str((endMillis-startMillis)/1000) + 's'); return { 'browsePath':browsePath, 'entities':entities, 'lookaheadFeatures':lookaheadFeatures, 'domainLookaheadFeatures':domainLookaheadFeatures }