Example #1
0
def getBrowsePathWithTextSelections(org,startdate,enddate,userlist=[],trail='*',domain=''):
    # first get the browse path
    graph = getBrowsePathEdges(org,startdate,enddate,userlist,trail,domain)
    nodes = graph['nodes']
    edges = graph['edges']

    newnodes = {}
    try:
        # for each node in the browse path pull any related notes:
        for key,node in nodes.iteritems():
            postIds = node['postIds']
            if len(postIds) > 0:
                params = ','.join(['%s' for i in range(len(postIds))])
                sql =  """
                   SELECT posts.id,  selections.id, unix_timestamp(posts.ts),posts.url,posts.userId,posts.userName,selections.selection
                   FROM datawake_data posts, datawake_selections selections
                   WHERE posts.id = selections.postId and posts.id  in ("""+params+")"

                rows = datawake_mysql.dbGetRows(sql,postIds)
                for row in rows:
                    postid = row[0]
                    selectionId = row[1]
                    ts = row[2]
                    url = row[3]
                    userId = row[4]
                    userName = row[5].encode()
                    selection = row[6]

                    id = 'selection_'+str(postid)+'_'+str(selectionId)+'_'+'_'+url
                    node = {'id':id,
                            'type':'selection',
                            'size':5,
                            'groupName':'',
                            'timestamps':[ts],
                            'userNames':[userName],
                            'userIds':[userId],
                            'data':selection
                    }
                    newnodes[id] = node
                    edges.append((key,id))

        nodes.update(newnodes)

        if len(userlist) == 1 and trail != '*':
            nodes = addUrlRankstoNodes(org,nodes,userlist[0],trail,domain=domain)


        return {'nodes':nodes,'edges':edges}
    except:
        raise
Example #2
0
def getBrowsePathWithTextSelections(org,startdate,enddate,userlist=[],trail='*',domain=''):
    # first get the browse path
    graph = getBrowsePathEdges(org,startdate,enddate,userlist,trail,domain)
    nodes = graph['nodes']
    edges = graph['edges']

    newnodes = {}
    try:
        # for each node in the browse path pull any related notes:
        for key,node in nodes.iteritems():
            postIds = node['postIds']
            if len(postIds) > 0:
                params = ','.join(['%s' for i in range(len(postIds))])
                sql =  """
                   SELECT posts.id,  selections.id, unix_timestamp(posts.ts),posts.url,posts.userId,posts.userName,selections.selection
                   FROM datawake_data posts, datawake_selections selections
                   WHERE posts.id = selections.postId and posts.id  in ("""+params+")"

                rows = datawake_mysql.dbGetRows(sql,postIds)
                for row in rows:
                    postid = row[0]
                    selectionId = row[1]
                    ts = row[2]
                    url = row[3]
                    userId = row[4]
                    userName = row[5].encode()
                    selection = row[6]

                    id = 'selection_'+str(postid)+'_'+str(selectionId)+'_'+'_'+url
                    node = {'id':id,
                            'type':'selection',
                            'size':5,
                            'groupName':'',
                            'timestamps':[ts],
                            'userNames':[userName],
                            'userIds':[userId],
                            'data':selection
                    }
                    newnodes[id] = node
                    edges.append((key,id))

        nodes.update(newnodes)

        if len(userlist) == 1 and trail != '*':
            nodes = addUrlRankstoNodes(org,nodes,userlist[0],trail,domain=domain)


        return {'nodes':nodes,'edges':edges}
    except:
        raise
Example #3
0
def getOculusForensicGraph(org,
                           startdate,
                           enddate,
                           userlist=[],
                           trail='*',
                           domain=''):
    startMillis = int(round(time.time() * 1000))
    entityDataConnector.close()
    org = org.upper()

    command = """
      SELECT id,unix_timestamp(ts) as ts,url
      FROM memex_sotera.datawake_data
      WHERE org=%s AND domain=%s
      """
    params = [org, domain]

    # add the user list filter if given
    if (len(userlist) > 0):
        command = command + " AND "
        newparams = ['%s' for i in range(len(userlist))]
        newparams = ','.join(newparams)
        command = command + "  userId in (" + params + ") "
        params.extend(newparams)

    # add the trail filter
    if trail != '*':
        command = command + " AND trail = %s"
        params.append(trail)

    # add the time filter to the query
    if (startdate == '' and enddate == ''):
        pass
    elif (startdate != '' and enddate == ''):
        command = command + " AND unix_timestamp(ts) >= %s "
        params.append(startdate)
    elif (startdate == '' and enddate != ''):
        command = command + "  AND unix_timestamp(ts) <= %s "
        params.append(enddate)
    else:
        command = command + " AND unix_timestamp(ts) >= %s and unix_timestamp(ts) <= %s "
        params.append(startdate)
        params.append(enddate)

    command = command + " GROUP BY url ORDER BY ts asc "

    db_rows = datawake_mysql.dbGetRows(command, params)
    urls = map(lambda x: x[2], db_rows)
    extracted_features = entityDataConnector.get_extracted_entities_from_urls(
        urls)

    browsePath = {}
    adj_urls = set([])
    entities = []
    for row in db_rows:
        (id, ts, url) = row
        #tangelo.log("URL: "+url)
        if url not in extracted_features:
            #tangelo.log("skipping url: "+url)
            continue
        extracted_features_for_url = extracted_features[url]
        for entity_type, entity_values in extracted_features_for_url.iteritems(
        ):
            if entity_type == "info":
                continue
            #tangelo.log("\tENTITY TYPE: "+entity_type)
            for entity_value in entity_values:
                #tangelo.log("\t\tENTITY VALUE: "+entity_value)
                if trail is None or trail.strip() == '': trail = "default"

                if id not in browsePath:
                    ext = tldextract.extract(url)
                    browsePath[id] = {
                        'id': id,
                        'url': url,
                        'timestamp': ts,
                        'subdomain': ext.subdomain,
                        'domain': ext.domain,
                        'suffix': ext.suffix
                    }

                entity = {'id': id, 'type': entity_type, 'value': entity_value}
                bAdd = True
                if (entity_type == 'email'):
                    emailPieces = entity_value.split('@')
                    entity['user_name'] = emailPieces[0]
                    emailURL = 'mailto://' + emailPieces[1]
                    emailExt = tldextract.extract(emailURL)
                    entity['domain'] = emailExt.domain
                    entity['subdomain'] = emailExt.subdomain
                elif (entity_type == 'phone'):
                    areaCode = ''
                    if (len(entity_value) == 10):
                        areaCode = entity_value[1:4]

                    if (areaCode != ''):
                        entity['area_code'] = areaCode
                else:
                    adj_urls.add(entity_value)
                    webExt = tldextract.extract(entity_value)
                    entity['subdomain'] = webExt.subdomain
                    entity['domain'] = webExt.domain
                    entity['suffix'] = webExt.suffix

                if (bAdd):
                    entities.append(entity)

    # Get all the lookahead features
    if (len(adj_urls) > 0):
        lookaheadFeatures = entityDataConnector.get_extracted_entities_from_urls(
            adj_urls)

        # add place holders for urls with no extracted data
        for adj_url in adj_urls:
            if adj_url not in lookaheadFeatures:
                lookaheadFeatures[adj_url] = {}

        domainLookaheadFeatures = entityDataConnector.get_extracted_domain_entities_from_urls(
            domain, adj_urls)
    else:
        lookaheadFeatures = []
        domainLookaheadFeatures = []

    entityDataConnector.close()
    endMillis = int(round(time.time() * 1000))
    # tangelo.log('Processing time = ' + str((endMillis-startMillis)/1000) + 's');
    return {
        'browsePath': browsePath,
        'entities': entities,
        'lookaheadFeatures': lookaheadFeatures,
        'domainLookaheadFeatures': domainLookaheadFeatures
    }
Example #4
0
def getBrowsePathEdges(org,startdate,enddate,userlist=[],trail='*',domain=''):
    print 'getBrowsePathEdges(',startdate,',',enddate,',',userlist,')'
    org = org.upper()
    command = """SELECT unix_timestamp(t1.ts) as ts, t1.url,hits,userName,userId,id,trail
                 FROM datawake_data as t1 LEFT JOIN (select url,count(url) as hits from datawake_data WHERE org = %s and domain = %s group by url ) as t2 ON t1.url = t2.url
                 WHERE t1.org = %s and t1.domain = %s
              """
    commandArgs = [org,domain,org,domain]

    # add the time filter to the query
    if (startdate == '' and enddate == ''):
        pass
    elif (startdate != '' and enddate == ''):
        command = command +" AND unix_timestamp(t1.ts) >= %s "
        commandArgs.append(startdate)
    elif (startdate == '' and enddate != ''):
        command = command + "  AND unix_timestamp(t1.ts) <= %s "
        commandArgs.append(enddate)
    else:
        command = command + " AND unix_timestamp(t1.ts) >= %s and unix_timestamp(t1.ts) <= %s "
        commandArgs.append(startdate)
        commandArgs.append(enddate)

    # add the user filter
    if (len(userlist) > 0):
        command = command +" AND "
        params = ['%s' for i in range(len(userlist))]
        params = ','.join(params)
        command = command + "  userId in ("+params+") "
        commandArgs.extend(userlist)

    # add the trail filter
    if trail != '*':
        command = command +" AND "
        command = command + " trail = %s"
        commandArgs.append(trail)

    command = command + " ORDER BY userId,t1.ts asc"
    rows = datawake_mysql.dbGetRows(command,commandArgs)

    edges = []
    nodes = {}
    edge_buffer = []
    for row in rows:
        (ts,url,hits,username,userId,postId,trail) = row
        if trail is None or trail.strip() == '': trail = "default"

        if url not in nodes:
            nodes[url] = {'id':url,
                              'type':'browse path ',
                              'size':10,
                              'timestamps':[],
                              'hits':0,
                              'userNames':[],
                              'userIds':[],
                              'postIds':[],
                              'trails':[]
            }
        nodes[url]['timestamps'].append(ts)
        nodes[url]['hits'] = hits
        nodes[url]['userNames'].append(username)
        nodes[url]['userIds'].append(userId)
        nodes[url]['postIds'].append(postId)
        nodes[url]['trails'].append(trail)

        edge_buffer.append(url)
        if len(edge_buffer) == 2:
            if (edge_buffer[0] != edge_buffer[1]):
                if 'chrome://newtab/' not in edge_buffer[1]:
                    users1 = nodes[edge_buffer[0]]['userIds'][-1]
                    users2 = nodes[edge_buffer[1]]['userIds'][-1]
                    if users1 == users2:
                        edges.append((edge_buffer[0],edge_buffer[1]))
            edge_buffer = [edge_buffer[1]]


    # set group name from each node
    for key,value in nodes.iteritems():
        domain = 'n/a'
        if '//' in key:  domain =  key.split('/')[2]
        value['groupName'] = domain

    if len(userlist) == 1 and trail != '*':
        nodes = addUrlRankstoNodes(org,nodes,userlist[0],trail,domain=domain)

    return {'nodes':nodes,'edges':edges}
Example #5
0
def getBrowsePathEdges(org,startdate,enddate,userlist=[],trail='*',domain=''):
    print 'getBrowsePathEdges(',startdate,',',enddate,',',userlist,')'
    org = org.upper()
    command = """SELECT unix_timestamp(t1.ts) as ts, t1.url,hits,userName,userId,id,trail
                 FROM datawake_data as t1 LEFT JOIN (select url,count(url) as hits from datawake_data WHERE org = %s and domain = %s group by url ) as t2 ON t1.url = t2.url
                 WHERE t1.org = %s and t1.domain = %s
              """
    commandArgs = [org,domain,org,domain]

    # add the time filter to the query
    if (startdate == '' and enddate == ''):
        pass
    elif (startdate != '' and enddate == ''):
        command = command +" AND unix_timestamp(t1.ts) >= %s "
        commandArgs.append(startdate)
    elif (startdate == '' and enddate != ''):
        command = command + "  AND unix_timestamp(t1.ts) <= %s "
        commandArgs.append(enddate)
    else:
        command = command + " AND unix_timestamp(t1.ts) >= %s and unix_timestamp(t1.ts) <= %s "
        commandArgs.append(startdate)
        commandArgs.append(enddate)

    # add the user filter
    if (len(userlist) > 0):
        command = command +" AND "
        params = ['%s' for i in range(len(userlist))]
        params = ','.join(params)
        command = command + "  userId in ("+params+") "
        commandArgs.extend(userlist)

    # add the trail filter
    if trail != '*':
        command = command +" AND "
        command = command + " trail = %s"
        commandArgs.append(trail)

    command = command + " ORDER BY userId,t1.ts asc"
    rows = datawake_mysql.dbGetRows(command,commandArgs)

    edges = []
    nodes = {}
    edge_buffer = []
    for row in rows:
        (ts,url,hits,username,userId,postId,trail) = row
        if trail is None or trail.strip() == '': trail = "default"

        if url not in nodes:
            nodes[url] = {'id':url,
                              'type':'browse path ',
                              'size':10,
                              'timestamps':[],
                              'hits':0,
                              'userNames':[],
                              'userIds':[],
                              'postIds':[],
                              'trails':[]
            }
        nodes[url]['timestamps'].append(ts)
        nodes[url]['hits'] = hits
        nodes[url]['userNames'].append(username)
        nodes[url]['userIds'].append(userId)
        nodes[url]['postIds'].append(postId)
        nodes[url]['trails'].append(trail)

        edge_buffer.append(url)
        if len(edge_buffer) == 2:
            if (edge_buffer[0] != edge_buffer[1]):
                if 'chrome://newtab/' not in edge_buffer[1]:
                    users1 = nodes[edge_buffer[0]]['userIds'][-1]
                    users2 = nodes[edge_buffer[1]]['userIds'][-1]
                    if users1 == users2:
                        edges.append((edge_buffer[0],edge_buffer[1]))
            edge_buffer = [edge_buffer[1]]


    # set group name from each node
    for key,value in nodes.iteritems():
        domain = 'n/a'
        if '//' in key:  domain =  key.split('/')[2]
        value['groupName'] = domain

    if len(userlist) == 1 and trail != '*':
        nodes = addUrlRankstoNodes(org,nodes,userlist[0],trail,domain=domain)

    return {'nodes':nodes,'edges':edges}
Example #6
0
def getOculusForensicGraph(org,startdate,enddate,userlist=[],trail='*',domain=''):
    startMillis = int(round(time.time() * 1000))
    entityDataConnector.close()
    org = org.upper()

    command = """
      SELECT id,unix_timestamp(ts) as ts,url
      FROM memex_sotera.datawake_data
      WHERE org=%s AND domain=%s
      """
    params = [org,domain]


    # add the user list filter if given
    if (len(userlist) > 0):
        command = command +" AND "
        newparams = ['%s' for i in range(len(userlist))]
        newparams = ','.join(newparams)
        command = command + "  userId in ("+params+") "
        params.extend(newparams)

    # add the trail filter
    if trail != '*':
        command = command +" AND trail = %s"
        params.append(trail)




    # add the time filter to the query
    if (startdate == '' and enddate == ''):
        pass
    elif (startdate != '' and enddate == ''):
        command = command +" AND unix_timestamp(ts) >= %s "
        params.append(startdate)
    elif (startdate == '' and enddate != ''):
        command = command + "  AND unix_timestamp(ts) <= %s "
        params.append(enddate)
    else:
        command = command + " AND unix_timestamp(ts) >= %s and unix_timestamp(ts) <= %s "
        params.append(startdate)
        params.append(enddate)


    command = command + " GROUP BY url ORDER BY ts asc "

    db_rows = datawake_mysql.dbGetRows(command,params)
    urls = map(lambda x: x[2],db_rows)
    extracted_features = entityDataConnector.get_extracted_entities_from_urls(urls)

    browsePath = {}
    adj_urls = set([])
    entities = []
    for row in db_rows:
        (id,ts,url) = row
        #tangelo.log("URL: "+url)
        if url not in extracted_features:
            #tangelo.log("skipping url: "+url)
            continue
        extracted_features_for_url = extracted_features[url]
        for entity_type,entity_values in extracted_features_for_url.iteritems():
            if entity_type == "info":
                continue
            #tangelo.log("\tENTITY TYPE: "+entity_type)
            for entity_value in entity_values:
                #tangelo.log("\t\tENTITY VALUE: "+entity_value)
                if trail is None or trail.strip() == '': trail = "default"

                if id not in browsePath:
                    ext = tldextract.extract(url)
                    browsePath[id] = {'id':id,
                              'url':url,
                              'timestamp':ts,
                              'subdomain':ext.subdomain,
                              'domain':ext.domain,
                              'suffix':ext.suffix
                    }

                entity = {
                    'id':id,
                    'type':entity_type,
                    'value':entity_value
                }
                bAdd = True;
                if (entity_type=='email'):
                    emailPieces = entity_value.split('@')
                    entity['user_name'] = emailPieces[0]
                    emailURL = 'mailto://'+emailPieces[1]
                    emailExt = tldextract.extract(emailURL)
                    entity['domain'] = emailExt.domain
                    entity['subdomain'] = emailExt.subdomain
                elif (entity_type=='phone'):
                    areaCode = ''
                    if (len(entity_value) == 10):
                        areaCode = entity_value[1:4]

                    if (areaCode != ''):
                        entity['area_code'] = areaCode
                else:
                    adj_urls.add(entity_value)
                    webExt = tldextract.extract(entity_value)
                    entity['subdomain']=webExt.subdomain
                    entity['domain']=webExt.domain
                    entity['suffix']=webExt.suffix

                if (bAdd):
                    entities.append(entity)

    # Get all the lookahead features
    if (len(adj_urls) > 0):
        lookaheadFeatures = entityDataConnector.get_extracted_entities_from_urls(adj_urls)

        # add place holders for urls with no extracted data
        for adj_url in adj_urls:
            if adj_url not in lookaheadFeatures:
                lookaheadFeatures[adj_url] = {}

        domainLookaheadFeatures = entityDataConnector.get_extracted_domain_entities_from_urls(domain,adj_urls)
    else:
        lookaheadFeatures = []
        domainLookaheadFeatures = []


    entityDataConnector.close()
    endMillis = int(round(time.time() * 1000))
    # tangelo.log('Processing time = ' + str((endMillis-startMillis)/1000) + 's');
    return {
        'browsePath':browsePath,
        'entities':entities,
        'lookaheadFeatures':lookaheadFeatures,
        'domainLookaheadFeatures':domainLookaheadFeatures
    }