Exemple #1
0
def search_email_by_conversation(*path_args, **param_args):
    tangelo.content_type("application/json")
    tangelo.log("search.search_email_by_conversation(path_args[%s] %s)" %
                (len(path_args), str(path_args)))

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **param_args)
    # TODO: set from UI
    size = param_args.get('size', 2500)

    # parse the sender address and the recipient address
    sender_list = parseParamEmailSender(**param_args)
    cherrypy.log("\tsender_list: %s)" % str(sender_list))

    recipient_list = parseParamEmailRecipient(**param_args)
    cherrypy.log("\trecipient_list: %s)" % str(recipient_list))

    document_uid = parseParamDocumentUID(**param_args)
    cherrypy.log("\tdocument_uid: %s)" % str(document_uid))

    document_datetime = parseParamDocumentDatetime(**param_args)
    cherrypy.log("\tdocument_datetime: %s)" % str(document_datetime))
    if not document_datetime:
        return tangelo.HTTPStatusCode(
            400,
            "invalid service call - missing mandatory param 'document_datetime'"
        )

    sender_address, recipient_address = parseParamAllSenderAllRecipient(
        **param_args)

    return es_get_conversation(data_set_id, sender_address, recipient_address,
                               start_datetime, end_datetime, size / 2,
                               document_uid, document_datetime)
def run(usertoken):
    # Create an empty response object.
    response = {}
    collectionNames = []

    # build custom girder header for authenticated access
    girderheader = {'Girder-Token': usertoken}
    print 'girderheader:',girderheader

    # look through the collections in girder.  Return a list of collections that are in this local # Arbor instance
    girderlocation = 'http://localhost:9000'
    resp = requests.get(girderlocation+'/api/v1/collection',headers=girderheader)

    # nothing particularly interesting here
    #print resp.headers
    #print requests.utils.dict_from_cookiejar(resp.cookies)

    for entry in resp.json():
        collname = entry['name']
        print "found collection:", collname
        collectionNames.append(entry['name'])

    # Pack the results into the response object, and return it.
    response['result'] = collectionNames

    # Return the response object.
    tangelo.log(str(response))
    return json.dumps(response)
Exemple #3
0
def queryEmail(email):
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt_email_by_id, email) as qry:
            tangelo.log("node-vals: %s" % qry.stmt)
            rtn = qry.cursor().fetchone()
            tangelo.content_type("application/json")
            return rtn if rtn else []
Exemple #4
0
def queryEntity(email):
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt_email_entities_by_id,
                           email) as qry:
            tangelo.log("node-vals: %s" % qry.stmt)
            rtn = [r for r in qry.cursor()]
            return rtn if rtn else []
Exemple #5
0
def parseParamAllSenderAllRecipient( **kwargs ):
    tangelo.log("parseParamAllSenderAllRecipient(kwargs[%s] %s)" % (len(kwargs), str(kwargs)))
    sender = kwargs.get('sender','').split(",")[0]
    recipient = kwargs.get('recipient','').split(",")

    recipient = [x for x in recipient if (x is not '' and x is not None)]
    return sender, recipient
Exemple #6
0
def module_reload_changed(key):
    """
    Reload a module if it has changed since we last imported it.  This is
    necessary if module a imports script b, script b is changed, and then
    module c asks to import script b.

    :param key: our key used in the WatchList.
    :returns: True if reloaded.
    """
    imp.acquire_lock()
    try:
        modkey = module_sys_modules_key(key)
        if not modkey:
            return False
        found = None
        if modkey:
            for second in WatchList:
                secmodkey = module_sys_modules_key(second)
                if secmodkey and sys.modules[modkey] == sys.modules[secmodkey]:
                    found = second
                    foundmodkey = secmodkey
                    break
        if not found:
            return
        filemtime = module_getmtime(WatchList[found]["file"])
        filemtime = latest_submodule_time(found, filemtime)
        if filemtime > WatchList[found]["time"]:
            tangelo.log("Reloaded %s" % found)
            reload_including_local(sys.modules[foundmodkey])
            for second in WatchList:
                if WatchList[second]["file"] == WatchList[found]["file"]:
                    WatchList[second]["time"] = filemtime
    finally:
        imp.release_lock()
    return True
Exemple #7
0
def scrape_page(team_id,domain_id,trail_id,url,content,user_email):

    content = urllib.unquote(content).encode('utf-8')
    url = url.encode('utf-8')

    connector = factory.get_entity_data_connector()

    # blacklist of pages to not extract data from
    blacklist = config.get_extraction_blacklist()
    if urlparse(url).netloc not in blacklist:
        (features,errors) = extractors.extractAll(content)
        for error in errors:
            tangelo.log("FEATURE EXTRACTION ERROR: "+error)

        for type,values in features.iteritems():
            connector.insert_entities(url,type,values)
            if len(values) > 0:
                features_in_domain = connector.get_domain_entity_matches(domain_id,type,values)
                if len(features_in_domain) > 0:
                    tangelo.log("INSERTING DOMAIN ENTITIES")
                    tangelo.log(type)
                    connector.insert_domain_entities(str(domain_id),url, type, features_in_domain)
        # we also don't want to export blacklisted pages.
        tangelo.log("Calling export")
        export_to_services(domain_id, team_id, trail_id, url, content, user_email, features)
    else:
        tangelo.log("Url: %s IN blacklist"%url)

    id = db.addBrowsePathData(team_id,domain_id,trail_id,url, user_email)

    count = db.getUrlCount(team_id,domain_id,trail_id, url)
    result = dict(id=id, count=count)
    return json.dumps(result)
def run(host,database):
    # Create an empty response object.
    response = {}
    collectionNames = ['select a dataset']

   # look through the collections in the ivaan database and return the name of all collections
   # that match the naming profile for tables.  This is matching to see if the collection name
   # begins with "table_"

    client = MongoClient(host, 27017)
    db = client[database]
    # get a list of all collections (excluding system collections)
    collection_list = db.collection_names(False)
    for coll in collection_list:
        # exclude the seeds collections
        if coll[:6] == 'seeds_':
            #print "found seeds:", coll
            collectionNames.append(coll)

    client.close()

    # Pack the results into the response object, and return it.
    response['result'] = collectionNames

    # Return the response object.
    tangelo.log(str(response))
    return json.dumps(response)
Exemple #9
0
    def __init__(self,
                 bus,
                 cfg_file=None,
                 logfile=None,
                 pidfile=None,
                 webroot=None,
                 hostname=None,
                 port=None):
        SimplePlugin.__init__(self, bus)

        self.finalized = False
        self.pid = os.getpid()
        self.filename = StatusFile.status_filename(self.pid)

        tangelo.log("here")
        self.status = {
            k: str(v)
            for k, v in zip(StatusFile.fields, map(eval, StatusFile.fields))
        }
        self.status["pid"] = str(self.pid)
        tangelo.log("there")

        for k, v in self.status.iteritems():
            if v is None:
                raise TypeError("argument '%s' cannot be None" % (k))
Exemple #10
0
def scrape_page(html, url, userId, userName, trail, domain, org):
    #tangelo.log('USER NAME: ' + userName)
    domain = domain.encode('utf-8')
    org = org.encode('utf-8')
    html = urllib.unquote(html).encode('utf-8')
    url = url.encode('utf-8')

    connector = factory.get_entity_data_connector()
    (features,errors) = extractors.extractAll(html)
    tangelo.log(features)
    for type,values in features.iteritems():
        connector.insert_entities(url,type,values)
        #for value in values:
        #    tangelo.log("EXTRACTED: "+type+"\t"+value)
        if len(values) > 0:
            features_in_domain = connector.get_domain_entity_matches(domain,type,values)
            if len(features_in_domain) > 0:
                connector.insert_domain_entities(domain,url, type, features_in_domain)
                #tangelo.log("EXTRACTED "+str(len(features_in_domain))+" DOMAIN FEATURES")


    for error in errors:
        tangelo.log("FEATURE EXTRACTION ERROR: "+error)


    id = db.addBrowsePathData(org, url, userId, userName, trail, domain=domain)
    # get number of times this url appears in the database
    count = db.getUrlCount(org, url, domain=domain)
    result = dict(id=id, count=count)



    #tangelo.log("POSTED url:" + url + "  return: " + str(result))
    return json.dumps(result)
Exemple #11
0
def search_email_by_community(*args, **param_args):
    tangelo.content_type("application/json")
    tangelo.log("search_email_by_community(args: %s kwargs: %s)" %
                (str(args), str(param_args)))

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **param_args)
    community = nth(args, 0, '')

    # TODO: set from UI
    size = param_args.get('size', 2500)

    if not data_set_id:
        return tangelo.HTTPStatusCode(
            400, "invalid service call - missing data_set_id")
    if not community:
        return tangelo.HTTPStatusCode(400,
                                      "invalid service call - missing sender")

    email_addrs = parseParam_email_addr(**param_args)

    qs = parseParamTextQuery(**param_args)

    return es_get_all_email_by_community(data_set_id, community, email_addrs,
                                         qs, start_datetime, end_datetime,
                                         size)
Exemple #12
0
def getRankedAddresses(*args, **kwargs):
    tangelo.content_type("application/json")
    tangelo.log("getRankedAddresses(args: %s kwargs: %s)" % (str(args), str(kwargs)))
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs)
    # TODO - reminder no 'qs' here set to ''
    # qs = parseParamTextQuery(**kwargs)
    qs=''

    # TODO this needs to come from UI
    size = size if size >500 else 2500

    ranked_addresses = get_ranked_email_address_from_email_addrs_index(data_set_id, start_datetime, end_datetime, size)
    top_address_list = []
    for i, email_address in enumerate(ranked_addresses["emails"]):
        graph = es_get_all_email_by_address(data_set_id, email_address[0], qs, start_datetime, end_datetime, size )

        top_address_list.append({
            "address_search_url_path" : email_address[0],
            "parameters" : kwargs,
            "search_results" : {
                "mail_sent_count" : email_address[6],
                "mail_received_count" : email_address[5],
                "mail_attachment_count" : email_address[7],
                "query_matched_count" : graph["query_hits"],
                "associated_count" : len(graph["graph"]["nodes"])
            },
            "TEMPORARY_GRAPH" : graph
        })


    return {"top_address_list" : top_address_list}
Exemple #13
0
def es_get_exif_emails(data_set_id, size):
    tangelo.log("es_geo.es_get_exif_emails()" )

    emails_resp = es().search(index=data_set_id, doc_type="emails", size=size, body=_geo_exif_query())
    tangelo.log("es_geo.es_get_exif_emails(total document hits = %s)" % emails_resp["hits"]["total"])
    docs = [hit["_source"] for hit in emails_resp["hits"]["hits"]]
    return {"total":emails_resp["hits"]["total"], "exif_docs" : docs}
Exemple #14
0
def _build_graph_for_emails(index, emails, query_hits):
    nodes = []
    edge_map = {}
    addr_index = {}

    total = count(index,"email_address")
    print total

    for email in emails:
        from_addr = email["from"]
        if from_addr not in _EMAIL_ADDR_CACHE[index]:
            tangelo.log("WARNING: From email address not found in cache <%s>" % email)
            continue;

        if from_addr not in addr_index:
            nodes.append(_map_node(_EMAIL_ADDR_CACHE[index][from_addr],total))
            addr_index[from_addr] = len(nodes)-1
        for rcvr_addr in email["to"]+email["cc"]+email["bcc"]:
            if rcvr_addr not in _EMAIL_ADDR_CACHE[index]:
                tangelo.log("WARNING: RCVR email address not found in cache <%s>" % rcvr_addr)
                continue;

            if rcvr_addr not in addr_index:
                nodes.append(_map_node(_EMAIL_ADDR_CACHE[index][rcvr_addr], total))
                addr_index[rcvr_addr] = len(nodes)-1
            #TODO reduce by key instead of mapping?  src->target and sum on value
            edge_key = from_addr+"#"+rcvr_addr
            if edge_key not in edge_map:
                edge_map[edge_key] = {"source" : addr_index[from_addr],"target": addr_index[rcvr_addr],"value": 1}
            else:
                edge_map[edge_key]["value"]=edge_map[edge_key]["value"]+1

    return {"graph":{"nodes":nodes, "links":edge_map.values()}, "rows": [_map_emails_to_row(email) for email in emails], "query_hits" : query_hits}
Exemple #15
0
def parseParamEmailAddress( **kwargs ):
    tangelo.log("parseParamEmailAddress(kwargs[%s] %s)" % (len(kwargs), str(kwargs)))
    email_regex = re.compile("[^@]+@[^@]+\\.[^@]+")
    key_list = [k for k in kwargs.keys() if email_regex.match(k)]
    tangelo.log("\tkey_list[] = %s" % str(key_list))
    
    return key_list
Exemple #16
0
def get_top_email_by_text_query(data_set_id, qs, start_datetime, end_datetime,
                                size):

    if not qs:
        return tangelo.HTTPStatusCode(
            400, "invalid service call - missing search term(s)")

    query = _build_email_query(qs=qs,
                               date_bounds=(start_datetime, end_datetime))
    tangelo.log("es_search.get_graph_for_text_query(query: %s)" % (query))

    results = _query_emails(data_set_id, size, query)
    graph = _build_graph_for_emails(data_set_id, results["hits"],
                                    results["total"])

    # Get attachments for community
    query = _build_email_query(qs=qs,
                               date_bounds=(start_datetime, end_datetime),
                               attachments_only=True)
    tangelo.log("es_search.get_top_email_by_text_query(attachment-query: %s)" %
                (query))
    attachments = _query_email_attachments(data_set_id, size, query)
    graph["attachments"] = attachments

    return graph
Exemple #17
0
def query(data):
    url = data['url']
    max_results_per_node = int(data['mrpn'])
    indd = data['index']
    search_terms = data['search_terms']
    es = Elasticsearch([url])
    if esauth.get(url) != None:
        cred = esauth[url]
        tangelo.log('http://' + cred + '@' + url)
        es = Elasticsearch(['http://' + cred + '@' + url])
    ind = indd
    rr = []
    num = 0
    for t in search_terms:
        if t['type'] == 'selection' or t['type'] == 'phone' or t['type'] == 'email' or t['type'] == 'info':
            num_to_search = t['id']
            if t['type'] == 'selection':
                num_to_search = t['data']
            if t['type'] == 'info':
                num_to_search = t['id'].split('->')[1].strip()
            results = es.search(index=ind,body={"size":max_results_per_node,"fields":["_index","_type","_id"],"query":{"match_phrase": {"_all": num_to_search}}})
            num += results['hits']['total']
            for hit in results['hits']['hits']:
                rr.append({'nid':t['id'],'search_term':num_to_search,'eid':hit['_id'],'itype':hit['_type'],'jindex':ind,'url':url})

    return json.dumps({'num':num,'hits':rr})
Exemple #18
0
def search_email_by_topic(*args, **param_args):
    tangelo.content_type("application/json")
    tangelo.log("search_email_by_topic(args: %s kwargs: %s)" %
                (str(args), str(param_args)))

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **param_args)

    # TODO: set from UI
    size = param_args.get('size', 2500)

    if not data_set_id:
        return tangelo.HTTPStatusCode(
            400, "invalid service call - missing data_set_id")

    if not param_args.get("topic_index"):
        return tangelo.HTTPStatusCode(
            400, "invalid service call - missing topic_index")
    topic = parseParamTopic(**param_args)

    email_addrs = parseParam_email_addr(**param_args)

    qs = parseParamTextQuery(**param_args)

    return es_get_all_email_by_topic(data_set_id,
                                     topic=topic,
                                     email_addrs=email_addrs,
                                     qs=qs,
                                     start_datetime=start_datetime,
                                     end_datetime=end_datetime,
                                     size=size)
Exemple #19
0
def parseParamEmailIds(**kwargs):
    tangelo.log("parseParamEmailIds(kwargs[%s] %s)" %
                (len(kwargs), str(kwargs)))
    value = kwargs.get('email_ids')
    email_ids = value.split(",")

    return [x for x in email_ids if (x is not '' and x is not None)]
Exemple #20
0
def getHourlyBrowsePathCounts(org, users, trail, domain='default'):
    sql = 'SELECT (unix_timestamp(ts) DIV 3600)*3600  as group_hour, count(1) from datawake_data where org = %s AND domain = %s '
    params = [org.upper(), domain]
    if trail != '*' and trail != '':
        sql = sql + ' AND trail = %s '
        params.append(trail)
    if len(users) > 0:
        param_string = ','.join(['%s' for i in range(len(users))])
        sql = sql + ' AND userId in (' + param_string + ') '
        params.extend(users)
    sql = sql + " GROUP BY group_hour"

    tangelo.log(sql)
    tangelo.log(str(params))
    rows = dbGetRows(sql, params)
    result = []
    delta = 3600
    if len(rows) > 0: curr = rows[0][0]
    for row in rows:
        if row[0] is None: continue
        print 'row ', row
        dt = row[0]
        while (dt - curr > 3600):
            curr = curr + delta
            result.append({'ts': curr, 'count': 0})
        result.append({'ts': dt, 'count': row[1]})
        curr = dt

    # add one hour
    if len(result) > 0:
        curr = curr + 3600
        result.append({'ts': curr, 'count': 0})

    return result
Exemple #21
0
def get_user(token):
    user = helper.get_user()
    if helper.get_token() != token or user is None:
        user_auth = factory.get_authentication_object(token)
        user = user_auth.get_user_from_token()
        tangelo.log('session.post verified user: ' + str(user))
    return user
Exemple #22
0
def get_attachments_by_sender(data_set_id, sender, start_datetime, end_datetime, size):

    # fields= ["id", "dir", "datetime", "from", "tos", "ccs", "bccs", "subject", "attach", "bodysize"]
    # fields= ["id", "datetime", "senders", "tos", "ccs", "bccs", "subject", "attachments.filename"]
    # body={"filter":{"exists":{"field":"attachments"}}, "query":{"match":{"senders":sender}}}

    body = _build_email_query(sender_addrs=[sender], date_bounds=(start_datetime, end_datetime), attachments_only=True)
    tangelo.log("get_attachments_by_sender.Query %s"%body)

    attachments_resp = es().search(index=data_set_id, doc_type="emails", size=size, body=body)

    email_attachments = []
    for attachment_item in attachments_resp["hits"]["hits"]:
        _source = attachment_item["_source"]
        attachment_entry = [_source["id"],
                            "PLACEHOLDER",
                            _source["datetime"],
                            _source.get("senders","")[0],
                            ';'.join(_source.get("tos","")),
                            ';'.join(_source.get("ccs","")),
                            ';'.join(_source.get("bccs","")),
                            _source.get("subject","")]
        for attachment in _source["attachments"]:
            l = list(attachment_entry)
            l[1] = attachment["guid"]
            l.append(attachment["filename"])
            l.append(0)
            email_attachments.append(l)
    return {"sender":sender, "email_attachments":email_attachments}
Exemple #23
0
def get_entity_histogram(index,
                         type,
                         email_addrs=[],
                         qs='',
                         topic_score=None,
                         date_bounds=None,
                         entity_agg_size=10):
    tangelo.log("===================================================")
    body = entity_histogram_query(email_addrs=email_addrs,
                                  qs=qs,
                                  topic_score=topic_score,
                                  date_bounds=date_bounds,
                                  entity_agg_size=entity_agg_size)

    tangelo.log("get_entity_histogram: query = %s" % body)

    resp = es().search(index=index, doc_type=type, body=body)
    return sorted([
        dict(d, **{"type": "location"}) for d in resp["aggregations"]
        ["filtered_entity_agg"]["location"]["buckets"]
    ] + [
        dict(d, **{"type": "organization"}) for d in resp["aggregations"]
        ["filtered_entity_agg"]["organization"]["buckets"]
    ] + [
        dict(d, **{"type": "person"}) for d in resp["aggregations"]
        ["filtered_entity_agg"]["person"]["buckets"]
    ] + [
        dict(d, **{"type": "misc"})
        for d in resp["aggregations"]["filtered_entity_agg"]["misc"]["buckets"]
    ],
                  key=lambda d: d["doc_count"],
                  reverse=True)
Exemple #24
0
def createResults(field, args_array):

    ## is text search 
    if not field.lower() in ["email", "entity"]:
        text = head(args_array)    
        if text:
            tangelo.log("text search : %s" % text)        
            es = Elasticsearch()
            res = es.search(index="newman", doc_type="emails", size=1000, q=text, body= {"fields": ["_id"], "query": {"match_all": {}}})
            
            ingestESTextResults(jsonGet(['hits','hits'], res, []))
    
    node_vals = getNodeVals(field, args_array)
    colors = {k:v.get("group_id") for k,v in node_vals.iteritems()}

    for k,v in node_vals.iteritems():
        node_vals[k]["color"] = colors.get(k)
    emails = sorted(getEmails(colors, field, args_array), key=lambda x: str(x.get('datetime')))
    idx_lookup = {}
    nodes = []

    for i, o in enumerate(node_vals.iteritems()):
        k,v = o
        idx_lookup[k]=i
        #nodes.append({"name": k, "num": v.get("num"), "rank": v.get("rank"), "group": v.get("color"), "community": colors.get(v.get("comm"))})
        nodes.append({"name": k, "num": v.get("num"), "rank": v.get("rank"), "group": v.get("color"), "community": v.get("comm_id")})
    edges = getEdges(idx_lookup, field, args_array)    

    results = { 'rows': emails, 'graph': { 'nodes': nodes, 'links': edges }}

    return results
Exemple #25
0
def getRankedAddresses(*args, **kwargs):
    tangelo.content_type("application/json")
    tangelo.log("getRankedAddresses(args: %s kwargs: %s)" %
                (str(args), str(kwargs)))
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **kwargs)
    # TODO - reminder no 'qs' here set to ''
    # qs = parseParamTextQuery(**kwargs)
    qs = ''

    # TODO this needs to come from UI
    size = size if size > 500 else 2500

    ranked_addresses = get_ranked_email_address_from_email_addrs_index(
        data_set_id, start_datetime, end_datetime, size)
    top_address_list = []
    for i, email_address in enumerate(ranked_addresses["emails"]):
        graph = es_get_all_email_by_address(data_set_id, email_address[0], qs,
                                            start_datetime, end_datetime, size)

        top_address_list.append({
            "address_search_url_path": email_address[0],
            "parameters": kwargs,
            "search_results": {
                "mail_sent_count": email_address[6],
                "mail_received_count": email_address[5],
                "mail_attachment_count": email_address[7],
                "query_matched_count": graph["query_hits"],
                "associated_count": len(graph["graph"]["nodes"])
            },
            "TEMPORARY_GRAPH": graph
        })

    return {"top_address_list": top_address_list}
Exemple #26
0
def getAttachFileType(*args, **kwargs):
    tangelo.content_type("application/json")
    tangelo.log("getAttachFileType(args: %s kwargs: %s)" % (str(args), str(kwargs)))
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs)
    
    top_count = int(size)
        
    attach_type = urllib.unquote(nth(args, 0, ''))
    if not attach_type:
        attach_type = 'all' #hack for now


    email_address_list = parseParamEmailAddress(**kwargs);


    if not email_address_list :
        file_types = get_top_attachment_types(data_set_id, date_bounds=(start_datetime, end_datetime), num_top_attachments=top_count)[:top_count]
    else :
        #TODO: implement populating the attachment file-types under individual email-accounts; simulate result for now
        file_types = get_top_attachment_types(data_set_id, date_bounds=(start_datetime, end_datetime), num_top_attachments=top_count)[:top_count]

    result = {
              "account_id" : data_set_id,
              "data_set_id" : data_set_id,
              "account_start_datetime" : start_datetime,
              "account_end_datetime" : end_datetime,
              "types" : file_types
             }
        
    return result
Exemple #27
0
def getEdges(node_idx, field, args_array):
    with newman_connector() as read_cnx:
        tangelo.log("start edge query")
        with execute_query(*edgeQueryObj(read_cnx.conn(), field, args_array)) as qry:    
            tangelo.log("edges : %s" % qry.stmt)
            return [{"source": node_idx.get(from_), "target": node_idx.get(to_), "value": int(weight)} 
                    for from_, to_, weight in qry.cursor()]
Exemple #28
0
def upload_file(*args, **kwargs):
    domain_content_connector = factory.get_entity_data_connector()
    try:
        domain_file = kwargs.get("file_upload")
        domain_name = kwargs.get("name")
        domain_description = kwargs.get("description")
        if not db.domain_exists(domain_name):
            if domain_file is not None:
                tangelo.log("read domain file")
                domain_file_lines = domain_file.file.readlines()
                domain_file_lines = map(lambda x: x.strip().replace('\0',''), domain_file_lines)
                db.add_new_domain(domain_name, domain_description)
                rowkeys = []
                for line in domain_file_lines:
                    i = line.index(',')   # split on the first comma
                    type = line[:i]
                    value = line[i+1:]
                    if type[0] == '"' and type[len(type)-1] == '"': type = type[1:-1]
                    if value[0] == '"' and value[len(value)-1] == '"': value = value[1:-1]
                    rowkeys.append( domain_name+'\0'+type+'\0'+value )
                result = domain_content_connector.add_new_domain_items(rowkeys)
                return json.dumps(dict(success=result))
            else:
                return json.dumps(dict(success=False))
        else:
            return json.dumps(dict(success=False))
    finally:
        domain_content_connector.close()
Exemple #29
0
def run():
    # Create an empty response object.
    response = {}
    collectionNames = []

    # look through the collections in the ivaan database and return the name of all collections
    # that match the naming profile for tables.  This is matching to see if the collection name
    # begins with "table_cardiac" since it is only returning cardiac studies from the IVAaN database

    connection = Connection('localhost', 27017)
    db = connection['ivaan']
    # get a list of all collections (excluding system collections)
    collection_list = db.collection_names(False)
    for coll in collection_list:
        # if it is a table, then add it to the response
        if (str(coll[:14]) == 'table_cardiac_'):
            print "found table:", coll
            # don't return the prefix in the project name. Users don't have to know the
            # cardiac project collection names are prepended
            collectionNames.append(coll[14:])

    connection.close()

    # if no projects found at all, return a default name
    if len(collectionNames) == 0:
        collectionNames.append("default")

    # Pack the results into the response object, and return it.
    response['result'] = collectionNames

    # Return the response object.
    tangelo.log(str(response))
    return bson.json_util.dumps(response)
Exemple #30
0
def listAllDataSet():

    tangelo.log("datasource.listAllDataSet()")

    # Ignore index keys in ES that are not in the newman_app.conf
    # Find all the indexes that begin with the index loader prefix
    indexes = [
        _index_record(index) for index in index_list() if
        index in data_set_names() or index.startswith(index_creator_prefix())
    ]

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**{})

    email_addrs = get_ranked_email_address_from_email_addrs_index(
        data_set_id, start_datetime, end_datetime, size)["emails"]
    email_addrs = {email_addr[0]: email_addr for email_addr in email_addrs}

    return {
        "data_set_selected": getDefaultDataSetID(),
        "data_sets": indexes,
        "top_hits": {
            "order_by": "rank",
            "email_addrs": email_addrs
        }
    }
Exemple #31
0
def get_entities(trail_id):
    tangelo.log('Getting entities for trail: %s' % trail_id)
    entities = {}
    entityList = []
    urls = []
    rows = db.getBrowsePathUrls(trail_id)
    for row in rows:
        urls.append(row['url'])

    entity_data_connector = factory.get_entity_data_connector()
    results = entity_data_connector.get_extracted_entities_from_urls(urls)

    tangelo.log('Got entities')

    for result in results:
        for entityType in results[result]:
            for entityName in results[result][entityType]:
                if entityName in entities:
                    entities[entityName]['pages'] = entities[entityName]['pages'] + 1
                else:
                    entities[entityName] = {'type': entityType, 'pages':1}
    # TODO either figure out how how map the data or do this differently
    for entity in entities:
        entityList.append({'name': entity, 'type': entities[entity]['type'], 'pages': entities[entity]['pages']})
    return json.dumps(entityList)
Exemple #32
0
def es_get_sender_locations(data_set_id, size):
    tangelo.log("es_geo.es_get_sender_locations()" )

    emails_resp = es().search(index=data_set_id, doc_type="emails", size=size, body=_geo_xoip_query())
    tangelo.log("es_geo.es_get_sender_locations(total document hits = %s)" % emails_resp["hits"]["total"])
    docs = [_map_geo_response(hit["_source"])for hit in emails_resp["hits"]["hits"]]
    return {"total":emails_resp["hits"]["total"], "XOIP_locations" : docs}
Exemple #33
0
def export(cdr):

     dd_url = '%s/%s/%s/'%(conf.get_deepdive_url(), conf.get_deepdive_user(), conf.get_deepdive_repo())

     headers = {'Authorization': 'Token %s' % conf.get_deepdive_token()}
     r = requests.post(dd_url, headers=headers, data=cdr)
     tangelo.log('Sending page to deepdive at: %s' % r.url)
Exemple #34
0
def _query_email_attachments(index, size, emails_query):
    tangelo.log("_query_email_attachments.Query %s" % emails_query)

    attachments_resp = es().search(index=index,
                                   doc_type="emails",
                                   size=size,
                                   body=emails_query)

    email_attachments = []
    for attachment_item in attachments_resp["hits"]["hits"]:
        _source = attachment_item["_source"]
        attachment_entry = [
            _source["id"], "PLACEHOLDER", _source["datetime"],
            _source.get("senders", ""), ';'.join(_source.get("tos", "")),
            ';'.join(_source.get("ccs", "")), ';'.join(_source.get("bccs",
                                                                   "")),
            _source.get("subject", "")
        ]
        for attachment in _source["attachments"]:
            l = list(attachment_entry)
            l[1] = attachment["guid"]
            l.append(attachment["filename"])
            l.append(0)
            email_attachments.append(l)
    return email_attachments
Exemple #35
0
def parseParamPhoneNumbers(**kwargs):
    tangelo.log("parseParamPhoneNumbers(kwargs[%s] %s)" %
                (len(kwargs), str(kwargs)))
    value = kwargs.get('phone_numbers')
    phone_numbers = value.split(",")

    return [x for x in phone_numbers if (x is not '' and x is not None)]
Exemple #36
0
def queryEmail(email):
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt_email_by_id, email) as qry:
            tangelo.log("node-vals: %s" % qry.stmt)
            rtn = qry.cursor().fetchone()
            tangelo.content_type("application/json")
            return rtn if rtn else []
Exemple #37
0
def run(tablename):
    # Create an empty response object.
    response = {}
    print "using collection:",tablename

    # first find out the types of the attributes in this collection. Create a dictionary with the names and types
    connection = Connection('localhost', 27017)
    db = connection['ivaan']
    dataset_collection = db[tablename]
    #tablerecord = dataset_collection.find()[0]

    # Now that we have the proper types in the table collection stored in a handy local dictionary "attributes", lets
    # build a query for mongoDB depending on how many filters are enabled.

    querystring = {}
    print "query to perform:", querystring

    # now that we have the query build, execute it and return the matching records from the collection

    connection = Connection('localhost', 27017)
    db = connection['polar']
    dataset_collection = db[tablename]

    # Do a find operation with the passed arguments.
    it = dataset_collection.find(querystring)
    results = [x for x in it]
   
    connection.close()

    print results

    # convert from the local json to a geojson multipoint result

    #latitudes = [30, 30, 30]
    #longitudes = [10, 20, 30]

    #define multipoint geometry
    multipoint = ogr.Geometry(ogr.wkbMultiPoint)

    #create point geometry and add to multipoint geometry
    for i in range(len(results)):
        point = ogr.Geometry(ogr.wkbPoint)
        fixedlocation = convertStringToFloatPoint(results[i]['stationLng'], results[i]['stationLat'])
        point.AddPoint(fixedlocation['lng'],fixedlocation['lat'])
        multipoint.AddGeometry(point)

    #convert geometry to GeoJSON format
    geojson_multipoint = multipoint.ExportToJson()


    # Pack the results into the response object, and return it.
    response['count'] = it.count()
    response['data'] = geojson_multipoint

    response['result'] = 'OK'

    # Return the response object.
    tangelo.log(str(response))
    return bson.json_util.dumps(response)
Exemple #38
0
def parseParamAllSenderAllRecipient(**kwargs):
    tangelo.log("parseParamAllSenderAllRecipient(kwargs[%s] %s)" %
                (len(kwargs), str(kwargs)))
    sender = kwargs.get('sender', '').split(",")[0]
    recipient = kwargs.get('recipient', '').split(",")

    recipient = [x for x in recipient if (x is not '' and x is not None)]
    return sender, recipient
Exemple #39
0
def parseParamEmailAddress(**kwargs):
    tangelo.log("parseParamEmailAddress(kwargs[%s] %s)" %
                (len(kwargs), str(kwargs)))
    email_regex = re.compile("[^@]+@[^@]+\\.[^@]+")
    key_list = [k for k in kwargs.keys() if email_regex.match(k)]
    tangelo.log("\tkey_list[] = %s" % str(key_list))

    return key_list
Exemple #40
0
        def __init__(self, *pargs, **kwargs):
            ws4py.websocket.WebSocket.__init__(self, *pargs, **kwargs)

            url = "ws://%s:%d/ws" % (hostname, port)

            tangelo.log("websocket created at %s:%d/%s (proxy to %s)" % (hostname, port, key, url))

            self.client = VTKWebSocketAB(url, self)
Exemple #41
0
def parseParamEntity(**kwargs):
    tangelo.log("parseParamEntity(kwargs[%s] %s)" % (len(kwargs), str(kwargs)))
    entity_dict = {
        k: v.split(",")
        for k, v in kwargs.iteritems() if k.startswith("entities")
    }

    return entity_dict
def get(term):
    google_results = search_google(term)
    tangelo.log(google_results)
    onion_results = search_onion(term)
    tangelo.log(onion_results)
    results = google_results + onion_results
    if len(results) != 0:
        return json.dumps(dict(success=True, resultCount=len(results), results=results))
Exemple #43
0
def export(cdr):

    dd_url = '%s/%s/%s/' % (conf.get_deepdive_url(), conf.get_deepdive_user(),
                            conf.get_deepdive_repo())

    headers = {'Authorization': 'Token %s' % conf.get_deepdive_token()}
    r = requests.post(dd_url, headers=headers, data=cdr)
    tangelo.log('Sending page to deepdive at: %s' % r.url)
Exemple #44
0
def post(action, *args, **kwargs):
    tangelo.log("Comments")
    post_data = json.loads(tangelo.request_body().read(), strict=False)

    def unknown(**kwargs):
        return tangelo.HTTPStatusCode(404, "unknown service call")

    return post_actions.get(action, unknown)(**post_data)
Exemple #45
0
def add_trail(trailname, domain, traildescription=u''):
    tangelo.log('datawake_trails POST trailname=%s traildescription=%s domain=%s' % (trailname, traildescription, domain))
    user = helper.get_user()
    org = user.get_org()
    invalid = re.match('^[\w]*(?!:)+$', trailname) is None
    if invalid:
        raise ValueError("Trail names must be alphanumeric and not contain a ':'")
    last_row = db.addTrail(org, trailname, traildescription, user.get_email(), domain=domain)
    return json.dumps(dict(success=last_row >= 0))
Exemple #46
0
def getCommunities(*args, **kwargs):
    tangelo.log("getCommunities(args: %s kwargs: %s)" % (str(args), str(kwargs)))
    tangelo.content_type("application/json")
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs)

    #top_count = int(urllib.unquote(nth(args, 0, "40")))
    top_count = int(size);

    return {"communities" : get_top_communities(data_set_id, date_bounds=(start_datetime, end_datetime), num_communities=top_count)[:top_count]}
def add_trail(trailname, domain, traildescription=u''):
    tangelo.log('datawake_trails POST trailname=%s traildescription=%s domain=%s' % (trailname, traildescription, domain))
    user = helper.get_user()
    org = user.get_org()
    invalid = re.match('^[\w]*(?!:)+$', trailname) is None
    if invalid:
        raise ValueError("Trail names must be alphanumeric and not contain a ':'")
    last_row = db.addTrail(org, trailname, traildescription, user.get_email(), domain=domain)
    return json.dumps(dict(success=last_row >= 0))
Exemple #48
0
def post(action, *args, **kwargs):
    body = tangelo.request_body().read()
    post_data = json.loads(body, strict=False)

    def unknown(*args):
        return tangelo.HTTPStatusCode(400, "invalid service call")

    tangelo.log(post_data)
    return post_actions.get(action, unknown)(**post_data)
Exemple #49
0
def get(term):
    google_results = search_google(term)
    tangelo.log(google_results)
    onion_results = search_onion(term)
    tangelo.log(onion_results)
    results = google_results + onion_results
    if len(results) != 0:
        return json.dumps(
            dict(success=True, resultCount=len(results), results=results))
Exemple #50
0
def post(action, *args, **kwargs):
    body = tangelo.request_body().read()
    post_data = json.loads(body, strict=False)

    def unknown(*args):
        return tangelo.HTTPStatusCode(400, "invalid service call")

    tangelo.log(post_data)
    return post_actions.get(action, unknown)(**post_data)
Exemple #51
0
def getRankedEmails(*args, **kwargs):
    tangelo.content_type("application/json")
    tangelo.log("getRankedEmails(args: %s kwargs: %s)" %
                (str(args), str(kwargs)))
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **kwargs)

    return get_ranked_email_address_from_email_addrs_index(
        data_set_id, start_datetime, end_datetime, size)
Exemple #52
0
def service_status(id, type, url, domain_id, team_id, trail_id, status):
    if UseRestAPI:
        ts = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')
        service_status = restPost('DatawakeXmits', dict(xmitId=0, recipientId=id, serviceType=type, datawakeUrl=url, domainId=domain_id, teamId=team_id, trailId=trail_id, xmitStatus=status, ts=ts))
        tangelo.log(service_status)
        return service_status.xmitId
    else:
        sql = 'insert into datawake_xmit (recipient_id, service_type, datawake_url, domain_id, team_id, trail_id, xmit_status, ts) values(%s,%s,%s,%s,%s,%s,%s,sysdate())'
        params = [id, type, url, domain_id, team_id, trail_id, status]
        return dbCommitSQL(sql, params)
Exemple #53
0
def save_page_selection(selection, domain, url):
    tangelo.log('savePageSelection url=' + str(url) + ' selection=' + selection + ' domain=' + domain)
    user = helper.get_user()
    org = user.get_org()
    postId = db.get_post_id(url)
    row = db.getBrowsePathData(org, postId, domain)
    row_id = -1
    if row['org'] == org:  # ensure the user is saving a selection to a post from their org
        row_id = db.addSelection(postId, selection)
    return json.dumps(dict(id=row_id))
Exemple #54
0
def getNodeVals(field, args_array):
    """
    nodes should be the all of the emails an email addr is a part of and then all of then all of the email addr associated with that set of emails 
    """
    with newman_connector() as read_cnx:
        tangelo.log("start node query")
        with execute_query(*nodeQueryObj(read_cnx.conn(), field, args_array)) as qry:
            tangelo.log("node-vals: %s" % qry.stmt)
            return {item[0]: 
                    { 'num': int(item[4]+item[5]), 'comm_id': item[2], 'group_id': item[3], 'comm': item[1], 'rank': item[6] } for item in qry.cursor() }