Ejemplo n.º 1
0
def getRankedAddresses(*args, **kwargs):
    tangelo.content_type("application/json")
    tangelo.log("getRankedAddresses(args: %s kwargs: %s)" % (str(args), str(kwargs)))
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs)
    # TODO - reminder no 'qs' here set to ''
    # qs = parseParamTextQuery(**kwargs)
    qs=''

    # TODO this needs to come from UI
    size = size if size >500 else 2500

    ranked_addresses = get_ranked_email_address_from_email_addrs_index(data_set_id, start_datetime, end_datetime, size)
    top_address_list = []
    for i, email_address in enumerate(ranked_addresses["emails"]):
        graph = es_get_all_email_by_address(data_set_id, email_address[0], qs, start_datetime, end_datetime, size )

        top_address_list.append({
            "address_search_url_path" : email_address[0],
            "parameters" : kwargs,
            "search_results" : {
                "mail_sent_count" : email_address[6],
                "mail_received_count" : email_address[5],
                "mail_attachment_count" : email_address[7],
                "query_matched_count" : graph["query_hits"],
                "associated_count" : len(graph["graph"]["nodes"])
            },
            "TEMPORARY_GRAPH" : graph
        })


    return {"top_address_list" : top_address_list}
Ejemplo n.º 2
0
def search_email_by_community(*args, **param_args):
    tangelo.content_type("application/json")
    tangelo.log("search_email_by_community(args: %s kwargs: %s)" %
                (str(args), str(param_args)))

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **param_args)
    community = nth(args, 0, '')

    # TODO: set from UI
    size = param_args.get('size', 2500)

    if not data_set_id:
        return tangelo.HTTPStatusCode(
            400, "invalid service call - missing data_set_id")
    if not community:
        return tangelo.HTTPStatusCode(400,
                                      "invalid service call - missing sender")

    email_addrs = parseParam_email_addr(**param_args)

    qs = parseParamTextQuery(**param_args)

    return es_get_all_email_by_community(data_set_id, community, email_addrs,
                                         qs, start_datetime, end_datetime,
                                         size)
Ejemplo n.º 3
0
def topic_list(*args, **kwargs):
    category = nth(args, 0, 'all')
    #tangelo.log("category %s" %(category))
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **kwargs)
    tangelo.content_type("application/json")
    return get_categories(data_set_id)
Ejemplo n.º 4
0
def stream_next(key):
    if key not in streams:
        tangelo.http_status(404, "No Such Key")
        return {
            "error": "Stream key does not correspond to an active stream",
            "stream": key
        }
    else:
        # Grab the stream in preparation for running it.
        stream = streams[key]

        # Attempt to run the stream via its next() method - if this
        # yields a result, then continue; if the next() method raises
        # StopIteration, then there are no more results to retrieve; if
        # any other exception is raised, this is treated as an error.
        try:
            return stream.next()
        except StopIteration:
            del streams[key]

            tangelo.http_status(204, "Stream Finished")
            return "OK"
        except:
            del streams[key]
            tangelo.http_status(500, "Streaming Service Exception")
            tangelo.content_type("application/json")
            return tangelo.util.traceback_report(
                error="Caught exception while executing stream service",
                stream=key)
Ejemplo n.º 5
0
def search_email_by_topic(*args, **param_args):
    tangelo.content_type("application/json")
    tangelo.log("search_email_by_topic(args: %s kwargs: %s)" %
                (str(args), str(param_args)))

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **param_args)

    # TODO: set from UI
    size = param_args.get('size', 2500)

    if not data_set_id:
        return tangelo.HTTPStatusCode(
            400, "invalid service call - missing data_set_id")

    if not param_args.get("topic_index"):
        return tangelo.HTTPStatusCode(
            400, "invalid service call - missing topic_index")
    topic = parseParamTopic(**param_args)

    email_addrs = parseParam_email_addr(**param_args)

    qs = parseParamTextQuery(**param_args)

    return es_get_all_email_by_topic(data_set_id,
                                     topic=topic,
                                     email_addrs=email_addrs,
                                     qs=qs,
                                     start_datetime=start_datetime,
                                     end_datetime=end_datetime,
                                     size=size)
Ejemplo n.º 6
0
def getAttachFileType(*args, **kwargs):
    tangelo.content_type("application/json")
    tangelo.log("getAttachFileType(args: %s kwargs: %s)" % (str(args), str(kwargs)))
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs)
    
    top_count = int(size)
        
    attach_type = urllib.unquote(nth(args, 0, ''))
    if not attach_type:
        attach_type = 'all' #hack for now


    email_address_list = parseParamEmailAddress(**kwargs);


    if not email_address_list :
        file_types = get_top_attachment_types(data_set_id, date_bounds=(start_datetime, end_datetime), num_top_attachments=top_count)[:top_count]
    else :
        #TODO: implement populating the attachment file-types under individual email-accounts; simulate result for now
        file_types = get_top_attachment_types(data_set_id, date_bounds=(start_datetime, end_datetime), num_top_attachments=top_count)[:top_count]

    result = {
              "account_id" : data_set_id,
              "data_set_id" : data_set_id,
              "account_start_datetime" : start_datetime,
              "account_end_datetime" : end_datetime,
              "types" : file_types
             }
        
    return result
Ejemplo n.º 7
0
def get_graph_for_entity(*args, **kwargs):
    tangelo.content_type("application/json")
    tangelo.log("entity.get_graph_for_entity(args: %s kwargs: %s)" %
                (str(args), str(kwargs)))

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **kwargs)
    email_address_list = parseParamEmailAddress(**kwargs)
    entity_dict = parseParamEntity(**kwargs)
    # TODO set from UI
    size = size if size > 500 else 2500

    qs = parseParamTextQuery(**kwargs)

    query = _build_email_query(email_addrs=email_address_list,
                               qs=qs,
                               entity=entity_dict,
                               date_bounds=(start_datetime, end_datetime))
    tangelo.log("entity.get_graph_for_entity(query: %s)" % (query))

    results = _query_emails(data_set_id, size, query)
    graph = _build_graph_for_emails(data_set_id, results["hits"],
                                    results["total"])

    # Get attachments for community
    query = _build_email_query(email_addrs=email_address_list,
                               qs=qs,
                               entity=entity_dict,
                               date_bounds=(start_datetime, end_datetime),
                               attachments_only=True)
    tangelo.log("entity.get_graph_by_entity(attachment-query: %s)" % (query))
    attachments = _query_email_attachments(data_set_id, size, query)
    graph["attachments"] = attachments

    return graph
Ejemplo n.º 8
0
def getEmail(*args):
    email = urllib.unquote(nth(args, 0, ''))
    if not email:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing id")

    tangelo.content_type("application/json")
    return {"email": queryEmail(email), "entities": queryEntity(email)}
Ejemplo n.º 9
0
def last_save(*args):
    tangelo.content_type("application/json")
    saves = list(glob.iglob('{}/*.json'.format(auto_save_dir)))
    if len(saves) > 0:
        f = max(saves, key=os.path.getctime)
        return slurp(f)
    return {'trainings': []}
Ejemplo n.º 10
0
def queryEmail(email):
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt_email_by_id, email) as qry:
            tangelo.log("node-vals: %s" % qry.stmt)
            rtn = qry.cursor().fetchone()
            tangelo.content_type("application/json")
            return rtn if rtn else []
Ejemplo n.º 11
0
def server_save(*args, **kwargs):
    print("inside Server save")
    f = kwargs.get('name')
    data = kwargs.get('data')
    spit("{}/{}".format(user_save_dir, f), json.dumps(data))
    tangelo.content_type("application/json")
    return {'saved': f}
Ejemplo n.º 12
0
def getExportable(*args):
    stmt = (" SELECT id, subject FROM email WHERE exportable='true' ")
    tangelo.content_type("application/json")
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt) as qry:
            rtn = [[str(val) for val in row] for row in qry.cursor()]
            return {"emails": rtn}
Ejemplo n.º 13
0
def get_topics_by_query(*args, **kwargs):
    tangelo.content_type("application/json")
    algorithm = kwargs.get('algorithm', 'lingo')
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **kwargs)
    email_address_list = parseParamEmailAddress(**kwargs)

    # TODO -------------------------------------------------------------------------
    # TODO  REMEMBER TO EVALUATE QUERY TERMS -- VERY IMPORTANT for good clustering!
    # TODO -------------------------------------------------------------------------
    query_terms = ''
    # TODO set from UI
    analysis_field = kwargs.get("analysis_field", "_source.body")
    # TODO set from UI
    num_returned = 20

    clusters = get_dynamic_clusters(data_set_id,
                                    "emails",
                                    email_addrs=email_address_list,
                                    query_terms=query_terms,
                                    topic_score=None,
                                    entity={},
                                    date_bounds=(start_datetime, end_datetime),
                                    cluster_fields=[analysis_field],
                                    cluster_title_fields=["_source.subject"],
                                    algorithm=algorithm,
                                    max_doc_pool_size=500)

    return {"topics": clusters[:num_returned]}
Ejemplo n.º 14
0
def search_email_by_conversation(*path_args, **param_args):
    tangelo.content_type("application/json")
    tangelo.log("search.search_email_by_conversation(path_args[%s] %s)" %
                (len(path_args), str(path_args)))

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **param_args)
    # TODO: set from UI
    size = param_args.get('size', 2500)

    # parse the sender address and the recipient address
    sender_list = parseParamEmailSender(**param_args)
    cherrypy.log("\tsender_list: %s)" % str(sender_list))

    recipient_list = parseParamEmailRecipient(**param_args)
    cherrypy.log("\trecipient_list: %s)" % str(recipient_list))

    document_uid = parseParamDocumentUID(**param_args)
    cherrypy.log("\tdocument_uid: %s)" % str(document_uid))

    document_datetime = parseParamDocumentDatetime(**param_args)
    cherrypy.log("\tdocument_datetime: %s)" % str(document_datetime))
    if not document_datetime:
        return tangelo.HTTPStatusCode(
            400,
            "invalid service call - missing mandatory param 'document_datetime'"
        )

    sender_address, recipient_address = parseParamAllSenderAllRecipient(
        **param_args)

    return es_get_conversation(data_set_id, sender_address, recipient_address,
                               start_datetime, end_datetime, size / 2,
                               document_uid, document_datetime)
Ejemplo n.º 15
0
def auto_save(*args, **kwargs):
    cherrypy.log("saved")
    f= "session_{}.json".format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
    spit("{}/{}".format(auto_save_dir, f), json.dumps(kwargs))
    remove_old_files()    
    tangelo.content_type("application/json")
    return { 'saved': f }
Ejemplo n.º 16
0
def get_graph_for_entity(*args, **kwargs):
    tangelo.content_type("application/json")
    tangelo.log("entity.get_graph_for_entity(args: %s kwargs: %s)" % (str(args), str(kwargs)))

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs)
    email_address_list = parseParamEmailAddress(**kwargs);
    entity_dict = parseParamEntity(**kwargs)
    # TODO set from UI
    size = size if size >500 else 2500

    qs = parseParamTextQuery(**kwargs)

    query = _build_email_query(email_addrs=email_address_list, qs=qs, entity=entity_dict, date_bounds=(start_datetime, end_datetime))
    tangelo.log("entity.get_graph_for_entity(query: %s)" % (query))

    results = _query_emails(data_set_id, size, query)
    graph = _build_graph_for_emails(data_set_id, results["hits"], results["total"])

    # Get attachments for community
    query = _build_email_query(email_addrs=email_address_list, qs=qs, entity=entity_dict, date_bounds=(start_datetime, end_datetime), attachments_only=True)
    tangelo.log("entity.get_graph_by_entity(attachment-query: %s)" % (query))
    attachments = _query_email_attachments(data_set_id, size, query)
    graph["attachments"] = attachments

    return graph
Ejemplo n.º 17
0
def queryEmail(email):
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt_email_by_id, email) as qry:
            tangelo.log("node-vals: %s" % qry.stmt)
            rtn = qry.cursor().fetchone()
            tangelo.content_type("application/json")
            return rtn if rtn else []
Ejemplo n.º 18
0
def auto_save(*args, **kwargs):
    cherrypy.log("saved")
    f= "session_{}.json".format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
    spit("{}/{}".format(auto_save_dir, f), json.dumps(kwargs))
    remove_old_files()    
    tangelo.content_type("application/json")
    return { 'saved': f }
Ejemplo n.º 19
0
def getEmail(*args):
    email=urllib.unquote(nth(args, 0, ''))
    if not email:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing id")
    
    tangelo.content_type("application/json")    
    return { "email" : queryEmail(email), "entities": queryEntity(email) }
Ejemplo n.º 20
0
def getRankedAddresses(*args, **kwargs):
    tangelo.content_type("application/json")
    tangelo.log("getRankedAddresses(args: %s kwargs: %s)" %
                (str(args), str(kwargs)))
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **kwargs)
    # TODO - reminder no 'qs' here set to ''
    # qs = parseParamTextQuery(**kwargs)
    qs = ''

    # TODO this needs to come from UI
    size = size if size > 500 else 2500

    ranked_addresses = get_ranked_email_address_from_email_addrs_index(
        data_set_id, start_datetime, end_datetime, size)
    top_address_list = []
    for i, email_address in enumerate(ranked_addresses["emails"]):
        graph = es_get_all_email_by_address(data_set_id, email_address[0], qs,
                                            start_datetime, end_datetime, size)

        top_address_list.append({
            "address_search_url_path": email_address[0],
            "parameters": kwargs,
            "search_results": {
                "mail_sent_count": email_address[6],
                "mail_received_count": email_address[5],
                "mail_attachment_count": email_address[7],
                "query_matched_count": graph["query_hits"],
                "associated_count": len(graph["graph"]["nodes"])
            },
            "TEMPORARY_GRAPH": graph
        })

    return {"top_address_list": top_address_list}
Ejemplo n.º 21
0
def searchStarred(*args, **kwargs):
    tangelo.log("email.searchStarred(args: %s kwargs: %s)" %
                (str(args), str(kwargs)))
    tangelo.content_type("application/json")

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **kwargs)

    size = size if size > 500 else 2500

    # TODO set from UI
    query_terms = ''
    email_address_list = []

    query = _build_email_query(email_addrs=email_address_list,
                               qs=query_terms,
                               date_bounds=(start_datetime, end_datetime),
                               starred=True)
    tangelo.log("email.searchStarred(query: %s)" % (query))

    results = _query_emails(data_set_id, size, query)
    graph = _build_graph_for_emails(data_set_id, results["hits"],
                                    results["total"])

    # Get attachments for community
    query = _build_email_query(email_addrs=email_address_list,
                               qs=query_terms,
                               date_bounds=(start_datetime, end_datetime),
                               attachments_only=True,
                               starred=True)
    tangelo.log("email.searchStarred(attachment-query: %s)" % (query))
    attachments = _query_email_attachments(data_set_id, size, query)
    graph["attachments"] = attachments

    return graph
Ejemplo n.º 22
0
def last_save(*args):
    tangelo.content_type("application/json")    
    saves=list(glob.iglob('{}/*.json'.format(auto_save_dir)))
    if len(saves) > 0:
        f= max(saves, key=os.path.getctime)
        return slurp(f)
    return { 'trainings' : [] }
Ejemplo n.º 23
0
def searchStarred(*args, **kwargs):
    tangelo.log("email.searchStarred(args: %s kwargs: %s)" % (str(args), str(kwargs)))
    tangelo.content_type("application/json")

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs)

    size = size if size >500 else 2500

    # TODO set from UI
    query_terms=''
    email_address_list = []

    query = _build_email_query(email_addrs=email_address_list, qs=query_terms, date_bounds=(start_datetime, end_datetime), starred=True)
    tangelo.log("email.searchStarred(query: %s)" % (query))

    results = _query_emails(data_set_id, size, query)
    graph = _build_graph_for_emails(data_set_id, results["hits"], results["total"])

    # Get attachments for community
    query = _build_email_query(email_addrs=email_address_list, qs=query_terms, date_bounds=(start_datetime, end_datetime), attachments_only=True, starred=True)
    tangelo.log("email.searchStarred(attachment-query: %s)" % (query))
    attachments = _query_email_attachments(data_set_id, size, query)
    graph["attachments"] = attachments

    return graph
Ejemplo n.º 24
0
def stream_next(key):
    if key not in streams:
        tangelo.http_status(404, "No Such Key")
        return {"error": "Stream key does not correspond to an active stream",
                "stream": key}
    else:
        # Grab the stream in preparation for running it.
        stream = streams[key]

        # Attempt to run the stream via its next() method - if this
        # yields a result, then continue; if the next() method raises
        # StopIteration, then there are no more results to retrieve; if
        # any other exception is raised, this is treated as an error.
        try:
            return stream.next()
        except StopIteration:
            del streams[key]

            tangelo.http_status(204, "Stream Finished")
            return "OK"
        except:
            del streams[key]

            tangelo.http_status(500, "Streaming Service Exception")
            tangelo.content_type("application/json")

            error_code = tangelo.util.generate_error_code()

            tangelo.util.log_traceback("STREAM", error_code, "Offending stream key: %s" % (key), "Uncaught executing executing service %s" % (tangelo.request_path))
            return tangelo.util.error_report(error_code)
Ejemplo n.º 25
0
def spacy_save(*args, **kwargs):
    print("inside spacy save")
    f = kwargs.get('name')
    data = kwargs.get('data')
    data_spacy_format = modify_output(data)
    spit("{}/{}".format(user_save_dir, f), json.dumps(data_spacy_format))
    tangelo.content_type("application/json")
    return {'saved': f, 'data': data_spacy_format}
Ejemplo n.º 26
0
def train_spacy(*args, **kwargs):
    print("inside training spacy")
    data = kwargs.get('data')
    data_spacy_format = modify_output(data)
    train_entity_extractor(data_spacy_format)
    # spit("{}/{}".format(user_save_dir, f), json.dumps(data_spacy_format))
    tangelo.content_type("application/json")
    return {'saved': f}
Ejemplo n.º 27
0
def export_emails_archive(data_set_id, email_ids=["f9c9c59a-7fe8-11e5-bb05-08002705cb99"]):
    cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" % (data_set_id, email_ids))
    if not data_set_id:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing index")
    # if not email:
    #     return tangelo.HTTPStatusCode(400, "invalid service call - missing attachment_id")

    # elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='10.1.70.143', port=9200): Read timed out. (read timeout=10))
    es = Elasticsearch([{"host" : "10.1.70.143", "port" : 9200}], request_timeout=60)
    # TODO can implement with multiple doc_types and combine attachments in
    emails = es.mget(index=data_set_id, doc_type="emails", body={"docs":[{"_id":id} for id in email_ids]})


    # TODO filename
    filename= "export.tar.gz"
    tangelo.content_type("application/x-gzip")
    header("Content-Disposition", 'attachment; filename="{}"'.format(filename))

    string_buffer = cStringIO.StringIO()
    tar = tarfile.open(mode='w:gz', fileobj=string_buffer)

    # Add each email to the tar
    for email_source in emails["docs"]:

        email = email_source["_source"]

        tarinfo_parent= tarfile.TarInfo(name = email["id"])
        tarinfo_parent.type = tarfile.DIRTYPE
        tarinfo_parent.mode = 0755
        tarinfo_parent.mtime = time.time()
        tar.addfile(tarinfo_parent)

        tarinfo = tarfile.TarInfo(email["id"]+"/"+email["id"]+".json")
        # TODO -- email transformation
        data_string = json.dumps(email)
        fobj = cStringIO.StringIO(data_string)

        tarinfo.size = len(data_string)
        tarinfo.mode = 0644
        tarinfo.mtime = time.time()
        tar.addfile(tarinfo, fobj)

        # Get the attachments
        if email["attachments"]:
            attachments = es.mget(index=data_set_id, doc_type="attachments", body={"docs":[{"_id":attch["guid"]} for attch in email["attachments"]]})
            for attachment_source in attachments["docs"]:
                attachment = attachment_source["_source"]
                filename = attachment["filename"]
                attch_data = str(base64.b64decode(attachment["contents64"]))

                tarinfo_attch = tarfile.TarInfo(email["id"]+"/"+filename)
                tarinfo_attch.size = len(attch_data)
                tarinfo_attch.mode = 0644
                tarinfo_attch.mtime = time.time()
                tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data))
    tar.close()

    return string_buffer.getvalue()
Ejemplo n.º 28
0
def getDomains(*args):
    stmt = (
        "SELECT SUBSTRING_INDEX(email_addr, '@', -1) as eml, count(1) from email_addr group by eml"
    )
    tangelo.content_type("application/json")
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt) as qry:
            rtn = [[str(val) for val in row] for row in qry.cursor()]
            return {"domains": rtn}
Ejemplo n.º 29
0
def getCommunities(*args, **kwargs):
    tangelo.log("getCommunities(args: %s kwargs: %s)" % (str(args), str(kwargs)))
    tangelo.content_type("application/json")
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs)

    #top_count = int(urllib.unquote(nth(args, 0, "40")))
    top_count = int(size);

    return {"communities" : get_top_communities(data_set_id, date_bounds=(start_datetime, end_datetime), num_communities=top_count)[:top_count]}
Ejemplo n.º 30
0
def setSelectedDataSet(*args):
    tangelo.content_type("application/json")
    data_set_id=urllib.unquote(nth(args, 0, ''))
    if not data_set_id:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing data_set_id")

    resp = initialize_email_addr_cache(data_set_id)
    _current_data_set_selected = data_set_id
    return _index_record(data_set_id)
Ejemplo n.º 31
0
def getRankedEmails(*args, **kwargs):
    tangelo.content_type("application/json")
    tangelo.log("getRankedEmails(args: %s kwargs: %s)" %
                (str(args), str(kwargs)))
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **kwargs)

    return get_ranked_email_address_from_email_addrs_index(
        data_set_id, start_datetime, end_datetime, size)
Ejemplo n.º 32
0
def getDomains(*args):
    stmt = (
        "SELECT SUBSTRING_INDEX(email_addr, '@', -1) as eml, count(1) from email_addr group by eml"
    )
    tangelo.content_type("application/json")        
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt) as qry:
            rtn = [[str(val) for val in row] for row in qry.cursor()]
            return { "domains" : rtn }
Ejemplo n.º 33
0
def getExportable(*args):
    stmt = (
        " SELECT id, subject FROM email WHERE exportable='true' "
    )
    tangelo.content_type("application/json")        
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt) as qry:
            rtn = [[str(val) for val in row] for row in qry.cursor()]
            return { "emails" : rtn }
Ejemplo n.º 34
0
def download(data):
    user = data.get("user")

    if not user:
        return tangelo.HTTPStatusCode(400, "invalid service call missing user")

    passwd = data.get("pass")
    limit = data.get("limit", "2000")
    logfile = "{}/{}.log".format(work_dir, user)
    spit(logfile, "[Start] {}\n".format(user), True)
    cherrypy.log("logfile: {}".format(logfile))

    def download_thread():
        try:
            cherrypy.log("Thread Start User: {}".format(user))

            try:
                session = newman_email.login(user, passwd, logfile)
                fldr = "{}/emails/{}".format(webroot, user)
                cherrypy.log("Login User: {}".format(user))

                if os.path.exists(fldr):
                    rmrf(fldr)

                mkdir(fldr)

                spit("{}/output.csv".format(fldr),
                     newman_email.headerrow() + "\n")

                mkdir(fldr + "/emails")

                newman_email.download(session, user, fldr, int(limit), logfile)

                spit(logfile, "[Completed Download] {}\n".format(user))
            except Exception as ex:
                spit(logfile, "[Error] {}\n".format(ex))
                cherrypy.log("[Error] {}\n".format(ex))
            except:
                spit(logfile, "[Error]")
                cherrypy.log("[Error]")
                error_info = sys.exc_info()[0]
                cherrypy.log(error_info)
                spit(logfile,
                     "[Error] {}\n".format(error_info.replace('\n', ' ')))

            finally:
                newman_email.close_session(session)

        except:
            error_info = sys.exc_info()[0]
            cherrypy.log(error_info)
            spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' ')))

    thr = threading.Thread(target=download_thread, args=())
    thr.start()
    tangelo.content_type("application/json")
    return {"id": user}
Ejemplo n.º 35
0
def stream_start(url, kwargs):
    content = tangelo.server.analyze_url(url).content

    if content is None or content.type != Content.Service:
        tangelo.http_status(500, "Error Opening Streaming Service")
        return {"error": "could not open streaming service"}
    else:
        # Extract the path to the service and the list of positional
        # arguments.
        module_path = content.path
        pargs = content.pargs

        # Get the service module.
        try:
            service = modules.get(module_path)
        except:
            tangelo.http_status(501, "Error Importing Streaming Service")
            tangelo.content_type("application/json")
            return tangelo.util.traceback_report(
                error="Could not import module %s" % (module_path))
        else:
            # Check for a "stream" function inside the module.
            if "stream" not in dir(service):
                tangelo.http_status(400, "Non-Streaming Service")
                return {
                    "error":
                    "The requested streaming service does not implement a 'stream()' function"
                }
            else:
                # Call the stream function and capture its result.
                try:
                    stream = service.stream(*pargs, **kwargs)
                except Exception:
                    result = tangelo.util.traceback_report(
                        error=
                        "Caught exception during streaming service execution",
                        module=tangelo.request_path())

                    tangelo.log_warning(
                        "STREAM", "Could not execute service %s:\n%s" %
                        (tangelo.request_path(), "\n".join(
                            result["traceback"])))

                    tangelo.http_status(500,
                                        "Streaming Service Raised Exception")
                    tangelo.content_type("application/json")
                    return result
                else:
                    # Generate a key corresponding to this object.
                    key = tangelo.util.generate_key(streams)

                    # Log the object in the streaming table.
                    streams[key] = stream

                    # Create an object describing the logging of the generator object.
                    return {"key": key}
Ejemplo n.º 36
0
def getRollup(*args):
    entity = urllib.unquote(nth(args, 0, ""))
    if not entity:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing id")

    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt_entity_rollup_id, entity) as qry:
            rtn = qry.cursor().fetchone()
            tangelo.content_type("application/json")
            return {"rollupId": rtn}
Ejemplo n.º 37
0
def tables(*args):
    with impalaopen(":".join(settings.IMPALA)) as curr:
        curr.execute("show tables")
        tangelo.content_type("application/json")
        return json.dumps({
            'tables': [
                table[:-20] for table in curr
                if table[0].endswith("tracks_comms_joined")
            ]
        })
Ejemplo n.º 38
0
    def plugin(self, *path, **args):
        # Refresh the plugin registry.
        if self.plugins:
            error = self.plugins.refresh()
            if error is not None:
                tangelo.content_type("text/plain")
                tangelo.http_status(400, "Bad Plugin Configuration")
                return error

        return self.execute_analysis(args)
Ejemplo n.º 39
0
def setExportMany(data):
    emails = data.get('emails', [])
    exportable = 'true' if data.get('exportable', True) else 'false'
    stmt = (" UPDATE email SET exportable=%s WHERE id = %s ")
    with newman_connector() as cnx:
        for email in emails:
            with execute_nonquery(cnx.conn(), stmt, exportable, email) as qry:
                pass
    tangelo.content_type("application/json")
    return {'exported': emails}
Ejemplo n.º 40
0
def setSelectedDataSet(*args):
    tangelo.content_type("application/json")
    data_set_id = urllib.unquote(nth(args, 0, ''))
    if not data_set_id:
        return tangelo.HTTPStatusCode(
            400, "invalid service call - missing data_set_id")

    resp = initialize_email_addr_cache(data_set_id)
    _current_data_set_selected = data_set_id
    return _index_record(data_set_id)
Ejemplo n.º 41
0
def topic_list(*args):
    category = nth(args, 0, 'all')
    with newman_connector() as read_cnx:
        stmt = (" select idx, value, docs from topic_category "
                " where category_id = %s "
                " order by idx ")
        with execute_query(read_cnx.conn(), stmt, category) as qry:
            rtn = [r for r in qry.cursor()]
            tangelo.content_type("application/json")
            return {"categories": rtn}
Ejemplo n.º 42
0
def getAttachCount(*args, **kwargs):
    tangelo.content_type("application/json")
    tangelo.log("getAttachCount(args: %s kwargs: %s)" %
                (str(args), str(kwargs)))

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **kwargs)

    attach_type = urllib.unquote(nth(args, 0, ''))
    if not attach_type:
        return tangelo.HTTPStatusCode(
            400, "invalid service call - missing attach_type")

    attach_type = 'all'  #hack for now
    email_address_list = parseParamEmailAddress(**kwargs)

    if not email_address_list:
        activity = get_total_attachment_activity(
            data_set_id,
            data_set_id,
            query_function=attachment_histogram,
            sender_email_addr="",
            start=start_datetime,
            end=end_datetime,
            interval="week")
        result = {
            "account_activity_list": [{
                "account_id": data_set_id,
                "data_set_id": data_set_id,
                "account_start_datetime": start_datetime,
                "account_end_datetime": end_datetime,
                "activities": activity
            }]
        }

    else:
        result = {
            "account_activity_list": [{
                "account_id":
                account_id,
                "data_set_id":
                data_set_id,
                "account_start_datetime":
                start_datetime,
                "account_end_datetime":
                end_datetime,
                "activities":
                get_emailer_attachment_activity(data_set_id,
                                                account_id,
                                                (start_datetime, end_datetime),
                                                interval="week")
            } for account_id in email_address_list]
        }

    return result
Ejemplo n.º 43
0
def download(data):
    user = data.get("user")
    
    if not user:
        return tangelo.HTTPStatusCode(400, "invalid service call missing user")

    passwd = data.get("pass")
    limit = data.get("limit", "2000")
    logfile = "{}/{}.log".format(work_dir, user)
    spit(logfile, "[Start] {}\n".format(user), True)
    cherrypy.log("logfile: {}".format(logfile))

    def download_thread():
        try:
            cherrypy.log("Thread Start User: {}".format(user))

            try: 
                session = newman_email.login(user, passwd, logfile)
                fldr = "{}/emails/{}".format(webroot, user)    
                cherrypy.log("Login User: {}".format(user))

                if os.path.exists(fldr):
                    rmrf(fldr)

                mkdir(fldr)

                spit("{}/output.csv".format(fldr), newman_email.headerrow() + "\n")

                mkdir(fldr + "/emails")

                newman_email.download(session, user, fldr, int(limit), logfile)

                spit(logfile, "[Completed Download] {}\n".format(user))
            except Exception as ex:
                spit(logfile, "[Error] {}\n".format(ex))
                cherrypy.log("[Error] {}\n".format(ex))
            except:
                spit(logfile, "[Error]")
                cherrypy.log("[Error]")
                error_info = sys.exc_info()[0]
                cherrypy.log(error_info)
                spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' ')))

            finally: 
                newman_email.close_session(session)

        except:
            error_info = sys.exc_info()[0]
            cherrypy.log(error_info)
            spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' ')))

    thr = threading.Thread(target=download_thread, args=())
    thr.start()
    tangelo.content_type("application/json")
    return { "id" : user }
Ejemplo n.º 44
0
def getRollup(*args):
    entity = urllib.unquote(nth(args, 0, ''))
    if not entity:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing id")

    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt_entity_rollup_id,
                           entity) as qry:
            rtn = qry.cursor().fetchone()
            tangelo.content_type("application/json")
            return {"rollupId": rtn}
Ejemplo n.º 45
0
def exif_emails(*args, **kwargs):
    tangelo.log("geo.exif_emails(args: %s kwargs: %s)" %
                (str(args), str(kwargs)))
    tangelo.content_type("application/json")

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(
        **kwargs)

    qs = parseParamTextQuery(**kwargs)

    return es_get_exif_emails(data_set_id, size)
Ejemplo n.º 46
0
def getRankedEmails(*args):
    tangelo.content_type("application/json")
    stmt = (
        " select email_addr, community, community_id, group_id, rank, total_received, total_sent "
        " from email_addr "
        " where rank > 0 "
        " order by cast(rank as decimal(4,4)) desc")
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt) as qry:
            rtn = [[str(val) for val in row] for row in qry.cursor()]
            return {"emails": rtn}
Ejemplo n.º 47
0
def ingest(data):
    cfg = "{}/conf/{}".format(base_dir, data.get('conf', 'target.cfg'))
    logname = "ingest_{}".format(fmtNow())
    teefile = "{}/{}.tee.log".format(work_dir, logname)
    errfile = "{}/{}.err.log".format(work_dir, logname)
    logfile = "{}/{}.status.log".format(work_dir, logname)

    cherrypy.log("Ingest config: {}".format(cfg))
    cherrypy.log("Ingest logfile: {}".format(logfile))

    def ingest_thread():
        cherrypy.log("Ingest Started:")
        try:
            cherrypy.log("started: {}".format(fmtNow()))
            spit(logfile, "[Started] {} \n".format(fmtNow()))

            args = ["./bin/rebuild_all.sh"]
            cherrypy.log("running: {}".format(" ".join(args)))
            spit(logfile, "[Running] {} \n".format(" ".join(args)))

            with open(teefile, 'w') as t, open(errfile, 'w') as e:
                kwargs = {'stdout': t, 'stderr': e, 'cwd': base_dir }
                rebuildp = subprocess.Popen(args, **kwargs)
                out, err = rebuildp.communicate()
                cherrypy.log("rebuild complete: {}".format(fmtNow()))
                rtn = rebuildp.returncode
                if rtn != 0:
                    spit(logfile, "[Error] rebuild return with non-zero code: {} \n".format(rtn))
                    return
                    
            args = ["./bin/ingest.sh", cfg]
            cherrypy.log("running ingest: {}".format(" ".join(args)))
            spit(logfile, "[Running] {} \n".format(" ".join(args)))

            with open(teefile, 'w') as t, open(errfile, 'w') as e:
                kwargs = {'stdout': t, 'stderr': e, 'cwd': base_dir, 'bufsize' : 1 }
                subp = subprocess.Popen(args, **kwargs)
                out, err = subp.communicate()
                cherrypy.log("complete: {}".format(fmtNow()))
                rtn = subp.returncode
                if rtn != 0:
                    spit(logfile, "[Error] return with non-zero code: {} \n".format(rtn))
                else:
                    spit(logfile, "[Complete]")
        except:
            error_info = sys.exc_info()[0]
            cherrypy.log(error_info)
            spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' ')))

    thr = threading.Thread(target=ingest_thread, args=())
    thr.start()
    tangelo.content_type("application/json")    
    return {'log' : logname }
Ejemplo n.º 48
0
def setExportMany(data):
    emails = data.get('emails', [])
    exportable= 'true' if data.get('exportable', True) else 'false'
    stmt = (
        " UPDATE email SET exportable=%s WHERE id = %s "	
    )
    with newman_connector() as cnx:
        for email in emails: 
            with execute_nonquery(cnx.conn(), stmt, exportable, email) as qry:
                pass
    tangelo.content_type("application/json")
    return { 'exported' : emails }
Ejemplo n.º 49
0
def getRankedEmails(*args):
    tangelo.content_type("application/json")    
    stmt = (
        " select email_addr, community, community_id, group_id, rank, total_received, total_sent "
        " from email_addr "
        " where rank > 0 "
        " order by cast(rank as decimal(4,4)) desc" 
    )
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt) as qry:
            rtn = [[str(val) for val in row] for row in qry.cursor()]
            return { "emails" : rtn }
Ejemplo n.º 50
0
def topic_list(*args):
    category=nth(args, 0, 'all')
    with newman_connector() as read_cnx:
        stmt = (
            " select idx, value, docs from topic_category "
            " where category_id = %s "
            " order by idx "
        ) 
        with execute_query(read_cnx.conn(), stmt, category) as qry:
            rtn = [r for r in qry.cursor()]
            tangelo.content_type("application/json")
            return { "categories" : rtn }
Ejemplo n.º 51
0
def getTopRollup(*args):
    amt = urllib.unquote(nth(args, 0, ""))
    if not amt:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing id")

    stmt = stmt_top_rollup_entities + ("limit {0}".format(amt))
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt) as qry:
            rtn = [r for r in qry.cursor()]
            rtn = rtn if rtn else []
            tangelo.content_type("application/json")
            return {"entities": rtn}
Ejemplo n.º 52
0
def getAllAttachmentBySender(*args, **kwargs):
    tangelo.log("getAttachmentsSender(args: %s kwargs: %s)" % (str(args), str(kwargs)))
    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs)
    sender=nth(args, 0, '')
    if not data_set_id:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing data_set_id")
    if not sender:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing sender")

    tangelo.content_type("application/json")

    return get_attachments_by_sender(data_set_id, sender, start_datetime, end_datetime, size )
Ejemplo n.º 53
0
def extract_pst(*args, **kwargs):
    cherrypy.log("search.extract_pst(kwargs[%s] %s)" % (len(kwargs), str(kwargs)))

    ingest_id=kwargs.get("ingest-id")
    ingest_file=kwargs.get("file")
    type=kwargs.get("type", "pst")

    # path = "{}/{}".format(ingest_parent_dir, type)
    if not ingest_id or not type or not ingest_file:
        raise TypeError("Encountered a 'None' value for 'email', 'type', or 'ingest_file!'")

    # Add the prefix for the newman indexes
    ingest_id = index_prefix+ingest_id

    logname = "pst_{}".format(fmtNow())
    ingester_log = "{}/{}.ingester.log".format(work_dir, logname)
    # errfile = "{}/{}.err.log".format(work_dir, logname)
    service_status_log = "{}/{}.status.log".format(work_dir, logname)

    spit(service_status_log, "[Start] email address={}\n".format(ingest_id), True)

    def extract_thread():
        try:
            args = ["./bin/ingest.sh", ingest_id, ingest_parent_dir, ingest_file, type]

            cherrypy.log("running pst: {}".format(" ".join(args)))
            spit(service_status_log, "[Running] {} \n".format(" ".join(args)))

            with open(ingester_log, 'w') as t:
                kwargs = {'stdout': t, 'stderr': t, 'cwd': base_dir, 'bufsize' : 1 }
                subp = subprocess.Popen(args, **kwargs)
                out, err = subp.communicate()

                # TODO should never see this line  - remove this
                cherrypy.log("complete: {}".format(fmtNow()))

                rtn = subp.returncode
                if rtn != 0:
                    spit(service_status_log, "[Error] return with non-zero code: {} \n".format(rtn))
                else:
                    spit(service_status_log, "[Done Ingesting data.  Reloading the email_addr cache.]")
                    initialize_email_addr_cache(ingest_id, update=True)
                    spit(service_status_log, "[Complete.]")
        except:
            error_info = sys.exc_info()[0]
            spit(service_status_log, "[Error] {}\n".format(error_info.replace('\n', ' ')))
            # cherrypy.log(error_info)

    thr = threading.Thread(target=extract_thread, args=())
    thr.start()
    tangelo.content_type("application/json")
    return {'log' : logname }
Ejemplo n.º 54
0
def getEmail(*args, **kwargs):
    tangelo.log("getEmail(args: %s kwargs: %s)" % (str(args), str(kwargs)))
    tangelo.content_type("application/json")

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs)

    qs = parseParamTextQuery(**kwargs)

    email_id = args[-1]
    if not email_id:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing email_id")

    return get_email(data_set_id, email_id, qs)
Ejemplo n.º 55
0
def setStarred(*args, **kwargs):
    tangelo.log("setStarred(args: %s kwargs: %s)" % (str(args), str(kwargs)))
    tangelo.content_type("application/json")

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs)

    email_id = args[-1]
    if not email_id:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing email_id")

    starred = parseParamStarred(**kwargs)

    return set_starred(data_set_id, [email_id], starred)
Ejemplo n.º 56
0
    def execute_analysis(self, query_args):
        # Hide the identity/version number of the server technology in the
        # response headers.
        cherrypy.response.headers["Server"] = ""

        # Analyze the URL.
        analysis = analyze_url(cherrypy.request.path_info)
        directive = analysis.directive
        content = analysis.content

        # If any "directives" were found (i.e., redirections) perform them here.
        if directive is not None:
            if directive.type == Directive.HTTPRedirect:
                raise cherrypy.HTTPRedirect(analysis.directive.argument)
            elif directive.type == Directive.InternalRedirect:
                raise cherrypy.InternalRedirect(analysis.directive.argument)
            elif directive.type == Directive.ListPlugins:
                tangelo.content_type("application/json")
                plugin_list = self.plugins.plugin_list() if self.plugins else []
                return json.dumps(plugin_list)
            else:
                raise RuntimeError("fatal internal error:  illegal directive type code %d" % (analysis.directive.type))

        # If content was actually found at the URL, perform any htaccess updates
        # now.
        do_auth = self.auth_update and content is None or content.type != Content.NotFound
        if do_auth:
            self.auth_update.update(analysis.reqpathcomp, analysis.pathcomp)

        # Serve content here, either by serving a static file, generating a
        # directory listing, executing a service, or barring the client entry.
        if content is not None:
            if content.type == Content.File:
                if content.path is not None:
                    return cherrypy.lib.static.serve_file(content.path)
                else:
                    raise cherrypy.HTTPError("403 Forbidden", "The requested path is forbidden")
            elif content.type == Content.Directory:
                if content.path is not None:
                    return Tangelo.dirlisting(content.path, cherrypy.request.path_info)
                else:
                    raise cherrypy.HTTPError("403 Forbidden", "Listing of this directory has been disabled")
            elif content.type == Content.Service:
                cherrypy.thread_data.pluginpath = analysis.plugin_path
                return self.invoke_service(content.path, *content.pargs, **query_args)
            elif content.type == Content.NotFound:
                raise cherrypy.HTTPError("404 Not Found", "The path '%s' was not found" % (content.path))
            else:
                raise RuntimeError("fatal error: illegal content type code %d" % (content.type))
        else:
            raise RuntimeError("fatal internal error:  analyze_url() returned analysis without directive or content")
Ejemplo n.º 57
0
def getTarget(*args):
    # returns the users who's email is being analyzed
    #todo: read from file or config 
    target = getOpt('target')
    stmt = (
        " select e.email_addr, e.community, e.community_id, e.group_id, e.total_received, e.total_sent, e.rank "
        " from email_addr e "
        " where e.email_addr = %s "
    )
    tangelo.content_type("application/json")        
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt, target) as qry:
            rtn = [[str(val) for val in row] for row in qry.cursor()]
            return { "email" : rtn }
Ejemplo n.º 58
0
def stream_start(url, kwargs):
    content = tangelo.server.analyze_url(url).content

    if content is None or content.type != Content.Service:
        tangelo.http_status(500, "Error Opening Streaming Service")
        return {"error": "could not open streaming service"}
    else:
        # Extract the path to the service and the list of positional
        # arguments.
        module_path = content.path
        pargs = content.pargs

        # Get the service module.
        try:
            service = modules.get(module_path)
        except:
            tangelo.http_status(500, "Error Importing Streaming Service")
            tangelo.content_type("application/json")

            error_code = tangelo.util.generate_error_code()

            tangelo.util.log_traceback("STREAM", error_code, "Could not import module %s" % (tangelo.request_path()))
            return tangelo.util.error_report(error_code)
        else:
            # Check for a "stream" function inside the module.
            if "stream" not in dir(service):
                tangelo.http_status(400, "Non-Streaming Service")
                return {"error": "The requested streaming service does not implement a 'stream()' function"}
            else:
                # Call the stream function and capture its result.
                try:
                    stream = service.stream(*pargs, **kwargs)
                except Exception:
                    tangelo.http_status(500, "Streaming Service Raised Exception")
                    tangelo.content_type("application/json")

                    error_code = tangelo.util.generate_error_code()

                    tangelo.util.log_traceback("STREAM", error_code, "Could not execute service %s" % (tangelo.request_path()))
                    return tangelo.util.error_report(error_code)
                else:
                    # Generate a key corresponding to this object.
                    key = tangelo.util.generate_key(streams)

                    # Log the object in the streaming table.
                    streams[key] = stream

                    # Create an object describing the logging of the generator object.
                    return {"key": key}
Ejemplo n.º 59
0
def getAttachmentsSender(*args):
    sender=urllib.unquote(nth(args, 0, ''))
    if not sender:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing id")

    tangelo.content_type("application/json")        
    stmt = (
        " select id, dir, datetime, from_addr, tos, ccs, bccs, subject, attach, bodysize "
        " from email "
        " where from_addr = %s and attach != '' "
    )
    with newman_connector() as read_cnx:
        with execute_query(read_cnx.conn(), stmt, sender) as qry:
            rtn = [[ val.encode('utf-8') if isinstance(val, basestring) else str(val) for val in row] for row in qry.cursor()]
            return { "sender": sender, "email_attachments" : rtn }