Esempio n. 1
0
def generate_user_nodes(src, dst):
    """Generate user node csv file."""
    attributes = [
        "uid:String",
        "user_agent:String",
        "device:String",
        "os:String",
        "browser:String",
        "email:String",
        "type:String",
    ]
    with open(src) as src_data:
        with gremlin_writer(GremlinNodeCSV, dst,
                            attributes=attributes) as writer:
            for data in json_lines_file(src_data):
                writer.add(
                    _id=data["uid"],
                    attribute_map={
                        "uid": data["uid"],
                        "user_agent": data["user_agent"],
                        "device": data["device"],
                        "os": data["os"],
                        "browser": data["browser"],
                        "email": data["email"],
                        "type": data["type"],
                    },
                    label="transientId",
                )
        return dst
Esempio n. 2
0
def generate_website_csv(urls, titles, dst):
    """Generate destination CSV file."""
    attributes = ["url:String", "title:String"]
    with gremlin_writer(GremlinNodeCSV, dst, attributes=attributes) as writer:
        for website in generate_websites(urls, titles):
            attribute_map = {"url": website.url, "title": website.title}
            writer.add(_id=website.url,
                       attribute_map=attribute_map,
                       label=WEBSITE_LABEL)
Esempio n. 3
0
def generate_persistent_nodes(src, dst):
    """Generate persistent node csv file."""
    with open(src) as f_h:
        with gremlin_writer(GremlinNodeCSV, dst,
                            attributes=["pid:String"]) as writer:
            for data in json_lines_file(f_h):
                writer.add(
                    _id=data["pid"],
                    attribute_map={"pid": data["pid"]},
                    label="persistentId",
                )
def generate_website_group_edges(website_group_json, dst):
    """Generate website group edges CSV."""
    with open(website_group_json) as f_h:
        with gremlin_writer(GremlinEdgeCSV, dst, attributes=[]) as writer:
            for data in json_lines_file(f_h):
                root_id = data["id"]
                websites = data["websites"]
                for website in websites:
                    writer.add(_id=get_id(root_id, website, {}),
                               _from=root_id,
                               to=website,
                               label=WEBISTE_GROUP_EDGE_LABEL,
                               attribute_map={})
def generate_persistent_id_edges(src, dst):
    """Generate persistentID edges based on union-find datastructure."""
    with open(src) as f_h:
        with gremlin_writer(GremlinEdgeCSV, dst, attributes=[]) as writer:
            for data in json_lines_file(f_h):
                for node in data["transientIds"]:
                    persistent_to_transient = {
                        "_id": get_id(data["pid"], node, {}),
                        "_from": data["pid"],
                        "to": node,
                        "label": "has_identity",
                        "attribute_map": {},
                    }
                    writer.add(**persistent_to_transient)
Esempio n. 6
0
def generate_website_group_nodes(website_group_json, dst):
    """Generate website groups csv."""
    attributes = ["url:String", "category:String", "categoryCode:String"]
    with open(website_group_json) as f_h:
        with gremlin_writer(GremlinNodeCSV, dst,
                            attributes=attributes) as writer:
            for data in json_lines_file(f_h):
                writer.add(_id=data["id"],
                           attribute_map={
                               "url": data["url"],
                               "category": data["category"]["name"],
                               "categoryCode": data["category"]["code"]
                           },
                           label=WEBSITE_GROUP_LABEL)
def generate_identity_group_nodes(src, dst):
    """Generate identity_group csv file with nodes."""
    attrs = ["igid:String", "type:String"]
    with open(src) as f_h:
        with gremlin_writer(GremlinNodeCSV, dst, attributes=attrs) as writer:
            for data in json_lines_file(f_h):
                if data["persistentIds"]:
                    writer.add(
                        _id=data["igid"],
                        attribute_map={
                            "igid": data["igid"],
                            "type": data["type"]
                        },
                        label="identityGroup",
                    )
def generate_identity_group_edges(src, dst):
    """Generate identity_group edge csv file."""
    with open(src) as f_h:
        with gremlin_writer(GremlinEdgeCSV, dst, attributes=[]) as writer:
            for data in json_lines_file(f_h):
                persistent_ids = data["persistentIds"]
                if persistent_ids:
                    for persistent_id in persistent_ids:
                        identity_group_to_persistent = {
                            "_id": get_id(data["igid"], persistent_id, {}),
                            "_from": data["igid"],
                            "to": persistent_id,
                            "attribute_map": {},
                            "label": "member",
                        }
                        writer.add(**identity_group_to_persistent)
def generate_user_website_edges(src_map, dst):
    """Generate edges between user nodes and website nodes."""
    with open(src_map["urls"]) as url_file:
        fact_to_website = {}
        for row in csv.reader(url_file, delimiter=","):
            fact_to_website[int(row[0])] = row[1]

    with open(src_map["facts"]) as facts_file:
        attrs = [
            "ts:Date",
            "visited_url:String",
            "uid:String",
            "state:String",
            "city:String",
            "ip_address:String",
        ]
        with gremlin_writer(GremlinEdgeCSV, dst, attributes=attrs) as writer:
            for data in json_lines_file(facts_file):
                for fact in data["facts"]:
                    timestamp = _parse_ts(fact["ts"])
                    website_id = fact_to_website[fact["fid"]]
                    loc_attrs = {
                        "state": fact["state"],
                        "city": fact["city"],
                        "ip_address": fact["ip_address"],
                    }
                    attr_map = {
                        "ts": timestamp,
                        "visited_url": website_id,
                        "uid": data["uid"],
                        **loc_attrs,
                    }
                    user_to_website = {
                        "_id": get_id(data["uid"], website_id, attr_map),
                        "_from": data["uid"],
                        "to": website_id,
                        "label": "visited",
                        "attribute_map": attr_map,
                    }
                    try:
                        writer.add(**user_to_website)
                    except Exception:
                        logger.exception("Something went wrong while creating an edge")
                        logger.info(json.dumps({"uid": data["uid"], **fact}))

    return dst
Esempio n. 10
0
def generate_ip_loc_edges_from_facts(src, dst):
    """Generate ip location csv file with edges."""
    with open(src) as f_h:
        with gremlin_writer(GremlinEdgeCSV, dst, attributes=[]) as writer:
            for data in json_lines_file(f_h):
                uid_locations = set()
                for fact in data["facts"]:
                    uid_locations.add(
                        IPLoc(fact["state"], fact["city"], fact["ip_address"]))

                for location in uid_locations:
                    loc_id = get_id(location)
                    writer.add(
                        _id=get_edge_id(data["uid"], loc_id, {}),
                        _from=data["uid"],
                        to=loc_id,
                        label="uses",
                        attribute_map={},
                    )
Esempio n. 11
0
def generate_ip_loc_nodes_from_facts(src, dst):
    """Generate ip location csv file with nodes."""
    attrs = ["state:String", "city:String", "ip_address:String"]
    with open(src) as f_h:
        with gremlin_writer(GremlinNodeCSV, dst, attributes=attrs) as writer:
            locations = set()
            for data in json_lines_file(f_h):
                for fact in data["facts"]:
                    locations.add(
                        IPLoc(fact["state"], fact["city"], fact["ip_address"]))

            for location in locations:
                writer.add(
                    _id=get_id(location),
                    attribute_map={
                        "state": location.state,
                        "city": location.city,
                        "ip_address": location.ip_address,
                    },
                    label="IP",
                )