コード例 #1
0
ファイル: add.py プロジェクト: samhays/aws-admartech-samples
def build_user_identitity_knowledge(persistent_ids_facts_file,
                                    transient_ids_facts_file, dst):
    """
    Generate some facts about user identities.

    There are few informations generated here:
        * transient ids types: cookie | device
        * transient id emails (it's randomly selected from persistent id emails)
        * transient id user agent (
            if transient id type is cookie then workstation user agent is generated,
            otherwise mobile one
        )
        * derivatives of user agent
            * device family (if type device)
            * OS
            * browser
    """
    user_emails = {}
    fake = Faker()
    fake.add_provider(UserAgentProvider)

    logger.info("Creating emails per transient ids")
    # create fake emails for devices with persistent ids
    with open(persistent_ids_facts_file) as f_h:
        for data in json_lines_file(f_h):
            nemail = random.randint(1, len(data["transientIds"]))
            emails = [fake.email() for _ in range(nemail)]
            for transient_id in data["transientIds"]:
                user_emails[transient_id] = random.choice(emails)

    # create fake emails for devices without persistent ids
    with open(transient_ids_facts_file) as t_f_h:
        for data in json_lines_file(t_f_h):
            if data["uid"] not in user_emails:
                user_emails[data["uid"]] = fake.email()

    logger.info("Writing down user identity facts")
    with open(dst, "w") as f_h:
        for transient_id, data in user_emails.items():
            type_ = random.choice(["cookie", "device"])
            uset_agent_str = fake.user_agent_from_type(type_)

            user_agent = parse(uset_agent_str)
            device = user_agent.device.family
            operating_system = user_agent.os.family
            browser = user_agent.browser.family

            f_h.write(
                json.dumps({
                    "transient_id": transient_id,
                    "user_agent": uset_agent_str,
                    "device": device,
                    "os": operating_system,
                    "browser": browser,
                    "email": data,
                    "type": type_,
                }) + "\n")
コード例 #2
0
def generate_user_nodes(src, dst):
    """Generate user node csv file."""
    attributes = [
        "uid:String",
        "user_agent:String",
        "device:String",
        "os:String",
        "browser:String",
        "email:String",
        "type:String",
    ]
    with open(src) as src_data:
        with gremlin_writer(GremlinNodeCSV, dst,
                            attributes=attributes) as writer:
            for data in json_lines_file(src_data):
                writer.add(
                    _id=data["uid"],
                    attribute_map={
                        "uid": data["uid"],
                        "user_agent": data["user_agent"],
                        "device": data["device"],
                        "os": data["os"],
                        "browser": data["browser"],
                        "email": data["email"],
                        "type": data["type"],
                    },
                    label="transientId",
                )
        return dst
コード例 #3
0
ファイル: add.py プロジェクト: samhays/aws-admartech-samples
def generate_identity_groups(persistent_ids_file,
                             distribution,
                             dst,
                             _seed=None):
    """Write facts about identity_group mapping."""
    if _seed is not None:
        random.seed(time.time())

    with open(persistent_ids_file) as f_h:
        pids = [data["pid"] for data in json_lines_file(f_h)]

    random.shuffle(pids)

    sizes, weights = zip(*[[k, v] for k, v in distribution.items()])
    i = 0
    with open(dst, "w") as f_h:
        while i < len(pids):
            size, *_ = random.choices(sizes, weights=weights)
            size = min(size, abs(len(pids) - i))
            persistent_ids = [pids[i + j] for j in range(size)]
            type_ = "household" if len(
                persistent_ids) < COMPANY_MIN_SIZE else "company"
            f_h.write(
                json.dumps({
                    "igid": hash_(persistent_ids),
                    "type": type_,
                    "persistentIds": persistent_ids,
                }) + "\n")
            # advance even if size was 0, meaning that persistent id
            # does not belong to any identity_group
            i += size or 1
コード例 #4
0
def generate_persistent_nodes(src, dst):
    """Generate persistent node csv file."""
    with open(src) as f_h:
        with gremlin_writer(GremlinNodeCSV, dst,
                            attributes=["pid:String"]) as writer:
            for data in json_lines_file(f_h):
                writer.add(
                    _id=data["pid"],
                    attribute_map={"pid": data["pid"]},
                    label="persistentId",
                )
コード例 #5
0
def extend_with_user_identity_information(user_identity_file_path):
    """Coroutine which generates user identity facts based on transient id."""
    with open(user_identity_file_path) as f_h:
        user_id_data = {data["transient_id"]: data for data in json_lines_file(f_h)}

    data = yield

    while data is not None:
        transformed = {**data.copy(), **user_id_data[data["uid"]]}
        del transformed["transient_id"]
        data = yield transformed
コード例 #6
0
def generate_website_group_edges(website_group_json, dst):
    """Generate website group edges CSV."""
    with open(website_group_json) as f_h:
        with gremlin_writer(GremlinEdgeCSV, dst, attributes=[]) as writer:
            for data in json_lines_file(f_h):
                root_id = data["id"]
                websites = data["websites"]
                for website in websites:
                    writer.add(_id=get_id(root_id, website, {}),
                               _from=root_id,
                               to=website,
                               label=WEBISTE_GROUP_EDGE_LABEL,
                               attribute_map={})
コード例 #7
0
def generate_persistent_id_edges(src, dst):
    """Generate persistentID edges based on union-find datastructure."""
    with open(src) as f_h:
        with gremlin_writer(GremlinEdgeCSV, dst, attributes=[]) as writer:
            for data in json_lines_file(f_h):
                for node in data["transientIds"]:
                    persistent_to_transient = {
                        "_id": get_id(data["pid"], node, {}),
                        "_from": data["pid"],
                        "to": node,
                        "label": "has_identity",
                        "attribute_map": {},
                    }
                    writer.add(**persistent_to_transient)
コード例 #8
0
def generate_website_group_nodes(website_group_json, dst):
    """Generate website groups csv."""
    attributes = ["url:String", "category:String", "categoryCode:String"]
    with open(website_group_json) as f_h:
        with gremlin_writer(GremlinNodeCSV, dst,
                            attributes=attributes) as writer:
            for data in json_lines_file(f_h):
                writer.add(_id=data["id"],
                           attribute_map={
                               "url": data["url"],
                               "category": data["category"]["name"],
                               "categoryCode": data["category"]["code"]
                           },
                           label=WEBSITE_GROUP_LABEL)
コード例 #9
0
def generate_identity_group_nodes(src, dst):
    """Generate identity_group csv file with nodes."""
    attrs = ["igid:String", "type:String"]
    with open(src) as f_h:
        with gremlin_writer(GremlinNodeCSV, dst, attributes=attrs) as writer:
            for data in json_lines_file(f_h):
                if data["persistentIds"]:
                    writer.add(
                        _id=data["igid"],
                        attribute_map={
                            "igid": data["igid"],
                            "type": data["type"]
                        },
                        label="identityGroup",
                    )
コード例 #10
0
def generate_identity_group_edges(src, dst):
    """Generate identity_group edge csv file."""
    with open(src) as f_h:
        with gremlin_writer(GremlinEdgeCSV, dst, attributes=[]) as writer:
            for data in json_lines_file(f_h):
                persistent_ids = data["persistentIds"]
                if persistent_ids:
                    for persistent_id in persistent_ids:
                        identity_group_to_persistent = {
                            "_id": get_id(data["igid"], persistent_id, {}),
                            "_from": data["igid"],
                            "to": persistent_id,
                            "attribute_map": {},
                            "label": "member",
                        }
                        writer.add(**identity_group_to_persistent)
コード例 #11
0
def generate_user_website_edges(src_map, dst):
    """Generate edges between user nodes and website nodes."""
    with open(src_map["urls"]) as url_file:
        fact_to_website = {}
        for row in csv.reader(url_file, delimiter=","):
            fact_to_website[int(row[0])] = row[1]

    with open(src_map["facts"]) as facts_file:
        attrs = [
            "ts:Date",
            "visited_url:String",
            "uid:String",
            "state:String",
            "city:String",
            "ip_address:String",
        ]
        with gremlin_writer(GremlinEdgeCSV, dst, attributes=attrs) as writer:
            for data in json_lines_file(facts_file):
                for fact in data["facts"]:
                    timestamp = _parse_ts(fact["ts"])
                    website_id = fact_to_website[fact["fid"]]
                    loc_attrs = {
                        "state": fact["state"],
                        "city": fact["city"],
                        "ip_address": fact["ip_address"],
                    }
                    attr_map = {
                        "ts": timestamp,
                        "visited_url": website_id,
                        "uid": data["uid"],
                        **loc_attrs,
                    }
                    user_to_website = {
                        "_id": get_id(data["uid"], website_id, attr_map),
                        "_from": data["uid"],
                        "to": website_id,
                        "label": "visited",
                        "attribute_map": attr_map,
                    }
                    try:
                        writer.add(**user_to_website)
                    except Exception:
                        logger.exception("Something went wrong while creating an edge")
                        logger.info(json.dumps({"uid": data["uid"], **fact}))

    return dst
コード例 #12
0
def extend_facts_file(fact_file_path, ip_loc_file_path, user_identity_file_path):
    """Extend facts file with additional information."""
    ip_loc_cor = extend_with_iploc_information(ip_loc_file_path)
    user_identity_cor = extend_with_user_identity_information(user_identity_file_path)

    next(ip_loc_cor)
    next(user_identity_cor)

    dst = f"{fact_file_path}.tmp"
    with open(fact_file_path) as f_h:
        with open(dst, "w") as f_dst:
            for data in json_lines_file(f_h):
                transformed_row = user_identity_cor.send(ip_loc_cor.send(data))
                f_dst.write(json.dumps(transformed_row) + "\n")

        ip_loc_cor.close()

    os.rename(dst, fact_file_path)
コード例 #13
0
def generate_ip_loc_edges_from_facts(src, dst):
    """Generate ip location csv file with edges."""
    with open(src) as f_h:
        with gremlin_writer(GremlinEdgeCSV, dst, attributes=[]) as writer:
            for data in json_lines_file(f_h):
                uid_locations = set()
                for fact in data["facts"]:
                    uid_locations.add(
                        IPLoc(fact["state"], fact["city"], fact["ip_address"]))

                for location in uid_locations:
                    loc_id = get_id(location)
                    writer.add(
                        _id=get_edge_id(data["uid"], loc_id, {}),
                        _from=data["uid"],
                        to=loc_id,
                        label="uses",
                        attribute_map={},
                    )
コード例 #14
0
def generate_ip_loc_nodes_from_facts(src, dst):
    """Generate ip location csv file with nodes."""
    attrs = ["state:String", "city:String", "ip_address:String"]
    with open(src) as f_h:
        with gremlin_writer(GremlinNodeCSV, dst, attributes=attrs) as writer:
            locations = set()
            for data in json_lines_file(f_h):
                for fact in data["facts"]:
                    locations.add(
                        IPLoc(fact["state"], fact["city"], fact["ip_address"]))

            for location in locations:
                writer.add(
                    _id=get_id(location),
                    attribute_map={
                        "state": location.state,
                        "city": location.city,
                        "ip_address": location.ip_address,
                    },
                    label="IP",
                )
コード例 #15
0
def extend_with_iploc_information(ip_loc_file_path):
    """Coroutine which generates ip location facts based on transient id."""
    with open(ip_loc_file_path) as f_h:
        loc_data = {data["transient_id"]: data["loc"] for data in json_lines_file(f_h)}

    data = yield

    def get_sane_ip_locaction(uid, facts, max_ts_difference=3600):
        """
        Given transient id and its facts add information about ip/location.

        Process is semi-deterministic.
            1. Choose the location at random from the given list of locations
            2. Repeat returning this location as long as the timestamp difference
               lies within the `max_ts_difference`
            3. Otherwise, start from 1)
        """
        facts = [None] + sorted(facts, key=lambda x: x["ts"])
        ptr1, ptr2 = itertools.tee(facts, 2)
        next(ptr2, None)

        loc_fact = random.choice(loc_data[uid])

        for previous_item, current in zip(ptr1, ptr2):
            if (
                previous_item is None
                or current["ts"] - previous_item["ts"] > max_ts_difference
            ):
                loc_fact = random.choice(loc_data[uid])
            yield {**current, **loc_fact}

    while data is not None:
        transformed = data.copy()
        transformed["facts"] = list(
            get_sane_ip_locaction(uid=data["uid"], facts=data["facts"])
        )
        data = yield transformed
コード例 #16
0
ファイル: add.py プロジェクト: samhays/aws-admartech-samples
def build_iploc_knowledge(
    ip_facts_file,
    persistent_ids_facts_file,
    identity_group_facts_file,
    transient_ids_facts_file,
    dst,
):
    """
    Given some fact files, generate random locations and IP addresses in a sane way.

    It’s like funnel. At the very top you have identity groups, then persistent nodes,
    then transient nodes.

    Logic can be simplified to:
        * identity groups = select few (at most 8 with very low probability) IP addresses
        * persistent nodes = select few IP addresses from the group above
        * transient nodes = select few IP addresses from the group above

    This way context data is sane. Each transient node has IPs from subset of persistent
    id IPs, and identity groups IPs.

    Probabilities makes highly probably for transient nodes to be within the same city,
    and the same state. Same goes for persistent nodes.
    """
    IPLoc = namedtuple("IPLoc", "state, city, ip_address")

    with open(ip_facts_file) as f_h:
        ip_cidrs_by_state_city = list(json_lines_file(f_h))

    knowledge = {
        "identity_group": {},
        "persistent_id": {},
        "transient_ids": {}
    }

    def random_ip_loc():
        state_count, *_ = random.choices([1, 2], weights=[0.98, 0.02])
        for state_data in random.choices(ip_cidrs_by_state_city,
                                         k=state_count):
            city_count, *_ = random.choices([1, 2, 3, 4],
                                            weights=[0.85, 0.1, 0.04, 0.01])
            for city_data in random.choices(state_data["cities"],
                                            k=city_count):
                random_cidr = random.choice(city_data["cidr_blocks"])
                yield IPLoc(
                    state=state_data["state"],
                    city=city_data["city"],
                    ip_address=str(random.choice(
                        get_ip_addresses(random_cidr))),
                )

    def random_ip_loc_from_group(locations):
        # compute weights, each next item is two times less likely probably than the previous
        weights = [1]
        for _ in locations[:-1]:
            weights.append(weights[-1] / 2)

        count = len(locations)
        random_count, *_ = random.choices(list(range(1, count + 1)),
                                          weights=weights)
        return list(set(random.choices(locations, k=random_count)))

    logger.info("Creating Identity group / persistent ids IP facts")
    with open(identity_group_facts_file) as f_h:
        for data in json_lines_file(f_h):
            locations = knowledge["identity_group"][data["igid"]] = list(
                set(random_ip_loc()))

            for persistent_id in data["persistentIds"]:
                knowledge["persistent_id"][
                    persistent_id] = random_ip_loc_from_group(locations)

    logger.info("Creating persistent / transient ids IP facts")
    with open(persistent_ids_facts_file) as f_h:
        for data in json_lines_file(f_h):
            persistent_id = data["pid"]
            # handle case where persistent id does not belong to any identity group
            if data["pid"] not in knowledge:
                knowledge["persistent_id"][
                    persistent_id] = random_ip_loc_from_group(
                        list(set(random_ip_loc())))
            for transient_id in data["transientIds"]:
                knowledge["transient_ids"][
                    transient_id] = random_ip_loc_from_group(
                        knowledge["persistent_id"][persistent_id])
        # now assign random ip location for transient ids without persistent ids
        logger.info("Processing remaining transient ids facts")
        with open(transient_ids_facts_file) as t_f_h:
            for data in json_lines_file(t_f_h):
                if data["uid"] not in knowledge["transient_ids"]:
                    knowledge["transient_ids"][data["uid"]] = list(
                        set(
                            random_ip_loc_from_group(  # "transient group" level
                                random_ip_loc_from_group(  # "persistent group" level
                                    list(set(random_ip_loc())
                                         )  # "identity group" level
                                ))))

    with open(dst, "w") as f_h:
        for key, data in knowledge["transient_ids"].items():
            f_h.write(
                json.dumps({
                    "transient_id": key,
                    "loc": [item._asdict() for item in data]
                }) + "\n")