Beispiel #1
0
def run_leak_detection(crawl_name, req_df, title_dict=None, device_ids=None):
    crawl_data_dir = get_crawl_data_path(crawl_name)
    if DEBUG:
        print("%d reqs from %d channels" %
              (len(req_df), req_df.channel_id.nunique()))

    if device_ids is None:
        # we factory resetted the device before manual_v2 crawls
        if crawl_name.endswith("manual_v2"):
            device_ids = TV_ID_MAP[get_ott_device_mac(crawl_data_dir)]
        else:
            device_ids = TV_ID_MAP_V1[get_ott_device_mac(crawl_data_dir)]
    print("Will search for the following IDs", device_ids)
    leak_df = detect_leaks_in_requests(req_df, device_ids, title_dict)

    # remove channel names as part of the hostname
    # e.g. accuradio channel talking to accuradio.com
    # remove_ch_name_url_false_positives(leak_df)

    if DEBUG:
        for id_type in device_ids.keys():
            num_leaks = leak_df[leak_df.id_type ==
                                id_type].channel_id.nunique()
            if num_leaks:
                print("%d channels leaked %s" % (num_leaks, id_type))

    return leak_df, device_ids
def get_distinct_ssl_conns(crawl_name, name_resolution=True):
    df = pd.DataFrame([])
    crawl_data_dir = get_crawl_data_path(crawl_name)
    post_process_dir = join(crawl_data_dir, 'post-process')
    print("Loading distinct SSL connections from %s " % post_process_dir)

    for txt_path in glob(join(post_process_dir, "*.ssl_connections")):
        filename = basename(txt_path)
        channel_id = filename.split("-")[0]
        #print(txt_path)
        tmp_df = pd.read_csv(txt_path,
                             sep='|',
                             encoding='utf-8',
                             index_col=None)
        tmp_df['channel_id'] = channel_id
        tmp_df["ssl.record.content_type"] = tmp_df[
            "ssl.record.content_type"].astype(str)
        # print(df["ssl.record.content_type"].value_counts())
        # print("before", len(df))
        tmp_df = tmp_df[tmp_df["ssl.record.content_type"].str.contains("22")]
        df = df.append(tmp_df.drop_duplicates("tcp.stream"))
    assert len(df)

    # replace dots in column names with underscores
    # mapping = {old_col: old_col.replace(".", "_") for old_col in df.columns}
    # df.rename(columns=mapping, inplace=True)
    replace_in_column_names(df, ".", "_")
    return df
Beispiel #3
0
def detect_openwpm_leaks(crawl_name):
    req_df = pd.DataFrame([])
    crawl_data_dir = get_crawl_data_path(crawl_name)

    for openwpm_db_path in glob(
            join(crawl_data_dir, "openwpm-data/*/crawl-data.sqlite")):
        tmp_df = load_reqs_as_df(openwpm_db_path)
        req_df = req_df.append(tmp_df)
    return run_leak_detection(crawl_name, req_df)
Beispiel #4
0
def search_for_video_titles(crawl_name):
    crawl_data_dir = get_crawl_data_path(crawl_name)
    title_list = join(crawl_data_dir, "post-process/global_imdb_titles.json")
    title_dict = defaultdict(set)
    req_df = load_df(crawl_name, "http_req")
    for l in open(title_list):
        channel_id, ts, title = l.rstrip().split("\t")
        title_dict[channel_id].add(title)

    run_leak_detection(crawl_name, req_df, title_dict, device_ids={})
def get_playback_detection_results(crawl_name):
    playback_detected = dict()
    craw_dir = get_crawl_data_path(crawl_name)
    for log_file in glob(join(craw_dir, "logs", "*.log")):
        channel_id = basename(log_file).rsplit("-", 1)[0]
        for l in open(log_file):
            if "SMART_CRAWLER: Playback detected on channel" in l:
                time_str = l.split('[')[1].split(']')[0]
                # the following works with python 3
                # timestamp = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S.%f').timestamp()
                playback_detected[channel_id] = time_str
    return playback_detected
def print_crawl_summary(crawl_name):
    detected = get_playback_detection_results(crawl_name)
    crawl_dir = get_crawl_data_path(crawl_name)
    crawl_statuses = get_crawl_status(crawl_dir)
    n_success = sum(1 for status in crawl_statuses.values()
                    if status == "TERMINATED")
    print("Crawl summary:", crawl_name, crawl_dir)
    print("---------------------")
    print("Total channels", len(crawl_statuses))
    print("Successful crawls", n_success)
    print("Results", Counter(crawl_statuses.values()))
    print("Playback detected in", len(detected))
    return n_success
def get_last_smart_launch_times(crawl_name):
    """Use timestamps.txt files to find the beginning of smart launch for each channel.
    
    We will use this to only consider leaks from the last launch.
    """
    last_smart_launch_times = {}
    crawl_dir = get_crawl_data_path(crawl_name)
    for fname in glob(join(crawl_dir, "logs/*-timestamps.txt")):
        channel_name = basename(fname).split("-")[0]
        for l in open(fname):
            # find the beginning of the smart launch
            if "key-seq-01" in l and "key-0" not in l:
                last_smartlaunch_ts = l.rstrip().split(",")[-1]
                last_smart_launch_times[channel_name] = last_smartlaunch_ts

    return last_smart_launch_times
def get_http_df(crawl_name, drop_from_unfinished=True):
    # print("Will load HTTP dataframe for", crawl_data_dir)
    crawl_data_dir = get_crawl_data_path(crawl_name)
    h1_requests, h1_responses, h1_dns = get_http1_df(crawl_data_dir)
    h2_requests, h2_responses, h2_dns = get_http2_df(crawl_data_dir)
    requests = h1_requests.append(h2_requests, sort=False)
    responses = h1_responses.append(h2_responses, sort=False)

    crawl_statuses = get_crawl_status(crawl_data_dir)
    add_channel_crawl_status(requests, crawl_statuses, drop_from_unfinished)
    add_channel_crawl_status(responses, crawl_statuses, drop_from_unfinished)

    playback_detected = get_playback_detection_results(crawl_name)
    requests['playback'] = requests['channel_id'].map(
        lambda x: x in playback_detected)
    responses['playback'] = responses['channel_id'].map(
        lambda x: x in playback_detected)

    add_adblocked_status(requests, check_by_url=True)
    requests["tcp_stream"] = pd.to_numeric(requests["tcp_stream"])
    requests["tcp_dstport"] = pd.to_numeric(requests["tcp_dstport"])
    return replace_nan(requests), replace_nan(responses), h1_dns
def get_n_successful_channels(crawl_name):
    """Return the number channels crawled without problem."""
    crawl_dir = get_crawl_data_path(crawl_name)
    crawl_statuses = get_crawl_status(crawl_dir)
    return sum(1 for status in crawl_statuses.values()
               if status == "TERMINATED")
def get_distinct_tcp_conns(crawl_name,
                           name_resolution=True,
                           drop_from_unfinished=True,
                           http_requests=None,
                           adBlockStat=True):
    df = pd.DataFrame([])
    crawl_data_dir = get_crawl_data_path(crawl_name)
    tv_ip = get_tv_ip_addr(crawl_data_dir)
    post_process_dir = join(crawl_data_dir, 'post-process')
    print("Loading distinct TCP connections from %s " % post_process_dir)
    if name_resolution:
        # rIP2NameDB, _ = load_dns_data(crawl_data_dir)
        _, ip_2_domains_by_channel = load_dns_data_from_pcap_csvs(
            crawl_data_dir)

    pattern = "*.tcp_streams"
    if DEBUG:
        pattern = "com.w*.tcp_streams"
    #DEBUG com.amazon.rialto.cordova.webapp.webappb656e57
    for txt_path in glob(join(post_process_dir, pattern)):
        filename = basename(txt_path)
        channel_id = filename.split("-")[0]
        #print(txt_path)
        tmp_df = pd.read_csv(txt_path,
                             sep=',',
                             encoding='utf-8',
                             index_col=None,
                             error_bad_lines=False)
        tmp_df['channel_id'] = channel_id
        tmp_df['mitm_attempt'] = 0
        # tmp_df['mitm_fail'] = 0
        # take distinct TCP connections
        df = df.append(tmp_df.drop_duplicates("tcp.stream"))
    assert len(df)
    # replace dots in column names with underscores
    # mapping = {old_col: old_col.replace(".", "_") for old_col in df.columns}
    # df.rename(columns=mapping, inplace=True)
    replace_in_column_names(df, ".", "_")
    # only take outgoing TCP packets
    df = df[df.ip_src == tv_ip]
    if name_resolution and ip_2_domains_by_channel is not None:
        add_hostname_col_by_dns(df, ip_2_domains_by_channel, "ip_dst")

    # add human readable timestamps
    df['timestamp'] = df['frame_time_epoch'].map(
        lambda x: datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))
    channel_df = read_channel_details_df()
    try:
        add_channel_details(df, channel_df)
    except Exception as e:
        # missing channel metadata
        print("ERR", e)
        pass

    playback_detected = get_playback_detection_results(crawl_name)
    df['playback'] = df['channel_id'].map(lambda x: x in playback_detected)
    crawl_statuses = get_crawl_status(crawl_data_dir)
    add_channel_crawl_status(df, crawl_statuses, drop_from_unfinished)
    http_hostnames = get_http_hostnames(crawl_name, http_requests)
    tls_snis = get_tls_snis(crawl_name)
    # 1- use host header from the HTTP request if available
    # 2- for connections to port 443: use SNI
    # 3- use DNS records if 1 and 2 fails
    df['host'] = df.apply(lambda x: get_hostname_for_tcp_conn(
        x, http_hostnames, tls_snis, ip_2_domains_by_channel),
                          axis=1)
    df = replace_nan(df)

    if adBlockStat:
        add_adblocked_status(df)

    df['domain'] = df.host.map(
        lambda x: get_fld("http://" + x, fail_silently=True))
    return df