Esempio n. 1
0
def get_list_statistics(file):
    """
    For every policy, open ip the list.csv file.  Read the number of
    entries to get the number of lists per file, then count the number
    of \n characters per list entry to figure out how many items are in
    each list.  Get average of the items per list, report back.

    In:     list CSV file to read.
    Out:    Policy object to return to the Pool list at the end.
    """
    with open(parser_output_dir + file, "r") as fp:
        csv_reader = reader(fp)
        elements = list(csv_reader)

    policy_stats = Policy(file, len(elements))
    num_items = []
    for l in elements:
        content_string = l[4]
        policy_stats.lists.append(content_string)
        num_items.append(content_string.count("\n") + 1)
    policy_stats.avg_list_len = sum(num_items) / len(num_items)

    # Update progress bar
    with index.get_lock():
        index.value += 1
        print_progress_bar(index.value,
                           len(random_files),
                           prefix="Sentence Statistics Progress:",
                           suffix="Complete",
                           length=50)

    return policy_stats
def verify(policy, ground_truth):
    """
    This function will verify that the HTML we scraped is actually a privacy
    policy.  (For example, we need to reject HTML which turns out to be an
    article about privacy or a pointer to policies as opposed to a privacy policy.)
    We accomplish this by comparing against a ground truth.  We build our ground
    truth by constructing a bag of words from human-verified privacy policies.
    HTML which does not pass the verification process will be logged then
    deleted.

    In:     policy filename
    Out:    cosine similarity score of ground truth and policy document
    """
    if policy == ".DS_Store":
        return 0
    
    with open(policies_html_dir + policy, "r") as fp:
        html_contents = fp.read()
    html_contents = remove_company_names(strip_text(html_contents), policy[:-5]) + " "
    
    # verify majority of the contents are english-language, discard if not
    if not is_english(dictionary, html_contents):
        # print(policy + " is not english")
        return 0

    if is_duplicate_policy(html_contents, policy, policy_dict):
        # print("this is a duplicate policy")
        return -2
    
    # Create the Document Term Matrix and pandas dataframe
    # https://www.machinelearningplus.com/nlp/cosine-similarity/
    documents = [ground_truth, html_contents]
    vectorizer = TfidfVectorizer()
    sparse_matrix = vectorizer.fit_transform(documents)
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, 
            columns=vectorizer.get_feature_names(),
            index=["ground_truth", "corp"])

    # calculate cosine similarity of the ground truth and the policy
    # sim[0,1] is the value we actually care about
    sim = cosine_similarity(df, df)
    
    # Update progress bar
    with index.get_lock():
        index.value += 1
        print_progress_bar(index.value, len(files), prefix = "Verification Progress:", suffix = "Complete", length = 50)

    return sim[0,1]
Esempio n. 3
0
def extract_sentences(file):
    """
    Reads in csv file from pre-generated parser output and looks at
    every line to gather sentences from it, then apply the input
    ruleset on those sentences, and return statistics.
    """
    policy_stats = Policy(file, rule_dict)

    with open(parser_output_dir + file, "r") as fp:
        csv_reader = reader(fp)
        elements = list(csv_reader)

    sentence_list = []
    for elem in elements:  # for every possible object
        sentences = sent_tokenize(elem[-1])
        for sentence in sentences:  # for every sentence in that object
            rule_hits = apply_sentence_rules(sentence, rule_dict)
            policy_stats.lengths.append(len(sentence.split()))
            sentence_list.append((len(sentence.split()), sentence, rule_hits))
            for name in policy_stats.rule_hits.keys(
            ):  # loop through all the keys in the dict
                if name in rule_hits:  # and increment the policy_stats dict if that key is in the sentence's keys
                    policy_stats.rule_hits[name] += 1
            policy_stats.sentences.append(rule_hits)

    # write sentences to csv file
    headings = ("Number of Words", "Sentence Text", "Rule Hits")
    with open(output_folder + file + "_sentences.csv", "w") as fp:
        csv_writer = writer(fp)
        csv_writer.writerow(headings)
        csv_writer.writerows(sentence_list)

    # create bar graphs of policy's sentence rule hits
    generate_rule_bar_fig(policy_stats.rule_hits,
                          output_folder + file[:-4] + "_rule_bar.pdf")

    # Update progress bar
    with index.get_lock():
        index.value += 1
        print_progress_bar(index.value,
                           len(random_files),
                           prefix="Sentence Statistics Progress:",
                           suffix="Complete",
                           length=50)

    return policy_stats
Esempio n. 4
0
def generate_rule_hist_figs(files, rule_hits, lengths, num_files, rule_dict,
                            outfile):
    """
    Creates aggregate representation of the rule_vals dictionaries
    collected from every successfully parsed file.  Produces histograms
    of every sentence parsing rule and presents them as a single image.
    Does not include "GOOD" rules.

    In:     rule_hits list of all policies rule_hits dictionaries,
            the number of files that were inspected for sentences,
            the rule dict providing the names of rules as keys,
            string filepath to output figure to.
    Out:    figure containing histograms of all rules.
    """
    num_files = len(rule_hits)
    rows = ceil(sqrt(len(rule_dict)) + 1) + 1
    cols = ceil(sqrt(len(rule_dict))) - 1
    # fig = plt.figure(figsize=(rows*10,cols*10))
    # i = 0
    # for i, (name, rule) in enumerate(rule_dict.items(), start=1):
    #     count = [d[name] for d in rule_hits]
    #     subfig = fig.add_subplot(rows,cols,i)
    #     subfig.set_xlabel(name + " Rule Hit Count")
    #     subfig.set_ylabel("Number of Policies")
    #     subfig.xaxis.set_major_locator(MaxNLocator(integer=True))
    #     subfig.hist(count, num_files, rwidth=0.5)
    #     print(i)
    # len_boxplot = fig.add_subplot(rows-1,1,5)
    # len_boxplot.set_xlabel("Sentence Length per Policy")
    # len_boxplot.set_ylabel("")
    # filenames = ["_".join(i.split("_", 2)[:2]) for i in files]
    # len_boxplot.boxplot(lengths)
    # fig.tight_layout()
    # fig.savefig(outfile)
    fig = plt.figure(figsize=(rows * 10, cols * 10))
    gs = fig.add_gridspec(rows, cols)
    r = 0
    c = 0
    for i, (name, rule) in enumerate(rule_dict.items(), start=1):
        count = [d[name] for d in rule_hits]
        # standalone_fig = hist(count, bins=arange(num_files + 1) - 0.5)
        standalone_fig = plt.figure()
        plt.hist(count,
                 bins=[
                     0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160,
                     180, 200, 250, 300, 350, 400
                 ],
                 rwidth=0.5,
                 figure=standalone_fig)
        plt.xlabel(name + " Rule Hit Count", figure=standalone_fig)
        plt.ylabel("# of Policies", figure=standalone_fig)
        standalone_fig.savefig(outfile[:-4] + "_" + name + ".pdf")
        subfig = fig.add_subplot(gs[r, c])
        subfig.set_xlabel(name + " Rule Hit Count")
        subfig.set_ylabel("# of Policies")
        # subfig.hist(count, bins=arange(num_files + 1) - 0.5)
        subfig.hist(count,
                    bins=[
                        0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140,
                        160, 180, 200, 250, 300, 350, 400
                    ],
                    rwidth=0.5)
        if c < cols - 1:
            c += 1
        else:
            c = 0
            r += 1
        print_progress_bar(i,
                           len(rule_dict.items()) + 1,
                           prefix="Rule Histograms Progress:",
                           suffix="Complete",
                           length=50)
    if c != 0:
        r += 1
    len_boxplot = fig.add_subplot(gs[r:, :])
    len_boxplot.set_xlabel("Sentence Length per Policy")
    len_boxplot.set_ylabel("")
    len_boxplot.tick_params(bottom=False, labelbottom=False)
    len_boxplot.boxplot(lengths)
    print_progress_bar(i + 1,
                       len(rule_dict.items()) + 1,
                       prefix="Rule Histograms Progress:",
                       suffix="Complete",
                       length=50)
    fig.tight_layout()
    fig.savefig(outfile)
Esempio n. 5
0
def crawl(domain):
    """
    Primary function for the process pool.
    Crawl websites for links to privacy policies.  First check if
    the website can be reached at all, then find list of policy links
    on first page.  Then loop through links to see if the links are 
    valid policies.  Keep statistics in every subprocess for summary
    at end.

    In:     domain landing page string
    Out:    CrawlReturn obj containing links, statistics about links,
            output files, etc.
    """
    # first get the domain landing page via HTTPS
    full_url = domain if ("http" in domain) else "http://" + domain
    # full_url = full_url if ("https://" in full_url) else full_url.replace("http://", "https://")
    # domain_html = request(full_url, driver)
    domain_html = request(full_url)
    if strip_text(domain_html) == "":
        failed_access_domain = CrawlReturn(domain, False)
        failed_access_domains.append(failed_access_domain)
        with index.get_lock():  # Update progress bar
            index.value += 1
            print_progress_bar(index.value,
                               len(domain_list),
                               prefix="Crawling Progress:",
                               suffix="Complete",
                               length=50)
        return failed_access_domain

    # get links from domain landing page, return if none found
    links = find_policy_links(full_url, domain_html)
    if len(links) == 0:
        no_link_domain = CrawlReturn(domain, True)
        no_link_domains.append(no_link_domain)
        with index.get_lock():  # Update progress bar
            index.value += 1
            print_progress_bar(index.value,
                               len(domain_list),
                               prefix="Crawling Progress:",
                               suffix="Complete",
                               length=50)
        return no_link_domain

    # go down the link rabbit hole to download the html and verify that they are policies
    retobj = CrawlReturn(domain, True)
    domain_successful_links = []
    domain_failed_links = []
    depth_count = 0
    output_count = 0
    for link in links:
        # link_html = request(link, driver)
        link_html = request(link)
        link_contents = strip_text(link_html)
        # link_dict[link] = domain
        # print(link_dict)

        # check whether we could even see this policy
        if link_contents == "":
            domain_failed_links.append(link)
            retobj.add_link(link, 0.0, "N/A", "N/A", False, False, False)
            continue  # policy is empty, skip this whole thing

        # add links on this page to the list to be visited if they are new
        if depth_count < max_crawler_depth:
            depth_count += 1
            new_links = find_policy_links(full_url, link_html)
            for l in new_links:
                if l not in links:
                    links.append(l)

        # get similarity score, check against the score threshold to see if policy
        sim_score = verify(link_contents, ground_truth)
        is_policy = sim_score >= cos_sim_threshold

        # if this page is a policy, check duplicate then write out to file
        if is_policy:
            if is_duplicate_policy(link_contents, domain, policy_dict):
                retobj.add_link(link, 0.0, "N/A", "N/A", True, True, True)
                continue  # we've already seen this policy, skip
            domain_successful_links.append(link)
            output_count += 1
            html_outfile = html_outfolder + domain[:-4] + "_" + str(
                output_count) + ".html"
            with open(html_outfile, "a") as fp:
                fp.write(link_html)
            stripped_outfile = stripped_outfolder + domain[:-4] + "_" + str(
                output_count) + ".txt"
            with open(stripped_outfile, "a") as fp:
                fp.write(link_contents)
            retobj.add_link(link, sim_score, html_outfile, stripped_outfile,
                            True, True, False)

        # this isn't a policy, so just add it to the stats and continue
        else:
            if is_duplicate_policy(link_contents, domain, policy_dict):
                retobj.add_link(link, 0.0, "N/A", "N/A", True, False, True)
                continue  # we've already seen this policy, skip
            domain_failed_links.append(link)
            retobj.add_link(link, sim_score, "N/A", "N/A", True, False, False)

    # check whether at least one link in the domain was successful
    successful_links.extend(domain_successful_links)
    failed_links.extend(domain_failed_links)
    if sum(link.valid == True for link in retobj.link_list) == 0:
        failed_link_domains.append(retobj)
    else:
        successful_domains.append(retobj)

    with index.get_lock():  # Update progress bar
        index.value += 1
        print_progress_bar(index.value,
                           len(domain_list),
                           prefix="Crawling Progress:",
                           suffix="Complete",
                           length=50)
    return retobj
Esempio n. 6
0
    def train(self):

        self.writer = tf.summary.create_file_writer(self.save_ckpt_path)
        os.makedirs(self.save_images_dir)
        os.makedirs(self.save_models_dir)

        for epoch_idx in range(1, self.epochs + 1):

            # Use metrics to storage log history
            gen_total_loss_metrics = tf.keras.metrics.Mean("gen_total_loss")
            gen_gan_loss_metrics = tf.keras.metrics.Mean("gen_gan_loss")
            gen_l1_loss_metrics = tf.keras.metrics.Mean("gen_l1_loss")
            disc_loss_metrics = tf.keras.metrics.Mean("disc_loss")

            # Train
            for batch_idx, (input_image, target) in self.train_dataset.enumerate():
                gen_total_loss, gen_gan_loss, gen_l1_loss, disc_loss = self.train_step(input_image, target)

                # Mean the loss
                gen_total_loss = gen_total_loss.numpy().mean()
                gen_gan_loss = gen_gan_loss.numpy().mean()
                gen_l1_loss = gen_l1_loss.numpy().mean()
                disc_loss = disc_loss.numpy().mean()

                # Use metrics to storage the loss history and mean
                gen_total_loss_metrics(gen_total_loss)
                gen_gan_loss_metrics(gen_gan_loss)
                gen_l1_loss_metrics(gen_l1_loss)
                disc_loss_metrics(disc_loss)

                # Print training step log
                prefix_log = 'Train -> [Epoch: {}/{}] | [Batch: {}/{}]'.format(epoch_idx,
                                                                               self.epochs,
                                                                               batch_idx,
                                                                               self.train_dataset_size + 1)

                suffix_log = "[gen_total_loss: {:1.5f}" \
                             " | gen_gan_loss {:1.5f}" \
                             " | gen_l1_loss {:1.5f}" \
                             " | disc_loss {:1.5f}]".format(float(gen_total_loss),
                                                            float(gen_gan_loss),
                                                            float(gen_l1_loss),
                                                            float(disc_loss))

                print_progress_bar(iteration=int(batch_idx),
                                   total=int(self.train_dataset_size + 1),
                                   prefix=prefix_log,
                                   suffix=suffix_log)

            # Save image
            for example_input, example_target in self.test_dataset.take(1):
                self.generate_and_save_images(example_input, example_target, epoch_done=epoch_idx)


            ###################
            ### TensorBoard ###
            ###################

            # Use metrics result write to tensorboard
            write_to_tensorboard_data = {"gen_total_loss": gen_total_loss_metrics.result(),
                                         "gen_gan_loss": gen_gan_loss_metrics.result(),
                                         "gen_l1_loss": gen_l1_loss_metrics.result(),
                                         "disc_loss": disc_loss_metrics.result()}

            self.write_to_tensorboard(data_dict=write_to_tensorboard_data, epoch_done=epoch_idx, condition='train')

            # Save model
            self.save_model(epoch_done=epoch_idx)
def process_policy(fname):
    """
    Entry function for each subprocess.  Reads in the HTML contents and
    stripped text of the input policy filename, creates all the output
    files needed for this policy, instantiates a bs4 object and an
    object to hold statistics about the policy, walks the bs4 tree,
    outputs each tag-type's list to its own CSV file, then builds
    the sequential list of all elements in the HTML file, then hands
    everything off to the sentence extraction phase.

    In:     policy filename.
    Out:    tuple containing policy rule_hits dict and the filename.
    """
    with open(dataset_html + fname, "r") as fp:
        html_contents = fp.read()
    with open(dataset_text + fname[:-5] + ".txt", "r") as fp:
        auto_stripped_text = fp.read()
    if html_contents == "":
        print("Skipping " + fname + " because it has no html contents.")
        # this isn't considered failure because html empty isn't the parser's fault
        return None
    if auto_stripped_text == "":
        print("Skipping " + fname + " because it has no text contents.")
        # this isn't considered failure because if the whole text is empty, there's no way to compare
        return None

    # build all the output files
    outfile_sequential = parser_output_folder + fname[:-5] + timestamp + "_sequential.txt"
    outfile_paragraphs = parser_output_folder + fname[:-5] + timestamp + "_paragraphs.csv"
    outfile_headers = parser_output_folder + fname[:-5] + timestamp + "_headers.csv"
    outfile_lists = parser_output_folder + fname[:-5] + timestamp + "_lists.csv"
    outfile_compare = parser_output_folder + fname[:-5] + timestamp + "_compare.txt"
    outfile_rule_bar = tokenizer_output_folder + fname[:-5] + timestamp + "_rule_bar.png"
    outfile_sentences = tokenizer_output_folder + fname[:-5] + timestamp + "_sentences.csv"

    # walk tree to parse all the beautiful soup tags and build comparison text
    try:
        soup = BeautifulSoup(html_contents, "html.parser")
    except Exception as e:
        print("Skipping " + fname +
              " because it can't be read by BeautifulSoup.")
        return None  # if there's no soup, we don't care
    parser = ParserData(rule_dict)
    walk_tree(remove_bad_tags(soup), parser)

    # output the parsed tags to their appropriate files
    if len(parser.paragraph_list) > 0:
        write_tag_list_to_csv(parser, parser.paragraph_list,
                              outfile_paragraphs)
    if len(parser.header_list) > 0:
        write_tag_list_to_csv(parser, parser.header_list, outfile_headers)
    if len(parser.list_list) > 0:
        write_tag_list_to_csv(parser, parser.list_list, outfile_lists)

    # go through entire sequential list to build sequential file
    out_string = ""
    for element in parser.seq_list:
        out_string = out_string + element.tag_type + str(
            element.tag_index) + "\n" + element.content_string + "\n"
    with open(outfile_sequential, "a") as fp:
        fp.write(out_string)

    # Update progress bar
    with index.get_lock():
        index.value += 1
        print_progress_bar(index.value,
                           len(files),
                           prefix="Parsing-Tokenizing Progress:",
                           suffix="Complete",
                           length=50)

    # Decide whether the parsing was successful
    remaining_sentences = compare_parsed_text(parser.seq_list,
                                              auto_stripped_text)
    lock = Lock(
    )  # do full lock here because err.txt & success.txt are shared files
    if len(remaining_sentences) > 5:
        # parsing failed --> don't bother doing anything else to this policy
        lock.acquire()
        try:
            num_failed_policies.value += 1
            with open(outfile_compare, "a") as fp:
                fp.write("\n\n".join(remaining_sentences) + "\n")
            with open(parser_output_folder + "err.txt", "a") as fp:
                fp.write(fname[:-5] + " has " + str(len(remaining_sentences)) +
                         " left.\n")
        finally:
            lock.release()
        return None
    else:
        # parsing succeeded --> sentence tokenize as much as possible from
        extract_sentences(parser, outfile_sentences, outfile_rule_bar)
        lock.acquire()
        try:
            with open(parser_output_folder + "success.txt", "a") as fp:
                fp.write(fname[:-5] + " has " + str(parser.rule_hits["GOOD"]) +
                         " good sentences.\n")
        finally:
            lock.release()
        return (parser.rule_hits.copy(), fname, parser.sentence_lengths)