Esempio n. 1
0
def get_token_counts(data_path, max_sent, max_highlight):

    doc_paths = [
        os.path.join(data_path, file) for file in os.listdir(data_path)
    ]

    num_docs = len(doc_paths)
    counts_inp = defaultdict(int)
    counts_hl = defaultdict(int)

    for i, doc_path in enumerate(doc_paths, 1):
        sys.stdout.write("\r {:d} / {:d} ( {:7.4f}% ) ".format(
            i, num_docs, 100 * i / num_docs))
        sys.stdout.flush()
        doc = read_document(doc_path)

        for sent in doc["sentences"][:max_sent]:
            tokens = preprocess_tokens(sent["tokens"], doc["entities"])
            for token in tokens:
                counts_inp[token] += 1

        for sent in doc["highlights"][:max_highlight]:
            tokens = preprocess_tokens(sent["tokens"], doc["entities"])
            for token in tokens:
                counts_hl[token] += 1

    return counts_inp, counts_hl
Esempio n. 2
0
def process_document(document_path, output_dir, lead):

    output_path = os.path.join(output_dir, os.path.split(document_path)[1])
    doc = read_document(document_path)
    summary_text = build_summary(doc, lead)
    with open(output_path, "w") as f:
        f.write(summary_text)
def process_document(args):
    document_path, output_dir, rouge_settings = args

    output_path = os.path.join(output_dir, os.path.split(document_path)[1])
    doc = read_document(document_path)
    summary_text = build_summary(doc, rouge_settings)
    with open(output_path, "w") as f:
        f.write(summary_text)
Esempio n. 4
0
def test_read(data_path):

    doc_paths = [
        os.path.join(data_path, file) for file in os.listdir(data_path)
    ]
    doc_paths.sort()
    num_docs = len(doc_paths)

    bad_paths = list()

    for i, doc_path in enumerate(doc_paths, 1):
        sys.stdout.write("\r {:d} / {:d} ( {:7.4f}% ) ".format(
            i, num_docs, 100 * i / num_docs))
        sys.stdout.flush()
        #try:
        try:
            read_document(doc_path)
        except ValueError, e:
            bad_paths.append((str(e), doc_path))
Esempio n. 5
0
def process_document(args): 
    document_path, output_dir, max_input, max_highlight, no_overwrite = args
    output_path = os.path.join(output_dir, os.path.split(document_path)[1])
    if no_overwrite is True and os.path.exists(output_path):
        return
    print document_path

    doc = read_document(document_path)

    meta = init_doc_meta(doc, max_input)
    #print document_path 
    data = list()
    for highlight in doc["highlights"][:max_highlight]:
        backbone, support, alignments = find_highlight_alignments(
            highlight, doc, meta)
        data.append([backbone, support, alignments])

    with open(output_path, "w") as f:
        f.write(yaml.dump(data))
Esempio n. 6
0
def build_summary(doc_path, align_path, summary_path):

    backbones = []
    used = set()
    with open(align_path, "r") as f:
        alignments = yaml.load(f)
        for backbone, support, ta in alignments:
            if backbone != None and backbone not in used:
                backbones.append(backbone)
                used.add(backbone)

    doc = read_document(doc_path)

    lines = list()
    for b in backbones:
        tokens = doc["sentences"][b]["tokens"]
        sent_str = " ".join(replace_entities(tokens, doc["entities"]))
        lines.append(sent_str)

    with open(summary_path, "w") as f:
        f.write("\n".join(lines))
Esempio n. 7
0
    def display_example(example):

        example = int(example)

        doc_path = app.config["DOC_PATHS"][example]
        align_path = app.config["ALIGN_PATHS"][example]
        doc = read_document(doc_path)
        doc_tokens = [replace_entities(s["tokens"], doc["entities"])
                      for s in doc["sentences"][:25]]
        highlight_tokens = [replace_entities(s["tokens"], doc["entities"])
                      for s in doc["highlights"][:4]]

        i=0

        doc_token_ids = list()
        for tokens in doc_tokens:
            token_ids = list()
            for token in tokens:
                token_ids.append(i)
                i += 1
            doc_token_ids.append(token_ids)


        backbone_ids = list()
        alignments = list()
        with open(align_path, "r") as f:
            #backbones, support, alignments 
            data = yaml.load(f)
            for backbone, support, alignment in data:
                if backbone != None:
                    backbone_ids.append(doc_token_ids[backbone])
                else:
                    backbone_ids.append(list())
                alignments.append(alignment)
        return render_template("default.html", doc_tokens=doc_tokens,
            highlights=highlight_tokens, alignments=alignments,
            alignments_json=json.dumps(alignments),
            backbone_ids=json.dumps(backbone_ids))
Esempio n. 8
0
def main():

    import argparse

    hlp = "View a random document"

    parser = argparse.ArgumentParser(hlp)
    parser.add_argument('--corpus', required=True, help="Corpus to use.",
        choices=["dailymail", "cnn"])
    parser.add_argument('--data-path', required=True, 
        help="Path to Cheng&Lapata data.")
    parser.add_argument('--split', required=True, help="Data split to use.",
        choices=["train", "dev", "test"])
    parser.add_argument('--replace-entities', default=False, 
        action="store_true")
    parser.add_argument('--pproc', default=False, 
        action="store_true")

    args = parser.parse_args()

    arg2split = {"test": "test", "train": "training", "dev": "validation"}
    split = arg2split[args.split]

    data_path = os.path.join(args.data_path, args.corpus, split)
    doc_paths = [os.path.join(data_path, file) 
                 for file in os.listdir(data_path)]
    doc_paths.sort()
    random.shuffle(doc_paths)

    doc = read_document(doc_paths[0])

    print("url")
    print("===")
    print(doc["url"])

    print("\nINPUT")
    print("=====")
    for s, sent in enumerate(doc["sentences"], 1):
        tokens = sent["tokens"]
        if args.pproc:
            tokens = preprocess_tokens(tokens, doc["entities"])
        if args.replace_entities:
            tokens = replace_entities(tokens, doc["entities"]) 
        sent_str = " ".join(tokens)
        line = "{}) [{}] {}".format(s, sent["score"], sent_str)
        print(textwrap.fill(line, subsequent_indent="   "))

    print("\nENTITIES")
    print("========")
    for id, entity in sorted(doc["entities"].items(), key=lambda x: x[0]):
        print("{:10} :: {}".format(id, entity))

    print("\nHIGHLIGHTS")
    print("==========")

    for s, sent in enumerate(doc["highlights"], 1):
        tokens = sent["tokens"]
        if args.pproc:
            tokens = preprocess_tokens(tokens, doc["entities"])
        if args.replace_entities:
            tokens = replace_entities(tokens, doc["entities"]) 
        sent_str = " ".join(tokens)
        line = "{}) {}".format(s, sent_str)
        print(textwrap.fill(line, subsequent_indent="   "))
Esempio n. 9
0
def collect_split_stats(data_dir, alignments_dir, vocab_out):

    document_paths = get_document_paths(data_dir)
    alignments_paths = get_document_paths(alignments_dir)

    backbone_counts = list()
    highlight_counts = list()
    support_counts = list()
    aligned_counts = list()
    unaligned_ent_counts = list()
    unaligned_counts = list()
    unaligned_common_counts = list()

    for doc_path, align_path in izip(document_paths, alignments_paths):
        
        if not os.path.basename(doc_path) == os.path.basename(align_path):
            raise Exception(
                 "Alignments directory does not contain one file for every " \
                 "file in data path.")
        
        doc = read_document(doc_path)

        with open(align_path, "r") as f:
            alignments = yaml.load(f)
        
        backbone_count = 0
        for a in xrange(len(alignments)):
            backbone, support, token_alignments = alignments[a]
            if backbone is not None:
                backbone_count += 1
            support_counts.append(len(support))

            highlight_tokens = doc["highlights"][a]["tokens"]
            pp_highlight_tokens = preprocess_tokens(
                    highlight_tokens, doc["entities"]) 

            aligned_tokens = list()
            unaligned_tokens = list()
            unaligned_common_tokens = list()
            unaligned_entity_tokens = list()

            for token, align in izip(pp_highlight_tokens, token_alignments):
                if align == unk_id or align == sw_id: 
                    unaligned_tokens.append(token)
                    if token in vocab_out:
                        unaligned_common_tokens.append(token)
                    elif token == "__ENTITY__":
                        unaligned_entity_tokens.append(token)
                else:
                    aligned_tokens.append(token)
                
            unaligned_ent_counts.append(len(unaligned_entity_tokens))

            aligned_counts.append(len(aligned_tokens))
            unaligned_counts.append(len(unaligned_tokens))
            unaligned_common_counts.append(len(unaligned_common_tokens))

        backbone_counts.append(backbone_count)
        highlight_counts.append(len(alignments))

    print "% highlights w/o alignments", \
        1 - np.sum(backbone_counts) / np.sum(highlight_counts)
    print "macro avg. support", np.mean(support_counts)

    aligned_counts = np.array(aligned_counts)
    unaligned_counts = np.array(unaligned_counts)
    unaligned_common_counts = np.array(unaligned_common_counts)
    total_tokens = aligned_counts + unaligned_counts
    
    macro_avg_align_recall = (aligned_counts / total_tokens).mean()
    micro_avg_align_recall = aligned_counts.sum() / total_tokens.sum()
    
    macro_avg_unalign_recall = (unaligned_common_counts / total_tokens).mean()
    micro_avg_unalign_recall = \
        unaligned_common_counts.sum() / total_tokens.sum()

    macro_avg_unalign_ent_recall = (unaligned_ent_counts / total_tokens).mean()

    macro_avg_max_recall = \
        ((unaligned_common_counts + aligned_counts) / total_tokens).mean()
    micro_avg_max_recall = \
        (unaligned_common_counts.sum() + aligned_counts.sum()) \
        / total_tokens.sum()

    print "avg. token count", total_tokens.mean()
    print "macro avg. align. recall", macro_avg_align_recall
    print "micro avg. align. recall", micro_avg_align_recall
    print "macro avg. unalign. recall", macro_avg_unalign_recall
    print "micro avg. unalign. recall", micro_avg_unalign_recall
    print "macro avg. unalign. ent recall", macro_avg_unalign_ent_recall

    print "macro avg. max recall", macro_avg_max_recall
    print "micro avg. max recall", micro_avg_max_recall
Esempio n. 10
0
def process_example(doc_path, align_path):

    print doc_path
    doc = read_document(doc_path)

    sent2token_ids = list()
    sent2pretty_tokens = list()
    sent2tokens = list()

    id = 0
    for sent in doc["sentences"]:
        token_ids = list()
        pretty_tokens = replace_entities(sent["tokens"], doc["entities"])
        pp_tokens = preprocess_tokens(sent["tokens"], doc["entities"])
        for token in pretty_tokens:
            token_ids.append(id)
            #pretty_tokens.append(token)
            id += 1

        sent2token_ids.append(token_ids)
        sent2pretty_tokens.append(pretty_tokens)
        sent2tokens.append(pp_tokens)

    hl_tokens_pretty = replace_entities(doc["highlights"][0]["tokens"],
                                        doc["entities"])
    hl_tokens = preprocess_tokens(doc["highlights"][0]["tokens"],
                                  doc["entities"])

    with open(align_path, "r") as f:
        backbone, supports, alignments = yaml.load(f)[0]

    token_ids_flat = list(["<S>"])
    token_ids_flat.extend(sent2token_ids[backbone])
    pretty_tokens_flat = list(["<S>"])
    pretty_tokens_flat.extend(sent2pretty_tokens[backbone])
    tokens_flat = list(["<S>"])
    tokens_flat.extend(sent2tokens[backbone])

    for support in supports:
        token_ids_flat.append("<B>")
        token_ids_flat.extend(sent2token_ids[support])
        pretty_tokens_flat.append("<B>")
        pretty_tokens_flat.extend(sent2pretty_tokens[support])
        tokens_flat.append("<B>")
        tokens_flat.extend(sent2tokens[support])

    relative_alignments = list()
    for i, a in enumerate(alignments):
        if a > -1:
            index = token_ids_flat.index(a)
            relative_alignments.append(index)
        else:
            if hl_tokens[i] in vocab2id_out:
                relative_alignments.append(-1)
            else:
                relative_alignments.append(-99)

    print
    print len(supports)
    print pretty_tokens_flat
    print hl_tokens_pretty
    print relative_alignments
    print[pretty_tokens_flat[a] if a > -1 else -1 for a in relative_alignments]

    print[a + len(vocab2id_out) if a > -1 else a for a in relative_alignments]

    relative_alignments = list()
    for i, a in enumerate(alignments):
        if a > -1:
            index = token_ids_flat.index(a)
            relative_alignments.append(index + len(id2vocab_out))
        else:
            if hl_tokens[i] in vocab2id_out:
                relative_alignments.append(vocab2id_out[hl_tokens[i]])
            else:
                relative_alignments.append(vocab2id_out["__UNK__"])
    print relative_alignments

    backbone_data_items = list()
    backbone_data_items.append(vocab2id_in.get("<S>"))
    for token in sent2tokens[backbone]:
        backbone_data_items.append(
            vocab2id_in.get(token, vocab2id_in["__UNK__"]))
    backbone_data_str = " ".join(str(i) for i in backbone_data_items)

    print sent2tokens[backbone]
    print[
        vocab2id_in.get(token, vocab2id_in["__UNK__"])
        for token in sent2tokens[backbone]
    ]
    print backbone_data_str
    print

    support_data_items = list()

    for support in supports:
        print sent2tokens[support]
        print[
            vocab2id_in.get(token, vocab2id_in["__UNK__"])
            for token in sent2tokens[support]
        ]
        print
        support_data_items.append(vocab2id_in["<B>"])
        for token in sent2tokens[support]:
            support_data_items.append(
                vocab2id_in.get(token, vocab2id_in["__UNK__"]))
    support_data_items.append(vocab2id_in["<B>"])

    support_data_str = " ".join(str(i) for i in support_data_items)

    relative_alignments = [vocab2id_out["<D>"]
                           ] + relative_alignments + [vocab2id_out["<E>"]]
    target_data_str = " ".join(str(i) for i in relative_alignments)

    print "THEDATA"
    print "======="
    print backbone_data_str
    print support_data_str
    print target_data_str

    print
    print[id2vocab_in[i] for i in backbone_data_items]
    print[id2vocab_in[i] for i in support_data_items]
    print[
        i if i < len(id2vocab_out) else pretty_tokens_flat[i -
                                                           len(id2vocab_out)]
        for i in relative_alignments
    ]

    return " | ".join([backbone_data_str, support_data_str, target_data_str])
Esempio n. 11
0
def collect_split_stats(data_path):

    doc_paths = [
        os.path.join(data_path, file) for file in os.listdir(data_path)
    ]

    num_docs = len(doc_paths)

    num_highlights = list()
    num_inputs = list()
    num_input_tokens = list()
    num_highlight_tokens = list()

    doc_len_tokens = list()
    doc_len_tokens_trunc = list()
    ref_len_tokens = list()

    num_ref_trunc75_tokens = list()
    num_ref_trunc250_tokens = list()
    num_ref_truncNA_tokens = list()
    num_ref_trunc75_sents = list()
    num_ref_trunc250_sents = list()
    num_ref_truncNA_sents = list()

    for i, doc_path in enumerate(doc_paths, 1):
        sys.stdout.write("\r {:d} / {:d} ( {:7.4f}% ) ".format(
            i, num_docs, 100 * i / num_docs))
        sys.stdout.flush()
        doc = read_document(doc_path)
        num_highlights.append(len(doc["highlights"]))
        num_inputs.append(len(doc["sentences"]))

        doc_i_len_tokens = 0
        doc_i_len_tokens_trunc = 0

        for s, sent in enumerate(doc["sentences"]):
            tokens = replace_entities(sent["tokens"], doc["entities"])
            num_input_tokens.append(len(tokens))
            doc_i_len_tokens += len(tokens)
            if s < 25:
                doc_i_len_tokens_trunc += len(tokens)

        doc_len_tokens.append(doc_i_len_tokens)
        doc_len_tokens_trunc.append(doc_i_len_tokens_trunc)

        ref_i_len_tokens = 0
        hl_tokens = list()
        hl_tokens_flat = list()
        for highlight in doc["highlights"]:
            tokens = replace_entities(highlight["tokens"], doc["entities"])
            num_highlight_tokens.append(len(tokens))
            hl_tokens.append(tokens)
            hl_tokens_flat.extend(tokens)
            ref_i_len_tokens += len(tokens)

        ref_len_tokens.append(ref_i_len_tokens)

        ref_text = "\n".join([" ".join(tokens) for tokens in hl_tokens])
        ref_text_flat = " ".join(hl_tokens_flat)

        ref_trunc75 = ref_text[:75]
        ref_trunc75_flat = ref_text_flat[:75]
        num_ref_trunc75_tokens.append(len(ref_trunc75_flat.split()))
        num_ref_trunc75_sents.append(len(ref_trunc75.split("\n")))

        ref_trunc250 = ref_text[:250]
        ref_trunc250_flat = ref_text_flat[:250]
        num_ref_trunc250_tokens.append(len(ref_trunc250_flat.split()))
        num_ref_trunc250_sents.append(len(ref_trunc250.split("\n")))

        ref_truncNA = ref_text
        ref_truncNA_flat = ref_text_flat
        num_ref_truncNA_tokens.append(len(ref_truncNA_flat.split()))
        num_ref_truncNA_sents.append(len(ref_truncNA.split("\n")))

    sys.stdout.write("\n")
    sys.stdout.flush()

    percentiles = [20, 30, 40, 50, 60, 70, 80, 90, 95, 99]

    def make_data_row(data):

        row_data = [np.mean(data), np.median(data), np.std(data), np.max(data)]
        row_data.extend(np.percentile(data, percentiles))
        return row_data

    df_data = list()
    df_data.append(make_data_row(num_inputs))
    df_data.append(make_data_row(doc_len_tokens))
    df_data.append(make_data_row(doc_len_tokens_trunc))
    df_data.append(make_data_row(num_input_tokens))

    df_data.append(make_data_row(num_highlights))
    df_data.append(make_data_row(ref_len_tokens))
    df_data.append(make_data_row(num_highlight_tokens))

    df_data.append(make_data_row(num_ref_trunc75_sents))
    df_data.append(make_data_row(num_ref_trunc75_tokens))
    df_data.append(make_data_row(num_ref_trunc250_sents))
    df_data.append(make_data_row(num_ref_trunc250_tokens))
    df_data.append(make_data_row(num_ref_truncNA_sents))
    df_data.append(make_data_row(num_ref_truncNA_tokens))


    columns = pd.MultiIndex.from_tuples(
        [("", "mean"), ("", "median"), ("", "std"), ("", "max")] + \
        [("Percentile", "{}th".format(p)) for p in percentiles])

    index = [
        "inp. len. (sents.)", "inp. len. (tok.)",
        "inp. len. trunc25sent (tok.)", "inp. sent. len. (toks.)",
        "hl. len. (sents.)", "hl. len. (tok.)", "hl. sent. len. (toks.)",
        "ref[:75] len. (sents.)", "ref[:75] len. (tok.)",
        "ref[:250] len. (sents.)", "ref[:250] len. (tok.)",
        "ref[:+inf] len. (sents.)", "ref[:+inf] len. (tok.)"
    ]

    df = pd.DataFrame(df_data, columns=columns, index=index)
    df_str_lines = str(df).split("\n")

    print("\n".join(df_str_lines[:2]) + "\n")
    print("\n".join(df_str_lines[2:6]) + "\n")
    print("\n".join(df_str_lines[6:9]) + "\n")
    print("\n".join(df_str_lines[9:11]) + "\n")
    print("\n".join(df_str_lines[11:13]) + "\n")
    print("\n".join(df_str_lines[13:15]) + "\n")