Exemple #1
0
 def yaml_iter():
     for i, line in enumerate(input_lines, 1):
         sys.stdout.write("Aligned {:8.4f}% \r".format(100 * i / n_data))
         sys.stdout.flush()
         doc = read_line_document(line)
         res = process(doc)
         yield (doc.filename, res)
Exemple #2
0
def load_documents(documents_path):

    name2doc = dict()
    docs = list()
    print("Loading documents from {} ...".format(documents_path))
    with open(documents_path, "r") as f:
        for i, line in enumerate(f, 1):
            sys.stdout.write("{:10d} documents read.\r".format(i))
            sys.stdout.flush()
            doc = read_line_document(line)
            name2doc[doc.filename] = doc
            docs.append(doc)
    print("")
    return docs, name2doc
Exemple #3
0
def main():

    import argparse

    hlp = "Compute input vocabulary."

    parser = argparse.ArgumentParser(hlp)
    parser.add_argument("--documents",
                        required=True,
                        help="Path to preprocessed documents.")
    parser.add_argument("--output", required=True, help="Path to write vocab.")
    parser.add_argument("--size",
                        required=True,
                        type=int,
                        help="Number of most frequent vocab words to keep.")
    parser.add_argument(
        "--special",
        nargs="+",
        default=["<E>", "<D>", "<S>", "<B>", "__UNK__", "__ENT__"])

    args = parser.parse_args()

    assert (args.size > 0)
    counts = defaultdict(int)

    vocab_dir = os.path.dirname(args.output)
    if vocab_dir != "" and not os.path.exists(vocab_dir):
        os.makedirs(vocab_dir)

    with open(args.documents, "r") as f:
        for i, line in enumerate(f, 1):
            sys.stdout.write("\rRead {:7d} documents".format(i))
            sys.stdout.flush()
            doc = read_line_document(line)
            for tokens in doc.highlights:
                for token in tokens:
                    if token.ne not in ents:
                        counts[re.sub(r"\d", "D", token.lower())] += 1

    counts = counts.items()
    counts.sort(key=lambda x: x[1], reverse=True)
    vocab = args.special + [w for w, c in counts[:args.size]]

    with open(args.output, "w") as f:
        f.write("\n".join(vocab))
Exemple #4
0
def main():

    import argparse

    help_msg = "Format data for sentence generation task using neural " \
            + "network implemented in Torch/Lua."
    parser = argparse.ArgumentParser(help_msg)
    parser.add_argument('--documents',
                        required=True,
                        help="Path to preprocessed data.")
    parser.add_argument('--alignments',
                        required=True,
                        help="Path to alignments data.")
    parser.add_argument('--output', required=True, help="File to write data.")
    parser.add_argument('--input-vocab',
                        required=True,
                        help="Path to input vocab.")
    parser.add_argument('--output-vocab',
                        required=True,
                        help="Path to output vocab.")
    parser.add_argument('--entity-mode',
                        required=True,
                        choices=["1-tag", "3-tags"])

    args = parser.parse_args()

    output_dir = os.path.dirname(args.output)
    if output_dir != '' and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Reading alignments from {} ...".format(args.alignments))

    name2alignments = dict()
    with open(args.alignments, "r") as f:
        for i, (filename, alignments) in enumerate(yaml.load_all(f), 1):
            sys.stdout.write("\rRead {:7d} alignments".format(i))
            sys.stdout.flush()
            name2alignments[filename] = alignments
    print("")

    print("Reading input vocab from {} ...".format(args.input_vocab))
    id2vocab_in, vocab2id_in = read_vocab(args.input_vocab)

    print("Reading output vocab from {} ...".format(args.output_vocab))
    id2vocab_out, vocab2id_out = read_vocab(args.output_vocab)

    print("Reading documents from {} ...".format(args.documents))
    print("Writing data to {} ...".format(args.output))

    with open(args.documents, "r") as f, open(args.output, "w") as o:
        for i, line in enumerate(f, 1):
            sys.stdout.write("\rRead {:7d} documents".format(i))
            sys.stdout.flush()
            doc = read_line_document(line)
            if doc.filename not in name2alignments:
                print("\nSkipping {}, no alignment found.".format(
                    doc.filename))
                continue
            alignments = name2alignments[doc.filename]
            dls = process_example(doc, alignments, vocab2id_in, id2vocab_in,
                                  vocab2id_out, id2vocab_out, args.entity_mode)
            for dl in dls:
                o.write(dl)
    print("")