def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--train-path",
                        type=str,
                        required=True,
                        help="Path to the train jsonl file.")
    parser.add_argument("--dev-path",
                        type=str,
                        required=True,
                        help="Path to the dev jsonl file.")
    parser.add_argument("--serialization-dir",
                        "-s",
                        type=str,
                        required=True,
                        help="Path to store the preprocessed output.")
    parser.add_argument(
        "--vocab-size",
        type=int,
        required=False,
        default=10000,
        help=
        "Path to store the preprocessed corpus vocabulary (output file name).")
    parser.add_argument("--vocab-namespace",
                        type=str,
                        required=False,
                        default="vampire",
                        help="Path to store the preprocessed output.")
    parser.add_argument(
        "--tokenize",
        action='store_true',
        help=
        "Path to store the preprocessed corpus vocabulary (output file name).")
    parser.add_argument(
        "--tokenizer-type",
        type=str,
        default="just_spaces",
        help=
        "Path to store the preprocessed corpus vocabulary (output file name).")
    parser.add_argument(
        "--reference-corpus-path",
        type=str,
        required=False,
        help=
        "Path to store the preprocessed corpus vocabulary (output file name).")
    args = parser.parse_args()

    if not os.path.isdir(args.serialization_dir):
        os.mkdir(args.serialization_dir)

    vocabulary_dir = os.path.join(args.serialization_dir, "vocabulary")

    if not os.path.isdir(vocabulary_dir):
        os.mkdir(vocabulary_dir)

    tokenized_train_examples = load_data(args.train_path, args.tokenize,
                                         args.tokenizer_type)
    tokenized_dev_examples = load_data(args.dev_path, args.tokenize,
                                       args.tokenizer_type)

    print("fitting count vectorizer...")

    count_vectorizer = CountVectorizer(stop_words='english',
                                       max_features=args.vocab_size,
                                       token_pattern=r'\b[^\d\W]{3,30}\b')

    text = tokenized_train_examples + tokenized_dev_examples

    count_vectorizer.fit(text)

    vectorized_train_examples = count_vectorizer.transform(
        tokenized_train_examples)
    vectorized_dev_examples = count_vectorizer.transform(
        tokenized_dev_examples)

    # add @@unknown@@ token vector
    vectorized_train_examples = sparse.hstack(
        (np.array([0] * len(tokenized_train_examples))[:, None],
         vectorized_train_examples))
    vectorized_dev_examples = sparse.hstack(
        (np.array([0] * len(tokenized_dev_examples))[:, None],
         vectorized_dev_examples))
    master = sparse.vstack(
        [vectorized_train_examples, vectorized_dev_examples])

    # generate background frequency
    print("generating background frequency...")
    vocab = ["@@UNKNOWN@@"] + count_vectorizer.get_feature_names()
    bgfreq = dict(zip(vocab, np.array(master.sum(0))[0] / master.sum()))

    print("saving data...")
    save_sparse(vectorized_train_examples,
                os.path.join(args.serialization_dir, "train.npz"))
    save_sparse(vectorized_dev_examples,
                os.path.join(args.serialization_dir, "dev.npz"))

    write_to_json(
        bgfreq,
        os.path.join(args.serialization_dir, f"{args.vocab_namespace}.bgfreq"))

    write_list_to_file(['@@UNKNOWN@@'] + count_vectorizer.get_feature_names(),
                       os.path.join(vocabulary_dir,
                                    f"{args.vocab_namespace}.txt"))
    write_list_to_file(['*tags', '*labels', f"{args.vocab_namespace}"],
                       os.path.join(vocabulary_dir,
                                    "non_padded_namespaces.txt"))
Example #2
0
def main():
    parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--train-path", type=str, required=True,
                        help="Path to the train jsonl file.")
    parser.add_argument("--dev-path", type=str, required=True,
                        help="Path to the dev jsonl file.")
    parser.add_argument("--train-mentions-path", type=str, required=False,
                        help="Path to the train mentions jsonl file.")
    parser.add_argument("--dev-mentions-path", type=str, required=False,
                        help="Path to the dev mentions jsonl file.")
    parser.add_argument("--serialization-dir", "-s", type=str, required=True,
                        help="Path to store the preprocessed output.")
    parser.add_argument("--vocab-namespace", type=str, required=False, default="vampire",
                        help="Path to store the preprocessed output.")
    parser.add_argument("--vocab-size", type=int, required=False, default=10000,
                        help="Vocabulary set size")
    parser.add_argument("--tokenize", action='store_true',
                        help="Whether to tokenize the input")
    parser.add_argument("--tokenizer-type", type=str, default="just_spaces",
                        help="Tokenizer type: just_spaces | spacy")
    # naming convention: if you want a field called "doc text", you should name it "doc_text"
    # parser.add_argument("--token-field-names", type=str, nargs="*", default=["text"],
    #                     help="token field names separable by space like, \"doc\", \"entity_mentions\". "
    #                          "Naming Convention: if you want a field called \"doc text\","
    #                          " you should name it \"doc_text\"")
    parser.add_argument("--global-repr", action='store_true',
                        help="Whether use document level information")
    parser.add_argument("--unique-sentence", action='store_true',
                        help="Only use one sentence in document once")
    parser.add_argument("--only-mentions", action='store_true',
                        help="Only use mentions")
    parser.add_argument("--entity_as_doc", "-e", action='store_true',
                        help="Whether to model persona")
    parser.add_argument("--remove-entity-name", action='store_true',
                        help="Whether to remove labeled entity name from the sentences")

    args = parser.parse_args()

    global_repr = args.global_repr
    print("Using document information" if global_repr else "Discarding document information")
    if not os.path.isdir(args.serialization_dir):
        os.mkdir(args.serialization_dir)

    ser_dir = args.serialization_dir if not args.remove_entity_name else args.serialization_dir + "_namefree"
    if not os.path.exists(ser_dir):
        os.mkdir(ser_dir)
    vocabulary_dir = os.path.join(ser_dir, "vocabulary")
    if not os.path.isdir(vocabulary_dir):
        os.mkdir(vocabulary_dir)

    # {token_field_name1: [ sentencen0-1, sentencen1-1, ...] ,
    #  token_field_name2: [ sentencen0-2, sentencen1-2, ...] }
    train_examples = load_data(data_path=args.train_path, tokenize=args.tokenize, tokenizer_type=args.tokenizer_type,
                               entity_as_doc=args.entity_as_doc, remove_entity_name=args.remove_entity_name)
    dev_examples = load_data(data_path=args.dev_path, tokenize=args.tokenize, tokenizer_type=args.tokenizer_type,
                             entity_as_doc=args.entity_as_doc, remove_entity_name=args.remove_entity_name)

    with open(f"/home/lzy/proj/neural_persona/examples/movies/vampire_persona_namefree/train.jsonl", "w") as f:
        for datum in train_examples:
            json.dump(datum, f)
            f.write("\n")
    with open(f"/home/lzy/proj/neural_persona/examples/movies/vampire_persona_namefree/dev.jsonl", "w") as f:
        for datum in dev_examples:
            json.dump(datum, f)
            f.write("\n")

    return
    print("fitting count vectorizer...")
    count_vectorizer = CountVectorizer(stop_words='english', max_features=args.vocab_size,
                                       token_pattern=r'\b[^\d\W]{3,30}\b')

    text = create_text(train_examples, args.unique_sentence, args.only_mentions, args.entity_as_doc) + \
           create_text(dev_examples, args.unique_sentence, args.only_mentions, args.entity_as_doc)

    # master is simply vectorized of the document corpus(no duplicate documents)
    master = count_vectorizer.fit_transform(text)
    if args.entity_as_doc:
        vectorized_train_examples = [{"docid": example["docid"],
                                      "text": sparse.hstack((np.array([0] * len(example["text"]))[:, None],
                                                             count_vectorizer.transform(example["text"]))).tocsc()}
                                     for example in train_examples]
        vectorized_dev_examples = [{"docid": example["docid"],
                                    "text": sparse.hstack((np.array([0] * len(example["text"]))[:, None],
                                                           count_vectorizer.transform(example["text"]))).tocsc()}
                                   for example in dev_examples]
    else:
        vectorized_train_examples = [{"docid": example["docid"],
                                      "text": sparse.hstack((np.array([0] * len(example["text"]))[:, None],
                                                             count_vectorizer.transform(example["text"]))).tocsc(),
                                      "entities": example["entities"],
                                      }
                                     for example in train_examples]
        vectorized_dev_examples = [{"docid": example["docid"],
                                    "text": sparse.hstack((np.array([0] * len(example["text"]))[:, None],
                                                           count_vectorizer.transform(example["text"]))).tocsc(),
                                    "entities": example["entities"]}
                                   for example in dev_examples]

    # add @@unknown@@ token vector for both doc and entity representation
    # this decision is for code simplicity
    if args.entity_as_doc:
        vectorized_train_examples = [{"docid": example["docid"],
                                      "text": np.asarray(example["text"].sum(0)).squeeze(0)}
                                     for example in vectorized_train_examples]
        vectorized_dev_examples = [{"docid": example["docid"],
                                    "text": np.asarray(example["text"].sum(0)).squeeze(0)}
                                   for example in vectorized_dev_examples]
    else:
        vectorized_train_examples = [{"docid": example["docid"],
                                      "text": np.asarray(example["text"].sum(0)).squeeze(0),
                                      "entities": [{"label": entity["name"],
                                                    "text": example["text"][entity["mentions"]]}
                                                   for entity in example["entities"]]}
                                     for example in vectorized_train_examples]
        vectorized_dev_examples = [{"docid": example["docid"],
                                    "text": np.asarray(example["text"].sum(0)).squeeze(0),
                                    "entities": [{"label": entity["name"],
                                                  "text": example["text"][entity["mentions"]]}
                                                 for entity in example["entities"]]}
                                   for example in vectorized_dev_examples]

    # vectorized_train_examples = extract_entity_from_doc_as_doc(vectorized_train_examples)
    # vectorized_dev_examples = extract_entity_from_doc_as_doc(vectorized_dev_examples)
    # vectorized_train_examples = extract_context_from_doc_as_doc(vectorized_train_examples)
    # vectorized_dev_examples = extract_context_from_doc_as_doc(vectorized_dev_examples)

    # add @@unknown@@ token vector
    master = sparse.hstack((np.array([0] * master.shape[0])[:, None], master))

    vocab = ["@@UNKNOWN@@"] + count_vectorizer.get_feature_names()
    # generate background frequency
    print("generating background frequency...")
    # bgfreq = dict(zip(count_vectorizer.get_feature_names(), master.toarray().sum(1) / args.vocab_size))
    bgfreq = dict(zip(vocab, np.array(master.sum(0))[0] / master.sum()))

    print("saving data...")
    pickle.dump(vectorized_train_examples, open(os.path.join(ser_dir, "train.pk"), "wb"))
    pickle.dump(vectorized_dev_examples, open(os.path.join(ser_dir, "dev.pk"), "wb"))
    # np.save(os.path.join(ser_dir, "train.pk"), vectorized_train_examples)
    # np.save(os.path.join(ser_dir, "dev.pk"), vectorized_dev_examples)

    write_to_json(bgfreq, os.path.join(ser_dir, f"{args.vocab_namespace}.bgfreq"))
    
    write_list_to_file(['@@UNKNOWN@@'] + count_vectorizer.get_feature_names(),
                       os.path.join(vocabulary_dir, f"{args.vocab_namespace}.txt"))
    write_list_to_file(['*tags', '*labels', args.vocab_namespace],
                       os.path.join(vocabulary_dir, "non_padded_namespaces.txt"))
def preprocess_data(train_infile,
                    test_infile,
                    output_dir,
                    train_prefix,
                    test_prefix,
                    vocab_size=None,
                    stopwords=None,
                    keep_num=False,
                    keep_alphanum=False,
                    strip_html=False,
                    lower=True,
                    min_length=3,
                    token_field_name='text'):

    if stopwords == 'mallet':
        print("Using Mallet stopwords")
        stopword_list = read_text(
            os.path.join('neural_persona', 'common', 'stopwords',
                         'mallet_stopwords.txt'))
    elif stopwords == 'snowball':
        print("Using snowball stopwords")
        stopword_list = read_text(
            os.path.join('neural_persona', 'common', 'stopwords',
                         'snowball_stopwords.txt'))
    elif stopwords is not None:
        print("Using custom stopwords")
        stopword_list = read_text(
            os.path.join('neural_persona', 'common', 'stopwords',
                         stopwords + '_stopwords.txt'))
    else:
        stopword_list = []
    stopword_set = {s.strip() for s in stopword_list}

    print("Reading data files")
    train_items = read_jsonlist(train_infile)
    n_train = len(train_items)
    print("Found {:d} training documents".format(n_train))

    if test_infile is not None:
        test_items = read_jsonlist(test_infile)
        n_test = len(test_items)
        print("Found {:d} test documents".format(n_test))
    else:
        test_items = []
        n_test = 0

    all_items = train_items + test_items
    all_texts = set([item[token_field_name] for item in all_items])
    n_items = len(all_texts)

    # make vocabulary
    train_parsed = []
    test_parsed = []

    print("Parsing %d documents" % n_items)
    word_counts = Counter()
    doc_counts = Counter()
    count = 0

    vocab = None
    for i, text in tqdm(enumerate(all_texts), total=n_items):
        if i % 1000 == 0 and count > 0:
            print(i)
        tokens, _ = tokenize(text,
                             strip_html=strip_html,
                             lower=lower,
                             keep_numbers=keep_num,
                             keep_alphanum=keep_alphanum,
                             min_length=min_length,
                             stopwords=stopword_set,
                             vocab=vocab)

        # store the parsed documents
        if i < n_train:
            train_parsed.append(tokens)
        else:
            test_parsed.append(tokens)

        # keep track fo the number of documents with each word
        word_counts.update(tokens)
        doc_counts.update(set(tokens))

    print("Size of full vocabulary=%d" % len(word_counts))

    print("Selecting the vocabulary")
    most_common = word_counts.most_common(n=vocab_size)
    words, word_counts = zip(*most_common)
    vocab = list(words)
    print("Vocab size after filtering = %d" % len(vocab))
    if vocab_size is not None:
        if len(vocab) > int(vocab_size):
            vocab = vocab[:int(vocab_size)]

    vocab_size = len(vocab)
    print("Final vocab size = %d" % vocab_size)

    print("Most common words remaining:", ' '.join(vocab[:10]))
    vocab.sort()

    write_to_json(vocab, os.path.join(output_dir, 'ref.vocab.json'))

    process_subset(train_items, train_parsed, vocab, output_dir, train_prefix)
    if n_test > 0:
        process_subset(test_items, test_parsed, vocab, output_dir, test_prefix)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--train-path",
                        type=str,
                        required=True,
                        help="Path to the train jsonl file.")
    parser.add_argument("--dev-path",
                        type=str,
                        required=True,
                        help="Path to the dev jsonl file.")
    parser.add_argument("--train-mentions-path",
                        type=str,
                        required=False,
                        help="Path to the train mentions jsonl file.")
    parser.add_argument("--dev-mentions-path",
                        type=str,
                        required=False,
                        help="Path to the dev mentions jsonl file.")
    parser.add_argument("--serialization-dir",
                        "-s",
                        type=str,
                        required=True,
                        help="Path to store the preprocessed output.")
    parser.add_argument("--vocab-namespace",
                        type=str,
                        required=False,
                        default="vampire",
                        help="Path to store the preprocessed output.")
    parser.add_argument("--vocab-size",
                        type=int,
                        required=False,
                        default=10000,
                        help="Vocabulary set size")
    parser.add_argument("--tokenize",
                        action='store_true',
                        help="Whether to tokenize the input")
    parser.add_argument("--tokenizer-type",
                        type=str,
                        default="just_spaces",
                        help="Tokenizer type: just_spaces | spacy")
    # naming convention: if you want a field called "doc text", you should name it "doc_text"
    # parser.add_argument("--token-field-names", type=str, nargs="*", default=["text"],
    #                     help="token field names separable by space like, \"doc\", \"entity_mentions\". "
    #                          "Naming Convention: if you want a field called \"doc text\","
    #                          " you should name it \"doc_text\"")
    parser.add_argument("--global-repr",
                        action='store_true',
                        help="Whether use document level information")

    args = parser.parse_args()

    global_repr = args.global_repr
    print("Using document information"
          if global_repr else "Discarding document information")
    if not os.path.isdir(args.serialization_dir):
        os.mkdir(args.serialization_dir)

    vocabulary_dir = os.path.join(args.serialization_dir, "vocabulary")

    if not os.path.isdir(vocabulary_dir):
        os.mkdir(vocabulary_dir)

    # {token_field_name1: [ sentencen0-1, sentencen1-1, ...] ,
    #  token_field_name2: [ sentencen0-2, sentencen1-2, ...] }
    token_field_names = ["entity_text", "doc_text"]
    named_tokenized_train_examples = load_data(args.train_path, args.tokenize,
                                               args.tokenizer_type,
                                               token_field_names)
    named_tokenized_dev_examples = load_data(args.dev_path, args.tokenize,
                                             args.tokenizer_type,
                                             token_field_names)

    print("fitting count vectorizer...")
    count_vectorizer = CountVectorizer(stop_words='english',
                                       max_features=args.vocab_size,
                                       token_pattern=r'\b[^\d\W]{3,30}\b')
    if global_repr:
        text = list(set(named_tokenized_train_examples["doc_text"])) + list(
            set(named_tokenized_dev_examples["doc_text"]))
    else:
        train_mentions = json.load(open(args.train_mentions_path))
        dev_mentions = json.load(open(args.dev_mentions_path))
        text = train_mentions + dev_mentions

    # master is simply vectorized of the document corpus(no duplicate documents)
    master = count_vectorizer.fit_transform(text)

    named_vectorized_train_examples = {
        token_field_name: count_vectorizer.transform(
            named_tokenized_train_examples[token_field_name])
        for token_field_name in token_field_names
    }
    named_vectorized_dev_examples = {
        token_field_name: count_vectorizer.transform(
            named_tokenized_dev_examples[token_field_name])
        for token_field_name in token_field_names
    }
    # add @@unknown@@ token vector for both doc and entity representation
    # this decision is for code simplicity
    for token_field_name in token_field_names:
        named_vectorized_train_examples[token_field_name] = sparse.hstack(
            (np.array(
                [0] *
                len(named_tokenized_train_examples[token_field_name]))[:,
                                                                       None],
             named_vectorized_train_examples[token_field_name]))
        named_vectorized_dev_examples[token_field_name] = sparse.hstack(
            (np.array(
                [0] *
                len(named_tokenized_dev_examples[token_field_name]))[:, None],
             named_vectorized_dev_examples[token_field_name]))
    # add @@unknown@@ token vector
    master = sparse.hstack((np.array([0] * len(text))[:, None], master))

    vocab = ["@@UNKNOWN@@"] + count_vectorizer.get_feature_names()
    # generate background frequency
    print("generating background frequency...")
    # bgfreq = dict(zip(count_vectorizer.get_feature_names(), master.toarray().sum(1) / args.vocab_size))
    bgfreq = dict(zip(vocab, np.array(master.sum(0))[0] / master.sum()))

    print("saving data...")
    np.savez(os.path.join(args.serialization_dir, "train.npz"),
             **named_vectorized_train_examples)
    np.savez(os.path.join(args.serialization_dir, "dev.npz"),
             **named_vectorized_dev_examples)
    # save_named_sparse(named_vectorized_train_examples, os.path.join(args.serialization_dir, "train.npz"))
    # save_named_sparse(named_vectorized_dev_examples, os.path.join(args.serialization_dir, "dev.npz"))

    write_to_json(
        bgfreq,
        os.path.join(args.serialization_dir, f"{args.vocab_namespace}.bgfreq"))

    write_list_to_file(['@@UNKNOWN@@'] + count_vectorizer.get_feature_names(),
                       os.path.join(vocabulary_dir,
                                    f"{args.vocab_namespace}.txt"))
    write_list_to_file(['*tags', '*labels', args.vocab_namespace],
                       os.path.join(vocabulary_dir,
                                    "non_padded_namespaces.txt"))