def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--train-path", type=str, required=True, help="Path to the train jsonl file.") parser.add_argument("--dev-path", type=str, required=True, help="Path to the dev jsonl file.") parser.add_argument("--serialization-dir", "-s", type=str, required=True, help="Path to store the preprocessed output.") parser.add_argument( "--vocab-size", type=int, required=False, default=10000, help= "Path to store the preprocessed corpus vocabulary (output file name).") parser.add_argument("--vocab-namespace", type=str, required=False, default="vampire", help="Path to store the preprocessed output.") parser.add_argument( "--tokenize", action='store_true', help= "Path to store the preprocessed corpus vocabulary (output file name).") parser.add_argument( "--tokenizer-type", type=str, default="just_spaces", help= "Path to store the preprocessed corpus vocabulary (output file name).") parser.add_argument( "--reference-corpus-path", type=str, required=False, help= "Path to store the preprocessed corpus vocabulary (output file name).") args = parser.parse_args() if not os.path.isdir(args.serialization_dir): os.mkdir(args.serialization_dir) vocabulary_dir = os.path.join(args.serialization_dir, "vocabulary") if not os.path.isdir(vocabulary_dir): os.mkdir(vocabulary_dir) tokenized_train_examples = load_data(args.train_path, args.tokenize, args.tokenizer_type) tokenized_dev_examples = load_data(args.dev_path, args.tokenize, args.tokenizer_type) print("fitting count vectorizer...") count_vectorizer = CountVectorizer(stop_words='english', max_features=args.vocab_size, token_pattern=r'\b[^\d\W]{3,30}\b') text = tokenized_train_examples + tokenized_dev_examples count_vectorizer.fit(text) vectorized_train_examples = count_vectorizer.transform( tokenized_train_examples) vectorized_dev_examples = count_vectorizer.transform( tokenized_dev_examples) # add @@unknown@@ token vector vectorized_train_examples = sparse.hstack( (np.array([0] * len(tokenized_train_examples))[:, None], vectorized_train_examples)) vectorized_dev_examples = sparse.hstack( (np.array([0] * len(tokenized_dev_examples))[:, None], vectorized_dev_examples)) master = sparse.vstack( [vectorized_train_examples, vectorized_dev_examples]) # generate background frequency print("generating background frequency...") vocab = ["@@UNKNOWN@@"] + count_vectorizer.get_feature_names() bgfreq = dict(zip(vocab, np.array(master.sum(0))[0] / master.sum())) print("saving data...") save_sparse(vectorized_train_examples, os.path.join(args.serialization_dir, "train.npz")) save_sparse(vectorized_dev_examples, os.path.join(args.serialization_dir, "dev.npz")) write_to_json( bgfreq, os.path.join(args.serialization_dir, f"{args.vocab_namespace}.bgfreq")) write_list_to_file(['@@UNKNOWN@@'] + count_vectorizer.get_feature_names(), os.path.join(vocabulary_dir, f"{args.vocab_namespace}.txt")) write_list_to_file(['*tags', '*labels', f"{args.vocab_namespace}"], os.path.join(vocabulary_dir, "non_padded_namespaces.txt"))
def main(): parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--train-path", type=str, required=True, help="Path to the train jsonl file.") parser.add_argument("--dev-path", type=str, required=True, help="Path to the dev jsonl file.") parser.add_argument("--train-mentions-path", type=str, required=False, help="Path to the train mentions jsonl file.") parser.add_argument("--dev-mentions-path", type=str, required=False, help="Path to the dev mentions jsonl file.") parser.add_argument("--serialization-dir", "-s", type=str, required=True, help="Path to store the preprocessed output.") parser.add_argument("--vocab-namespace", type=str, required=False, default="vampire", help="Path to store the preprocessed output.") parser.add_argument("--vocab-size", type=int, required=False, default=10000, help="Vocabulary set size") parser.add_argument("--tokenize", action='store_true', help="Whether to tokenize the input") parser.add_argument("--tokenizer-type", type=str, default="just_spaces", help="Tokenizer type: just_spaces | spacy") # naming convention: if you want a field called "doc text", you should name it "doc_text" # parser.add_argument("--token-field-names", type=str, nargs="*", default=["text"], # help="token field names separable by space like, \"doc\", \"entity_mentions\". " # "Naming Convention: if you want a field called \"doc text\"," # " you should name it \"doc_text\"") parser.add_argument("--global-repr", action='store_true', help="Whether use document level information") parser.add_argument("--unique-sentence", action='store_true', help="Only use one sentence in document once") parser.add_argument("--only-mentions", action='store_true', help="Only use mentions") parser.add_argument("--entity_as_doc", "-e", action='store_true', help="Whether to model persona") parser.add_argument("--remove-entity-name", action='store_true', help="Whether to remove labeled entity name from the sentences") args = parser.parse_args() global_repr = args.global_repr print("Using document information" if global_repr else "Discarding document information") if not os.path.isdir(args.serialization_dir): os.mkdir(args.serialization_dir) ser_dir = args.serialization_dir if not args.remove_entity_name else args.serialization_dir + "_namefree" if not os.path.exists(ser_dir): os.mkdir(ser_dir) vocabulary_dir = os.path.join(ser_dir, "vocabulary") if not os.path.isdir(vocabulary_dir): os.mkdir(vocabulary_dir) # {token_field_name1: [ sentencen0-1, sentencen1-1, ...] , # token_field_name2: [ sentencen0-2, sentencen1-2, ...] } train_examples = load_data(data_path=args.train_path, tokenize=args.tokenize, tokenizer_type=args.tokenizer_type, entity_as_doc=args.entity_as_doc, remove_entity_name=args.remove_entity_name) dev_examples = load_data(data_path=args.dev_path, tokenize=args.tokenize, tokenizer_type=args.tokenizer_type, entity_as_doc=args.entity_as_doc, remove_entity_name=args.remove_entity_name) with open(f"/home/lzy/proj/neural_persona/examples/movies/vampire_persona_namefree/train.jsonl", "w") as f: for datum in train_examples: json.dump(datum, f) f.write("\n") with open(f"/home/lzy/proj/neural_persona/examples/movies/vampire_persona_namefree/dev.jsonl", "w") as f: for datum in dev_examples: json.dump(datum, f) f.write("\n") return print("fitting count vectorizer...") count_vectorizer = CountVectorizer(stop_words='english', max_features=args.vocab_size, token_pattern=r'\b[^\d\W]{3,30}\b') text = create_text(train_examples, args.unique_sentence, args.only_mentions, args.entity_as_doc) + \ create_text(dev_examples, args.unique_sentence, args.only_mentions, args.entity_as_doc) # master is simply vectorized of the document corpus(no duplicate documents) master = count_vectorizer.fit_transform(text) if args.entity_as_doc: vectorized_train_examples = [{"docid": example["docid"], "text": sparse.hstack((np.array([0] * len(example["text"]))[:, None], count_vectorizer.transform(example["text"]))).tocsc()} for example in train_examples] vectorized_dev_examples = [{"docid": example["docid"], "text": sparse.hstack((np.array([0] * len(example["text"]))[:, None], count_vectorizer.transform(example["text"]))).tocsc()} for example in dev_examples] else: vectorized_train_examples = [{"docid": example["docid"], "text": sparse.hstack((np.array([0] * len(example["text"]))[:, None], count_vectorizer.transform(example["text"]))).tocsc(), "entities": example["entities"], } for example in train_examples] vectorized_dev_examples = [{"docid": example["docid"], "text": sparse.hstack((np.array([0] * len(example["text"]))[:, None], count_vectorizer.transform(example["text"]))).tocsc(), "entities": example["entities"]} for example in dev_examples] # add @@unknown@@ token vector for both doc and entity representation # this decision is for code simplicity if args.entity_as_doc: vectorized_train_examples = [{"docid": example["docid"], "text": np.asarray(example["text"].sum(0)).squeeze(0)} for example in vectorized_train_examples] vectorized_dev_examples = [{"docid": example["docid"], "text": np.asarray(example["text"].sum(0)).squeeze(0)} for example in vectorized_dev_examples] else: vectorized_train_examples = [{"docid": example["docid"], "text": np.asarray(example["text"].sum(0)).squeeze(0), "entities": [{"label": entity["name"], "text": example["text"][entity["mentions"]]} for entity in example["entities"]]} for example in vectorized_train_examples] vectorized_dev_examples = [{"docid": example["docid"], "text": np.asarray(example["text"].sum(0)).squeeze(0), "entities": [{"label": entity["name"], "text": example["text"][entity["mentions"]]} for entity in example["entities"]]} for example in vectorized_dev_examples] # vectorized_train_examples = extract_entity_from_doc_as_doc(vectorized_train_examples) # vectorized_dev_examples = extract_entity_from_doc_as_doc(vectorized_dev_examples) # vectorized_train_examples = extract_context_from_doc_as_doc(vectorized_train_examples) # vectorized_dev_examples = extract_context_from_doc_as_doc(vectorized_dev_examples) # add @@unknown@@ token vector master = sparse.hstack((np.array([0] * master.shape[0])[:, None], master)) vocab = ["@@UNKNOWN@@"] + count_vectorizer.get_feature_names() # generate background frequency print("generating background frequency...") # bgfreq = dict(zip(count_vectorizer.get_feature_names(), master.toarray().sum(1) / args.vocab_size)) bgfreq = dict(zip(vocab, np.array(master.sum(0))[0] / master.sum())) print("saving data...") pickle.dump(vectorized_train_examples, open(os.path.join(ser_dir, "train.pk"), "wb")) pickle.dump(vectorized_dev_examples, open(os.path.join(ser_dir, "dev.pk"), "wb")) # np.save(os.path.join(ser_dir, "train.pk"), vectorized_train_examples) # np.save(os.path.join(ser_dir, "dev.pk"), vectorized_dev_examples) write_to_json(bgfreq, os.path.join(ser_dir, f"{args.vocab_namespace}.bgfreq")) write_list_to_file(['@@UNKNOWN@@'] + count_vectorizer.get_feature_names(), os.path.join(vocabulary_dir, f"{args.vocab_namespace}.txt")) write_list_to_file(['*tags', '*labels', args.vocab_namespace], os.path.join(vocabulary_dir, "non_padded_namespaces.txt"))
def preprocess_data(train_infile, test_infile, output_dir, train_prefix, test_prefix, vocab_size=None, stopwords=None, keep_num=False, keep_alphanum=False, strip_html=False, lower=True, min_length=3, token_field_name='text'): if stopwords == 'mallet': print("Using Mallet stopwords") stopword_list = read_text( os.path.join('neural_persona', 'common', 'stopwords', 'mallet_stopwords.txt')) elif stopwords == 'snowball': print("Using snowball stopwords") stopword_list = read_text( os.path.join('neural_persona', 'common', 'stopwords', 'snowball_stopwords.txt')) elif stopwords is not None: print("Using custom stopwords") stopword_list = read_text( os.path.join('neural_persona', 'common', 'stopwords', stopwords + '_stopwords.txt')) else: stopword_list = [] stopword_set = {s.strip() for s in stopword_list} print("Reading data files") train_items = read_jsonlist(train_infile) n_train = len(train_items) print("Found {:d} training documents".format(n_train)) if test_infile is not None: test_items = read_jsonlist(test_infile) n_test = len(test_items) print("Found {:d} test documents".format(n_test)) else: test_items = [] n_test = 0 all_items = train_items + test_items all_texts = set([item[token_field_name] for item in all_items]) n_items = len(all_texts) # make vocabulary train_parsed = [] test_parsed = [] print("Parsing %d documents" % n_items) word_counts = Counter() doc_counts = Counter() count = 0 vocab = None for i, text in tqdm(enumerate(all_texts), total=n_items): if i % 1000 == 0 and count > 0: print(i) tokens, _ = tokenize(text, strip_html=strip_html, lower=lower, keep_numbers=keep_num, keep_alphanum=keep_alphanum, min_length=min_length, stopwords=stopword_set, vocab=vocab) # store the parsed documents if i < n_train: train_parsed.append(tokens) else: test_parsed.append(tokens) # keep track fo the number of documents with each word word_counts.update(tokens) doc_counts.update(set(tokens)) print("Size of full vocabulary=%d" % len(word_counts)) print("Selecting the vocabulary") most_common = word_counts.most_common(n=vocab_size) words, word_counts = zip(*most_common) vocab = list(words) print("Vocab size after filtering = %d" % len(vocab)) if vocab_size is not None: if len(vocab) > int(vocab_size): vocab = vocab[:int(vocab_size)] vocab_size = len(vocab) print("Final vocab size = %d" % vocab_size) print("Most common words remaining:", ' '.join(vocab[:10])) vocab.sort() write_to_json(vocab, os.path.join(output_dir, 'ref.vocab.json')) process_subset(train_items, train_parsed, vocab, output_dir, train_prefix) if n_test > 0: process_subset(test_items, test_parsed, vocab, output_dir, test_prefix)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--train-path", type=str, required=True, help="Path to the train jsonl file.") parser.add_argument("--dev-path", type=str, required=True, help="Path to the dev jsonl file.") parser.add_argument("--train-mentions-path", type=str, required=False, help="Path to the train mentions jsonl file.") parser.add_argument("--dev-mentions-path", type=str, required=False, help="Path to the dev mentions jsonl file.") parser.add_argument("--serialization-dir", "-s", type=str, required=True, help="Path to store the preprocessed output.") parser.add_argument("--vocab-namespace", type=str, required=False, default="vampire", help="Path to store the preprocessed output.") parser.add_argument("--vocab-size", type=int, required=False, default=10000, help="Vocabulary set size") parser.add_argument("--tokenize", action='store_true', help="Whether to tokenize the input") parser.add_argument("--tokenizer-type", type=str, default="just_spaces", help="Tokenizer type: just_spaces | spacy") # naming convention: if you want a field called "doc text", you should name it "doc_text" # parser.add_argument("--token-field-names", type=str, nargs="*", default=["text"], # help="token field names separable by space like, \"doc\", \"entity_mentions\". " # "Naming Convention: if you want a field called \"doc text\"," # " you should name it \"doc_text\"") parser.add_argument("--global-repr", action='store_true', help="Whether use document level information") args = parser.parse_args() global_repr = args.global_repr print("Using document information" if global_repr else "Discarding document information") if not os.path.isdir(args.serialization_dir): os.mkdir(args.serialization_dir) vocabulary_dir = os.path.join(args.serialization_dir, "vocabulary") if not os.path.isdir(vocabulary_dir): os.mkdir(vocabulary_dir) # {token_field_name1: [ sentencen0-1, sentencen1-1, ...] , # token_field_name2: [ sentencen0-2, sentencen1-2, ...] } token_field_names = ["entity_text", "doc_text"] named_tokenized_train_examples = load_data(args.train_path, args.tokenize, args.tokenizer_type, token_field_names) named_tokenized_dev_examples = load_data(args.dev_path, args.tokenize, args.tokenizer_type, token_field_names) print("fitting count vectorizer...") count_vectorizer = CountVectorizer(stop_words='english', max_features=args.vocab_size, token_pattern=r'\b[^\d\W]{3,30}\b') if global_repr: text = list(set(named_tokenized_train_examples["doc_text"])) + list( set(named_tokenized_dev_examples["doc_text"])) else: train_mentions = json.load(open(args.train_mentions_path)) dev_mentions = json.load(open(args.dev_mentions_path)) text = train_mentions + dev_mentions # master is simply vectorized of the document corpus(no duplicate documents) master = count_vectorizer.fit_transform(text) named_vectorized_train_examples = { token_field_name: count_vectorizer.transform( named_tokenized_train_examples[token_field_name]) for token_field_name in token_field_names } named_vectorized_dev_examples = { token_field_name: count_vectorizer.transform( named_tokenized_dev_examples[token_field_name]) for token_field_name in token_field_names } # add @@unknown@@ token vector for both doc and entity representation # this decision is for code simplicity for token_field_name in token_field_names: named_vectorized_train_examples[token_field_name] = sparse.hstack( (np.array( [0] * len(named_tokenized_train_examples[token_field_name]))[:, None], named_vectorized_train_examples[token_field_name])) named_vectorized_dev_examples[token_field_name] = sparse.hstack( (np.array( [0] * len(named_tokenized_dev_examples[token_field_name]))[:, None], named_vectorized_dev_examples[token_field_name])) # add @@unknown@@ token vector master = sparse.hstack((np.array([0] * len(text))[:, None], master)) vocab = ["@@UNKNOWN@@"] + count_vectorizer.get_feature_names() # generate background frequency print("generating background frequency...") # bgfreq = dict(zip(count_vectorizer.get_feature_names(), master.toarray().sum(1) / args.vocab_size)) bgfreq = dict(zip(vocab, np.array(master.sum(0))[0] / master.sum())) print("saving data...") np.savez(os.path.join(args.serialization_dir, "train.npz"), **named_vectorized_train_examples) np.savez(os.path.join(args.serialization_dir, "dev.npz"), **named_vectorized_dev_examples) # save_named_sparse(named_vectorized_train_examples, os.path.join(args.serialization_dir, "train.npz")) # save_named_sparse(named_vectorized_dev_examples, os.path.join(args.serialization_dir, "dev.npz")) write_to_json( bgfreq, os.path.join(args.serialization_dir, f"{args.vocab_namespace}.bgfreq")) write_list_to_file(['@@UNKNOWN@@'] + count_vectorizer.get_feature_names(), os.path.join(vocabulary_dir, f"{args.vocab_namespace}.txt")) write_list_to_file(['*tags', '*labels', args.vocab_namespace], os.path.join(vocabulary_dir, "non_padded_namespaces.txt"))