class TestSimpleWordSplitter(AllenNlpTestCase): def setUp(self): super(TestSimpleWordSplitter, self).setUp() self.word_splitter = SimpleWordSplitter() def test_tokenize_handles_complex_punctuation(self): sentence = "this (sentence) has 'crazy' \"punctuation\"." expected_tokens = [ "this", "(", "sentence", ")", "has", "'", "crazy", "'", '"', "punctuation", '"', "." ] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_contraction(self): sentence = "it ain't joe's problem; would've been yesterday" expected_tokens = [ "it", "ai", "n't", "joe", "'s", "problem", ";", "would", "'ve", "been", "yesterday" ] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_multiple_contraction(self): sentence = "wouldn't've" expected_tokens = ["would", "n't", "'ve"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_final_apostrophe(self): sentence = "the jones' house" expected_tokens = ["the", "jones", "'", "house"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_special_cases(self): sentence = "mr. and mrs. jones, etc., went to, e.g., the store" expected_tokens = [ "mr.", "and", "mrs.", "jones", ",", "etc.", ",", "went", "to", ",", "e.g.", ",", "the", "store" ] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens
class TestSimpleWordSplitter(AllenNlpTestCase): def setUp(self): super(TestSimpleWordSplitter, self).setUp() self.word_splitter = SimpleWordSplitter() def test_tokenize_handles_complex_punctuation(self): sentence = "this (sentence) has 'crazy' \"punctuation\"." expected_tokens = ["this", "(", "sentence", ")", "has", "'", "crazy", "'", '"', "punctuation", '"', "."] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_contraction(self): sentence = "it ain't joe's problem; would've been yesterday" expected_tokens = ["it", "ai", "n't", "joe", "'s", "problem", ";", "would", "'ve", "been", "yesterday"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_batch_tokenization(self): sentences = ["This is a sentence", "This isn't a sentence.", "This is the 3rd sentence." "Here's the 'fourth' sentence."] batch_split = self.word_splitter.batch_split_words(sentences) separately_split = [self.word_splitter.split_words(sentence) for sentence in sentences] assert len(batch_split) == len(separately_split) for batch_sentence, separate_sentence in zip(batch_split, separately_split): assert len(batch_sentence) == len(separate_sentence) for batch_word, separate_word in zip(batch_sentence, separate_sentence): assert batch_word.text == separate_word.text def test_tokenize_handles_multiple_contraction(self): sentence = "wouldn't've" expected_tokens = ["would", "n't", "'ve"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_final_apostrophe(self): sentence = "the jones' house" expected_tokens = ["the", "jones", "'", "house"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_special_cases(self): sentence = "mr. and mrs. jones, etc., went to, e.g., the store" expected_tokens = ["mr.", "and", "mrs.", "jones", ",", "etc.", ",", "went", "to", ",", "e.g.", ",", "the", "store"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens
with open(in_file, "r") as in_fp: for line in tqdm(in_fp.readlines()): struct = jsondecoder.decode(line) hypothesis = struct["claim"] premise_idx = 0 for sentence in struct["predicted_sentences"]: underlined_title = sentence[0] label = 0 # placeholder, but must be a valid index premise = sentence[3] # Prefix the premise sentence with [ TITLE ] (from source article) title = underlined_title.replace("_", " ") title_words = tokenizer.split_words(title) tokenized_title = " ".join(map(lambda x: x.text, title_words)) premise = "[ " + tokenized_title + " ] " + premise premise_words = premise.split(" ") if (len(premise_words) > max_sent_len): premise = " ".join(premise_words[0:max_sent_len]) info = str(struct["id"]) + "\t" + str(premise_idx) + "\t" info = info + str(sentence[0]) + "\t" + str(sentence[1]) premise_fp.write(premise + "\n") hypothesis_fp.write(hypothesis + "\n") label_fp.write(str(label) + "\n") index_fp.write(info + "\n")
# from retrieval.fever_doc_db import FeverDocDB parser = argparse.ArgumentParser() parser.add_argument("--in_file", type=str, required=True) parser.add_argument("--out_file", type=str, required=True) args = parser.parse_args() in_file = args.in_file out_file = args.out_file if os.path.exists(out_file): raise ValueError("Output already exists") jsondecoder = json.JSONDecoder() jsonencoder = json.JSONEncoder() tokenizer = SimpleWordSplitter() print("Tokenizing") with open(in_file, "r") as in_fp: with open(out_file, "w") as out_fp: for line in tqdm(in_fp.readlines()): struct = jsondecoder.decode(line) tok = tokenizer.split_words(struct["claim"]) tokenized = " ".join(map(lambda x: x.text, tok)) struct["claim"] = tokenized result = jsonencoder.encode(struct) out_fp.write(result + "\n")