コード例 #1
0
def main():
    """
    Given a parallel corpus, partitions examples into training, development, and test sets.

    Provided output will be a directory containing the partitions:
    <corpus_name> /
        <corpus_name>_train.jsonl
        <corpus_name>_development.jsonl
        <corpus_name>_test.jsonl
        partition_info.txt

    when given a parallel corpus <corpus_name>.jsonl
    """
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    project_root = os.path.abspath(os.path.realpath(os.path.join(
        os.path.dirname(  # Escape out into project directory.
            os.path.dirname( # Escape out into scripts directory.
                os.path.realpath(__file__))))))
    parser.add_argument("--corpus-path", type=str,
                        help="Path to the parallel JSON lines corpus.")
    parser.add_argument("--save-dir", type=str,
                        default=project_root,
                        help="Directory to store the train-dev-test split directory.")
    args = parser.parse_args()
    corpus_name = os.path.basename(args.corpus_path).split('.')[0]
    out_file_path = os.path.join(args.save_dir, corpus_name + "_tokenized.jsonl")
    out_file = open(out_file_path, 'w')

    # Language-specific tokenizers.
    en_tokenizer = SpacyWordSplitter(language='en_core_web_sm')
    fr_tokenizer = SpacyWordSplitter(language='fr_core_news_sm')

    print("Tokenizing utterances for {}...".format(corpus_name))
    with open(args.corpus_path) as f:
        for lines in tqdm(grouper(f, 100, '')):
            # When the grouper collects a group smaller than the batch, padding
            # is done via empty strings.
            # Check for them explicitly before continuing.
            examples = [ujson.loads(line.strip()) for line in filter(lambda l: l, lines)]
            en_utterances = [ex['en'] for ex in examples]
            fr_utterances = [ex['fr'] for ex in examples]

            en_utterances_tokenized = en_tokenizer.batch_split_words(en_utterances)
            fr_utterances_tokenized = fr_tokenizer.batch_split_words(fr_utterances)

            for i, ex in enumerate(examples):
                ex_tokenized = {
                    'id': ex['id'],
                    'en': ' '.join([token.text for token in en_utterances_tokenized[i]]),
                    'fr': ' '.join([token.text for token in fr_utterances_tokenized[i]])
                }
                ujson.dump(ex_tokenized, out_file, ensure_ascii=False)
                out_file.write('\n')

    out_file.close()
コード例 #2
0
class TestSpacyWordSplitter(AllenNlpTestCase):
    def setUp(self):
        super(TestSpacyWordSplitter, self).setUp()
        self.word_splitter = SpacyWordSplitter()

    def test_tokenize_handles_complex_punctuation(self):
        sentence = "this (sentence) has 'crazy' \"punctuation\"."
        expected_tokens = [
            "this", "(", "sentence", ")", "has", "'", "crazy", "'", '"',
            "punctuation", '"', "."
        ]
        tokens = self.word_splitter.split_words(sentence)
        token_text = [t.text for t in tokens]
        assert token_text == expected_tokens
        for token in tokens:
            start = token.idx
            end = start + len(token.text)
            assert sentence[start:end] == token.text

    def test_tokenize_handles_contraction(self):
        # note that "would've" is kept together, while "ain't" is not.
        sentence = "it ain't joe's problem; would been yesterday"
        expected_tokens = [
            "it", "ai", "n't", "joe", "'s", "problem", ";", "would", "been",
            "yesterday"
        ]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_multiple_contraction(self):
        sentence = "wouldn't've"
        expected_tokens = ["would", "n't", "'ve"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_final_apostrophe(self):
        sentence = "the jones' house"
        expected_tokens = ["the", "jones", "'", "house"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_removes_whitespace_tokens(self):
        sentence = "the\n jones'   house  \x0b  55"
        expected_tokens = ["the", "jones", "'", "house", "55"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_special_cases(self):
        # note that the etc. doesn't quite work --- we can special case this if we want.
        sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store"
        expected_tokens = [
            "Mr.", "and", "Mrs.", "Jones", ",", "etc", ".", ",", "went", "to",
            ",", "e.g.", ",", "the", "store"
        ]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_batch_tokenization(self):
        sentences = [
            "This is     a sentence", "This isn't a sentence.",
            "This is the 3rd     sentence."
            "Here's the 'fourth' sentence."
        ]
        batch_split = self.word_splitter.batch_split_words(sentences)
        separately_split = [
            self.word_splitter.split_words(sentence) for sentence in sentences
        ]
        assert len(batch_split) == len(separately_split)
        for batch_sentence, separate_sentence in zip(batch_split,
                                                     separately_split):
            assert len(batch_sentence) == len(separate_sentence)
            for batch_word, separate_word in zip(batch_sentence,
                                                 separate_sentence):
                assert batch_word.text == separate_word.text

    def test_keep_spacy_tokens(self):
        word_splitter = SpacyWordSplitter()
        sentence = "This should be an allennlp Token"
        tokens = word_splitter.split_words(sentence)
        assert tokens
        assert all(isinstance(token, Token) for token in tokens)

        word_splitter = SpacyWordSplitter(keep_spacy_tokens=True)
        sentence = "This should be a spacy Token"
        tokens = word_splitter.split_words(sentence)
        assert tokens
        assert all(isinstance(token, spacy.tokens.Token) for token in tokens)
コード例 #3
0
class TestSpacyWordSplitter(AllenNlpTestCase):
    def setUp(self):
        super(TestSpacyWordSplitter, self).setUp()
        self.word_splitter = SpacyWordSplitter()

    def test_tokenize_handles_complex_punctuation(self):
        sentence = "this (sentence) has 'crazy' \"punctuation\"."
        expected_tokens = ["this", "(", "sentence", ")", "has", "'", "crazy", "'", '"',
                           "punctuation", '"', "."]
        tokens = self.word_splitter.split_words(sentence)
        token_text = [t.text for t in tokens]
        assert token_text == expected_tokens
        for token in tokens:
            start = token.idx
            end = start + len(token.text)
            assert sentence[start:end] == token.text

    def test_tokenize_handles_contraction(self):
        # note that "would've" is kept together, while "ain't" is not.
        sentence = "it ain't joe's problem; would been yesterday"
        expected_tokens = ["it", "ai", "n't", "joe", "'s", "problem", ";", "would", "been",
                           "yesterday"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_multiple_contraction(self):
        sentence = "wouldn't've"
        expected_tokens = ["would", "n't", "'ve"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_final_apostrophe(self):
        sentence = "the jones' house"
        expected_tokens = ["the", "jones", "'", "house"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_removes_whitespace_tokens(self):
        sentence = "the\n jones'   house  \x0b  55"
        expected_tokens = ["the", "jones", "'", "house", "55"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_special_cases(self):
        # note that the etc. doesn't quite work --- we can special case this if we want.
        sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store"
        expected_tokens = ["Mr.", "and", "Mrs.", "Jones", ",", "etc", ".", ",", "went", "to", ",",
                           "e.g.", ",", "the", "store"]
        tokens = [t.text for t in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_batch_tokenization(self):
        sentences = ["This is     a sentence",
                     "This isn't a sentence.",
                     "This is the 3rd     sentence."
                     "Here's the 'fourth' sentence."]
        batch_split = self.word_splitter.batch_split_words(sentences)
        separately_split = [self.word_splitter.split_words(sentence) for sentence in sentences]
        assert len(batch_split) == len(separately_split)
        for batch_sentence, separate_sentence in zip(batch_split, separately_split):
            assert len(batch_sentence) == len(separate_sentence)
            for batch_word, separate_word in zip(batch_sentence, separate_sentence):
                assert batch_word.text == separate_word.text
コード例 #4
0
import allennlp
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
spw = SpacyWordSplitter(pos_tags=True, ner=True)

sentence = 'I am going ot the store in France. George Washington is awesome.'
sentences = [
    'I am going to the store in France.', 'George Washington is awesome',
    'I like ice cream.', 'The vikings are awesome people from Normandy'
]

# Single split words
out = spw.split_words(sentence)
for o in out:
    print(o.pos_)

for o in out:
    print(o.ent_type_)

# Batch split words
sent = spw.batch_split_words(sentences)

for out in sent:
    for o in out:
        print(o.pos_)

for out in sent:
    for o in out:
        print(o.ent_type_)