コード例 #1
0
def read_wsc(dir_path):
    df_dict = dict()
    tokenizer = WhitespaceTokenizer()
    meta_data = dict()
    meta_data['noun'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
    meta_data['pronoun'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
    for fold in ['train', 'val', 'test']:
        jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
        df = read_jsonl_superglue(jsonl_path)
        samples = []
        for i in range(len(df)):
            text = df.loc[i, 'text']
            if fold != 'test':
                label = df.loc[i, 'label']
            target = df.loc[i, 'target']
            span1_index = target['span1_index']
            span2_index = target['span2_index']
            span1_text = target['span1_text']
            span2_text = target['span2_text']
            # Build entity
            # list of entities
            # 'entities': {'start': 0, 'end': 100}
            tokens, offsets = tokenizer.encode_with_offsets(text, str)
            pos_start1 = offsets[span1_index][0]
            pos_end1 = pos_start1 + len(span1_text)
            pos_start2 = offsets[span2_index][0]
            pos_end2 = pos_start2 + len(span2_text)
            if fold == 'test':
                samples.append({
                    'text': text,
                    'noun': {
                        'start': pos_start1,
                        'end': pos_end1
                    },
                    'pronoun': {
                        'start': pos_start2,
                        'end': pos_end2
                    }
                })
            else:
                samples.append({
                    'text': text,
                    'noun': {
                        'start': pos_start1,
                        'end': pos_end1
                    },
                    'pronoun': {
                        'start': pos_start2,
                        'end': pos_end2
                    },
                    'label': label
                })
        df = pd.DataFrame(samples)
        df_dict[fold] = df
    return df_dict, meta_data
コード例 #2
0
ファイル: clean_tok_corpus.py プロジェクト: tlby/gluon-nlp
def get_tokenizer(tokenizer, lang=None):
    if isinstance(tokenizer, BaseTokenizer):
        return tokenizer
    else:
        if tokenizer == 'moses':
            return MosesTokenizer(lang=lang)
        elif tokenizer == 'whitespace':
            return WhitespaceTokenizer()
        elif tokenizer == 'jieba':
            return JiebaTokenizer()
        else:
            raise NotImplementedError
コード例 #3
0
def test_whitespace_tokenizer():
    tokenizer = WhitespaceTokenizer()
    gt_en_tokenized = [['Four', 'score', 'and', 'seven', 'years', 'ago', 'our', 'fathers', 'brought',
                        'forth', 'on', 'this', 'continent,', 'a', 'new', 'nation,', 'conceived',
                        'in', 'Liberty,', 'and', 'dedicated', 'to', 'the', 'proposition', 'that',
                        'all', 'men', 'are', 'created', 'equal.'],
                       ['In', 'spite', 'of', 'the', 'debate', 'going', 'on', 'for', 'months',
                        'about', 'the', 'photos', 'of', 'Özil', 'with', 'the', 'Turkish',
                        'President', 'Recep', 'Tayyip', 'Erdogan,', 'he', 'regrets', 'the',
                        'return', 'of', 'the', '92-match', 'national', 'player', 'Özil.']]
    gt_de_tokenized = [['Goethe', 'stammte', 'aus', 'einer', 'angesehenen', 'bürgerlichen',
                        'Familie;', 'sein', 'Großvater', 'mütterlicherseits', 'war', 'als',
                        'Stadtschultheiß', 'höchster', 'Justizbeamter', 'der', 'Stadt',
                        'Frankfurt,', 'sein', 'Vater', 'Doktor', 'der', 'Rechte', 'und',
                        'kaiserlicher', 'Rat.'],
                       ['"Das', 'ist', 'eine', 'Frage,', 'die', 'natürlich', 'davon', 'abhängt,',
                        'dass', 'man', 'einmal', 'ins', 'Gespräch', 'kommt,', 'dass', 'man', 'mit',
                        'ihm', 'auch', 'darüber', 'spricht,', 'warum', 'er', 'das', 'eine', 'oder',
                        'andere', 'offenbar', 'so', 'empfunden', 'hat,', 'wie', 'das', 'in',
                        'seinem', 'Statement', 'niedergelegt', 'ist",', 'sagte', 'Grindel', 'im',
                        'Fußball-Podcast', '"Phrasenmäher"', 'der', '"Bild-Zeitung.']]
    for _ in range(2):
        # Inject noise and test for encode
        noisy_en_samples = [random_inject_space(ele) for ele in EN_SAMPLES]
        noisy_de_samples = [random_inject_space(ele) for ele in DE_SAMPLES]
        verify_encode_token(tokenizer, noisy_en_samples + noisy_de_samples,
                            gt_en_tokenized + gt_de_tokenized)
        # Test for decode
        verify_decode(tokenizer, EN_SAMPLES + DE_SAMPLES, str)
        # Test for encode_with_offsets
        verify_encode_token_with_offsets(tokenizer, noisy_en_samples + noisy_de_samples)
    verify_decode_no_vocab_raise(tokenizer)

    # Test for output_type = int
    vocab = Vocab(collections.Counter(sum(gt_en_tokenized + gt_de_tokenized,
                                          [])))
    tokenizer.set_vocab(vocab)
    verify_decode(tokenizer, EN_SAMPLES + DE_SAMPLES, int)
    verify_pickleble(tokenizer, WhitespaceTokenizer)
    verify_encode_token_with_offsets(tokenizer, EN_SAMPLES + DE_SAMPLES)