def read_wsc(dir_path): df_dict = dict() tokenizer = WhitespaceTokenizer() meta_data = dict() meta_data['noun'] = {'type': 'entity', 'attrs': {'parent': 'text'}} meta_data['pronoun'] = {'type': 'entity', 'attrs': {'parent': 'text'}} for fold in ['train', 'val', 'test']: jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold)) df = read_jsonl_superglue(jsonl_path) samples = [] for i in range(len(df)): text = df.loc[i, 'text'] if fold != 'test': label = df.loc[i, 'label'] target = df.loc[i, 'target'] span1_index = target['span1_index'] span2_index = target['span2_index'] span1_text = target['span1_text'] span2_text = target['span2_text'] # Build entity # list of entities # 'entities': {'start': 0, 'end': 100} tokens, offsets = tokenizer.encode_with_offsets(text, str) pos_start1 = offsets[span1_index][0] pos_end1 = pos_start1 + len(span1_text) pos_start2 = offsets[span2_index][0] pos_end2 = pos_start2 + len(span2_text) if fold == 'test': samples.append({ 'text': text, 'noun': { 'start': pos_start1, 'end': pos_end1 }, 'pronoun': { 'start': pos_start2, 'end': pos_end2 } }) else: samples.append({ 'text': text, 'noun': { 'start': pos_start1, 'end': pos_end1 }, 'pronoun': { 'start': pos_start2, 'end': pos_end2 }, 'label': label }) df = pd.DataFrame(samples) df_dict[fold] = df return df_dict, meta_data
def get_tokenizer(tokenizer, lang=None): if isinstance(tokenizer, BaseTokenizer): return tokenizer else: if tokenizer == 'moses': return MosesTokenizer(lang=lang) elif tokenizer == 'whitespace': return WhitespaceTokenizer() elif tokenizer == 'jieba': return JiebaTokenizer() else: raise NotImplementedError
def test_whitespace_tokenizer(): tokenizer = WhitespaceTokenizer() gt_en_tokenized = [['Four', 'score', 'and', 'seven', 'years', 'ago', 'our', 'fathers', 'brought', 'forth', 'on', 'this', 'continent,', 'a', 'new', 'nation,', 'conceived', 'in', 'Liberty,', 'and', 'dedicated', 'to', 'the', 'proposition', 'that', 'all', 'men', 'are', 'created', 'equal.'], ['In', 'spite', 'of', 'the', 'debate', 'going', 'on', 'for', 'months', 'about', 'the', 'photos', 'of', 'Özil', 'with', 'the', 'Turkish', 'President', 'Recep', 'Tayyip', 'Erdogan,', 'he', 'regrets', 'the', 'return', 'of', 'the', '92-match', 'national', 'player', 'Özil.']] gt_de_tokenized = [['Goethe', 'stammte', 'aus', 'einer', 'angesehenen', 'bürgerlichen', 'Familie;', 'sein', 'Großvater', 'mütterlicherseits', 'war', 'als', 'Stadtschultheiß', 'höchster', 'Justizbeamter', 'der', 'Stadt', 'Frankfurt,', 'sein', 'Vater', 'Doktor', 'der', 'Rechte', 'und', 'kaiserlicher', 'Rat.'], ['"Das', 'ist', 'eine', 'Frage,', 'die', 'natürlich', 'davon', 'abhängt,', 'dass', 'man', 'einmal', 'ins', 'Gespräch', 'kommt,', 'dass', 'man', 'mit', 'ihm', 'auch', 'darüber', 'spricht,', 'warum', 'er', 'das', 'eine', 'oder', 'andere', 'offenbar', 'so', 'empfunden', 'hat,', 'wie', 'das', 'in', 'seinem', 'Statement', 'niedergelegt', 'ist",', 'sagte', 'Grindel', 'im', 'Fußball-Podcast', '"Phrasenmäher"', 'der', '"Bild-Zeitung.']] for _ in range(2): # Inject noise and test for encode noisy_en_samples = [random_inject_space(ele) for ele in EN_SAMPLES] noisy_de_samples = [random_inject_space(ele) for ele in DE_SAMPLES] verify_encode_token(tokenizer, noisy_en_samples + noisy_de_samples, gt_en_tokenized + gt_de_tokenized) # Test for decode verify_decode(tokenizer, EN_SAMPLES + DE_SAMPLES, str) # Test for encode_with_offsets verify_encode_token_with_offsets(tokenizer, noisy_en_samples + noisy_de_samples) verify_decode_no_vocab_raise(tokenizer) # Test for output_type = int vocab = Vocab(collections.Counter(sum(gt_en_tokenized + gt_de_tokenized, []))) tokenizer.set_vocab(vocab) verify_decode(tokenizer, EN_SAMPLES + DE_SAMPLES, int) verify_pickleble(tokenizer, WhitespaceTokenizer) verify_encode_token_with_offsets(tokenizer, EN_SAMPLES + DE_SAMPLES)