def __init__( self, question_path, paragraph_path, ratio, batch_size, vocab: Vocab = Ref("model.vocab"), batch_first=Ref("model.batch_first", True), ): self.vocab = vocab question = Field(include_lengths=True, batch_first=batch_first, pad_token=vocab.pad_token) question.vocab = vocab paragraph = Field(batch_first=batch_first, pad_token=vocab.pad_token) paragraph.vocab = vocab paragraphs = NestedField(paragraph, include_lengths=True) paragraphs.vocab = vocab target = Field(sequential=False, use_vocab=False, is_target=True) fields = [("question", question), ("paragraphs", paragraphs), ("target", target)] examples = [] with open(paragraph_path) as paragraph_file, open( question_path) as question_file: for q in question_file: q = q.strip() ps = [paragraph_file.readline().strip() for _ in range(ratio)] examples.append(Example.fromlist([q, ps, 0], fields)) BaseIRDataset.__init__(self, ratio, batch_size, batch_first) TorchTextDataset.__init__(self, examples, fields)
def make_fields(vocab_count, binary=True): text_field = Field(batch_first=True, include_lengths=True, tokenize=lambda x: x.split(' ')) text_field.vocab = Vocab(vocab_count['text']) char_nesting_field = Field(batch_first=True, tokenize=list) char_field = NestedField(char_nesting_field, tokenize=lambda x: x.split(' ')) char_nesting_field.vocab = Vocab(vocab_count['chars']) char_field.vocab = Vocab(vocab_count['chars']) pos1_field = Field(batch_first=True, sequential=False, use_vocab=False) pos2_field = Field(batch_first=True, sequential=False, use_vocab=False) pos1_rel_field = Field(sequential=True, batch_first=True) pos1_rel_field.vocab = Vocab(vocab_count['pos1_rel']) pos2_rel_field = Field(sequential=True, batch_first=True) pos2_rel_field.vocab = Vocab(vocab_count['pos2_rel']) if binary: label_field = Field(sequential=False, batch_first=True) else: label_field = Field(sequential=False, batch_first=True) label_field.vocab = Vocab(vocab_count['relation'], specials=[]) reltype_field = Field(batch_first=True, sequential=False) reltype_field.vocab = Vocab(vocab_count['rel_type']) fields_dict = { 'text': [('text', text_field), ('chars', char_field)], 'pos1': ('pos1', pos1_field), 'pos2': ('pos2', pos2_field), 'pos1_rel': ('pos1_rel', pos1_rel_field), 'pos2_rel': ('pos2_rel', pos2_rel_field), 'relation': ('relation', label_field), 'rel_type': ('rel_type', reltype_field) } return fields_dict
def prepare_fields(pad_t): WORD_field = data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=pad_t) WORD_nested_field = NestedField( data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=pad_t)) PAD_field = data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=0) PAD_nested_field = NestedField( data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=0)) MASK_nested_field = NestedField( data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=1.)) fields = { 'id': data.RawField(), 'question': data.RawField(), 'answers': data.RawField(), 'src': WORD_nested_field, 'src_mask': PAD_nested_field, 'doc_mask': MASK_nested_field, 'target': WORD_field, 'target_mask': PAD_field, } return fields
def initialize_fields(self): ''' initializes fields ''' # initialize the text field with the spacy tokenizer and no casing self.text_field = Field(tokenize='spacy', lower=True, batch_first=True) # initialize the tag field without an unknown token (hopefully the train set contains all of the tags) self.tag_field = Field(unk_token=None, batch_first=True) # initialize the character field char_nesting_field = Field(tokenize=list, batch_first=True) self.char_field = NestedField(char_nesting_field) self.pad_token = self.text_field.pad_token
def lattice_fields(**kwargs): """Create text fields. Args: base_name (str): Name associated with the field. n_feats (int): Number of word level feats (not counting the tokens) include_lengths (bool): Optionally return the sequence lengths. pad (str, optional): Defaults to ``"<blank>"``. bos (str or NoneType, optional): Defaults to ``"<s>"``. eos (str or NoneType, optional): Defaults to ``"</s>"``. truncate (bool or NoneType, optional): Defaults to ``None``. Returns: LatticeMultiField """ n_feats = kwargs["n_feats"] include_lengths = kwargs["include_lengths"] base_name = kwargs["base_name"] pad = kwargs.get("pad", "<blank>") truncate = kwargs.get("truncate", None) fields_ = [] i = 0 use_len = i == 0 and include_lengths name = base_name + "_feat_" + str(i - 1) if i > 0 else base_name nesting_field_text = Field(pad_token=pad, use_vocab=True) text_field = NestedField(nesting_field_text, pad_token=pad, include_lengths=use_len, use_vocab=True) nesting_field_scores = Field(pad_token=0.0, use_vocab=False, dtype=torch.float64) scores_field = NestedField(nesting_field_scores, use_vocab=False, tokenize=None, dtype=torch.float64, include_lengths=use_len) #feat = [('confnet', text_field), ('scores', scores_field)] fields_.append(('confnet', text_field)) fields_.append(('scores', scores_field)) """ print('fields_', fields_) print(fields_[0][0]) print(fields_[0][1]) print(fields_[1:]) """ #field = LatticeMultiField(fields_[0][0], fields_[0][1], fields_[1:]) confnet_field = LatticeMultiField('confnet', text_field, []) score_field = LatticeMultiField('score', scores_field, []) #print('lattice field', field) return confnet_field, score_field
def define_fields(self): self.id_field = Field(sequential=False, tokenize=lambda x: x, use_vocab=True) self.tweet_field = Field(sequential=True, tokenize=DataLoader.tokenize_text, include_lengths=False, lower=True, fix_length=self.max_length, use_vocab=True) self.timestamp_field = Field(sequential=False, include_lengths=False, use_vocab=False) self.structure_field = Field( sequential=True, tokenize=lambda x: DataLoader.tokenize_structure(x), include_lengths=False, fix_length=self.config.max_tweets, pad_token=self.config.num_structure_index, use_vocab=False) self.label_field = Field(sequential=False, use_vocab=False) self.tweet_lst_field = NestedField(self.tweet_field, fix_length=self.config.max_tweets) self.timestamp_lst_field = NestedField( self.timestamp_field, pad_token=str(self.config.size), fix_length=self.config.max_tweets) self.structure_lst_field = NestedField( self.structure_field, fix_length=self.config.max_tweets) data_fields = {} for key, val in self.config.keys_order.items(): if key == "post_id": data_fields[val] = (val, self.id_field) if key == "content": data_fields[val] = (val, self.tweet_lst_field) elif key == "label": data_fields[val] = (val, self.label_field) elif key == "time_delay": data_fields[val] = (val, self.timestamp_lst_field) elif key == "structure": data_fields[val] = (val, self.structure_lst_field) self.data_fields = data_fields
class CharCorpus(object): def __init__(self, input_folder, min_word_freq, batch_size, wv_file=None): # list all the fields self.word_field = Field(lower=True) # [sent len, batch_size] self.tag_field = Field(unk_token=None) # [sent len, batch_size] ### BEGIN MODIFIED SECTION: CHARACTER EMBEDDING ### self.char_nesting_field = Field(tokenize=list) self.char_field = NestedField( self.char_nesting_field) # [batch_size, sent len, word len] # create dataset using built-in parser from torchtext self.train_dataset, self.test_dataset = SequenceTaggingDataset.splits( path=input_folder, train="train.txt", test="test.txt", fields=((("word", "char"), (self.word_field, self.char_field)), ("tag", self.tag_field))) ### END MODIFIED SECTION ### # convert fields to vocabulary list if wv_file: self.wv_model = gensim.models.word2vec.Word2Vec.load(wv_file) self.embedding_dim = self.wv_model.vector_size word_freq = { word: self.wv_model.wv.vocab[word].count for word in self.wv_model.wv.vocab } word_counter = Counter(word_freq) self.word_field.vocab = Vocab(word_counter, min_freq=min_word_freq) vectors = [] for word, idx in self.word_field.vocab.stoi.items(): if word in self.wv_model.wv.vocab.keys(): vectors.append( torch.as_tensor(self.wv_model.wv[word].tolist())) else: vectors.append(torch.zeros(self.embedding_dim)) self.word_field.vocab.set_vectors(stoi=self.word_field.vocab.stoi, vectors=vectors, dim=self.embedding_dim) else: self.word_field.build_vocab(self.train_dataset.word, min_freq=min_word_freq) # build vocab for tag and characters self.char_field.build_vocab(self.train_dataset.char) # NEWLY ADDED self.tag_field.build_vocab(self.train_dataset.tag) # create iterator for batch input self.train_iter, self.test_iter = BucketIterator.splits( datasets=(self.train_dataset, self.test_dataset), batch_size=batch_size) # prepare padding index to be ignored during model training/evaluation self.word_pad_idx = self.word_field.vocab.stoi[ self.word_field.pad_token] self.char_pad_idx = self.char_field.vocab.stoi[ self.char_field.pad_token] # NEWLY ADDED self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]
class Robust45Hierarchical(Robust45): @staticmethod def clean_sentence(string): return clean_string(string, sentence_droprate=0, max_length=100) NESTING_FIELD = Field(batch_first=True, tokenize=clean_string) TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents)
def get_data_fields(fixed_lengths: int) -> dict: """" Creates torchtext fields for the I/O pipeline. """ language = Field( batch_first=True, init_token=None, eos_token=None, pad_token=None, unk_token=None) characters = Field(include_lengths=True, batch_first=True, init_token=None, eos_token=END_TOKEN, pad_token=PAD_TOKEN, fix_length=fixed_lengths) nesting_field = Field(tokenize=list, pad_token=PAD_TOKEN, batch_first=True, init_token=None, eos_token=END_TOKEN) paragraph = NestedField(nesting_field, pad_token=PAD_TOKEN, eos_token=END_TOKEN, include_lengths=True) # # paragraph = Field(include_lengths=True, batch_first=True, init_token=None, # eos_token=END_TOKEN, pad_token=PAD_TOKEN) fields = { 'characters': ('characters', characters), 'paragraph': ('paragraph', paragraph), 'language': ('language', language) } return fields
def gen_language_model_corpus(dataset_cls: torchtext.datasets.LanguageModelingDataset): field_char = NestedField(Field( pad_token=PAD_WORD, tokenize=list, init_token=SOS_WORD, eos_token=EOS_WORD, batch_first=True), pad_token=PAD_WORD, ) field_word = Field(batch_first=True) dataset_char = dataset_cls.splits(field_char) dataset_word = dataset_cls.splits(field_word) field_char.build_vocab(dataset_char[0]) field_word.build_vocab(dataset_word[0]) return [_ for _ in zip(dataset_word, dataset_char)], field_word, field_char
def __init__(self, path, batch_size, vocab: Vocab = Ref("model.vocab"), batch_first=Ref("model.batch_first", True)): self.vocab = vocab question = Field(include_lengths=True, use_vocab=False, pad_token=vocab.pad_index, batch_first=batch_first) paragraph = Field(batch_first=batch_first, pad_token=vocab.pad_index, use_vocab=False) paragraphs = NestedField(paragraph, include_lengths=True) target = Field(sequential=False, use_vocab=False, is_target=True) fields = [("question", question), ("paragraphs", paragraphs), ("target", target)] import h5py self.data = h5py.File(path, "r") ds = self.data["examples"] ratio = ds.attrs["ratio"] TorchTextDataset.__init__(self, self.ExampleWrapper(ds, ratio, fields), fields) BaseIRDataset.__init__(self, ratio, batch_size, batch_first)
def get_data_fields(): """Creates torchtext fields for the I/O pipeline.""" form = Field(include_lengths=True, batch_first=True, init_token=None, eos_token=None, pad_token=PAD_TOKEN, lower=True) pos = Field(include_lengths=True, batch_first=True, init_token=ROOT_TOKEN, eos_token=END_TOKEN, pad_token=PAD_TOKEN, unk_token=None) nesting_field = Field(tokenize=list, pad_token=PAD_TOKEN, batch_first=True, init_token=None, eos_token=None) chars = NestedField(nesting_field, init_token=None, pad_token=PAD_TOKEN, eos_token=None, include_lengths=True) fields = { 'form': ('form', form), 'pos': ('pos', pos), 'chars': ('chars', chars) } return fields
def create_fields(use_prefix=False, use_suffix=False, use_chars=False, lower_words=True, lower_prefixes=True, lower_suffixes=True, lower_chars=False): WORDS = Field(batch_first=True, lower=lower_words, init_token='<s>', eos_token='</s>') TAGS = Field(batch_first=True, init_token='<s>', eos_token='</s>') PREFIXES_2 = Field(batch_first=True, lower=lower_prefixes, init_token='<s>', eos_token='</s>') PREFIXES_3 = Field(batch_first=True, lower=lower_prefixes, init_token='<s>', eos_token='</s>') SUFFIXES_2 = Field(batch_first=True, lower=lower_suffixes, init_token='<s>', eos_token='</s>') SUFFIXES_3 = Field(batch_first=True, lower=lower_suffixes, init_token='<s>', eos_token='</s>') CHARS = NestedField(Field(batch_first=True, lower=lower_chars, pad_token='<cpad>', unk_token='<cunk>', tokenize=list, init_token='<w>', eos_token='</w>'), init_token='<s>', eos_token='</s>') field_odict = collections.OrderedDict({ 'words': WORDS, 'tags': TAGS, 'prefs_2': None, 'prefs_3': None, 'suffs_2': None, 'suffs_3': None, 'chars': None, }) if use_prefix: field_odict['prefs_2'] = PREFIXES_2 field_odict['prefs_3'] = PREFIXES_3 if use_suffix: field_odict['suffs_2'] = SUFFIXES_2 field_odict['suffs_3'] = SUFFIXES_3 if use_chars: field_odict['chars'] = CHARS return field_odict
class IMDB_HAN(IMDB): NESTING = Field(sequential=True, batch_first=True, lower=True, use_vocab=True, tokenize=clean_string) TEXT = NestedField(NESTING, tokenize=split_sents, include_lengths=True) LABEL = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=process_labels)
def _init_fields(self): self.words = Field(batch_first=True, init_token='<s>', eos_token='</s>') self.lab = Field(batch_first=True, unk_token=None, pad_token=None) # self.char = NestedField(Field(batch_first=True, tokenize=list, unk_token='<cunk>') # , init_token='<s>', eos_token='</s>') self.char = NestedField(Field(batch_first=True, tokenize=list, unk_token='<cunk>', init_token='<w>', eos_token='</w>'), init_token='<s>', eos_token='</s>') self.labeled_fields = [(self.WORDS_NAME, self.words), (self.CHAR_NAME, self.char), (self.LAB_NAME, self.lab)] self.unlabeled_fields = [(self.WORDS_NAME, self.words), (self.CHAR_NAME, self.char)] self.logger.info('fields initialized successfully')
class FakeHealth_HAN(FakeHealth): NESTING = Field(sequential=True, batch_first=True, lower=True, use_vocab=True, tokenize=clean_string) TEXT = NestedField(NESTING, tokenize=sent_tokenize, include_lengths=True) LABEL = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=process_labels) ID = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=process_ids)
def load_dataset(config, device): label_dict = {"observing": 0, "against": 1, "for": 2} LABEL = Field(use_vocab = False, sequential = False,\ dtype = torch.long, preprocessing = lambda x: label_dict[x.strip()]) SEQ = Field(dtype = torch.long, lower = True, batch_first = True,\ preprocessing = lambda x:x[:45], include_lengths = True) SENT = Field(dtype = torch.long, lower = True, batch_first = True,\ preprocessing = lambda x:x[:45], include_lengths = False) DOC = NestedField(SENT, tokenize = lambda s:s.strip().split(' </s> '), \ preprocessing = lambda s:[x for x in s[:45] if x], dtype = torch.long,\ include_lengths = True) fields = [('label', LABEL), ('claim', SEQ), ('hline', SEQ),\ ('abst', SEQ), ('body', DOC)] train, test = TabularDataset.splits(path="../stance_data/", format = "tsv",\ fields = fields, train = config.train_file, test = config.test_file) train, val = train.split(split_ratio=0.80) vectors = GloVe(name="6B", dim=config.embed_dim, cache='/users4/jwduan/vectors/') DOC.build_vocab(train, val, test, vectors=vectors) SEQ.build_vocab() SEQ.vocab = DOC.vocab config.vocab_size = len(DOC.vocab) train_loader, val_loader, test_loader = Iterator.splits((train, val, test),\ batch_sizes = (config.batch_size, 256, 256), sort_key = lambda x:len(x.body), sort = True, device = device, shuffle = True, repeat = False) return (train_loader, val_loader, test_loader), DOC.vocab.vectors
def load_dataset(config, device): LABEL = Field(sequential = False, dtype = torch.long, use_vocab = False,\ batch_first = True, preprocessing = lambda x:1 if float(x) > 0. else 0) TARGET = Field(batch_first=True, lower=True, dtype=torch.long, preprocessing=lambda x: x[0].split('_'), include_lengths=True) TEXT = Field(dtype = torch.long, lower = True, batch_first = True,\ preprocessing = lambda x:x[:50])# [w for w in x if w not in stopwords_set][:50]) LEADS = NestedField(TEXT, dtype = torch.long, include_lengths = True,\ tokenize = lambda s: s.split('</s>'), preprocessing = lambda x:x[-5:]) DOC = NestedField(TEXT, dtype = torch.long, include_lengths = True,\ tokenize = lambda s: s.split('</s>'), preprocessing = lambda x:[s for s in x[1:50] if s]) DOCS = NestNestedField(DOC, dtype = torch.long, include_lengths = True,\ tokenize = lambda s: s.split('</p>'), preprocessing = lambda x:x[-5:]) fields = [('label', LABEL), ('target', TARGET), ('leads', LEADS), ('docs', DOCS)] train, val, test = TabularDataset.splits(path="../abrt_data/", format = "tsv", \ fields = fields, train = config.train_file, validation = config.dev_file, test = config.test_file) TARGET.build_vocab(train, val, test) DOCS.build_vocab(train, val, test) config.wvocab_size = len(DOCS.vocab) config.tvocab_size = len(TARGET.vocab) # sort = False, train_loader, val_loader, test_loader = BucketIterator.splits((train, val, test),\ sort_key = lambda x: len(x.docs), sort = True, batch_sizes = (config.batch_size, 32, 32),\ device = device, repeat = False) return (train_loader, val_loader, test_loader)
def get_data_fields(model_type) -> dict: """" Creates torchtext fields for the I/O pipeline. """ language_per_word = Field(include_lengths=True, batch_first=True, init_token=None, eos_token=END_TOKEN, pad_token=PAD_TOKEN) language_per_char = Field(include_lengths=True, batch_first=True, init_token=None, eos_token=END_TOKEN, pad_token=PAD_TOKEN) characters = Field(include_lengths=True, batch_first=True, init_token=None, eos_token=END_TOKEN, pad_token=PAD_TOKEN) nesting_field = Field(tokenize=list, pad_token=PAD_TOKEN, batch_first=True, eos_token=None) if model_type != "recurrent": paragraph = NestedField(nesting_field, pad_token=PAD_TOKEN, eos_token=END_TOKEN, include_lengths=True) else: paragraph = Field(include_lengths=True, batch_first=True, init_token=None, eos_token=END_TOKEN, pad_token=PAD_TOKEN) # FIXME BACK fields = { 'characters': ('characters', characters), 'paragraph': ('paragraph', paragraph), 'language_per_word': ('language_per_word', language_per_word), 'language_per_char': ('language_per_char', language_per_char) } return fields
def __init__(self, glove=True, device=device): self.device = device nlp = spacy.load("en_core_web_sm") char_nesting = Field(batch_first=True, tokenize=list, lower=True) char = NestedField(char_nesting, init_token="<sos>", eos_token="<eos>", tokenize="spacy") word = Field(init_token="<sos>", eos_token="<eos>", lower=True, tokenize="spacy") label = Field(sequential=False, is_target=True, use_vocab=False) self.fields = [("question_char", char), ("question_word", word), ("context_char", char), ("context_word", word), ("answer", label)] self.dict_fields = { "question": [("question_char", char), ("question_word", word)], "context": [("context_char", char), ("context_word", word)], "answer": ("answer", label) } self.train_data = self._get_data("../data/train.jsonl") self.dev_data = self._get_data("../data/dev.jsonl") char.build_vocab(self.train_data) if glove: word.build_vocab(self.train_data, vectors=GloVe(name="6B", dim=100)) else: word.build_vocab(self.train_data, vectors=FastText(language='en', max_vectors=30000)) self.char_vocab = char.vocab self.word_vocab = word.vocab pos = [] ner = [] ind2pos = [] ind2ner = [] for data in tqdm(self.train_data): doc = nlp(' '.join(data.question_word + data.context_word)) # t - token pos.extend([t.pos_ for t in doc]) ner.extend([t.label_ for t in doc.ents]) ind2pos.extend([[self.word_vocab.stoi[str(t)], t.pos_] for t in doc]) ind2ner.extend([[self.word_vocab.stoi[str(t)], t.label_] for t in doc.ents]) self.pos_voc = {tag: i for i, tag in enumerate(set(pos))} self.ner_voc = {tag: i + 1 for i, tag in enumerate(set(ner))} self.ner_voc['None'] = 0 # default values, used in DrQA model self.ind2pos = defaultdict(lambda: self.pos_voc['X']) # returns 14 self.ind2ner = defaultdict(lambda: self.ner_voc['None']) # returns 0 self.ind2pos.update({tag[0]: self.pos_voc[tag[1]] for tag in ind2pos}) self.ind2ner.update({tag[0]: self.ner_voc[tag[1]] for tag in ind2ner})
class LyricsArtistHierarchical(LyricsArtist): NESTING_FIELD = Field(batch_first=True, tokenize=clean_string) TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents)
class JiraHierarchical(Jira): NESTING_FIELD = Field(batch_first=True, tokenize=clean_string) TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents)
def test_inference_performance(): from sklearn.metrics import f1_score from torchtext.datasets import SequenceTaggingDataset from torchtext.data import Field, NestedField WORD = Field(init_token='<bos>', eos_token='<eos>') CHAR_NESTING = Field(tokenize=list, init_token='<bos>', eos_token='<eos>') CHAR = NestedField(CHAR_NESTING, init_token='<bos>', eos_token='<eos>') ENTITY = Field(init_token='<bos>', eos_token='<eos>') data_file = tempfile.NamedTemporaryFile(delete=True) # TODO Need to be decoded in Python 3 data_file.write(requests.get(CORA_URL).content) fields = [(('text', 'char'), (WORD, CHAR))] + [(None, None)] * 22 + [('entity', ENTITY)] dataset = SequenceTaggingDataset(data_file.name, fields, separator=" ") model = Model(model_path='models/neuralParsCit') model.parameters['pre_emb'] = os.path.join(os.getcwd(), 'vectors_with_unk.kv') f = model.build(training=False, **model.parameters) model.reload() word_to_id = {v: i for i, v in model.id_to_word.items()} char_to_id = {v: i for i, v in model.id_to_char.items()} tag_to_id = {tag: i for i, tag in model.id_to_tag.items()} tf = tempfile.NamedTemporaryFile(delete=False) tf.write("\n\n".join( ["\n".join(example.text) for example in dataset.examples])) tf.close() train_sentences = load_sentences(tf.name, model.parameters['lower'], model.parameters['zeros']) train_inputs = prepare_dataset(train_sentences, word_to_id, char_to_id, model.parameters['lower'], True) preds = [] for citation in train_inputs: inputs = create_input(citation, model.parameters, False) y_pred = np.array(f[1](*inputs))[1:-1] preds.append([(w, y_pred[i]) for i, w in enumerate(citation['str_words'])]) assert len(preds) == len(dataset.examples) results = [] for P, T in zip(preds, dataset.examples): for p, t in zip(P, zip(T.text, T.entity)): results.append((p[1], tag_to_id[t[1]])) pred, true = zip(*results) eval_metrics = { 'micro_f1': f1_score(true, pred, average='micro'), 'macro_f1': f1_score(true, pred, average='macro') } data_file.close() assert eval_metrics == pytest.approx({ 'macro_f1': 0.984, 'micro_f1': 0.993 }, abs=0.001)
class SpringDiffTokenHierarchical(SpringDiffToken): NESTING_FIELD = Field(batch_first=True, tokenize=split_string) CODE_FIELD = NestedField(NESTING_FIELD, tokenize=split_json)
class AppReviewsHierarchical(AppReviews): NESTING_FIELD = Field(batch_first=True, tokenize=clean_string) TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents)
class VulasPairedTokenHierarchical(VulasPairedToken): NESTING1_FIELD = Field(batch_first=True, tokenize=split_string) CODE1_FIELD = NestedField(NESTING1_FIELD, tokenize=split_json) NESTING2_FIELD = Field(batch_first=True, tokenize=split_string) CODE2_FIELD = NestedField(NESTING2_FIELD, tokenize=split_json)
class YELP14Hierarchical(YELP14): NESTING_FIELD = Field(batch_first=True, tokenize=clean_string) TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents, include_lengths=True)
def __init__(self, args): # list all the fields self.word_field = Field(lower=True) self.event_field = Field(unk_token=None) self.entity_field = Field(unk_token=None) self.argument_field = Field(unk_token=None) self.trigger_pos_field = Field(unk_token=None) self.char_nesting_field = Field(tokenize=list) self.char_field = NestedField(self.char_nesting_field) self.wv = args.wv_file # create dataset using built-in parser from torchtext self.train_dataset, self.val_dataset, self.test_dataset = SequenceTaggingDataset.splits( path=args.input_folder, train="train.txt", validation="dev.txt", test="test.txt", fields=((("word", "char"), (self.word_field, self.char_field)), ("event", self.event_field), ("entity", self.entity_field), ("argument", self.argument_field), ("trigger_pos", self.trigger_pos_field)), ) # convert fields to vocabulary list # self.word_field.build_vocab(self.train_dataset.word, min_freq=min_word_freq) self.event_field.build_vocab(self.train_dataset.event) # create iterator for batch input if args.wv_file: print("start loading embedding") self.wv_model = gensim.models.KeyedVectors.load_word2vec_format( args.wv_file, binary=True) print("done loading embedding") self.embedding_dim = self.wv_model.vector_size word_freq = { word: self.wv_model.wv.vocab[word].count for word in self.wv_model.wv.vocab } word_counter = Counter(word_freq) self.word_field.vocab = Vocab(word_counter, min_freq=args.min_word_freq) # mapping each vector/embedding from word2vec model to word_field vocabs vectors = [] print("start loading vec", len(self.word_field.vocab.stoi)) for word, idx in self.word_field.vocab.stoi.items(): if word in self.wv_model.wv.vocab.keys(): vectors.append( torch.as_tensor(self.wv_model.wv[word].tolist())) else: vectors.append(torch.zeros(self.embedding_dim)) print("done loading vec") del self.wv_model self.word_field.vocab.set_vectors( stoi=self.word_field.vocab.stoi, # list of vector embedding, orderred according to word_field.vocab vectors=vectors, dim=self.embedding_dim) else: self.word_field.build_vocab(self.train_dataset.word, min_freq=args.min_word_freq) self.char_field.build_vocab(self.train_dataset.char) self.entity_field.build_vocab(self.train_dataset.entity) self.argument_field.build_vocab(self.train_dataset.argument) self.trigger_pos_field.build_vocab(self.train_dataset.trigger_pos) self.train_iter, self.val_iter, self.test_iter = BucketIterator.splits( datasets=(self.train_dataset, self.val_dataset, self.test_dataset), batch_size=args.batch_size, shuffle=False, ) # prepare padding index to be ignored during model training/evaluation self.word_pad_idx = self.word_field.vocab.stoi[ self.word_field.pad_token] self.event_pad_idx = self.event_field.vocab.stoi[ self.event_field.pad_token] self.char_pad_idx = self.char_field.vocab.stoi[ self.char_field.pad_token] self.entity_pad_idx = self.entity_field.vocab.stoi[ self.entity_field.pad_token] self.argument_pad_idx = self.entity_field.vocab.stoi[ self.entity_field.pad_token]
class Corpus(object): def __init__(self, data_path, vector_path, glove6b, embedding_dim, min_word_freq, max_vocab_size, batch_size, device, test, prefix): ''' class for interacting with dataset data_path: root path for dataset directory vector_path: path for vector_cache glove6b: switch for using glove.6b pretrained embeddings embedding_dim: dimension of embedding (50, 100, 200, or 300 for glove.6b) min_word_freq: ignore words that don't meet the frequency threshold in the text field max_vocab_size: maximum size of the vocabulary of the text field batch_size: batch size for data iterators device: torch device test: switch for whether the dataset is a test (torchtext) set that is hopefully more likely to work prefix: prefix to be appended to data path ''' # set all of the attributes self.data_path, self.vector_path, self.glove6b = data_path, vector_path, glove6b self.embedding_dim, self.min_word_freq, self.max_vocab_size = embedding_dim, min_word_freq, max_vocab_size self.batch_size = batch_size self.device, self.test, self.prefix = device, test, prefix # initialize text and tag fields self.initialize_fields() # load dataset self.load_data() # build vocabularies from text and tag data self.build_vocabularies() # build iterators for batches of train, dev, and test sets self.initialize_iterators() # initialize indices of padding and unknown tokens self.init_idxs() def initialize_fields(self): ''' initializes fields ''' # initialize the text field with the spacy tokenizer and no casing self.text_field = Field(tokenize='spacy', lower=True, batch_first=True) # initialize the tag field without an unknown token (hopefully the train set contains all of the tags) self.tag_field = Field(unk_token=None, batch_first=True) # initialize the character field char_nesting_field = Field(tokenize=list, batch_first=True) self.char_field = NestedField(char_nesting_field) self.pad_token = self.text_field.pad_token def load_data(self): ''' load data from file using torchtext ''' if self.test: # built-in datasets if self.prefix == 'udpos': self.train_set, self.valid_set, self.test_set = UDPOS.splits( fields=((('text', 'char'), (self.text_field, self.char_field)), ('tag', self.tag_field), ('pos', None)), root=self.data_path) if self.prefix == 'conll2000': self.train_set, self.valid_set, self.test_set = CoNLL2000Chunking.splits( fields=((('text', 'char'), (self.text_field, self.char_field)), ('pos', None), ('tag', self.tag_field)), root=self.data_path) else: # load datasets from pre-prepared tsv files self.train_set, self.valid_set, self.test_set = SequenceTaggingDataset.splits( fields=((('text', 'char'), (self.text_field, self.char_field)), ('tag', self.tag_field)), path=self.data_path + '/{}'.format(self.prefix), train='train.tsv', validation='dev.tsv', test='test.tsv') def build_vocabularies(self): ''' builds vocabularies for the text and tag data ''' # if a vector path is provided, then we have to make sure that the word vectors are handled if self.vector_path: if self.glove6b: # the way to do this is built-in with glove.6b self.text_field.build_vocab(self.train_set.text, max_size=self.max_vocab_size, min_freq=self.min_word_freq, vectors='glove.6B.{}d'.format( self.embedding_dim), vectors_cache=self.vector_path, unk_init=torch.Tensor.normal_) else: # not sure if this is working self.text_field.build_vocab(self.train_set.text, max_size=self.max_vocab_size, min_freq=self.min_word_freq, vectors_cache=self.vector_path) ########################################################################### # not currently working due to conflict between gensim and python version # ########################################################################### # self.wv_model = gensim.models.word2vec.Word2Vec.load(wv_file) # self.embedding_dim = self.wv_model.vector_size # word_freq = {word: self.wv_model.wv.vocab[word].count for word in self.wv_model.wv.vocab} # word_counter = Counter(word_freq) # self.word_field.vocab = Vocab(word_counter, min_freq=min_word_freq) # vectors = [] # for word, idx in self.word_field.vocab.stoi.items(): # if word in self.wv_model.wv.vocab.keys(): # vectors.append(torch.as_tensor(self.wv_model.wv[word].tolist())) # else: # vectors.append(torch.zeros(self.embedding_dim)) # self.word_field.vocab.set_vectors(stoi=self.word_field.vocab.stoi, vectors=vectors, dim=self.embedding_dim) else: # no vectors required self.text_field.build_vocab(self.train_set.text, max_size=self.max_vocab_size, min_freq=self.min_word_freq) # build vocabulary for the tags (nothing fancy needed) self.char_field.build_vocab(self.train_set.char) self.tag_field.build_vocab(self.train_set.tag) def initialize_iterators(self): ''' build iterators for data (by batches) using the bucket iterator ''' self.train_iter, self.valid_iter, self.test_iter = BucketIterator.splits( datasets=(self.train_set, self.valid_set, self.test_set), batch_size=self.batch_size, device=self.device, random_state=seed) def init_idxs(self): ''' saves indices for padding and unknown tokens ''' self.text_pad_idx = self.text_field.vocab.stoi[ self.text_field.pad_token] self.char_pad_idx = self.char_field.vocab.stoi[ self.char_field.pad_token] self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token] self.text_unk_idx = self.text_field.vocab.stoi[ self.text_field.unk_token]
class IMDBHierarchical(IMDB): NESTING_FIELD = Field(batch_first=True, tokenize=clean_string, fix_length=50) TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=Sentence_Tokenize())