def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None): self.sort_key = sort_key can_copy = 'src_map' in fields and 'alignment' in fields read_iters = [r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs)] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict( ex_dict, src_field.base_field, tgt_field.base_field) self.src_vocabs.append(src_ex_vocab) ex_fields = {k: [(k, v)] for k, v in fields.items() if k in ex_dict} ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(Dataset, self).__init__(examples, fields, filter_pred)
def __init__(self,annFile,text_field,transform=None): from pycocotools.coco import COCO coco = COCO(annFile) ids = list(coco.imgs.keys()) transform = transform field = [("text",text_field)] examples = [] max_seq_len = 0 for i in ids: ann_ids = coco.getAnnIds(imgIds=i) anns = coco.loadAnns(ann_ids) for ann in anns: caption = ann['caption'] if transform is not None: caption = transform(caption) if len(caption) > max_seq_len: max_seq_len = len(caption) examples.append(Example.fromlist([caption],field)) self.max_seq_len = max_seq_len + 2 # one for <sos> and one for <eos> super().__init__(examples=examples,fields=field)
def _load_examples(self, dev_splits, test_splits): dataset_path = os.path.join(DATASET_DIR, 'robust04', 'robust04.logits_bert_msmarco_mb.tsv') with open(dataset_path, 'r') as dataset_tsv: for line in dataset_tsv: data_row = line.split('\t') data_row[0] = binary_one_hot(data_row[0]) # Convert the label to one-hot data_row[1] = ast.literal_eval(data_row[1]) # Convert the logits to a float list data_row[2] = int(data_row[2]) # Convert the query id to an integer doc_id_index = len(self.doc_id_map) # Convert the document id to an integer self.doc_id_map[doc_id_index] = data_row[3] data_row[3] = doc_id_index example = Example.fromlist(data_row, self.fields) if data_row[2] in dev_splits: self.dev_examples.append(example) elif data_row[2] in test_splits: self.test_examples.append(example) else: self.train_examples.append(example)
def __init__(self, root_path, img_dir, filename, fields, train, **kwargs): with open(os.path.join(root_path, filename), 'rb') as f: data = pickle.load(f) examples = [] rand_crop = 'rand_crop' in kwargs and kwargs['rand_crop'] self.img_transform = preprocess_rc if train and rand_crop else preprocess_1c self.train = train self.cap_field = fields['caption'][1] for cnt, ex in enumerate(data['captions']): img_id = ex['image_id'] img_path = ex['image_path'] examples.append({ 'image_id': img_id, 'img_to_load': os.path.join(root_path, img_dir, img_path) if rand_crop else None, 'img_1c_feat': torch.Tensor(data['features'][img_id]), 'caption': ex['caption'], 'caption_id': cnt }) examples = [Example.fromdict(ex, fields) for ex in examples] super(ImageCaptionDataset, self).__init__(examples, fields.values())
def init_dataloaders(self): batch_size = self.config.hp.batch_size project_path = self.config.firelab.project_path domain_x_data_path = os.path.join(project_path, self.config.data.domain_x) domain_y_data_path = os.path.join(project_path, self.config.data.domain_y) with open(domain_x_data_path) as f: domain_x = f.read().splitlines() with open(domain_y_data_path) as f: domain_y = f.read().splitlines() text = Field(init_token='<bos>', eos_token='|', batch_first=True, tokenize=char_tokenize) fields = [('domain_x', text), ('domain_y', text)] examples = [ Example.fromlist([m, o], fields) for m, o in zip(domain_x, domain_y) ] train_exs, val_exs = train_test_split( examples, test_size=self.config.val_set_size, random_state=self.config.random_seed) self.train_ds, self.val_ds = Dataset(train_exs, fields), Dataset(val_exs, fields) text.build_vocab(self.train_ds, max_size=self.config.hp.get('max_vocab_size')) self.vocab = text.vocab self.train_dataloader = data.BucketIterator(self.train_ds, batch_size, repeat=False) self.val_dataloader = data.BucketIterator(self.val_ds, batch_size, repeat=False, shuffle=False)
def __init__(self, fields, paths, filter_pred=None, lang_src=False, high_oversampling=1, low_oversampling=1): if isinstance(paths, str): paths = [paths] examples = [] for path in paths: with open(path) as f: language = lang_name(path) if 'language' in fields else None setting = data_setting(path) for line in f: ex_dict = dict() if language is not None: ex_dict["language"] = language line_fields = line.strip().split('\t') if len(line_fields) == 3: src, tgt, inflection = line_fields ex_dict['tgt'] = tgt else: src, inflection = line_fields fields.pop("tgt", None) if "inflection" in fields: ex_dict["src"] = src ex_dict["inflection"] = inflection else: respaced_inflection = " ".join(inflection.split(";")) respaced_src = " ".join( [c if c != " " else "<space>" for c in src]) src_seq = [] if language is not None and lang_src: src_seq.append(language) src_seq.extend([respaced_inflection, respaced_src]) ex_dict["src"] = " ".join(src_seq) ex = Example.fromdict(ex_dict, fields) if setting == "low": examples.extend((ex for i in range(low_oversampling))) else: examples.extend((ex for i in range(high_oversampling))) fields = dict(chain.from_iterable(fields.values())) super(SigmorphonDataset, self).__init__(examples, fields, filter_pred)
def __init__(self, path, model, train_frac=1.0, encoding="utf-8"): text_field = Field(sequential=True, use_vocab=False, include_lengths=True, batch_first=True, pad_token=model.tokenizer.pad_token_id) fields = [ ('text', text_field), ('span', Field(sequential=False, use_vocab=False, batch_first=True)), ('orig_span', Field(sequential=False, use_vocab=False, batch_first=True)), ('label', Field(sequential=False, use_vocab=False, batch_first=True)) ] examples = [] f = open(path, encoding=encoding) lines = f.readlines() is_train = self.check_for_train_file(path) if is_train and train_frac < 1.0: red_num_lines = int(len(lines) * train_frac) lines = lines[:red_num_lines] for line in lines: instance = json.loads(line) text, subword_to_word_idx = model.tokenize( instance["text"].split(), get_subword_indices=True) for target in instance["targets"]: span_index = self.get_tokenized_span_indices( subword_to_word_idx, target["span1"]) label = target["label"] examples.append( Example.fromlist( [text, span_index, target["span1"], label], fields)) super(TaskDataset, self).__init__(examples, fields)
def json_to_dialogue_examples(path_dir: Path, *, fields: List[Tuple[str, Field]], utterance_key: str, role_key: str, text_key: str, sort_key: str, max_sl: int = 1000, target_roles: Optional[List[str]] = None) -> \ Iterator[Example]: """Load dialogues from json files a json file should have a List of Dicts, see examples: [{batch_col:chat_id, utterance_col:[{text_col:message, role_col:role, sort_col:timestamp}]}] """ for file_index, file in enumerate(path_dir.glob("*.json")): with file.open('r', encoding='utf-8') as fh: dialogues = json.load(fh) for dialogue in tqdm(dialogues, desc=f'processed file {file}'): if isinstance(sort_key, str): key = itemgetter(sort_key) elif callable(sort_key): key = sort_key else: raise ValueError("Invalid sort_key provided") conversation = sorted(dialogue[utterance_key], key=key) text = "" roles = "" lengths = [] tokenize = fields[0][1].tokenize for utterance in conversation: ut = utterance[text_key] ut = " ".join(ut) if isinstance(ut, list) else ut conv_role = "__" + utterance[role_key] + "__" text_with_role = conv_role + " " + ut if text.strip() != "": if target_roles is None or utterance[role_key] in target_roles: example = Example.fromlist([text.strip(), roles.strip(), text_with_role], fields) example.sl = [i for i in lengths] # sanity check if the sl is much larger than expected ignore assert len(lengths) == len(roles.split()) if max(example.sl) < max_sl: yield example text += " " + text_with_role roles += " " + conv_role lengths.append(len(tokenize(text_with_role)))
def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None): self.sort_key = sort_key can_copy = 'src_map' in fields and 'alignment' in fields read_iters = [ r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs) ] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict(ex_dict, src_field.base_field, tgt_field.base_field) self.src_vocabs.append(src_ex_vocab) ex_fields = { k: [(k, v)] for k, v in fields.items() if k in ex_dict } ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(Dataset, self).__init__(examples, fields, filter_pred)
def extract_features(self, instance: Dict[str, object]) -> Example: try: wordslst = instance['review'].split() words = [ self.SPACE_TOKEN if x == ' ' else x for x in wordslst[:self.MAX_LEN - 2] ] syllables = [ self.SPACE_TOKEN if x == ' ' else x for x in instance['review'][:self.MAX_LEN - 2] ] except: print(instance) ex = Example() if (self.tokenizeword): setattr(ex, 'syllable_contents', [self.INIT_TOKEN] + words + [self.EOS_TOKEN]) else: setattr(ex, 'syllable_contents', [self.INIT_TOKEN] + syllables + [self.EOS_TOKEN]) if 'sentiment' in instance: label = instance['sentiment'] if type(label) is int: label = int(label) setattr(ex, 'label', 1. if label >= 1 else 0.) elif type(label) is str: ''' if(label=='NEG'): setattr(ex, 'label', 0.) elif(label=='POS'): setattr(ex, 'label', 1.) else: setattr(ex, 'label', 2.) ''' setattr(ex, 'label', 1. if (label == '1.0' or label == '1') else 0.) else: raise Exception("yo label your y correctly...") return ex
def _create_sva_examples( sens: List[Sequence[str]], fields: List[Tuple[str, Field]] ) -> List[Example]: examples = [] for s1, s2 in sens: s1, s2 = s1.split(), s2.split() # Locate index of verb as first point where correct and incorrect sentence differ. verb_index = 0 for w1, w2 in zip(s1, s2): if w1 != w2: break verb_index += 1 subsen = s1[:verb_index] verb = s1[verb_index] wrong_verb = s2[verb_index] ex = Example.fromlist([subsen, verb, wrong_verb], fields) examples.append(ex) return examples
def __init__(self, fields, path, filter_pred=None, lang_src=False): if isinstance(path, str): path = [path] examples = [] for p in path: with open(p) as f: language = lang_name(p) if 'language' in fields else None for line in f: line = line.strip() if line: ex_dict = dict() if language is not None: ex_dict["language"] = language line_fields = line.strip().split('\t') if len(line_fields) == 3: src, trg, inflection = line_fields ex_dict['trg'] = trg else: src, inflection = line_fields fields.pop("trg", None) # hmm # kludgey stuff for handling inflections respaced_inflection = " ".join(inflection.split(";")) respaced_src = " ".join( [c if c != " " else "<space>" for c in src]) src_seq = [] if language is not None and lang_src: src_seq.append(language) src_seq.extend([respaced_inflection, respaced_src]) ex_dict["src"] = " ".join(src_seq) ex = Example.fromdict(ex_dict, fields) examples.append(ex) fields = dict(chain.from_iterable(fields.values())) super(SimpleSigmorphonDataset, self).__init__(examples, fields, filter_pred)
def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None, tgt_type=None): # this is set at line 594 in inputter.py and line 303 in translator.py self.tgt_type = tgt_type # concatenate multiple tgt sequences with <sep> or keep them separate as a list of seqs (2D tensor) self.concat_tgt = False self.sort_key = sort_key # will be specified before training, one of [one2one, original, random, verbatim] # build src_map/alignment no matter field is available can_copy = True read_iters = [r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs)] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict( ex_dict, src_field.base_field, tgt_field.base_field) self.src_vocabs.append(src_ex_vocab) ex_fields = {k: [(k, v)] for k, v in fields.items() if k in ex_dict} ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(KeyphraseDataset, self).__init__(examples, fields, filter_pred)
def __init__(self, path, text_field, newline_eos=True, encoding='utf-8', topk=float('inf'), **kwargs): fields = [('text', text_field)] text = [] with open(path, encoding=encoding) as f: line_counter = 0 for line in f: text += text_field.preprocess(line) if newline_eos: text.append(u'<eos>') line_counter += 1 if line_counter >= topk: break examples = [Example.fromlist([text], fields)] super(LanguageModelingDataset, self).__init__(examples, fields, **kwargs)
def parse_sentence(self, js, fields, amr): SENTID = fields["sentence_id"] WORDS = fields["words"] POSTAGS = fields["pos-tags"] # LEMMAS = fields["lemma"] ENTITYLABELS = fields["golden-entity-mentions"] if amr: colcc = "simple-parsing" else: colcc = "combined-parsing" # print(colcc) ADJMATRIX = fields[colcc] LABELS = fields["golden-event-mentions"] EVENTS = fields["all-events"] ENTITIES = fields["all-entities"] sentence = Sentence_ace(json_content=js, graph_field_name=colcc) ex = Example() # print('sentence.wordList', WORDS[1].preprocess(sentence.wordList)) setattr(ex, SENTID[0], SENTID[1].preprocess(sentence.sentence_id)) setattr(ex, WORDS[0], WORDS[1].preprocess(sentence.wordList)) setattr(ex, POSTAGS[0], POSTAGS[1].preprocess(sentence.posLabelList)) # setattr(ex, LEMMAS[0], LEMMAS[1].preprocess(sentence.lemmaList)) setattr(ex, ENTITYLABELS[0], ENTITYLABELS[1].preprocess(sentence.entityLabelList)) setattr(ex, ADJMATRIX[0], (sentence.adjpos, sentence.adjv)) setattr(ex, LABELS[0], LABELS[1].preprocess(sentence.triggerLabelList)) setattr(ex, EVENTS[0], EVENTS[1].preprocess(sentence.events)) setattr(ex, ENTITIES[0], ENTITIES[1].preprocess(sentence.entities)) if self.keep_events is not None: if self.only_keep and sentence.containsEvents != self.keep_events: return None elif not self.only_keep and sentence.containsEvents < self.keep_events: return None else: return ex else: return ex
def __init__(self, cache_path, fields, **kwargs): # save_cache interleaves src and trg examples so here we read the cache file having that format in mind cached_data = [line.split() for line in open(cache_path, encoding="utf-8")] cached_data_src = cached_data[0::2] # Even lines contain source examples cached_data_trg = cached_data[1::2] # Odd lines contain target examples assert len(cached_data_src) == len(cached_data_trg), f"Source and target data should be of the same length." examples = [] src_dataset_total_number_of_tokens = 0 trg_dataset_total_number_of_tokens = 0 for src_tokenized_data, trg_tokenized_data in zip(cached_data_src, cached_data_trg): ex = Example() setattr(ex, "src", src_tokenized_data) setattr(ex, "trg", trg_tokenized_data) examples.append(ex) # Update the number of tokens src_dataset_total_number_of_tokens += len(src_tokenized_data) trg_dataset_total_number_of_tokens += len(trg_tokenized_data) # Print relevant information about the dataset (parsing the cache file name) filename_parts = os.path.split(cache_path)[1].split("_") src_language, trg_language = ("English", "German") if filename_parts[0] == "en" else ("German", "English") dataset_name = "IWSLT" if filename_parts[2] == "iwslt" else "WMT-14" dataset_type = "train" if filename_parts[3] == "train" else "val" print( f"{dataset_type} dataset ({dataset_name}) has {src_dataset_total_number_of_tokens} tokens in the source language ({src_language}) corpus." ) print( f"{dataset_type} dataset ({dataset_name}) has {trg_dataset_total_number_of_tokens} tokens in the target language ({trg_language}) corpus." ) # Call the parent class Dataset's constructor super().__init__(examples, fields, **kwargs)
def convert_to_dataset(data, kor, eng): """ Pre-process input DataFrame and convert pandas DataFrame to torchtext Dataset. Args: data: (DataFrame) pandas DataFrame to be converted into torchtext Dataset kor: torchtext Field containing Korean sentence eng: torchtext Field containing English sentence Returns: (Dataset) torchtext Dataset containing 'kor' and 'eng' Fields """ # drop missing values not containing str value from DataFrame missing_rows = [idx for idx, row in data.iterrows() if type(row.korean) != str or type(row.english) != str] data = data.drop(missing_rows) # convert each row of DataFrame to torchtext 'Example' containing 'kor' and 'eng' Fields list_of_examples = [Example.fromlist(row.apply(lambda x: clean_text(x)).tolist(), fields=[('kor', kor), ('eng', eng)]) for _, row in data.iterrows()] # construct torchtext 'Dataset' using torchtext 'Example' list dataset = Dataset(examples=list_of_examples, fields=[('kor', kor), ('eng', eng)]) return dataset
def predict(model, field, sentence_beginnings: List[str], max_len=50, batch_size=100) -> List[str]: examples = [ Example.fromlist([x], [('text', field)]) for x in sentence_beginnings ] dataset = Dataset(examples, [('text', field)]) dataloader = BucketIterator(dataset, batch_size, repeat=False) results = [] for batch in dataloader: input = batch.text.to(DEVICE) _, input_state = model(input[:, :-1], return_state=True) curr_results = InferenceState({ "model": model.cached_forward, "inputs": input_state, "vocab": field.vocab, "device": DEVICE, "max_len": max_len, "is_inputs_update_enabled": True, "inputs_batch_dim": 1, "active_seqs": input, "sample_from_top": 0.5, "temperature": 0.2 }).inference() curr_results = [x.cpu().numpy().tolist() for x in curr_results] curr_results = itos_many(curr_results, field.vocab, remove_special=True) results.extend(curr_results) return results
def __init__(self, fields, path, filter_pred=None, columns=("src", "trg"), label_columns=()): """ Note that tsv does not currently allow missing columns (such as when translating a file with no trg specified) """ self._columns = columns self._src_columns = [ c for c in columns if c != "trg" and c not in label_columns ] self.label_columns = label_columns fields = {k: [(k, v)] for k, v in fields.items()} paths = glob(path) if isinstance(path, str) else path assert len(paths) > 0 paths.sort() examples = [] for p in paths: with open(p) as f: for line in f: line = line.strip() if line: ex_dict = dict() values = line.strip().split('\t') assert len(values) == len(columns), \ "Wrong number of columns" for column, value in zip(columns, values): ex_dict[column] = value ex = Example.fromdict(ex_dict, fields) examples.append(ex) fields = dict(chain.from_iterable(fields.values())) super(TSVDataset, self).__init__(examples, fields, filter_pred)
def __init__(self, xmlfile, model, encoding="utf-8"): text_field = Field(sequential=True, use_vocab=False, include_lengths=True, batch_first=True, pad_token=model.tokenizer.pad_token_id) non_seq_field = Field(sequential=False, use_vocab=False, batch_first=True) fields = [('ID', non_seq_field), ('event', text_field), ('type_event', non_seq_field), ('hyp_event', text_field), ('hyp_event_ID', non_seq_field), ('label', non_seq_field)] examples = [] tree = ET.parse(xmlfile) # get root element root = tree.getroot() for item in root.findall('item'): id = item.attrib['id'] event_type_text = item.attrib['asks-for'] assert(event_type_text in ['cause', 'effect']) event_type = (0 if event_type_text == "cause" else 1) hyp_label = int(item.attrib['most-plausible-alternative']) - 1 event, hyp_event_1, hyp_event_2 = None, None, None for child in item: tokenized_text = model.tokenize(child.text.lower().split()) if child.tag == "p": event = tokenized_text elif child.tag == "a1": hyp_event_1 = tokenized_text elif child.tag == "a2": hyp_event_2 = tokenized_text for idx, hyp_event in enumerate([hyp_event_1, hyp_event_2]): examples.append( Example.fromlist([id, event, event_type, hyp_event, idx, int(hyp_label == idx)], fields)) super(COPADataset, self).__init__(examples, fields)
def __init__(self, path, **kwargs): """ Create a Semeval dataset instance """ fields = [('qid', self.QID_FIELD), ('qaid', self.QID_FIELD), ('label', self.LABEL_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD)] examples = [] with open(path) as infile: for line in infile: content = json.loads(line) sent_list_1 = content['question'] sent_list_2 = content['qaquestion'] word_to_doc_cnt = get_pairwise_word_to_doc_freq( sent_list_1, sent_list_2) overlap_feats = get_pairwise_overlap_features( sent_list_1, sent_list_2, word_to_doc_cnt) overlap_feats = [] values = [ content['qid'], content['qaid'], content['qarel'], content['question'], content['qaquestion'], ' '.join(content['question']), ' '.join(content['qaquestion']), overlap_feats ] examples.append(Example.fromlist(values, fields)) super(Semeval, self).__init__(examples, fields, **kwargs)
def greedy_decoding(model: nn.Module, input, fields, maxLen=20): src_field = [('src', fields[0])] tgt_field = fields[1] ex = Example.fromlist([input], src_field) src_tensor = src.numericalize([ex.src], device) tgt_tensor = torch.tensor([[tgt.vocab.stoi['<s>']]], device=device) model.eval() dec_result = [] with torch.no_grad(): enc_out, hidden = model.encoder(src_tensor) for i in range(maxLen): dec_step, hidden = model.decoder(tgt_tensor, hidden, enc_out) _, top_idx = torch.topk(dec_step, 1) if tgt_field.vocab.itos[top_idx] == '</s>': break else: dec_result.append(top_idx.item()) tgt_tensor = top_idx.view(1, 1) dec_result = [tgt_field.vocab.itos[w] for w in dec_result] return dec_result
def test_single_gpu_batch_parse(): trainer = Trainer(gpus=1) # non-transferrable types primitive_objects = [None, {}, [], 1.0, "x", [None, 2], {"x": (1, 2), "y": None}] for batch in primitive_objects: data = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert data == batch # batch is just a tensor batch = torch.rand(2, 3) batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch.device.index == 0 and batch.type() == 'torch.cuda.FloatTensor' # tensor list batch = [torch.rand(2, 3), torch.rand(2, 3)] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0].device.index == 0 and batch[0].type() == 'torch.cuda.FloatTensor' assert batch[1].device.index == 0 and batch[1].type() == 'torch.cuda.FloatTensor' # tensor list of lists batch = [[torch.rand(2, 3), torch.rand(2, 3)]] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0][0].device.index == 0 and batch[0][0].type() == 'torch.cuda.FloatTensor' assert batch[0][1].device.index == 0 and batch[0][1].type() == 'torch.cuda.FloatTensor' # tensor dict batch = [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)}] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0]['a'].device.index == 0 and batch[0]['a'].type() == 'torch.cuda.FloatTensor' assert batch[0]['b'].device.index == 0 and batch[0]['b'].type() == 'torch.cuda.FloatTensor' # tuple of tensor list and list of tensor dict batch = ([torch.rand(2, 3) for _ in range(2)], [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)} for _ in range(2)]) batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0][0].device.index == 0 and batch[0][0].type() == 'torch.cuda.FloatTensor' assert batch[1][0]['a'].device.index == 0 assert batch[1][0]['a'].type() == 'torch.cuda.FloatTensor' assert batch[1][0]['b'].device.index == 0 assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor' # namedtuple of tensor BatchType = namedtuple('BatchType', ['a', 'b']) batch = [BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2)] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0].a.device.index == 0 assert batch[0].a.type() == 'torch.cuda.FloatTensor' # non-Tensor that has `.to()` defined class CustomBatchType: def __init__(self): self.a = torch.rand(2, 2) def to(self, *args, **kwargs): self.a = self.a.to(*args, **kwargs) return self batch = trainer.accelerator.batch_to_device(CustomBatchType(), torch.device('cuda:0')) assert batch.a.type() == 'torch.cuda.FloatTensor' # torchtext.data.Batch samples = [{ 'text': 'PyTorch Lightning is awesome!', 'label': 0 }, { 'text': 'Please make it work with torchtext', 'label': 1 }] text_field = Field() label_field = LabelField() fields = {'text': ('text', text_field), 'label': ('label', label_field)} examples = [Example.fromdict(sample, fields) for sample in samples] dataset = Dataset(examples=examples, fields=fields.values()) # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first text_field.build_vocab(dataset) label_field.build_vocab(dataset) batch = Batch(data=examples, dataset=dataset) batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch.text.type() == 'torch.cuda.LongTensor' assert batch.label.type() == 'torch.cuda.LongTensor'
def text_to_example(x: Tuple[str]) -> Example: return Example.fromlist(data=x, fields=fields)
def _make_torchtext_dataset(self, data, fields): examples = [Example.fromlist(i, fields) for i in data] return Dataset(examples, fields)
def predict(sentences: List[str], n_lines: int, temperature: float = None, max_len: int = None): "For each sentence generates `n_lines` lines sequentially to form a dialog" dialogs = [s for s in sentences ] # Let's not mutate original list and copy it batch_size = len(dialogs) temperature = temperature or DEFAULT_TEMPERATURE max_len = max_len or DEFAULT_MAX_LINE_LEN for _ in range(n_lines): examples = [ Example.fromlist([EOS_TOKEN.join(d)], [('text', field)]) for d in dialogs ] dataset = Dataset(examples, [('text', field)]) dataloader = data.BucketIterator(dataset, batch_size, shuffle=False, repeat=False) batch = next(iter(dataloader)) # We have a single batch text = cudable( batch.text[:, -MAX_CONTEXT_SIZE:] ) # As we made pad_first we are not afraid of losing information if model_cls_name == 'CharLMFromEmbs': z = lm.init_z(text.size(0), 1) z = lm(z, text, return_z=True)[1] elif model_cls_name == 'ConditionalLM': z = cudable(torch.zeros(2, len(text), 2048)) z = lm(z, text, style=1, return_z=True)[1] elif model_cls_name == 'WeightedLMEnsemble': z = cudable(torch.zeros(2, 1, len(text), 4096)) z = lm(z, text, return_z=True)[1] else: embs = lm.embed(text) z = lm.gru(embs)[1] next_lines = InferenceState({ 'model': lm, 'inputs': z, 'vocab': field.vocab, 'max_len': max_len, 'bos_token': EOS_TOKEN, # We start infering a new reply when we see EOS 'eos_token': EOS_TOKEN, 'temperature': temperature, 'sample_type': 'sample', 'inputs_batch_dim': 1 if model_cls_name != 'WeightedLMEnsemble' else 2, 'substitute_inputs': True, 'kwargs': inference_kwargs }).inference() next_lines = itos_many(next_lines, field.vocab, sep='') next_lines = [slice_unfinished_sentence(l) for l in next_lines] dialogs = [d + EOS_TOKEN + l for d, l in zip(dialogs, next_lines)] dialogs = [d.split(EOS_TOKEN) for d in dialogs] dialogs = [[s for s in d if len(s) != 0] for d in dialogs] dialogs = [assign_speakers(d) for d in dialogs] return dialogs
def __call__(self, args): (i, doc) = args return Example.fromlist([i, doc], self.fields)
def translate_a_single_sentence(translation_config): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # checking whether you have a GPU # Step 1: Prepare the field processor (tokenizer, numericalizer) _, _, src_field_processor, trg_field_processor = get_datasets_and_vocabs( translation_config['dataset_path'], translation_config['language_direction'], translation_config['dataset_name'] == DatasetType.IWSLT.name) assert src_field_processor.vocab.stoi[ PAD_TOKEN] == trg_field_processor.vocab.stoi[PAD_TOKEN] pad_token_id = src_field_processor.vocab.stoi[ PAD_TOKEN] # needed for constructing masks # Step 2: Prepare the model baseline_transformer = Transformer( model_dimension=BASELINE_MODEL_DIMENSION, src_vocab_size=len(src_field_processor.vocab), trg_vocab_size=len(trg_field_processor.vocab), number_of_heads=BASELINE_MODEL_NUMBER_OF_HEADS, number_of_layers=BASELINE_MODEL_NUMBER_OF_LAYERS, dropout_probability=BASELINE_MODEL_DROPOUT_PROB, log_attention_weights=True).to(device) model_path = os.path.join(BINARIES_PATH, translation_config['model_name']) if not os.path.exists(model_path): print(f'Model {model_path} does not exist, attempting to download.') model_path = download_models(translation_config) model_state = torch.load(model_path) print_model_metadata(model_state) baseline_transformer.load_state_dict(model_state["state_dict"], strict=True) baseline_transformer.eval() # Step 3: Prepare the input sentence source_sentence = translation_config['source_sentence'] ex = Example.fromlist([source_sentence], fields=[('src', src_field_processor) ]) # tokenize the sentence source_sentence_tokens = ex.src print(f'Source sentence tokens = {source_sentence_tokens}') # Numericalize and convert to cuda tensor src_token_ids_batch = src_field_processor.process([source_sentence_tokens], device) with torch.no_grad(): # Step 4: Optimization - compute the source token representations only once src_mask, _ = get_masks_and_count_tokens_src(src_token_ids_batch, pad_token_id) src_representations_batch = baseline_transformer.encode( src_token_ids_batch, src_mask) # Step 5: Decoding process if translation_config['decoding_method'] == DecodingMethod.GREEDY: target_sentence_tokens = greedy_decoding( baseline_transformer, src_representations_batch, src_mask, trg_field_processor) else: beam_decoding = get_beam_decoder(translation_config) target_sentence_tokens = beam_decoding(baseline_transformer, src_representations_batch, src_mask, trg_field_processor) print( f'Translation | Target sentence tokens = {target_sentence_tokens}') # Step 6: Potentially visualize the encoder/decoder attention weights if translation_config['visualize_attention']: visualize_attention(baseline_transformer, source_sentence_tokens, target_sentence_tokens)
def extract_features(self, instance: Dict[str, object]) -> Example: syllables = [self.SPACE_TOKEN if x == ' ' else x for x in instance['contents'][:self.MAX_LEN - 2]] ex = Example() setattr(ex, 'syllable_contents', [self.INIT_TOKEN] + syllables + [self.EOS_TOKEN]) return ex
def __getitem__(self, i): if not (self.cache_idx <= i < self.cache_idx + self.CACHE_SIZE): self.cache_idx = i self.cache = self.ds[i:i + self.CACHE_SIZE] return Example.fromlist(self.cache[i - self.cache_idx], self.fields)
def process(sample): return Example.fromlist([sample.text, sample.text, mapping[sample.label]], fields)
def classify_from_file(self, file_name, batch_size: int = 5, delimiter: str = ",", quotechar: str = '"', text_col_name: str = 'text'): assert self.has_trained """ This method reads in a file, parses it into the correct format and classifies the contents of the file. Throws an error when the model is not trained. :param file_name: string specifying the location and name of the file that contains the training dat :param delimiter: string specifying the delimiter used in the training csv file :param quotechar: string specifying the quotechar used in the training csv file :param text_col_name: string specifying the name of the column containing the mails in the csv file :param batch_size: integer specifying the batch size, this will affect the size of the batches fed into \ the model this can be set lower if memory issues occur :return: returns a list of results, where the result indices from the model have been converted back to \ the original class names from the file """ strings = pd.read_csv(file_name, sep=delimiter, quotechar=quotechar)[text_col_name].tolist() if isinstance(strings, str): strings = [strings] if isinstance(strings, list): strings = [[string] for string in strings] fields = [('text', self._TEXT)] list_of_examples = [ Example.fromlist(string, fields) for string in strings ] dataset = torchtext.data.Dataset(list_of_examples, fields) data = Iterator(dataset, batch_size=batch_size, device=torch.device("cpu"), sort=False, sort_within_batch=False, repeat=False, shuffle=False) predictions = defaultdict(list) for item in data: x = getattr(item, text_col_name) # Set the model to evaluation mode, important because of the Dropout Layers self.model.to(self.device) self.model = self.model.eval() outputs = self.model(x.to(self.device), tower=self.target_names_list) for i in range(len(self.target_names_list)): predictions[self.target_names_list[i]].extend( outputs[i].detach().cpu().argmax(1).tolist()) results = defaultdict(list) for key, val in predictions.items(): results[key] = [ self._label_names[key][i] for i in predictions[key] ] return results