def __init__(self, path, text_field, label_field, samples=None, cap=None): fields = {'text': ('text', TEXT), 'label': ('label', LABEL)} with open(path, 'r', encoding='utf-8') as f: data = json.load(f) if samples: examples = [] for ix in samples: examples.append(Example.fromdict(data[ix], fields)) else: examples = [] for d in data: examples.append(Example.fromdict(d, fields)) if cap: if not isinstance(cap, int): raise ( "cap needs to be an instance of int, got {}".format(cap)) if cap < len(examples): examples = examples[:cap] if isinstance(fields, dict): fields, fields_dict = [], fields for field in fields_dict.values(): if isinstance(field, list): fields.extend(field) else: fields.append(field) super(DailyDialog, self).__init__(examples, fields)
def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None): self.sort_key = sort_key can_copy = 'src_map' in fields and 'alignment' in fields read_iters = [r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs)] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict( ex_dict, src_field.base_field, tgt_field.base_field) self.src_vocabs.append(src_ex_vocab) if "tgt" in ex_dict: _add_tgt_plan(ex_dict, fields['tgt'].base_field, "tgt_plan") _add_segment_details(ex_dict, fields['src'].base_field, "src", "segment_lengths", "segment_count") ex_fields = {k: [(k, v)] for k, v in fields.items() if k in ex_dict} ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(Dataset, self).__init__(examples, fields, filter_pred)
def annotate(model, doc, inp_fields): examples = [] for span1, span2, rel_type in span_pair_generator(doc): example = process_span_pair(span1, span2, doc, rel_type) if example is not None: torch_ex = Example.fromdict(example, inp_fields) ex_tensors = {} for name in inp_fields: tgt_fields = inp_fields[name] if not isinstance(tgt_fields, list): tgt_fields = [tgt_fields] data = [getattr(torch_ex, name)] for name, field in tgt_fields: ex_tensors[name] = field.process(data) word, length = ex_tensors['text'] mask = get_mask(word, ex_tensors['pos1'], ex_tensors['pos2'], length) logits = model(word=word, chars=ex_tensors['chars'], pos1=ex_tensors['pos1_rel'], pos2=ex_tensors['pos2_rel'], mask=mask) _, output = torch.max(logits, dim=1) output = [int(x.item()) for x in output] relation = Relation(source='nre', label=Label(value=str(output[0])), annotation_from=span1, annotation_to=span2, doc=doc) doc.add(relation)
def __init__(self, fields, path, filter_pred=None): paths = glob(path) if isinstance(path, str) else path assert len(paths) > 0 paths.sort() examples = [] for p in paths: with open(p) as f: language = lang_name(p) if 'language' in fields else None for line in f: line = line.strip() if line: ex_dict = dict() if language is not None: ex_dict["language"] = language line_fields = line.strip().split('\t') if len(line_fields) == 3: src, trg, inflection = line_fields ex_dict['trg'] = trg else: src, inflection = line_fields fields.pop("trg", None) # hmm ex_dict["src"] = src ex_dict["inflection"] = inflection ex = Example.fromdict(ex_dict, fields) examples.append(ex) fields = dict(chain.from_iterable(fields.values())) super(SigmorphonDataset, self).__init__(examples, fields, filter_pred)
def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None): #print('fields', fields) #print('readers', readers) #print('dirs', dirs) self.sort_key = sort_key can_copy = 'src_map' in fields and 'alignment' in fields #print('can_copy', can_copy) read_iters = [ r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs) ] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): #print('qields ques', ex_dict) if can_copy: ques_field = fields['ques'] ans_field = fields['ans'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict(ex_dict, ques_field.base_field, ans_field.base_field, tgt_field.base_field) self.src_vocabs.append(src_ex_vocab) ex_fields = { k: [(k, v)] for k, v in fields.items() if k in ex_dict } ########## HACK ############## #ex_dict["ques"] = [ex_dict["ques"], ex_dict["scores"]] ############################# #print(ex_fields) #print(ex_dict) #ex_fields["src_map_weights"] = ex_dict["src_map_weights"] ex = Example.fromdict(ex_dict, ex_fields) #scores_dict = {k: [(k, v)] for k, v in fields.items() if k == "scores"} #ex_temp = Example.fromdict(scores_dict, ex_fields) #print(ex.ques) #print(ex.ans) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(Dataset, self).__init__(examples, fields, filter_pred)
def __init__(self, fields, path, filter_pred=None, decompose=True): paths = glob(path) if isinstance(path, str) else path assert len(paths) > 0 paths.sort() examples = [] for p in paths: with open(p) as f: language = g2p_lang_name(p) if 'language' in fields else None for line in f: line = line.strip() if line: ex_dict = dict() if language is not None: ex_dict["language"] = language line_fields = line.strip().split('\t') assert 0 < len(line_fields) <= 2 src = line_fields[0] if decompose: # hard-coding the Korean decomposition src = unicodedata.normalize("NFD", src) ex_dict["src"] = src if len(line_fields) == 2: ex_dict['trg'] = line_fields[1] else: fields.pop("trg", None) ex = Example.fromdict(ex_dict, fields) examples.append(ex) fields = dict(chain.from_iterable(fields.values())) super(SigmorphonG2PDataset, self).__init__(examples, fields, filter_pred)
def __init__(self, root_path, img_dir, filename, fields, train, **kwargs): with open(os.path.join(root_path, filename), 'rb') as f: data = pickle.load(f) examples = [] rand_crop = 'rand_crop' in kwargs and kwargs['rand_crop'] self.img_transform = preprocess_rc if train and rand_crop else preprocess_1c self.train = train self.cap_field = fields['caption'][1] for cnt, ex in enumerate(data['captions']): img_id = ex['image_id'] img_path = ex['image_path'] examples.append({ 'image_id': img_id, 'img_to_load': os.path.join(root_path, img_dir, img_path) if rand_crop else None, 'img_1c_feat': torch.Tensor(data['features'][img_id]), 'caption': ex['caption'], 'caption_id': cnt }) examples = [Example.fromdict(ex, fields) for ex in examples] super(ImageCaptionDataset, self).__init__(examples, fields.values())
def __init__(self, fields, src_examples_iter, tgt_examples_iter, filter_pred=None): dynamic_dict = 'src_map' in fields and 'alignment' in fields if tgt_examples_iter is not None: examples_iter = (self._join_dicts( src, tgt) for src, tgt in zip(src_examples_iter, tgt_examples_iter)) else: examples_iter = src_examples_iter # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in examples_iter: if dynamic_dict: src_field = fields['src'][0][1] tgt_field = fields['tgt'][0][1] src_vocab, ex_dict = self._dynamic_dict( ex_dict, src_field, tgt_field) self.src_vocabs.append(src_vocab) ex_fields = {k: v for k, v in fields.items() if k in ex_dict} ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # the dataset's self.fields should have the same attributes as examples fields = dict(chain.from_iterable(ex_fields.values())) super(DatasetBase, self).__init__(examples, fields, filter_pred)
def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None): self.sort_key = sort_key can_copy = 'src_map' in fields and 'alignment' in fields read_iters = [r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs)] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): src_ex_vocab = None if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict( ex_dict, src_field.base_field, tgt_field.base_field) ex_fields = {k: [(k, v)] for k, v in fields.items() if k in ex_dict} ex = Example.fromdict(ex_dict, ex_fields) if (filter_pred is not None and filter_pred(ex)) or (filter_pred is None): examples.append(ex) if can_copy: self.src_vocabs.append(src_ex_vocab) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) assert len(examples) == len(self.src_vocabs), "example {}, src_vocabs {}".format(len(examples), len(self.src_vocabs)) super(Dataset, self).__init__(examples, fields, filter_pred)
def from_raw(cls, fields, readers, data, dirs, sort_key, filter_pred=None): assert filter_pred is None, 'filter_pred != None f***s up the data' can_copy = 'src_map' in fields and 'alignment' in fields read_iters = [ r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs) ] # self.src_vocabs is used in collapse_copy_scores and Translator.py src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict(ex_dict, src_field.base_field, tgt_field.base_field) src_vocabs.append(src_ex_vocab) ex_fields = { k: [(k, v)] for k, v in fields.items() if k in ex_dict } ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) return cls(examples, fields, sort_key, src_vocabs)
def __init__(self, fields, src_data, tgt_data, filter_func=None, sort_key=None): self.sort_key = sort_key ex_fields = {k: [(k, v)] for k, v in fields.items()} self.src_vocabs = [] examples = [] for ei, (sd, td) in enumerate(zip(src_data, tgt_data)): sd = sd.replace("\n", "") td = td.replace("\n", "") #print(f"GLOSS'{sd}' => TRANS'{td}'") ex_dict = {"indices": ei, "src": sd, "tgt": td} ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) dataset_fields = [] for _, name_field in ex_fields.items(): dataset_fields.append(name_field[0]) super(TextDataset, self).__init__(examples, dataset_fields, filter_func)
def __init__(self, fields, data, sort_key, filter_pred=None): self.sort_key = sort_key can_copy = 'src_map' in fields and 'alignment' in fields # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in data: if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text ex_dict = _dynamic_dict(ex_dict, src_field.base_field, tgt_field.base_field) self.src_vocabs.append(ex_dict["src_ex_vocab"]) ex_fields = { k: [(k, v)] for k, v in fields.items() if k in ex_dict } ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(Dataset, self).__init__(examples, fields, filter_pred)
def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None): # print("+++++++++++++++++++dataset_base Dataset init+++++++++++++++++++++++++") self.sort_key = sort_key can_copy = 'src_map' in fields and 'alignment' in fields # for k,v in fields.items(): # if(k in ['src','tgt','tt']): # print(("dataset_base.py init() fields_item",k,v.fields[0][1].include_lengths)) # for r,dat,dir_ in zip(readers,data,dirs): # print(r) # # print(dat[1]) # # print(dat[0]) # print(type(dir_)) # exit() read_iters = [ r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs) ] # print(type(read_iters)) # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): # print(("ex_dict",ex_dict)) if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict(ex_dict, src_field.base_field, tgt_field.base_field) self.src_vocabs.append(src_ex_vocab) # print("===========================can_copy==================") # print("====================111111======================") ex_fields = { k: [(k, v)] for k, v in fields.items() if k in ex_dict } # print(("dataset_base.py init ex_fields",ex_fields['tt'][0][1].fields[0][1].include_lengths)) # exit() ex = Example.fromdict(ex_dict, ex_fields) # print(("ex",ex['src'])) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): # print((_,nf_list)) assert len(nf_list) == 1 # print(nf_list[0]) fields.append(nf_list[0]) # print(fields) # exit() super(Dataset, self).__init__(examples, fields, filter_pred)
def make_examples(source, cvt): assert len(source['documents']) > 0 sq = source['question'].strip() passage_list = [] for doc in source['documents']: #ques_len = len(doc['segmented_title']) + 1 #clean_passage = "".join(doc['segmented_paragraphs'][doc['most_related_para']][ques_len:]) #if len(clean_passage) > 4: # passage_list.append(clean_passage) passage_list.append(doc['paragraphs'][doc['most_related_para']]) ret = [] for passage in passage_list[:args.max_para_num]: sample = cvt.convert(sq, passage, args.max_query_length, args.max_seq_length, to_tensor=False) (input_ids, input_mask, segment_ids) = sample['input'], sample['att_mask'], sample['seg'] example = { 'question_id': source['question_id'], 'question_text': sq, 'question_type': source['question_type'], 'passage': passage, 'answers': source['answers'], 'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids } ret.append(Example.fromdict(example, {t[0]: t for t in FIELDS})) return ret
def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None): self.sort_key = sort_key can_copy = 'src_map' in fields and 'alignment' in fields read_iters = [r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs)] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict( ex_dict, src_field.base_field, tgt_field.base_field) self.src_vocabs.append(src_ex_vocab) ex_fields = {k: [(k, v)] for k, v in fields.items() if k in ex_dict} ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(Dataset, self).__init__(examples, fields, filter_pred)
def __init__(self, fields, readers, data, dirs, filter_pred=None): dynamic_dict = 'src_map' in fields and 'alignment' in fields read_iters = [ r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs) ] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(self._join_dicts, zip(*read_iters)): if dynamic_dict: src_field = fields['src'][0][1] tgt_field = fields['tgt'][0][1] # this assumes src_field and tgt_field are both text src_vocab, ex_dict = self._dynamic_dict( ex_dict, src_field.base_field, tgt_field.base_field) self.src_vocabs.append(src_vocab) ex_fields = {k: v for k, v in fields.items() if k in ex_dict} ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # the dataset's self.fields should have the same attributes as examples fields = dict(chain.from_iterable(ex_fields.values())) super(DatasetBase, self).__init__(examples, fields, filter_pred)
def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None, pointers_file=None): self.sort_key = sort_key can_copy = 'src_map' in fields and 'alignment' in fields read_iters = [ r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers[:2], data[:2], dirs[:2]) ] # for the label we can directly read the element if len(readers) == 3: read_iters += [[{'label': i} for i in data[2][1]]] if pointers_file is not None: with open(pointers_file) as f: pointers = [line.strip() for line in f] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for i, ex_dict in enumerate(starmap(_join_dicts, zip(*read_iters))): if can_copy: ex_pointers = pointers[i] if pointers_file is not None else None src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict(ex_dict, src_field.base_field, tgt_field.base_field, pointers=ex_pointers) self.src_vocabs.append(src_ex_vocab) ex_fields = { k: [(k, v)] for k, v in fields.items() if k in ex_dict } #print(ex_fields) ex = Example.fromdict(ex_dict, ex_fields) #print(ex.src) #import sys #sys.exit() #print(ex.src_map.shape) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(Dataset, self).__init__(examples, fields, filter_pred)
def load_dataset(self): fields = self._get_fields() data_dict_generator = self._data_dict_generator() examples = [ Example.fromdict(data_dict, fields) for data_dict in data_dict_generator ] fields = {k: v for k, v in fields.values()} return Dataset(examples, fields)
def datapoint2example(datapoint, cws=False): if cws: TXT_field = TEXT_cws else: TXT_field = TEXT return Example.fromdict(datapoint, fields={ "author": ("author", AUTHOR), "book": ("book", BOOK), "text": ("text", TXT_field) })
def _to_examples(self, bucket, is_train=False): examples = [] for item in bucket: maybe_example = self._process(item, is_train=is_train) if maybe_example is not None: example = self._maybe_add_dynamic_dict( maybe_example, self.fields_dict) ex_fields = {k: [(k, v)] for k, v in self.fields_dict.items() if k in example} ex = TorchtextExample.fromdict(example, ex_fields) examples.append(ex) return examples
def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None): self.sort_key = sort_key can_copy = 'src_map' in fields and 'alignment' in fields read_iters = [ r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs) ] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict(ex_dict, src_field.base_field, tgt_field.base_field) self.src_vocabs.append(src_ex_vocab) ex_fields = { k: [(k, v)] for k, v in fields.items() if k in ex_dict } ex = Example.fromdict(ex_dict, ex_fields) if 'tgt' in ex_dict.keys(): ex_tgt = [] for word in getattr(ex, 'tgt')[0]: if word not in getattr(ex, 'src')[0]: ex_tgt += word.split(' ') else: ex_tgt.append(word) for i, word in enumerate(ex_tgt): if word == '': del ex_tgt[i] ex_tgt = [ex_tgt] setattr(ex, 'tgt', ex_tgt) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(Dataset, self).__init__(examples, fields, filter_pred)
def _get_data(self, file): examples = [] with jsonlines.open(file) as json_lines: for e in json_lines: example = { "context": e["title"] + ' ' + e["passage"], "question": e["question"], "answer": e["answer"] } examples.append( Example.fromdict(example, fields=self.dict_fields)) return Dataset(examples, self.fields)
def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None, tgt_type=None): # this is set at line 594 in inputter.py and line 303 in translator.py self.tgt_type = tgt_type # concatenate multiple tgt sequences with <sep> or keep them separate as a list of seqs (2D tensor) self.concat_tgt = False self.sort_key = sort_key # will be specified before training, one of [one2one, original, random, verbatim] # build src_map/alignment no matter field is available can_copy = True read_iters = [ r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs) ] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict(ex_dict, src_field.base_field, tgt_field.base_field) self.src_vocabs.append(src_ex_vocab) ex_fields = { k: [(k, v)] for k, v in fields.items() if k in ex_dict } ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(KeyphraseDataset, self).__init__(examples, fields, filter_pred)
def __init__(self, fields, path, bpe_path, filter_pred=None, lang_src=False, high_oversampling=1, low_oversampling=1): # if isinstance(path, str): # paths = [path] examples = [] #for path in paths: #with open(path) as f: if not isinstance(path, str): path=path[0] #print(bpe_path) f = open(path) f_bpe = open(bpe_path) language = lang_name(path) if 'language' in fields else None setting = data_setting(path) for line, line_bpe in zip(f,f_bpe): ex_dict = dict() if language is not None: ex_dict["language"] = language line_fields = line.strip().split('\t') if len(line_fields) == 3: src, tgt, inflection = line_fields ex_dict['tgt'] = tgt else: src, inflection = line_fields fields.pop("tgt", None) if "inflection" in fields: ex_dict["src"] = src ex_dict["inflection"] = inflection else: respaced_inflection = " ".join(inflection.split(";")) respaced_src = " ".join( [c if c != " " else "<space>" for c in src]) src_seq = [] if language is not None and lang_src: src_seq.append(language) src_seq.extend([respaced_inflection, respaced_src]) ex_dict["src"] = " ".join(src_seq) bpe_src = line_bpe.strip().split('|') bpe_lens = [len(split) for split in bpe_src] ex_dict["word_split"] = bpe_lens #print(ex_dict) ex = Example.fromdict(ex_dict, fields) if setting == "low": examples.extend((ex for i in range(low_oversampling))) else: examples.extend((ex for i in range(high_oversampling))) fields = dict(chain.from_iterable(fields.values())) #print(fields) super(SigmorphonDatasetBPE, self).__init__(examples, fields, filter_pred)
def _deserialize_example(cls, serialized, fields): """ Return (example, src_vocab) """ ex_dict = {} src_vocab = None for k, v in serialized.items(): if k == 'src_vocab': src_vocab = MiniVocab(v) elif k in ('src_map', 'alignment'): ex_dict[k] = torch.tensor(v) else: ex_dict[k] = v ex_fields = {k: [(k, v)] for k, v in fields.items() if k in ex_dict} example = Example.fromdict(ex_dict, ex_fields) return example, src_vocab
def make_examples(df: pd.DataFrame): examples = [] fields = {'tag': ('tag', tag_field), 'word': ('word', text_field)} for _, row in tqdm( df.groupby(["sent"]).agg({ "word": list, "tag": list }).iterrows()): # for row in sent: example = Example.fromdict(row, fields) examples.append(example) return Dataset(examples, fields=[('tag', tag_field), ('word', text_field)]).split()
def build_examples(data: List[Dict[str, str]], src_lang: str, dest_lang: str, logger: Logger) -> Tuple[List[Example], Field, Field]: logger.info('BUILD EXAMPLES') src_field = Field(lower=True, tokenize='spacy', tokenizer_language=src_lang, include_lengths=True) dest_field = Field(init_token='<sos>', eos_token='<eos>', lower=True, tokenize='spacy', tokenizer_language=dest_lang, include_lengths=True) examples = [Example.fromdict( data=pair, fields={ f'{src_lang}': ('src', src_field), f'{dest_lang}': ('dest', dest_field) } ) for pair in tqdm.tqdm(data)] logger.info(f'Number of examples: {len(examples):,}') return examples, src_field, dest_field
def read_data(X, y, SRC, TRG, preprocess=None, limit=1000): examples = [] fields = {'text-tokens': ('text', SRC), 'summ-tokens': ('summ', TRG)} for i,(x,y) in enumerate(zip(LineSentenceGenerator(X, preprocess),LineSentenceGenerator(y, preprocess))): text_field = x summ_field = y if limit is not None and i > limit: break e = Example.fromdict({"text-tokens": text_field, "summ-tokens": summ_field}, fields=fields) examples.append(e) return Dataset(examples, fields=[('text', SRC), ('summ', TRG)])
def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None): self.sort_key = sort_key can_copy = 'src_map' in fields and 'alignment' in fields read_iters = [ r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs) ] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] #starmap() # 接受实参:func, seq # 结果:func(*seq[0]), func(*seq[1]), ... # 例子:starmap(pow, [(2,5), (3,2), (10,3)]) --> 32 9 1000 for ex_dict in starmap(_join_dicts, zip(*read_iters)): if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict(ex_dict, src_field.base_field, tgt_field.base_field) self.src_vocabs.append(src_ex_vocab) ex_fields = { k: [(k, v)] for k, v in fields.items() if k in ex_dict } ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(Dataset, self).__init__(examples, fields, filter_pred)
def __init__(self, src_types: List[str], fields, readers: List, data: List[Tuple[str, Any]], dirs: List[str], sort_key, filter_pred=None, can_copy: bool = False, ): self.sort_key = sort_key read_iters = [r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs)] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): if can_copy: tgt_field = fields['tgt'] src_types_fields = { src_type: fields[f"src.{src_type}"].base_field for src_type in src_types } src_ex_vocab, ex_dict = _dynamic_dict( ex_dict, src_types, src_types_fields, tgt_field.base_field) self.src_vocabs.append(src_ex_vocab) # end if ex_fields = {k: [(k, v)] for k, v in fields.items() if k in ex_dict} ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(MultiSourceAPDataset, self).__init__(examples, fields, filter_pred)
def __init__(self, fields, path, filter_pred=None, lang_src=False): if isinstance(path, str): path = [path] examples = [] for p in path: with open(p) as f: language = lang_name(p) if 'language' in fields else None for line in f: line = line.strip() if line: ex_dict = dict() if language is not None: ex_dict["language"] = language line_fields = line.strip().split('\t') if len(line_fields) == 3: src, trg, inflection = line_fields ex_dict['trg'] = trg else: src, inflection = line_fields fields.pop("trg", None) # hmm # kludgey stuff for handling inflections respaced_inflection = " ".join(inflection.split(";")) respaced_src = " ".join( [c if c != " " else "<space>" for c in src]) src_seq = [] if language is not None and lang_src: src_seq.append(language) src_seq.extend([respaced_inflection, respaced_src]) ex_dict["src"] = " ".join(src_seq) ex = Example.fromdict(ex_dict, fields) examples.append(ex) fields = dict(chain.from_iterable(fields.values())) super(SimpleSigmorphonDataset, self).__init__(examples, fields, filter_pred)