Esempio n. 1
0
    def __init__(self, fields, readers, data, dirs, sort_key,
                 filter_pred=None):
        self.sort_key = sort_key
        can_copy = 'src_map' in fields and 'alignment' in fields

        read_iters = [r.read(dat[1], dat[0], dir_) for r, dat, dir_
                      in zip(readers, data, dirs)]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(
                    ex_dict, src_field.base_field, tgt_field.base_field)
                self.src_vocabs.append(src_ex_vocab)
            ex_fields = {k: [(k, v)] for k, v in fields.items() if
                         k in ex_dict}
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        super(Dataset, self).__init__(examples, fields, filter_pred)
	def __init__(self,annFile,text_field,transform=None):
		from pycocotools.coco import COCO
		coco = COCO(annFile)
		ids = list(coco.imgs.keys())
		transform = transform
		field = [("text",text_field)]
		examples = []
		max_seq_len = 0
		for i in ids:
			ann_ids = coco.getAnnIds(imgIds=i)
			anns = coco.loadAnns(ann_ids)
			for ann in anns:
				caption = ann['caption']
				if transform is not None:
					caption = transform(caption)
				if len(caption) > max_seq_len:
					max_seq_len = len(caption)
				examples.append(Example.fromlist([caption],field))
		self.max_seq_len = max_seq_len + 2 # one for <sos> and one for <eos>
		super().__init__(examples=examples,fields=field)
Esempio n. 3
0
    def _load_examples(self, dev_splits, test_splits):
        dataset_path = os.path.join(DATASET_DIR, 'robust04', 'robust04.logits_bert_msmarco_mb.tsv')
        with open(dataset_path, 'r') as dataset_tsv:
            for line in dataset_tsv:
                data_row = line.split('\t')
                data_row[0] = binary_one_hot(data_row[0])  # Convert the label to one-hot
                data_row[1] = ast.literal_eval(data_row[1])  # Convert the logits to a float list
                data_row[2] = int(data_row[2])  # Convert the query id to an integer

                doc_id_index = len(self.doc_id_map)  # Convert the document id to an integer
                self.doc_id_map[doc_id_index] = data_row[3]
                data_row[3] = doc_id_index

                example = Example.fromlist(data_row, self.fields)

                if data_row[2] in dev_splits:
                    self.dev_examples.append(example)
                elif data_row[2] in test_splits:
                    self.test_examples.append(example)
                else:
                    self.train_examples.append(example)
Esempio n. 4
0
    def __init__(self, root_path, img_dir, filename, fields, train, **kwargs):
        with open(os.path.join(root_path, filename), 'rb') as f:
            data = pickle.load(f)

        examples = []
        rand_crop = 'rand_crop' in kwargs and kwargs['rand_crop']
        self.img_transform = preprocess_rc if train and rand_crop else preprocess_1c
        self.train = train
        self.cap_field = fields['caption'][1]
        for cnt, ex in enumerate(data['captions']):
            img_id = ex['image_id']
            img_path = ex['image_path']
            examples.append({
                'image_id': img_id,
                'img_to_load': os.path.join(root_path, img_dir, img_path) if rand_crop else None,
                'img_1c_feat': torch.Tensor(data['features'][img_id]),
                'caption': ex['caption'],
                'caption_id': cnt
            })
        examples = [Example.fromdict(ex, fields) for ex in examples]
        super(ImageCaptionDataset, self).__init__(examples, fields.values())
    def init_dataloaders(self):
        batch_size = self.config.hp.batch_size
        project_path = self.config.firelab.project_path
        domain_x_data_path = os.path.join(project_path,
                                          self.config.data.domain_x)
        domain_y_data_path = os.path.join(project_path,
                                          self.config.data.domain_y)

        with open(domain_x_data_path) as f:
            domain_x = f.read().splitlines()
        with open(domain_y_data_path) as f:
            domain_y = f.read().splitlines()

        text = Field(init_token='<bos>',
                     eos_token='|',
                     batch_first=True,
                     tokenize=char_tokenize)
        fields = [('domain_x', text), ('domain_y', text)]
        examples = [
            Example.fromlist([m, o], fields)
            for m, o in zip(domain_x, domain_y)
        ]
        train_exs, val_exs = train_test_split(
            examples,
            test_size=self.config.val_set_size,
            random_state=self.config.random_seed)

        self.train_ds, self.val_ds = Dataset(train_exs,
                                             fields), Dataset(val_exs, fields)
        text.build_vocab(self.train_ds,
                         max_size=self.config.hp.get('max_vocab_size'))

        self.vocab = text.vocab
        self.train_dataloader = data.BucketIterator(self.train_ds,
                                                    batch_size,
                                                    repeat=False)
        self.val_dataloader = data.BucketIterator(self.val_ds,
                                                  batch_size,
                                                  repeat=False,
                                                  shuffle=False)
Esempio n. 6
0
    def __init__(self, fields, paths, filter_pred=None, lang_src=False,
                 high_oversampling=1, low_oversampling=1):
        if isinstance(paths, str):
            paths = [paths]
        examples = []
        for path in paths:
            with open(path) as f:
                language = lang_name(path) if 'language' in fields else None
                setting = data_setting(path)

                for line in f:
                    ex_dict = dict()
                    if language is not None:
                        ex_dict["language"] = language
                    line_fields = line.strip().split('\t')
                    if len(line_fields) == 3:
                        src, tgt, inflection = line_fields
                        ex_dict['tgt'] = tgt
                    else:
                        src, inflection = line_fields
                        fields.pop("tgt", None)
                    if "inflection" in fields:
                        ex_dict["src"] = src
                        ex_dict["inflection"] = inflection
                    else:
                        respaced_inflection = " ".join(inflection.split(";"))
                        respaced_src = " ".join(
                            [c if c != " " else "<space>" for c in src])
                        src_seq = []
                        if language is not None and lang_src:
                            src_seq.append(language)
                        src_seq.extend([respaced_inflection, respaced_src])
                        ex_dict["src"] = " ".join(src_seq)
                    ex = Example.fromdict(ex_dict, fields)
                    if setting == "low":
                        examples.extend((ex for i in range(low_oversampling)))
                    else:
                        examples.extend((ex for i in range(high_oversampling)))
        fields = dict(chain.from_iterable(fields.values()))
        super(SigmorphonDataset, self).__init__(examples, fields, filter_pred)
Esempio n. 7
0
    def __init__(self, path, model, train_frac=1.0, encoding="utf-8"):
        text_field = Field(sequential=True,
                           use_vocab=False,
                           include_lengths=True,
                           batch_first=True,
                           pad_token=model.tokenizer.pad_token_id)
        fields = [
            ('text', text_field),
            ('span', Field(sequential=False, use_vocab=False,
                           batch_first=True)),
            ('orig_span',
             Field(sequential=False, use_vocab=False, batch_first=True)),
            ('label', Field(sequential=False,
                            use_vocab=False,
                            batch_first=True))
        ]

        examples = []
        f = open(path, encoding=encoding)
        lines = f.readlines()
        is_train = self.check_for_train_file(path)

        if is_train and train_frac < 1.0:
            red_num_lines = int(len(lines) * train_frac)
            lines = lines[:red_num_lines]

        for line in lines:
            instance = json.loads(line)
            text, subword_to_word_idx = model.tokenize(
                instance["text"].split(), get_subword_indices=True)

            for target in instance["targets"]:
                span_index = self.get_tokenized_span_indices(
                    subword_to_word_idx, target["span1"])
                label = target["label"]
                examples.append(
                    Example.fromlist(
                        [text, span_index, target["span1"], label], fields))

        super(TaskDataset, self).__init__(examples, fields)
Esempio n. 8
0
def json_to_dialogue_examples(path_dir: Path, *, fields: List[Tuple[str, Field]], utterance_key: str, role_key: str,
                              text_key: str, sort_key: str, max_sl: int = 1000,
                              target_roles: Optional[List[str]] = None) -> \
        Iterator[Example]:
    """Load dialogues from json files
    a json file should have a List of Dicts, see examples:
     [{batch_col:chat_id, utterance_col:[{text_col:message, role_col:role, sort_col:timestamp}]}]

    """
    for file_index, file in enumerate(path_dir.glob("*.json")):
        with file.open('r', encoding='utf-8') as fh:
            dialogues = json.load(fh)
        for dialogue in tqdm(dialogues, desc=f'processed file {file}'):
            if isinstance(sort_key, str):
                key = itemgetter(sort_key)
            elif callable(sort_key):
                key = sort_key
            else:
                raise ValueError("Invalid sort_key provided")
            conversation = sorted(dialogue[utterance_key], key=key)
            text = ""
            roles = ""
            lengths = []
            tokenize = fields[0][1].tokenize
            for utterance in conversation:
                ut = utterance[text_key]
                ut = " ".join(ut) if isinstance(ut, list) else ut
                conv_role = "__" + utterance[role_key] + "__"
                text_with_role = conv_role + " " + ut
                if text.strip() != "":
                    if target_roles is None or utterance[role_key] in target_roles:
                        example = Example.fromlist([text.strip(), roles.strip(), text_with_role], fields)
                        example.sl = [i for i in lengths]
                        # sanity check if the sl is much larger than expected ignore
                        assert len(lengths) == len(roles.split())
                        if max(example.sl) < max_sl:
                            yield example
                text += " " + text_with_role
                roles += " " + conv_role
                lengths.append(len(tokenize(text_with_role)))
    def __init__(self,
                 fields,
                 readers,
                 data,
                 dirs,
                 sort_key,
                 filter_pred=None):
        self.sort_key = sort_key
        can_copy = 'src_map' in fields and 'alignment' in fields

        read_iters = [
            r.read(dat[1], dat[0], dir_)
            for r, dat, dir_ in zip(readers, data, dirs)
        ]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(ex_dict,
                                                      src_field.base_field,
                                                      tgt_field.base_field)
                self.src_vocabs.append(src_ex_vocab)
            ex_fields = {
                k: [(k, v)]
                for k, v in fields.items() if k in ex_dict
            }
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])
        super(Dataset, self).__init__(examples, fields, filter_pred)
Esempio n. 10
0
 def extract_features(self, instance: Dict[str, object]) -> Example:
     try:
         wordslst = instance['review'].split()
         words = [
             self.SPACE_TOKEN if x == ' ' else x
             for x in wordslst[:self.MAX_LEN - 2]
         ]
         syllables = [
             self.SPACE_TOKEN if x == ' ' else x
             for x in instance['review'][:self.MAX_LEN - 2]
         ]
     except:
         print(instance)
     ex = Example()
     if (self.tokenizeword):
         setattr(ex, 'syllable_contents',
                 [self.INIT_TOKEN] + words + [self.EOS_TOKEN])
     else:
         setattr(ex, 'syllable_contents',
                 [self.INIT_TOKEN] + syllables + [self.EOS_TOKEN])
     if 'sentiment' in instance:
         label = instance['sentiment']
         if type(label) is int:
             label = int(label)
             setattr(ex, 'label', 1. if label >= 1 else 0.)
         elif type(label) is str:
             '''
             if(label=='NEG'):
                 setattr(ex, 'label', 0.)
             elif(label=='POS'):
                 setattr(ex, 'label', 1.)
             else:
                 setattr(ex, 'label', 2.)
             '''
             setattr(ex, 'label', 1. if
                     (label == '1.0' or label == '1') else 0.)
         else:
             raise Exception("yo label your y correctly...")
     return ex
Esempio n. 11
0
    def _create_sva_examples(
        sens: List[Sequence[str]], fields: List[Tuple[str, Field]]
    ) -> List[Example]:
        examples = []

        for s1, s2 in sens:
            s1, s2 = s1.split(), s2.split()

            # Locate index of verb as first point where correct and incorrect sentence differ.
            verb_index = 0
            for w1, w2 in zip(s1, s2):
                if w1 != w2:
                    break
                verb_index += 1

            subsen = s1[:verb_index]
            verb = s1[verb_index]
            wrong_verb = s2[verb_index]
            ex = Example.fromlist([subsen, verb, wrong_verb], fields)
            examples.append(ex)

        return examples
Esempio n. 12
0
    def __init__(self, fields, path, filter_pred=None, lang_src=False):
        if isinstance(path, str):
            path = [path]
        examples = []
        for p in path:
            with open(p) as f:
                language = lang_name(p) if 'language' in fields else None

                for line in f:
                    line = line.strip()
                    if line:
                        ex_dict = dict()
                        if language is not None:
                            ex_dict["language"] = language
                        line_fields = line.strip().split('\t')
                        if len(line_fields) == 3:
                            src, trg, inflection = line_fields
                            ex_dict['trg'] = trg
                        else:
                            src, inflection = line_fields
                            fields.pop("trg", None)  # hmm

                        # kludgey stuff for handling inflections
                        respaced_inflection = " ".join(inflection.split(";"))
                        respaced_src = " ".join(
                            [c if c != " " else "<space>" for c in src])
                        src_seq = []
                        if language is not None and lang_src:
                            src_seq.append(language)
                        src_seq.extend([respaced_inflection, respaced_src])

                        ex_dict["src"] = " ".join(src_seq)

                        ex = Example.fromdict(ex_dict, fields)
                        examples.append(ex)

        fields = dict(chain.from_iterable(fields.values()))
        super(SimpleSigmorphonDataset, self).__init__(examples, fields,
                                                      filter_pred)
Esempio n. 13
0
    def __init__(self, fields, readers, data, dirs, sort_key,
                 filter_pred=None, tgt_type=None):
        # this is set at line 594 in inputter.py and line 303 in translator.py
        self.tgt_type = tgt_type
        # concatenate multiple tgt sequences with <sep> or keep them separate as a list of seqs (2D tensor)
        self.concat_tgt = False
        self.sort_key = sort_key

        # will be specified before training, one of [one2one, original, random, verbatim]

        # build src_map/alignment no matter field is available
        can_copy = True

        read_iters = [r.read(dat[1], dat[0], dir_) for r, dat, dir_
                      in zip(readers, data, dirs)]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(
                    ex_dict, src_field.base_field, tgt_field.base_field)
                self.src_vocabs.append(src_ex_vocab)
            ex_fields = {k: [(k, v)] for k, v in fields.items() if
                         k in ex_dict}
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        super(KeyphraseDataset, self).__init__(examples, fields, filter_pred)
Esempio n. 14
0
    def __init__(self,
                 path,
                 text_field,
                 newline_eos=True,
                 encoding='utf-8',
                 topk=float('inf'),
                 **kwargs):
        fields = [('text', text_field)]
        text = []
        with open(path, encoding=encoding) as f:
            line_counter = 0
            for line in f:
                text += text_field.preprocess(line)
                if newline_eos:
                    text.append(u'<eos>')
                line_counter += 1
                if line_counter >= topk:
                    break

        examples = [Example.fromlist([text], fields)]
        super(LanguageModelingDataset, self).__init__(examples, fields,
                                                      **kwargs)
Esempio n. 15
0
    def parse_sentence(self, js, fields, amr):
        SENTID = fields["sentence_id"]
        WORDS = fields["words"]
        POSTAGS = fields["pos-tags"]
        # LEMMAS = fields["lemma"]
        ENTITYLABELS = fields["golden-entity-mentions"]
        if amr:
            colcc = "simple-parsing"
        else:
            colcc = "combined-parsing"
        # print(colcc)
        ADJMATRIX = fields[colcc]
        LABELS = fields["golden-event-mentions"]
        EVENTS = fields["all-events"]
        ENTITIES = fields["all-entities"]

        sentence = Sentence_ace(json_content=js, graph_field_name=colcc)
        ex = Example()
        # print('sentence.wordList', WORDS[1].preprocess(sentence.wordList))
        setattr(ex, SENTID[0], SENTID[1].preprocess(sentence.sentence_id))
        setattr(ex, WORDS[0], WORDS[1].preprocess(sentence.wordList))
        setattr(ex, POSTAGS[0], POSTAGS[1].preprocess(sentence.posLabelList))
        # setattr(ex, LEMMAS[0], LEMMAS[1].preprocess(sentence.lemmaList))
        setattr(ex, ENTITYLABELS[0],
                ENTITYLABELS[1].preprocess(sentence.entityLabelList))
        setattr(ex, ADJMATRIX[0], (sentence.adjpos, sentence.adjv))
        setattr(ex, LABELS[0], LABELS[1].preprocess(sentence.triggerLabelList))
        setattr(ex, EVENTS[0], EVENTS[1].preprocess(sentence.events))
        setattr(ex, ENTITIES[0], ENTITIES[1].preprocess(sentence.entities))

        if self.keep_events is not None:
            if self.only_keep and sentence.containsEvents != self.keep_events:
                return None
            elif not self.only_keep and sentence.containsEvents < self.keep_events:
                return None
            else:
                return ex
        else:
            return ex
Esempio n. 16
0
    def __init__(self, cache_path, fields, **kwargs):
        # save_cache interleaves src and trg examples so here we read the cache file having that format in mind
        cached_data = [line.split() for line in open(cache_path, encoding="utf-8")]

        cached_data_src = cached_data[0::2]  # Even lines contain source examples
        cached_data_trg = cached_data[1::2]  # Odd lines contain target examples

        assert len(cached_data_src) == len(cached_data_trg), f"Source and target data should be of the same length."

        examples = []
        src_dataset_total_number_of_tokens = 0
        trg_dataset_total_number_of_tokens = 0
        for src_tokenized_data, trg_tokenized_data in zip(cached_data_src, cached_data_trg):
            ex = Example()

            setattr(ex, "src", src_tokenized_data)
            setattr(ex, "trg", trg_tokenized_data)

            examples.append(ex)

            # Update the number of tokens
            src_dataset_total_number_of_tokens += len(src_tokenized_data)
            trg_dataset_total_number_of_tokens += len(trg_tokenized_data)

        # Print relevant information about the dataset (parsing the cache file name)
        filename_parts = os.path.split(cache_path)[1].split("_")
        src_language, trg_language = ("English", "German") if filename_parts[0] == "en" else ("German", "English")
        dataset_name = "IWSLT" if filename_parts[2] == "iwslt" else "WMT-14"
        dataset_type = "train" if filename_parts[3] == "train" else "val"
        print(
            f"{dataset_type} dataset ({dataset_name}) has {src_dataset_total_number_of_tokens} tokens in the source language ({src_language}) corpus."
        )
        print(
            f"{dataset_type} dataset ({dataset_name}) has {trg_dataset_total_number_of_tokens} tokens in the target language ({trg_language}) corpus."
        )

        # Call the parent class Dataset's constructor
        super().__init__(examples, fields, **kwargs)
Esempio n. 17
0
def convert_to_dataset(data, kor, eng):
    """
    Pre-process input DataFrame and convert pandas DataFrame to torchtext Dataset.
    Args:
        data: (DataFrame) pandas DataFrame to be converted into torchtext Dataset
        kor: torchtext Field containing Korean sentence
        eng: torchtext Field containing English sentence

    Returns:
        (Dataset) torchtext Dataset containing 'kor' and 'eng' Fields
    """
    # drop missing values not containing str value from DataFrame
    missing_rows = [idx for idx, row in data.iterrows() if type(row.korean) != str or type(row.english) != str]
    data = data.drop(missing_rows)

    # convert each row of DataFrame to torchtext 'Example' containing 'kor' and 'eng' Fields
    list_of_examples = [Example.fromlist(row.apply(lambda x: clean_text(x)).tolist(),
                                         fields=[('kor', kor), ('eng', eng)]) for _, row in data.iterrows()]

    # construct torchtext 'Dataset' using torchtext 'Example' list
    dataset = Dataset(examples=list_of_examples, fields=[('kor', kor), ('eng', eng)])

    return dataset
Esempio n. 18
0
def predict(model,
            field,
            sentence_beginnings: List[str],
            max_len=50,
            batch_size=100) -> List[str]:
    examples = [
        Example.fromlist([x], [('text', field)]) for x in sentence_beginnings
    ]
    dataset = Dataset(examples, [('text', field)])
    dataloader = BucketIterator(dataset, batch_size, repeat=False)
    results = []

    for batch in dataloader:
        input = batch.text.to(DEVICE)
        _, input_state = model(input[:, :-1], return_state=True)

        curr_results = InferenceState({
            "model": model.cached_forward,
            "inputs": input_state,
            "vocab": field.vocab,
            "device": DEVICE,
            "max_len": max_len,
            "is_inputs_update_enabled": True,
            "inputs_batch_dim": 1,
            "active_seqs": input,
            "sample_from_top": 0.5,
            "temperature": 0.2
        }).inference()

        curr_results = [x.cpu().numpy().tolist() for x in curr_results]
        curr_results = itos_many(curr_results,
                                 field.vocab,
                                 remove_special=True)

        results.extend(curr_results)

    return results
Esempio n. 19
0
    def __init__(self,
                 fields,
                 path,
                 filter_pred=None,
                 columns=("src", "trg"),
                 label_columns=()):
        """
        Note that tsv does not currently allow missing columns (such as when
        translating a file with no trg specified)
        """
        self._columns = columns
        self._src_columns = [
            c for c in columns if c != "trg" and c not in label_columns
        ]
        self.label_columns = label_columns
        fields = {k: [(k, v)] for k, v in fields.items()}
        paths = glob(path) if isinstance(path, str) else path
        assert len(paths) > 0
        paths.sort()
        examples = []
        for p in paths:
            with open(p) as f:
                for line in f:
                    line = line.strip()
                    if line:
                        ex_dict = dict()
                        values = line.strip().split('\t')
                        assert len(values) == len(columns), \
                            "Wrong number of columns"
                        for column, value in zip(columns, values):
                            ex_dict[column] = value

                        ex = Example.fromdict(ex_dict, fields)
                        examples.append(ex)

        fields = dict(chain.from_iterable(fields.values()))
        super(TSVDataset, self).__init__(examples, fields, filter_pred)
Esempio n. 20
0
    def __init__(self, xmlfile, model, encoding="utf-8"):
        text_field = Field(sequential=True, use_vocab=False, include_lengths=True,
                           batch_first=True, pad_token=model.tokenizer.pad_token_id)
        non_seq_field = Field(sequential=False, use_vocab=False, batch_first=True)
        fields = [('ID', non_seq_field), ('event', text_field), ('type_event', non_seq_field),
                  ('hyp_event', text_field), ('hyp_event_ID',  non_seq_field),
                  ('label', non_seq_field)]

        examples = []
        tree = ET.parse(xmlfile)
        # get root element
        root = tree.getroot()

        for item in root.findall('item'):
            id = item.attrib['id']
            event_type_text = item.attrib['asks-for']
            assert(event_type_text in ['cause', 'effect'])
            event_type = (0 if event_type_text == "cause" else 1)

            hyp_label = int(item.attrib['most-plausible-alternative']) - 1
            event, hyp_event_1, hyp_event_2 = None, None, None
            for child in item:
                tokenized_text = model.tokenize(child.text.lower().split())
                if child.tag == "p":
                    event = tokenized_text
                elif child.tag == "a1":
                    hyp_event_1 = tokenized_text
                elif child.tag == "a2":
                    hyp_event_2 = tokenized_text

            for idx, hyp_event in enumerate([hyp_event_1, hyp_event_2]):
                examples.append(
                    Example.fromlist([id, event, event_type, hyp_event, idx,
                                      int(hyp_label == idx)], fields))

        super(COPADataset, self).__init__(examples, fields)
Esempio n. 21
0
    def __init__(self, path, **kwargs):
        """
        Create a Semeval dataset instance
        """

        fields = [('qid', self.QID_FIELD), ('qaid', self.QID_FIELD),
                  ('label', self.LABEL_FIELD), ('sentence_1', self.TEXT_FIELD),
                  ('sentence_2', self.TEXT_FIELD),
                  ('sentence_1_raw', self.RAW_TEXT_FIELD),
                  ('sentence_2_raw', self.RAW_TEXT_FIELD),
                  ('ext_feats', self.EXT_FEATS_FIELD)]

        examples = []

        with open(path) as infile:
            for line in infile:
                content = json.loads(line)

                sent_list_1 = content['question']
                sent_list_2 = content['qaquestion']

                word_to_doc_cnt = get_pairwise_word_to_doc_freq(
                    sent_list_1, sent_list_2)
                overlap_feats = get_pairwise_overlap_features(
                    sent_list_1, sent_list_2, word_to_doc_cnt)
                overlap_feats = []
                values = [
                    content['qid'], content['qaid'], content['qarel'],
                    content['question'], content['qaquestion'],
                    ' '.join(content['question']),
                    ' '.join(content['qaquestion']), overlap_feats
                ]

                examples.append(Example.fromlist(values, fields))

        super(Semeval, self).__init__(examples, fields, **kwargs)
Esempio n. 22
0
def greedy_decoding(model: nn.Module, input, fields, maxLen=20):
    src_field = [('src', fields[0])]
    tgt_field = fields[1]

    ex = Example.fromlist([input], src_field)
    src_tensor = src.numericalize([ex.src], device)
    tgt_tensor = torch.tensor([[tgt.vocab.stoi['<s>']]], device=device)
    model.eval()

    dec_result = []

    with torch.no_grad():
        enc_out, hidden = model.encoder(src_tensor)
        for i in range(maxLen):
            dec_step, hidden = model.decoder(tgt_tensor, hidden, enc_out)
            _, top_idx = torch.topk(dec_step, 1)
            if tgt_field.vocab.itos[top_idx] == '</s>':
                break
            else:
                dec_result.append(top_idx.item())
                tgt_tensor = top_idx.view(1, 1)

    dec_result = [tgt_field.vocab.itos[w] for w in dec_result]
    return dec_result
Esempio n. 23
0
def test_single_gpu_batch_parse():
    trainer = Trainer(gpus=1)

    # non-transferrable types
    primitive_objects = [None, {}, [], 1.0, "x", [None, 2], {"x": (1, 2), "y": None}]
    for batch in primitive_objects:
        data = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
        assert data == batch

    # batch is just a tensor
    batch = torch.rand(2, 3)
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch.device.index == 0 and batch.type() == 'torch.cuda.FloatTensor'

    # tensor list
    batch = [torch.rand(2, 3), torch.rand(2, 3)]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0].device.index == 0 and batch[0].type() == 'torch.cuda.FloatTensor'
    assert batch[1].device.index == 0 and batch[1].type() == 'torch.cuda.FloatTensor'

    # tensor list of lists
    batch = [[torch.rand(2, 3), torch.rand(2, 3)]]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0][0].device.index == 0 and batch[0][0].type() == 'torch.cuda.FloatTensor'
    assert batch[0][1].device.index == 0 and batch[0][1].type() == 'torch.cuda.FloatTensor'

    # tensor dict
    batch = [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)}]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0]['a'].device.index == 0 and batch[0]['a'].type() == 'torch.cuda.FloatTensor'
    assert batch[0]['b'].device.index == 0 and batch[0]['b'].type() == 'torch.cuda.FloatTensor'

    # tuple of tensor list and list of tensor dict
    batch = ([torch.rand(2, 3) for _ in range(2)], [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)} for _ in range(2)])
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0][0].device.index == 0 and batch[0][0].type() == 'torch.cuda.FloatTensor'

    assert batch[1][0]['a'].device.index == 0
    assert batch[1][0]['a'].type() == 'torch.cuda.FloatTensor'

    assert batch[1][0]['b'].device.index == 0
    assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor'

    # namedtuple of tensor
    BatchType = namedtuple('BatchType', ['a', 'b'])
    batch = [BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2)]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0].a.device.index == 0
    assert batch[0].a.type() == 'torch.cuda.FloatTensor'

    # non-Tensor that has `.to()` defined
    class CustomBatchType:

        def __init__(self):
            self.a = torch.rand(2, 2)

        def to(self, *args, **kwargs):
            self.a = self.a.to(*args, **kwargs)
            return self

    batch = trainer.accelerator.batch_to_device(CustomBatchType(), torch.device('cuda:0'))
    assert batch.a.type() == 'torch.cuda.FloatTensor'

    # torchtext.data.Batch
    samples = [{
        'text': 'PyTorch Lightning is awesome!',
        'label': 0
    }, {
        'text': 'Please make it work with torchtext',
        'label': 1
    }]

    text_field = Field()
    label_field = LabelField()
    fields = {'text': ('text', text_field), 'label': ('label', label_field)}

    examples = [Example.fromdict(sample, fields) for sample in samples]
    dataset = Dataset(examples=examples, fields=fields.values())

    # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first
    text_field.build_vocab(dataset)
    label_field.build_vocab(dataset)

    batch = Batch(data=examples, dataset=dataset)
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))

    assert batch.text.type() == 'torch.cuda.LongTensor'
    assert batch.label.type() == 'torch.cuda.LongTensor'
Esempio n. 24
0
 def text_to_example(x: Tuple[str]) -> Example:
     return Example.fromlist(data=x, fields=fields)
 def _make_torchtext_dataset(self, data, fields):
     examples = [Example.fromlist(i, fields) for i in data]
     return Dataset(examples, fields)
    def predict(sentences: List[str],
                n_lines: int,
                temperature: float = None,
                max_len: int = None):
        "For each sentence generates `n_lines` lines sequentially to form a dialog"
        dialogs = [s for s in sentences
                   ]  # Let's not mutate original list and copy it
        batch_size = len(dialogs)
        temperature = temperature or DEFAULT_TEMPERATURE
        max_len = max_len or DEFAULT_MAX_LINE_LEN

        for _ in range(n_lines):
            examples = [
                Example.fromlist([EOS_TOKEN.join(d)], [('text', field)])
                for d in dialogs
            ]
            dataset = Dataset(examples, [('text', field)])
            dataloader = data.BucketIterator(dataset,
                                             batch_size,
                                             shuffle=False,
                                             repeat=False)
            batch = next(iter(dataloader))  # We have a single batch
            text = cudable(
                batch.text[:, -MAX_CONTEXT_SIZE:]
            )  # As we made pad_first we are not afraid of losing information

            if model_cls_name == 'CharLMFromEmbs':
                z = lm.init_z(text.size(0), 1)
                z = lm(z, text, return_z=True)[1]
            elif model_cls_name == 'ConditionalLM':
                z = cudable(torch.zeros(2, len(text), 2048))
                z = lm(z, text, style=1, return_z=True)[1]
            elif model_cls_name == 'WeightedLMEnsemble':
                z = cudable(torch.zeros(2, 1, len(text), 4096))
                z = lm(z, text, return_z=True)[1]
            else:
                embs = lm.embed(text)
                z = lm.gru(embs)[1]

            next_lines = InferenceState({
                'model':
                lm,
                'inputs':
                z,
                'vocab':
                field.vocab,
                'max_len':
                max_len,
                'bos_token':
                EOS_TOKEN,  # We start infering a new reply when we see EOS
                'eos_token':
                EOS_TOKEN,
                'temperature':
                temperature,
                'sample_type':
                'sample',
                'inputs_batch_dim':
                1 if model_cls_name != 'WeightedLMEnsemble' else 2,
                'substitute_inputs':
                True,
                'kwargs':
                inference_kwargs
            }).inference()

            next_lines = itos_many(next_lines, field.vocab, sep='')
            next_lines = [slice_unfinished_sentence(l) for l in next_lines]
            dialogs = [d + EOS_TOKEN + l for d, l in zip(dialogs, next_lines)]

        dialogs = [d.split(EOS_TOKEN) for d in dialogs]
        dialogs = [[s for s in d if len(s) != 0] for d in dialogs]
        dialogs = [assign_speakers(d) for d in dialogs]

        return dialogs
Esempio n. 27
0
 def __call__(self, args):
     (i, doc) = args
     return Example.fromlist([i, doc], self.fields)
Esempio n. 28
0
def translate_a_single_sentence(translation_config):
    device = torch.device("cuda" if torch.cuda.is_available() else
                          "cpu")  # checking whether you have a GPU

    # Step 1: Prepare the field processor (tokenizer, numericalizer)
    _, _, src_field_processor, trg_field_processor = get_datasets_and_vocabs(
        translation_config['dataset_path'],
        translation_config['language_direction'],
        translation_config['dataset_name'] == DatasetType.IWSLT.name)
    assert src_field_processor.vocab.stoi[
        PAD_TOKEN] == trg_field_processor.vocab.stoi[PAD_TOKEN]
    pad_token_id = src_field_processor.vocab.stoi[
        PAD_TOKEN]  # needed for constructing masks

    # Step 2: Prepare the model
    baseline_transformer = Transformer(
        model_dimension=BASELINE_MODEL_DIMENSION,
        src_vocab_size=len(src_field_processor.vocab),
        trg_vocab_size=len(trg_field_processor.vocab),
        number_of_heads=BASELINE_MODEL_NUMBER_OF_HEADS,
        number_of_layers=BASELINE_MODEL_NUMBER_OF_LAYERS,
        dropout_probability=BASELINE_MODEL_DROPOUT_PROB,
        log_attention_weights=True).to(device)

    model_path = os.path.join(BINARIES_PATH, translation_config['model_name'])
    if not os.path.exists(model_path):
        print(f'Model {model_path} does not exist, attempting to download.')
        model_path = download_models(translation_config)

    model_state = torch.load(model_path)
    print_model_metadata(model_state)
    baseline_transformer.load_state_dict(model_state["state_dict"],
                                         strict=True)
    baseline_transformer.eval()

    # Step 3: Prepare the input sentence
    source_sentence = translation_config['source_sentence']
    ex = Example.fromlist([source_sentence],
                          fields=[('src', src_field_processor)
                                  ])  # tokenize the sentence

    source_sentence_tokens = ex.src
    print(f'Source sentence tokens = {source_sentence_tokens}')

    # Numericalize and convert to cuda tensor
    src_token_ids_batch = src_field_processor.process([source_sentence_tokens],
                                                      device)

    with torch.no_grad():
        # Step 4: Optimization - compute the source token representations only once
        src_mask, _ = get_masks_and_count_tokens_src(src_token_ids_batch,
                                                     pad_token_id)
        src_representations_batch = baseline_transformer.encode(
            src_token_ids_batch, src_mask)

        # Step 5: Decoding process
        if translation_config['decoding_method'] == DecodingMethod.GREEDY:
            target_sentence_tokens = greedy_decoding(
                baseline_transformer, src_representations_batch, src_mask,
                trg_field_processor)
        else:
            beam_decoding = get_beam_decoder(translation_config)
            target_sentence_tokens = beam_decoding(baseline_transformer,
                                                   src_representations_batch,
                                                   src_mask,
                                                   trg_field_processor)
        print(
            f'Translation | Target sentence tokens = {target_sentence_tokens}')

        # Step 6: Potentially visualize the encoder/decoder attention weights
        if translation_config['visualize_attention']:
            visualize_attention(baseline_transformer, source_sentence_tokens,
                                target_sentence_tokens)
Esempio n. 29
0
 def extract_features(self, instance: Dict[str, object]) -> Example:
     syllables = [self.SPACE_TOKEN if x == ' ' else x for x in instance['contents'][:self.MAX_LEN - 2]]
     ex = Example()
     setattr(ex, 'syllable_contents', [self.INIT_TOKEN] + syllables + [self.EOS_TOKEN])
     return ex
Esempio n. 30
0
 def __getitem__(self, i):
     if not (self.cache_idx <= i < self.cache_idx + self.CACHE_SIZE):
         self.cache_idx = i
         self.cache = self.ds[i:i + self.CACHE_SIZE]
     return Example.fromlist(self.cache[i - self.cache_idx],
                             self.fields)
Esempio n. 31
0
 def process(sample):
     return Example.fromlist([sample.text, sample.text, mapping[sample.label]], fields)
    def classify_from_file(self,
                           file_name,
                           batch_size: int = 5,
                           delimiter: str = ",",
                           quotechar: str = '"',
                           text_col_name: str = 'text'):
        assert self.has_trained
        """
        This method reads in a file, parses it into the correct format and classifies the contents
        of the file. Throws an error when the model is not trained.

        :param file_name: string specifying the location and name of the file that contains the training dat
        :param delimiter: string specifying the delimiter used in the training csv file
        :param quotechar: string specifying the quotechar used in the training csv file
        :param text_col_name: string specifying the name of the column containing the mails in the csv file
        :param batch_size: integer specifying the batch size, this will affect the size of the batches fed into \
        the model this can be set lower if memory issues occur
        :return: returns a list of results, where the result indices from the model have been converted back to \
        the original class names from the file
        """

        strings = pd.read_csv(file_name, sep=delimiter,
                              quotechar=quotechar)[text_col_name].tolist()

        if isinstance(strings, str):
            strings = [strings]
        if isinstance(strings, list):
            strings = [[string] for string in strings]

        fields = [('text', self._TEXT)]

        list_of_examples = [
            Example.fromlist(string, fields) for string in strings
        ]
        dataset = torchtext.data.Dataset(list_of_examples, fields)

        data = Iterator(dataset,
                        batch_size=batch_size,
                        device=torch.device("cpu"),
                        sort=False,
                        sort_within_batch=False,
                        repeat=False,
                        shuffle=False)

        predictions = defaultdict(list)

        for item in data:
            x = getattr(item, text_col_name)
            # Set the model to evaluation mode, important because of the Dropout Layers
            self.model.to(self.device)
            self.model = self.model.eval()
            outputs = self.model(x.to(self.device),
                                 tower=self.target_names_list)
            for i in range(len(self.target_names_list)):
                predictions[self.target_names_list[i]].extend(
                    outputs[i].detach().cpu().argmax(1).tolist())

        results = defaultdict(list)
        for key, val in predictions.items():
            results[key] = [
                self._label_names[key][i] for i in predictions[key]
            ]
        return results