def create_examples(self, datas, set_type): examples = [] for i, data in enumerate(datas): guid = data[0] text_a = data[1].strip() # text_b = data[3].strip() if set_type == 'test': label = None else: label = str(int(data[2])) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def similarity_with_concepts(self, text, concepts): examples = [InputExample(guid='0', text_a=text, text_b=concept) for concept in concepts] features = glue_convert_examples_to_features(examples=examples, tokenizer=tokenizer, max_length=128, output_mode='regression', label_list=[None]) input_ids = torch.tensor([feature.input_ids for feature in features]) attention_mask = torch.tensor([feature.attention_mask for feature in features]) with torch.no_grad(): outputs = model(input_ids=input_ids, attention_mask=attention_mask) outputs = outputs[0].T.tolist()[0] return outputs
def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] text_index = 0 for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = line[text_index] label = line[1] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples_adv(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # if i == 0: # continue guid = "%s-%s" % (set_type, i) text_a = line label = "0" ## label here doesn't matter examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines_in, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for i, line in enumerate(lines_in[1:]): guid = "%s-%s" % (set_type, i) text_a = line[1] text_b = '' label = None if set_type == "predict" else line[0] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[7] text_b = line[8] label = line[-1] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def similarity_with_concept(self, text, concept): example = InputExample(guid='0', text_a=text, text_b=concept) feature = glue_convert_examples_to_features(examples=[example], tokenizer=self.tokenizer, max_length=128, output_mode='regression', label_list=[None]) input_ids = torch.tensor(feature[0].input_ids).unsqueeze(0) attention_mask = torch.tensor(feature[0].attention_mask).unsqueeze(0) with torch.no_grad(): outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) return outputs[0].item()
def load_pr_curve_data(world, split): prcp_pairs, tags = world.GetDPRCP(test=(split == 'test')) examples = [] for i, (qid1, qid2) in enumerate(prcp_pairs): q1 = world.id_to_text[qid1] q2 = world.id_to_text[qid2] # One QQP question is '' which screws up RoBERTa tokenizer. Change it to ' ' if not q1: q1 = ' ' if not q2: q2 = ' ' examples.append( InputExample(guid='{}-{}-pr-{}-{}'.format(split, i, qid1, qid2), text_a=q1, text_b=q2, label='0')) return examples, tags
def get_train_examples(self, data_dir): """See base class.""" datafile = os.path.join(data_dir, 'doordash_categorized.pkl') print(datafile) with open(datafile, 'rb') as f: json = pickle.load(f) examples = [] for entry in json: guid = entry['key'] title = entry['title'] label = entry['labels']['category'] if label in self.excluded_labels: continue examples.append( InputExample(guid=guid, text_a=title, label=label) ) return examples
def _create_examples(self, lines, set_type): examples = [] for (_, line) in enumerate(lines): guid = "%s-%s" % (set_type, line[0]) try: text_a = line[1].lower() text_b = line[2].lower() label = line[3] except IndexError: print('cannot read the line: ' + line) continue examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def create_examples(filename): datas = pd.read_csv(filename).values.tolist() examples = [] for i, data in enumerate(datas): guid = data[1] text_a = data[2].strip() # print(text_a) # text_b = data[2].strip() examples.append( InputExample( guid=guid, text_a=text_a, text_b=None, label=None ) ) return examples
def _create_examples(self, lines: List[List[str]], set_type: str) -> List[InputExample]: """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = line[5] text_b = line[6] label = self._preprocess_label(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _get_test_or_dev_helper(self, data_dir, stage): if stage == "dev": language = self.dev_language elif stage == "test": language = self.test_language rows = self._read_xnli_tsv( os.path.join(data_dir, "XNLI-1.0", f"xnli.{stage}.tsv")) examples = [ InputExample( guid=f"{stage}-{i}", text_a=row["sentence1"], text_b=row["sentence2"], label=row["gold_label"], ) for i, row in enumerate(rows) if row["language"] == language ] return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = line[0] text_b = line[1] if "imho" in text_a or "imo" in text_a: #Gengyu: shoud I remove this two words from data label = "1" else: label = "0" examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[1] text_b = line[2] label = self.get_labels()[0] if set_type == 'test' else line[-1] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == 'test': text_a = line[1] label = '-1' else: label = line[0] text_a = line[1] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_train_examples(self, data_dir): rows = self._read_xnli_tsv( os.path.join( data_dir, "XNLI-MT-1.0", "multinli", f"multinli.train.{self.train_language}.tsv", )) examples = [ InputExample( guid=f"train-{i}", text_a=row["premise"], text_b=row["hypo"], label=("contradiction" if row["label"] == "contradictory" else row["label"]), ) for i, row in enumerate(rows) ] return examples
def _create_examples(self, lines, set_type): examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[1] text_b = line[2] if set_type == "test": label = -1 else: label = line[0] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines: List[List[str]], set_type: str) -> List[InputExample]: """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[8] text_b = line[9] label = None if set_type.startswith( "test") else self._preprocess_label(line[-1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" test_mode = set_type == "test" q1_index = 1 if test_mode else 3 q2_index = 2 if test_mode else 4 examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) try: text_a = line[q1_index] text_b = line[q2_index] label = None if test_mode else line[5] except IndexError: continue examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, set_type): """Creates examples for the training, dev and test sets.""" examples = [] label_map = { 0: "entailment", 1: "neutral", 2: "contradiction" } from nlp import load_dataset dataset = load_dataset('anli') for (i, data) in enumerate(dataset[set_type]): guid = "%s-%s" % (set_type, data['uid']) text_a = data['premise'] text_b = data['hypothesis'] # label = None if set_type.startswith("test") else label_map[data['label']] label = label_map[data['label']] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _rows2examples(self, rows): # Spanish sentence tokenizer tokenizer = nltk.data.load("tokenizers/punkt/PY3/spanish.pickle") examples = [] logger.info("Reading examples") for i, row in enumerate(tqdm(rows)): # the text column was saved as a string with the python syntax # for bytes literals, so it must be converted to a string literal tokens = tokenizer.tokenize(eval(row[1]).decode()) example = InputExample( f"test-{i}", tokens[0], tokens[1] if len(tokens) > 1 else None, label=row[0], ) examples.append(example) return examples
def _create_test_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples, rows = [], [] for (_, line) in enumerate(lines): guid = "%s-%s" % (set_type, line[0]) try: text_a = line[1].lower() text_b = line[2].lower() label = line[3] except IndexError: print('cannot read the line: ' + line) continue examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) rows.append(line) return examples, rows
def _load_data(self, dir, set_type): examples = [] for filename in os.listdir(dir): if filename.endswith(".ann"): lines = self._read_tsv(os.path.join(dir, filename)) #print(os.path.join(directory, filename)) for (i, line) in enumerate(lines): #line = line.split("\t") if line[0].startswith("T"): guid = "%s-%s" % (set_type, len(examples)) text_a = line[2].replace("CMV:", "").strip() if line[0].startswith("A"): label = line[1].split(" ")[2].strip() examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _read_tfds_and_create_examples(self, mode): if not is_tfds_available(): raise RuntimeError( "The package tensorflow_datasets can't be imported") import tensorflow_datasets as tfds try: ds, dsinfo = tfds.load(self.dataset_name, split=mode, with_info=True) except KeyError: raise ValueError( "The dataset {} does not exists in the tensorflow_datasets catalog." .format(self.dataset_name)) seen_labels = set() for ex_index, entry in enumerate(ds): if ex_index % 10000 == 0: logger.info("Creating example %d", ex_index) if self.guid in list(dsinfo.features.keys()): guid = entry[self.guid].numpy() else: guid = id if self.text_b in list(dsinfo.features.keys()): text_b = entry[self.text_b].numpy().decode("utf-8") else: text_b = None label = dsinfo.features[self.label].int2str( entry[self.label].numpy()) seen_labels.add(label) example = InputExample(guid, entry[self.text_a].numpy().decode("utf-8"), text_b, label) self.examples[mode].append(example) self.labels = list(set(self.labels).union(seen_labels))
def get_dev_examples(self, data_dir): """See base class.""" examples = [] with open(os.path.join(data_dir, "dev.jsonl"), "r") as f: for i, line in enumerate(f.readlines()): json_content = json.loads(line.replace("\n", "")) guid = "%s-%s" % (i, "dev") text_a = json_content['q1'] text_b = json_content['q2'] label = json_content['label'] assert label == '1' or label == '0' label = "paraphrase" if label == "1" else "not-paraphrase" assert isinstance(text_a, str), f"Training input {text_a} is not a string" assert isinstance(text_b, str), f"Training input {text_b} is not a string" assert isinstance(label, str), f"Training label {label} is not a string" example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) if i < 10: print(example) examples.append(example) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # if i == 0: # continue guid = "%s-%s" % (set_type, i) if len(line) < 2: continue text_a = line[0] label = line[1] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) # if set_type == "train": # random.shuffle(examples) # print ("Examples are shuffled") return examples
def _create_examples(self, data, set_type): examples = [] raw_texts = data.tweet.values.tolist() raw_labels = data.label.values.tolist() for i in range(0, len(raw_texts)): guid = "%s-%s" % (set_type, i) raw_text = raw_texts[i] raw_label = raw_labels[i] label = raw_label text = self._preprocess_text(raw_text) examples.append( InputExample(guid=guid, text_a=text, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue # guid = "%s-%s" % (set_type, i) guid = line[0] if set_type == 'test': # text_a = line[1] text_a = line[2] label = None else: text_a = line[2] label = line[1] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): try: if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[1] text_b = None label = line[0] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) except Exception as e: import pdb pdb.set_trace() print(e) return examples