def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) text_a = tokenization.convert_to_unicode(line[8]) text_b = tokenization.convert_to_unicode(line[9]) label = tokenization.convert_to_unicode(line[-1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] # one line meaning one line for original file for (i, line) in enumerate(lines): # line: <class 'list'>: ['game', '5000万,投资游戏该有多好!【游戏智库每周游报】 休息两相关部门人士求..........'] guid = "%s-%s" % (set_type, i) # train-0 text_a = tokenization.convert_to_unicode(line[1]) # train-05000万,投资游戏该有多好!【游戏智库每周游报】 休息两天-------->unicode label = tokenization.convert_to_unicode(line[0]) # game self.labels.add(label) examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, df: pd.DataFrame, set_type: str): """Creates examples for the training and dev sets.""" examples = [] for i, row in df.iterrows(): if set_type == 'test': guid = row['id'] else: guid = row['ID'] text_a = tokenization.convert_to_unicode(row['title']) if pd.isna(row['keyword']): text_b = '' else: text_b = tokenization.convert_to_unicode(row['keyword']) if set_type == 'test': label = None else: label = tokenization.convert_to_unicode(str(row['label'])) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples