def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) if set_type == 'test': text_a = tokenization.convert_to_unicode(line[-2]) text_b = tokenization.convert_to_unicode(line[-1]) label = 0.0 else: text_a = tokenization.convert_to_unicode(line[-3]) text_b = tokenization.convert_to_unicode(line[-2]) label = float(line[-1]) examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): examples = [] for line in lines: qid = line['id'] question = tokenization.convert_to_unicode( line['question']['stem']) answers = np.array([ tokenization.convert_to_unicode(choice['text']) for choice in sorted(line['question']['choices'], key=lambda c: c['label']) ]) # the test set has no answer key so use 'A' as a dummy label label = self.LABELS.index(line.get('answerKey', 'A')) examples.append( InputExample(qid=qid, question=question, answers=answers, label=label)) return examples
def _create_examples_variant_D(self, lines, set_type): examples = [] for line in lines: qid = line['idx'] premise = tokenization.convert_to_unicode(line['premise']) answers = np.array([ tokenization.convert_to_unicode(line["choice1"]), tokenization.convert_to_unicode(line["choice2"]) ]) # the test set has no answer key so use '0' as a dummy label label = line.get('label', 0) examples.append( InputExample(qid=qid, question=premise, answers=answers, label=label)) return examples
def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. print('converting to unicode...') for input_file in input_files: with tf.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) print('done') # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) vocab_words = list(tokenizer.vocab.keys()) instances = [] print('processing documents...') for _ in range(dupe_factor): print('dupe_factor', _) for document_index in tqdm(range(len(all_documents))): instances.extend( create_instances_from_document(all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) rng.shuffle(instances) return instances