def batcher(params, batch): #print ('batch size' ,len(batch)) batch = [sent if sent != [] else ['.'] for sent in batch] batch = [' '.join(sent) for sent in batch] #print ('batch', batch) examples = [] unique_id = 0 #print ('batch size ', len(batch)) for sent in batch: sent = sent.strip() text_b = None text_a = sent examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 features = convert_examples_to_features(examples, params['bert'].seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_encoder_layers, _ = params['bert'](all_input_ids, token_type_ids=None, attention_mask=all_input_mask) ###get z_vec #get the output of previous layer prev_out = all_encoder_layers[params['bert'].layer_no - 1] #print ('prev_out.shape ', prev_out.shape) #print ('all_input_mask.shape ', all_input_mask.shape) ##apply self-attention to it extended_attention_mask = all_input_mask.cuda().unsqueeze(1).unsqueeze(2) extended_attention_mask = extended_attention_mask.to( dtype=next(params['bert'].parameters()).dtype) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 embeddings = next(params['bert'].children()).encoder.layer[ params['bert'].layer_no].attention.self(prev_out, extended_attention_mask) ##do mean/max pooling embeddings = embeddings.detach().mean(1).cpu().numpy() #print ('befor shape', embeddings.shape) if params['bert'].head_no is not None: if params['bert'].head_no == 'random': embeddings = embeddings[:, params['bert'].randidx] else: embeddings = embeddings[:, 64 * params['bert'].head_no:64 * (params['bert'].head_no + 1)] #print ('after shape', embeddings.shape) #print ('embeddings.shape ', embeddings.shape) #print ('finished a batch \n\n') return embeddings
def get_examples(self, sents): """ Read sentences. Args: sents (str): A list of sentences Return: A list of InputExample """ examples = [] unique_id = 0 for sent in sents: line = tokenization.convert_to_unicode(sent) line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
def __getitem__(self, item): cur_id = self.sample_counter self.sample_counter += 1 if not self.on_memory: # after one epoch we start again from beginning of file if cur_id != 0 and (cur_id % len(self) == 0): self.file.close() self.file = open(self.corpus_path, "r", encoding=self.encoding) t1, t2, is_next_label = self.random_sent(item) # tokenize tokens_a = self.tokenizer.tokenize(t1) tokens_b = self.tokenizer.tokenize(t2) # combine to one sample cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label) # transform sample to features cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer) cur_tensors = (torch.tensor(cur_features.input_ids), torch.tensor(cur_features.input_mask), torch.tensor(cur_features.segment_ids), torch.tensor(cur_features.lm_label_ids), torch.tensor(cur_features.is_next)) return cur_tensors
def __convert_text_to_examples(self, text): unique_id = 0 examples = [] paragraphs = text.split('\n') for sentence in paragraphs: examples.append(InputExample(unique_id=unique_id, text_a=self.__normalize_text(sentence), text_b=None)) unique_id += 1 return examples
def read_sequence(input_sentences): examples = [] unique_id = 0 for sentence in input_sentences: line = tokenization.convert_to_unicode(sentence) examples.append(InputExample(unique_id=unique_id, text_a=line, text_b=None)) unique_id += 1 return examples
def preprocess(text): text_a = text example = InputExample(unique_id=None, text_a=text_a, text_b=None) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) feature = convert_examples_to_features([example], max_token_len, tokenizer)[0] input_ids = np.reshape([feature.input_ids], (1, max_token_len)) return {"inputs": {"input_ids": input_ids.tolist()}}
def preprocess(text): text_a = text example = InputExample(unique_id=None, text_a=text_a, text_b=None) feature = convert_examples_to_features([example], max_token_len, tokenizer)[0] """4: 从上一步的信息可知,输入的key是input_ids,维度是(1,400),同时外包[]表示batch size 为1 """ input_ids = np.reshape([feature.input_ids], (1, max_token_len)) return input_ids
def preprocess(text, flag): if flag == 'class': text_a = " ".join(tokenizer.tokenize(text)) example = InputExample(unique_id=None, text_a=text_a, text_b=None) feature = convert_class_to_features(example, max_token_len, tokenizer) else: example = InputSeqExample(guid=None, text_token=text, token_label=None) feature = convert_seq_to_features(example, max_token_len, tokenizer, label_list) input_ids = np.reshape([feature.input_ids], (1, max_token_len)) return {"inputs": {"input_ids": input_ids.tolist()}}
def preprocess(text): """ function: preprocess text into input numpy array """ vocab_file = os.environ.get("vocab_file", "./dependency/vocab.txt") max_token_len = os.environ.get("max_token_len", 128) text_a = text example = InputExample(unique_id=None, text_a=text_a, text_b=None) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) feature = convert_examples_to_features([example], max_token_len, tokenizer)[0] input_ids = np.reshape([feature.input_ids], (1, max_token_len)) return {"inputs": {"input_ids": input_ids.tolist()}}
def _create_examples(self, texts_a, texts_b, labels, set_type): """ Creates examples for the training and dev sets. Parameters ---------- texts_a : list list of input texts_a (e.g the first `sentence` for BERT) texts_b : list, (list of None if not required) list of input texts_b (e.g the second `sentence` for BERT) labels : list list of input labels set_type : str specifies whether the set is 'train' or 'dev' Returns ------- list list of InputExample objects Raises ------ ValueError if the length of the texts and length of the labels are incompatible """ examples = [] for i, (text_a, text_b, label) in enumerate(zip(texts_a, texts_b, labels)): unique_id = f"{self.data_name}-{set_type}-{i}" text_a = convert_to_unicode(text_a) if text_b is not None: text_b = convert_to_unicode(text_b) if label is not None: if isinstance(label, str) and '[' and ']' in label: # If we have a list of labels as a string convert into an actual list e.g # "['list', 'of', 'strings']"" -> ['list', 'of', 'strings'] label = ast.literal_eval(label) label = convert_to_unicode(label) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b, label=label)) return examples
def read_examples(str_io): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 while True: line = tokenization.convert_to_unicode(str_io.readline()) if not line: break line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append(InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
def convert_lines_to_examples(lines): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 for line in lines: line = tokenization.convert_to_unicode(line) if not line: continue line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples