def process_data(hp): tokenizer = create_tokenizer_from_hub_module(hp) train, test = download_and_load_datasets() # print(train) # train = train.sample(5000) # test = test.sample(5000) # Use the InputExample class from BERT's run_classifier code to create examples from the data train_InputExamples = train.apply( lambda x: bert.run_classifier.InputExample( guid=None, # Globally unique ID for bookkeeping, unused in this example text_a=x[hp.DATA_COLUMN], text_b=None, label=x[hp.LABEL_COLUMN]), axis=1) test_InputExamples = test.apply( lambda x: bert.run_classifier.InputExample(guid=None, text_a=x[hp.DATA_COLUMN], text_b=None, label=x[hp.LABEL_COLUMN]), axis=1) # print(tokenizer.tokenize("This here's an example of using the BERT tokenizer")) # Convert our train and test features to InputFeatures that BERT understands. label_list = [int(i) for i in hp.label_list.split(",")] train_features = run_classifier.convert_examples_to_features( train_InputExamples, label_list, hp.MAX_SEQ_LENGTH, tokenizer) test_features = run_classifier.convert_examples_to_features( test_InputExamples, label_list, hp.MAX_SEQ_LENGTH, tokenizer) return train_features, test_features
def do_eval(self): print(f"[INFO] Started working on evaluation...") print("[INFO] Preparing test InputExample...") test_inputExamples = self.test.apply( lambda x: run_classifier.InputExample( guid=None, text_a=x['sentence'], text_b=None, label=x['label' ]), axis=1) print("[INFO] Done preparing test InputExample...\n") label_list = list(range(len(self.labels))) print("[INFO] Preparing test features...") test_features = run_classifier.convert_examples_to_features( test_inputExamples, label_list, self.max_seq_length, self.tokenizer) print("[INFO] Done preparing test features...\n") test_input_fn = run_classifier.input_fn_builder( features=test_features, seq_length=self.max_seq_length, is_training=False, drop_remainder=False) print(f'[INFO] Begin evaluating...!') result = self.estimator.evaluate(input_fn=test_input_fn, steps=None) print(f"[INFO] Done evaluating...\n") for key in sorted(result.keys()): print(f"[INFO] {key} = {result[key]}")
def imdb_to_bert_features(records, max_seq_length, repo_path, train=True): repo = repo_path model = 'uncased_L-12_H-768_A-12' pretrained_dir = f'{repo}/{model}' vocab_file = os.path.join(pretrained_dir, 'vocab.txt') do_lower_case = model.startswith('uncased') records_txt_v, records_lbl_v = records[:, 0], records[:, 1:] get_id = lambda v: str(np.argmax(v)) get_str = lambda v: str(v) records_txt = list(map(get_str, records_txt_v)) records_lbl = list(map(get_id, records_lbl_v)) label_list = ['0', '1'] tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) guid = 'train' if train else 'test' records_examples = create_bert_examples(records_txt, guid, labels=records_lbl) records_features = run_classifier.convert_examples_to_features( records_examples, label_list, max_seq_length, tokenizer) return records_features
def getPrediction(in_sentences): labels = ["Negative", "Positive"] input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer) predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) predictions = estimator.predict(predict_input_fn) return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]
def predict(self): """ Predict and store the predicted label in self.predicted_class """ if not self.empty: """ See runclassifier.py from the bert git for more details""" input_examples = [ run_classifier.InputExample(guid="", text_a=self.clean_text, text_b=None, label=self.label_list[0]) ] # here, "" is just a dummy label self.input_features = run_classifier.convert_examples_to_features( input_examples, self.label_list, self.max_seq_length, self.tokenizer) input_ids = np.array(self.input_features[0].input_ids).reshape( -1, self.max_seq_length) input_mask = np.array(self.input_features[0].input_mask).reshape( -1, self.max_seq_length) label_ids = np.array(self.input_features[0].label_id).reshape(-1, ) segment_ids = np.array(self.input_features[0].segment_ids).reshape( -1, self.max_seq_length) result = SinglePredict.sess.run( SinglePredict.tensor_outputs, feed_dict={ SinglePredict.tensor_input_ids: input_ids, SinglePredict.tensor_input_mask: input_mask, SinglePredict.tensor_label_ids: label_ids, SinglePredict.tensor_segment_ids: segment_ids }) self.predicted_class = self.label_list[result]
def model_eval(estimator, processor, input_dir, label_list, max_sequence_length, tokenizer, eval_batch_size, output_dir): eval_examples = processor.get_dev_examples(input_dir) eval_features = run_classifier.convert_examples_to_features( eval_examples, label_list, max_sequence_length, tokenizer) print('***** Started evaluation at {} *****'.format( datetime.datetime.now())) print(' Num examples = {}'.format(len(eval_examples))) print(' Batch size = {}'.format(eval_batch_size)) # Eval will be slightly WRONG on the TPU because it will truncate # the last batch. eval_steps = int(len(eval_examples) / eval_batch_size) eval_input_fn = run_classifier.input_fn_builder( features=eval_features, seq_length=max_sequence_length, is_training=False, drop_remainder=True) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) print('***** Finished evaluation at {} *****'.format( datetime.datetime.now())) output_eval_file = os.path.join(output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: print("***** Eval results *****") for key in sorted(result.keys()): print(' {} = {}'.format(key, str(result[key]))) writer.write("%s = %s\n" % (key, str(result[key])))
def getRatings(self, in_sentences): #print(in_sentences) #1 input_examples = [ run_classifier.InputExample(guid="", text_a=x, text_b=None, label="1") for x in in_sentences ] # here, "" is just a dummy label #2 input_features = run_classifier.convert_examples_to_features( input_examples, label_list, MAX_SEQ_LENGTH, self.tokenizer ) #[<bert.run_classifier.InputFeatures object at 0x0000015AF00D7448>] #3 predict_input_fn = run_classifier.input_fn_builder( features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False ) #function input_fn_builder.<locals>.input_fn at 0x0000015B23F65678 #4 predictions = self.estimator.predict(input_fn=predict_input_fn) return predictions
def getPrediction(in_sentences): labels = [0, 1, 2] input_examples = [run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in in_sentences] # here, "" is just a dummy label input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer) predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) predictions = estimator.predict(predict_input_fn) indice=0 resultado = [] # try: for prediction in predictions: #elimian # from sklearn.preprocessing import label_binarize resultado.append((in_sentences[indice], prediction['probabilities'], label_binarize([labels[prediction['labels']]], classes=[0, 1, 2])[0] )) print(str(indice) +"###"+str(len(in_sentences))) indice = indice + 1 except Exception as e: print(e) return resultado return resultado '''
def evaluate(self, data_dir, raw_score_output_dir=None): """ Directory that contains a "test.tsv" :param data_dir: :return: """ test_examples = self._processor.get_test_examples(data_dir) label_list = self._processor.get_labels() tokenizer = self._tokenizer test_features = convert_examples_to_features( test_examples, label_list, self._config["max_seq_length"], tokenizer) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return self._evaluate(test_data, raw_score_output_dir)
def getPrediction(in_sentences): labels = ["Not an error", "Is an error"] input_examples = [ run_classifier.InputExample(guid="", text_a=x[0], text_b=x[1], label=0) for x in in_sentences ] # here, "" is just a dummy label input_features = run_classifier.convert_examples_to_features( input_examples, label_list, MAX_SEQ_LENGTH, tokenizer) predict_input_fn = run_classifier.input_fn_builder( features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) predictions = estimator.predict(predict_input_fn, checkpoint_path="./" + OUTPUT_DIR + "/model.ckpt-56165", yield_single_examples=False) prediction_list = [] i = 0 for prediction in predictions: if hasattr(prediction["labels"], '__len__'): for j in range(len(prediction["labels"])): prediction_list.append( (in_sentences[i], prediction["probabilities"][j], labels[prediction["labels"][j]])) i += 1 else: prediction_list.append( (in_sentences[i], prediction["probabilities"], labels[prediction["labels"]])) i += 1 return prediction_list
def get_final_predictions(in_contexts, in_last_sentences, tokenizer, estimator: tf.estimator.Estimator, label_list): """ Return the log probabilities based on the story context and the endings proposed Parameters ---------- in_contexts: str of the story context in_last_sentences: proposed last sentence tokenizer: bert tokenizer estimator: tf.estimator label_list: possible values """ input_examples = [ run_classifier.InputExample(guid="", text_a=x, text_b=y, label=0) for x, y in zip(in_contexts, in_last_sentences) ] # here, "" is just a dummy label input_features = run_classifier.convert_examples_to_features( input_examples, label_list, flags.max_seq_length, tokenizer) predict_input_fn = run_classifier.input_fn_builder( features=input_features, seq_length=flags.max_seq_length, is_training=False, drop_remainder=False) predictions = estimator.predict(predict_input_fn) predictions = [prediction['probabilities'] for prediction in predictions] return predictions
def predict(self, sent1, sent2): """ Predict on a single sentence pair (sent1, sent2); the choice of the inputs depends on the task at hand: - Relevance (claim, perspective) - Stance (claim, perspective) - Equivalence (claim + perspective1, perspective2) - Evidence (claim + perspective, evidence) :param example: :param sent1 :param sent2 :return the confidence value of the output label """ label_list = self._processor.get_labels() example = InputExample(guid="dummy", text_a=sent1, text_b=sent2, label=label_list[0]) feature = convert_examples_to_features([example], label_list, self._config["max_seq_length"], self._tokenizer)[0] self._model.eval() with torch.no_grad(): input_ids_tensor = torch.tensor([feature.input_ids]) segment_ids_tensor = torch.tensor([feature.segment_ids]) input_mask_tensor = torch.tensor([feature.input_mask]) output = self._model(input_ids_tensor, segment_ids_tensor) # , input_mask_tensor # print(output) return output.detach().cpu().numpy()[0]
def evaluate(self, examples): features = run_classifier.convert_examples_to_features( examples, self.label_list, self.max_len, self.tokenizer) input_fn = run_classifier.input_fn_builder(features=features, seq_length=self.max_len, is_training=False) self.estimator.evaluate(input_fn=input_fn, steps=None)
def getListPrediction(self, in_sentences): #print(in_sentences) #1 input_examples = [ run_classifier.InputExample(guid="", text_a=x, text_b=None, label="0") for x in in_sentences ] # here, "" is just a dummy label #2 input_features = run_classifier.convert_examples_to_features( input_examples, label_list, MAX_SEQ_LENGTH, self.tokenizer) #3 predict_input_fn = run_classifier.input_fn_builder( features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) #4 predictions = self.estimator.predict(input_fn=predict_input_fn) return predictions
def getPrediction(in_sentences): labels = ["Non-Sensitive", "Sensitive"] input_examples = [ run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in in_sentences ] # here, "" is just a dummy label input_features = run_classifier.convert_examples_to_features( input_examples, label_list, MAX_SEQ_LENGTH, tokenizer) predict_input_fn = run_classifier.input_fn_builder( features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) estimator = run() predictions = estimator.predict(predict_input_fn) return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)] # pred_sentences = [ # "He drinks apple", # "He drinks milk", # "A mosquito stings me", # "I sting a mosquito", # "A niece is a person.", # "A giraffe is a person.", # "I like to ride my chocolate", # "I like to ride my bike", # "he put elephant into the jug", # ] # predictions = getPrediction(pred_sentences) # print(predictions,"predictions")
def getPrediction(in_sentences, type_output="features"): #A list to map the actual labels to the predictions labels = np.unique(train['label']) input_examples = [ run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in in_sentences ] input_features = run_classifier.convert_examples_to_features( input_examples, label_list, MAX_SEQ_LENGTH, tokenizer) #Predicting the classes predict_input_fn = run_classifier.input_fn_builder( features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) predictions = estimator.predict(predict_input_fn) if type_output == "features": return [ prediction['pooled_output'] for _, prediction in enumerate(predictions) ] else: return ([(sentence, prediction['probabilities'], prediction['labels'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)])
def predict(sentences, predict_fn): labels = [0, 1] input_examples = [ run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in sentences ] # here, "" is just a dummy label input_features = run_classifier.convert_examples_to_features( input_examples, labels, MAX_SEQ_LENGTH, tokenizer) all_input_ids = [] all_input_mask = [] all_segment_ids = [] all_label_ids = [] for feature in input_features: all_input_ids.append(feature.input_ids) all_input_mask.append(feature.input_mask) all_segment_ids.append(feature.segment_ids) all_label_ids.append(feature.label_id) pred_dict = { 'input_ids': all_input_ids, 'input_mask': all_input_mask, 'segment_ids': all_segment_ids, 'label_ids': all_label_ids } predictions = predict_fn(pred_dict) return [(sentence, prediction, label) for sentence, prediction, label in zip( pred_sentences, predictions['probabilities'], predictions['labels'])]
def embed(self, sequences: Union[str, Iterable[str]], per_token: bool = _DEFAULT_PER_TOKEN_EMBEDDING) -> np.ndarray: """ Embeds a sequence or multiple sequences using the BERT model Args: sequences (Union[str, Iterable[str]]): the sequence(s) to embed per_token (bool): whether to produce an embedding per token or a pooled embedding for the whole sequence Returns: a numpy array with the embedding(s) of the sequence(s) """ single_input = isinstance(sequences, str) if single_input: sequences = [sequences] # Convert sequnces into BERT input format input_examples = [ run_classifier.InputExample(guid=None, text_a=sequence, text_b=None, label=0) for sequence in sequences ] input_features = run_classifier.convert_examples_to_features( input_examples, [0], self._input_ids.shape[1], self._tokenizer) # Execute the computation graph on the inputs if self._session is not None: output = self._session.run( self. _outputs["sequence_output" if per_token else "pooled_output"], feed_dict={ self._input_ids: [sequence.input_ids for sequence in input_features], self._input_mask: [sequence.input_mask for sequence in input_features], self._segment_ids: [sequence.segment_ids for sequence in input_features] }) else: with tf.compat.v1.Session(graph=self._graph) as session: session.run(tf.global_variables_initializer()) output = session.run( self._outputs[ "sequence_output" if per_token else "pooled_output"], feed_dict={ self._input_ids: [sequence.input_ids for sequence in input_features], self._input_mask: [sequence.input_mask for sequence in input_features], self._segment_ids: [sequence.segment_ids for sequence in input_features] }) if single_input: output = output.reshape(output.shape[1:]) return output
def predict(self, examples): features = run_classifier.convert_examples_to_features( examples, self.label_list, self.max_len, self.tokenizer) input_fn = run_classifier.input_fn_builder(features=features, seq_length=self.max_len, is_training=False) predictions = self.estimator.predict(input_fn=input_fn) return predictions['probabilities']
def __create_features(self, pd_dataset, label_list, max_seq_len, tokenizer, data_column, label_column): input_examples = pd_dataset.apply(lambda x: InputExample(guid=None, text_a=x[data_column], text_b=None, label=x[label_column]), axis=1) return convert_examples_to_features(input_examples, label_list, max_seq_len, tokenizer)
def data_prep(stressors_list, label_list, MAX_SEQ_LENGTH, tokenizer): input_examples = [ run_classifier.InputExample(guid="", text_a=x, text_b=None, label=1) for x in stressors_list ] input_features = run_classifier.convert_examples_to_features( input_examples, label_list, MAX_SEQ_LENGTH, tokenizer) return input_features
def get_bert_features(text): if len(text) == 1: in_sentences = [text[0], 'aha'] else: in_sentences = text input_examples = [ run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in in_sentences ] input_features = run_classifier.convert_examples_to_features( input_examples, label_list, MAX_SEQ_LENGTH, tokenizer) return input_features
def get_bert_inputs(input, bert_hub_module_handle, bert_max_seqlen): tokenizer = create_tokenizer_from_hub_module(bert_hub_module_handle) features = run_classifier.convert_examples_to_features( input, [0, 1], bert_max_seqlen, tokenizer) input_ids = [] input_mask = [] segment_ids = [] for feature in features: input_ids.append(feature.input_ids) input_mask.append(feature.input_mask) segment_ids.append(feature.segment_ids) return input_ids, input_mask, segment_ids
def convert_sent_to_vec(tokenizer, dataset, DATA_COLUMN='text_1', DATA_COLUMN2='text_2'): label_list = [1, 0] input_example = dataset.apply(lambda x: run_classifier.InputExample( guid=None, text_a=x[DATA_COLUMN], text_b=x[DATA_COLUMN2], label=1), axis=1) features = run_classifier.convert_examples_to_features( input_example, cfg.label_list, cfg.MAX_SEQ_LENGTH, tokenizer) return features
def create_train_test_features( tokenizer: tokenization.FullTokenizer ) -> Tuple[run_classifier.InputFeatures, run_classifier.InputFeatures]: train_input_examples, test_input_examples = (ParquetFile( data_filename(dataset_name)).to_pandas().sample(SAMPLE_SIZE).apply( create_bert_input_example, axis=1) for dataset_name in DATASET_NAMES) train_features, test_features = ( run_classifier.convert_examples_to_features(input_examples, LABEL_LIST, MAX_SEQ_LENGTH, tokenizer) for input_examples in (train_input_examples, test_input_examples)) return train_features, test_features
def getPrediction(in_sentences): #A list to map the actual labels to the predictions labels = ["0", "1","2","3"] #Transforming the test data into BERT accepted form input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] #Creating input features for Test data input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer) #Predicting the classes predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) predictions = estimator.predict(predict_input_fn) return [(sentence, prediction['probabilities'],prediction['labels'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]
def getPrediction(estimator, in_sentences, labels, label_list, max_seq_len, tokenizer): input_examples = [ run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in in_sentences ] input_features = run_classifier.convert_examples_to_features( input_examples, label_list, max_seq_len, tokenizer) predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=max_seq_len, is_training=False, drop_remainder=False) predictions = estimator.predict(predict_input_fn) return [(sentence, pred['probabilities'], labels[pred['labels']]) for sentence, pred in zip(in_sentences, predictions)]
def do_train(self): print("[INFO] Preparing train InputExample...") train_inputExamples = self.train.apply( lambda x: run_classifier.InputExample( guid=None, text_a=x['sentence'], text_b=None, label=x['label' ]), axis=1) print("[INFO] Done preparing train InputExample...\n") label_list = list(range(len(self.labels))) print("[INFO] Preparing train features...") train_features = run_classifier.convert_examples_to_features( train_inputExamples, label_list, self.max_seq_length, self.tokenizer) print("[INFO] Done preparing train features...\n") train_input_fn = run_classifier.input_fn_builder( features=train_features, seq_length=self.max_seq_length, is_training=True, drop_remainder=False) num_train_steps = \ int(len(train_features)/self.train_batch_size*self.num_train_epochs) num_warmup_steps = int(num_train_steps * self.warmup_proportion) print(f"[INFO] No. of train steps: {num_train_steps}") print(f"[INFO] No. of warmup steps: {num_warmup_steps}") self.estimator = get_estimator( self.init_checkpoint, self.bert_config_file, self.labels, self.train_batch_size, self.model_dir, save_summary_steps=self.save_summary_steps, save_checkpoints_steps=self.save_checkpoints_steps, learning_rate=self.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) print(f'[INFO] Begin Training...!') current_time = datetime.now() self.estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) print( f"[INFO] Training took time {datetime.now() - current_time} sec..!\n" )
def model_predict(estimator, processor, input_dir, predict_batch_size, label_list, max_sequence_length, tokenizer): prediction_examples = processor.get_dev_examples( input_dir)[:predict_batch_size] input_features = run_classifier.convert_examples_to_features( prediction_examples, label_list, max_sequence_length, tokenizer) predict_input_fn = run_classifier.input_fn_builder( features=input_features, seq_length=max_sequence_length, is_training=False, drop_remainder=True) predictions = estimator.predict(predict_input_fn) for example, prediction in zip(prediction_examples, predictions): print('text_a: %s\ntext_b: %s\nlabel:%s\nprediction:%s\n' % (example.text_a, example.text_b, str( example.label), prediction['probabilities']))
def classify_sentences(estimator, tokenizer, test_features, testfile): test_input_fn = run_classifier.input_fn_builder( features=test_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) estimator.evaluate(input_fn=test_input_fn, steps=None) #ul_test_path = os.path.join(project_path,'unlabeled_test_with_noise.tsv') #print(ul_test_path) ul_test = pd.read_csv(testfile, sep='\n') ul_test.head() ul_test.columns = ['Text'] ul_test.head [count, col] = ul_test.shape print("count, col", count, col) dft2 = ul_test["Text"] pred_sentences = [] for i in range (0,count): pred_sentences.append(dft2[i]) print(len(pred_sentences)) input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 1) for x in pred_sentences] input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer) #pred_sentences = [ # "That movie was absolutely awful", # "Please finish this work by Monday", # "I am going to attend the meeting in Germany", # "The results are very good this year. We can close the deal", # "abhi said I oft manerism", # " utkash said he also so anyone can be confusing.", # " if the say otherwise in great in the dark but you know also I'll be able to text thing ." #] print(pred_sentences) predictions = getPrediction(pred_sentences, estimator, tokenizer) print (predictions) return predictions