Exemple #1
0
def process_data(hp):
    tokenizer = create_tokenizer_from_hub_module(hp)

    train, test = download_and_load_datasets()
    # print(train)
    # train = train.sample(5000)
    # test = test.sample(5000)

    # Use the InputExample class from BERT's run_classifier code to create examples from the data
    train_InputExamples = train.apply(
        lambda x: bert.run_classifier.InputExample(
            guid=None,
            # Globally unique ID for bookkeeping, unused in this example
            text_a=x[hp.DATA_COLUMN],
            text_b=None,
            label=x[hp.LABEL_COLUMN]),
        axis=1)

    test_InputExamples = test.apply(
        lambda x: bert.run_classifier.InputExample(guid=None,
                                                   text_a=x[hp.DATA_COLUMN],
                                                   text_b=None,
                                                   label=x[hp.LABEL_COLUMN]),
        axis=1)

    # print(tokenizer.tokenize("This here's an example of using the BERT tokenizer"))

    # Convert our train and test features to InputFeatures that BERT understands.
    label_list = [int(i) for i in hp.label_list.split(",")]
    train_features = run_classifier.convert_examples_to_features(
        train_InputExamples, label_list, hp.MAX_SEQ_LENGTH, tokenizer)
    test_features = run_classifier.convert_examples_to_features(
        test_InputExamples, label_list, hp.MAX_SEQ_LENGTH, tokenizer)
    return train_features, test_features
    def do_eval(self):
        print(f"[INFO] Started working on evaluation...")
        print("[INFO] Preparing test InputExample...")
        test_inputExamples = self.test.apply(
            lambda x: run_classifier.InputExample(
                guid=None, text_a=x['sentence'], text_b=None, label=x['label'
                                                                      ]),
            axis=1)
        print("[INFO] Done preparing test InputExample...\n")

        label_list = list(range(len(self.labels)))
        print("[INFO] Preparing test features...")
        test_features = run_classifier.convert_examples_to_features(
            test_inputExamples, label_list, self.max_seq_length,
            self.tokenizer)
        print("[INFO] Done preparing test features...\n")

        test_input_fn = run_classifier.input_fn_builder(
            features=test_features,
            seq_length=self.max_seq_length,
            is_training=False,
            drop_remainder=False)

        print(f'[INFO] Begin evaluating...!')
        result = self.estimator.evaluate(input_fn=test_input_fn, steps=None)
        print(f"[INFO] Done evaluating...\n")
        for key in sorted(result.keys()):
            print(f"[INFO]  {key} = {result[key]}")
Exemple #3
0
def imdb_to_bert_features(records, max_seq_length, repo_path, train=True):

    repo = repo_path
    model = 'uncased_L-12_H-768_A-12'

    pretrained_dir = f'{repo}/{model}'

    vocab_file = os.path.join(pretrained_dir, 'vocab.txt')
    do_lower_case = model.startswith('uncased')

    records_txt_v, records_lbl_v = records[:, 0], records[:, 1:]

    get_id = lambda v: str(np.argmax(v))
    get_str = lambda v: str(v)

    records_txt = list(map(get_str, records_txt_v))

    records_lbl = list(map(get_id, records_lbl_v))

    label_list = ['0', '1']

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)

    guid = 'train' if train else 'test'

    records_examples = create_bert_examples(records_txt,
                                            guid,
                                            labels=records_lbl)
    records_features = run_classifier.convert_examples_to_features(
        records_examples, label_list, max_seq_length, tokenizer)

    return records_features
Exemple #4
0
def getPrediction(in_sentences):
  labels = ["Negative", "Positive"]
  input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
  predictions = estimator.predict(predict_input_fn)
  return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]
Exemple #5
0
    def predict(self):
        """
        Predict and store the predicted label in self.predicted_class
        """
        if not self.empty:
            """ See runclassifier.py from the bert git for more details"""
            input_examples = [
                run_classifier.InputExample(guid="",
                                            text_a=self.clean_text,
                                            text_b=None,
                                            label=self.label_list[0])
            ]  # here, "" is just a dummy label
            self.input_features = run_classifier.convert_examples_to_features(
                input_examples, self.label_list, self.max_seq_length,
                self.tokenizer)
            input_ids = np.array(self.input_features[0].input_ids).reshape(
                -1, self.max_seq_length)
            input_mask = np.array(self.input_features[0].input_mask).reshape(
                -1, self.max_seq_length)
            label_ids = np.array(self.input_features[0].label_id).reshape(-1, )
            segment_ids = np.array(self.input_features[0].segment_ids).reshape(
                -1, self.max_seq_length)

            result = SinglePredict.sess.run(
                SinglePredict.tensor_outputs,
                feed_dict={
                    SinglePredict.tensor_input_ids: input_ids,
                    SinglePredict.tensor_input_mask: input_mask,
                    SinglePredict.tensor_label_ids: label_ids,
                    SinglePredict.tensor_segment_ids: segment_ids
                })
            self.predicted_class = self.label_list[result]
Exemple #6
0
def model_eval(estimator, processor, input_dir, label_list,
               max_sequence_length, tokenizer, eval_batch_size, output_dir):
    eval_examples = processor.get_dev_examples(input_dir)
    eval_features = run_classifier.convert_examples_to_features(
        eval_examples, label_list, max_sequence_length, tokenizer)
    print('***** Started evaluation at {} *****'.format(
        datetime.datetime.now()))
    print('  Num examples = {}'.format(len(eval_examples)))
    print('  Batch size = {}'.format(eval_batch_size))

    # Eval will be slightly WRONG on the TPU because it will truncate
    # the last batch.
    eval_steps = int(len(eval_examples) / eval_batch_size)
    eval_input_fn = run_classifier.input_fn_builder(
        features=eval_features,
        seq_length=max_sequence_length,
        is_training=False,
        drop_remainder=True)
    result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
    print('***** Finished evaluation at {} *****'.format(
        datetime.datetime.now()))
    output_eval_file = os.path.join(output_dir, "eval_results.txt")
    with tf.gfile.GFile(output_eval_file, "w") as writer:
        print("***** Eval results *****")
        for key in sorted(result.keys()):
            print('  {} = {}'.format(key, str(result[key])))
            writer.write("%s = %s\n" % (key, str(result[key])))
Exemple #7
0
    def getRatings(self, in_sentences):
        #print(in_sentences)
        #1
        input_examples = [
            run_classifier.InputExample(guid="",
                                        text_a=x,
                                        text_b=None,
                                        label="1") for x in in_sentences
        ]  # here, "" is just a dummy label

        #2
        input_features = run_classifier.convert_examples_to_features(
            input_examples, label_list, MAX_SEQ_LENGTH, self.tokenizer
        )  #[<bert.run_classifier.InputFeatures object at 0x0000015AF00D7448>]

        #3
        predict_input_fn = run_classifier.input_fn_builder(
            features=input_features,
            seq_length=MAX_SEQ_LENGTH,
            is_training=False,
            drop_remainder=False
        )  #function input_fn_builder.<locals>.input_fn at 0x0000015B23F65678

        #4
        predictions = self.estimator.predict(input_fn=predict_input_fn)

        return predictions
Exemple #8
0
def getPrediction(in_sentences):
    labels = [0, 1, 2]
    input_examples = [run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in
                      in_sentences]  # here, "" is just a dummy label
    input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH,
                                                       is_training=False, drop_remainder=False)
    predictions = estimator.predict(predict_input_fn)
    indice=0
    resultado = []



    #
    try:
        for prediction in predictions:
            #elimian


            #

            from sklearn.preprocessing import label_binarize
            resultado.append((in_sentences[indice], prediction['probabilities'], label_binarize([labels[prediction['labels']]], classes=[0, 1, 2])[0] ))
            print(str(indice) +"###"+str(len(in_sentences)))
            indice = indice + 1


    except Exception as e:
        print(e)
        return resultado

    return resultado
    '''
    def evaluate(self, data_dir, raw_score_output_dir=None):
        """
        Directory that contains a "test.tsv"
        :param data_dir:
        :return:
        """
        test_examples = self._processor.get_test_examples(data_dir)
        label_list = self._processor.get_labels()
        tokenizer = self._tokenizer

        test_features = convert_examples_to_features(
            test_examples, label_list, self._config["max_seq_length"],
            tokenizer)

        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in test_features],
                                     dtype=torch.long)

        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)

        return self._evaluate(test_data, raw_score_output_dir)
Exemple #10
0
def getPrediction(in_sentences):
    labels = ["Not an error", "Is an error"]
    input_examples = [
        run_classifier.InputExample(guid="", text_a=x[0], text_b=x[1], label=0)
        for x in in_sentences
    ]  # here, "" is just a dummy label
    input_features = run_classifier.convert_examples_to_features(
        input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    predict_input_fn = run_classifier.input_fn_builder(
        features=input_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)
    predictions = estimator.predict(predict_input_fn,
                                    checkpoint_path="./" + OUTPUT_DIR +
                                    "/model.ckpt-56165",
                                    yield_single_examples=False)
    prediction_list = []
    i = 0
    for prediction in predictions:
        if hasattr(prediction["labels"], '__len__'):
            for j in range(len(prediction["labels"])):
                prediction_list.append(
                    (in_sentences[i], prediction["probabilities"][j],
                     labels[prediction["labels"][j]]))
                i += 1
        else:
            prediction_list.append(
                (in_sentences[i], prediction["probabilities"],
                 labels[prediction["labels"]]))
            i += 1
    return prediction_list
Exemple #11
0
def get_final_predictions(in_contexts, in_last_sentences, tokenizer,
                          estimator: tf.estimator.Estimator, label_list):
    """
    Return the log probabilities based on the story context and the endings proposed

    Parameters
    ----------
    in_contexts:            str of the story context
    in_last_sentences:      proposed last sentence
    tokenizer:              bert tokenizer
    estimator:              tf.estimator
    label_list:             possible values
    """
    input_examples = [
        run_classifier.InputExample(guid="", text_a=x, text_b=y, label=0)
        for x, y in zip(in_contexts, in_last_sentences)
    ]  # here, "" is just a dummy label
    input_features = run_classifier.convert_examples_to_features(
        input_examples, label_list, flags.max_seq_length, tokenizer)
    predict_input_fn = run_classifier.input_fn_builder(
        features=input_features,
        seq_length=flags.max_seq_length,
        is_training=False,
        drop_remainder=False)
    predictions = estimator.predict(predict_input_fn)
    predictions = [prediction['probabilities'] for prediction in predictions]

    return predictions
    def predict(self, sent1, sent2):
        """
        Predict on a single sentence pair (sent1, sent2); the choice of the inputs depends on the task at hand:
         - Relevance (claim, perspective)
         - Stance (claim, perspective)
         - Equivalence (claim + perspective1, perspective2)
         - Evidence (claim + perspective, evidence)
        :param example:
        :param sent1
        :param sent2
        :return the confidence value of the output label
        """
        label_list = self._processor.get_labels()
        example = InputExample(guid="dummy",
                               text_a=sent1,
                               text_b=sent2,
                               label=label_list[0])
        feature = convert_examples_to_features([example], label_list,
                                               self._config["max_seq_length"],
                                               self._tokenizer)[0]

        self._model.eval()

        with torch.no_grad():
            input_ids_tensor = torch.tensor([feature.input_ids])
            segment_ids_tensor = torch.tensor([feature.segment_ids])
            input_mask_tensor = torch.tensor([feature.input_mask])

            output = self._model(input_ids_tensor,
                                 segment_ids_tensor)  # , input_mask_tensor
            # print(output)
            return output.detach().cpu().numpy()[0]
 def evaluate(self, examples):
     features = run_classifier.convert_examples_to_features(
         examples, self.label_list, self.max_len, self.tokenizer)
     input_fn = run_classifier.input_fn_builder(features=features,
                                                seq_length=self.max_len,
                                                is_training=False)
     self.estimator.evaluate(input_fn=input_fn, steps=None)
    def getListPrediction(self, in_sentences):
        #print(in_sentences)
        #1
        input_examples = [
            run_classifier.InputExample(guid="",
                                        text_a=x,
                                        text_b=None,
                                        label="0") for x in in_sentences
        ]  # here, "" is just a dummy label

        #2
        input_features = run_classifier.convert_examples_to_features(
            input_examples, label_list, MAX_SEQ_LENGTH, self.tokenizer)

        #3
        predict_input_fn = run_classifier.input_fn_builder(
            features=input_features,
            seq_length=MAX_SEQ_LENGTH,
            is_training=False,
            drop_remainder=False)

        #4
        predictions = self.estimator.predict(input_fn=predict_input_fn)

        return predictions
Exemple #15
0
def getPrediction(in_sentences):
    labels = ["Non-Sensitive", "Sensitive"]
    input_examples = [
        run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0)
        for x in in_sentences
    ]  # here, "" is just a dummy label
    input_features = run_classifier.convert_examples_to_features(
        input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    predict_input_fn = run_classifier.input_fn_builder(
        features=input_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)
    estimator = run()
    predictions = estimator.predict(predict_input_fn)
    return [(sentence, prediction['probabilities'],
             labels[prediction['labels']])
            for sentence, prediction in zip(in_sentences, predictions)]


# pred_sentences = [
#   "He drinks apple",
#   "He drinks milk",
#   "A mosquito stings me",
#   "I sting a mosquito",
#   "A niece is a person.",
#   "A giraffe is a person.",
#   "I like to ride my chocolate",
#   "I like to ride my bike",
#   "he put elephant into the jug",
# ]
# predictions = getPrediction(pred_sentences)

# print(predictions,"predictions")
def getPrediction(in_sentences, type_output="features"):
    #A list to map the actual labels to the predictions
    labels = np.unique(train['label'])
    input_examples = [
        run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0)
        for x in in_sentences
    ]
    input_features = run_classifier.convert_examples_to_features(
        input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    #Predicting the classes
    predict_input_fn = run_classifier.input_fn_builder(
        features=input_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)
    predictions = estimator.predict(predict_input_fn)

    if type_output == "features":
        return [
            prediction['pooled_output']
            for _, prediction in enumerate(predictions)
        ]
    else:
        return ([(sentence, prediction['probabilities'], prediction['labels'],
                  labels[prediction['labels']])
                 for sentence, prediction in zip(in_sentences, predictions)])
Exemple #17
0
def predict(sentences, predict_fn):
    labels = [0, 1]
    input_examples = [
        run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0)
        for x in sentences
    ]  # here, "" is just a dummy label
    input_features = run_classifier.convert_examples_to_features(
        input_examples, labels, MAX_SEQ_LENGTH, tokenizer)

    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    all_label_ids = []

    for feature in input_features:
        all_input_ids.append(feature.input_ids)
        all_input_mask.append(feature.input_mask)
        all_segment_ids.append(feature.segment_ids)
        all_label_ids.append(feature.label_id)

    pred_dict = {
        'input_ids': all_input_ids,
        'input_mask': all_input_mask,
        'segment_ids': all_segment_ids,
        'label_ids': all_label_ids
    }

    predictions = predict_fn(pred_dict)
    return [(sentence, prediction, label)
            for sentence, prediction, label in zip(
                pred_sentences, predictions['probabilities'],
                predictions['labels'])]
Exemple #18
0
    def embed(self,
              sequences: Union[str, Iterable[str]],
              per_token: bool = _DEFAULT_PER_TOKEN_EMBEDDING) -> np.ndarray:
        """
        Embeds a sequence or multiple sequences using the BERT model

        Args:
            sequences (Union[str, Iterable[str]]): the sequence(s) to embed
            per_token (bool): whether to produce an embedding per token or a pooled embedding for the whole sequence

        Returns:
            a numpy array with the embedding(s) of the sequence(s)
        """
        single_input = isinstance(sequences, str)
        if single_input:
            sequences = [sequences]

        # Convert sequnces into BERT input format
        input_examples = [
            run_classifier.InputExample(guid=None,
                                        text_a=sequence,
                                        text_b=None,
                                        label=0) for sequence in sequences
        ]
        input_features = run_classifier.convert_examples_to_features(
            input_examples, [0], self._input_ids.shape[1], self._tokenizer)

        # Execute the computation graph on the inputs
        if self._session is not None:
            output = self._session.run(
                self.
                _outputs["sequence_output" if per_token else "pooled_output"],
                feed_dict={
                    self._input_ids:
                    [sequence.input_ids for sequence in input_features],
                    self._input_mask:
                    [sequence.input_mask for sequence in input_features],
                    self._segment_ids:
                    [sequence.segment_ids for sequence in input_features]
                })
        else:
            with tf.compat.v1.Session(graph=self._graph) as session:
                session.run(tf.global_variables_initializer())

                output = session.run(
                    self._outputs[
                        "sequence_output" if per_token else "pooled_output"],
                    feed_dict={
                        self._input_ids:
                        [sequence.input_ids for sequence in input_features],
                        self._input_mask:
                        [sequence.input_mask for sequence in input_features],
                        self._segment_ids:
                        [sequence.segment_ids for sequence in input_features]
                    })

        if single_input:
            output = output.reshape(output.shape[1:])
        return output
 def predict(self, examples):
     features = run_classifier.convert_examples_to_features(
         examples, self.label_list, self.max_len, self.tokenizer)
     input_fn = run_classifier.input_fn_builder(features=features,
                                                seq_length=self.max_len,
                                                is_training=False)
     predictions = self.estimator.predict(input_fn=input_fn)
     return predictions['probabilities']
Exemple #20
0
 def __create_features(self, pd_dataset, label_list,
                     max_seq_len, tokenizer,
                     data_column, label_column):
     input_examples = pd_dataset.apply(lambda x: InputExample(guid=None,
                                       text_a=x[data_column],
                                       text_b=None,
                                       label=x[label_column]), axis=1)
     return convert_examples_to_features(input_examples, label_list,
                                         max_seq_len, tokenizer)
def data_prep(stressors_list, label_list, MAX_SEQ_LENGTH, tokenizer):
    input_examples = [
        run_classifier.InputExample(guid="", text_a=x, text_b=None, label=1)
        for x in stressors_list
    ]
    input_features = run_classifier.convert_examples_to_features(
        input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

    return input_features
Exemple #22
0
def get_bert_features(text):
    if len(text) == 1:
        in_sentences = [text[0], 'aha']
    else:
        in_sentences = text
    input_examples = [
        run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0)
        for x in in_sentences
    ]
    input_features = run_classifier.convert_examples_to_features(
        input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    return input_features
def get_bert_inputs(input, bert_hub_module_handle, bert_max_seqlen):
    tokenizer = create_tokenizer_from_hub_module(bert_hub_module_handle)
    features = run_classifier.convert_examples_to_features(
        input, [0, 1], bert_max_seqlen, tokenizer)
    input_ids = []
    input_mask = []
    segment_ids = []
    for feature in features:
        input_ids.append(feature.input_ids)
        input_mask.append(feature.input_mask)
        segment_ids.append(feature.segment_ids)

    return input_ids, input_mask, segment_ids
Exemple #24
0
def convert_sent_to_vec(tokenizer,
                        dataset,
                        DATA_COLUMN='text_1',
                        DATA_COLUMN2='text_2'):
    label_list = [1, 0]

    input_example = dataset.apply(lambda x: run_classifier.InputExample(
        guid=None, text_a=x[DATA_COLUMN], text_b=x[DATA_COLUMN2], label=1),
                                  axis=1)
    features = run_classifier.convert_examples_to_features(
        input_example, cfg.label_list, cfg.MAX_SEQ_LENGTH, tokenizer)

    return features
def create_train_test_features(
    tokenizer: tokenization.FullTokenizer
) -> Tuple[run_classifier.InputFeatures, run_classifier.InputFeatures]:
    train_input_examples, test_input_examples = (ParquetFile(
        data_filename(dataset_name)).to_pandas().sample(SAMPLE_SIZE).apply(
            create_bert_input_example,
            axis=1) for dataset_name in DATASET_NAMES)

    train_features, test_features = (
        run_classifier.convert_examples_to_features(input_examples, LABEL_LIST,
                                                    MAX_SEQ_LENGTH, tokenizer)
        for input_examples in (train_input_examples, test_input_examples))

    return train_features, test_features
Exemple #26
0
def getPrediction(in_sentences):
  #A list to map the actual labels to the predictions
  labels = ["0", "1","2","3"]

  #Transforming the test data into BERT accepted form
  input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] 
  
  #Creating input features for Test data
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

  #Predicting the classes 
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
  predictions = estimator.predict(predict_input_fn)
  return [(sentence, prediction['probabilities'],prediction['labels'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]
def getPrediction(estimator, in_sentences, labels, label_list, max_seq_len,
                  tokenizer):
    input_examples = [
        run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0)
        for x in in_sentences
    ]
    input_features = run_classifier.convert_examples_to_features(
        input_examples, label_list, max_seq_len, tokenizer)
    predict_input_fn = run_classifier.input_fn_builder(features=input_features,
                                                       seq_length=max_seq_len,
                                                       is_training=False,
                                                       drop_remainder=False)
    predictions = estimator.predict(predict_input_fn)
    return [(sentence, pred['probabilities'], labels[pred['labels']])
            for sentence, pred in zip(in_sentences, predictions)]
    def do_train(self):
        print("[INFO] Preparing train InputExample...")
        train_inputExamples = self.train.apply(
            lambda x: run_classifier.InputExample(
                guid=None, text_a=x['sentence'], text_b=None, label=x['label'
                                                                      ]),
            axis=1)
        print("[INFO] Done preparing train InputExample...\n")

        label_list = list(range(len(self.labels)))
        print("[INFO] Preparing train features...")
        train_features = run_classifier.convert_examples_to_features(
            train_inputExamples, label_list, self.max_seq_length,
            self.tokenizer)
        print("[INFO] Done preparing train features...\n")

        train_input_fn = run_classifier.input_fn_builder(
            features=train_features,
            seq_length=self.max_seq_length,
            is_training=True,
            drop_remainder=False)

        num_train_steps = \
            int(len(train_features)/self.train_batch_size*self.num_train_epochs)
        num_warmup_steps = int(num_train_steps * self.warmup_proportion)

        print(f"[INFO] No. of train steps: {num_train_steps}")
        print(f"[INFO] No. of warmup steps: {num_warmup_steps}")

        self.estimator = get_estimator(
            self.init_checkpoint,
            self.bert_config_file,
            self.labels,
            self.train_batch_size,
            self.model_dir,
            save_summary_steps=self.save_summary_steps,
            save_checkpoints_steps=self.save_checkpoints_steps,
            learning_rate=self.learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps)

        print(f'[INFO] Begin Training...!')
        current_time = datetime.now()
        self.estimator.train(input_fn=train_input_fn,
                             max_steps=num_train_steps)
        print(
            f"[INFO] Training took time {datetime.now() - current_time} sec..!\n"
        )
Exemple #29
0
def model_predict(estimator, processor, input_dir, predict_batch_size,
                  label_list, max_sequence_length, tokenizer):
    prediction_examples = processor.get_dev_examples(
        input_dir)[:predict_batch_size]
    input_features = run_classifier.convert_examples_to_features(
        prediction_examples, label_list, max_sequence_length, tokenizer)
    predict_input_fn = run_classifier.input_fn_builder(
        features=input_features,
        seq_length=max_sequence_length,
        is_training=False,
        drop_remainder=True)
    predictions = estimator.predict(predict_input_fn)
    for example, prediction in zip(prediction_examples, predictions):
        print('text_a: %s\ntext_b: %s\nlabel:%s\nprediction:%s\n' %
              (example.text_a, example.text_b, str(
                  example.label), prediction['probabilities']))
def classify_sentences(estimator, tokenizer, test_features, testfile):
    test_input_fn = run_classifier.input_fn_builder(
        features=test_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)

    estimator.evaluate(input_fn=test_input_fn, steps=None)

    #ul_test_path = os.path.join(project_path,'unlabeled_test_with_noise.tsv')
    #print(ul_test_path)
    ul_test = pd.read_csv(testfile, sep='\n')

    ul_test.head()

    ul_test.columns = ['Text']
    ul_test.head
    [count, col] = ul_test.shape
    print("count, col", count, col)
    dft2 = ul_test["Text"]

    pred_sentences = []
    for i in range (0,count):
      pred_sentences.append(dft2[i])

    print(len(pred_sentences))

    input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 1) for x in pred_sentences]

    input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

    #pred_sentences = [
    #  "That movie was absolutely awful",
    #  "Please finish this work by Monday",
    #  "I am going to attend the meeting in Germany",
    #  "The results are very good this year. We can close the deal",
    #  "abhi said I oft manerism",
    #  " utkash said he also so anyone can be confusing.",
    #  " if the say otherwise in great in the dark but you know also I'll be able to text thing ."
    #]
    print(pred_sentences)

    predictions = getPrediction(pred_sentences, estimator, tokenizer)

    print (predictions)

    return predictions