def create_examples(lines, set_type, labels=None):
    guid = set_type
    examples = []
    if set_type == 'train':
        for line, label in zip(lines, labels):
            text_a = line
            label = str(label)
            examples.append(run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    else:
        for line in lines:
            text_a = line
            label = '0'
            examples.append(run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))

    return examples
Example #2
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training, dev and test sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%d" % (set_type, i)
         if set_type == 'test':
             # removing header and invalid data
             if i == 0 or len(line) != 3:
                 print(guid, line)
                 continue
             text_a = tokenization.convert_to_unicode(line[1])
             text_b = tokenization.convert_to_unicode(line[2])
             label = 0  # We will use zero for test as convert_example_to_features doesn't support None
         else:
             # removing header and invalid data
             if i == 0 or len(line) != 6:
                 continue
             text_a = tokenization.convert_to_unicode(line[3])
             text_b = tokenization.convert_to_unicode(line[4])
             label = int(line[5])
         examples.append(
             run_classifier.InputExample(guid=guid,
                                         text_a=text_a,
                                         text_b=text_b,
                                         label=label))
     return examples
Example #3
0
def create_examples(lines, set_type, labels=None):
#Generate data for the BERT model
    guid = f'{set_type}'
    examples = []
    if guid == 'train':
        for line, label in zip(lines, labels):
            text_a = line
            label = str(label)
            """Creating single training/test example for simple sequence classification."""
            examples.append(run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    else:
        for line in lines:
            text_a = line
            label = '0'
            examples.append(run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples
Example #4
0
def getPrediction(in_sentences):
    '''
  Function to provide class predictions for list of input sentences
  :param in_sentences:
  :return:
  '''

    #labels = ["Negative", "Positive"]

    # pre-process input
    input_examples = [
        run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0)
        for x in in_sentences
    ]  # here, "" is just a dummy label
    input_features = run_classifier.convert_examples_to_features(
        input_examples, label_list, FLAGS.MAX_SEQ_LENGTH, tokenizer)

    # create model function input
    predict_input_fn = run_classifier.input_fn_builder(
        features=input_features,
        seq_length=FLAGS.MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)

    # calculate the predictions
    predictions = estimator.predict(predict_input_fn)

    #
    output = [(sentence, prediction['probabilities'], prediction['labels'])
              for sentence, prediction in zip(in_sentences, predictions)]
    return output
def classify(search_sentence):

    examples = []
    guid = "dev-1"
    text_a = tokenization.convert_to_unicode(search_sentence)
    label = tokenization.convert_to_unicode("1")  # put dummy input
    examples.append(
        run_classifier.InputExample(guid=guid,
                                    text_a=text_a,
                                    text_b=None,
                                    label=label))

    predict_features = run_classifier.convert_examples_to_features(
        examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    predict_input_fn = run_classifier.input_fn_builder(
        features=predict_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=True)
    result = estimator.predict(input_fn=predict_input_fn)
    for (i, prediction) in enumerate(result):
        probabilities = prediction["probabilities"]
        if i >= 1: break

        emotion = {"happy": 0, "sad": 0, "angry": 0, "surprised": 0}
        _emotion = ["happy", "sad", "angry", "surprised"]
        for (i, class_probability) in enumerate(probabilities):
            emotion[_emotion[i]] = class_probability
            print(_emotion[i] + ": " + str(class_probability))

    return emotion
Example #6
0
def predict(sentence):
    input_example = run_classifier.InputExample(
        guid="", text_a=sentence, text_b=None,
        label="0")  # here, "" is just a dummy label
    input_examples = [input_example]

    input_features = run_classifier.convert_examples_to_features(
        input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

    predict_input_fn = run_classifier.input_fn_builder(
        features=input_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)

    predictions = estimator.predict(input_fn=predict_input_fn)

    df = pd.DataFrame(predictions)
    label_index = df.iat[0, 0].argmax()
    confidence = df.iat[0, 0].max()
    predicted_label = emotions.iat[int(label_index), 0]

    return {
        'predicted_label': predicted_label,
        'confidence': confidence.item()
    }
Example #7
0
def FLTR_Prediction(d,tokenizer,estimator):
    """ Ranking all of reviews accoring to how they are relevant to a given quetion."""
    
    d = d.reset_index(drop=True)
    test_t = d.apply(QR_pair,axis=1)
    test_t = test_t.tolist()
    flat_list = [item for sublist in test_t for item in sublist]
    test_t = pd.DataFrame(flat_list,columns=['question','review'])
    test_t['question'] = test_t['question'].apply(str)
    test_t['review'] = test_t['review'].apply(str)
    DATA_COLUMN_A = 'question'
    DATA_COLUMN_B = 'review'
    label_list = [0, 1]
    max_inputs = 1000000
    probs = []
    temp_test = test_t.copy()
    
    while len(temp_test)>0:
        line = min(max_inputs,len(temp_test))
        temp = temp_test[:line]
        
        inputExamples = temp.apply(lambda x: run_classifier.InputExample(guid=None,
                                                                         text_a = x[DATA_COLUMN_A],
                                                                         text_b = x[DATA_COLUMN_B],
                                                                         label = 0), 
                                   axis = 1)
                                    
        input_features = run_classifier.convert_examples_to_features(inputExamples, 
                                                                     label_list, 
                                                                      FLAGS.max_seq_length, 
                                                                      tokenizer)  
        
        predict_input_fn = run_classifier.input_fn_builder(features=input_features, 
                                                           seq_length=FLAGS.max_seq_length,
                                                           is_training=False, 
                                                           drop_remainder=False)
        
        predictions = estimator.predict(predict_input_fn)
        probabilities = [prediction['probabilities'] for prediction in  predictions]
        probs = probs+[item.tolist()[1] for item in probabilities]
        
        if len(temp_test)>max_inputs:
            temp_test = temp_test[line:]
            temp_test = temp_test.reset_index(drop=True)
        else:
            temp_test = []

    test_t['probilities']=probs
    num_reviews = d['num_reviews'].tolist()
    d['FLTR_scores'] = ''
    for i in range(0,len(d)):
        n = num_reviews[i]
        #print(probs[:n])
        d.at[i,'FLTR_scores'] = probs[:n]
        #print(d.at[i,'FLTR_scores'])
        if i!=len(d)-1:
            probs=probs[n:]
            
    return d
 def get_predict_examples(self, sentence_pairs):
   """Given question pairs, conevrting to list of InputExample"""
   examples = []
   for (i, qpair) in enumerate(sentence_pairs):
     guid = "predict-%d" % (i)
     # converting questions to utf-8 and creating InputExamples
     text_a = tokenization.convert_to_unicode(qpair[0])
     text_b = tokenization.convert_to_unicode(qpair[1])
     # We will add label  as 0, because None is not supported in converting to features
     examples.append(
         run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=0))
   return examples
Example #9
0
def get_predict_examples(list1, list2):
    examples = []
    for i, (sent1, sent2) in enumerate(zip(list1, list2)):
        guid = f"test-{i}"
        text_a = tokenization.convert_to_unicode(sent1)
        text_b = tokenization.convert_to_unicode(sent2)
        label = "0"
        examples.append(
            run_classifier.InputExample(guid=guid,
                                        text_a=text_a,
                                        text_b=text_b,
                                        label=label))
    return examples
Example #10
0
def create_examples(lines, set_type, labels=None):
  """Generate data for the BERT model
  :params lines: (numpy array) input sentences
  :params set_type: (str) eg. train
  :params labels: (numpy array) labels
  :return examples: list
  """
  
  guid = f'{set_type}'
  examples = []
  if guid == 'train':
      for line, label in zip(lines, labels):
          text_a = line
          label = str(label)
          examples.append(
            run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
  else:
      for line in lines:
          text_a = line
          label = '0'
          examples.append(
            run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
  return examples
Example #11
0
def create_examples(lines, set_type, labels=None):
    #Generate data for the BERT model
    guid = f'{set_type}'
    examples = []
    if guid == 'train':
        for line, label in zip(lines, labels):
            text_a = line
            label = str(label)
            examples.append(
                run_classifier.InputExample(guid=guid,
                                            text_a=text_a,
                                            text_b=None,
                                            label=label))
    else:
        for line in lines:
            text_a = line
            label = '[0 0 0 0 0 0]'
            examples.append(
                run_classifier.InputExample(guid=guid,
                                            text_a=text_a,
                                            text_b=None,
                                            label=label))
    return examples
Example #12
0
def BerQA_train_predict(data,is_training=True):
    d=data.copy()
    scores = []
    max_inputs = 30000
    
    LEARNING_RATE = 2e-5
    NUM_TRAIN_EPOCHS = 2
    WARMUP_PROPORTION = 0.1
    # Model configs
    SAVE_CHECKPOINTS_STEPS = 1000
    SAVE_SUMMARY_STEPS = 500
    num_train_steps = 100
    max_steps = 100000
    
    DATA_COLUMN_A = 'senA'
    DATA_COLUMN_B = 'senB'
    LABEL_COLUMN = 'Label'
    label_list = [0, 1]
    
    while(len(d)>0 and num_train_steps<=max_steps):
        line = min(max_inputs,len(d))
        temp = d[:line]
        temp_t = temp.apply(qar_pair,axis=1)
        temp_t=temp_t.tolist()
        flat_list = [item for sublist in temp_t for item in sublist]
        temp_t =pd.DataFrame(flat_list,columns=['senA','senB'])	
        temp_t['Label'] =1
        temp_t['senA']=temp_t['senA'].apply(str)
        temp_t['senB']=temp_t['senB'].apply(str)

        temp_InputExamples = temp_t.apply(lambda x: run_classifier.InputExample(guid=None,
                                                                           text_a = x[DATA_COLUMN_A],
                                                                           text_b = x[DATA_COLUMN_B],
                                                                           label = x[LABEL_COLUMN]), axis = 1)
        
        temp_features = run_classifier.convert_examples_to_features(temp_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
        num_train_steps = int(len(temp_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)+num_train_steps
        num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
        # Specify outpit directory and number of checkpoint steps to save
        run_config = tf.estimator.RunConfig(
            model_dir=OUTPUT_DIR,
            save_summary_steps=SAVE_SUMMARY_STEPS,
            save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

        model_fn = model_fn_builder(
            num_labels=len(label_list),
            learning_rate=LEARNING_RATE,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps)
Example #13
0
def load_data(
        load_fn=standard_loader('twitter'),
        bert_pair_shape=True, split=None, context_extent='last',
        emoji_totext=False,
        random_state=42
):
    """
    :param load_fn: a loader function which when executed  should return data frame with 'label',
    'response' and 'context'
    :param bert_pair_shape: True if you want it in shape for bert
    :param split: If none full dataset will be returned, else train, test where split corresponds to frac of test
    :param context_extent: 'last' or 'all'
    :param emoji_totext: process emojis to text if True
    :return:
    """

    # Why do we bother doing this ? We want to pass some arbitrary data not tied to any specific file later.
    res = load_fn()

    # Process emojis if we'd like to
    if emoji_totext:
        res['response'] = res['response'].apply(lambda resp: emoji.demojize(resp))
        res['context'] = res['context'].apply(lambda contexts: [emoji.demojize(c) for c in contexts])

    # Get it into a BERT friendly format
    new_res = []
    if bert_pair_shape:
        i = 1
        for _ind, row in res.iterrows():
            guid = "%d" % i  # A Unique id for each row.
            text_a = tokenization.convert_to_unicode(row['response'])  # The main text which we wish to classify
            # in this Sentence pair classification task, our context is the second sentence.
            text_b = tokenization.convert_to_unicode(context_process(row['context'], context_extent))
            # Sarcasm Label
            label = row.get('label', 'NOT_SARCASM')

            # Convert every row into a BERT InputExample Object
            new_res.append(run_classifier.InputExample(guid, text_a, text_b, label))
    else:
        new_res = res

    # Split data for test if needed
    if split is None:
        return new_res
    else:
        train, test = train_test_split(new_res, test_size=split, random_state=random_state)
        return train, test
Example #14
0
 def getPrediction(self, in_sentences):
     label_list = [0, 1]
     input_examples = [
         run_classifier.InputExample(guid="",
                                     text_a=x,
                                     text_b=None,
                                     label=0) for x in in_sentences
     ]  # here, "" is just a dummy label
     input_features = run_classifier.convert_examples_to_features(
         input_examples, label_list, self.MAX_SEQ_LENGTH, self.tokenizer)
     predict_input_fn = run_classifier.input_fn_builder(
         features=input_features,
         seq_length=self.MAX_SEQ_LENGTH,
         is_training=False,
         drop_remainder=False)
     predictions = self.estimator.predict(predict_input_fn)
     return [(sentence, prediction['encode_vec'])
             for sentence, prediction in zip(in_sentences, predictions)]
Example #15
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         n = 0
         if set_type == "train":
             n = 2222
         elif set_type == "dev":
             n = 3333
         else:
             n = 4444
         uid = str(n) + str(i)
         guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(uid))
         text_a = tokenization.convert_to_unicode(line[0])
         #text_b = tokenization.convert_to_unicode(line[9])
         label = tokenization.convert_to_unicode(line[-1])
         examples.append(
             rc.InputExample(guid=guid, text_a=text_a, label=label))
     return examples
 def scorePredict(self):
     sentences = self.sentences
     input_examples = [
         run_classifier.InputExample(guid="",
                                     text_a=x,
                                     text_b=None,
                                     label="0") for x in sentences
     ]  # here, "" is just a dummy label
     input_features = run_classifier.convert_examples_to_features(
         input_examples, ['0', '1', '-1'], self.MAX_SEQ_LENGTH,
         self.tokenizer)
     predict_input_fn = run_classifier.input_fn_builder(
         features=input_features,
         seq_length=self.MAX_SEQ_LENGTH,
         is_training=False,
         drop_remainder=False)
     predictions = self.estimator.predict(input_fn=predict_input_fn)
     self.pred = list(predictions)
     return self.pred
Example #17
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # Only the test set has a header
         if set_type == 'test' and i == 0:
             continue
         #guid = '%s-%s' % (set_type, i)
         guid = tokenization.convert_to_unicode(line[0])
         if set_type == 'test':
             text_a = tokenization.convert_to_unicode(line[3])
             #Set a dummy value. This is not used
             label = 'positive'
         else:
             text_a = tokenization.convert_to_unicode(line[3])
             label = tokenization.convert_to_unicode(line[1])
         examples.append(
             run_classifier.InputExample(guid=guid,
                                         text_a=text_a,
                                         text_b=None,
                                         label=label))
     return examples
  def _create_examples(self, lines, set_type):
    """Creates examples for the training, dev and test sets."""
    examples = []
    lines.pop(0)
    for (i, line) in enumerate(lines):
      #Unqiue ID is created
      guid = "%s-%d" % (set_type, i)
      if set_type=='test':
        # removing header and invalid data

        text_a = tokenization.convert_to_unicode(line[0])
        text_b = tokenization.convert_to_unicode(line[1])
        label = 0 # We will use zero for test as convert_example_to_features doesn't support None
      else:
        #We need to check that these labels are looked at correctly. text_a is the question and text_b is the answer.
        # The label is the thrid variable in the .tsv file that we pass through.
        text_a = tokenization.convert_to_unicode(line[0])
        text_b = tokenization.convert_to_unicode(line[1])
        label = int(line[2])
      examples.append(
          run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples
Example #19
0
def BerQA_train_predict(data, is_training=True):
    d = data.copy()
    scores = []
    max_inputs = 30000

    LEARNING_RATE = 2e-5
    NUM_TRAIN_EPOCHS = 2
    WARMUP_PROPORTION = 0.1
    # Model configs
    SAVE_CHECKPOINTS_STEPS = 1000
    SAVE_SUMMARY_STEPS = 500
    num_train_steps = 100
    max_steps = 100000

    DATA_COLUMN_A = 'senA'
    DATA_COLUMN_B = 'senB'
    LABEL_COLUMN = 'Label'
    label_list = [0, 1]

    while (len(d) > 0 and num_train_steps <= max_steps):
        line = min(max_inputs, len(d))
        temp = d[:line]
        temp_t = temp.apply(qar_pair, axis=1)
        temp_t = temp_t.tolist()
        flat_list = [item for sublist in temp_t for item in sublist]
        temp_t = pd.DataFrame(flat_list, columns=['senA', 'senB'])
        temp_t['Label'] = 1
        temp_t['senA'] = temp_t['senA'].apply(str)
        temp_t['senB'] = temp_t['senB'].apply(str)

        temp_InputExamples = temp_t.apply(
            lambda x: run_classifier.InputExample(guid=None,
                                                  text_a=x[DATA_COLUMN_A],
                                                  text_b=x[DATA_COLUMN_B],
                                                  label=x[LABEL_COLUMN]),
            axis=1)

        temp_features = run_classifier.convert_examples_to_features(
            temp_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
        num_train_steps = int(
            len(temp_features) / BATCH_SIZE *
            NUM_TRAIN_EPOCHS) + num_train_steps
        num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
        # Specify outpit directory and number of checkpoint steps to save
        run_config = tf.estimator.RunConfig(
            model_dir=OUTPUT_DIR,
            save_summary_steps=SAVE_SUMMARY_STEPS,
            save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

        model_fn = model_fn_builder(num_labels=len(label_list),
                                    learning_rate=LEARNING_RATE,
                                    num_train_steps=num_train_steps,
                                    num_warmup_steps=num_warmup_steps)

        estimator = tf.estimator.Estimator(model_fn=model_fn,
                                           config=run_config,
                                           params={"batch_size": BATCH_SIZE})

        input_fn = input_fn_builder(features=temp_features,
                                    seq_length=MAX_SEQ_LENGTH,
                                    is_training=is_training,
                                    drop_remainder=True)

        if is_training:
            print('Beginning Training!')
            early_stopping = tf.contrib.estimator.stop_if_no_decrease_hook(
                estimator,
                metric_name='loss',
                max_steps_without_decrease=1000,
                min_steps=100)
            current_time = datetime.now()
            #tf.estimator.train_and_evaluate(estimator,train_spec=tf.estimator.TrainSpec(input_fn, hooks=[early_stopping]))
            estimator.train(input_fn=input_fn,
                            max_steps=num_train_steps,
                            hooks=[early_stopping])
            print("Training took time ", datetime.now() - current_time)
        else:
            predictions = estimator.predict(input_fn)
            outputs = [(prediction['probabilities'], prediction['crq'])
                       for prediction in predictions]
            x = [i[0] for i in outputs]
            y = [i[1] for i in outputs]
            print('\n')
            print('Accuracy of ' + category + ' is: ' +
                  str(sum(i > 0 for i in x) / len(x)))
            print('\n')
            scores = scores + y

        if len(d) > max_inputs:
            d = d[line:]
            d = d.reset_index(drop=True)
        else:
            d = []

    if is_training is False:
        data = data[:len(scores)]
        scores = [item.tolist() for item in scores]
        #BERTQA_scores = pd.DataFrame(data=scores)
        data['BERTQA_scores'] = scores
        #data = pd.concat([data,BERTQA_scores],axis=1,ignore_index=True)
        return data
Example #20
0
def main(_):
    """For current work, we only use the following four categories, 
    but you can add others if you would like to."""
    categories = [
        'Tools_and_Home_Improvement',
        'Patio_Lawn_and_Garden',
        'Electronics',
        'Baby',
    ]

    models = ['FLTR', 'BertQA']

    data_path = os.path.join(FLAGS.data_dir, 'Annotated_Data.txt')
    data = pd.read_csv(data_path,
                       sep='\t',
                       encoding='utf-8',
                       converters={
                           'annotation_score': ast.literal_eval,
                           'reviews': ast.literal_eval
                       })
    data = data.reset_index()
    data['qr'] = data[['index', 'question', 'reviews'
                       ]].apply(lambda x: [[x['index'], x['question'], i]
                                           for i in x['reviews']],
                                axis=1)

    d = []
    for category in categories:
        qr = data[data['category'] == category]['qr'].tolist()
        qr = [item for sublist in qr for item in sublist]
        qr = pd.DataFrame(columns=['index', 'question', 'review'], data=qr)
        qr['label'] = 1

        temp = qr.copy()
        temp['question'] = temp['question'].apply(str)
        temp['review'] = temp['review'].apply(str)
        DATA_COLUMN_A = 'question'
        DATA_COLUMN_B = 'review'
        LABEL_COLUMN = 'label'
        label_list = [0, 1]

        tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                               do_lower_case=True)

        test_InputExamples = temp.apply(
            lambda x: run_classifier.InputExample(guid=None,
                                                  text_a=x[DATA_COLUMN_A],
                                                  text_b=x[DATA_COLUMN_B],
                                                  label=x[LABEL_COLUMN]),
            axis=1)

        test_features = run_classifier.convert_examples_to_features(
            test_InputExamples, label_list, FLAGS.max_seq_length, tokenizer)

        t = data[data['category'] == category]
        t = t.reset_index(drop=True)
        for model in models:

            OUTPUT_DIR = os.path.join(FLAGS.model_output_dir,
                                      category + '_' + model)
            run_config = tf.estimator.RunConfig(model_dir=OUTPUT_DIR,
                                                save_summary_steps=100,
                                                save_checkpoints_steps=100)

            model_fn = None
            if model == 'BertQA':
                model_fn = model_fn_builder_BertQA(
                    bert_config=modeling.BertConfig.from_json_file(
                        FLAGS.bert_config_file),
                    num_labels=len(label_list),
                    init_checkpoint=OUTPUT_DIR,
                    learning_rate=FLAGS.learning_rate,
                    num_train_steps=100,
                    num_warmup_steps=100)
            else:
                model_fn = model_fn_builder_FLTR(
                    bert_config=modeling.BertConfig.from_json_file(
                        FLAGS.bert_config_file),
                    num_labels=len(label_list),
                    init_checkpoint=OUTPUT_DIR,
                    learning_rate=FLAGS.learning_rate,
                    num_train_steps=100,
                    num_warmup_steps=100)

            estimator = tf.estimator.Estimator(
                model_fn=model_fn,
                config=run_config,
                params={"batch_size": FLAGS.train_batch_size})

            test_input_fn = run_classifier.input_fn_builder(
                features=test_features,
                seq_length=FLAGS.max_seq_length,
                is_training=False,
                drop_remainder=False)

            predictions = estimator.predict(test_input_fn)
            probabilities = [
                prediction['probabilities'] for prediction in predictions
            ]
            probabilities = [list(item) for item in probabilities]

            if model == 'FLTR':
                probabilities = [item[1] for item in probabilities]
            else:
                probabilities = [item[0] for item in probabilities]

            print(model, ' :', probabilities[:10])
            temp[model + '_score'] = probabilities
            temp_groupby = temp.groupby(
                ['index', 'question'],
                sort=False)[model + '_score'].apply(list).reset_index(
                    name=model + '_score')
            t = pd.concat([t, temp_groupby[model + '_score']], axis=1)

        if len(d) == 0:
            d = t
        else:
            d = pd.concat([d, t], axis=0, ignore_index=True)

    d.to_csv(os.path.join(FLAGS.data_dir, 'test_predictions.txt'),
             index=None,
             sep='\t',
             mode='w')
Example #21
0
d = pd.concat([qa,nqa],axis=0)
d=shuffle(d)
d['question']=d['question'].apply(str)
d['answer']=d['answer'].apply(str)
split = int(len(d)*0.9)
dtrain = d[0:split]
dtest = d[split:]

DATA_COLUMN_A = 'question'
DATA_COLUMN_B = 'answer'
LABEL_COLUMN = 'label'
label_list = [0, 1]

# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = dtrain.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                        text_a = x[DATA_COLUMN_A],
                                                                        text_b = x[DATA_COLUMN_B],
                                                                        label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = dtest.apply(lambda x: run_classifier.InputExample(guid=None,text_a = x[DATA_COLUMN_A],
                                            text_b = x[DATA_COLUMN_B],label = x[LABEL_COLUMN]), axis = 1)
                                            
# Convert our train and test features to InputFeatures that BERT understands.
train_features = run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * 2)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
Example #22
0
label_list = list(np.unique(train['label']))  #[0, 1]

tf.logging.info('shape of data: train: (%d,%d), test: (%d,%d)' %
                (train.shape + test.shape))
tf.logging.info('columns of train file: %s' % ','.join(train.columns))

# <------------------ Prepare the training input

tf.logging.info('prepare training input...')

# Use the InputExample class from BERT's run_classifier code to create examples from the data. Each data point
# wrapped into a InputExample class
train_InputExamples = train.apply(
    lambda x: run_classifier.InputExample(
        guid=
        None,  # Globally unique ID for bookkeeping, unused in this example
        text_a=x['text'],
        text_b=None,
        label=x['label']),
    axis=1)

test_InputExamples = test.apply(lambda x: run_classifier.InputExample(
    guid=None, text_a=x['text'], text_b=None, label=x['label']),
                                axis=1)

# <------------------ Prepare tokenizer and do tokenization

tf.logging.info('prepare tokenizer')

# Checks whether the casing config is consistent with the checkpoint name.
tokenization.validate_case_matches_checkpoint(
    do_lower_case=True, init_checkpoint=FLAGS.BERT_INIT_CHKPNT)
Example #23
0
def main(_):
    """For current work, we only use the following four categories, 
    but you can add others if you would like to."""
    categories = [
              'Tools_and_Home_Improvement', 
              'Patio_Lawn_and_Garden',
              'Electronics',
              'Baby',
    ]
    
    data = None
    
    if FLAGS.category_name == "All_Categories":
        """Cross-domain pre-training (All_Categories)."""
        for category in categories:
            category_data_path = os.path.join(FLAGS.data_dir,category+'.txt')
            category_data = pd.read_csv(category_data_path,sep='\t',encoding='utf-8',#nrows=5000,
                  converters={'QA':ast.literal_eval,'reviewText':ast.literal_eval})
            
            category_data = category_data[:int(len(data)*0.8)]
            if data is None:
                data = category_data
            else:
                data = pd.concat([data,category_data],axis=0)
        data = data.sample(n=min(len(data),60000))
    else:
        data_path = os.path.join(FLAGS.data_dir,FLAGS.category_name+'.txt')
        data = pd.read_csv(data_path,sep='\t',encoding='utf-8',#nrows=10000,
                  converters={'QA':ast.literal_eval,'reviewText':ast.literal_eval})
    
    data['question'] = data['QA'].apply(lambda x: x['questionText'])
    data['answer'] = data['QA'].apply(lambda x: x['answers'][0]['answerText'] if len(x['answers'])>0 else PAD_WORD)
    data['num_reviews']= data['reviewText'].apply(lambda x: len(x))

    train = data[:int(len(data)*0.8)]
    
    list_of_answers = list(train['answer'])
    list_of_answers=shuffle(list_of_answers)
    qa = train[['question','answer']]
    nqa =  pd.DataFrame({'question': train['question'].tolist(),'answer':list_of_answers})
    qa['label']=1
    nqa['label']=0

    d = pd.concat([qa,nqa],axis=0)
    d=shuffle(d)
    d['question']=d['question'].apply(str)
    d['answer']=d['answer'].apply(str)
    split = int(len(d)*0.9)
    dtrain = d[0:split]
    dtest = d[split:]

    DATA_COLUMN_A = 'question'
    DATA_COLUMN_B = 'answer'
    LABEL_COLUMN = 'label'
    label_list = [0, 1]

    tokenizer = tokenization.FullTokenizer(
                                           vocab_file=FLAGS.vocab_file, 
                                           do_lower_case=True)
    
    train_InputExamples = dtrain.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                        text_a = x[DATA_COLUMN_A],
                                                                        text_b = x[DATA_COLUMN_B],
                                                                        label = x[LABEL_COLUMN]), 
                                       axis = 1)

    test_InputExamples = dtest.apply(lambda x: run_classifier.InputExample(guid=None,
                                                                           text_a = x[DATA_COLUMN_A],
                                                                           text_b = x[DATA_COLUMN_B],
                                                                           label = x[LABEL_COLUMN]), 
                                     axis = 1)
                                            
    train_features = run_classifier.convert_examples_to_features(train_InputExamples, 
                                                  label_list, 
                                                  FLAGS.max_seq_length, 
                                                  tokenizer)
    test_features = run_classifier.convert_examples_to_features(test_InputExamples, 
                                                 label_list, 
                                                 FLAGS.max_seq_length, 
                                                 tokenizer)
   

    OUTPUT_DIR = os.path.join(FLAGS.model_output_dir,FLAGS.category_name+'_FLTR')
    tf.gfile.MakeDirs(OUTPUT_DIR)
    
    run_config = tf.estimator.RunConfig(
                                    model_dir=OUTPUT_DIR,
                                    save_summary_steps=FLAGS.save_summary_steps,
                                    save_checkpoints_steps=FLAGS.save_checkpoints_steps)

    num_train_steps = int(len(train_features) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
    
    
    model_fn = model_fn_builder(
                            bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file),
                            num_labels=len(label_list),
                            init_checkpoint=FLAGS.init_checkpoint,
                            learning_rate=FLAGS.learning_rate,
                            num_train_steps=num_train_steps,
                            num_warmup_steps=num_warmup_steps)

    estimator = tf.estimator.Estimator(
                                   model_fn=model_fn,
                                   config=run_config,
                                   params={"batch_size": FLAGS.train_batch_size})

    train_input_fn = run_classifier.input_fn_builder(
                                                 features=train_features,
                                                 seq_length=FLAGS.max_seq_length,
                                                 is_training=True,
                                                 drop_remainder=True)

    print("Beginning Training!")
    current_time = datetime.now()
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    print("Training took time ", datetime.now() - current_time)
    
    test_input_fn = run_classifier.input_fn_builder(
                                                features=test_features,
                                                seq_length=FLAGS.max_seq_length,
                                                is_training=False,
                                                drop_remainder=False)
                                                

    predictions = estimator.predict(test_input_fn)
    x=[np.argmax(prediction['probabilities']) for prediction in predictions]
    dtest['prediction']=x
    print("The accuracy of FLTR on "+FLAGS.category_name+" is: "+str(accuracy_score(dtest.label,dtest.prediction)))
    
    if FLAGS.do_predict:
        print("Beginning Prediction!")
        data_with_FLTR_predictions = FLTR_Prediction(data,tokenizer,estimator)
        
        if(data_with_FLTR_predictions.isnull().values.any()):
            data_with_FLTR_predictions = data_with_FLTR_predictions.replace(np.nan, "[PAD]", regex=True)
            
        data_with_FLTR_predictions.to_csv(os.path.join(FLAGS.data_dir, 
                                                       FLAGS.category_name+'.txt'), 
                                          index=None, sep='\t', mode='w') 
        print("Prediction End!")
def read_classifier_examples(input_file, labels, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                #if (len(qa["answers"]) != 1):

                if not is_impossible:
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    actual_text = " ".join(
                        doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
                        tokenization.whitespace_tokenize(orig_answer_text))
                    if actual_text.find(cleaned_answer_text) == -1:
                        #tf.logging.warning("Could not find answer: '%s' vs. '%s'",
                        #actual_text, cleaned_answer_text)
                        continue
                else:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""
                label = labels[qas_id]
                example = run_classifier.InputExample(guid=qas_id,
                                                      text_a=question_text,
                                                      text_b=orig_answer_text,
                                                      label=label)
                examples.append(example)
    return examples
Example #25
0
def main(_):
    """For current work, we only use the following four categories, 
    but you can add others if you would like to."""
    categories = [
              'Tools_and_Home_Improvement', 
              'Patio_Lawn_and_Garden',
              'Electronics',
              'Baby',
    ]
    
    data = None
    
    """Cross-domain pre-training (All_Categories) to boost the performance."""
    if FLAGS.category_name == "All_Categories":
        for category in categories:
            category_data_path = os.path.join(FLAGS.data_dir,category+'.txt')
            category_data = pd.read_csv(category_data_path,sep='\t',encoding='utf-8',nrows=10000,
                  converters={'reviewText':ast.literal_eval,'FLTR_scores':ast.literal_eval})
            if data is None:
                data = category_data
            else:
                data = pd.concat([data,category_data],axis=0)
        data = data.sample(n=len(data))
    else:
        data_path = os.path.join(FLAGS.data_dir,FLAGS.category_name+'.txt')
        data = pd.read_csv(data_path,sep='\t',encoding='utf-8',#nrows=10000,
                  cconverters={'reviewText':ast.literal_eval,'FLTR_scores':ast.literal_eval})
    
    #data['len_questions'] = data["question"].apply(lambda x: len(x.split()))
    #data = data[data['len_questions']<=10]
    
    data['FLTR_Top10'] = data.apply(FLTR_Top10,axis=1)
    list_of_answers = list(data['answer'])
    list_of_answers=shuffle(list_of_answers)
    data['non_answer']= list_of_answers
    
    train = data[:int(len(data)*0.8)]
    train = train.sample(n=min(20000,len(train)))
    test = data[int(len(data)*0.8):]
    print(train.shape,test.shape)

    DATA_COLUMN_A = 'senA'
    DATA_COLUMN_B = 'senB'
    LABEL_COLUMN = 'Label'
    label_list = [0, 1]
         
    train = train.apply(qar_pair,axis=1)    
    test = test.apply(qar_pair,axis=1)
    
    temp = train.tolist()
    flat_list = [item for sublist in temp for item in sublist]
    train =pd.DataFrame(flat_list,columns=['senA','senB'])
    train['Label'] =1
    train['senA']=train['senA'].apply(str)
    train['senB']=train['senB'].apply(str)
    
    temp = test.tolist()
    flat_list = [item for sublist in temp for item in sublist]
    test = pd.DataFrame(flat_list,columns=['senA','senB'])
    test['Label'] = 1
    test['senA'] = test['senA'].apply(str)
    test['senB'] = test['senB'].apply(str)
    print(train.shape,test.shape)
    
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, 
                                           do_lower_case=True)
    
    train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                        text_a = x[DATA_COLUMN_A],
                                                                        text_b = x[DATA_COLUMN_B],
                                                                        label = x[LABEL_COLUMN]), 
                                       axis = 1)

    test_InputExamples = test.apply(lambda x: run_classifier.InputExample(guid=None,
                                                                           text_a = x[DATA_COLUMN_A],
                                                                           text_b = x[DATA_COLUMN_B],
                                                                           label = x[LABEL_COLUMN]), 
                                     axis = 1)
                                            
    train_features = run_classifier.convert_examples_to_features(train_InputExamples, 
                                                  label_list, 
                                                  FLAGS.max_seq_length, 
                                                  tokenizer)
    test_features = run_classifier.convert_examples_to_features(test_InputExamples, 
                                                 label_list, 
                                                 FLAGS.max_seq_length, 
                                                 tokenizer)

    
    OUTPUT_DIR = os.path.join(FLAGS.model_output_dir,FLAGS.category_name+"_BertQA")
    tf.gfile.MakeDirs(OUTPUT_DIR)
    
    run_config = tf.estimator.RunConfig(
                                    model_dir=OUTPUT_DIR,
                                    keep_checkpoint_max=2,
                                    save_summary_steps=FLAGS.save_summary_steps,
                                    save_checkpoints_steps=FLAGS.save_checkpoints_steps)

    num_train_steps = int(len(train_features) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
    
    
    model_fn = model_fn_builder(
                            bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file),
                            num_labels = len(label_list),
                            init_checkpoint = FLAGS.init_checkpoint,
                            learning_rate = FLAGS.learning_rate,
                            num_train_steps = num_train_steps,
                            num_warmup_steps = num_warmup_steps)

    estimator = tf.estimator.Estimator(
                                   model_fn = model_fn,
                                   config = run_config,
                                   params = {"batch_size": FLAGS.train_batch_size})

    train_input_fn = run_classifier.input_fn_builder(
                                                 features = train_features,
                                                 seq_length = FLAGS.max_seq_length,
                                                 is_training = True,
                                                 drop_remainder = True)

    print("Beginning Training!")
    current_time = datetime.now()
    #early_stopping = tf.contrib.estimator.stop_if_no_decrease_hook(
    #                 estimator,metric_name='loss',max_steps_without_decrease=1000,min_steps=100)

    estimator.train(input_fn = train_input_fn, max_steps = num_train_steps) #,hooks=[early_stopping]
    print("Training took time ", datetime.now() - current_time)
    
    test_input_fn = run_classifier.input_fn_builder(
                                                features = test_features,
                                                seq_length = FLAGS.max_seq_length,
                                                is_training = False,
                                                drop_remainder = True)
                                                

    predictions = estimator.predict(test_input_fn)
    x=[prediction['scores'] for prediction in predictions]
    print('\n')
    print("The accuracy of BertQA on "+FLAGS.category_name+" is: "+str(sum(i > 0 for i in x)/len(x)))
    print('\n')
Example #26
0
def convert_input(x):
    return run_classifier.InputExample(guid=x["id"],
                                       text_a=x["comment_text"],
                                       text_b=None,
                                       label=0)