def create_examples(lines, set_type, labels=None): guid = set_type examples = [] if set_type == 'train': for line, label in zip(lines, labels): text_a = line label = str(label) examples.append(run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) else: for line in lines: text_a = line label = '0' examples.append(run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%d" % (set_type, i) if set_type == 'test': # removing header and invalid data if i == 0 or len(line) != 3: print(guid, line) continue text_a = tokenization.convert_to_unicode(line[1]) text_b = tokenization.convert_to_unicode(line[2]) label = 0 # We will use zero for test as convert_example_to_features doesn't support None else: # removing header and invalid data if i == 0 or len(line) != 6: continue text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) label = int(line[5]) examples.append( run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def create_examples(lines, set_type, labels=None): #Generate data for the BERT model guid = f'{set_type}' examples = [] if guid == 'train': for line, label in zip(lines, labels): text_a = line label = str(label) """Creating single training/test example for simple sequence classification.""" examples.append(run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) else: for line in lines: text_a = line label = '0' examples.append(run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def getPrediction(in_sentences): ''' Function to provide class predictions for list of input sentences :param in_sentences: :return: ''' #labels = ["Negative", "Positive"] # pre-process input input_examples = [ run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in in_sentences ] # here, "" is just a dummy label input_features = run_classifier.convert_examples_to_features( input_examples, label_list, FLAGS.MAX_SEQ_LENGTH, tokenizer) # create model function input predict_input_fn = run_classifier.input_fn_builder( features=input_features, seq_length=FLAGS.MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) # calculate the predictions predictions = estimator.predict(predict_input_fn) # output = [(sentence, prediction['probabilities'], prediction['labels']) for sentence, prediction in zip(in_sentences, predictions)] return output
def classify(search_sentence): examples = [] guid = "dev-1" text_a = tokenization.convert_to_unicode(search_sentence) label = tokenization.convert_to_unicode("1") # put dummy input examples.append( run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) predict_features = run_classifier.convert_examples_to_features( examples, label_list, MAX_SEQ_LENGTH, tokenizer) predict_input_fn = run_classifier.input_fn_builder( features=predict_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=True) result = estimator.predict(input_fn=predict_input_fn) for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= 1: break emotion = {"happy": 0, "sad": 0, "angry": 0, "surprised": 0} _emotion = ["happy", "sad", "angry", "surprised"] for (i, class_probability) in enumerate(probabilities): emotion[_emotion[i]] = class_probability print(_emotion[i] + ": " + str(class_probability)) return emotion
def predict(sentence): input_example = run_classifier.InputExample( guid="", text_a=sentence, text_b=None, label="0") # here, "" is just a dummy label input_examples = [input_example] input_features = run_classifier.convert_examples_to_features( input_examples, label_list, MAX_SEQ_LENGTH, tokenizer) predict_input_fn = run_classifier.input_fn_builder( features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) predictions = estimator.predict(input_fn=predict_input_fn) df = pd.DataFrame(predictions) label_index = df.iat[0, 0].argmax() confidence = df.iat[0, 0].max() predicted_label = emotions.iat[int(label_index), 0] return { 'predicted_label': predicted_label, 'confidence': confidence.item() }
def FLTR_Prediction(d,tokenizer,estimator): """ Ranking all of reviews accoring to how they are relevant to a given quetion.""" d = d.reset_index(drop=True) test_t = d.apply(QR_pair,axis=1) test_t = test_t.tolist() flat_list = [item for sublist in test_t for item in sublist] test_t = pd.DataFrame(flat_list,columns=['question','review']) test_t['question'] = test_t['question'].apply(str) test_t['review'] = test_t['review'].apply(str) DATA_COLUMN_A = 'question' DATA_COLUMN_B = 'review' label_list = [0, 1] max_inputs = 1000000 probs = [] temp_test = test_t.copy() while len(temp_test)>0: line = min(max_inputs,len(temp_test)) temp = temp_test[:line] inputExamples = temp.apply(lambda x: run_classifier.InputExample(guid=None, text_a = x[DATA_COLUMN_A], text_b = x[DATA_COLUMN_B], label = 0), axis = 1) input_features = run_classifier.convert_examples_to_features(inputExamples, label_list, FLAGS.max_seq_length, tokenizer) predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) predictions = estimator.predict(predict_input_fn) probabilities = [prediction['probabilities'] for prediction in predictions] probs = probs+[item.tolist()[1] for item in probabilities] if len(temp_test)>max_inputs: temp_test = temp_test[line:] temp_test = temp_test.reset_index(drop=True) else: temp_test = [] test_t['probilities']=probs num_reviews = d['num_reviews'].tolist() d['FLTR_scores'] = '' for i in range(0,len(d)): n = num_reviews[i] #print(probs[:n]) d.at[i,'FLTR_scores'] = probs[:n] #print(d.at[i,'FLTR_scores']) if i!=len(d)-1: probs=probs[n:] return d
def get_predict_examples(self, sentence_pairs): """Given question pairs, conevrting to list of InputExample""" examples = [] for (i, qpair) in enumerate(sentence_pairs): guid = "predict-%d" % (i) # converting questions to utf-8 and creating InputExamples text_a = tokenization.convert_to_unicode(qpair[0]) text_b = tokenization.convert_to_unicode(qpair[1]) # We will add label as 0, because None is not supported in converting to features examples.append( run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=0)) return examples
def get_predict_examples(list1, list2): examples = [] for i, (sent1, sent2) in enumerate(zip(list1, list2)): guid = f"test-{i}" text_a = tokenization.convert_to_unicode(sent1) text_b = tokenization.convert_to_unicode(sent2) label = "0" examples.append( run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def create_examples(lines, set_type, labels=None): """Generate data for the BERT model :params lines: (numpy array) input sentences :params set_type: (str) eg. train :params labels: (numpy array) labels :return examples: list """ guid = f'{set_type}' examples = [] if guid == 'train': for line, label in zip(lines, labels): text_a = line label = str(label) examples.append( run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) else: for line in lines: text_a = line label = '0' examples.append( run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def create_examples(lines, set_type, labels=None): #Generate data for the BERT model guid = f'{set_type}' examples = [] if guid == 'train': for line, label in zip(lines, labels): text_a = line label = str(label) examples.append( run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) else: for line in lines: text_a = line label = '[0 0 0 0 0 0]' examples.append( run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def BerQA_train_predict(data,is_training=True): d=data.copy() scores = [] max_inputs = 30000 LEARNING_RATE = 2e-5 NUM_TRAIN_EPOCHS = 2 WARMUP_PROPORTION = 0.1 # Model configs SAVE_CHECKPOINTS_STEPS = 1000 SAVE_SUMMARY_STEPS = 500 num_train_steps = 100 max_steps = 100000 DATA_COLUMN_A = 'senA' DATA_COLUMN_B = 'senB' LABEL_COLUMN = 'Label' label_list = [0, 1] while(len(d)>0 and num_train_steps<=max_steps): line = min(max_inputs,len(d)) temp = d[:line] temp_t = temp.apply(qar_pair,axis=1) temp_t=temp_t.tolist() flat_list = [item for sublist in temp_t for item in sublist] temp_t =pd.DataFrame(flat_list,columns=['senA','senB']) temp_t['Label'] =1 temp_t['senA']=temp_t['senA'].apply(str) temp_t['senB']=temp_t['senB'].apply(str) temp_InputExamples = temp_t.apply(lambda x: run_classifier.InputExample(guid=None, text_a = x[DATA_COLUMN_A], text_b = x[DATA_COLUMN_B], label = x[LABEL_COLUMN]), axis = 1) temp_features = run_classifier.convert_examples_to_features(temp_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer) num_train_steps = int(len(temp_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)+num_train_steps num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION) # Specify outpit directory and number of checkpoint steps to save run_config = tf.estimator.RunConfig( model_dir=OUTPUT_DIR, save_summary_steps=SAVE_SUMMARY_STEPS, save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS) model_fn = model_fn_builder( num_labels=len(label_list), learning_rate=LEARNING_RATE, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps)
def load_data( load_fn=standard_loader('twitter'), bert_pair_shape=True, split=None, context_extent='last', emoji_totext=False, random_state=42 ): """ :param load_fn: a loader function which when executed should return data frame with 'label', 'response' and 'context' :param bert_pair_shape: True if you want it in shape for bert :param split: If none full dataset will be returned, else train, test where split corresponds to frac of test :param context_extent: 'last' or 'all' :param emoji_totext: process emojis to text if True :return: """ # Why do we bother doing this ? We want to pass some arbitrary data not tied to any specific file later. res = load_fn() # Process emojis if we'd like to if emoji_totext: res['response'] = res['response'].apply(lambda resp: emoji.demojize(resp)) res['context'] = res['context'].apply(lambda contexts: [emoji.demojize(c) for c in contexts]) # Get it into a BERT friendly format new_res = [] if bert_pair_shape: i = 1 for _ind, row in res.iterrows(): guid = "%d" % i # A Unique id for each row. text_a = tokenization.convert_to_unicode(row['response']) # The main text which we wish to classify # in this Sentence pair classification task, our context is the second sentence. text_b = tokenization.convert_to_unicode(context_process(row['context'], context_extent)) # Sarcasm Label label = row.get('label', 'NOT_SARCASM') # Convert every row into a BERT InputExample Object new_res.append(run_classifier.InputExample(guid, text_a, text_b, label)) else: new_res = res # Split data for test if needed if split is None: return new_res else: train, test = train_test_split(new_res, test_size=split, random_state=random_state) return train, test
def getPrediction(self, in_sentences): label_list = [0, 1] input_examples = [ run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in in_sentences ] # here, "" is just a dummy label input_features = run_classifier.convert_examples_to_features( input_examples, label_list, self.MAX_SEQ_LENGTH, self.tokenizer) predict_input_fn = run_classifier.input_fn_builder( features=input_features, seq_length=self.MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) predictions = self.estimator.predict(predict_input_fn) return [(sentence, prediction['encode_vec']) for sentence, prediction in zip(in_sentences, predictions)]
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): n = 0 if set_type == "train": n = 2222 elif set_type == "dev": n = 3333 else: n = 4444 uid = str(n) + str(i) guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(uid)) text_a = tokenization.convert_to_unicode(line[0]) #text_b = tokenization.convert_to_unicode(line[9]) label = tokenization.convert_to_unicode(line[-1]) examples.append( rc.InputExample(guid=guid, text_a=text_a, label=label)) return examples
def scorePredict(self): sentences = self.sentences input_examples = [ run_classifier.InputExample(guid="", text_a=x, text_b=None, label="0") for x in sentences ] # here, "" is just a dummy label input_features = run_classifier.convert_examples_to_features( input_examples, ['0', '1', '-1'], self.MAX_SEQ_LENGTH, self.tokenizer) predict_input_fn = run_classifier.input_fn_builder( features=input_features, seq_length=self.MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) predictions = self.estimator.predict(input_fn=predict_input_fn) self.pred = list(predictions) return self.pred
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == 'test' and i == 0: continue #guid = '%s-%s' % (set_type, i) guid = tokenization.convert_to_unicode(line[0]) if set_type == 'test': text_a = tokenization.convert_to_unicode(line[3]) #Set a dummy value. This is not used label = 'positive' else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] lines.pop(0) for (i, line) in enumerate(lines): #Unqiue ID is created guid = "%s-%d" % (set_type, i) if set_type=='test': # removing header and invalid data text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = 0 # We will use zero for test as convert_example_to_features doesn't support None else: #We need to check that these labels are looked at correctly. text_a is the question and text_b is the answer. # The label is the thrid variable in the .tsv file that we pass through. text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = int(line[2]) examples.append( run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def BerQA_train_predict(data, is_training=True): d = data.copy() scores = [] max_inputs = 30000 LEARNING_RATE = 2e-5 NUM_TRAIN_EPOCHS = 2 WARMUP_PROPORTION = 0.1 # Model configs SAVE_CHECKPOINTS_STEPS = 1000 SAVE_SUMMARY_STEPS = 500 num_train_steps = 100 max_steps = 100000 DATA_COLUMN_A = 'senA' DATA_COLUMN_B = 'senB' LABEL_COLUMN = 'Label' label_list = [0, 1] while (len(d) > 0 and num_train_steps <= max_steps): line = min(max_inputs, len(d)) temp = d[:line] temp_t = temp.apply(qar_pair, axis=1) temp_t = temp_t.tolist() flat_list = [item for sublist in temp_t for item in sublist] temp_t = pd.DataFrame(flat_list, columns=['senA', 'senB']) temp_t['Label'] = 1 temp_t['senA'] = temp_t['senA'].apply(str) temp_t['senB'] = temp_t['senB'].apply(str) temp_InputExamples = temp_t.apply( lambda x: run_classifier.InputExample(guid=None, text_a=x[DATA_COLUMN_A], text_b=x[DATA_COLUMN_B], label=x[LABEL_COLUMN]), axis=1) temp_features = run_classifier.convert_examples_to_features( temp_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer) num_train_steps = int( len(temp_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS) + num_train_steps num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION) # Specify outpit directory and number of checkpoint steps to save run_config = tf.estimator.RunConfig( model_dir=OUTPUT_DIR, save_summary_steps=SAVE_SUMMARY_STEPS, save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS) model_fn = model_fn_builder(num_labels=len(label_list), learning_rate=LEARNING_RATE, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params={"batch_size": BATCH_SIZE}) input_fn = input_fn_builder(features=temp_features, seq_length=MAX_SEQ_LENGTH, is_training=is_training, drop_remainder=True) if is_training: print('Beginning Training!') early_stopping = tf.contrib.estimator.stop_if_no_decrease_hook( estimator, metric_name='loss', max_steps_without_decrease=1000, min_steps=100) current_time = datetime.now() #tf.estimator.train_and_evaluate(estimator,train_spec=tf.estimator.TrainSpec(input_fn, hooks=[early_stopping])) estimator.train(input_fn=input_fn, max_steps=num_train_steps, hooks=[early_stopping]) print("Training took time ", datetime.now() - current_time) else: predictions = estimator.predict(input_fn) outputs = [(prediction['probabilities'], prediction['crq']) for prediction in predictions] x = [i[0] for i in outputs] y = [i[1] for i in outputs] print('\n') print('Accuracy of ' + category + ' is: ' + str(sum(i > 0 for i in x) / len(x))) print('\n') scores = scores + y if len(d) > max_inputs: d = d[line:] d = d.reset_index(drop=True) else: d = [] if is_training is False: data = data[:len(scores)] scores = [item.tolist() for item in scores] #BERTQA_scores = pd.DataFrame(data=scores) data['BERTQA_scores'] = scores #data = pd.concat([data,BERTQA_scores],axis=1,ignore_index=True) return data
def main(_): """For current work, we only use the following four categories, but you can add others if you would like to.""" categories = [ 'Tools_and_Home_Improvement', 'Patio_Lawn_and_Garden', 'Electronics', 'Baby', ] models = ['FLTR', 'BertQA'] data_path = os.path.join(FLAGS.data_dir, 'Annotated_Data.txt') data = pd.read_csv(data_path, sep='\t', encoding='utf-8', converters={ 'annotation_score': ast.literal_eval, 'reviews': ast.literal_eval }) data = data.reset_index() data['qr'] = data[['index', 'question', 'reviews' ]].apply(lambda x: [[x['index'], x['question'], i] for i in x['reviews']], axis=1) d = [] for category in categories: qr = data[data['category'] == category]['qr'].tolist() qr = [item for sublist in qr for item in sublist] qr = pd.DataFrame(columns=['index', 'question', 'review'], data=qr) qr['label'] = 1 temp = qr.copy() temp['question'] = temp['question'].apply(str) temp['review'] = temp['review'].apply(str) DATA_COLUMN_A = 'question' DATA_COLUMN_B = 'review' LABEL_COLUMN = 'label' label_list = [0, 1] tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=True) test_InputExamples = temp.apply( lambda x: run_classifier.InputExample(guid=None, text_a=x[DATA_COLUMN_A], text_b=x[DATA_COLUMN_B], label=x[LABEL_COLUMN]), axis=1) test_features = run_classifier.convert_examples_to_features( test_InputExamples, label_list, FLAGS.max_seq_length, tokenizer) t = data[data['category'] == category] t = t.reset_index(drop=True) for model in models: OUTPUT_DIR = os.path.join(FLAGS.model_output_dir, category + '_' + model) run_config = tf.estimator.RunConfig(model_dir=OUTPUT_DIR, save_summary_steps=100, save_checkpoints_steps=100) model_fn = None if model == 'BertQA': model_fn = model_fn_builder_BertQA( bert_config=modeling.BertConfig.from_json_file( FLAGS.bert_config_file), num_labels=len(label_list), init_checkpoint=OUTPUT_DIR, learning_rate=FLAGS.learning_rate, num_train_steps=100, num_warmup_steps=100) else: model_fn = model_fn_builder_FLTR( bert_config=modeling.BertConfig.from_json_file( FLAGS.bert_config_file), num_labels=len(label_list), init_checkpoint=OUTPUT_DIR, learning_rate=FLAGS.learning_rate, num_train_steps=100, num_warmup_steps=100) estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, params={"batch_size": FLAGS.train_batch_size}) test_input_fn = run_classifier.input_fn_builder( features=test_features, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) predictions = estimator.predict(test_input_fn) probabilities = [ prediction['probabilities'] for prediction in predictions ] probabilities = [list(item) for item in probabilities] if model == 'FLTR': probabilities = [item[1] for item in probabilities] else: probabilities = [item[0] for item in probabilities] print(model, ' :', probabilities[:10]) temp[model + '_score'] = probabilities temp_groupby = temp.groupby( ['index', 'question'], sort=False)[model + '_score'].apply(list).reset_index( name=model + '_score') t = pd.concat([t, temp_groupby[model + '_score']], axis=1) if len(d) == 0: d = t else: d = pd.concat([d, t], axis=0, ignore_index=True) d.to_csv(os.path.join(FLAGS.data_dir, 'test_predictions.txt'), index=None, sep='\t', mode='w')
d = pd.concat([qa,nqa],axis=0) d=shuffle(d) d['question']=d['question'].apply(str) d['answer']=d['answer'].apply(str) split = int(len(d)*0.9) dtrain = d[0:split] dtest = d[split:] DATA_COLUMN_A = 'question' DATA_COLUMN_B = 'answer' LABEL_COLUMN = 'label' label_list = [0, 1] # Use the InputExample class from BERT's run_classifier code to create examples from the data train_InputExamples = dtrain.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example text_a = x[DATA_COLUMN_A], text_b = x[DATA_COLUMN_B], label = x[LABEL_COLUMN]), axis = 1) test_InputExamples = dtest.apply(lambda x: run_classifier.InputExample(guid=None,text_a = x[DATA_COLUMN_A], text_b = x[DATA_COLUMN_B],label = x[LABEL_COLUMN]), axis = 1) # Convert our train and test features to InputFeatures that BERT understands. train_features = run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer) test_features = run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer) # Compute # train and warmup steps from batch size num_train_steps = int(len(train_features) / BATCH_SIZE * 2) num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION) # Specify outpit directory and number of checkpoint steps to save run_config = tf.estimator.RunConfig(
label_list = list(np.unique(train['label'])) #[0, 1] tf.logging.info('shape of data: train: (%d,%d), test: (%d,%d)' % (train.shape + test.shape)) tf.logging.info('columns of train file: %s' % ','.join(train.columns)) # <------------------ Prepare the training input tf.logging.info('prepare training input...') # Use the InputExample class from BERT's run_classifier code to create examples from the data. Each data point # wrapped into a InputExample class train_InputExamples = train.apply( lambda x: run_classifier.InputExample( guid= None, # Globally unique ID for bookkeeping, unused in this example text_a=x['text'], text_b=None, label=x['label']), axis=1) test_InputExamples = test.apply(lambda x: run_classifier.InputExample( guid=None, text_a=x['text'], text_b=None, label=x['label']), axis=1) # <------------------ Prepare tokenizer and do tokenization tf.logging.info('prepare tokenizer') # Checks whether the casing config is consistent with the checkpoint name. tokenization.validate_case_matches_checkpoint( do_lower_case=True, init_checkpoint=FLAGS.BERT_INIT_CHKPNT)
def main(_): """For current work, we only use the following four categories, but you can add others if you would like to.""" categories = [ 'Tools_and_Home_Improvement', 'Patio_Lawn_and_Garden', 'Electronics', 'Baby', ] data = None if FLAGS.category_name == "All_Categories": """Cross-domain pre-training (All_Categories).""" for category in categories: category_data_path = os.path.join(FLAGS.data_dir,category+'.txt') category_data = pd.read_csv(category_data_path,sep='\t',encoding='utf-8',#nrows=5000, converters={'QA':ast.literal_eval,'reviewText':ast.literal_eval}) category_data = category_data[:int(len(data)*0.8)] if data is None: data = category_data else: data = pd.concat([data,category_data],axis=0) data = data.sample(n=min(len(data),60000)) else: data_path = os.path.join(FLAGS.data_dir,FLAGS.category_name+'.txt') data = pd.read_csv(data_path,sep='\t',encoding='utf-8',#nrows=10000, converters={'QA':ast.literal_eval,'reviewText':ast.literal_eval}) data['question'] = data['QA'].apply(lambda x: x['questionText']) data['answer'] = data['QA'].apply(lambda x: x['answers'][0]['answerText'] if len(x['answers'])>0 else PAD_WORD) data['num_reviews']= data['reviewText'].apply(lambda x: len(x)) train = data[:int(len(data)*0.8)] list_of_answers = list(train['answer']) list_of_answers=shuffle(list_of_answers) qa = train[['question','answer']] nqa = pd.DataFrame({'question': train['question'].tolist(),'answer':list_of_answers}) qa['label']=1 nqa['label']=0 d = pd.concat([qa,nqa],axis=0) d=shuffle(d) d['question']=d['question'].apply(str) d['answer']=d['answer'].apply(str) split = int(len(d)*0.9) dtrain = d[0:split] dtest = d[split:] DATA_COLUMN_A = 'question' DATA_COLUMN_B = 'answer' LABEL_COLUMN = 'label' label_list = [0, 1] tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=True) train_InputExamples = dtrain.apply(lambda x: run_classifier.InputExample(guid=None, text_a = x[DATA_COLUMN_A], text_b = x[DATA_COLUMN_B], label = x[LABEL_COLUMN]), axis = 1) test_InputExamples = dtest.apply(lambda x: run_classifier.InputExample(guid=None, text_a = x[DATA_COLUMN_A], text_b = x[DATA_COLUMN_B], label = x[LABEL_COLUMN]), axis = 1) train_features = run_classifier.convert_examples_to_features(train_InputExamples, label_list, FLAGS.max_seq_length, tokenizer) test_features = run_classifier.convert_examples_to_features(test_InputExamples, label_list, FLAGS.max_seq_length, tokenizer) OUTPUT_DIR = os.path.join(FLAGS.model_output_dir,FLAGS.category_name+'_FLTR') tf.gfile.MakeDirs(OUTPUT_DIR) run_config = tf.estimator.RunConfig( model_dir=OUTPUT_DIR, save_summary_steps=FLAGS.save_summary_steps, save_checkpoints_steps=FLAGS.save_checkpoints_steps) num_train_steps = int(len(train_features) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file), num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, params={"batch_size": FLAGS.train_batch_size}) train_input_fn = run_classifier.input_fn_builder( features=train_features, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) print("Beginning Training!") current_time = datetime.now() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) print("Training took time ", datetime.now() - current_time) test_input_fn = run_classifier.input_fn_builder( features=test_features, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) predictions = estimator.predict(test_input_fn) x=[np.argmax(prediction['probabilities']) for prediction in predictions] dtest['prediction']=x print("The accuracy of FLTR on "+FLAGS.category_name+" is: "+str(accuracy_score(dtest.label,dtest.prediction))) if FLAGS.do_predict: print("Beginning Prediction!") data_with_FLTR_predictions = FLTR_Prediction(data,tokenizer,estimator) if(data_with_FLTR_predictions.isnull().values.any()): data_with_FLTR_predictions = data_with_FLTR_predictions.replace(np.nan, "[PAD]", regex=True) data_with_FLTR_predictions.to_csv(os.path.join(FLAGS.data_dir, FLAGS.category_name+'.txt'), index=None, sep='\t', mode='w') print("Prediction End!")
def read_classifier_examples(input_file, labels, is_training): """Read a SQuAD json file into a list of SquadExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False #if (len(qa["answers"]) != 1): if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: #tf.logging.warning("Could not find answer: '%s' vs. '%s'", #actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" label = labels[qas_id] example = run_classifier.InputExample(guid=qas_id, text_a=question_text, text_b=orig_answer_text, label=label) examples.append(example) return examples
def main(_): """For current work, we only use the following four categories, but you can add others if you would like to.""" categories = [ 'Tools_and_Home_Improvement', 'Patio_Lawn_and_Garden', 'Electronics', 'Baby', ] data = None """Cross-domain pre-training (All_Categories) to boost the performance.""" if FLAGS.category_name == "All_Categories": for category in categories: category_data_path = os.path.join(FLAGS.data_dir,category+'.txt') category_data = pd.read_csv(category_data_path,sep='\t',encoding='utf-8',nrows=10000, converters={'reviewText':ast.literal_eval,'FLTR_scores':ast.literal_eval}) if data is None: data = category_data else: data = pd.concat([data,category_data],axis=0) data = data.sample(n=len(data)) else: data_path = os.path.join(FLAGS.data_dir,FLAGS.category_name+'.txt') data = pd.read_csv(data_path,sep='\t',encoding='utf-8',#nrows=10000, cconverters={'reviewText':ast.literal_eval,'FLTR_scores':ast.literal_eval}) #data['len_questions'] = data["question"].apply(lambda x: len(x.split())) #data = data[data['len_questions']<=10] data['FLTR_Top10'] = data.apply(FLTR_Top10,axis=1) list_of_answers = list(data['answer']) list_of_answers=shuffle(list_of_answers) data['non_answer']= list_of_answers train = data[:int(len(data)*0.8)] train = train.sample(n=min(20000,len(train))) test = data[int(len(data)*0.8):] print(train.shape,test.shape) DATA_COLUMN_A = 'senA' DATA_COLUMN_B = 'senB' LABEL_COLUMN = 'Label' label_list = [0, 1] train = train.apply(qar_pair,axis=1) test = test.apply(qar_pair,axis=1) temp = train.tolist() flat_list = [item for sublist in temp for item in sublist] train =pd.DataFrame(flat_list,columns=['senA','senB']) train['Label'] =1 train['senA']=train['senA'].apply(str) train['senB']=train['senB'].apply(str) temp = test.tolist() flat_list = [item for sublist in temp for item in sublist] test = pd.DataFrame(flat_list,columns=['senA','senB']) test['Label'] = 1 test['senA'] = test['senA'].apply(str) test['senB'] = test['senB'].apply(str) print(train.shape,test.shape) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=True) train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, text_a = x[DATA_COLUMN_A], text_b = x[DATA_COLUMN_B], label = x[LABEL_COLUMN]), axis = 1) test_InputExamples = test.apply(lambda x: run_classifier.InputExample(guid=None, text_a = x[DATA_COLUMN_A], text_b = x[DATA_COLUMN_B], label = x[LABEL_COLUMN]), axis = 1) train_features = run_classifier.convert_examples_to_features(train_InputExamples, label_list, FLAGS.max_seq_length, tokenizer) test_features = run_classifier.convert_examples_to_features(test_InputExamples, label_list, FLAGS.max_seq_length, tokenizer) OUTPUT_DIR = os.path.join(FLAGS.model_output_dir,FLAGS.category_name+"_BertQA") tf.gfile.MakeDirs(OUTPUT_DIR) run_config = tf.estimator.RunConfig( model_dir=OUTPUT_DIR, keep_checkpoint_max=2, save_summary_steps=FLAGS.save_summary_steps, save_checkpoints_steps=FLAGS.save_checkpoints_steps) num_train_steps = int(len(train_features) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file), num_labels = len(label_list), init_checkpoint = FLAGS.init_checkpoint, learning_rate = FLAGS.learning_rate, num_train_steps = num_train_steps, num_warmup_steps = num_warmup_steps) estimator = tf.estimator.Estimator( model_fn = model_fn, config = run_config, params = {"batch_size": FLAGS.train_batch_size}) train_input_fn = run_classifier.input_fn_builder( features = train_features, seq_length = FLAGS.max_seq_length, is_training = True, drop_remainder = True) print("Beginning Training!") current_time = datetime.now() #early_stopping = tf.contrib.estimator.stop_if_no_decrease_hook( # estimator,metric_name='loss',max_steps_without_decrease=1000,min_steps=100) estimator.train(input_fn = train_input_fn, max_steps = num_train_steps) #,hooks=[early_stopping] print("Training took time ", datetime.now() - current_time) test_input_fn = run_classifier.input_fn_builder( features = test_features, seq_length = FLAGS.max_seq_length, is_training = False, drop_remainder = True) predictions = estimator.predict(test_input_fn) x=[prediction['scores'] for prediction in predictions] print('\n') print("The accuracy of BertQA on "+FLAGS.category_name+" is: "+str(sum(i > 0 for i in x)/len(x))) print('\n')
def convert_input(x): return run_classifier.InputExample(guid=x["id"], text_a=x["comment_text"], text_b=None, label=0)