Esempio n. 1
0
def train(hp):
    """Train the advanced blocking model
    Store the trained model in hp.model_fn.

    Args:
        hp (Namespace): the hyperparameters

    Returns:
        None
    """
    # define model
    model_names = {'distilbert': 'distilbert-base-uncased',
                   'bert': 'bert-base-uncased',
                   'albert': 'albert-base-v2' }

    word_embedding_model = models.Transformer(model_names[hp.lm])
    pooling_model = models.Pooling(word_embedding_model\
                                   .get_word_embedding_dimension(),
				   pooling_mode_mean_tokens=True,
				   pooling_mode_cls_token=False,
				   pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # load the training and validation data
    reader = Reader()
    trainset = SentencesDataset(examples=reader.get_examples(hp.train_fn),
                                model=model)
    train_dataloader = DataLoader(trainset,
                                  shuffle=True,
                                  batch_size=hp.batch_size)
    train_loss = losses.SoftmaxLoss(model=model,
            sentence_embedding_dimension=model\
                    .get_sentence_embedding_dimension(),
            num_labels=2)

    dev_data = SentencesDataset(examples=reader\
                                         .get_examples(hp.valid_fn),
                                model=model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=hp.batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    warmup_steps = math.ceil(len(train_dataloader) \
            * hp.n_epochs / hp.batch_size * 0.1) #10% of train data for warm-up

    if os.path.exists(hp.model_fn):
        import shutil
        shutil.rmtree(hp.model_fn)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=hp.n_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=hp.model_fn,
          fp16=hp.fp16,
          fp16_opt_level='O2')
Esempio n. 2
0
    def preparing_data_mnli(self):
        """
         Method used for data preparation before training
         it reads data from files predefined in config and process them
         Uses for MNLI data format
        """
        def read_mnli(path):
            df = pd.read_table(path, error_bad_lines=False)
            df.sentence1 = df.sentence1.astype(str)
            df.sentence2 = df.sentence2.astype(str)
            df.gold_label = df.gold_label.astype(str)
            df = df[df.gold_label != '-']
            df.dropna(inplace=True)
            return df

        train_snli = _create_examples_mnli(read_mnli(self.train_path),
                                           'train_s')
        dev_snli = _create_examples_mnli(read_mnli(self.dev_path), 'dev_s')
        test_snli = _create_examples_mnli(read_mnli(self.test_path), 'test_s')

        # Convert the dataset to a DataLoader ready for training
        self.logger.info("Read train dataset")

        train_nli_samples = []
        dev_nli_samples = []
        test_nli_samples = []

        print(len(train_snli))
        for row in tqdm(train_snli):
            label_id = self.label2int[row[3]]
            train_nli_samples.append(
                InputExample(guid=row[0],
                             texts=[row[1], row[2]],
                             label=label_id))
        for row in tqdm(dev_snli):
            label_id = self.label2int[row[3]]
            dev_nli_samples.append(
                InputExample(guid=row[0],
                             texts=[row[1], row[2]],
                             label=label_id))
        for row in tqdm(test_snli):
            label_id = self.label2int[row[3]]
            test_nli_samples.append(
                InputExample(guid=row[0],
                             texts=[row[1], row[2]],
                             label=label_id))

        print(len(train_nli_samples))
        train_data_nli = SentencesDataset(train_nli_samples, model=self.model)
        self.train_dataloader_nli = DataLoader(train_data_nli,
                                               shuffle=True,
                                               batch_size=self.batch_size)
        dev_data_nli = SentencesDataset(dev_nli_samples, model=self.model)
        self.dev_dataloader_nli = DataLoader(dev_data_nli,
                                             shuffle=True,
                                             batch_size=self.batch_size)
        test_data_nli = SentencesDataset(test_nli_samples, model=self.model)
        self.test_dataloader_nli = DataLoader(test_data_nli,
                                              shuffle=True,
                                              batch_size=self.batch_size)
Esempio n. 3
0
    def train(self, train_examples, dev_examples, dir_path=None):

        train_examples = SentencesDataset(train_examples, self.model)
        dev_examples = SentencesDataset(dev_examples, self.model)

        train_dataloader = DataLoader(train_examples,
                                      shuffle=True,
                                      batch_size=self.args.train_batch_size)
        dev_dataloader = DataLoader(dev_examples,
                                    shuffle=False,
                                    batch_size=self.args.eval_batch_size)

        train_loss = losses.CosineSimilarityLoss(model=self.model)
        evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

        warmup_steps = math.ceil(
            len(train_examples) * self.args.num_train_epochs /
            self.args.train_batch_size * self.args.warmup_proportion)

        self.model.zero_grad()
        self.model.train()
        self.model.fit(train_objectives=[(train_dataloader, train_loss)],
                       evaluator=evaluator,
                       epochs=self.args.num_train_epochs,
                       evaluation_steps=10000,
                       warmup_steps=warmup_steps,
                       output_path=None,
                       optimizer_params={
                           'lr': self.args.learning_rate,
                           'eps': 1e-6,
                           'correct_bias': False
                       })
Esempio n. 4
0
def main(model_path, model_type, extra_dataset):
    # Read the dataset
    train_batch_size = 64
    num_epochs = 20
    model_save_path = model_path + '_continue_training_' + datetime.now(
    ).strftime("%Y_%m_%d_%H_%M_%S")
    n2c2_reader = TripletReader(extra_dataset)

    if model_type.lower() in ["bert"]:
        word_embedding_model = models.BERT(model_path)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        embedder = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        #### load sentence BERT models and generate sentence embeddings ####
    else:
        #### load sentence BERT models and generate sentence embeddings ####
        embedder = SentenceTransformer(model_path)

    # Load a pre-trained sentence transformer model
    model = SentenceTransformer(model_path)

    # Convert the dataset to a DataLoader ready for training
    logging.info("Read extra training dataset")
    train_data = SentencesDataset(n2c2_reader.get_examples('train.tsv'), model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.TripletLoss(model=model, triplet_margin=triplet_margin)

    logging.info("Read development dataset")
    dev_data = SentencesDataset(examples=n2c2_reader.get_examples('dev.tsv'),
                                model=model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=train_batch_size)
    evaluator = TripletEvaluator(dev_dataloader)

    # Configure the training. We skip evaluation in this example
    warmup_steps = math.ceil(
        len(train_data) * num_epochs / train_batch_size *
        0.1)  #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=math.ceil(len(train_data) / train_batch_size),
              warmup_steps=warmup_steps,
              output_path=model_save_path)
Esempio n. 5
0
    def preparing_data_fever(self):
        """
        Method used for data preparation before training
        it reads data from files predefined in config and process them
        Uses for FEVER SNLI-style data format
        """
        def read_fever(path):
            df = pd.read_csv(path)
            df.dropna(inplace=True)
            df.reset_index(drop=True, inplace=True)
            return df

        train_snli = _create_examples_fever(read_fever(self.train_path),
                                            'train_s')
        dev_snli = _create_examples_fever(read_fever(self.dev_path), 'dev_s')
        test_snli = _create_examples_fever(read_fever(self.test_path),
                                           'test_s')

        # Convert the dataset to a DataLoader ready for training
        self.logger.info("Read train dataset")

        train_nli_samples = []
        dev_nli_samples = []
        test_nli_samples = []

        for row in tqdm(train_snli):
            label_id = self.label2int[row[3]]
            train_nli_samples.append(
                InputExample(guid=row[0],
                             texts=[row[1], row[2]],
                             label=label_id))
        for row in tqdm(dev_snli):
            label_id = self.label2int[row[3]]
            dev_nli_samples.append(
                InputExample(guid=row[0],
                             texts=[row[1], row[2]],
                             label=label_id))
        for row in tqdm(test_snli):
            label_id = self.label2int[row[3]]
            test_nli_samples.append(
                InputExample(guid=row[0],
                             texts=[row[1], row[2]],
                             label=label_id))

        train_data_nli = SentencesDataset(train_nli_samples, model=self.model)
        self.train_dataloader_nli = DataLoader(train_data_nli,
                                               shuffle=True,
                                               batch_size=self.batch_size)
        dev_data_nli = SentencesDataset(dev_nli_samples, model=self.model)
        self.dev_dataloader_nli = DataLoader(dev_data_nli,
                                             shuffle=True,
                                             batch_size=self.batch_size)
        test_data_nli = SentencesDataset(test_nli_samples, model=self.model)
        self.test_dataloader_nli = DataLoader(test_data_nli,
                                              shuffle=True,
                                              batch_size=self.batch_size)
Esempio n. 6
0
    def train_contrastive_model(self,
                                slang_ind,
                                params=None,
                                fold_name='default'):

        if params is None:
            params = {
                'train_batch_size': 16,
                'num_epochs': 4,
                'triplet_margin': 1,
                'outpath': 'SBERT_contrastive'
            }

        self.prep_contrastive_training(slang_ind, fold_name=fold_name)

        out_dir = self.out_dir + '/' + fold_name + '/SBERT_data/'

        triplet_reader = TripletReader(out_dir,
                                       s1_col_idx=0,
                                       s2_col_idx=1,
                                       s3_col_idx=2,
                                       delimiter=',',
                                       has_header=True)
        output_path = out_dir + params['outpath']

        sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

        train_data = SentencesDataset(
            examples=triplet_reader.get_examples('contrastive_train.csv'),
            model=sbert_model)
        train_dataloader = DataLoader(train_data,
                                      shuffle=True,
                                      batch_size=params['train_batch_size'])
        train_loss = losses.TripletLoss(
            model=sbert_model, triplet_margin=params['triplet_margin'])

        dev_data = SentencesDataset(
            examples=triplet_reader.get_examples('contrastive_dev.csv'),
            model=sbert_model)
        dev_dataloader = DataLoader(dev_data,
                                    shuffle=False,
                                    batch_size=params['train_batch_size'])
        evaluator = TripletEvaluator(dev_dataloader)

        warmup_steps = int(
            len(train_data) * params['num_epochs'] /
            params['train_batch_size'] * 0.1)  #10% of train data

        # Train the model
        sbert_model.fit(train_objectives=[(train_dataloader, train_loss)],
                        evaluator=evaluator,
                        epochs=params['num_epochs'],
                        evaluation_steps=len(dev_data),
                        warmup_steps=warmup_steps,
                        output_path=output_path)
def stratifiedkfoldtest(data):
	data = data.sample(frac=1,random_state=1).reset_index(drop=True)
	skf = StratifiedKFold(n_splits=10)
	splits=[(x,y) for x,y in skf.split(data, data['label'])]
	f1list=[]
	acclist=[]
	import torch
	torch.cuda.empty_cache()
	t = torch.cuda.get_device_properties(0).total_memory
	r = torch.cuda.memory_reserved(0) 
	a = torch.cuda.memory_allocated(0)
	f = r-a  # free inside reserved
	print(f"Total:{t/1e+9}, Reserved:{r}, Allocated:{a}, Free:{f}")
	for b in [24]:
	  for l in [2e-5]:
	    for e in [4]:
	      for train_index, test_index in splits:
	        #resetting the model for every fold
	        model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1)
	        #train split
	        train=data.loc[train_index]
	        #test split
	        test=data.loc[test_index]
	        #data loaders
	        train_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in train.iterrows()],model)
	        test_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in test.iterrows()],model)
	        train_=DataLoader(train_,batch_size=b)
	        test_=DataLoader(test_)
	        #loss function
	        #training
	        model.fit(train_,epochs=e,optimizer_params={'lr':l})
	        #predictions using encoder similarity
	        y=test['label']
	        dlist=list(test.apply(lambda d:(d['query_p'],d['citation_p']), axis=1))
	        yh=sts_sim(dlist,model)
	        #f1
	        f1scores,thresholds=f1_macro(y,yh)
	        print(np.nan in f1scores)
	        f1=max(f1scores)
	        f1list.append(f1)
	        print(f1)
	        #accuracy
	        mthres=thresholds[np.nanargmax(f1scores)]
	        yh1=np.zeros(len(yh))
	        yh1[yh>=mthres]=1
	        f12=metrics.f1_score(y,yh1,average='macro')
	        if f12!=f1:
	        	import pdb
	        	pdb.set_trace()
	        acc=metrics.accuracy_score(y, yh1)
	        print(acc)
	        acclist.append(acc)
	      print(b,l,e)
	      print("Average Macro F1 across folds:",np.mean(f1list))
	      print("Average Acc across folds:",np.mean(acclist))
def kfoldtest(data):
	data = data.sample(frac=1,random_state=1).reset_index(drop=True)
	skf = KFold(n_splits=100)
	splits=[(x,y) for x,y in skf.split(data)]
	f1list=[]
	acclist=[]
	import torch
	print(torch.cuda.is_available())
	for b in [20]:
		for l in [2e-5]:
			for e in [4]:
				yh=np.array([])
				y=np.array([])
				i=0
				for train_index, test_index in splits:
					i+=1
					print(f"Fold {i}")
					#resetting the model for every fold
					model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1)
					#train split
					train=data.loc[train_index]
					#test split
					test=data.loc[test_index]
					#data loaders
					train_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in train.iterrows()],model)
					test_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in test.iterrows()],model)
					train_=DataLoader(train_,batch_size=b)
					test_=DataLoader(test_)
					#training
					model.fit(train_,epochs=e,optimizer_params={'lr':l})
					#predictions using cos_similarity
					y=np.append(y,test['label'])
					dlist=list(test.apply(lambda d:(d['query_p'],d['citation_p']), axis=1))
					yh=np.append(yh,sts_sim(dlist,model))
				#f1
				f1scores,thresholds=f1_macro(y,yh)
				print(np.nan in f1scores)
				f1=max(f1scores)
				f1list.append(f1)
				print(f1)
				#accuracy
				mthres=thresholds[np.nanargmax(f1scores)]
				yh1=np.zeros(len(yh))
				yh1[yh>=mthres]=1
				f12=metrics.f1_score(y,yh1,average='macro')
				if f12!=f1:
					import pdb
					pdb.set_trace()
				acc=metrics.accuracy_score(y, yh1)
				print(acc)
				acclist.append(acc)
				print(b,l,e)
				print("BERT Fine-Tuned: Average F1 across folds:",np.mean(f1list))
				print("BERT Fine-Tuned: Average Acc across folds:",np.mean(acclist))
    def load_data(self, data_path=DATAPATH):
        """
        Method to load data in special format appropriate for BERT training. The data should be previously
        prepared using BertPreprocess and be in one directory having predefined names.
        :param data_path: string, directory where data is
        :return:
        """
        self.df1 = pd.read_csv(data_path + "product_df.csv")
        self.df1.columns = [0, 1, 2]
        Logger.logger.info("[INFO] Datasets1 for BERT training is loaded.")

        self.df2 = pd.read_csv(data_path + "function_df.csv")
        self.df2.columns = [0, 1, 2]
        Logger.logger.info("[INFO] Datasets2 for BERT training is loaded.")

        # training data
        self.train_data1 = SentencesDataset(self.reader.get_examples(
            self.df1, modelname='model_1'),
                                            model=self.model1)
        self.train_dataloader1 = DataLoader(self.train_data1,
                                            shuffle=True,
                                            batch_size=self.batch_size)
        Logger.logger.info("[INFO] First loader initialized")
        # training data
        self.train_data2 = SentencesDataset(self.reader.get_examples(
            self.df2, modelname='model_2'),
                                            model=self.model2)
        self.train_dataloader2 = DataLoader(self.train_data2,
                                            shuffle=True,
                                            batch_size=self.batch_size)
        Logger.logger.info("[INFO] Second loader initialized")

        # val data
        fold = int(len(self.df1) * 0.95)
        dev_data1 = SentencesDataset(self.reader.get_examples(
            self.df1[fold:], modelname='model_1'),
                                     model=self.model1)
        dev_dataloader1 = DataLoader(dev_data1,
                                     shuffle=False,
                                     batch_size=self.batch_size)
        self.evaluator1 = EmbeddingSimilarityEvaluator(dev_dataloader1)

        # val data
        fold = int(len(self.df2) * 0.95)
        dev_data2 = SentencesDataset(self.reader.get_examples(
            self.df2[fold:], modelname='model_2'),
                                     model=self.model2)
        dev_dataloader2 = DataLoader(dev_data2,
                                     shuffle=False,
                                     batch_size=self.batch_size)
        self.evaluator2 = EmbeddingSimilarityEvaluator(dev_dataloader2)
def nlptrain(premodel,ver,tr_data,te_data):
	
#### Just some code to print debug information to stdout
	logging.basicConfig(format='%(asctime)s - %(message)s',
                    	datefmt='%Y-%m-%d %H:%M:%S',
                    	level=logging.INFO,
                    	handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
	model_name = 'roberta-large-nli-stsb-mean-tokens'
	train_batch_size = 16
	num_epochs = 4
	model_save_path = ver
	sts_reader = STSDataReader('kt_datasets/kt_benchmark', normalize_scores=True)

# Load a pre-trained sentence transformer model
	model = SentenceTransformer(premodel)

# Convert the dataset to a DataLoader ready for training
	logging.info("")
	train_data = SentencesDataset(sts_reader.get_examples(tr_data), model)
	train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
	train_loss = losses.CosineSimilarityLoss(model=model)


	logging.info("")
	dev_data = SentencesDataset(examples=sts_reader.get_examples(te_data), model=model)
	dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
	evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)


# Configure the training. We skip evaluation in this example
	warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
	logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
	model.fit(train_objectives=[(train_dataloader, train_loss)],
          	evaluator=evaluator,
          	epochs=num_epochs,
          	evaluation_steps=1000,
         	warmup_steps=warmup_steps,
          	output_path=model_save_path)

	list=['model saved in '+ ver+' directory']

	return(list)
def evaluate_language_pair(model, pair_name="cmn-eng", batch_size=32):
    lang_1, lang_2 = pair_name.split("-")
    reader_1 = TatoebaReader(TATOEBA_PATH / f"tatoeba.{pair_name}.{lang_1}")
    ds_1 = SentencesDataset(reader_1.get_examples(), model=model)
    loader_1 = DataLoader(ds_1,
                          shuffle=False,
                          batch_size=batch_size,
                          collate_fn=model.smart_batching_collate)

    reader_2 = TatoebaReader(TATOEBA_PATH / f"tatoeba.{pair_name}.{lang_2}")
    ds_2 = SentencesDataset(reader_2.get_examples(), model=model)
    loader_2 = DataLoader(ds_2,
                          shuffle=False,
                          batch_size=batch_size,
                          collate_fn=model.smart_batching_collate)

    model.eval()
    emb_1, emb_2 = [], []
    with torch.no_grad():
        for batch in loader_1:
            emb_1.append(
                model(batch_to_device(batch,
                                      "cuda")[0][0])['sentence_embedding'])
        for batch in loader_2:
            emb_2.append(
                model(batch_to_device(batch,
                                      "cuda")[0][0])['sentence_embedding'])
    emb_1 = torch.cat(emb_1).cpu().numpy()
    emb_2 = torch.cat(emb_2).cpu().numpy()

    idx_1 = faiss.IndexFlatL2(emb_1.shape[1])
    faiss.normalize_L2(emb_1)
    idx_1.add(emb_1)
    idx_2 = faiss.IndexFlatL2(emb_2.shape[1])
    faiss.normalize_L2(emb_2)
    idx_2.add(emb_2)

    results = []
    _, match = idx_2.search(x=emb_1, k=1)
    results.append((lang_1, lang_2, np.sum(match[:,
                                                 0] == np.arange(len(emb_1))),
                    len(emb_1)))
    _, match = idx_1.search(x=emb_2, k=1)
    results.append((lang_2, lang_1, np.sum(match[:,
                                                 0] == np.arange(len(emb_2))),
                    len(emb_2)))
    return results
Esempio n. 12
0
def train_sbert(model_name, model_save_path):
    batch_size = 16
    nli_reader, sts_reader = load_dataset()
    train_num_labels = nli_reader.get_num_labels()
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.BERT(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_mean_tokens=True,
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


    # Convert the dataset to a DataLoader ready for training
    logging.info("Read AllNLI train dataset")
    train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)

    logging.info("Read STSbenchmark dev dataset")
    dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
    dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    # Configure the training
    num_epochs = 1

    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))
    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=1000,
            warmup_steps=warmup_steps,
            output_path=model_save_path
            )

    model = SentenceTransformer(model_save_path)
    test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
    test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

    model.evaluate(evaluator)
def create_posts_ranking(fl, data_dir, model, validate=None, is_test=False):
    train_posts_ranking = []
    disbn = []
    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        for obj in data:
            answers = obj['answers']
            filtered_answers = []
            votes = 1000000
            for answer in answers:
                my_votes = answer['a_votes']
                if my_votes < votes:
                    votes = my_votes
                    filtered_answers.append(answer)

            if len(filtered_answers) > 1:
                rank = len(filtered_answers)
                for answer in filtered_answers:
                    dist = rank / len(filtered_answers)
                    disbn.append(answer['a_rank'])
                    rank = rank - 1
                    train_posts_ranking.append(
                        InputExample(texts=[obj['q_text'], answer['a_text']],
                                     label=dist))

    random.shuffle(train_posts_ranking)

    print("data size " + str(len(train_posts_ranking)))

    if is_test:
        return train_posts_ranking

    if max_size:
        train_posts_ranking = train_posts_ranking[:max_size]

    evaluator = None
    if posts_rank_str == validate:
        train_posts_ranking, dev_posts_ranking = train_test_split(
            train_posts_ranking, test_size=0.1)
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            dev_posts_ranking, name='posts ranking')

    warmup_steps = math.ceil(
        len(train_posts_ranking) * num_epochs / batch_size *
        0.1)  # 10% of train data for warm-up

    train_data_posts_ranking = SentencesDataset(train_posts_ranking,
                                                model=model)
    train_dataloader_posts_ranking = DataLoader(train_data_posts_ranking,
                                                shuffle=True,
                                                batch_size=batch_size)
    train_loss_posts_ranking = losses.CosineSimilarityLoss(model=model)

    print('R: Number of training examples: ', len(train_posts_ranking))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_posts_ranking) / 0.1)

    return train_dataloader_posts_ranking, train_loss_posts_ranking, evaluator, warmup_steps
Esempio n. 14
0
def dev_config(sts_reader, model, batch_size):
	'''dev dataloader and model'''	
	logger.info(f"Read STSbenchmark dev dataset")
	dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
	dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
	dev_loss = losses.CosineSimilarityLoss(model=model)
	dev_evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
	return dev_loss, dev_dataloader, dev_evaluator
Esempio n. 15
0
def train_config(sts_reader, model, batch_size):
	'''train dataloader and model.'''
	logger.info(f"Read STSbenchmark train dataset")
	train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
	train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
	train_loss = losses.CosineSimilarityLoss(model=model)
	train_evaluator = EmbeddingSimilarityEvaluator(train_dataloader)
	return train_data, train_loss, train_dataloader, train_evaluator
Esempio n. 16
0
def fit(model,
        data_pars=None,
        model_pars=None,
        compute_pars=None,
        out_pars=None,
        *args,
        **kw):
    """
    """

    log("############ Dataloader setup  #############################")
    data_readers, interal_states = get_dataset(data_pars)
    train_reader, val_reader = data_readers

    train_data = SentencesDataset(train_reader.get_examples('train.gz'),
                                  model=model.model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=compute_pars["batch_size"])

    val_data = SentencesDataset(val_reader.get_examples('val/sts-dev.csv'),
                                model=model.model)
    val_dataloader = DataLoader(val_data,
                                shuffle=True,
                                batch_size=compute_pars["batch_size"])

    log("############ Fit setup  ##################################")
    emb_dim = model.model.get_sentence_embedding_dimension()
    train_num_labels = train_reader.get_num_labels()

    train_loss = getattr(losses, compute_pars["loss"])(
        model=model.model,
        sentence_embedding_dimension=emb_dim,
        num_labels=train_num_labels)
    train_loss.float()
    evaluator = EmbeddingSimilarityEvaluator(val_dataloader)
    model.model.float()

    model.fit_metrics = model.model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=evaluator,
        epochs=compute_pars["num_epochs"],
        evaluation_steps=compute_pars["evaluation_steps"],
        warmup_steps=compute_pars["warmup_steps"],
        output_path=out_pars["model_path"])
    return model, None
def create_hirerachy_examples(fl,
                              data_dir,
                              model,
                              validate=None,
                              is_test=False):
    train_hierarchy_samples = []
    disbn = []
    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        max_distance = 0
        for obj in data:
            if obj['distance'] > max_distance:
                max_distance = obj['distance']
        for obj in data:
            # flip the meaning of similarity, since the more distant the two classes, the closer to 0 it should be
            dist = (max_distance - obj['distance']) / (max_distance - 1)
            train_hierarchy_samples.append(
                InputExample(texts=[obj['class1'], obj['class2']], label=dist))
            disbn.append(obj['distance'])
    random.shuffle(train_hierarchy_samples)
    train_hierarchy_samples = train_hierarchy_samples[:100000]
    disbn = disbn[:100000]

    if max_size:
        train_hierarchy_samples = train_hierarchy_samples[:max_size]
        disbn = disbn[:max_size]

    if is_test:
        return train_hierarchy_samples

    evaluator = None

    if hierarchy_str == validate:
        train_hierarchy_samples, dev_hierarchy_samples = train_test_split(
            train_hierarchy_samples, stratify=disbn, test_size=0.1)
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            dev_hierarchy_samples, name='hierarchy')

    warmup_steps = math.ceil(
        len(train_hierarchy_samples) * num_epochs / batch_size *
        0.1)  # 10% of train data for warm-up

    train_data_hierarchy = SentencesDataset(train_hierarchy_samples,
                                            model=model)
    train_dataloader_hierarchy = DataLoader(train_data_hierarchy,
                                            shuffle=True,
                                            batch_size=batch_size)
    train_loss_hierarchy = losses.CosineSimilarityLoss(model=model)

    print('H: Number of training examples: ', len(train_hierarchy_samples))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_hierarchy_samples) / 0.1)
    return train_dataloader_hierarchy, train_loss_hierarchy, evaluator, warmup_steps
def finetune_sbert(model, df, rep_sents, finetune_cfg):
    """Finetune the Sentence-BERT."""
    # setup
    train_size = finetune_cfg.get("train_size", 200000)
    sample_per_pair = finetune_cfg.get("sample_per_pair", 5)
    train_batch_size = finetune_cfg.get("train_batch_size", 32)
    epochs = finetune_cfg.get("epochs", 1)
    train = []
    n_sampled = 0
    cnts = [0, 0]  # [neg, pos]
    max_label_size = train_size // 2
    genres = df.genres.apply(set)

    with tqdm(total=train_size, position=0) as pbar:
        # sample sentence pairs
        while n_sampled < train_size:
            id1, id2 = np.random.randint(0, len(df), 2)
            label = int(bool(set.intersection(genres[id1], genres[id2])))

            if cnts[label] > max_label_size:
                continue

            sent_pairs = np.stack(np.meshgrid(rep_sents[id1],
                                              rep_sents[id2])).T.reshape(
                                                  -1, 2)
            if len(sent_pairs) <= sample_per_pair:
                samples = sent_pairs
            else:
                samples_idx = np.random.choice(sent_pairs.shape[0],
                                               sample_per_pair,
                                               replace=False)
                samples = sent_pairs[samples_idx]

            inexp = lambda pair: InputExample(texts=list(pair), label=label)
            samples = list(map(inexp, samples))
            train.extend(samples)

            n_sampled += len(samples)
            cnts[label] += len(samples)
            pbar.update(len(samples))

        # run finetune
        train_ds = SentencesDataset(train, model)
        train_obj = (
            DataLoader(train_ds, shuffle=True, batch_size=train_batch_size),
            losses.ContrastiveLoss(model=model),
        )
        model.fit(train_objectives=[train_obj],
                  epochs=epochs,
                  warmup_steps=100)
        os.makedirs("model/clustering/sbert", exist_ok=True)
        model.save("model/clustering/sbert")
Esempio n. 19
0
    def pretrained_model_score(self, model_name, expected_score):
        model = SentenceTransformer(model_name)
        sts_reader = STSDataReader('../examples/datasets/stsbenchmark')

        test_data = SentencesDataset(
            examples=sts_reader.get_examples("sts-test.csv"), model=model)
        test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
        evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

        score = model.evaluate(evaluator) * 100
        print(model_name,
              "{:.2f} vs. exp: {:.2f}".format(score, expected_score))
        assert abs(score - expected_score) < 0.1
def main():
    model = SentenceTransformer('bert-base-nli-mean-tokens')

    sts_reader = STSDataReader('datasets/stsbenchmark')

    test_data = SentencesDataset(
        examples=sts_reader.get_examples('sts-test.csv'),
        model=model,
        dataset_cache_id='sts-eval')
    test_dataloader = DataLoader(test_data, shuffle=False, batch_size=16)
    evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

    model.evaluate(evaluator)
Esempio n. 21
0
    def run(
        self,
        training_data,
        evaluator,
        output_path,
        from_scratch=False,
        loss=SentenceTransformerLoss.cosine_similarity_loss,
        model_name_or_path="roberta-large-nli-stsb-mean-tokens",
        cuda=True,
        **kwargs,
    ):
        logger.info(
            f"Running Sentence Transformer Task: {model_name_or_path}, Output path: {output_path}"
        )
        if from_scratch:
            logger.info("Training from scratch")
            models.Transformer(model_name_or_path,
                               max_seq_length=kwargs.get(
                                   "max_seq_length", 128))
        else:
            model = SentenceTransformer(model_name_or_path)
        if cuda:
            logger.info("Running model on GPU")
            model.cuda()

        train_examples = [
            InputExample(texts=[data["sentence1"], data["sentence2"]],
                         label=data["label"])
            for data in training_data.values()
        ]
        train_dataset = SentencesDataset(train_examples, model)
        train_dataloader = DataLoader(
            train_dataset,
            shuffle=kwargs.get("shuffle", True),
            batch_size=kwargs.get("batch_size", 4),
        )
        warmup_steps = math.ceil(
            len(train_examples) * kwargs.get("num_epochs", 3) /
            kwargs.get("train_batch_size", 4) *
            0.1)  # 10% of train data for warm-up
        train_loss = loss.value(model)
        model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=kwargs.get("num_epochs", 3),
            evaluation_steps=kwargs.get("evaluation_steps", 500),
            warmup_steps=warmup_steps,
            output_path=output_path,
            evaluator=evaluator,
        )
Esempio n. 22
0
def cli_main():
    # 作者在issues里提到的多语言的预训练模型 xlm-r-40langs-bert-base-nli-stsb-mean-tokens
    # 针对信息检索任务的多语言预训练模型  distilbert-multilingual-nli-stsb-quora-ranking
    model = SentenceTransformer(
        'distilbert-multilingual-nli-stsb-quora-ranking')

    num_epochs = 10
    train_batch_size = 64
    model_save_path = os.path.join(
        cur_dir, 'output/training_MultipleNegativesRankingLoss-' +
        datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    os.makedirs(model_save_path, exist_ok=True)

    colab_dir = "/content/drive/My Drive/data/nlp"
    data_file = os.path.join(colab_dir, "LCCC-large.json")
    train_samples = get_data(data_file)

    # After reading the train_samples, we create a SentencesDataset and a DataLoader
    train_dataset = SentencesDataset(train_samples, model=model)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.MultipleNegativesRankingLoss(model)

    ###### Duplicate Questions Information Retrieval ######
    evaluators = []
    data_file = os.path.join(colab_dir, "STC.json")
    max_ir_num = 5000
    max_corpus_size = 100000
    ir_queries, ir_corpus, ir_relevant_docs = get_iq_corpus(
        data_file, max_ir_num, max_corpus_size)

    ir_evaluator = evaluation.InformationRetrievalEvaluator(
        ir_queries, ir_corpus, ir_relevant_docs)
    evaluators.append(ir_evaluator)
    seq_evaluator = evaluation.SequentialEvaluator(
        evaluators, main_score_function=lambda scores: scores[-1])

    logging.info("Evaluate model without training")
    seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=seq_evaluator,
              epochs=num_epochs,
              warmup_steps=1000,
              output_path=model_save_path,
              output_path_ignore_not_empty=True)
def create_linked_posts(fl, data_dir, model, validate=None, is_test=False):
    train_linked_posts = []
    disbn = []

    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        for obj in data:
            if obj['class'] == 'relevant':
                label = 1
            else:
                label = 0
            disbn.append(label)

            train_linked_posts.append(
                InputExample(texts=[obj['text_1'], obj['text_2']],
                             label=label))
    random.shuffle(train_linked_posts)

    if is_test:
        return train_linked_posts

    if max_size:
        train_linked_posts = train_linked_posts[:max_size]

    evaluator = None
    if linked_posts_str == validate:
        train_linked_posts, dev_linked_posts = train_test_split(
            train_linked_posts, stratify=disbn, test_size=0.1)
        evaluator = BinaryClassificationEvaluator.from_input_examples(
            dev_linked_posts, name='linked-posts')

    warmup_steps = math.ceil(
        len(train_linked_posts) * num_epochs / batch_size *
        0.1)  # 10% of train data for warm-up

    train_data_linked_posts = SentencesDataset(train_linked_posts, model=model)
    train_dataloader_linked_posts = DataLoader(train_data_linked_posts,
                                               shuffle=True,
                                               batch_size=batch_size)
    train_loss_linked_posts = losses.ContrastiveLoss(model=model)

    print('L: Number of training examples: ', len(train_linked_posts))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_linked_posts) / 0.1)

    return train_dataloader_linked_posts, train_loss_linked_posts, evaluator, warmup_steps
def create_train_usage(fl, data_dir, model, validate=None, is_test=False):
    train_usage = []
    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        min_d = 10000000
        max_d = 0
        for obj in data:
            dist = obj['distance']
            if dist < min_d:
                min_d = dist
            if dist > max_d:
                max_d = dist
        for obj in data:
            dist = (max_d - obj['distance']) / (max_d - min_d)
            train_usage.append(
                InputExample(texts=[obj['class1'], obj['class2']], label=dist))

    random.shuffle(train_usage)

    if is_test:
        return train_usage

    if max_size:
        train_usage = train_usage[:max_size]

    evaluator = None

    if usage_str == validate:
        train_usage, dev_usage = train_test_split(train_usage, test_size=0.1)
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            dev_usage, name='usage')
    warmup_steps = math.ceil(len(train_usage) * num_epochs / batch_size *
                             0.1)  # 10% of train data for warm-up

    train_data_usage = SentencesDataset(train_usage, model=model)
    train_dataloader_usage = DataLoader(train_data_usage,
                                        shuffle=True,
                                        batch_size=batch_size)
    train_loss_usage = losses.CosineSimilarityLoss(model=model)

    print('U: Number of training examples: ', len(train_usage))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_usage) / 0.1)

    return train_dataloader_usage, train_loss_usage, evaluator, warmup_steps
    def test_train_stsb(self):
        word_embedding_model = models.Transformer('distilbert-base-uncased')
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension())
        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        train_dataset = SentencesDataset(self.stsb_train_samples, model)
        train_dataloader = DataLoader(train_dataset,
                                      shuffle=True,
                                      batch_size=16)
        train_loss = losses.CosineSimilarityLoss(model=model)
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=None,
                  epochs=1,
                  evaluation_steps=1000,
                  warmup_steps=int(len(train_dataloader) * 0.1),
                  use_amp=True)

        self.evaluate_stsb_test(model, 80.0)
Esempio n. 26
0
def fine_tune(cfg):
    """
    Function to finetune a model with Infodemic-specific data.

    :param cfg: configuration dictionary
    :return: none
    """
    model = SentenceTransformer(cfg['model'])
    # data reading dependent on data format, see https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/sts/training_stsbenchmark_continue_training.py
    # for an example at lines 48-62
    train_samples = None
    train_ds = SentencesDataset(train_samples, model)
    train_dl = DataLoader(train_ds)
    train_loss = losses.SoftmaxLoss(model, num_labels=3)

    evaluator = None  # list of evaluators at https://github.com/UKPLab/sentence-transformers/tree/master/sentence_transformers/evaluation
    model.fit(train_objectives=[(train_dl, train_loss)],
              evaluator=evaluator,
              epochs=30,
              evaluation_steps=1000,
              output_path=cfg['model_output'])
Esempio n. 27
0
File: eval.py Progetto: BenfenYU/Gun
def test_self():
    sts_reader = Self_csv_DataReader('./self_dataset/')
    model_save_path = './output'
    dir_list = os.listdir(model_save_path)
    dir_list.sort(key=lambda fn: os.path.getmtime(model_save_path + '/' + fn))
    model_save_path = os.path.join(model_save_path, dir_list[-1])
    model_save_path = './output/training_nli_.-pretrained_model-bert-base-chinese-2020-07-30_15-59-13'

    model = SentenceTransformer(model_save_path)
    examples, label_text = sts_reader.get_examples("test.csv", _eval=True)
    test_data = SentencesDataset(examples=examples, model=model)
    test_dataloader = DataLoader(test_data,
                                 shuffle=False,
                                 batch_size=config.train_batch_size)
    evaluator = LabelAccuracyEvaluator(
        test_dataloader,
        softmax_model=Softmax_label(model=model,
                                    sentence_embedding_dimension=model.
                                    get_sentence_embedding_dimension(),
                                    num_labels=config.train_num_labels),
        label_text=label_text)

    model.evaluate(evaluator, output_path=model_save_path)
Esempio n. 28
0
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

# Add two trainable feed-forward networks (DAN)
sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension()
dan1 = models.Dense(in_features=sent_embeddings_dimension,
                    out_features=sent_embeddings_dimension)
dan2 = models.Dense(in_features=sent_embeddings_dimension,
                    out_features=sent_embeddings_dimension)

model = SentenceTransformer(
    modules=[word_embedding_model, word_weights, pooling_model, dan1, dan2])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'),
                              model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
                            model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

# Configure the training
num_epochs = 10
warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
Esempio n. 29
0
def main(args):
    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
    #### /print debug information to stdout
    # Read the dataset
    train_batch_size = 64
    num_epochs = 1000

    if args.pretrained:
        model = SentenceTransformer(args.pretrained)
        model_save_path = os.path.join(
            args.save_path,
            args.pretrained.split("/")[-1] + '-' +
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    else:
        #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
        model_name = 'cl-tohoku/bert-base-japanese-char-whole-word-masking'
        model_save_path = os.path.join(
            args.save_path,
            model_name.replace("/", "-") + '-' +
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
        # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
        word_embedding_model = models.Transformer(model_name)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])

    # Convert the dataset to a DataLoader ready for training
    logging.info("Read custom train dataset")

    train_samples = []
    val_samples = []
    inp_list = []
    dataset_path = args.data_path
    with gzip.open(dataset_path, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            score = float(
                row['score']) / 10  # Normalize score to range 0 ... 1
            inp_list.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=score))

    from sklearn.model_selection import train_test_split
    train_samples, val_samples = train_test_split(inp_list, test_size=0.2)
    # import ipdb; ipdb.set_trace()

    train_dataset = SentencesDataset(train_samples, model)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    logging.info("Read custom dev dataset")
    # evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='sts-dev')
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples)

    # Configure the training. We skip evaluation in this example
    warmup_steps = math.ceil(
        len(train_dataset) * num_epochs / train_batch_size *
        0.1)  #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # import ipdb; ipdb.set_trace()
    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=model_save_path)
Esempio n. 30
0
#################################################################################################
#
# Step 3: Train bi-encoder model with both (gold + silver) STSbenchmark dataset - Augmented SBERT
#
#################################################################################################

logging.info(
    "Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)".
    format(model_name))

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark gold and silver train dataset")
silver_samples = list(InputExample(texts=[data[0], data[1]], label=score) for \
    data, score in zip(silver_data, silver_scores))

train_dataset = SentencesDataset(gold_samples + silver_samples, bi_encoder)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=bi_encoder)

logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples,
                                                             name='sts-dev')

# Configure the training.
warmup_steps = math.ceil(len(train_dataset) * num_epochs / batch_size *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the bi-encoder model