def test_train_stsb(self):
     model = CrossEncoder('distilroberta-base', num_labels=1)
     train_dataloader = DataLoader(self.stsb_train_samples, shuffle=True, batch_size=16)
     model.fit(train_dataloader=train_dataloader,
               epochs=1,
               warmup_steps=int(len(train_dataloader)*0.1))
     self.evaluate_stsb_test(model, 75)
def stratifiedkfoldtest(data):
	data = data.sample(frac=1,random_state=1).reset_index(drop=True)
	skf = StratifiedKFold(n_splits=10)
	splits=[(x,y) for x,y in skf.split(data, data['label'])]
	f1list=[]
	acclist=[]
	import torch
	torch.cuda.empty_cache()
	t = torch.cuda.get_device_properties(0).total_memory
	r = torch.cuda.memory_reserved(0) 
	a = torch.cuda.memory_allocated(0)
	f = r-a  # free inside reserved
	print(f"Total:{t/1e+9}, Reserved:{r}, Allocated:{a}, Free:{f}")
	for b in [24]:
	  for l in [2e-5]:
	    for e in [4]:
	      for train_index, test_index in splits:
	        #resetting the model for every fold
	        model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1)
	        #train split
	        train=data.loc[train_index]
	        #test split
	        test=data.loc[test_index]
	        #data loaders
	        train_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in train.iterrows()],model)
	        test_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in test.iterrows()],model)
	        train_=DataLoader(train_,batch_size=b)
	        test_=DataLoader(test_)
	        #loss function
	        #training
	        model.fit(train_,epochs=e,optimizer_params={'lr':l})
	        #predictions using encoder similarity
	        y=test['label']
	        dlist=list(test.apply(lambda d:(d['query_p'],d['citation_p']), axis=1))
	        yh=sts_sim(dlist,model)
	        #f1
	        f1scores,thresholds=f1_macro(y,yh)
	        print(np.nan in f1scores)
	        f1=max(f1scores)
	        f1list.append(f1)
	        print(f1)
	        #accuracy
	        mthres=thresholds[np.nanargmax(f1scores)]
	        yh1=np.zeros(len(yh))
	        yh1[yh>=mthres]=1
	        f12=metrics.f1_score(y,yh1,average='macro')
	        if f12!=f1:
	        	import pdb
	        	pdb.set_trace()
	        acc=metrics.accuracy_score(y, yh1)
	        print(acc)
	        acclist.append(acc)
	      print(b,l,e)
	      print("Average Macro F1 across folds:",np.mean(f1list))
	      print("Average Acc across folds:",np.mean(acclist))
def kfoldtest(data):
	data = data.sample(frac=1,random_state=1).reset_index(drop=True)
	skf = KFold(n_splits=100)
	splits=[(x,y) for x,y in skf.split(data)]
	f1list=[]
	acclist=[]
	import torch
	print(torch.cuda.is_available())
	for b in [20]:
		for l in [2e-5]:
			for e in [4]:
				yh=np.array([])
				y=np.array([])
				i=0
				for train_index, test_index in splits:
					i+=1
					print(f"Fold {i}")
					#resetting the model for every fold
					model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1)
					#train split
					train=data.loc[train_index]
					#test split
					test=data.loc[test_index]
					#data loaders
					train_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in train.iterrows()],model)
					test_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in test.iterrows()],model)
					train_=DataLoader(train_,batch_size=b)
					test_=DataLoader(test_)
					#training
					model.fit(train_,epochs=e,optimizer_params={'lr':l})
					#predictions using cos_similarity
					y=np.append(y,test['label'])
					dlist=list(test.apply(lambda d:(d['query_p'],d['citation_p']), axis=1))
					yh=np.append(yh,sts_sim(dlist,model))
				#f1
				f1scores,thresholds=f1_macro(y,yh)
				print(np.nan in f1scores)
				f1=max(f1scores)
				f1list.append(f1)
				print(f1)
				#accuracy
				mthres=thresholds[np.nanargmax(f1scores)]
				yh1=np.zeros(len(yh))
				yh1[yh>=mthres]=1
				f12=metrics.f1_score(y,yh1,average='macro')
				if f12!=f1:
					import pdb
					pdb.set_trace()
				acc=metrics.accuracy_score(y, yh1)
				print(acc)
				acclist.append(acc)
				print(b,l,e)
				print("BERT Fine-Tuned: Average F1 across folds:",np.mean(f1list))
				print("BERT Fine-Tuned: Average Acc across folds:",np.mean(acclist))
    for item in gold:
        try:
            doc = db.lookup_docno(item[0])
            examples.append(InputExample(texts=[query, doc], label=item[1]))
        except:
            continue
print("finished", len(examples))

#%%
from torch.utils.data import DataLoader
train_dataset = SentencesDataset(examples, ranker)
train_dl = DataLoader(train_dataset, shuffle=True, batch_size=16)
train_loss = losses.OnlineContrastiveLoss(model=ranker)

ranker.fit(train_dataloader=train_dl,
           epochs=20,
           output_path="ranker/constrastive_loss/",
           save_best_model=True)

pickle.dump(
    ranker,
    open("ranker/constrastive_loss/ranker_contrastive_loss_20_epochs.pkl",
         "wb"))

from tqdm.notebook import tqdm

run = {}
for topic in tqdm(topics):
    number = topic["number"]
    query = topic["title"]

    extracted_ids = [k for k in qrel[number].keys()]
Ejemplo n.º 5
0
		continue

test_sentence_pairs = []
test_labels = []
for line in test_data:
	pair = line.strip('\n').split('\t')
	new_entry = []
	try:
		new_entry.append([pair[0], pair[1]])
	except:
		continue
	try:
		new_entry.append(int(pair[2]))
	except:
		continue
	test_sentence_pairs.append(new_entry[0])
	test_labels.append(new_entry[1])

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
evaluator = CEBinaryClassificationEvaluator(test_sentence_pairs, test_labels)





roberta.fit(train_dataloader = train_dataloader,
	evaluator = evaluator,
	epochs = 5,
	loss_fct = nn.BCEWithLogitsLoss(),
	output_path = './ssce_save/ssce/'
	)
for topic in topics:
    gold = qrel[topic["number"]].items()
    query = topic["title"].strip()

    for item in gold:
        try:
            doc = db.lookup_docno(item[0])
            examples.append(InputExample(texts=[query, doc], label=item[1]))
        except:
            continue
print("finished")

from torch.utils.data import DataLoader
train_dataset = SentencesDataset(examples, ranker)
train_dl = DataLoader(train_dataset, shuffle=True, batch_size=16)
ranker.fit(train_dataloader=train_dl, epochs=5, output_path="ranker/base")

from tqdm.notebook import tqdm

run = {}
for topic in tqdm(topics):
    number = topic["number"]
    query = topic["title"]

    extracted_ids = [k for k in qrel[number].keys()]

    doc_ids = []
    for id in extracted_ids:
        try:
            db.lookup_docno(id)
            doc_ids.append(id)