Ejemplo n.º 1
0
 def __getitem__(self, index) -> InputExample:
     """
         0.5: match
            0.4: word - sentence
               0.: eng - eng: 100000+
               0.: thai - thai: 37706
               0.: thai - eng: 93045
               0.: eng - thai: 6310
            0.05: sentence - sentence: 6310 - match word from thai to eng then pick random sentences
            0.05: word - word: 80508
         0.5: not match
            0.: eng-eng
            0.: thai-thai
            0.: both
     """
     tha, eng = self.words[self.indices[index]]
     if np.random.rand() > 0.6 or self.true_only:
         out = InputExample(texts=[eng, tha], label=0.8)
     else:
         while True:
             idx = torch.randint(0, len(self), (1, ))
             other_tha, _ = self.words[self.indices[idx]]
             if other_tha != tha:
                 break
         out = InputExample(texts=[eng, other_tha], label=0.2)
     return out
    def test_multiclass(self):
        transformer = models.Transformer('prajjwal1/bert-tiny')
        model = SentenceTransformer(modules=[
            transformer,
            models.Pooling(transformer.get_word_embedding_dimension())
        ])
        softmax_loss = losses.SoftmaxLoss(
            model, transformer.get_word_embedding_dimension(), num_labels=3)

        samples = [
            InputExample(texts=[
                "Hello Word, a first test sentence",
                "Hello Word, a other test sentence"
            ],
                         label=0),
            InputExample(texts=[
                "Hello Word, a second test sentence",
                "Hello Word, a other test sentence"
            ],
                         label=1),
            InputExample(texts=[
                "Hello Word, a third test sentence",
                "Hello Word, a other test sentence"
            ],
                         label=2)
        ]
        dataloader = DataLoader(samples, batch_size=1)
        evaluator = MulticlassEvaluator(dataloader, softmax_model=softmax_loss)
        result = evaluator(model)

        i = 0
def stratifiedkfoldtest(data):
	data = data.sample(frac=1,random_state=1).reset_index(drop=True)
	skf = StratifiedKFold(n_splits=10)
	splits=[(x,y) for x,y in skf.split(data, data['label'])]
	f1list=[]
	acclist=[]
	import torch
	torch.cuda.empty_cache()
	t = torch.cuda.get_device_properties(0).total_memory
	r = torch.cuda.memory_reserved(0) 
	a = torch.cuda.memory_allocated(0)
	f = r-a  # free inside reserved
	print(f"Total:{t/1e+9}, Reserved:{r}, Allocated:{a}, Free:{f}")
	for b in [24]:
	  for l in [2e-5]:
	    for e in [4]:
	      for train_index, test_index in splits:
	        #resetting the model for every fold
	        model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1)
	        #train split
	        train=data.loc[train_index]
	        #test split
	        test=data.loc[test_index]
	        #data loaders
	        train_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in train.iterrows()],model)
	        test_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in test.iterrows()],model)
	        train_=DataLoader(train_,batch_size=b)
	        test_=DataLoader(test_)
	        #loss function
	        #training
	        model.fit(train_,epochs=e,optimizer_params={'lr':l})
	        #predictions using encoder similarity
	        y=test['label']
	        dlist=list(test.apply(lambda d:(d['query_p'],d['citation_p']), axis=1))
	        yh=sts_sim(dlist,model)
	        #f1
	        f1scores,thresholds=f1_macro(y,yh)
	        print(np.nan in f1scores)
	        f1=max(f1scores)
	        f1list.append(f1)
	        print(f1)
	        #accuracy
	        mthres=thresholds[np.nanargmax(f1scores)]
	        yh1=np.zeros(len(yh))
	        yh1[yh>=mthres]=1
	        f12=metrics.f1_score(y,yh1,average='macro')
	        if f12!=f1:
	        	import pdb
	        	pdb.set_trace()
	        acc=metrics.accuracy_score(y, yh1)
	        print(acc)
	        acclist.append(acc)
	      print(b,l,e)
	      print("Average Macro F1 across folds:",np.mean(f1list))
	      print("Average Acc across folds:",np.mean(acclist))
def kfoldtest(data):
	data = data.sample(frac=1,random_state=1).reset_index(drop=True)
	skf = KFold(n_splits=100)
	splits=[(x,y) for x,y in skf.split(data)]
	f1list=[]
	acclist=[]
	import torch
	print(torch.cuda.is_available())
	for b in [20]:
		for l in [2e-5]:
			for e in [4]:
				yh=np.array([])
				y=np.array([])
				i=0
				for train_index, test_index in splits:
					i+=1
					print(f"Fold {i}")
					#resetting the model for every fold
					model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1)
					#train split
					train=data.loc[train_index]
					#test split
					test=data.loc[test_index]
					#data loaders
					train_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in train.iterrows()],model)
					test_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in test.iterrows()],model)
					train_=DataLoader(train_,batch_size=b)
					test_=DataLoader(test_)
					#training
					model.fit(train_,epochs=e,optimizer_params={'lr':l})
					#predictions using cos_similarity
					y=np.append(y,test['label'])
					dlist=list(test.apply(lambda d:(d['query_p'],d['citation_p']), axis=1))
					yh=np.append(yh,sts_sim(dlist,model))
				#f1
				f1scores,thresholds=f1_macro(y,yh)
				print(np.nan in f1scores)
				f1=max(f1scores)
				f1list.append(f1)
				print(f1)
				#accuracy
				mthres=thresholds[np.nanargmax(f1scores)]
				yh1=np.zeros(len(yh))
				yh1[yh>=mthres]=1
				f12=metrics.f1_score(y,yh1,average='macro')
				if f12!=f1:
					import pdb
					pdb.set_trace()
				acc=metrics.accuracy_score(y, yh1)
				print(acc)
				acclist.append(acc)
				print(b,l,e)
				print("BERT Fine-Tuned: Average F1 across folds:",np.mean(f1list))
				print("BERT Fine-Tuned: Average Acc across folds:",np.mean(acclist))
def part_gen_constructor(sampler, part_df):
    #question_neg_dict = {}
    for question, df in part_df.groupby("question"):
        pos_answer_list = df["answer"].tolist()
        negs = sbert_sampler.sample(question, pos_answer_list)
        #negs = sbert_sampler.sample(question, [])
        #neg_mg_df = pd.merge(train_part_tiny, pd.DataFrame(np.asarray(negs[0]).reshape([-1, 1]), columns = ["answer"]), on = "answer", how = "inner")
        #question_neg_dict[question] = neg_mg_df
        for pos_answer in pos_answer_list:
            yield InputExample(texts=[question, pos_answer], label=1)
        for neg_answer in negs[0]:
            yield InputExample(texts=[question, neg_answer], label=0)
Ejemplo n.º 6
0
def load_train_sbert(path_train_data, num_samples):
    df = pd.read_csv(path_train_data)
    if num_samples>0:
        df = df.head(num_samples).copy()
    train_examples = [InputExample(texts=[s1, s2], label=int(l)) \
                for s1,s2,l in zip(list(df.sentence1.values), list(df.sentence2.values), list(df.label.values))]
    return train_examples
Ejemplo n.º 7
0
    def train(self, train_df, eval_df):
        """

        :param train_df: dataframe with columns 'text_a', 'text_b', 'labels'
        :param eval_df: dataframe with columns 'text_a', 'text_b', 'labels'
        :return:
        """

        # format training data
        if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in train_df.columns:
            if self.args.do_lower_case:
                train_df.loc[:, 'text_a'] = train_df['text_a'].str.lower()
                train_df.loc[:, 'text_b'] = train_df['text_b'].str.lower()

            train_examples = [
                InputExample(str(i), [text_a, text_b], label)
                for i, (text_a, text_b, label) in enumerate(
                    zip(
                        train_df["text_a"].astype(str),
                        train_df["text_b"].astype(str),
                        train_df["labels"].astype(float),
                    ))
            ]
        else:
            raise KeyError(
                'Training data processing - Required columns not found!')

        # format evaluation data
        if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in eval_df.columns:
            if self.args.do_lower_case:
                eval_df.loc[:, 'text_a'] = eval_df['text_a'].str.lower()
                eval_df.loc[:, 'text_b'] = eval_df['text_b'].str.lower()

            evaluator = evaluation.EmbeddingSimilarityEvaluator(
                list(eval_df["text_a"]),
                list(eval_df["text_b"]),
                list(eval_df["labels"]),
                batch_size=self.args.eval_batch_size)
        else:
            raise KeyError(
                'Evaluation data processing - Required columns not found!')

        # Define train dataset, the dataloader and the train loss
        train_dataloader = DataLoader(train_examples,
                                      shuffle=True,
                                      batch_size=self.args.train_batch_size)
        train_loss = losses.CosineSimilarityLoss(self.model)

        # Tune the model
        self.model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=self.args.num_train_epochs,
            warmup_steps=self.args.warmup_steps,
            optimizer_params={'lr': self.args.learning_rate},
            weight_decay=self.args.weight_decay,
            evaluator=evaluator,
            evaluation_steps=self.args.evaluate_during_training_steps,
            max_grad_norm=self.args.max_grad_norm,
            output_path=self.args.best_model_dir,
            show_progress_bar=self.args.show_progress_bar)
Ejemplo n.º 8
0
def get_binary_experimental_setup():
    # Items
    train_items, valid_items = extract_examples("items")

    # Domains
    train_domains, valid_domains = extract_examples("domains")

    # Regroup items and domains together
    train_examples = train_items + train_domains
    valid_examples = valid_items + valid_domains

    print(
        f"{len(train_examples)} training examples to {len(valid_examples)} valid examples"
    )

    # Postprocess train examples to correct format
    train_examples = [
        InputExample(texts=[sent1, sent2], label=label)
        for (sent1, sent2, label) in train_examples
    ]

    # Get evaluator from valid data
    evaluator = evaluation.BinaryClassificationEvaluator(*zip(*valid_examples),
                                                         batch_size=128)

    return train_examples, evaluator
    def pretrained_model_score(self, model_name, expected_score):
        model = SentenceTransformer(model_name)
        sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

        if not os.path.exists(sts_dataset_path):
            util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                          sts_dataset_path)

        train_samples = []
        dev_samples = []
        test_samples = []
        with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
            reader = csv.DictReader(fIn,
                                    delimiter='\t',
                                    quoting=csv.QUOTE_NONE)
            for row in reader:
                score = float(
                    row['score']) / 5.0  # Normalize score to range 0 ... 1
                inp_example = InputExample(
                    texts=[row['sentence1'], row['sentence2']], label=score)

                if row['split'] == 'dev':
                    dev_samples.append(inp_example)
                elif row['split'] == 'test':
                    test_samples.append(inp_example)
                else:
                    train_samples.append(inp_example)

        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            test_samples, name='sts-test')

        score = model.evaluate(evaluator) * 100
        print(model_name,
              "{:.2f} vs. exp: {:.2f}".format(score, expected_score))
        assert score > expected_score or abs(score - expected_score) < 0.1
Ejemplo n.º 10
0
    def test_LabelAccuracyEvaluator(self):
        """Tests that the LabelAccuracyEvaluator can be loaded correctly"""
        model = SentenceTransformer('paraphrase-distilroberta-base-v1')

        nli_dataset_path = 'datasets/AllNLI.tsv.gz'
        if not os.path.exists(nli_dataset_path):
            util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz',
                          nli_dataset_path)

        label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
        dev_samples = []
        with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
            reader = csv.DictReader(fIn,
                                    delimiter='\t',
                                    quoting=csv.QUOTE_NONE)
            for row in reader:
                if row['split'] == 'train':
                    label_id = label2int[row['label']]
                    dev_samples.append(
                        InputExample(
                            texts=[row['sentence1'], row['sentence2']],
                            label=label_id))
                    if len(dev_samples) >= 100:
                        break

        train_loss = losses.SoftmaxLoss(model=model,
                                        sentence_embedding_dimension=model.
                                        get_sentence_embedding_dimension(),
                                        num_labels=len(label2int))

        dev_dataloader = DataLoader(dev_samples, shuffle=False, batch_size=16)
        evaluator = evaluation.LabelAccuracyEvaluator(dev_dataloader,
                                                      softmax_model=train_loss)
        acc = evaluator(model)
        assert acc > 0.2
Ejemplo n.º 11
0
def generate_dataset(_grouped):
    def all_same(_sentences):
        for i in _sentences:
            for j in _sentences:
                if i != j:
                    return False
        return True

    def duplicate(data, k=1):
        for cluster_id, _sentences in data.items():
            tmp = copy.deepcopy(_sentences)
            for i in range(k):
                _sentences += tmp

    dataset = []
    same = copy.deepcopy(_grouped)
    diff = copy.deepcopy(_grouped)
    other = copy.deepcopy(_grouped)
    duplicate(same, k=1)  # duplicate for balanced dataset

    for cluster_id, sentences in same.items():
        while len(sentences) > 1:
            if all_same(sentences):
                break
            choices = random.choices(sentences, k=2)
            if choices[0] != choices[1]:
                dataset.append(
                    InputExample(texts=[choices[0], choices[1]], label=1.0))
                sentences.remove(choices[0])
                sentences.remove(choices[1])

    for cluster_id, sentences in diff.items():
        other_cluster = [
            value for key, value in other.items() if key != cluster_id
        ]
        other_cluster = [
            item for sublist in other_cluster for item in sublist
        ]  # flatten lists
        for sentence in sentences:
            choice = random.choice(other_cluster)
            if choice != choices[1]:
                other_cluster.remove(choice)
                dataset.append(
                    InputExample(texts=[sentence, choice], label=0.0))

    print(f"Dataset length: {len(dataset)}")
    return dataset
Ejemplo n.º 12
0
def create_posts_ranking(fl, data_dir, model, validate=None, is_test=False):
    train_posts_ranking = []
    disbn = []
    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        for obj in data:
            answers = obj['answers']
            filtered_answers = []
            votes = 1000000
            for answer in answers:
                my_votes = answer['a_votes']
                if my_votes < votes:
                    votes = my_votes
                    filtered_answers.append(answer)

            if len(filtered_answers) > 1:
                rank = len(filtered_answers)
                for answer in filtered_answers:
                    dist = rank / len(filtered_answers)
                    disbn.append(answer['a_rank'])
                    rank = rank - 1
                    train_posts_ranking.append(
                        InputExample(texts=[obj['q_text'], answer['a_text']],
                                     label=dist))

    random.shuffle(train_posts_ranking)

    print("data size " + str(len(train_posts_ranking)))

    if is_test:
        return train_posts_ranking

    if max_size:
        train_posts_ranking = train_posts_ranking[:max_size]

    evaluator = None
    if posts_rank_str == validate:
        train_posts_ranking, dev_posts_ranking = train_test_split(
            train_posts_ranking, test_size=0.1)
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            dev_posts_ranking, name='posts ranking')

    warmup_steps = math.ceil(
        len(train_posts_ranking) * num_epochs / batch_size *
        0.1)  # 10% of train data for warm-up

    train_data_posts_ranking = SentencesDataset(train_posts_ranking,
                                                model=model)
    train_dataloader_posts_ranking = DataLoader(train_data_posts_ranking,
                                                shuffle=True,
                                                batch_size=batch_size)
    train_loss_posts_ranking = losses.CosineSimilarityLoss(model=model)

    print('R: Number of training examples: ', len(train_posts_ranking))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_posts_ranking) / 0.1)

    return train_dataloader_posts_ranking, train_loss_posts_ranking, evaluator, warmup_steps
 def __iter__(self):
     with open(self.triplets_file, 'r') as fIn:
         for line in fIn:
             qid, pos_id, neg_id = line.strip().split()
             query_text = self.queries[qid]
             pos_text = self.corpus[pos_id]
             neg_text = self.corpus[neg_id]
             yield InputExample(texts=[query_text, pos_text, neg_text])
Ejemplo n.º 14
0
def read_dataset(dataset_path, split):
    samples = []
    with open(dataset_path, 'r') as fIn:
        for line in tqdm(fIn):
            line_split, query, doc1, doc2, label = line.strip().split('\t')
            if line_split == split:
                samples.append(
                    InputExample(texts=[query, doc1, doc2], label=int(label)))
Ejemplo n.º 15
0
def to_input_example(language_list):
    result = []
    for dataset in language_list:
        result.append(
            InputExample(
                texts=[dataset["sentence1"], dataset["sentence2"]],
                label=(dataset["similarity_score"] / 5),
            ))
    return result
Ejemplo n.º 16
0
    def sentence_bert_data_prepare(intent_list, text_intent_map):
        print("开始整理数据")
        replacement_words1 = ['地方', '位置', '地址']
        replacement_words2 = ['没有时间', '没时间', '没空']
        duplicated = []
        examples_train = []
        for intent_a in intent_list:
            for text_a in text_intent_map[intent_a]:
                for intent_b in intent_list:
                    for text_b in text_intent_map[intent_b]:
                        if intent_a == intent_b:
                            label = 1
                        else:
                            label = 0
                        temp_a = text_a + text_b
                        temp_b = text_b + text_a

                        if temp_a in duplicated or temp_b in duplicated:
                            continue
                        duplicated.append(temp_b)
                        examples_train.append(
                            InputExample(guid='guid',
                                         texts=[text_a, text_b],
                                         label=float(label)))
                        # for word1 in replacement_words1:
                        #     if word1 in text_a:
                        #         for word2 in replacement_words1:
                        #             if word1 != word2:
                        #                 new = text_a.replace(word1, word2)
                        #                 examples_train.append(
                        #                     InputExample(guid='guid', texts=[new, text_b], label=float(label)))
                        #     if word1 in text_b:
                        #         for word2 in replacement_words1:
                        #             if word1 != word2:
                        #                 new = text_a.replace(word1, word2)
                        #                 examples_train.append(
                        #                     InputExample(guid='guid', texts=[text_a, new], label=float(label)))
                        #     # if word1 in text_b and word1 in text_a:
                        #     #     for word2 in replacement_words1:
                        #     #         if word1!=word2:
                        #     #             new1=text_a.replace(word1,word2)
                        #     #             new2=text_b.replace(word1,word2)
                        #     #             examples_train.append(InputExample(guid='guid', texts=[new, new], label=float(label)))
                        # for word1 in replacement_words2:
                        #     if word1 in text_a:
                        #         for word2 in replacement_words2:
                        #             if word1 != word2:
                        #                 new = text_a.replace(word1, word2)
                        #                 examples_train.append(
                        #                     InputExample(guid='guid', texts=[new, text_b], label=float(label)))
                        #     if word1 in text_b:
                        #         for word2 in replacement_words2:
                        #             if word1 != word2:
                        #                 new = text_a.replace(word1, word2)
                        #                 examples_train.append(
                        #                     InputExample(guid='guid', texts=[text_a, new], label=float(label)))
        return examples_train
Ejemplo n.º 17
0
def evaluate_sbert(model, batch_size=16):
    sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

    test_samples = []
    with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            if row['split'] == 'test':
                score = float(
                    row['score']) / 5.0  #Normalize score to range 0 ... 1
                test_samples.append(
                    InputExample(texts=[row['sentence1'], row['sentence2']],
                                 label=score))

    sentences1 = []
    sentences2 = []
    scores = []

    examples = test_samples

    for example in examples:
        sentences1.append((example.texts[0], 'none'))
        sentences2.append((example.texts[1], 'none'))
        scores.append(example.label)

    _, embeddings1 = model.forward(sentences1, checkpoint=False)
    _, embeddings2 = model.forward(sentences2, checkpoint=False)
    labels = scores

    cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
    manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
    euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
    dot_products = [
        np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)
    ]

    eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
    eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

    eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
    eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

    eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
    eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

    eval_pearson_dot, _ = pearsonr(labels, dot_products)
    eval_spearman_dot, _ = spearmanr(labels, dot_products)

    print("Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(
        eval_pearson_cosine, eval_spearman_cosine))
    print("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
        eval_pearson_manhattan, eval_spearman_manhattan))
    print("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
        eval_pearson_euclidean, eval_spearman_euclidean))
    print("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
        eval_pearson_dot, eval_spearman_dot))
Ejemplo n.º 18
0
def get_softmax_experimental_setup():
    train_examples, num_labels = extract_classif_examples()

    # Postprocess train examples to correct format
    train_examples = [
        InputExample(texts=[sent, sent], label=label)
        for (sent, label) in train_examples
    ] * 100

    return train_examples, num_labels
Ejemplo n.º 19
0
def get_file_data(filename):
    dataset = []
    data_file = open(filename, 'r', encoding='utf8')
    for line in data_file.readlines()[1:]:
        info = line.split('\t')
        dataset.append(
            InputExample(texts=[info[2].strip(), info[3].strip()],
                         label=float(info[1].strip())))
    data_file.close()
    return dataset
Ejemplo n.º 20
0
 def read(self, data, return_pt=False):
     sentence1 = data['sent1'].tolist()
     sentence2 = data['sent2'].tolist()
     labels = data['label'].tolist()
     if return_pt:
         dataloader = []
         for s1, s2, l in zip(sentence1, sentence2, labels):
             dataloader.append(InputExample(texts=[s1, s2], label=l))
         return dataloader
     return sentence1, sentence2, labels
Ejemplo n.º 21
0
def create_hirerachy_examples(fl,
                              data_dir,
                              model,
                              validate=None,
                              is_test=False):
    train_hierarchy_samples = []
    disbn = []
    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        max_distance = 0
        for obj in data:
            if obj['distance'] > max_distance:
                max_distance = obj['distance']
        for obj in data:
            # flip the meaning of similarity, since the more distant the two classes, the closer to 0 it should be
            dist = (max_distance - obj['distance']) / (max_distance - 1)
            train_hierarchy_samples.append(
                InputExample(texts=[obj['class1'], obj['class2']], label=dist))
            disbn.append(obj['distance'])
    random.shuffle(train_hierarchy_samples)
    train_hierarchy_samples = train_hierarchy_samples[:100000]
    disbn = disbn[:100000]

    if max_size:
        train_hierarchy_samples = train_hierarchy_samples[:max_size]
        disbn = disbn[:max_size]

    if is_test:
        return train_hierarchy_samples

    evaluator = None

    if hierarchy_str == validate:
        train_hierarchy_samples, dev_hierarchy_samples = train_test_split(
            train_hierarchy_samples, stratify=disbn, test_size=0.1)
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            dev_hierarchy_samples, name='hierarchy')

    warmup_steps = math.ceil(
        len(train_hierarchy_samples) * num_epochs / batch_size *
        0.1)  # 10% of train data for warm-up

    train_data_hierarchy = SentencesDataset(train_hierarchy_samples,
                                            model=model)
    train_dataloader_hierarchy = DataLoader(train_data_hierarchy,
                                            shuffle=True,
                                            batch_size=batch_size)
    train_loss_hierarchy = losses.CosineSimilarityLoss(model=model)

    print('H: Number of training examples: ', len(train_hierarchy_samples))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_hierarchy_samples) / 0.1)
    return train_dataloader_hierarchy, train_loss_hierarchy, evaluator, warmup_steps
Ejemplo n.º 22
0
def get_triplet_data(config):
    dev_data = get_file_data(
        os.path.join(config['eval_dir'], config['dev_file']))
    train_file = open(
        os.path.join(config['train_dir'], config['train_triplet_file']))
    train_dataset = []
    for line in train_file.readlines():
        info = line.strip().split('\t')
        anchor, positive, negative = info[0], info[1], info[2]
        train_dataset.append(InputExample(texts=[anchor, positive, negative]))
    train_file.close()
    return train_dataset, dev_data
def finetune_sbert(model, df, rep_sents, finetune_cfg):
    """Finetune the Sentence-BERT."""
    # setup
    train_size = finetune_cfg.get("train_size", 200000)
    sample_per_pair = finetune_cfg.get("sample_per_pair", 5)
    train_batch_size = finetune_cfg.get("train_batch_size", 32)
    epochs = finetune_cfg.get("epochs", 1)
    train = []
    n_sampled = 0
    cnts = [0, 0]  # [neg, pos]
    max_label_size = train_size // 2
    genres = df.genres.apply(set)

    with tqdm(total=train_size, position=0) as pbar:
        # sample sentence pairs
        while n_sampled < train_size:
            id1, id2 = np.random.randint(0, len(df), 2)
            label = int(bool(set.intersection(genres[id1], genres[id2])))

            if cnts[label] > max_label_size:
                continue

            sent_pairs = np.stack(np.meshgrid(rep_sents[id1],
                                              rep_sents[id2])).T.reshape(
                                                  -1, 2)
            if len(sent_pairs) <= sample_per_pair:
                samples = sent_pairs
            else:
                samples_idx = np.random.choice(sent_pairs.shape[0],
                                               sample_per_pair,
                                               replace=False)
                samples = sent_pairs[samples_idx]

            inexp = lambda pair: InputExample(texts=list(pair), label=label)
            samples = list(map(inexp, samples))
            train.extend(samples)

            n_sampled += len(samples)
            cnts[label] += len(samples)
            pbar.update(len(samples))

        # run finetune
        train_ds = SentencesDataset(train, model)
        train_obj = (
            DataLoader(train_ds, shuffle=True, batch_size=train_batch_size),
            losses.ContrastiveLoss(model=model),
        )
        model.fit(train_objectives=[train_obj],
                  epochs=epochs,
                  warmup_steps=100)
        os.makedirs("model/clustering/sbert", exist_ok=True)
        model.save("model/clustering/sbert")
Ejemplo n.º 24
0
def load_data(training_data_file, training_split=0.9, batch_size=16):
    train_examples = []
    validation_examples = []
    for row in jsonlines.open(training_data_file):
        query = row["query"]
        for positive in row["positives"]:
            sample = InputExample(texts=[query, positive], label=1.0)
            if random.random() < training_split:
                train_examples.append(sample)
            else:
                validation_examples.append(sample)
        for negative in row["negatives"]:
            sample = InputExample(texts=[query, negative], label=0.0)
            if random.random() < training_split:
                train_examples.append(sample)
            else:
                validation_examples.append(sample)

    train_dataloader = DataLoader(train_examples,
                                  shuffle=True,
                                  batch_size=batch_size)
    return train_dataloader, validation_examples
Ejemplo n.º 25
0
    def __getitem__(self, item):
        query = self.queries[self.queries_ids[item]]
        query_text = query['query']

        pos_id = query['pos'].pop(0)  #Pop positive and add at end
        pos_text = self.corpus[pos_id]
        query['pos'].append(pos_id)

        neg_id = query['neg'].pop(0)  #Pop negative and add at end
        neg_text = self.corpus[neg_id]
        query['neg'].append(neg_id)

        return InputExample(texts=[query_text, pos_text, neg_text])
Ejemplo n.º 26
0
    def __getitem__(self, index):
        # Load the code and the descriptions
        code_snippet = self.code_snippets[index]
        positive_desc = self.descriptions[index]
        # negative_desc = self.descriptions[random.randint(0, self.__len__() - 1)]
        negative_candidates = list(range(self.__len__()))
        negative_candidates.remove(index)
        negative_index = random.choice(negative_candidates)
        negative_desc = self.descriptions[negative_index]

        input_example = InputExample(
            texts=[code_snippet, positive_desc, negative_desc])

        return input_example
Ejemplo n.º 27
0
    def run(
        self,
        training_data,
        evaluator,
        output_path,
        from_scratch=False,
        loss=SentenceTransformerLoss.cosine_similarity_loss,
        model_name_or_path="roberta-large-nli-stsb-mean-tokens",
        cuda=True,
        **kwargs,
    ):
        logger.info(
            f"Running Sentence Transformer Task: {model_name_or_path}, Output path: {output_path}"
        )
        if from_scratch:
            logger.info("Training from scratch")
            models.Transformer(model_name_or_path,
                               max_seq_length=kwargs.get(
                                   "max_seq_length", 128))
        else:
            model = SentenceTransformer(model_name_or_path)
        if cuda:
            logger.info("Running model on GPU")
            model.cuda()

        train_examples = [
            InputExample(texts=[data["sentence1"], data["sentence2"]],
                         label=data["label"])
            for data in training_data.values()
        ]
        train_dataset = SentencesDataset(train_examples, model)
        train_dataloader = DataLoader(
            train_dataset,
            shuffle=kwargs.get("shuffle", True),
            batch_size=kwargs.get("batch_size", 4),
        )
        warmup_steps = math.ceil(
            len(train_examples) * kwargs.get("num_epochs", 3) /
            kwargs.get("train_batch_size", 4) *
            0.1)  # 10% of train data for warm-up
        train_loss = loss.value(model)
        model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=kwargs.get("num_epochs", 3),
            evaluation_steps=kwargs.get("evaluation_steps", 500),
            warmup_steps=warmup_steps,
            output_path=output_path,
            evaluator=evaluator,
        )
def construct_train_samples(json_obj, neg_random=False):
    train_samples = []
    label = 1
    for t2 in json_obj["pos_tuple_set"]:
        q_index, a_index = t2[0], t2[1]
        q, a = json_obj["index_question_dict"][q_index], json_obj[
            "index_answer_dict"][a_index]
        #q = q + "<sep>"
        train_samples.append(InputExample(texts=[q, a], label=label))
    if neg_random:
        neg_len = len(json_obj["index_answer_dict"])
    label = 0
    for t2 in json_obj["neg_tuple_set"]:
        q_index, a_index = t2[0], t2[1]
        if neg_random:
            q, a = json_obj["index_question_dict"][q_index], json_obj[
                "index_answer_dict"][np.random.randint(0, neg_len)]
        else:
            q, a = json_obj["index_question_dict"][q_index], json_obj[
                "index_answer_dict"][a_index]
        #q = q + "<sep>"
        train_samples.append(InputExample(texts=[q, a], label=label))
    train_indexes = np.random.permutation(np.arange(len(train_samples)))
    return list(map(lambda idx: train_samples[idx], train_indexes))
Ejemplo n.º 29
0
def create_linked_posts(fl, data_dir, model, validate=None, is_test=False):
    train_linked_posts = []
    disbn = []

    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        for obj in data:
            if obj['class'] == 'relevant':
                label = 1
            else:
                label = 0
            disbn.append(label)

            train_linked_posts.append(
                InputExample(texts=[obj['text_1'], obj['text_2']],
                             label=label))
    random.shuffle(train_linked_posts)

    if is_test:
        return train_linked_posts

    if max_size:
        train_linked_posts = train_linked_posts[:max_size]

    evaluator = None
    if linked_posts_str == validate:
        train_linked_posts, dev_linked_posts = train_test_split(
            train_linked_posts, stratify=disbn, test_size=0.1)
        evaluator = BinaryClassificationEvaluator.from_input_examples(
            dev_linked_posts, name='linked-posts')

    warmup_steps = math.ceil(
        len(train_linked_posts) * num_epochs / batch_size *
        0.1)  # 10% of train data for warm-up

    train_data_linked_posts = SentencesDataset(train_linked_posts, model=model)
    train_dataloader_linked_posts = DataLoader(train_data_linked_posts,
                                               shuffle=True,
                                               batch_size=batch_size)
    train_loss_linked_posts = losses.ContrastiveLoss(model=model)

    print('L: Number of training examples: ', len(train_linked_posts))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_linked_posts) / 0.1)

    return train_dataloader_linked_posts, train_loss_linked_posts, evaluator, warmup_steps
Ejemplo n.º 30
0
def create_train_usage(fl, data_dir, model, validate=None, is_test=False):
    train_usage = []
    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        min_d = 10000000
        max_d = 0
        for obj in data:
            dist = obj['distance']
            if dist < min_d:
                min_d = dist
            if dist > max_d:
                max_d = dist
        for obj in data:
            dist = (max_d - obj['distance']) / (max_d - min_d)
            train_usage.append(
                InputExample(texts=[obj['class1'], obj['class2']], label=dist))

    random.shuffle(train_usage)

    if is_test:
        return train_usage

    if max_size:
        train_usage = train_usage[:max_size]

    evaluator = None

    if usage_str == validate:
        train_usage, dev_usage = train_test_split(train_usage, test_size=0.1)
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            dev_usage, name='usage')
    warmup_steps = math.ceil(len(train_usage) * num_epochs / batch_size *
                             0.1)  # 10% of train data for warm-up

    train_data_usage = SentencesDataset(train_usage, model=model)
    train_dataloader_usage = DataLoader(train_data_usage,
                                        shuffle=True,
                                        batch_size=batch_size)
    train_loss_usage = losses.CosineSimilarityLoss(model=model)

    print('U: Number of training examples: ', len(train_usage))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_usage) / 0.1)

    return train_dataloader_usage, train_loss_usage, evaluator, warmup_steps