def get_triples(cluster_data, max_triples_per_page): all25_triples = [] for c in trange(len(cluster_data)): text = cluster_data[c].texts t = list(cluster_data[c].label) triples = [] page_done = False for i in range(len(t) - 2): for j in range(i + 1, len(t) - 1): for k in range(i + 2, len(t)): if len(set([t[i], t[j], t[k]])) == 2: if t[i] == t[j]: triples.append( InputExample(texts=[text[i], text[j], text[k]], label=0)) elif t[j] == t[k]: triples.append( InputExample(texts=[text[j], text[k], text[i]], label=0)) else: triples.append( InputExample(texts=[text[i], text[k], text[j]], label=0)) if max_triples_per_page > 0 and len( triples) >= max_triples_per_page: page_done = True break if page_done: break if page_done: break all25_triples += triples return all25_triples
def prepare_cluster_data(train_pages_to_cluster, test_pages_to_cluster, val_samples): ng_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) ng_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')) print(ng_train.target_names) ng_train.keys() train_cluster_data = [] test_cluster_data = [] for i in range(len(ng_train['filenames']) // train_pages_to_cluster): train_cluster_data.append( InputExample( texts=ng_train['data'][i * train_pages_to_cluster:(i + 1) * train_pages_to_cluster], label=ng_train['target'][i * train_pages_to_cluster:(i + 1) * train_pages_to_cluster])) val_cluster_data = train_cluster_data[-val_samples:] train_cluster_data = train_cluster_data[:-val_samples] for i in range(len(ng_test['filenames']) // test_pages_to_cluster): test_cluster_data.append( InputExample( texts=ng_test['data'][i * test_pages_to_cluster:(i + 1) * test_pages_to_cluster], label=ng_test['target'][i * test_pages_to_cluster:(i + 1) * test_pages_to_cluster])) print("Train instances: %5d" % len(train_cluster_data)) print("Val instances: %5d" % len(val_cluster_data)) print("Test instances: %5d" % len(test_cluster_data)) return train_cluster_data, val_cluster_data, test_cluster_data
def get_examples(self, filename, max_examples=0): """ filename specified which data split to use (train.csv, dev.csv, test.csv). """ filepath = os.path.join(self.dataset_folder, filename) self.data = preprocess_crr.read_crr_tsv_as_df(filepath) self.negative_sampler = negative_sampling.RandomNegativeSampler( list(self.data["response"].values), 1) examples = [] for idx, row in enumerate( tqdm(self.data.itertuples(index=False), total=len(self.data))): context = row[0] relevant_response = row[1] examples.append( InputExample(guid=filename + str(idx) + "_pos", texts=[context, relevant_response], label=1.0)) ns_candidates, _, _ = self.negative_sampler.sample( context, relevant_response) for ns in ns_candidates: examples.append( InputExample(guid=filename + str(idx) + "_neg", texts=[context, ns], label=0.0)) return examples
def get_frac_triples(cluster_data, num_triples_frac): frac_triples = [] for c in trange(len(cluster_data)): text = cluster_data[c].texts t = list(cluster_data[c].label) triples = [] for i in range(len(t) - 2): for j in range(i + 1, len(t) - 1): for k in range(i + 2, len(t)): if len(set([t[i], t[j], t[k]])) == 2: if t[i] == t[j]: triples.append( InputExample(texts=[text[i], text[j], text[k]], label=0)) elif t[j] == t[k]: triples.append( InputExample(texts=[text[j], text[k], text[i]], label=0)) else: triples.append( InputExample(texts=[text[i], text[k], text[j]], label=0)) frac_triples += random.sample(triples, len(triples) // num_triples_frac) print('No of train triples: %2d' % len(frac_triples)) return frac_triples
def load_pairwise_data(args, split): data = pd.read_csv( os.path.join(args.data_dir, "pairwise_pos_%s.csv" % split)) train_samples = [] for index, row in data.iterrows(): train_samples.append( InputExample(texts=[row['title_1'], row['title_2']], label=1)) train_samples.append( InputExample(texts=[row['title_2'], row['title_1']], label=1)) return train_samples
def setUp(self): sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) nli_dataset_path = 'datasets/AllNLI.tsv.gz' if not os.path.exists(nli_dataset_path): util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path) #Read NLI label2int = {"contradiction": 0, "entailment": 1, "neutral": 2} self.nli_train_samples = [] max_train_samples = 10000 with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: if row['split'] == 'train': label_id = label2int[row['label']] self.nli_train_samples.append( InputExample( texts=[row['sentence1'], row['sentence2']], label=label_id)) if len(self.nli_train_samples) >= max_train_samples: break #Read STSB self.stsb_train_samples = [] self.dev_samples = [] self.test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float( row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample( texts=[row['sentence1'], row['sentence2']], label=score) if row['split'] == 'dev': self.dev_samples.append(inp_example) elif row['split'] == 'test': self.test_samples.append(inp_example) else: self.stsb_train_samples.append(inp_example)
def examples_for_q_answers(q_text: str, a_texts: List[str], a_dists: List[float] = None): if a_dists is None: a_dists = [0] * len(a_texts) for a_text, a_dist in zip(a_texts, a_dists): yield InputExample("Infer_example", [q_text, a_text], a_dist)
def main(): parser = argparse.ArgumentParser(description='Evaluate saved models') parser.add_argument('-dt', '--data', default='trec') parser.add_argument('-in', '--input_dir', default='~/trec_dataset') parser.add_argument('-mp', '--model_path') parser.add_argument('-lv', '--level', default='t') args = parser.parse_args() dataset = args.data input_dir = args.input_dir model_path = args.model_path level = args.level if dataset == 'trec': test_art_qrels = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/test.pages.cbor-article.qrels' test_top_qrels = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/test.pages.cbor-toplevel.qrels' test_hier_qrels = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/test.pages.cbor-hierarchical.qrels' test_paratext = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/by1test_paratext/by1test_paratext.tsv' evaluate_treccar(model_path, test_art_qrels, test_top_qrels, test_hier_qrels, test_paratext, level) elif dataset == '20ng': pages_to_cluster = 50 ng_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')) test_cluster_data = [] for i in range(len(ng_test['filenames']) // pages_to_cluster): test_cluster_data.append( InputExample( texts=ng_test['data'][i * pages_to_cluster:(i + 1) * pages_to_cluster], label=ng_test['target'][i * pages_to_cluster:(i + 1) * pages_to_cluster])) print("Test instances: %5d" % len(test_cluster_data)) evaluate_ng20(model_path, test_cluster_data)
def setUp(self): sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) #Read STSB self.stsb_train_samples = [] self.dev_samples = [] self.test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float( row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample( texts=[row['sentence1'], row['sentence2']], label=score) if row['split'] == 'dev': self.dev_samples.append(inp_example) elif row['split'] == 'test': self.test_samples.append(inp_example) else: self.stsb_train_samples.append(inp_example)
def get_examples(self, language, split='train'): pairs = self._load_data() if language == 'hi': pairs = pairs['hindi_headlines'] elif language == 'pt': pairs = pairs['ciper'] else: pairs = pairs['fact_pairs'] split_point = int(round(len(pairs) * 0.8)) if split == 'train': pairs = pairs[:split_point] else: pairs = pairs[split_point:] examples = [] for i, item in enumerate(pairs): guid = i sentence1 = item['lookup_text'] sentence2 = item['database_text'] label = item['label'] examples.append( InputExample(guid=guid, texts=[sentence1, sentence2], label=label)) return examples
def get_examples(self, language): with open('../data/xnli/xnli.dev.jsonl', 'r') as json_file: json_list = list(json_file) xnli_data = [json.loads(line) for line in json_list] with open('../data/xnli/xnli.test.jsonl', 'r') as json_file: json_list = list(json_file) xnli_data += [json.loads(line) for line in json_list] xnli_data = [ item for item in xnli_data if item['language'] == language ] examples = [] for item in xnli_data: guid = item['pairID'] sentence1 = item['sentence1'] sentence2 = item['sentence2'] label = item['gold_label'] examples.append( InputExample(guid=guid, texts=[sentence1, sentence2], label=self.map_label(label))) return examples
def get_examples(self, filename, max_examples=0): """ filename specified which data split to use (train.csv, dev.csv, test.csv). """ filepath = os.path.join(self.dataset_folder, filename) with gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open( filepath, encoding="utf-8") as fIn: data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting) examples = [] for id, row in enumerate(data): score = float(row[self.score_col_idx]) if self.normalize_scores: # Normalize to a 0...1 value score = (score - self.min_score) / (self.max_score - self.min_score) s1 = row[self.s1_col_idx] s2 = row[self.s2_col_idx] examples.append( InputExample(guid=filename + str(id), texts=[s1, s2], label=score)) if max_examples > 0 and len(examples) >= max_examples: break return examples
def get_examples(self): bhaav = pd.read_csv( "../data/recasted-hindi-nli-data/bhaav/bhaav_recasted.tsv", sep="\t") bhaav = bhaav.dropna(subset=['entailment']) mr = pd.read_csv( "../data/recasted-hindi-nli-data/MR/recasted_movie_review_data.tsv", sep="\t") pr = pd.read_csv( "../data/recasted-hindi-nli-data/PR/recasted_product_review_data.tsv", sep="\t") examples = [] idx = 0 for _, item in pd.concat([bhaav, mr, pr]).iterrows(): guid = idx idx += 1 sentence1 = item['context'] sentence2 = item['hypothesis'] label = item['entailment'] examples.append( InputExample(guid=guid, texts=[sentence1, sentence2], label=self.map_label(label))) return examples
def triplets_from_labeled_dataset(input_examples): # Create triplets for a [(label, sentence), (label, sentence)...] dataset # by using each example as an anchor and selecting randomly a # positive instance with the same label and a negative instance with a different label triplets = [] label2sentence = defaultdict(list) for inp_example in input_examples: label2sentence[inp_example.label].append(inp_example) for inp_example in input_examples: anchor = inp_example if len( label2sentence[inp_example.label] ) < 2: #We need at least 2 examples per label to create a triplet continue positive = None while positive is None or positive.guid == anchor.guid: positive = random.choice(label2sentence[inp_example.label]) negative = None while negative is None or negative.label == anchor.label: negative = random.choice(input_examples) triplets.append( InputExample( texts=[anchor.texts[0], positive.texts[0], negative.texts[0]])) return triplets
def get_examples(self, filename, max_examples=0): """ filename specified which data split to use (train.csv, dev.csv, test.csv). """ data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), delimiter=self.delimiter, quoting=self.quoting) examples = [] for id, row in enumerate(data): try: score = float(row[self.score_col_idx]) except: print(row[self.score_col_idx]) continue if self.normalize_scores: # Normalize to a 0...1 value score = (score - self.min_score) / (self.max_score - self.min_score) s1 = row[self.s1_col_idx] s2 = row[self.s2_col_idx] examples.append( InputExample(guid=filename + str(id), texts=[s1, s2], label=score)) if max_examples > 0 and len(examples) >= max_examples: break return examples
def get_examples(self, fn): examples = [] for line in open(fn): sent1, sent2, label = line.strip().split('\t') examples.append(InputExample(guid=self.guid, texts=[sent1, sent2], label=int(label))) self.guid += 1 return examples
def get_data(data_file): train_samples = [] with open(data_file, "r", encoding="utf-8") as f: dataset = json.loads(f.read()) for cur_dialg in dataset: train_samples.append( InputExample(texts=[ cur_dialg[0].strip().replace(" ", ""), cur_dialg[1].strip().replace(" ", "") ], label=1)) # 无论单轮还是多轮,只取最相关的前两句 return train_samples
def trec_dataset( directory="datasets/trec/", train_filename="train_5500.label", test_filename="TREC_10.label", validation_dataset_nb=500, urls=[ "http://cogcomp.org/Data/QA/QC/train_5500.label", "http://cogcomp.org/Data/QA/QC/TREC_10.label", ], ): os.makedirs(directory, exist_ok=True) ret = [] for url, filename in zip(urls, [train_filename, test_filename]): full_path = os.path.join(directory, filename) urllib.request.urlretrieve(url, filename=full_path) examples = [] label_map = {} guid = 1 for line in open(full_path, "rb"): # there is one non-ASCII byte: sisterBADBYTEcity; replaced with space label, _, text = line.replace(b"\xf0", b" ").strip().decode().partition(" ") # We extract the upper category (e.g. DESC from DESC:def) label, _, _ = label.partition(":") if label not in label_map: label_map[label] = len(label_map) label_id = label_map[label] guid += 1 examples.append( InputExample(guid=guid, texts=[text], label=label_id)) ret.append(examples) train_set, test_set = ret dev_set = None # Create a dev set from train set if validation_dataset_nb > 0: dev_set = train_set[-validation_dataset_nb:] train_set = train_set[:-validation_dataset_nb] # For dev & test set, we return triplets (anchor, positive, negative) random.seed(42) #Fix seed, so that we always get the same triplets dev_triplets = triplets_from_labeled_dataset(dev_set) test_triplets = triplets_from_labeled_dataset(test_set) return train_set, dev_triplets, test_triplets
def examples_from_questions_tup(questions: Iterable[Tuple[int, Question]]): for q_i, q in questions: if q_i % 10000 == 0: print("Loading %s" % q_i) if q.answers is None: continue all_q_upvotes = [a.score for a in q.answers] all_q_dists = upvotes_to_distance(all_q_upvotes) if np.isnan(all_q_dists).any(): # skip the votes with equally-rated answers: these are mostly 0-votes: we do not know anything about continue for a_i, a in enumerate(q.answers): yield InputExample("%s_%s" % (q_i, a_i), [q.body, a.body], all_q_dists[a_i])
def get_data(data_file): train_samples = [] discard_num = 0 with open(data_file, "r", encoding="utf-8") as f: dataset = json.loads(f.read()) for cur_dialg in dataset: query_sent = data_clean(cur_dialg[0]) content_sent = data_clean(cur_dialg[1]) if len(query_sent) == 0 or len(content_sent) == 0: discard_num += 1 continue else: train_samples.append(InputExample(texts=[query_sent, content_sent], label=1)) # 无论单轮还是多轮,只取最相关的前两句 return train_samples
def trec_dataset( directory="datasets/trec/", train_filename="train_5500.label", test_filename="TREC_10.label", validation_dataset_nb=500, urls=[ "http://cogcomp.org/Data/QA/QC/train_5500.label", "http://cogcomp.org/Data/QA/QC/TREC_10.label", ], ): os.makedirs(directory, exist_ok=True) ret = [] for url, filename in zip(urls, [train_filename, test_filename]): full_path = os.path.join(directory, filename) urllib.request.urlretrieve(url, filename=full_path) examples = [] label_map = {} guid = 0 for line in open(full_path, "rb"): # there is one non-ASCII byte: sisterBADBYTEcity; replaced with space label, _, text = line.replace(b"\xf0", b" ").strip().decode().partition(" ") # We extract the upper category (e.g. DESC from DESC:def) label, _, _ = label.partition(":") if label not in label_map: label_map[label] = len(label_map) guid += 1 label_id = label_map[label] examples.append( InputExample(guid=guid, texts=[text], label=label_id)) ret.append(examples) # Validation dataset: # It doesn't exist in the original dataset, # so we create one by splitting the train data # Ret[0] is train # Ret[1] is test # Ret[2] is val if validation_dataset_nb > 0: ret.append(ret[0][-validation_dataset_nb:]) ret[0] = ret[0][:-validation_dataset_nb] return ret
def get_pairs(cluster_data, balanced): pairs = [] if balanced: print('Going to balance the datasets') for c in trange(len(cluster_data)): text = cluster_data[c].texts t = list(cluster_data[c].label) pos_pairs, neg_pairs = [], [] for i in range(len(t) - 1): for j in range(i + 1, len(t)): if t[i] == t[j]: pos_pairs.append( InputExample(texts=[text[i], text[j]], label=1)) else: neg_pairs.append( InputExample(texts=[text[i], text[j]], label=0)) if balanced: neg_pairs = random.sample(neg_pairs, len(pos_pairs)) pairs_loc = pos_pairs + neg_pairs random.shuffle(pairs_loc) pairs += pairs_loc print('No of train pairs: %2d' % len(pairs)) return pairs
def trec_dataset( directory="datasets/trec/", train_filename="train_5500.label", test_filename="TREC_10.label", validation_dataset_nb=500, urls=[ "https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label", "https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label", ], ): os.makedirs(directory, exist_ok=True) ret = [] for url, filename in zip(urls, [train_filename, test_filename]): full_path = os.path.join(directory, filename) if not os.path.exists(full_path): util.http_get(url, full_path) examples = [] label_map = {} for guid, line in enumerate(open(full_path, "rb"), start=2): # there is one non-ASCII byte: sisterBADBYTEcity; replaced with space label, _, text = line.replace(b"\xf0", b" ").strip().decode().partition(" ") if label not in label_map: label_map[label] = len(label_map) label_id = label_map[label] examples.append(InputExample(guid=guid, texts=[text], label=label_id)) ret.append(examples) train_set, test_set = ret dev_set = None # Create a dev set from train set if validation_dataset_nb > 0: dev_set = train_set[-validation_dataset_nb:] train_set = train_set[:-validation_dataset_nb] # For dev & test set, we return triplets (anchor, positive, negative) random.seed(42) #Fix seed, so that we always get the same triplets dev_triplets = triplets_from_labeled_dataset(dev_set) test_triplets = triplets_from_labeled_dataset(test_set) return train_set, dev_triplets, test_triplets
def read_dataset(train_data_path): data = csv.reader(open(os.path.join(train_data_path), encoding="utf-8"), delimiter="\t", quoting=csv.QUOTE_NONE) label_map = {} train_set = [] guid = 0 for line in data: # there is one non-ASCII byte: sisterBADBYTEcity; replaced with space text, label = line if label not in label_map: label_map[label] = len(label_map) label_id = label_map[label] guid += 1 train_set.append(InputExample(guid=guid, texts=[text], label=label_id)) return train_set
def get_examples(self, max_examples: int = None): """Get a set of examples as required by SentencesDataset. :param max_examples: number of samples to return, defaults to None :return: InputExample object """ if max_examples is None: max_examples = self.df.shape[0] s1 = self.df["sentence1"].iloc[:max_examples].values s2 = self.df["sentence2"].iloc[:max_examples].values labels = self.df["label"].astype(int).iloc[:max_examples].values examples = [] for guid_id, (sentence_a, sentence_b, label) in enumerate( zip(s1, s2, labels)): examples.append( InputExample( guid=guid_id, texts=[ sentence_a, sentence_b], label=label)) return examples
def main(args): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset train_batch_size = 64 num_epochs = 1000 if args.pretrained: model = SentenceTransformer(args.pretrained) model_save_path = os.path.join( args.save_path, args.pretrained.split("/")[-1] + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) else: #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base model_name = 'cl-tohoku/bert-base-japanese-char-whole-word-masking' model_save_path = os.path.join( args.save_path, model_name.replace("/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read custom train dataset") train_samples = [] val_samples = [] inp_list = [] dataset_path = args.data_path with gzip.open(dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float( row['score']) / 10 # Normalize score to range 0 ... 1 inp_list.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) from sklearn.model_selection import train_test_split train_samples, val_samples = train_test_split(inp_list, test_size=0.2) # import ipdb; ipdb.set_trace() train_dataset = SentencesDataset(train_samples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read custom dev dataset") # evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='sts-dev') evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # import ipdb; ipdb.set_trace() # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path)
logging.info( "Step 1: Train cross-encoder: ({}) with STSbenchmark".format(model_name)) gold_samples = [] dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 if row['split'] == 'dev': dev_samples.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) elif row['split'] == 'test': test_samples.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) else: #As we want to get symmetric scores, i.e. CrossEncoder(A,B) = CrossEncoder(B,A), we pass both combinations to the train set gold_samples.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) gold_samples.append( InputExample(texts=[row['sentence2'], row['sentence1']], label=score)) # We wrap gold_samples (which is a List[InputExample]) into a pytorch DataLoader train_dataloader = DataLoader(gold_samples,
url= 'https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip', path=zip_save_path) with ZipFile(zip_save_path, 'r') as zip: zip.extractall(dataset_path) ######### Read train data ########## train_samples_MultipleNegativesRankingLoss = [] train_samples_ConstrativeLoss = [] with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: train_samples_ConstrativeLoss.append( InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))) if row['is_duplicate'] == '1': train_samples_MultipleNegativesRankingLoss.append( InputExample(texts=[row['question1'], row['question2']], label=1)) train_samples_MultipleNegativesRankingLoss.append( InputExample(texts=[row['question2'], row['question1']], label=1) ) # if A is a duplicate of B, then B is a duplicate of A # Create data loader and loss for MultipleNegativesRankingLoss train_dataset_MultipleNegativesRankingLoss = SentencesDataset( train_samples_MultipleNegativesRankingLoss, model=model) train_dataloader_MultipleNegativesRankingLoss = DataLoader( train_dataset_MultipleNegativesRankingLoss, shuffle=True,
ranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') # Base loss from sentence_transformers import SentencesDataset, losses from sentence_transformers.readers import InputExample examples = [] for topic in topics: gold = qrel[topic["number"]].items() query = topic["title"].strip() for item in gold: try: doc = db.lookup_docno(item[0]) examples.append(InputExample(texts=[query, doc], label=item[1])) except: continue print("finished", len(examples)) #%% from torch.utils.data import DataLoader train_dataset = SentencesDataset(examples, ranker) train_dl = DataLoader(train_dataset, shuffle=True, batch_size=16) train_loss = losses.OnlineContrastiveLoss(model=ranker) ranker.fit(train_dataloader=train_dl, epochs=20, output_path="ranker/constrastive_loss/", save_best_model=True)
pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_samples = [] dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float( row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample( texts=[row['sentence1'], row['sentence2']], label=score) if row['split'] == 'dev': dev_samples.append(inp_example) elif row['split'] == 'test': test_samples.append(inp_example) else: train_samples.append(inp_example) train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(