It was trained on the Natural Questions dataset, a dataset with real questions from Google Search together with annotated data from Wikipedia providing the answer. For the passages, we encode the Wikipedia article tile together with the individual text passages. Google Colab Example: https://colab.research.google.com/drive/11GunvCqJuebfeTlgbJWkIMT0xJH6PWF1?usp=sharing """ import json from sentence_transformers import SentenceTransformer, util import time import gzip import os import torch # We use the Bi-Encoder to encode all passages, so that we can use it with sematic search model_name = 'nq-distilbert-base-v1' bi_encoder = SentenceTransformer(model_name) top_k = 5 # Number of passages we want to retrieve with the bi-encoder # As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only # about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz' if not os.path.exists(wikipedia_filepath): util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath) passages = [] with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn: for line in fIn: data = json.loads(line.strip())
from sentence_transformers import SentenceTransformer sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
dev_files.append(arg) else: train_files.append(arg) if not train_files: print("Please pass at least some train files") print("python make_multilingual_sys.py file1.tsv.gz file2.tsv.gz --dev dev1.tsv.gz dev2.tsv.gz") exit() logger.info("Train files: {}".format(", ".join(train_files))) logger.info("Dev files: {}".format(", ".join(dev_files))) ######## Start the extension of the teacher model to multiple languages ######## logger.info("Load teacher model") teacher_model = SentenceTransformer(teacher_model_name) logger.info("Create student model from scratch") word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) ###### Read Parallel Sentences Dataset ###### train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True) for train_file in train_files: train_data.load_data(train_file, max_sentences=max_sentences_per_trainfile, max_sentence_length=train_max_sentence_length) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
print(text_df) !pip install -U sentence-transformers """## BERT Sentence Tranformers Semantic Search""" """ This is a simple application for sentence embeddings: semantic search given query sentence,this finds the most similar sentence in this corpus script outputs for various queries the top 5 most similar publications in the corpus *Used open source code to aid in development """ from sentence_transformers import SentenceTransformer import scipy.spatial import pickle as pkl embedder = SentenceTransformer('bert-base-nli-mean-tokens') sentences = list(text_df['Text Processed']) # Eaxmple query sentences queries = ['How to evolve architecture for constellations and simulation', 'Build behavior of complex aerospace and modeling of safety'] query_embeddings = embedder.encode(queries,show_progress_bar=True) text_embeddings = embedder.encode(sentences, show_progress_bar=True) # # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity closest_n = 5 print("\nTop 5 most similar sentences in corpus:") for query, query_embedding in zip(queries, query_embeddings): distances = scipy.spatial.distance.cdist([query_embedding], text_embeddings, "cosine")[0] results = zip(range(len(distances)), distances)
import spacy, random nlp = spacy.load('en_core_web_lg') from sentence_transformers import SentenceTransformer embedder = SentenceTransformer('bert-base-nli-mean-tokens') # Load Movielines & Conversations movie_lines = {} for line in open("./cornell movie-dialogs corpus/movie_lines.txt", encoding="latin1"): line = line.strip() parts = line.split(" +++$+++ ") if len(parts) == 5: movie_lines[parts[0]] = parts[4] else: movie_lines[parts[0]] = "" import json responses = {} for line in open("./cornell movie-dialogs corpus/movie_conversations.txt", encoding="latin1"): line = line.strip() parts = line.split(" +++$+++ ") line_ids = json.loads(parts[3].replace("'", '"')) for first, second in zip(line_ids[:-1], line_ids[1:]): responses[first] = second import numpy as np def sentence_mean(nlp, s):
# fastext_raw = fasttext_embedding(fastText_model, data.text.to_list(), dictionary, tfidf_model,False) fasttext_tfidf = fasttext_embedding(fastText_model, data.text.to_list(), dictionary, tfidf_model, True) # data['fasttext_raw'] = fastext_raw data['fasttext_tfidf'] = fasttext_tfidf from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() x = mlb.fit_transform( [tuple(int(x) for x in i.split(',')) for i in data.label.to_list()]) data['new_label'] = list(x) len(data.new_label[0]) ###Bert modelBert = SentenceTransformer('monologg/bert-base-cased-goemotions-original') def bert_embedding(sentences): sentence_embeddings = modelBert.encode(sentences) return sentence_embeddings bert_features = bert_embedding(data.text.to_list()) fabeec = [] for i in range(len(bert_features)): fabeec.append(bert_features[i].tolist() + fasttext_tfidf[i].tolist()) #imbalance
def __init__(self, nlp): spacy_name = nlp.meta['name'] model_name = util.name_spacy_to_sentencebert(spacy_name) self.model = SentenceTransformer(model_name)
def main(): a = get_args() prev_enc = 0 def train(i): loss = 0 noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if a.noise > 0 else None img_out = image_f(noise) micro = None if a.in_txt2 is None else False imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, norm_in, a.overscan, micro=micro) out_enc = model_clip.encode_image(imgs_sliced[-1]) if a.diverse != 0: imgs_sliced = slice_imgs([image_f(noise)], a.samples, a.modsize, norm_in, a.overscan, micro=micro) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += a.diverse * torch.cosine_similarity( out_enc, out_enc2, dim=-1).mean() del out_enc2 torch.cuda.empty_cache() if a.in_img is not None and os.path.isfile(a.in_img): # input image loss += sign * 0.5 * torch.cosine_similarity( img_enc, out_enc, dim=-1).mean() if a.in_txt is not None: # input text loss += sign * torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean() if a.in_txt0 is not None: # subtract text loss += -sign * torch.cosine_similarity(txt_enc0, out_enc, dim=-1).mean() if a.sync > 0 and a.in_img is not None and os.path.isfile( a.in_img): # image composition loss -= a.sync * ssim_loss( F.interpolate(img_out, ssim_size).float(), img_in) if a.in_txt2 is not None: # input text for micro details imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, norm_in, a.overscan, micro=True) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += sign * torch.cosine_similarity(txt_enc2, out_enc2, dim=-1).mean() del out_enc2 torch.cuda.empty_cache() if a.expand > 0: global prev_enc if i > 0: loss += a.expand * torch.cosine_similarity( out_enc, prev_enc, dim=-1).mean() prev_enc = out_enc.detach() del img_out, imgs_sliced, out_enc torch.cuda.empty_cache() assert not isinstance(loss, int), ' Loss not defined, check the inputs' if a.prog is True: lr_cur = lr0 + (i / a.steps) * (lr1 - lr0) for g in optimizer.param_groups: g['lr'] = lr_cur optimizer.zero_grad() loss.backward() optimizer.step() if i % a.fstep == 0: with torch.no_grad(): img = image_f(contrast=a.contrast).cpu().numpy()[0] checkout(img, os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)), verbose=a.verbose) pbar.upd() # Load CLIP models model_clip, _ = clip.load(a.model) if a.verbose is True: print(' using model', a.model) xmem = {'RN50': 0.5, 'RN50x4': 0.16, 'RN101': 0.33} if 'RN' in a.model: a.samples = int(a.samples * xmem[a.model]) if a.multilang is True: model_lang = SentenceTransformer( 'clip-ViT-B-32-multilingual-v1').cuda() def enc_text(txt): if a.multilang is True: emb = model_lang.encode([txt], convert_to_tensor=True, show_progress_bar=False) else: emb = model_clip.encode_text(clip.tokenize(txt).cuda()) return emb.detach().clone() if a.diverse != 0: a.samples = int(a.samples * 0.5) norm_in = torchvision.transforms.Normalize( (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) out_name = [] if a.in_img is not None and os.path.isfile(a.in_img): if a.verbose is True: print(' ref image:', basename(a.in_img)) img_in = torch.from_numpy( img_read(a.in_img) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda() img_in = img_in[:, :3, :, :] # fix rgb channels in_sliced = slice_imgs([img_in], a.samples, a.modsize, transform=norm_in, overscan=a.overscan)[0] img_enc = model_clip.encode_image(in_sliced).detach().clone() if a.sync > 0: ssim_loss = ssim.SSIM(window_size=11) ssim_size = [s // 8 for s in a.size] img_in = F.interpolate(img_in, ssim_size).float() else: del img_in del in_sliced torch.cuda.empty_cache() out_name.append(basename(a.in_img).replace(' ', '_')) if a.in_txt is not None: if a.verbose is True: print(' ref text: ', basename(a.in_txt)) if a.translate: translator = Translator() a.in_txt = translator.translate(a.in_txt, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt) txt_enc = enc_text(a.in_txt) out_name.append(txt_clean(a.in_txt)) if a.in_txt2 is not None: if a.verbose is True: print(' micro text:', basename(a.in_txt2)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt2 = translator.translate(a.in_txt2, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt2) txt_enc2 = enc_text(a.in_txt2) out_name.append(txt_clean(a.in_txt2)) if a.in_txt0 is not None: if a.verbose is True: print(' subtract text:', basename(a.in_txt0)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt0 = translator.translate(a.in_txt0, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt0) txt_enc0 = enc_text(a.in_txt0) out_name.append('off-' + txt_clean(a.in_txt0)) if a.multilang is True: del model_lang params, image_f = fft_image([1, 3, *a.size], resume=a.resume) image_f = to_valid_rgb(image_f) if a.prog is True: lr1 = a.lrate * 2 lr0 = lr1 * 0.01 else: lr0 = a.lrate optimizer = torch.optim.Adam(params, lr0) sign = 1. if a.invert is True else -1. if a.verbose is True: print(' samples:', a.samples) out_name = '-'.join(out_name) out_name += '-%s' % a.model if 'RN' in a.model.upper() else '' tempdir = os.path.join(a.out_dir, out_name) os.makedirs(tempdir, exist_ok=True) pbar = ProgressBar(a.steps // a.fstep) for i in range(a.steps): train(i) os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(a.out_dir, out_name))) shutil.copy( img_list(tempdir)[-1], os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps))) if a.save_pt is True: torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
title_long.append(title[i]) url_long.append(url[i]) data = { 'article_index': article_long, 'title': title_long, 'snippet': sentence_long, 'summary': summary_long, 'url': url_long } return data content = pd.read_csv('final_content.csv') print("this will take some time if u are running it for many articles") content = summarizer(content) print("running the sentence splitting") snippet_content = pd.DataFrame( sentence_creator(content), columns=['article_index', 'title', 'snippet', 'summary', 'url']) snippet_content.to_excel('snippet_content.xlsx', index=False) sentence = [] for i in range(len(snippet_content)): sentence.append(snippet_content['snippet'][i]) model = SentenceTransformer('model') sentence_content_embeddings = model.encode(sentence) with open('sentence_split_encoder_content', 'wb') as f: pickle.dump(sentence_content_embeddings, f)
# Beginning to calculate features include BERT and TF-IDF; this process can be a bit of bottleneck # TODO: Consider writing these variables to a file to "pre-compute" them if experiments are taking awhile print(" ") class_report.write(" \n") print("===============") class_report.write("===============\n") print("Fitting Features: ") class_report.write("Fitting Features: \n") print(" ") class_report.write('\n') bert_dimension = 0 if Features == "All" or Features == "BERT": # Create BERT Features and add to data frame print('Fitting BERT Features') class_report.write('Fitting BERT Features') model = SentenceTransformer('bert-base-nli-mean-tokens') sentences = df['Sentence'].tolist() sentence_embeddings = model.encode(sentences) encoded_values = pd.DataFrame(np.row_stack(sentence_embeddings)) FeatureNames = [] bert_dimension = encoded_values.shape[1] for x in range(0, bert_dimension): FeatureNames.append("BERT_" + str(x)) training_corpus = encoded_values.head(dataset) test_corpus = encoded_values.tail((df['Set'] == 1).sum()) tf_dimension = 0 if Features == "All" or Features == "TF": # Create TF-IDF Features and add to data frame
def bert_main(): # Tweet data using a Vectorizer tweets = importTweets(True) tweetsk = list(tweets.keys()) tweetsl = list(tweets.values()) # Query data using a Vectorizer queries = importQuery(True) queriesl = list(queries.values()) # Embedding the Tweet data words using a bert model. bert_model = SentenceTransformer('bert-base-nli-mean-tokens') print("-" * 70) print("Embedding the tweet strings with the Bert token model...") tweet_embeddings = bert_model.encode(tweetsl, batch_size=500, show_progress_bar=True) # Embedding the Query data words using a bert model. print("-" * 70) print("Embedding the Query strings with the Bert token model...") query_embeddings = bert_model.encode(queriesl, batch_size=500, show_progress_bar=True) # Calculation the Cosine Similarity for the embedded words. print("-" * 70) print( "Calculating the Cosine Similarity for the Bert embedded Tweets and Queries..." ) Rankings = {} for q in range(0, len(queriesl)): # Dictionary to sort the Cosine Similarity of each document per query. docCurrentQuery = {} for t in range(0, len(tweetsl)): docCurrentQuery[tweetsk[t]] = 1 - spatial.distance.cosine( tweet_embeddings[t], query_embeddings[q]) # Sorting the document in descending order of the Cosine Similarity per query. docCurrentQuery = dict( sorted(docCurrentQuery.items(), key=lambda item: item[1], reverse=True)) # Creating a new dictionary of only the Top 1000 documents for each query. doc_counter = 1 docCurrentQuery_1000 = {} for key, value in docCurrentQuery.items(): if (doc_counter <= 1000): docCurrentQuery_1000[key] = value doc_counter += 1 else: break Rankings[q + 1] = docCurrentQuery_1000 print("-" * 70) print("Creating a results file with all the required details...") # Creating a txt file with the results resultFileCreation(Rankings, True) print("-" * 70) print("Results file is created (visit the dist folder)") print("-" * 70)
def __init__(self): self.es = Elasticsearch(maxsize=1000) self.bc = SentenceTransformer('distiluse-base-multilingual-cased')
class SimiSearch: def __init__(self): self.es = Elasticsearch(maxsize=1000) self.bc = SentenceTransformer('distiluse-base-multilingual-cased') def findSimQuestions(self, q: str, topk: int, minScore=0.5): """ Find similar questions based on cosine similarity to a question q and return top k results Params: q: question that needs searching for similar questions topk: nb of top results returned """ embedding_start = time.time() query_vector = self.bc.encode([q]) query_vector = query_vector[0].tolist() embedding_time = time.time() - embedding_start script_query = { "script_score": { "query": { "multi_match": { "query": q, "type": "bool_prefix", "fields": ["text", "text._2gram", "text._3gram"] } }, "script": { "source": "1+cosineSimilarity(params.query_vector, 'vectorisation')", "params": { "query_vector": query_vector } }, "min_score": minScore + 1 } } #print('encoding time: {}'.format(embedding_time)) search_start = time.time() response = self.es.search(index='qa', body={ "size": topk, "query": script_query, "_source": ['id', 'text', 'rep'] }) search_time = time.time() - search_start #print('search time: {}'.format(search_time)) res = [] reps = [] for r in response['hits']['hits'][:topk]: if r['_source']['rep'] not in reps: reps.append(r['_source']['rep']) res.append({ 'id': r['_source']['id'], 'text': r['_source']['text'], 'score': r['_score'], 'rep': r['_source']['rep'] }) return res
def fit(self, token_lists): """ Generate topic model, dictionary, corpus from token lists :param token_lists: list of document tokens """ try: # create Gensim dictionary & corpus for validation dictionary = Dictionary(token_lists) corpus = [dictionary.doc2bow(text) for text in token_lists] if self.method == "BERT": model = SentenceTransformer(self.pre_trained_name) # convert list of document tokens to list of sentences sentences = [Utility.to_sentence(token_list) for token_list in token_lists] # generate BERT sentence embeddings embeddings = model.encode(sentences, show_progress_bar=True) # reduce dimensionality of all embeddings using umap model umap_model = umap.UMAP( n_neighbors=self.n_neighbors, n_components=self.n_components, min_dist=self.min_dist, metric=self.umap_metric,random_state=self.random_state ).fit(embeddings) umap_embeddings = umap_model.transform(embeddings) # cluster documents using HDBSCAN cluster_model = hdbscan.HDBSCAN( min_cluster_size=self.min_cluster_size, metric=self.cluster_metric, cluster_selection_method=self.cluster_selection_method, prediction_data=self.prediction_data ).fit(umap_embeddings) # get cluster labels labels = cluster_model.labels_ # generate label_docs dataframe label_docs_df = self.get_label_docs_df(sentences, labels) # calculate word importance per topic tf_idf, cv = self.c_tf_idf(label_docs_df.doc.values, m=len(sentences)) self.k = len(np.unique(labels)) self.labels = labels self.dictionary = dictionary self.corpus = corpus self.sentences = sentences self.token_lists = token_lists self.cluster_model = cluster_model self.umap_model = umap_model self.embeddings = embeddings self.umap_embeddings = umap_embeddings self.cv = cv self.tf_idf = tf_idf self.feature_names = cv.get_feature_names() else: raise Exception('method not exist') except Exception: logging.error("exception occured", exc_info=True)
string, convert_to_tensor=True).unsqueeze(0).cpu() output.append(features[string]) return torch.cat(output).to(device), features if __name__ == '__main__': device = 'cuda' if os.path.isdir('/media/palm/BiggerData/dictionaries/'): root_data = '/media/palm/BiggerData/dictionaries/' elif os.path.isdir('/home/palm/PycharmProjects/cp/cp10-work/'): root_data = '/home/palm/PycharmProjects/cp' elif os.path.isdir('/home/palm/PycharmProjects/nlp/cp10-work'): root_data = '/home/palm/PycharmProjects/nlp/' else: raise ValueError('Well, something\'s wrong here') eng_sm = SentenceTransformer(os.path.join(root_data, 'cp10-work')) eng_sm.requires_grad_(False) eng_sm.train(False) embeddings = copy.deepcopy( eng_sm._first_module().auto_model.embeddings).to(device) embeddings.requires_grad_(True) embeddings.train(True) dataset = SentenceTokenized(eng_sm.tokenizer, 'first', language='eng', true_only=True) model = AEPretrainedEmbedding(dataset.vocab_size, embeddings) model.to(device)
import os from pprint import pprint from flask import Flask, render_template, jsonify, request from elasticsearch import Elasticsearch from sentence_transformers import SentenceTransformer import codecs, json import numpy as np #import gluonnlp as nlp #import mxnet as mx SEARCH_SIZE = 10 INDEX_NAME = os.environ['INDEX_NAME'] model = "roberta-base-nli-stsb-mean-tokens" embedder = SentenceTransformer(model) #model, vocab = nlp.model.get_model('roberta_12_768_12', dataset_name='openwebtext_ccnews_stories_books_cased', use_decoder=False); #tokenizer = nlp.data.GPT2BPETokenizer(); app = Flask(__name__) @app.route('/') def index(): return render_template('index.html') @app.route('/search') def analyzer(): client = Elasticsearch('elasticsearch:9200')
class SemanticEngine: def __init__(self, text_df: pd.DataFrame) -> None: """ Args: text_df (pd.DataFrame): pandas dataframe with fields: ts, text """ self.model = SentenceTransformer("paraphrase-distilroberta-base-v1") self.text_df = text_df.to_numpy() self.embeddings = None def load_embeddings(self, path) -> None: """ load embeddings from pickle file """ with open(path, "rb") as file: self.embeddings = pickle.load(file) def save_embeddings(self, path) -> None: """ save embeddings to pickle file """ with open(path, "wb") as file: pickle.dump(self.embeddings, file) def calc_embeddings(self, corpus: List[str]): """ calculate new embeddings """ if len(corpus) == 0: raise ValueError("corpus is empty") corpus_embeddings = self.model.encode(corpus, convert_to_tensor=True, show_progress_bar=False) self.embeddings = corpus_embeddings def get_top_k(self, query: str, k=5) -> List[Dict]: r"""Get k most similar to query sentences You need to call load_embeddings or calc_embeddings first to use this method Args: query (str): text for which you want to find similar sentences k (int, optional): number of sentences to find. Defaults to 5. Returns: List[Dict[float, str, float]]: List with dictionaries of the following structure: { ts: timestamp of message, score: cosin similarity score text: message text } Example 1: calculate embeddings, save them and get top 5 sentences :: >>> df = pd.read_csv("data/prepared/edu_courses.tsv", sep="\t") >>> engine = SemanticEngine(text_df=df) >>> engine.calc_embeddings(df.text.tolist()) >>> engine.save_embeddings("data/embeddings/edu_courses.pkl") >>> query = "посоветуйте каких-нибудь курсов по pytorch" >>> result = engine.get_top_k(query, k=5) >>> for res in result: ... print(res["ts"], res["text"], res["score"], sep="\n") Example 2: load embeddings from file, and get top 5 sentences >>> df = pd.read_csv("data/prepared/edu_courses.tsv", sep="\t") >>> engine = SemanticEngine(text_df=df) >>> engine.load_embeddings("data/embeddings/edu_courses.pkl") >>> query = "посоветуйте каких-нибудь курсов по pytorch" >>> result = engine.get_top_k(query, k=5) >>> for res in result: ... print(res["ts"], res["text"], res["score"], sep="\n") """ if self.embeddings is None: raise ValueError( "embeddings are not initialized. Call `load_embeddings` or `calc_embeddings` first" ) if k > len(self.embeddings): warnings.warn(f"""`k` with value of {k} is bigger then number of sentences with value of {len(self.embeddings)}. Value of k is set to {len(self.embeddings)} """) k = len(self.embeddings) query_embedding = self.model.encode([query], convert_to_tensor=True, show_progress_bar=False) hits = util.semantic_search(query_embedding, self.embeddings, top_k=k) hits = hits[0] result = [{ "ts": str(self.text_df[hit["corpus_id"]][0]), "score": str(hit["score"]), "text": self.text_df[hit["corpus_id"]][1], } for hit in hits] return result
def load_my_model(): model = SentenceTransformer('distilbert-base-nli-mean-tokens') return model
type=str, help="path where pretrained part was stored") parser.add_argument("-p", "--prefix", required=True, type=str, help="prefix to output files") parser.add_argument("-n", "--n", default=16, type=int, help="number of predictors used") args = parser.parse_args() bert = SentenceTransformer('bert-base-nli-mean-tokens') bert_size = 768 loaded_model = HiddenLabelPredictorModel(bert, bert_size, args.n) loaded_model.load_state_dict(torch.load(args.model)) descriptions = [] description_embeddings = {} UIs = [] UI_embedding = [] screen_names = [] trace_to_index = {} i = 0 for package_dir in os.listdir(args.dataset):
from sentence_transformers import SentenceTransformer import scipy.spatial import numpy as np import PrepareData embedder = SentenceTransformer( 'output/training_stsbenchmark_bert-base-uncased-2020-06-19_19-51-26') corpus = PrepareData.load_data() # Corpus with example sentences corpus_embeddings = [] for document in corpus: sentences_embeddings = embedder.encode(document) sentences_embeddings = np.array(sentences_embeddings) document_embedding = np.mean(sentences_embeddings, axis=0) corpus_embeddings.append(document_embedding) # Query sentences: # #similarity_matrix = [] #for first_doc in corpus_embeddings: # similarity_vector = [] # for second_doc in corpus_embeddings: # similarity_vector.append(1 - scipy.spatial.distance.cosine(first_doc, second_doc)) # similarity_matrix.append(similarity_vector) # #similarity_matrix = np.array(similarity_matrix) #print(similarity_matrix) # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
# print("f1 score:",f1_score(y_dev, pred, average='weighted')) # print("acc:",accuracy_score(y_dev, pred)) print(classification_report(y_dev, pred)) clf = BernoulliNB() clf.fit(X_train, y_train) pred = clf.predict(X_dev) print("TFIDF vectorization + NB:") # print("f1 score:",f1_score(y_dev, pred, average='weighted')) # print("acc:",accuracy_score(y_dev, pred)) print(classification_report(y_dev, pred)) ''' multilingual BERT model loading and embedding generation ''' test = pd.read_csv('../Dravidian-CodeMix/tamil_test.csv') model = SentenceTransformer('distiluse-base-multilingual-cased', device='cuda:1') X_train = model.encode(X_train_ori, batch_size=20, show_progress_bar=True) X_dev = model.encode(X_dev_ori, batch_size=20, show_progress_bar=True) X_test = model.encode(X_test_ori, batch_size=20, show_progress_bar=True) clf = MLPClassifier(hidden_layer_sizes=(512, ), max_iter=25) clf.fit(X_train, y_train) pred = clf.predict(X_dev) print("BERT + MLP:") # print("f1 score:",f1_score(y_dev, pred, average='weighted')) # print("acc:",accuracy_score(y_dev, pred)) print(classification_report(y_dev, pred)) ''' Loading Tamil specific pretrained fastText model ''' from pymagnitude import *
class Recommender(): def __init__(self, db_path, pretrained_model='stsb-roberta-large', no_cuda=True): self.device = "cuda" if torch.cuda.is_available( ) and not no_cuda else "cpu" # self.device = 'cpu' self.db_path = db_path self.pretrained_model = pretrained_model self.load_model() self.load_db() def load_model(self): """ Load the SentenceTransformer model base ond :return: """ print(f"SentenceTransformer for model {self.pretrained_model}") self.model = SentenceTransformer(self.pretrained_model, device=self.device) def load_db(self): self.conn = sqlite3.connect(self.db_path) df_chapter = pd.read_sql('select * from chapter', self.conn) df_chapter.sort_values(['chapter_number'], inplace=True) df_section = pd.read_sql('select * from section', self.conn) df_section.sort_values(['chapter_number', 'section_number'], inplace=True) df_text = pd.read_sql('select * from text', self.conn) df_text.sort_values(['chapter_number', 'section_number', 'id'], inplace=True) df_text = pd.merge(df_text, df_chapter.drop(['id'], axis=1), how='left', on='chapter_number') self.df_text = pd.merge(df_text, df_section.drop(['id'], axis=1), how='left', on=['chapter_number', 'section_number']) print(f"DB data text table loaded with shape {self.df_text.shape}") # predict method from run_pplm_discrim_train.py def match(self, input_text, source_tradition, top_labels): print(f"Finding closest passage for {input_text}") # get the candidate sources candidate_text = self.df_text[ self.df_text['chapter_name'].isin(top_labels) & self.df_text['source_tradition'].isin( source_tradition)]['source_text'].tolist() embedding1 = self.model.encode(input_text, convert_to_tensor=True) embedding2 = self.model.encode(candidate_text, convert_to_tensor=True) # compute similarity scores of two embeddings cosine_scores = util.pytorch_cos_sim(embedding1, embedding2) max_match_text = '' max_sim = 0.0 for i in range(len(input_text)): for j in range(len(candidate_text)): sim = cosine_scores[i][j].item() if sim > max_sim: max_match_text = candidate_text[j] max_sim = sim # print(f"New best match: {sim}: {max_match_text}") source = self.df_text[self.df_text['source_text'] == max_match_text]['source_location'].item() return f"{source_tradition[0]}, {source}: {max_match_text}"
from pathlib import Path from sentence_transformers import SentenceTransformer import scipy DFILE = "spanish" DTYPE = "test" print("Running for %s / %s" % (DTYPE, DFILE)) print("Loading BERT model") model = SentenceTransformer( '/data/wordembeddings/BERT/bert-large-nli-stsb-mean-tokens') print("Begin annotation") with Path("my/sentences/%s_%s_sentences.txt" % (DTYPE, DFILE)).open('r') as reader: lines = (line.strip().split('\t') for line in reader) sentences = dict((int(number), sentence) for number, sentence in lines) lnum = 0 with Path("my/sentences/%s_%s_pairs_dist.txt" % (DTYPE, DFILE)).open('w') as writer: with Path("my/sentences/%s_%s_pairs.txt" % (DTYPE, DFILE)).open('r') as reader: lines = (line.strip().split('\t') for line in reader) for n1, n2 in lines: lnum += 1 print("Line %d" % (lnum)) n1 = int(n1) n2 = int(n2)
def generate_embeddings(docs, batch_size, model_name='bert-base-cased', pooling='mean', offset=0): """ Generator function for generating embeddings from strings using a flair model. Takes a list of sentences and returns a list tuple. The first element represents failure (0) or success (1 or 2) and the second element contains a list of embeddings as numpy arrays if successful, and the indices of the failed batch if unsuccessful. The first element is 1, if batch_size embeddings were created :param docs: a list of strings for which embeddings should be created :param batch_size: integer representing how many embeddings should be created at once :param model_name: the model for creating the embeddings. Defaults to document embeddings using BERT-Base :param pooling: the pooling strategy to generate Document Embeddings :param offset: the offset of the integers, for printing out the correct index :return: a tuple (success/failure, embeddings/failed_indices) """ rest = len(docs) % batch_size model = False if pooling == 'mean': embedding = TransformerWordEmbeddings(model_name, layers='-1', allow_long_sentences=True) model = DocumentPoolEmbeddings([embedding], fine_tune_mode='none') elif pooling == 'CLS': model = TransformerDocumentEmbeddings(model_name) if model: for i in range(0, len(docs) - rest, batch_size): sentences = [ Sentence(sentence) for sentence in docs[i:i + batch_size] ] try: model.embed(sentences) print( f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}' ) yield 1, [ sentence.get_embedding().detach().cpu().numpy() for sentence in sentences ] except RuntimeError: print( f'could not embed sentences with index {offset + i} ' f'to {offset + i + batch_size-1}\nstoring in failed index list' ) yield 0, (offset + i, offset + i + batch_size - 1) if rest: sentences = [Sentence(sentence) for sentence in docs[-rest:]] try: model.embed(sentences) print( f'successfully embedded sentences from {len(docs) + offset - rest} to the end' ) yield 1, [ sentence.get_embedding().detach().cpu().numpy() for sentence in sentences ] except RuntimeError: yield 0, (len(docs) - rest, 0) elif pooling == 'SentenceBert': model = SentenceTransformer(model_name) for i in range(0, len(docs) - rest, batch_size): try: embeddings = model.encode(docs[i:i + batch_size]) print( f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}' ) yield 1, embeddings except RuntimeError: print( f'could not embed sentences with index {offset + i} ' f'to {offset + i + batch_size-1}\nstoring in failed index list' ) yield 0, (offset + i, offset + i + batch_size - 1) if rest: try: embeddings = model.encode(docs[-rest:]) print( f'successfully embedded sentences from {len(docs) + offset - rest} to the end' ) yield 1, embeddings except RuntimeError: yield 0, (len(docs) - rest, 0) else: raise Exception("No Valid model")
train_batch_size = 16 num_epochs = 4 model_save_path = 'output/training_stsbenchmark_' + model_name.replace( "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_samples = [] dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) if row['split'] == 'dev':
#### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base # model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' set_seed(args) # Read the dataset train_batch_size = args.batch_size model_save_path = args.model_path model = SentenceTransformer(model_save_path) folder = '../datasets/temp-sts/STS-data' #'STS2012-gold','STS2013-gold','STS2014-gold','STS2015-gold', names = [ 'STS2012-gold', 'STS2013-gold', 'STS2014-gold', 'STS2015-gold', 'STS2016-gold', 'SICK-data' ] for name in names: sts_reader = STSDataReader(os.path.join(folder, name)) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples('all.tsv'), batch_size=train_batch_size, name=name + '-test')
def __init__(self, model_name: str): self.encoder_model = SentenceTransformer(model_name) self.classifier = None
def __init__(self, knowledge_index): self.knowledge_vecs = knowledge_index["knowledge_vecs"] self.model = SentenceTransformer('bert-base-nli-stsb-mean-tokens') self.threshold = 0.35
sorted_df["Topic"] = sorted_df["Topic"].apply(str) sorted_df["That"] = sorted_df["That"].apply(str) sorted_df["Template"] = sorted_df["Template"].apply(str) # Sort by topic sorted_df = sorted_df.sort_values(by=['Topic']) # print(sorted_df.info()) # print(sorted_df.head()) # print(sorted_df['Topic'].value_counts()) train, test = train_test_split(sorted_df, stratify=sorted_df['Topic']) test, val = train_test_split(test, stratify=test['Topic']) print("Getting the bert-base-nli-mean-tokens model.") model = SentenceTransformer("bert-base-nli-mean-tokens") print("Read AIML QA dataset") train_dataloader = DataLoader(train, shuffle=True, batch_size=train_batch_size) print("Calculate loss") train_loss = losses.CosineSimilarityLoss(model=model) print("Create evaluator") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val) # Train the model warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up print("training the model...") model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000,
def __init__( self, document_store: BaseDocumentStore, embedding_model: str, use_gpu: bool = True, model_format: str = "farm", pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, ): """ :param document_store: An instance of DocumentStore from which to retrieve documents. :param embedding_model: Local path or name of model in Hugging Face's model hub such as ``'deepset/sentence_bert'`` :param use_gpu: Whether to use gpu or not :param model_format: Name of framework that was used for saving the model. Options: - ``'farm'`` - ``'transformers'`` - ``'sentence_transformers'`` :param pooling_strategy: Strategy for combining the embeddings from the model (for farm / transformers models only). Options: - ``'cls_token'`` (sentence vector) - ``'reduce_mean'`` (sentence vector) - ``'reduce_max'`` (sentence vector) - ``'per_token'`` (individual token vectors) :param emb_extraction_layer: Number of layer from which the embeddings shall be extracted (for farm / transformers models only). Default: -1 (very last layer). """ self.document_store = document_store self.model_format = model_format self.pooling_strategy = pooling_strategy self.emb_extraction_layer = emb_extraction_layer logger.info(f"Init retriever using embeddings of model {embedding_model}") if model_format == "farm" or model_format == "transformers": self.embedding_model = Inferencer.load( embedding_model, task_type="embeddings", extraction_strategy=self.pooling_strategy, extraction_layer=self.emb_extraction_layer, gpu=use_gpu, batch_size=4, max_seq_len=512, num_processes=0 ) # Check that document_store has the right similarity function similarity = document_store.similarity # If we are using a sentence transformer model if "sentence" in embedding_model.lower() and similarity != "cosine": logger.warning(f"You seem to be using a Sentence Transformer with the {similarity} function. " f"We recommend using cosine instead. " f"This can be set when initializing the DocumentStore") elif "dpr" in embedding_model.lower() and similarity != "dot_product": logger.warning(f"You seem to be using a DPR model with the {similarity} function. " f"We recommend using dot_product instead. " f"This can be set when initializing the DocumentStore") elif model_format == "sentence_transformers": try: from sentence_transformers import SentenceTransformer except ImportError: raise ImportError("Can't find package `sentence-transformers` \n" "You can install it via `pip install sentence-transformers` \n" "For details see https://github.com/UKPLab/sentence-transformers ") # pretrained embedding models coming from: https://github.com/UKPLab/sentence-transformers#pretrained-models # e.g. 'roberta-base-nli-stsb-mean-tokens' if use_gpu: device = "cuda" else: device = "cpu" self.embedding_model = SentenceTransformer(embedding_model, device=device) if document_store.similarity != "cosine": logger.warning( f"You are using a Sentence Transformer with the {document_store.similarity} function. " f"We recommend using cosine instead. " f"This can be set when initializing the DocumentStore") else: raise NotImplementedError