def load(path): """ Loads an embeddings model and db database. Args: path: model path, if None uses default path Returns: (embeddings, db handle) """ # Default path if not provided if not path: path = Models.modelPath() dbfile = os.path.join(path, "articles.sqlite") if os.path.isfile(os.path.join(path, "config")): print("Loading model from %s" % path) embeddings = Embeddings() embeddings.load(path) else: print("ERROR: loading model: ensure model is present") raise FileNotFoundError("Unable to load model from %s" % path) # Connect to database file db = sqlite3.connect(dbfile) return (embeddings, db)
def load(): """ Loads an embeddings model and questions.db database. Returns: (embeddings, db handle) """ path = Models.modelPath("stackexchange") dbfile = os.path.join(path, "questions.db") if os.path.isfile(os.path.join(path, "config")): print("Loading model from %s" % path) embeddings = Embeddings() embeddings.load(path) else: print("ERROR: loading model: ensure model is installed") print( "ERROR: Pre-trained model can be installed by running python -m codequestion.download" ) raise FileNotFoundError( "Unable to load codequestion model from %s" % path) # Connect to database file db = sqlite3.connect(dbfile) return (embeddings, db)
def testWords(self): """ Test embeddings backed by word vectors """ # Initialize model path path = os.path.join(tempfile.gettempdir(), "model") os.makedirs(path, exist_ok=True) # Build tokens file with tempfile.NamedTemporaryFile(mode="w", delete=False) as output: tokens = output.name for x in self.data: output.write(x + "\n") # Word vectors path vectors = os.path.join(path, "test-300d") # Build word vectors, if they don't already exist WordVectors.build(tokens, 300, 1, vectors) # Create dataset data = [(x, row, None) for x, row in enumerate(self.data)] # Create embeddings model, backed by word vectors embeddings = Embeddings({ "path": vectors + ".magnitude", "storevectors": True, "scoring": "bm25", "pca": 3, "quantize": True }) # Call scoring and index methods embeddings.score(data) embeddings.index(data) # Test search self.assertIsNotNone(embeddings.search("win", 1)) # Generate temp file path index = os.path.join(tempfile.gettempdir(), "wembeddings") # Test save/load embeddings.save(index) embeddings.load(index) # Test search self.assertIsNotNone(embeddings.search("win", 1))
def load(): """ Loads a questions database and pre-trained embeddings model Returns: (db, embeddings) """ print("Loading model") path = Models.modelPath("stackexchange") dbfile = os.path.join(path, "questions.db") # Connect to database file db = sqlite3.connect(dbfile) # Loading embeddings model embeddings = Embeddings() embeddings.load(path) return db, embeddings
index_name = 'index' with open(input_file, 'r') as infile: sections = infile.readlines() # Create an index for the list of sections doc_dict = {} index_text = [] for uid, text in enumerate(sections): doc_dict[uid] = text.split('\t') session_id, raw_text = doc_dict[uid][:2] if len(raw_text) > 250: raw_text = Tokenizer.tokenize(raw_text) index_text.append((uid, raw_text, None)) if mode == 'index': print("--indexing-- %d documents" % (len(index_text))) embeddings.index(index_text) embeddings.save(index_name) elif mode == 'search': print("--searching-- %d documents" % (len(index_text))) embeddings.load(index_name) for query in ("the milestones for our seed round", "what is possible today", "My philosophy has always been don't solve the human", "story about Larry", "biological memory", "short-term memory", "memory blocks and memory stack", "the company where i programmed robots", "nothing to do with us"): # Extract uid of first result # search result format: (uid, score) print(query) for i in range(0, 3): uid = embeddings.search(query, 3)[i][0] print("%-20s %s" % (query, doc_dict[uid]))
if args.create_embed_index: #create index #uniqueindex is your list of terms embeddings = Embeddings({"method": "transformers", "path": sentence_transformer_path.__str__(),"quantize":True}) embeddings.index([(uid, text, None) for uid, text in enumerate(uniqueterms)]) embeddings.save("embedding_index") else: #load index embeddings = Embeddings() #hack to port an embedding_index created on another machine with other dir structure with open("%s/config" % "embedding_index", "rb") as handle: config = pickle.load(handle) config["path"] = sentence_transformer_path.__str__() with open("%s/config" % "embedding_index", "wb") as handle: config = pickle.dump(config,handle) embeddings.load("embedding_index") ### load predictions on CORD-19 abstracts to create KB kb = pd.read_csv(predictions_path,usecols=["doc_id","sentence","span1","span2","relation_tag","conf","span1_lemma","span2_lemma"],sep="\t") kb.dropna(inplace=True) #string cleanups kb['norm_span1'] = kb['span1'].str.replace('[^\w\s]','').str.replace("\s\s+", " ").str.strip().str.replace('^(\d+\s ?)*|(^[0-9]+)', '').str.replace("^[0-9]+$","") kb['norm_span2'] = kb['span2'].str.replace('[^\w\s]','').str.replace("\s\s+", " ").str.strip().str.replace('^(\d+\s ?)*|(^[0-9]+)', '').str.replace("^[0-9]+$","") kb = kb[~((kb.norm_span1=="") | (kb.norm_span2==""))] badi = [] for i in range(len(kb["conf"])): v = kb["conf"].iloc[i] try: float(v) except: badi.append(i)
class TestEmbeddings(unittest.TestCase): """ Embeddings tests """ def setUp(self): """ Initialize test data. """ self.data = [ "US tops 5 million confirmed virus cases", "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg", "Beijing mobilises invasion craft along coast as Taiwan tensions escalate", "The National Park Service warns against sacrificing slower friends in a bear attack", "Maine man wins $1M from $25 lottery ticket", "Make huge profits without work, earn up to $100,000 a day" ] # Create embeddings model, backed by sentence-transformers & transformers self.embeddings = Embeddings({ "method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens" }) def testIndex(self): """ Test embeddings.index """ # Create an index for the list of sections self.embeddings.index([(uid, text, None) for uid, text in enumerate(self.data)]) # Search for best match uid = self.embeddings.search("feel good story", 1)[0][0] self.assertEqual(self.data[uid], self.data[4]) def testSave(self): """ Test embeddings.save """ # Create an index for the list of sections self.embeddings.index([(uid, text, None) for uid, text in enumerate(self.data)]) # Generate temp file path index = os.path.join(tempfile.gettempdir(), "embeddings") self.embeddings.save(index) self.embeddings.load(index) # Search for best match uid = self.embeddings.search("feel good story", 1)[0][0] self.assertEqual(self.data[uid], self.data[4]) def testSimilarity(self): """ Test embeddings.similarity """ # Get best matching id uid = np.argmax( self.embeddings.similarity("feel good story", self.data)) self.assertEqual(self.data[uid], self.data[4]) def testWords(self): """ Test embeddings backed by word vectors """ # Initialize model path path = os.path.join(tempfile.gettempdir(), "model") os.makedirs(path, exist_ok=True) # Build tokens file with tempfile.NamedTemporaryFile(mode="w", delete=False) as output: tokens = output.name for x in self.data: output.write(x + "\n") # Word vectors path vectors = os.path.join(path, "test-300d") # Build word vectors, if they don't already exist WordVectors.build(tokens, 300, 1, vectors) # Create dataset data = [(x, row, None) for x, row in enumerate(self.data)] # Create embeddings model, backed by word vectors embeddings = Embeddings({ "path": vectors + ".magnitude", "storevectors": True, "scoring": "bm25", "pca": 3, "quantize": True }) # Call scoring and index methods embeddings.score(data) embeddings.index(data) # Test search self.assertIsNotNone(embeddings.search("win", 1))