def embeddings(dbfile, vectors, maxsize): """ Builds a sentence embeddings index. Args: dbfile: input SQLite file vectors: vector path maxsize: maximum number of documents to process Returns: embeddings index """ embeddings = Embeddings({"path": vectors, "scoring": "bm25", "pca": 3, "quantize": True}) # Build scoring index if scoring method provided if embeddings.config.get("scoring"): embeddings.score(Index.stream(dbfile, maxsize)) # Build embeddings index embeddings.index(Index.stream(dbfile, maxsize)) return embeddings
def embeddings(dbfile): """ Builds a sentence embeddings index. Args: dbfile: input SQLite file Returns: embeddings index """ embeddings = Embeddings({ "path": Models.vectorPath("stackexchange-300d.magnitude"), "storevectors": True, "scoring": "bm25", "pca": 3, "quantize": True }) # Build scoring index if scoring method provided if embeddings.config.get("scoring"): embeddings.score(Index.stream(dbfile)) # Build embeddings index embeddings.index(Index.stream(dbfile)) return embeddings
def testWords(self): """ Test embeddings backed by word vectors """ # Initialize model path path = os.path.join(tempfile.gettempdir(), "model") os.makedirs(path, exist_ok=True) # Build tokens file with tempfile.NamedTemporaryFile(mode="w", delete=False) as output: tokens = output.name for x in self.data: output.write(x + "\n") # Word vectors path vectors = os.path.join(path, "test-300d") # Build word vectors, if they don't already exist WordVectors.build(tokens, 300, 1, vectors) # Create dataset data = [(x, row, None) for x, row in enumerate(self.data)] # Create embeddings model, backed by word vectors embeddings = Embeddings({ "path": vectors + ".magnitude", "storevectors": True, "scoring": "bm25", "pca": 3, "quantize": True }) # Call scoring and index methods embeddings.score(data) embeddings.index(data) # Test search self.assertIsNotNone(embeddings.search("win", 1)) # Generate temp file path index = os.path.join(tempfile.gettempdir(), "wembeddings") # Test save/load embeddings.save(index) embeddings.load(index) # Test search self.assertIsNotNone(embeddings.search("win", 1))
def train(vector, score): """ Trains an Embeddings model on STS dev + train data. Args: vector: word vector model path score: scoring method (bm25, sif, tfidf or None for averaging) Returns: trained Embeddings model """ print("Building model") embeddings = Embeddings({ "path": Models.vectorPath(vector), "scoring": score, "pca": 3 }) rows1 = STS.read(Models.testPath("stsbenchmark", "sts-dev.csv")) rows2 = STS.read(Models.testPath("stsbenchmark", "sts-train.csv")) rows = rows1 + rows2 documents = [] for row in rows: tokens = Tokenizer.tokenize(row[2] + " " + row[3]) if tokens: documents.append((row[0], tokens, None)) else: print("Skipping all stop word string: ", row) # Build scoring index if scoring method provided if embeddings.config.get("scoring"): embeddings.score(documents) # Build embeddings index embeddings.index(documents) return embeddings
def embeddings(dbfile, vectors, maxsize): """ Builds a sentence embeddings index. Args: dbfile: input SQLite file vectors: path to vectors file or configuration maxsize: maximum number of documents to process Returns: embeddings index """ # Read config and create Embeddings instance embeddings = Embeddings(Index.config(vectors)) # Build scoring index if scoring method provided if embeddings.config.get("scoring"): embeddings.score(Index.stream(dbfile, maxsize)) # Build embeddings index embeddings.index(Index.stream(dbfile, maxsize)) return embeddings