def load(): """ Loads an embeddings model and questions.db database. Returns: (embeddings, db handle) """ path = Models.modelPath("stackexchange") dbfile = os.path.join(path, "questions.db") if os.path.isfile(os.path.join(path, "config")): print("Loading model from %s" % path) embeddings = Embeddings() embeddings.load(path) else: print("ERROR: loading model: ensure model is installed") print( "ERROR: Pre-trained model can be installed by running python -m codequestion.download" ) raise FileNotFoundError( "Unable to load codequestion model from %s" % path) # Connect to database file db = sqlite3.connect(dbfile) return (embeddings, db)
def is_most_relevant(query, sources, threshold=0.2, use_api=False): if use_api: r = requests.post(url + ':8080/is_most_relevant', data=json.dumps({ 'query': query, 'sources': sources })) return r.json() embeddings = Embeddings({ "method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens" }) sections = sources similarities = embeddings.similarity(query, sections) result = zip(sources, similarities) result = sorted(result, key=lambda x: x[1], reverse=True) print(result) result = [x for x in result if x[1] > threshold] if result == []: return {'result': result} result = [result[0][0]] response = {'result': result} return response
def load(path): """ Loads an embeddings model and db database. Args: path: model path, if None uses default path Returns: (embeddings, db handle) """ # Default path if not provided if not path: path = Models.modelPath() dbfile = os.path.join(path, "articles.sqlite") if os.path.isfile(os.path.join(path, "config")): print("Loading model from %s" % path) embeddings = Embeddings() embeddings.load(path) else: print("ERROR: loading model: ensure model is present") raise FileNotFoundError("Unable to load model from %s" % path) # Connect to database file db = sqlite3.connect(dbfile) return (embeddings, db)
def preloop(self): # Create embeddings model, backed by sentence-transformers & transformers self.embeddings = Embeddings({"method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens"}) self.data = [ "US tops 5 million confirmed virus cases", "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg", "Beijing mobilises invasion craft along coast as Taiwan tensions escalate", "The National Park Service warns against sacrificing slower friends in a bear attack", "Maine man wins $1M from $25 lottery ticket", "Make huge profits without work, earn up to $100,000 a day", ]
def build(self, components): """ Builds a workflow using components. Args: components: list of components to add to workflow """ # Clear application self.__init__() # pylint: disable=W0108 tasks = [] for component in components: wtype = component.pop("type") self.components[wtype] = component if wtype == "summary": self.pipelines[wtype] = Summary(component.pop("path")) tasks.append( Task(lambda x: self.pipelines["summary"] (x, **self.components["summary"]))) elif wtype == "segment": self.pipelines[wtype] = Segmentation( **self.components["segment"]) tasks.append(Task(self.pipelines["segment"])) elif wtype == "textract": self.pipelines[wtype] = Textractor( **self.components["textract"]) tasks.append(UrlTask(self.pipelines["textract"])) elif wtype == "transcribe": self.pipelines[wtype] = Transcription(component.pop("path")) tasks.append(UrlTask(self.pipelines["transcribe"], r".\.wav$")) elif wtype == "translate": self.pipelines[wtype] = Translation() tasks.append( Task(lambda x: self.pipelines["translate"] (x, **self.components["translate"]))) elif wtype == "embeddings": self.embeddings = Embeddings({ "method": "transformers", **component }) self.documents = Documents() tasks.append(Task(self.documents.add, unpack=False)) self.workflow = Workflow(tasks)
def testComplexWorkflow(self): """ Tests a complex workflow """ textractor = Textractor(paragraphs=True, minlength=150, join=True) summary = Summary("sshleifer/distilbart-xsum-12-1") embeddings = Embeddings({ "method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens" }) documents = Documents() def index(x): documents.add(x) return x # Extract text and summarize articles articles = Workflow( [FileTask(textractor), Task(lambda x: summary(x, maxlength=15))]) # Complex workflow that extracts text, runs summarization then loads into an embeddings index tasks = [WorkflowTask(articles, r".\.pdf$"), Task(index, unpack=False)] data = [ "file://" + Utils.PATH + "/article.pdf", "Workflows can process audio files, documents and snippets" ] # Convert file paths to data tuples data = [(x, element, None) for x, element in enumerate(data)] # Execute workflow, discard results as they are streamed workflow = Workflow(tasks) for _ in workflow(data): pass # Build the embeddings index embeddings.index(documents) # Cleanup temporary storage documents.close() # Run search and validate result index, _ = embeddings.search("search text", 1)[0] self.assertEqual(index, 0)
class Shell(Cmd): """ Query shell. """ def __init__(self): super().__init__() self.intro = "query shell" self.prompt = "(search) " self.embeddings = None self.data = None def preloop(self): # Create embeddings model, backed by sentence-transformers & transformers self.embeddings = Embeddings({"method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens"}) self.data = [ "US tops 5 million confirmed virus cases", "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg", "Beijing mobilises invasion craft along coast as Taiwan tensions escalate", "The National Park Service warns against sacrificing slower friends in a bear attack", "Maine man wins $1M from $25 lottery ticket", "Make huge profits without work, earn up to $100,000 a day", ] def default(self, line): # Get index of best section that best matches query uid = self.embeddings.similarity(line, self.data)[0][0] print(self.data[uid]) print()
def setUpClass(cls): """ Create single extractor instance. """ sections = [ "Giants hit 3 HRs to down Dodgers", "Giants 5 Dodgers 4 final", "Dodgers drop Game 2 against the Giants, 5-4", "Blue Jays 2 Red Sox 1 final", "Red Sox lost to the Blue Jays, 2-1", "Blue Jays at Red Sox is over. Score: 2-1", "Phillies win over the Braves, 5-0", "Phillies 5 Braves 0 final", "Final: Braves lose to the Phillies in the series opener, 5-0", "Final score: Flyers 4 Lightning 1", "Flyers 4 Lightning 1 final", "Flyers win 4-1" ] # Add unique id to each section to assist with qa extraction cls.sections = [(uid, section) for uid, section in enumerate(sections)] # Create embeddings model, backed by sentence-transformers & transformers cls.embeddings = Embeddings({ "method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens" }) # Create extractor instance cls.extractor = Extractor(cls.embeddings, "distilbert-base-cased-distilled-squad")
def build(directory): """ Builds an image embeddings index. Args: directory: directory with images Returns: Embeddings index """ embeddings = Embeddings({ "method": "transformers", "path": "clip-ViT-B-32", "modelhub": False }) embeddings.index(images(directory)) return embeddings
def load(): """ Loads a questions database and pre-trained embeddings model Returns: (db, embeddings) """ print("Loading model") path = Models.modelPath("stackexchange") dbfile = os.path.join(path, "questions.db") # Connect to database file db = sqlite3.connect(dbfile) # Loading embeddings model embeddings = Embeddings() embeddings.load(path) return db, embeddings
def embeddings(dbfile): """ Builds a sentence embeddings index. Args: dbfile: input SQLite file Returns: embeddings index """ embeddings = Embeddings({ "path": Models.vectorPath("stackexchange-300d.magnitude"), "storevectors": True, "scoring": "bm25", "pca": 3, "quantize": True }) # Build scoring index if scoring method provided if embeddings.config.get("scoring"): embeddings.score(Index.stream(dbfile)) # Build embeddings index embeddings.index(Index.stream(dbfile)) return embeddings
def embeddings(dbfile, vectors, maxsize): """ Builds a sentence embeddings index. Args: dbfile: input SQLite file vectors: vector path maxsize: maximum number of documents to process Returns: embeddings index """ embeddings = Embeddings({"path": vectors, "scoring": "bm25", "pca": 3, "quantize": True}) # Build scoring index if scoring method provided if embeddings.config.get("scoring"): embeddings.score(Index.stream(dbfile, maxsize)) # Build embeddings index embeddings.index(Index.stream(dbfile, maxsize)) return embeddings
def testQA(self): """ Test qa extraction """ # Create embeddings model, backed by sentence-transformers & transformers embeddings = Embeddings({ "method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens" }) # Create extractor instance extractor = Extractor(embeddings, "distilbert-base-cased-distilled-squad") sections = [ "Giants hit 3 HRs to down Dodgers", "Giants 5 Dodgers 4 final", "Dodgers drop Game 2 against the Giants, 5-4", "Blue Jays 2 Red Sox 1 final", "Red Sox lost to the Blue Jays, 2-1", "Blue Jays at Red Sox is over. Score: 2-1", "Phillies win over the Braves, 5-0", "Phillies 5 Braves 0 final", "Final: Braves lose to the Phillies in the series opener, 5-0", "Final score: Flyers 4 Lightning 1", "Flyers 4 Lightning 1 final", "Flyers win 4-1" ] # Add unique id to each section to assist with qa extraction sections = [(uid, section) for uid, section in enumerate(sections)] questions = ["What team won the game?", "What was score?"] execute = lambda query: extractor(sections, [ (question, query, question, False) for question in questions ]) answers = execute("Red Sox - Blue Jays") self.assertEqual("Blue Jays", answers[0][1]) self.assertEqual("2-1", answers[1][1]) # Ad-hoc questions question = "What hockey team won?" answers = extractor(sections, [(question, question, question, False)]) self.assertEqual("Flyers", answers[0][1])
def is_relevant_document(query, documents, threshold=0.2, use_api=False): if use_api: r = requests.post(url + ':8080/is_relevant_document', data=json.dumps({ 'query': query, 'documents': documents })) return r.json() embeddings = Embeddings({ "method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens" }) sources_dictionary = {} for i in range(len(documents)): document = documents[i] document = document.replace('!', '.') sentences = document.split('.') for sentence in sentences: sources_dictionary[sentence] = i sources = list(sources_dictionary.keys()) sources = prune(sources) is_relevant_response = is_relevant(query, sources, threshold=0.2, use_api=False) is_relevant_result = is_relevant_response['result'] print(is_relevant_result) indices_result = [ sources_dictionary[source] for source in is_relevant_result ] indices_result = np.unique(indices_result) print(indices_result) result = [documents[index] for index in indices_result] response = {'result': result} return response
def testWords(self): """ Test embeddings backed by word vectors """ # Initialize model path path = os.path.join(tempfile.gettempdir(), "model") os.makedirs(path, exist_ok=True) # Build tokens file with tempfile.NamedTemporaryFile(mode="w", delete=False) as output: tokens = output.name for x in self.data: output.write(x + "\n") # Word vectors path vectors = os.path.join(path, "test-300d") # Build word vectors, if they don't already exist WordVectors.build(tokens, 300, 1, vectors) # Create dataset data = [(x, row, None) for x, row in enumerate(self.data)] # Create embeddings model, backed by word vectors embeddings = Embeddings({ "path": vectors + ".magnitude", "storevectors": True, "scoring": "bm25", "pca": 3, "quantize": True }) # Call scoring and index methods embeddings.score(data) embeddings.index(data) # Test search self.assertIsNotNone(embeddings.search("win", 1))
def train(vector, score): """ Trains an Embeddings model on STS dev + train data. Args: vector: word vector model path score: scoring method (bm25, sif, tfidf or None for averaging) Returns: trained Embeddings model """ print("Building model") embeddings = Embeddings({ "path": Models.vectorPath(vector), "scoring": score, "pca": 3 }) rows1 = STS.read(Models.testPath("stsbenchmark", "sts-dev.csv")) rows2 = STS.read(Models.testPath("stsbenchmark", "sts-train.csv")) rows = rows1 + rows2 documents = [] for row in rows: tokens = Tokenizer.tokenize(row[2] + " " + row[3]) if tokens: documents.append((row[0], tokens, None)) else: print("Skipping all stop word string: ", row) # Build scoring index if scoring method provided if embeddings.config.get("scoring"): embeddings.score(documents) # Build embeddings index embeddings.index(documents) return embeddings
def embeddings(index, database): """ Builds an embeddings index. Args: index: index configuration database: database handle with content to index """ # Create embeddings model, backed by sentence-transformers & transformers embeddings = Embeddings(index["embeddings"]) database.execute("SELECT Id, Title FROM articles") # Create an index for the list of articles articles = [(uid, text, None) for uid, text in database.cur.fetchall()] embeddings.index(articles) logging.info("Built embedding index over %d stored articles", len(articles)) # Save index embeddings.save(index["path"])
def embeddings(dbfile, vectors, maxsize): """ Builds a sentence embeddings index. Args: dbfile: input SQLite file vectors: path to vectors file or configuration maxsize: maximum number of documents to process Returns: embeddings index """ # Read config and create Embeddings instance embeddings = Embeddings(Index.config(vectors)) # Build scoring index if scoring method provided if embeddings.config.get("scoring"): embeddings.score(Index.stream(dbfile, maxsize)) # Build embeddings index embeddings.index(Index.stream(dbfile, maxsize)) return embeddings
class Application: """ Streamlit application. """ def __init__(self): """ Creates a new Streamlit application. """ # Component options self.components = {} # Defined pipelines self.pipelines = {} # Current workflow self.workflow = [] # Embeddings index params self.embeddings = None self.documents = None self.data = None def number(self, label): """ Extracts a number from a text input field. Args: label: label to use for text input field Returns: numeric input """ value = st.sidebar.text_input(label) return int(value) if value else None def options(self, component): """ Extracts component settings into a component configuration dict. Args: component: component type Returns: dict with component settings """ options = {"type": component} st.sidebar.markdown("---") if component == "summary": st.sidebar.markdown("**Summary** \n*Abstractive text summarization*") options["path"] = st.sidebar.text_input("Model", value="sshleifer/distilbart-cnn-12-6") options["minlength"] = self.number("Min length") options["maxlength"] = self.number("Max length") elif component in ("segment", "textract"): if component == "segment": st.sidebar.markdown("**Segment** \n*Split text into semantic units*") else: st.sidebar.markdown("**Textractor** \n*Extract text from documents*") options["sentences"] = st.sidebar.checkbox("Split sentences") options["lines"] = st.sidebar.checkbox("Split lines") options["paragraphs"] = st.sidebar.checkbox("Split paragraphs") options["join"] = st.sidebar.checkbox("Join tokenized") options["minlength"] = self.number("Min section length") elif component == "transcribe": st.sidebar.markdown("**Transcribe** \n*Transcribe audio to text*") options["path"] = st.sidebar.text_input("Model", value="facebook/wav2vec2-base-960h") elif component == "translate": st.sidebar.markdown("**Translate** \n*Machine translation*") options["target"] = st.sidebar.text_input("Target language code", value="en") elif component == "embeddings": st.sidebar.markdown("**Embeddings Index** \n*Index workflow output*") options["path"] = st.sidebar.text_area("Embeddings model path", value="sentence-transformers/bert-base-nli-mean-tokens") return options def build(self, components): """ Builds a workflow using components. Args: components: list of components to add to workflow """ # Clear application self.__init__() # pylint: disable=W0108 tasks = [] for component in components: wtype = component.pop("type") self.components[wtype] = component if wtype == "summary": self.pipelines[wtype] = Summary(component.pop("path")) tasks.append(Task(lambda x: self.pipelines["summary"](x, **self.components["summary"]))) elif wtype == "segment": self.pipelines[wtype] = Segmentation(**self.components["segment"]) tasks.append(Task(self.pipelines["segment"])) elif wtype == "textract": self.pipelines[wtype] = Textractor(**self.components["textract"]) tasks.append(FileTask(self.pipelines["textract"])) elif wtype == "transcribe": self.pipelines[wtype] = Transcription(component.pop("path")) tasks.append(FileTask(self.pipelines["transcribe"], r".\.wav$")) elif wtype == "translate": self.pipelines[wtype] = Translation() tasks.append(Task(lambda x: self.pipelines["translate"](x, **self.components["translate"]))) elif wtype == "embeddings": self.embeddings = Embeddings({"method": "transformers", **component}) self.documents = Documents() tasks.append(Task(self.documents.add, unpack=False)) self.workflow = Workflow(tasks) def process(self, data): """ Processes the current application action. Args: data: input data """ if data and self.workflow: # Build tuples for embedding index if self.documents: data = [(x, element, None) for x, element in enumerate(data)] # Process workflow for result in self.workflow(data): if not self.documents: st.write(result) # Build embeddings index if self.documents: # Cache data self.data = [x[1] for x in self.documents] with st.spinner("Building embedding index...."): self.embeddings.index(self.documents) self.documents.close() # Clear workflow self.documents, self.pipelines, self.workflow = None, None, None if self.embeddings and self.data: # Set query and limit query = st.text_input("Query") limit = min(5, len(self.data)) st.markdown( """ <style> table td:nth-child(1) { display: none } table th:nth-child(1) { display: none } table {text-align: left !important} </style> """, unsafe_allow_html=True, ) if query: df = pd.DataFrame([{"content": self.data[uid], "score": score} for uid, score in self.embeddings.search(query, limit)]) st.table(df) def run(self): """ Runs Streamlit application. """ st.sidebar.image("https://github.com/neuml/txtai/raw/master/logo.png", width=256) st.sidebar.markdown("# Workflow builder \n*Build and apply workflows to data* ") # Get selected components selected = st.sidebar.multiselect("Select components", ["embeddings", "segment", "summary", "textract", "transcribe", "translate"]) # Get selected options components = [self.options(component) for component in selected] st.sidebar.markdown("---") # Build or re-build workflow when build button clicked build = st.sidebar.button("Build") if build: with st.spinner("Building workflow...."): self.build(components) with st.beta_expander("Data", expanded=not self.data): data = st.text_area("Input", height=10) # Parse text items data = [x for x in data.split("\n") if x] if "file://" in data else [data] # Process current action self.process(data)
from txtai.embeddings import Embeddings from txtai.tokenizer import Tokenizer import numpy as np import sys embeddings = Embeddings({'method': 'transformers', 'path': 'sentence-transformers/roberta-base-nli-stsb-mean-tokens'}) embeddings = Embeddings({'method': 'transformers', 'path': 'sentence-transformers/bert-base-nli-stsb-mean-tokens'}) input_file = sys.argv[1] mode = sys.argv[2] index_name = 'index' with open(input_file, 'r') as infile: sections = infile.readlines() # Create an index for the list of sections doc_dict = {} index_text = [] for uid, text in enumerate(sections): doc_dict[uid] = text.split('\t') session_id, raw_text = doc_dict[uid][:2] if len(raw_text) > 250: raw_text = Tokenizer.tokenize(raw_text) index_text.append((uid, raw_text, None)) if mode == 'index': print("--indexing-- %d documents" % (len(index_text))) embeddings.index(index_text)
from txtai.embeddings import Embeddings import numpy as np embeddings = Embeddings({'method': 'transformers', 'path': 'sentence-transformers/roberta-base-nli-stsb-mean-tokens'}) # Create an index for the list of sections embeddings.index([(uid, text, None) for uid, text in enumerate(sections)]) print("%-20s %s" % ("Query", "Best Match")) print("-" * 50) # Run an embeddings search for each query for query in ("feel good story", "climate change", "health", "war", "wildlife", "asia", "north america", "dishonest junk"): # Extract uid of first result # search result format: (uid, score) uid = embeddings.search(query, 1)[0][0] # Print section print("%-20s %s" % (query, sections[uid]))
help="which evaluation task") args = parser.parse_args() sentence_transformer_path = Path(args.sent_trans_path) embeddings_index_path = Path(args.embeddings_index_path) predictions_path = Path(args.predictions_path) if args.create_embed_index: #create index #uniqueindex is your list of terms embeddings = Embeddings({"method": "transformers", "path": sentence_transformer_path.__str__(),"quantize":True}) embeddings.index([(uid, text, None) for uid, text in enumerate(uniqueterms)]) embeddings.save("embedding_index") else: #load index embeddings = Embeddings() #hack to port an embedding_index created on another machine with other dir structure with open("%s/config" % "embedding_index", "rb") as handle: config = pickle.load(handle) config["path"] = sentence_transformer_path.__str__() with open("%s/config" % "embedding_index", "wb") as handle: config = pickle.dump(config,handle) embeddings.load("embedding_index") ### load predictions on CORD-19 abstracts to create KB kb = pd.read_csv(predictions_path,usecols=["doc_id","sentence","span1","span2","relation_tag","conf","span1_lemma","span2_lemma"],sep="\t")
class TestEmbeddings(unittest.TestCase): """ Embeddings tests """ def setUp(self): """ Initialize test data. """ self.data = [ "US tops 5 million confirmed virus cases", "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg", "Beijing mobilises invasion craft along coast as Taiwan tensions escalate", "The National Park Service warns against sacrificing slower friends in a bear attack", "Maine man wins $1M from $25 lottery ticket", "Make huge profits without work, earn up to $100,000 a day" ] # Create embeddings model, backed by sentence-transformers & transformers self.embeddings = Embeddings({ "method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens" }) def testIndex(self): """ Test embeddings.index """ # Create an index for the list of sections self.embeddings.index([(uid, text, None) for uid, text in enumerate(self.data)]) # Search for best match uid = self.embeddings.search("feel good story", 1)[0][0] self.assertEqual(self.data[uid], self.data[4]) def testSave(self): """ Test embeddings.save """ # Create an index for the list of sections self.embeddings.index([(uid, text, None) for uid, text in enumerate(self.data)]) # Generate temp file path index = os.path.join(tempfile.gettempdir(), "embeddings") self.embeddings.save(index) self.embeddings.load(index) # Search for best match uid = self.embeddings.search("feel good story", 1)[0][0] self.assertEqual(self.data[uid], self.data[4]) def testSimilarity(self): """ Test embeddings.similarity """ # Get best matching id uid = np.argmax( self.embeddings.similarity("feel good story", self.data)) self.assertEqual(self.data[uid], self.data[4]) def testWords(self): """ Test embeddings backed by word vectors """ # Initialize model path path = os.path.join(tempfile.gettempdir(), "model") os.makedirs(path, exist_ok=True) # Build tokens file with tempfile.NamedTemporaryFile(mode="w", delete=False) as output: tokens = output.name for x in self.data: output.write(x + "\n") # Word vectors path vectors = os.path.join(path, "test-300d") # Build word vectors, if they don't already exist WordVectors.build(tokens, 300, 1, vectors) # Create dataset data = [(x, row, None) for x, row in enumerate(self.data)] # Create embeddings model, backed by word vectors embeddings = Embeddings({ "path": vectors + ".magnitude", "storevectors": True, "scoring": "bm25", "pca": 3, "quantize": True }) # Call scoring and index methods embeddings.score(data) embeddings.index(data) # Test search self.assertIsNotNone(embeddings.search("win", 1))
from flask import Flask, render_template, redirect, url_for, request from flask import jsonify from txtai.embeddings import Embeddings # Create embeddings model, backed by sentence-transformers & transformers embeddings = Embeddings({ "method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens" }) app = Flask(__name__) from conversions import rules def getcode(response): for i in rules: if response in i: return i[-1] sections = [] for i in rules: for t in i[:-1]: sections.append(t) def bot_response(t): userinput = t