def testWords(self): """ Test embeddings backed by word vectors """ # Initialize model path path = os.path.join(tempfile.gettempdir(), "model") os.makedirs(path, exist_ok=True) # Build tokens file with tempfile.NamedTemporaryFile(mode="w", delete=False) as output: tokens = output.name for x in self.data: output.write(x + "\n") # Word vectors path vectors = os.path.join(path, "test-300d") # Build word vectors, if they don't already exist WordVectors.build(tokens, 300, 1, vectors) # Create dataset data = [(x, row, None) for x, row in enumerate(self.data)] # Create embeddings model, backed by word vectors embeddings = Embeddings({ "path": vectors + ".magnitude", "storevectors": True, "scoring": "bm25", "pca": 3, "quantize": True }) # Call scoring and index methods embeddings.score(data) embeddings.index(data) # Test search self.assertIsNotNone(embeddings.search("win", 1)) # Generate temp file path index = os.path.join(tempfile.gettempdir(), "wembeddings") # Test save/load embeddings.save(index) embeddings.load(index) # Test search self.assertIsNotNone(embeddings.search("win", 1))
def testComplexWorkflow(self): """ Tests a complex workflow """ textractor = Textractor(paragraphs=True, minlength=150, join=True) summary = Summary("sshleifer/distilbart-xsum-12-1") embeddings = Embeddings({ "method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens" }) documents = Documents() def index(x): documents.add(x) return x # Extract text and summarize articles articles = Workflow( [FileTask(textractor), Task(lambda x: summary(x, maxlength=15))]) # Complex workflow that extracts text, runs summarization then loads into an embeddings index tasks = [WorkflowTask(articles, r".\.pdf$"), Task(index, unpack=False)] data = [ "file://" + Utils.PATH + "/article.pdf", "Workflows can process audio files, documents and snippets" ] # Convert file paths to data tuples data = [(x, element, None) for x, element in enumerate(data)] # Execute workflow, discard results as they are streamed workflow = Workflow(tasks) for _ in workflow(data): pass # Build the embeddings index embeddings.index(documents) # Cleanup temporary storage documents.close() # Run search and validate result index, _ = embeddings.search("search text", 1)[0] self.assertEqual(index, 0)
index_name = 'index' with open(input_file, 'r') as infile: sections = infile.readlines() # Create an index for the list of sections doc_dict = {} index_text = [] for uid, text in enumerate(sections): doc_dict[uid] = text.split('\t') session_id, raw_text = doc_dict[uid][:2] if len(raw_text) > 250: raw_text = Tokenizer.tokenize(raw_text) index_text.append((uid, raw_text, None)) if mode == 'index': print("--indexing-- %d documents" % (len(index_text))) embeddings.index(index_text) embeddings.save(index_name) elif mode == 'search': print("--searching-- %d documents" % (len(index_text))) embeddings.load(index_name) for query in ("the milestones for our seed round", "what is possible today", "My philosophy has always been don't solve the human", "story about Larry", "biological memory", "short-term memory", "memory blocks and memory stack", "the company where i programmed robots", "nothing to do with us"): # Extract uid of first result # search result format: (uid, score) print(query) for i in range(0, 3): uid = embeddings.search(query, 3)[i][0] print("%-20s %s" % (query, doc_dict[uid]))
from txtai.embeddings import Embeddings import numpy as np embeddings = Embeddings({'method': 'transformers', 'path': 'sentence-transformers/roberta-base-nli-stsb-mean-tokens'}) # Create an index for the list of sections embeddings.index([(uid, text, None) for uid, text in enumerate(sections)]) print("%-20s %s" % ("Query", "Best Match")) print("-" * 50) # Run an embeddings search for each query for query in ("feel good story", "climate change", "health", "war", "wildlife", "asia", "north america", "dishonest junk"): # Extract uid of first result # search result format: (uid, score) uid = embeddings.search(query, 1)[0][0] # Print section print("%-20s %s" % (query, sections[uid]))
class TestEmbeddings(unittest.TestCase): """ Embeddings tests """ def setUp(self): """ Initialize test data. """ self.data = [ "US tops 5 million confirmed virus cases", "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg", "Beijing mobilises invasion craft along coast as Taiwan tensions escalate", "The National Park Service warns against sacrificing slower friends in a bear attack", "Maine man wins $1M from $25 lottery ticket", "Make huge profits without work, earn up to $100,000 a day" ] # Create embeddings model, backed by sentence-transformers & transformers self.embeddings = Embeddings({ "method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens" }) def testIndex(self): """ Test embeddings.index """ # Create an index for the list of sections self.embeddings.index([(uid, text, None) for uid, text in enumerate(self.data)]) # Search for best match uid = self.embeddings.search("feel good story", 1)[0][0] self.assertEqual(self.data[uid], self.data[4]) def testSave(self): """ Test embeddings.save """ # Create an index for the list of sections self.embeddings.index([(uid, text, None) for uid, text in enumerate(self.data)]) # Generate temp file path index = os.path.join(tempfile.gettempdir(), "embeddings") self.embeddings.save(index) self.embeddings.load(index) # Search for best match uid = self.embeddings.search("feel good story", 1)[0][0] self.assertEqual(self.data[uid], self.data[4]) def testSimilarity(self): """ Test embeddings.similarity """ # Get best matching id uid = np.argmax( self.embeddings.similarity("feel good story", self.data)) self.assertEqual(self.data[uid], self.data[4]) def testWords(self): """ Test embeddings backed by word vectors """ # Initialize model path path = os.path.join(tempfile.gettempdir(), "model") os.makedirs(path, exist_ok=True) # Build tokens file with tempfile.NamedTemporaryFile(mode="w", delete=False) as output: tokens = output.name for x in self.data: output.write(x + "\n") # Word vectors path vectors = os.path.join(path, "test-300d") # Build word vectors, if they don't already exist WordVectors.build(tokens, 300, 1, vectors) # Create dataset data = [(x, row, None) for x, row in enumerate(self.data)] # Create embeddings model, backed by word vectors embeddings = Embeddings({ "path": vectors + ".magnitude", "storevectors": True, "scoring": "bm25", "pca": 3, "quantize": True }) # Call scoring and index methods embeddings.score(data) embeddings.index(data) # Test search self.assertIsNotNone(embeddings.search("win", 1))
class Application: """ Streamlit application. """ def __init__(self): """ Creates a new Streamlit application. """ # Component options self.components = {} # Defined pipelines self.pipelines = {} # Current workflow self.workflow = [] # Embeddings index params self.embeddings = None self.documents = None self.data = None def number(self, label): """ Extracts a number from a text input field. Args: label: label to use for text input field Returns: numeric input """ value = st.sidebar.text_input(label) return int(value) if value else None def options(self, component): """ Extracts component settings into a component configuration dict. Args: component: component type Returns: dict with component settings """ options = {"type": component} st.sidebar.markdown("---") if component == "summary": st.sidebar.markdown("**Summary** \n*Abstractive text summarization*") options["path"] = st.sidebar.text_input("Model", value="sshleifer/distilbart-cnn-12-6") options["minlength"] = self.number("Min length") options["maxlength"] = self.number("Max length") elif component in ("segment", "textract"): if component == "segment": st.sidebar.markdown("**Segment** \n*Split text into semantic units*") else: st.sidebar.markdown("**Textractor** \n*Extract text from documents*") options["sentences"] = st.sidebar.checkbox("Split sentences") options["lines"] = st.sidebar.checkbox("Split lines") options["paragraphs"] = st.sidebar.checkbox("Split paragraphs") options["join"] = st.sidebar.checkbox("Join tokenized") options["minlength"] = self.number("Min section length") elif component == "transcribe": st.sidebar.markdown("**Transcribe** \n*Transcribe audio to text*") options["path"] = st.sidebar.text_input("Model", value="facebook/wav2vec2-base-960h") elif component == "translate": st.sidebar.markdown("**Translate** \n*Machine translation*") options["target"] = st.sidebar.text_input("Target language code", value="en") elif component == "embeddings": st.sidebar.markdown("**Embeddings Index** \n*Index workflow output*") options["path"] = st.sidebar.text_area("Embeddings model path", value="sentence-transformers/bert-base-nli-mean-tokens") return options def build(self, components): """ Builds a workflow using components. Args: components: list of components to add to workflow """ # Clear application self.__init__() # pylint: disable=W0108 tasks = [] for component in components: wtype = component.pop("type") self.components[wtype] = component if wtype == "summary": self.pipelines[wtype] = Summary(component.pop("path")) tasks.append(Task(lambda x: self.pipelines["summary"](x, **self.components["summary"]))) elif wtype == "segment": self.pipelines[wtype] = Segmentation(**self.components["segment"]) tasks.append(Task(self.pipelines["segment"])) elif wtype == "textract": self.pipelines[wtype] = Textractor(**self.components["textract"]) tasks.append(FileTask(self.pipelines["textract"])) elif wtype == "transcribe": self.pipelines[wtype] = Transcription(component.pop("path")) tasks.append(FileTask(self.pipelines["transcribe"], r".\.wav$")) elif wtype == "translate": self.pipelines[wtype] = Translation() tasks.append(Task(lambda x: self.pipelines["translate"](x, **self.components["translate"]))) elif wtype == "embeddings": self.embeddings = Embeddings({"method": "transformers", **component}) self.documents = Documents() tasks.append(Task(self.documents.add, unpack=False)) self.workflow = Workflow(tasks) def process(self, data): """ Processes the current application action. Args: data: input data """ if data and self.workflow: # Build tuples for embedding index if self.documents: data = [(x, element, None) for x, element in enumerate(data)] # Process workflow for result in self.workflow(data): if not self.documents: st.write(result) # Build embeddings index if self.documents: # Cache data self.data = [x[1] for x in self.documents] with st.spinner("Building embedding index...."): self.embeddings.index(self.documents) self.documents.close() # Clear workflow self.documents, self.pipelines, self.workflow = None, None, None if self.embeddings and self.data: # Set query and limit query = st.text_input("Query") limit = min(5, len(self.data)) st.markdown( """ <style> table td:nth-child(1) { display: none } table th:nth-child(1) { display: none } table {text-align: left !important} </style> """, unsafe_allow_html=True, ) if query: df = pd.DataFrame([{"content": self.data[uid], "score": score} for uid, score in self.embeddings.search(query, limit)]) st.table(df) def run(self): """ Runs Streamlit application. """ st.sidebar.image("https://github.com/neuml/txtai/raw/master/logo.png", width=256) st.sidebar.markdown("# Workflow builder \n*Build and apply workflows to data* ") # Get selected components selected = st.sidebar.multiselect("Select components", ["embeddings", "segment", "summary", "textract", "transcribe", "translate"]) # Get selected options components = [self.options(component) for component in selected] st.sidebar.markdown("---") # Build or re-build workflow when build button clicked build = st.sidebar.button("Build") if build: with st.spinner("Building workflow...."): self.build(components) with st.beta_expander("Data", expanded=not self.data): data = st.text_area("Input", height=10) # Parse text items data = [x for x in data.split("\n") if x] if "file://" in data else [data] # Process current action self.process(data)