Example #1
0
    def testWords(self):
        """
        Test embeddings backed by word vectors
        """

        # Initialize model path
        path = os.path.join(tempfile.gettempdir(), "model")
        os.makedirs(path, exist_ok=True)

        # Build tokens file
        with tempfile.NamedTemporaryFile(mode="w", delete=False) as output:
            tokens = output.name
            for x in self.data:
                output.write(x + "\n")

        # Word vectors path
        vectors = os.path.join(path, "test-300d")

        # Build word vectors, if they don't already exist
        WordVectors.build(tokens, 300, 1, vectors)

        # Create dataset
        data = [(x, row, None) for x, row in enumerate(self.data)]

        # Create embeddings model, backed by word vectors
        embeddings = Embeddings({
            "path": vectors + ".magnitude",
            "storevectors": True,
            "scoring": "bm25",
            "pca": 3,
            "quantize": True
        })

        # Call scoring and index methods
        embeddings.score(data)
        embeddings.index(data)

        # Test search
        self.assertIsNotNone(embeddings.search("win", 1))

        # Generate temp file path
        index = os.path.join(tempfile.gettempdir(), "wembeddings")

        # Test save/load
        embeddings.save(index)
        embeddings.load(index)

        # Test search
        self.assertIsNotNone(embeddings.search("win", 1))
Example #2
0
    def testComplexWorkflow(self):
        """
        Tests a complex workflow
        """

        textractor = Textractor(paragraphs=True, minlength=150, join=True)
        summary = Summary("sshleifer/distilbart-xsum-12-1")

        embeddings = Embeddings({
            "method":
            "transformers",
            "path":
            "sentence-transformers/bert-base-nli-mean-tokens"
        })
        documents = Documents()

        def index(x):
            documents.add(x)
            return x

        # Extract text and summarize articles
        articles = Workflow(
            [FileTask(textractor),
             Task(lambda x: summary(x, maxlength=15))])

        # Complex workflow that extracts text, runs summarization then loads into an embeddings index
        tasks = [WorkflowTask(articles, r".\.pdf$"), Task(index, unpack=False)]

        data = [
            "file://" + Utils.PATH + "/article.pdf",
            "Workflows can process audio files, documents and snippets"
        ]

        # Convert file paths to data tuples
        data = [(x, element, None) for x, element in enumerate(data)]

        # Execute workflow, discard results as they are streamed
        workflow = Workflow(tasks)
        for _ in workflow(data):
            pass

        # Build the embeddings index
        embeddings.index(documents)

        # Cleanup temporary storage
        documents.close()

        # Run search and validate result
        index, _ = embeddings.search("search text", 1)[0]
        self.assertEqual(index, 0)
Example #3
0
index_name = 'index'

with open(input_file, 'r') as infile:
    sections = infile.readlines()

# Create an index for the list of sections
doc_dict = {}
index_text = []

for uid, text in enumerate(sections):
    doc_dict[uid] = text.split('\t')
    session_id, raw_text = doc_dict[uid][:2]
    if len(raw_text) > 250:
        raw_text = Tokenizer.tokenize(raw_text)
        index_text.append((uid, raw_text, None))

if mode == 'index':
    print("--indexing-- %d documents" % (len(index_text)))
    embeddings.index(index_text)
    embeddings.save(index_name)
elif mode == 'search':
    print("--searching-- %d documents" % (len(index_text)))
    embeddings.load(index_name)
    for query in ("the milestones for our seed round", "what is possible today", "My philosophy has always been don't solve the human", "story about Larry", "biological memory", "short-term memory", "memory blocks and memory stack", "the company where i programmed robots", "nothing to do with us"):
    # Extract uid of first result
    # search result format: (uid, score)
        print(query)
        for i in range(0, 3):
            uid = embeddings.search(query, 3)[i][0]
            print("%-20s %s" % (query, doc_dict[uid]))
Example #4
0
from txtai.embeddings import Embeddings 
import numpy as np

embeddings = Embeddings({'method': 'transformers', 'path': 'sentence-transformers/roberta-base-nli-stsb-mean-tokens'})

# Create an index for the list of sections
embeddings.index([(uid, text, None) for uid, text in enumerate(sections)])

print("%-20s %s" % ("Query", "Best Match"))
print("-" * 50)

# Run an embeddings search for each query
for query in ("feel good story", "climate change", "health", "war", "wildlife", "asia",
              "north america", "dishonest junk"):
    # Extract uid of first result
    # search result format: (uid, score)
    uid = embeddings.search(query, 1)[0][0]

    # Print section
    print("%-20s %s" % (query, sections[uid]))

Example #5
0
class TestEmbeddings(unittest.TestCase):
    """
    Embeddings tests
    """
    def setUp(self):
        """
        Initialize test data.
        """

        self.data = [
            "US tops 5 million confirmed virus cases",
            "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
            "Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
            "The National Park Service warns against sacrificing slower friends in a bear attack",
            "Maine man wins $1M from $25 lottery ticket",
            "Make huge profits without work, earn up to $100,000 a day"
        ]

        # Create embeddings model, backed by sentence-transformers & transformers
        self.embeddings = Embeddings({
            "method":
            "transformers",
            "path":
            "sentence-transformers/bert-base-nli-mean-tokens"
        })

    def testIndex(self):
        """
        Test embeddings.index
        """

        # Create an index for the list of sections
        self.embeddings.index([(uid, text, None)
                               for uid, text in enumerate(self.data)])

        # Search for best match
        uid = self.embeddings.search("feel good story", 1)[0][0]

        self.assertEqual(self.data[uid], self.data[4])

    def testSave(self):
        """
        Test embeddings.save
        """

        # Create an index for the list of sections
        self.embeddings.index([(uid, text, None)
                               for uid, text in enumerate(self.data)])

        # Generate temp file path
        index = os.path.join(tempfile.gettempdir(), "embeddings")

        self.embeddings.save(index)
        self.embeddings.load(index)

        # Search for best match
        uid = self.embeddings.search("feel good story", 1)[0][0]

        self.assertEqual(self.data[uid], self.data[4])

    def testSimilarity(self):
        """
        Test embeddings.similarity
        """

        # Get best matching id
        uid = np.argmax(
            self.embeddings.similarity("feel good story", self.data))

        self.assertEqual(self.data[uid], self.data[4])

    def testWords(self):
        """
        Test embeddings backed by word vectors
        """

        # Initialize model path
        path = os.path.join(tempfile.gettempdir(), "model")
        os.makedirs(path, exist_ok=True)

        # Build tokens file
        with tempfile.NamedTemporaryFile(mode="w", delete=False) as output:
            tokens = output.name
            for x in self.data:
                output.write(x + "\n")

        # Word vectors path
        vectors = os.path.join(path, "test-300d")

        # Build word vectors, if they don't already exist
        WordVectors.build(tokens, 300, 1, vectors)

        # Create dataset
        data = [(x, row, None) for x, row in enumerate(self.data)]

        # Create embeddings model, backed by word vectors
        embeddings = Embeddings({
            "path": vectors + ".magnitude",
            "storevectors": True,
            "scoring": "bm25",
            "pca": 3,
            "quantize": True
        })

        # Call scoring and index methods
        embeddings.score(data)
        embeddings.index(data)

        # Test search
        self.assertIsNotNone(embeddings.search("win", 1))
Example #6
0
class Application:
    """
    Streamlit application.
    """

    def __init__(self):
        """
        Creates a new Streamlit application.
        """

        # Component options
        self.components = {}

        # Defined pipelines
        self.pipelines = {}

        # Current workflow
        self.workflow = []

        # Embeddings index params
        self.embeddings = None
        self.documents = None
        self.data = None

    def number(self, label):
        """
        Extracts a number from a text input field.

        Args:
            label: label to use for text input field

        Returns:
            numeric input
        """

        value = st.sidebar.text_input(label)
        return int(value) if value else None

    def options(self, component):
        """
        Extracts component settings into a component configuration dict.

        Args:
            component: component type

        Returns:
            dict with component settings
        """

        options = {"type": component}

        st.sidebar.markdown("---")
        if component == "summary":
            st.sidebar.markdown("**Summary**  \n*Abstractive text summarization*")
            options["path"] = st.sidebar.text_input("Model", value="sshleifer/distilbart-cnn-12-6")
            options["minlength"] = self.number("Min length")
            options["maxlength"] = self.number("Max length")

        elif component in ("segment", "textract"):
            if component == "segment":
                st.sidebar.markdown("**Segment**  \n*Split text into semantic units*")
            else:
                st.sidebar.markdown("**Textractor**  \n*Extract text from documents*")

            options["sentences"] = st.sidebar.checkbox("Split sentences")
            options["lines"] = st.sidebar.checkbox("Split lines")
            options["paragraphs"] = st.sidebar.checkbox("Split paragraphs")
            options["join"] = st.sidebar.checkbox("Join tokenized")
            options["minlength"] = self.number("Min section length")

        elif component == "transcribe":
            st.sidebar.markdown("**Transcribe**  \n*Transcribe audio to text*")
            options["path"] = st.sidebar.text_input("Model", value="facebook/wav2vec2-base-960h")

        elif component == "translate":
            st.sidebar.markdown("**Translate**  \n*Machine translation*")
            options["target"] = st.sidebar.text_input("Target language code", value="en")

        elif component == "embeddings":
            st.sidebar.markdown("**Embeddings Index**  \n*Index workflow output*")
            options["path"] = st.sidebar.text_area("Embeddings model path", value="sentence-transformers/bert-base-nli-mean-tokens")

        return options

    def build(self, components):
        """
        Builds a workflow using components.

        Args:
            components: list of components to add to workflow
        """

        # Clear application
        self.__init__()

        # pylint: disable=W0108
        tasks = []
        for component in components:
            wtype = component.pop("type")
            self.components[wtype] = component

            if wtype == "summary":
                self.pipelines[wtype] = Summary(component.pop("path"))
                tasks.append(Task(lambda x: self.pipelines["summary"](x, **self.components["summary"])))

            elif wtype == "segment":
                self.pipelines[wtype] = Segmentation(**self.components["segment"])
                tasks.append(Task(self.pipelines["segment"]))

            elif wtype == "textract":
                self.pipelines[wtype] = Textractor(**self.components["textract"])
                tasks.append(FileTask(self.pipelines["textract"]))

            elif wtype == "transcribe":
                self.pipelines[wtype] = Transcription(component.pop("path"))
                tasks.append(FileTask(self.pipelines["transcribe"], r".\.wav$"))

            elif wtype == "translate":
                self.pipelines[wtype] = Translation()
                tasks.append(Task(lambda x: self.pipelines["translate"](x, **self.components["translate"])))

            elif wtype == "embeddings":
                self.embeddings = Embeddings({"method": "transformers", **component})
                self.documents = Documents()
                tasks.append(Task(self.documents.add, unpack=False))

        self.workflow = Workflow(tasks)

    def process(self, data):
        """
        Processes the current application action.

        Args:
            data: input data
        """

        if data and self.workflow:
            # Build tuples for embedding index
            if self.documents:
                data = [(x, element, None) for x, element in enumerate(data)]

            # Process workflow
            for result in self.workflow(data):
                if not self.documents:
                    st.write(result)

            # Build embeddings index
            if self.documents:
                # Cache data
                self.data = [x[1] for x in self.documents]

                with st.spinner("Building embedding index...."):
                    self.embeddings.index(self.documents)
                    self.documents.close()

                # Clear workflow
                self.documents, self.pipelines, self.workflow = None, None, None

        if self.embeddings and self.data:
            # Set query and limit
            query = st.text_input("Query")
            limit = min(5, len(self.data))

            st.markdown(
                """
            <style>
            table td:nth-child(1) {
                display: none
            }
            table th:nth-child(1) {
                display: none
            }
            table {text-align: left !important}
            </style>
            """,
                unsafe_allow_html=True,
            )

            if query:
                df = pd.DataFrame([{"content": self.data[uid], "score": score} for uid, score in self.embeddings.search(query, limit)])
                st.table(df)

    def run(self):
        """
        Runs Streamlit application.
        """

        st.sidebar.image("https://github.com/neuml/txtai/raw/master/logo.png", width=256)
        st.sidebar.markdown("# Workflow builder  \n*Build and apply workflows to data*  ")

        # Get selected components
        selected = st.sidebar.multiselect("Select components", ["embeddings", "segment", "summary", "textract", "transcribe", "translate"])

        # Get selected options
        components = [self.options(component) for component in selected]
        st.sidebar.markdown("---")

        # Build or re-build workflow when build button clicked
        build = st.sidebar.button("Build")
        if build:
            with st.spinner("Building workflow...."):
                self.build(components)

        with st.beta_expander("Data", expanded=not self.data):
            data = st.text_area("Input", height=10)

        # Parse text items
        data = [x for x in data.split("\n") if x] if "file://" in data else [data]

        # Process current action
        self.process(data)