Ejemplo n.º 1
0
    def load():
        """
        Loads an embeddings model and questions.db database.

        Returns:
            (embeddings, db handle)
        """

        path = Models.modelPath("stackexchange")
        dbfile = os.path.join(path, "questions.db")

        if os.path.isfile(os.path.join(path, "config")):
            print("Loading model from %s" % path)
            embeddings = Embeddings()
            embeddings.load(path)
        else:
            print("ERROR: loading model: ensure model is installed")
            print(
                "ERROR: Pre-trained model can be installed by running python -m codequestion.download"
            )
            raise FileNotFoundError(
                "Unable to load codequestion model from %s" % path)

        # Connect to database file
        db = sqlite3.connect(dbfile)

        return (embeddings, db)
Ejemplo n.º 2
0
def is_most_relevant(query, sources, threshold=0.2, use_api=False):
    if use_api:
        r = requests.post(url + ':8080/is_most_relevant',
                          data=json.dumps({
                              'query': query,
                              'sources': sources
                          }))
        return r.json()
    embeddings = Embeddings({
        "method":
        "transformers",
        "path":
        "sentence-transformers/bert-base-nli-mean-tokens"
    })
    sections = sources
    similarities = embeddings.similarity(query, sections)
    result = zip(sources, similarities)
    result = sorted(result, key=lambda x: x[1], reverse=True)
    print(result)
    result = [x for x in result if x[1] > threshold]
    if result == []:
        return {'result': result}
    result = [result[0][0]]
    response = {'result': result}
    return response
Ejemplo n.º 3
0
    def load(path):
        """
        Loads an embeddings model and db database.

        Args:
            path: model path, if None uses default path

        Returns:
            (embeddings, db handle)
        """

        # Default path if not provided
        if not path:
            path = Models.modelPath()

        dbfile = os.path.join(path, "articles.sqlite")

        if os.path.isfile(os.path.join(path, "config")):
            print("Loading model from %s" % path)
            embeddings = Embeddings()
            embeddings.load(path)
        else:
            print("ERROR: loading model: ensure model is present")
            raise FileNotFoundError("Unable to load model from %s" % path)

        # Connect to database file
        db = sqlite3.connect(dbfile)

        return (embeddings, db)
Ejemplo n.º 4
0
    def preloop(self):
        # Create embeddings model, backed by sentence-transformers & transformers
        self.embeddings = Embeddings({"method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens"})

        self.data = [
            "US tops 5 million confirmed virus cases",
            "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
            "Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
            "The National Park Service warns against sacrificing slower friends in a bear attack",
            "Maine man wins $1M from $25 lottery ticket",
            "Make huge profits without work, earn up to $100,000 a day",
        ]
Ejemplo n.º 5
0
    def build(self, components):
        """
        Builds a workflow using components.

        Args:
            components: list of components to add to workflow
        """

        # Clear application
        self.__init__()

        # pylint: disable=W0108
        tasks = []
        for component in components:
            wtype = component.pop("type")
            self.components[wtype] = component

            if wtype == "summary":
                self.pipelines[wtype] = Summary(component.pop("path"))
                tasks.append(
                    Task(lambda x: self.pipelines["summary"]
                         (x, **self.components["summary"])))

            elif wtype == "segment":
                self.pipelines[wtype] = Segmentation(
                    **self.components["segment"])
                tasks.append(Task(self.pipelines["segment"]))

            elif wtype == "textract":
                self.pipelines[wtype] = Textractor(
                    **self.components["textract"])
                tasks.append(UrlTask(self.pipelines["textract"]))

            elif wtype == "transcribe":
                self.pipelines[wtype] = Transcription(component.pop("path"))
                tasks.append(UrlTask(self.pipelines["transcribe"], r".\.wav$"))

            elif wtype == "translate":
                self.pipelines[wtype] = Translation()
                tasks.append(
                    Task(lambda x: self.pipelines["translate"]
                         (x, **self.components["translate"])))

            elif wtype == "embeddings":
                self.embeddings = Embeddings({
                    "method": "transformers",
                    **component
                })
                self.documents = Documents()
                tasks.append(Task(self.documents.add, unpack=False))

        self.workflow = Workflow(tasks)
Ejemplo n.º 6
0
    def testComplexWorkflow(self):
        """
        Tests a complex workflow
        """

        textractor = Textractor(paragraphs=True, minlength=150, join=True)
        summary = Summary("sshleifer/distilbart-xsum-12-1")

        embeddings = Embeddings({
            "method":
            "transformers",
            "path":
            "sentence-transformers/bert-base-nli-mean-tokens"
        })
        documents = Documents()

        def index(x):
            documents.add(x)
            return x

        # Extract text and summarize articles
        articles = Workflow(
            [FileTask(textractor),
             Task(lambda x: summary(x, maxlength=15))])

        # Complex workflow that extracts text, runs summarization then loads into an embeddings index
        tasks = [WorkflowTask(articles, r".\.pdf$"), Task(index, unpack=False)]

        data = [
            "file://" + Utils.PATH + "/article.pdf",
            "Workflows can process audio files, documents and snippets"
        ]

        # Convert file paths to data tuples
        data = [(x, element, None) for x, element in enumerate(data)]

        # Execute workflow, discard results as they are streamed
        workflow = Workflow(tasks)
        for _ in workflow(data):
            pass

        # Build the embeddings index
        embeddings.index(documents)

        # Cleanup temporary storage
        documents.close()

        # Run search and validate result
        index, _ = embeddings.search("search text", 1)[0]
        self.assertEqual(index, 0)
Ejemplo n.º 7
0
class Shell(Cmd):
    """
    Query shell.
    """

    def __init__(self):
        super().__init__()

        self.intro = "query shell"
        self.prompt = "(search) "

        self.embeddings = None
        self.data = None

    def preloop(self):
        # Create embeddings model, backed by sentence-transformers & transformers
        self.embeddings = Embeddings({"method": "transformers", "path": "sentence-transformers/bert-base-nli-mean-tokens"})

        self.data = [
            "US tops 5 million confirmed virus cases",
            "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
            "Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
            "The National Park Service warns against sacrificing slower friends in a bear attack",
            "Maine man wins $1M from $25 lottery ticket",
            "Make huge profits without work, earn up to $100,000 a day",
        ]

    def default(self, line):
        # Get index of best section that best matches query
        uid = self.embeddings.similarity(line, self.data)[0][0]
        print(self.data[uid])
        print()
Ejemplo n.º 8
0
    def setUpClass(cls):
        """
        Create single extractor instance.
        """

        sections = [
            "Giants hit 3 HRs to down Dodgers", "Giants 5 Dodgers 4 final",
            "Dodgers drop Game 2 against the Giants, 5-4",
            "Blue Jays 2 Red Sox 1 final",
            "Red Sox lost to the Blue Jays, 2-1",
            "Blue Jays at Red Sox is over. Score: 2-1",
            "Phillies win over the Braves, 5-0", "Phillies 5 Braves 0 final",
            "Final: Braves lose to the Phillies in the series opener, 5-0",
            "Final score: Flyers 4 Lightning 1", "Flyers 4 Lightning 1 final",
            "Flyers win 4-1"
        ]

        # Add unique id to each section to assist with qa extraction
        cls.sections = [(uid, section) for uid, section in enumerate(sections)]

        # Create embeddings model, backed by sentence-transformers & transformers
        cls.embeddings = Embeddings({
            "method":
            "transformers",
            "path":
            "sentence-transformers/bert-base-nli-mean-tokens"
        })

        # Create extractor instance
        cls.extractor = Extractor(cls.embeddings,
                                  "distilbert-base-cased-distilled-squad")
Ejemplo n.º 9
0
def build(directory):
    """
    Builds an image embeddings index.

    Args:
        directory: directory with images

    Returns:
        Embeddings index
    """

    embeddings = Embeddings({
        "method": "transformers",
        "path": "clip-ViT-B-32",
        "modelhub": False
    })
    embeddings.index(images(directory))

    return embeddings
Ejemplo n.º 10
0
    def load():
        """
        Loads a questions database and pre-trained embeddings model

        Returns:
            (db, embeddings)
        """

        print("Loading model")

        path = Models.modelPath("stackexchange")
        dbfile = os.path.join(path, "questions.db")

        # Connect to database file
        db = sqlite3.connect(dbfile)

        # Loading embeddings model
        embeddings = Embeddings()
        embeddings.load(path)

        return db, embeddings
Ejemplo n.º 11
0
    def embeddings(dbfile):
        """
        Builds a sentence embeddings index.

        Args:
            dbfile: input SQLite file

        Returns:
            embeddings index
        """

        embeddings = Embeddings({
            "path":
            Models.vectorPath("stackexchange-300d.magnitude"),
            "storevectors":
            True,
            "scoring":
            "bm25",
            "pca":
            3,
            "quantize":
            True
        })

        # Build scoring index if scoring method provided
        if embeddings.config.get("scoring"):
            embeddings.score(Index.stream(dbfile))

        # Build embeddings index
        embeddings.index(Index.stream(dbfile))

        return embeddings
Ejemplo n.º 12
0
    def embeddings(dbfile, vectors, maxsize):
        """
        Builds a sentence embeddings index.

        Args:
            dbfile: input SQLite file
            vectors: vector path
            maxsize: maximum number of documents to process

        Returns:
            embeddings index
        """

        embeddings = Embeddings({"path": vectors,
                                 "scoring": "bm25",
                                 "pca": 3,
                                 "quantize": True})

        # Build scoring index if scoring method provided
        if embeddings.config.get("scoring"):
            embeddings.score(Index.stream(dbfile, maxsize))

        # Build embeddings index
        embeddings.index(Index.stream(dbfile, maxsize))

        return embeddings
Ejemplo n.º 13
0
    def testQA(self):
        """
        Test qa extraction
        """

        # Create embeddings model, backed by sentence-transformers & transformers
        embeddings = Embeddings({
            "method":
            "transformers",
            "path":
            "sentence-transformers/bert-base-nli-mean-tokens"
        })

        # Create extractor instance
        extractor = Extractor(embeddings,
                              "distilbert-base-cased-distilled-squad")

        sections = [
            "Giants hit 3 HRs to down Dodgers", "Giants 5 Dodgers 4 final",
            "Dodgers drop Game 2 against the Giants, 5-4",
            "Blue Jays 2 Red Sox 1 final",
            "Red Sox lost to the Blue Jays, 2-1",
            "Blue Jays at Red Sox is over. Score: 2-1",
            "Phillies win over the Braves, 5-0", "Phillies 5 Braves 0 final",
            "Final: Braves lose to the Phillies in the series opener, 5-0",
            "Final score: Flyers 4 Lightning 1", "Flyers 4 Lightning 1 final",
            "Flyers win 4-1"
        ]

        # Add unique id to each section to assist with qa extraction
        sections = [(uid, section) for uid, section in enumerate(sections)]

        questions = ["What team won the game?", "What was score?"]

        execute = lambda query: extractor(sections, [
            (question, query, question, False) for question in questions
        ])

        answers = execute("Red Sox - Blue Jays")
        self.assertEqual("Blue Jays", answers[0][1])
        self.assertEqual("2-1", answers[1][1])

        # Ad-hoc questions
        question = "What hockey team won?"

        answers = extractor(sections, [(question, question, question, False)])
        self.assertEqual("Flyers", answers[0][1])
Ejemplo n.º 14
0
def is_relevant_document(query, documents, threshold=0.2, use_api=False):
    if use_api:
        r = requests.post(url + ':8080/is_relevant_document',
                          data=json.dumps({
                              'query': query,
                              'documents': documents
                          }))
        return r.json()
    embeddings = Embeddings({
        "method":
        "transformers",
        "path":
        "sentence-transformers/bert-base-nli-mean-tokens"
    })
    sources_dictionary = {}
    for i in range(len(documents)):
        document = documents[i]
        document = document.replace('!', '.')
        sentences = document.split('.')
        for sentence in sentences:
            sources_dictionary[sentence] = i
    sources = list(sources_dictionary.keys())
    sources = prune(sources)
    is_relevant_response = is_relevant(query,
                                       sources,
                                       threshold=0.2,
                                       use_api=False)
    is_relevant_result = is_relevant_response['result']
    print(is_relevant_result)
    indices_result = [
        sources_dictionary[source] for source in is_relevant_result
    ]
    indices_result = np.unique(indices_result)
    print(indices_result)
    result = [documents[index] for index in indices_result]
    response = {'result': result}
    return response
Ejemplo n.º 15
0
    def testWords(self):
        """
        Test embeddings backed by word vectors
        """

        # Initialize model path
        path = os.path.join(tempfile.gettempdir(), "model")
        os.makedirs(path, exist_ok=True)

        # Build tokens file
        with tempfile.NamedTemporaryFile(mode="w", delete=False) as output:
            tokens = output.name
            for x in self.data:
                output.write(x + "\n")

        # Word vectors path
        vectors = os.path.join(path, "test-300d")

        # Build word vectors, if they don't already exist
        WordVectors.build(tokens, 300, 1, vectors)

        # Create dataset
        data = [(x, row, None) for x, row in enumerate(self.data)]

        # Create embeddings model, backed by word vectors
        embeddings = Embeddings({
            "path": vectors + ".magnitude",
            "storevectors": True,
            "scoring": "bm25",
            "pca": 3,
            "quantize": True
        })

        # Call scoring and index methods
        embeddings.score(data)
        embeddings.index(data)

        # Test search
        self.assertIsNotNone(embeddings.search("win", 1))
Ejemplo n.º 16
0
    def train(vector, score):
        """
        Trains an Embeddings model on STS dev + train data.

        Args:
            vector: word vector model path
            score: scoring method (bm25, sif, tfidf or None for averaging)

        Returns:
            trained Embeddings model
        """

        print("Building model")
        embeddings = Embeddings({
            "path": Models.vectorPath(vector),
            "scoring": score,
            "pca": 3
        })

        rows1 = STS.read(Models.testPath("stsbenchmark", "sts-dev.csv"))
        rows2 = STS.read(Models.testPath("stsbenchmark", "sts-train.csv"))

        rows = rows1 + rows2

        documents = []
        for row in rows:
            tokens = Tokenizer.tokenize(row[2] + " " + row[3])

            if tokens:
                documents.append((row[0], tokens, None))
            else:
                print("Skipping all stop word string: ", row)

        # Build scoring index if scoring method provided
        if embeddings.config.get("scoring"):
            embeddings.score(documents)

        # Build embeddings index
        embeddings.index(documents)

        return embeddings
Ejemplo n.º 17
0
    def embeddings(index, database):
        """
        Builds an embeddings index.

        Args:
            index: index configuration
            database: database handle with content to index
        """

        # Create embeddings model, backed by sentence-transformers & transformers
        embeddings = Embeddings(index["embeddings"])

        database.execute("SELECT Id, Title FROM articles")

        # Create an index for the list of articles
        articles = [(uid, text, None) for uid, text in database.cur.fetchall()]
        embeddings.index(articles)

        logging.info("Built embedding index over %d stored articles", len(articles))

        # Save index
        embeddings.save(index["path"])
Ejemplo n.º 18
0
    def embeddings(dbfile, vectors, maxsize):
        """
        Builds a sentence embeddings index.

        Args:
            dbfile: input SQLite file
            vectors: path to vectors file or configuration
            maxsize: maximum number of documents to process

        Returns:
            embeddings index
        """

        # Read config and create Embeddings instance
        embeddings = Embeddings(Index.config(vectors))

        # Build scoring index if scoring method provided
        if embeddings.config.get("scoring"):
            embeddings.score(Index.stream(dbfile, maxsize))

        # Build embeddings index
        embeddings.index(Index.stream(dbfile, maxsize))

        return embeddings
Ejemplo n.º 19
0
class Application:
    """
    Streamlit application.
    """

    def __init__(self):
        """
        Creates a new Streamlit application.
        """

        # Component options
        self.components = {}

        # Defined pipelines
        self.pipelines = {}

        # Current workflow
        self.workflow = []

        # Embeddings index params
        self.embeddings = None
        self.documents = None
        self.data = None

    def number(self, label):
        """
        Extracts a number from a text input field.

        Args:
            label: label to use for text input field

        Returns:
            numeric input
        """

        value = st.sidebar.text_input(label)
        return int(value) if value else None

    def options(self, component):
        """
        Extracts component settings into a component configuration dict.

        Args:
            component: component type

        Returns:
            dict with component settings
        """

        options = {"type": component}

        st.sidebar.markdown("---")
        if component == "summary":
            st.sidebar.markdown("**Summary**  \n*Abstractive text summarization*")
            options["path"] = st.sidebar.text_input("Model", value="sshleifer/distilbart-cnn-12-6")
            options["minlength"] = self.number("Min length")
            options["maxlength"] = self.number("Max length")

        elif component in ("segment", "textract"):
            if component == "segment":
                st.sidebar.markdown("**Segment**  \n*Split text into semantic units*")
            else:
                st.sidebar.markdown("**Textractor**  \n*Extract text from documents*")

            options["sentences"] = st.sidebar.checkbox("Split sentences")
            options["lines"] = st.sidebar.checkbox("Split lines")
            options["paragraphs"] = st.sidebar.checkbox("Split paragraphs")
            options["join"] = st.sidebar.checkbox("Join tokenized")
            options["minlength"] = self.number("Min section length")

        elif component == "transcribe":
            st.sidebar.markdown("**Transcribe**  \n*Transcribe audio to text*")
            options["path"] = st.sidebar.text_input("Model", value="facebook/wav2vec2-base-960h")

        elif component == "translate":
            st.sidebar.markdown("**Translate**  \n*Machine translation*")
            options["target"] = st.sidebar.text_input("Target language code", value="en")

        elif component == "embeddings":
            st.sidebar.markdown("**Embeddings Index**  \n*Index workflow output*")
            options["path"] = st.sidebar.text_area("Embeddings model path", value="sentence-transformers/bert-base-nli-mean-tokens")

        return options

    def build(self, components):
        """
        Builds a workflow using components.

        Args:
            components: list of components to add to workflow
        """

        # Clear application
        self.__init__()

        # pylint: disable=W0108
        tasks = []
        for component in components:
            wtype = component.pop("type")
            self.components[wtype] = component

            if wtype == "summary":
                self.pipelines[wtype] = Summary(component.pop("path"))
                tasks.append(Task(lambda x: self.pipelines["summary"](x, **self.components["summary"])))

            elif wtype == "segment":
                self.pipelines[wtype] = Segmentation(**self.components["segment"])
                tasks.append(Task(self.pipelines["segment"]))

            elif wtype == "textract":
                self.pipelines[wtype] = Textractor(**self.components["textract"])
                tasks.append(FileTask(self.pipelines["textract"]))

            elif wtype == "transcribe":
                self.pipelines[wtype] = Transcription(component.pop("path"))
                tasks.append(FileTask(self.pipelines["transcribe"], r".\.wav$"))

            elif wtype == "translate":
                self.pipelines[wtype] = Translation()
                tasks.append(Task(lambda x: self.pipelines["translate"](x, **self.components["translate"])))

            elif wtype == "embeddings":
                self.embeddings = Embeddings({"method": "transformers", **component})
                self.documents = Documents()
                tasks.append(Task(self.documents.add, unpack=False))

        self.workflow = Workflow(tasks)

    def process(self, data):
        """
        Processes the current application action.

        Args:
            data: input data
        """

        if data and self.workflow:
            # Build tuples for embedding index
            if self.documents:
                data = [(x, element, None) for x, element in enumerate(data)]

            # Process workflow
            for result in self.workflow(data):
                if not self.documents:
                    st.write(result)

            # Build embeddings index
            if self.documents:
                # Cache data
                self.data = [x[1] for x in self.documents]

                with st.spinner("Building embedding index...."):
                    self.embeddings.index(self.documents)
                    self.documents.close()

                # Clear workflow
                self.documents, self.pipelines, self.workflow = None, None, None

        if self.embeddings and self.data:
            # Set query and limit
            query = st.text_input("Query")
            limit = min(5, len(self.data))

            st.markdown(
                """
            <style>
            table td:nth-child(1) {
                display: none
            }
            table th:nth-child(1) {
                display: none
            }
            table {text-align: left !important}
            </style>
            """,
                unsafe_allow_html=True,
            )

            if query:
                df = pd.DataFrame([{"content": self.data[uid], "score": score} for uid, score in self.embeddings.search(query, limit)])
                st.table(df)

    def run(self):
        """
        Runs Streamlit application.
        """

        st.sidebar.image("https://github.com/neuml/txtai/raw/master/logo.png", width=256)
        st.sidebar.markdown("# Workflow builder  \n*Build and apply workflows to data*  ")

        # Get selected components
        selected = st.sidebar.multiselect("Select components", ["embeddings", "segment", "summary", "textract", "transcribe", "translate"])

        # Get selected options
        components = [self.options(component) for component in selected]
        st.sidebar.markdown("---")

        # Build or re-build workflow when build button clicked
        build = st.sidebar.button("Build")
        if build:
            with st.spinner("Building workflow...."):
                self.build(components)

        with st.beta_expander("Data", expanded=not self.data):
            data = st.text_area("Input", height=10)

        # Parse text items
        data = [x for x in data.split("\n") if x] if "file://" in data else [data]

        # Process current action
        self.process(data)
Ejemplo n.º 20
0
from txtai.embeddings import Embeddings
from txtai.tokenizer import Tokenizer

import numpy as np
import sys

embeddings = Embeddings({'method': 'transformers', 'path': 'sentence-transformers/roberta-base-nli-stsb-mean-tokens'})
embeddings = Embeddings({'method': 'transformers', 'path': 'sentence-transformers/bert-base-nli-stsb-mean-tokens'})

input_file = sys.argv[1]
mode = sys.argv[2]

index_name = 'index'

with open(input_file, 'r') as infile:
    sections = infile.readlines()

# Create an index for the list of sections
doc_dict = {}
index_text = []

for uid, text in enumerate(sections):
    doc_dict[uid] = text.split('\t')
    session_id, raw_text = doc_dict[uid][:2]
    if len(raw_text) > 250:
        raw_text = Tokenizer.tokenize(raw_text)
        index_text.append((uid, raw_text, None))

if mode == 'index':
    print("--indexing-- %d documents" % (len(index_text)))
    embeddings.index(index_text)
Ejemplo n.º 21
0
from txtai.embeddings import Embeddings 
import numpy as np

embeddings = Embeddings({'method': 'transformers', 'path': 'sentence-transformers/roberta-base-nli-stsb-mean-tokens'})

# Create an index for the list of sections
embeddings.index([(uid, text, None) for uid, text in enumerate(sections)])

print("%-20s %s" % ("Query", "Best Match"))
print("-" * 50)

# Run an embeddings search for each query
for query in ("feel good story", "climate change", "health", "war", "wildlife", "asia",
              "north america", "dishonest junk"):
    # Extract uid of first result
    # search result format: (uid, score)
    uid = embeddings.search(query, 1)[0][0]

    # Print section
    print("%-20s %s" % (query, sections[uid]))

Ejemplo n.º 22
0
                        help="which evaluation task")

            

    args = parser.parse_args()


    
    sentence_transformer_path = Path(args.sent_trans_path)
    embeddings_index_path = Path(args.embeddings_index_path)
    predictions_path = Path(args.predictions_path)

    if args.create_embed_index:
        #create index
        #uniqueindex is your list of terms
        embeddings = Embeddings({"method": "transformers", "path": sentence_transformer_path.__str__(),"quantize":True})
        embeddings.index([(uid, text, None) for uid, text in enumerate(uniqueterms)])
        embeddings.save("embedding_index")
    else:
        #load index 
        embeddings = Embeddings()
        #hack to port an embedding_index created on another machine with other dir structure
        with open("%s/config" % "embedding_index", "rb") as handle:
                config = pickle.load(handle)
        config["path"] = sentence_transformer_path.__str__()
        with open("%s/config" % "embedding_index", "wb") as handle:
                config = pickle.dump(config,handle)
        embeddings.load("embedding_index")

    ### load predictions on CORD-19 abstracts to create KB
    kb = pd.read_csv(predictions_path,usecols=["doc_id","sentence","span1","span2","relation_tag","conf","span1_lemma","span2_lemma"],sep="\t")
Ejemplo n.º 23
0
class TestEmbeddings(unittest.TestCase):
    """
    Embeddings tests
    """
    def setUp(self):
        """
        Initialize test data.
        """

        self.data = [
            "US tops 5 million confirmed virus cases",
            "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
            "Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
            "The National Park Service warns against sacrificing slower friends in a bear attack",
            "Maine man wins $1M from $25 lottery ticket",
            "Make huge profits without work, earn up to $100,000 a day"
        ]

        # Create embeddings model, backed by sentence-transformers & transformers
        self.embeddings = Embeddings({
            "method":
            "transformers",
            "path":
            "sentence-transformers/bert-base-nli-mean-tokens"
        })

    def testIndex(self):
        """
        Test embeddings.index
        """

        # Create an index for the list of sections
        self.embeddings.index([(uid, text, None)
                               for uid, text in enumerate(self.data)])

        # Search for best match
        uid = self.embeddings.search("feel good story", 1)[0][0]

        self.assertEqual(self.data[uid], self.data[4])

    def testSave(self):
        """
        Test embeddings.save
        """

        # Create an index for the list of sections
        self.embeddings.index([(uid, text, None)
                               for uid, text in enumerate(self.data)])

        # Generate temp file path
        index = os.path.join(tempfile.gettempdir(), "embeddings")

        self.embeddings.save(index)
        self.embeddings.load(index)

        # Search for best match
        uid = self.embeddings.search("feel good story", 1)[0][0]

        self.assertEqual(self.data[uid], self.data[4])

    def testSimilarity(self):
        """
        Test embeddings.similarity
        """

        # Get best matching id
        uid = np.argmax(
            self.embeddings.similarity("feel good story", self.data))

        self.assertEqual(self.data[uid], self.data[4])

    def testWords(self):
        """
        Test embeddings backed by word vectors
        """

        # Initialize model path
        path = os.path.join(tempfile.gettempdir(), "model")
        os.makedirs(path, exist_ok=True)

        # Build tokens file
        with tempfile.NamedTemporaryFile(mode="w", delete=False) as output:
            tokens = output.name
            for x in self.data:
                output.write(x + "\n")

        # Word vectors path
        vectors = os.path.join(path, "test-300d")

        # Build word vectors, if they don't already exist
        WordVectors.build(tokens, 300, 1, vectors)

        # Create dataset
        data = [(x, row, None) for x, row in enumerate(self.data)]

        # Create embeddings model, backed by word vectors
        embeddings = Embeddings({
            "path": vectors + ".magnitude",
            "storevectors": True,
            "scoring": "bm25",
            "pca": 3,
            "quantize": True
        })

        # Call scoring and index methods
        embeddings.score(data)
        embeddings.index(data)

        # Test search
        self.assertIsNotNone(embeddings.search("win", 1))
Ejemplo n.º 24
0
from flask import Flask, render_template, redirect, url_for, request
from flask import jsonify
from txtai.embeddings import Embeddings

# Create embeddings model, backed by sentence-transformers & transformers
embeddings = Embeddings({
    "method":
    "transformers",
    "path":
    "sentence-transformers/bert-base-nli-mean-tokens"
})
app = Flask(__name__)

from conversions import rules


def getcode(response):
    for i in rules:
        if response in i:
            return i[-1]


sections = []

for i in rules:
    for t in i[:-1]:
        sections.append(t)


def bot_response(t):
    userinput = t