Beispiel #1
0
    def load(path):
        """
        Loads an embeddings model and db database.

        Args:
            path: model path, if None uses default path

        Returns:
            (embeddings, db handle)
        """

        # Default path if not provided
        if not path:
            path = Models.modelPath()

        dbfile = os.path.join(path, "articles.sqlite")

        if os.path.isfile(os.path.join(path, "config")):
            print("Loading model from %s" % path)
            embeddings = Embeddings()
            embeddings.load(path)
        else:
            print("ERROR: loading model: ensure model is present")
            raise FileNotFoundError("Unable to load model from %s" % path)

        # Connect to database file
        db = sqlite3.connect(dbfile)

        return (embeddings, db)
Beispiel #2
0
    def load():
        """
        Loads an embeddings model and questions.db database.

        Returns:
            (embeddings, db handle)
        """

        path = Models.modelPath("stackexchange")
        dbfile = os.path.join(path, "questions.db")

        if os.path.isfile(os.path.join(path, "config")):
            print("Loading model from %s" % path)
            embeddings = Embeddings()
            embeddings.load(path)
        else:
            print("ERROR: loading model: ensure model is installed")
            print(
                "ERROR: Pre-trained model can be installed by running python -m codequestion.download"
            )
            raise FileNotFoundError(
                "Unable to load codequestion model from %s" % path)

        # Connect to database file
        db = sqlite3.connect(dbfile)

        return (embeddings, db)
Beispiel #3
0
    def testWords(self):
        """
        Test embeddings backed by word vectors
        """

        # Initialize model path
        path = os.path.join(tempfile.gettempdir(), "model")
        os.makedirs(path, exist_ok=True)

        # Build tokens file
        with tempfile.NamedTemporaryFile(mode="w", delete=False) as output:
            tokens = output.name
            for x in self.data:
                output.write(x + "\n")

        # Word vectors path
        vectors = os.path.join(path, "test-300d")

        # Build word vectors, if they don't already exist
        WordVectors.build(tokens, 300, 1, vectors)

        # Create dataset
        data = [(x, row, None) for x, row in enumerate(self.data)]

        # Create embeddings model, backed by word vectors
        embeddings = Embeddings({
            "path": vectors + ".magnitude",
            "storevectors": True,
            "scoring": "bm25",
            "pca": 3,
            "quantize": True
        })

        # Call scoring and index methods
        embeddings.score(data)
        embeddings.index(data)

        # Test search
        self.assertIsNotNone(embeddings.search("win", 1))

        # Generate temp file path
        index = os.path.join(tempfile.gettempdir(), "wembeddings")

        # Test save/load
        embeddings.save(index)
        embeddings.load(index)

        # Test search
        self.assertIsNotNone(embeddings.search("win", 1))
Beispiel #4
0
    def load():
        """
        Loads a questions database and pre-trained embeddings model

        Returns:
            (db, embeddings)
        """

        print("Loading model")

        path = Models.modelPath("stackexchange")
        dbfile = os.path.join(path, "questions.db")

        # Connect to database file
        db = sqlite3.connect(dbfile)

        # Loading embeddings model
        embeddings = Embeddings()
        embeddings.load(path)

        return db, embeddings
Beispiel #5
0
index_name = 'index'

with open(input_file, 'r') as infile:
    sections = infile.readlines()

# Create an index for the list of sections
doc_dict = {}
index_text = []

for uid, text in enumerate(sections):
    doc_dict[uid] = text.split('\t')
    session_id, raw_text = doc_dict[uid][:2]
    if len(raw_text) > 250:
        raw_text = Tokenizer.tokenize(raw_text)
        index_text.append((uid, raw_text, None))

if mode == 'index':
    print("--indexing-- %d documents" % (len(index_text)))
    embeddings.index(index_text)
    embeddings.save(index_name)
elif mode == 'search':
    print("--searching-- %d documents" % (len(index_text)))
    embeddings.load(index_name)
    for query in ("the milestones for our seed round", "what is possible today", "My philosophy has always been don't solve the human", "story about Larry", "biological memory", "short-term memory", "memory blocks and memory stack", "the company where i programmed robots", "nothing to do with us"):
    # Extract uid of first result
    # search result format: (uid, score)
        print(query)
        for i in range(0, 3):
            uid = embeddings.search(query, 3)[i][0]
            print("%-20s %s" % (query, doc_dict[uid]))
Beispiel #6
0
    if args.create_embed_index:
        #create index
        #uniqueindex is your list of terms
        embeddings = Embeddings({"method": "transformers", "path": sentence_transformer_path.__str__(),"quantize":True})
        embeddings.index([(uid, text, None) for uid, text in enumerate(uniqueterms)])
        embeddings.save("embedding_index")
    else:
        #load index 
        embeddings = Embeddings()
        #hack to port an embedding_index created on another machine with other dir structure
        with open("%s/config" % "embedding_index", "rb") as handle:
                config = pickle.load(handle)
        config["path"] = sentence_transformer_path.__str__()
        with open("%s/config" % "embedding_index", "wb") as handle:
                config = pickle.dump(config,handle)
        embeddings.load("embedding_index")

    ### load predictions on CORD-19 abstracts to create KB
    kb = pd.read_csv(predictions_path,usecols=["doc_id","sentence","span1","span2","relation_tag","conf","span1_lemma","span2_lemma"],sep="\t")
    kb.dropna(inplace=True)
    #string cleanups 
    kb['norm_span1'] = kb['span1'].str.replace('[^\w\s]','').str.replace("\s\s+", " ").str.strip().str.replace('^(\d+\s ?)*|(^[0-9]+)', '').str.replace("^[0-9]+$","")
    kb['norm_span2'] = kb['span2'].str.replace('[^\w\s]','').str.replace("\s\s+", " ").str.strip().str.replace('^(\d+\s ?)*|(^[0-9]+)', '').str.replace("^[0-9]+$","")
    kb = kb[~((kb.norm_span1=="") | (kb.norm_span2==""))]
    badi = []
    for i in range(len(kb["conf"])):
        v = kb["conf"].iloc[i]
        try:
            float(v)
        except:
            badi.append(i)
Beispiel #7
0
class TestEmbeddings(unittest.TestCase):
    """
    Embeddings tests
    """
    def setUp(self):
        """
        Initialize test data.
        """

        self.data = [
            "US tops 5 million confirmed virus cases",
            "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
            "Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
            "The National Park Service warns against sacrificing slower friends in a bear attack",
            "Maine man wins $1M from $25 lottery ticket",
            "Make huge profits without work, earn up to $100,000 a day"
        ]

        # Create embeddings model, backed by sentence-transformers & transformers
        self.embeddings = Embeddings({
            "method":
            "transformers",
            "path":
            "sentence-transformers/bert-base-nli-mean-tokens"
        })

    def testIndex(self):
        """
        Test embeddings.index
        """

        # Create an index for the list of sections
        self.embeddings.index([(uid, text, None)
                               for uid, text in enumerate(self.data)])

        # Search for best match
        uid = self.embeddings.search("feel good story", 1)[0][0]

        self.assertEqual(self.data[uid], self.data[4])

    def testSave(self):
        """
        Test embeddings.save
        """

        # Create an index for the list of sections
        self.embeddings.index([(uid, text, None)
                               for uid, text in enumerate(self.data)])

        # Generate temp file path
        index = os.path.join(tempfile.gettempdir(), "embeddings")

        self.embeddings.save(index)
        self.embeddings.load(index)

        # Search for best match
        uid = self.embeddings.search("feel good story", 1)[0][0]

        self.assertEqual(self.data[uid], self.data[4])

    def testSimilarity(self):
        """
        Test embeddings.similarity
        """

        # Get best matching id
        uid = np.argmax(
            self.embeddings.similarity("feel good story", self.data))

        self.assertEqual(self.data[uid], self.data[4])

    def testWords(self):
        """
        Test embeddings backed by word vectors
        """

        # Initialize model path
        path = os.path.join(tempfile.gettempdir(), "model")
        os.makedirs(path, exist_ok=True)

        # Build tokens file
        with tempfile.NamedTemporaryFile(mode="w", delete=False) as output:
            tokens = output.name
            for x in self.data:
                output.write(x + "\n")

        # Word vectors path
        vectors = os.path.join(path, "test-300d")

        # Build word vectors, if they don't already exist
        WordVectors.build(tokens, 300, 1, vectors)

        # Create dataset
        data = [(x, row, None) for x, row in enumerate(self.data)]

        # Create embeddings model, backed by word vectors
        embeddings = Embeddings({
            "path": vectors + ".magnitude",
            "storevectors": True,
            "scoring": "bm25",
            "pca": 3,
            "quantize": True
        })

        # Call scoring and index methods
        embeddings.score(data)
        embeddings.index(data)

        # Test search
        self.assertIsNotNone(embeddings.search("win", 1))