Ejemplo n.º 1
0
    def testTokenize(self):
        """
        Test tokenize
        """

        self.assertEqual(Tokenizer.tokenize("Y this is a test!"), ["test"])
        self.assertEqual(Tokenizer.tokenize("abc123 ABC 123"), ["abc123", "abc"])
Ejemplo n.º 2
0
    def stream(self, dbfile):
        """
        Connects to SQLite file at dbfile and yields parsed tokens for each row.

        Args:
            dbfile:
        """

        # Connection to database file
        db = sqlite3.connect(dbfile)
        cur = db.cursor()

        cur.execute("SELECT Text FROM sections")

        count = 0
        for section in cur:
            # Tokenize text
            tokens = Tokenizer.tokenize(section[0])

            count += 1
            if count % 1000 == 0:
                print("Streamed %d documents" % (count), end="\r")

            # Skip documents with no tokens parsed
            if tokens:
                yield tokens

        print("Iterated over %d total rows" % (count))

        # Free database resources
        db.close()
Ejemplo n.º 3
0
    def search(embeddings, cur, query, topn, threshold):
        """
        Executes an embeddings search for the input query. Each returned result is resolved
        to the full section row.

        Args:
            embeddings: embeddings model
            cur: database cursor
            query: query text
            topn: number of documents to return
            threshold: require at least this score to include result

        Returns:
            search results
        """

        if query == "*":
            return []

        # Default threshold if None
        threshold = threshold if threshold is not None else 0.6

        results = []

        # Get list of required and prohibited tokens
        must = [token.strip("+") for token in query.split() if token.startswith("+") and len(token) > 1]
        mnot = [token.strip("-") for token in query.split() if token.startswith("-") and len(token) > 1]

        # Tokenize search query
        query = Tokenizer.tokenize(query)

        # Retrieve topn * 5 to account for duplicate matches
        for uid, score in embeddings.search(query, topn * 5):
            if score >= threshold:
                cur.execute("SELECT Article, Text FROM sections WHERE id = ?", [uid])

                # Get matching row
                sid, text = cur.fetchone()

                # Add result if:
                #   - all required tokens are present or there are not required tokens AND
                #   - all prohibited tokens are not present or there are not prohibited tokens
                if (not must or all([token.lower() in text.lower() for token in must])) and (
                    not mnot or all([token.lower() not in text.lower() for token in mnot])
                ):
                    # Save result
                    results.append((uid, score, sid, text))

        return results
Ejemplo n.º 4
0
    def tokenize(text):
        """
        Tokenizes text into tokens, removes domain specific stop words.

        Args:
            text: input text

        Returns:
            tokens
        """

        # Remove additional stop words to improve highlighting results
        return {
            token
            for token in Tokenizer.tokenize(text)
            if token not in Highlights.STOP_WORDS
        }
Ejemplo n.º 5
0
    def stream(dbfile, maxsize):
        """
        Streams documents from an articles.sqlite file. This method is a generator and will yield a row at time.

        Args:
            dbfile: input SQLite file
            maxsize: maximum number of documents to process
        """

        # Connection to database file
        db = sqlite3.connect(dbfile)
        cur = db.cursor()

        # Select tagged sentences without a NLP label. NLP labels are set for non-informative sentences.
        query = Index.SECTION_QUERY + " AND tags is not null"

        if maxsize > 0:
            query += " AND article in (SELECT id FROM articles ORDER BY entry DESC LIMIT %d)" % maxsize

        # Run the query
        cur.execute(query)

        count = 0
        for row in cur:
            # Unpack row
            uid, name, text = row

            if not name or not re.search(Index.SECTION_FILTER, name.lower()):
                # Tokenize text
                tokens = Tokenizer.tokenize(text)

                document = (uid, tokens, None)

                count += 1
                if count % 1000 == 0:
                    print("Streamed %d documents" % (count), end="\r")

                # Skip documents with no tokens parsed
                if tokens:
                    yield document

        print("Iterated over %d total rows" % (count))

        # Free database resources
        db.close()
Ejemplo n.º 6
0
    def search(embeddings, cur, query, topn):
        """
        Executes an embeddings search for the input query. Each returned result is resolved
        to the full section row.

        Args:
            embeddings: embeddings model
            cur: database cursor
            query: query text
            topn: number of documents to return

        Returns:
            search results
        """

        if query == "*":
            return []

        results = []

        # Get list of required tokens
        must = [
            token.strip("+") for token in query.split()
            if token.startswith("+")
        ]

        # Tokenize search query
        query = Tokenizer.tokenize(query)

        # Retrieve topn * 5 to account for duplicate matches
        for uid, score in embeddings.search(query, topn * 5):
            if score >= 0.6:
                cur.execute("SELECT Article, Text FROM sections WHERE id = ?",
                            [uid])

                # Get matching row
                sid, text = cur.fetchone()

                # Add result if all required tokens are present or there are not required tokens
                if not must or all(
                    [token.lower() in text.lower() for token in must]):
                    # Save result
                    results.append((uid, score, sid, text))

        return results
Ejemplo n.º 7
0
mode = sys.argv[2]

index_name = 'index'

with open(input_file, 'r') as infile:
    sections = infile.readlines()

# Create an index for the list of sections
doc_dict = {}
index_text = []

for uid, text in enumerate(sections):
    doc_dict[uid] = text.split('\t')
    session_id, raw_text = doc_dict[uid][:2]
    if len(raw_text) > 250:
        raw_text = Tokenizer.tokenize(raw_text)
        index_text.append((uid, raw_text, None))

if mode == 'index':
    print("--indexing-- %d documents" % (len(index_text)))
    embeddings.index(index_text)
    embeddings.save(index_name)
elif mode == 'search':
    print("--searching-- %d documents" % (len(index_text)))
    embeddings.load(index_name)
    for query in ("the milestones for our seed round", "what is possible today", "My philosophy has always been don't solve the human", "story about Larry", "biological memory", "short-term memory", "memory blocks and memory stack", "the company where i programmed robots", "nothing to do with us"):
    # Extract uid of first result
    # search result format: (uid, score)
        print(query)
        for i in range(0, 3):
            uid = embeddings.search(query, 3)[i][0]