Esempio n. 1
0
def load_documents(use_tfidf=False, doc_frequencies=None):
    """Build and return a dict from names of documents in the documents
    directories to their term vectors."""
    out = {}
    for root, dirs, fileshere in os.walk("documents"):
        for fn in fileshere:
            if fn.endswith(".txt"):
                whole_pathname = os.path.join(root, fn)
                termvector = countwords(listwords(whole_pathname))
                if use_tfidf:
                    tfidf_vector = make_tfidf(termvector, doc_frequencies)
                    out[whole_pathname] = tfidf_vector
                else:
                    out[whole_pathname] = termvector
    return out
Esempio n. 2
0
def load_documents(use_tfidf=False,doc_frequencies=None):
    """Build and return a dict from names of documents in the documents
    directories to their term vectors."""
    out = {}
    for root, dirs, fileshere in os.walk("documents"):
        for fn in fileshere:
            if fn.endswith(".txt"):
                whole_pathname = os.path.join(root, fn)
                termvector = countwords(listwords(whole_pathname))
                if use_tfidf:
                    tfidf_vector = make_tfidf(termvector,doc_frequencies)
                    out[whole_pathname] = tfidf_vector
                else:
                    out[whole_pathname] = termvector
    return out
Esempio n. 3
0
def main():
    frequencies = document_frequencies()
    # documents = load_documents(use_tfidf=True, doc_frequencies=frequencies)
    documents = load_documents()

    print("press ctrl-D to exit!")
    while True:
        try:
            querytext = input("query> ")
            querytext = querytext.strip().lower()
            if not querytext: continue
            query = countwords(querytext.split())
            for hit in search(query, documents):
                if hit[1]: print(hit)
        except EOFError:
            print()
            break
    print("OK thanks!")
Esempio n. 4
0
def main():
    frequencies = document_frequencies()
    # documents = load_documents(use_tfidf=True, doc_frequencies=frequencies)
    documents = load_documents()

    print("press ctrl-D to exit!")
    while True:
        try:
            querytext = input("query> ")
            querytext = querytext.strip().lower()
            if not querytext: continue
            query = countwords(querytext.split())
            for hit in search(query, documents):
                if hit[1]: print(hit)
        except EOFError:
            print()
            break
    print("OK thanks!")
Esempio n. 5
0
    def testCountWords(self):
        counts = countwords(words)

        assert counts['the'] == 2
        assert counts['fox'] == 1
        assert counts['mango'] == 0