# A Model is a matrix (or vector space) # with features as columns and feature weights as rows. # We can then do calculations on the matrix, # for example to compute TF-IDF or similarity between documents. # Load a model from a folder of text documents: documents = [] for f in glob.glob(os.path.join(os.path.dirname(__file__), "corpus", "*.txt")): text = codecs.open(f, encoding="utf-8").read() name = os.path.basename(f)[:-4] documents.append(Document(text, name=name)) m = Model(documents, weight=TFIDF) # We can retrieve documents by name: d = m.document(name="lion") print d.keywords(top=10) print print d.tf("food") print d.tfidf( "food") # TF-IDF is less: "food" is also mentioned with the other animals. print # We can compare how similar two documents are. # This is done by calculating the distance between the document vectors # (i.e., finding those that are near to each other). # For example, say we have two vectors with features "x" and "y". # We can calculate the distance between two points (x, y) in 2-D space: # d = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
# A Model is a matrix (or vector space) # with features as columns and feature weights as rows. # We can then do calculations on the matrix, # for example to compute TF-IDF or similarity between documents. # Load a model from a folder of text documents: documents = [] for f in glob.glob(os.path.join(os.path.dirname(__file__), "corpus", "*.txt")): text = codecs.open(f, encoding="utf-8").read() name = os.path.basename(f)[:-4] documents.append(Document(text, name=name)) m = Model(documents, weight=TFIDF) # We can retrieve documents by name: d = m.document(name="lion") print(d.keywords(top=10)) print() print(d.tf("food")) # TF-IDF is less: "food" is also mentioned with the other animals. print(d.tfidf("food")) print() # We can compare how similar two documents are. # This is done by calculating the distance between the document vectors # (i.e., finding those that are near to each other). # For example, say we have two vectors with features "x" and "y". # We can calculate the distance between two points (x, y) in 2-D space: # d = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))