def buscaCorreo2(x): documents = [] documap = {} for archivo in os.listdir("Correos"): if archivo.endswith(".txt"): f = open("Correos/" + archivo, "r") f.readline() f.readline() f.readline() f.readline() mailbody = f.read() f.close() docu = Document(mailbody, name=archivo) documents.append(docu) docukey = int(archivo[0:-4]) documap[docukey] = docu model = Model(documents=documents, weight=TFIDF) docu = documap[int(var.get())] tupla = model.neighbors(docu, top=1)[0] tkMessageBox.showinfo("Tk", "El documento que mas se parece es el " + tupla[1].name[0:-4] + ", con un " + str(tupla[0]) + " de similitud")
# For text, a better metric than Euclidean distance # is called cosine similarity. This is what a Model uses: d1 = m.document(name="lion") d2 = m.document(name="tiger") d3 = m.document(name="dolphin") d4 = m.document(name="shark") d5 = m.document(name="parakeet") print "lion-tiger:", m.similarity(d1, d2) print "lion-dolphin:", m.similarity(d1, d3) print "dolphin-shark:", m.similarity(d3, d4) print "dolphin-parakeet:", m.similarity(d3, d5) print print "Related to tiger:" print m.neighbors(d2, top=3) # Top three most similar. print print "Related to a search query ('water'):" print m.search("water", top=10) # In summary: # A Document: # - takes a string of text, # - counts the words in the text, # - constructs a vector of words (features) and normalized word count (weight). # A Model: # - groups multiple vectors in a matrix, # - tweaks the weight with TF-IDF to find "unique" words in each document,
# For text, a better metric than Euclidean distance # is called cosine similarity. This is what a Model uses: d1 = m.document(name="lion") d2 = m.document(name="tiger") d3 = m.document(name="dolphin") d4 = m.document(name="shark") d5 = m.document(name="parakeet") print("lion-tiger:", m.similarity(d1, d2)) print("lion-dolphin:", m.similarity(d1, d3)) print("dolphin-shark:", m.similarity(d3, d4)) print("dolphin-parakeet:", m.similarity(d3, d5)) print() print("Related to tiger:") print(m.neighbors(d2, top=3)) # Top three most similar. print() print("Related to a search query ('water'):") print(m.search("water", top=10)) # In summary: # A Document: # - takes a string of text, # - counts the words in the text, # - constructs a vector of words (features) and normalized word count (weight). # A Model: # - groups multiple vectors in a matrix, # - tweaks the weight with TF-IDF to find "unique" words in each document,
from pattern.vector import Document, Model, IG, TF, TFIDF, BINARY import sys import os print "Reading sample code and instantiating documents..." documents = [] exampleDir = "examples/" for file in os.listdir(exampleDir): if os.path.isdir(exampleDir + file): for subfile in os.listdir(exampleDir + file): if (os.path.isfile(exampleDir + file + "/" + subfile)): with open (exampleDir + file + "/" + subfile, "r") as langDoc: text = langDoc.read() doc = Document(text, type=file) documents.append(doc) print "Creating statistical model..." m = Model(documents=documents, weight=IG) # Test with sample Java doc print "Comparing test document..." with open ("coffee.txt", "r") as myfile: testFile = myfile.read() testDoc = Document(testFile, type='Java') testSimilarities = m.neighbors(testDoc, top=10) prediction = testSimilarities[0][1].type #neighbors() returns (similarity, document) list confidence = testSimilarities[0][0] print "LanguageLearn has predicted " + testSimilarities[0][1].type + " with a " + str(round(confidence * 100, 2)) + "% confidence"
def recommend_game(this_game): games = recommendable_games(this_game) total_recommendable = games.count() print 'Total recommendable games based on ' + this_game.title + ": " + total_recommendable.__str__() document_title = Document(this_game.title) document_publisher = Document(this_game.publisher) document_summary = Document(this_game.summary, top=None, threshold=0, stemmer=None, exclude=[], stopwords=False, language='en') document_keywords = Document(', '.join([x['name'] for x in this_game.keywords.all().values("name")])) document_genres = Document(', '.join([x['name'] for x in this_game.genres.all().values("name")])) # format: {"id":id, socre:"SUM(dist*pond)"} game_similarities = [] summary_documents = [] for game in games: score = 0 game = Game.objects.filter(title=game['title'], platform=game['platform'])[0] title_similarity = 1 - distance(document_title.vector, Document(game.title).vector) publisher_similarity = 1 - distance(document_publisher.vector, Document(game.publisher).vector) genre_similarity = 1 - distance(document_genres.vector, Document( ', '.join([x['name'] for x in game.genres.all().values("name")]) ).vector) keywords_similarity = 1 - distance(document_keywords.vector, Document( ', '.join([x['name'] for x in game.keywords.all().values("name")]) ).vector) score = (0.15 * title_similarity) + (0.2 * genre_similarity) + (0.2 * publisher_similarity) + ( 0.20 * keywords_similarity) summary_documents.append(Document(game.summary, top=None, threshold=0, stemmer=None, exclude=[], stopwords=False, language='en', name=game.id)) game_similarities.append({"id": game.id, "score": score}) to_compare = Document(document_summary) model = Model(documents=summary_documents, weight=TFIDF) neighbours = model.neighbors(to_compare, top=total_recommendable) for neighbour in neighbours: for rec_game in game_similarities: if rec_game['id'] == neighbour[1].name: rec_game['score'] = rec_game['score'] + 0.25 * neighbour[0] recommended = sorted(game_similarities, key=lambda k: -k['score'])[0:total_recommendable] if len(recommended) >= 40: random_selection = random.sample(recommended[0:40], 25) else: random_selection = random.sample(recommended, 25) recommended_ids = [g['id'] for g in random_selection] return recommended_ids