from models.tweet import Tweet import random # compile documents doc_complete = [i for i in Tweet.all() if "mujica" in i.data['text'].lower()] from nltk.stem import SnowballStemmer lemma = SnowballStemmer("spanish") def clean(tweet): return ' '.join(tweet.tokenize_and_clean()) doc_clean = [clean(tweet).split() for tweet in doc_complete] # Importing Gensim import gensim from gensim import corpora # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # Creating the object for LDA model using gensim library Lda = gensim.models.ldamodel.LdaModel # Running and Trainign LDA model on the document term matrix. ldamodel = Lda(doc_term_matrix, num_topics=5, id2word=dictionary, passes=100)
print("Modelo cargado") # def get_tweets(palabras): # tweets = Tweet.all() # tweets_encontrados = set() # for t in tweets: # for p in palabras: # if p.lower() in t.data['text'].lower(): # tweets_encontrados.add(t) # return tweets_encontrados # tweets = get_tweets(['@LuisSuarez9', 'luisito', 'pistolero', 'luis suarez']) tweets = [i for i in Tweet.all() if "@LuisSuarez9" in i.data['text']] target_tweet = random.sample(tweets, 1)[0] print("---------------------------------------------") print("TARGET TWEET TEXT") print(target_tweet.data['text']) closest_tweet = None closest_tweets = list() distance = Distance(model) for candidate in tweets: if distance.jaccard( candidate, target_tweet ) < 0.90 and candidate.tweet_id != target_tweet.tweet_id:
def expandir_a_parecidos(tweet): tokens = tweet.tokenize_and_clean() tweet_expandido = set() for t in tokens: t = t.encode('utf8') tweet_expandido.add(t) if model.__contains__(t): indexes, metrics = model.cosine(t) parecidos_t = model.vocab[indexes][:1] tweet_expandido |= set(parecidos_t) return tweet_expandido tweets = random.sample(Tweet.all(), 200) target_tweet = tweets[75] print "Target tweet text" print target_tweet.data['text'] target_tweet_expandido = expandir_a_parecidos(target_tweet) print "Target tweet expandido" print target_tweet_expandido closest_tweet = None max_palabras_iguales = 0 for candidate in tweets: candidate_expandido = expandir_a_parecidos(candidate)
# def maybe_download(filename, expected_bytes): # """Download a file if not present, and make sure it's the right size.""" # if not os.path.exists(filename): # filename, _ = urllib.request.urlretrieve(url + filename, filename) # statinfo = os.stat(filename) # if statinfo.st_size == expected_bytes: # print('Found and verified', filename) # else: # print(statinfo.st_size) # raise Exception( # 'Failed to verify ' + filename + '. Can you get to it with a browser?') # return filename # f = open('data', 'w') for tweet in Tweet.all(): tokens = tweet.tokenize_and_clean() for t in tokens: f.write(t.encode('utf8') + ' ') f.close() filename = 'data' # Read the data into a list of strings. def read_data(filename): """Extract the first file enclosed in a zip file as a list of words""" with open(filename, 'r') as f: data = tf.compat.as_str(f.read()).split() return data