Esempio n. 1
0
from models.tweet import Tweet
import random

# compile documents
doc_complete = [i for i in Tweet.all() if "mujica" in i.data['text'].lower()]

from nltk.stem import SnowballStemmer
lemma = SnowballStemmer("spanish")


def clean(tweet):
    return ' '.join(tweet.tokenize_and_clean())


doc_clean = [clean(tweet).split() for tweet in doc_complete]

# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word=dictionary, passes=100)
Esempio n. 2
0
print("Modelo cargado")

# def get_tweets(palabras):
#     tweets = Tweet.all()
#     tweets_encontrados = set()

#     for t in tweets:
# 	for p in palabras:
# 	    if p.lower() in t.data['text'].lower():
# 		tweets_encontrados.add(t)

#     return tweets_encontrados

# tweets = get_tweets(['@LuisSuarez9', 'luisito', 'pistolero', 'luis suarez'])

tweets = [i for i in Tweet.all() if "@LuisSuarez9" in i.data['text']]
target_tweet = random.sample(tweets, 1)[0]

print("---------------------------------------------")
print("TARGET TWEET TEXT")
print(target_tweet.data['text'])

closest_tweet = None
closest_tweets = list()

distance = Distance(model)

for candidate in tweets:
    if distance.jaccard(
            candidate, target_tweet
    ) < 0.90 and candidate.tweet_id != target_tweet.tweet_id:
Esempio n. 3
0
def expandir_a_parecidos(tweet):
    tokens = tweet.tokenize_and_clean()
    tweet_expandido = set()

    for t in tokens:
        t = t.encode('utf8')
        tweet_expandido.add(t)
        if model.__contains__(t):
            indexes, metrics = model.cosine(t)
            parecidos_t = model.vocab[indexes][:1]
            tweet_expandido |= set(parecidos_t)

    return tweet_expandido


tweets = random.sample(Tweet.all(), 200)
target_tweet = tweets[75]

print "Target tweet text"
print target_tweet.data['text']

target_tweet_expandido = expandir_a_parecidos(target_tweet)

print "Target tweet expandido"
print target_tweet_expandido

closest_tweet = None
max_palabras_iguales = 0

for candidate in tweets:
    candidate_expandido = expandir_a_parecidos(candidate)
Esempio n. 4
0
# def maybe_download(filename, expected_bytes):
#   """Download a file if not present, and make sure it's the right size."""
#   if not os.path.exists(filename):
#     filename, _ = urllib.request.urlretrieve(url + filename, filename)
#   statinfo = os.stat(filename)
#   if statinfo.st_size == expected_bytes:
#     print('Found and verified', filename)
#   else:
#     print(statinfo.st_size)
#     raise Exception(
#         'Failed to verify ' + filename + '. Can you get to it with a browser?')
#   return filename
#
f = open('data', 'w')

for tweet in Tweet.all():
    tokens = tweet.tokenize_and_clean()
    for t in tokens:
        f.write(t.encode('utf8') + ' ')

f.close()

filename = 'data'


# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words"""
    with open(filename, 'r') as f:
        data = tf.compat.as_str(f.read()).split()
    return data