def main(): tweets_path = config_lda.get("input_file") tweets_clean = load_tweets(tweets_path) dictionary = corpora.Dictionary(tweets_clean) dtm = [dictionary.doc2bow(text) for text in tweets_clean] if reduce: tweets_clean = remove_terms_low_median(tweets_clean, dictionary, dtm) # Calculate again dictionary = corpora.Dictionary(tweets_clean) dtm = [dictionary.doc2bow(text) for text in tweets_clean] k = config_lda.get("topics") if k == 0: k = calculate_num_topics() ldamodel = gensim.models.LdaMulticore(dtm, num_topics=k, id2word=dictionary, passes=config_lda.get("passes", 20)) topics_dict = {} for topic in ldamodel.show_topics(formatted=False, num_topics=k, num_words=config_lda.get("words", 10)): topic_words = [] for pair in topic[1]: topic_words.append((pair[0], str(pair[1]))) topics_dict['topic_' + str(topic[0])] = topic_words file_system_json_file = open( config_lda.get("output_file", "./topics_results.json"), "w") file_system_json_file.write(json.dumps(topics_dict)) file_system_json_file.close()
str2 = set(str2.split()) try: return float(len(str1 & str2)) / len(str1 | str2) except ZeroDivisionError: return 0 # prepara función de verbose if opts.verbose: def verbose(*args): print(*args) else: verbose = lambda *a: None # Colecta los tweets y sus identificadores (idtweet y idusuario) users,ids=load_tweets(opts.DIR,opts.format,False) #print (users) #lo limpiamos de links clean_users = [[re.sub(r'\bhttps?:\/\/.*[\r\n]*', u'', i) for i in x] for x in users] #Lo mostramos histogram_list = [] for tweets in clean_users: #print(tweets) tweets_1 = list(tweets) tweets_2 = list(tweets) #tweets_1 = list(clean_tweets_2) #tweets_2 = list(clean_tweets_2) #print(tweets_1)
help="Define el valor minimo de cuentas ") p.add_argument("-v", "--verbose", action="store_true", dest="verbose", help="Verbose mode [Off]") opts = p.parse_args() # prepara función de verbose if opts.verbose: def verbose(*args): print(*args) else: verbose = lambda *a: None # Colecta los tweets y sus identificadores (idtweet y idusuario) tweets,ids=load_tweets(opts.DIR,opts.format,mix=opts.mix) # Imprime alguna información sobre los tweets if opts.verbose: for i,tweet in enumerate(tweets[:10]): verbose('Tweet example',i+1,tweet[:100]) verbose("Total tweets : ",len(tweets)) try: verbose("Total usuarios : ",len(set([id for x,id in ids]))) except ValueError: verbose("Total usuarios : ",len(ids)) # Calculamos los features #metemos las stop words en una lista if not opts.stopwords: my_stop_words=[]
"--verbose", action="store_true", dest="verbose", help="Verbose mode [Off]") opts = p.parse_args() # prepara función de verbose if opts.verbose: def verbose(*args): print(*args) else: verbose = lambda *a: None # Colecta los tweets y sus identificadores (idtweet y idusuario) tweets, ids = load_tweets(opts.DIR, opts.format, mix=opts.mix) # Imprime alguna información sobre los tweets if opts.verbose: for i, tweet in enumerate(tweets[:10]): verbose('Tweet example', i + 1, tweet[:100]) verbose("Total tweets : ", len(tweets)) try: verbose("Total usuarios : ", len(set([id for x, id in ids]))) except ValueError: verbose("Total usuarios : ", len(ids)) # Calculamos los features #metemos las stop words en una lista if not opts.stopwords: my_stop_words = []
str2 = set(str2.split()) try: return float(len(str1 & str2)) / len(str1 | str2) except ZeroDivisionError: return 0 # prepara función de verbose if opts.verbose: def verbose(*args): print(*args) else: verbose = lambda *a: None # Colecta los tweets y sus identificadores (idtweet y idusuario) users, ids = load_tweets(opts.DIR, opts.format, False) #print (users) #lo limpiamos de links clean_users = [[re.sub(r'\bhttps?:\/\/.*[\r\n]*', u'', i) for i in x] for x in users] #Lo mostramos histogram_list = [] for tweets in clean_users: #print(tweets) tweets_1 = list(tweets) tweets_2 = list(tweets) #tweets_1 = list(clean_tweets_2) #tweets_2 = list(clean_tweets_2)
from load_tweets import load_tweets from feature_extractors import * from stats_and_plots import * import matplotlib.pyplot as plt tweets = load_tweets('data/sample1.txt') tdm, vocab = word_counts(tweets) print(most_used(tdm, vocab, 25)) print(least_used(tdm, vocab, 25)) plt.show(word_bar(tdm, vocab, 25)) feats = {} feats['num_hashtags'] = num_hashtags(tweets) feats['reply'] = reply(tweets) feats['length'] = length(tweets) feats['num_retweets'] = num_retweets(tweets) print(most_hashtags(feats)) plt.show(hashtag_hist(feats)) print(proportion_replies(feats)) print(avg_tweet_length(tdm)) plt.show(length_hist(feats)) print(avg_retweets(feats)) plt.show(num_retweets_hist(feats))
:return: dictionary containing the word and its count ''' dict = {} cv = CountVectorizer() text = [t['text'] for t in tweets] tdm = cv.fit_transform(text) word_counts = tdm.sum(axis=0).tolist()[0] assert (len(cv.get_feature_names()) == len(word_counts)) return zip(cv.get_feature_names(), word_counts) if __name__ == '__main__': print('Loading...') samples = [load_tweets('data/sample%d.txt' % i) for i in range(1, 4)] print('Loaded.') print('Cleaning...') for tweets in samples: tweets = remove_stop_words(tweets) tweets = remove_punctuation(tweets) tweets = remove_non_english(tweets) tweets = remove_links(tweets) tweets = remove_digits(tweets) tweets = remove_empty(tweets) print('Cleaned.') print('Extract Trends') trend_extract_term_frequency(samples) print('Extracted')
if __name__ == "__main__": warnings.simplefilter("ignore") tweets = None sentiment_dir = "../sentiment/" sentiment_models = { "text_blob": find_text_blob_sentiment, "vader": find_vader_sentiment, } for model_name, model_function in sentiment_models.items(): sentiment_path = os.path.join(sentiment_dir, model_name) + ".pickle" if not os.path.exists(sentiment_path): if tweets is None: tweets = load_tweets() tweets = list(tweets.items()) results = Parallel(find_text_blob_sentiment, tweets, model_name) sentiment = {tweet_id: value for tweet_id, value in results} save_pickle(sentiment, sentiment_path) model_name = "flair" sentiment_path = os.path.join(sentiment_dir, model_name) + ".pickle" if not os.path.exists(sentiment_path): if tweets is None: tweets = load_tweets() tweets = list(tweets.items()) sentiment = find_flair_sentiment(tweets, chunk_len=100000) sentiment_models[model_name] = sentiment
# Separate data into train, validation and test sets data, labels = load_polarity() class_names = ['negatif', 'positif'] size_data = len(labels) train, test, train_labels, test_labels = sklearn.model_selection.train_test_split( data, labels, test_size=.9, random_state=42) train, val, train_labels, val_labels = sklearn.model_selection.train_test_split( train, train_labels, test_size=.1, random_state=42) train_labels = np.array(train_labels) test_labels = np.array(test_labels) val_labels = np.array(val_labels) elif dataset == 'tweet': # If spanish change load_tweets to load_tweets_es() else only load_tweets() class_names = load_tweets.transform_emoji() train, train_labels = load_tweets.load_tweets() train, test, train_labels, test_labels = sklearn.model_selection.train_test_split( train, train_labels, test_size=.9, random_state=42) train, val, train_labels, val_labels = sklearn.model_selection.train_test_split( train, train_labels, test_size=.2, random_state=42) size_data = len(train_labels) + len(test_labels) elif 'bert' in dataset: class_names = ['negative', 'positive'] df = pd.read_csv( 'https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None) # Number of sentance examples on which the bert model is trained on the train set (limited since the model takes time) batch_1 = df[:size_batch_bert] train_vectors = batch_1[0]