Beispiel #1
0
train_ls = [str(i) for i in range(6, 28, 1)]
corpus = []
count = 0
for d in train_ls:
    fn = "training_tweets/{}.txt".format(d)
    txt = ""
    with open(fn, 'r') as f:
        csv_reader = csv.reader(f, delimiter='\t')
        for row in csv_reader:
            t = removeSpecialChar(removeURLs(removeUsernames(row[3])))
            corpus.append(word_tokenization(t, tagger))
            count += 1
    print("File {} is done".format(d))

print("Total tweets are", count)
docs = pytm.DocumentSet(corpus, min_df=5, max_df=0.5)
print("Corpus Created")

#Applying LDA on our dataset
n_topics = 100
lda = pytm.SVILDA(n_topics, docs.get_n_vocab())
lda.fit(docs,
        n_iteration=1000,
        B=1000,
        n_inner_iteration=5,
        n_hyper_iteration=20,
        J=5)
print("LDA fitted")

topic_list = []
alphas = [lda.get_alpha(k) for k in range(n_topics)]
    return re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in orignal.split("\n")

def remove2Char(tweet):
    new_tweet = ""
    for w in tweet.split():
        if len(w) >= 3:
             new_tweet=new_tweet + w + " "
    return new_tweet

only_text = []
with open('all_english_tweets.txt', mode = 'r') as f:
    csv_reader = csv.reader((x.replace('\0', '') for x in f), delimiter = ',')
    for row in csv_reader:
        only_text.append(remove2Char(removeSpecialChar(removeStopWords(removeUsernames(removeURLs(row[1]))))))

docs = pytm.DocumentSet(only_text, min_df=5, max_df=0.5)

#Applying LDA on our dataset
n_topics = 1000
lda = pytm.SVILDA(n_topics, docs.get_n_vocab())
lda.fit(docs, n_iteration=1000, B=1000, n_inner_iteration=5, n_hyper_iteration=20, J=5)

#Getting topic's and alpha values
topic_list = []
alphas = [lda.get_alpha(k) for k in range(n_topics)]
for k, alpha in enumerate(alphas):
    vocab = docs.get_vocab()
    phi = lda.get_phi(k)
    new_phi = np.around(list(phi), decimals = 3)
    print('topic {0} (alpha = {1})'.format(k, np.around(alpha, decimals = 2)))
    a = sorted(zip(vocab, new_phi), key=lambda x: -x[1])[:10]
Beispiel #3
0
    return new_tweet

contents = []
years = []

with open('nips-papers/papers.csv', mode = 'r') as f:
    csv_reader = csv.reader((x.replace('\0', '') for x in f), delimiter = ',')
    for row in csv_reader:
        txt = remove2Char(removeSpecialChar(removeStopWords(removeURLs(row[6]))))
        if len(txt) > 500:
            contents.append(txt)
            years.append(row[1])


print("Corpus Created")
docs = pytm.DocumentSet(contents, min_df=5, max_df=0.5)

#Applying LDA on our dataset
n_topics = 60
lda = pytm.SVILDA(n_topics, docs.get_n_vocab())
lda.fit(docs, n_iteration=1000, B=1000, n_inner_iteration=5, n_hyper_iteration=20, J=5)

#Getting topic's and alpha values
topic_list = []
alphas = [lda.get_alpha(k) for k in range(n_topics)]
for k, alpha in enumerate(alphas):
    vocab = docs.get_vocab()
    phi = lda.get_phi(k)
    new_phi = np.around(list(phi), decimals = 3)
    print('topic {0} (alpha = {1})'.format(k, np.around(alpha, decimals = 2)))
    a = sorted(zip(vocab, new_phi), key=lambda x: -x[1])[:50]

only_text = []
max_doc_size = 1000
count = 0
with open('ordered_day_hashtag_dtm.txt', mode='r') as f:
    csv_reader = csv.reader((x.replace('\0', '') for x in f), delimiter=',')
    for row in csv_reader:
        only_text.append(
            remove2Char(removeSpecialChar(removeStopWords(removeURLs(
                row[2])))))
        count += 1
        if count == max_doc_size:
            break

docs = pytm.DocumentSet(only_text, min_df=5, max_df=0.5)

#Applying LDA on our dataset
n_topics = 20
lda = pytm.SVILDA(n_topics, docs.get_n_vocab())
lda.fit(docs,
        n_iteration=1000,
        B=1000,
        n_inner_iteration=5,
        n_hyper_iteration=20,
        J=5)

#Getting topic's and alpha values
topic_list = []
alphas = [lda.get_alpha(k) for k in range(n_topics)]
for k, alpha in enumerate(alphas):