Esempio n. 1
0
def getKeywords(text, useless):
	text = TextBlob(text)
	for word in text.words:
		for bad in useless:
			if word is bad:
				text.remove(word)
	return text
Esempio n. 2
0
def remove_stopwords(text, stopwords=final_stopwords):
    tlist = TextBlob(text.lower())
    tlist = list(tlist.words)
    symbs = [
        '@', '#', 'https', 'http', 'www.', '.com', '=', ',', "'", 'the', 'and',
        '\'s'
    ]
    for i in symbs:
        for j in tlist:
            if i in j:
                tlist.remove(j)
            else:
                continue
    for wd in tlist:
        if wd in stopwords:
            tlist.remove(wd)
        else:
            continue
    return ' '.join(tlist)
Esempio n. 3
0
plt.ylabel('How many talks in that duration')
plt.title('TED duration Distribution')
plt.axvline(x=td['duration'].mean(),linestyle='--')
plt.axvline(x=td['duration'].median(),color = '#FFFF7F',linestyle='-.')
plt.legend(['mean of duration','median of duration'], loc='upper right')
plt.show()
# [',]# 排名前10的tag
m = ['[',"'",',',']']
tags_split = []
indi_tag = []
for t in title_rank['tags']:
    t = t.split("'")
    #print(t)
    for i in t:
        if i[0] in m:
            t.remove(i)
    tags_split.append(t)
title_rank['tags_split'] = tags_split
for row in tags_split:
    for w in row:
        if w in indi_tag:
            continue
        else:
            indi_tag.append(w)
tags_count = []
for t in title_rank['tags_split']:
    tags_count.append(len(t))
title_rank['tags_count']=tags_count

indi_tag_view = {}
view_tag = dict(zip(title_rank.views, title_rank.tags_split))