def getKeywords(text, useless): text = TextBlob(text) for word in text.words: for bad in useless: if word is bad: text.remove(word) return text
def remove_stopwords(text, stopwords=final_stopwords): tlist = TextBlob(text.lower()) tlist = list(tlist.words) symbs = [ '@', '#', 'https', 'http', 'www.', '.com', '=', ',', "'", 'the', 'and', '\'s' ] for i in symbs: for j in tlist: if i in j: tlist.remove(j) else: continue for wd in tlist: if wd in stopwords: tlist.remove(wd) else: continue return ' '.join(tlist)
plt.ylabel('How many talks in that duration') plt.title('TED duration Distribution') plt.axvline(x=td['duration'].mean(),linestyle='--') plt.axvline(x=td['duration'].median(),color = '#FFFF7F',linestyle='-.') plt.legend(['mean of duration','median of duration'], loc='upper right') plt.show() # [',]# 排名前10的tag m = ['[',"'",',',']'] tags_split = [] indi_tag = [] for t in title_rank['tags']: t = t.split("'") #print(t) for i in t: if i[0] in m: t.remove(i) tags_split.append(t) title_rank['tags_split'] = tags_split for row in tags_split: for w in row: if w in indi_tag: continue else: indi_tag.append(w) tags_count = [] for t in title_rank['tags_split']: tags_count.append(len(t)) title_rank['tags_count']=tags_count indi_tag_view = {} view_tag = dict(zip(title_rank.views, title_rank.tags_split))