def tweet_preprocess(tweet_text: str): """ 文本预处理,去除特殊符号 """ tweet_text = tweet_text.lower() tweet_text = re.sub( r"(?:https?://)?(?:(?:0rz\.tw)|(?:1link\.in)|(?:1url\.com)|(?:2\.gp)|(?:2big\.at)|(?:2tu\.us)|(?:3\.ly)|(?:307\.to)|(?:4ms\.me)|(?:4sq\.com)|(?:4url\.cc)|(?:6url\.com)|(?:7\.ly)|(?:a\.gg)|(?:a\.nf)|(?:aa\.cx)|(?:abcurl\.net)|(?:ad\.vu)|(?:adf\.ly)|(?:adjix\.com)|(?:afx\.cc)|(?:all\.fuseurl.com)|(?:alturl\.com)|(?:amzn\.to)|(?:ar\.gy)|(?:arst\.ch)|(?:atu\.ca)|(?:azc\.cc)|(?:b23\.ru)|(?:b2l\.me)|(?:bacn\.me)|(?:bcool\.bz)|(?:binged\.it)|(?:bit\.ly)|(?:bizj\.us)|(?:bloat\.me)|(?:bravo\.ly)|(?:bsa\.ly)|(?:budurl\.com)|(?:canurl\.com)|(?:chilp\.it)|(?:chzb\.gr)|(?:cl\.lk)|(?:cl\.ly)|(?:clck\.ru)|(?:cli\.gs)|(?:cliccami\.info)|(?:clickthru\.ca)|(?:clop\.in)|(?:conta\.cc)|(?:cort\.as)|(?:cot\.ag)|(?:crks\.me)|(?:ctvr\.us)|(?:cutt\.us)|(?:dai\.ly)|(?:decenturl\.com)|(?:dfl8\.me)|(?:digbig\.com)|(?:digg\.com)|(?:disq\.us)|(?:dld\.bz)|(?:dlvr\.it)|(?:do\.my)|(?:doiop\.com)|(?:dopen\.us)|(?:easyuri\.com)|(?:easyurl\.net)|(?:eepurl\.com)|(?:eweri\.com)|(?:fa\.by)|(?:fav\.me)|(?:fb\.me)|(?:fbshare\.me)|(?:ff\.im)|(?:fff\.to)|(?:fire\.to)|(?:firsturl\.de)|(?:firsturl\.net)|(?:flic\.kr)|(?:flq\.us)|(?:fly2\.ws)|(?:fon\.gs)|(?:freak\.to)|(?:fuseurl\.com)|(?:fuzzy\.to)|(?:fwd4\.me)|(?:fwib\.net)|(?:g\.ro.lt)|(?:gizmo\.do)|(?:gl\.am)|(?:go\.9nl.com)|(?:go\.ign.com)|(?:go\.usa.gov)|(?:goo\.gl)|(?:goshrink\.com)|(?:gurl\.es)|(?:hex\.io)|(?:hiderefer\.com)|(?:hmm\.ph)|(?:href\.in)|(?:hsblinks\.com)|(?:htxt\.it)|(?:huff\.to)|(?:hulu\.com)|(?:hurl\.me)|(?:hurl\.ws)|(?:icanhaz\.com)|(?:idek\.net)|(?:ilix\.in)|(?:is\.gd)|(?:its\.my)|(?:ix\.lt)|(?:j\.mp)|(?:jijr\.com)|(?:kl\.am)|(?:klck\.me)|(?:korta\.nu)|(?:krunchd\.com)|(?:l9k\.net)|(?:lat\.ms)|(?:liip\.to)|(?:liltext\.com)|(?:linkbee\.com)|(?:linkbun\.ch)|(?:liurl\.cn)|(?:ln-s\.net)|(?:ln-s\.ru)|(?:lnk\.gd)|(?:lnk\.ms)|(?:lnkd\.in)|(?:lnkurl\.com)|(?:lru\.jp)|(?:lt\.tl)|(?:lurl\.no)|(?:macte\.ch)|(?:mash\.to)|(?:merky\.de)|(?:migre\.me)|(?:miniurl\.com)|(?:minurl\.fr)|(?:mke\.me)|(?:moby\.to)|(?:moourl\.com)|(?:mrte\.ch)|(?:myloc\.me)|(?:myurl\.in)|(?:n\.pr)|(?:nbc\.co)|(?:nblo\.gs)|(?:nn\.nf)|(?:not\.my)|(?:notlong\.com)|(?:nsfw\.in)|(?:nutshellurl\.com)|(?:nxy\.in)|(?:nyti\.ms)|(?:o-x\.fr)|(?:oc1\.us)|(?:om\.ly)|(?:omf\.gd)|(?:omoikane\.net)|(?:on\.cnn.com)|(?:on\.mktw.net)|(?:onforb\.es)|(?:orz\.se)|(?:ow\.ly)|(?:ping\.fm)|(?:pli\.gs)|(?:pnt\.me)|(?:politi\.co)|(?:post\.ly)|(?:pp\.gg)|(?:profile\.to)|(?:ptiturl\.com)|(?:pub\.vitrue.com)|(?:qlnk\.net)|(?:qte\.me)|(?:qu\.tc)|(?:qy\.fi)|(?:r\.im)|(?:rb6\.me)|(?:read\.bi)|(?:readthis\.ca)|(?:reallytinyurl\.com)|(?:redir\.ec)|(?:redirects\.ca)|(?:redirx\.com)|(?:retwt\.me)|(?:ri\.ms)|(?:rickroll\.it)|(?:riz\.gd)|(?:rt\.nu)|(?:ru\.ly)|(?:rubyurl\.com)|(?:rurl\.org)|(?:rww\.tw)|(?:s4c\.in)|(?:s7y\.us)|(?:safe\.mn)|(?:sameurl\.com)|(?:sdut\.us)|(?:shar\.es)|(?:shink\.de)|(?:shorl\.com)|(?:short\.ie)|(?:short\.to)|(?:shortlinks\.co.uk)|(?:shorturl\.com)|(?:shout\.to)|(?:show\.my)|(?:shrinkify\.com)|(?:shrinkr\.com)|(?:shrt\.fr)|(?:shrt\.st)|(?:shrten\.com)|(?:shrunkin\.com)|(?:simurl\.com)|(?:slate\.me)|(?:smallr\.com)|(?:smsh\.me)|(?:smurl\.name)|(?:sn\.im)|(?:snipr\.com)|(?:snipurl\.com)|(?:snurl\.com)|(?:sp2\.ro)|(?:spedr\.com)|(?:srnk\.net)|(?:srs\.li)|(?:starturl\.com)|(?:su\.pr)|(?:surl\.co.uk)|(?:surl\.hu)|(?:t\.cn)|(?:t\.co)|(?:t\.lh.com)|(?:ta\.gd)|(?:tbd\.ly)|(?:tcrn\.ch)|(?:tgr\.me)|(?:tgr\.ph)|(?:tighturl\.com)|(?:tiniuri\.com)|(?:tiny\.cc)|(?:tiny\.ly)|(?:tiny\.pl)|(?:tinylink\.in)|(?:tinyuri\.ca)|(?:tinyurl\.com)|(?:tl\.gd)|(?:tmi\.me)|(?:tnij\.org)|(?:tnw\.to)|(?:tny\.com)|(?:to\.ly)|(?:togoto\.us)|(?:totc\.us)|(?:toysr\.us)|(?:tpm\.ly)|(?:tr\.im)|(?:tra\.kz)|(?:trunc\.it)|(?:twhub\.com)|(?:twirl\.at)|(?:twitclicks\.com)|(?:twitterurl\.net)|(?:twitterurl\.org)|(?:twiturl\.de)|(?:twurl\.cc)|(?:twurl\.nl)|(?:u\.mavrev.com)|(?:u\.nu)|(?:u76\.org)|(?:ub0\.cc)|(?:ulu\.lu)|(?:updating\.me)|(?:ur1\.ca)|(?:url\.az)|(?:url\.co.uk)|(?:url\.ie)|(?:url360\.me)|(?:url4\.eu)|(?:urlborg\.com)|(?:urlbrief\.com)|(?:urlcover\.com)|(?:urlcut\.com)|(?:urlenco\.de)|(?:urli\.nl)|(?:urls\.im)|(?:urlshorteningservicefortwitter\.com)|(?:urlx\.ie)|(?:urlzen\.com)|(?:usat\.ly)|(?:use\.my)|(?:vb\.ly)|(?:vgn\.am)|(?:vl\.am)|(?:vm\.lc)|(?:w55\.de)|(?:wapo\.st)|(?:wapurl\.co.uk)|(?:wipi\.es)|(?:wp\.me)|(?:x\.vu)|(?:xr\.com)|(?:xrl\.in)|(?:xrl\.us)|(?:xurl\.es)|(?:xurl\.jp)|(?:y\.ahoo.it)|(?:yatuc\.com)|(?:ye\.pe)|(?:yep\.it)|(?:yfrog\.com)|(?:yhoo\.it)|(?:yiyd\.com)|(?:youtu\.be)|(?:yuarel\.com)|(?:z0p\.de)|(?:zi\.ma)|(?:zi\.mu)|(?:zipmyurl\.com)|(?:zud\.me)|(?:zurl\.ws)|(?:zz\.gd)|(?:zzang\.kr)|(?:›\.ws)|(?:✩\.ws)|(?:✿\.ws)|(?:❥\.ws)|(?:➔\.ws)|(?:➞\.ws)|(?:➡\.ws)|(?:➨\.ws)|(?:➯\.ws)|(?:➹\.ws)|(?:➽\.ws))/[a-z0-9]*/?", "", tweet_text) tweet_text = re.sub(r"[?!,.#…@:*\"/•]", " ", tweet_text) tweet_text = re.sub(r"[()]", " ", tweet_text) tweet_text = re.sub(r"\s{2,}", " ", tweet_text) tweet_text = re.sub(r"(^\s)|(\s$)", "", tweet_text) tweet_text = re.sub(r"\s[–\-|]|(--)\s", " ", tweet_text) return [stem(t) for t in tweet_text.split()]
def _valid_word(self, word): """ Check if the given word is valid """ # Stopwords check if word in self.stopwords: return False # Punctuation check if not word.isalpha(): return False # Accent check if not word.isascii(): return False # Vowel + y check if set("aeiouy").isdisjoint(word): return False # Length check if not 2 <= len(word) <= 12: return False # Plural check if pluralize(word) in self.codenames_words: return False # Singular check if singularize(word) in self.codenames_words: return False # Stem check if stem(word) in self.stemmed_codenames_words: return False # Containment check for codenames_word in self.codenames_words: if codenames_word in word or word in codenames_word: return False return True
Created on Tue Mar 31 10:08:01 2020 @author: Matteo """ import numpy as np from gensim.corpora import Dictionary from gensim.parsing.preprocessing import stem import re #%% Train data with open( r'C:\Users\Matteo\Documents\Git\aLDA\data\wikitext-2-raw\wiki.train.raw', encoding="utf8") as file: dataRaw = file.read() dataRaw = stem(dataRaw) data = dataRaw.replace('= = = = ', '+ + + +') data = data.replace('= = =', '- - -') data = data.replace('= =', '* *') data = data.split(" = ") datagensim = [] regex = re.compile('[^a-zA-Z ]') for d in data: #First parameter is the replacement, second parameter is your input string test = regex.sub('', d) #Out: 'abdE' if len(test) > 100: datagensim += [[i for i in test.split(" ") if len(i) > 3]]
file_dir = os.path.join('C:\\', 'Users', 'cruze', 'Documents', 'CS664') inputfile = os.path.join(file_dir, 'train_E6oV3lV.csv') df = pd.read_csv(inputfile) corpus = df['tweet'] df = [] from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_non_alphanum, strip_numeric, strip_multiple_whitespaces, stem for msg in corpus: string = remove_stopwords(msg) string = strip_punctuation(string) string = strip_non_alphanum(string) string = strip_numeric(string) string = strip_multiple_whitespaces(string) string = stem(string) df.append(string) corpus = df #out = pd.DataFrame(data=corpus) #out.to_csv('chatOut.csv', index_label=False) tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(corpus)] max_epochs = 50 vec_size = 50 alpha = 0.025 model = Doc2Vec(size=vec_size, alpha=alpha,
def __getitem__(self, item): return [stem(w) for w in self.full_tweet_split[item]]
def analyze(topics, start_prediction=int(time.time() * 1000 - 604800000), end_prediction=None, select_top=10, print_select_tops=True, key_words=None): start_time = time.time() tt = TextTeaser() # 用于提取摘要 print("开始日期:", start_prediction) print("全文本") full_tweets = FullTweets(start_prediction, end_prediction, key_words) # 全文本 print("去处理后") full_split = FullTweetTextSplit(start_prediction, end_prediction) # 去处理后 print("提取词干") full_stemmed = TweetStemmed(start_prediction, end_prediction) # 提取词干 print("论文数量", len(full_tweets)) # 构造字典 并且处理字典中的停用词 dictionary = corpora.Dictionary( tweet_preprocess(item) for item in full_stemmed) print("initialization finished") stop_list = set( 'a an the of at on upon in to from out as so such or and those this these that for is was am are \'s been were what when where who will with the www' .split()) stop_ids = [ dictionary.token2id[stop_word] for stop_word in stop_list if stop_word in dictionary.token2id ] once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1 ] dictionary.filter_tokens(stop_ids + once_ids) dictionary.compactify() print("dictionary finished") # 建立 tfidf模型 corpus = TFIDFCorpus(full_stemmed, dictionary) print('corpus finished') # noinspection PyPep8Naming tfIdfModel = models.TfidfModel(corpus) print('TF-IDF finished') # 建立 词袋模型 topics_bow = dictionary.doc2bow( [stem(t.replace(' ', '_')) for t in topics]) topics_tfidf = tfIdfModel[topics_bow] print("related topics finished") # 建立矩阵相似度index print(len(dictionary)) index = similarities.MatrixSimilarity(tfIdfModel[corpus]) print("similarity index finished") sims0 = index[topics_tfidf] sims = sorted(enumerate(sims0), key=lambda item: -item[1]) print("similarity sorting finished") # remove duplicates print(len(full_tweets)) tweet_texts = [' '.join(t) for t in full_split] current_pointer = 0 while current_pointer < len(sims) - 1: this_index = sims[current_pointer][0] next_index = sims[current_pointer + 1][0] this_text = re.sub('rt \S* ', '', tweet_texts[this_index]).strip() next_text = re.sub('rt \S* ', '', tweet_texts[next_index]).strip() if this_text[0:32] == next_text[0:32]: del sims[current_pointer + 1] else: current_pointer += 1 existing = [False] * len(full_tweets) for i in sims: existing[i[0]] = True print(len(sims)) print(sims[0:select_top]) result_str_list = [] if print_select_tops: final_result = sims[0:select_top] increment_flag = 1 for item in final_result: t = full_tweets[item[0]] result_str = """{increment_flag}.\n{title}\n{authors}\nhttps://arxiv.org/abs/{arxiv_id}\n{abstract}""".format( increment_flag=increment_flag, title=t.title.replace('\n ', ''), authors=t.authors, arxiv_id=t.user_id.replace('arXiv:', ''), abstract=t.description) result_str_list.append(result_str) increment_flag += 1 print(str(time.time() - start_time)) # send email result_email_text = '论文' + '\n\n\n'.join(result_str_list) if end_prediction is None: end_prediction = int(time.time() * 1000) start_prediction = int(time.time() * 1000 - 604800000) subject = '本周' + topics[0] + '领域论文推荐: {start_date}-{end_date}'.format( start_date=datetime.fromtimestamp( start_prediction / 1000).strftime("%Y.%m.%d"), end_date=datetime.fromtimestamp( end_prediction / 1000).strftime("%Y.%m.%d")) sendEmail(subject, result_email_text) # # calculate accuracy acc_count = 0 top_indexes = [i[0] for i in sims[0:select_top]] for _idx, tweet in enumerate(full_tweets): if _idx in top_indexes: t = tweet.description if labelling(t, topics): acc_count += 1 hit = acc_count / select_top # calculate loss (cross entropy) q = [] p = [] for _idx, tweet in enumerate(full_tweets): if not existing[_idx]: continue q.append(log(sims0[_idx] if sims0[_idx] > 0 else 1e-20)) p.append(1.0 if labelling(tweet.description, topics) else 0.0) loss = -np.dot(p, q) # calculate precision & recall pr_counter = [0, 0, 0, 0] for _idx, tweet in enumerate(full_tweets): if not existing[_idx]: continue t = tweet.title.lower() + "." + tweet.description if _idx in top_indexes and labelling(t, topics): pr_counter[0] += 1 elif _idx in top_indexes and not labelling(t, topics): pr_counter[1] += 1 elif _idx not in top_indexes and labelling(t, topics): pr_counter[2] += 1 elif _idx not in top_indexes and not labelling(t, topics): pr_counter[3] += 1 accuracy = (pr_counter[0] + pr_counter[3]) / len(sims) print(hit) print(loss) print(acc_count) print(accuracy)
def __iter__(self): for tweet in self.full_tweet_split: yield [stem(w) for w in tweet]
def get_processed_stems(self): return prep.stem( prep.remove_stopwords(prep.strip_non_alphanum(self.text))).split()
def get_stems(self): return prep.stem(self.text).split()
""" Created on Fri Apr 10 10:14:59 2020 @author: Matteo """ from sklearn.datasets import fetch_20newsgroups import numpy as np from gensim.corpora import Dictionary from gensim.parsing.preprocessing import stem from gensim.models import AuthorTopicModel,LdaModel,LdaMulticore import re #%% Importing directly from raw text newsgroups_train = fetch_20newsgroups(subset='train', remove = 'headers') dataRaw = newsgroups_train.data data = [stem(i) for i in dataRaw] datagensim = [] regex = re.compile('[^a-zA-Z ]') for d in data[:200]: #First parameter is the replacement, second parameter is your input string test = regex.sub('', d) #Out: 'abdE' if len(test)>100: datagensim += [[i.lower() for i in test.split(" ") if len(i)>2]] #gensim.utils.lemmatize( dct = Dictionary(datagensim) dct.filter_extremes(keep_n=50000, no_above=0.8 ) dct.compactify()
def lemmatize_stemming(text): return stem(WordNetLemmatizer().lemmatize(text, pos='v'))