コード例 #1
0
def tweet_preprocess(tweet_text: str):
    """
    文本预处理,去除特殊符号
    """
    tweet_text = tweet_text.lower()
    tweet_text = re.sub(
        r"(?:https?://)?(?:(?:0rz\.tw)|(?:1link\.in)|(?:1url\.com)|(?:2\.gp)|(?:2big\.at)|(?:2tu\.us)|(?:3\.ly)|(?:307\.to)|(?:4ms\.me)|(?:4sq\.com)|(?:4url\.cc)|(?:6url\.com)|(?:7\.ly)|(?:a\.gg)|(?:a\.nf)|(?:aa\.cx)|(?:abcurl\.net)|(?:ad\.vu)|(?:adf\.ly)|(?:adjix\.com)|(?:afx\.cc)|(?:all\.fuseurl.com)|(?:alturl\.com)|(?:amzn\.to)|(?:ar\.gy)|(?:arst\.ch)|(?:atu\.ca)|(?:azc\.cc)|(?:b23\.ru)|(?:b2l\.me)|(?:bacn\.me)|(?:bcool\.bz)|(?:binged\.it)|(?:bit\.ly)|(?:bizj\.us)|(?:bloat\.me)|(?:bravo\.ly)|(?:bsa\.ly)|(?:budurl\.com)|(?:canurl\.com)|(?:chilp\.it)|(?:chzb\.gr)|(?:cl\.lk)|(?:cl\.ly)|(?:clck\.ru)|(?:cli\.gs)|(?:cliccami\.info)|(?:clickthru\.ca)|(?:clop\.in)|(?:conta\.cc)|(?:cort\.as)|(?:cot\.ag)|(?:crks\.me)|(?:ctvr\.us)|(?:cutt\.us)|(?:dai\.ly)|(?:decenturl\.com)|(?:dfl8\.me)|(?:digbig\.com)|(?:digg\.com)|(?:disq\.us)|(?:dld\.bz)|(?:dlvr\.it)|(?:do\.my)|(?:doiop\.com)|(?:dopen\.us)|(?:easyuri\.com)|(?:easyurl\.net)|(?:eepurl\.com)|(?:eweri\.com)|(?:fa\.by)|(?:fav\.me)|(?:fb\.me)|(?:fbshare\.me)|(?:ff\.im)|(?:fff\.to)|(?:fire\.to)|(?:firsturl\.de)|(?:firsturl\.net)|(?:flic\.kr)|(?:flq\.us)|(?:fly2\.ws)|(?:fon\.gs)|(?:freak\.to)|(?:fuseurl\.com)|(?:fuzzy\.to)|(?:fwd4\.me)|(?:fwib\.net)|(?:g\.ro.lt)|(?:gizmo\.do)|(?:gl\.am)|(?:go\.9nl.com)|(?:go\.ign.com)|(?:go\.usa.gov)|(?:goo\.gl)|(?:goshrink\.com)|(?:gurl\.es)|(?:hex\.io)|(?:hiderefer\.com)|(?:hmm\.ph)|(?:href\.in)|(?:hsblinks\.com)|(?:htxt\.it)|(?:huff\.to)|(?:hulu\.com)|(?:hurl\.me)|(?:hurl\.ws)|(?:icanhaz\.com)|(?:idek\.net)|(?:ilix\.in)|(?:is\.gd)|(?:its\.my)|(?:ix\.lt)|(?:j\.mp)|(?:jijr\.com)|(?:kl\.am)|(?:klck\.me)|(?:korta\.nu)|(?:krunchd\.com)|(?:l9k\.net)|(?:lat\.ms)|(?:liip\.to)|(?:liltext\.com)|(?:linkbee\.com)|(?:linkbun\.ch)|(?:liurl\.cn)|(?:ln-s\.net)|(?:ln-s\.ru)|(?:lnk\.gd)|(?:lnk\.ms)|(?:lnkd\.in)|(?:lnkurl\.com)|(?:lru\.jp)|(?:lt\.tl)|(?:lurl\.no)|(?:macte\.ch)|(?:mash\.to)|(?:merky\.de)|(?:migre\.me)|(?:miniurl\.com)|(?:minurl\.fr)|(?:mke\.me)|(?:moby\.to)|(?:moourl\.com)|(?:mrte\.ch)|(?:myloc\.me)|(?:myurl\.in)|(?:n\.pr)|(?:nbc\.co)|(?:nblo\.gs)|(?:nn\.nf)|(?:not\.my)|(?:notlong\.com)|(?:nsfw\.in)|(?:nutshellurl\.com)|(?:nxy\.in)|(?:nyti\.ms)|(?:o-x\.fr)|(?:oc1\.us)|(?:om\.ly)|(?:omf\.gd)|(?:omoikane\.net)|(?:on\.cnn.com)|(?:on\.mktw.net)|(?:onforb\.es)|(?:orz\.se)|(?:ow\.ly)|(?:ping\.fm)|(?:pli\.gs)|(?:pnt\.me)|(?:politi\.co)|(?:post\.ly)|(?:pp\.gg)|(?:profile\.to)|(?:ptiturl\.com)|(?:pub\.vitrue.com)|(?:qlnk\.net)|(?:qte\.me)|(?:qu\.tc)|(?:qy\.fi)|(?:r\.im)|(?:rb6\.me)|(?:read\.bi)|(?:readthis\.ca)|(?:reallytinyurl\.com)|(?:redir\.ec)|(?:redirects\.ca)|(?:redirx\.com)|(?:retwt\.me)|(?:ri\.ms)|(?:rickroll\.it)|(?:riz\.gd)|(?:rt\.nu)|(?:ru\.ly)|(?:rubyurl\.com)|(?:rurl\.org)|(?:rww\.tw)|(?:s4c\.in)|(?:s7y\.us)|(?:safe\.mn)|(?:sameurl\.com)|(?:sdut\.us)|(?:shar\.es)|(?:shink\.de)|(?:shorl\.com)|(?:short\.ie)|(?:short\.to)|(?:shortlinks\.co.uk)|(?:shorturl\.com)|(?:shout\.to)|(?:show\.my)|(?:shrinkify\.com)|(?:shrinkr\.com)|(?:shrt\.fr)|(?:shrt\.st)|(?:shrten\.com)|(?:shrunkin\.com)|(?:simurl\.com)|(?:slate\.me)|(?:smallr\.com)|(?:smsh\.me)|(?:smurl\.name)|(?:sn\.im)|(?:snipr\.com)|(?:snipurl\.com)|(?:snurl\.com)|(?:sp2\.ro)|(?:spedr\.com)|(?:srnk\.net)|(?:srs\.li)|(?:starturl\.com)|(?:su\.pr)|(?:surl\.co.uk)|(?:surl\.hu)|(?:t\.cn)|(?:t\.co)|(?:t\.lh.com)|(?:ta\.gd)|(?:tbd\.ly)|(?:tcrn\.ch)|(?:tgr\.me)|(?:tgr\.ph)|(?:tighturl\.com)|(?:tiniuri\.com)|(?:tiny\.cc)|(?:tiny\.ly)|(?:tiny\.pl)|(?:tinylink\.in)|(?:tinyuri\.ca)|(?:tinyurl\.com)|(?:tl\.gd)|(?:tmi\.me)|(?:tnij\.org)|(?:tnw\.to)|(?:tny\.com)|(?:to\.ly)|(?:togoto\.us)|(?:totc\.us)|(?:toysr\.us)|(?:tpm\.ly)|(?:tr\.im)|(?:tra\.kz)|(?:trunc\.it)|(?:twhub\.com)|(?:twirl\.at)|(?:twitclicks\.com)|(?:twitterurl\.net)|(?:twitterurl\.org)|(?:twiturl\.de)|(?:twurl\.cc)|(?:twurl\.nl)|(?:u\.mavrev.com)|(?:u\.nu)|(?:u76\.org)|(?:ub0\.cc)|(?:ulu\.lu)|(?:updating\.me)|(?:ur1\.ca)|(?:url\.az)|(?:url\.co.uk)|(?:url\.ie)|(?:url360\.me)|(?:url4\.eu)|(?:urlborg\.com)|(?:urlbrief\.com)|(?:urlcover\.com)|(?:urlcut\.com)|(?:urlenco\.de)|(?:urli\.nl)|(?:urls\.im)|(?:urlshorteningservicefortwitter\.com)|(?:urlx\.ie)|(?:urlzen\.com)|(?:usat\.ly)|(?:use\.my)|(?:vb\.ly)|(?:vgn\.am)|(?:vl\.am)|(?:vm\.lc)|(?:w55\.de)|(?:wapo\.st)|(?:wapurl\.co.uk)|(?:wipi\.es)|(?:wp\.me)|(?:x\.vu)|(?:xr\.com)|(?:xrl\.in)|(?:xrl\.us)|(?:xurl\.es)|(?:xurl\.jp)|(?:y\.ahoo.it)|(?:yatuc\.com)|(?:ye\.pe)|(?:yep\.it)|(?:yfrog\.com)|(?:yhoo\.it)|(?:yiyd\.com)|(?:youtu\.be)|(?:yuarel\.com)|(?:z0p\.de)|(?:zi\.ma)|(?:zi\.mu)|(?:zipmyurl\.com)|(?:zud\.me)|(?:zurl\.ws)|(?:zz\.gd)|(?:zzang\.kr)|(?:›\.ws)|(?:✩\.ws)|(?:✿\.ws)|(?:❥\.ws)|(?:➔\.ws)|(?:➞\.ws)|(?:➡\.ws)|(?:➨\.ws)|(?:➯\.ws)|(?:➹\.ws)|(?:➽\.ws))/[a-z0-9]*/?",
        "", tweet_text)
    tweet_text = re.sub(r"[?!,.#…@:*\"/’•]", " ", tweet_text)
    tweet_text = re.sub(r"[()]", " ", tweet_text)
    tweet_text = re.sub(r"\s{2,}", " ", tweet_text)
    tweet_text = re.sub(r"(^\s)|(\s$)", "", tweet_text)
    tweet_text = re.sub(r"\s[–\-|]|(--)\s", " ", tweet_text)

    return [stem(t) for t in tweet_text.split()]
コード例 #2
0
    def _valid_word(self, word):
        """
        Check if the given word is valid
        """
        # Stopwords check
        if word in self.stopwords:
            return False

        # Punctuation check
        if not word.isalpha():
            return False

        # Accent check
        if not word.isascii():
            return False

        # Vowel + y check
        if set("aeiouy").isdisjoint(word):
            return False

        # Length check
        if not 2 <= len(word) <= 12:
            return False

        # Plural check
        if pluralize(word) in self.codenames_words:
            return False

        # Singular check
        if singularize(word) in self.codenames_words:
            return False

        # Stem check
        if stem(word) in self.stemmed_codenames_words:
            return False

        # Containment check
        for codenames_word in self.codenames_words:
            if codenames_word in word or word in codenames_word:
                return False

        return True
コード例 #3
0
Created on Tue Mar 31 10:08:01 2020

@author: Matteo
"""

import numpy as np
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import stem
import re
#%% Train data
with open(
        r'C:\Users\Matteo\Documents\Git\aLDA\data\wikitext-2-raw\wiki.train.raw',
        encoding="utf8") as file:
    dataRaw = file.read()

dataRaw = stem(dataRaw)
data = dataRaw.replace('= = = = ', '+ + + +')
data = data.replace('= = =', '- - -')
data = data.replace('= =', '* *')

data = data.split(" = ")
datagensim = []
regex = re.compile('[^a-zA-Z ]')
for d in data:

    #First parameter is the replacement, second parameter is your input string
    test = regex.sub('', d)
    #Out: 'abdE'
    if len(test) > 100:
        datagensim += [[i for i in test.split(" ") if len(i) > 3]]
コード例 #4
0
file_dir = os.path.join('C:\\', 'Users', 'cruze', 'Documents', 'CS664')
inputfile = os.path.join(file_dir, 'train_E6oV3lV.csv')
df = pd.read_csv(inputfile)

corpus = df['tweet']

df = []
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_non_alphanum, strip_numeric, strip_multiple_whitespaces, stem
for msg in corpus:
    string = remove_stopwords(msg)
    string = strip_punctuation(string)
    string = strip_non_alphanum(string)
    string = strip_numeric(string)
    string = strip_multiple_whitespaces(string)
    string = stem(string)
    df.append(string)

corpus = df

#out = pd.DataFrame(data=corpus)
#out.to_csv('chatOut.csv', index_label=False)

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(corpus)]

max_epochs = 50
vec_size = 50
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
コード例 #5
0
 def __getitem__(self, item):
     return [stem(w) for w in self.full_tweet_split[item]]
コード例 #6
0
def analyze(topics,
            start_prediction=int(time.time() * 1000 - 604800000),
            end_prediction=None,
            select_top=10,
            print_select_tops=True,
            key_words=None):
    start_time = time.time()
    tt = TextTeaser()  # 用于提取摘要
    print("开始日期:", start_prediction)
    print("全文本")
    full_tweets = FullTweets(start_prediction, end_prediction,
                             key_words)  # 全文本
    print("去处理后")
    full_split = FullTweetTextSplit(start_prediction, end_prediction)  # 去处理后
    print("提取词干")
    full_stemmed = TweetStemmed(start_prediction, end_prediction)  # 提取词干
    print("论文数量", len(full_tweets))
    # 构造字典 并且处理字典中的停用词
    dictionary = corpora.Dictionary(
        tweet_preprocess(item) for item in full_stemmed)
    print("initialization finished")
    stop_list = set(
        'a an the of at on upon in to from out as so such or and those this these that for is was am are \'s been were what when where who will with the www'
        .split())
    stop_ids = [
        dictionary.token2id[stop_word] for stop_word in stop_list
        if stop_word in dictionary.token2id
    ]
    once_ids = [
        tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1
    ]
    dictionary.filter_tokens(stop_ids + once_ids)
    dictionary.compactify()
    print("dictionary finished")

    # 建立 tfidf模型
    corpus = TFIDFCorpus(full_stemmed, dictionary)
    print('corpus finished')
    # noinspection PyPep8Naming
    tfIdfModel = models.TfidfModel(corpus)
    print('TF-IDF finished')

    # 建立 词袋模型
    topics_bow = dictionary.doc2bow(
        [stem(t.replace(' ', '_')) for t in topics])
    topics_tfidf = tfIdfModel[topics_bow]
    print("related topics finished")

    # 建立矩阵相似度index
    print(len(dictionary))
    index = similarities.MatrixSimilarity(tfIdfModel[corpus])
    print("similarity index finished")
    sims0 = index[topics_tfidf]
    sims = sorted(enumerate(sims0), key=lambda item: -item[1])
    print("similarity sorting finished")

    # remove duplicates
    print(len(full_tweets))
    tweet_texts = [' '.join(t) for t in full_split]
    current_pointer = 0
    while current_pointer < len(sims) - 1:
        this_index = sims[current_pointer][0]
        next_index = sims[current_pointer + 1][0]
        this_text = re.sub('rt \S* ', '', tweet_texts[this_index]).strip()
        next_text = re.sub('rt \S* ', '', tweet_texts[next_index]).strip()
        if this_text[0:32] == next_text[0:32]:
            del sims[current_pointer + 1]
        else:
            current_pointer += 1
    existing = [False] * len(full_tweets)
    for i in sims:
        existing[i[0]] = True
    print(len(sims))
    print(sims[0:select_top])

    result_str_list = []
    if print_select_tops:
        final_result = sims[0:select_top]
        increment_flag = 1
        for item in final_result:
            t = full_tweets[item[0]]
            result_str = """{increment_flag}.\n{title}\n{authors}\nhttps://arxiv.org/abs/{arxiv_id}\n{abstract}""".format(
                increment_flag=increment_flag,
                title=t.title.replace('\n ', ''),
                authors=t.authors,
                arxiv_id=t.user_id.replace('arXiv:', ''),
                abstract=t.description)
            result_str_list.append(result_str)

            increment_flag += 1
    print(str(time.time() - start_time))

    # send email
    result_email_text = '论文' + '\n\n\n'.join(result_str_list)
    if end_prediction is None:
        end_prediction = int(time.time() * 1000)
    start_prediction = int(time.time() * 1000 - 604800000)
    subject = '本周' + topics[0] + '领域论文推荐: {start_date}-{end_date}'.format(
        start_date=datetime.fromtimestamp(
            start_prediction / 1000).strftime("%Y.%m.%d"),
        end_date=datetime.fromtimestamp(
            end_prediction / 1000).strftime("%Y.%m.%d"))
    sendEmail(subject, result_email_text)
    #
    # calculate accuracy
    acc_count = 0
    top_indexes = [i[0] for i in sims[0:select_top]]
    for _idx, tweet in enumerate(full_tweets):
        if _idx in top_indexes:
            t = tweet.description
            if labelling(t, topics):
                acc_count += 1
    hit = acc_count / select_top

    # calculate loss (cross entropy)
    q = []
    p = []
    for _idx, tweet in enumerate(full_tweets):
        if not existing[_idx]:
            continue
        q.append(log(sims0[_idx] if sims0[_idx] > 0 else 1e-20))
        p.append(1.0 if labelling(tweet.description, topics) else 0.0)
    loss = -np.dot(p, q)

    # calculate precision & recall
    pr_counter = [0, 0, 0, 0]
    for _idx, tweet in enumerate(full_tweets):
        if not existing[_idx]:
            continue
        t = tweet.title.lower() + "." + tweet.description
        if _idx in top_indexes and labelling(t, topics):
            pr_counter[0] += 1
        elif _idx in top_indexes and not labelling(t, topics):
            pr_counter[1] += 1
        elif _idx not in top_indexes and labelling(t, topics):
            pr_counter[2] += 1
        elif _idx not in top_indexes and not labelling(t, topics):
            pr_counter[3] += 1
    accuracy = (pr_counter[0] + pr_counter[3]) / len(sims)

    print(hit)
    print(loss)
    print(acc_count)
    print(accuracy)
コード例 #7
0
 def __iter__(self):
     for tweet in self.full_tweet_split:
         yield [stem(w) for w in tweet]
コード例 #8
0
 def get_processed_stems(self):
     return prep.stem(
         prep.remove_stopwords(prep.strip_non_alphanum(self.text))).split()
コード例 #9
0
 def get_stems(self):
     return prep.stem(self.text).split()
コード例 #10
0
"""
Created on Fri Apr 10 10:14:59 2020

@author: Matteo
"""

from sklearn.datasets import fetch_20newsgroups
import numpy as np
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import stem
from gensim.models import AuthorTopicModel,LdaModel,LdaMulticore
import re
#%% Importing directly from raw text
newsgroups_train = fetch_20newsgroups(subset='train', remove = 'headers')
dataRaw = newsgroups_train.data
data = [stem(i) for i in dataRaw]

datagensim = []
regex = re.compile('[^a-zA-Z ]')
for d in data[:200]:
    

    #First parameter is the replacement, second parameter is your input string
    test = regex.sub('', d)
    #Out: 'abdE'
    if len(test)>100:
        datagensim += [[i.lower() for i in test.split(" ") if len(i)>2]]
#gensim.utils.lemmatize(
dct = Dictionary(datagensim)
dct.filter_extremes(keep_n=50000, no_above=0.8 )
dct.compactify()
コード例 #11
0
def lemmatize_stemming(text):
    return stem(WordNetLemmatizer().lemmatize(text, pos='v'))