def tokenize_text(body): body = body.lower() body = BeautifulSoup(body, "html.parser").get_text() tokenizer = tokenize.RegexpTokenizer(r'\w+') body = tokenizer.tokenize(body) return [word for word in body if word not in corpus.stopwords.words('english')]
def prep_sequence(seq, lookup, size): '''Converts a sequence to a vector''' tokenizer = tokenize.RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(seq) sentence_vec = np.zeros((size, )) for token in tokens: if token in lookup: sentence_vec += lookup[token] return torch.from_numpy(sentence_vec).float()
def longest_sentence_length(data1, data2): max_length = 0 tokenizer = tokenize.RegexpTokenizer(r'\w+') for sent_list in [data1['str1'], data1['str2'], data2['str1'], data2['str2']]: for sent in sent_list: tokens = tokenizer.tokenize(sent) if len(tokens) > max_length: max_length = len(tokens) return max_length
def prep_sequence(seq, lookup, max_len, gpu=False): '''Converts a sequence to a vector''' tokenizer = tokenize.RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(seq) seq = [lookup[s] for s in tokens] while len(seq) < max_len: seq.append(0) if gpu: return torch.LongTensor(seq).cuda() else: return torch.LongTensor(seq)
def get_tokens(text): tokenizer = tokenize.RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(text) stw = set(stopwords.words('spanish')) tokens_no_sw = [x.lower() for x in tokens if x.lower() not in stw] tokens_no_sw = Counter(tokens_no_sw) norma = sum([tokens_no_sw[i] * tokens_no_sw[i] for i in tokens_no_sw]) norma = sqrt(norma) for i in tokens_no_sw: tokens_no_sw[i] = tokens_no_sw[i] / norma return tokens_no_sw
def tokenize_code(text): """Tokenize code strings. This simply considers whitespaces as token delimiters. Args: text: A code string to be tokenized. Returns: A list of strings representing the tokens in the code. """ return tokenize.RegexpTokenizer(r'\w+').tokenize(text)
def tokenize_comment(comment, voc, voc_index): tokenizer = tokenize.RegexpTokenizer(r'\w+') stemmer = stem.SnowballStemmer('russian') result = [] for sent in tokenize.sent_tokenize(comment): filtered = [word for word in tokenizer.tokenize(sent) \ if word not in corpus.stopwords.words('russian')] stemmed = [stemmer.stem(word) for word in filtered] for word in stemmed: if voc.get(word) == None: voc[word] = voc_index voc_index += 1 result += stemmed return voc_index, result
def read_data(files_loc, s_type='positive'): content_list = [] tag_list = [] s_tag_list = [] tokenizer = tokenize.RegexpTokenizer(r'\w+') files_loc = files_loc + "*.txt" stemmer = SnowballStemmer("english") for f in glob.glob(files_loc): tag_list.append(f) s_tag_list.append(s_type) with open(f, 'r', encoding='utf-8') as text: raw = text.read() content = re.sub(r'\d+', '', raw) content = tokenizer.tokenize(content) content = list(map(lambda x: x.lower(), content)) content = list(map(lambda x: stemmer.stem(x), content)) content_list.append(content) return content_list, tag_list, s_tag_list
def preprocess(self, text): text = unicode(text, errors='replace') text = text.replace("\t", " ") text = text.replace("\r", " ") text = text.replace("\n", " ") text = text.replace("'", " ") text = text.strip('\t\n\r') #NLTK removing punctuation and numbers; tokenizer = tokenize.RegexpTokenizer(r'[-.?!,":;()|0-9 ]', gaps=True) word_list = tokenizer.tokenize(text) #NLTK removing english stop words and Stemming Words; stemmer = PorterStemmer() filtered_words = [ stemmer.stem(w) for w in word_list if not w in stopwords.words('english') and w != '' and w != ' ' and w != '\n' and len(w) > 1 ] return ' '.join(filtered_words)
主题词抽取 ''' import warnings # 用于过滤模型多余的警告 warnings.filterwarnings('ignore', category=UserWarning) import nltk.tokenize as tk import nltk.corpus as nc import nltk.stem.snowball as sb import gensim.models.ldamodel as gm import gensim.corpora as gc doc = [] with open('../data2/topic.txt', 'r') as f: for line in f.readlines(): doc.append(line[:-1]) tokenizer = tk.RegexpTokenizer(r'\w+') # 基于正则表达式分词 stopwords = nc.stopwords.words('english') # 去除非主要词 stemmer = sb.SnowballStemmer('english') # 词干提取器 lines_tokens = [] for line in doc: tokens = tokenizer.tokenize(line.lower()) line_tokens = [] for token in tokens: if token not in stopwords: token = stemmer.stem(token) line_tokens.append(token) lines_tokens.append(line_tokens) dic = gc.Dictionary(lines_tokens) bow = [] for line_tokens in lines_tokens: row = dic.doc2bow(line_tokens) # 构造词袋----------
import warnings warnings.filterwarnings("ignore", category=UserWarning) # 忽略用户级警告 import nltk.tokenize as nltoken import nltk.corpus as sc import nltk.stem.snowball as snowball import gensim.models.ldamodel as gm import gensim.corpora as gc doc = list() with open(os.path.dirname(__file__) + "/data/topic.txt") as f: for line in f.readlines(): doc.append(line) tokenizer = nltoken.RegexpTokenizer(r"\w+") # 见空白就拆开 stopwords = sc.stopwords.words("english") # 停止词汇 stemmer = snowball.SnowballStemmer("english") lines_tokens = list() # 被拆开的句子是一个行列式, 一行表示一句, 一列表示一个单词 for line in doc: tokens = tokenizer.tokenize(line.lower()) line_tokens = list() for token in tokens: if token not in stopwords: token = stemmer.stem(token) line_tokens.append(token) line_tokens.append(line_tokens) dic = gc.Dictionary(lines_tokens) # 构建词典, 用于构建词袋
def __init__(self): """Start up code that should be run once in order to set up for separating sentences by topic""" # making sure user has required resources try: nltk.data.find('corpora/wordnet') except: nltk.download('wordnet') try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') # list of stop words to remove curtesy of https://gist.github.com/sebleier/554280 self._stop_words = [ "a", "about", "above", "after", "again", "against", "ain", "all", "am", "an", "and", "any", "are", "aren", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "couldn", "couldn't", "d", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", "doing", "don", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn", "hadn't", "has", "hasn", "hasn't", "have", "haven", "haven't", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "ll", "m", "ma", "me", "mightn", "mightn't", "more", "most", "mustn", "mustn't", "my", "myself", "needn", "needn't", "no", "nor", "not", "now", "o", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "re", "s", "same", "shan", "shan't", "she", "she's", "should", "should've", "shouldn", "shouldn't", "so", "some", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "ve", "very", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "won", "won't", "wouldn", "wouldn't", "y", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "could", "he'd", "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've", "let's", "ought", "she'd", "she'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "we'd", "we'll", "we're", "we've", "what's", "when's", "where's", "who's", "why's", "would", "able", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "afterwards", "ah", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "announce", "another", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "arent", "arise", "around", "aside", "ask", "asking", "auth", "available", "away", "awfully", "b", "back", "became", "become", "becomes", "becoming", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "believe", "beside", "besides", "beyond", "biol", "brief", "briefly", "c", "ca", "came", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "couldnt", "date", "different", "done", "downwards", "due", "e", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "former", "formerly", "forth", "found", "four", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "happens", "hardly", "hed", "hence", "hereafter", "hereby", "herein", "heres", "hereupon", "hes", "hi", "hid", "hither", "home", "howbeit", "however", "hundred", "id", "ie", "im", "immediate", "immediately", "importance", "important", "inc", "indeed", "index", "information", "instead", "invention", "inward", "itd", "it'll", "j", "k", "keep", "keeps", "kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "made", "mainly", "make", "makes", "many", "may", "maybe", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "moreover", "mostly", "mr", "mrs", "much", "mug", "must", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "nobody", "non", "none", "nonetheless", "noone", "normally", "nos", "noted", "nothing", "nowhere", "obtain", "obtained", "obviously", "often", "oh", "ok", "okay", "old", "omitted", "one", "ones", "onto", "ord", "others", "otherwise", "outside", "overall", "owing", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "said", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "shed", "shes", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "sufficiently", "suggest", "sup", "sure", "take", "taken", "taking", "tell", "tends", "th", "thank", "thanks", "thanx", "thats", "that've", "thence", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "thereto", "thereupon", "there've", "theyd", "theyre", "think", "thou", "though", "thoughh", "thousand", "throug", "throughout", "thru", "thus", "til", "tip", "together", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "twice", "two", "u", "un", "unfortunately", "unless", "unlike", "unlikely", "unto", "upon", "ups", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "v", "value", "various", "'ve", "via", "viz", "vol", "vols", "vs", "w", "want", "wants", "wasnt", "way", "wed", "welcome", "went", "werent", "whatever", "what'll", "whats", "whence", "whenever", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "whim", "whither", "whod", "whoever", "whole", "who'll", "whomever", "whos", "whose", "widely", "willing", "wish", "within", "without", "wont", "words", "world", "wouldnt", "www", "x", "yes", "yet", "youd", "youre", "z", "zero", "a's", "ain't", "allow", "allows", "apart", "appear", "appreciate", "appropriate", "associated", "best", "better", "c'mon", "c's", "cant", "changes", "clearly", "concerning", "consequently", "consider", "considering", "corresponding", "course", "currently", "definitely", "described", "despite", "entirely", "exactly", "example", "going", "greetings", "hello", "help", "hopefully", "ignored", "inasmuch", "indicate", "indicated", "indicates", "inner", "insofar", "it'd", "keep", "keeps", "novel", "presumably", "reasonably", "second", "secondly", "sensible", "serious", "seriously", "sure", "t's", "third", "thorough", "thoroughly", "three", "well", "wonder" ] # set pattern to tokenize sentence to words self._word_tokenizer = tokenize.RegexpTokenizer( r"[a-zA-Z]{3,}(?:-[a-zA-Z]+)+|[a-zA-Z]{3,}(?:'t)?") # creates lemmatizer self._lemmatizer = WordNetLemmatizer() self._topicNum = 2
def tokenize_code(text): """A very basic procedure for tokenizing code strings.""" return tokenize.RegexpTokenizer(r'\w+').tokenize(text)
def set_up(self): self.tokenizer = tokenize.RegexpTokenizer(self.pattern)
"wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" } neg_pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b') tzer = tokenize.RegexpTokenizer(r'[A-Za-z_]+') #remove url, unicodes, emojis pat1 = r'@[A-Za-z0-9]+' pat2 = r'https?://[^ ]+' pat3 = r'www\.[^ ]+' pat4 = r'\\u[^ ]+' combined_pat = r'|'.join((pat1, pat2, pat3, pat4)) re_pat = re.compile(combined_pat) tweets = x.tolist() clean_tweets = [] for t in tweets: tweet = tweet_cleaner(t) clean_tweets.append(remove_underscores(tweet)) x = df.text.apply(tweet_cleaner).apply(remove_underscores)
#paramters batch_size=500 epoch = 100 hidden_dim = 50 embedding_dim = 50 label_size = 5 max_num=200 learning_rate = 10**(-3) layer=2 use_gpu = torch.cuda.is_available() tokenizer = tokenize.RegexpTokenizer(r"\w+") stop_words = set(stopwords.words('english')) #clean stop words (not use right now) def text_clean(sentence): words = tokenizer.tokenize(sentence) words = [w.lower() for w in words if not w in stop_words] return words #load pre-trained embedding matrix def load_glove_into_dict(glove_path): embeddings_ix = {} with open(glove_path) as glove_file:
def normalize(s): tokenizer = tokenize.RegexpTokenizer(r'\w+') words = tokenizer.tokenize(s.lower().strip()) return ' '.join([stemmer.stem(w) for w in words])
import warnings warnings.filterwarnings('ignore', category=UserWarning) import nltk.tokenize as tk import nltk.corpus as nc import nltk.stem.snowball as sb import gensim.models.ldamodel as gm import gensim.corpora as gc doc = [] with open('../data/topic.txt', 'r') as f: for line in f.readlines(): doc.append(line[:-1]) tokenizer = tk.RegexpTokenizer(r'\w+') stopwords = nc.stopwords.words('english') stemmer = sb.SnowballStemmer('english') lines_tokens = [] for line in doc: tokens = tokenizer.tokenize(line.lower()) line_tokens = [] for token in tokens: if token not in stopwords: token = stemmer.stem(token) line_tokens.append(token) lines_tokens.append(line_tokens) dic = gc.Dictionary(lines_tokens) bow = [] for line_tokens in lines_tokens: row = dic.doc2bow(line_tokens) bow.append(row) n_topics = 2 model = gm.LdaModel(bow, num_topics=n_topics, id2word=dic, passes=25) topics = model.print_topics(num_topics=n_topics, num_words=4)
def _tokenize(sentences): tokenizer = tokenize.RegexpTokenizer(r'\w+') tokens = [] for sentence in sentences: tokens += tokenizer.tokenize(sentence) return tokens
def tokenize(self, corpus): tokenizer = tokenizers.RegexpTokenizer(r'\w+') corpus = corpus.apply(lambda x: tokenizer.tokenize(x)) return corpus
def preprocess(sentence): sentence = sentence.lower() tokenizer = tokenize.RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(sentence) filtered_words = [w for w in tokens if not w in stopwords.words('english')] return filtered_words
def word_split(s): t = nltk_tokenize.RegexpTokenizer(r'[\w-]+(\.[\w-]+)*') return t.tokenize(s)
def punctuation_remove(text): tzer = tokenize.RegexpTokenizer(r'[A-Za-z0-9_]+') tokenized = tzer.tokenize(text) return ' '.join(tokenized)
def tweet_cleaner(text): contractions_dict = { "ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", "so's": "so as", "this's": "this is", "that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is", "they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" } neg_pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b') tzer = tokenize.RegexpTokenizer(r'[A-Za-z_]+') #remove url, unicodes, emojis pat1 = r'@[A-Za-z0-9]+' pat2 = r'https?://[^ ]+' pat3 = r'www\.[^ ]+' pat4 = r'\\u[^ ]+' combined_pat = r'|'.join((pat1, pat2, pat3, pat4)) re_pat = re.compile(combined_pat) soup = BeautifulSoup(text, 'lxml') souped = soup.get_text().lower() # lowercase the whole thing here bomgone = souped.replace('�', ' ') re_cleaned = re_pat.sub(' ', bomgone) neg_handled = neg_pattern.sub(lambda x: contractions_dict[x.group()], re_cleaned) tokenized = tzer.tokenize(neg_handled) return " ".join(tokenized)
import pickle import logging import os from os.path import dirname from gensim.models import Word2Vec from nltk import tokenize from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer # Tokenizer to remove non-alphabets in words. alpha_regex_tokenizer = tokenize.RegexpTokenizer(r'\w+') # Vader based sentiment analyzer sid = SentimentIntensityAnalyzer() # All reviews seen until now. overall_reviews = [] def predict(review): logging.debug("Predicting review..") sentences = clean(review) overall_scores = [] for sentence in sentences: logging.debug( "Predicting sentiment for {sentence}".format(sentence=sentence)) scores = sid.polarity_scores(sentence) overall_scores.append(scores) return overall_scores def clean(review):
import re from nltk import tokenize TOKENIZER = tokenize.RegexpTokenizer(r'[\w\'\-]+|(?:[\.,\/#!$\"\?%\^&\*;:{}=\-_`~()\[\]])') URL_REGEX = re.compile(r'(https?:\/\/[^ )]+)', re.MULTILINE) def _clean(text): # match source code (lines starting with 4 empty spaces) text_cleaned = re.sub(r'^\ {4,}.*', '', text, flags=re.MULTILINE) text_cleaned = text_cleaned.replace('\n', '') text_cleaned = text_cleaned.replace('\r', '') return text_cleaned def _tokens(text): tokens = TOKENIZER.tokenize(text) filtered = [token for token in tokens if token.isalpha()] return filtered class ReadabilityAnalyzer(object): def __init__(self, question): self.raw = URL_REGEX.sub('URL', question) self.cleaned = _clean(self.raw) @property def tokens(self): return _tokens(self.cleaned) @property def sents(self): return tokenize.sent_tokenize(self.cleaned)
# uncompyle6 version 3.2.4 # Python bytecode 3.5 (3350) # Decompiled from: Python 3.5.2 (default, Nov 12 2018, 13:43:14) # [GCC 5.4.0 20160609] # Embedded file name: /home/riccardo/Lev_dist.py # Compiled at: 2018-12-19 15:46:22 # Size of source mod 2**32: 1675 bytes import stringdist from nltk import tokenize, pos_tag from nltk.corpus import stopwords import time Personaltokenizer = tokenize.RegexpTokenizer('\\w+') possible_results = [] def evaluate_query(query, R, pos, vocab): start = time.clock() if len(query) <= 85 and len(query) > 0: query = query.lower() query = Personaltokenizer.tokenize(query) print('Query as inserted lower case: ', query) filtered_query = [ word for word in query if word not in stopwords.words('english') ] print('Filtered query : ', filtered_query) filtered_tagged_query = pos_tag(filtered_query) print('Tagged query : ', filtered_tagged_query)
def init_nltk(): global tokenizer global tagger tokenizer = tokenize.RegexpTokenizer(r'\w+|[^\w\s]+') tagger = UnigramTagger(brown.tagged_sents())
主题词提取:隐含狄利克雷分布(LDA(Latent Dirichlet allocation))应用 ?????????? ''' import warnings warnings.filterwarnings('ignore', category=UserWarning) import nltk.tokenize as tk import nltk.corpus as nc import nltk.stem.snowball as sb import gensim.models.ldamodel as gm # 隐含狄利克雷分布模型 LDA(线性判别分析) import gensim.corpora as gc doc = [] with open('topic.txt', 'r') as f: for line in f.readlines(): doc.append(line[:-1]) tokenizer = tk.RegexpTokenizer(r'\w+') # reg 匹配特殊的分隔符 stopwords = nc.stopwords.words('english') # 停止词,没实际语义贡献的词 stemmer = sb.SnowballStemmer('english') lines_tokens = [] for line in doc: tokens = tokenizer.tokenize(line.lower()) line_tokens = [] for token in tokens: if token not in stopwords: token = stemmer.stem(token) line_tokens.append(token) lines_tokens.append(line_tokens) #Dictionary: encapsulates(封装) the mapping between normalized words and their integer ids. dic = gc.Dictionary(lines_tokens) # Dictionary(121 unique tokens: ['cryptographi', 'lot', 'spent', 'studi', 'time']...)
database = "tmdb") movie_dataframe2 = sqlio.read_sql_query(sql, dbConnection) except (Exception , psycopg2.Error) as dbError : print ("Error:", dbError) finally: if(dbConnection): dbConnection.close() import nltk.sentiment.vader from nltk.corpus import stopwords import nltk.tokenize as nt from nltk.stem import PorterStemmer # Tokenization word_tokens = [] tokenizer = nt.RegexpTokenizer(r'\w+') for i in range(0, len(movie_dataframe)): word_tokens.append(tokenizer.tokenize(movie_dataframe['content'][i])) #Stop word removal stop_words = set(stopwords.words('english')) movie_dataframe['word_list'] = movie_dataframe['content'].apply(lambda x: [item for item in x.split() if item not in stop_words]) movie_dataframe['sentiment_scores'] = "" movie_dataframe['total_score'] = ""