def stem(list): ps = nltk.PorterStemmer() stemmedList = [] for word in list: stemmedList.append(ps.stem(word)) return stemmedList
lamda = 0.6 elmo_layers_weight = [0.0, 1.0, 0.0] elif (database == "Duc2001"): data, labels = fileIO.get_duc2001_data() lamda = 1.0 elmo_layers_weight = [1.0, 0.0, 0.0] else: data, labels = fileIO.get_semeval2017_data() lamda = 0.6 elmo_layers_weight = [1.0, 0.0, 0.0] #download from https://allennlp.org/elmo options_file = "../auxiliary_data/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "../auxiliary_data/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" porter = nltk.PorterStemmer() #please download nltk ELMO = word_emb_elmo.WordEmbeddings(options_file, weight_file, cuda_device=0) SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=lamda, database=database) en_model = StanfordCoreNLP( r'E:\Python_Files\stanford-corenlp-full-2018-02-27', quiet=True) #download from https://stanfordnlp.github.io/CoreNLP/ try: for key, data in data.items(): lables = labels[key] lables_stemed = [] for lable in lables: tokens = lable.split() lables_stemed.append(' '.join(porter.stem(t) for t in tokens))
def stemmer(text): ps = nltk.PorterStemmer() text = ' '.join([ps.stem(word) for word in text.split()]) return text
import PyPDF2 import nltk nltk.download('punkt') nltk.download('stopwords') nltk.download('averaged_perceptron_tagger') nltk.download('maxent_ne_chunker') nltk.download('words') from nltk.corpus import stopwords stop_words = set(stopwords.words( 'english')) # Getting the stop word list for english corpus stem = nltk.PorterStemmer() # Initialising the stemmer # Read the pdf file def read_pdf(filename): content = "" pdfFileObj = open(filename, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) for x in range(0, pdfReader.numPages): pageObj = pdfReader.getPage(x) content += (pageObj.extractText()).strip() return content # Do prepossessing with the text blob by splitting it , tokenizing it and doing Part of speech tagging def process_data(pdf_content): sent_tok = pdf_content.split("\n") # split different sentences word_tok = [nltk.word_tokenize(w) for w in sent_tok] #split each sentence to its tokens
def stem(desc, stemmer=None): stemmer = stemmer or nltk.PorterStemmer() return ' '.join(stemmer.stem(w) for w in nltk.word_tokenize(desc))
def nameLDArepresentation(textfile, dictfile, representationfile): appname = [] with open(textfile, 'r') as fin: for line in fin: line = json.loads(line) appname.append(line['name']) print("Preprocessing Name Text done!") tokenizer = RegexpTokenizer(r'\w+') appnametext = [] count = 0 for namestr in appname: wordlist = tokenizer.tokenize(namestr) wordlist_rmstopword = [ word for word in wordlist if word not in stopwords.words('english') ] for i in range(len(wordlist_rmstopword)): wordlist_rmstopword[i] = nltk.PorterStemmer().stem_word( wordlist_rmstopword[i]) Lnumber = [ word for word in wordlist_rmstopword if re.match(r'\d+$', word) ] Lothers = [ word for word in wordlist_rmstopword if re.match(r'^_+', word) ] wordlist_rmLnumber = [ word for word in wordlist_rmstopword if word not in Lnumber ] wordlist_rmLothers = [ word for word in wordlist_rmLnumber if word not in Lothers ] appnametext.append(wordlist_rmLothers) if count % 1000 == 0: print count count = count + 1 # appname.clear() print("Obtain Name Wordlist done!") name_dict = corpora.Dictionary(appnametext) once_ids = [ wordid for wordid, docfreq in name_dict.dfs.items() if docfreq == 1 ] name_dict.filter_tokens(once_ids) name_dict.save_as_text(dictfile) print("Obtain Name dictionary done!") corpus_tf = [name_dict.doc2bow(eachappname) for eachappname in appnametext] KList = [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] for k in KList: #lda transform lda = models.LdaModel(corpus=corpus_tf, id2word=name_dict, num_topics=k, minimum_probability=0) # print 'lda:',type(lda) # lda.save() corpus_lda = lda[corpus_tf] print("Obtain Name lsi representation done!") featurefile = representationfile + str(k) + ".txt" fout = open(featurefile, 'w') for doc in corpus_lda: line = [] for i in range(k): line.append(0) for (fid, fvalue) in doc: line[fid] = fvalue for item in line: t = fout.write(str(item) + '\t') t = fout.write('\r\n') fout.close()
def str_stemmer(s): return " ".join([nltk.PorterStemmer().stem_word(word) for word in s.lower().split()])
sent = sents[4] print(sent) # Segment the words in sentence with a "tokenizer" tokens = nltk.word_tokenize(sent) tokens # Normalize the tokens normalized_tokens = [t.lower() for t in tokens] print('\nNormalized tokens:\n', normalized_tokens) # Build the vocabulary vocabulary = sorted(set(normalized_tokens)) print('\nThe vocabulary:\n', vocabulary) #Stemming print([nltk.PorterStemmer().stem(t) for t in tokens]) #For example we can show that the stemmer works: example_tokens = ['lie', 'lied', 'lay', 'lies', 'lying'] stemmed_tokens = [nltk.PorterStemmer().stem(t) for t in example_tokens] print(stemmed_tokens) #Lets look at some statistics for the words ltokens = [nltk.word_tokenize(doc) for doc in news.data[:500]] # convert list of list of tokens (ltokens) into a list of tokens import itertools tokens_all = list(itertools.chain.from_iterable(ltokens)) # convert list of tokens to nltk text object x = nltk.Text(t.lower() for t in tokens_all) print("The text comprises %d normalized tokens." % len(x)) print("The first few are", x[:10])
def __init__(self, tokens): """ Constructor. """ self.stemmer = nltk.PorterStemmer() self.tokens = tokens
def nameDoc2vec_representation(textfile, dictfile, representationfile): appname = [] with open(textfile, 'r') as fin: for line in fin: line = json.loads(line) appname.append(line['name']) print("Preprocessing Name Text done!") tokenizer = RegexpTokenizer(r'\w+') appnametext = [] count = 0 for namestr in appname: wordlist = tokenizer.tokenize(namestr) wordlist_rmstopword = [ word for word in wordlist if word not in stopwords.words('english') ] # wordlist_rmstopword=wordlist for i in range(len(wordlist_rmstopword)): wordlist_rmstopword[i] = nltk.PorterStemmer().stem_word( wordlist_rmstopword[i]) Lnumber = [ word for word in wordlist_rmstopword if re.match(r'\d+$', word) ] Lothers = [ word for word in wordlist_rmstopword if re.match(r'^_+', word) ] wordlist_rmLnumber = [ word for word in wordlist_rmstopword if word not in Lnumber ] wordlist_rmLothers = [ word for word in wordlist_rmLnumber if word not in Lothers ] appnametext.append(wordlist_rmLothers) if count % 1000 == 0: # print(str(count)+"...",end='') print count count = count + 1 # appname.clear() print("Obtain Name Wordlist done!") name_dict = corpora.Dictionary(appnametext) once_ids = [ wordid for wordid, docfreq in name_dict.dfs.items() if docfreq == 1 ] name_dict.filter_tokens(once_ids) name_dict.save_as_text(dictfile) print("Obtain Name dictionary done!") # corpus_tf=[name_dict.doc2bow(eachappname) for eachappname in appnametext] # #tfidf transform # tfidf=models.TfidfModel(corpus_tf) # corpus_tfidf=tfidf[corpus_tf] KList = [ 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000 ] # documents = [] # for i in range(len(appnametext)): # string = "SENT_" + str(i) # sentence = models.doc2vec.LabeledSentence(appnametext[i], labels = [string]) # documents.append(sentence) file = 'doc2vec/apptext.txt' apptext = open(file, 'w') for line in appnametext: # print line apptext.write(' '.join(line) + '\n') apptext.close() for k in KList: #lsi transform # lsi=models.LsiModel(corpus=corpus_tfidf,id2word=name_dict,num_topics=k) documents = models.doc2vec.TaggedLineDocument(file) doc2vec = models.Doc2Vec(documents, size=k, window=2, min_count=0, workers=4) featurefile = representationfile + str(k) + ".txt" fout = open(featurefile, 'w') for i in range(15282): # print type() valueList = doc2vec.docvecs[i].tolist() for j in range(k): fout.write(str(valueList[j]) + '\t') fout.write('\n') fout.close()
def __init__(self): self.model = self._load_model() self.lemmatizer = nltk.WordNetLemmatizer() self.stemmer = nltk.PorterStemmer() self.country_fixes = {} self.coutries = self._read_countries()
def stem_with_porter(words): porter = nltk.PorterStemmer() new_words = [porter.stem(w) for w in words] return new_words
try: user_desc = json_dict['user']['description'].replace(' ', '') if user_desc: text += ' &' + user_desc except: pass return text if __name__ == "__main__": data_dir = './data' ##Setting your own file path here. x_filename = 'samples.txt' y_filename = 'labels.txt' porter = nltk.PorterStemmer() #porter stemmer stops = set(stopwords.words('english')) stops.add('rt') #may add personalized stop words ##load and process samples print('start loading and process samples...') words_stat = {} # record statistics of the df and tf for each word; Form: {word:[tf, df, tweet index]} tweets = [] bonuses = [] cnt = 0 with open(os.path.join(data_dir, x_filename)) as f: for i, line in enumerate(f): postprocess_tweet = [] tweet_obj = json.loads(line.strip(), encoding='utf-8') content = tweet_obj['text'].replace("\n"," ")
def setup(): nltk.download('punkt') nltk.download('wordnet') porter = nltk.PorterStemmer() wnl = nltk.WordNetLemmatizer() return [porter, wnl]
def __init__(self): nltk.download('stopwords') nltk.download('punkt') self.stemmer = nltk.PorterStemmer() self.stopwords = set( stopwords.words('english')) # + get_stop_words('en'))
nltk, sklearn ''' import tomotopy as tp import nltk from nltk.corpus import stopwords import re from sklearn.datasets import fetch_20newsgroups import itertools print('Training lda models...') try: # load if trained model exist already mdl = tp.LDAModel.load('trained_lda_model.bin') except: porter_stemmer = nltk.PorterStemmer().stem english_stops = set(porter_stemmer(w) for w in stopwords.words('english')) pat = re.compile('^[a-z]{2,}$') corpus = tp.utils.Corpus( tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), stopwords=lambda x: x in english_stops or not pat.match(x)) newsgroups_train = fetch_20newsgroups() corpus.process(d.lower() for d in newsgroups_train.data) mdl = tp.LDAModel(min_df=5, rm_top=30, k=20, corpus=corpus) mdl.train(0) print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( len(mdl.docs), len(mdl.used_vocabs), mdl.num_words)) print('Removed Top words: ', *mdl.removed_top_words)
def Tokenizer(str_input): words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split() porter_stemmer = nltk.PorterStemmer() words = [porter_stemmer.stem(word) for word in words] return words
def stemming(tokens): """ stem tokens """ porter = nltk.PorterStemmer() return [porter.stem(t) for t in tokens]
def descriptionLDArepresentation(textfile, dictfile, representationfile): appdescription = [] htmlcompilers = re.compile( r'<[^>]+>| +|=+|\?+|!+|-+|\*+|\.+|(>)+|\(+|\)+|\^+|\_+|#+|\[+|\,+|(&)+|\/+|\]+|:+|(&39;t)+|(")+|(')+|~+', re.S) spacecompilers = re.compile(r'\s+', re.S) with open(textfile, 'r') as fin: for line in fin: line = json.loads(line) tmp = htmlcompilers.sub(' ', line['description']) tmp = spacecompilers.sub(' ', tmp) appdescription.append(tmp) print("Preprocessing description Text done!") tokenizer = RegexpTokenizer(r'\w+') appdescriptiontext = [] count = 0 for descriptionstr in appdescription: wordlist = tokenizer.tokenize(descriptionstr) wordlist_rmstopword = [ word for word in wordlist if word not in stopwords.words('english') ] for i in range(len(wordlist_rmstopword)): wordlist_rmstopword[i] = nltk.PorterStemmer().stem_word( wordlist_rmstopword[i]) Lnumber = [ word for word in wordlist_rmstopword if re.match(r'\d+$', word) ] Lothers = [ word for word in wordlist_rmstopword if re.match(r'^_+', word) ] wordlist_rmLnumber = [ word for word in wordlist_rmstopword if word not in Lnumber ] wordlist_rmLothers = [ word for word in wordlist_rmLnumber if word not in Lothers ] appdescriptiontext.append(wordlist_rmLothers) if count % 1000 == 0: print count count = count + 1 # appdescription.clear() print("Obtain Description Wordlist done!") description_dict = corpora.Dictionary(appdescriptiontext) once_ids = [ wordid for wordid, docfreq in description_dict.dfs.items() if docfreq == 1 ] description_dict.filter_tokens(once_ids) description_dict.save_as_text(dictfile) print("Obtain Description dictionary done!") corpus_tf = [ description_dict.doc2bow(eachappdescription) for eachappdescription in appdescriptiontext ] KList = [600, 700, 800, 900, 1000] for k in KList: #lda transform lda = models.LdaModel(corpus=corpus_tf, id2word=description_dict, num_topics=k, minimum_probability=0) corpus_lda = lda[corpus_tf] print("Obtain Description lsi representation done!") featurefile = representationfile + str(k) + ".txt" fout = open(featurefile, 'w') for doc in corpus_lda: line = [] for i in range(k): line.append(0) for (fid, fvalue) in doc: line[fid] = fvalue for item in line: t = fout.write(str(item) + '\t') t = fout.write('\r\n') fout.close()
def norm(arr): return np.sqrt(np.sum((arr)**2,axis=0)) def radian(arr1,arr2): return np.arccos(np.sum(arr1*arr2,axis=0)/(norm(arr1)*norm(arr2))) def distance(arr1,arr2): return np.sqrt(np.sum((np.abs(arr1-arr2))**2,axis=0)) corpus=pd.read_csv("pacifier.csv",usecols=["review_body"])[:10]["review_body"].values tokenizer = nltk.RegexpTokenizer(r'\w+') #去除标点符号的正则过滤器 corpus2=["" for i in range(0,corpus.shape[0])] for i in range(0,corpus.shape[0]): lis=tokenizer.tokenize(corpus[i]) for word in lis: corpus2[i]+= nltk.PorterStemmer().stem(word)+" " #将文本中所有单词只保留词干 corpus=np.array(corpus2) tfidf_vectorizer = TfidfVectorizer() tfidf = tfidf_vectorizer.fit_transform(corpus).toarray() feature_name = tfidf_vectorizer.get_feature_names() sm_dis=np.zeros((tfidf.shape[0],tfidf.shape[0])) sm_rad=np.zeros((tfidf.shape[0],tfidf.shape[0])) for j in range(0,tfidf.shape[0]): for i in range(j,tfidf.shape[0]): sm_dis[j][i]=distance(tfidf[j],tfidf[i]) sm_dis[i][j]=sm_dis[j][i] sm_rad[j][i]=radian(tfidf[j],tfidf[i]) sm_rad[i][j]=sm_rad[j][i] # %%
def text_to_wordlist(text, remove_stop_words=True, stem_words=False): # Clean the text, with the option to remove stop_words and to stem words. # Clean the text text = text.rstrip('?') text = text.rstrip(',') text = re.sub(r"[^A-Za-z0-9]", " ", text) text = re.sub(r"what's", "", text) text = re.sub(r"What's", "", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"I'm", "I am", text) text = re.sub(r" m ", " am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r"60k", " 60000 ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e-mail", "email", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(r"quikly", "quickly", text) text = re.sub(r" usa ", " America ", text) text = re.sub(r" USA ", " America ", text) text = re.sub(r" u s ", " America ", text) text = re.sub(r" uk ", " England ", text) text = re.sub(r" UK ", " England ", text) text = re.sub(r"india", "India", text) text = re.sub(r"switzerland", "Switzerland", text) text = re.sub(r"china", "China", text) text = re.sub(r"chinese", "Chinese", text) text = re.sub(r"imrovement", "improvement", text) text = re.sub(r"intially", "initially", text) text = re.sub(r"quora", "Quora", text) text = re.sub(r" dms ", "direct messages ", text) text = re.sub(r"demonitization", "demonetization", text) text = re.sub(r"actived", "active", text) text = re.sub(r"kms", " kilometers ", text) text = re.sub(r"KMs", " kilometers ", text) text = re.sub(r" cs ", " computer science ", text) text = re.sub(r" upvotes ", " up votes ", text) text = re.sub(r" iPhone ", " phone ", text) text = re.sub(r"\0rs ", " rs ", text) text = re.sub(r"calender", "calendar", text) text = re.sub(r"ios", "operating system", text) text = re.sub(r"gps", "GPS", text) text = re.sub(r"gst", "GST", text) text = re.sub(r"programing", "programming", text) text = re.sub(r"bestfriend", "best friend", text) text = re.sub(r"dna", "DNA", text) text = re.sub(r"III", "3", text) text = re.sub(r"the US", "America", text) text = re.sub(r"Astrology", "astrology", text) text = re.sub(r"Method", "method", text) text = re.sub(r"Find", "find", text) text = re.sub(r"banglore", "Banglore", text) text = re.sub(r" J K ", " JK ", text) # Remove punctuation from text text = ''.join([c for c in text if c not in punctuation]) if remove_stop_words: text = text.split() text = [w for w in text if not w in stop_words] text = " ".join(text) # Optionally, shorten words to their stems if stem_words: text = text.split() #stemmer = SnowballStemmer('english') #stemmed_words = [stemmer.stem(word) for word in text] stemmed_words = [nltk.PorterStemmer().stem_word(word.lower()) for word in text] text = " ".join(stemmed_words) # Return a list of words return(text)
def main(): data_dir = './data' tweet_source_file = 'samples.txt' porter = nltk.PorterStemmer() stops = set(stopwords.words('english')) ## Load and process sample tweets print('start loading and process samples...') hashtags_stat = { } # record statistics of the df and tf for each hashtag; Form: {tag:[tf, df, tweet index]} hashtags = [] with open(os.path.join(data_dir, tweet_source_file)) as f: for i, line in enumerate(f): postprocess_hashtag_list = [] tweet_obj = json.loads(line.strip(), encoding='utf-8') hashtag_list = tweet_obj['entities']['hashtags'] no_of_hashtags = len(hashtag_list) hashtag_text_list = [] if no_of_hashtags == 0: # joined_postprocess_tags = '' joined_postprocess_tags = 'void' # hashtags.append(joined_postprocess_tags) else: for j in range(no_of_hashtags): hashtag_text_list.append(hashtag_list[j]['text']) joined_tags = ' '.join(hashtag_text_list) tags = pre_process(joined_tags, porter) for tag in tags: if tag not in stops: postprocess_hashtag_list.append(tag) if tag in hashtags_stat.keys(): hashtags_stat[tag][0] += 1 if i != hashtags_stat[tag][2]: hashtags_stat[tag][1] += 1 hashtags_stat[tag][2] = i else: hashtags_stat[tag] = [1, 1, i] joined_postprocess_tags = ' '.join(postprocess_hashtag_list) hashtags.append(joined_postprocess_tags) # print(hashtags[:50]) ## Save the statistics of tf and df for each hashtag into file print("The number of unique words in data set is %i." % len(hashtags_stat.keys())) lowTF_tags = set() stats_dir = './stats' with open(os.path.join(stats_dir, 'hashtags_statistics.txt'), 'w') as f: f.write('TF\tDF\tHASHTAG\n') for tag, stat in sorted(hashtags_stat.items(), key=lambda i: i[1], reverse=True): f.write('\t'.join([str(m) for m in stat[0:2]]) + '\t' + tag + '\n') if stat[0] < 2: lowTF_tags.add(tag) print("The number of low frequency words is %d." % len(lowTF_tags)) ## Re-process samples, filter low frequency hashtags... features_dir = './features' fout = open(os.path.join(features_dir, 'hashtags_processed.txt'), 'w') new_hashtags_list = [] for hashtag in hashtags: tags = hashtag.split(' ') new = [] for tag in tags: if tag not in lowTF_tags: new.append(tag) if len(new) == 0: new.append('void') new_hashtags = ' '.join(new) new_hashtags_list.append(new_hashtags) fout.write('%s\n' % new_hashtags) fout.close() print("Preprocessing is completed")
def stem(tokens): porter = nltk.PorterStemmer() return [porter.stem(x) for x in tokens]
def stem(msg: str) -> str: stemmer = nltk.PorterStemmer() return ' '.join(stemmer.stem(term) for term in msg.split())
def jerry_learn(): key_file = 'keys.json' with open(key_file) as f: keys = json.load(f) auth = tweepy.OAuthHandler(keys["consumer_key"], keys["consumer_secret"]) auth.set_access_token(keys["access_token"], keys["access_token_secret"]) api = tweepy.API(auth, wait_on_rate_limit=True) today = date.today() today = datetime(today.year, today.month, today.day) week_ago = today - timedelta(days=1) start = week_ago.strftime('%Y-%m-%d %H:%M:%S')[0:10] timestamp = [] user = [] text = [] retweet_count = [] i = 0 for tweet in tweepy.Cursor(api.search, q = '#bitcoin', lang="en", since = start).items(): i += 1 timestamp.append(tweet.created_at) retweet_count.append(tweet.retweet_count) text.append(tweet.text) user.append(tweet.user.screen_name) if i > 1500: break start2 = int(round(timestamp[-1].replace(tzinfo=timezone.utc).timestamp())) rawlink = "http://api.bitcoincharts.com/v1/trades.csv?symbol=bitstampUSD" link = rawlink + "&start=" + str(int(round(start2))) filename = wget.download(link) btcprice = pd.read_csv(filename, header = None) btcprice.columns = ['unixtime', 'price', 'amount'] converted_time = btcprice['unixtime'].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')) d = {'timestamp': timestamp, 'user': user, 'text' : text, 'retweet' : retweet_count} df = pd.DataFrame(data = d) df.to_csv("most_recent_tweet.csv") btcprice['timestamp'] = converted_time btcprice2 = btcprice.iloc[::50, :].reset_index() del btcprice2['index'] df2 = df.iloc[::-1].reset_index() del df2['index'] btcprice2['timestamp'] = btcprice2['timestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) def cal_direction(array): direction = np.ones(len(array)) for i in range(len(array) - 1): if array[i + 1] - array[i] < 0: direction[i + 1] = 0 return(direction) btcprice2 = btcprice2.assign(direction = cal_direction(btcprice2['price'].values)) direction_tweet = np.zeros(len(df2)) for x in range(len(df2)): for y in range(len(btcprice2)): if (btcprice2.loc[y, 'timestamp'] > df2.loc[x, 'timestamp']): direction_tweet[x] = btcprice2.loc[y, 'direction'] break stopwords = nltk.corpus.stopwords.words('english') ps = nltk.PorterStemmer() def clean_text(text): text = "".join([word.lower() for word in text if word not in string.punctuation]) tokens = re.split('\W+', text) text = [ps.stem(word) for word in tokens if word not in stopwords] return text tfidf_vec = TfidfVectorizer(analyzer=clean_text) x_tfidf = tfidf_vec.fit_transform(df2['text']) x_tfidf.columns = tfidf_vec.get_feature_names() x_counts_tfidf = pd.DataFrame(x_tfidf.toarray()) x_feature = pd.concat([df2[['retweet']], x_counts_tfidf], axis = 1) x_feature2 = x_feature.loc[:int(round(0.8*len(x_feature)))-1, :] direction_tweet2 = direction_tweet[:int(round(0.8*len(direction_tweet)))] x_est = x_feature.loc[int(round(0.8*len(x_feature))):, :] train_size = int(round(0.8*len(x_feature2))) x_train = x_feature2.loc[:train_size-1, :] x_test = x_feature2.loc[train_size:, :] y_train = direction_tweet2[:train_size] y_test = direction_tweet2[train_size:] rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1) rf_model = rf.fit(x_train, y_train) y_pred = rf_model.predict(x_test) label = None if sum(y_pred == 0) >= sum(y_pred == 1): label = 0 else: lebel = 1 precision, recall, fscore, support = score(y_test, y_pred, pos_label= label, average='binary') val1 = 'Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round((y_pred==y_test).sum() / len(y_pred),3)) y_est = rf_model.predict(x_est) p1 = sum(y_est == 1) p0 = sum(y_est == 0) val2 = None if p1 > p0: val2 = "The random forest model detects an upward trend based on conversations on tweet with a probability of " + str(p1/len(y_est)) else: val2 = "The random forest model detects an downward trend based on conversations on tweet with a probability of " + str(p0/len(y_est)) return val1, val2
def getBestWords(trainSet): # extract features for each review and store in list of tuples pertaining to each review # this is the training data to be passed to the classifier word_freq = nltk.probability.FreqDist() label_freq = nltk.probability.ConditionalFreqDist() stemmer = nltk.PorterStemmer() print("Getting word frequency..") i = 0 for review in trainSet: if (review[2] == 'pos'): words = [stemmer.stem(x.lower()) for x in review[3]] word_freq.update(nltk.probability.FreqDist(words)) word_freq.update( nltk.probability.FreqDist([x.lower() for x in review[3]])) label_freq['pos'].update( nltk.probability.FreqDist([x.lower() for x in review[3]])) label_freq['pos'].update(nltk.probability.FreqDist(words)) elif (review[2] == 'neg'): words = [stemmer.stem(x.lower()) for x in review[3]] word_freq.update(nltk.probability.FreqDist(words)) word_freq.update( nltk.probability.FreqDist([x.lower() for x in review[3]])) label_freq['neg'].update( nltk.probability.FreqDist([x.lower() for x in review[3]])) label_freq['neg'].update(nltk.probability.FreqDist(words)) if (i % 20 == 0): print(".", end="") if (i % 1000 == 0): print(str(i)) i = i + 1 print(str(i) + " Finished") pos_words = label_freq['pos'].N() neg_words = label_freq['neg'].N() total_words = pos_words + neg_words word_scores = {} print("Calculating word scores..") for word, freq in word_freq.iteritems(): pos_score = nltk.BigramAssocMeasures.chi_sq(label_freq['pos'][word], (freq, pos_words), total_words) neg_score = nltk.BigramAssocMeasures.chi_sq(label_freq['neg'][word], (freq, neg_words), total_words) tag = nltk.pos_tag([word])[0][1] if (tag.__contains__('VB') or tag.__contains__('NN') or tag.__contains__('RB') or tag.__contains__('JJ')): word_scores[word] = pos_score + neg_score print("Sorting Word scores..") best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:5000] print("Getting Best words..") bestwords = set([w for w, s in best]) return bestwords
# Replace whitespace between terms with a single space processed = processed.str.replace(r'\s+', ' ') # Remove leading and trailing whitespace processed = processed.str.replace(r'^\s+|\s+?$', '') # change all to lower case processed = processed.str.lower() # remove stop words stop_words = set(stopwords.words('english')) processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words)) # Remove word stems using a Porter stemmer ps = nltk.PorterStemmer() processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split())) # create bag-of-words all_words = [] for message in processed: words = word_tokenize(message) for w in words: all_words.append(w) all_words = nltk.FreqDist(all_words) # print the total number of words and the 15 most common words # print('Number of words: {}'.format(len(all_words)))
def __init__(self, path, StopwordRemoval=False, Stemming=False, Debug=False): #run in corpus folder for root, directories, documents in os.walk(path): # filter documents = [f for f in documents if f.endswith('.html')] # use a small part of the corpus if Debug: documents = [ f for f in documents if int(f.split('.')[0]) < 1000 ] # build index self._documents = documents if Debug: print('stopword removal:{0}, stemming:{1}'.format( StopwordRemoval, Stemming)) if StopwordRemoval: self._stop = nltk.corpus.stopwords.words('english') if Stemming: self._stemmer = nltk.PorterStemmer() #index is a dict of dict #self._index = defaultdict(lambda : defaultdict(int)) self._index = defaultdict(dd) self.N = 0 self.docIDs = [] for document in documents: if Debug: print('processing document {0}...'.format(document)) try: '''example of document : 21393.html documentID : 21393 split document name by dot and parse first part as int''' documentID = int(document.split('.')[0]) print("parsing Doc {0}".format(self.N)) self.docIDs.append(documentID) #read content and tokenization content = open(path + '/' + document, errors='ignore') raw = content.read() raw = raw.lower() tokens = nltk.word_tokenize(raw) if StopwordRemoval: tokens = [ token for token in tokens if token not in self._stop ] def StemToken(token): return self._stemmer.stem(token) if Stemming: tokens = map(StemToken(token), tokens) #tokens = map(lambda token:self._stemmer.stem(token), tokens) ''' index[token][documentID] -- term frequency -- tf size of dict index[token] -- document frequency -- df ''' for token in tokens: self._index[token][documentID] += 1 self.N = self.N + 1 except Exception as e: print('error occur when reading {0}'.format(documentID)) raise e
#print(READ_DATA) # eliminate punctuation for char in string.punctuation: READ_DATA = READ_DATA.replace(char, ' ') #print(READ_DATA) # eliminate numbers for char in string.digits: READ_DATA = READ_DATA.replace(char, ' ') #print(READ_DATA) # perform stemming using nltk stemmer tokens = nltk.word_tokenize(READ_DATA) porter = nltk.PorterStemmer() looper = 0 for token in tokens: tokens[looper] = porter.stem(token) looper += 1 #print "Stemmed -->" #print tokens for token in tokens: #print(token) #check if word exists in dictionary if list['words'].has_key(token): #print(token + ' exists') #check if doc already in word's docList
def add_details(self, details=None, commit=False, **kwargs): """ Adds arbitrary key-value pairs to this entry. Parameters ---------- details : list .. versionadded:: 0.1.8 List of dict of structure: .. code-block:: [{ 'key': '', 'value': '', 'description': '' }] where the ``description`` is optional and can be omitted. If no descriptions are passed at all, you can also use `**kwargs` to pass ``key=value`` pairs. commit : bool If True, the Entry session will be added to the current session and the transaction is commited. Can have side-effects. Defaults to False. """ ps = nltk.PorterStemmer() # build entries here detail_list = [] # parse kwargs for k, v in kwargs.items(): detail_list.append({ 'entry_id': self.id, 'key': str(k), 'stem': ps.stem(k), 'value': v }) # parse details if details is not None: for detail in details: d = { 'entry_id': self.id, 'key': detail['key'], 'stem': ps.stem(detail['key']), 'value': detail['value'] } if 'description' in detail.keys(): d['description'] = detail['description'] detail_list.append(d) # build the models for detail in detail_list: self.details.append(models.Detail(**detail)) if commit: session = object_session(self) try: session.add(self) session.commit() except Exception as e: session.rollback() raise e