def text_preproc(line): text = [] line = BeautifulSoup(line).get_text(" ") porter = nltk.PorterStemmer() words = nltk.word_tokenize(line) for w in words: cleanword = clean_word(w) if cleanword and cleanword not in STOP_WORDS and len(cleanword) > 1: text.append(porter.stem(cleanword)) return text
def load_train(filepath): df = pd.DataFrame(columns=['text', 'sent']) text = [] sent = [] with open(train_file_path, 'r') as train: reader = csv.reader(train, delimiter=',') for row in reader: review = clean_text(row[0], True) text.append(review) if row[1] == 'neutral': sent.append(0) elif row[1] == 'positive': sent.append(1) else: sent.append(-1) df['text'] = text df['sent'] = sent df = df.sample(frac=1).reset_index(drop=True) return df
from sklearn.preprocessing import Normalizer from sklearn.pipeline import make_pipeline dirr = str(sys.argv[1]) outt = str(sys.argv[2]) f = open('title.txt', 'r') data = [] for line in f: data.append(line.strip()) f.close() f = open('stopword.txt', 'r') text = [] for line in f: text.append(line.strip()) f.close() mystop = {'doesn', 'don', 'won', 'isn', 'aren', 're', 'shouldn'} stop = mystop.union(text) TFIDFvectorizer = TfidfVectorizer(stop_words=stop, min_df=0.00001) x = TFIDFvectorizer.fit_transform(data) svd = TruncatedSVD(n_components=20) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) U = lsa.fit_transform(x) kmeans = KMeans(n_clusters=20)
for file in glob.glob(path): filelist.append(file) def concatenate_list_data(list): result = '' for element in list: result += str(element) return result text = [] for i in range(len(filelist)): # for i in range(10): with open(filelist[i], "r") as myfile: data = myfile.readlines() data = concatenate_list_data(data) text.append(data) if i % 1000 == 0: print('{}th txt file reading...'.format(i)) else: pass vect = CountVectorizer(stop_words="english") # vect = CountVectorizer() bow = vect.fit_transform(text).toarray() norm_bow = normalize(bow, norm='l1', axis=1) norm_data = pd.DataFrame(norm_bow) #print norm_data.shape norm_data.to_csv('./total_asrfeat.csv', index=False) norm_data = pd.read_csv('./total_asrfeat.csv') norm_data.head(3)
# text.append(t) # f.write(t+'\n') # f2.write(classes[j]+'\n') # j=j+1 #f2.close() with open('preprocessedData2.csv', 'w', newline='') as csvfile: fieldnames = ['AgeClass', 'post'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() j = -1 for line in sentence_stemmed: t = "" i = 0 t = " ".join(line) text.append(t) j = j + 1 writer.writerow({'AgeClass': classes[j], 'post': t}) print("sentence full==============================================") #posts, test, classes, testclass = train_test_split(text, # classes, # test_size=0.33, # random_state=42,stratify=classes) # # ##print(posts) #count_vect = CountVectorizer() #X_train_counts = count_vect.fit_transform(posts) ##print(X_train_counts) #tfidf_transformer = TfidfTransformer()
# In[11]: lemm_a = lemmat(allwords, y) # In[12]: text = [] sentence = [] a = [] for j in text1: a = re.findall(r"[\w']+|[.,!?;]", j) lemm_t = lemmat(a, y) sentence = ' '.join(lemm_t) text.append(sentence) # In[13]: valid_data = [] sentence1 = [] a = [] for j in valid_data1: a = re.findall(r"[\w']+|[.,!?;]", j) lemm_t = lemmat(a, y) sentence1 = ' '.join(lemm_t) valid_data.append(sentence1) # In[14]: test_data = []
#for line in lines: if (len(line) != 0): tweets.append(json.loads(line)) for tweet in tweets: location = tweet['tweet']['user']['location'].lower() if 'washington' in location and not 'dc' in location: actual_location.append(0) # 0 for 'WA' tweet_text = tweet['tweet']['text'].lower() tweet_text = [w for w in tweet_text.split() if not w in stop_words] tweet_text = " ".join([stemmer.stem(plural) for plural in tweet_text]) for b in num: tweet_text = tweet_text.replace(b, "") for c in string.punctuation: tweet_text = tweet_text.replace(c, "") text.append(tweet_text) print tweet_text print '*********' elif 'massachusetts' in location: actual_location.append(1) # 1 for 'MA' tweet_text = tweet['tweet']['text'].lower() tweet_text = [w for w in tweet_text.split() if not w in stop_words] tweet_text = " ".join([stemmer.stem(plural) for plural in tweet_text]) for b in num: tweet_text = tweet_text.replace(b, "") for c in string.punctuation: tweet_text = tweet_text.replace(c, "") text.append(tweet_text) print tweet_text print '*********'
def main(): # %% Q1-2: first look results = [] for hashtag in ['gohawks', 'gopatriots', 'patriots', 'sb49']: results.append(first_look(hashtag)[1]) for hashtag in ['nfl', 'superbowl']: results.append(first_look(hashtag, plot=True)[1]) results_df = pd.DataFrame(results) # %% Q3: linear regression hashtag_list = [ 'gohawks', 'gopatriots', 'patriots', 'sb49', 'nfl', 'superbowl' ] summary = [] for hashtag in hashtag_list: data = first_look(hashtag)[0] X, y = extract_Xy(data) summary.append(fit_OLS(X, y, hashtag)) summary_df = pd.concat(summary, axis=1, ignore_index=True) # %% Q4: design new features hashtag_list = [ 'gohawks', 'gopatriots', 'patriots', 'sb49', 'nfl', 'superbowl' ] summary = [] est_params = [] est_fitted_values = [] Xs = [] ys = [] filename = 'Q4' for hashtag in hashtag_list: print('running #{}...'.format(hashtag)) start_time = time.time() X, y, _ = new_features(hashtag, filename) sum, est = fit_OLS(X, y, hashtag) summary.append(sum) # append dataframe est_params.append(est.params) est_fitted_values.append(est.fittedvalues) Xs.append(X) ys.append(y) print('time elapsed: {:.2f}s'.format(time.time() - start_time)) summary_df = pd.concat(summary, axis=1, ignore_index=True) summary_df['mean'] = summary_df.sum(axis=1) summary_df['mean'] = summary_df['mean'].drop( index=['hashtag', 'mse_total', 'r_squared']) summary_df['mean'] = summary_df['mean'].astype(float).round(4) / 6 est_params_df = pd.concat(est_params, axis=1, ignore_index=True) est_params_df.columns = hashtag_list hash_number = dict(gohawks=0, gopatriots=1, patriots=2, sb49=3, nfl=4, superbowl=5) # %% Q5 best_feature = ['url_count_sum', 'user_mentions_sum', 'retweets_max'] for hashtag in hashtag_list: fig, ax = plt.subplots(1, 3, figsize=(15, 4)) i = 0 for feature in best_feature: slope = est_params[hash_number[hashtag]][feature] ax[i].scatter(Xs[hash_number[hashtag]][feature], ys[hash_number[hashtag]], label='observed', s=15) ax[i].scatter(Xs[hash_number[hashtag]][feature], est_fitted_values[hash_number[hashtag]], label='fitted', s=15) ax[i].plot(Xs[hash_number[hashtag]][feature], Xs[hash_number[hashtag]][feature] * slope, 'c', label='slope = {:.2f}'.format(slope)) ax[i].set_title('#{}, feature: {}'.format(hashtag, feature), fontsize=16) ax[i].set_xlabel('value of feature', fontsize=16) ax[i].set_ylabel('number of tweets for next hour', fontsize=16) ax[i].legend(loc='upper left', fontsize=12) ax[i].tick_params(labelsize=12) i += 1 plt.tight_layout() plt.savefig('results/Q5_#{}.png'.format(hashtag), dpi=300) plt.show() # %% Q6: split time periods hashtag_list = [ 'gohawks', 'gopatriots', 'patriots', 'sb49', 'nfl', 'superbowl' ] summary = [] for hashtag in hashtag_list: before_df, between_df, after_df = split_periods(hashtag) for i, data in enumerate([before_df, between_df, after_df]): X, y = extract_Xy(data) summary.append( fit_OLS(X, y, hashtag + '_' + ['before', 'between', 'after'][i])) summary_df = pd.concat(summary, axis=1, ignore_index=True) summary_before = summary_df.loc[:, [ 'before' in x for x in summary_df.loc['hashtag'] ]] summary_between = summary_df.loc[:, [ 'between' in x for x in summary_df.loc['hashtag'] ]] summary_after = summary_df.loc[:, [ 'after' in x for x in summary_df.loc['hashtag'] ]] # %% Q7: aggregate all hashtags # cache agg data for future use hashtag_list = [ 'gohawks', 'gopatriots', 'patriots', 'sb49', 'nfl', 'superbowl' ] for i, period in enumerate(['before', 'between', 'after']): data = [] for hashtag in hashtag_list: data.append(split_periods(hashtag)[i]) data_agg = pd.concat(data, axis=0) data_agg.to_csv('data/aggregated_data_{}.csv'.format(period)) # %% linear regression summary = [] for i, period in enumerate(['before', 'between', 'after']): data_agg = pd.read_csv('data/aggregated_data_{}.csv'.format(period)) data_agg['hour'] = pd.to_datetime(data_agg['hour']) X, y = extract_Xy(data_agg) summary.append(fit_OLS(X, y, 'aggregated_' + period)) summary_df = pd.concat(summary, axis=1, ignore_index=True) # %% Q8: random forest, grid search param_grid = { 'max_depth': [10, 20, 40, 60, 80, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000] } reg = RandomForestRegressor() summary_df_rf = gridsearch_periods(reg, param_grid) # %% Q9: compare RF and OLS on entire agg dataset data_agg_all = [] for i, period in enumerate(['before', 'between', 'after']): data_agg = pd.read_csv('data/aggregated_data_{}.csv'.format(period)) data_agg['hour'] = pd.to_datetime(data_agg['hour']) data_agg_all.append(data_agg) data_agg_all = pd.concat(data_agg_all, axis=0, ignore_index=True) X, y = extract_Xy(data_agg_all) rf = RandomForestRegressor(max_depth=80, max_features='sqrt', min_samples_leaf=4, min_samples_split=5, n_estimators=200) rf.fit(X, y) y_pred = rf.predict(X) mse_rf = metrics.mean_squared_error(y, y_pred) mse_OLS = fit_OLS(X, y, 'total').loc['mse_total', 0] print( 'On entire aggregated data, random forest mse: {:.4f}'.format(mse_rf), ', OLS mse: {:.4f}'.format(mse_OLS)) # %% Q10: gradient boosting, grid search param_grid = { 'max_depth': [10, 20, 40, 60, 80, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000] } reg = GradientBoostingRegressor() summary_df_gb = gridsearch_periods(reg, param_grid) # %% Q11: MLPRegressor X, y = extract_Xy(data_agg_all) size_list = [(50, ), (100, ), (200, ), (20, 10), (50, 10)] mse_nn = {} for size in size_list: reg = MLPRegressor(hidden_layer_sizes=size) reg.fit(X, y) mse_nn.update( {str(size): [metrics.mean_squared_error(y, reg.predict(X))]}) mse_nn_df = pd.DataFrame(mse_nn) # %% Q12: standard scalar scaler = StandardScaler() X_scale = scaler.fit_transform(X) reg = MLPRegressor(hidden_layer_sizes=(200, )) reg.fit(X_scale, y) print(metrics.mean_squared_error(y, reg.predict(X_scale))) # %% Q13: grid search for periods param_grid = {'hidden_layer_sizes': size_list} reg = MLPRegressor() summary_df_nn = gridsearch_periods(reg, param_grid) # %% Q14: train a rf using all agg data first data_agg_all = [] for i, period in enumerate(['before', 'between', 'after']): data_agg = pd.read_csv('data/aggregated_data_{}.csv'.format(period)) data_agg['hour'] = pd.to_datetime(data_agg['hour']) data_agg_all.append(data_agg) data_before, data_between, data_after = data_agg_all data_agg_all = pd.concat(data_agg_all, axis=0, ignore_index=True) X_all, y_all = extract_Xy(data_agg_all) X_before, y_before = extract_Xy(data_before) X_between, y_between = extract_Xy(data_between) X_after, y_after = extract_Xy(data_after) rf = RandomForestRegressor(max_depth=80, max_features='sqrt', min_samples_leaf=4, min_samples_split=5, n_estimators=200) # load test data and predict rf.fit(X_all, y_all) mse_test_all_1 = predict_6x(rf, 'rf_all', 1) mse_test_all_2 = predict_6x(rf, 'rf_all', 2) mse_test_all_3 = predict_6x(rf, 'rf_all', 3) rf.fit(X_before, y_before) mse_test_before_1 = predict_6x(rf, 'rf_before', 1) rf.fit(X_between, y_between) mse_test_between_2 = predict_6x(rf, 'rf_between', 2) rf.fit(X_after, y_after) mse_test_after_3 = predict_6x(rf, 'rf_after', 3) # %% Q15 fan base prediction, label base first filename = 'data/tweets_#superbowl.txt' substring_WA = ['WA', 'Washington', 'Seattle'] substring_MA = ['MA', 'Massachusetts', 'Boston'] locations = [] base = [] text = [] with open(filename, 'r') as f: for line in f: line = json.loads(line) location = line['tweet']['user']['location'] if any([(s in location) and ('DC' not in location) for s in substring_WA]): base.append('Washington') elif any([s in location for s in substring_MA]): base.append('Massachusetts') else: base.append('other') locations.append(location) text.append(line['tweet']['text']) data = pd.DataFrame({'location': locations, 'base': base, 'text': text}) data_location = data[data['base'] != 'other'].reset_index(drop=True) # %% base classification # create custom analyzer remove_num, incoporating the lemmatizer analyze = CountVectorizer().build_analyzer() # default analyzer remove_num = lambda doc: [ word for word in lemmatize(analyze(doc)) if not word.isdigit() ] # create vectorizer with the above analyzer, vectorize and tfidf_transform vectorizer = CountVectorizer(min_df=3, analyzer=remove_num, stop_words='english') tfidf_transformer = TfidfTransformer(smooth_idf=False) # fit and transform on train data y = data_location['base'] y[y == 'Massachusetts'] = 0 y[y == 'Washington'] = 1 y = np.array(y).astype(int) X = vectorizer.fit_transform(data_location['text']) X_tfidf = tfidf_transformer.fit_transform(X) # %% reduce dimension then classify nmf = NMF(n_components=50, init='random', random_state=0) X_tfidf_nmf = nmf.fit_transform(X_tfidf) # %% run clf, report ROC, confusion matrix, accuracy, recall, precision # %% Naive Bayes mnb = MultinomialNB(alpha=0.01) clf_report(mnb, X_tfidf_nmf, y, 'Multinomial Naive Bayes') # %% Logistic lr = LogisticRegression(C=1e6) clf_report(lr, X_tfidf_nmf, y, 'Logistic Regression') # %% random forest rfc = RandomForestClassifier(500, max_depth=10, max_features='auto') clf_report(rfc, X_tfidf_nmf, y, 'Random Forest') # %% 5 fold CV cv_clf = [] for clf in [mnb, lr, rfc]: cv_clf.append( np.mean( cross_validate(clf, X_tfidf_nmf, y, cv=5, scoring='accuracy', n_jobs=-1)['test_score'])) print('5 fold CV, mean test accuracy, mnb={:.4f}, lr={:.4f}' ', rfc={:.4f}'.format(*cv_clf)) # %% Q16 predict support team for people from other areas data_other = data[data['base'] == 'other'].reset_index(drop=True) X_other = vectorizer.transform(data_other['text']) X_other_tfidf = tfidf_transformer.transform(X_other) X_other_nmf = nmf.transform(X_other_tfidf) lr.fit(X_tfidf_nmf, y) pred_other = lr.predict(X_other_nmf) #%% visualize for i, team in enumerate(['Patriots', 'Seahawks']): X_team = X_other_nmf[pred_other == i, :] plt.scatter(X_team[:, 0], X_team[:, 1], marker='.', alpha=0.5, label='{}, {}'.format(team, len(X_team))) plt.legend() plt.show() #%% cluster MA, WA cluster = KMeans(n_clusters=6, n_init=30, max_iter=1000, random_state=0, n_jobs=-1) pred = cluster.fit_predict(X_tfidf_nmf) fig, axes = plt.subplots(1, 2, figsize=(10, 4)) axes[0].scatter(X_tfidf_nmf[:, 0], X_tfidf_nmf[:, 1], marker='.', alpha=0.5, c=pred, label='clustered, log then scale') axes[0].legend(loc="upper right") axes[1].scatter(X_tfidf_nmf[:, 0], X_tfidf_nmf[:, 1], marker='.', alpha=0.5, c=y, label='labeled') axes[1].legend(loc="upper right") plt.show()
def main(): all_fms = [] all_recalls = [] all_precs = [] all_aucs = [] adm_ids = [] class_labels = [] text = [] text_real = [] # get input data (test data for generation models) test_doc_ids = [line.rstrip('\n') for line in open(sys.argv[1])] test_texts = [line.rstrip('\n') for line in open(sys.argv[2])] test_real_texts = [line.rstrip('\n') for line in open(sys.argv[3])] gen_dict = defaultdict(list) gen_real_dict = defaultdict(list) for n, doc_id in enumerate(test_doc_ids): gen_text = test_texts[n] gen_dict[doc_id].extend(gen_text.split(' ')) gen_real_text = test_real_texts[n] gen_real_dict[doc_id].extend(gen_real_text.split(' ')) df = pd.read_csv('input.csv') df = df.fillna('') # get labels from original DB dump for row in df.iterrows(): row = row[1] class_row = np.zeros(len(conditions)) doc_id = str(row['Document_ID']) if doc_id not in gen_dict.keys(): continue for n, dia in enumerate(conditions): if (row['Diagnosis'] == dia): class_row[n] = 1 adm_ids.append(doc_id) class_labels.append(class_row) for doc_id in adm_ids: text.append((doc_id, gen_dict[doc_id])) text_real.append((doc_id, gen_real_dict[doc_id])) class_labels = np.array(class_labels) skf = StratifiedKFold(n_splits=5, random_state=8) for trainval_index, test_index in skf.split( text, np.argmax(class_labels, axis=1)): xtrainval, xtest = [ x for i, x in enumerate(text) if i in trainval_index ], [x for i, x in enumerate(text_real) if i in test_index] ytrainval, ytest = [ x for i, x in enumerate(class_labels) if i in trainval_index ], [x for i, x in enumerate(class_labels) if i in test_index] train_size = int(round(len(text) * .7)) xtrain, xval, ytrain, yval = train_test_split(xtrainval, ytrainval, train_size=train_size, random_state=8) print(len(xtrain)) print(len(xval)) print(len(xtest)) doc_length_array = [] train_x = [x[1] for x in xtrain] global word2index global embedding_weights word2index = {'padding': 0} index2word = {0: 'padding'} counter = 1 cc = 0 # define max doc length for padding for x in train_x: doc_length_array.append(len(x)) for word in x: if word not in word2index: word2index[word] = counter index2word[counter] = word counter += 1 global maxlen maxlen = int(np.percentile(doc_length_array, 75, axis=0)) print(np.mean(doc_length_array)) print(maxlen) x_train = [line.rstrip('\n').split(' ') for line in open('train.txt')] w2v_model = train_word2vec(x_train, index2word, num_features=embedding_dims, min_word_count=1, context=10) print('absent') print(cc) global embedding_matrix embedding_matrix = np.zeros((len(word2index), embedding_dims)) for i in range(len(word2index)): if index2word[i] in w2v_model: embedding_vector = w2v_model.wv[index2word[i]] embedding_matrix[i] = embedding_vector print('Computing features') valid_x = [x[1] for x in xval] test_x = [x[1] for x in xtest] test_ids = [x[0] for x in xtest] vect, X_features, val_X_features, test_X_features = extract_features( train_x, valid_x, test_x, 5) ytrain = np.array(ytrain) yval = np.array(yval) ytest = np.array(ytest) print('Training loop') for index, condition in enumerate(conditions): print('Current Condition: {0}'.format(condition)) train_y = ytrain[:, index] valid_y = yval[:, index] test_y = ytest[:, index] current_fms = [] current_recalls = [] current_precs = [] current_aucs = [] for y in range(1): fm, prec, recall, auc = make_predictions( X_features, train_y, val_X_features, valid_y, test_X_features, test_y, 1, test_ids) # s) current_fms.append(fm) current_precs.append(prec) current_recalls.append(recall) current_aucs.append(auc) print('\n') current_fms = np.array(current_fms) current_precs = np.array(current_precs) current_recalls = np.array(current_recalls) current_aucs = np.array(current_aucs) all_fms.extend(current_fms) all_precs.extend(current_precs) all_recalls.extend(current_recalls) all_aucs.extend(current_aucs) print('F-measure all') print(all_fms) print('Precision all') print(all_precs) print('Recall all') print(all_recalls) print('Aucs all') print(all_aucs)