Example #1
0
def text_preproc(line):
	text = []
	line = BeautifulSoup(line).get_text(" ")
	porter = nltk.PorterStemmer()
	words = nltk.word_tokenize(line)
	for w in words:
		cleanword = clean_word(w)
		if cleanword and cleanword not in STOP_WORDS and len(cleanword) > 1:
			text.append(porter.stem(cleanword))
	return text
Example #2
0
def text_preproc(line):
    text = []
    line = BeautifulSoup(line).get_text(" ")
    porter = nltk.PorterStemmer()
    words = nltk.word_tokenize(line)
    for w in words:
        cleanword = clean_word(w)
        if cleanword and cleanword not in STOP_WORDS and len(cleanword) > 1:
            text.append(porter.stem(cleanword))
    return text
Example #3
0
def load_train(filepath):
    df = pd.DataFrame(columns=['text', 'sent'])
    text = []
    sent = []

    with open(train_file_path, 'r') as train:
        reader = csv.reader(train, delimiter=',')
        for row in reader:
            review = clean_text(row[0], True)

            text.append(review)
            if row[1] == 'neutral':
                sent.append(0)
            elif row[1] == 'positive':
                sent.append(1)
            else:
                sent.append(-1)

    df['text'] = text
    df['sent'] = sent
    df = df.sample(frac=1).reset_index(drop=True)
    return df
Example #4
0
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

dirr = str(sys.argv[1])
outt = str(sys.argv[2])

f = open('title.txt', 'r')
data = []
for line in f:
    data.append(line.strip())
f.close()

f = open('stopword.txt', 'r')
text = []
for line in f:
    text.append(line.strip())
f.close()

mystop = {'doesn', 'don', 'won', 'isn', 'aren', 're', 'shouldn'}
stop = mystop.union(text)

TFIDFvectorizer = TfidfVectorizer(stop_words=stop, min_df=0.00001)
x = TFIDFvectorizer.fit_transform(data)

svd = TruncatedSVD(n_components=20)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

U = lsa.fit_transform(x)

kmeans = KMeans(n_clusters=20)
Example #5
0
    for file in glob.glob(path):
        filelist.append(file)

    def concatenate_list_data(list):
        result = ''
        for element in list:
            result += str(element)
        return result

    text = []
    for i in range(len(filelist)):
        # for i in range(10):
        with open(filelist[i], "r") as myfile:
            data = myfile.readlines()
            data = concatenate_list_data(data)
        text.append(data)
        if i % 1000 == 0:
            print('{}th txt file reading...'.format(i))
        else:
            pass

    vect = CountVectorizer(stop_words="english")
    # vect = CountVectorizer()
    bow = vect.fit_transform(text).toarray()
    norm_bow = normalize(bow, norm='l1', axis=1)
    norm_data = pd.DataFrame(norm_bow)
    #print norm_data.shape
    norm_data.to_csv('./total_asrfeat.csv', index=False)

    norm_data = pd.read_csv('./total_asrfeat.csv')
    norm_data.head(3)
#        text.append(t)
#        f.write(t+'\n')
#        f2.write(classes[j]+'\n')
#        j=j+1
#f2.close()
with open('preprocessedData2.csv', 'w', newline='') as csvfile:
    fieldnames = ['AgeClass', 'post']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    j = -1
    for line in sentence_stemmed:
        t = ""
        i = 0
        t = " ".join(line)
        text.append(t)
        j = j + 1
        writer.writerow({'AgeClass': classes[j], 'post': t})
print("sentence full==============================================")

#posts, test, classes, testclass = train_test_split(text,
#                                                          classes,
#                                                          test_size=0.33,
#                                                          random_state=42,stratify=classes)
#
#
##print(posts)
#count_vect = CountVectorizer()
#X_train_counts = count_vect.fit_transform(posts)
##print(X_train_counts)
#tfidf_transformer = TfidfTransformer()

# In[11]:

lemm_a = lemmat(allwords, y)

# In[12]:

text = []
sentence = []
a = []
for j in text1:
    a = re.findall(r"[\w']+|[.,!?;]", j)
    lemm_t = lemmat(a, y)
    sentence = ' '.join(lemm_t)
    text.append(sentence)

# In[13]:

valid_data = []
sentence1 = []
a = []
for j in valid_data1:
    a = re.findall(r"[\w']+|[.,!?;]", j)
    lemm_t = lemmat(a, y)
    sentence1 = ' '.join(lemm_t)
    valid_data.append(sentence1)

# In[14]:

test_data = []
    #for line in lines:
    if (len(line) != 0):
        tweets.append(json.loads(line))

for tweet in tweets:
    location = tweet['tweet']['user']['location'].lower()
    if 'washington' in location and not 'dc' in location:
        actual_location.append(0)  # 0 for 'WA'
        tweet_text = tweet['tweet']['text'].lower()
        tweet_text = [w for w in tweet_text.split() if not w in stop_words]
        tweet_text = " ".join([stemmer.stem(plural) for plural in tweet_text])
        for b in num:
            tweet_text = tweet_text.replace(b, "")
        for c in string.punctuation:
            tweet_text = tweet_text.replace(c, "")
        text.append(tweet_text)
        print tweet_text
        print '*********'

    elif 'massachusetts' in location:
        actual_location.append(1)  # 1 for 'MA'
        tweet_text = tweet['tweet']['text'].lower()
        tweet_text = [w for w in tweet_text.split() if not w in stop_words]
        tweet_text = " ".join([stemmer.stem(plural) for plural in tweet_text])
        for b in num:
            tweet_text = tweet_text.replace(b, "")
        for c in string.punctuation:
            tweet_text = tweet_text.replace(c, "")
        text.append(tweet_text)
        print tweet_text
        print '*********'
def main():
    # %% Q1-2: first look
    results = []
    for hashtag in ['gohawks', 'gopatriots', 'patriots', 'sb49']:
        results.append(first_look(hashtag)[1])
    for hashtag in ['nfl', 'superbowl']:
        results.append(first_look(hashtag, plot=True)[1])
    results_df = pd.DataFrame(results)

    # %% Q3: linear regression
    hashtag_list = [
        'gohawks', 'gopatriots', 'patriots', 'sb49', 'nfl', 'superbowl'
    ]
    summary = []
    for hashtag in hashtag_list:
        data = first_look(hashtag)[0]
        X, y = extract_Xy(data)
        summary.append(fit_OLS(X, y, hashtag))
    summary_df = pd.concat(summary, axis=1, ignore_index=True)

    # %% Q4: design new features
    hashtag_list = [
        'gohawks', 'gopatriots', 'patriots', 'sb49', 'nfl', 'superbowl'
    ]
    summary = []
    est_params = []
    est_fitted_values = []
    Xs = []
    ys = []
    filename = 'Q4'
    for hashtag in hashtag_list:
        print('running #{}...'.format(hashtag))
        start_time = time.time()
        X, y, _ = new_features(hashtag, filename)
        sum, est = fit_OLS(X, y, hashtag)
        summary.append(sum)  # append dataframe
        est_params.append(est.params)
        est_fitted_values.append(est.fittedvalues)
        Xs.append(X)
        ys.append(y)
        print('time elapsed: {:.2f}s'.format(time.time() - start_time))

    summary_df = pd.concat(summary, axis=1, ignore_index=True)
    summary_df['mean'] = summary_df.sum(axis=1)
    summary_df['mean'] = summary_df['mean'].drop(
        index=['hashtag', 'mse_total', 'r_squared'])
    summary_df['mean'] = summary_df['mean'].astype(float).round(4) / 6
    est_params_df = pd.concat(est_params, axis=1, ignore_index=True)
    est_params_df.columns = hashtag_list

    hash_number = dict(gohawks=0,
                       gopatriots=1,
                       patriots=2,
                       sb49=3,
                       nfl=4,
                       superbowl=5)

    # %% Q5
    best_feature = ['url_count_sum', 'user_mentions_sum', 'retweets_max']

    for hashtag in hashtag_list:
        fig, ax = plt.subplots(1, 3, figsize=(15, 4))
        i = 0
        for feature in best_feature:
            slope = est_params[hash_number[hashtag]][feature]
            ax[i].scatter(Xs[hash_number[hashtag]][feature],
                          ys[hash_number[hashtag]],
                          label='observed',
                          s=15)
            ax[i].scatter(Xs[hash_number[hashtag]][feature],
                          est_fitted_values[hash_number[hashtag]],
                          label='fitted',
                          s=15)
            ax[i].plot(Xs[hash_number[hashtag]][feature],
                       Xs[hash_number[hashtag]][feature] * slope,
                       'c',
                       label='slope = {:.2f}'.format(slope))
            ax[i].set_title('#{}, feature: {}'.format(hashtag, feature),
                            fontsize=16)
            ax[i].set_xlabel('value of feature', fontsize=16)
            ax[i].set_ylabel('number of tweets for next hour', fontsize=16)
            ax[i].legend(loc='upper left', fontsize=12)
            ax[i].tick_params(labelsize=12)
            i += 1
        plt.tight_layout()
        plt.savefig('results/Q5_#{}.png'.format(hashtag), dpi=300)
        plt.show()

    # %% Q6: split time periods
    hashtag_list = [
        'gohawks', 'gopatriots', 'patriots', 'sb49', 'nfl', 'superbowl'
    ]
    summary = []
    for hashtag in hashtag_list:
        before_df, between_df, after_df = split_periods(hashtag)
        for i, data in enumerate([before_df, between_df, after_df]):
            X, y = extract_Xy(data)
            summary.append(
                fit_OLS(X, y,
                        hashtag + '_' + ['before', 'between', 'after'][i]))
    summary_df = pd.concat(summary, axis=1, ignore_index=True)
    summary_before = summary_df.loc[:, [
        'before' in x for x in summary_df.loc['hashtag']
    ]]
    summary_between = summary_df.loc[:, [
        'between' in x for x in summary_df.loc['hashtag']
    ]]
    summary_after = summary_df.loc[:, [
        'after' in x for x in summary_df.loc['hashtag']
    ]]

    # %% Q7: aggregate all hashtags
    # cache agg data for future use
    hashtag_list = [
        'gohawks', 'gopatriots', 'patriots', 'sb49', 'nfl', 'superbowl'
    ]
    for i, period in enumerate(['before', 'between', 'after']):
        data = []
        for hashtag in hashtag_list:
            data.append(split_periods(hashtag)[i])
        data_agg = pd.concat(data, axis=0)
        data_agg.to_csv('data/aggregated_data_{}.csv'.format(period))
    # %% linear regression
    summary = []
    for i, period in enumerate(['before', 'between', 'after']):
        data_agg = pd.read_csv('data/aggregated_data_{}.csv'.format(period))
        data_agg['hour'] = pd.to_datetime(data_agg['hour'])
        X, y = extract_Xy(data_agg)
        summary.append(fit_OLS(X, y, 'aggregated_' + period))
    summary_df = pd.concat(summary, axis=1, ignore_index=True)

    # %% Q8: random forest, grid search
    param_grid = {
        'max_depth': [10, 20, 40, 60, 80, None],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [200, 400, 600, 800, 1000]
    }
    reg = RandomForestRegressor()
    summary_df_rf = gridsearch_periods(reg, param_grid)

    # %% Q9: compare RF and OLS on entire agg dataset
    data_agg_all = []
    for i, period in enumerate(['before', 'between', 'after']):
        data_agg = pd.read_csv('data/aggregated_data_{}.csv'.format(period))
        data_agg['hour'] = pd.to_datetime(data_agg['hour'])
        data_agg_all.append(data_agg)
    data_agg_all = pd.concat(data_agg_all, axis=0, ignore_index=True)
    X, y = extract_Xy(data_agg_all)
    rf = RandomForestRegressor(max_depth=80,
                               max_features='sqrt',
                               min_samples_leaf=4,
                               min_samples_split=5,
                               n_estimators=200)
    rf.fit(X, y)
    y_pred = rf.predict(X)
    mse_rf = metrics.mean_squared_error(y, y_pred)
    mse_OLS = fit_OLS(X, y, 'total').loc['mse_total', 0]
    print(
        'On entire aggregated data, random forest mse: {:.4f}'.format(mse_rf),
        ', OLS mse: {:.4f}'.format(mse_OLS))

    # %% Q10: gradient boosting, grid search
    param_grid = {
        'max_depth': [10, 20, 40, 60, 80, None],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [200, 400, 600, 800, 1000]
    }
    reg = GradientBoostingRegressor()
    summary_df_gb = gridsearch_periods(reg, param_grid)

    # %% Q11: MLPRegressor
    X, y = extract_Xy(data_agg_all)
    size_list = [(50, ), (100, ), (200, ), (20, 10), (50, 10)]
    mse_nn = {}
    for size in size_list:
        reg = MLPRegressor(hidden_layer_sizes=size)
        reg.fit(X, y)
        mse_nn.update(
            {str(size): [metrics.mean_squared_error(y, reg.predict(X))]})
    mse_nn_df = pd.DataFrame(mse_nn)

    # %% Q12: standard scalar
    scaler = StandardScaler()
    X_scale = scaler.fit_transform(X)
    reg = MLPRegressor(hidden_layer_sizes=(200, ))
    reg.fit(X_scale, y)
    print(metrics.mean_squared_error(y, reg.predict(X_scale)))

    # %% Q13: grid search for periods
    param_grid = {'hidden_layer_sizes': size_list}
    reg = MLPRegressor()
    summary_df_nn = gridsearch_periods(reg, param_grid)

    # %% Q14: train a rf using all agg data first
    data_agg_all = []
    for i, period in enumerate(['before', 'between', 'after']):
        data_agg = pd.read_csv('data/aggregated_data_{}.csv'.format(period))
        data_agg['hour'] = pd.to_datetime(data_agg['hour'])
        data_agg_all.append(data_agg)
    data_before, data_between, data_after = data_agg_all
    data_agg_all = pd.concat(data_agg_all, axis=0, ignore_index=True)
    X_all, y_all = extract_Xy(data_agg_all)
    X_before, y_before = extract_Xy(data_before)
    X_between, y_between = extract_Xy(data_between)
    X_after, y_after = extract_Xy(data_after)

    rf = RandomForestRegressor(max_depth=80,
                               max_features='sqrt',
                               min_samples_leaf=4,
                               min_samples_split=5,
                               n_estimators=200)
    # load test data and predict
    rf.fit(X_all, y_all)
    mse_test_all_1 = predict_6x(rf, 'rf_all', 1)
    mse_test_all_2 = predict_6x(rf, 'rf_all', 2)
    mse_test_all_3 = predict_6x(rf, 'rf_all', 3)

    rf.fit(X_before, y_before)
    mse_test_before_1 = predict_6x(rf, 'rf_before', 1)
    rf.fit(X_between, y_between)
    mse_test_between_2 = predict_6x(rf, 'rf_between', 2)
    rf.fit(X_after, y_after)
    mse_test_after_3 = predict_6x(rf, 'rf_after', 3)

    # %% Q15 fan base prediction, label base first
    filename = 'data/tweets_#superbowl.txt'
    substring_WA = ['WA', 'Washington', 'Seattle']
    substring_MA = ['MA', 'Massachusetts', 'Boston']
    locations = []
    base = []
    text = []
    with open(filename, 'r') as f:
        for line in f:
            line = json.loads(line)
            location = line['tweet']['user']['location']
            if any([(s in location) and ('DC' not in location)
                    for s in substring_WA]):
                base.append('Washington')
            elif any([s in location for s in substring_MA]):
                base.append('Massachusetts')
            else:
                base.append('other')
            locations.append(location)
            text.append(line['tweet']['text'])
    data = pd.DataFrame({'location': locations, 'base': base, 'text': text})
    data_location = data[data['base'] != 'other'].reset_index(drop=True)

    # %% base classification
    # create custom analyzer remove_num, incoporating the lemmatizer
    analyze = CountVectorizer().build_analyzer()  # default analyzer
    remove_num = lambda doc: [
        word for word in lemmatize(analyze(doc)) if not word.isdigit()
    ]
    # create vectorizer with the above analyzer, vectorize and tfidf_transform
    vectorizer = CountVectorizer(min_df=3,
                                 analyzer=remove_num,
                                 stop_words='english')
    tfidf_transformer = TfidfTransformer(smooth_idf=False)
    # fit and transform on train data
    y = data_location['base']
    y[y == 'Massachusetts'] = 0
    y[y == 'Washington'] = 1
    y = np.array(y).astype(int)

    X = vectorizer.fit_transform(data_location['text'])
    X_tfidf = tfidf_transformer.fit_transform(X)

    # %% reduce dimension then classify
    nmf = NMF(n_components=50, init='random', random_state=0)
    X_tfidf_nmf = nmf.fit_transform(X_tfidf)

    # %% run clf, report ROC, confusion matrix, accuracy, recall, precision
    # %% Naive Bayes
    mnb = MultinomialNB(alpha=0.01)
    clf_report(mnb, X_tfidf_nmf, y, 'Multinomial Naive Bayes')
    # %% Logistic
    lr = LogisticRegression(C=1e6)
    clf_report(lr, X_tfidf_nmf, y, 'Logistic Regression')
    # %% random forest
    rfc = RandomForestClassifier(500, max_depth=10, max_features='auto')
    clf_report(rfc, X_tfidf_nmf, y, 'Random Forest')
    # %% 5 fold CV
    cv_clf = []
    for clf in [mnb, lr, rfc]:
        cv_clf.append(
            np.mean(
                cross_validate(clf,
                               X_tfidf_nmf,
                               y,
                               cv=5,
                               scoring='accuracy',
                               n_jobs=-1)['test_score']))
    print('5 fold CV, mean test accuracy, mnb={:.4f}, lr={:.4f}'
          ', rfc={:.4f}'.format(*cv_clf))

    # %% Q16 predict support team for people from other areas
    data_other = data[data['base'] == 'other'].reset_index(drop=True)
    X_other = vectorizer.transform(data_other['text'])
    X_other_tfidf = tfidf_transformer.transform(X_other)
    X_other_nmf = nmf.transform(X_other_tfidf)
    lr.fit(X_tfidf_nmf, y)
    pred_other = lr.predict(X_other_nmf)

    #%% visualize
    for i, team in enumerate(['Patriots', 'Seahawks']):
        X_team = X_other_nmf[pred_other == i, :]
        plt.scatter(X_team[:, 0],
                    X_team[:, 1],
                    marker='.',
                    alpha=0.5,
                    label='{}, {}'.format(team, len(X_team)))
    plt.legend()
    plt.show()

    #%% cluster MA, WA
    cluster = KMeans(n_clusters=6,
                     n_init=30,
                     max_iter=1000,
                     random_state=0,
                     n_jobs=-1)
    pred = cluster.fit_predict(X_tfidf_nmf)
    fig, axes = plt.subplots(1, 2, figsize=(10, 4))
    axes[0].scatter(X_tfidf_nmf[:, 0],
                    X_tfidf_nmf[:, 1],
                    marker='.',
                    alpha=0.5,
                    c=pred,
                    label='clustered, log then scale')
    axes[0].legend(loc="upper right")
    axes[1].scatter(X_tfidf_nmf[:, 0],
                    X_tfidf_nmf[:, 1],
                    marker='.',
                    alpha=0.5,
                    c=y,
                    label='labeled')
    axes[1].legend(loc="upper right")
    plt.show()
def main():

    all_fms = []
    all_recalls = []
    all_precs = []
    all_aucs = []

    adm_ids = []
    class_labels = []
    text = []
    text_real = []

    # get input data (test data for generation models)
    test_doc_ids = [line.rstrip('\n') for line in open(sys.argv[1])]
    test_texts = [line.rstrip('\n') for line in open(sys.argv[2])]
    test_real_texts = [line.rstrip('\n') for line in open(sys.argv[3])]

    gen_dict = defaultdict(list)
    gen_real_dict = defaultdict(list)

    for n, doc_id in enumerate(test_doc_ids):

        gen_text = test_texts[n]
        gen_dict[doc_id].extend(gen_text.split(' '))
        gen_real_text = test_real_texts[n]
        gen_real_dict[doc_id].extend(gen_real_text.split(' '))

    df = pd.read_csv('input.csv')
    df = df.fillna('')

    # get labels from original DB dump
    for row in df.iterrows():
        row = row[1]
        class_row = np.zeros(len(conditions))
        doc_id = str(row['Document_ID'])
        if doc_id not in gen_dict.keys():
            continue
        for n, dia in enumerate(conditions):
            if (row['Diagnosis'] == dia):
                class_row[n] = 1
        adm_ids.append(doc_id)
        class_labels.append(class_row)

    for doc_id in adm_ids:

        text.append((doc_id, gen_dict[doc_id]))
        text_real.append((doc_id, gen_real_dict[doc_id]))

    class_labels = np.array(class_labels)

    skf = StratifiedKFold(n_splits=5, random_state=8)

    for trainval_index, test_index in skf.split(
            text, np.argmax(class_labels, axis=1)):

        xtrainval, xtest = [
            x for i, x in enumerate(text) if i in trainval_index
        ], [x for i, x in enumerate(text_real) if i in test_index]
        ytrainval, ytest = [
            x for i, x in enumerate(class_labels) if i in trainval_index
        ], [x for i, x in enumerate(class_labels) if i in test_index]
        train_size = int(round(len(text) * .7))
        xtrain, xval, ytrain, yval = train_test_split(xtrainval,
                                                      ytrainval,
                                                      train_size=train_size,
                                                      random_state=8)
        print(len(xtrain))
        print(len(xval))
        print(len(xtest))
        doc_length_array = []

        train_x = [x[1] for x in xtrain]

        global word2index
        global embedding_weights

        word2index = {'padding': 0}
        index2word = {0: 'padding'}

        counter = 1
        cc = 0

        # define max doc length for padding
        for x in train_x:

            doc_length_array.append(len(x))
            for word in x:
                if word not in word2index:
                    word2index[word] = counter
                    index2word[counter] = word
                    counter += 1
        global maxlen
        maxlen = int(np.percentile(doc_length_array, 75, axis=0))
        print(np.mean(doc_length_array))
        print(maxlen)
        x_train = [line.rstrip('\n').split(' ') for line in open('train.txt')]
        w2v_model = train_word2vec(x_train,
                                   index2word,
                                   num_features=embedding_dims,
                                   min_word_count=1,
                                   context=10)
        print('absent')
        print(cc)
        global embedding_matrix
        embedding_matrix = np.zeros((len(word2index), embedding_dims))
        for i in range(len(word2index)):
            if index2word[i] in w2v_model:
                embedding_vector = w2v_model.wv[index2word[i]]
                embedding_matrix[i] = embedding_vector

        print('Computing features')
        valid_x = [x[1] for x in xval]
        test_x = [x[1] for x in xtest]

        test_ids = [x[0] for x in xtest]

        vect, X_features, val_X_features, test_X_features = extract_features(
            train_x, valid_x, test_x, 5)
        ytrain = np.array(ytrain)
        yval = np.array(yval)
        ytest = np.array(ytest)

        print('Training loop')
        for index, condition in enumerate(conditions):
            print('Current Condition: {0}'.format(condition))

            train_y = ytrain[:, index]
            valid_y = yval[:, index]
            test_y = ytest[:, index]

            current_fms = []
            current_recalls = []
            current_precs = []
            current_aucs = []
            for y in range(1):
                fm, prec, recall, auc = make_predictions(
                    X_features, train_y, val_X_features, valid_y,
                    test_X_features, test_y, 1, test_ids)  # s)
                current_fms.append(fm)
                current_precs.append(prec)
                current_recalls.append(recall)
                current_aucs.append(auc)
            print('\n')
            current_fms = np.array(current_fms)
            current_precs = np.array(current_precs)
            current_recalls = np.array(current_recalls)
            current_aucs = np.array(current_aucs)
            all_fms.extend(current_fms)
            all_precs.extend(current_precs)
            all_recalls.extend(current_recalls)
            all_aucs.extend(current_aucs)
    print('F-measure all')
    print(all_fms)
    print('Precision all')
    print(all_precs)
    print('Recall all')
    print(all_recalls)
    print('Aucs all')
    print(all_aucs)