Ejemplo n.º 1
0
from my_tools import get_bill_data
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# retrive data from mongo
data, _ = get_bill_data()

beyond_intro = data[data['bill_status'] != 'Introduced']

# show histograms to show proportion of bills that passed vs. those that failed
passed_df = data[data['labels'] == 1]

fig = plt.figure(figsize=(16, 8))
ax = fig.add_subplot(111)
ax.set_title(
    'Number of Bills Introduced (yellow), Beyond Introduced (red), and Passed (green) vs. Time',
    fontdict={'fontsize': 16})
ax.hist(data['intro_date'], bins=500, alpha=.35, color='orange')
ax.hist(beyond_intro['intro_date'], bins=500, alpha=.5, color='r')
ax.hist(passed_df['intro_date'], bins=500, color='g')
ax.set_ylim(0, 400)
plt.show()
Ejemplo n.º 2
0
import os
from my_tools import get_bill_data, process_corpus, read_jsonl_file
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, confusion_matrix

from sklearn.externals import joblib

# get bill data
print('-------------------')
print('Loading original and preprocessed data for vectorizing and modeling...')
data, in_progress = get_bill_data()

corpus_with_labels = read_jsonl_file(
    '/home/ubuntu/galvanize_capstone/data/nlp/corpus_with_labels.jsonl')
corpus_df = pd.DataFrame(list(corpus_with_labels))

X = corpus_df['document']
y = corpus_df['label'].astype(int)

# create stratified train-test split
print('-------------------')
print('Doing train-test split...')
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y)  #, random_state = 123)

# Already vectorized using pickle_nlp_boosting_model.py
Ejemplo n.º 3
0
import numpy as np
import pandas as pd
from pymongo import MongoClient
import pprint
import string
import re
from collections import Counter

from my_tools import get_bill_data, process_corpus

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import normalize
from sklearn.metrics import recall_score, precision_score, accuracy_score, confusion_matrix

from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams, skipgrams

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB  #, ComplementNB unreleased as of 12/14

import matplotlib.pyplot as plt
plt.style.use('ggplot')

data = get_bill_data()