target = 'safe_loans'

loans = loans[features + [target]]

#train_idx=ps.read_json('module-5-assignment-1-train-idx.json')
with open('C:\Users\Isaac\Course 3/module-5-assignment-1-train-idx.json',
          'r') as f:
    train_idx = json.load(f)

#test_idx=ps.read_json('module-5-assignment-1-test-idx.json')
with open('C:\Users\Isaac\Course 3/module-5-assignment-1-validation-idx.json',
          'r') as f:
    validation_idx = json.load(f)

train_data = loans.iloc[train_idx]
validation_data = loans.iloc[validation_idx]
train_matrix, train_output = get_numpy_data(train_data, features, target)
validation_matrix, validation_output = get_numpy_data(validation_data,
                                                      features, target)

safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)
percentage = len(risky_loans_raw) / float(len(safe_loans_raw))
risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)

tree = sklearn.tree.DecisionTreeClassifier(max_depth=6)

decision_tree_model = tree.fit(train_matrix, train_output)
Esempio n. 2
0
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)


products['review_clean'] = products['review'].apply(remove_punctuation)

#Iterating over the words in important words
for word in important_words:
    products[word] = products['review_clean'].apply(
        lambda s: s.split().count(word))

# number of reviews with the word perfect
sum(products['perfect'] > 0)  # 2955

feature_matrix, sentiment = get_numpy_data(products, important_words,
                                           'sentiment')

feature_matrix.shape  #193 features

# Computing the optimal coefficients
coefficients = logistic_regression(feature_matrix,
                                   sentiment,
                                   initial_coefficients=np.zeros(194),
                                   step_size=1e-7,
                                   max_iter=301)

# Computing the scores for those coefficients
scores = np.dot(feature_matrix, coefficients)

# classifying the scores
negatives = (scores <= 0).astype(int) * (-1)  #27946
#Cleaning the review column
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

products['review_clean'] = products['review'].apply(remove_punctuation)

#Iterating over the words in important words
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))
    
# number of reviews with the word perfect
sum(products['perfect']>0) # 2955

feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment') 

feature_matrix.shape  #193 features

# Computing the optimal coefficients
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients=np.zeros(194),
 step_size=1e-7, max_iter=301)

# Computing the scores for those coefficients
scores = np.dot(feature_matrix, coefficients)

# classifying the scores
negatives = (scores <= 0).astype(int)*(-1) #27946
positives = (scores > 0).astype(int) #25126

predictions = negatives+positives
           ]

target = 'safe_loans'

loans = loans[features + [target]]


#train_idx=ps.read_json('module-5-assignment-1-train-idx.json')
with open('C:\Users\Isaac\Course 3/module-5-assignment-1-train-idx.json', 'r') as f:
    train_idx = json.load(f)

#test_idx=ps.read_json('module-5-assignment-1-test-idx.json')
with open('C:\Users\Isaac\Course 3/module-5-assignment-1-validation-idx.json', 'r') as f:
    validation_idx = json.load(f)
    
train_data = loans.iloc[train_idx]
validation_data = loans.iloc[validation_idx]
train_matrix,train_output=get_numpy_data(train_data,features,target)
validation_matrix,validation_output=get_numpy_data(validation_data,features,target)

safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)

tree=sklearn.tree.DecisionTreeClassifier(max_depth=6)

decision_tree_model=tree.fit(train_matrix,train_output)