target = 'safe_loans' loans = loans[features + [target]] #train_idx=ps.read_json('module-5-assignment-1-train-idx.json') with open('C:\Users\Isaac\Course 3/module-5-assignment-1-train-idx.json', 'r') as f: train_idx = json.load(f) #test_idx=ps.read_json('module-5-assignment-1-test-idx.json') with open('C:\Users\Isaac\Course 3/module-5-assignment-1-validation-idx.json', 'r') as f: validation_idx = json.load(f) train_data = loans.iloc[train_idx] validation_data = loans.iloc[validation_idx] train_matrix, train_output = get_numpy_data(train_data, features, target) validation_matrix, validation_output = get_numpy_data(validation_data, features, target) safe_loans_raw = loans[loans[target] == +1] risky_loans_raw = loans[loans[target] == -1] print "Number of safe loans : %s" % len(safe_loans_raw) print "Number of risky loans : %s" % len(risky_loans_raw) percentage = len(risky_loans_raw) / float(len(safe_loans_raw)) risky_loans = risky_loans_raw safe_loans = safe_loans_raw.sample(percentage, seed=1) tree = sklearn.tree.DecisionTreeClassifier(max_depth=6) decision_tree_model = tree.fit(train_matrix, train_output)
def remove_punctuation(text): import string return text.translate(None, string.punctuation) products['review_clean'] = products['review'].apply(remove_punctuation) #Iterating over the words in important words for word in important_words: products[word] = products['review_clean'].apply( lambda s: s.split().count(word)) # number of reviews with the word perfect sum(products['perfect'] > 0) # 2955 feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment') feature_matrix.shape #193 features # Computing the optimal coefficients coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients=np.zeros(194), step_size=1e-7, max_iter=301) # Computing the scores for those coefficients scores = np.dot(feature_matrix, coefficients) # classifying the scores negatives = (scores <= 0).astype(int) * (-1) #27946
#Cleaning the review column def remove_punctuation(text): import string return text.translate(None, string.punctuation) products['review_clean'] = products['review'].apply(remove_punctuation) #Iterating over the words in important words for word in important_words: products[word] = products['review_clean'].apply(lambda s : s.split().count(word)) # number of reviews with the word perfect sum(products['perfect']>0) # 2955 feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment') feature_matrix.shape #193 features # Computing the optimal coefficients coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients=np.zeros(194), step_size=1e-7, max_iter=301) # Computing the scores for those coefficients scores = np.dot(feature_matrix, coefficients) # classifying the scores negatives = (scores <= 0).astype(int)*(-1) #27946 positives = (scores > 0).astype(int) #25126 predictions = negatives+positives
] target = 'safe_loans' loans = loans[features + [target]] #train_idx=ps.read_json('module-5-assignment-1-train-idx.json') with open('C:\Users\Isaac\Course 3/module-5-assignment-1-train-idx.json', 'r') as f: train_idx = json.load(f) #test_idx=ps.read_json('module-5-assignment-1-test-idx.json') with open('C:\Users\Isaac\Course 3/module-5-assignment-1-validation-idx.json', 'r') as f: validation_idx = json.load(f) train_data = loans.iloc[train_idx] validation_data = loans.iloc[validation_idx] train_matrix,train_output=get_numpy_data(train_data,features,target) validation_matrix,validation_output=get_numpy_data(validation_data,features,target) safe_loans_raw = loans[loans[target] == +1] risky_loans_raw = loans[loans[target] == -1] print "Number of safe loans : %s" % len(safe_loans_raw) print "Number of risky loans : %s" % len(risky_loans_raw) percentage = len(risky_loans_raw)/float(len(safe_loans_raw)) risky_loans = risky_loans_raw safe_loans = safe_loans_raw.sample(percentage, seed=1) tree=sklearn.tree.DecisionTreeClassifier(max_depth=6) decision_tree_model=tree.fit(train_matrix,train_output)