def build_model(congress): majority_party = load_majority_party(congress) committee_membership = load_committee_membership(congress) indexed_success_text = load_indexed_success_text() lobbying_data = None #load_lobbying_data(congress) # universe BILLS = Bill.objects.filter(congress=congress).prefetch_related() # compute the most frequent first few words of bills. slurp in all of the titles, # record the counts of all of the prefixes of the titles, and then take the top few, # excluding ones that are prefixes of another popular prefix. title_counts = {} for bill in BILLS: title = bill.title_no_number if title.startswith("Providing for consideration of"): continue # hack, add later for nwords in xrange(4, 10): prefix = " ".join(title.split(" ")[0:nwords]) title_counts[prefix] = title_counts.get(prefix, 0) + 1 title_counts = sorted(title_counts.items(), key=lambda kv: kv[1], reverse=True) pop_title_prefixes = list() for t, c in title_counts: seen = False for tt in list(pop_title_prefixes): if tt.startswith(t + " "): seen = True elif t.startswith(tt + " "): pop_title_prefixes.remove(tt) if seen: continue pop_title_prefixes.append(t) if len(pop_title_prefixes) == 40: break pop_title_prefixes.append( "A joint resolution proposing an amendment to the Constitution") pop_title_prefixes.append("Providing for consideration of") # We create separate models for bills by the bill type (bill, joint resolution, # concurrent resolution, simple resolution) and by whether the bill's status is # introduced or has been reported or more. # # Once a bill has been reported by committee it's chances of success are # of course much higher, since the bills that have not been reported by committee # in historical data are necessarily failed bills. Also the models change # substantially. MODEL = dict() for bill_type, model_type in itertools.product(bill_type_map.keys(), (0, 1)): # GET LIST OF BILLS bills = BILLS.filter(bill_type__in=bill_type_map[bill_type]) if model_type == 1: # In model 0, we scan across all bills, because all bills were # in the introduced status at one point. If we filter it to bills whose # current status is introduced, obviously they will all have been # failed bills, which defeats the purpose. In model 1, we # only look at bills that have at least gotten reported so that we can see # of reported bills which make it to success. bills = bills.exclude(current_status=BillStatus.introduced) print bill_type, model_type total = bills.count() if model_type == 0: # for the introduced model, success is getting out of committee total_success = bills.exclude( current_status=BillStatus.introduced).count() else: # for the reported model, success is being enacted (or whatever final status as appropriate for the bill type) total_success = bills.filter( current_status__in=BillStatus.final_status_passed).count() print "\toverall", int(round(100.0 * total_success / total)), "%; N=", total # GET REGRESSION MATRIX INFORMATION # Build a list of sets, one for each bill, containing the binary # factors that apply to the bill. Build a corresponding list of # floats (either 1.0, 0.0) indicating whether the bill was successful. # # Also remember for each binary factor the total count of bills # it applied to and the count of those that were successful. # # And also remember for each binary factor, the short descriptive # text for the factor. factor_success_rate = {} regression_outcomes = [] regression_predictors = [] factor_descriptions = {} #bills = bills[0:100] # REMOVEME for bill in bills: # What's the measured outcome for this bill? Check if the bill # ended in a success state. Allow floating-point values! success = is_success(bill, model_type, indexed_success_text[bill_type]) # Get the binary factors that apply to this bill. factors = get_bill_factors(bill, pop_title_prefixes, committee_membership, majority_party, lobbying_data) # maintain a simple list of success percent rates for each factor individually for key, descr, general_descr in factors: if not key in factor_success_rate: factor_success_rate[key] = [0, 0 ] # count of total, successful factor_success_rate[key][0] += 1 factor_success_rate[key][1] += success factor_descriptions[key] = general_descr # build data for a regression regression_outcomes.append(success) regression_predictors.append(set( f[0] for f in factors)) # extract just the key from the (key, descr) tuple # FIRST PASS SIGNIFICANCE CHECK # Reduce the complexity of the regression model by filtering out # factors that, when considered independently, don't have a success # rate that appears to differ from the population success rate. factor_binomial_sig = dict() for key, bill_counts in factor_success_rate.items(): # If there were very few bills with this factor, do not include it in the model. if bill_counts[0] < 15: continue # Create a binomial distribution with a sample size the same as # the number of bills with this factor, and with a probability # of heads equal to the population success rate. distr = scipy.stats.binom(bill_counts[0], float(total_success) / float(total)) # What is the possibility that we would see as many or as few # successes as we do (i.e. two tailed). pless = distr.cdf(bill_counts[1]) # as few == P(count <= observed) pmore = 1.0 - (distr.cdf(bill_counts[1] - 1) if bill_counts[1] > 0 else 0.0) # as many == P(count >= observed) p = min(pless, pmore) if p < .05: factor_binomial_sig[key] = p # LOGISTIC REGRESSION for trial in xrange(2): regression_predictors_map = None regression_beta = None if len(factor_binomial_sig) > 0: # Assign consecutive indices to the remaining factors. regression_predictors_map = dict( reversed(e) for e in enumerate(factor_binomial_sig)) # Build a binary matrix indicating which bills have which factors. regression_predictors_2 = [[] for f in regression_predictors_map] for factors in regression_predictors: for fname, findex in regression_predictors_map.items(): regression_predictors_2[findex].append( 1.0 if fname in factors else 0.0) regression_predictors_2 = numpy.array(regression_predictors_2) regression_outcomes = numpy.array(regression_outcomes) # Perform regression. regression_beta, J_bar, l = logistic_regression( regression_predictors_2, regression_outcomes) # Remove factors that are within 1.75 standard error from zero, # and then re-run the regression. if trial == 0: # Get the standard errors (the logistic_regression module # says to do it this way). from numpy import sqrt, diag, abs, median from numpy.linalg import inv try: stderrs = sqrt(diag( inv(J_bar))) # [intercept, beta1, beta2, ...] except numpy.linalg.linalg.LinAlgError as e: print "\t", e break # The standard errors are coming back wacky large for # the factors with VERY large beta. Special-case those. for fname, findex in regression_predictors_map.items(): beta = regression_beta[findex + 1] stderr = stderrs[findex + 1] if abs(beta / stderr) < 1.75 and abs(beta) < 5.0: # This factor's effect is small/non-significant, # so remove it from factor_binomial_sig so that on # next iteration it is excluded from regression. del factor_binomial_sig[fname] # Generate the model for output. model = dict() MODEL[(bill_type, model_type == 0)] = model if model_type == 0: model["success_name"] = "sent out of committee to the floor" else: if bill_type == "bill": model["success_name"] = "enacted" elif bill_type == "jr": model["success_name"] = "enacted or passed" else: model["success_name"] = "agreed to" model["count"] = total model["success_rate"] = 100.0 * total_success / total model["bill_type"] = bill_type model["bill_type_descr"] = bill_type_names[bill_type] model["is_introduced_model"] = (model_type == 0) model["regression_predictors_map"] = regression_predictors_map model["regression_beta"] = list( regression_beta) if regression_beta != None else None model_factors = dict() model["factors"] = model_factors for key, bill_counts in factor_success_rate.items(): if key not in factor_binomial_sig: continue print "\t" + key, \ int(round(100.0*bill_counts[1]/bill_counts[0])), "%;", \ "N=", bill_counts[0], \ "p<", int(round(100*factor_binomial_sig[key])), \ "B=", regression_beta[regression_predictors_map[key]+1] model_factors[key] = dict() model_factors[key]["count"] = bill_counts[0] model_factors[key][ "success_rate"] = 100.0 * bill_counts[1] / bill_counts[0] model_factors[key]["regression_beta"] = regression_beta[ regression_predictors_map[key] + 1] model_factors[key]["description"] = factor_descriptions[key] with open("bill/prognosis_model.py", "w") as modelfile: modelfile.write( "# this file was automatically generated by prognosis.py\n") modelfile.write("congress = %d\n" % congress) from pprint import pprint modelfile.write("pop_title_prefixes = ") pprint(pop_title_prefixes, modelfile) modelfile.write("factors = ") pprint(MODEL, modelfile)
def run(): """ L regression running script. It is self-contained. Complete the whole pipeline of the simulation. The parameters are given in the first part, then the whole simulation takes part in 4 steps : 0. Set the parameters : - seed : seed for the random number generation. - k_fold : the number of samples we have for the cross_validation - degrees : the degrees of the polynomial we want to test on. - lambdas : the range of lambdas we want to do grid search on. 1. Load the training data. 2. Use cross_validation to estimate the error in order to pick the lambda and polynomial degree with the least error. 3. Train the model on the best polynomial degree and lambda. 4. Make prediction on the testing data. """ #0. DEFINE PARAMETERS FOR OUR RUN seed = 1 #not possible yet to run polynomial degrees at the same time. degrees = np.array([2]) k_fold = 4 gammas = [1e-5,1e-6,1e-7]#[0.0000000000001]#np.logspace(-3,-2,2) max_iters = 2000 #1. LOAD THE DATA print('LOADING THE DATA: ',end=" ") DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here y, tX, ids = load_csv_data(DATA_TRAIN_PATH) y = (y+1)/2 print('DONE') #2. RUN CROSS VALIDATION TO GET BEST gamma print('CROSS VALIDATION') #degree, gamma, error = cross_validation(y,tX,degrees,gammas,max_iters,k_fold,seed) degree = degrees[0] gamma = gammas[0] #3. TRAIN THE MODEL #Let us now clean the input tX = count_NaN(tX) tX,median_tr = sanitize_NaN(tX) tX,mean_tr,std_tr = standardize(tX) tX = build_poly(tX,degree) weights = logistic_regression(y, tX, gamma,max_iters) print('Weights on whole set\n',weights) #4. TEST THE MODEL AND EXPORT THE RESULTS DATA_TEST_PATH = '../data/test.csv' # Download train data and supply path here print('IMPORTING TESTING DATA :',end=" ") y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH) print('DONE') tX_test = count_NaN(tX_test) tX_test_sorted,median_vec = sanitize_NaN(tX_test,median_tr) tX_test_sorted,mean_tr,std_tr = standardize(tX_test_sorted,mean_tr,std_tr) tX_test_sorted = build_poly(tX_test_sorted, degree) OUTPUT_PATH = 'results/output_sanitized_normalization_test_deg_2_gamma_8.csv' # Fill in desired name of output file for submission print('EXPORTING TESTING DATA WITH PREDICTIONS :',end=" ") y_pred = predict_labels(np.array(weights), np.array(tX_test_sorted)) create_csv_submission(ids_test, y_pred, OUTPUT_PATH) print('DONE')
def build_model(congress): majority_party = load_majority_party(congress) committee_membership = load_committee_membership(congress) lobbying_data = None #load_lobbying_data(congress) # universe BILLS = Bill.objects.filter(congress=congress).prefetch_related() # compute the most frequent first few words of bills. slurp in all of the titles, # record the counts of all of the prefixes of the titles, and then take the top few, # excluding ones that are prefixes of another popular prefix. title_counts = { } for bill in BILLS: title = bill.title_no_number if title.startswith("Providing for consideration of"): continue # hack, add later for nwords in xrange(4, 10): prefix = " ".join(title.split(" ")[0:nwords]) title_counts[prefix] = title_counts.get(prefix, 0) + 1 title_counts = sorted(title_counts.items(), key = lambda kv : kv[1], reverse=True) pop_title_prefixes = list() for t, c in title_counts: seen = False for tt in list(pop_title_prefixes): if tt.startswith(t + " "): seen = True elif t.startswith(tt + " "): pop_title_prefixes.remove(tt) if seen: continue pop_title_prefixes.append(t) if len(pop_title_prefixes) == 40: break pop_title_prefixes.append("A joint resolution proposing an amendment to the Constitution") pop_title_prefixes.append("Providing for consideration of") # We create separate models for bills by the bill type (H.R., S., H.Res., etc.) # and by whether the bill is introduced/referred or has been reported or more. # Once a bill has been reported by committee it's chances of success are # of course much higher, since the bills that have not been reported by committee # in historical data are necessarily failed bills. Also the models change # substantially. MODEL = dict() for (bill_type, bill_type_descr), model_type in itertools.product(BillType, (0, 1)): #if bill_type != BillType.house_bill: continue #if bill_type not in (BillType.house_joint_resolution, BillType.senate_joint_resolution): continue bills = BILLS.filter(bill_type=bill_type) if model_type == 1: # In model 0, we scan across all bills, because all bills were # in the introduced/referred status at one point. If we filter it to bills whose # current status is introduced/referred, obviously they will all have been # failed bills, which defeats the purpose. In model 1, we # only look at bills that have at least gotten reported so that we can see # of reported bills which make it to success. bills = bills.exclude(current_status__in=BillStatus.introduced_statuses) print bill_type_descr, model_type total = bills.count() if model_type == 0: # for the introduced model, success is getting out of committee total_success = bills.exclude(current_status__in=BillStatus.introduced_statuses).count() else: # for the reported model, success is being enacted (or whatever final status as appropriate for the bill type) total_success = bills.filter(current_status__in=BillStatus.final_status_passed).count() print "\toverall", int(round(100.0*total_success/total)), "%; N=", total sorted_bills = { } regression_outcomes = [ ] regression_predictors = [ ] factor_descriptions = { } for bill in bills: #import random # speed this up? #if random.random() < .7: continue # What's the measured binary outcome for this bill? Check if the bill # ended in a success state. success = is_success(bill, model_type) # Get the binary factors that apply to this bill. factors = get_bill_factors(bill, pop_title_prefixes, committee_membership, majority_party, lobbying_data) # maintain a simple list of success percent rates for each factor individually for key, descr, general_descr in factors: if not key in sorted_bills: sorted_bills[key] = [0, 0] # count of total, successful sorted_bills[key][0] += 1 if success: sorted_bills[key][1] += 1 factor_descriptions[key] = general_descr # build data for a regression regression_outcomes.append(1.0 if success else 0.0) regression_predictors.append(set( f[0] for f in factors )) # extract just the key from the (key, descr) tuple # check which factors were useful significant_factors = dict() for key, bill_counts in sorted_bills.items(): # create a binomial distribution based on the overall pass rate for this # type of bill (H.R., H.Res., etc.) and a draw the number of bills # within this subset (key) that are passed, and see if it is statistically # different from the overall count. only include statistical differences. if bill_counts[0] < 15: continue distr = scipy.stats.binom(bill_counts[0], float(total_success)/float(total)) pless = distr.cdf(bill_counts[1]) pmore = 1.0-distr.cdf(bill_counts[1]) if pless < .015 or pmore < .015 or (total < 100 and (pless < .05 or pmore < .05)): # only show statistically significant differences from the group mean significant_factors[key] = (pless, pmore) # run a logistic regression regression_predictors_map = None regression_beta = None if len(significant_factors) > 0: regression_predictors_map = dict(reversed(e) for e in enumerate(significant_factors)) regression_predictors_2 = [ [] for f in regression_predictors_map ] for factors in regression_predictors: for fname, findex in regression_predictors_map.items(): regression_predictors_2[findex].append(1.0 if fname in factors else 0.0) regression_predictors_2 = numpy.array(regression_predictors_2) regression_outcomes = numpy.array(regression_outcomes) regression_beta, J_bar, l = logistic_regression(regression_predictors_2, regression_outcomes) # Generate the model for output. model = dict() MODEL[(bill_type,model_type == 0)] = model if model_type == 0: model["success_name"] = "sent out of committee to the floor" else: if bill_type in (BillType.senate_bill, BillType.house_bill): model["success_name"] = "enacted" elif bill_type in (BillType.senate_joint_resolution, BillType.house_joint_resolution): model["success_name"] = "enacted or passed" else: model["success_name"] = "agreed to" model["count"] = total model["success_rate"] = 100.0*total_success/total model["regression_predictors_map"] = regression_predictors_map model["regression_beta"] = list(regression_beta) if regression_beta != None else None model_factors = dict() model["factors"] = model_factors for key, bill_counts in sorted_bills.items(): if key not in significant_factors: continue pless, pmore = significant_factors[key] print "\t" + key, int(round(100.0*bill_counts[1]/bill_counts[0])), "%; N=", bill_counts[0], "p<", int(round(100*pless)), int(round(100*pmore)), "B=", regression_beta[regression_predictors_map[key]+1] model_factors[key] = dict() model_factors[key]["count"] = bill_counts[0] model_factors[key]["success_rate"] = 100.0*bill_counts[1]/bill_counts[0] model_factors[key]["regression_beta"] = regression_beta[regression_predictors_map[key]+1] model_factors[key]["description"] = factor_descriptions[key] with open("bill/prognosis_model.py", "w") as modelfile: modelfile.write("# this file was automatically generated by prognosis.py\n") modelfile.write("congress = %d\n" % congress) from pprint import pprint modelfile.write("pop_title_prefixes = ") pprint(pop_title_prefixes, modelfile) modelfile.write("factors = ") pprint(MODEL, modelfile)
def build_model(congress): majority_party = load_majority_party(congress) committee_membership = load_committee_membership(congress) indexed_success_text = load_indexed_success_text() lobbying_data = None #load_lobbying_data(congress) # universe BILLS = Bill.objects.filter(congress=congress).prefetch_related() # compute the most frequent first few words of bills. slurp in all of the titles, # record the counts of all of the prefixes of the titles, and then take the top few, # excluding ones that are prefixes of another popular prefix. title_counts = { } for bill in BILLS: title = bill.title_no_number if title.startswith("Providing for consideration of"): continue # hack, add later for nwords in xrange(4, 10): prefix = " ".join(title.split(" ")[0:nwords]) title_counts[prefix] = title_counts.get(prefix, 0) + 1 title_counts = sorted(title_counts.items(), key = lambda kv : kv[1], reverse=True) pop_title_prefixes = list() for t, c in title_counts: seen = False for tt in list(pop_title_prefixes): if tt.startswith(t + " "): seen = True elif t.startswith(tt + " "): pop_title_prefixes.remove(tt) if seen: continue pop_title_prefixes.append(t) if len(pop_title_prefixes) == 40: break pop_title_prefixes.append("A joint resolution proposing an amendment to the Constitution") pop_title_prefixes.append("Providing for consideration of") # We create separate models for bills by the bill type (bill, joint resolution, # concurrent resolution, simple resolution) and by whether the bill's status is # introduced/referred or has been reported or more. # # Once a bill has been reported by committee it's chances of success are # of course much higher, since the bills that have not been reported by committee # in historical data are necessarily failed bills. Also the models change # substantially. MODEL = dict() for bill_type, model_type in itertools.product(bill_type_map.keys(), (0, 1)): # GET LIST OF BILLS bills = BILLS.filter(bill_type__in=bill_type_map[bill_type]) if model_type == 1: # In model 0, we scan across all bills, because all bills were # in the introduced/referred status at one point. If we filter it to bills whose # current status is introduced/referred, obviously they will all have been # failed bills, which defeats the purpose. In model 1, we # only look at bills that have at least gotten reported so that we can see # of reported bills which make it to success. bills = bills.exclude(current_status__in=BillStatus.introduced_statuses) print bill_type, model_type total = bills.count() if model_type == 0: # for the introduced model, success is getting out of committee total_success = bills.exclude(current_status__in=BillStatus.introduced_statuses).count() else: # for the reported model, success is being enacted (or whatever final status as appropriate for the bill type) total_success = bills.filter(current_status__in=BillStatus.final_status_passed).count() print "\toverall", int(round(100.0*total_success/total)), "%; N=", total # GET REGRESSION MATRIX INFORMATION # Build a list of sets, one for each bill, containing the binary # factors that apply to the bill. Build a corresponding list of # floats (either 1.0, 0.0) indicating whether the bill was successful. # # Also remember for each binary factor the total count of bills # it applied to and the count of those that were successful. # # And also remember for each binary factor, the short descriptive # text for the factor. factor_success_rate = { } regression_outcomes = [ ] regression_predictors = [ ] factor_descriptions = { } #bills = bills[0:100] # REMOVEME for bill in bills: # What's the measured outcome for this bill? Check if the bill # ended in a success state. Allow floating-point values! success = is_success(bill, model_type, indexed_success_text[bill_type]) # Get the binary factors that apply to this bill. factors = get_bill_factors(bill, pop_title_prefixes, committee_membership, majority_party, lobbying_data) # maintain a simple list of success percent rates for each factor individually for key, descr, general_descr in factors: if not key in factor_success_rate: factor_success_rate[key] = [0, 0] # count of total, successful factor_success_rate[key][0] += 1 factor_success_rate[key][1] += success factor_descriptions[key] = general_descr # build data for a regression regression_outcomes.append(success) regression_predictors.append(set( f[0] for f in factors )) # extract just the key from the (key, descr) tuple # FIRST PASS SIGNIFICANCE CHECK # Reduce the complexity of the regression model by filtering out # factors that, when considered independently, don't have a success # rate that appears to differ from the population success rate. factor_binomial_sig = dict() for key, bill_counts in factor_success_rate.items(): # If there were very few bills with this factor, do not include it in the model. if bill_counts[0] < 15: continue # Create a binomial distribution with a sample size the same as # the number of bills with this factor, and with a probability # of heads equal to the population success rate. distr = scipy.stats.binom(bill_counts[0], float(total_success)/float(total)) # What is the possibility that we would see as many or as few # successes as we do (i.e. two tailed). pless = distr.cdf(bill_counts[1]) # as few == P(count <= observed) pmore = 1.0-(distr.cdf(bill_counts[1]-1) if bill_counts[1] > 0 else 0.0) # as many == P(count >= observed) p = min(pless, pmore) if p < .05: factor_binomial_sig[key] = p # LOGISTIC REGRESSION for trial in xrange(2): regression_predictors_map = None regression_beta = None if len(factor_binomial_sig) > 0: # Assign consecutive indices to the remaining factors. regression_predictors_map = dict(reversed(e) for e in enumerate(factor_binomial_sig)) # Build a binary matrix indicating which bills have which factors. regression_predictors_2 = [ [] for f in regression_predictors_map ] for factors in regression_predictors: for fname, findex in regression_predictors_map.items(): regression_predictors_2[findex].append(1.0 if fname in factors else 0.0) regression_predictors_2 = numpy.array(regression_predictors_2) regression_outcomes = numpy.array(regression_outcomes) # Perform regression. regression_beta, J_bar, l = logistic_regression(regression_predictors_2, regression_outcomes) # Remove factors that are within 1.75 standard error from zero, # and then re-run the regression. if trial == 0: # Get the standard errors (the logistic_regression module # says to do it this way). from numpy import sqrt, diag, abs, median from numpy.linalg import inv try: stderrs = sqrt(diag(inv(J_bar))) # [intercept, beta1, beta2, ...] except numpy.linalg.linalg.LinAlgError as e: print "\t", e break # The standard errors are coming back wacky large for # the factors with VERY large beta. Special-case those. for fname, findex in regression_predictors_map.items(): beta = regression_beta[findex+1] stderr = stderrs[findex+1] if abs(beta/stderr) < 1.75 and abs(beta) < 5.0: # This factor's effect is small/non-significant, # so remove it from factor_binomial_sig so that on # next iteration it is excluded from regression. del factor_binomial_sig[fname] # Generate the model for output. model = dict() MODEL[(bill_type,model_type == 0)] = model if model_type == 0: model["success_name"] = "sent out of committee to the floor" else: if bill_type == "bill": model["success_name"] = "enacted" elif bill_type == "jr": model["success_name"] = "enacted or passed" else: model["success_name"] = "agreed to" model["count"] = total model["success_rate"] = 100.0*total_success/total model["bill_type"] = bill_type model["bill_type_descr"] = bill_type_names[bill_type] model["is_introduced_model"] = (model_type == 0) model["regression_predictors_map"] = regression_predictors_map model["regression_beta"] = list(regression_beta) if regression_beta != None else None model_factors = dict() model["factors"] = model_factors for key, bill_counts in factor_success_rate.items(): if key not in factor_binomial_sig: continue print "\t" + key, \ int(round(100.0*bill_counts[1]/bill_counts[0])), "%;", \ "N=", bill_counts[0], \ "p<", int(round(100*factor_binomial_sig[key])), \ "B=", regression_beta[regression_predictors_map[key]+1] model_factors[key] = dict() model_factors[key]["count"] = bill_counts[0] model_factors[key]["success_rate"] = 100.0*bill_counts[1]/bill_counts[0] model_factors[key]["regression_beta"] = regression_beta[regression_predictors_map[key]+1] model_factors[key]["description"] = factor_descriptions[key] with open("bill/prognosis_model.py", "w") as modelfile: modelfile.write("# this file was automatically generated by prognosis.py\n") modelfile.write("congress = %d\n" % congress) from pprint import pprint modelfile.write("pop_title_prefixes = ") pprint(pop_title_prefixes, modelfile) modelfile.write("factors = ") pprint(MODEL, modelfile)
print("Split data: DONE") #Standardize the training data training_tx, training_tx_mean, training_tx_std = standardize_training( training_tx) #Remove outliers and -999 from the standardized training dataset training_tx, training_y = remove_outliers(training_tx, training_y, 2.3) #Use polynomial terms training_tx = get_polynomial(training_tx, 2) #Get optimal weights print("Compute least squares: START") maxiter, stepsize, lambda_ = 500, 1, 0 weights = logistic_regression(training_y, training_tx, 1, stepsize, maxiter, lambda_) print("Compute least squares: DONE") # print("Weights are:",weights) #Standardize the test data using training mean and std test_tx = standardize_test(test_tx, training_tx_mean, training_tx_std) #Use polynomial terms test_tx = get_polynomial(test_tx, 2) # Perform prediction print("Perform prediction: START") y_pred = predict_labels(weights, test_tx) print("Perform prediction: DONE") # Verify predictions with in house test data
############### Name: Shubham Pareek ############ ############### UBID: spareek ############ from logistic_regression import * from linear_regression import * from neural_network import * from preprocessing import * X1, y1 = get_feature_matrix(data='hod', method='concatenate') X2, y2 = get_feature_matrix(data='hod', method='subtract') X3, y3 = get_feature_matrix(data='gsc', method='concatenate') X4, y4 = get_feature_matrix(data='gsc', method='subtract') logistic_regression(X1, y1) logistic_regression(X2, y2) logistic_regression(X3, y3) logistic_regression(X4, y4) linear_regression(X1, y1) linear_regression(X2, y2) linear_regression(X3, y3) linear_regression(X4, y4) neural_network(X1, y1) neural_network(X2, y2) neural_network(X3, y3) neural_network(X4, y4)
def exp_three_models(): training_data_path = "train.csv" test_data_path = "test.csv" output_path = "output.csv" train_ratio = 0.9 n_trials = 10 limit = int(train_ratio * 250000) sd_limit_0 = 3.0 sd_limit_1 = 2.75 sd_limit_2 = 2.75 sd_limit_3 = 2.6 list_sd_limit = [ sd_limit_0, sd_limit_1, sd_limit_2, sd_limit_3, sd_limit_0, sd_limit_1, sd_limit_2, sd_limit_3 ] list_poly = [2, 2, 2, 2, 2, 2, 2, 2] # List of polynomials for jets 0 to 3 list_lambda = [1.0] # List of lambdas for trial weights_average = np.array([0, 0, 0, 0, 0, 0, 0, 0]) # Average weights maxiter, stepsize, is_newton = 20000, 1e-05, 0 print('isNewton = ', is_newton) print('stepsize = ', stepsize) f = open('lambda_check.txt', 'w') f.write('lambda' + '\t' + 'average accuracy' + '\t' + 'average weights' + '\n') #Read training data # print("Read training data: START") training_y_full, training_tx_full, training_ids = load_csv_data( training_data_path, sub_sample=False) # print("Read training data: DONE") for i_lam in range(len(list_lambda)): lambda_ = list_lambda[i_lam] accuracy_average = 0 print('lambda = ', lambda_) for i_trial in range(n_trials): #Split the data into test and training training_tx, training_y, test_tx, test_y = split_data( training_tx_full, training_y_full, train_ratio, i_trial) # Split the training data by jet numbers # list_training_tx, list_training_y, list_training_ids = split_data_by_jet_num_2(training_tx, training_y, training_ids[:limit]) list_training_tx, list_training_y, list_training_ids = split_data_by_jet_num_feature( training_tx, training_y, training_ids[:limit]) # Split test data into various jet numbers # list_test_tx, list_test_y, list_test_ids = split_data_by_jet_num_2(test_tx, test_y, training_ids[:250000-limit]) list_test_tx, list_test_y, list_test_ids = split_data_by_jet_num_feature( test_tx, test_y, training_ids[:250000 - limit]) # List that will contain all the norms of the weights for each jet list_weight_norms = np.array([0, 0, 0, 0, 0, 0, 0, 0]) # Loop through jet numbers for i in range(8): # print('Jet number is : ', i) training_tx_i, training_y_i, training_ids_i = list_training_tx[ i], list_training_y[i], list_training_ids[i] test_tx_i, test_y_i, test_ids_i = list_test_tx[i], list_test_y[ i], list_test_ids[i] sd_limit = list_sd_limit[i] # ******************************************* TRAINING ************************************************************* #Standardize the training data training_tx_i, training_tx_i_mean, training_tx_i_std = standardize_training( training_tx_i) #Remove outliers from the standardized training dataset training_tx_i, training_y_i, training_tx_i_out, training_y_i_out = remove_outliers( training_tx_i, training_y_i, sd_limit) #Redo standardization training_tx_i, training_tx_i_mean, training_tx_i_std = redo_standardization( training_tx_i, training_tx_i_mean, training_tx_i_std) #Create polynomial expansions training_tx_i = get_polynomial(training_tx_i, list_poly[i]) #Get weights weights_i = logistic_regression(training_y_i, training_tx_i, is_newton, stepsize, maxiter, lambda_) # ******************************************* PREDICTION ************************************************************* #Standardize the test data using training mean and std test_tx_i_standardized = standardize_test( test_tx_i, training_tx_i_mean, training_tx_i_std) #Create polynomial expansion for the test data test_tx_i_standardized = get_polynomial( test_tx_i_standardized, list_poly[i]) #Get predictions for the current jet y_pred_i = predict_labels(weights_i, test_tx_i_standardized) accuracy = verify_prediction(y_pred_i, test_y_i) #Collate data for all jets if (i == 0): y_pred_all = y_pred_i test_y_all = test_y_i test_ids_all = test_ids_i else: y_pred_all = np.concatenate((y_pred_all, y_pred_i)) test_y_all = np.concatenate((test_y_all, test_y_i)) test_ids_all = np.concatenate((test_ids_all, test_ids_i)) list_weight_norms[i] = np.linalg.norm(weights_i) # print('Accuracy 0 is:',accuracy) # print(list_weight_norms) accuracy = verify_prediction(y_pred_all, test_y_all) print('Accuracy Cross is:', accuracy) accuracy_average = (accuracy + (accuracy_average * i_trial)) / (i_trial + 1) weights_average = (list_weight_norms + (weights_average * i_trial)) / (i_trial + 1) print('Accuracy Average is:', accuracy_average) print('Weights Average Norms are:', weights_average) f.write( str(lambda_) + '\t' + str(accuracy_average) + '\t' + str(weights_average) + '\n') f.close()
def exp_three_models(): training_data_path = "train.csv" test_data_path = "test.csv" output_path = "output.csv" train_ratio = 0.9 n_trials = 1 limit = int(train_ratio * 250000) sd_limit_0 = 2.3 sd_limit_1 = 2.75 sd_limit_others = 2.3 #Read training data print("Read training data: START") training_y, training_tx, training_ids = load_csv_data(training_data_path, sub_sample=False) print("Read training data: DONE") for i in range(n_trials): #Split the data into test and training # print("Split data: START") # training_tx, training_y, test_tx, test_y = split_data(training_tx_full, training_y_full, train_ratio, i) # print("Split data: DONE") print("Split data based on jet number: START") training_tx_0, training_y_0, training_ids_0, training_tx_1, training_y_1, training_ids_1, \ training_tx_others, training_y_others, training_ids_others \ = split_data_by_jet_num(training_tx, training_y, training_ids) print("Split data based on jet number: DONE") #Standardize the training data training_tx_0, training_tx_0_mean, training_tx_0_std = standardize_training( training_tx_0) training_tx_1, training_tx_1_mean, training_tx_1_std = standardize_training( training_tx_1) training_tx_others, training_tx_others_mean, training_tx_others_std = standardize_training( training_tx_others) #Remove outliers and -999 from the standardized training dataset #training_tx, training_y = remove_outliers(training_tx,training_y,2.3) training_tx_0, training_y_0, training_tx_0_out, training_y_0_out = remove_outliers( training_tx_0, training_y_0, sd_limit_0) training_tx_1, training_y_1, training_tx_1_out, training_y_1_out = remove_outliers( training_tx_1, training_y_1, sd_limit_1) training_tx_others, training_y_others, training_tx_others_out, training_y_others_out = remove_outliers( training_tx_others, training_y_others, sd_limit_others) training_tx_0 = get_polynomial(training_tx_0, 2) training_tx_1 = get_polynomial(training_tx_1, 2) training_tx_others = get_polynomial(training_tx_others, 2) # weights_0 = least_squares(training_y_0, training_tx_0) # weights_1 = least_squares(training_y_1, training_tx_1) # weights_others = least_squares(training_y_others, training_tx_others) maxiter, stepsize = 50000, 5e-01 lambda_ = 0 weights_0 = logistic_regression(training_y_0, training_tx_0, 1, stepsize, maxiter, lambda_) weights_1 = logistic_regression(training_y_1, training_tx_1, 1, stepsize, maxiter, lambda_) weights_others = logistic_regression(training_y_others, training_tx_others, 1, stepsize, maxiter, lambda_) test_tx_0, test_y_0, test_ids_0, test_tx_1, test_y_1, test_ids_1, test_tx_others, test_y_others, \ test_ids_others = split_data_by_jet_num(test_tx, test_y, training_ids) #Standardize the test data using training mean and std test_tx_0 = standardize_test(test_tx_0, training_tx_0_mean, training_tx_0_std) test_tx_1 = standardize_test(test_tx_1, training_tx_1_mean, training_tx_1_std) test_tx_others = standardize_test(test_tx_others, training_tx_others_mean, training_tx_others_std) test_tx_0, test_y_0, test_tx_0_out, test_y_0_out = remove_outliers( test_tx_0, test_y_0, sd_limit_0) #training_tx_1, training_y_1, training_tx_1_out, training_y_1_out = remove_outliers(training_tx_1,training_y_1,2.3) #training_tx_others, training_y_others, training_tx_others_out, training_y_others_out = remove_outliers(training_tx_others,training_y_others,2.3) test_tx_0 = get_polynomial(test_tx_0, 2) test_tx_1 = get_polynomial(test_tx_1, 2) test_tx_others = get_polynomial(test_tx_others, 2) y_pred_0 = predict_labels(weights_0, test_tx_0) y_pred_1 = predict_labels(weights_1, test_tx_1) y_pred_others = predict_labels(weights_others, test_tx_others) y_pred_0_out = np.array([-1] * test_y_0_out.shape[0]) accuracy = verify_prediction(y_pred_0, test_y_0) print('Accuracy 0 is:', accuracy) accuracy = verify_prediction(y_pred_1, test_y_1) print('Accuracy 1 is:', accuracy) accuracy = verify_prediction(y_pred_others, test_y_others) print('Accuracy Others is:', accuracy) y_pred_all = np.concatenate( (y_pred_0, y_pred_1, y_pred_others, y_pred_0_out)) test_y_all = np.concatenate( (test_y_0, test_y_1, test_y_others, test_y_0_out)) accuracy = verify_prediction(y_pred_all, test_y_all) print('Accuracy Cross is:', accuracy) #Read test data print("Read test data: START") test_y, test_tx, test_ids = load_csv_data(test_data_path) print("Read test data: DONE") test_tx_0, test_y_0, test_ids_0, test_tx_1, test_y_1, test_ids_1, test_tx_others, test_y_others, \ test_ids_others = split_data_by_jet_num(test_tx, test_y, test_ids) test_tx_0 = standardize_test(test_tx_0, training_tx_0_mean, training_tx_0_std) test_tx_1 = standardize_test(test_tx_1, training_tx_1_mean, training_tx_1_std) test_tx_others = standardize_test(test_tx_others, training_tx_others_mean, training_tx_others_std) test_tx_0, test_y_0, test_tx_0_out, test_y_0_out = remove_outliers( test_tx_0, test_y_0, sd_limit_0) #Use polynomial terms test_tx_0 = get_polynomial(test_tx_0, 2) test_tx_1 = get_polynomial(test_tx_1, 2) test_tx_others = get_polynomial(test_tx_others, 2) # Perform prediction print("Perform prediction: START") y_pred_0 = predict_labels(weights_0, test_tx_0) y_pred_1 = predict_labels(weights_1, test_tx_1) y_pred_others = predict_labels(weights_others, test_tx_others) y_pred_0_out = np.array([-1] * test_y_0_out.shape[0]) print("Perform prediction: DONE") test_ids_all = np.concatenate((test_ids_0, test_ids_1, test_ids_others)) y_pred_all = np.concatenate( (y_pred_0, y_pred_1, y_pred_others, y_pred_0_out)) zipped_list = sorted(zip(test_ids_all, y_pred_all)) test_ids_all, y_pred_all = zip(*zipped_list) test_ids_all = np.array(test_ids_all) y_pred_all = np.array(y_pred_all) # Create output file print("Write CSV output: START") create_csv_submission(test_ids_all, y_pred_all, output_path) print("Write CSV output: DONE")
#Note: At some point I'm going to clean this code up. # Right now it still needs to be vectorized. import matplotlib.pyplot as plt from logistic_regression import * from logistic_regression_util import * import numpy as np import random import math import sys #-----------------------------------------------------------------------START - Generating Dataset sys.stdout.write("Generating dataset...") data, X, Y = generateData(200) print "complete!" #-----------------------------------------------------------------------END - Generating Dataset #-----------------------------------------------------------------------START - Train Model sys.stdout.write("Training model...") model = logistic_regression() model.train(X, Y) print "complete!" #-----------------------------------------------------------------------END - Train Model #-----------------------------------------------------------------------START - Plot the results sys.stdout.write("Displaying classification results...") displayClassification(plt, model, X, data) print "complete!" #-----------------------------------------------------------------------END - Plot the results
def exp_three_models(): training_data_path = "train.csv" test_data_path = "test.csv" output_path = "output.csv" train_ratio = 0.8 n_trials = 10 limit = int(train_ratio * 250000) sd_limit_0 = 2.3 sd_limit_1 = 2.5 sd_limit_2 = 2.75 sd_limit_3 = 2.6 list_sd_limit = [sd_limit_0, sd_limit_1, sd_limit_2, sd_limit_3] list_poly = [2, 2, 2, 2] maxiter, stepsize, lambda_, is_newton = 200, 0.1, 1, 1 #Read training data print("Read training data: START") training_y_full, training_tx_full, training_ids = load_csv_data( training_data_path, sub_sample=False) print("Read training data: DONE") accuracy_average = 0 for i_trial in range(n_trials): #Split the data into test and training print("Split data: START") training_tx, training_y, test_tx, test_y = split_data( training_tx_full, training_y_full, train_ratio, i_trial) print("Split data: DONE") # print("Split data based on jet number (2 and 3 together): START") # training_tx_0, training_y_0, training_ids_0, training_tx_1, training_y_1, training_ids_1, \ # training_tx_2, training_y_2, training_ids_2 \ # = split_data_by_jet_num(training_tx, training_y, training_ids[:limit]) # print("Split data based on jet number (2 and 3 together): DONE") print("Split data based on jet number (2 and 3 separate): START") list_training_tx, list_training_y, list_training_ids = split_data_by_jet_num_2( training_tx, training_y, training_ids[:limit]) print("Split data based on jet number (2 and 3 separate): DONE") # Split test data into various jet numbers list_test_tx, list_test_y, list_test_ids = split_data_by_jet_num_2( test_tx, test_y, training_ids[:250000 - limit]) list_weights = [] # Loop through jet numbers for i in range(4): training_tx_i, training_y_i, training_ids_i = list_training_tx[ i], list_training_y[i], list_training_ids[i] test_tx_i, test_y_i, test_ids_i = list_test_tx[i], list_test_y[ i], list_test_ids[i] sd_limit = list_sd_limit[i] # ******************************************* TRAINING ************************************************************* # training_tx_i = replace_999_mass(training_tx_i, training_y_i) #Standardize the training data training_tx_i, training_tx_i_mean, training_tx_i_std = standardize_training( training_tx_i) #Remove outliers and -999 from the standardized training dataset training_tx_i, training_y_i, training_tx_i_out, training_y_i_out = remove_outliers( training_tx_i, training_y_i, sd_limit) #Redo standardization training_tx_i, training_tx_i_mean, training_tx_i_std = redo_standardization( training_tx_i, training_tx_i_mean, training_tx_i_std) #Create polynomial expansions training_tx_i = get_polynomial(training_tx_i, list_poly[i]) #Get weights weights_i = logistic_regression(training_y_i, training_tx_i, is_newton, stepsize, maxiter, lambda_) list_weights.append(weights_i) # ******************************************* PREDICTION ************************************************************* #Standardize the test data using training mean and std test_tx_i_standardized = standardize_test(test_tx_i, training_tx_i_mean, training_tx_i_std) #Create polynomial expansion for the test data test_tx_i_standardized = get_polynomial(test_tx_i_standardized, list_poly[i]) #Get predictions for all the jets y_pred_i = predict_labels(weights_i, test_tx_i_standardized) # if i ==0: y_pred_i = set_background(test_tx_i_standardized,y_pred_i, sd_limit) accuracy = verify_prediction(y_pred_i, test_y_i) #Collate data for all jets if (i == 0): y_pred_all = y_pred_i test_y_all = test_y_i test_ids_all = test_ids_i else: y_pred_all = np.concatenate((y_pred_all, y_pred_i)) test_y_all = np.concatenate((test_y_all, test_y_i)) test_ids_all = np.concatenate((test_ids_all, test_ids_i)) print('Accuracy 0 is:', accuracy) accuracy = verify_prediction(y_pred_all, test_y_all) print('Accuracy Cross is:', accuracy) accuracy_average = (accuracy + (accuracy_average * i_trial)) / (i_trial + 1) print('Accuracy Average is:', accuracy_average)
def exp_three_models(): # ******************************************* INPUT PARAMS ************************************************************* training_data_path = "train.csv" test_data_path = "test.csv" output_path = "output.csv" sd_limit_0, sd_limit_1, sd_limit_2, sd_limit_3 = 2.5, 2.75, 2.75, 2.6 # list_sd_limit = [sd_limit_0,sd_limit_1,sd_limit_2,sd_limit_3] list_sd_limit = [ sd_limit_0, sd_limit_1, sd_limit_2, sd_limit_3, sd_limit_0, sd_limit_1, sd_limit_2, sd_limit_3 ] maxiter, stepsize, lambda_, is_newton = 20000, 1e-01, 1, 1 # ******************************************* READ DATA ************************************************************* #Read training data training_y, training_tx, training_ids = load_csv_data(training_data_path, sub_sample=False) # Read test data test_y, test_tx, test_ids = load_csv_data(test_data_path) # ******************************************* SPLIT DATA ************************************************************* #Split training data into different jets list_training_tx, list_training_y, list_training_ids = split_data_by_jet_num_feature( training_tx, training_y, training_ids) # Split test data into various jet numbers list_test_tx, list_test_y, list_test_ids = split_data_by_jet_num_feature( test_tx, test_y, test_ids) list_weights = [] # Loop through jet numbers for i in range(8): training_tx_i, training_y_i, training_ids_i = list_training_tx[ i], list_training_y[i], list_training_ids[i] test_tx_i, test_y_i, test_ids_i = list_test_tx[i], list_test_y[ i], list_test_ids[i] sd_limit = list_sd_limit[i] # ******************************************* TRAINING ************************************************************* #Standardize the training data training_tx_i, training_tx_i_mean, training_tx_i_std = standardize_training( training_tx_i) #Remove outliers and -999 from the standardized training dataset training_tx_i, training_y_i, training_tx_i_out, training_y_i_out = remove_outliers( training_tx_i, training_y_i, sd_limit) #Redo standardization training_tx_i, training_tx_i_mean, training_tx_i_std = redo_standardization( training_tx_i, training_tx_i_mean, training_tx_i_std) #Create polynomial expansions training_tx_i = get_polynomial(training_tx_i, 2) #Get weights weights_i = logistic_regression(training_y_i, training_tx_i, is_newton, stepsize, maxiter, lambda_) # list_weights.append(weights_i) # ******************************************* PREDICTION ************************************************************* #Standardize the test data using training mean and std test_tx_i = standardize_test(test_tx_i, training_tx_i_mean, training_tx_i_std) #Create polynomial expansion for the test data test_tx_i = get_polynomial(test_tx_i, 2) #Get predictions for all the jets y_pred_i = predict_labels(weights_i, test_tx_i) #Collate data for all jets if (i == 0): y_pred_all = y_pred_i test_ids_all = test_ids_i else: y_pred_all = np.concatenate((y_pred_all, y_pred_i)) test_ids_all = np.concatenate((test_ids_all, test_ids_i)) # ******************************************* OUTPUT ************************************************************* zipped_list = sorted(zip(test_ids_all, y_pred_all)) test_ids_all, y_pred_all = zip(*zipped_list) test_ids_all = np.array(test_ids_all) print('length of test ids', len(test_ids_all)) print('length of test ys', len(y_pred_all)) y_pred_all = np.array(y_pred_all) # Create output file print("Write CSV output: START") create_csv_submission(test_ids_all, y_pred_all, output_path) print("Write CSV output: DONE")