Ejemplo n.º 1
0
def build_model(congress):
    majority_party = load_majority_party(congress)
    committee_membership = load_committee_membership(congress)
    indexed_success_text = load_indexed_success_text()
    lobbying_data = None  #load_lobbying_data(congress)

    # universe
    BILLS = Bill.objects.filter(congress=congress).prefetch_related()

    # compute the most frequent first few words of bills. slurp in all of the titles,
    # record the counts of all of the prefixes of the titles, and then take the top few,
    # excluding ones that are prefixes of another popular prefix.
    title_counts = {}
    for bill in BILLS:
        title = bill.title_no_number
        if title.startswith("Providing for consideration of"):
            continue  # hack, add later
        for nwords in xrange(4, 10):
            prefix = " ".join(title.split(" ")[0:nwords])
            title_counts[prefix] = title_counts.get(prefix, 0) + 1
    title_counts = sorted(title_counts.items(),
                          key=lambda kv: kv[1],
                          reverse=True)
    pop_title_prefixes = list()
    for t, c in title_counts:
        seen = False
        for tt in list(pop_title_prefixes):
            if tt.startswith(t + " "):
                seen = True
            elif t.startswith(tt + " "):
                pop_title_prefixes.remove(tt)
        if seen: continue
        pop_title_prefixes.append(t)
        if len(pop_title_prefixes) == 40: break
    pop_title_prefixes.append(
        "A joint resolution proposing an amendment to the Constitution")
    pop_title_prefixes.append("Providing for consideration of")

    # We create separate models for bills by the bill type (bill, joint resolution,
    # concurrent resolution, simple resolution) and by whether the bill's status is
    # introduced or has been reported or more.
    #
    # Once a bill has been reported by committee it's chances of success are
    # of course much higher, since the bills that have not been reported by committee
    # in historical data are necessarily failed bills. Also the models change
    # substantially.

    MODEL = dict()

    for bill_type, model_type in itertools.product(bill_type_map.keys(),
                                                   (0, 1)):
        # GET LIST OF BILLS

        bills = BILLS.filter(bill_type__in=bill_type_map[bill_type])
        if model_type == 1:
            # In model 0, we scan across all bills, because all bills were
            # in the introduced status at one point. If we filter it to bills whose
            # current status is introduced, obviously they will all have been
            # failed bills, which defeats the purpose. In model 1, we
            # only look at bills that have at least gotten reported so that we can see
            # of reported bills which make it to success.
            bills = bills.exclude(current_status=BillStatus.introduced)

        print bill_type, model_type

        total = bills.count()

        if model_type == 0:
            # for the introduced model, success is getting out of committee
            total_success = bills.exclude(
                current_status=BillStatus.introduced).count()
        else:
            # for the reported model, success is being enacted (or whatever final status as appropriate for the bill type)
            total_success = bills.filter(
                current_status__in=BillStatus.final_status_passed).count()

        print "\toverall", int(round(100.0 * total_success /
                                     total)), "%; N=", total

        # GET REGRESSION MATRIX INFORMATION

        # Build a list of sets, one for each bill, containing the binary
        # factors that apply to the bill. Build a corresponding list of
        # floats (either 1.0, 0.0) indicating whether the bill was successful.
        #
        # Also remember for each binary factor the total count of bills
        # it applied to and the count of those that were successful.
        #
        # And also remember for each binary factor, the short descriptive
        # text for the factor.

        factor_success_rate = {}
        regression_outcomes = []
        regression_predictors = []
        factor_descriptions = {}
        #bills = bills[0:100] # REMOVEME
        for bill in bills:
            # What's the measured outcome for this bill? Check if the bill
            # ended in a success state. Allow floating-point values!
            success = is_success(bill, model_type,
                                 indexed_success_text[bill_type])

            # Get the binary factors that apply to this bill.
            factors = get_bill_factors(bill, pop_title_prefixes,
                                       committee_membership, majority_party,
                                       lobbying_data)

            # maintain a simple list of success percent rates for each factor individually
            for key, descr, general_descr in factors:
                if not key in factor_success_rate:
                    factor_success_rate[key] = [0, 0
                                                ]  # count of total, successful
                factor_success_rate[key][0] += 1
                factor_success_rate[key][1] += success
                factor_descriptions[key] = general_descr

            # build data for a regression
            regression_outcomes.append(success)
            regression_predictors.append(set(
                f[0] for f in
                factors))  # extract just the key from the (key, descr) tuple

        # FIRST PASS SIGNIFICANCE CHECK

        # Reduce the complexity of the regression model by filtering out
        # factors that, when considered independently, don't have a success
        # rate that appears to differ from the population success rate.

        factor_binomial_sig = dict()
        for key, bill_counts in factor_success_rate.items():
            # If there were very few bills with this factor, do not include it in the model.
            if bill_counts[0] < 15: continue

            # Create a binomial distribution with a sample size the same as
            # the number of bills with this factor, and with a probability
            # of heads equal to the population success rate.
            distr = scipy.stats.binom(bill_counts[0],
                                      float(total_success) / float(total))

            # What is the possibility that we would see as many or as few
            # successes as we do (i.e. two tailed).
            pless = distr.cdf(bill_counts[1])  # as few == P(count <= observed)
            pmore = 1.0 - (distr.cdf(bill_counts[1] - 1) if bill_counts[1] > 0
                           else 0.0)  # as many == P(count >= observed)
            p = min(pless, pmore)
            if p < .05:
                factor_binomial_sig[key] = p

        # LOGISTIC REGRESSION

        for trial in xrange(2):
            regression_predictors_map = None
            regression_beta = None
            if len(factor_binomial_sig) > 0:
                # Assign consecutive indices to the remaining factors.
                regression_predictors_map = dict(
                    reversed(e) for e in enumerate(factor_binomial_sig))

                # Build a binary matrix indicating which bills have which factors.
                regression_predictors_2 = [[]
                                           for f in regression_predictors_map]
                for factors in regression_predictors:
                    for fname, findex in regression_predictors_map.items():
                        regression_predictors_2[findex].append(
                            1.0 if fname in factors else 0.0)
                regression_predictors_2 = numpy.array(regression_predictors_2)
                regression_outcomes = numpy.array(regression_outcomes)

                # Perform regression.
                regression_beta, J_bar, l = logistic_regression(
                    regression_predictors_2, regression_outcomes)

                # Remove factors that are within 1.75 standard error from zero,
                # and then re-run the regression.
                if trial == 0:
                    # Get the standard errors (the logistic_regression module
                    # says to do it this way).
                    from numpy import sqrt, diag, abs, median
                    from numpy.linalg import inv
                    try:
                        stderrs = sqrt(diag(
                            inv(J_bar)))  # [intercept, beta1, beta2, ...]
                    except numpy.linalg.linalg.LinAlgError as e:
                        print "\t", e
                        break

                    # The standard errors are coming back wacky large for
                    # the factors with VERY large beta. Special-case those.
                    for fname, findex in regression_predictors_map.items():
                        beta = regression_beta[findex + 1]
                        stderr = stderrs[findex + 1]
                        if abs(beta / stderr) < 1.75 and abs(beta) < 5.0:
                            # This factor's effect is small/non-significant,
                            # so remove it from factor_binomial_sig so that on
                            # next iteration it is excluded from regression.
                            del factor_binomial_sig[fname]

        # Generate the model for output.
        model = dict()
        MODEL[(bill_type, model_type == 0)] = model
        if model_type == 0:
            model["success_name"] = "sent out of committee to the floor"
        else:
            if bill_type == "bill":
                model["success_name"] = "enacted"
            elif bill_type == "jr":
                model["success_name"] = "enacted or passed"
            else:
                model["success_name"] = "agreed to"
        model["count"] = total
        model["success_rate"] = 100.0 * total_success / total
        model["bill_type"] = bill_type
        model["bill_type_descr"] = bill_type_names[bill_type]
        model["is_introduced_model"] = (model_type == 0)
        model["regression_predictors_map"] = regression_predictors_map
        model["regression_beta"] = list(
            regression_beta) if regression_beta != None else None
        model_factors = dict()
        model["factors"] = model_factors
        for key, bill_counts in factor_success_rate.items():
            if key not in factor_binomial_sig: continue
            print "\t" + key, \
             int(round(100.0*bill_counts[1]/bill_counts[0])), "%;", \
             "N=", bill_counts[0], \
             "p<", int(round(100*factor_binomial_sig[key])), \
             "B=", regression_beta[regression_predictors_map[key]+1]
            model_factors[key] = dict()
            model_factors[key]["count"] = bill_counts[0]
            model_factors[key][
                "success_rate"] = 100.0 * bill_counts[1] / bill_counts[0]
            model_factors[key]["regression_beta"] = regression_beta[
                regression_predictors_map[key] + 1]
            model_factors[key]["description"] = factor_descriptions[key]

    with open("bill/prognosis_model.py", "w") as modelfile:
        modelfile.write(
            "# this file was automatically generated by prognosis.py\n")
        modelfile.write("congress = %d\n" % congress)
        from pprint import pprint
        modelfile.write("pop_title_prefixes = ")
        pprint(pop_title_prefixes, modelfile)
        modelfile.write("factors = ")
        pprint(MODEL, modelfile)
Ejemplo n.º 2
0
def run():
    """ L regression running script. It is self-contained.
        Complete the whole pipeline of the simulation. The parameters are given in the first part, then the whole simulation takes part in 4 steps :
            0. Set the parameters :
                - seed :    seed for the random number generation.
                - k_fold :  the number of samples we have for the cross_validation
                - degrees : the degrees of the polynomial we want to test on.
                - lambdas : the range of lambdas we want to do grid search on.
            1. Load the training data.
            2. Use cross_validation to estimate the error in order to pick the lambda and polynomial degree with the least error.
            3. Train the model on the best polynomial degree and lambda.
            4. Make prediction on the testing data.
    """
    
    #0. DEFINE PARAMETERS FOR OUR RUN
    seed = 1
    
    #not possible yet to run polynomial  degrees at the same time.
    degrees = np.array([2])
    k_fold = 4
    gammas = [1e-5,1e-6,1e-7]#[0.0000000000001]#np.logspace(-3,-2,2)
    max_iters = 2000
    #1. LOAD THE DATA
    print('LOADING THE DATA: ',end=" ")
    DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
    y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
    y = (y+1)/2
    print('DONE')
    
    #2. RUN CROSS VALIDATION TO GET BEST gamma
    print('CROSS VALIDATION')
    #degree, gamma, error = cross_validation(y,tX,degrees,gammas,max_iters,k_fold,seed)
    degree = degrees[0]
    gamma = gammas[0]
    
    #3. TRAIN THE MODEL
    #Let us now clean the input
    tX = count_NaN(tX)
    tX,median_tr = sanitize_NaN(tX)
    tX,mean_tr,std_tr = standardize(tX)
    tX = build_poly(tX,degree)

    weights = logistic_regression(y, tX, gamma,max_iters)

    print('Weights on whole set\n',weights)
    
    #4. TEST THE MODEL AND EXPORT THE RESULTS
    DATA_TEST_PATH = '../data/test.csv'  # Download train data and supply path here 
    print('IMPORTING TESTING DATA :',end=" ")
    y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
    print('DONE')
    
    tX_test = count_NaN(tX_test)
    tX_test_sorted,median_vec = sanitize_NaN(tX_test,median_tr)
    tX_test_sorted,mean_tr,std_tr = standardize(tX_test_sorted,mean_tr,std_tr)
    tX_test_sorted = build_poly(tX_test_sorted, degree)
    OUTPUT_PATH = 'results/output_sanitized_normalization_test_deg_2_gamma_8.csv' # Fill in desired name of output file for submission
    print('EXPORTING TESTING DATA WITH PREDICTIONS :',end=" ")
    y_pred = predict_labels(np.array(weights), np.array(tX_test_sorted))
    create_csv_submission(ids_test, y_pred, OUTPUT_PATH)
    print('DONE')
Ejemplo n.º 3
0
def build_model(congress):
	majority_party = load_majority_party(congress)
	committee_membership = load_committee_membership(congress)
	lobbying_data = None #load_lobbying_data(congress)
	
	# universe
	BILLS = Bill.objects.filter(congress=congress).prefetch_related()
	
	# compute the most frequent first few words of bills. slurp in all of the titles,
	# record the counts of all of the prefixes of the titles, and then take the top few,
	# excluding ones that are prefixes of another popular prefix.
	title_counts = { }
	for bill in BILLS:
		title = bill.title_no_number
		if title.startswith("Providing for consideration of"): continue # hack, add later
		for nwords in xrange(4, 10):
			prefix = " ".join(title.split(" ")[0:nwords])
			title_counts[prefix] = title_counts.get(prefix, 0) + 1
	title_counts = sorted(title_counts.items(), key = lambda kv : kv[1], reverse=True)
	pop_title_prefixes = list()
	for t, c in title_counts:
		seen = False
		for tt in list(pop_title_prefixes):
			if tt.startswith(t + " "):
				seen = True
			elif t.startswith(tt + " "):
				pop_title_prefixes.remove(tt)
		if seen: continue
		pop_title_prefixes.append(t)
		if len(pop_title_prefixes) == 40: break
	pop_title_prefixes.append("A joint resolution proposing an amendment to the Constitution")
	pop_title_prefixes.append("Providing for consideration of")
		
	# We create separate models for bills by the bill type (H.R., S., H.Res., etc.)
	# and by whether the bill is introduced/referred or has been reported or more.
	# Once a bill has been reported by committee it's chances of success are
	# of course much higher, since the bills that have not been reported by committee
	# in historical data are necessarily failed bills. Also the models change
	# substantially.
		
	MODEL = dict()
	
	for (bill_type, bill_type_descr), model_type in itertools.product(BillType, (0, 1)):
		#if bill_type != BillType.house_bill: continue
		#if bill_type not in (BillType.house_joint_resolution, BillType.senate_joint_resolution): continue
		
		bills = BILLS.filter(bill_type=bill_type)
		if model_type == 1:
			# In model 0, we scan across all bills, because all bills were
			# in the introduced/referred status at one point. If we filter it to bills whose
			# current status is introduced/referred, obviously they will all have been
			# failed bills, which defeats the purpose. In model 1, we
			# only look at bills that have at least gotten reported so that we can see
			# of reported bills which make it to success.
			bills = bills.exclude(current_status__in=BillStatus.introduced_statuses)

		print bill_type_descr, model_type
		
		total = bills.count()
		
		if model_type == 0:
			# for the introduced model, success is getting out of committee
			total_success = bills.exclude(current_status__in=BillStatus.introduced_statuses).count()
		else:
			# for the reported model, success is being enacted (or whatever final status as appropriate for the bill type)
			total_success = bills.filter(current_status__in=BillStatus.final_status_passed).count()
			
		print "\toverall", int(round(100.0*total_success/total)), "%; N=", total
		
		sorted_bills = { }
		regression_outcomes = [ ]
		regression_predictors = [ ]
		factor_descriptions = { }
		for bill in bills:
			#import random # speed this up?
			#if random.random() < .7: continue
			
			# What's the measured binary outcome for this bill? Check if the bill
			# ended in a success state.
			success = is_success(bill, model_type)
			
			# Get the binary factors that apply to this bill.
			factors = get_bill_factors(bill, pop_title_prefixes, committee_membership, majority_party, lobbying_data)
			
			# maintain a simple list of success percent rates for each factor individually
			for key, descr, general_descr in factors:
				if not key in sorted_bills: sorted_bills[key] = [0, 0] # count of total, successful
				sorted_bills[key][0] += 1
				if success: sorted_bills[key][1] += 1
				factor_descriptions[key] = general_descr
				
			# build data for a regression
			regression_outcomes.append(1.0 if success else 0.0)
			regression_predictors.append(set( f[0] for f in factors )) # extract just the key from the (key, descr) tuple
			
		# check which factors were useful
		significant_factors = dict()
		for key, bill_counts in sorted_bills.items():
			# create a binomial distribution based on the overall pass rate for this
			# type of bill (H.R., H.Res., etc.) and a draw the number of bills
			# within this subset (key) that are passed, and see if it is statistically
			# different from the overall count. only include statistical differences.
			if bill_counts[0] < 15: continue
			distr = scipy.stats.binom(bill_counts[0], float(total_success)/float(total))
			pless = distr.cdf(bill_counts[1])
			pmore = 1.0-distr.cdf(bill_counts[1])
			if pless < .015 or pmore < .015 or (total < 100 and (pless < .05 or pmore < .05)):
				# only show statistically significant differences from the group mean
				significant_factors[key] = (pless, pmore)
				
		# run a logistic regression
		regression_predictors_map = None
		regression_beta = None
		if len(significant_factors) > 0:
			regression_predictors_map = dict(reversed(e) for e in enumerate(significant_factors))
			regression_predictors_2 = [ [] for f in regression_predictors_map ]
			for factors in regression_predictors:
				for fname, findex in regression_predictors_map.items():
					regression_predictors_2[findex].append(1.0 if fname in factors else 0.0)
			regression_predictors_2 = numpy.array(regression_predictors_2)
			regression_outcomes = numpy.array(regression_outcomes)
			regression_beta, J_bar, l = logistic_regression(regression_predictors_2, regression_outcomes)
			
		# Generate the model for output.
		model = dict()
		MODEL[(bill_type,model_type == 0)] = model
		if model_type == 0:
			model["success_name"] = "sent out of committee to the floor"
		else:
			if bill_type in (BillType.senate_bill, BillType.house_bill):
				model["success_name"] = "enacted"
			elif bill_type in (BillType.senate_joint_resolution, BillType.house_joint_resolution):
				model["success_name"] = "enacted or passed"
			else:
				model["success_name"] = "agreed to"
		model["count"] = total
		model["success_rate"] = 100.0*total_success/total
		model["regression_predictors_map"] = regression_predictors_map
		model["regression_beta"] = list(regression_beta) if regression_beta != None else None
		model_factors = dict()
		model["factors"] = model_factors
		for key, bill_counts in sorted_bills.items():
			if key not in significant_factors: continue
			pless, pmore = significant_factors[key]
			print "\t" + key, int(round(100.0*bill_counts[1]/bill_counts[0])), "%; N=", bill_counts[0], "p<", int(round(100*pless)), int(round(100*pmore)), "B=", regression_beta[regression_predictors_map[key]+1]
			model_factors[key] = dict()
			model_factors[key]["count"] = bill_counts[0]
			model_factors[key]["success_rate"] = 100.0*bill_counts[1]/bill_counts[0]
			model_factors[key]["regression_beta"] = regression_beta[regression_predictors_map[key]+1]
			model_factors[key]["description"] = factor_descriptions[key]
			
	with open("bill/prognosis_model.py", "w") as modelfile:
		modelfile.write("# this file was automatically generated by prognosis.py\n")
		modelfile.write("congress = %d\n" % congress)
		from pprint import pprint
		modelfile.write("pop_title_prefixes = ")
		pprint(pop_title_prefixes, modelfile)
		modelfile.write("factors = ")
		pprint(MODEL, modelfile)
def build_model(congress):
	majority_party = load_majority_party(congress)
	committee_membership = load_committee_membership(congress)
	indexed_success_text = load_indexed_success_text()
	lobbying_data = None #load_lobbying_data(congress)
	
	# universe
	BILLS = Bill.objects.filter(congress=congress).prefetch_related()
	
	# compute the most frequent first few words of bills. slurp in all of the titles,
	# record the counts of all of the prefixes of the titles, and then take the top few,
	# excluding ones that are prefixes of another popular prefix.
	title_counts = { }
	for bill in BILLS:
		title = bill.title_no_number
		if title.startswith("Providing for consideration of"): continue # hack, add later
		for nwords in xrange(4, 10):
			prefix = " ".join(title.split(" ")[0:nwords])
			title_counts[prefix] = title_counts.get(prefix, 0) + 1
	title_counts = sorted(title_counts.items(), key = lambda kv : kv[1], reverse=True)
	pop_title_prefixes = list()
	for t, c in title_counts:
		seen = False
		for tt in list(pop_title_prefixes):
			if tt.startswith(t + " "):
				seen = True
			elif t.startswith(tt + " "):
				pop_title_prefixes.remove(tt)
		if seen: continue
		pop_title_prefixes.append(t)
		if len(pop_title_prefixes) == 40: break
	pop_title_prefixes.append("A joint resolution proposing an amendment to the Constitution")
	pop_title_prefixes.append("Providing for consideration of")
		
	# We create separate models for bills by the bill type (bill, joint resolution,
	# concurrent resolution, simple resolution) and by whether the bill's status is
	# introduced/referred or has been reported or more.
	#
	# Once a bill has been reported by committee it's chances of success are
	# of course much higher, since the bills that have not been reported by committee
	# in historical data are necessarily failed bills. Also the models change
	# substantially.
		
	MODEL = dict()
	
	for bill_type, model_type in itertools.product(bill_type_map.keys(), (0, 1)):
		# GET LIST OF BILLS
		
		bills = BILLS.filter(bill_type__in=bill_type_map[bill_type])
		if model_type == 1:
			# In model 0, we scan across all bills, because all bills were
			# in the introduced/referred status at one point. If we filter it to bills whose
			# current status is introduced/referred, obviously they will all have been
			# failed bills, which defeats the purpose. In model 1, we
			# only look at bills that have at least gotten reported so that we can see
			# of reported bills which make it to success.
			bills = bills.exclude(current_status__in=BillStatus.introduced_statuses)

		print bill_type, model_type
		
		total = bills.count()
		
		if model_type == 0:
			# for the introduced model, success is getting out of committee
			total_success = bills.exclude(current_status__in=BillStatus.introduced_statuses).count()
		else:
			# for the reported model, success is being enacted (or whatever final status as appropriate for the bill type)
			total_success = bills.filter(current_status__in=BillStatus.final_status_passed).count()
			
		print "\toverall", int(round(100.0*total_success/total)), "%; N=", total
		
		# GET REGRESSION MATRIX INFORMATION
		
		# Build a list of sets, one for each bill, containing the binary
		# factors that apply to the bill. Build a corresponding list of
		# floats (either 1.0, 0.0) indicating whether the bill was successful.
		#
		# Also remember for each binary factor the total count of bills
		# it applied to and the count of those that were successful.
		#
		# And also remember for each binary factor, the short descriptive
		# text for the factor.
		
		factor_success_rate = { }
		regression_outcomes = [ ]
		regression_predictors = [ ]
		factor_descriptions = { }
		#bills = bills[0:100] # REMOVEME
		for bill in bills:
			# What's the measured outcome for this bill? Check if the bill
			# ended in a success state. Allow floating-point values!
			success = is_success(bill, model_type, indexed_success_text[bill_type])
			
			# Get the binary factors that apply to this bill.
			factors = get_bill_factors(bill, pop_title_prefixes, committee_membership, majority_party, lobbying_data)
			
			# maintain a simple list of success percent rates for each factor individually
			for key, descr, general_descr in factors:
				if not key in factor_success_rate: factor_success_rate[key] = [0, 0] # count of total, successful
				factor_success_rate[key][0] += 1
				factor_success_rate[key][1] += success
				factor_descriptions[key] = general_descr
				
			# build data for a regression
			regression_outcomes.append(success)
			regression_predictors.append(set( f[0] for f in factors )) # extract just the key from the (key, descr) tuple
			
		# FIRST PASS SIGNIFICANCE CHECK
		
		# Reduce the complexity of the regression model by filtering out
		# factors that, when considered independently, don't have a success
		# rate that appears to differ from the population success rate.
			
		factor_binomial_sig = dict()
		for key, bill_counts in factor_success_rate.items():
			# If there were very few bills with this factor, do not include it in the model.
			if bill_counts[0] < 15: continue
			
			# Create a binomial distribution with a sample size the same as
			# the number of bills with this factor, and with a probability
			# of heads equal to the population success rate.
			distr = scipy.stats.binom(bill_counts[0], float(total_success)/float(total))
			
			# What is the possibility that we would see as many or as few
			# successes as we do (i.e. two tailed).
			pless = distr.cdf(bill_counts[1]) # as few == P(count <= observed)
			pmore = 1.0-(distr.cdf(bill_counts[1]-1) if bill_counts[1] > 0 else 0.0) # as many == P(count >= observed)
			p = min(pless, pmore)
			if p < .05:
				factor_binomial_sig[key] = p
				
		# LOGISTIC REGRESSION
		
		for trial in xrange(2):
			regression_predictors_map = None
			regression_beta = None
			if len(factor_binomial_sig) > 0:
				# Assign consecutive indices to the remaining factors.
				regression_predictors_map = dict(reversed(e) for e in enumerate(factor_binomial_sig))
				
				# Build a binary matrix indicating which bills have which factors.
				regression_predictors_2 = [ [] for f in regression_predictors_map ]
				for factors in regression_predictors:
					for fname, findex in regression_predictors_map.items():
						regression_predictors_2[findex].append(1.0 if fname in factors else 0.0)
				regression_predictors_2 = numpy.array(regression_predictors_2)
				regression_outcomes = numpy.array(regression_outcomes)
				
				# Perform regression.
				regression_beta, J_bar, l = logistic_regression(regression_predictors_2, regression_outcomes)
				
				# Remove factors that are within 1.75 standard error from zero,
				# and then re-run the regression.
				if trial == 0:
					# Get the standard errors (the logistic_regression module
					# says to do it this way).
					from numpy import sqrt, diag, abs, median
					from numpy.linalg import inv
					try:
						stderrs = sqrt(diag(inv(J_bar))) # [intercept, beta1, beta2, ...]
					except numpy.linalg.linalg.LinAlgError as e:
						print "\t", e
						break
					
					# The standard errors are coming back wacky large for
					# the factors with VERY large beta. Special-case those.
					for fname, findex in regression_predictors_map.items():
						beta = regression_beta[findex+1]
						stderr = stderrs[findex+1]
						if abs(beta/stderr) < 1.75 and abs(beta) < 5.0:
							# This factor's effect is small/non-significant,
							# so remove it from factor_binomial_sig so that on
							# next iteration it is excluded from regression.
							del factor_binomial_sig[fname]
			
		# Generate the model for output.
		model = dict()
		MODEL[(bill_type,model_type == 0)] = model
		if model_type == 0:
			model["success_name"] = "sent out of committee to the floor"
		else:
			if bill_type == "bill":
				model["success_name"] = "enacted"
			elif bill_type == "jr":
				model["success_name"] = "enacted or passed"
			else:
				model["success_name"] = "agreed to"
		model["count"] = total
		model["success_rate"] = 100.0*total_success/total
		model["bill_type"] = bill_type
		model["bill_type_descr"] = bill_type_names[bill_type]
		model["is_introduced_model"] = (model_type == 0)
		model["regression_predictors_map"] = regression_predictors_map
		model["regression_beta"] = list(regression_beta) if regression_beta != None else None
		model_factors = dict()
		model["factors"] = model_factors
		for key, bill_counts in factor_success_rate.items():
			if key not in factor_binomial_sig: continue
			print "\t" + key, \
				int(round(100.0*bill_counts[1]/bill_counts[0])), "%;", \
				"N=", bill_counts[0], \
				"p<", int(round(100*factor_binomial_sig[key])), \
				"B=", regression_beta[regression_predictors_map[key]+1]
			model_factors[key] = dict()
			model_factors[key]["count"] = bill_counts[0]
			model_factors[key]["success_rate"] = 100.0*bill_counts[1]/bill_counts[0]
			model_factors[key]["regression_beta"] = regression_beta[regression_predictors_map[key]+1]
			model_factors[key]["description"] = factor_descriptions[key]
			
	with open("bill/prognosis_model.py", "w") as modelfile:
		modelfile.write("# this file was automatically generated by prognosis.py\n")
		modelfile.write("congress = %d\n" % congress)
		from pprint import pprint
		modelfile.write("pop_title_prefixes = ")
		pprint(pop_title_prefixes, modelfile)
		modelfile.write("factors = ")
		pprint(MODEL, modelfile)
Ejemplo n.º 5
0
        print("Split data: DONE")

        #Standardize the training data
        training_tx, training_tx_mean, training_tx_std = standardize_training(
            training_tx)

        #Remove outliers and -999 from the standardized training dataset
        training_tx, training_y = remove_outliers(training_tx, training_y, 2.3)

        #Use polynomial terms
        training_tx = get_polynomial(training_tx, 2)

        #Get optimal weights
        print("Compute least squares: START")
        maxiter, stepsize, lambda_ = 500, 1, 0
        weights = logistic_regression(training_y, training_tx, 1, stepsize,
                                      maxiter, lambda_)
        print("Compute least squares: DONE")
        # print("Weights are:",weights)

        #Standardize the test data using training mean and std
        test_tx = standardize_test(test_tx, training_tx_mean, training_tx_std)

        #Use polynomial terms
        test_tx = get_polynomial(test_tx, 2)

        # Perform prediction
        print("Perform prediction: START")
        y_pred = predict_labels(weights, test_tx)
        print("Perform prediction: DONE")

        # Verify predictions with in house test data
Ejemplo n.º 6
0
############### Name: Shubham Pareek ############
############### UBID: spareek        ############

from logistic_regression import *
from linear_regression import *
from neural_network import *
from preprocessing import *

X1, y1 = get_feature_matrix(data='hod', method='concatenate')
X2, y2 = get_feature_matrix(data='hod', method='subtract')
X3, y3 = get_feature_matrix(data='gsc', method='concatenate')
X4, y4 = get_feature_matrix(data='gsc', method='subtract')

logistic_regression(X1, y1)
logistic_regression(X2, y2)
logistic_regression(X3, y3)
logistic_regression(X4, y4)

linear_regression(X1, y1)
linear_regression(X2, y2)
linear_regression(X3, y3)
linear_regression(X4, y4)

neural_network(X1, y1)
neural_network(X2, y2)
neural_network(X3, y3)
neural_network(X4, y4)
def exp_three_models():

    training_data_path = "train.csv"
    test_data_path = "test.csv"
    output_path = "output.csv"
    train_ratio = 0.9
    n_trials = 10
    limit = int(train_ratio * 250000)
    sd_limit_0 = 3.0
    sd_limit_1 = 2.75
    sd_limit_2 = 2.75
    sd_limit_3 = 2.6
    list_sd_limit = [
        sd_limit_0, sd_limit_1, sd_limit_2, sd_limit_3, sd_limit_0, sd_limit_1,
        sd_limit_2, sd_limit_3
    ]
    list_poly = [2, 2, 2, 2, 2, 2, 2,
                 2]  # List of polynomials  for jets 0 to 3
    list_lambda = [1.0]  # List of lambdas for trial
    weights_average = np.array([0, 0, 0, 0, 0, 0, 0, 0])  # Average weights
    maxiter, stepsize, is_newton = 20000, 1e-05, 0
    print('isNewton = ', is_newton)

    print('stepsize = ', stepsize)
    f = open('lambda_check.txt', 'w')
    f.write('lambda' + '\t' + 'average accuracy' + '\t' + 'average weights' +
            '\n')
    #Read training data
    # print("Read training data: START")
    training_y_full, training_tx_full, training_ids = load_csv_data(
        training_data_path, sub_sample=False)
    # print("Read training data: DONE")

    for i_lam in range(len(list_lambda)):
        lambda_ = list_lambda[i_lam]
        accuracy_average = 0
        print('lambda = ', lambda_)
        for i_trial in range(n_trials):

            #Split the data into test and training
            training_tx, training_y, test_tx, test_y = split_data(
                training_tx_full, training_y_full, train_ratio, i_trial)

            # Split the training data by jet numbers
            # list_training_tx, list_training_y, list_training_ids = split_data_by_jet_num_2(training_tx, training_y, training_ids[:limit])
            list_training_tx, list_training_y, list_training_ids = split_data_by_jet_num_feature(
                training_tx, training_y, training_ids[:limit])

            # Split test data into various jet numbers
            # list_test_tx, list_test_y, list_test_ids = split_data_by_jet_num_2(test_tx, test_y, training_ids[:250000-limit])
            list_test_tx, list_test_y, list_test_ids = split_data_by_jet_num_feature(
                test_tx, test_y, training_ids[:250000 - limit])

            # List that will contain all the norms of the weights for each jet
            list_weight_norms = np.array([0, 0, 0, 0, 0, 0, 0, 0])

            # Loop through jet numbers
            for i in range(8):
                # print('Jet number is : ', i)
                training_tx_i, training_y_i, training_ids_i = list_training_tx[
                    i], list_training_y[i], list_training_ids[i]
                test_tx_i, test_y_i, test_ids_i = list_test_tx[i], list_test_y[
                    i], list_test_ids[i]
                sd_limit = list_sd_limit[i]

                # ******************************************* TRAINING *************************************************************
                #Standardize the training data
                training_tx_i, training_tx_i_mean, training_tx_i_std = standardize_training(
                    training_tx_i)

                #Remove outliers from the standardized training dataset
                training_tx_i, training_y_i, training_tx_i_out, training_y_i_out = remove_outliers(
                    training_tx_i, training_y_i, sd_limit)

                #Redo standardization
                training_tx_i, training_tx_i_mean, training_tx_i_std = redo_standardization(
                    training_tx_i, training_tx_i_mean, training_tx_i_std)

                #Create polynomial expansions
                training_tx_i = get_polynomial(training_tx_i, list_poly[i])

                #Get weights
                weights_i = logistic_regression(training_y_i, training_tx_i,
                                                is_newton, stepsize, maxiter,
                                                lambda_)

                # ******************************************* PREDICTION *************************************************************
                #Standardize the test data using training mean and std
                test_tx_i_standardized = standardize_test(
                    test_tx_i, training_tx_i_mean, training_tx_i_std)

                #Create polynomial expansion for the test data
                test_tx_i_standardized = get_polynomial(
                    test_tx_i_standardized, list_poly[i])

                #Get predictions for the current jet
                y_pred_i = predict_labels(weights_i, test_tx_i_standardized)
                accuracy = verify_prediction(y_pred_i, test_y_i)

                #Collate data for all jets
                if (i == 0):
                    y_pred_all = y_pred_i
                    test_y_all = test_y_i
                    test_ids_all = test_ids_i

                else:
                    y_pred_all = np.concatenate((y_pred_all, y_pred_i))
                    test_y_all = np.concatenate((test_y_all, test_y_i))
                    test_ids_all = np.concatenate((test_ids_all, test_ids_i))

                list_weight_norms[i] = np.linalg.norm(weights_i)
                # print('Accuracy 0 is:',accuracy)
                # print(list_weight_norms)
            accuracy = verify_prediction(y_pred_all, test_y_all)
            print('Accuracy Cross is:', accuracy)
            accuracy_average = (accuracy +
                                (accuracy_average * i_trial)) / (i_trial + 1)
            weights_average = (list_weight_norms +
                               (weights_average * i_trial)) / (i_trial + 1)
        print('Accuracy Average is:', accuracy_average)
        print('Weights Average Norms are:', weights_average)
        f.write(
            str(lambda_) + '\t' + str(accuracy_average) + '\t' +
            str(weights_average) + '\n')

    f.close()
Ejemplo n.º 8
0
def exp_three_models():

    training_data_path = "train.csv"
    test_data_path = "test.csv"
    output_path = "output.csv"
    train_ratio = 0.9
    n_trials = 1
    limit = int(train_ratio * 250000)
    sd_limit_0 = 2.3
    sd_limit_1 = 2.75
    sd_limit_others = 2.3
    #Read training data
    print("Read training data: START")
    training_y, training_tx, training_ids = load_csv_data(training_data_path,
                                                          sub_sample=False)
    print("Read training data: DONE")

    for i in range(n_trials):
        #Split the data into test and training
        # print("Split data: START")
        # training_tx, training_y, test_tx, test_y = split_data(training_tx_full, training_y_full, train_ratio, i)
        # print("Split data: DONE")

        print("Split data based on jet number: START")
        training_tx_0, training_y_0, training_ids_0, training_tx_1, training_y_1, training_ids_1, \
        training_tx_others, training_y_others, training_ids_others \
        = split_data_by_jet_num(training_tx, training_y, training_ids)
        print("Split data based on jet number: DONE")

        #Standardize the training data
        training_tx_0, training_tx_0_mean, training_tx_0_std = standardize_training(
            training_tx_0)
        training_tx_1, training_tx_1_mean, training_tx_1_std = standardize_training(
            training_tx_1)
        training_tx_others, training_tx_others_mean, training_tx_others_std = standardize_training(
            training_tx_others)

        #Remove outliers and -999 from the standardized training dataset
        #training_tx, training_y = remove_outliers(training_tx,training_y,2.3)
        training_tx_0, training_y_0, training_tx_0_out, training_y_0_out = remove_outliers(
            training_tx_0, training_y_0, sd_limit_0)
        training_tx_1, training_y_1, training_tx_1_out, training_y_1_out = remove_outliers(
            training_tx_1, training_y_1, sd_limit_1)
        training_tx_others, training_y_others, training_tx_others_out, training_y_others_out = remove_outliers(
            training_tx_others, training_y_others, sd_limit_others)

        training_tx_0 = get_polynomial(training_tx_0, 2)
        training_tx_1 = get_polynomial(training_tx_1, 2)
        training_tx_others = get_polynomial(training_tx_others, 2)

        # weights_0 = least_squares(training_y_0, training_tx_0)
        # weights_1 = least_squares(training_y_1, training_tx_1)
        # weights_others = least_squares(training_y_others, training_tx_others)

        maxiter, stepsize = 50000, 5e-01
        lambda_ = 0
        weights_0 = logistic_regression(training_y_0, training_tx_0, 1,
                                        stepsize, maxiter, lambda_)
        weights_1 = logistic_regression(training_y_1, training_tx_1, 1,
                                        stepsize, maxiter, lambda_)
        weights_others = logistic_regression(training_y_others,
                                             training_tx_others, 1, stepsize,
                                             maxiter, lambda_)

        test_tx_0, test_y_0, test_ids_0, test_tx_1, test_y_1, test_ids_1, test_tx_others, test_y_others, \
        test_ids_others = split_data_by_jet_num(test_tx, test_y, training_ids)

        #Standardize the test data using training mean and std
        test_tx_0 = standardize_test(test_tx_0, training_tx_0_mean,
                                     training_tx_0_std)
        test_tx_1 = standardize_test(test_tx_1, training_tx_1_mean,
                                     training_tx_1_std)
        test_tx_others = standardize_test(test_tx_others,
                                          training_tx_others_mean,
                                          training_tx_others_std)

        test_tx_0, test_y_0, test_tx_0_out, test_y_0_out = remove_outliers(
            test_tx_0, test_y_0, sd_limit_0)
        #training_tx_1, training_y_1, training_tx_1_out, training_y_1_out = remove_outliers(training_tx_1,training_y_1,2.3)
        #training_tx_others, training_y_others, training_tx_others_out, training_y_others_out = remove_outliers(training_tx_others,training_y_others,2.3)

        test_tx_0 = get_polynomial(test_tx_0, 2)
        test_tx_1 = get_polynomial(test_tx_1, 2)
        test_tx_others = get_polynomial(test_tx_others, 2)

        y_pred_0 = predict_labels(weights_0, test_tx_0)
        y_pred_1 = predict_labels(weights_1, test_tx_1)
        y_pred_others = predict_labels(weights_others, test_tx_others)

        y_pred_0_out = np.array([-1] * test_y_0_out.shape[0])

        accuracy = verify_prediction(y_pred_0, test_y_0)
        print('Accuracy 0 is:', accuracy)
        accuracy = verify_prediction(y_pred_1, test_y_1)
        print('Accuracy 1 is:', accuracy)
        accuracy = verify_prediction(y_pred_others, test_y_others)
        print('Accuracy Others is:', accuracy)

        y_pred_all = np.concatenate(
            (y_pred_0, y_pred_1, y_pred_others, y_pred_0_out))
        test_y_all = np.concatenate(
            (test_y_0, test_y_1, test_y_others, test_y_0_out))
        accuracy = verify_prediction(y_pred_all, test_y_all)
        print('Accuracy Cross is:', accuracy)

    #Read test data
    print("Read test data: START")
    test_y, test_tx, test_ids = load_csv_data(test_data_path)
    print("Read test data: DONE")

    test_tx_0, test_y_0, test_ids_0, test_tx_1, test_y_1, test_ids_1, test_tx_others, test_y_others, \
     test_ids_others = split_data_by_jet_num(test_tx, test_y, test_ids)

    test_tx_0 = standardize_test(test_tx_0, training_tx_0_mean,
                                 training_tx_0_std)
    test_tx_1 = standardize_test(test_tx_1, training_tx_1_mean,
                                 training_tx_1_std)
    test_tx_others = standardize_test(test_tx_others, training_tx_others_mean,
                                      training_tx_others_std)

    test_tx_0, test_y_0, test_tx_0_out, test_y_0_out = remove_outliers(
        test_tx_0, test_y_0, sd_limit_0)

    #Use polynomial terms
    test_tx_0 = get_polynomial(test_tx_0, 2)
    test_tx_1 = get_polynomial(test_tx_1, 2)
    test_tx_others = get_polynomial(test_tx_others, 2)

    # Perform prediction
    print("Perform prediction: START")
    y_pred_0 = predict_labels(weights_0, test_tx_0)
    y_pred_1 = predict_labels(weights_1, test_tx_1)
    y_pred_others = predict_labels(weights_others, test_tx_others)
    y_pred_0_out = np.array([-1] * test_y_0_out.shape[0])
    print("Perform prediction: DONE")

    test_ids_all = np.concatenate((test_ids_0, test_ids_1, test_ids_others))
    y_pred_all = np.concatenate(
        (y_pred_0, y_pred_1, y_pred_others, y_pred_0_out))

    zipped_list = sorted(zip(test_ids_all, y_pred_all))
    test_ids_all, y_pred_all = zip(*zipped_list)
    test_ids_all = np.array(test_ids_all)
    y_pred_all = np.array(y_pred_all)
    # Create output file
    print("Write CSV output: START")
    create_csv_submission(test_ids_all, y_pred_all, output_path)
    print("Write CSV output: DONE")
Ejemplo n.º 9
0
#Note: At some point I'm going to clean this code up.
#      Right now it still needs to be vectorized.

import matplotlib.pyplot as plt
from logistic_regression import *
from logistic_regression_util import *
import numpy as np
import random
import math
import sys

#-----------------------------------------------------------------------START - Generating Dataset
sys.stdout.write("Generating dataset...")
data, X, Y = generateData(200)
print "complete!"
#-----------------------------------------------------------------------END - Generating Dataset

#-----------------------------------------------------------------------START - Train Model
sys.stdout.write("Training model...")
model = logistic_regression()
model.train(X, Y)
print "complete!"
#-----------------------------------------------------------------------END - Train Model

#-----------------------------------------------------------------------START - Plot the results
sys.stdout.write("Displaying classification results...")
displayClassification(plt, model, X, data)
print "complete!"
#-----------------------------------------------------------------------END - Plot the results
def exp_three_models():

    training_data_path = "train.csv"
    test_data_path = "test.csv"
    output_path = "output.csv"
    train_ratio = 0.8
    n_trials = 10
    limit = int(train_ratio * 250000)
    sd_limit_0 = 2.3
    sd_limit_1 = 2.5
    sd_limit_2 = 2.75
    sd_limit_3 = 2.6
    list_sd_limit = [sd_limit_0, sd_limit_1, sd_limit_2, sd_limit_3]
    list_poly = [2, 2, 2, 2]
    maxiter, stepsize, lambda_, is_newton = 200, 0.1, 1, 1
    #Read training data
    print("Read training data: START")
    training_y_full, training_tx_full, training_ids = load_csv_data(
        training_data_path, sub_sample=False)
    print("Read training data: DONE")
    accuracy_average = 0
    for i_trial in range(n_trials):
        #Split the data into test and training
        print("Split data: START")
        training_tx, training_y, test_tx, test_y = split_data(
            training_tx_full, training_y_full, train_ratio, i_trial)
        print("Split data: DONE")

        # print("Split data based on jet number (2 and 3 together): START")
        # training_tx_0, training_y_0, training_ids_0, training_tx_1, training_y_1, training_ids_1, \
        # training_tx_2, training_y_2, training_ids_2 \
        # = split_data_by_jet_num(training_tx, training_y, training_ids[:limit])
        # print("Split data based on jet number (2 and 3 together): DONE")

        print("Split data based on jet number (2 and 3 separate): START")
        list_training_tx, list_training_y, list_training_ids = split_data_by_jet_num_2(
            training_tx, training_y, training_ids[:limit])
        print("Split data based on jet number (2 and 3 separate): DONE")

        # Split test data into various jet numbers
        list_test_tx, list_test_y, list_test_ids = split_data_by_jet_num_2(
            test_tx, test_y, training_ids[:250000 - limit])
        list_weights = []

        # Loop through jet numbers
        for i in range(4):
            training_tx_i, training_y_i, training_ids_i = list_training_tx[
                i], list_training_y[i], list_training_ids[i]
            test_tx_i, test_y_i, test_ids_i = list_test_tx[i], list_test_y[
                i], list_test_ids[i]
            sd_limit = list_sd_limit[i]

            # ******************************************* TRAINING *************************************************************
            # training_tx_i = replace_999_mass(training_tx_i, training_y_i)

            #Standardize the training data
            training_tx_i, training_tx_i_mean, training_tx_i_std = standardize_training(
                training_tx_i)

            #Remove outliers and -999 from the standardized training dataset
            training_tx_i, training_y_i, training_tx_i_out, training_y_i_out = remove_outliers(
                training_tx_i, training_y_i, sd_limit)

            #Redo standardization
            training_tx_i, training_tx_i_mean, training_tx_i_std = redo_standardization(
                training_tx_i, training_tx_i_mean, training_tx_i_std)

            #Create polynomial expansions
            training_tx_i = get_polynomial(training_tx_i, list_poly[i])

            #Get weights
            weights_i = logistic_regression(training_y_i, training_tx_i,
                                            is_newton, stepsize, maxiter,
                                            lambda_)
            list_weights.append(weights_i)

            # ******************************************* PREDICTION *************************************************************
            #Standardize the test data using training mean and std
            test_tx_i_standardized = standardize_test(test_tx_i,
                                                      training_tx_i_mean,
                                                      training_tx_i_std)

            #Create polynomial expansion for the test data
            test_tx_i_standardized = get_polynomial(test_tx_i_standardized,
                                                    list_poly[i])

            #Get predictions for all the jets
            y_pred_i = predict_labels(weights_i, test_tx_i_standardized)
            # if i ==0: y_pred_i = set_background(test_tx_i_standardized,y_pred_i, sd_limit)
            accuracy = verify_prediction(y_pred_i, test_y_i)

            #Collate data for all jets
            if (i == 0):
                y_pred_all = y_pred_i
                test_y_all = test_y_i
                test_ids_all = test_ids_i
            else:
                y_pred_all = np.concatenate((y_pred_all, y_pred_i))
                test_y_all = np.concatenate((test_y_all, test_y_i))
                test_ids_all = np.concatenate((test_ids_all, test_ids_i))
            print('Accuracy 0 is:', accuracy)

        accuracy = verify_prediction(y_pred_all, test_y_all)
        print('Accuracy Cross is:', accuracy)
        accuracy_average = (accuracy +
                            (accuracy_average * i_trial)) / (i_trial + 1)
    print('Accuracy Average is:', accuracy_average)
Ejemplo n.º 11
0
def exp_three_models():

    # ******************************************* INPUT PARAMS *************************************************************
    training_data_path = "train.csv"
    test_data_path = "test.csv"
    output_path = "output.csv"
    sd_limit_0, sd_limit_1, sd_limit_2, sd_limit_3 = 2.5, 2.75, 2.75, 2.6
    # list_sd_limit = [sd_limit_0,sd_limit_1,sd_limit_2,sd_limit_3]
    list_sd_limit = [
        sd_limit_0, sd_limit_1, sd_limit_2, sd_limit_3, sd_limit_0, sd_limit_1,
        sd_limit_2, sd_limit_3
    ]
    maxiter, stepsize, lambda_, is_newton = 20000, 1e-01, 1, 1

    # ******************************************* READ DATA *************************************************************
    #Read training data
    training_y, training_tx, training_ids = load_csv_data(training_data_path,
                                                          sub_sample=False)

    # Read test data
    test_y, test_tx, test_ids = load_csv_data(test_data_path)

    # ******************************************* SPLIT DATA *************************************************************
    #Split training data into different jets
    list_training_tx, list_training_y, list_training_ids = split_data_by_jet_num_feature(
        training_tx, training_y, training_ids)

    # Split test data into various jet numbers
    list_test_tx, list_test_y, list_test_ids = split_data_by_jet_num_feature(
        test_tx, test_y, test_ids)
    list_weights = []

    # Loop through jet numbers
    for i in range(8):
        training_tx_i, training_y_i, training_ids_i = list_training_tx[
            i], list_training_y[i], list_training_ids[i]
        test_tx_i, test_y_i, test_ids_i = list_test_tx[i], list_test_y[
            i], list_test_ids[i]
        sd_limit = list_sd_limit[i]

        # ******************************************* TRAINING *************************************************************
        #Standardize the training data
        training_tx_i, training_tx_i_mean, training_tx_i_std = standardize_training(
            training_tx_i)

        #Remove outliers and -999 from the standardized training dataset
        training_tx_i, training_y_i, training_tx_i_out, training_y_i_out = remove_outliers(
            training_tx_i, training_y_i, sd_limit)

        #Redo standardization
        training_tx_i, training_tx_i_mean, training_tx_i_std = redo_standardization(
            training_tx_i, training_tx_i_mean, training_tx_i_std)

        #Create polynomial expansions
        training_tx_i = get_polynomial(training_tx_i, 2)

        #Get weights
        weights_i = logistic_regression(training_y_i, training_tx_i, is_newton,
                                        stepsize, maxiter, lambda_)
        # list_weights.append(weights_i)

        # ******************************************* PREDICTION *************************************************************
        #Standardize the test data using training mean and std
        test_tx_i = standardize_test(test_tx_i, training_tx_i_mean,
                                     training_tx_i_std)

        #Create polynomial expansion for the test data
        test_tx_i = get_polynomial(test_tx_i, 2)

        #Get predictions for all the jets
        y_pred_i = predict_labels(weights_i, test_tx_i)

        #Collate data for all jets
        if (i == 0):
            y_pred_all = y_pred_i
            test_ids_all = test_ids_i
        else:
            y_pred_all = np.concatenate((y_pred_all, y_pred_i))
            test_ids_all = np.concatenate((test_ids_all, test_ids_i))

    # ******************************************* OUTPUT *************************************************************
    zipped_list = sorted(zip(test_ids_all, y_pred_all))
    test_ids_all, y_pred_all = zip(*zipped_list)
    test_ids_all = np.array(test_ids_all)
    print('length of test ids', len(test_ids_all))
    print('length of test ys', len(y_pred_all))
    y_pred_all = np.array(y_pred_all)

    # Create output file
    print("Write CSV output: START")
    create_csv_submission(test_ids_all, y_pred_all, output_path)
    print("Write CSV output: DONE")