def getData(rep_id, data_directory, preprocess_data=None): ''' Loads any cached data from previous preprocessing to avoid time sinks. Caches the final dataset for this rep under TrainingData/<rep_id> Returns: Training set for this rep in the form: { 'labels': A list of labels for each feature, 'data': List of data point objects} ''' print 'Getting votes and bills...' votes = load_json(data_directory+'/'+rep_id) # Get the training votes for this rep bills = getBills(votes) print '\tdone.' # Generate information about the bills used for later feature generation if preprocess_data == None: preprocess_data = preprocess.preprocess(rep_id, bills) data_points = [] all_labels = None label_hash = None print "Generating all feature vectors..." # Iterate through all the votes and bills and compile the final data set for i, v in enumerate(votes): if i % 20 == 0: a_string = str(int(float(i)/float(len(votes))*100))+'%' sys.stdout.write('\b%s%s'%(a_string,'\b' * len(a_string))) # '\b' erases a char # Our final point object to feed into training. woo point = { 'option': v['option'], 'bill': bills[i], 'vote_obj': v } # Generate the input vector for this vote vector, labels = extract_features.generate_feature_vector(point['bill'], preprocess_data) point['vector'] = vector # If the labels for this vector are different from the previous, # the feature generation is messed up or out of order if all_labels == None: all_labels = labels else: if labels != all_labels: print "Error: Labels differ on data points. Feature vector generation is messed up." # Ignore abstaining votes if point['option']=="+" or point['option']=="-" or config.remove_abstaining_votes == False: data_points.append(point) # Save off the data and return result = {'labels':labels, 'data':data_points} return result
def getTrainingPoints(rep_id): print 'Getting votes and bills...' votes = load_json('rep_votes_train/'+rep_id) bills = getBills(rep_id, votes) print 'done.' # Generate information about the bills used for later feature generation preprocess_data = preprocess.preprocess(rep_id, bills, features_to_use) data_points = [] all_labels = None label_hash = None print "Generating all feature vectors" # Iterate through all the votes and bills and compile the final data set for i, v in enumerate(votes): if i % 200 == 0: print '%i / %i' % (i, len(votes)) # Our final point object to feed into training. woo point = { 'option': v['option'], 'bill': bills[i], 'vote_obj': v } # Generate the input vector for this vote vector, labels = extract_features.generate_feature_vector(point['bill'], preprocess_data, features_to_use) point['vector'] = vector # If the labels for this vector are different from the previous, # the feature generation is messed up or out of order if all_labels == None: all_labels = labels else: if labels != all_labels: print "Error: Labels differ on data points. Feature vector generation is messed up." data_points.append(point) return (labels, data_points)
for v in votes_s: if v['chamber'] != 'senate': votes.append(v) results = {} for rep_id in reps: print '--------------------- ', rep_id # Generate preprocess on train bills! gen_feature_data.getData(rep_id, 'rep_votes_train', preprocess_data=None) pre_data = json.loads(open('preprocess_data/'+rep_id).read()) model = svm.loadSVM('all_no_summary', rep_id) print 'Loaded model and data.' #print votes_ for i, v in enumerate(votes): vector, _ = extract_features.generate_feature_vector(bills[i], pre_data) test_data = np.array(vector) prediction = model.predict(test_data) print prediction if v['id'] not in results: results[v['id']] = 0 results[v['id']] += prediction for v in votes: nums = v['required'].split('/') req_percentage = float(nums[0]) / float(nums[1])