def getData(rep_id, data_directory, preprocess_data=None):
    '''
        Loads any cached data from previous preprocessing to avoid time sinks. 
        Caches the final dataset for this rep under TrainingData/<rep_id>

        Returns:
            Training set for this rep in the form:
            { 'labels': A list of labels for each feature, 'data': List of data point objects}
    '''
    print 'Getting votes and bills...'
    votes = load_json(data_directory+'/'+rep_id) # Get the training votes for this rep
    bills = getBills(votes)
    print '\tdone.'

    # Generate information about the bills used for later feature generation
    if preprocess_data == None:
        preprocess_data = preprocess.preprocess(rep_id, bills)
    
    data_points = []
    all_labels = None
    label_hash = None

    print "Generating all feature vectors..."

    # Iterate through all the votes and bills and compile the final data set
    for i, v in enumerate(votes):
        if i % 20 == 0: 
            a_string = str(int(float(i)/float(len(votes))*100))+'%'
            sys.stdout.write('\b%s%s'%(a_string,'\b' * len(a_string))) # '\b' erases a char

        # Our final point object to feed into training. woo
        point = {
            'option': v['option'],
            'bill': bills[i],
            'vote_obj': v
        }

        # Generate the input vector for this vote
        vector, labels = extract_features.generate_feature_vector(point['bill'], preprocess_data)
        point['vector'] = vector

        # If the labels for this vector are different from the previous, 
        # the feature generation is messed up or out of order
        if all_labels == None:
            all_labels = labels
        else:
            if labels != all_labels:
                print "Error: Labels differ on data points. Feature vector generation is messed up."

        # Ignore abstaining votes
        if point['option']=="+" or point['option']=="-" or config.remove_abstaining_votes == False:
            data_points.append(point)

    # Save off the data and return 
    result = {'labels':labels, 'data':data_points}

    return result
def getTrainingPoints(rep_id):
    print 'Getting votes and bills...'
    votes = load_json('rep_votes_train/'+rep_id)
    bills = getBills(rep_id, votes)
    print 'done.'

    # Generate information about the bills used for later feature generation
    preprocess_data = preprocess.preprocess(rep_id, bills, features_to_use)
    data_points = []
    all_labels = None
    label_hash = None

    print "Generating all feature vectors"
    # Iterate through all the votes and bills and compile the final data set
    for i, v in enumerate(votes):
        if i % 200 == 0: print '%i / %i' % (i, len(votes))

        # Our final point object to feed into training. woo
        point = {
            'option': v['option'],
            'bill': bills[i],
            'vote_obj': v
        }

        # Generate the input vector for this vote
        vector, labels = extract_features.generate_feature_vector(point['bill'], preprocess_data, features_to_use)
        point['vector'] = vector

        # If the labels for this vector are different from the previous, 
        # the feature generation is messed up or out of order
        if all_labels == None:
            all_labels = labels
        else:
            
            if labels != all_labels:
                print "Error: Labels differ on data points. Feature vector generation is messed up."

        data_points.append(point)

    return (labels, data_points)
for v in votes_s:
  if v['chamber'] != 'senate':
    votes.append(v)

results = {}

for rep_id in reps:
  print  '---------------------  ', rep_id
  # Generate preprocess on train bills!
  gen_feature_data.getData(rep_id, 'rep_votes_train', preprocess_data=None)
  pre_data = json.loads(open('preprocess_data/'+rep_id).read())
  model = svm.loadSVM('all_no_summary', rep_id)
  print 'Loaded model and data.'
  #print votes_
  for i, v in enumerate(votes):
    vector, _ = extract_features.generate_feature_vector(bills[i], pre_data)
    test_data = np.array(vector)

    prediction = model.predict(test_data)
    print prediction

    if v['id'] not in results:
      results[v['id']] = 0
    results[v['id']] += prediction

    
    
for v in votes:
  nums = v['required'].split('/')
  req_percentage = float(nums[0]) / float(nums[1])