def do_feature_set_analysis(train_instances, test_instances, folds, clf, param_grid, dense, outfile):

    groups = set(train_instances[0].feature_groups.keys()).intersection(test_instances[0].feature_groups.keys())

    print groups
    X_train, y_train, feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups)
    X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense = dense)
    model = get_optimal_model(X_train, y_train, folds, clf, param_grid, 'roc_auc')
    y_pred = model.predict(X_test)
    scores =  get_scores(model, X_test) 
    print("Test ROC: %f" % roc_auc_score( y_test, scores))
    print(classification_report(y_test, y_pred))

    fpr, tpr, thresholds = roc_curve(y_test, scores)
    np.set_printoptions(threshold='nan')

    for i in range(1, len(fpr), 100):
        print "Theshold: %0.4f  FPR: %0.4f   TPR: %0.4f" % (thresholds[i], fpr[i], tpr[i])

    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, lw=1, label='ALL  (area = %0.4f)' % (roc_auc))


    all_tpr = []

    for g in groups:
        print g

        X_train, y_train, feature_space = pipe.instances_to_matrix(train_instances, groups = [g,], dense = dense)
        X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense = dense)

        model = get_optimal_model(X_train, y_train,  folds, clf, param_grid, 'roc_auc')
        y_pred = model.predict(X_test)
        scores = get_scores(model, X_test)
        print("Test ROC: %f" % roc_auc_score( y_test, scores))
        print(classification_report(y_test, y_pred))
        fpr, tpr, thresholds = roc_curve(y_test, scores)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='%s  (area = %0.4f)' % (g.split("_")[0], roc_auc))
        print "\n"*4

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Feature Set Analysis')
    plt.legend(loc="lower right", prop={'size':12})
    plt.savefig(outfile)
Ejemplo n.º 2
0
def do_feature_selection(train_instances, test_instances, folds, clf,
                         param_grid, dense, outfile):
    groups = set(train_instances[0].feature_groups.keys()).intersection(
        test_instances[0].feature_groups.keys())

    X_train, y_train, feature_space = pipe.instances_to_matrix(train_instances,
                                                               dense=dense,
                                                               groups=groups)
    X_test, y_test = test_instances_to_matrix(feature_space,
                                              test_instances,
                                              dense=dense)

    all_tpr = []

    (chi2values, pval) = chi2(X_train, y_train)
    feature_indices = [
        i[0] for i in sorted(enumerate(pval), key=lambda x: x[1])
    ]
    index_to_name = {v: k for k, v in feature_space.items()}
    feature_names = [index_to_name[i] for i in feature_indices]

    print feature_indices[0:200]
    print feature_names[0:200]

    for percentile in range(1, 10, 2):
        t0 = time()
        ch2 = SelectPercentile(chi2, percentile=percentile)
        X_train_trans = ch2.fit_transform(X_train, y_train)
        print("done in %fs" % (time() - t0))

        model = get_optimal_model(X_train_trans, y_train, folds, clf,
                                  param_grid, 'roc_auc')

        X_test_trans = ch2.transform(X_test)

        scores = get_scores(model, X_test_trans)
        fpr, tpr, thresholds = roc_curve(y_test, scores)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr,
                 tpr,
                 lw=1,
                 label='%d  (area = %0.4f)' % (percentile, roc_auc))
        print "\n" * 4

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.savefig('feature_selection.png')

    print()
    def label_row(self, row, column_indices, table_offset, congress, chamber,
                  document_type, number, sponsor_indices):

        instance = self.get_instance_from_row(row, column_indices)
        X, y, space = pipe.instances_to_matrix(
            [
                instance,
            ], feature_space=self.feature_space, dense=False)
        scores = self.model.decision_function(X)
        fields = [
            'congress', 'chamber', 'document_type', 'number', 'row',
            'row_offset', 'row_length', 'score', 'state', 'sponsors'
        ]
        cmd = "insert into candidate_earmarks (" + ", ".join(
            fields
        ) + ") values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) returning id"
        attributes = instance.attributes
        state = self.geo_coder.get_state(attributes['entity_text'])
        cur = self.conn.cursor()
        if sponsor_indices:
            print sponsor_indices

        sponsors = []
        for index in sponsor_indices:
            try:
                sponsor_cell = attributes['entity_text'].split("|")[index]
                sponsors_in_cell = string_functions.tokenize(
                    string_functions.normalize_no_lower(sponsor_cell))
                for sic in sponsors_in_cell:
                    if sic in self.sponsor_coder.sponsors[congress]:
                        sponsors.append(sic)

            except Exception as e:
                print "Index: %d" % index
                print len(attributes['entity_text'].split("|"))
                print attributes['entity_text']
                logging.exception("SCREW UP")

        sponsors_string = "|".join(sponsors)[:1024]

        cur.execute(cmd, (congress, chamber, document_type, number,
                          attributes['entity_text'], row.offset + table_offset,
                          row.length, scores[0], state, sponsors_string))
        curr_id = cur.fetchone()[0]

        for sponsor in sponsors:
            cur.execute(
                'insert into sponsors (candidate_earmark_id, sponsor) values (%s, %s)',
                (curr_id, sponsor))

        self.conn.commit()
def main():

    parser = argparse.ArgumentParser(description='Match entities to OMB')
    parser.add_argument('--model', required = False, help='path to pickeld matching model')
    parser.add_argument('--data', required = False, help='path to pickeld instances')

    args = parser.parse_args()

    bills2008 = os.path.join(util.configuration.get_path_to_bills(), "/110/bills/hr/hr2764/text-versions/")
    bills2009 = os.path.join(util.configuration.get_path_to_bills(), "/111/bills/hr/hr1105/text-versions/")

    years = [ "111", "110","109", "108", "107", "106", "105", "104"]

    reports_base=util.configuration.get_path_to_reports()
    folders = [os.path.join(reports_base, year) for year in years] + [bills2008, bills2009]


    conn = psycopg2.connect(CONN_STRING)

    if args.model:
        feature_space = pickle.load(open(args.model+".feature_space", "rb"))
        model = joblib.load(args.model)
        logging.info("Loaded Model")

    elif args.data:
        keep_group = ['unigram_feature_generator', 'simple_entity_text_feature_generator', 'geo_feature_generator', 'sponsor_feature_generator']
        instances = prepare_earmark_data.load_instances(args.data)
        ignore_groups = [ fg for fg in instances[0].feature_groups.keys() if fg not in keep_group]
        X, y, feature_space = pipe.instances_to_matrix(instances, ignore_groups = ignore_groups,  dense = False)
        clf = svm.LinearSVC(C = 0.01)
        param_grid = {'C': [ 0.01, 0.1]}
        model = diagnostics.get_optimal_model (X, y, 5, clf, param_grid, 'roc_auc')
    else:
        exit()

    geo_coder = GeoCoder()
    sponsor_coder = SponsorCoder()


    earmark_detector = EarmarkDetector(geo_coder, sponsor_coder, conn, model, feature_space)


    p = mp.Pool(mp.cpu_count())
    p.map(label_all, [(folder, earmark_detector) for folder in folders])


    #for folder in folders:
    #   label_all((folder, earmark_detector));

    conn.close()
def do_feature_selection(train_instances, test_instances, folds, clf, param_grid, dense, outfile):
    groups = set(train_instances[0].feature_groups.keys()).intersection(test_instances[0].feature_groups.keys())

    X_train, y_train, feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups)
    X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense = dense)

    all_tpr = []

    (chi2values, pval) =  chi2(X_train, y_train)
    feature_indices = [i[0] for i in sorted(enumerate(pval), key=lambda x:x[1])]
    index_to_name = {v:k for k, v in feature_space.items()}
    feature_names = [index_to_name[i] for i in feature_indices]

    print feature_indices[0:200]
    print feature_names[0:200]

    for percentile in range(1, 10, 2):
            t0 = time()
            ch2 = SelectPercentile(chi2, percentile=percentile)
            X_train_trans = ch2.fit_transform(X_train, y_train)
            print("done in %fs" % (time() - t0))

            model = get_optimal_model (X_train_trans, y_train, folds, clf, param_grid, 'roc_auc')

            X_test_trans = ch2.transform(X_test)

            scores = get_scores(model, X_test_trans)
            fpr, tpr, thresholds = roc_curve(y_test, scores)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=1, label='%d  (area = %0.4f)' % (percentile, roc_auc))
            print "\n"*4


    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.savefig('feature_selection.png')
    
    print()
    def label_row(self, row, column_indices, table_offset, congress, chamber, document_type, number, sponsor_indices):

        instance = self.get_instance_from_row(row, column_indices)
        X, y, space = pipe.instances_to_matrix([instance,], feature_space = self.feature_space, dense = False)
        scores = self.model.decision_function(X)
        fields = ['congress', 'chamber','document_type','number', 'row', 'row_offset', 'row_length', 'score', 'state', 'sponsors']
        cmd = "insert into candidate_earmarks (" + ", ".join(fields) + ") values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) returning id"
        attributes = instance.attributes
        state = self.geo_coder.get_state(attributes['entity_text'])
        cur = self.conn.cursor()
        if sponsor_indices:
            print sponsor_indices

        sponsors = []
        for index in sponsor_indices:
            try:
                sponsor_cell = attributes['entity_text'].split("|")[index]
                sponsors_in_cell = string_functions.tokenize(string_functions.normalize_no_lower(sponsor_cell))
                for sic in sponsors_in_cell:
                    if sic in self.sponsor_coder.sponsors[congress]:
                        sponsors.append(sic)

            except Exception as e:
                print "Index: %d" % index
                print len(attributes['entity_text'].split("|"))
                print attributes['entity_text']
                logging.exception("SCREW UP")

        sponsors_string = "|".join(sponsors)[:1024]

        cur.execute(cmd, (congress, chamber, document_type, number, attributes['entity_text'], row.offset+table_offset, row.length, scores[0], state, sponsors_string))
        curr_id = cur.fetchone()[0]

        for sponsor in sponsors:
            cur.execute('insert into sponsors (candidate_earmark_id, sponsor) values (%s, %s)', (curr_id,sponsor ))


        self.conn.commit()
def main():
        parser = argparse.ArgumentParser(description='build classifier')

        parser.add_argument('--train',  required=True, help='file to pickled training instances')
        parser.add_argument('--test',  required=False, help='file to pickled test instances')
        parser.add_argument('--folds',  required=False, type = int, default = 5, help='number of folds for cv')
        parser.add_argument('--alg', required = True, help = "'rf' for RandomForest, 'svm' for LinearSVC")
        subparsers = parser.add_subparsers(dest = "subparser_name", help='sub-command help')

        parser_grid = subparsers.add_parser('grid', help='tune hyper-parameters')
        parser_grid.add_argument('--scoring', required = True)

        parser_save = subparsers.add_parser('save', help='tune hyper-parameters')
        parser_save.add_argument('--scoring', required = True)
        parser_save.add_argument('--outfile', required = True)

        parser_error = subparsers.add_parser('error', help='do error analysis')
        parser_features = subparsers.add_parser('features', help='do feature analysis')
        parser_features.add_argument('--outfile', required = True)

        parser_error = subparsers.add_parser('relabel', help='do error analysis')

        args = parser.parse_args() 

        print "Doing %s" % args.subparser_name
        print "Train: %s" %args.train
        print "Test: %s" % args.test

        if args.alg == 'svm':
            clf = svm.LinearSVC(C = 0.01)
            #param_grid = {'C': [ 0.001, 0.01, 0.1, 0.5, 1, 4, 10, 100]}
            param_grid = {'C': [ 0.01, 0.1]}

            dense = False

        else:
            clf = RandomForestClassifier(n_estimators=10,max_depth=None, random_state = 0,max_features = 'log2', n_jobs = -1)
            param_grid = {'n_estimators' : [10, 30, 50, 100, 300, 500], 'max_features' : ['log2', 'sqrt'] }
            dense = True

        if args.subparser_name == "save":
            

            groups = ['unigram_feature_generator', 'simple_entity_text_feature_generator', 'geo_feature_generator', 'sponsor_feature_generator']

            instances = prepare_earmark_data.load_instances(args.train)
            print instances[0].feature_groups.keys()
            print instances[-1].feature_groups.keys()

            X, y, feature_space = pipe.instances_to_matrix(instances, dense = dense, groups = groups)
            save_model(X, y, feature_space, args.folds, clf, param_grid, args.scoring, args.outfile)

        elif args.subparser_name =="error":

            #this does error analysis on training data only!
            instances = prepare_earmark_data.load_instances(args.train)
            X, y, feature_space = pipe.instances_to_matrix(instances, dense = dense)
            error_analysis_for_labeling(instances, X, y, args.folds, clf, param_grid, args.train)

        elif args.subparser_name =="grid":


            if args.test:
                train_instances = prepare_earmark_data.load_instances(args.train)
                test_instances = prepare_earmark_data.load_instances(args.test)


                groups = set(train_instances[0].feature_groups.keys()).intersection(test_instances[0].feature_groups.keys())

                X_train, y_train, train_feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups)
                X_test, y_test, test_feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups)


                keys_train = set(train_feature_space.keys())
                keys_test = set(test_feature_space.keys())
                intersection = list(keys_train & keys_train)
                feature_space = {intersection[i]:i for i in range(len(intersection))}
                X_train, y_train = test_instances_to_matrix(feature_space, train_instances, dense = dense)
                X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense = dense)



        
                

            else:
                instances = prepare_earmark_data.load_instances(args.train)
                X_train, y_train, feature_space = pipe.instances_to_matrix(instances, dense = dense)
                X_test = None
                y_test = None
                
            do_grid_search(X_train, y_train, args.folds, clf, param_grid, args.scoring, X_test, y_test)
            

        elif args.subparser_name == "features":

            if args.test:
                train_instances = prepare_earmark_data.load_instances(args.train)
                test_instances = prepare_earmark_data.load_instances(args.test)
                
            else:
                # this is just for exposition, would really want to cv over this
                instances = prepare_earmark_data.load_instances(args.train)
                X, y, feature_space = pipe.instances_to_matrix(instances, dense = dense)
                train, test =  split_data_stratified(X, y, test_size = 0.33)
                train_instances = [instances[i] for i in train]
                test_instances = [instances[i] for i in test]
                
            #do_feature_selection(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile)

            do_feature_set_analysis(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile)
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(description='build classifier')

    parser.add_argument('--train',
                        required=True,
                        help='file to pickled training instances')
    parser.add_argument('--test',
                        required=False,
                        help='file to pickled test instances')
    parser.add_argument('--folds',
                        required=False,
                        type=int,
                        default=5,
                        help='number of folds for cv')
    parser.add_argument('--alg',
                        required=True,
                        help="'rf' for RandomForest, 'svm' for LinearSVC")
    subparsers = parser.add_subparsers(dest="subparser_name",
                                       help='sub-command help')

    parser_grid = subparsers.add_parser('grid', help='tune hyper-parameters')
    parser_grid.add_argument('--scoring', required=True)

    parser_save = subparsers.add_parser('save', help='tune hyper-parameters')
    parser_save.add_argument('--scoring', required=True)
    parser_save.add_argument('--outfile', required=True)

    parser_error = subparsers.add_parser('error', help='do error analysis')
    parser_features = subparsers.add_parser('features',
                                            help='do feature analysis')
    parser_features.add_argument('--outfile', required=True)

    parser_error = subparsers.add_parser('relabel', help='do error analysis')

    args = parser.parse_args()

    print "Doing %s" % args.subparser_name
    print "Train: %s" % args.train
    print "Test: %s" % args.test

    if args.alg == 'svm':
        clf = svm.LinearSVC(C=0.01)
        #param_grid = {'C': [ 0.001, 0.01, 0.1, 0.5, 1, 4, 10, 100]}
        param_grid = {'C': [0.01, 0.1]}

        dense = False

    else:
        clf = RandomForestClassifier(n_estimators=10,
                                     max_depth=None,
                                     random_state=0,
                                     max_features='log2',
                                     n_jobs=-1)
        param_grid = {
            'n_estimators': [10, 30, 50, 100, 300, 500],
            'max_features': ['log2', 'sqrt']
        }
        dense = True

    if args.subparser_name == "save":

        groups = [
            'unigram_feature_generator',
            'simple_entity_text_feature_generator', 'geo_feature_generator',
            'sponsor_feature_generator'
        ]

        instances = prepare_earmark_data.load_instances(args.train)
        print instances[0].feature_groups.keys()
        print instances[-1].feature_groups.keys()

        X, y, feature_space = pipe.instances_to_matrix(instances,
                                                       dense=dense,
                                                       groups=groups)
        save_model(X, y, feature_space, args.folds, clf, param_grid,
                   args.scoring, args.outfile)

    elif args.subparser_name == "error":

        #this does error analysis on training data only!
        instances = prepare_earmark_data.load_instances(args.train)
        X, y, feature_space = pipe.instances_to_matrix(instances, dense=dense)
        error_analysis_for_labeling(instances, X, y, args.folds, clf,
                                    param_grid, args.train)

    elif args.subparser_name == "grid":

        if args.test:
            train_instances = prepare_earmark_data.load_instances(args.train)
            test_instances = prepare_earmark_data.load_instances(args.test)

            groups = set(
                train_instances[0].feature_groups.keys()).intersection(
                    test_instances[0].feature_groups.keys())

            X_train, y_train, train_feature_space = pipe.instances_to_matrix(
                train_instances, dense=dense, groups=groups)
            X_test, y_test, test_feature_space = pipe.instances_to_matrix(
                train_instances, dense=dense, groups=groups)

            keys_train = set(train_feature_space.keys())
            keys_test = set(test_feature_space.keys())
            intersection = list(keys_train & keys_train)
            feature_space = {
                intersection[i]: i
                for i in range(len(intersection))
            }
            X_train, y_train = test_instances_to_matrix(feature_space,
                                                        train_instances,
                                                        dense=dense)
            X_test, y_test = test_instances_to_matrix(feature_space,
                                                      test_instances,
                                                      dense=dense)

        else:
            instances = prepare_earmark_data.load_instances(args.train)
            X_train, y_train, feature_space = pipe.instances_to_matrix(
                instances, dense=dense)
            X_test = None
            y_test = None

        do_grid_search(X_train, y_train, args.folds, clf, param_grid,
                       args.scoring, X_test, y_test)

    elif args.subparser_name == "features":

        if args.test:
            train_instances = prepare_earmark_data.load_instances(args.train)
            test_instances = prepare_earmark_data.load_instances(args.test)

        else:
            # this is just for exposition, would really want to cv over this
            instances = prepare_earmark_data.load_instances(args.train)
            X, y, feature_space = pipe.instances_to_matrix(instances,
                                                           dense=dense)
            train, test = split_data_stratified(X, y, test_size=0.33)
            train_instances = [instances[i] for i in train]
            test_instances = [instances[i] for i in test]

        #do_feature_selection(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile)

        do_feature_set_analysis(train_instances, test_instances, args.folds,
                                clf, param_grid, dense, args.outfile)
Ejemplo n.º 9
0
def do_feature_set_analysis(train_instances, test_instances, folds, clf,
                            param_grid, dense, outfile):

    groups = set(train_instances[0].feature_groups.keys()).intersection(
        test_instances[0].feature_groups.keys())

    print groups
    X_train, y_train, feature_space = pipe.instances_to_matrix(train_instances,
                                                               dense=dense,
                                                               groups=groups)
    X_test, y_test = test_instances_to_matrix(feature_space,
                                              test_instances,
                                              dense=dense)
    model = get_optimal_model(X_train, y_train, folds, clf, param_grid,
                              'roc_auc')
    y_pred = model.predict(X_test)
    scores = get_scores(model, X_test)
    print("Test ROC: %f" % roc_auc_score(y_test, scores))
    print(classification_report(y_test, y_pred))

    fpr, tpr, thresholds = roc_curve(y_test, scores)
    np.set_printoptions(threshold='nan')

    for i in range(1, len(fpr), 100):
        print "Theshold: %0.4f  FPR: %0.4f   TPR: %0.4f" % (thresholds[i],
                                                            fpr[i], tpr[i])

    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, lw=1, label='ALL  (area = %0.4f)' % (roc_auc))

    all_tpr = []

    for g in groups:
        print g

        X_train, y_train, feature_space = pipe.instances_to_matrix(
            train_instances, groups=[
                g,
            ], dense=dense)
        X_test, y_test = test_instances_to_matrix(feature_space,
                                                  test_instances,
                                                  dense=dense)

        model = get_optimal_model(X_train, y_train, folds, clf, param_grid,
                                  'roc_auc')
        y_pred = model.predict(X_test)
        scores = get_scores(model, X_test)
        print("Test ROC: %f" % roc_auc_score(y_test, scores))
        print(classification_report(y_test, y_pred))
        fpr, tpr, thresholds = roc_curve(y_test, scores)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr,
                 tpr,
                 lw=1,
                 label='%s  (area = %0.4f)' % (g.split("_")[0], roc_auc))
        print "\n" * 4

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Feature Set Analysis')
    plt.legend(loc="lower right", prop={'size': 12})
    plt.savefig(outfile)
def main():

    parser = argparse.ArgumentParser(description='Match entities to OMB')
    parser.add_argument('--model',
                        required=False,
                        help='path to pickeld matching model')
    parser.add_argument('--data',
                        required=False,
                        help='path to pickeld instances')

    args = parser.parse_args()

    bills2008 = os.path.join(util.configuration.get_path_to_bills(),
                             "/110/bills/hr/hr2764/text-versions/")
    bills2009 = os.path.join(util.configuration.get_path_to_bills(),
                             "/111/bills/hr/hr1105/text-versions/")

    years = ["111", "110", "109", "108", "107", "106", "105", "104"]

    reports_base = util.configuration.get_path_to_reports()
    folders = [os.path.join(reports_base, year)
               for year in years] + [bills2008, bills2009]

    conn = psycopg2.connect(CONN_STRING)

    if args.model:
        feature_space = pickle.load(open(args.model + ".feature_space", "rb"))
        model = joblib.load(args.model)
        logging.info("Loaded Model")

    elif args.data:
        keep_group = [
            'unigram_feature_generator',
            'simple_entity_text_feature_generator', 'geo_feature_generator',
            'sponsor_feature_generator'
        ]
        instances = prepare_earmark_data.load_instances(args.data)
        ignore_groups = [
            fg for fg in instances[0].feature_groups.keys()
            if fg not in keep_group
        ]
        X, y, feature_space = pipe.instances_to_matrix(
            instances, ignore_groups=ignore_groups, dense=False)
        clf = svm.LinearSVC(C=0.01)
        param_grid = {'C': [0.01, 0.1]}
        model = diagnostics.get_optimal_model(X, y, 5, clf, param_grid,
                                              'roc_auc')
    else:
        exit()

    geo_coder = GeoCoder()
    sponsor_coder = SponsorCoder()

    earmark_detector = EarmarkDetector(geo_coder, sponsor_coder, conn, model,
                                       feature_space)

    p = mp.Pool(mp.cpu_count())
    p.map(label_all, [(folder, earmark_detector) for folder in folders])

    #for folder in folders:
    #   label_all((folder, earmark_detector));

    conn.close()