Exemple #1
0
def simple_model_test(data_filter=DocFilter):

    dat = RoBData(test_mode=False)
    dat.generate_data(doc_level_only=True)


    metrics = BinaryMetricsRecorder(domains=dat.CORE_DOMAINS)

    stupid_metrics = BinaryMetricsRecorder(domains=dat.CORE_DOMAINS)


    multitask_docs = MultiTaskDocFilter(dat) # use the same ids as the multitask model
    multitask_uids = np.array(multitask_docs.available_ids)
    no_studies = len(multitask_uids)
    kf = KFold(no_studies, n_folds=5, shuffle=False)

    for domain in dat.CORE_DOMAINS:

        docs = data_filter(dat, domain=domain)
        uids = np.array(docs.available_ids)
        print "%d docs obtained for domain: %s" % (len(uids), domain)


        tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}
        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='f1')

        no_studies = len(uids)

        

        for train, test in kf:

            X_train_d, y_train = docs.Xy(np.intersect1d(uids, multitask_uids[train]))
            X_test_d, y_test = docs.Xy(np.intersect1d(uids, multitask_uids[test]))

            # vec = CountVectorizer(min_df=2)
            vec = InteractionHashingVectorizer(norm=None, non_negative=True, binary=True)

            X_train = vec.fit_transform(X_train_d, low=2)
            X_test = vec.transform(X_test_d)

            clf.fit(X_train, y_train)

            y_preds = clf.predict(X_test)

            metrics.add_preds_test(y_preds, y_test, domain=domain)

            stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    metrics.save_csv('simple_acc.csv')
    stupid_metrics.save_csv('stupid_output.csv')
Exemple #2
0
def multitask_test(fold=None, n_folds_total=5, pickle_metrics=False, 
                                metrics_out_dir=None):
    """run multitask experiment.

    if fold a fold is specified, run only that fold. 
    """

    logging.info('loading data into memory')
    dat = RoBData(test_mode=False)
    dat.generate_data(doc_level_only=True)


    logging.info('loading metric recorder')
    metrics = BinaryMetricsRecorder(domains=dat.CORE_DOMAINS)


    logging.info('generating training documents')
    train_docs = MultiTaskDocFilter(dat)
    logging.info('generating training ids')
    train_uids = np.array(train_docs.available_ids)

    logging.info('setting model parameters')
    tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='f1')

    no_studies = len(train_uids)
    logging.info('calculating folds')
    kf = KFold(no_studies, n_folds=n_folds_total, shuffle=False)
    if fold is not None:
        kf = [list(kf)[fold]]
        metrics_out_path = os.path.join(
                metrics_out_dir, "metrics_%s.pickle" % fold)

    for train, test in kf:
        logging.info('new fold starting!')

        X_train_d, y_train, i_train = train_docs.Xyi(train_uids[train])

        logging.info('building up test data')
        interactions = {domain:[] for domain in dat.CORE_DOMAINS}
        for doc_text, doc_domain in zip(X_train_d, i_train):
            for domain in dat.CORE_DOMAINS:
                if domain == doc_domain:
                    interactions[domain].append(True)
                else:
                    interactions[domain].append(False)

        logging.info('adding test data to vectorizer')
        vec = ModularCountVectorizer()
        vec.builder_clear()

        logging.info('adding base features')
        vec.builder_add_docs(X_train_d, low=10) # add base features

        for domain in dat.CORE_DOMAINS:
            logging.info('adding interactions for domain %s' % (domain,))
            print np.sum(interactions[domain]), "/", len(interactions[domain]), "added for", domain
            vec.builder_add_interaction_features(X_train_d, interactions=interactions[domain], prefix=domain+"-i-", low=2) # then add interactions

        logging.info('fitting vectorizer')
        X_train = vec.builder_fit_transform()
        
        logging.info('fitting model')
        clf.fit(X_train, y_train)


        for domain in dat.CORE_DOMAINS:

            test_docs = DocFilter(dat, domain=domain) # test on regular doc model
            domain_uids = np.array(test_docs.available_ids)

            test_uids = np.intersect1d(train_uids[test], domain_uids)

            X_test_d, y_test = test_docs.Xy(test_uids)

            # build up test vector

            vec.builder_clear()
            vec.builder_add_docs(X_test_d) # add base features
            vec.builder_add_docs(X_test_d, prefix=domain+'-i-') # add interactions

            X_test = vec.builder_transform()

            y_preds = clf.predict(X_test)

            metrics.add_preds_test(y_preds, y_test, domain=domain)

            if pickle_metrics:
                with open(metrics_out_path, 'wb') as out_f:
                    pickle.dump(metrics, out_f)


    if fold is None:
        metrics.save_csv('multitask_acc.csv')
    else:
        metrics.save_csv(os.path.join(metrics_out_path, 'multitask.csv'))