Python RoBData Examples

Programming Language: Python

Namespace/Package Name: cochranenlp.experiments.riskofbias2

Method/Function: RoBData

Examples at hotexamples.com: 7

Python RoBData - 7 examples found. These are the top rated real world Python examples of cochranenlp.experiments.riskofbias2.RoBData extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def main():

    model_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    docs = riskofbias.MultiTaskSentFilter(data)

    uids = np.array(docs.get_ids())
    no_studies = len(uids)

    kf = KFold(no_studies, n_folds=5, shuffle=False)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 5)}

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space

    for train, test in kf:

        y_train = docs.y(uids[train])

        vec.builder_clear()
        vec.builder_add_interaction_features(docs.X(uids[train]),
                                             low=7)  # add base features
        vec.builder_add_interaction_features(docs.X_i(uids[train]),
                                             low=2)  # then add interactions
        X_train = vec.builder_fit_transform()

        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                           tuned_parameters,
                           scoring='recall')

        # import pdb; pdb.set_trace()

        clf.fit(X_train, y_train)
        del X_train, y_train
        clf = clf.best_estimator_  # and we only need the best performing, discard the rest

        # Test on each domain in turn

        # filtered_data = riskofbias.SentFilter(data)

        for domain in riskofbias.CORE_DOMAINS:

            print "Testing on %s" % domain

            vec.builder_clear()
            vec.builder_add_interaction_features(
                docs.X(uids[test], domain=domain))  # add base features
            vec.builder_add_interaction_features(
                docs.X_i(uids[test], domain=domain))  # then add interactions
            X_test = vec.builder_transform()

            y_test = docs.y(uids[test], domain=domain)
            y_preds = clf.predict(X_test)

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)
            stupid_metrics.add_preds_test([-1] * len(y_test),
                                          y_test,
                                          domain=domain)

            del X_test, y_test, y_preds

        del clf

    model_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="stupid-baseline")))

Example #2

Show file

File: mt_mt_skipd.py Project: brucexia6116/cochrane-nlp-experiments

def main(out_dir="results"):
    model_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains)
    human_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains)

    # parse the risk of bias data from Cochrane
    print "risk of bias data!"
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(
        pmid_instance=1
    )  # those with 2 (or more) assessments (to hide for training)
    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    ########################
    # sentence prediction  #
    ########################

    # The first stage is to make the sentence prediction model using the
    #   training data set
    #
    print "First, making sentence prediction model"
    sent_docs = riskofbias.MultiTaskSentFilter(data)
    uids = np.array(sent_docs.get_ids())
    no_studies = len(uids)

    # sentence tokenization
    sent_vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    sent_vec.builder_clear()
    # add base features; this effectively generates the shared feature
    # space (i.e., features for all domains)
    sent_vec.builder_add_interaction_features(sent_docs.X(uids_train,
                                                          domain=skip_domains),
                                              low=7)

    # now we add interaction features, which cross the domain with the
    # tokens. specifically, the X_i method returns token tuples crossing
    # every term with every domain, and the vectorizer (an instance of
    # ModularVectorizer) deals with inserting the actual interaction tokens
    # that cross domains with tokens.
    domain_interaction_tuples = sent_docs.X_i(uids_train, domain=skip_domains)
    sent_vec.builder_add_interaction_features(domain_interaction_tuples, low=2)

    # setup sentence classifier
    tuned_parameters = {
        "alpha": np.logspace(-4, -1, 5),
        "class_weight": [{
            1: i,
            -1: 1
        } for i in np.logspace(0, 2, 5)]
    }
    # bcw: are we sure we want to do 'recall' here, and not (e.g.) F1?
    sent_clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                            tuned_parameters,
                            scoring='recall')

    X_train = sent_vec.builder_fit_transform()
    y_train = sent_docs.y(uids_train, domain=skip_domains)

    sent_clf.fit(X_train, y_train)
    del X_train, y_train
    # we only need the best performing
    sent_clf = sent_clf.best_estimator_

    # now we have our multi-task sentence prediction model,
    # which we'll use to make sentence-level predictions for
    # documents.

    ########################
    # document prediction  #
    ########################

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)
    docs = riskofbias.MultiTaskDocFilter(data)
    X_train_d = docs.Xyi(uids_train, domain=skip_domains)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 5)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                       tuned_parameters,
                       scoring='f1')

    # bcw: note that I've amended the y method to
    # return interactions as well (i.e., domain strs)
    y_train = docs.y(uids_train, domain=skip_domains)

    # add interaction features (here both domain + high prob sentences)
    interactions = {domain: [] for domain in skip_domains}
    high_prob_sents = []
    interaction_domains = []

    for doc_index, (doc_text, doc_domain) in enumerate(X_train_d):

        doc_sents = sent_tokenizer.tokenize(doc_text)
        doc_domains = [doc_domain] * len(doc_sents)
        # interactions
        doc_X_i = izip(doc_sents, doc_domains)

        # sent_vec is from above.
        sent_vec.builder_clear()
        sent_vec.builder_add_interaction_features(
            doc_sents)  # add base features
        sent_vec.builder_add_interaction_features(
            doc_X_i)  # then add interactions
        doc_sents_X = sent_vec.builder_transform()

        ## bcw -- shouldn't we use the *true* sentence labels
        # here, rather than predictions????

        # sent_clf was trained above
        doc_sents_preds = sent_clf.predict(doc_sents_X)

        high_prob_sents.append(" ".join([
            sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
            if sent_pred == 1
        ]))
        interaction_domains.append("-s-" + doc_domain)

        if doc_index % 10 == 0:
            print doc_index
        # from collections import Counter
        # prob_count = Counter(list(doc_sents_preds))
        # print prob_count

        # for domain in riskofbias.CORE_DOMAINS:
        #     if domain == doc_domain:
        #         interactions[domain].append(True)
        #     else:
        #         interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()
    vec.builder_add_docs(docs.X(uids_train, domain=skip_domains),
                         low=7)  # add base features
    vec.builder_add_docs(docs.Xyi(uids_train, domain=skip_domains),
                         low=2)  # add domain interactions
    # removed X_train_d since already been through the generator! (needed reset)
    vec.builder_add_docs(izip(high_prob_sents, interaction_domains),
                         low=2)  # then add sentence interaction terms

    X_train = vec.builder_fit_transform()
    clf.fit(X_train, y_train)

    ############
    # testing  #
    ############

    # Test on each domain in turn
    for domain in skip_domains:
        uids_domain_all = filtered_data.get_ids(pmid_instance=0,
                                                filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(
            pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all,
                                          uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain,
                                            domain=domain,
                                            pmid_instance=0)
        X_ignore, y_human = filtered_data.Xy(uids_test_domain,
                                             domain=domain,
                                             pmid_instance=1)
        X_ignore = None  # don't need this bit

        #
        #   get high prob sents from test data
        #
        high_prob_sents = []

        for doc_text in X_test_d:
            doc_sents = sent_tokenizer.tokenize(doc_text)

            # bcw -- I think this (using doc_domain and not
            # domain) was the bug before!
            #doc_domains = [doc_domain] * len(doc_sents)
            doc_domains = [domain] * len(doc_sents)

            doc_X_i = izip(doc_sents, doc_domains)

            sent_vec.builder_clear()
            sent_vec.builder_add_interaction_features(
                doc_sents)  # add base features
            sent_vec.builder_add_interaction_features(
                doc_X_i)  # then add interactions
            doc_sents_X = sent_vec.builder_transform()
            doc_sents_preds = sent_clf.predict(doc_sents_X)

            high_prob_sents.append(" ".join([
                sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
                if sent_pred == 1
            ]))

        sent_domain_interactions = ["-s-" + domain] * len(high_prob_sents)
        domain_interactions = [domain] * len(high_prob_sents)

        print
        print "domain: %s" % domain
        print "High prob sents:"
        print '\n'.join(high_prob_sents)

        # build up test vector
        vec.builder_clear()
        vec.builder_add_docs(X_test_d)  # add base features
        vec.builder_add_docs(izip(X_test_d,
                                  domain_interactions))  # add interactions
        vec.builder_add_docs(
            izip(high_prob_sents,
                 sent_domain_interactions))  # sentence interactions

        X_test = vec.builder_transform()
        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="human-performance")))

Example #3

Show file

File: sent_mt_recall_generate_test.py Project: brucexia6116/cochrane-nlp-experiments

def main():

    model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)

    f = open('test_data.csv','wb')
    w = csv.DictWriter(f, ["pmid", "domain", "sent_text", "random", "human", "algorithm", "top3", "top1"], escapechar="\\")
    w.writeheader()

    # parse the risk of bias data from Cochrane     
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    docs = riskofbias.MultiTaskSentFilter(data)

    uids = np.array(docs.get_ids())
    no_studies = len(uids)

    kf = KFold(no_studies, n_folds=5, shuffle=False)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 5), "class_weight": [{1: i, -1: 1} for i in np.logspace(0, 2, 5)]}

    vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space

    for k_i, (train, test) in enumerate(kf):

        if k_i == 1:
            break

        y_train = docs.y(uids[train])

            
        vec.builder_clear()
        vec.builder_add_interaction_features(docs.X(uids[train]), low=7) # add base features
        vec.builder_add_interaction_features(docs.X_i(uids[train]), low=2) # then add interactions
        X_train = vec.builder_fit_transform()

        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall', n_jobs=16)

        # import pdb; pdb.set_trace()

        clf.fit(X_train, y_train)
        del X_train, y_train
        clf = clf.best_estimator_ # and we only need the best performing, discard the rest

        # Test on each domain in turn

        # filtered_data = riskofbias.SentFilter(data)



        for domain in riskofbias.CORE_DOMAINS:

            print "Testing on %s" % domain

            

            vec.builder_clear()
            vec.builder_add_interaction_features(docs.X(uids[test], domain=domain)) # add base features
            vec.builder_add_interaction_features(docs.X_i(uids[test], domain=domain)) # then add interactions
            X_test = vec.builder_transform()

            y_test = docs.y(uids[test], domain=domain)
            y_preds = clf.predict(X_test)




            y_df = clf.decision_function(X_test) # get distances from the decision boundary
            # positive distances = more likely to be relevant sentences

            r_len = len(y_preds)
            y_top3 = []
            y_top1 = []
            y_rand = []

            y_uids = np.array(docs.y_uids(uids[test], domain=domain))

            # import pdb; pdb.set_trace()

            for y_uid in np.unique(y_uids):

                mask = np.where(y_uids == y_uid)[0]
                doc_df = y_df[mask]

                doc_top3 = np.argpartition(doc_df, -3)[-3:]
                y_top3.extend(list(mask[doc_top3]))
                
                doc_top1 = np.argmax(doc_df)
                y_top1.append(mask[doc_top1])

                doc_rand = np.random.randint(0, len(doc_df))
                y_rand.append(mask[doc_rand])


            human_sent_indices = np.where(y_test==1)[0]
            algorithm_sent_indices = np.where(y_preds==1)[0]

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)
            stupid_metrics.add_preds_test([-1] * len(y_test), y_test, domain=domain)

            # import pdb; pdb.set_trace()

            for doc_i, (doc, pmid) in enumerate(izip(docs.X(uids[test], domain=domain), docs.iter_pmid(uids[test], domain=domain))):

                row = {"domain": domain,
                       "sent_text": doc,
                       "random": doc_i in y_rand,
                       "human": doc_i in human_sent_indices,
                       "algorithm": doc_i in algorithm_sent_indices,
                       "top3": doc_i in y_top3,
                       "top1": doc_i in y_top1,
                       "pmid": pmid}

                if row["random"] or row["human"] or row["top3"] or row["top1"]:
                    # please note, the sentences will only be included in the analysis if
                    # in the top1 or top3
                    # we do have data on whether the raw classifier has predicted yes/no
                    # 
                    # this in effect means where the classifier picks <= 3 sentences
                    # we use all raw classifier data
                    # where >3 sentences are predicted by raw classifier, only the
                    # top 3 are used; the rest are discarded
                    w.writerow(row)

            del X_test, y_test, y_preds

        del clf



    model_metrics.save_csv(os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(os.path.join('results', outputnames.filename(label="stupid-baseline")))
    f.close()

Example #4

Show file

def _load_domain_map(filename=os.path.join(cochranenlp.PATH, "data", "domain_names.txt")):

    with codecs.open(filename, 'rb', 'utf-8') as f:
        raw_data = yaml.load(f)

    mapping = {}
    for key, value in raw_data.iteritems():
        for synonym in value:
            mapping[synonym] = key

    return mapping



data = riskofbias.RoBData(test_mode=False)
data.generate_data(doc_level_only=False)

mapper = _load_domain_map()

all_pmids = Counter()
domains_present = defaultdict(Counter)


b = biviewer.BiViewer()
print "getting all pubmed ids in CDSR..."

p = ProgressBar(len(b))

for doc in b:
	p.tap()

Example #5

Show file

def main():

    model_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)
    human_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)

    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(
        pmid_instance=1
    )  # those with 2 (or more) assessments (to hide for training)

    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)

    docs = riskofbias.MultiTaskDocFilter(data)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                       tuned_parameters,
                       scoring='accuracy')

    X_train_d, y_train, i_train = docs.Xyi(uids_train, pmid_instance=0)

    interactions = {domain: [] for domain in target_domains}
    for doc_text, doc_domain in zip(X_train_d, i_train):
        for domain in target_domains:
            if domain == doc_domain:
                interactions[domain].append(True)
            else:
                interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()

    vec.builder_add_docs(X_train_d, low=10)  # add base features

    for domain in target_domains:

        print np.sum(interactions[domain]), "/", len(
            interactions[domain]), "added for", domain
        vec.builder_add_interaction_features(X_train_d,
                                             interactions=interactions[domain],
                                             prefix=domain + "_I_",
                                             low=2)  # then add interactions

    X_train = vec.builder_fit_transform()

    clf.fit(X_train, y_train)

    # free some memory now, only need the model
    del X_train_d  # remove references to these
    del X_train
    del y_train
    clf = clf.best_estimator_  # and we only need the best performing, discard the rest

    # Test on each domain in turn

    for domain in target_domains:

        uids_domain_all = filtered_data.get_ids(pmid_instance=0,
                                                filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(
            pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all,
                                          uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain,
                                            domain=domain,
                                            pmid_instance=0)

        X_ignore, y_human = filtered_data.Xy(uids_test_domain,
                                             domain=domain,
                                             pmid_instance=1)
        X_ignore = None  # don't need this bit

        # build up test vector

        vec.builder_clear()
        vec.builder_add_docs(X_test_d)  # add base features
        vec.builder_add_docs(X_test_d,
                             prefix=domain + '_I_')  # add interactions

        X_test = vec.builder_transform()

        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(
        os.path.join('results',
                     outputnames.filename(label="human-performance")))

Example #6

Show file

def main():

    model_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    human_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(
        pmid_instance=1
    )  # those with 2 (or more) assessments (to hide for training)

    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    ###
    ###    sentence prediction
    ###

    # The first stage is to make the sentence prediction model using the
    #   training data set
    #

    print "First, making sentence prediction model"

    sent_docs = riskofbias.SentFilter(data)

    sent_models = {}  #where the key is the domain name

    sent_vec = modhashvec.InteractionHashingVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**24
    )  # hashing vectorizer so doesn't change per domain in terms of feature space

    for domain in riskofbias.CORE_DOMAINS:

        sent_uids = np.intersect1d(
            uids_train, np.array(sent_docs.get_ids(filter_domain=domain)))
        no_studies = len(sent_uids)

        print "%d docs obtained for domain: %s" % (no_studies, domain)

        tuned_parameters = {
            "alpha": np.logspace(-4, -1, 5),
            "class_weight": [{
                1: i,
                -1: 1
            } for i in np.logspace(-1, 1, 5)]
        }
        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                           tuned_parameters,
                           scoring='recall')

        X_train_d, y_train = sent_docs.Xy(sent_uids, domain=domain)

        X_train = sent_vec.fit_transform(X_train_d, low=2)

        clf.fit(X_train, y_train)

        sent_models[domain] = clf.best_estimator_
        # import pdb; pdb.set_trace()

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)

    docs = riskofbias.MultiTaskDocFilter(data)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                       tuned_parameters,
                       scoring='accuracy')

    # X_train_d, y_train, i_train = docs.Xyi(uids_train, pmid_instance=0)

    # add interaction features (here both domain + high prob sentences)

    # interactions = {domain:[] for domain in riskofbias.CORE_DOMAINS}

    high_prob_sents = []

    for doc_text, doc_domain in zip(X_train_d, i_train):

        doc_sents = sent_tokenizer.tokenize(doc_text)
        doc_sents_X = sent_vec.transform(doc_sents)

        doc_sents_preds = sent_models[doc_domain].predict(doc_sents_X)

        high_prob_sents.append(" ".join([
            sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
            if sent_pred == 1
        ]))

        print "high prob sents:"

        from collections import Counter
        prob_count = Counter(list(doc_sents_preds))
        print prob_count

        for domain in riskofbias.CORE_DOMAINS:
            if domain == doc_domain:
                interactions[domain].append(True)
            else:
                interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()

    vec.builder_add_docs(X_train_d, low=10)  # add base features

    # print high_prob_sents

    for domain in riskofbias.CORE_DOMAINS:

        print np.sum(interactions[domain]), "/", len(
            interactions[domain]), "added for", domain
        vec.builder_add_docs(X_train_d,
                             interactions=interactions[domain],
                             prefix=domain + "-i-",
                             low=2)  # then add interactions

    vec.builder_add_docs(high_prob_sents, prefix="-s-", low=2)

    X_train = vec.builder_fit_transform()

    clf.fit(X_train, y_train)

    # Test on each domain in turn

    for domain in riskofbias.CORE_DOMAINS:

        uids_domain_all = filtered_data.get_ids(pmid_instance=0,
                                                filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(
            pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all,
                                          uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain,
                                            domain=domain,
                                            pmid_instance=0)

        X_ignore, y_human = filtered_data.Xy(uids_test_domain,
                                             domain=domain,
                                             pmid_instance=1)
        X_ignore = None  # don't need this bit

        #
        #   get high prob sents from test data
        #

        high_prob_sents = []
        for doc_text in X_test_d:

            doc_sents = sent_tokenizer.tokenize(doc_text)
            doc_sents_X = sent_vec.transform(doc_sents)

            doc_sents_preds = sent_models[domain].predict(doc_sents_X)

            high_prob_sents.append(" ".join([
                sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
                if sent_pred == 1
            ]))

        # build up test vector

        vec.builder_clear()
        vec.builder_add_docs(X_test_d)  # add base features
        vec.builder_add_docs(X_test_d,
                             prefix=domain + '-i-')  # add interactions
        vec.builder_add_docs(high_prob_sents, prefix="-s-")

        X_test = vec.builder_transform()

        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(
        os.path.join('results',
                     outputnames.filename(label="human-performance")))

Example #7

Show file

def main(out_dir="results"):

    # parse the risk of bias data from Cochrane
    print "risk of bias data!"
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False, skip_small_files=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)
    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)

    ########################
    # sentence prediction  #
    ########################

    # The first stage is to make the sentence prediction model using the
    #   training data set
    #
    print "First, making sentence prediction model"
    sent_docs = riskofbias.MultiTaskSentFilter(data)
    uids = np.array(sent_docs.get_ids())
    no_studies = len(uids)

    # sentence tokenization
    sent_vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    sent_vec.builder_clear()
    # add base features; this effectively generates the shared feature
    # space (i.e., features for all domains)
    sent_vec.builder_add_interaction_features(sent_docs.X(uids_all,
                                                          domain=skip_domains),
                                              low=7)

    # now we add interaction features, which cross the domain with the
    # tokens. specifically, the X_i method returns token tuples crossing
    # every term with every domain, and the vectorizer (an instance of
    # ModularVectorizer) deals with inserting the actual interaction tokens
    # that cross domains with tokens.
    domain_interaction_tuples = sent_docs.X_i(uids_all, domain=skip_domains)
    sent_vec.builder_add_interaction_features(domain_interaction_tuples, low=2)

    # setup sentence classifier
    tuned_parameters = {
        "alpha": np.logspace(-4, -1, 5),
        "class_weight": [{
            1: i,
            -1: 1
        } for i in np.logspace(-1, 1, 5)]
    }
    # bcw: are we sure we want to do 'recall' here, and not (e.g.) F1?
    sent_clf = GridSearchCV(SGDClassifier(loss="hinge",
                                          penalty="L2",
                                          shuffle=True),
                            tuned_parameters,
                            scoring='recall')

    X_train = sent_vec.builder_fit_transform()
    y_train = sent_docs.y(uids_all, domain=skip_domains)

    sent_clf.fit(X_train, y_train)
    del X_train, y_train
    # we only need the best performing
    sent_clf = sent_clf.best_estimator_

    # now we have our multi-task sentence prediction model,
    # which we'll use to make sentence-level predictions for
    # documents.

    ########################
    # document prediction  #
    ########################

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)
    docs = riskofbias.MultiTaskDocFilter(data)
    X_train_d = docs.Xyi(uids_all, domain=skip_domains)

    tuned_parameters = {"alpha": np.logspace(-2, 2, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2", shuffle=True),
                       tuned_parameters,
                       scoring='f1')

    # bcw: note that I've amended the y method to
    # return interactions as well (i.e., domain strs)
    y_train = docs.y(uids_all, domain=skip_domains)

    # add interaction features (here both domain + high prob sentences)
    interactions = {domain: [] for domain in skip_domains}
    high_prob_sents = []
    interaction_domains = []

    for doc_index, (doc_text, doc_domain) in enumerate(X_train_d):

        doc_sents = sent_tokenizer.tokenize(doc_text)
        doc_domains = [doc_domain] * len(doc_sents)
        # interactions
        doc_X_i = izip(doc_sents, doc_domains)

        # sent_vec is from above.
        sent_vec.builder_clear()
        sent_vec.builder_add_interaction_features(
            doc_sents)  # add base features
        sent_vec.builder_add_interaction_features(
            doc_X_i)  # then add interactions
        doc_sents_X = sent_vec.builder_transform()

        ## bcw -- shouldn't we use the *true* sentence labels
        # here, rather than predictions????

        # sent_clf was trained above
        doc_sents_preds = sent_clf.predict(doc_sents_X)

        high_prob_sents.append(" ".join([
            sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
            if sent_pred == 1
        ]))
        interaction_domains.append("-s-" + doc_domain)

        if doc_index % 10 == 0:
            print doc_index
        # from collections import Counter
        # prob_count = Counter(list(doc_sents_preds))
        # print prob_count

        # for domain in riskofbias.CORE_DOMAINS:
        #     if domain == doc_domain:
        #         interactions[domain].append(True)
        #     else:
        #         interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()
    vec.builder_add_docs(docs.X(uids_all, domain=skip_domains),
                         low=7)  # add base features
    vec.builder_add_docs(docs.Xyi(uids_all, domain=skip_domains),
                         low=2)  # add domain interactions
    # removed X_train_d since already been through the generator! (needed reset)
    vec.builder_add_docs(izip(high_prob_sents, interaction_domains),
                         low=2)  # then add sentence interaction terms

    X_train = vec.builder_fit_transform()
    clf.fit(X_train, y_train)
    clf = clf.best_estimator_

    with open('mt_mt_production_models.pck', 'wb') as f:
        pickle.dump((sent_clf, clf), f)