Esempio n. 1
0
def main():

    model_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    docs = riskofbias.MultiTaskSentFilter(data)

    uids = np.array(docs.get_ids())
    no_studies = len(uids)

    kf = KFold(no_studies, n_folds=5, shuffle=False)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 5)}

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space

    for train, test in kf:

        y_train = docs.y(uids[train])

        vec.builder_clear()
        vec.builder_add_interaction_features(docs.X(uids[train]),
                                             low=7)  # add base features
        vec.builder_add_interaction_features(docs.X_i(uids[train]),
                                             low=2)  # then add interactions
        X_train = vec.builder_fit_transform()

        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                           tuned_parameters,
                           scoring='recall')

        # import pdb; pdb.set_trace()

        clf.fit(X_train, y_train)
        del X_train, y_train
        clf = clf.best_estimator_  # and we only need the best performing, discard the rest

        # Test on each domain in turn

        # filtered_data = riskofbias.SentFilter(data)

        for domain in riskofbias.CORE_DOMAINS:

            print "Testing on %s" % domain

            vec.builder_clear()
            vec.builder_add_interaction_features(
                docs.X(uids[test], domain=domain))  # add base features
            vec.builder_add_interaction_features(
                docs.X_i(uids[test], domain=domain))  # then add interactions
            X_test = vec.builder_transform()

            y_test = docs.y(uids[test], domain=domain)
            y_preds = clf.predict(X_test)

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)
            stupid_metrics.add_preds_test([-1] * len(y_test),
                                          y_test,
                                          domain=domain)

            del X_test, y_test, y_preds

        del clf

    model_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="stupid-baseline")))
def main(out_dir="results"):
    model_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains)
    human_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains)

    # parse the risk of bias data from Cochrane
    print "risk of bias data!"
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(
        pmid_instance=1
    )  # those with 2 (or more) assessments (to hide for training)
    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    ########################
    # sentence prediction  #
    ########################

    # The first stage is to make the sentence prediction model using the
    #   training data set
    #
    print "First, making sentence prediction model"
    sent_docs = riskofbias.MultiTaskSentFilter(data)
    uids = np.array(sent_docs.get_ids())
    no_studies = len(uids)

    # sentence tokenization
    sent_vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    sent_vec.builder_clear()
    # add base features; this effectively generates the shared feature
    # space (i.e., features for all domains)
    sent_vec.builder_add_interaction_features(sent_docs.X(uids_train,
                                                          domain=skip_domains),
                                              low=7)

    # now we add interaction features, which cross the domain with the
    # tokens. specifically, the X_i method returns token tuples crossing
    # every term with every domain, and the vectorizer (an instance of
    # ModularVectorizer) deals with inserting the actual interaction tokens
    # that cross domains with tokens.
    domain_interaction_tuples = sent_docs.X_i(uids_train, domain=skip_domains)
    sent_vec.builder_add_interaction_features(domain_interaction_tuples, low=2)

    # setup sentence classifier
    tuned_parameters = {
        "alpha": np.logspace(-4, -1, 5),
        "class_weight": [{
            1: i,
            -1: 1
        } for i in np.logspace(0, 2, 5)]
    }
    # bcw: are we sure we want to do 'recall' here, and not (e.g.) F1?
    sent_clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                            tuned_parameters,
                            scoring='recall')

    X_train = sent_vec.builder_fit_transform()
    y_train = sent_docs.y(uids_train, domain=skip_domains)

    sent_clf.fit(X_train, y_train)
    del X_train, y_train
    # we only need the best performing
    sent_clf = sent_clf.best_estimator_

    # now we have our multi-task sentence prediction model,
    # which we'll use to make sentence-level predictions for
    # documents.

    ########################
    # document prediction  #
    ########################

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)
    docs = riskofbias.MultiTaskDocFilter(data)
    X_train_d = docs.Xyi(uids_train, domain=skip_domains)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 5)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                       tuned_parameters,
                       scoring='f1')

    # bcw: note that I've amended the y method to
    # return interactions as well (i.e., domain strs)
    y_train = docs.y(uids_train, domain=skip_domains)

    # add interaction features (here both domain + high prob sentences)
    interactions = {domain: [] for domain in skip_domains}
    high_prob_sents = []
    interaction_domains = []

    for doc_index, (doc_text, doc_domain) in enumerate(X_train_d):

        doc_sents = sent_tokenizer.tokenize(doc_text)
        doc_domains = [doc_domain] * len(doc_sents)
        # interactions
        doc_X_i = izip(doc_sents, doc_domains)

        # sent_vec is from above.
        sent_vec.builder_clear()
        sent_vec.builder_add_interaction_features(
            doc_sents)  # add base features
        sent_vec.builder_add_interaction_features(
            doc_X_i)  # then add interactions
        doc_sents_X = sent_vec.builder_transform()

        ## bcw -- shouldn't we use the *true* sentence labels
        # here, rather than predictions????

        # sent_clf was trained above
        doc_sents_preds = sent_clf.predict(doc_sents_X)

        high_prob_sents.append(" ".join([
            sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
            if sent_pred == 1
        ]))
        interaction_domains.append("-s-" + doc_domain)

        if doc_index % 10 == 0:
            print doc_index
        # from collections import Counter
        # prob_count = Counter(list(doc_sents_preds))
        # print prob_count

        # for domain in riskofbias.CORE_DOMAINS:
        #     if domain == doc_domain:
        #         interactions[domain].append(True)
        #     else:
        #         interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()
    vec.builder_add_docs(docs.X(uids_train, domain=skip_domains),
                         low=7)  # add base features
    vec.builder_add_docs(docs.Xyi(uids_train, domain=skip_domains),
                         low=2)  # add domain interactions
    # removed X_train_d since already been through the generator! (needed reset)
    vec.builder_add_docs(izip(high_prob_sents, interaction_domains),
                         low=2)  # then add sentence interaction terms

    X_train = vec.builder_fit_transform()
    clf.fit(X_train, y_train)

    ############
    # testing  #
    ############

    # Test on each domain in turn
    for domain in skip_domains:
        uids_domain_all = filtered_data.get_ids(pmid_instance=0,
                                                filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(
            pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all,
                                          uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain,
                                            domain=domain,
                                            pmid_instance=0)
        X_ignore, y_human = filtered_data.Xy(uids_test_domain,
                                             domain=domain,
                                             pmid_instance=1)
        X_ignore = None  # don't need this bit

        #
        #   get high prob sents from test data
        #
        high_prob_sents = []

        for doc_text in X_test_d:
            doc_sents = sent_tokenizer.tokenize(doc_text)

            # bcw -- I think this (using doc_domain and not
            # domain) was the bug before!
            #doc_domains = [doc_domain] * len(doc_sents)
            doc_domains = [domain] * len(doc_sents)

            doc_X_i = izip(doc_sents, doc_domains)

            sent_vec.builder_clear()
            sent_vec.builder_add_interaction_features(
                doc_sents)  # add base features
            sent_vec.builder_add_interaction_features(
                doc_X_i)  # then add interactions
            doc_sents_X = sent_vec.builder_transform()
            doc_sents_preds = sent_clf.predict(doc_sents_X)

            high_prob_sents.append(" ".join([
                sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
                if sent_pred == 1
            ]))

        sent_domain_interactions = ["-s-" + domain] * len(high_prob_sents)
        domain_interactions = [domain] * len(high_prob_sents)

        print
        print "domain: %s" % domain
        print "High prob sents:"
        print '\n'.join(high_prob_sents)

        # build up test vector
        vec.builder_clear()
        vec.builder_add_docs(X_test_d)  # add base features
        vec.builder_add_docs(izip(X_test_d,
                                  domain_interactions))  # add interactions
        vec.builder_add_docs(
            izip(high_prob_sents,
                 sent_domain_interactions))  # sentence interactions

        X_test = vec.builder_transform()
        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="human-performance")))
def main():

    model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)

    f = open('test_data.csv','wb')
    w = csv.DictWriter(f, ["pmid", "domain", "sent_text", "random", "human", "algorithm", "top3", "top1"], escapechar="\\")
    w.writeheader()

    # parse the risk of bias data from Cochrane     
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    docs = riskofbias.MultiTaskSentFilter(data)

    uids = np.array(docs.get_ids())
    no_studies = len(uids)

    kf = KFold(no_studies, n_folds=5, shuffle=False)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 5), "class_weight": [{1: i, -1: 1} for i in np.logspace(0, 2, 5)]}

    vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space

    for k_i, (train, test) in enumerate(kf):

        if k_i == 1:
            break

        y_train = docs.y(uids[train])

            
        vec.builder_clear()
        vec.builder_add_interaction_features(docs.X(uids[train]), low=7) # add base features
        vec.builder_add_interaction_features(docs.X_i(uids[train]), low=2) # then add interactions
        X_train = vec.builder_fit_transform()

        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall', n_jobs=16)

        # import pdb; pdb.set_trace()

        clf.fit(X_train, y_train)
        del X_train, y_train
        clf = clf.best_estimator_ # and we only need the best performing, discard the rest

        # Test on each domain in turn

        # filtered_data = riskofbias.SentFilter(data)



        for domain in riskofbias.CORE_DOMAINS:

            print "Testing on %s" % domain

            

            vec.builder_clear()
            vec.builder_add_interaction_features(docs.X(uids[test], domain=domain)) # add base features
            vec.builder_add_interaction_features(docs.X_i(uids[test], domain=domain)) # then add interactions
            X_test = vec.builder_transform()

            y_test = docs.y(uids[test], domain=domain)
            y_preds = clf.predict(X_test)




            y_df = clf.decision_function(X_test) # get distances from the decision boundary
            # positive distances = more likely to be relevant sentences

            r_len = len(y_preds)
            y_top3 = []
            y_top1 = []
            y_rand = []

            y_uids = np.array(docs.y_uids(uids[test], domain=domain))

            # import pdb; pdb.set_trace()

            for y_uid in np.unique(y_uids):

                mask = np.where(y_uids == y_uid)[0]
                doc_df = y_df[mask]

                doc_top3 = np.argpartition(doc_df, -3)[-3:]
                y_top3.extend(list(mask[doc_top3]))
                
                doc_top1 = np.argmax(doc_df)
                y_top1.append(mask[doc_top1])

                doc_rand = np.random.randint(0, len(doc_df))
                y_rand.append(mask[doc_rand])


            human_sent_indices = np.where(y_test==1)[0]
            algorithm_sent_indices = np.where(y_preds==1)[0]

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)
            stupid_metrics.add_preds_test([-1] * len(y_test), y_test, domain=domain)

            # import pdb; pdb.set_trace()

            for doc_i, (doc, pmid) in enumerate(izip(docs.X(uids[test], domain=domain), docs.iter_pmid(uids[test], domain=domain))):

                row = {"domain": domain,
                       "sent_text": doc,
                       "random": doc_i in y_rand,
                       "human": doc_i in human_sent_indices,
                       "algorithm": doc_i in algorithm_sent_indices,
                       "top3": doc_i in y_top3,
                       "top1": doc_i in y_top1,
                       "pmid": pmid}

                if row["random"] or row["human"] or row["top3"] or row["top1"]:
                    # please note, the sentences will only be included in the analysis if
                    # in the top1 or top3
                    # we do have data on whether the raw classifier has predicted yes/no
                    # 
                    # this in effect means where the classifier picks <= 3 sentences
                    # we use all raw classifier data
                    # where >3 sentences are predicted by raw classifier, only the
                    # top 3 are used; the rest are discarded
                    w.writerow(row)

            del X_test, y_test, y_preds

        del clf



    model_metrics.save_csv(os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(os.path.join('results', outputnames.filename(label="stupid-baseline")))
    f.close()
Esempio n. 4
0
def _load_domain_map(filename=os.path.join(cochranenlp.PATH, "data", "domain_names.txt")):

    with codecs.open(filename, 'rb', 'utf-8') as f:
        raw_data = yaml.load(f)

    mapping = {}
    for key, value in raw_data.iteritems():
        for synonym in value:
            mapping[synonym] = key

    return mapping



data = riskofbias.RoBData(test_mode=False)
data.generate_data(doc_level_only=False)

mapper = _load_domain_map()

all_pmids = Counter()
domains_present = defaultdict(Counter)


b = biviewer.BiViewer()
print "getting all pubmed ids in CDSR..."

p = ProgressBar(len(b))

for doc in b:
	p.tap()
Esempio n. 5
0
def main():

    model_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)
    human_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)

    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(
        pmid_instance=1
    )  # those with 2 (or more) assessments (to hide for training)

    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)

    docs = riskofbias.MultiTaskDocFilter(data)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                       tuned_parameters,
                       scoring='accuracy')

    X_train_d, y_train, i_train = docs.Xyi(uids_train, pmid_instance=0)

    interactions = {domain: [] for domain in target_domains}
    for doc_text, doc_domain in zip(X_train_d, i_train):
        for domain in target_domains:
            if domain == doc_domain:
                interactions[domain].append(True)
            else:
                interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()

    vec.builder_add_docs(X_train_d, low=10)  # add base features

    for domain in target_domains:

        print np.sum(interactions[domain]), "/", len(
            interactions[domain]), "added for", domain
        vec.builder_add_interaction_features(X_train_d,
                                             interactions=interactions[domain],
                                             prefix=domain + "_I_",
                                             low=2)  # then add interactions

    X_train = vec.builder_fit_transform()

    clf.fit(X_train, y_train)

    # free some memory now, only need the model
    del X_train_d  # remove references to these
    del X_train
    del y_train
    clf = clf.best_estimator_  # and we only need the best performing, discard the rest

    # Test on each domain in turn

    for domain in target_domains:

        uids_domain_all = filtered_data.get_ids(pmid_instance=0,
                                                filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(
            pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all,
                                          uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain,
                                            domain=domain,
                                            pmid_instance=0)

        X_ignore, y_human = filtered_data.Xy(uids_test_domain,
                                             domain=domain,
                                             pmid_instance=1)
        X_ignore = None  # don't need this bit

        # build up test vector

        vec.builder_clear()
        vec.builder_add_docs(X_test_d)  # add base features
        vec.builder_add_docs(X_test_d,
                             prefix=domain + '_I_')  # add interactions

        X_test = vec.builder_transform()

        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(
        os.path.join('results',
                     outputnames.filename(label="human-performance")))
Esempio n. 6
0
def main():

    model_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    human_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(
        pmid_instance=1
    )  # those with 2 (or more) assessments (to hide for training)

    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    ###
    ###    sentence prediction
    ###

    # The first stage is to make the sentence prediction model using the
    #   training data set
    #

    print "First, making sentence prediction model"

    sent_docs = riskofbias.SentFilter(data)

    sent_models = {}  #where the key is the domain name

    sent_vec = modhashvec.InteractionHashingVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**24
    )  # hashing vectorizer so doesn't change per domain in terms of feature space

    for domain in riskofbias.CORE_DOMAINS:

        sent_uids = np.intersect1d(
            uids_train, np.array(sent_docs.get_ids(filter_domain=domain)))
        no_studies = len(sent_uids)

        print "%d docs obtained for domain: %s" % (no_studies, domain)

        tuned_parameters = {
            "alpha": np.logspace(-4, -1, 5),
            "class_weight": [{
                1: i,
                -1: 1
            } for i in np.logspace(-1, 1, 5)]
        }
        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                           tuned_parameters,
                           scoring='recall')

        X_train_d, y_train = sent_docs.Xy(sent_uids, domain=domain)

        X_train = sent_vec.fit_transform(X_train_d, low=2)

        clf.fit(X_train, y_train)

        sent_models[domain] = clf.best_estimator_
        # import pdb; pdb.set_trace()

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)

    docs = riskofbias.MultiTaskDocFilter(data)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                       tuned_parameters,
                       scoring='accuracy')

    # X_train_d, y_train, i_train = docs.Xyi(uids_train, pmid_instance=0)

    # add interaction features (here both domain + high prob sentences)

    # interactions = {domain:[] for domain in riskofbias.CORE_DOMAINS}

    high_prob_sents = []

    for doc_text, doc_domain in zip(X_train_d, i_train):

        doc_sents = sent_tokenizer.tokenize(doc_text)
        doc_sents_X = sent_vec.transform(doc_sents)

        doc_sents_preds = sent_models[doc_domain].predict(doc_sents_X)

        high_prob_sents.append(" ".join([
            sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
            if sent_pred == 1
        ]))

        print "high prob sents:"

        from collections import Counter
        prob_count = Counter(list(doc_sents_preds))
        print prob_count

        for domain in riskofbias.CORE_DOMAINS:
            if domain == doc_domain:
                interactions[domain].append(True)
            else:
                interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()

    vec.builder_add_docs(X_train_d, low=10)  # add base features

    # print high_prob_sents

    for domain in riskofbias.CORE_DOMAINS:

        print np.sum(interactions[domain]), "/", len(
            interactions[domain]), "added for", domain
        vec.builder_add_docs(X_train_d,
                             interactions=interactions[domain],
                             prefix=domain + "-i-",
                             low=2)  # then add interactions

    vec.builder_add_docs(high_prob_sents, prefix="-s-", low=2)

    X_train = vec.builder_fit_transform()

    clf.fit(X_train, y_train)

    # Test on each domain in turn

    for domain in riskofbias.CORE_DOMAINS:

        uids_domain_all = filtered_data.get_ids(pmid_instance=0,
                                                filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(
            pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all,
                                          uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain,
                                            domain=domain,
                                            pmid_instance=0)

        X_ignore, y_human = filtered_data.Xy(uids_test_domain,
                                             domain=domain,
                                             pmid_instance=1)
        X_ignore = None  # don't need this bit

        #
        #   get high prob sents from test data
        #

        high_prob_sents = []
        for doc_text in X_test_d:

            doc_sents = sent_tokenizer.tokenize(doc_text)
            doc_sents_X = sent_vec.transform(doc_sents)

            doc_sents_preds = sent_models[domain].predict(doc_sents_X)

            high_prob_sents.append(" ".join([
                sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
                if sent_pred == 1
            ]))

        # build up test vector

        vec.builder_clear()
        vec.builder_add_docs(X_test_d)  # add base features
        vec.builder_add_docs(X_test_d,
                             prefix=domain + '-i-')  # add interactions
        vec.builder_add_docs(high_prob_sents, prefix="-s-")

        X_test = vec.builder_transform()

        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(
        os.path.join('results',
                     outputnames.filename(label="human-performance")))
Esempio n. 7
0
def main(out_dir="results"):

    # parse the risk of bias data from Cochrane
    print "risk of bias data!"
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False, skip_small_files=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)
    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)

    ########################
    # sentence prediction  #
    ########################

    # The first stage is to make the sentence prediction model using the
    #   training data set
    #
    print "First, making sentence prediction model"
    sent_docs = riskofbias.MultiTaskSentFilter(data)
    uids = np.array(sent_docs.get_ids())
    no_studies = len(uids)

    # sentence tokenization
    sent_vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    sent_vec.builder_clear()
    # add base features; this effectively generates the shared feature
    # space (i.e., features for all domains)
    sent_vec.builder_add_interaction_features(sent_docs.X(uids_all,
                                                          domain=skip_domains),
                                              low=7)

    # now we add interaction features, which cross the domain with the
    # tokens. specifically, the X_i method returns token tuples crossing
    # every term with every domain, and the vectorizer (an instance of
    # ModularVectorizer) deals with inserting the actual interaction tokens
    # that cross domains with tokens.
    domain_interaction_tuples = sent_docs.X_i(uids_all, domain=skip_domains)
    sent_vec.builder_add_interaction_features(domain_interaction_tuples, low=2)

    # setup sentence classifier
    tuned_parameters = {
        "alpha": np.logspace(-4, -1, 5),
        "class_weight": [{
            1: i,
            -1: 1
        } for i in np.logspace(-1, 1, 5)]
    }
    # bcw: are we sure we want to do 'recall' here, and not (e.g.) F1?
    sent_clf = GridSearchCV(SGDClassifier(loss="hinge",
                                          penalty="L2",
                                          shuffle=True),
                            tuned_parameters,
                            scoring='recall')

    X_train = sent_vec.builder_fit_transform()
    y_train = sent_docs.y(uids_all, domain=skip_domains)

    sent_clf.fit(X_train, y_train)
    del X_train, y_train
    # we only need the best performing
    sent_clf = sent_clf.best_estimator_

    # now we have our multi-task sentence prediction model,
    # which we'll use to make sentence-level predictions for
    # documents.

    ########################
    # document prediction  #
    ########################

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)
    docs = riskofbias.MultiTaskDocFilter(data)
    X_train_d = docs.Xyi(uids_all, domain=skip_domains)

    tuned_parameters = {"alpha": np.logspace(-2, 2, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2", shuffle=True),
                       tuned_parameters,
                       scoring='f1')

    # bcw: note that I've amended the y method to
    # return interactions as well (i.e., domain strs)
    y_train = docs.y(uids_all, domain=skip_domains)

    # add interaction features (here both domain + high prob sentences)
    interactions = {domain: [] for domain in skip_domains}
    high_prob_sents = []
    interaction_domains = []

    for doc_index, (doc_text, doc_domain) in enumerate(X_train_d):

        doc_sents = sent_tokenizer.tokenize(doc_text)
        doc_domains = [doc_domain] * len(doc_sents)
        # interactions
        doc_X_i = izip(doc_sents, doc_domains)

        # sent_vec is from above.
        sent_vec.builder_clear()
        sent_vec.builder_add_interaction_features(
            doc_sents)  # add base features
        sent_vec.builder_add_interaction_features(
            doc_X_i)  # then add interactions
        doc_sents_X = sent_vec.builder_transform()

        ## bcw -- shouldn't we use the *true* sentence labels
        # here, rather than predictions????

        # sent_clf was trained above
        doc_sents_preds = sent_clf.predict(doc_sents_X)

        high_prob_sents.append(" ".join([
            sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
            if sent_pred == 1
        ]))
        interaction_domains.append("-s-" + doc_domain)

        if doc_index % 10 == 0:
            print doc_index
        # from collections import Counter
        # prob_count = Counter(list(doc_sents_preds))
        # print prob_count

        # for domain in riskofbias.CORE_DOMAINS:
        #     if domain == doc_domain:
        #         interactions[domain].append(True)
        #     else:
        #         interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()
    vec.builder_add_docs(docs.X(uids_all, domain=skip_domains),
                         low=7)  # add base features
    vec.builder_add_docs(docs.Xyi(uids_all, domain=skip_domains),
                         low=2)  # add domain interactions
    # removed X_train_d since already been through the generator! (needed reset)
    vec.builder_add_docs(izip(high_prob_sents, interaction_domains),
                         low=2)  # then add sentence interaction terms

    X_train = vec.builder_fit_transform()
    clf.fit(X_train, y_train)
    clf = clf.best_estimator_

    with open('mt_mt_production_models.pck', 'wb') as f:
        pickle.dump((sent_clf, clf), f)