def main(): model_metrics = metrics.BinaryMetricsRecorder( domains=riskofbias.CORE_DOMAINS) stupid_metrics = metrics.BinaryMetricsRecorder( domains=riskofbias.CORE_DOMAINS) # parse the risk of bias data from Cochrane data = riskofbias.RoBData(test_mode=False) data.generate_data(doc_level_only=False) docs = riskofbias.MultiTaskSentFilter(data) uids = np.array(docs.get_ids()) no_studies = len(uids) kf = KFold(no_studies, n_folds=5, shuffle=False) tuned_parameters = {"alpha": np.logspace(-4, -1, 5)} vec = modhashvec.ModularVectorizer( norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space for train, test in kf: y_train = docs.y(uids[train]) vec.builder_clear() vec.builder_add_interaction_features(docs.X(uids[train]), low=7) # add base features vec.builder_add_interaction_features(docs.X_i(uids[train]), low=2) # then add interactions X_train = vec.builder_fit_transform() clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall') # import pdb; pdb.set_trace() clf.fit(X_train, y_train) del X_train, y_train clf = clf.best_estimator_ # and we only need the best performing, discard the rest # Test on each domain in turn # filtered_data = riskofbias.SentFilter(data) for domain in riskofbias.CORE_DOMAINS: print "Testing on %s" % domain vec.builder_clear() vec.builder_add_interaction_features( docs.X(uids[test], domain=domain)) # add base features vec.builder_add_interaction_features( docs.X_i(uids[test], domain=domain)) # then add interactions X_test = vec.builder_transform() y_test = docs.y(uids[test], domain=domain) y_preds = clf.predict(X_test) model_metrics.add_preds_test(y_preds, y_test, domain=domain) stupid_metrics.add_preds_test([-1] * len(y_test), y_test, domain=domain) del X_test, y_test, y_preds del clf model_metrics.save_csv( os.path.join('results', outputnames.filename(label="model"))) stupid_metrics.save_csv( os.path.join('results', outputnames.filename(label="stupid-baseline")))
def main(out_dir="results"): model_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains) stupid_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains) human_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains) # parse the risk of bias data from Cochrane print "risk of bias data!" data = riskofbias.RoBData(test_mode=False) data.generate_data(doc_level_only=False) # filter the data by Document filtered_data = riskofbias.DocFilter(data) # get the uids of the desired training set # (for this experiment those which appear in only one review) uids_all = filtered_data.get_ids( pmid_instance=0) # those with 1 or more assessment (i.e. all) uids_double_assessed = filtered_data.get_ids( pmid_instance=1 ) # those with 2 (or more) assessments (to hide for training) uids_train = np.setdiff1d(uids_all, uids_double_assessed) ######################## # sentence prediction # ######################## # The first stage is to make the sentence prediction model using the # training data set # print "First, making sentence prediction model" sent_docs = riskofbias.MultiTaskSentFilter(data) uids = np.array(sent_docs.get_ids()) no_studies = len(uids) # sentence tokenization sent_vec = modhashvec.ModularVectorizer( norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space sent_vec.builder_clear() # add base features; this effectively generates the shared feature # space (i.e., features for all domains) sent_vec.builder_add_interaction_features(sent_docs.X(uids_train, domain=skip_domains), low=7) # now we add interaction features, which cross the domain with the # tokens. specifically, the X_i method returns token tuples crossing # every term with every domain, and the vectorizer (an instance of # ModularVectorizer) deals with inserting the actual interaction tokens # that cross domains with tokens. domain_interaction_tuples = sent_docs.X_i(uids_train, domain=skip_domains) sent_vec.builder_add_interaction_features(domain_interaction_tuples, low=2) # setup sentence classifier tuned_parameters = { "alpha": np.logspace(-4, -1, 5), "class_weight": [{ 1: i, -1: 1 } for i in np.logspace(0, 2, 5)] } # bcw: are we sure we want to do 'recall' here, and not (e.g.) F1? sent_clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall') X_train = sent_vec.builder_fit_transform() y_train = sent_docs.y(uids_train, domain=skip_domains) sent_clf.fit(X_train, y_train) del X_train, y_train # we only need the best performing sent_clf = sent_clf.best_estimator_ # now we have our multi-task sentence prediction model, # which we'll use to make sentence-level predictions for # documents. ######################## # document prediction # ######################## # we need different test ids for each domain # (since we're testing on studies with more than one RoB assessment for *each domain*) docs = riskofbias.MultiTaskDocFilter(data) X_train_d = docs.Xyi(uids_train, domain=skip_domains) tuned_parameters = {"alpha": np.logspace(-4, -1, 5)} clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='f1') # bcw: note that I've amended the y method to # return interactions as well (i.e., domain strs) y_train = docs.y(uids_train, domain=skip_domains) # add interaction features (here both domain + high prob sentences) interactions = {domain: [] for domain in skip_domains} high_prob_sents = [] interaction_domains = [] for doc_index, (doc_text, doc_domain) in enumerate(X_train_d): doc_sents = sent_tokenizer.tokenize(doc_text) doc_domains = [doc_domain] * len(doc_sents) # interactions doc_X_i = izip(doc_sents, doc_domains) # sent_vec is from above. sent_vec.builder_clear() sent_vec.builder_add_interaction_features( doc_sents) # add base features sent_vec.builder_add_interaction_features( doc_X_i) # then add interactions doc_sents_X = sent_vec.builder_transform() ## bcw -- shouldn't we use the *true* sentence labels # here, rather than predictions???? # sent_clf was trained above doc_sents_preds = sent_clf.predict(doc_sents_X) high_prob_sents.append(" ".join([ sent for sent, sent_pred in zip(doc_sents, doc_sents_preds) if sent_pred == 1 ])) interaction_domains.append("-s-" + doc_domain) if doc_index % 10 == 0: print doc_index # from collections import Counter # prob_count = Counter(list(doc_sents_preds)) # print prob_count # for domain in riskofbias.CORE_DOMAINS: # if domain == doc_domain: # interactions[domain].append(True) # else: # interactions[domain].append(False) vec = modhashvec.ModularVectorizer( norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space vec.builder_clear() vec.builder_add_docs(docs.X(uids_train, domain=skip_domains), low=7) # add base features vec.builder_add_docs(docs.Xyi(uids_train, domain=skip_domains), low=2) # add domain interactions # removed X_train_d since already been through the generator! (needed reset) vec.builder_add_docs(izip(high_prob_sents, interaction_domains), low=2) # then add sentence interaction terms X_train = vec.builder_fit_transform() clf.fit(X_train, y_train) ############ # testing # ############ # Test on each domain in turn for domain in skip_domains: uids_domain_all = filtered_data.get_ids(pmid_instance=0, filter_domain=domain) uids_domain_double_assessed = filtered_data.get_ids( pmid_instance=1, filter_domain=domain) uids_test_domain = np.intersect1d(uids_domain_all, uids_domain_double_assessed) X_test_d, y_test = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=0) X_ignore, y_human = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=1) X_ignore = None # don't need this bit # # get high prob sents from test data # high_prob_sents = [] for doc_text in X_test_d: doc_sents = sent_tokenizer.tokenize(doc_text) # bcw -- I think this (using doc_domain and not # domain) was the bug before! #doc_domains = [doc_domain] * len(doc_sents) doc_domains = [domain] * len(doc_sents) doc_X_i = izip(doc_sents, doc_domains) sent_vec.builder_clear() sent_vec.builder_add_interaction_features( doc_sents) # add base features sent_vec.builder_add_interaction_features( doc_X_i) # then add interactions doc_sents_X = sent_vec.builder_transform() doc_sents_preds = sent_clf.predict(doc_sents_X) high_prob_sents.append(" ".join([ sent for sent, sent_pred in zip(doc_sents, doc_sents_preds) if sent_pred == 1 ])) sent_domain_interactions = ["-s-" + domain] * len(high_prob_sents) domain_interactions = [domain] * len(high_prob_sents) print print "domain: %s" % domain print "High prob sents:" print '\n'.join(high_prob_sents) # build up test vector vec.builder_clear() vec.builder_add_docs(X_test_d) # add base features vec.builder_add_docs(izip(X_test_d, domain_interactions)) # add interactions vec.builder_add_docs( izip(high_prob_sents, sent_domain_interactions)) # sentence interactions X_test = vec.builder_transform() y_preds = clf.predict(X_test) model_metrics.add_preds_test(y_preds, y_test, domain=domain) human_metrics.add_preds_test(y_human, y_test, domain=domain) stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain) model_metrics.save_csv( os.path.join(out_dir, outputnames.filename(label="model"))) stupid_metrics.save_csv( os.path.join(out_dir, outputnames.filename(label="stupid-baseline"))) human_metrics.save_csv( os.path.join(out_dir, outputnames.filename(label="human-performance")))
def main(): model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS) stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS) f = open('test_data.csv','wb') w = csv.DictWriter(f, ["pmid", "domain", "sent_text", "random", "human", "algorithm", "top3", "top1"], escapechar="\\") w.writeheader() # parse the risk of bias data from Cochrane data = riskofbias.RoBData(test_mode=False) data.generate_data(doc_level_only=False) docs = riskofbias.MultiTaskSentFilter(data) uids = np.array(docs.get_ids()) no_studies = len(uids) kf = KFold(no_studies, n_folds=5, shuffle=False) tuned_parameters = {"alpha": np.logspace(-4, -1, 5), "class_weight": [{1: i, -1: 1} for i in np.logspace(0, 2, 5)]} vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space for k_i, (train, test) in enumerate(kf): if k_i == 1: break y_train = docs.y(uids[train]) vec.builder_clear() vec.builder_add_interaction_features(docs.X(uids[train]), low=7) # add base features vec.builder_add_interaction_features(docs.X_i(uids[train]), low=2) # then add interactions X_train = vec.builder_fit_transform() clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall', n_jobs=16) # import pdb; pdb.set_trace() clf.fit(X_train, y_train) del X_train, y_train clf = clf.best_estimator_ # and we only need the best performing, discard the rest # Test on each domain in turn # filtered_data = riskofbias.SentFilter(data) for domain in riskofbias.CORE_DOMAINS: print "Testing on %s" % domain vec.builder_clear() vec.builder_add_interaction_features(docs.X(uids[test], domain=domain)) # add base features vec.builder_add_interaction_features(docs.X_i(uids[test], domain=domain)) # then add interactions X_test = vec.builder_transform() y_test = docs.y(uids[test], domain=domain) y_preds = clf.predict(X_test) y_df = clf.decision_function(X_test) # get distances from the decision boundary # positive distances = more likely to be relevant sentences r_len = len(y_preds) y_top3 = [] y_top1 = [] y_rand = [] y_uids = np.array(docs.y_uids(uids[test], domain=domain)) # import pdb; pdb.set_trace() for y_uid in np.unique(y_uids): mask = np.where(y_uids == y_uid)[0] doc_df = y_df[mask] doc_top3 = np.argpartition(doc_df, -3)[-3:] y_top3.extend(list(mask[doc_top3])) doc_top1 = np.argmax(doc_df) y_top1.append(mask[doc_top1]) doc_rand = np.random.randint(0, len(doc_df)) y_rand.append(mask[doc_rand]) human_sent_indices = np.where(y_test==1)[0] algorithm_sent_indices = np.where(y_preds==1)[0] model_metrics.add_preds_test(y_preds, y_test, domain=domain) stupid_metrics.add_preds_test([-1] * len(y_test), y_test, domain=domain) # import pdb; pdb.set_trace() for doc_i, (doc, pmid) in enumerate(izip(docs.X(uids[test], domain=domain), docs.iter_pmid(uids[test], domain=domain))): row = {"domain": domain, "sent_text": doc, "random": doc_i in y_rand, "human": doc_i in human_sent_indices, "algorithm": doc_i in algorithm_sent_indices, "top3": doc_i in y_top3, "top1": doc_i in y_top1, "pmid": pmid} if row["random"] or row["human"] or row["top3"] or row["top1"]: # please note, the sentences will only be included in the analysis if # in the top1 or top3 # we do have data on whether the raw classifier has predicted yes/no # # this in effect means where the classifier picks <= 3 sentences # we use all raw classifier data # where >3 sentences are predicted by raw classifier, only the # top 3 are used; the rest are discarded w.writerow(row) del X_test, y_test, y_preds del clf model_metrics.save_csv(os.path.join('results', outputnames.filename(label="model"))) stupid_metrics.save_csv(os.path.join('results', outputnames.filename(label="stupid-baseline"))) f.close()
def _load_domain_map(filename=os.path.join(cochranenlp.PATH, "data", "domain_names.txt")): with codecs.open(filename, 'rb', 'utf-8') as f: raw_data = yaml.load(f) mapping = {} for key, value in raw_data.iteritems(): for synonym in value: mapping[synonym] = key return mapping data = riskofbias.RoBData(test_mode=False) data.generate_data(doc_level_only=False) mapper = _load_domain_map() all_pmids = Counter() domains_present = defaultdict(Counter) b = biviewer.BiViewer() print "getting all pubmed ids in CDSR..." p = ProgressBar(len(b)) for doc in b: p.tap()
def main(): model_metrics = metrics.BinaryMetricsRecorder(domains=target_domains) stupid_metrics = metrics.BinaryMetricsRecorder(domains=target_domains) human_metrics = metrics.BinaryMetricsRecorder(domains=target_domains) # parse the risk of bias data from Cochrane data = riskofbias.RoBData(test_mode=False) data.generate_data(doc_level_only=True) # filter the data by Document filtered_data = riskofbias.DocFilter(data) # get the uids of the desired training set # (for this experiment those which appear in only one review) uids_all = filtered_data.get_ids( pmid_instance=0) # those with 1 or more assessment (i.e. all) uids_double_assessed = filtered_data.get_ids( pmid_instance=1 ) # those with 2 (or more) assessments (to hide for training) uids_train = np.setdiff1d(uids_all, uids_double_assessed) # we need different test ids for each domain # (since we're testing on studies with more than one RoB assessment for *each domain*) docs = riskofbias.MultiTaskDocFilter(data) tuned_parameters = {"alpha": np.logspace(-4, -1, 10)} clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='accuracy') X_train_d, y_train, i_train = docs.Xyi(uids_train, pmid_instance=0) interactions = {domain: [] for domain in target_domains} for doc_text, doc_domain in zip(X_train_d, i_train): for domain in target_domains: if domain == doc_domain: interactions[domain].append(True) else: interactions[domain].append(False) vec = modhashvec.ModularVectorizer( norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space vec.builder_clear() vec.builder_add_docs(X_train_d, low=10) # add base features for domain in target_domains: print np.sum(interactions[domain]), "/", len( interactions[domain]), "added for", domain vec.builder_add_interaction_features(X_train_d, interactions=interactions[domain], prefix=domain + "_I_", low=2) # then add interactions X_train = vec.builder_fit_transform() clf.fit(X_train, y_train) # free some memory now, only need the model del X_train_d # remove references to these del X_train del y_train clf = clf.best_estimator_ # and we only need the best performing, discard the rest # Test on each domain in turn for domain in target_domains: uids_domain_all = filtered_data.get_ids(pmid_instance=0, filter_domain=domain) uids_domain_double_assessed = filtered_data.get_ids( pmid_instance=1, filter_domain=domain) uids_test_domain = np.intersect1d(uids_domain_all, uids_domain_double_assessed) X_test_d, y_test = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=0) X_ignore, y_human = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=1) X_ignore = None # don't need this bit # build up test vector vec.builder_clear() vec.builder_add_docs(X_test_d) # add base features vec.builder_add_docs(X_test_d, prefix=domain + '_I_') # add interactions X_test = vec.builder_transform() y_preds = clf.predict(X_test) model_metrics.add_preds_test(y_preds, y_test, domain=domain) human_metrics.add_preds_test(y_human, y_test, domain=domain) stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain) model_metrics.save_csv( os.path.join('results', outputnames.filename(label="model"))) stupid_metrics.save_csv( os.path.join('results', outputnames.filename(label="stupid-baseline"))) human_metrics.save_csv( os.path.join('results', outputnames.filename(label="human-performance")))
def main(): model_metrics = metrics.BinaryMetricsRecorder( domains=riskofbias.CORE_DOMAINS) stupid_metrics = metrics.BinaryMetricsRecorder( domains=riskofbias.CORE_DOMAINS) human_metrics = metrics.BinaryMetricsRecorder( domains=riskofbias.CORE_DOMAINS) # parse the risk of bias data from Cochrane data = riskofbias.RoBData(test_mode=False) data.generate_data(doc_level_only=False) # filter the data by Document filtered_data = riskofbias.DocFilter(data) # get the uids of the desired training set # (for this experiment those which appear in only one review) uids_all = filtered_data.get_ids( pmid_instance=0) # those with 1 or more assessment (i.e. all) uids_double_assessed = filtered_data.get_ids( pmid_instance=1 ) # those with 2 (or more) assessments (to hide for training) uids_train = np.setdiff1d(uids_all, uids_double_assessed) ### ### sentence prediction ### # The first stage is to make the sentence prediction model using the # training data set # print "First, making sentence prediction model" sent_docs = riskofbias.SentFilter(data) sent_models = {} #where the key is the domain name sent_vec = modhashvec.InteractionHashingVectorizer( norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**24 ) # hashing vectorizer so doesn't change per domain in terms of feature space for domain in riskofbias.CORE_DOMAINS: sent_uids = np.intersect1d( uids_train, np.array(sent_docs.get_ids(filter_domain=domain))) no_studies = len(sent_uids) print "%d docs obtained for domain: %s" % (no_studies, domain) tuned_parameters = { "alpha": np.logspace(-4, -1, 5), "class_weight": [{ 1: i, -1: 1 } for i in np.logspace(-1, 1, 5)] } clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall') X_train_d, y_train = sent_docs.Xy(sent_uids, domain=domain) X_train = sent_vec.fit_transform(X_train_d, low=2) clf.fit(X_train, y_train) sent_models[domain] = clf.best_estimator_ # import pdb; pdb.set_trace() # we need different test ids for each domain # (since we're testing on studies with more than one RoB assessment for *each domain*) docs = riskofbias.MultiTaskDocFilter(data) tuned_parameters = {"alpha": np.logspace(-4, -1, 10)} clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='accuracy') # X_train_d, y_train, i_train = docs.Xyi(uids_train, pmid_instance=0) # add interaction features (here both domain + high prob sentences) # interactions = {domain:[] for domain in riskofbias.CORE_DOMAINS} high_prob_sents = [] for doc_text, doc_domain in zip(X_train_d, i_train): doc_sents = sent_tokenizer.tokenize(doc_text) doc_sents_X = sent_vec.transform(doc_sents) doc_sents_preds = sent_models[doc_domain].predict(doc_sents_X) high_prob_sents.append(" ".join([ sent for sent, sent_pred in zip(doc_sents, doc_sents_preds) if sent_pred == 1 ])) print "high prob sents:" from collections import Counter prob_count = Counter(list(doc_sents_preds)) print prob_count for domain in riskofbias.CORE_DOMAINS: if domain == doc_domain: interactions[domain].append(True) else: interactions[domain].append(False) vec = modhashvec.ModularVectorizer( norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space vec.builder_clear() vec.builder_add_docs(X_train_d, low=10) # add base features # print high_prob_sents for domain in riskofbias.CORE_DOMAINS: print np.sum(interactions[domain]), "/", len( interactions[domain]), "added for", domain vec.builder_add_docs(X_train_d, interactions=interactions[domain], prefix=domain + "-i-", low=2) # then add interactions vec.builder_add_docs(high_prob_sents, prefix="-s-", low=2) X_train = vec.builder_fit_transform() clf.fit(X_train, y_train) # Test on each domain in turn for domain in riskofbias.CORE_DOMAINS: uids_domain_all = filtered_data.get_ids(pmid_instance=0, filter_domain=domain) uids_domain_double_assessed = filtered_data.get_ids( pmid_instance=1, filter_domain=domain) uids_test_domain = np.intersect1d(uids_domain_all, uids_domain_double_assessed) X_test_d, y_test = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=0) X_ignore, y_human = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=1) X_ignore = None # don't need this bit # # get high prob sents from test data # high_prob_sents = [] for doc_text in X_test_d: doc_sents = sent_tokenizer.tokenize(doc_text) doc_sents_X = sent_vec.transform(doc_sents) doc_sents_preds = sent_models[domain].predict(doc_sents_X) high_prob_sents.append(" ".join([ sent for sent, sent_pred in zip(doc_sents, doc_sents_preds) if sent_pred == 1 ])) # build up test vector vec.builder_clear() vec.builder_add_docs(X_test_d) # add base features vec.builder_add_docs(X_test_d, prefix=domain + '-i-') # add interactions vec.builder_add_docs(high_prob_sents, prefix="-s-") X_test = vec.builder_transform() y_preds = clf.predict(X_test) model_metrics.add_preds_test(y_preds, y_test, domain=domain) human_metrics.add_preds_test(y_human, y_test, domain=domain) stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain) model_metrics.save_csv( os.path.join('results', outputnames.filename(label="model"))) stupid_metrics.save_csv( os.path.join('results', outputnames.filename(label="stupid-baseline"))) human_metrics.save_csv( os.path.join('results', outputnames.filename(label="human-performance")))
def main(out_dir="results"): # parse the risk of bias data from Cochrane print "risk of bias data!" data = riskofbias.RoBData(test_mode=False) data.generate_data(doc_level_only=False, skip_small_files=True) # filter the data by Document filtered_data = riskofbias.DocFilter(data) # get the uids of the desired training set # (for this experiment those which appear in only one review) uids_all = filtered_data.get_ids( pmid_instance=0) # those with 1 or more assessment (i.e. all) ######################## # sentence prediction # ######################## # The first stage is to make the sentence prediction model using the # training data set # print "First, making sentence prediction model" sent_docs = riskofbias.MultiTaskSentFilter(data) uids = np.array(sent_docs.get_ids()) no_studies = len(uids) # sentence tokenization sent_vec = modhashvec.ModularVectorizer( norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space sent_vec.builder_clear() # add base features; this effectively generates the shared feature # space (i.e., features for all domains) sent_vec.builder_add_interaction_features(sent_docs.X(uids_all, domain=skip_domains), low=7) # now we add interaction features, which cross the domain with the # tokens. specifically, the X_i method returns token tuples crossing # every term with every domain, and the vectorizer (an instance of # ModularVectorizer) deals with inserting the actual interaction tokens # that cross domains with tokens. domain_interaction_tuples = sent_docs.X_i(uids_all, domain=skip_domains) sent_vec.builder_add_interaction_features(domain_interaction_tuples, low=2) # setup sentence classifier tuned_parameters = { "alpha": np.logspace(-4, -1, 5), "class_weight": [{ 1: i, -1: 1 } for i in np.logspace(-1, 1, 5)] } # bcw: are we sure we want to do 'recall' here, and not (e.g.) F1? sent_clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2", shuffle=True), tuned_parameters, scoring='recall') X_train = sent_vec.builder_fit_transform() y_train = sent_docs.y(uids_all, domain=skip_domains) sent_clf.fit(X_train, y_train) del X_train, y_train # we only need the best performing sent_clf = sent_clf.best_estimator_ # now we have our multi-task sentence prediction model, # which we'll use to make sentence-level predictions for # documents. ######################## # document prediction # ######################## # we need different test ids for each domain # (since we're testing on studies with more than one RoB assessment for *each domain*) docs = riskofbias.MultiTaskDocFilter(data) X_train_d = docs.Xyi(uids_all, domain=skip_domains) tuned_parameters = {"alpha": np.logspace(-2, 2, 10)} clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2", shuffle=True), tuned_parameters, scoring='f1') # bcw: note that I've amended the y method to # return interactions as well (i.e., domain strs) y_train = docs.y(uids_all, domain=skip_domains) # add interaction features (here both domain + high prob sentences) interactions = {domain: [] for domain in skip_domains} high_prob_sents = [] interaction_domains = [] for doc_index, (doc_text, doc_domain) in enumerate(X_train_d): doc_sents = sent_tokenizer.tokenize(doc_text) doc_domains = [doc_domain] * len(doc_sents) # interactions doc_X_i = izip(doc_sents, doc_domains) # sent_vec is from above. sent_vec.builder_clear() sent_vec.builder_add_interaction_features( doc_sents) # add base features sent_vec.builder_add_interaction_features( doc_X_i) # then add interactions doc_sents_X = sent_vec.builder_transform() ## bcw -- shouldn't we use the *true* sentence labels # here, rather than predictions???? # sent_clf was trained above doc_sents_preds = sent_clf.predict(doc_sents_X) high_prob_sents.append(" ".join([ sent for sent, sent_pred in zip(doc_sents, doc_sents_preds) if sent_pred == 1 ])) interaction_domains.append("-s-" + doc_domain) if doc_index % 10 == 0: print doc_index # from collections import Counter # prob_count = Counter(list(doc_sents_preds)) # print prob_count # for domain in riskofbias.CORE_DOMAINS: # if domain == doc_domain: # interactions[domain].append(True) # else: # interactions[domain].append(False) vec = modhashvec.ModularVectorizer( norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space vec.builder_clear() vec.builder_add_docs(docs.X(uids_all, domain=skip_domains), low=7) # add base features vec.builder_add_docs(docs.Xyi(uids_all, domain=skip_domains), low=2) # add domain interactions # removed X_train_d since already been through the generator! (needed reset) vec.builder_add_docs(izip(high_prob_sents, interaction_domains), low=2) # then add sentence interaction terms X_train = vec.builder_fit_transform() clf.fit(X_train, y_train) clf = clf.best_estimator_ with open('mt_mt_production_models.pck', 'wb') as f: pickle.dump((sent_clf, clf), f)