class RCTRobot: def __init__(self): self.svm_clf = MiniClassifier( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz')) cnn_weight_files = glob.glob( os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5')) json_filename = os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_cnn_structure.json') self.cnn_clfs = [ get_model(json_filename, cnn_weight_file) for cnn_weight_file in cnn_weight_files ] self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english') self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join( robotreviewer.DATA_ROOT, 'rct/rct_cnn_vocab_map.pck')) self.scale_constants = { 'cnn': { 'mean': 0.15592811611054261, 'std': 0.22405916984696986, 'weight': 1.6666666666666667 }, 'ptyp': { 'mean': 0.055155532891381948, 'std': 0.22828359573751594 }, 'svm': { 'mean': -0.75481403525485891, 'std': 0.7812955939364481, 'weight': 10.0 } } # weighted in mean since we use only 1 SVM model (since produces near identical results to binning 10) and 6 CNN models (since runs faster, and no further reduction in variance for further models) self.thresholds = { 'cnn': { 'precise': 2.1340457758193034, 'sensitive': -0.076709540491855063 }, 'cnn_ptyp': { 'precise': 3.529609848417909, 'sensitive': 0.083502632442633312 }, 'svm': { 'precise': 1.9185522606237164, 'sensitive': 0.093273630980694439 }, 'svm_cnn': { 'precise': 1.8749128673557529, 'sensitive': 0.064481902000491614 }, 'svm_cnn_ptyp': { 'precise': 3.7674045603568755, 'sensitive': 0.1952449060483534 }, 'svm_ptyp': { 'precise': 3.7358855328111837, 'sensitive': 0.42992224964656178 } } # All precise models have been calibrated to 97.6% sensitivity # All sensitive models have been calibrated to 99.1% sensitivity def annotate(self, data): # use the best performing models from the validation paper (in draft...) filter_class = "svm_cnn_ptyp" threshold_class = "precise" if data.get("abstract") is not None and data.get("title") is not None: ti = data["title"] ab = data["abstract"] elif data.get("parsed_text") is not None: # then just use the start of the document TI_LEN = 30 AB_LEN = 500 # best guesses based on sample of RCT abstracts + aiming for 95% centile ti = data['parsed_text'][:TI_LEN].text ab = data['parsed_text'][:AB_LEN].text else: # else can't proceed return data if "pubmed" in data.data: ptyp = 1.0 else: ptyp = 0.0 X_ti_str = [ti] X_ab_str = ['{}\n\n{}'.format(ti, ab)] if "svm" in filter_class: X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str)) X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str)) svm_preds = self.svm_clf.decision_function(hstack([X_ti, X_ab])) svm_scale = (svm_preds - self.scale_constants['svm']['mean'] ) / self.scale_constants['svm']['std'] if "ptyp" in filter_class: ptyp = np.array([ptyp]) ptyp_scale = (ptyp - self.scale_constants['ptyp']['mean'] ) / self.scale_constants['ptyp']['std'] if "cnn" in filter_class: X_cnn = self.cnn_vectorizer.transform(X_ab_str) cnn_preds = [clf.predict(X_cnn).T[0] for clf in self.cnn_clfs] cnn_preds = np.vstack(cnn_preds) cnn_scale = (cnn_preds - self.scale_constants['cnn']['mean'] ) / self.scale_constants['cnn']['std'] if filter_class == "svm": y_preds = svm_scale elif filter_class == "svm_ptyp": y_preds = svm_scale + ptyp_scale elif filter_class == "ptyp": y_preds = ptyp_scale elif filter_class == "svm_cnn_ptyp": weights = [self.scale_constants['svm']['weight']] + ( [self.scale_constants['cnn']['weight']] * len(self.cnn_clfs)) y_preds = np.average(np.vstack([cnn_scale, svm_scale]), axis=0, weights=weights) + ptyp_scale structured_data = { "is_rct": bool(y_preds[0] > self.thresholds[filter_class][threshold_class]), "decision_score": y_preds[0], "model_class": filter_class } data.ml["rct"] = structured_data return data @staticmethod def get_marginalia(data): """ Get marginalia formatted for Spa from structured data """ marginalia = [{ "type": "Trial Design", "title": "Is an RCT?", "annotations": [], "description": "{0} (Decision score={1:0.2f} using {} model)".format( data["rct"]["is_rct"], data["rct"]["decision_score"], data["rct"]["model_class"]) }] return marginalia
class BiasRobot: def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz')) self.doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz')) self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) self.bias_domains = ['Random sequence generation', 'Allocation concealment', 'Blinding of participants and personnel', 'Blinding of outcome assessment', 'Incomplete outcome data', 'Selective reporting'] self.top_k = top_k def pdf_annotate(self, data, top_k=None): """ Annotate full text of clinical trial report `top_k` can be overridden here, else defaults to the class default set in __init__ """ if top_k is None: top_k = self.top_k doc_text = data.get('parsed_text') if not doc_text: # we've got to know the text at least.. return data doc_len = len(data['text']) marginalia = [] doc_sents = [sent.text for sent in doc_text.sents] doc_sent_start_i = [sent.start_char for sent in doc_text.sents] doc_sent_end_i = [sent.end_char for sent in doc_text.sents] structured_data = [] for domain in self.bias_domains: doc_domains = [domain] * len(doc_sents) doc_X_i = zip(doc_sents, doc_domains) # # build up sentence feature set # self.vec.builder_clear() # uni-bigrams self.vec.builder_add_docs(doc_sents) # uni-bigrams/domain interactions self.vec.builder_add_docs(doc_X_i) doc_sents_X = self.vec.builder_transform() doc_sents_preds = self.sent_clf.decision_function(doc_sents_X) high_prob_sent_indices = np.argsort(doc_sents_preds)[:-top_k-1:-1] # top k, with no 1 first high_prob_sents = [doc_sents[i] for i in high_prob_sent_indices] high_prob_start_i = [doc_sent_start_i[i] for i in high_prob_sent_indices] high_prob_end_i = [doc_sent_end_i[i] for i in high_prob_sent_indices] high_prob_prefixes = [doc_text.text[max(0, offset-20):offset] for offset in high_prob_start_i] high_prob_suffixes = [doc_text.text[offset: min(doc_len, offset+20)] for offset in high_prob_end_i] high_prob_sents_j = " ".join(high_prob_sents) sent_domain_interaction = "-s-" + domain # # build up document feature set # self.vec.builder_clear() # uni-bigrams self.vec.builder_add_docs([doc_text.text]) # uni-bigrams/domain interaction self.vec.builder_add_docs([(doc_text.text, domain)]) # uni-bigrams/relevance interaction self.vec.builder_add_docs([(high_prob_sents_j, sent_domain_interaction)]) X = self.vec.builder_transform() bias_pred = self.doc_clf.predict(X) bias_class = ["high/unclear", "low"][bias_pred[0]] annotation_metadata = [{"content": sent[0], "position": sent[1], "uuid": str(uuid.uuid1()), "prefix": sent[2], "suffix": sent[3]} for sent in zip(high_prob_sents, high_prob_start_i, high_prob_prefixes, high_prob_suffixes)] structured_data.append({ "domain": domain, "judgement": bias_class, "annotations": annotation_metadata}) data.ml["bias"] = structured_data return data @staticmethod def get_marginalia(data): """ Get marginalia formatted for Spa from structured data """ marginalia = [] for row in data['bias']: marginalia.append({ "type": "Risk of Bias", "title": row['domain'], "annotations": row['annotations'], "description": "**Overall risk of bias prediction**: {}".format(row['judgement']) }) return marginalia @staticmethod def get_domains(): return [u'Random sequence generation', u'Allocation concealment', u'Blinding of participants and personnel', u'Blinding of outcome assessment', u'Incomplete outcome data', u'Selective reporting']
class RCTRobot: def __init__(self): from keras.preprocessing import sequence from keras.models import load_model from keras.models import Sequential from keras.preprocessing import sequence from keras.layers import Dense, Dropout, Activation, Lambda, Input, merge, Flatten from keras.layers import Embedding from keras.layers import Convolution1D, MaxPooling1D from keras import backend as K from keras.models import Model from keras.regularizers import l2 global sequence, load_model, Sequential, Dense, Dropout, Activation, Lambda, Input, merge, Flatten global Embedding, Convolution1D, MaxPooling1D, K, Model, l2 self.svm_clf = MiniClassifier( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz')) cnn_weight_files = glob.glob( os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5')) self.cnn_clfs = [ load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files ] self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english') self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join( robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'), stop_words='english') with open( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_model_calibration.json'), 'r') as f: self.constants = json.load(f) self.calibration_lr = {} with open( os.path.join(robotreviewer.DATA_ROOT, 'rct/svm_cnn_ptyp_calibration.pck'), 'rb') as f: self.calibration_lr['svm_cnn_ptyp'] = pickle.load(f) with open( os.path.join(robotreviewer.DATA_ROOT, 'rct/svm_cnn_calibration.pck'), 'rb') as f: self.calibration_lr['svm_cnn'] = pickle.load(f) def _process_ptyp(self, data_row, strict=True): """ Takes in a data row which might include rct_ptyp or ptyp fields. If strict=True, then raises exception when passed any contradictory data Returns: 1 = ptyp is RCT 0 = ptyp is NOT RCT -1 = no ptyp information present """ if data_row['use_ptyp'] == False: return -1 elif data_row['use_ptyp'] == True: return 1 if any( (tag in data_row['ptyp'] for tag in ["Randomized Controlled Trial", "D016449"])) else 0 else: raise Exception("unexpected value for 'use_ptyp'") def api_annotate(self, articles): # use the best performing models from the validation paper (in draft...) ensemble_type = "svm_cnn" threshold_class = "balanced" auto_use_ptyp = True # require a title and abstract. ptyp optional if not (all( ("ti" in article and "ab" in article for article in articles))): raise Exception("RCT robot requires a full title and abstract") prepared_data = [{ "title": r['ti'], "abstract": r['ab'], "ptyp": r.get('ptyp'), "use_ptyp": "ptyp" in r } for r in articles] preds = self.predict(prepared_data, ensemble_type=ensemble_type, threshold_type=threshold_class, auto_use_ptyp=auto_use_ptyp) return preds def pdf_annotate(self, data): # use the best performing models from the validation paper (in draft...) ensemble_type = "svm_cnn" threshold_class = "balanced" auto_use_ptyp = True if data.get("abstract") is not None and data.get("title") is not None: ti = data["title"] ab = data["abstract"] elif data.get("parsed_text") is not None: # then just use the start of the document TI_LEN = 30 AB_LEN = 500 # best guesses based on sample of RCT abstracts + aiming for 95% centile ti = data['parsed_text'][:TI_LEN].text ab = data['parsed_text'][:AB_LEN].text else: # else can't proceed return data preds = self.predict({ "title": ti, "abstract": ab }, auto_use_ptyp=False)[0] structured_data = { "is_rct": preds['is_rct'], "decision_score": preds['threshold_value'], "model_class": preds['model'], "threshold_type": preds['threshold_type'] } data.ml["rct"] = structured_data return data def predict(self, X, get_raw=False, ensemble_type="svm_cnn", threshold_type="sensitive", auto_use_ptyp=True): if isinstance(X, dict): X = [X] if auto_use_ptyp: pt_mask = np.array([self._process_ptyp(r) for r in X]) else: # don't add for any of them pt_mask = np.array([-1 for r in X]) preds_l = {} # calculate ptyp for all ptyp_scale = (pt_mask - self.constants['scales']['ptyp']['mean'] ) / self.constants['scales']['ptyp']['std'] # but set to 0 if not using ptyp_scale[pt_mask == -1] = 0 preds_l['ptyp'] = ptyp_scale # thresholds vary per article thresholds_all = {k: [] for k in ['precise', 'balanced', 'sensitive']} for t in ['precise', 'balanced', 'sensitive']: for r in pt_mask: if r != -1: thresholds_all[t].append(self.constants['thresholds'][ "{}_ptyp".format(ensemble_type)][t]) else: thresholds_all[t].append( self.constants['thresholds'][ensemble_type][t]) X_ti_str = [article.get('title', '') for article in X] X_ab_str = [ '{}\n\n{}'.format(article.get('title', ''), article.get('abstract', '')) for article in X ] if "svm" in ensemble_type: X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str)) X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str)) svm_preds = self.svm_clf.decision_function(hstack([X_ab, X_ti])) svm_scale = (svm_preds - self.constants['scales']['svm']['mean'] ) / self.constants['scales']['svm']['std'] preds_l['svm'] = svm_scale preds_l['svm_ptyp'] = preds_l['svm'] + preds_l['ptyp'] preds_l['svm_raw'] = svm_preds.tolist() if "cnn" in ensemble_type: X_cnn = self.cnn_vectorizer.transform(X_ab_str) cnn_preds = [] for i, clf in enumerate(self.cnn_clfs): cnn_preds.append(clf.predict(X_cnn).T[0]) cnn_preds = np.vstack(cnn_preds) cnn_scale = (cnn_preds - self.constants['scales']['cnn']['mean'] ) / self.constants['scales']['cnn']['std'] preds_l['cnn'] = np.mean(cnn_scale, axis=0) preds_l['cnn_raw'] = cnn_preds.T.tolist() preds_l['cnn_ptyp'] = preds_l['cnn'] + preds_l['ptyp'] if ensemble_type == "svm_cnn": weights = [self.constants['scales']['svm']['weight'] ] + ([self.constants['scales']['cnn']['weight']] * len(self.cnn_clfs)) preds_l['svm_cnn'] = np.average(np.vstack([svm_scale, cnn_scale]), axis=0, weights=weights) preds_l['svm_cnn_ptyp'] = preds_l['svm_cnn'] + preds_l['ptyp'] # if svm_cnn then we can have probabilities X_calib = np.hstack([ svm_preds.reshape(-1, 1), cnn_preds.T, pt_mask.reshape(-1, 1) ]) probs = [] for r in X_calib: if r[11] != -1: probs.append(self.calibration_lr['svm_cnn'].predict_proba( [r[:11]])[0][1]) else: probs.append( self.calibration_lr['svm_cnn_ptyp'].predict_proba( [r])[0][1]) preds_l['probability'] = probs if get_raw: return {"svm": svm_preds, "cnn": cnn_preds, "ptyp": pt_mask} preds_d = [dict(zip(preds_l, i)) for i in zip(*preds_l.values())] out = [] thresholds_T = [ dict(zip(thresholds_all, t)) for t in zip(*thresholds_all.values()) ] # i.e. https://stackoverflow.com/questions/5558418/list-of-dicts-to-from-dict-of-lists for pred, threshold, used_ptyp in zip(preds_d, thresholds_T, pt_mask): row = {} if used_ptyp != -1: row['model'] = "{}_ptyp".format(ensemble_type) else: row['model'] = ensemble_type row['score'] = float(pred[row['model']]) row['threshold_type'] = threshold_type row['threshold_value'] = float(threshold[threshold_type]) row['is_rct'] = bool(row['score'] >= threshold[threshold_type]) row['is_rct_precise'] = bool(row['score'] >= threshold['precise']) row['is_rct_balanced'] = bool( row['score'] >= threshold['balanced']) row['is_rct_sensitive'] = bool( row['score'] >= threshold['sensitive']) row['ptyp_rct'] = int(used_ptyp) out.append(row) return out @staticmethod def get_marginalia(data): """ Get marginalia formatted for Spa from structured data """ marginalia = [{ "type": "Trial Design", "title": "Is an RCT?", "annotations": [], "description": "{0} (Decision score={1:0.2f} using {} model)".format( data["rct"]["is_rct"], data["rct"]["decision_score"], data["rct"]["model_class"]) }] return marginalia def predict_ris(self, ris_data, ensemble_type="svm_cnn", threshold_type='sensitive', auto_use_ptyp=False): simplified = [ris.simplify(article) for article in ris_data] preds = self.predict(simplified, ensemble_type=ensemble_type, threshold_type=threshold_type, auto_use_ptyp=auto_use_ptyp) return preds def filter_articles(self, ris_string, ensemble_type="svm_cnn", threshold_type='sensitive', auto_use_ptyp=True, remove_non_rcts=True): print('Parsing RIS data') ris_data = ris.loads(ris_string) import json with open("debug.json", 'w') as f: json.dumps(ris_data) preds = self.predict_ris(ris_data, ensemble_type=ensemble_type, threshold_type=threshold_type, auto_use_ptyp=auto_use_ptyp) out = [] pred_key_map = { "score": "ZS", "model": "ZM", "threshold_type": "ZT", "threshold_value": "ZC", "is_rct": "ZR", "ptyp_rct": "ZP" } for ris_row, pred_row in zip(ris_data, preds): if remove_non_rcts == False or pred_row['is_rct']: ris_row.update( {pred_key_map[k]: v for k, v in pred_row.items()}) out.append(ris_row) return ris.dumps(out)
class RCTRobot: def __init__(self): self.svm_clf = MiniClassifier( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz')) cnn_weight_files = glob.glob( os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5')) self.cnn_clfs = [ load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files ] self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english') self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join( robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'), stop_words='english') with open( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_model_calibration.json'), 'r') as f: self.constants = json.load(f) def _process_ptyp(self, data_row, strict=True): """ Takes in a data row which might include rct_ptyp or ptyp fields. If strict=True, then raises exception when passed any contradictory data Returns: 1 = ptyp is RCT 0 = ptyp is NOT RCT -1 = no ptyp information present """ if data_row['use_ptyp'] == False: return -1 elif data_row['use_ptyp'] == True: return 1 if any( (tag in data_row['ptyp'] for tag in ["Randomized Controlled Trial", "D016449"])) else 0 else: raise Exception("unexpcted value for 'use_ptyp'") ### # Annotate function ## def annotate(self, data, filename): # use the best performing models from the validation paper (in draft...) # initialize empty output structure structured_data = { "filename": filename, "is_rct": -1, "decision_score": -1 } filter_class = "svm_cnn" threshold_class = "balanced" auto_use_ptyp = True if data.get("abstract") is not None and data.get("title") is not None: ti = data["title"] ab = data["abstract"] elif data.get("parsed_text") is not None: # then just use the start of the document TI_LEN = 30 AB_LEN = 500 # best guesses based on sample of RCT abstracts + aiming for 95% centile ti = data['parsed_text'][:TI_LEN].text ab = data['parsed_text'][:AB_LEN].text else: # else can't proceed return structured_data preds = self.predict({ "title": ti, "abstract": ab }, auto_use_ptyp=False)[0] structured_data.update({ "is_rct": preds['is_rct'], "decision_score": preds['threshold_value'] }) print(structured_data) return structured_data ### # Predict function ## def predict(self, X, filter_class="svm", filter_type="sensitive", auto_use_ptyp=True): if isinstance(X, dict): X = [X] if auto_use_ptyp: pt_mask = np.array([self._process_ptyp(r) for r in X]) else: # don't add for any of them pt_mask = np.array([-1 for r in X]) # calculate ptyp for all #ptyp = np.copy(pt_mask) # ptyp = np.array([(article.get('rct_ptyp')==True)*1. for article in X]) ptyp_scale = (pt_mask - self.constants['scales']['ptyp']['mean'] ) / self.constants['scales']['ptyp']['std'] # but set to 0 if not using ptyp_scale[pt_mask == -1] = 0 # thresholds vary per article thresholds = [] for r in pt_mask: if r != -1: thresholds.append(self.constants['thresholds'][ "{}_ptyp".format(filter_class)][filter_type]) else: thresholds.append( self.constants['thresholds'][filter_class][filter_type]) X_ti_str = [article.get('title', '') for article in X] X_ab_str = [ '{}\n\n{}'.format(article.get('title', ''), article.get('abstract', '')) for article in X ] if "svm" in filter_class: X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str)) X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str)) svm_preds = self.svm_clf.decision_function(hstack([X_ab, X_ti])) svm_scale = (svm_preds - self.constants['scales']['svm']['mean'] ) / self.constants['scales']['svm']['std'] if "cnn" in filter_class: X_cnn = self.cnn_vectorizer.transform(X_ab_str) cnn_preds = [] for i, clf in enumerate(self.cnn_clfs): print('\t{} of {}'.format(i + 1, len(self.cnn_clfs))) cnn_preds.append(clf.predict(X_cnn).T[0]) cnn_preds = np.vstack(cnn_preds) cnn_scale = (cnn_preds - self.constants['scales']['cnn']['mean'] ) / self.constants['scales']['cnn']['std'] if filter_class == "svm": y_preds = svm_scale elif filter_class == "cnn": y_preds = np.mean(cnn_scale, axis=0) elif filter_class == "svm_cnn": weights = [self.constants['scales']['svm']['weight'] ] + ([self.constants['scales']['cnn']['weight']] * len(self.cnn_clfs)) y_preds = np.average(np.vstack([svm_scale, cnn_scale]), axis=0, weights=weights) y_preds += ptyp_scale out = [] for pred, threshold, used_ptyp in zip(y_preds, thresholds, pt_mask): row = {} row['score'] = float(pred) if used_ptyp != -1: row['model'] = "{}_ptyp".format(filter_class) else: row['model'] = filter_class row['threshold_type'] = filter_type row['threshold_value'] = float(threshold) row['is_rct'] = bool(pred >= threshold) row['ptyp_rct'] = int(used_ptyp) out.append(row) return out @staticmethod def get_marginalia(data): """ Get marginalia formatted for Spa from structured data """ marginalia = [{ "type": "Trial Design", "title": "Is an RCT?", "annotations": [], "description": "{0} (Decision score={1:0.2f} using {} model)".format( data["rct"]["is_rct"], data["rct"]["decision_score"], data["rct"]["model_class"]) }] return marginalia