コード例 #1
0
class RCTRobot:
    def __init__(self):
        self.svm_clf = MiniClassifier(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))

        cnn_weight_files = glob.glob(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
        json_filename = os.path.join(robotreviewer.DATA_ROOT,
                                     'rct/rct_cnn_structure.json')
        self.cnn_clfs = [
            get_model(json_filename, cnn_weight_file)
            for cnn_weight_file in cnn_weight_files
        ]
        self.svm_vectorizer = HashingVectorizer(binary=False,
                                                ngram_range=(1, 1),
                                                stop_words='english')
        self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(
            robotreviewer.DATA_ROOT, 'rct/rct_cnn_vocab_map.pck'))

        self.scale_constants = {
            'cnn': {
                'mean': 0.15592811611054261,
                'std': 0.22405916984696986,
                'weight': 1.6666666666666667
            },
            'ptyp': {
                'mean': 0.055155532891381948,
                'std': 0.22828359573751594
            },
            'svm': {
                'mean': -0.75481403525485891,
                'std': 0.7812955939364481,
                'weight': 10.0
            }
        }  # weighted in mean since we use only 1 SVM model (since produces near identical results to binning 10) and 6 CNN models (since runs faster, and no further reduction in variance for further models)

        self.thresholds = {
            'cnn': {
                'precise': 2.1340457758193034,
                'sensitive': -0.076709540491855063
            },
            'cnn_ptyp': {
                'precise': 3.529609848417909,
                'sensitive': 0.083502632442633312
            },
            'svm': {
                'precise': 1.9185522606237164,
                'sensitive': 0.093273630980694439
            },
            'svm_cnn': {
                'precise': 1.8749128673557529,
                'sensitive': 0.064481902000491614
            },
            'svm_cnn_ptyp': {
                'precise': 3.7674045603568755,
                'sensitive': 0.1952449060483534
            },
            'svm_ptyp': {
                'precise': 3.7358855328111837,
                'sensitive': 0.42992224964656178
            }
        }  # All precise models have been calibrated to 97.6% sensitivity
        # All sensitive models have been calibrated to 99.1% sensitivity

    def annotate(self, data):

        # use the best performing models from the validation paper (in draft...)
        filter_class = "svm_cnn_ptyp"
        threshold_class = "precise"

        if data.get("abstract") is not None and data.get("title") is not None:
            ti = data["title"]
            ab = data["abstract"]
        elif data.get("parsed_text") is not None:
            # then just use the start of the document
            TI_LEN = 30
            AB_LEN = 500
            # best guesses based on sample of RCT abstracts + aiming for 95% centile
            ti = data['parsed_text'][:TI_LEN].text
            ab = data['parsed_text'][:AB_LEN].text
        else:
            # else can't proceed
            return data

        if "pubmed" in data.data:
            ptyp = 1.0
        else:
            ptyp = 0.0

        X_ti_str = [ti]
        X_ab_str = ['{}\n\n{}'.format(ti, ab)]

        if "svm" in filter_class:

            X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str))
            X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str))

            svm_preds = self.svm_clf.decision_function(hstack([X_ti, X_ab]))
            svm_scale = (svm_preds - self.scale_constants['svm']['mean']
                         ) / self.scale_constants['svm']['std']

        if "ptyp" in filter_class:
            ptyp = np.array([ptyp])
            ptyp_scale = (ptyp - self.scale_constants['ptyp']['mean']
                          ) / self.scale_constants['ptyp']['std']

        if "cnn" in filter_class:
            X_cnn = self.cnn_vectorizer.transform(X_ab_str)
            cnn_preds = [clf.predict(X_cnn).T[0] for clf in self.cnn_clfs]
            cnn_preds = np.vstack(cnn_preds)
            cnn_scale = (cnn_preds - self.scale_constants['cnn']['mean']
                         ) / self.scale_constants['cnn']['std']

        if filter_class == "svm":
            y_preds = svm_scale
        elif filter_class == "svm_ptyp":
            y_preds = svm_scale + ptyp_scale
        elif filter_class == "ptyp":
            y_preds = ptyp_scale
        elif filter_class == "svm_cnn_ptyp":
            weights = [self.scale_constants['svm']['weight']] + (
                [self.scale_constants['cnn']['weight']] * len(self.cnn_clfs))
            y_preds = np.average(np.vstack([cnn_scale, svm_scale]),
                                 axis=0,
                                 weights=weights) + ptyp_scale

        structured_data = {
            "is_rct":
            bool(y_preds[0] > self.thresholds[filter_class][threshold_class]),
            "decision_score":
            y_preds[0],
            "model_class":
            filter_class
        }

        data.ml["rct"] = structured_data
        return data

        @staticmethod
        def get_marginalia(data):
            """
            Get marginalia formatted for Spa from structured data
            """
            marginalia = [{
                "type":
                "Trial Design",
                "title":
                "Is an RCT?",
                "annotations": [],
                "description":
                "{0} (Decision score={1:0.2f} using {} model)".format(
                    data["rct"]["is_rct"], data["rct"]["decision_score"],
                    data["rct"]["model_class"])
            }]
            return marginalia
コード例 #2
0
class BiasRobot:

    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """


        self.sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz'))

        self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26)

        self.bias_domains = ['Random sequence generation', 'Allocation concealment', 'Blinding of participants and personnel', 'Blinding of outcome assessment', 'Incomplete outcome data', 'Selective reporting']

        self.top_k = top_k

    def pdf_annotate(self, data, top_k=None):

        """
        Annotate full text of clinical trial report
        `top_k` can be overridden here, else defaults to the class
        default set in __init__
        """
        if top_k is None:
            top_k = self.top_k


        doc_text = data.get('parsed_text')

        if not doc_text:
            # we've got to know the text at least..
            return data

        doc_len = len(data['text'])



        marginalia = []

        doc_sents = [sent.text for sent in doc_text.sents]
        doc_sent_start_i = [sent.start_char for sent in doc_text.sents]
        doc_sent_end_i = [sent.end_char for sent in doc_text.sents]

        structured_data = []

        for domain in self.bias_domains:

            doc_domains = [domain] * len(doc_sents)
            doc_X_i = zip(doc_sents, doc_domains)

            #
            # build up sentence feature set
            #
            self.vec.builder_clear()

            # uni-bigrams
            self.vec.builder_add_docs(doc_sents)

            # uni-bigrams/domain interactions
            self.vec.builder_add_docs(doc_X_i)

            doc_sents_X = self.vec.builder_transform()
            doc_sents_preds = self.sent_clf.decision_function(doc_sents_X)

            high_prob_sent_indices = np.argsort(doc_sents_preds)[:-top_k-1:-1] # top k, with no 1 first
            high_prob_sents = [doc_sents[i] for i in high_prob_sent_indices]
            high_prob_start_i = [doc_sent_start_i[i] for i in high_prob_sent_indices]
            high_prob_end_i = [doc_sent_end_i[i] for i in high_prob_sent_indices]
            high_prob_prefixes = [doc_text.text[max(0, offset-20):offset] for offset in high_prob_start_i]
            high_prob_suffixes = [doc_text.text[offset: min(doc_len, offset+20)] for offset in high_prob_end_i]
            high_prob_sents_j = " ".join(high_prob_sents)
            sent_domain_interaction = "-s-" + domain

            #
            # build up document feature set
            #
            self.vec.builder_clear()

            # uni-bigrams
            self.vec.builder_add_docs([doc_text.text])

            # uni-bigrams/domain interaction
            self.vec.builder_add_docs([(doc_text.text, domain)])

            # uni-bigrams/relevance interaction
            self.vec.builder_add_docs([(high_prob_sents_j, sent_domain_interaction)])

            X = self.vec.builder_transform()

            bias_pred = self.doc_clf.predict(X)
            bias_class = ["high/unclear", "low"][bias_pred[0]]
            annotation_metadata = [{"content": sent[0],
                                    "position": sent[1],
                                    "uuid": str(uuid.uuid1()),
                                    "prefix": sent[2],
                                    "suffix": sent[3]} for sent in zip(high_prob_sents, high_prob_start_i,
                                       high_prob_prefixes,
                                       high_prob_suffixes)]

            structured_data.append({
                "domain": domain,
                "judgement": bias_class,
                "annotations": annotation_metadata})
        data.ml["bias"] = structured_data
        return data

    @staticmethod
    def get_marginalia(data):
        """
        Get marginalia formatted for Spa from structured data
        """
        marginalia = []

        for row in data['bias']:
            marginalia.append({
                        "type": "Risk of Bias",
                        "title": row['domain'],
                        "annotations": row['annotations'],
                        "description": "**Overall risk of bias prediction**: {}".format(row['judgement'])
                        })
        return marginalia

    @staticmethod
    def get_domains():
        return [u'Random sequence generation',
                u'Allocation concealment',
                u'Blinding of participants and personnel',
                u'Blinding of outcome assessment',
                u'Incomplete outcome data',
                u'Selective reporting']
コード例 #3
0
class RCTRobot:
    def __init__(self):
        from keras.preprocessing import sequence
        from keras.models import load_model
        from keras.models import Sequential
        from keras.preprocessing import sequence
        from keras.layers import Dense, Dropout, Activation, Lambda, Input, merge, Flatten
        from keras.layers import Embedding
        from keras.layers import Convolution1D, MaxPooling1D
        from keras import backend as K
        from keras.models import Model
        from keras.regularizers import l2
        global sequence, load_model, Sequential, Dense, Dropout, Activation, Lambda, Input, merge, Flatten
        global Embedding, Convolution1D, MaxPooling1D, K, Model, l2
        self.svm_clf = MiniClassifier(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))
        cnn_weight_files = glob.glob(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
        self.cnn_clfs = [
            load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files
        ]
        self.svm_vectorizer = HashingVectorizer(binary=False,
                                                ngram_range=(1, 1),
                                                stop_words='english')
        self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(
            robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'),
                                              stop_words='english')
        with open(
                os.path.join(robotreviewer.DATA_ROOT,
                             'rct/rct_model_calibration.json'), 'r') as f:
            self.constants = json.load(f)

        self.calibration_lr = {}
        with open(
                os.path.join(robotreviewer.DATA_ROOT,
                             'rct/svm_cnn_ptyp_calibration.pck'), 'rb') as f:
            self.calibration_lr['svm_cnn_ptyp'] = pickle.load(f)

        with open(
                os.path.join(robotreviewer.DATA_ROOT,
                             'rct/svm_cnn_calibration.pck'), 'rb') as f:
            self.calibration_lr['svm_cnn'] = pickle.load(f)

    def _process_ptyp(self, data_row, strict=True):
        """
        Takes in a data row which might include rct_ptyp
        or ptyp fields.
        If strict=True, then raises exception when passed any
        contradictory data
        Returns: 1 = ptyp is RCT
                 0 = ptyp is NOT RCT
                 -1 = no ptyp information present
        """
        if data_row['use_ptyp'] == False:
            return -1
        elif data_row['use_ptyp'] == True:
            return 1 if any(
                (tag in data_row['ptyp']
                 for tag in ["Randomized Controlled Trial", "D016449"])) else 0
        else:
            raise Exception("unexpected value for 'use_ptyp'")

    def api_annotate(self, articles):

        # use the best performing models from the validation paper (in draft...)
        ensemble_type = "svm_cnn"
        threshold_class = "balanced"
        auto_use_ptyp = True

        # require a title and abstract. ptyp optional

        if not (all(
            ("ti" in article and "ab" in article for article in articles))):
            raise Exception("RCT robot requires a full title and abstract")

        prepared_data = [{
            "title": r['ti'],
            "abstract": r['ab'],
            "ptyp": r.get('ptyp'),
            "use_ptyp": "ptyp" in r
        } for r in articles]
        preds = self.predict(prepared_data,
                             ensemble_type=ensemble_type,
                             threshold_type=threshold_class,
                             auto_use_ptyp=auto_use_ptyp)

        return preds

    def pdf_annotate(self, data):

        # use the best performing models from the validation paper (in draft...)
        ensemble_type = "svm_cnn"
        threshold_class = "balanced"
        auto_use_ptyp = True

        if data.get("abstract") is not None and data.get("title") is not None:
            ti = data["title"]
            ab = data["abstract"]
        elif data.get("parsed_text") is not None:
            # then just use the start of the document
            TI_LEN = 30
            AB_LEN = 500
            # best guesses based on sample of RCT abstracts + aiming for 95% centile
            ti = data['parsed_text'][:TI_LEN].text
            ab = data['parsed_text'][:AB_LEN].text
        else:
            # else can't proceed
            return data

        preds = self.predict({
            "title": ti,
            "abstract": ab
        },
                             auto_use_ptyp=False)[0]

        structured_data = {
            "is_rct": preds['is_rct'],
            "decision_score": preds['threshold_value'],
            "model_class": preds['model'],
            "threshold_type": preds['threshold_type']
        }

        data.ml["rct"] = structured_data
        return data

    def predict(self,
                X,
                get_raw=False,
                ensemble_type="svm_cnn",
                threshold_type="sensitive",
                auto_use_ptyp=True):

        if isinstance(X, dict):
            X = [X]

        if auto_use_ptyp:
            pt_mask = np.array([self._process_ptyp(r) for r in X])
        else:
            # don't add for any of them
            pt_mask = np.array([-1 for r in X])

        preds_l = {}
        # calculate ptyp for all
        ptyp_scale = (pt_mask - self.constants['scales']['ptyp']['mean']
                      ) / self.constants['scales']['ptyp']['std']
        # but set to 0 if not using
        ptyp_scale[pt_mask == -1] = 0
        preds_l['ptyp'] = ptyp_scale

        # thresholds vary per article
        thresholds_all = {k: [] for k in ['precise', 'balanced', 'sensitive']}

        for t in ['precise', 'balanced', 'sensitive']:
            for r in pt_mask:
                if r != -1:
                    thresholds_all[t].append(self.constants['thresholds'][
                        "{}_ptyp".format(ensemble_type)][t])
                else:
                    thresholds_all[t].append(
                        self.constants['thresholds'][ensemble_type][t])

        X_ti_str = [article.get('title', '') for article in X]
        X_ab_str = [
            '{}\n\n{}'.format(article.get('title', ''),
                              article.get('abstract', '')) for article in X
        ]

        if "svm" in ensemble_type:
            X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str))
            X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str))
            svm_preds = self.svm_clf.decision_function(hstack([X_ab, X_ti]))
            svm_scale = (svm_preds - self.constants['scales']['svm']['mean']
                         ) / self.constants['scales']['svm']['std']
            preds_l['svm'] = svm_scale
            preds_l['svm_ptyp'] = preds_l['svm'] + preds_l['ptyp']
            preds_l['svm_raw'] = svm_preds.tolist()

        if "cnn" in ensemble_type:
            X_cnn = self.cnn_vectorizer.transform(X_ab_str)
            cnn_preds = []
            for i, clf in enumerate(self.cnn_clfs):
                cnn_preds.append(clf.predict(X_cnn).T[0])

            cnn_preds = np.vstack(cnn_preds)
            cnn_scale = (cnn_preds - self.constants['scales']['cnn']['mean']
                         ) / self.constants['scales']['cnn']['std']
            preds_l['cnn'] = np.mean(cnn_scale, axis=0)
            preds_l['cnn_raw'] = cnn_preds.T.tolist()
            preds_l['cnn_ptyp'] = preds_l['cnn'] + preds_l['ptyp']

        if ensemble_type == "svm_cnn":
            weights = [self.constants['scales']['svm']['weight']
                       ] + ([self.constants['scales']['cnn']['weight']] *
                            len(self.cnn_clfs))
            preds_l['svm_cnn'] = np.average(np.vstack([svm_scale, cnn_scale]),
                                            axis=0,
                                            weights=weights)
            preds_l['svm_cnn_ptyp'] = preds_l['svm_cnn'] + preds_l['ptyp']

            # if svm_cnn then we can have probabilities

            X_calib = np.hstack([
                svm_preds.reshape(-1, 1), cnn_preds.T,
                pt_mask.reshape(-1, 1)
            ])
            probs = []

            for r in X_calib:

                if r[11] != -1:
                    probs.append(self.calibration_lr['svm_cnn'].predict_proba(
                        [r[:11]])[0][1])
                else:
                    probs.append(
                        self.calibration_lr['svm_cnn_ptyp'].predict_proba(
                            [r])[0][1])

            preds_l['probability'] = probs

        if get_raw:
            return {"svm": svm_preds, "cnn": cnn_preds, "ptyp": pt_mask}

        preds_d = [dict(zip(preds_l, i)) for i in zip(*preds_l.values())]

        out = []

        thresholds_T = [
            dict(zip(thresholds_all, t)) for t in zip(*thresholds_all.values())
        ]
        # i.e. https://stackoverflow.com/questions/5558418/list-of-dicts-to-from-dict-of-lists

        for pred, threshold, used_ptyp in zip(preds_d, thresholds_T, pt_mask):
            row = {}
            if used_ptyp != -1:
                row['model'] = "{}_ptyp".format(ensemble_type)
            else:
                row['model'] = ensemble_type
            row['score'] = float(pred[row['model']])
            row['threshold_type'] = threshold_type
            row['threshold_value'] = float(threshold[threshold_type])
            row['is_rct'] = bool(row['score'] >= threshold[threshold_type])
            row['is_rct_precise'] = bool(row['score'] >= threshold['precise'])
            row['is_rct_balanced'] = bool(
                row['score'] >= threshold['balanced'])
            row['is_rct_sensitive'] = bool(
                row['score'] >= threshold['sensitive'])
            row['ptyp_rct'] = int(used_ptyp)
            out.append(row)
        return out

    @staticmethod
    def get_marginalia(data):
        """
        Get marginalia formatted for Spa from structured data
        """
        marginalia = [{
            "type":
            "Trial Design",
            "title":
            "Is an RCT?",
            "annotations": [],
            "description":
            "{0} (Decision score={1:0.2f} using {} model)".format(
                data["rct"]["is_rct"], data["rct"]["decision_score"],
                data["rct"]["model_class"])
        }]
        return marginalia

    def predict_ris(self,
                    ris_data,
                    ensemble_type="svm_cnn",
                    threshold_type='sensitive',
                    auto_use_ptyp=False):

        simplified = [ris.simplify(article) for article in ris_data]
        preds = self.predict(simplified,
                             ensemble_type=ensemble_type,
                             threshold_type=threshold_type,
                             auto_use_ptyp=auto_use_ptyp)
        return preds

    def filter_articles(self,
                        ris_string,
                        ensemble_type="svm_cnn",
                        threshold_type='sensitive',
                        auto_use_ptyp=True,
                        remove_non_rcts=True):

        print('Parsing RIS data')
        ris_data = ris.loads(ris_string)
        import json
        with open("debug.json", 'w') as f:
            json.dumps(ris_data)
        preds = self.predict_ris(ris_data,
                                 ensemble_type=ensemble_type,
                                 threshold_type=threshold_type,
                                 auto_use_ptyp=auto_use_ptyp)
        out = []

        pred_key_map = {
            "score": "ZS",
            "model": "ZM",
            "threshold_type": "ZT",
            "threshold_value": "ZC",
            "is_rct": "ZR",
            "ptyp_rct": "ZP"
        }

        for ris_row, pred_row in zip(ris_data, preds):
            if remove_non_rcts == False or pred_row['is_rct']:
                ris_row.update(
                    {pred_key_map[k]: v
                     for k, v in pred_row.items()})

                out.append(ris_row)
        return ris.dumps(out)
コード例 #4
0
class RCTRobot:
    def __init__(self):
        self.svm_clf = MiniClassifier(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))
        cnn_weight_files = glob.glob(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
        self.cnn_clfs = [
            load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files
        ]
        self.svm_vectorizer = HashingVectorizer(binary=False,
                                                ngram_range=(1, 1),
                                                stop_words='english')
        self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(
            robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'),
                                              stop_words='english')
        with open(
                os.path.join(robotreviewer.DATA_ROOT,
                             'rct/rct_model_calibration.json'), 'r') as f:
            self.constants = json.load(f)

    def _process_ptyp(self, data_row, strict=True):
        """
        Takes in a data row which might include rct_ptyp
        or ptyp fields.
        If strict=True, then raises exception when passed any
        contradictory data
        Returns: 1 = ptyp is RCT
                 0 = ptyp is NOT RCT
                 -1 = no ptyp information present
        """
        if data_row['use_ptyp'] == False:
            return -1
        elif data_row['use_ptyp'] == True:
            return 1 if any(
                (tag in data_row['ptyp']
                 for tag in ["Randomized Controlled Trial", "D016449"])) else 0
        else:
            raise Exception("unexpcted value for 'use_ptyp'")

    ###
    # Annotate function
    ##
    def annotate(self, data, filename):

        # use the best performing models from the validation paper (in draft...)

        # initialize empty output structure
        structured_data = {
            "filename": filename,
            "is_rct": -1,
            "decision_score": -1
        }
        filter_class = "svm_cnn"
        threshold_class = "balanced"
        auto_use_ptyp = True

        if data.get("abstract") is not None and data.get("title") is not None:
            ti = data["title"]
            ab = data["abstract"]
        elif data.get("parsed_text") is not None:
            # then just use the start of the document
            TI_LEN = 30
            AB_LEN = 500
            # best guesses based on sample of RCT abstracts + aiming for 95% centile
            ti = data['parsed_text'][:TI_LEN].text
            ab = data['parsed_text'][:AB_LEN].text
        else:
            # else can't proceed
            return structured_data

        preds = self.predict({
            "title": ti,
            "abstract": ab
        },
                             auto_use_ptyp=False)[0]

        structured_data.update({
            "is_rct": preds['is_rct'],
            "decision_score": preds['threshold_value']
        })
        print(structured_data)
        return structured_data

    ###
    # Predict function
    ##
    def predict(self,
                X,
                filter_class="svm",
                filter_type="sensitive",
                auto_use_ptyp=True):

        if isinstance(X, dict):
            X = [X]

        if auto_use_ptyp:
            pt_mask = np.array([self._process_ptyp(r) for r in X])
        else:
            # don't add for any of them
            pt_mask = np.array([-1 for r in X])

        # calculate ptyp for all
        #ptyp = np.copy(pt_mask)
        # ptyp = np.array([(article.get('rct_ptyp')==True)*1. for article in X])
        ptyp_scale = (pt_mask - self.constants['scales']['ptyp']['mean']
                      ) / self.constants['scales']['ptyp']['std']
        # but set to 0 if not using
        ptyp_scale[pt_mask == -1] = 0

        # thresholds vary per article
        thresholds = []
        for r in pt_mask:
            if r != -1:
                thresholds.append(self.constants['thresholds'][
                    "{}_ptyp".format(filter_class)][filter_type])
            else:
                thresholds.append(
                    self.constants['thresholds'][filter_class][filter_type])

        X_ti_str = [article.get('title', '') for article in X]
        X_ab_str = [
            '{}\n\n{}'.format(article.get('title', ''),
                              article.get('abstract', '')) for article in X
        ]

        if "svm" in filter_class:
            X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str))
            X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str))
            svm_preds = self.svm_clf.decision_function(hstack([X_ab, X_ti]))
            svm_scale = (svm_preds - self.constants['scales']['svm']['mean']
                         ) / self.constants['scales']['svm']['std']

        if "cnn" in filter_class:
            X_cnn = self.cnn_vectorizer.transform(X_ab_str)
            cnn_preds = []
            for i, clf in enumerate(self.cnn_clfs):
                print('\t{} of {}'.format(i + 1, len(self.cnn_clfs)))
                cnn_preds.append(clf.predict(X_cnn).T[0])

            cnn_preds = np.vstack(cnn_preds)
            cnn_scale = (cnn_preds - self.constants['scales']['cnn']['mean']
                         ) / self.constants['scales']['cnn']['std']

        if filter_class == "svm":
            y_preds = svm_scale
        elif filter_class == "cnn":
            y_preds = np.mean(cnn_scale, axis=0)
        elif filter_class == "svm_cnn":
            weights = [self.constants['scales']['svm']['weight']
                       ] + ([self.constants['scales']['cnn']['weight']] *
                            len(self.cnn_clfs))
            y_preds = np.average(np.vstack([svm_scale, cnn_scale]),
                                 axis=0,
                                 weights=weights)

        y_preds += ptyp_scale

        out = []
        for pred, threshold, used_ptyp in zip(y_preds, thresholds, pt_mask):
            row = {}
            row['score'] = float(pred)
            if used_ptyp != -1:
                row['model'] = "{}_ptyp".format(filter_class)
            else:
                row['model'] = filter_class
            row['threshold_type'] = filter_type
            row['threshold_value'] = float(threshold)
            row['is_rct'] = bool(pred >= threshold)
            row['ptyp_rct'] = int(used_ptyp)
            out.append(row)
        return out

    @staticmethod
    def get_marginalia(data):
        """
        Get marginalia formatted for Spa from structured data
        """
        marginalia = [{
            "type":
            "Trial Design",
            "title":
            "Is an RCT?",
            "annotations": [],
            "description":
            "{0} (Decision score={1:0.2f} using {} model)".format(
                data["rct"]["is_rct"], data["rct"]["decision_score"],
                data["rct"]["model_class"])
        }]
        return marginalia