def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """

        self.sent_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.doc_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_doc_level.npz'))

        self.vec = ModularVectorizer(norm=None,
                                     non_negative=True,
                                     binary=True,
                                     ngram_range=(1, 2),
                                     n_features=2**26)

        self.bias_domains = [
            'Random sequence generation', 'Allocation concealment',
            'Blinding of participants and personnel',
            'Blinding of outcome assessment', 'Incomplete outcome data',
            'Selective reporting'
        ]

        self.top_k = top_k
Ejemplo n.º 2
0
    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default

        """
        self.bias_domains = ['Random sequence generation']
        self.top_k = top_k

        self.bias_domains = {
            'RSG': 'Random sequence generation',
            'AC': 'Allocation concealment',
            'BPP': 'Blinding of participants and personnel',
            'BOA': 'Blinding of outcome assessment',
            'IOD': 'Incomplete outcome data',
            'SR': 'Selective reporting'
        }

        ###
        # Here we take a simple ensembling approach in which we combine the
        # predictions made by our rationaleCNN model and the JAMIA (linear)
        # multi task variant.
        ###

        self.all_domains = ['RSG', 'AC', 'BPP', 'BOA']

        # CNN domains
        vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle'
        arch_str = 'robotreviewer/data/keras/models/{}.json'
        weight_str = 'robotreviewer/data/keras/models/{}.hdf5'
        self.CNN_models = OrderedDict()
        for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']:
            # Load vectorizer and keras model
            vectorizer_loc = vectorizer_str.format(bias_domain)
            arch_loc = arch_str.format(bias_domain)
            weight_loc = weight_str.format(bias_domain)
            preprocessor = pickle.load(open(vectorizer_loc, 'rb'))
            self.CNN_models[bias_domain] = RationaleCNN(
                preprocessor,
                document_model_architecture_path=arch_loc,
                document_model_weights_path=weight_loc)

        # Linear domains (these are joint models!)
        self.linear_sent_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.linear_doc_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_doc_level.npz'))
        self.linear_vec = ModularVectorizer(norm=None,
                                            non_negative=True,
                                            binary=True,
                                            ngram_range=(1, 2),
                                            n_features=2**26)
Ejemplo n.º 3
0
    def __init__(self, top_k=2, min_k=1):
        """
        In most cases, a fixed number of sentences (top_k) will be
        returned for each document, *except* when the decision
        scores are below a threshold (i.e. the implication being
        that none of the sentences are relevant).

        top_k = the default number of sentences to retrive per
                document
        min_k = ensure that at at least min_k sentences are
                always returned


        """

        logging.debug("Loading PICO classifiers")

        self.P_clf = MiniClassifier(robotreviewer.get_data("pico/P_model.npz"))
        self.I_clf = MiniClassifier(robotreviewer.get_data("pico/I_model.npz"))
        self.O_clf = MiniClassifier(robotreviewer.get_data("pico/O_model.npz"))

        logging.debug("PICO classifiers loaded")

        logging.debug("Loading IDF weights")
        with open(robotreviewer.get_data("pico/P_idf.npz"), 'rb') as f:
            self.P_idf = diags(
                np.load(f, allow_pickle=True,
                        encoding='latin1').item().todense().A1, 0)

        with open(robotreviewer.get_data("pico/I_idf.npz"), 'rb') as f:
            self.I_idf = diags(
                np.load(f, allow_pickle=True,
                        encoding='latin1').item().todense().A1, 0)

        with open(robotreviewer.get_data("pico/O_idf.npz"), 'rb') as f:
            self.O_idf = diags(
                np.load(f, allow_pickle=True,
                        encoding='latin1').item().todense().A1, 0)

        logging.debug("IDF weights loaded")

        self.vec = PICO_vectorizer()
        self.models = [self.P_clf, self.I_clf, self.O_clf]
        self.idfs = [self.P_idf, self.I_idf, self.O_idf]
        self.PICO_domains = ["Population", "Intervention", "Outcomes"]

        # if config.USE_METAMAP:
        #     self.metamap = MetaMap.get_instance()

        self.top_k = top_k
        self.min_k = min_k
Ejemplo n.º 4
0
    def __init__(self):
        self.svm_clf = MiniClassifier(os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))

        cnn_weight_files = glob.glob(os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
        json_filename = os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_cnn_structure.json')
        self.cnn_clfs = [get_model(json_filename, cnn_weight_file) for cnn_weight_file in cnn_weight_files]
        self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english')
        self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_cnn_vocab_map.pck'))




        self.scale_constants =  {'cnn': {'mean': 0.15592811611054261,
                      'std': 0.22405916984696986,
                      'weight': 1.6666666666666667},
                     'ptyp': {'mean': 0.055155532891381948,
                      'std': 0.22828359573751594},
                     'svm': {'mean': -0.75481403525485891,
                      'std': 0.7812955939364481,
                      'weight': 10.0}} # weighted in mean since we use only 1 SVM model (since produces near identical results to binning 10) and 6 CNN models (since runs faster, and no further reduction in variance for further models)

        self.thresholds = {'cnn': {'precise': 2.1340457758193034,
              'sensitive': -0.076709540491855063},
             'cnn_ptyp': {'precise': 3.529609848417909,
              'sensitive': 0.083502632442633312},
             'svm': {'precise': 1.9185522606237164,
              'sensitive': 0.093273630980694439},
             'svm_cnn': {'precise': 1.8749128673557529,
              'sensitive': 0.064481902000491614},
             'svm_cnn_ptyp': {'precise': 3.7674045603568755,
              'sensitive': 0.1952449060483534},
             'svm_ptyp': {'precise': 3.7358855328111837,
              'sensitive': 0.42992224964656178}}# All precise models have been calibrated to 97.6% sensitivity
Ejemplo n.º 5
0
 def __init__(self):
     from keras.preprocessing import sequence
     from keras.models import load_model
     from keras.models import Sequential
     from keras.preprocessing import sequence
     from keras.layers import Dense, Dropout, Activation, Lambda, Input, merge, Flatten
     from keras.layers import Embedding
     from keras.layers import Convolution1D, MaxPooling1D
     from keras import backend as K
     from keras.models import Model
     from keras.regularizers import l2
     global sequence, load_model, Sequential, Dense, Dropout, Activation, Lambda, Input, merge, Flatten
     global Embedding, Convolution1D, MaxPooling1D, K, Model, l2
     self.svm_clf = MiniClassifier(
         os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))
     cnn_weight_files = glob.glob(
         os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
     self.cnn_clfs = [
         load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files
     ]
     self.svm_vectorizer = HashingVectorizer(binary=False,
                                             ngram_range=(1, 1),
                                             stop_words='english')
     self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(
         robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'),
                                           stop_words='english')
     with open(
             os.path.join(robotreviewer.DATA_ROOT,
                          'rct/rct_model_calibration.json'), 'r') as f:
         self.constants = json.load(f)
Ejemplo n.º 6
0
    def __init__(self, top_k=None):
        
        self.bias_domains = ['Random sequence generation']
        self.top_k = top_k

        self.bias_domains = {'RSG': 'Random sequence generation',
                             'AC': 'Allocation concealment',
                             'BPP': 'Blinding of participants and personnel',
                             'BOA': 'Blinding of outcome assessment'
        }

        ###
        # Here we take a simple ensembling approach in which we combine the
        # predictions made by our rationaleCNN model and the JAMIA (linear)
        # multi task variant.
        ###

        self.all_domains = ['RSG', 'AC', 'BPP', 'BOA']

        # CNN domains
        vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle'
        arch_str = 'robotreviewer/data/keras/models/{}.json'
        weight_str = 'robotreviewer/data/keras/models/{}.hdf5'
        self.CNN_models = OrderedDict()

        for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']:
            # Load vectorizer and keras model
            vectorizer_loc = vectorizer_str.format(bias_domain)
            arch_loc = arch_str.format(bias_domain)
            weight_loc = weight_str.format(bias_domain)
            preprocessor = pickle.load(open(vectorizer_loc, 'rb'))

            preprocessor.tokenizer.oov_token = None

            self.CNN_models[bias_domain] = RationaleCNN(preprocessor,
                                                    document_model_architecture_path=arch_loc,
                                                    document_model_weights_path=weight_loc)

        # Linear domains (these are joint models!)
        self.linear_sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.linear_doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz'))
        self.linear_vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2),
                                                n_features=2**26)
Ejemplo n.º 7
0
 def __init__(self):
     self.svm_clf = MiniClassifier(
         os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))
     cnn_weight_files = glob.glob(
         os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
     self.cnn_clfs = [
         load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files
     ]
     self.svm_vectorizer = HashingVectorizer(binary=False,
                                             ngram_range=(1, 1),
                                             stop_words='english')
     self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(
         robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'),
                                           stop_words='english')
     with open(
             os.path.join(robotreviewer.DATA_ROOT,
                          'rct/rct_model_calibration.json'), 'r') as f:
         self.constants = json.load(f)
    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """

        self.doc_clf = MiniClassifier(
            robotreviewer.get_data(os.path.join('bias_ab', 'bias_ab.npz')))
        self.vec = ModularVectorizer(norm=None,
                                     non_negative=True,
                                     binary=True,
                                     ngram_range=(1, 2))
        self.bias_domains = [
            'random_sequence_generation', 'allocation_concealment',
            'blinding_participants_personnel'
        ]
        self.top_k = top_k
Ejemplo n.º 9
0
class TestMiniClassifier(unittest.TestCase):

    doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz'))
    util = Utilities()

    def test_init(self):
        ''' test for MiniClassifier.__init__() '''
        self.assertTrue(isinstance(self.doc_clf.coef, np.ndarray))
        self.assertTrue(isinstance(self.doc_clf.intercept, float))

    def test_decision_function(self):
        ''' test for MiniClassifier.decision_function(X) '''
        X = self.util.load_sparse_csr("X_data.npz")
        dec = self.doc_clf.decision_function(X)  # [ 1.50563252]
        decTest = np.float64([1.50563252])
        ''' can't do:
            print(np.array_equal(dec, y))
            print(np.array_equiv(dec, y))
            since as decimals these will not pass
        '''
        self.assertTrue(np.allclose(dec, decTest))

    def test_predict(self):
        ''' test for MiniClassifier.predict(X) '''
        X = self.util.load_sparse_csr("X_data.npz")
        pred = self.doc_clf.predict(X)  # [1]
        self.assertEqual(pred, np.int(1))

    def test_predict_proba(self):
        ''' tests for MiniClassifier.predict_proba(X) '''
        with open(ex_path + "rationale_robot_data.json", "r",
                  encoding="utf-8") as data:
            data = json.load(data)
        bpl = data["bias_prob_linear"]
        X = self.util.load_sparse_csr("X_data.npz")
        bpl_test = self.doc_clf.predict_proba(X)[0]
        self.assertTrue(abs(bpl - bpl_test) < 0.01)