def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """

        self.sent_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.doc_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_doc_level.npz'))

        self.vec = ModularVectorizer(norm=None,
                                     non_negative=True,
                                     binary=True,
                                     ngram_range=(1, 2),
                                     n_features=2**26)

        self.bias_domains = [
            'Random sequence generation', 'Allocation concealment',
            'Blinding of participants and personnel',
            'Blinding of outcome assessment', 'Incomplete outcome data',
            'Selective reporting'
        ]

        self.top_k = top_k
 def __init__(self):
     raw_data = np.load(robotreviewer.get_data('pubmed/pubmed_title_hash_2016_07_24.npz'))
     self.vec_ti = csr_matrix((raw_data['data'], raw_data['indices'], raw_data['indptr']), raw_data['shape'])
     self.pmid_ind = np.load(robotreviewer.get_data('pubmed/pubmed_index_2016_07_24.npz'))['pmid_ind']
     self.vectorizer = HashingVectorizer(binary=True, stop_words='english')
     # load database
     self.connection = sqlite3.connect(robotreviewer.get_data('pubmed/pubmed_rcts_2016_07_24.sqlite'))
     self.c = self.connection.cursor()
    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default

        """
        self.bias_domains = ['Random sequence generation']
        self.top_k = top_k

        self.bias_domains = {
            'RSG': 'Random sequence generation',
            'AC': 'Allocation concealment',
            'BPP': 'Blinding of participants and personnel',
            'BOA': 'Blinding of outcome assessment',
            'IOD': 'Incomplete outcome data',
            'SR': 'Selective reporting'
        }

        ###
        # Here we take a simple ensembling approach in which we combine the
        # predictions made by our rationaleCNN model and the JAMIA (linear)
        # multi task variant.
        ###

        self.all_domains = ['RSG', 'AC', 'BPP', 'BOA']

        # CNN domains
        vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle'
        arch_str = 'robotreviewer/data/keras/models/{}.json'
        weight_str = 'robotreviewer/data/keras/models/{}.hdf5'
        self.CNN_models = OrderedDict()
        for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']:
            # Load vectorizer and keras model
            vectorizer_loc = vectorizer_str.format(bias_domain)
            arch_loc = arch_str.format(bias_domain)
            weight_loc = weight_str.format(bias_domain)
            preprocessor = pickle.load(open(vectorizer_loc, 'rb'))
            self.CNN_models[bias_domain] = RationaleCNN(
                preprocessor,
                document_model_architecture_path=arch_loc,
                document_model_weights_path=weight_loc)

        # Linear domains (these are joint models!)
        self.linear_sent_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.linear_doc_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_doc_level.npz'))
        self.linear_vec = ModularVectorizer(norm=None,
                                            non_negative=True,
                                            binary=True,
                                            ngram_range=(1, 2),
                                            n_features=2**26)
    def __init__(self, top_k=None):
        
        self.bias_domains = ['Random sequence generation']
        self.top_k = top_k

        self.bias_domains = {'RSG': 'Random sequence generation',
                             'AC': 'Allocation concealment',
                             'BPP': 'Blinding of participants and personnel',
                             'BOA': 'Blinding of outcome assessment'
        }

        ###
        # Here we take a simple ensembling approach in which we combine the
        # predictions made by our rationaleCNN model and the JAMIA (linear)
        # multi task variant.
        ###

        self.all_domains = ['RSG', 'AC', 'BPP', 'BOA']

        # CNN domains
        vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle'
        arch_str = 'robotreviewer/data/keras/models/{}.json'
        weight_str = 'robotreviewer/data/keras/models/{}.hdf5'
        self.CNN_models = OrderedDict()

        for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']:
            # Load vectorizer and keras model
            vectorizer_loc = vectorizer_str.format(bias_domain)
            arch_loc = arch_str.format(bias_domain)
            weight_loc = weight_str.format(bias_domain)
            preprocessor = pickle.load(open(vectorizer_loc, 'rb'))

            preprocessor.tokenizer.oov_token = None

            self.CNN_models[bias_domain] = RationaleCNN(preprocessor,
                                                    document_model_architecture_path=arch_loc,
                                                    document_model_weights_path=weight_loc)

        # Linear domains (these are joint models!)
        self.linear_sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.linear_doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz'))
        self.linear_vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2),
                                                n_features=2**26)
Beispiel #5
0
def cleanup_database(days=1):
    """
    remove any PDFs which have been here for more than
    1 day, then compact the database
    """
    log.info('Cleaning up database')
    conn = sqlite3.connect(robotreviewer.get_data('uploaded_pdfs/uploaded_pdfs.sqlite'),
                           detect_types=sqlite3.PARSE_DECLTYPES)

    d = datetime.now() - timedelta(days=days)
    c = conn.cursor()
    c.execute("DELETE FROM article WHERE timestamp < datetime(?) AND dont_delete=0", [d])
    conn.commit()
    conn.execute("VACUUM") # make the database smaller again
    conn.commit()
    conn.close()
Beispiel #6
0
    def __init__(self, top_k=2, min_k=1):
        """
        In most cases, a fixed number of sentences (top_k) will be
        returned for each document, *except* when the decision
        scores are below a threshold (i.e. the implication being
        that none of the sentences are relevant).

        top_k = the default number of sentences to retrive per
                document
        min_k = ensure that at at least min_k sentences are
                always returned


        """

        logging.debug("Loading PICO classifiers")

        self.P_clf = MiniClassifier(robotreviewer.get_data("pico/P_model.npz"))
        self.I_clf = MiniClassifier(robotreviewer.get_data("pico/I_model.npz"))
        self.O_clf = MiniClassifier(robotreviewer.get_data("pico/O_model.npz"))

        logging.debug("PICO classifiers loaded")

        logging.debug("Loading IDF weights")
        with open(robotreviewer.get_data("pico/P_idf.npz"), 'rb') as f:
            self.P_idf = diags(
                np.load(f, allow_pickle=True,
                        encoding='latin1').item().todense().A1, 0)

        with open(robotreviewer.get_data("pico/I_idf.npz"), 'rb') as f:
            self.I_idf = diags(
                np.load(f, allow_pickle=True,
                        encoding='latin1').item().todense().A1, 0)

        with open(robotreviewer.get_data("pico/O_idf.npz"), 'rb') as f:
            self.O_idf = diags(
                np.load(f, allow_pickle=True,
                        encoding='latin1').item().todense().A1, 0)

        logging.debug("IDF weights loaded")

        self.vec = PICO_vectorizer()
        self.models = [self.P_clf, self.I_clf, self.O_clf]
        self.idfs = [self.P_idf, self.I_idf, self.O_idf]
        self.PICO_domains = ["Population", "Intervention", "Outcomes"]

        # if config.USE_METAMAP:
        #     self.metamap = MetaMap.get_instance()

        self.top_k = top_k
        self.min_k = min_k
    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """

        self.doc_clf = MiniClassifier(
            robotreviewer.get_data(os.path.join('bias_ab', 'bias_ab.npz')))
        self.vec = ModularVectorizer(norm=None,
                                     non_negative=True,
                                     binary=True,
                                     ngram_range=(1, 2))
        self.bias_domains = [
            'random_sequence_generation', 'allocation_concealment',
            'blinding_participants_personnel'
        ]
        self.top_k = top_k
Beispiel #8
0
class TestMiniClassifier(unittest.TestCase):

    doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz'))
    util = Utilities()

    def test_init(self):
        ''' test for MiniClassifier.__init__() '''
        self.assertTrue(isinstance(self.doc_clf.coef, np.ndarray))
        self.assertTrue(isinstance(self.doc_clf.intercept, float))

    def test_decision_function(self):
        ''' test for MiniClassifier.decision_function(X) '''
        X = self.util.load_sparse_csr("X_data.npz")
        dec = self.doc_clf.decision_function(X)  # [ 1.50563252]
        decTest = np.float64([1.50563252])
        ''' can't do:
            print(np.array_equal(dec, y))
            print(np.array_equiv(dec, y))
            since as decimals these will not pass
        '''
        self.assertTrue(np.allclose(dec, decTest))

    def test_predict(self):
        ''' test for MiniClassifier.predict(X) '''
        X = self.util.load_sparse_csr("X_data.npz")
        pred = self.doc_clf.predict(X)  # [1]
        self.assertEqual(pred, np.int(1))

    def test_predict_proba(self):
        ''' tests for MiniClassifier.predict_proba(X) '''
        with open(ex_path + "rationale_robot_data.json", "r",
                  encoding="utf-8") as data:
            data = json.load(data)
        bpl = data["bias_prob_linear"]
        X = self.util.load_sparse_csr("X_data.npz")
        bpl_test = self.doc_clf.predict_proba(X)[0]
        self.assertTrue(abs(bpl - bpl_test) < 0.01)
Beispiel #9
0
        "sample_size_bot":SampleSizeBot()}

log.info("Robots loaded successfully! Ready...")
'''

# lastly wait until Grobid is connected
pdf_reader.connect()

# start up Celery service
app = Celery('ml_worker', backend='amqp://', broker='amqp://')

#####
## connect to and set up database
#####
rr_sql_conn = sqlite3.connect(
    robotreviewer.get_data('uploaded_pdfs/uploaded_pdfs.sqlite'),
    detect_types=sqlite3.PARSE_DECLTYPES,
    check_same_thread=False)

c = rr_sql_conn.cursor()
c.execute(
    'CREATE TABLE IF NOT EXISTS doc_queue(id INTEGER PRIMARY KEY, report_uuid TEXT, pdf_uuid TEXT, pdf_hash TEXT, pdf_filename TEXT, pdf_file BLOB, timestamp TIMESTAMP)'
)

c.execute(
    'CREATE TABLE IF NOT EXISTS article(id INTEGER PRIMARY KEY, report_uuid TEXT, pdf_uuid TEXT, pdf_hash TEXT, pdf_file BLOB, annotations TEXT, timestamp TIMESTAMP, dont_delete INTEGER)'
)
c.close()
rr_sql_conn.commit()

Beispiel #10
0
    def __init__(self):
        
        with open(robotreviewer.get_data(os.path.join('bias_ab', 'bias_prob_clf.pck')), 'rb') as f:
            self.clf = pickle.load(f)

        self.vec = HashingVectorizer(ngram_range=(1, 3), stop_words='english')
Beispiel #11
0
 def __init__(self):
         with open(robotreviewer.get_data('drugbank/drugbank.pck'), 'rb') as f:
             self.data = pickle.load(f)
             self.description = pickle.load(f)
Beispiel #12
0
def str2bool(v):
    return v.lower() in ("yes", "true", "t", "1")


DEBUG_MODE = str2bool(os.environ.get("DEBUG", "true"))
LOCAL_PATH = "robotreviewer/uploads"
LOG_LEVEL = (logging.DEBUG if DEBUG_MODE else logging.INFO)
# determined empirically by Edward; covers 90% of abstracts
# (crudely and unscientifically adjusted for grobid)
NUM_WORDS_IN_ABSTRACT = 450
import robotreviewer
from robotreviewer import config
logging.basicConfig(level=LOG_LEVEL,
                    format='[%(levelname)s] %(name)s %(asctime)s: %(message)s',
                    filename=robotreviewer.get_data(config.LOG))
log = logging.getLogger(__name__)

log.info("RobotReviewer machine learning tasks starting")

from robotreviewer.textprocessing.pdfreader import PdfReader
pdf_reader = PdfReader()  # launch Grobid process before anything else

from robotreviewer.textprocessing.tokenizer import nlp
''' robots! '''
# from robotreviewer.robots.bias_robot import BiasRobot
from robotreviewer.robots.rationale_robot import BiasRobot
from robotreviewer.robots.pico_robot import PICORobot
from robotreviewer.robots.rct_robot import RCTRobot
from robotreviewer.robots.pubmed_robot import PubmedRobot
from robotreviewer.robots.pico_span_robot import PICOSpanRobot
Beispiel #13
0
from robotreviewer.util import rand_id
from celery import Celery
from celery.result import AsyncResult
from datetime import datetime
import robotreviewer
import sqlite3
import json

import connexion

celery_app = Celery('robotreviewer.ml_worker', backend='amqp://', broker='amqp://')
celery_tasks = {"api_annotate": celery_app.signature('robotreviewer.ml_worker.api_annotate')}

rr_sql_conn = sqlite3.connect(robotreviewer.get_data('uploaded_pdfs/uploaded_pdfs.sqlite'), detect_types=sqlite3.PARSE_DECLTYPES,  check_same_thread=False)

def queue_documents(body):
    report_uuid = rand_id()
    c = rr_sql_conn.cursor()
    c.execute("INSERT INTO api_queue (report_uuid, uploaded_data, timestamp) VALUES (?, ?, ?)", (report_uuid, json.dumps(body), datetime.now()))
    rr_sql_conn.commit()
    c.close()
    # send async request to Celery
    celery_tasks['api_annotate'].apply_async((report_uuid, ), task_id=report_uuid)
    return json.dumps({"report_id": report_uuid})


def report_status(report_id):
    '''
    check and return status of celery annotation process
    '''
    result = AsyncResult(report_id, app=celery_app)
Beispiel #14
0
log.info("Loading the robots...")
bots = {"bias_bot": BiasRobot(top_k=3),
        "pico_bot": PICORobot(),
        "pubmed_bot": PubmedRobot(),
        # "ictrp_bot": ICTRPRobot(),
        "rct_bot": RCTRobot(),
        "pico_viz_bot": PICOVizRobot(),
        "sample_size_bot":SampleSizeBot()}
        # "mendeley_bot": MendeleyRobot()}

log.info("Robots loaded successfully! Ready...")

#####
## connect to and set up database
#####
rr_sql_conn = sqlite3.connect(robotreviewer.get_data('uploaded_pdfs/uploaded_pdfs.sqlite'), detect_types=sqlite3.PARSE_DECLTYPES)
c = rr_sql_conn.cursor()

c.execute('CREATE TABLE IF NOT EXISTS article(id INTEGER PRIMARY KEY, report_uuid TEXT, pdf_uuid TEXT, pdf_hash TEXT, pdf_file BLOB, annotations TEXT, timestamp TIMESTAMP, dont_delete INTEGER)')
c.close()
rr_sql_conn.commit()



# lastly wait until Grobid is connected
pdf_reader.connect()

@app.route('/')
def main():
    resp = make_response(render_template('index.html'))
    return resp
Beispiel #15
0
import logging, os

import sqlite3
def str2bool(v):
    return v.lower() in ("yes", "true", "t", "1")


DEBUG_MODE = str2bool(os.environ.get("DEBUG", "true"))
LOCAL_PATH = "robotreviewer/uploads"
LOG_LEVEL = (logging.DEBUG if DEBUG_MODE else logging.INFO)
# determined empirically by Edward; covers 90% of abstracts
# (crudely and unscientifically adjusted for grobid)
NUM_WORDS_IN_ABSTRACT = 450
import robotreviewer
from robotreviewer import config
logging.basicConfig(level=LOG_LEVEL, format='[%(levelname)s] %(name)s %(asctime)s: %(message)s', filename=robotreviewer.get_data(config.LOG))
log = logging.getLogger(__name__)

log.info("RobotReviewer machine learning tasks starting")


from robotreviewer.textprocessing.pdfreader import PdfReader
pdf_reader = PdfReader() # launch Grobid process before anything else


from robotreviewer.textprocessing.tokenizer import nlp

''' robots! '''
# from robotreviewer.robots.bias_robot import BiasRobot
from robotreviewer.robots.rationale_robot import BiasRobot
from robotreviewer.robots.pico_robot import PICORobot