def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.sent_clf = MiniClassifier( robotreviewer.get_data('bias/bias_sent_level.npz')) self.doc_clf = MiniClassifier( robotreviewer.get_data('bias/bias_doc_level.npz')) self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) self.bias_domains = [ 'Random sequence generation', 'Allocation concealment', 'Blinding of participants and personnel', 'Blinding of outcome assessment', 'Incomplete outcome data', 'Selective reporting' ] self.top_k = top_k
def __init__(self): raw_data = np.load(robotreviewer.get_data('pubmed/pubmed_title_hash_2016_07_24.npz')) self.vec_ti = csr_matrix((raw_data['data'], raw_data['indices'], raw_data['indptr']), raw_data['shape']) self.pmid_ind = np.load(robotreviewer.get_data('pubmed/pubmed_index_2016_07_24.npz'))['pmid_ind'] self.vectorizer = HashingVectorizer(binary=True, stop_words='english') # load database self.connection = sqlite3.connect(robotreviewer.get_data('pubmed/pubmed_rcts_2016_07_24.sqlite')) self.c = self.connection.cursor()
def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.bias_domains = ['Random sequence generation'] self.top_k = top_k self.bias_domains = { 'RSG': 'Random sequence generation', 'AC': 'Allocation concealment', 'BPP': 'Blinding of participants and personnel', 'BOA': 'Blinding of outcome assessment', 'IOD': 'Incomplete outcome data', 'SR': 'Selective reporting' } ### # Here we take a simple ensembling approach in which we combine the # predictions made by our rationaleCNN model and the JAMIA (linear) # multi task variant. ### self.all_domains = ['RSG', 'AC', 'BPP', 'BOA'] # CNN domains vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle' arch_str = 'robotreviewer/data/keras/models/{}.json' weight_str = 'robotreviewer/data/keras/models/{}.hdf5' self.CNN_models = OrderedDict() for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']: # Load vectorizer and keras model vectorizer_loc = vectorizer_str.format(bias_domain) arch_loc = arch_str.format(bias_domain) weight_loc = weight_str.format(bias_domain) preprocessor = pickle.load(open(vectorizer_loc, 'rb')) self.CNN_models[bias_domain] = RationaleCNN( preprocessor, document_model_architecture_path=arch_loc, document_model_weights_path=weight_loc) # Linear domains (these are joint models!) self.linear_sent_clf = MiniClassifier( robotreviewer.get_data('bias/bias_sent_level.npz')) self.linear_doc_clf = MiniClassifier( robotreviewer.get_data('bias/bias_doc_level.npz')) self.linear_vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26)
def __init__(self, top_k=None): self.bias_domains = ['Random sequence generation'] self.top_k = top_k self.bias_domains = {'RSG': 'Random sequence generation', 'AC': 'Allocation concealment', 'BPP': 'Blinding of participants and personnel', 'BOA': 'Blinding of outcome assessment' } ### # Here we take a simple ensembling approach in which we combine the # predictions made by our rationaleCNN model and the JAMIA (linear) # multi task variant. ### self.all_domains = ['RSG', 'AC', 'BPP', 'BOA'] # CNN domains vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle' arch_str = 'robotreviewer/data/keras/models/{}.json' weight_str = 'robotreviewer/data/keras/models/{}.hdf5' self.CNN_models = OrderedDict() for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']: # Load vectorizer and keras model vectorizer_loc = vectorizer_str.format(bias_domain) arch_loc = arch_str.format(bias_domain) weight_loc = weight_str.format(bias_domain) preprocessor = pickle.load(open(vectorizer_loc, 'rb')) preprocessor.tokenizer.oov_token = None self.CNN_models[bias_domain] = RationaleCNN(preprocessor, document_model_architecture_path=arch_loc, document_model_weights_path=weight_loc) # Linear domains (these are joint models!) self.linear_sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz')) self.linear_doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz')) self.linear_vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26)
def cleanup_database(days=1): """ remove any PDFs which have been here for more than 1 day, then compact the database """ log.info('Cleaning up database') conn = sqlite3.connect(robotreviewer.get_data('uploaded_pdfs/uploaded_pdfs.sqlite'), detect_types=sqlite3.PARSE_DECLTYPES) d = datetime.now() - timedelta(days=days) c = conn.cursor() c.execute("DELETE FROM article WHERE timestamp < datetime(?) AND dont_delete=0", [d]) conn.commit() conn.execute("VACUUM") # make the database smaller again conn.commit() conn.close()
def __init__(self, top_k=2, min_k=1): """ In most cases, a fixed number of sentences (top_k) will be returned for each document, *except* when the decision scores are below a threshold (i.e. the implication being that none of the sentences are relevant). top_k = the default number of sentences to retrive per document min_k = ensure that at at least min_k sentences are always returned """ logging.debug("Loading PICO classifiers") self.P_clf = MiniClassifier(robotreviewer.get_data("pico/P_model.npz")) self.I_clf = MiniClassifier(robotreviewer.get_data("pico/I_model.npz")) self.O_clf = MiniClassifier(robotreviewer.get_data("pico/O_model.npz")) logging.debug("PICO classifiers loaded") logging.debug("Loading IDF weights") with open(robotreviewer.get_data("pico/P_idf.npz"), 'rb') as f: self.P_idf = diags( np.load(f, allow_pickle=True, encoding='latin1').item().todense().A1, 0) with open(robotreviewer.get_data("pico/I_idf.npz"), 'rb') as f: self.I_idf = diags( np.load(f, allow_pickle=True, encoding='latin1').item().todense().A1, 0) with open(robotreviewer.get_data("pico/O_idf.npz"), 'rb') as f: self.O_idf = diags( np.load(f, allow_pickle=True, encoding='latin1').item().todense().A1, 0) logging.debug("IDF weights loaded") self.vec = PICO_vectorizer() self.models = [self.P_clf, self.I_clf, self.O_clf] self.idfs = [self.P_idf, self.I_idf, self.O_idf] self.PICO_domains = ["Population", "Intervention", "Outcomes"] # if config.USE_METAMAP: # self.metamap = MetaMap.get_instance() self.top_k = top_k self.min_k = min_k
def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.doc_clf = MiniClassifier( robotreviewer.get_data(os.path.join('bias_ab', 'bias_ab.npz'))) self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2)) self.bias_domains = [ 'random_sequence_generation', 'allocation_concealment', 'blinding_participants_personnel' ] self.top_k = top_k
class TestMiniClassifier(unittest.TestCase): doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz')) util = Utilities() def test_init(self): ''' test for MiniClassifier.__init__() ''' self.assertTrue(isinstance(self.doc_clf.coef, np.ndarray)) self.assertTrue(isinstance(self.doc_clf.intercept, float)) def test_decision_function(self): ''' test for MiniClassifier.decision_function(X) ''' X = self.util.load_sparse_csr("X_data.npz") dec = self.doc_clf.decision_function(X) # [ 1.50563252] decTest = np.float64([1.50563252]) ''' can't do: print(np.array_equal(dec, y)) print(np.array_equiv(dec, y)) since as decimals these will not pass ''' self.assertTrue(np.allclose(dec, decTest)) def test_predict(self): ''' test for MiniClassifier.predict(X) ''' X = self.util.load_sparse_csr("X_data.npz") pred = self.doc_clf.predict(X) # [1] self.assertEqual(pred, np.int(1)) def test_predict_proba(self): ''' tests for MiniClassifier.predict_proba(X) ''' with open(ex_path + "rationale_robot_data.json", "r", encoding="utf-8") as data: data = json.load(data) bpl = data["bias_prob_linear"] X = self.util.load_sparse_csr("X_data.npz") bpl_test = self.doc_clf.predict_proba(X)[0] self.assertTrue(abs(bpl - bpl_test) < 0.01)
"sample_size_bot":SampleSizeBot()} log.info("Robots loaded successfully! Ready...") ''' # lastly wait until Grobid is connected pdf_reader.connect() # start up Celery service app = Celery('ml_worker', backend='amqp://', broker='amqp://') ##### ## connect to and set up database ##### rr_sql_conn = sqlite3.connect( robotreviewer.get_data('uploaded_pdfs/uploaded_pdfs.sqlite'), detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False) c = rr_sql_conn.cursor() c.execute( 'CREATE TABLE IF NOT EXISTS doc_queue(id INTEGER PRIMARY KEY, report_uuid TEXT, pdf_uuid TEXT, pdf_hash TEXT, pdf_filename TEXT, pdf_file BLOB, timestamp TIMESTAMP)' ) c.execute( 'CREATE TABLE IF NOT EXISTS article(id INTEGER PRIMARY KEY, report_uuid TEXT, pdf_uuid TEXT, pdf_hash TEXT, pdf_file BLOB, annotations TEXT, timestamp TIMESTAMP, dont_delete INTEGER)' ) c.close() rr_sql_conn.commit()
def __init__(self): with open(robotreviewer.get_data(os.path.join('bias_ab', 'bias_prob_clf.pck')), 'rb') as f: self.clf = pickle.load(f) self.vec = HashingVectorizer(ngram_range=(1, 3), stop_words='english')
def __init__(self): with open(robotreviewer.get_data('drugbank/drugbank.pck'), 'rb') as f: self.data = pickle.load(f) self.description = pickle.load(f)
def str2bool(v): return v.lower() in ("yes", "true", "t", "1") DEBUG_MODE = str2bool(os.environ.get("DEBUG", "true")) LOCAL_PATH = "robotreviewer/uploads" LOG_LEVEL = (logging.DEBUG if DEBUG_MODE else logging.INFO) # determined empirically by Edward; covers 90% of abstracts # (crudely and unscientifically adjusted for grobid) NUM_WORDS_IN_ABSTRACT = 450 import robotreviewer from robotreviewer import config logging.basicConfig(level=LOG_LEVEL, format='[%(levelname)s] %(name)s %(asctime)s: %(message)s', filename=robotreviewer.get_data(config.LOG)) log = logging.getLogger(__name__) log.info("RobotReviewer machine learning tasks starting") from robotreviewer.textprocessing.pdfreader import PdfReader pdf_reader = PdfReader() # launch Grobid process before anything else from robotreviewer.textprocessing.tokenizer import nlp ''' robots! ''' # from robotreviewer.robots.bias_robot import BiasRobot from robotreviewer.robots.rationale_robot import BiasRobot from robotreviewer.robots.pico_robot import PICORobot from robotreviewer.robots.rct_robot import RCTRobot from robotreviewer.robots.pubmed_robot import PubmedRobot from robotreviewer.robots.pico_span_robot import PICOSpanRobot
from robotreviewer.util import rand_id from celery import Celery from celery.result import AsyncResult from datetime import datetime import robotreviewer import sqlite3 import json import connexion celery_app = Celery('robotreviewer.ml_worker', backend='amqp://', broker='amqp://') celery_tasks = {"api_annotate": celery_app.signature('robotreviewer.ml_worker.api_annotate')} rr_sql_conn = sqlite3.connect(robotreviewer.get_data('uploaded_pdfs/uploaded_pdfs.sqlite'), detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False) def queue_documents(body): report_uuid = rand_id() c = rr_sql_conn.cursor() c.execute("INSERT INTO api_queue (report_uuid, uploaded_data, timestamp) VALUES (?, ?, ?)", (report_uuid, json.dumps(body), datetime.now())) rr_sql_conn.commit() c.close() # send async request to Celery celery_tasks['api_annotate'].apply_async((report_uuid, ), task_id=report_uuid) return json.dumps({"report_id": report_uuid}) def report_status(report_id): ''' check and return status of celery annotation process ''' result = AsyncResult(report_id, app=celery_app)
log.info("Loading the robots...") bots = {"bias_bot": BiasRobot(top_k=3), "pico_bot": PICORobot(), "pubmed_bot": PubmedRobot(), # "ictrp_bot": ICTRPRobot(), "rct_bot": RCTRobot(), "pico_viz_bot": PICOVizRobot(), "sample_size_bot":SampleSizeBot()} # "mendeley_bot": MendeleyRobot()} log.info("Robots loaded successfully! Ready...") ##### ## connect to and set up database ##### rr_sql_conn = sqlite3.connect(robotreviewer.get_data('uploaded_pdfs/uploaded_pdfs.sqlite'), detect_types=sqlite3.PARSE_DECLTYPES) c = rr_sql_conn.cursor() c.execute('CREATE TABLE IF NOT EXISTS article(id INTEGER PRIMARY KEY, report_uuid TEXT, pdf_uuid TEXT, pdf_hash TEXT, pdf_file BLOB, annotations TEXT, timestamp TIMESTAMP, dont_delete INTEGER)') c.close() rr_sql_conn.commit() # lastly wait until Grobid is connected pdf_reader.connect() @app.route('/') def main(): resp = make_response(render_template('index.html')) return resp
import logging, os import sqlite3 def str2bool(v): return v.lower() in ("yes", "true", "t", "1") DEBUG_MODE = str2bool(os.environ.get("DEBUG", "true")) LOCAL_PATH = "robotreviewer/uploads" LOG_LEVEL = (logging.DEBUG if DEBUG_MODE else logging.INFO) # determined empirically by Edward; covers 90% of abstracts # (crudely and unscientifically adjusted for grobid) NUM_WORDS_IN_ABSTRACT = 450 import robotreviewer from robotreviewer import config logging.basicConfig(level=LOG_LEVEL, format='[%(levelname)s] %(name)s %(asctime)s: %(message)s', filename=robotreviewer.get_data(config.LOG)) log = logging.getLogger(__name__) log.info("RobotReviewer machine learning tasks starting") from robotreviewer.textprocessing.pdfreader import PdfReader pdf_reader = PdfReader() # launch Grobid process before anything else from robotreviewer.textprocessing.tokenizer import nlp ''' robots! ''' # from robotreviewer.robots.bias_robot import BiasRobot from robotreviewer.robots.rationale_robot import BiasRobot from robotreviewer.robots.pico_robot import PICORobot