def prepare_articles( articles ): parsed_articles = [] out_articles = [] for doc in nlp.pipe( ( d.get('text', u'') for d in articles), batch_size = 1, n_threads = 6 ): parsed_articles.append( doc ) for article, parsed_text in zip( articles, parsed_articles ): article._spacy['parsed_text'] = parsed_text out_articles.append( article ) return( articles )
def upload_and_annotate(): # uploads a bunch of PDFs, do the RobotReviewer annotation # save PDFs + annotations to database # returns the report run uuid + list of article uuids report_uuid = rand_id() pdf_uuids = [] uploaded_files = request.files.getlist("file") c = rr_sql_conn.cursor() blobs = [f.read() for f in uploaded_files] filenames = [f.filename for f in uploaded_files] articles = pdf_reader.convert_batch(blobs) parsed_articles = [] # tokenize full texts here for doc in nlp.pipe((d.get('text', u'') for d in articles), batch_size=1, n_threads=config.SPACY_THREADS, tag=True, parse=True, entity=False): parsed_articles.append(doc) # adjust the tag, parse, and entity values if these are needed later for article, parsed_text in zip(articles, parsed_articles): article._spacy['parsed_text'] = parsed_text for filename, blob, data in zip(filenames, blobs, articles): pdf_hash = hashlib.md5(blob).hexdigest() pdf_uuid = rand_id() pdf_uuids.append(pdf_uuid) data = annotate(data, bot_names=[ "pubmed_bot", "bias_bot", "pico_bot", "rct_bot", "pico_viz_bot" ]) data.gold['pdf_uuid'] = pdf_uuid data.gold['filename'] = filename c.execute( "INSERT INTO article (report_uuid, pdf_uuid, pdf_hash, pdf_file, annotations, timestamp, dont_delete) VALUES(?, ?, ?, ?, ?, ?, ?)", (report_uuid, pdf_uuid, pdf_hash, sqlite3.Binary(blob), data.to_json(), datetime.now(), config.DONT_DELETE)) rr_sql_conn.commit() c.close() return json.dumps({"report_uuid": report_uuid, "pdf_uuids": pdf_uuids})
def annotate(report_uuid): """ takes a report uuid as input searches for pdfs using that id, then saves annotations in database """ pdf_uuids, pdf_hashes, filenames, blobs, timestamps = [], [], [], [], [] c = rr_sql_conn.cursor() #import pdb; pdb.set_trace() # load in the PDF data from the queue table for pdf_uuid, pdf_hash, filename, pdf_file, timestamp in c.execute( "SELECT pdf_uuid, pdf_hash, pdf_filename, pdf_file, timestamp FROM doc_queue WHERE report_uuid=?", (report_uuid, )): pdf_uuids.append(pdf_uuid) pdf_hashes.append(pdf_hash) filenames.append(filename) blobs.append(pdf_file) timestamps.append(timestamp) c.close() current_task.update_state(state='PROGRESS', meta={ 'process_percentage': 25, 'task': 'reading PDFs' }) articles = pdf_reader.convert_batch(blobs) parsed_articles = [] current_task.update_state(state='PROGRESS', meta={ 'process_percentage': 50, 'task': 'parsing text' }) # tokenize full texts here for doc in nlp.pipe((d.get('text', u'') for d in articles), batch_size=1, n_threads=config.SPACY_THREADS): parsed_articles.append(doc) # adjust the tag, parse, and entity values if these are needed later for article, parsed_text in zip(articles, parsed_articles): article._spacy['parsed_text'] = parsed_text current_task.update_state(state='PROGRESS', meta={ 'process_percentage': 75, 'task': 'doing machine learning' }) for pdf_uuid, pdf_hash, filename, blob, data, timestamp in zip( pdf_uuids, pdf_hashes, filenames, blobs, articles, timestamps): # "pico_viz_bot", # DEBUG current_task.update_state(state='PROGRESS', meta={ 'process_percentage': 76, 'task': 'processing PDF {}'.format(filename) }) data = annotate_study(data, bot_names=[ "rct_bot", "pubmed_bot", "bias_bot", "pico_bot", "sample_size_bot" ]) #data = annotate_study(data, bot_names=["bias_bot"]) data.gold['pdf_uuid'] = pdf_uuid data.gold['filename'] = filename c = rr_sql_conn.cursor() c.execute( "INSERT INTO article (report_uuid, pdf_uuid, pdf_hash, pdf_file, annotations, timestamp, dont_delete) VALUES(?, ?, ?, ?, ?, ?, ?)", (report_uuid, pdf_uuid, pdf_hash, sqlite3.Binary(blob), data.to_json(), timestamp, config.DONT_DELETE)) rr_sql_conn.commit() c.close() # finally delete the PDFs from the queue c = rr_sql_conn.cursor() c.execute("DELETE FROM doc_queue WHERE report_uuid=?", (report_uuid, )) rr_sql_conn.commit() c.close() current_task.update_state(state='SUCCESS', meta={ 'process_percentage': 100, 'task': 'done!' }) return {"process_percentage": 100, "task": "completed"}
def api_annotate(report_uuid): """ Handles annotation tasks sent from the API Strict in datatype handling """ current_task.update_state(state='PROGRESS', meta={ 'status': "in process", 'position': "received request, fetching data" }) c = rr_sql_conn.cursor() c.execute( "SELECT uploaded_data, timestamp FROM api_queue WHERE report_uuid=?", (report_uuid, )) result = c.fetchone() uploaded_data_s, timestamp = result uploaded_data = json.loads(uploaded_data_s) articles = uploaded_data["articles"] target_robots = uploaded_data["robots"] filter_rcts = uploaded_data.get("filter_rcts", "is_rct_balanced") # now do the ML if filter_rcts != 'none': current_task.update_state(state='PROGRESS', meta={ 'status': "in process", 'position': "rct_robot classification" }) # do rct_bot first results = bots['rct_bot'].api_annotate(articles) for a, r in zip(articles, results): if r[filter_rcts]: a['skip_annotation'] = False else: a['skip_annotation'] = True a['rct_bot'] = r # and remove from the task list if present so don't duplicate target_robots = [tr for tr in target_robots if tr != "rct_bot"] current_task.update_state(state='PROGRESS', meta={ 'status': "in process", 'position': "tokenizing data" }) for k in ["ti", "ab", "fullText"]: parsed = nlp.pipe((a.get(k, "") for a in articles if a.get('skip_annotation', False) == False)) articles_gen = (a for a in articles) while True: try: current_doc = articles_gen.__next__() except StopIteration: break if current_doc.get("skip_annotation"): continue else: current_doc['parsed_{}'.format(k)] = parsed.__next__() for bot_name in target_robots: current_task.update_state(state='PROGRESS', meta={ 'status': "in process", 'position': "{} classification".format(bot_name) }) results = bots[bot_name].api_annotate(articles) for a, r in zip(articles, results): if not a.get('skip_annotations', False): a[bot_name] = r # delete the parsed text for article in articles: for k in ["ti", "ab", "fullText"]: article.pop('parsed_{}'.format(k), None) c = rr_sql_conn.cursor() current_task.update_state(state='PROGRESS', meta={ 'status': "in process", 'position': "writing the predictions to database" }) c.execute( "INSERT INTO api_done (report_uuid, annotations, timestamp) VALUES(?, ?, ?)", (report_uuid, json.dumps(articles), timestamp)) rr_sql_conn.commit() c.close() # finally delete the data from the queue c = rr_sql_conn.cursor() c.execute("DELETE FROM api_queue WHERE report_uuid=?", (report_uuid, )) rr_sql_conn.commit() c.close() current_task.update_state(state='done') return {"status": 100, "task": "completed"}