def prepare_articles( articles ):
    parsed_articles = []
    out_articles = []
    for doc in nlp.pipe( ( d.get('text', u'') for d in articles), batch_size = 1, n_threads = 6 ):
        parsed_articles.append( doc )

    for article, parsed_text in zip( articles, parsed_articles ):
        article._spacy['parsed_text'] = parsed_text
        out_articles.append( article )

    return( articles )
Esempio n. 2
0
def upload_and_annotate():
    # uploads a bunch of PDFs, do the RobotReviewer annotation
    # save PDFs + annotations to database
    # returns the report run uuid + list of article uuids

    report_uuid = rand_id()
    pdf_uuids = []

    uploaded_files = request.files.getlist("file")
    c = rr_sql_conn.cursor()

    blobs = [f.read() for f in uploaded_files]
    filenames = [f.filename for f in uploaded_files]

    articles = pdf_reader.convert_batch(blobs)
    parsed_articles = []
    # tokenize full texts here
    for doc in nlp.pipe((d.get('text', u'') for d in articles),
                        batch_size=1,
                        n_threads=config.SPACY_THREADS,
                        tag=True,
                        parse=True,
                        entity=False):
        parsed_articles.append(doc)

    # adjust the tag, parse, and entity values if these are needed later
    for article, parsed_text in zip(articles, parsed_articles):
        article._spacy['parsed_text'] = parsed_text

    for filename, blob, data in zip(filenames, blobs, articles):
        pdf_hash = hashlib.md5(blob).hexdigest()
        pdf_uuid = rand_id()
        pdf_uuids.append(pdf_uuid)
        data = annotate(data,
                        bot_names=[
                            "pubmed_bot", "bias_bot", "pico_bot", "rct_bot",
                            "pico_viz_bot"
                        ])
        data.gold['pdf_uuid'] = pdf_uuid
        data.gold['filename'] = filename

        c.execute(
            "INSERT INTO article (report_uuid, pdf_uuid, pdf_hash, pdf_file, annotations, timestamp, dont_delete) VALUES(?, ?, ?, ?, ?, ?, ?)",
            (report_uuid, pdf_uuid, pdf_hash, sqlite3.Binary(blob),
             data.to_json(), datetime.now(), config.DONT_DELETE))
        rr_sql_conn.commit()
    c.close()

    return json.dumps({"report_uuid": report_uuid, "pdf_uuids": pdf_uuids})
Esempio n. 3
0
def annotate(report_uuid):
    """
    takes a report uuid as input
    searches for pdfs using that id,
    then saves annotations in database
    """
    pdf_uuids, pdf_hashes, filenames, blobs, timestamps = [], [], [], [], []

    c = rr_sql_conn.cursor()

    #import pdb; pdb.set_trace()

    # load in the PDF data from the queue table
    for pdf_uuid, pdf_hash, filename, pdf_file, timestamp in c.execute(
            "SELECT pdf_uuid, pdf_hash, pdf_filename, pdf_file, timestamp FROM doc_queue WHERE report_uuid=?",
        (report_uuid, )):
        pdf_uuids.append(pdf_uuid)
        pdf_hashes.append(pdf_hash)
        filenames.append(filename)
        blobs.append(pdf_file)
        timestamps.append(timestamp)

    c.close()

    current_task.update_state(state='PROGRESS',
                              meta={
                                  'process_percentage': 25,
                                  'task': 'reading PDFs'
                              })
    articles = pdf_reader.convert_batch(blobs)
    parsed_articles = []

    current_task.update_state(state='PROGRESS',
                              meta={
                                  'process_percentage': 50,
                                  'task': 'parsing text'
                              })
    # tokenize full texts here
    for doc in nlp.pipe((d.get('text', u'') for d in articles),
                        batch_size=1,
                        n_threads=config.SPACY_THREADS):
        parsed_articles.append(doc)

    # adjust the tag, parse, and entity values if these are needed later
    for article, parsed_text in zip(articles, parsed_articles):
        article._spacy['parsed_text'] = parsed_text

    current_task.update_state(state='PROGRESS',
                              meta={
                                  'process_percentage': 75,
                                  'task': 'doing machine learning'
                              })

    for pdf_uuid, pdf_hash, filename, blob, data, timestamp in zip(
            pdf_uuids, pdf_hashes, filenames, blobs, articles, timestamps):

        # "pico_viz_bot",

        # DEBUG
        current_task.update_state(state='PROGRESS',
                                  meta={
                                      'process_percentage': 76,
                                      'task':
                                      'processing PDF {}'.format(filename)
                                  })

        data = annotate_study(data,
                              bot_names=[
                                  "rct_bot", "pubmed_bot", "bias_bot",
                                  "pico_bot", "sample_size_bot"
                              ])

        #data = annotate_study(data, bot_names=["bias_bot"])
        data.gold['pdf_uuid'] = pdf_uuid
        data.gold['filename'] = filename
        c = rr_sql_conn.cursor()
        c.execute(
            "INSERT INTO article (report_uuid, pdf_uuid, pdf_hash, pdf_file, annotations, timestamp, dont_delete) VALUES(?, ?, ?, ?, ?, ?, ?)",
            (report_uuid, pdf_uuid, pdf_hash, sqlite3.Binary(blob),
             data.to_json(), timestamp, config.DONT_DELETE))
        rr_sql_conn.commit()
        c.close()

    # finally delete the PDFs from the queue
    c = rr_sql_conn.cursor()
    c.execute("DELETE FROM doc_queue WHERE report_uuid=?", (report_uuid, ))
    rr_sql_conn.commit()
    c.close()
    current_task.update_state(state='SUCCESS',
                              meta={
                                  'process_percentage': 100,
                                  'task': 'done!'
                              })
    return {"process_percentage": 100, "task": "completed"}
Esempio n. 4
0
def api_annotate(report_uuid):
    """
    Handles annotation tasks sent from the API
    Strict in datatype handling
    """

    current_task.update_state(state='PROGRESS',
                              meta={
                                  'status': "in process",
                                  'position': "received request, fetching data"
                              })

    c = rr_sql_conn.cursor()

    c.execute(
        "SELECT uploaded_data, timestamp FROM api_queue WHERE report_uuid=?",
        (report_uuid, ))
    result = c.fetchone()
    uploaded_data_s, timestamp = result
    uploaded_data = json.loads(uploaded_data_s)

    articles = uploaded_data["articles"]
    target_robots = uploaded_data["robots"]
    filter_rcts = uploaded_data.get("filter_rcts", "is_rct_balanced")

    # now do the ML
    if filter_rcts != 'none':

        current_task.update_state(state='PROGRESS',
                                  meta={
                                      'status': "in process",
                                      'position': "rct_robot classification"
                                  })

        # do rct_bot first
        results = bots['rct_bot'].api_annotate(articles)
        for a, r in zip(articles, results):
            if r[filter_rcts]:
                a['skip_annotation'] = False
            else:
                a['skip_annotation'] = True
            a['rct_bot'] = r

        # and remove from the task list if present so don't duplicate
        target_robots = [tr for tr in target_robots if tr != "rct_bot"]

    current_task.update_state(state='PROGRESS',
                              meta={
                                  'status': "in process",
                                  'position': "tokenizing data"
                              })

    for k in ["ti", "ab", "fullText"]:

        parsed = nlp.pipe((a.get(k, "") for a in articles
                           if a.get('skip_annotation', False) == False))
        articles_gen = (a for a in articles)

        while True:
            try:
                current_doc = articles_gen.__next__()
            except StopIteration:
                break
            if current_doc.get("skip_annotation"):
                continue
            else:
                current_doc['parsed_{}'.format(k)] = parsed.__next__()

    for bot_name in target_robots:
        current_task.update_state(state='PROGRESS',
                                  meta={
                                      'status':
                                      "in process",
                                      'position':
                                      "{} classification".format(bot_name)
                                  })
        results = bots[bot_name].api_annotate(articles)
        for a, r in zip(articles, results):
            if not a.get('skip_annotations', False):
                a[bot_name] = r

    # delete the parsed text
    for article in articles:
        for k in ["ti", "ab", "fullText"]:
            article.pop('parsed_{}'.format(k), None)
    c = rr_sql_conn.cursor()

    current_task.update_state(state='PROGRESS',
                              meta={
                                  'status': "in process",
                                  'position':
                                  "writing the predictions to database"
                              })

    c.execute(
        "INSERT INTO api_done (report_uuid, annotations, timestamp) VALUES(?, ?, ?)",
        (report_uuid, json.dumps(articles), timestamp))
    rr_sql_conn.commit()
    c.close()

    # finally delete the data from the queue
    c = rr_sql_conn.cursor()
    c.execute("DELETE FROM api_queue WHERE report_uuid=?", (report_uuid, ))
    rr_sql_conn.commit()
    c.close()
    current_task.update_state(state='done')
    return {"status": 100, "task": "completed"}