Ejemplo n.º 1
0
def get_tsne_data(trials):
    """
    Resolve  nct_ids to their coordinates in saved TSNE matrix and return them along
    with the color of each data point, and the path to the background image
    @param trials: lists of nct_ids corresponding to trials
    @return: data to plot
    """
    tsne_plot = numpy.load(utils.most_recent_tsne())
    tsne_labels = numpy.load(utils.most_recent_tfidf_labels())
    rel_ids = [
        trial['nct_id'] for trial in trials
        if trial['relationship'] == 'relevant'
    ]
    incl_ids = [
        trial['nct_id'] for trial in trials
        if trial['relationship'] == 'included'
    ]
    ix = numpy.isin(tsne_labels, rel_ids)
    ids_indices = numpy.where(ix)[0]
    vectors = tsne_plot[ids_indices, :]
    colours = ['#fc6e2d' for x in range(vectors.shape[0])]
    if incl_ids:
        ix = numpy.isin(tsne_labels, incl_ids)
        ids_indices = numpy.where(ix)[0]
        incl_vectors = tsne_plot[ids_indices, :]
        colours = colours + ['#5f4af9' for x in range(incl_vectors.shape[0])]
        vectors = numpy.concatenate((vectors, incl_vectors), axis=0)
    rel_path = utils.most_recent_tsne_img()
    rel_path = rel_path[rel_path.index('static'):]
    return {
        'x': vectors[:, 0].tolist(),
        'y': vectors[:, 1].tolist(),
        'img': rel_path,
        'colours': colours
    }
Ejemplo n.º 2
0
def basicbot2(review_id=None, sess_id=None):
    """
    use document similarity to recommend trials for a review based on similarity to current included trials
    @param review_id: PMID of review
    @param sess_id: session ID if transitting progress via websocket
    """
    if sess_id:
        socketio = SocketIO(message_queue='amqp://localhost')
    conn = dblib.create_con(VERBOSE=True)
    cur = conn.cursor()
    cur.execute(
        "SELECT nct_id FROM review_rtrial WHERE relationship = 'included' AND review_id = %s;",
        (review_id, ))
    trials = cur.fetchall()
    if len(trials) < 1:
        print('no trials for basicbot2')
        conn.close()
        return False
    if trials:
        cur.execute(
            "delete from votes where link_id in (select id from review_rtrial where review_id = %s) and user_id = %s;",
            (review_id, 10))
        conn.commit()
        cur.execute(
            "delete from review_rtrial where upvotes = 0 and downvotes = 0 and user_id = 10;"
        )
        conn.commit()
        conn.close()
        if sess_id:
            socketio.emit('basicbot2_update', {'msg': 'triggering basicbot2'},
                          room=sess_id)
        tfidf_matrix = utils.most_recent_tfidf()
        ids = np.load(utils.most_recent_tfidf_labels())
        trials = list(list(zip(*trials))[0])
        ix = np.isin(ids, trials)
        trial_indices = np.where(ix)[0]
        if sess_id:
            socketio.emit('basicbot2_update', {'msg': 'vectorizing stuff'},
                          room=sess_id)
        trial_vecs = tfidf_matrix[trial_indices, :]
        cos_sim = linear_kernel(trial_vecs, tfidf_matrix)
        if sess_id:
            socketio.emit('basicbot2_update',
                          {'msg': 'calculating cosine similarity'},
                          room=sess_id)
        final = cos_sim.sum(axis=0)
        top = np.argpartition(final, -100)[-100:]
        top_ranked = set(ids[np.array(top)]) - set(ids[trial_indices])
        if sess_id:
            socketio.emit('basicbot2_update',
                          {'msg': 'inserting basicbot 2 predictions'},
                          room=sess_id)
        for nct_id in top_ranked:
            crud.review_trial(review_id, nct_id, False, 'relevant',
                              'basicbot2', 10)
        if sess_id:
            socketio.emit('basicbot2_update', {'msg': 'basicbot2 complete!'},
                          room=sess_id)
Ejemplo n.º 3
0
def docsim(review_id, sess_id=None):
    """
    use document similarity to recommend trials based on similarity to title & abstract text of review
    @param review_id: PMID of review
    @param sess_id: session ID if transitting progress via websocket
    """
    if sess_id:
        socketio = SocketIO(message_queue='amqp://localhost')
        socketio.emit('docsim_update', {'msg': 'started basicbot'}, room=sess_id)
        eventlet.sleep(0)
    review = crud.review_medtadata_db(review_id)
    document = (review['title'] + """ """ + review['abstract']) if review['abstract'] else review['title']
    if not document:
        if sess_id:
            socketio.emit('docsim_update', {'msg': 'Unable to make predictions. Basicbot complete'}, room=sess_id)
        return
    tf_transformer = TfidfVectorizer(use_idf=False)
    trials_vectorizer = pickle.load(open(utils.most_recent_tfidf_vec()))
    normalised_tf_vector = tf_transformer.fit_transform([document])
    if sess_id:
        socketio.emit('docsim_update', {'msg': 'vectorising stuff...'}, room=sess_id)
        eventlet.sleep(0)
    tfidf_matrix = scipy.sparse.load_npz(utils.most_recent_tfidf())
    idf_indices = [trials_vectorizer.vocabulary_[feature_name] for feature_name in tf_transformer.get_feature_names() if
                   feature_name in trials_vectorizer.vocabulary_.keys()]
    tf_indices = [tf_transformer.vocabulary_[feature_name] for feature_name in trials_vectorizer.get_feature_names() if
                  feature_name in tf_transformer.vocabulary_.keys()]
    final_idf = trials_vectorizer.idf_[np.array(idf_indices)]
    final_tf = np.array(normalised_tf_vector.toarray()[0])[np.array(tf_indices)]
    review_tfidf = np.asmatrix(final_tf * final_idf)
    tfidf_matrix = tfidf_matrix[:, np.array(idf_indices)]
    if sess_id:
        socketio.emit('docsim_update', {'msg': 'calculating similarity...'}, room=sess_id)
        eventlet.sleep(0)
    cos_sim = cosine_similarity(review_tfidf, tfidf_matrix).flatten()
    related_docs_indices = cos_sim.argsort()[:-100:-1]
    ids = np.load(utils.most_recent_tfidf_labels())
    to_insert = ids[np.array(related_docs_indices)]
    if sess_id:
        socketio.emit('docsim_update', {'msg': 'inserting predictions'}, room=sess_id)
        eventlet.sleep(0)
    for id in to_insert:
        crud.review_trial(review_id, id, False, 'relevant', 'basicbot1', 3)
    if sess_id:
        socketio.emit('docsim_update', {'msg': 'basicbot complete!'}, room=sess_id)
        eventlet.sleep(0)
Ejemplo n.º 4
0
def basicbot2_freetext(review_id=None, sess_id=None):
    """
    use document similarity to recommend trials for a review based on similarity to current included trials
    @param review_id: PMID of review
    @param sess_id: session ID if transitting progress via websocket
    """
    if sess_id:
        socketio = SocketIO(message_queue='amqp://localhost')
    conn = dblib.create_con(VERBOSE=True)
    cur = conn.cursor()
    cur.execute(
        "SELECT nct_id FROM freetext_review_rtrial WHERE review_id = %s;",
        (review_id, ))
    trials = cur.fetchall()
    conn.close()
    if len(trials) < 1:
        print('no trials for basicbot2')
        conn.close()

        return []
    if trials:
        if sess_id:
            socketio.emit('basicbot2_update', {'msg': 'triggering basicbot2'},
                          room=sess_id)
        tfidf_matrix = utils.most_recent_tfidf()
        ids = np.load(utils.most_recent_tfidf_labels())
        trials = list(list(zip(*trials))[0])
        ix = np.isin(ids, trials)
        trial_indices = np.where(ix)[0]
        if sess_id:
            socketio.emit('basicbot2_update', {'msg': 'vectorizing stuff'},
                          room=sess_id)
        trial_vecs = tfidf_matrix[trial_indices, :]
        cos_sim = linear_kernel(trial_vecs, tfidf_matrix)
        if sess_id:
            socketio.emit('basicbot2_update',
                          {'msg': 'calculating cosine similarity'},
                          room=sess_id)
        final = cos_sim.sum(axis=0)
        top = np.argpartition(final, -100)[-100:]
        top_ranked = set(ids[np.array(top)]) - set(ids[trial_indices])

        return list(top_ranked)
Ejemplo n.º 5
0
def _gen_T_v2(date):
    con = dblib.create_con(VERBOSE=True)
    cur = con.cursor()
    cur.execute(
        "SELECT nct_id, review_id from review_rtrial where relationship = 'included';"
    )
    links = cur.fetchall()
    con.close()
    ar = np.array(links)
    rows = np.load(utils.most_recent_tfidf_labels())
    ix = np.isin(ar[:, 0], rows)
    row_idx = np.where(ix)
    new_ar = ar[row_idx]
    r_pos = np.array([np.where(rows == x)[0][0] for x in new_ar[:, 0]])
    cols, c_pos = np.unique(new_ar[:, 1], return_inverse=True)
    pivot_table = np.zeros((len(rows), len(cols)))
    pivot_table[r_pos, c_pos] = 1
    s = scipy.sparse.csr_matrix(pivot_table)
    scipy.sparse.save_npz('models/matfac/trials_x_reviews_' + date + '.npz', s)
    np.save(open('models/matfac/nct_rows_' + date + ".pickle", "wb"), rows)
    np.save(open('models/matfac/pmid_cols_' + date + ".pickle", "wb"), cols)
Ejemplo n.º 6
0
def upload_models():
    """ upload the latest tfidf and TSNE models to webserver """
    tfidf_matrix = utils.most_recent_tfidf()
    tfidf_vec = utils.most_recent_tfidf_vec()
    tfidf_labels = utils.most_recent_tfidf_labels()
    tsne_matrix = utils.most_recent_tsne()
    tsne_image = utils.most_recent_tsne_img()
    for x in [tfidf_labels, tsne_matrix, tsne_image]:
        print datetime.fromtimestamp(os.path.getmtime(x))
        if datetime.fromtimestamp(
                os.path.getmtime(x)) < datetime.now() - timedelta(days=2):
            print 'too old!'
            return
    for x in [tfidf_labels, tsne_matrix, tsne_image, tfidf_matrix, tfidf_vec]:
        cmd = 'scp -i ' + config.SCP_KEYFILE + ' ' + x + ' ' + config.SCP_USER + '@' + config.SCP_HOST + ':' + replace_local_path(
            x)
        print cmd
        call(cmd.split())
    for x in [tfidf_labels, tfidf_matrix]:
        cmd = 'scp -i ' + config.SCP2_KEYFILE + ' ' + x + ' ' + config.SCP2_USER + '@' + config.SCP2_HOST + ':' + config.REMOTE_PATH2 + '/models/tfidf/' + x.split(
            '/')[-1]
        print cmd
        call(cmd.split())
Ejemplo n.º 7
0
def docsim_freetext(document, sess_id=None):
    """
    use document similarity to recommend trials based on similarity to title & abstract text of review
    @param review_id: PMID of review
    @param sess_id: session ID if transitting progress via websocket
    """
    if sess_id:
        socketio = SocketIO(message_queue='amqp://localhost')
        socketio.emit('docsim_update', {'msg': 'started basicbot'},
                      room=sess_id)
        eventlet.sleep(0)
    if not document:
        if sess_id:
            socketio.emit(
                'docsim_update',
                {'msg': 'Unable to make predictions. Basicbot complete'},
                room=sess_id)

        return []
    tf_transformer = TfidfVectorizer(use_idf=False)
    trials_vectorizer = pickle.load(open(utils.most_recent_tfidf_vec()))
    try:
        normalised_tf_vector = tf_transformer.fit_transform([document])
    except ValueError as e:
        print(e)
        return []
    if sess_id:
        socketio.emit('docsim_update', {'msg': 'vectorising stuff...'},
                      room=sess_id)
        eventlet.sleep(0)
    tfidf_matrix = utils.most_recent_tfidf()
    idf_indices = [
        trials_vectorizer.vocabulary_[feature_name]
        for feature_name in tf_transformer.get_feature_names()
        if feature_name in trials_vectorizer.vocabulary_.keys()
    ]
    tf_indices = [
        tf_transformer.vocabulary_[feature_name]
        for feature_name in trials_vectorizer.get_feature_names()
        if feature_name in tf_transformer.vocabulary_.keys()
    ]

    if not idf_indices:
        return []

    final_idf = trials_vectorizer.idf_[np.array(idf_indices)]
    final_tf = np.array(
        normalised_tf_vector.toarray()[0])[np.array(tf_indices)]
    review_tfidf = np.asmatrix(final_tf * final_idf)
    tfidf_matrix = tfidf_matrix[:, np.array(idf_indices)]
    if sess_id:
        socketio.emit('docsim_update', {'msg': 'calculating similarity...'},
                      room=sess_id)
        eventlet.sleep(0)
    cos_sim = cosine_similarity(review_tfidf, tfidf_matrix).flatten()
    related_docs_indices = cos_sim.argsort()[:-100:-1]
    ids = np.load(utils.most_recent_tfidf_labels())
    to_insert = ids[np.array(related_docs_indices)]
    if sess_id:
        # socketio.emit('docsim_update', {'msg': 'basicbot complete!'}, room=sess_id)
        eventlet.sleep(0)

    return list(to_insert)