コード例 #1
0
ファイル: bot.py プロジェクト: bailliem/trial2rev
def docsim(review_id, sess_id=None):
    """
    use document similarity to recommend trials based on similarity to title & abstract text of review
    @param review_id: PMID of review
    @param sess_id: session ID if transitting progress via websocket
    """
    if sess_id:
        socketio = SocketIO(message_queue='amqp://localhost')
        socketio.emit('docsim_update', {'msg': 'started basicbot'}, room=sess_id)
        eventlet.sleep(0)
    review = crud.review_medtadata_db(review_id)
    document = (review['title'] + """ """ + review['abstract']) if review['abstract'] else review['title']
    if not document:
        if sess_id:
            socketio.emit('docsim_update', {'msg': 'Unable to make predictions. Basicbot complete'}, room=sess_id)
        return
    tf_transformer = TfidfVectorizer(use_idf=False)
    trials_vectorizer = pickle.load(open(utils.most_recent_tfidf_vec()))
    normalised_tf_vector = tf_transformer.fit_transform([document])
    if sess_id:
        socketio.emit('docsim_update', {'msg': 'vectorising stuff...'}, room=sess_id)
        eventlet.sleep(0)
    tfidf_matrix = scipy.sparse.load_npz(utils.most_recent_tfidf())
    idf_indices = [trials_vectorizer.vocabulary_[feature_name] for feature_name in tf_transformer.get_feature_names() if
                   feature_name in trials_vectorizer.vocabulary_.keys()]
    tf_indices = [tf_transformer.vocabulary_[feature_name] for feature_name in trials_vectorizer.get_feature_names() if
                  feature_name in tf_transformer.vocabulary_.keys()]
    final_idf = trials_vectorizer.idf_[np.array(idf_indices)]
    final_tf = np.array(normalised_tf_vector.toarray()[0])[np.array(tf_indices)]
    review_tfidf = np.asmatrix(final_tf * final_idf)
    tfidf_matrix = tfidf_matrix[:, np.array(idf_indices)]
    if sess_id:
        socketio.emit('docsim_update', {'msg': 'calculating similarity...'}, room=sess_id)
        eventlet.sleep(0)
    cos_sim = cosine_similarity(review_tfidf, tfidf_matrix).flatten()
    related_docs_indices = cos_sim.argsort()[:-100:-1]
    ids = np.load(utils.most_recent_tfidf_labels())
    to_insert = ids[np.array(related_docs_indices)]
    if sess_id:
        socketio.emit('docsim_update', {'msg': 'inserting predictions'}, room=sess_id)
        eventlet.sleep(0)
    for id in to_insert:
        crud.review_trial(review_id, id, False, 'relevant', 'basicbot1', 3)
    if sess_id:
        socketio.emit('docsim_update', {'msg': 'basicbot complete!'}, room=sess_id)
        eventlet.sleep(0)
コード例 #2
0
def upload_models():
    """ upload the latest tfidf and TSNE models to webserver """
    tfidf_matrix = utils.most_recent_tfidf()
    tfidf_vec = utils.most_recent_tfidf_vec()
    tfidf_labels = utils.most_recent_tfidf_labels()
    tsne_matrix = utils.most_recent_tsne()
    tsne_image = utils.most_recent_tsne_img()
    for x in [tfidf_labels, tsne_matrix, tsne_image]:
        print datetime.fromtimestamp(os.path.getmtime(x))
        if datetime.fromtimestamp(
                os.path.getmtime(x)) < datetime.now() - timedelta(days=2):
            print 'too old!'
            return
    for x in [tfidf_labels, tsne_matrix, tsne_image, tfidf_matrix, tfidf_vec]:
        cmd = 'scp -i ' + config.SCP_KEYFILE + ' ' + x + ' ' + config.SCP_USER + '@' + config.SCP_HOST + ':' + replace_local_path(
            x)
        print cmd
        call(cmd.split())
    for x in [tfidf_labels, tfidf_matrix]:
        cmd = 'scp -i ' + config.SCP2_KEYFILE + ' ' + x + ' ' + config.SCP2_USER + '@' + config.SCP2_HOST + ':' + config.REMOTE_PATH2 + '/models/tfidf/' + x.split(
            '/')[-1]
        print cmd
        call(cmd.split())
コード例 #3
0
def docsim_freetext(document, sess_id=None):
    """
    use document similarity to recommend trials based on similarity to title & abstract text of review
    @param review_id: PMID of review
    @param sess_id: session ID if transitting progress via websocket
    """
    if sess_id:
        socketio = SocketIO(message_queue='amqp://localhost')
        socketio.emit('docsim_update', {'msg': 'started basicbot'},
                      room=sess_id)
        eventlet.sleep(0)
    if not document:
        if sess_id:
            socketio.emit(
                'docsim_update',
                {'msg': 'Unable to make predictions. Basicbot complete'},
                room=sess_id)

        return []
    tf_transformer = TfidfVectorizer(use_idf=False)
    trials_vectorizer = pickle.load(open(utils.most_recent_tfidf_vec()))
    try:
        normalised_tf_vector = tf_transformer.fit_transform([document])
    except ValueError as e:
        print(e)
        return []
    if sess_id:
        socketio.emit('docsim_update', {'msg': 'vectorising stuff...'},
                      room=sess_id)
        eventlet.sleep(0)
    tfidf_matrix = utils.most_recent_tfidf()
    idf_indices = [
        trials_vectorizer.vocabulary_[feature_name]
        for feature_name in tf_transformer.get_feature_names()
        if feature_name in trials_vectorizer.vocabulary_.keys()
    ]
    tf_indices = [
        tf_transformer.vocabulary_[feature_name]
        for feature_name in trials_vectorizer.get_feature_names()
        if feature_name in tf_transformer.vocabulary_.keys()
    ]

    if not idf_indices:
        return []

    final_idf = trials_vectorizer.idf_[np.array(idf_indices)]
    final_tf = np.array(
        normalised_tf_vector.toarray()[0])[np.array(tf_indices)]
    review_tfidf = np.asmatrix(final_tf * final_idf)
    tfidf_matrix = tfidf_matrix[:, np.array(idf_indices)]
    if sess_id:
        socketio.emit('docsim_update', {'msg': 'calculating similarity...'},
                      room=sess_id)
        eventlet.sleep(0)
    cos_sim = cosine_similarity(review_tfidf, tfidf_matrix).flatten()
    related_docs_indices = cos_sim.argsort()[:-100:-1]
    ids = np.load(utils.most_recent_tfidf_labels())
    to_insert = ids[np.array(related_docs_indices)]
    if sess_id:
        # socketio.emit('docsim_update', {'msg': 'basicbot complete!'}, room=sess_id)
        eventlet.sleep(0)

    return list(to_insert)