def _matfac_results(date): K = 50 sparse_R = utils.most_recent_tfidf() svd = TruncatedSVD(n_components=200) R = svd.fit_transform(sparse_R) np.save('models/matfac/truncated_r_' + date, R) # R = np.load('models/matfac/truncated_r_'+date+'.npy').astype('float64') T = scipy.sparse.load_npz(utils.most_recent_trialsxreviews()) numNonZeroT = T.count_nonzero() T = T.todense().astype('float64') estP = np.random.random_sample([R.shape[0], K]) / 10 estQ = np.random.random_sample([R.shape[1], K]) / 10 estW = np.random.random_sample([T.shape[1], K]) / 10 PS_K = np.zeros(K, dtype='float64') numRow = R.shape[0] numCol1 = R.shape[1] numCol2 = T.shape[1] numIter = 5000 alpha_par = 0.01 lambda_par = 0.001 lambda_t_par = 0.1 T_est = np.zeros((numRow, numCol2), dtype='float64') VERBOSE = 1 T_est = np.asarray( matfac.run(R, T, estP, estQ, estW, PS_K, numNonZeroT, K, numRow, numCol1, numCol2, numIter, alpha_par, lambda_par, lambda_t_par, T_est, VERBOSE)) np.save('models/matfac/matfac_results_' + date + '.npy', T_est)
def basicbot2(review_id=None, sess_id=None): """ use document similarity to recommend trials for a review based on similarity to current included trials @param review_id: PMID of review @param sess_id: session ID if transitting progress via websocket """ if sess_id: socketio = SocketIO(message_queue='amqp://localhost') conn = dblib.create_con(VERBOSE=True) cur = conn.cursor() cur.execute( "SELECT nct_id FROM review_rtrial WHERE relationship = 'included' AND review_id = %s;", (review_id, )) trials = cur.fetchall() if len(trials) < 1: print('no trials for basicbot2') conn.close() return False if trials: cur.execute( "delete from votes where link_id in (select id from review_rtrial where review_id = %s) and user_id = %s;", (review_id, 10)) conn.commit() cur.execute( "delete from review_rtrial where upvotes = 0 and downvotes = 0 and user_id = 10;" ) conn.commit() conn.close() if sess_id: socketio.emit('basicbot2_update', {'msg': 'triggering basicbot2'}, room=sess_id) tfidf_matrix = utils.most_recent_tfidf() ids = np.load(utils.most_recent_tfidf_labels()) trials = list(list(zip(*trials))[0]) ix = np.isin(ids, trials) trial_indices = np.where(ix)[0] if sess_id: socketio.emit('basicbot2_update', {'msg': 'vectorizing stuff'}, room=sess_id) trial_vecs = tfidf_matrix[trial_indices, :] cos_sim = linear_kernel(trial_vecs, tfidf_matrix) if sess_id: socketio.emit('basicbot2_update', {'msg': 'calculating cosine similarity'}, room=sess_id) final = cos_sim.sum(axis=0) top = np.argpartition(final, -100)[-100:] top_ranked = set(ids[np.array(top)]) - set(ids[trial_indices]) if sess_id: socketio.emit('basicbot2_update', {'msg': 'inserting basicbot 2 predictions'}, room=sess_id) for nct_id in top_ranked: crud.review_trial(review_id, nct_id, False, 'relevant', 'basicbot2', 10) if sess_id: socketio.emit('basicbot2_update', {'msg': 'basicbot2 complete!'}, room=sess_id)
def regenerate_tsne(): """ regen the TSNE matrix & plot image with the latest trial data """ date = datetime.now().date().strftime('%d-%m-%Y') tfidf_matrix = scipy.sparse.load_npz(utils.most_recent_tfidf()) _tsne(tfidf_matrix, date) new_tsne = numpy.load('models/tsne/tsne_matrix_' + date + '.npy') # generate new _tsne background plot _new_tsne_img(new_tsne, date) # uncomment this if uploading to a remote server upload_models()
def docsim(review_id, sess_id=None): """ use document similarity to recommend trials based on similarity to title & abstract text of review @param review_id: PMID of review @param sess_id: session ID if transitting progress via websocket """ if sess_id: socketio = SocketIO(message_queue='amqp://localhost') socketio.emit('docsim_update', {'msg': 'started basicbot'}, room=sess_id) eventlet.sleep(0) review = crud.review_medtadata_db(review_id) document = (review['title'] + """ """ + review['abstract']) if review['abstract'] else review['title'] if not document: if sess_id: socketio.emit('docsim_update', {'msg': 'Unable to make predictions. Basicbot complete'}, room=sess_id) return tf_transformer = TfidfVectorizer(use_idf=False) trials_vectorizer = pickle.load(open(utils.most_recent_tfidf_vec())) normalised_tf_vector = tf_transformer.fit_transform([document]) if sess_id: socketio.emit('docsim_update', {'msg': 'vectorising stuff...'}, room=sess_id) eventlet.sleep(0) tfidf_matrix = scipy.sparse.load_npz(utils.most_recent_tfidf()) idf_indices = [trials_vectorizer.vocabulary_[feature_name] for feature_name in tf_transformer.get_feature_names() if feature_name in trials_vectorizer.vocabulary_.keys()] tf_indices = [tf_transformer.vocabulary_[feature_name] for feature_name in trials_vectorizer.get_feature_names() if feature_name in tf_transformer.vocabulary_.keys()] final_idf = trials_vectorizer.idf_[np.array(idf_indices)] final_tf = np.array(normalised_tf_vector.toarray()[0])[np.array(tf_indices)] review_tfidf = np.asmatrix(final_tf * final_idf) tfidf_matrix = tfidf_matrix[:, np.array(idf_indices)] if sess_id: socketio.emit('docsim_update', {'msg': 'calculating similarity...'}, room=sess_id) eventlet.sleep(0) cos_sim = cosine_similarity(review_tfidf, tfidf_matrix).flatten() related_docs_indices = cos_sim.argsort()[:-100:-1] ids = np.load(utils.most_recent_tfidf_labels()) to_insert = ids[np.array(related_docs_indices)] if sess_id: socketio.emit('docsim_update', {'msg': 'inserting predictions'}, room=sess_id) eventlet.sleep(0) for id in to_insert: crud.review_trial(review_id, id, False, 'relevant', 'basicbot1', 3) if sess_id: socketio.emit('docsim_update', {'msg': 'basicbot complete!'}, room=sess_id) eventlet.sleep(0)
def basicbot2_freetext(review_id=None, sess_id=None): """ use document similarity to recommend trials for a review based on similarity to current included trials @param review_id: PMID of review @param sess_id: session ID if transitting progress via websocket """ if sess_id: socketio = SocketIO(message_queue='amqp://localhost') conn = dblib.create_con(VERBOSE=True) cur = conn.cursor() cur.execute( "SELECT nct_id FROM freetext_review_rtrial WHERE review_id = %s;", (review_id, )) trials = cur.fetchall() conn.close() if len(trials) < 1: print('no trials for basicbot2') conn.close() return [] if trials: if sess_id: socketio.emit('basicbot2_update', {'msg': 'triggering basicbot2'}, room=sess_id) tfidf_matrix = utils.most_recent_tfidf() ids = np.load(utils.most_recent_tfidf_labels()) trials = list(list(zip(*trials))[0]) ix = np.isin(ids, trials) trial_indices = np.where(ix)[0] if sess_id: socketio.emit('basicbot2_update', {'msg': 'vectorizing stuff'}, room=sess_id) trial_vecs = tfidf_matrix[trial_indices, :] cos_sim = linear_kernel(trial_vecs, tfidf_matrix) if sess_id: socketio.emit('basicbot2_update', {'msg': 'calculating cosine similarity'}, room=sess_id) final = cos_sim.sum(axis=0) top = np.argpartition(final, -100)[-100:] top_ranked = set(ids[np.array(top)]) - set(ids[trial_indices]) return list(top_ranked)
def upload_models(): """ upload the latest tfidf and TSNE models to webserver """ tfidf_matrix = utils.most_recent_tfidf() tfidf_vec = utils.most_recent_tfidf_vec() tfidf_labels = utils.most_recent_tfidf_labels() tsne_matrix = utils.most_recent_tsne() tsne_image = utils.most_recent_tsne_img() for x in [tfidf_labels, tsne_matrix, tsne_image]: print datetime.fromtimestamp(os.path.getmtime(x)) if datetime.fromtimestamp( os.path.getmtime(x)) < datetime.now() - timedelta(days=2): print 'too old!' return for x in [tfidf_labels, tsne_matrix, tsne_image, tfidf_matrix, tfidf_vec]: cmd = 'scp -i ' + config.SCP_KEYFILE + ' ' + x + ' ' + config.SCP_USER + '@' + config.SCP_HOST + ':' + replace_local_path( x) print cmd call(cmd.split()) for x in [tfidf_labels, tfidf_matrix]: cmd = 'scp -i ' + config.SCP2_KEYFILE + ' ' + x + ' ' + config.SCP2_USER + '@' + config.SCP2_HOST + ':' + config.REMOTE_PATH2 + '/models/tfidf/' + x.split( '/')[-1] print cmd call(cmd.split())
def docsim_freetext(document, sess_id=None): """ use document similarity to recommend trials based on similarity to title & abstract text of review @param review_id: PMID of review @param sess_id: session ID if transitting progress via websocket """ if sess_id: socketio = SocketIO(message_queue='amqp://localhost') socketio.emit('docsim_update', {'msg': 'started basicbot'}, room=sess_id) eventlet.sleep(0) if not document: if sess_id: socketio.emit( 'docsim_update', {'msg': 'Unable to make predictions. Basicbot complete'}, room=sess_id) return [] tf_transformer = TfidfVectorizer(use_idf=False) trials_vectorizer = pickle.load(open(utils.most_recent_tfidf_vec())) try: normalised_tf_vector = tf_transformer.fit_transform([document]) except ValueError as e: print(e) return [] if sess_id: socketio.emit('docsim_update', {'msg': 'vectorising stuff...'}, room=sess_id) eventlet.sleep(0) tfidf_matrix = utils.most_recent_tfidf() idf_indices = [ trials_vectorizer.vocabulary_[feature_name] for feature_name in tf_transformer.get_feature_names() if feature_name in trials_vectorizer.vocabulary_.keys() ] tf_indices = [ tf_transformer.vocabulary_[feature_name] for feature_name in trials_vectorizer.get_feature_names() if feature_name in tf_transformer.vocabulary_.keys() ] if not idf_indices: return [] final_idf = trials_vectorizer.idf_[np.array(idf_indices)] final_tf = np.array( normalised_tf_vector.toarray()[0])[np.array(tf_indices)] review_tfidf = np.asmatrix(final_tf * final_idf) tfidf_matrix = tfidf_matrix[:, np.array(idf_indices)] if sess_id: socketio.emit('docsim_update', {'msg': 'calculating similarity...'}, room=sess_id) eventlet.sleep(0) cos_sim = cosine_similarity(review_tfidf, tfidf_matrix).flatten() related_docs_indices = cos_sim.argsort()[:-100:-1] ids = np.load(utils.most_recent_tfidf_labels()) to_insert = ids[np.array(related_docs_indices)] if sess_id: # socketio.emit('docsim_update', {'msg': 'basicbot complete!'}, room=sess_id) eventlet.sleep(0) return list(to_insert)