def yearwise_SD(Y_terms, chems, **kwargs): """Returning overall and year-wise Social Density (SD) values for a set of chemical compounds and a set of properties (Y-terms) Jaccardian SD(X,Y) = |A(X) intersect. A(Y)| \ |A(X)|+|A(Y)| """ # setting up the logger logger_disable = kwargs.get('logger_disable', False) logfile_path = kwargs.get('logfile_path', None) logger = helpers.set_up_logger(__name__, logfile_path, logger_disable) msdb.crsr.execute('SELECT COUNT(*) FROM paper;') logger.info('Total number of documents in the DB: {}'.format( msdb.crsr.fetchall()[0][0])) # getting unique authors of Y-terms in different years case_sensitives = kwargs.get('case_sensitives',[]) logger.info('Downloading authors for terms {} in their abstracts'.format(Y_terms)) R = msdb.get_authors_by_keywords(Y_terms, cols=['author_id','P.date'], return_papers=False, case_sensitives=case_sensitives) if len(R)==0: raise ValueError('Given property terms are not associated with any papers in the data base') Y_years = np.array([y.year for y in R['date']]) Y_authors = {y: R['author_id'][Y_years==y] for y in np.unique(Y_years)} unique_Y_authors = np.unique(R['author_id']) min_yr = np.min(Y_years) max_yr = np.max(Y_years) logger.info('Downloading is done. The oldest paper is published in {}.'.format(min_yr)) logger.info('The total number of unique authors is {}.'.format(len(unique_Y_authors))) # iterating over chemicals and compute SD for each yr_SDs = np.zeros((len(chems), max_yr-min_yr+1)) years = np.arange(min_yr, max_yr+1) save_dirname = kwargs.get('save_dirname', None) logger.info('Iterating over chemicals for computing social densities began.') for i, chm in enumerate(chems): if not(i%1000) or (i==len(chems)-1): logger.info('Iteration {}..'.format(i)) if save_dirname is not None: np.savetxt(os.path.join(save_dirname, 'yr_SDs.txt'), yr_SDs) # getting unique authors of this materials in different years R = msdb.get_authors_by_chemicals([chm], cols=['author_id','P.date'], years=np.unique(Y_years), return_papers=False) if len(R)==0: continue X_years = np.array([y.year for y in R[chm]['date']]) X_authors = {y: R[chm]['author_id'][X_years==y] for y in np.unique(X_years)} overlap_dict, union_dict = yearwise_authors_set_op(X_authors, Y_authors) for yr in Y_authors: yr_SDs[i,yr-min_yr] = len(overlap_dict[yr])/len(union_dict[yr]) return yr_SDs, years
def eval_predictor(predictor_func, gt_func, year_of_pred, **kwargs): """Evaluating a given predictor function in how accurate its predictions match the actual discoveries returned by a given ground-truth function The evaluations are done for individual years strating from a given year of prediction to 2018. """ metric = kwargs.get('metric', 'cumul_precision') last_year = kwargs.get('last_year', 2019) save_path = kwargs.get('save_path', None) return_preds = kwargs.get('return_preds', False) logfile_path = kwargs.get('logfile_path', None) logger_disable = kwargs.get('logger_disable',False) logger = helpers.set_up_logger(__name__, logfile_path, logger_disable) """ Generating the Prediction """ preds = predictor_func(year_of_pred) logger.info('Number of actual predictions: {}'.format(len(preds))) if metric=='auc': if len(preds)!=2: raise ValueError('When asking for AUC metric, predictor should return score array too.') scores = preds[1] preds = preds[0] if save_path is not None: with open(save_path, 'w') as f: f.write('\n'.join(preds)+'\n') """ Evaluating the Predictions for the Upcoming Years """ years_of_eval = np.arange(year_of_pred, last_year) iter_list = [] # to be the prec. values or actul disc. (for AUC) for i, yr in enumerate(years_of_eval): gt = gt_func(yr) if metric=='cumul_precision': # Cumulative Precision iter_list += [np.sum(np.in1d(gt, preds)) / len(preds)] elif metric=='auc': # Area Under Curve iter_list += gt.tolist() if metric == 'cumul_precision': res = np.cumsum(iter_list) elif metric == 'auc': y = np.zeros(len(preds)) y[np.isin(preds,iter_list)] = 1 res = roc_auc_score(y, scores) if return_preds: return res, preds else: return res
def __init__(self, path_to_data, **kwargs): self.path_to_data = path_to_data self.pars = {} for key, def_val in DEFAULT_PARS.items(): self.pars[key] = kwargs.get(key, def_val) # setting up the logger logger_disable = kwargs.get('logger_disable', False) self.logfile_path = kwargs.get('logfile_path', None) self.logger = helpers.set_up_logger(__name__, self.logfile_path, logger_disable)
def __init__(self, path_to_sents, **kwargs): self.sents = open(path_to_sents,'r').read().splitlines() # for now we are skipping the preprocessing steps as we # will be using this for deepwalk sentences that do not need # preprocessing # ... PREPROCESSING GOES HERE self.to_be_removed = [] # setting up the logger logger_disable = kwargs.get('silent', False) self.logger = helpers.set_up_logger(__name__, None, logger_disable)
def cooccurrences(Y_terms, ents, **kwargs): """Getting co-occurrences of a given list of entities and a set of keywords (Y-terms) in abstracts of the database """ msdb.crsr.execute('SELECT COUNT(*) FROM chemical_paper_mapping;') cnt = msdb.crsr.fetchall()[0][0] print('Number of rows in chemical-paper-mapping: {}'.format(cnt)) # setting up the logger logger_disable = kwargs.get('logger_disable', False) logfile_path = kwargs.get('logfile_path', None) logger = helpers.set_up_logger(__name__, logfile_path, logger_disable) # downloading papers with Y-terms (Y-papers) and categorizing them yearwise logger.info('Downloading papers with terms {} in their abstracts'.format(Y_terms)) case_sensitives = kwargs.get('case_sensitives', []) (_,Y_papers), (_,Y_dates) = msdb.get_papers_by_keywords(Y_terms, cols=['paper_id','date'], logical_comb='OR', case_sensitives=case_sensitives).it ems() Y_years = np.array([y.year for y in Y_dates]) Y_distinct_yrs = np.unique(Y_years) min_yr = np.min(Y_years) max_yr = np.max(Y_years) yrs = np.arange(min_yr, max_yr+1) logger.info('{} papers with Y-terms have been downloaded. \ The earliest one is published in {}'.format(len(Y_papers), min_yr)) cocrs = np.zeros((len(ents), len(yrs))) ents = np.array(ents) for i,yr in enumerate(Y_years): yr_loc = yr - min_yr # add co-occurrences to all chemicals present in this paper # all chemicals in this paper present_ents = msdb.get_chemicals_by_paper_ids(int(Y_papers[i]), cols=['formula']) present_ents_formula = present_ents[int(Y_papers[i])]['formula'] if len(present_ents)>0 else [] present_ents_formula = list(set(present_ents_formula).intersection(set(ents))) present_ents_locs = [np.where(ents==frml)[0][0] for frml in present_ents_formula] for cloc in present_ents_locs: cocrs[cloc, yr_loc] += 1 if not(i%1000): logger.info('{} papers is reviewed.'.format(i)) return cocrs, yrs
def compute_vertex_matrix(db, **kwargs): """Forming vertex matrix of the hypergraph, which is a |E|x|V| matrix and its (i,j) element is equal to 1 if hyperedge (article) i has node j and zero otherwise The hyperdeges are the articles and nodes are the union of author and chemical nodes """ # setting up the logger logger_disable = kwargs.get('logger_disable', False) logfile_path = kwargs.get('logfile_path', None) logger = helpers.set_up_logger(__name__, logfile_path, logger_disable) savefile_path = kwargs.get('savefile_path', None) nP = db.count_table_rows('paper') Pids = db.get_1d_query('SELECT id FROM paper;') nA = db.count_table_rows('author') Aids = db.get_1d_query('SELECT id FROM author;') nE = db.count_table_rows(db.entity_tab) Eids = db.get_1d_query('SELECT id FROM {};'.format(db.entity_tab)) logger.info('#papers={}, #author={}, #entities={}'.format(nP, nA, nE)) VM = sparse.lil_matrix((nP, nA + nE), dtype=np.uint8) # filling the matrix with batches cnt = 0 batch_size = 500 logger.info( 'Starting to fill the vertex matrix with batche size {}'.format( batch_size)) while cnt < nP: inds = np.arange(cnt, min(cnt + batch_size, nP)) batch_Pids = Pids[inds] q_Aids = db.get_LoA_by_PID(batch_Pids) q_Eids = db.get_LoE_by_PID(batch_Pids) cols = [] rows = [] for i, pid in enumerate(batch_Pids): # each PID has a number of authors and entities; # locate them in the global array of author and entity IDs; # these locations would be their rows in vertex matrix au_cols = np.where(np.isin( Aids, q_Aids[pid]['id']))[0] if pid in q_Aids else [] ent_cols = np.where(np.isin( Eids, q_Eids[pid]['id']))[0] + nA if pid in q_Eids else [] cols += [np.concatenate((au_cols, ent_cols))] rows += [inds[i] * np.ones(len(au_cols) + len(ent_cols))] cols = np.concatenate(cols) rows = np.concatenate(rows) VM[rows, cols] = 1 cnt += batch_size if not (cnt % 100000): logger.info('{} articles have been processed'.format(cnt)) if not (cnt % 10000) and (savefile_path is not None): sparse.save_npz(savefile_path, VM.tocsc()) return VM
def collect_authors_new_discoveries(self, full_chems, cocrs, Y_terms, yrs, **kwargs): """Collecting authors of papers with new co-occurrences (new discoveries) and extracting their previous papers on the topic of the property and/or the newly studied molecule """ case_sensitives = kwargs.get('case_sensitives', []) logfile_path = kwargs.get('logfile_path', None) savefile_path = kwargs.get('savefile_path', None) start_yr = kwargs.get('start_yr', 2001) yr_Y_authors = kwargs.get('yr_Y_authors', None) yr_Y_papers = kwargs.get('yr_Y_papers', None) logger = set_up_logger(__name__, logfile_path, False) if (yr_Y_authors is None) or (yr_Y_papers is None): yr_Y_authors, yr_Y_papers = self.get_yearwise_authors_by_keywords( Y_terms, return_papers=True, case_sensitives=case_sensitives) # analyze years from 2001 to 2018 (note that: yrs[-1]=2019) disc_dict = {} for yr in np.arange(start_yr, yrs[-1]): yr_loc = np.where(yrs == yr)[0][0] thisyr_Y_papers = yr_Y_papers[yr] disc_dict[yr] = {} new_discs = find_first_time_cocrs(cocrs, yr_loc) logger.info('PROGRESS FOR {}: {} new discoveries found'.format( yr, len(new_discs))) for i, chm in enumerate(full_chems[new_discs]): yr_X_authors, yr_X_papers = self.get_yearwise_authors_by_keywords( [chm], chemical=True, return_papers=True) thisyr_X_papers = yr_X_papers[yr] # papers with co-occurrences ov_papers = list( set(thisyr_Y_papers).intersection(set(thisyr_X_papers))) disc_dict[yr][chm] = {pid: {} for pid in ov_papers} for pid in ov_papers: # authors of papers with co-occurrences A = self.get_authors_by_paper_id([pid], ['author_id']) if len(A) > 0: A = A['author_id'] disc_dict[yr][chm][pid] = {a: [{}, {}] for a in A} for auth in A: """ for the property """ # years that the current author has published a paper on Y so that .. a_pubY_yrs = [ y for y in yr_Y_authors if auth in yr_Y_authors[y] and y < yr ] if len(a_pubY_yrs) > 0: # .. we can consider only those years to query his/her papers array_yrs = '({})'.format(','.join( [str(y) for y in a_pubY_yrs])) scomm = 'SELECT P.paper_id, YEAR(P.date) FROM paper P \ INNER JOIN paper_author_mapping P2A ON P.paper_id=P2A.paper_id \ WHERE P2A.author_id={} AND (YEAR(P.date) IN {})'.format( auth, array_yrs) # Pa and Ya are the papers and years of those papers (_, Pa), (_, Ya) = self.execute_and_get_results( scomm, ['paper_id', 'year']).items() uYa = np.unique(Ya) disc_dict[yr][chm][pid][auth][0] = { yr: [ Pa[i] for i in range(len(Pa)) if Ya[i] == yr if Pa[i] in yr_Y_papers[yr] ] for yr in uYa } """ for the molecule """ a_pubX_yrs = [ x for x in yr_X_authors if auth in yr_X_authors[x] and x < yr ] if len(a_pubX_yrs) > 0: array_yrs = '({})'.format(','.join( [str(x) for x in a_pubX_yrs])) scomm = 'SELECT P.paper_id, YEAR(P.date) FROM paper P \ INNER JOIN paper_author_mapping P2A ON P.paper_id=P2A.paper_id \ WHERE P2A.author_id={} AND (YEAR(P.date) IN {})'.format( auth, array_yrs) (_, Pa), (_, Ya) = self.execute_and_get_results( scomm, ['paper_id', 'year']).items() uYa = np.unique(Ya) disc_dict[yr][chm][pid][auth][1] = { yr: [ Pa[i] for i in range(len(Pa)) if Ya[i] == yr if Pa[i] in yr_X_papers[yr] ] for yr in uYa } if i > 0 and not (i % 100): logger.info('\t{} materials have been analyzed'.format(i)) if savefile_path is not None: with open(savefile_path, 'wb') as f: pickle.dump(disc_dict, f) logger.info( 'The results have been saved in {}'.format(savefile_path)) return disc_dict
def complete_affiliations(paper_ids, sql_db, sql_cursor, logfile_path=None): logger = helpers.set_up_logger(__name__, logfile_path, False, file_mode='a') # initialize the affiliation primary key sql_cursor.execute('SELECT aff_id FROM affiliation;') all_aff_PKs = sql_cursor.fetchall() if len(all_aff_PKs)==0: aff_PK = 0 else: aff_PK = max([a[0] for a in all_aff_PKs]) + 1 sql_cursor.execute('SELECT aff_scopus_ID FROM affiliation;') curr_aff_scopus_id_list = [a[0] for a in sql_cursor.fetchall()] sql_cursor.execute('SELECT * FROM author_affiliation_mapping;') curr_author_aff_pairs = list(sql_cursor.fetchall()) pids_array = ','.join([str(p) for p in paper_ids]) sql_cursor.execute('SELECT doi, paper_id FROM paper WHERE paper_id IN {};'.format(pids_array)) RES = sql_cursor.fetchall() dois = [a[0] for a in RES] paper_ids = [a[1] for a in RES] dois_with_nonexisting_authors = [] for j,doi in enumerate(dois): try: r = AbstractRetrieval(doi) except Scopus429Error: print('Scopus resource exhausted. Check your quota.') return except: raise ValueError('Could not download doi {}'.format(doi)) if r.authors is None: continue paper_scopus_id_list = [a.auid for a in r.authors] for i,scps_id in enumerate(paper_scopus_id_list): # if repetitive author, ignore: if scps_id in paper_scopus_id_list[:i]: continue sql_cursor.execute('SELECT author_id \ FROM author \ WHERE author_scopus_ID = {}'.format(scps_id)) this_author_PK = sql_cursor.fetchall() if len(this_author_PK)==0: if doi not in dois_with_nonexisting_authors: dois_with_nonexisting_authors += [doi] logger.info('(CASE NUMBER {}) PAPER_ID {}, DOI {}: author with scopus ID {} does not exist.'.format(306+len(dois_with_nonexisting_authors), paper_ids[j], doi, scps_id)) continue else: this_author_PK = this_author_PK[0][0] # directly go to their affiliations if r.authors[i].affiliation is not None: author_aff_scopus_id_list = np.unique(r.authors[i].affiliation) else: author_aff_scopus_id_list = [] for aff_scps_id in author_aff_scopus_id_list: if aff_scps_id in curr_aff_scopus_id_list: sql_cursor.execute('SELECT aff_id \ FROM affiliation \ WHERE aff_scopus_ID = {}'.format(aff_scps_id)) this_aff_PK = sql_cursor.fetchall()[0][0] # add the pair only if the author/aff. have not already # been added to the mapping table if (this_author_PK, this_aff_PK) not in curr_author_aff_pairs: sql_cursor.execute('INSERT INTO author_affiliation_mapping \ VALUES({}, {})'.format(this_author_PK, this_aff_PK)) curr_author_aff_pairs += [(this_author_PK, this_aff_PK)] logger.info('{} have been added to A2A.'.format((r.authors[i].given_name, r.authors[i].surname, this_aff_PK))) else: lcn = np.where([x.id==aff_scps_id for x in r.affiliation])[0] if len(lcn)>0: lcn = lcn[0] aff_name = r.affiliation[lcn].name.replace('"','\\"') aff_city = r.affiliation[lcn].city aff_country = r.affiliation[lcn].country else: aff_name = 'NA' aff_city = 'NA' aff_country = 'NA' sql_cursor.execute('INSERT INTO affiliation \ VALUES({},"{}","{}","{}","{}");'.format( aff_PK, aff_scps_id, aff_name, aff_city, aff_country) ) sql_cursor.execute('INSERT INTO author_affiliation_mapping \ VALUES({}, {})'.format(this_author_PK, aff_PK)) curr_author_aff_pairs += [(this_author_PK, aff_PK)] logger.info('{} have been added to A2A.'.format((r.authors[i].given_name, r.authors[i].surname, this_aff_PK))) # update the affliations list curr_aff_scopus_id_list += [aff_scps_id] aff_PK += 1 if not(j%1000): np.savetxt('/home/jamshid/codes/data/iter_inds.txt', [j]) sql_db.commit()