Ejemplo n.º 1
0
def yearwise_SD(Y_terms, chems, **kwargs):
    """Returning overall and year-wise Social Density (SD) values for a set
    of chemical compounds and a set of properties (Y-terms)

    Jaccardian SD(X,Y) = |A(X) intersect. A(Y)| \ |A(X)|+|A(Y)|
    """

    # setting up the logger
    logger_disable = kwargs.get('logger_disable', False)
    logfile_path =   kwargs.get('logfile_path', None)
    logger = helpers.set_up_logger(__name__, logfile_path, logger_disable)

    msdb.crsr.execute('SELECT COUNT(*) FROM paper;')
    logger.info('Total number of documents in the DB: {}'.format(
        msdb.crsr.fetchall()[0][0]))


    # getting unique authors of Y-terms in different years
    case_sensitives = kwargs.get('case_sensitives',[])
    logger.info('Downloading authors for terms {} in their abstracts'.format(Y_terms))
    R = msdb.get_authors_by_keywords(Y_terms,
                                     cols=['author_id','P.date'],
                                     return_papers=False,
                                     case_sensitives=case_sensitives)
    if len(R)==0:
        raise ValueError('Given property terms are not associated with any papers in the data base')
    Y_years = np.array([y.year for y in R['date']])
    Y_authors = {y: R['author_id'][Y_years==y] for y in np.unique(Y_years)}
    unique_Y_authors = np.unique(R['author_id'])
    min_yr = np.min(Y_years)
    max_yr = np.max(Y_years)
    logger.info('Downloading is done. The oldest paper is published in {}.'.format(min_yr))
    logger.info('The total number of unique authors is {}.'.format(len(unique_Y_authors)))

    # iterating over chemicals and compute SD for each
    yr_SDs = np.zeros((len(chems), max_yr-min_yr+1))
    years = np.arange(min_yr, max_yr+1)
    save_dirname = kwargs.get('save_dirname', None)
    logger.info('Iterating over chemicals for computing social densities began.')
    for i, chm in enumerate(chems):
        if not(i%1000) or (i==len(chems)-1):
            logger.info('Iteration {}..'.format(i))
            if save_dirname is not None:
                np.savetxt(os.path.join(save_dirname, 'yr_SDs.txt'), yr_SDs)

        # getting unique authors of this materials in different years
        R = msdb.get_authors_by_chemicals([chm],
                                          cols=['author_id','P.date'],
                                          years=np.unique(Y_years),
                                          return_papers=False)
        if len(R)==0: continue
        X_years = np.array([y.year for y in R[chm]['date']])
        X_authors = {y: R[chm]['author_id'][X_years==y] for y in np.unique(X_years)}
        overlap_dict, union_dict = yearwise_authors_set_op(X_authors, Y_authors)
        for yr in Y_authors:
            yr_SDs[i,yr-min_yr] = len(overlap_dict[yr])/len(union_dict[yr])
        
    return yr_SDs, years
def eval_predictor(predictor_func,
                   gt_func,
                   year_of_pred,
                   **kwargs):
    """Evaluating a given predictor function in how accurate its predictions
    match the actual discoveries returned by a given ground-truth function

    The evaluations are done for individual years strating from a given year 
    of prediction to 2018.
    """

    metric = kwargs.get('metric', 'cumul_precision')
    last_year = kwargs.get('last_year', 2019)
    save_path = kwargs.get('save_path', None)
    return_preds = kwargs.get('return_preds', False)
    logfile_path = kwargs.get('logfile_path', None)
    logger_disable = kwargs.get('logger_disable',False)
    logger = helpers.set_up_logger(__name__, logfile_path, logger_disable)
    

    """ Generating the Prediction """
    preds = predictor_func(year_of_pred)
    logger.info('Number of actual predictions: {}'.format(len(preds)))
    if metric=='auc':
        if len(preds)!=2:
            raise ValueError('When asking for AUC metric, predictor should return score array too.')
        scores = preds[1]
        preds = preds[0]

    if save_path is not None:
        with open(save_path, 'w') as f:
            f.write('\n'.join(preds)+'\n')
    
    """ Evaluating the Predictions for the Upcoming Years """
    years_of_eval = np.arange(year_of_pred, last_year)
    iter_list = []  # to be the prec. values or actul disc. (for AUC)   
    for i, yr in enumerate(years_of_eval):
        gt = gt_func(yr)

        if metric=='cumul_precision':      # Cumulative Precision
            iter_list += [np.sum(np.in1d(gt, preds)) / len(preds)]
        elif metric=='auc':    # Area Under Curve
            iter_list += gt.tolist()

    if metric == 'cumul_precision':
        res = np.cumsum(iter_list)
    elif metric == 'auc':
        y = np.zeros(len(preds))
        y[np.isin(preds,iter_list)] = 1
        res = roc_auc_score(y, scores)

    if return_preds:
        return res, preds
    else:
        return res
Ejemplo n.º 3
0
    def __init__(self, path_to_data, **kwargs):


        self.path_to_data = path_to_data
        self.pars = {}
        for key, def_val in DEFAULT_PARS.items():
            self.pars[key] = kwargs.get(key, def_val)

        # setting up the logger
        logger_disable = kwargs.get('logger_disable', False)
        self.logfile_path =   kwargs.get('logfile_path', None)
        self.logger = helpers.set_up_logger(__name__, self.logfile_path, logger_disable)
Ejemplo n.º 4
0
    def __init__(self, path_to_sents, **kwargs):

        self.sents = open(path_to_sents,'r').read().splitlines()
        
        # for now we are skipping the preprocessing steps as we
        # will be using this for deepwalk sentences that do not need
        # preprocessing
        # ... PREPROCESSING GOES HERE
        self.to_be_removed = []

        # setting up the logger
        logger_disable = kwargs.get('silent', False)
        self.logger = helpers.set_up_logger(__name__, None, logger_disable)
Ejemplo n.º 5
0
def cooccurrences(Y_terms, ents, **kwargs):
    """Getting co-occurrences of a given list of entities and 
    a set of keywords (Y-terms) in  abstracts of the database
    """

    msdb.crsr.execute('SELECT COUNT(*) FROM chemical_paper_mapping;')
    cnt = msdb.crsr.fetchall()[0][0]
    print('Number of rows in chemical-paper-mapping: {}'.format(cnt))
    
    # setting up the logger
    logger_disable = kwargs.get('logger_disable', False)
    logfile_path =   kwargs.get('logfile_path', None)
    logger = helpers.set_up_logger(__name__, logfile_path, logger_disable)

   # downloading papers with Y-terms (Y-papers) and categorizing them yearwise
    logger.info('Downloading papers with terms {} in their abstracts'.format(Y_terms))
    case_sensitives = kwargs.get('case_sensitives', [])
    (_,Y_papers), (_,Y_dates) = msdb.get_papers_by_keywords(Y_terms,
                                                            cols=['paper_id','date'],
                                                            logical_comb='OR',
                                                            case_sensitives=case_sensitives).it
    ems()
    Y_years = np.array([y.year for y in Y_dates])
    Y_distinct_yrs = np.unique(Y_years)
    min_yr = np.min(Y_years)
    max_yr = np.max(Y_years)
    yrs = np.arange(min_yr, max_yr+1)

    logger.info('{} papers with Y-terms have been downloaded. \
                 The earliest one is published in {}'.format(len(Y_papers), min_yr))
    cocrs = np.zeros((len(ents), len(yrs)))
    ents = np.array(ents)
    for i,yr in enumerate(Y_years):
        yr_loc = yr - min_yr

        # add co-occurrences to all chemicals present in this paper
        # all chemicals in this paper
        present_ents = msdb.get_chemicals_by_paper_ids(int(Y_papers[i]), cols=['formula'])
        present_ents_formula = present_ents[int(Y_papers[i])]['formula'] if len(present_ents)>0 else []
        present_ents_formula = list(set(present_ents_formula).intersection(set(ents)))
        present_ents_locs = [np.where(ents==frml)[0][0] for frml in present_ents_formula]
        
        for cloc in present_ents_locs:
            cocrs[cloc, yr_loc] += 1
            
        if not(i%1000):
            logger.info('{} papers is reviewed.'.format(i))

    return cocrs, yrs
Ejemplo n.º 6
0
def compute_vertex_matrix(db, **kwargs):
    """Forming vertex matrix of the hypergraph, which is a |E|x|V|
    matrix and its (i,j) element is equal to 1 if hyperedge (article)
    i has node j and zero otherwise
    
    The hyperdeges are the articles and nodes are the union of author and
    chemical nodes
    """

    # setting up the logger
    logger_disable = kwargs.get('logger_disable', False)
    logfile_path = kwargs.get('logfile_path', None)
    logger = helpers.set_up_logger(__name__, logfile_path, logger_disable)

    savefile_path = kwargs.get('savefile_path', None)

    nP = db.count_table_rows('paper')
    Pids = db.get_1d_query('SELECT id FROM paper;')
    nA = db.count_table_rows('author')
    Aids = db.get_1d_query('SELECT id FROM author;')
    nE = db.count_table_rows(db.entity_tab)
    Eids = db.get_1d_query('SELECT id FROM {};'.format(db.entity_tab))
    logger.info('#papers={}, #author={}, #entities={}'.format(nP, nA, nE))

    VM = sparse.lil_matrix((nP, nA + nE), dtype=np.uint8)
    # filling the matrix with batches
    cnt = 0
    batch_size = 500
    logger.info(
        'Starting to fill the vertex matrix with batche size {}'.format(
            batch_size))
    while cnt < nP:
        inds = np.arange(cnt, min(cnt + batch_size, nP))
        batch_Pids = Pids[inds]
        q_Aids = db.get_LoA_by_PID(batch_Pids)
        q_Eids = db.get_LoE_by_PID(batch_Pids)

        cols = []
        rows = []
        for i, pid in enumerate(batch_Pids):
            # each PID has a number of authors and entities;
            # locate them in the global array of author and entity IDs;
            # these locations would be their rows in vertex matrix
            au_cols = np.where(np.isin(
                Aids, q_Aids[pid]['id']))[0] if pid in q_Aids else []
            ent_cols = np.where(np.isin(
                Eids, q_Eids[pid]['id']))[0] + nA if pid in q_Eids else []

            cols += [np.concatenate((au_cols, ent_cols))]
            rows += [inds[i] * np.ones(len(au_cols) + len(ent_cols))]

        cols = np.concatenate(cols)

        rows = np.concatenate(rows)
        VM[rows, cols] = 1

        cnt += batch_size

        if not (cnt % 100000):
            logger.info('{} articles have been processed'.format(cnt))
            if not (cnt % 10000) and (savefile_path is not None):
                sparse.save_npz(savefile_path, VM.tocsc())

    return VM
Ejemplo n.º 7
0
    def collect_authors_new_discoveries(self, full_chems, cocrs, Y_terms, yrs,
                                        **kwargs):
        """Collecting authors of papers with new co-occurrences (new discoveries)
        and extracting their previous papers on the topic of the property and/or
        the newly studied molecule
        """

        case_sensitives = kwargs.get('case_sensitives', [])
        logfile_path = kwargs.get('logfile_path', None)
        savefile_path = kwargs.get('savefile_path', None)
        start_yr = kwargs.get('start_yr', 2001)
        yr_Y_authors = kwargs.get('yr_Y_authors', None)
        yr_Y_papers = kwargs.get('yr_Y_papers', None)

        logger = set_up_logger(__name__, logfile_path, False)

        if (yr_Y_authors is None) or (yr_Y_papers is None):
            yr_Y_authors, yr_Y_papers = self.get_yearwise_authors_by_keywords(
                Y_terms, return_papers=True, case_sensitives=case_sensitives)

        # analyze years from 2001 to 2018 (note that: yrs[-1]=2019)
        disc_dict = {}
        for yr in np.arange(start_yr, yrs[-1]):
            yr_loc = np.where(yrs == yr)[0][0]
            thisyr_Y_papers = yr_Y_papers[yr]

            disc_dict[yr] = {}
            new_discs = find_first_time_cocrs(cocrs, yr_loc)
            logger.info('PROGRESS FOR {}: {} new discoveries found'.format(
                yr, len(new_discs)))
            for i, chm in enumerate(full_chems[new_discs]):
                yr_X_authors, yr_X_papers = self.get_yearwise_authors_by_keywords(
                    [chm], chemical=True, return_papers=True)
                thisyr_X_papers = yr_X_papers[yr]

                # papers with co-occurrences
                ov_papers = list(
                    set(thisyr_Y_papers).intersection(set(thisyr_X_papers)))
                disc_dict[yr][chm] = {pid: {} for pid in ov_papers}
                for pid in ov_papers:
                    # authors of papers with co-occurrences
                    A = self.get_authors_by_paper_id([pid], ['author_id'])
                    if len(A) > 0: A = A['author_id']
                    disc_dict[yr][chm][pid] = {a: [{}, {}] for a in A}

                    for auth in A:
                        """ for the property """
                        # years that the current author has published a paper on Y so that ..
                        a_pubY_yrs = [
                            y for y in yr_Y_authors
                            if auth in yr_Y_authors[y] and y < yr
                        ]
                        if len(a_pubY_yrs) > 0:
                            # .. we can consider only those years to query his/her papers
                            array_yrs = '({})'.format(','.join(
                                [str(y) for y in a_pubY_yrs]))
                            scomm = 'SELECT P.paper_id, YEAR(P.date) FROM paper P \
                                     INNER JOIN paper_author_mapping P2A ON P.paper_id=P2A.paper_id \
                                     WHERE P2A.author_id={} AND (YEAR(P.date) IN {})'.format(
                                auth, array_yrs)
                            # Pa and Ya are the papers and years of those papers
                            (_, Pa), (_, Ya) = self.execute_and_get_results(
                                scomm, ['paper_id', 'year']).items()
                            uYa = np.unique(Ya)
                            disc_dict[yr][chm][pid][auth][0] = {
                                yr: [
                                    Pa[i] for i in range(len(Pa))
                                    if Ya[i] == yr if Pa[i] in yr_Y_papers[yr]
                                ]
                                for yr in uYa
                            }
                        """ for the molecule """
                        a_pubX_yrs = [
                            x for x in yr_X_authors
                            if auth in yr_X_authors[x] and x < yr
                        ]
                        if len(a_pubX_yrs) > 0:
                            array_yrs = '({})'.format(','.join(
                                [str(x) for x in a_pubX_yrs]))
                            scomm = 'SELECT P.paper_id, YEAR(P.date) FROM paper P \
                                     INNER JOIN paper_author_mapping P2A ON P.paper_id=P2A.paper_id \
                                     WHERE P2A.author_id={} AND (YEAR(P.date) IN {})'.format(
                                auth, array_yrs)
                            (_, Pa), (_, Ya) = self.execute_and_get_results(
                                scomm, ['paper_id', 'year']).items()
                            uYa = np.unique(Ya)
                            disc_dict[yr][chm][pid][auth][1] = {
                                yr: [
                                    Pa[i] for i in range(len(Pa))
                                    if Ya[i] == yr if Pa[i] in yr_X_papers[yr]
                                ]
                                for yr in uYa
                            }

                if i > 0 and not (i % 100):
                    logger.info('\t{} materials have been analyzed'.format(i))

            if savefile_path is not None:
                with open(savefile_path, 'wb') as f:
                    pickle.dump(disc_dict, f)
                logger.info(
                    'The results have been saved in {}'.format(savefile_path))

        return disc_dict
Ejemplo n.º 8
0
def complete_affiliations(paper_ids, sql_db, sql_cursor, logfile_path=None):

    logger = helpers.set_up_logger(__name__, logfile_path, False, file_mode='a')
    
    # initialize the affiliation primary key
    sql_cursor.execute('SELECT aff_id FROM affiliation;')
    all_aff_PKs = sql_cursor.fetchall()
    if len(all_aff_PKs)==0:
        aff_PK = 0
    else:
        aff_PK = max([a[0] for a in all_aff_PKs]) + 1
        
    sql_cursor.execute('SELECT aff_scopus_ID FROM affiliation;')
    curr_aff_scopus_id_list = [a[0] for a in sql_cursor.fetchall()]
    sql_cursor.execute('SELECT * FROM author_affiliation_mapping;')
    curr_author_aff_pairs = list(sql_cursor.fetchall())

    pids_array = ','.join([str(p) for p in paper_ids])
    sql_cursor.execute('SELECT doi, paper_id FROM paper WHERE paper_id IN {};'.format(pids_array))
    RES = sql_cursor.fetchall()
    dois = [a[0] for a in RES]
    paper_ids = [a[1] for a in RES]

    dois_with_nonexisting_authors = []
    for j,doi in enumerate(dois):
        
        try:
            r = AbstractRetrieval(doi)
        except Scopus429Error:
            print('Scopus resource exhausted. Check your quota.')
            return
        except:
            raise ValueError('Could not download doi {}'.format(doi))
        
        if r.authors is None:
            continue
        
        paper_scopus_id_list = [a.auid for a in r.authors]
        for i,scps_id in enumerate(paper_scopus_id_list):
            # if repetitive author, ignore:
            if scps_id in paper_scopus_id_list[:i]:
                continue

            sql_cursor.execute('SELECT author_id \
                                FROM author \
                                WHERE author_scopus_ID = {}'.format(scps_id))
            
            this_author_PK = sql_cursor.fetchall()
            if len(this_author_PK)==0:
                if doi not in dois_with_nonexisting_authors:
                    dois_with_nonexisting_authors += [doi]
                logger.info('(CASE NUMBER {}) PAPER_ID {}, DOI {}: author with scopus ID {} does not exist.'.format(306+len(dois_with_nonexisting_authors), paper_ids[j], doi, scps_id))
                continue
            else:
                this_author_PK = this_author_PK[0][0]
            
            # directly go to their affiliations
            if r.authors[i].affiliation is not None:
                author_aff_scopus_id_list = np.unique(r.authors[i].affiliation)
            else:
                author_aff_scopus_id_list = []
                
            for aff_scps_id in author_aff_scopus_id_list:
                if aff_scps_id in curr_aff_scopus_id_list:
                    sql_cursor.execute('SELECT aff_id \
                    FROM affiliation \
                    WHERE aff_scopus_ID = {}'.format(aff_scps_id))
                    this_aff_PK = sql_cursor.fetchall()[0][0]

                    # add the pair only if the author/aff. have not already
                    # been added to the mapping table
                    if (this_author_PK, this_aff_PK) not in curr_author_aff_pairs:
                        sql_cursor.execute('INSERT INTO author_affiliation_mapping \
                                            VALUES({}, {})'.format(this_author_PK,
                                                                   this_aff_PK))
                        curr_author_aff_pairs += [(this_author_PK, this_aff_PK)]
                        logger.info('{} have been added to A2A.'.format((r.authors[i].given_name,
                                                                         r.authors[i].surname,
                                                                         this_aff_PK)))
                else:
                    lcn = np.where([x.id==aff_scps_id for x in r.affiliation])[0]
                    if len(lcn)>0:
                        lcn = lcn[0]
                        aff_name = r.affiliation[lcn].name.replace('"','\\"')
                        aff_city = r.affiliation[lcn].city
                        aff_country = r.affiliation[lcn].country
                    else:
                        aff_name = 'NA'
                        aff_city = 'NA'
                        aff_country = 'NA'

                    sql_cursor.execute('INSERT INTO affiliation \
                                        VALUES({},"{}","{}","{}","{}");'.format(
                                            aff_PK,
                                            aff_scps_id,
                                            aff_name,
                                            aff_city,
                                            aff_country)
                    )
                    sql_cursor.execute('INSERT INTO author_affiliation_mapping \
                                        VALUES({}, {})'.format(this_author_PK, aff_PK))
                    curr_author_aff_pairs += [(this_author_PK, aff_PK)]
                    logger.info('{} have been added to A2A.'.format((r.authors[i].given_name,
                                                                     r.authors[i].surname,
                                                                     this_aff_PK)))

                    # update the affliations list
                    curr_aff_scopus_id_list += [aff_scps_id]
                    aff_PK += 1

        if not(j%1000):
            np.savetxt('/home/jamshid/codes/data/iter_inds.txt', [j])
        sql_db.commit()