def make_homolog_links(db):
    """Add gene_abstract links for homologs (using homologene_gene table)"""

    logger.debug('inserting records from `gene_abstract` into `homologene_gene_abstract` table')

    c = getcursor(db)

    try:
        c.execute("""
        insert ignore into `homologene_gene_abstract`
        select * from `gene_abstract`
        """)
    except Exception as e:
        logger.critical('Could not insert records from `gene_abstract` into `homologene_gene_abstract` table.  Error message: %s', e)
        raise

    logger.debug('inserting homolog links from `gene_abstract` and `homologene_gene` into `homologene_gene_abstract`')
    try:
        c.execute("""
        insert ignore into `homologene_gene_abstract`
        select null `id`, h1.`gene` `gene`, ga.`abstract` `abstract`
        from `homologene_gene` h1
        inner join `homologene_gene` h2 on h1.`homologene_id` = h2.`homologene_id`
        inner join `gene_abstract` ga on h2.`gene` = ga.`gene`
        """)
    except Exception as e:
        logger.critical('could not insert homolog links from `gene_abstract` and `homologene_gene` into `homologene_gene_abstract`.  Error message: e', e)
        raise

    c.close()
    logger.info('inserted records from `gene_abstract` into `homologene_gene_abstract` table')
def ids(db, size = abstract_fetch_batchsize):
    """Find all pubmed ID's of abstracts with `updated` set to null.  Return an
    iterator over lists of abstract PMID's with the given size (for fetching
    multiple abstracts at once.)"""

    logger.debug('Quering database for a list of un-fetched abstracts')

    c = getcursor(db)

    try:
        c.execute("""
            select `pubmed_id`
            from `abstract`
            where `updated` is null;
        """)
    except Exception as e:
        logger.critical('Error: could not get a list of un-fetched abstracts from the database.  Error message: %s', e)
        raise
    
    logger.info('Got list of un-fetched abstracts')

    records = c.fetchall()
    c.close()

    for i in xrange(0, len(records), size):
        yield [r[0] for r in records[i:i+size]]
def count_genes(db):
    """Update the `gene` table with counts of associated abstracts"""

    logger.debug('updating the `gene` table with new abstract counts')

    c = getcursor(db)

    try:
        c.execute("""
        update `gene`
        set `gene`.`abstracts` = (
            select count(distinct `abstract`) 
            from `gene_abstract` 
            where `gene_abstract`.`gene` = `gene`.`entrez_id`),
        `gene`.`homolog_abstracts` = (
            select count(distinct `abstract`) 
            from `homologene_gene_abstract` 
            where `homologene_gene_abstract`.`gene` = `gene`.`entrez_id`);
        """)
    except Exception as e:
        logger.critical('Error updating the `gene` table with new abstract counts.  Error message: %s', e)
        raise

    c.close()
    logger.info('updated the `gene` table with new abstract counts')
def check_abstract_counts(db):
    """Check to see if the index and database table have the same number of
    abstracts (something is wrong if they don't.)"""
    
    # get index abstract count
    ix = open_index(ABSTRACT_INDEX_PATH)
    ix_count = ix.doc_count()
    ix.close()

    # get abstract count from database
    c = getcursor(db)

    c.execute("""
    select count(*) from `abstract`
    """)

    db_count, = c.fetchone()
    c.close()

    if ix_count == db_count:
        logger.info('database abstract count matches index abstract count (this is good.)  abstract count: %s', ix_count)
        return True
    else:
        logger.warning('database abstract count does not match index abstract count.  database count: %s, index count: %s.  There may have been an error committing the index, or an abstract was removed from the abstract table.  If you can\'t figure out what caused the inconsistency, one way to fix it is to re-build the index from scratch (delete the contents of the index folder, set the `indexed` column in the `abstract` table to null for all rows, and run the updater script.  It will probably take a couple hours.', db_count, ix_count)
        return False
def update(db, metadata):
    """Enter metadata into the table using a given list of dicts.  Each dict is
    one row."""
   
    c = getcursor(db)

    for m in metadata:
        try:
            c.execute("""
                update `abstract`
                set title := %s,
                authors := %s,
                abstract := %s,
                pubdate := %s,
                journal := %s,
                volume := %s,
                issue := %s,
                pages := %s,
                review := %s,
                updated := now()
                where pubmed_id = %s;
            """,
            (m['title'], m['authors'], m['abstract'], (m['year'] +'-'+ m['month'] +'-'+m['day']), 
            m['journal'], m['volume'], m['issue'], m['pages'], m['review'], m['id'])
            )
        except Exception as e:     
            logger.error('Error updating `abstract` table for PMID %s.  Error message: %s', m['id'], e)

    c.close()
def cleanup(db):
    """Remove gene-abstract links with gene or abstract IDs set to 0."""

    logger.debug('Removing gene_abstracts with gene or abstract ID set to 0')

    c = config.getcursor(db)

    try:
        c.execute("""
        delete from `gene_abstract`
        where `abstract` = 0
        or `gene` = 0;
        """)
    except Exception as e:
        logger.error('Error removing bad gene_abstract links, error message: %s', e)
    
    c.close()
    logger.info('Removed gene_abstracts with gene or abstract ID set to 0')
def add_new_abstracts(db):
    """Add abstracts from the 'gene_abstract' link table to the 'abstract'
    table."""

    logger.debug('adding abstract id\'s from gene_abstract table to the abstract table')

    c = getcursor(db)

    try:
        c.execute("""
        insert ignore into `abstract` (`pubmed_id`)
        select distinct `abstract` from `gene_abstract`; 
        """)
    except Exception as e:
        logger.critical('Error inserting add new abstract ID\'s from gene-abstract links into abstract table.  Error message: %s', e)
        raise

    c.close()
    logger.info('added new abstract ID\'s to abstract table')    
def insert(db, source):
    """Insert a source into the database"""

    logger.debug('Inserting %s into gene_abstract database table', source.filename)

    # check to see if this source has a function for insertion
    if source.insertfunction:
        return source.insertfunction(db, source)

    c = config.getcursor(db)

    # otherwise insert by executing its 'insertquery'
    try:
        c.execute(source.insertquery.format(path=join(config.datapath, source.filename)))
    except Exception as e:
        logger.error('error inserting %s into gene_abstract database table.  Maybe the format of the file changed?  Error message: %s', source.filename, e)
        return False
    else:
        logger.info('Inserted %s into gene_abstract database table', source.filename)
        return True
def remove_bad_abstracts(db):
    """Remove abstracts in the 'removed_abstracts' table from the 'abstract' table"""

    logger.debug('removing abstracts in the `removed_abstracts` table from the `abstract` table')

    c = getcursor(db)

    try:
        c.execute("""
        delete `abstract`
        from `abstract`
        inner join `removed_abstracts`
        on `abstract`.`pubmed_id` = `removed_abstracts`.`abstract`
        """)
    except Exception as e:
        logger.critical('Error removing abstracts in the `removed_abstracts` table from the `abstract` table.  Error message: %s', e)
        raise

    c.close()
    logger.info('removed abstracts in the `removed_abstracts` table from the `abstract` table')
def remove_bad_links(db):
    """Remove `gene_abstract` links for abstracts not in the `abstract` table"""

    logger.debug('Removing `gene_abstract` links for abstracts not in the `abstract` table')

    c = getcursor(db)

    try:
        c.execute("""
        delete `gene_abstract`
        from `gene_abstract`
        left join `abstract`
        on `gene_abstract`.`abstract` = `abstract`.`pubmed_id`
        where `abstract`.`pubmed_id` is null""")
    except Exception as e:
        logger.critical('Error removing `gene_abstract` links for abstracts not in the `abstract` table.  Error message: %s', e)
        raise

    c.close()
    logger.info('Removed `gene_abstract` links for abstracts not in the `abstract` table')
def find_unfetched_abstracts(db):
    """Find unfetched abstracts in the `abstract` table (where the `abstract`
    field is null,) and add them to the `removed_abstracts` table"""

    logger.debug('Finding unfetched abstracts in the `abstract` table (where the `abstract` field is null), and adding them to the `removed_abstract` table')

    c = getcursor(db)

    try:
        c.execute("""
        insert ignore into `removed_abstracts`
        (`abstract`, `removed`, `reason`)
        select `pubmed_id` `abstract`, now() `removed`, 2 `reason`
        from `abstract` a
        where a.`abstract` is null""")
    except Exception as e:
        logger.critical('Error adding unfetched abstracts to `removed_abstract` table.  Error message: %s', e)
        raise

    c.close()
    logger.info('Found unfetched abstracts in the `abstract` table (where the `abstract` field is null), and added them to the `removed_abstract` table')
Beispiel #12
0
def remove_bad_abstracts(ix, db):
    """Remove abatracts in the "removed_abstracts" database table from the
    index."""

    c = getcursor(db)

    # get abstract PMIDs from the removed_abstract table
    c.execute("""
    select `abstract` from `removed_abstracts`
    """)

    logger.debug('Removing abstracts in the `removed_abstracts` table from the index')

    # for each PMID in the query result, try to delete that abstract from
    # the index.
    writer = ix.writer()
    for row in c:
        pmid = row[0]

        writer.delete_by_term('pmid', unicode(pmid))
    
    c.close()
    logger.info('Removed abstracts in the `removed_abstracts` table from the index')
def clearfiles(db):

    logger.debug('Deleting uploaded files more than 1 day old from database')

    c = getcursor(db)

    # delete uploaded genes and uploaded files more than a day old
    c.execute("""
    delete uploaded_gene, uploaded_genefile 
    from uploaded_gene
    inner join uploaded_genefile
    on uploaded_gene.`genefile` = uploaded_genefile.`id`
    where uploaded_genefile.`uploaded` > (now() - interval 1 day);
    """)

    # delete uploaded files with no genes
    c.execute("""
    delete uploaded_genefile
    from uploaded_genefile
    left join uploaded_gene
    on uploaded_gene.genefile = uploaded_genefile.id
    where uploaded_gene.`genefile` is null;    
    """)

    # delete uploaded genes with no file
    c.execute("""
    delete uploaded_gene
    from uploaded_gene
    left join uploaded_genefile
    on uploaded_gene.genefile = uploaded_genefile.id
    where uploaded_genefile.`id` is null;
    """)

    c.close()

    logger.info('Finished deleting old files')
def find_too_many_genes(db, maxgenes=1000):
    """Find abstracts with more than 'maxgenes' (1000 by default) associated 
    genes, and then add them to the 'removed_abstracts' table to get rid of 
    GWA papers"""

    logger.debug('adding abstracts to `removed_abstracts` table with too many (> %s) associated genes', maxgenes)

    c = getcursor(db)

    try:
        c.execute("""
        insert ignore into `removed_abstracts`
        (`abstract`, `removed`, `reason`)
        select `abstract`, now() `removed`, 1 `reason`
        from `gene_abstract`
        group by `abstract`
        having count(`gene`) > %s
        """, (maxgenes,))
    except Exception as e:
        logger.critical('Error adding abstracts with too many genes (> %s) to `removed_abstracts` table.  Error message: %s', maxgenes, e)
        raise

    c.close()
    logger.info('added abstracts to `removed_abstracts` table with too many (> %s) associated genes', maxgenes)
Beispiel #15
0
def articles(db):
    # get unindexed and updated abstracts

    logger.debug('Looking up articles that need to be indexed')

    c = getcursor(db)

    try:
        c.execute("""
            select `pubmed_id`, `title`, `abstract`, `authors`, year(`pubdate`), month(`pubdate`), day(`pubdate`), `journal`, `volume`, `pages`, `review`
            from `abstract`
            where `updated` is not null
            and (`indexed` is null or `indexed` < `updated` or `indexed` < `index_dirty`);
            """)
    except Exception as e:
        logger.critical('Could not look up articles that need to be indexed.  Error message: %s', e)
        raise

    logger.info('Looked up articles that need to be indexed')

    for article in c:
        yield article

    c.close()
def update_metabolites(db):
    """
    Find metabolites mentioned in new articles, and insert new records into the
    metabolite_abstract table in the database.
    
    (For each metabolite in the metabolite_info.txt file, search against the 
    temporary whoosh index containing only new articles.)
    """

    logger.debug('Scanning for metabolites')

    # Don't open the index until this enclosing function is called, because
    # we'll be deleting it and re-creating it in a previous state of the 
    # update process.
    ix = open_index(TEMP_METABOLITE_INDEX_PATH)
    cursor = getcursor(db)


    # query parser and searcher
    parser = QueryParser('abstract',ix.schema)
    parser.add_plugin(PhrasePlugin)
    searcher = ix.searcher(weighting=BM25F)


    #Get all common names so they don't repeat
    #outfile = open('metabolite2pubmed.txt','w') #mapping file
    common_name_set = set()
    with open('metabolite_info.txt')as f:
        for line in f:
            if line.startswith('HMDB'):
                synonym_line=f.next().strip()
                synonyms = synonym_line.split('\t')
                common_name = synonyms[0]
                #print(common_name)
                common_name_set.add(common_name)


    #search abstracts and write to metabolite2pubmed.txt
    with open('metabolite_info.txt') as f:
        for line in f:
            if line.startswith('HMDB'):
                #outfile.write(line) #Write ID to file (line 1)
                
                hmdb_id = line.strip()
                
                synonym_line = f.next().strip()
                #outfile.write(synonym_line)
                synonyms = synonym_line.split('\t')
                common_name = synonyms[0]
                printsyn = common_name + '\t'
                for s in synonyms:
                    if s in common_name_set and s != common_name:
                        synonyms.remove(s)
                        continue
                    if s == common_name:
                        continue
                    printsyn = printsyn + '\t' +s
                #outfile.write(printsyn+'\n') #Write synonyms to file (line 2)
                reference_line = f.next().strip()
                references = set(reference_line.split('\t'))
                if '\n' in references:
                    references.remove('\n')

                for name in synonyms:
                    query = '"' + name + '"' #performs complete query
                    results = get_abstracts(parser, searcher, query) #searches with get_abstracts useing "line" as the search keyword
                    for item in results:
                        references.add(str(item))


                rlist = list(references)
                
                insert_db_records(cursor, hmdb_id, rlist)
                
                #rline = '\t'.join(references) + '\n'
                #outfile.write(rline) #Write references to file (line 3)


    logger.info('updated metabolite-abstract links')
Beispiel #17
0
def write(articles, ix, db, mark_db=True):
    """
    Given a list of articles as tuples, write them to the index ix.
    
    If mark_db is True, then update the database record for each article in 'articles,'
    recording that it has been indexed.
    """ 

    logger.debug('writing articles to index')

    # update the index with the articles
    writer = ix.writer()
    
    # keep track of the articles we've written so far, so we cam mark them as
    # indexed after a successful commit
    writtenPMIDs = []

    c = getcursor(db)

    for i, article in enumerate(articles):
        pmid, title, abstract, authors, year, month, day, journal, volume, pages, review = article
        

        # make text fields unicode
        if title is not None:
            title = unicode(title, 'utf-8')
        if abstract is not None:
            abstract = unicode(abstract, 'utf-8')
        if authors is not None:
            authors = unicode(authors, 'utf-8')


        # get associated genes
        genes = u' '.join([unicode(g[0]) for g in lookup_genes(pmid, c)])
            
        homolog_genes = u' '.join([unicode(g[0]) for g in lookup_homolog_genes(pmid, c)]) 
        
        metabolites = u' '.join([unicode(g[0]) for g in lookup_metabolites(pmid, c)])
        
        # index the document
        writer.update_document(pmid=pmid, genes=genes, homolog_genes=homolog_genes,
            metabolites=metabolites,
            title=title, abstract=abstract, authors=authors, year=year, month=month, 
            day=day, journal=journal, volume=volume, pages=pages, review=review)
        
        # keep track of the articles we've written so far.
        writtenPMIDs.append(pmid)
        
        # commit the index every 100,000 articles
        if i % 100000 == 0 and i != 0:
            writer.commit(merge=False)
            writer = ix.writer()
            logger.debug('Commiting abstract index.  Abstracts: %s', i)
            
    logger.info('wrote articles to index')


    logger.debug("committing abstract index...")
    writer.commit() 
    logger.info("committed abstract index")


    # mark all of the indexed abstracts as indexed after a successful commit
    if mark_db:
        for pmid in writtenPMIDs:
            mark_as_indexed(pmid, c)

    c.close()