def make_homolog_links(db): """Add gene_abstract links for homologs (using homologene_gene table)""" logger.debug('inserting records from `gene_abstract` into `homologene_gene_abstract` table') c = getcursor(db) try: c.execute(""" insert ignore into `homologene_gene_abstract` select * from `gene_abstract` """) except Exception as e: logger.critical('Could not insert records from `gene_abstract` into `homologene_gene_abstract` table. Error message: %s', e) raise logger.debug('inserting homolog links from `gene_abstract` and `homologene_gene` into `homologene_gene_abstract`') try: c.execute(""" insert ignore into `homologene_gene_abstract` select null `id`, h1.`gene` `gene`, ga.`abstract` `abstract` from `homologene_gene` h1 inner join `homologene_gene` h2 on h1.`homologene_id` = h2.`homologene_id` inner join `gene_abstract` ga on h2.`gene` = ga.`gene` """) except Exception as e: logger.critical('could not insert homolog links from `gene_abstract` and `homologene_gene` into `homologene_gene_abstract`. Error message: e', e) raise c.close() logger.info('inserted records from `gene_abstract` into `homologene_gene_abstract` table')
def ids(db, size = abstract_fetch_batchsize): """Find all pubmed ID's of abstracts with `updated` set to null. Return an iterator over lists of abstract PMID's with the given size (for fetching multiple abstracts at once.)""" logger.debug('Quering database for a list of un-fetched abstracts') c = getcursor(db) try: c.execute(""" select `pubmed_id` from `abstract` where `updated` is null; """) except Exception as e: logger.critical('Error: could not get a list of un-fetched abstracts from the database. Error message: %s', e) raise logger.info('Got list of un-fetched abstracts') records = c.fetchall() c.close() for i in xrange(0, len(records), size): yield [r[0] for r in records[i:i+size]]
def count_genes(db): """Update the `gene` table with counts of associated abstracts""" logger.debug('updating the `gene` table with new abstract counts') c = getcursor(db) try: c.execute(""" update `gene` set `gene`.`abstracts` = ( select count(distinct `abstract`) from `gene_abstract` where `gene_abstract`.`gene` = `gene`.`entrez_id`), `gene`.`homolog_abstracts` = ( select count(distinct `abstract`) from `homologene_gene_abstract` where `homologene_gene_abstract`.`gene` = `gene`.`entrez_id`); """) except Exception as e: logger.critical('Error updating the `gene` table with new abstract counts. Error message: %s', e) raise c.close() logger.info('updated the `gene` table with new abstract counts')
def check_abstract_counts(db): """Check to see if the index and database table have the same number of abstracts (something is wrong if they don't.)""" # get index abstract count ix = open_index(ABSTRACT_INDEX_PATH) ix_count = ix.doc_count() ix.close() # get abstract count from database c = getcursor(db) c.execute(""" select count(*) from `abstract` """) db_count, = c.fetchone() c.close() if ix_count == db_count: logger.info('database abstract count matches index abstract count (this is good.) abstract count: %s', ix_count) return True else: logger.warning('database abstract count does not match index abstract count. database count: %s, index count: %s. There may have been an error committing the index, or an abstract was removed from the abstract table. If you can\'t figure out what caused the inconsistency, one way to fix it is to re-build the index from scratch (delete the contents of the index folder, set the `indexed` column in the `abstract` table to null for all rows, and run the updater script. It will probably take a couple hours.', db_count, ix_count) return False
def update(db, metadata): """Enter metadata into the table using a given list of dicts. Each dict is one row.""" c = getcursor(db) for m in metadata: try: c.execute(""" update `abstract` set title := %s, authors := %s, abstract := %s, pubdate := %s, journal := %s, volume := %s, issue := %s, pages := %s, review := %s, updated := now() where pubmed_id = %s; """, (m['title'], m['authors'], m['abstract'], (m['year'] +'-'+ m['month'] +'-'+m['day']), m['journal'], m['volume'], m['issue'], m['pages'], m['review'], m['id']) ) except Exception as e: logger.error('Error updating `abstract` table for PMID %s. Error message: %s', m['id'], e) c.close()
def cleanup(db): """Remove gene-abstract links with gene or abstract IDs set to 0.""" logger.debug('Removing gene_abstracts with gene or abstract ID set to 0') c = config.getcursor(db) try: c.execute(""" delete from `gene_abstract` where `abstract` = 0 or `gene` = 0; """) except Exception as e: logger.error('Error removing bad gene_abstract links, error message: %s', e) c.close() logger.info('Removed gene_abstracts with gene or abstract ID set to 0')
def add_new_abstracts(db): """Add abstracts from the 'gene_abstract' link table to the 'abstract' table.""" logger.debug('adding abstract id\'s from gene_abstract table to the abstract table') c = getcursor(db) try: c.execute(""" insert ignore into `abstract` (`pubmed_id`) select distinct `abstract` from `gene_abstract`; """) except Exception as e: logger.critical('Error inserting add new abstract ID\'s from gene-abstract links into abstract table. Error message: %s', e) raise c.close() logger.info('added new abstract ID\'s to abstract table')
def insert(db, source): """Insert a source into the database""" logger.debug('Inserting %s into gene_abstract database table', source.filename) # check to see if this source has a function for insertion if source.insertfunction: return source.insertfunction(db, source) c = config.getcursor(db) # otherwise insert by executing its 'insertquery' try: c.execute(source.insertquery.format(path=join(config.datapath, source.filename))) except Exception as e: logger.error('error inserting %s into gene_abstract database table. Maybe the format of the file changed? Error message: %s', source.filename, e) return False else: logger.info('Inserted %s into gene_abstract database table', source.filename) return True
def remove_bad_abstracts(db): """Remove abstracts in the 'removed_abstracts' table from the 'abstract' table""" logger.debug('removing abstracts in the `removed_abstracts` table from the `abstract` table') c = getcursor(db) try: c.execute(""" delete `abstract` from `abstract` inner join `removed_abstracts` on `abstract`.`pubmed_id` = `removed_abstracts`.`abstract` """) except Exception as e: logger.critical('Error removing abstracts in the `removed_abstracts` table from the `abstract` table. Error message: %s', e) raise c.close() logger.info('removed abstracts in the `removed_abstracts` table from the `abstract` table')
def remove_bad_links(db): """Remove `gene_abstract` links for abstracts not in the `abstract` table""" logger.debug('Removing `gene_abstract` links for abstracts not in the `abstract` table') c = getcursor(db) try: c.execute(""" delete `gene_abstract` from `gene_abstract` left join `abstract` on `gene_abstract`.`abstract` = `abstract`.`pubmed_id` where `abstract`.`pubmed_id` is null""") except Exception as e: logger.critical('Error removing `gene_abstract` links for abstracts not in the `abstract` table. Error message: %s', e) raise c.close() logger.info('Removed `gene_abstract` links for abstracts not in the `abstract` table')
def find_unfetched_abstracts(db): """Find unfetched abstracts in the `abstract` table (where the `abstract` field is null,) and add them to the `removed_abstracts` table""" logger.debug('Finding unfetched abstracts in the `abstract` table (where the `abstract` field is null), and adding them to the `removed_abstract` table') c = getcursor(db) try: c.execute(""" insert ignore into `removed_abstracts` (`abstract`, `removed`, `reason`) select `pubmed_id` `abstract`, now() `removed`, 2 `reason` from `abstract` a where a.`abstract` is null""") except Exception as e: logger.critical('Error adding unfetched abstracts to `removed_abstract` table. Error message: %s', e) raise c.close() logger.info('Found unfetched abstracts in the `abstract` table (where the `abstract` field is null), and added them to the `removed_abstract` table')
def remove_bad_abstracts(ix, db): """Remove abatracts in the "removed_abstracts" database table from the index.""" c = getcursor(db) # get abstract PMIDs from the removed_abstract table c.execute(""" select `abstract` from `removed_abstracts` """) logger.debug('Removing abstracts in the `removed_abstracts` table from the index') # for each PMID in the query result, try to delete that abstract from # the index. writer = ix.writer() for row in c: pmid = row[0] writer.delete_by_term('pmid', unicode(pmid)) c.close() logger.info('Removed abstracts in the `removed_abstracts` table from the index')
def clearfiles(db): logger.debug('Deleting uploaded files more than 1 day old from database') c = getcursor(db) # delete uploaded genes and uploaded files more than a day old c.execute(""" delete uploaded_gene, uploaded_genefile from uploaded_gene inner join uploaded_genefile on uploaded_gene.`genefile` = uploaded_genefile.`id` where uploaded_genefile.`uploaded` > (now() - interval 1 day); """) # delete uploaded files with no genes c.execute(""" delete uploaded_genefile from uploaded_genefile left join uploaded_gene on uploaded_gene.genefile = uploaded_genefile.id where uploaded_gene.`genefile` is null; """) # delete uploaded genes with no file c.execute(""" delete uploaded_gene from uploaded_gene left join uploaded_genefile on uploaded_gene.genefile = uploaded_genefile.id where uploaded_genefile.`id` is null; """) c.close() logger.info('Finished deleting old files')
def find_too_many_genes(db, maxgenes=1000): """Find abstracts with more than 'maxgenes' (1000 by default) associated genes, and then add them to the 'removed_abstracts' table to get rid of GWA papers""" logger.debug('adding abstracts to `removed_abstracts` table with too many (> %s) associated genes', maxgenes) c = getcursor(db) try: c.execute(""" insert ignore into `removed_abstracts` (`abstract`, `removed`, `reason`) select `abstract`, now() `removed`, 1 `reason` from `gene_abstract` group by `abstract` having count(`gene`) > %s """, (maxgenes,)) except Exception as e: logger.critical('Error adding abstracts with too many genes (> %s) to `removed_abstracts` table. Error message: %s', maxgenes, e) raise c.close() logger.info('added abstracts to `removed_abstracts` table with too many (> %s) associated genes', maxgenes)
def articles(db): # get unindexed and updated abstracts logger.debug('Looking up articles that need to be indexed') c = getcursor(db) try: c.execute(""" select `pubmed_id`, `title`, `abstract`, `authors`, year(`pubdate`), month(`pubdate`), day(`pubdate`), `journal`, `volume`, `pages`, `review` from `abstract` where `updated` is not null and (`indexed` is null or `indexed` < `updated` or `indexed` < `index_dirty`); """) except Exception as e: logger.critical('Could not look up articles that need to be indexed. Error message: %s', e) raise logger.info('Looked up articles that need to be indexed') for article in c: yield article c.close()
def update_metabolites(db): """ Find metabolites mentioned in new articles, and insert new records into the metabolite_abstract table in the database. (For each metabolite in the metabolite_info.txt file, search against the temporary whoosh index containing only new articles.) """ logger.debug('Scanning for metabolites') # Don't open the index until this enclosing function is called, because # we'll be deleting it and re-creating it in a previous state of the # update process. ix = open_index(TEMP_METABOLITE_INDEX_PATH) cursor = getcursor(db) # query parser and searcher parser = QueryParser('abstract',ix.schema) parser.add_plugin(PhrasePlugin) searcher = ix.searcher(weighting=BM25F) #Get all common names so they don't repeat #outfile = open('metabolite2pubmed.txt','w') #mapping file common_name_set = set() with open('metabolite_info.txt')as f: for line in f: if line.startswith('HMDB'): synonym_line=f.next().strip() synonyms = synonym_line.split('\t') common_name = synonyms[0] #print(common_name) common_name_set.add(common_name) #search abstracts and write to metabolite2pubmed.txt with open('metabolite_info.txt') as f: for line in f: if line.startswith('HMDB'): #outfile.write(line) #Write ID to file (line 1) hmdb_id = line.strip() synonym_line = f.next().strip() #outfile.write(synonym_line) synonyms = synonym_line.split('\t') common_name = synonyms[0] printsyn = common_name + '\t' for s in synonyms: if s in common_name_set and s != common_name: synonyms.remove(s) continue if s == common_name: continue printsyn = printsyn + '\t' +s #outfile.write(printsyn+'\n') #Write synonyms to file (line 2) reference_line = f.next().strip() references = set(reference_line.split('\t')) if '\n' in references: references.remove('\n') for name in synonyms: query = '"' + name + '"' #performs complete query results = get_abstracts(parser, searcher, query) #searches with get_abstracts useing "line" as the search keyword for item in results: references.add(str(item)) rlist = list(references) insert_db_records(cursor, hmdb_id, rlist) #rline = '\t'.join(references) + '\n' #outfile.write(rline) #Write references to file (line 3) logger.info('updated metabolite-abstract links')
def write(articles, ix, db, mark_db=True): """ Given a list of articles as tuples, write them to the index ix. If mark_db is True, then update the database record for each article in 'articles,' recording that it has been indexed. """ logger.debug('writing articles to index') # update the index with the articles writer = ix.writer() # keep track of the articles we've written so far, so we cam mark them as # indexed after a successful commit writtenPMIDs = [] c = getcursor(db) for i, article in enumerate(articles): pmid, title, abstract, authors, year, month, day, journal, volume, pages, review = article # make text fields unicode if title is not None: title = unicode(title, 'utf-8') if abstract is not None: abstract = unicode(abstract, 'utf-8') if authors is not None: authors = unicode(authors, 'utf-8') # get associated genes genes = u' '.join([unicode(g[0]) for g in lookup_genes(pmid, c)]) homolog_genes = u' '.join([unicode(g[0]) for g in lookup_homolog_genes(pmid, c)]) metabolites = u' '.join([unicode(g[0]) for g in lookup_metabolites(pmid, c)]) # index the document writer.update_document(pmid=pmid, genes=genes, homolog_genes=homolog_genes, metabolites=metabolites, title=title, abstract=abstract, authors=authors, year=year, month=month, day=day, journal=journal, volume=volume, pages=pages, review=review) # keep track of the articles we've written so far. writtenPMIDs.append(pmid) # commit the index every 100,000 articles if i % 100000 == 0 and i != 0: writer.commit(merge=False) writer = ix.writer() logger.debug('Commiting abstract index. Abstracts: %s', i) logger.info('wrote articles to index') logger.debug("committing abstract index...") writer.commit() logger.info("committed abstract index") # mark all of the indexed abstracts as indexed after a successful commit if mark_db: for pmid in writtenPMIDs: mark_as_indexed(pmid, c) c.close()