def delete (self, context): cursor = context.get_cursor() try: # First, remove the study (connected table records will # also be deleted). But don't delete non-existent studies. if self.study_id >= 0: study = Study(context, self.study_id) study.delete(context) # Next, unindex it. search_index = canary.search.SearchIndex(context) search_index.unindex_record(self) # Then, remove the metadata cursor.execute(""" DELETE FROM queued_record_metadata WHERE queued_record_id = %s """, self.uid) # Finally, remove this record itself. cursor.execute(""" DELETE FROM queued_records WHERE uid = %s """, self.uid) if context.config.use_cache: context.cache_delete('record:%s' % self.uid) except Exception, e: context.logger.error('Delete queued record %s (%s)', self.uid, e)
def delete(self, context): cursor = context.get_cursor() try: # First, remove the study (connected table records will # also be deleted). But don't delete non-existent studies. if self.study_id >= 0: study = Study(context, self.study_id) study.delete(context) # Next, unindex it. search_index = canary.search.SearchIndex(context) search_index.unindex_record(self) # Then, remove the metadata cursor.execute( """ DELETE FROM queued_record_metadata WHERE queued_record_id = %s """, self.uid) # Finally, remove this record itself. cursor.execute( """ DELETE FROM queued_records WHERE uid = %s """, self.uid) if context.config.use_cache: context.cache_delete('record:%s' % self.uid) except Exception, e: context.logger.error('Delete queued record %s (%s)', self.uid, e)
def search(self, query): """perform a lucene search for studies """ studies = [] for record in self.searcher.search(query, curated_only=True): studies.append(Study(self.context, record.study_id)) return studies
query_str = options.boolean.join( [' "%s" [%s] ' % (term, options.field) for term in args]) #print query_str.strip() search_index = SearchIndex(context) hit_list = [] hits, searcher = search_index.search(query_str) for i, doc in hits: hit_list.append(doc.get('uid')) searcher.close() output = [] for id in hit_list: rec = QueuedRecord(context, int(id)) if options.locations: study = Study(context, rec.study_id) for loc in study.locations: out = [] out.extend((id, loc.uid, loc.study_id, loc.feature_id)) feature = Feature(uid=loc.feature_id) feature.load(context) out.extend((feature.latitude, feature.longitude, feature.name, feature.feature_type, feature.country_code)) output.append('\t'.join([str(v) for v in out])) else: mm = rec.get_mapped_metadata(ctm) if mm['author']: first_author = mm['author'][0] else: first_author = '-' output.append('\t'.join(
#!/usr/bin/env python # $Id$ from canary.context import Context from canary.loader import QueuedRecord from canary.study import Study context = Context() cursor = context.get_cursor() cursor.execute(""" SELECT studies.record_id, study_history.study_id, study_history.modified FROM study_history, studies WHERE study_history.message = 'Set record status to curated' AND study_history.study_id = studies.uid ORDER BY studies.record_id ASC, study_history.modified ASC """) for row in cursor: print "record_id:%i; study_id:%i; curated:%s" % row # update either QUEUED_RECORDS or STUDIES with curated time # r = QueuedRecord(con,row[0]) # r.date_curated = row[2] # r.save s = Study(context, row[1]) s.date_curated = row[2] s.save(context)
pmid_list.sort() for pmid in pmid_list: cursor.execute(""" SELECT studies.uid AS study_id, queued_records.uid AS queued_record_id FROM queued_records, studies WHERE queued_records.study_id = studies.uid AND studies.status = 2 AND queued_records.unique_identifier = %s """, pmid) fields = [d[0] for d in cursor.description] desc = dtuple.TupleDescriptor([[f] for f in fields]) row = cursor.fetchone() if row: row = dtuple.DatabaseTuple(desc, row) study = Study(row['study_id']) study.load(cursor) studies.append((study, pmid)) else: # sort by unique_identifier for easier side-by-side comparison cursor.execute(""" SELECT studies.uid AS study_id, queued_records.uid AS queued_record_id, queued_records.unique_identifier FROM studies, queued_records WHERE studies.uid = queued_records.study_id AND studies.status = 2 AND studies.curator_user_id = %s ORDER BY ABS(queued_records.unique_identifier) """, curator_id)
def makeplot(context, token, records): this_module = sys.modules[__name__] N = len(linkage_keys) ind = arange(N) width = 0.35 meth_type_count = {} study_count = dict(zip(linkages, array([0] * len(linkages)))) for id, type in linkages_attrs: meth_type_count[id] = start.copy() for rec in records: study = Study(context, rec.study_id) for attr, key in linkages_attrs: if getattr(study, 'has_%s' % key): study_count[attr] += 1 for meth in study.methodologies: meth_type_count[attr][meth.study_type_id] += 1 rc('grid', color=0.85, linestyle='-', linewidth=0.3) grid(True) yoff = array([0] * N) s = """ 'experimental' : 1, 'descriptive' : 2, 'aggregate' : 3, 'cross sectional' : 4, 'cohort' : 5, 'case control' : 6, 'disease model' : 7, """ print['%s %s\n' % (k, v) for k, v in meth_type_count.items()] p_exp_x = [meth_type_count[key][1] for key in linkage_keys] p_exp = bar(ind, p_exp_x, width, color='#993333', bottom=yoff) yoff = yoff + p_exp_x p_coh_x = [meth_type_count[key][5] for key in linkage_keys] p_coh = bar(ind, p_coh_x, width, color='#FF9933', bottom=yoff) yoff = yoff + p_coh_x p_csec_x = [meth_type_count[key][4] for key in linkage_keys] p_csec = bar(ind, p_csec_x, width, color='#99CC99', bottom=yoff) yoff = yoff + p_csec_x p_desc_x = [meth_type_count[key][2] for key in linkage_keys] p_desc = bar(ind, p_desc_x, width, color='#6666CC', bottom=yoff) yoff = yoff + p_desc_x #p_agg_x = [meth_type_count[key][3] for key in linkage_keys] #p_agg = bar(ind, p_agg_x, width, color='#CCCC00', bottom=yoff) #yoff = yoff + p_agg_x #p_cc_x = [meth_type_count[key][6] for key in linkage_keys] #p_cc = bar(ind, p_cc_x, width, color='#CC66FF', bottom=yoff) #yoff = yoff + p_cc_x #p_dm_x = [meth_type_count[key][7] for key in linkage_keys] #p_dm = bar(ind, p_dm_x, width, color='#993366', bottom=yoff) #yoff = yoff + p_dm_x precords_x = array([len(records)] * N) #precords = bar(ind+width/3, precords_x, width/3, color='#999999', bottom=0) precords = plot(precords_x, color='#AAAAAA', marker='-', linewidth=1.5) pstudies_x = [study_count[k] for k in linkage_keys] pstudies = bar(ind + width / 3, pstudies_x, width / 3, color='#EEEEEE', bottom=0) max_val = max(yoff) xlabel('Linkages to Human Health') ylabel('# Methodologies by Type for Studies with Linkages') title('Animal Sentinels for "%s" (%s records)' % (token, len(records)), size=12) xticks(ind + width / 2, [linkages[k] for k in linkage_keys], rotation=20, size=6) step = max_val / 5 yticks(arange(0, max_val + (step * 3), step)) legend( (p_exp[0], p_coh[0], p_csec[0], p_desc[0], precords[0], pstudies[0]), ('Experimental', 'Cohort', 'Cross-Sectional', 'Descriptive', '# Studies Total', '# Records w/Linkage')) ## legend((p_exp[0], p_desc[0], p_agg[0], p_csec[0], p_coh[0], ## p_cc[0], p_dm[0], precords[0], pstudies[0]), ## ('Experimental', 'Descriptive', 'Aggregate', ## 'Cross-Sectional', 'Cohort', 'Case-Control', 'Disease Model', #savefig(('%s' % token.replace(' ', '_'))) show() cla()
def index_record(self, record, writer=None): # field, value, store?, index?, token? try: if not writer: had_writer = False writer = self.context.get_search_index_writer(False) else: had_writer = True study = Study(self.context, record.study_id) self.logger.debug('starting document') doc = PyLucene.Document() # First, we need to create a unique key so we can later delete # if necessary. Will try simply uid for now. doc.add(PyLucene.Field('uid', str(record.uid), True, True, False)) doc.add(PyLucene.Field('all', str(record.uid), True, True, False)) # Second, save internal-use metadata. These should probably # be x'd out at Query-time. doc.add( PyLucene.Field('record-status', str(record.status), False, True, False)) doc.add( PyLucene.Field('article-type', str(study.article_type), False, True, False)) source_catalog = self.context.get_source_catalog() complete_term_map = source_catalog.get_complete_mapping() mapped_metadata = record.get_mapped_metadata(complete_term_map) # First index all the non-multiple metadata fields for field in ('abstract', 'affiliation', 'issn', 'journal', 'pubdate', 'issue', 'pages', 'title', 'volume'): val = mapped_metadata.get(field, None) if val: doc.add(PyLucene.Field(field, val, False, True, True)) doc.add(PyLucene.Field('all', val, False, True, True)) # Be sure to index all of (abbrev, full title, issn) as "journal" issn = mapped_metadata.get('issn') if issn: j = Journal() j.load_from_issn(self.context, issn) no_dash = j.no_dash() self.logger.debug('indexing journal: %s, abbv:%s, issn:%s' % \ (j.journal_title, j.abbreviation, issn)) doc.add(PyLucene.Field('journal', issn, False, True, True)) doc.add(PyLucene.Field('journal', no_dash, False, True, True)) doc.add(PyLucene.Field('all', issn, False, True, True)) doc.add(PyLucene.Field('all', no_dash, False, True, True)) if j.abbreviation: doc.add( PyLucene.Field('journal', j.abbreviation, False, True, True)) doc.add( PyLucene.Field('all', j.abbreviation, False, True, True)) if j.journal_title: doc.add( PyLucene.Field('journal', j.journal_title, False, True, True)) doc.add( PyLucene.Field('all', j.journal_title, False, True, True)) # If a page range is given, index the first page, assuming # the delimiter is '-' pages = mapped_metadata.get('pages', None) if pages \ and '-' in pages: first_page = pages[0:pages.index('-')] doc.add(PyLucene.Field('pages', first_page, False, True, True)) doc.add(PyLucene.Field('all', first_page, False, True, True)) # 'unique_identifier' must be specially treated because # of the '_' val = mapped_metadata.get('unique_identifier', None) if val: doc.add( PyLucene.Field('unique-identifier', val, False, True, True)) doc.add(PyLucene.Field('all', val, False, True, True)) # Next, index all the possibly-multiple metadata fields # Give these (especially for author and subject) a little # boost, less than for canary UMLS concepts for field in ('author', 'grantnum', 'keyword', 'registrynum', 'subject'): vals = mapped_metadata.get(field, None) for val in vals: doc.add(PyLucene.Field(field, val, False, True, True)) f = PyLucene.Field('all', val, False, True, True) f.setBoost(1.3) doc.add(f) # If at least one author name is available, index the first # author to support first-author searching. Also, boost it # slightly higher than the other authors. authors = mapped_metadata.get('author', None) if authors: doc.add( PyLucene.Field('first-author', authors[0], False, True, True)) f = PyLucene.Field('all', authors[0], False, True, True) f.setBoost(1.5) doc.add(f) # All the booleans for bool in ('has_outcomes', 'has_exposures', 'has_relationships', 'has_interspecies', 'has_exposure_linkage', 'has_outcome_linkage', 'has_genomic'): val = getattr(study, bool) # NOTE: I think lucene dislikes '_' in field names ?? boolstr = bool.replace('_', '-') doc.add( PyLucene.Field(boolstr, str(int(val)), False, True, False)) # NOTE: no need to add this to 'all'. I think. # Now, all the UMLS concepts. Simpler approach to # lucene "synonym injection", but it works! Give it # slightly bigger boost than keywords/subjects for ctype in ('exposures', 'outcomes', 'risk_factors', 'species'): # NOTE: I think lucene dislikes '_' in field names ?? ctype_search = ctype.replace('_', '-') for val in getattr(study, ctype): concept = Concept(self.context, val.concept_id) for syn in concept.synonyms: doc.add( PyLucene.Field(ctype_search, unicode(syn, 'latin-1'), False, True, True)) f = PyLucene.Field('all', unicode(syn, 'latin-1'), False, True, True) f.setBoost(2.0) doc.add(f) # And, the locations gazeteer = self.context.get_gazeteer() locs = [] for location in study.locations: feature = Feature(self.context, uid=location.feature_id) feature.load(self.context) if gazeteer.fips_codes.has_key( (feature.country_code, feature.adm1)): region_name = gazeteer.fips_codes[(feature.country_code, feature.adm1)] else: region_name = '' full_name = '%s (%s, %s, %s)' % ( feature.name, gazeteer.feature_codes[feature.feature_type], render_capitalized(region_name), render_capitalized( gazeteer.country_codes[feature.country_code])) doc.add( PyLucene.Field('location', unicode(full_name, 'latin-1'), False, True, True)) doc.add( PyLucene.Field('all', unicode(full_name, 'latin-1'), False, True, True)) # Finally, the methodologies for meth in study.methodologies: doc.add( PyLucene.Field('methodology', meth.get_study_type(text=True), False, True, True)) doc.add( PyLucene.Field('all', meth.get_study_type(text=True), False, True, True)) # And each exposure route term for route in meth.get_routes(True): doc.add( PyLucene.Field('exposure_route', route, False, True, True)) doc.add(PyLucene.Field('all', route, False, True, True)) writer.addDocument(doc) if not had_writer: writer.close() except Exception, e: self.logger.error('Failed to index record: %s', e) self.logger.error(traceback.print_exc())
if __name__ == '__main__': cmdline = CommandLine() cmdline.parse_args() context = cmdline.context() collector = StatCollector(context) collector.add_handlers( CuratorHandler(), ArticleTypeHandler(), ExposureHandler(), OutcomeHandler(), RiskFactorHandler(), SpeciesHandler(), LocationHandler(), MethodologyTypeHandler(), ) searcher = RecordSearcher(context) records = searcher.search('record-status:%i' % QueuedRecord.STATUS_CURATED) today = datetime.now() one_weeks_records = [ rec for rec in records if today - Study(context, rec.study_id).date_modified <= timedelta(7) ] collector.process(one_weeks_records) for handler in collector.handlers: print handler.__class__, handler.stats