all = {} context = canary.context.Context() for i in range(4000): try: rec = QueuedRecord(context, i) if not rec \ or not rec.status == rec.STATUS_CURATED: raise 'ValueError' study = Study(context, rec.study_id) if not study.status == study.STATUS_TYPES['curated'] \ or not (study.article_type >= study.ARTICLE_TYPES['traditional'] \ and study.article_type <= study.ARTICLE_TYPES['curated']): raise 'ValueError' for loc in study.locations: out = [] out.extend((loc.uid, loc.study_id, loc.feature_id)) feature = Feature(uid=loc.feature_id) feature.load(context) out.extend((feature.latitude, feature.longitude, feature.name, feature.feature_type, feature.country_code)) except: continue out_file.write('\t'.join([str(s) for s in out]) + '\n') out_file.close()
search_index = SearchIndex(context) hit_list = [] hits, searcher = search_index.search(query_str) for i, doc in hits: hit_list.append(doc.get('uid')) searcher.close() output = [] for id in hit_list: rec = QueuedRecord(context, int(id)) if options.locations: study = Study(context, rec.study_id) for loc in study.locations: out = [] out.extend((id, loc.uid, loc.study_id, loc.feature_id)) feature = Feature(uid=loc.feature_id) feature.load(context) out.extend((feature.latitude, feature.longitude, feature.name, feature.feature_type, feature.country_code)) output.append('\t'.join([str(v) for v in out])) else: mm = rec.get_mapped_metadata(ctm) if mm['author']: first_author = mm['author'][0] else: first_author = '-' output.append('\t'.join( (str(rec.uid), rec.title, first_author, rec.source))) print '\n'.join(output)
def index_record(self, record, writer=None): # field, value, store?, index?, token? try: if not writer: had_writer = False writer = self.context.get_search_index_writer(False) else: had_writer = True study = Study(self.context, record.study_id) self.logger.debug('starting document') doc = PyLucene.Document() # First, we need to create a unique key so we can later delete # if necessary. Will try simply uid for now. doc.add(PyLucene.Field('uid', str(record.uid), True, True, False)) doc.add(PyLucene.Field('all', str(record.uid), True, True, False)) # Second, save internal-use metadata. These should probably # be x'd out at Query-time. doc.add( PyLucene.Field('record-status', str(record.status), False, True, False)) doc.add( PyLucene.Field('article-type', str(study.article_type), False, True, False)) source_catalog = self.context.get_source_catalog() complete_term_map = source_catalog.get_complete_mapping() mapped_metadata = record.get_mapped_metadata(complete_term_map) # First index all the non-multiple metadata fields for field in ('abstract', 'affiliation', 'issn', 'journal', 'pubdate', 'issue', 'pages', 'title', 'volume'): val = mapped_metadata.get(field, None) if val: doc.add(PyLucene.Field(field, val, False, True, True)) doc.add(PyLucene.Field('all', val, False, True, True)) # Be sure to index all of (abbrev, full title, issn) as "journal" issn = mapped_metadata.get('issn') if issn: j = Journal() j.load_from_issn(self.context, issn) no_dash = j.no_dash() self.logger.debug('indexing journal: %s, abbv:%s, issn:%s' % \ (j.journal_title, j.abbreviation, issn)) doc.add(PyLucene.Field('journal', issn, False, True, True)) doc.add(PyLucene.Field('journal', no_dash, False, True, True)) doc.add(PyLucene.Field('all', issn, False, True, True)) doc.add(PyLucene.Field('all', no_dash, False, True, True)) if j.abbreviation: doc.add( PyLucene.Field('journal', j.abbreviation, False, True, True)) doc.add( PyLucene.Field('all', j.abbreviation, False, True, True)) if j.journal_title: doc.add( PyLucene.Field('journal', j.journal_title, False, True, True)) doc.add( PyLucene.Field('all', j.journal_title, False, True, True)) # If a page range is given, index the first page, assuming # the delimiter is '-' pages = mapped_metadata.get('pages', None) if pages \ and '-' in pages: first_page = pages[0:pages.index('-')] doc.add(PyLucene.Field('pages', first_page, False, True, True)) doc.add(PyLucene.Field('all', first_page, False, True, True)) # 'unique_identifier' must be specially treated because # of the '_' val = mapped_metadata.get('unique_identifier', None) if val: doc.add( PyLucene.Field('unique-identifier', val, False, True, True)) doc.add(PyLucene.Field('all', val, False, True, True)) # Next, index all the possibly-multiple metadata fields # Give these (especially for author and subject) a little # boost, less than for canary UMLS concepts for field in ('author', 'grantnum', 'keyword', 'registrynum', 'subject'): vals = mapped_metadata.get(field, None) for val in vals: doc.add(PyLucene.Field(field, val, False, True, True)) f = PyLucene.Field('all', val, False, True, True) f.setBoost(1.3) doc.add(f) # If at least one author name is available, index the first # author to support first-author searching. Also, boost it # slightly higher than the other authors. authors = mapped_metadata.get('author', None) if authors: doc.add( PyLucene.Field('first-author', authors[0], False, True, True)) f = PyLucene.Field('all', authors[0], False, True, True) f.setBoost(1.5) doc.add(f) # All the booleans for bool in ('has_outcomes', 'has_exposures', 'has_relationships', 'has_interspecies', 'has_exposure_linkage', 'has_outcome_linkage', 'has_genomic'): val = getattr(study, bool) # NOTE: I think lucene dislikes '_' in field names ?? boolstr = bool.replace('_', '-') doc.add( PyLucene.Field(boolstr, str(int(val)), False, True, False)) # NOTE: no need to add this to 'all'. I think. # Now, all the UMLS concepts. Simpler approach to # lucene "synonym injection", but it works! Give it # slightly bigger boost than keywords/subjects for ctype in ('exposures', 'outcomes', 'risk_factors', 'species'): # NOTE: I think lucene dislikes '_' in field names ?? ctype_search = ctype.replace('_', '-') for val in getattr(study, ctype): concept = Concept(self.context, val.concept_id) for syn in concept.synonyms: doc.add( PyLucene.Field(ctype_search, unicode(syn, 'latin-1'), False, True, True)) f = PyLucene.Field('all', unicode(syn, 'latin-1'), False, True, True) f.setBoost(2.0) doc.add(f) # And, the locations gazeteer = self.context.get_gazeteer() locs = [] for location in study.locations: feature = Feature(self.context, uid=location.feature_id) feature.load(self.context) if gazeteer.fips_codes.has_key( (feature.country_code, feature.adm1)): region_name = gazeteer.fips_codes[(feature.country_code, feature.adm1)] else: region_name = '' full_name = '%s (%s, %s, %s)' % ( feature.name, gazeteer.feature_codes[feature.feature_type], render_capitalized(region_name), render_capitalized( gazeteer.country_codes[feature.country_code])) doc.add( PyLucene.Field('location', unicode(full_name, 'latin-1'), False, True, True)) doc.add( PyLucene.Field('all', unicode(full_name, 'latin-1'), False, True, True)) # Finally, the methodologies for meth in study.methodologies: doc.add( PyLucene.Field('methodology', meth.get_study_type(text=True), False, True, True)) doc.add( PyLucene.Field('all', meth.get_study_type(text=True), False, True, True)) # And each exposure route term for route in meth.get_routes(True): doc.add( PyLucene.Field('exposure_route', route, False, True, True)) doc.add(PyLucene.Field('all', route, False, True, True)) writer.addDocument(doc) if not had_writer: writer.close() except Exception, e: self.logger.error('Failed to index record: %s', e) self.logger.error(traceback.print_exc())
def render_study (cursor, study, pmid): print 'canaryid: %s pmid: %s curator: %s' % \ (study.uid, pmid, study.curator_user_id) bools = [] for bool_attr in [ 'exposures', 'outcomes', 'relationships', 'interspecies', 'exposure_linkage', 'outcome_linkage', 'genomic', ]: if getattr(study, 'has_%s' % bool_attr): print '\t%s:\tX' % bool_attr else: print '\t%s:\t-' % bool_attr if len(study.exposures) > 0: print 'Exposures:' exps = [exp.term for exp in study.exposures] exps.sort() for exp in exps: print '\t' + exp if len(study.outcomes) > 0: print 'Outcomes:' outs = [out.term for out in study.outcomes] outs.sort() for out in outs: print '\t' + out if len(study.species) > 0: print 'Species:' specs = [sp.term for sp in study.species] specs.sort() for sp in specs: print '\t' + sp if len(study.locations) > 0: print 'Locations:' features = [] for loc in study.locations: feature = Feature(loc.feature_id) feature.load(cursor) features.append(feature) feats = [f.name for f in features] feats.sort() for f in feats: print '\t' + f if len(study.methodologies) > 0: print 'Methodologies:' for meth in study.methodologies: study_type = meth.get_study_type(True) if study_type == 'experimental' \ and meth.is_mesocosm: study_type = 'experimental (mesocosm)' elif study_type == 'cohort' \ and meth.is_enclosure: study_type = 'cohort (enclosure)' print ' ~ '.join([ 'study_type', 'N', 'routes', 'sampling', 'controls', 'timing', ]) print ' ~ '.join([ study_type, meth.sample_size, ', '.join(meth.get_routes(True)), meth.get_sampling(True), meth.get_controls(True), meth.get_timing(True), ]) print '\n -=-=-=-=- \n'
def index_record (self, record, writer=None): # field, value, store?, index?, token? try: if not writer: had_writer = False writer = self.context.get_search_index_writer(False) else: had_writer = True study = Study(self.context, record.study_id) self.logger.debug('starting document') doc = PyLucene.Document() # First, we need to create a unique key so we can later delete # if necessary. Will try simply uid for now. doc.add(PyLucene.Field('uid', str(record.uid), True, True, False)) doc.add(PyLucene.Field('all', str(record.uid), True, True, False)) # Second, save internal-use metadata. These should probably # be x'd out at Query-time. doc.add(PyLucene.Field('record-status', str(record.status), False, True, False)) doc.add(PyLucene.Field('article-type', str(study.article_type), False, True, False)) source_catalog = self.context.get_source_catalog() complete_term_map = source_catalog.get_complete_mapping() mapped_metadata = record.get_mapped_metadata(complete_term_map) # First index all the non-multiple metadata fields for field in ('abstract', 'affiliation', 'issn', 'journal', 'pubdate', 'issue', 'pages', 'title', 'volume'): val = mapped_metadata.get(field, None) if val: doc.add(PyLucene.Field(field, val, False, True, True)) doc.add(PyLucene.Field('all', val, False, True, True)) # Be sure to index all of (abbrev, full title, issn) as "journal" issn = mapped_metadata.get('issn') if issn: j = Journal() j.load_from_issn(self.context, issn) no_dash = j.no_dash() self.logger.debug('indexing journal: %s, abbv:%s, issn:%s' % \ (j.journal_title, j.abbreviation, issn)) doc.add(PyLucene.Field('journal', issn, False, True, True)) doc.add(PyLucene.Field('journal', no_dash, False, True, True)) doc.add(PyLucene.Field('all', issn, False, True, True)) doc.add(PyLucene.Field('all', no_dash, False, True, True)) if j.abbreviation: doc.add(PyLucene.Field('journal', j.abbreviation, False, True, True)) doc.add(PyLucene.Field('all', j.abbreviation, False, True, True)) if j.journal_title: doc.add(PyLucene.Field('journal', j.journal_title, False, True, True)) doc.add(PyLucene.Field('all', j.journal_title, False, True, True)) # If a page range is given, index the first page, assuming # the delimiter is '-' pages = mapped_metadata.get('pages', None) if pages \ and '-' in pages: first_page = pages[0:pages.index('-')] doc.add(PyLucene.Field('pages', first_page, False, True, True)) doc.add(PyLucene.Field('all', first_page, False, True, True)) # 'unique_identifier' must be specially treated because # of the '_' val = mapped_metadata.get('unique_identifier', None) if val: doc.add(PyLucene.Field('unique-identifier', val, False, True, True)) doc.add(PyLucene.Field('all', val, False, True, True)) # Next, index all the possibly-multiple metadata fields # Give these (especially for author and subject) a little # boost, less than for canary UMLS concepts for field in ('author', 'grantnum', 'keyword', 'registrynum', 'subject'): vals = mapped_metadata.get(field, None) for val in vals: doc.add(PyLucene.Field(field, val, False, True, True)) f = PyLucene.Field('all', val, False, True, True) f.setBoost(1.3) doc.add(f) # If at least one author name is available, index the first # author to support first-author searching. Also, boost it # slightly higher than the other authors. authors = mapped_metadata.get('author', None) if authors: doc.add(PyLucene.Field('first-author', authors[0], False, True, True)) f = PyLucene.Field('all', authors[0], False, True, True) f.setBoost(1.5) doc.add(f) # All the booleans for bool in ('has_outcomes', 'has_exposures', 'has_relationships', 'has_interspecies', 'has_exposure_linkage', 'has_outcome_linkage', 'has_genomic'): val = getattr(study, bool) # NOTE: I think lucene dislikes '_' in field names ?? boolstr = bool.replace('_', '-') doc.add(PyLucene.Field(boolstr, str(int(val)), False, True, False)) # NOTE: no need to add this to 'all'. I think. # Now, all the UMLS concepts. Simpler approach to # lucene "synonym injection", but it works! Give it # slightly bigger boost than keywords/subjects for ctype in ('exposures', 'outcomes', 'risk_factors', 'species'): # NOTE: I think lucene dislikes '_' in field names ?? ctype_search = ctype.replace('_', '-') for val in getattr(study, ctype): concept = Concept(self.context, val.concept_id) for syn in concept.synonyms: doc.add(PyLucene.Field(ctype_search, unicode(syn, 'latin-1'), False, True, True)) f = PyLucene.Field('all', unicode(syn, 'latin-1'), False, True, True) f.setBoost(2.0) doc.add(f) # And, the locations gazeteer = self.context.get_gazeteer() locs = [] for location in study.locations: feature = Feature(self.context, uid=location.feature_id) feature.load(self.context) if gazeteer.fips_codes.has_key((feature.country_code, feature.adm1)): region_name = gazeteer.fips_codes[(feature.country_code, feature.adm1)] else: region_name = '' full_name = '%s (%s, %s, %s)' % (feature.name, gazeteer.feature_codes[feature.feature_type], render_capitalized(region_name), render_capitalized(gazeteer.country_codes[feature.country_code])) doc.add(PyLucene.Field('location', unicode(full_name, 'latin-1'), False, True, True)) doc.add(PyLucene.Field('all', unicode(full_name, 'latin-1'), False, True, True)) # Finally, the methodologies for meth in study.methodologies: doc.add(PyLucene.Field('methodology', meth.get_study_type(text=True), False, True, True)) doc.add(PyLucene.Field('all', meth.get_study_type(text=True), False, True, True)) # And each exposure route term for route in meth.get_routes(True): doc.add(PyLucene.Field('exposure_route', route, False, True, True)) doc.add(PyLucene.Field('all', route, False, True, True)) writer.addDocument(doc) if not had_writer: writer.close() except Exception, e: self.logger.error('Failed to index record: %s', e) self.logger.error(traceback.print_exc())