Beispiel #1
0
    all = {}
    context = canary.context.Context()
    for i in range(4000):
        try:
            rec = QueuedRecord(context, i)

            if not rec \
                or not rec.status == rec.STATUS_CURATED:
                raise 'ValueError'

            study = Study(context, rec.study_id)
            if not study.status == study.STATUS_TYPES['curated'] \
                or not (study.article_type >= study.ARTICLE_TYPES['traditional'] \
                    and study.article_type <= study.ARTICLE_TYPES['curated']):
                raise 'ValueError'

            for loc in study.locations:
                out = []
                out.extend((loc.uid, loc.study_id, loc.feature_id))
                feature = Feature(uid=loc.feature_id)
                feature.load(context)
                out.extend((feature.latitude, feature.longitude,
                    feature.name, feature.feature_type, feature.country_code))

        except:
            continue

    out_file.write('\t'.join([str(s) for s in out]) + '\n')

    out_file.close()
Beispiel #2
0
    search_index = SearchIndex(context)
    hit_list = []
    hits, searcher = search_index.search(query_str)
    for i, doc in hits:
        hit_list.append(doc.get('uid'))
    searcher.close()

    output = []
    for id in hit_list:
        rec = QueuedRecord(context, int(id))
        if options.locations:
            study = Study(context, rec.study_id)
            for loc in study.locations:
                out = []
                out.extend((id, loc.uid, loc.study_id, loc.feature_id))
                feature = Feature(uid=loc.feature_id)
                feature.load(context)
                out.extend((feature.latitude, feature.longitude, feature.name,
                            feature.feature_type, feature.country_code))
                output.append('\t'.join([str(v) for v in out]))
        else:
            mm = rec.get_mapped_metadata(ctm)
            if mm['author']:
                first_author = mm['author'][0]
            else:
                first_author = '-'
            output.append('\t'.join(
                (str(rec.uid), rec.title, first_author, rec.source)))

    print '\n'.join(output)
Beispiel #3
0
    def index_record(self, record, writer=None):
        # field, value, store?, index?, token?
        try:
            if not writer:
                had_writer = False
                writer = self.context.get_search_index_writer(False)
            else:
                had_writer = True

            study = Study(self.context, record.study_id)

            self.logger.debug('starting document')
            doc = PyLucene.Document()

            # First, we need to create a unique key so we can later delete
            # if necessary.  Will try simply uid for now.
            doc.add(PyLucene.Field('uid', str(record.uid), True, True, False))
            doc.add(PyLucene.Field('all', str(record.uid), True, True, False))

            # Second, save internal-use metadata.  These should probably
            # be x'd out at Query-time.
            doc.add(
                PyLucene.Field('record-status', str(record.status), False,
                               True, False))
            doc.add(
                PyLucene.Field('article-type', str(study.article_type), False,
                               True, False))

            source_catalog = self.context.get_source_catalog()
            complete_term_map = source_catalog.get_complete_mapping()
            mapped_metadata = record.get_mapped_metadata(complete_term_map)

            # First index all the non-multiple metadata fields
            for field in ('abstract', 'affiliation', 'issn', 'journal',
                          'pubdate', 'issue', 'pages', 'title', 'volume'):
                val = mapped_metadata.get(field, None)
                if val:
                    doc.add(PyLucene.Field(field, val, False, True, True))
                    doc.add(PyLucene.Field('all', val, False, True, True))

            # Be sure to index all of (abbrev, full title, issn) as "journal"
            issn = mapped_metadata.get('issn')
            if issn:
                j = Journal()
                j.load_from_issn(self.context, issn)
                no_dash = j.no_dash()
                self.logger.debug('indexing journal: %s, abbv:%s, issn:%s' % \
                    (j.journal_title, j.abbreviation, issn))
                doc.add(PyLucene.Field('journal', issn, False, True, True))
                doc.add(PyLucene.Field('journal', no_dash, False, True, True))
                doc.add(PyLucene.Field('all', issn, False, True, True))
                doc.add(PyLucene.Field('all', no_dash, False, True, True))
                if j.abbreviation:
                    doc.add(
                        PyLucene.Field('journal', j.abbreviation, False, True,
                                       True))
                    doc.add(
                        PyLucene.Field('all', j.abbreviation, False, True,
                                       True))
                if j.journal_title:
                    doc.add(
                        PyLucene.Field('journal', j.journal_title, False, True,
                                       True))
                    doc.add(
                        PyLucene.Field('all', j.journal_title, False, True,
                                       True))

            # If a page range is given, index the first page, assuming
            # the delimiter is '-'
            pages = mapped_metadata.get('pages', None)
            if pages \
                and '-' in pages:
                first_page = pages[0:pages.index('-')]
                doc.add(PyLucene.Field('pages', first_page, False, True, True))
                doc.add(PyLucene.Field('all', first_page, False, True, True))

            # 'unique_identifier' must be specially treated because
            # of the '_'
            val = mapped_metadata.get('unique_identifier', None)
            if val:
                doc.add(
                    PyLucene.Field('unique-identifier', val, False, True,
                                   True))
                doc.add(PyLucene.Field('all', val, False, True, True))

            # Next, index all the possibly-multiple metadata fields
            # Give these (especially for author and subject) a little
            # boost, less than for canary UMLS concepts
            for field in ('author', 'grantnum', 'keyword', 'registrynum',
                          'subject'):
                vals = mapped_metadata.get(field, None)
                for val in vals:
                    doc.add(PyLucene.Field(field, val, False, True, True))
                    f = PyLucene.Field('all', val, False, True, True)
                    f.setBoost(1.3)
                    doc.add(f)

            # If at least one author name is available, index the first
            # author to support first-author searching.  Also, boost it
            # slightly higher than the other authors.
            authors = mapped_metadata.get('author', None)
            if authors:
                doc.add(
                    PyLucene.Field('first-author', authors[0], False, True,
                                   True))
                f = PyLucene.Field('all', authors[0], False, True, True)
                f.setBoost(1.5)
                doc.add(f)

            # All the booleans
            for bool in ('has_outcomes', 'has_exposures', 'has_relationships',
                         'has_interspecies', 'has_exposure_linkage',
                         'has_outcome_linkage', 'has_genomic'):
                val = getattr(study, bool)
                # NOTE: I think lucene dislikes '_' in field names ??
                boolstr = bool.replace('_', '-')
                doc.add(
                    PyLucene.Field(boolstr, str(int(val)), False, True, False))
                # NOTE: no need to add this to 'all'.  I think.

            # Now, all the UMLS concepts.  Simpler approach to
            # lucene "synonym injection", but it works!  Give it
            # slightly bigger boost than keywords/subjects
            for ctype in ('exposures', 'outcomes', 'risk_factors', 'species'):
                # NOTE: I think lucene dislikes '_' in field names ??
                ctype_search = ctype.replace('_', '-')
                for val in getattr(study, ctype):
                    concept = Concept(self.context, val.concept_id)
                    for syn in concept.synonyms:
                        doc.add(
                            PyLucene.Field(ctype_search,
                                           unicode(syn, 'latin-1'), False,
                                           True, True))
                        f = PyLucene.Field('all', unicode(syn, 'latin-1'),
                                           False, True, True)
                        f.setBoost(2.0)
                        doc.add(f)

            # And, the locations
            gazeteer = self.context.get_gazeteer()
            locs = []
            for location in study.locations:
                feature = Feature(self.context, uid=location.feature_id)
                feature.load(self.context)
                if gazeteer.fips_codes.has_key(
                    (feature.country_code, feature.adm1)):
                    region_name = gazeteer.fips_codes[(feature.country_code,
                                                       feature.adm1)]
                else:
                    region_name = ''
                full_name = '%s (%s, %s, %s)' % (
                    feature.name, gazeteer.feature_codes[feature.feature_type],
                    render_capitalized(region_name),
                    render_capitalized(
                        gazeteer.country_codes[feature.country_code]))
                doc.add(
                    PyLucene.Field('location', unicode(full_name, 'latin-1'),
                                   False, True, True))
                doc.add(
                    PyLucene.Field('all', unicode(full_name, 'latin-1'), False,
                                   True, True))

            # Finally, the methodologies
            for meth in study.methodologies:
                doc.add(
                    PyLucene.Field('methodology',
                                   meth.get_study_type(text=True), False, True,
                                   True))
                doc.add(
                    PyLucene.Field('all', meth.get_study_type(text=True),
                                   False, True, True))
                # And each exposure route term
                for route in meth.get_routes(True):
                    doc.add(
                        PyLucene.Field('exposure_route', route, False, True,
                                       True))
                    doc.add(PyLucene.Field('all', route, False, True, True))

            writer.addDocument(doc)
            if not had_writer:
                writer.close()
        except Exception, e:
            self.logger.error('Failed to index record: %s', e)
            self.logger.error(traceback.print_exc())
Beispiel #4
0
def render_study (cursor, study, pmid):
    print 'canaryid: %s  pmid: %s  curator: %s' % \
        (study.uid, pmid, study.curator_user_id)
    bools = []
    for bool_attr in [
        'exposures',
        'outcomes', 
        'relationships',
        'interspecies',
        'exposure_linkage',
        'outcome_linkage',
        'genomic',
        ]:
        if getattr(study, 'has_%s' % bool_attr):
            print '\t%s:\tX' % bool_attr
        else:
            print '\t%s:\t-' % bool_attr
    
    if len(study.exposures) > 0:
        print 'Exposures:'
        exps = [exp.term for exp in study.exposures]
        exps.sort()
        for exp in exps:
            print '\t' + exp

    if len(study.outcomes) > 0:
        print 'Outcomes:'
        outs = [out.term for out in study.outcomes]
        outs.sort()
        for out in outs:
            print '\t' + out
    
    if len(study.species) > 0:
        print 'Species:'
        specs = [sp.term for sp in study.species]
        specs.sort()
        for sp in specs:
            print '\t' + sp
     
    if len(study.locations) > 0:
        print 'Locations:'
        features = []
        for loc in study.locations:
            feature = Feature(loc.feature_id)
            feature.load(cursor)
            features.append(feature)
        feats = [f.name for f in features]
        feats.sort()
        for f in feats:
            print '\t' + f
    
    if len(study.methodologies) > 0:
        
        print 'Methodologies:'
        for meth in study.methodologies:
            study_type = meth.get_study_type(True)
            if study_type == 'experimental' \
                and meth.is_mesocosm:
                study_type = 'experimental (mesocosm)'
            elif study_type == 'cohort' \
                and meth.is_enclosure:
                study_type = 'cohort (enclosure)'
                
            print ' ~ '.join([
                'study_type',
                'N',
                'routes',
                'sampling',
                'controls',
                'timing',
                ])
                
            print ' ~ '.join([
                study_type, 
                meth.sample_size, 
                ', '.join(meth.get_routes(True)),
                meth.get_sampling(True), 
                meth.get_controls(True),
                meth.get_timing(True),
                ])
    
    print '\n -=-=-=-=- \n'
Beispiel #5
0
    def index_record (self, record, writer=None):
        # field, value, store?, index?, token?
        try:
            if not writer:
                had_writer = False
                writer = self.context.get_search_index_writer(False)
            else:
                had_writer = True
            
            study = Study(self.context, record.study_id)
            
            self.logger.debug('starting document')
            doc = PyLucene.Document()
            
            # First, we need to create a unique key so we can later delete
            # if necessary.  Will try simply uid for now.
            doc.add(PyLucene.Field('uid', str(record.uid),
                True, True, False))
            doc.add(PyLucene.Field('all', str(record.uid),
                True, True, False))
            
            # Second, save internal-use metadata.  These should probably
            # be x'd out at Query-time.
            doc.add(PyLucene.Field('record-status', str(record.status),
                False, True, False))
            doc.add(PyLucene.Field('article-type', str(study.article_type),
                False, True, False))
            
            source_catalog = self.context.get_source_catalog()
            complete_term_map = source_catalog.get_complete_mapping()
            mapped_metadata = record.get_mapped_metadata(complete_term_map)
            
            # First index all the non-multiple metadata fields
            for field in ('abstract', 'affiliation', 'issn', 
                'journal', 'pubdate', 'issue', 'pages', 'title', 
                'volume'):
                val = mapped_metadata.get(field, None)
                if val:
                    doc.add(PyLucene.Field(field, val,
                        False, True, True))
                    doc.add(PyLucene.Field('all', val,
                        False, True, True))
            
            # Be sure to index all of (abbrev, full title, issn) as "journal"
            issn = mapped_metadata.get('issn')
            if issn:
                j = Journal()
                j.load_from_issn(self.context, issn)
                no_dash = j.no_dash()
                self.logger.debug('indexing journal: %s, abbv:%s, issn:%s' % \
                    (j.journal_title, j.abbreviation, issn))
                doc.add(PyLucene.Field('journal', issn,
                    False, True, True))
                doc.add(PyLucene.Field('journal', no_dash,
                    False, True, True))
                doc.add(PyLucene.Field('all', issn,
                    False, True, True))
                doc.add(PyLucene.Field('all', no_dash,
                    False, True, True))
                if j.abbreviation:
                    doc.add(PyLucene.Field('journal', j.abbreviation,
                        False, True, True))
                    doc.add(PyLucene.Field('all', j.abbreviation,
                        False, True, True))
                if j.journal_title:
                    doc.add(PyLucene.Field('journal', j.journal_title,
                        False, True, True))
                    doc.add(PyLucene.Field('all', j.journal_title,
                        False, True, True))
                
            
            # If a page range is given, index the first page, assuming
            # the delimiter is '-'
            pages = mapped_metadata.get('pages', None)
            if pages \
                and '-' in pages:
                first_page = pages[0:pages.index('-')]
                doc.add(PyLucene.Field('pages', first_page,
                    False, True, True))
                doc.add(PyLucene.Field('all', first_page,
                    False, True, True))
            
            
            # 'unique_identifier' must be specially treated because 
            # of the '_'
            val = mapped_metadata.get('unique_identifier', None)
            if val:
                doc.add(PyLucene.Field('unique-identifier', val,
                    False, True, True))
                doc.add(PyLucene.Field('all', val,
                    False, True, True))
            
            # Next, index all the possibly-multiple metadata fields
            # Give these (especially for author and subject) a little
            # boost, less than for canary UMLS concepts
            for field in ('author', 'grantnum', 'keyword', 'registrynum', 
                'subject'):
                vals = mapped_metadata.get(field, None)
                for val in vals:
                    doc.add(PyLucene.Field(field, val,
                        False, True, True))
                    f = PyLucene.Field('all', val,
                        False, True, True)
                    f.setBoost(1.3)
                    doc.add(f)
            
            # If at least one author name is available, index the first
            # author to support first-author searching.  Also, boost it
            # slightly higher than the other authors.
            authors = mapped_metadata.get('author', None)
            if authors:
                doc.add(PyLucene.Field('first-author', authors[0],
                    False, True, True))
                f = PyLucene.Field('all', authors[0],
                    False, True, True)
                f.setBoost(1.5)
                doc.add(f)

            
            # All the booleans
            for bool in ('has_outcomes', 'has_exposures', 
                'has_relationships', 'has_interspecies', 
                'has_exposure_linkage', 'has_outcome_linkage', 
                'has_genomic'):
                val = getattr(study, bool)
                # NOTE: I think lucene dislikes '_' in field names ??
                boolstr = bool.replace('_', '-')
                doc.add(PyLucene.Field(boolstr, str(int(val)),
                    False, True, False))
                # NOTE: no need to add this to 'all'.  I think.
            
            # Now, all the UMLS concepts.  Simpler approach to
            # lucene "synonym injection", but it works!  Give it
            # slightly bigger boost than keywords/subjects
            for ctype in ('exposures', 'outcomes', 'risk_factors',
                'species'):
                # NOTE: I think lucene dislikes '_' in field names ??
                ctype_search = ctype.replace('_', '-')
                for val in getattr(study, ctype):
                    concept = Concept(self.context, val.concept_id)
                    for syn in concept.synonyms:
                        doc.add(PyLucene.Field(ctype_search, 
                            unicode(syn, 'latin-1'),
                            False, True, True))
                        f = PyLucene.Field('all', unicode(syn, 'latin-1'),
                            False, True, True)
                        f.setBoost(2.0)
                        doc.add(f)

            # And, the locations
            gazeteer = self.context.get_gazeteer()
            locs = []
            for location in study.locations:
                feature = Feature(self.context, uid=location.feature_id)
                feature.load(self.context)
                if gazeteer.fips_codes.has_key((feature.country_code, feature.adm1)):
                    region_name = gazeteer.fips_codes[(feature.country_code, feature.adm1)]
                else:
                    region_name = ''
                full_name = '%s (%s, %s, %s)' % (feature.name, 
                    gazeteer.feature_codes[feature.feature_type],
                    render_capitalized(region_name), 
                    render_capitalized(gazeteer.country_codes[feature.country_code]))
                doc.add(PyLucene.Field('location', unicode(full_name, 'latin-1'),
                    False, True, True))
                doc.add(PyLucene.Field('all', unicode(full_name, 'latin-1'),
                    False, True, True))
                
            # Finally, the methodologies
            for meth in study.methodologies:
                doc.add(PyLucene.Field('methodology', 
                    meth.get_study_type(text=True),
                    False, True, True))
                doc.add(PyLucene.Field('all', 
                    meth.get_study_type(text=True),
                    False, True, True))
                # And each exposure route term
                for route in meth.get_routes(True):
                    doc.add(PyLucene.Field('exposure_route',
                        route, False, True, True))
                    doc.add(PyLucene.Field('all',
                        route, False, True, True))
            
            writer.addDocument(doc)
            if not had_writer:
                writer.close()
        except Exception, e:
            self.logger.error('Failed to index record: %s', e)
            self.logger.error(traceback.print_exc())