def bioconcept_alias_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_bioconcept = dict([(x.unique_key(), x) for x in nex_session.query(Bioconcept).all()]) key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) #Go aliases for old_goterm in make_db_starter(bud_session.query(Go).options(joinedload('go_gosynonyms')), 1000)(): go_key = (get_go_format_name(old_goterm.go_go_id), 'GO') if go_key in key_to_bioconcept: for go_gosynonym in old_goterm.go_gosynonyms: synonym = go_gosynonym.gosynonym yield {'display_name': synonym.name, 'source': key_to_source['SGD'], 'bioconcept_id': key_to_bioconcept[go_key].id, 'date_created': synonym.date_created, 'created_by': synonym.created_by} else: print 'Go term not found: ' + str(go_key) yield None #Phenotype aliases for cvtermsynonym in bud_session.query(CVTermSynonym).join(CVTerm).filter(CVTerm.cv_no == 6).all(): observable = cvtermsynonym.cvterm.name.lower() if observable == 'observable': observable = 'ypo' phenotype_key = (create_format_name(observable), 'OBSERVABLE') if phenotype_key in key_to_bioconcept: yield {'display_name': cvtermsynonym.synonym, 'source': key_to_source['SGD'], 'bioconcept_id': key_to_bioconcept[phenotype_key].id, 'date_created': cvtermsynonym.date_created, 'created_by': cvtermsynonym.created_by} else: print 'Phenotype not found: ' + str(phenotype_key) yield None for cvterm_dbxref in bud_session.query(CVTermDbxref).join(CVTerm).filter(CVTerm.cv_no == 6).options(joinedload('dbxref')).all(): observable = cvterm_dbxref.cvterm.name.lower() if observable == 'observable': observable = 'ypo' phenotype_key = (create_format_name(observable), 'OBSERVABLE') if phenotype_key in key_to_bioconcept: yield {'display_name': cvterm_dbxref.dbxref.dbxref_id, 'source': key_to_source['SGD'], 'category': cvterm_dbxref.dbxref.dbxref_type, 'bioconcept_id': key_to_bioconcept[phenotype_key].id, 'date_created': cvterm_dbxref.dbxref.date_created, 'created_by': cvterm_dbxref.dbxref.created_by} else: print 'Phenotype not found: ' + str(phenotype_key) yield None bud_session.close() nex_session.close()
def phenotype_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) key_to_observable = dict([(x.unique_key(), x) for x in nex_session.query(Observable).all()]) for bud_obj in bud_session.query(Phenotype).all(): observable_key = (create_format_name(bud_obj.observable).lower(), 'OBSERVABLE') if observable_key in key_to_observable: yield {'source': key_to_source['SGD'], 'observable': key_to_observable[observable_key], 'qualifier': bud_obj.qualifier, 'date_created': bud_obj.date_created, 'created_by': bud_obj.created_by} for bud_obj in make_db_starter(bud_session.query(PhenotypeFeature).join(PhenotypeFeature.phenotype).filter(Phenotype.observable.in_(chemical_phenotypes)), 1000)(): if bud_obj.experiment is None: yield None chemicals = bud_obj.experiment.chemicals if len(chemicals) == 0: yield None chemical = ' and '.join([x[0] for x in chemicals]) old_observable = bud_obj.phenotype.observable description = None if old_observable == 'resistance to chemicals': new_observable = bud_obj.phenotype.observable.replace('chemicals', chemical) description = 'The level of resistance to exposure to ' + chemical + '.' elif old_observable == 'chemical compound accumulation': new_observable = bud_obj.phenotype.observable.replace('chemical compound', chemical) description = 'The production and/or storage of ' + chemical + '.' elif old_observable == 'chemical compound excretion': new_observable = bud_obj.phenotype.observable.replace('chemical compound', chemical) description = 'The excretion from the cell of ' + chemical + '.' else: new_observable = None if new_observable is not None: observable_key = (create_format_name(new_observable).lower(), 'OBSERVABLE') if observable_key in key_to_observable: yield {'source': key_to_source['SGD'], 'observable': key_to_observable[observable_key], 'qualifier': bud_obj.phenotype.qualifier, 'description': description, 'date_created': bud_obj.date_created, 'created_by': bud_obj.created_by} bud_session.close() nex_session.close()
def disambig_starter(): nex_session = nex_session_maker() for obj in make_db_starter(nex_session.query(cls), 1000)(): for field in fields: field_value = getattr(obj, field) if len(str(field_value)) <= 200: if field == 'doi': field_value = None if field_value is None else 'doi:' + field_value.lower() if field_value is not None and (field == 'id' or field == 'pubmed_id' or not is_number(field_value)): yield {'disambig_key': str(field_value), 'class_type': class_type, 'subclass_type': subclass_type, 'identifier': obj.id} else: print 'Value too long: ' + str(field_value) nex_session.close()
def reference_paragraph_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) id_to_reference = dict([(x.id, x) for x in nex_session.query(Reference).all()]) for old_abstract in make_db_starter(bud_session.query(Abstract), 1000)(): reference_id = old_abstract.reference_id if reference_id in id_to_reference: yield { 'source': key_to_source['SGD'], 'text': old_abstract.text, 'html': link_gene_names(old_abstract.text, set(), nex_session), 'reference': id_to_reference[reference_id], } else: print 'Reference not found: ' + str(reference_id) yield None bud_session.close() nex_session.close()
def bioentity_url_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) id_to_bioentity = dict([(x.id, x) for x in nex_session.query(Bioentity).all()]) for bud_obj in make_db_starter(bud_session.query(FeatUrl).options(joinedload('url')), 1000)(): old_url = bud_obj.url url_type = old_url.url_type link = old_url.url bioentity_id = bud_obj.feature_id for old_webdisplay in old_url.displays: if bioentity_id in id_to_bioentity: bioentity = id_to_bioentity[bioentity_id] if url_type == 'query by SGDID': link = link.replace('_SUBSTITUTE_THIS_', str(bioentity.sgdid)) elif url_type == 'query by SGD ORF name with anchor' or url_type == 'query by SGD ORF name' or url_type == 'query by ID assigned by database': link = link.replace('_SUBSTITUTE_THIS_', str(bioentity.format_name)) else: print "Can't handle this url. " + str(old_url.url_type) yield None category = None if old_webdisplay.label_location not in category_mapping else category_mapping[old_webdisplay.label_location] yield {'display_name': old_webdisplay.label_name, 'link': link, 'source': key_to_source[create_format_name(old_url.source)], 'category': category, 'bioentity_id': bioentity_id, 'date_created': old_url.date_created, 'created_by': old_url.created_by} else: #print 'Bioentity not found: ' + str(bioentity_id) yield None for bud_obj in make_db_starter(bud_session.query(DbxrefFeat).options(joinedload('dbxref'), joinedload('dbxref.dbxref_urls')), 1000)(): old_urls = bud_obj.dbxref.urls dbxref_id = bud_obj.dbxref.dbxref_id bioentity_id = bud_obj.feature_id for old_url in old_urls: for old_webdisplay in old_url.displays: if bioentity_id in id_to_bioentity: bioentity = id_to_bioentity[bioentity_id] url_type = old_url.url_type link = old_url.url if url_type == 'query by SGD ORF name with anchor' or url_type == 'query by SGD ORF name': link = link.replace('_SUBSTITUTE_THIS_', bioentity.format_name) elif url_type == 'query by ID assigned by database': link = link.replace('_SUBSTITUTE_THIS_', str(dbxref_id)) elif url_type == 'query by SGDID': link = link.replace('_SUBSTITUTE_THIS_', bioentity.sgdid) else: print "Can't handle this url. " + str(old_url.url_type) yield None category = None if old_webdisplay.label_location not in category_mapping else category_mapping[old_webdisplay.label_location] yield {'display_name': old_webdisplay.label_name, 'link': link, 'source': key_to_source[create_format_name(old_url.source)], 'category': category, 'bioentity_id': bioentity_id, 'date_created': old_url.date_created, 'created_by': old_url.created_by} else: #print 'Bioentity not found: ' + str(bioentity_id) yield None for locus in nex_session.query(Locus).all(): yield {'display_name': 'SPELL', 'link': 'http://spell.yeastgenome.org/search/show_results?search_string=' + locus.format_name, 'source': key_to_source['SGD'], 'category': 'LOCUS_EXPRESSION', 'bioentity_id': locus.id} yield {'display_name': 'Gene/Sequence Resources', 'link': '/cgi-bin/seqTools?back=1&seqname=' + locus.format_name, 'source': key_to_source['SGD'], 'category': 'LOCUS_SEQUENCE', 'bioentity_id': locus.id} yield {'display_name': 'ORF Map', 'link': '/cgi-bin/ORFMAP/ORFmap?dbid=' + locus.sgdid, 'source': key_to_source['SGD'], 'category': 'LOCUS_SEQUENCE', 'bioentity_id': locus.id} yield {'display_name': 'GBrowse', 'link': 'http://browse.yeastgenome.org/fgb2/gbrowse/scgenome/?name=' + locus.format_name, 'source': key_to_source['SGD'], 'category': 'LOCUS_SEQUENCE', 'bioentity_id': locus.id} yield {'display_name': 'BLASTN', 'link': '/cgi-bin/blast-sgd.pl?name=' + locus.format_name, 'source': key_to_source['SGD'], 'category': 'LOCUS_SEQUENCE_SECTION', 'bioentity_id': locus.id} yield {'display_name': 'BLASTP', 'link': '/cgi-bin/blast-sgd.pl?name=' + locus.format_name + '&suffix=prot', 'source': key_to_source['SGD'], 'category': 'LOCUS_SEQUENCE_SECTION', 'bioentity_id': locus.id} yield {'display_name': 'Variant Viewer', 'link': '/variant-viewer#/' + locus.sgdid, 'source': key_to_source['SGD'], 'category': 'LOCUS_SEQUENCE_OTHER_STRAINS', 'bioentity_id': locus.id} yield {'display_name': 'Yeast Phenotype Ontology', 'link': '/ontology/phenotype/ypo/overview', 'source': key_to_source['SGD'], 'category': 'LOCUS_PHENOTYPE_ONTOLOGY', 'bioentity_id': locus.id} bud_session.close() nex_session.close()
def bioconcept_relation_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_bioconcept = dict([(x.unique_key(), x) for x in nex_session.query(Bioconcept).all()]) key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) # GO relations for gopath in make_db_starter(bud_session.query(GoPath).filter(GoPath.generation == 1).options(joinedload('child'), joinedload('ancestor')), 1000)(): parent_key = (get_go_format_name(gopath.ancestor.go_go_id), 'GO') child_key = (get_go_format_name(gopath.child.go_go_id), 'GO') if parent_key in key_to_bioconcept and child_key in key_to_bioconcept: yield {'source': key_to_source['SGD'], 'relation_type': gopath.relationship_type, 'parent_id': key_to_bioconcept[parent_key].id, 'child_id': key_to_bioconcept[child_key].id} else: print 'Could not find go. Parent: ' + str(parent_key) + ' Child: ' + str(child_key) yield None old_gosets = bud_session.query(GoSet).filter(GoSet.name == 'Yeast GO-Slim').options(joinedload('go')).all() slim_ids = set() for old_goset in old_gosets: go_key = (get_go_format_name(old_goset.go.go_go_id), 'GO') if go_key[0] != 'GO:0008150' and go_key[0] != 'GO:0003674' and go_key[0] != 'GO:0005575' and go_key in key_to_bioconcept: slim_ids.add(key_to_bioconcept[go_key].id) else: print 'GO term not found: ' + str(go_key) #Go Slim go_child_id_to_parent_ids = {} for go_relation in nex_session.query(Bioconceptrelation).filter(Bioconceptrelation.relation_type == 'is a'): if go_relation.child_id in go_child_id_to_parent_ids: go_child_id_to_parent_ids[go_relation.child_id].append(go_relation.parent_id) else: go_child_id_to_parent_ids[go_relation.child_id] = [go_relation.parent_id] for child_id in go_child_id_to_parent_ids: parent_ids = go_child_id_to_parent_ids[child_id] while len(parent_ids) > 0: new_parent_ids = set() for parent_id in parent_ids: if parent_id in slim_ids: yield {'source': key_to_source['SGD'], 'parent_id': parent_id, 'child_id': child_id, 'relation_type': 'GO_SLIM'} if parent_id in go_child_id_to_parent_ids: new_parent_ids.update(go_child_id_to_parent_ids[parent_id]) parent_ids = new_parent_ids #Phenotype relations for cvtermrel in bud_session.query(CVTermRel).options(joinedload('child'), joinedload('parent')).all(): parent_key = (create_format_name(cvtermrel.parent.name.lower()), 'OBSERVABLE') child_key = (create_format_name(cvtermrel.child.name.lower()), 'OBSERVABLE') if parent_key == ('observable', 'OBSERVABLE'): parent_key = ('ypo', 'OBSERVABLE') if parent_key in key_to_bioconcept and child_key in key_to_bioconcept: yield {'source': key_to_source['SGD'], 'relation_type': cvtermrel.relationship_type, 'parent_id': key_to_bioconcept[parent_key].id, 'child_id': key_to_bioconcept[child_key].id, 'date_created': cvtermrel.date_created, 'created_by': cvtermrel.created_by} for old_phenotype in make_db_starter(bud_session.query(OldPhenotype).filter(OldPhenotype.observable.in_(chemical_phenotypes)).options( joinedload('phenotype_features'), joinedload('phenotype_features.experiment')), 1000)(): for phenotype_feature in old_phenotype.phenotype_features: chemical = ' and '.join([x[0] for x in phenotype_feature.experiment.chemicals]) old_observable = old_phenotype.observable if old_observable == 'resistance to chemicals': new_observable = old_phenotype.observable.replace('chemicals', chemical) else: new_observable = old_phenotype.observable.replace('chemical compound', chemical) parent_key = (create_format_name(old_observable.lower()), 'OBSERVABLE') child_key = (create_format_name(new_observable.lower()), 'OBSERVABLE') if parent_key in key_to_bioconcept and child_key in key_to_bioconcept: yield {'source': key_to_source['SGD'], 'relation_type': 'is a', 'parent_id': key_to_bioconcept[parent_key].id, 'child_id': key_to_bioconcept[child_key].id} else: print 'Could not find phenotype. Parent: ' + str(parent_key) + ' Child: ' + str(child_key) yield None #Phenotype Slim phenotype_slim = {'cell_death', 'chromosome-plasmid_maintenance', 'intracellular_transport', 'mitotic_cell_cycle', 'prion_state', 'stress_resistance', 'budding', 'filamentous_growth', 'lifespan', 'sexual_cycle', 'viable', 'inviable', 'competitive_fitness', 'viability', 'haploinsufficient', 'haploproficient', 'metabolism_and_growth', 'cellular_morphology', 'culture_appearance', 'ypo'} for phenotype in nex_session.query(Phenotype).all(): ancestor = phenotype.observable while ancestor is not None and ancestor.format_name not in phenotype_slim: if len(ancestor.parents) > 0: ancestor = ancestor.parents[0].parent else: ancestor = None if ancestor is not None: yield {'source': key_to_source['SGD'], 'parent_id': ancestor.id, 'child_id': phenotype.id, 'relation_type': 'PHENOTYPE_SLIM'} bud_session.close() nex_session.close()
def observable_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) old_cvterms = bud_session.query(CVTerm).filter(CVTerm.cv_no == 6).all() observable_to_ancestor = dict() child_id_to_parent_id = dict([(x.child_id, x.parent_id) for x in bud_session.query(CVTermRel).all()]) id_to_observable = dict([(x.id, x.name) for x in old_cvterms]) for old_obj in old_cvterms: observable_id = old_obj.id if observable_id in child_id_to_parent_id: ancestry = [observable_id, child_id_to_parent_id[observable_id]] else: ancestry = [observable_id, None] while ancestry[len(ancestry)-1] is not None: latest_parent_id = ancestry[len(ancestry)-1] if latest_parent_id in child_id_to_parent_id: ancestry.append(child_id_to_parent_id[latest_parent_id]) else: ancestry.append(None) if len(ancestry) > 2: ancestor_id = ancestry[len(ancestry)-3] observable_to_ancestor[old_obj.name] = id_to_observable[ancestor_id] else: observable_to_ancestor[old_obj.name] = id_to_observable[ancestry[0]] for bud_obj in bud_session.query(CVTerm).filter(CVTerm.cv_no == 6).all(): observable = bud_obj.name source = key_to_source['SGD'] ancestor_type = None if observable not in observable_to_ancestor else observable_to_ancestor[observable] if ancestor_type is None: print 'No ancestor type: ' + str(observable) yield None description = bud_obj.definition if observable == 'observable': description = 'Features of Saccharomyces cerevisiae cells, cultures, or colonies that can be detected, observed, measured, or monitored.' yield {'source': source, 'description': description, 'display_name': observable, 'ancestor_type': ancestor_type, 'date_created': bud_obj.date_created, 'created_by': bud_obj.created_by} for bud_obj in make_db_starter(bud_session.query(PhenotypeFeature).join(PhenotypeFeature.phenotype).filter(Phenotype.observable.in_(chemical_phenotypes)), 1000)(): if bud_obj.experiment is None: yield None chemicals = bud_obj.experiment.chemicals if len(chemicals) == 0: yield None chemical = ' and '.join([x[0] for x in chemicals]) source = key_to_source['SGD'] old_observable = bud_obj.phenotype.observable description = None if old_observable == 'resistance to chemicals': new_observable = bud_obj.phenotype.observable.replace('chemicals', chemical) description = 'The level of resistance to exposure to ' + chemical + '.' elif old_observable == 'chemical compound accumulation': new_observable = bud_obj.phenotype.observable.replace('chemical compound', chemical) description = 'The production and/or storage of ' + chemical + '.' elif old_observable == 'chemical compound excretion': new_observable = bud_obj.phenotype.observable.replace('chemical compound', chemical) description = 'The excretion from the cell of ' + chemical + '.' else: new_observable = None if new_observable is not None: ancestor_type = None if old_observable not in observable_to_ancestor else observable_to_ancestor[old_observable] yield {'source': source, 'description': description, 'display_name': new_observable, 'ancestor_type': ancestor_type, 'date_created': bud_obj.date_created, 'created_by': bud_obj.created_by} bud_session.close() nex_session.close()