def bioitem_relation_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_chemical = dict([(x.unique_key(), x) for x in nex_session.query(Chemical).all()]) key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) for bud_obj in bud_session.query(CVTermRel).options(joinedload('child'), joinedload('parent')).all(): parent_key = (create_format_name(bud_obj.parent.name)[:95], 'CHEMICAL') child_key = (create_format_name(bud_obj.child.name)[:95], 'CHEMICAL') if parent_key in key_to_chemical and child_key in key_to_chemical: yield {'source': key_to_source['SGD'], 'relation_type': bud_obj.relationship_type, 'parent_id': key_to_chemical[parent_key].id, 'child_id': key_to_chemical[child_key].id, 'date_created': bud_obj.date_created, 'created_by': bud_obj.created_by} bud_session.close() nex_session.close()
def experiment_relation_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_experiment = dict([(x.unique_key(), x) for x in nex_session.query(Experiment).all()]) key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) for old_cv_term in bud_session.query(CVTerm).filter(CVTerm.cv_no==7).options(joinedload('parent_rels'), joinedload('parent_rels.parent')).all(): child_key = create_format_name(old_cv_term.name) for parent_rel in old_cv_term.parent_rels: parent_key = create_format_name(parent_rel.parent.name) if parent_key in key_to_experiment and child_key in key_to_experiment: yield {'source': key_to_source['SGD'], 'parent_id': key_to_experiment[parent_key].id, 'child_id': key_to_experiment[child_key].id, 'date_created': parent_rel.date_created, 'created_by': parent_rel.created_by} else: print 'Experiment does not exist: ' + str(parent_key) + ' ' + str(child_key) bud_session.close() nex_session.close()
def bioitem_tag_starter(): nex_session = nex_session_maker() key_to_dataset = dict([(x.unique_key(), x) for x in nex_session.query(Dataset).all()]) key_to_tag = dict([(x.unique_key(), x) for x in nex_session.query(Tag).all()]) for row in make_file_starter('src/sgd/convert/data/microarray_05_14/SPELL-tags.txt')(): dataset_key = (row[1].strip()[:-4], 'DATASET') tags = row[2].strip() for t in [x.strip() for x in tags.split('|')]: if t != '': yield { 'bioitem': key_to_dataset[dataset_key], 'tag': key_to_tag[create_format_name(t)] } nex_session.close()
def experiment_alias_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_experiment = dict([(x.unique_key(), x) for x in nex_session.query(Experiment).all()]) key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) for old_cv_term in bud_session.query(CVTerm).filter(CVTerm.cv_no==7).options(joinedload('cv_dbxrefs'), joinedload('cv_dbxrefs.dbxref')).all(): experiment_key = create_format_name(old_cv_term.name) if experiment_key in key_to_experiment: for dbxref in old_cv_term.dbxrefs: yield {'display_name': dbxref.dbxref_id, 'source': key_to_source['SGD'], 'category': 'APOID', 'experiment_id': key_to_experiment[experiment_key].id, 'date_created': dbxref.date_created, 'created_by': dbxref.created_by} else: print 'Experiment does not exist: ' + str(experiment_key) yield None bud_session.close() nex_session.close()
def orphan_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) for bud_obj in bud_session.query(ExperimentProperty).filter(ExperimentProperty.type == 'Reporter').all(): if bud_obj.type == 'Reporter': yield {'display_name': bud_obj.value, 'source': key_to_source['SGD']} for bud_obj in bud_session.query(GorefDbxref).all(): dbxref = bud_obj.dbxref dbxref_type = dbxref.dbxref_type if dbxref_type != 'GOID' and dbxref_type != 'EC number' and dbxref_type != 'DBID Primary' and dbxref_type != 'PANTHER' and dbxref_type != 'Prosite': source_key = create_format_name(dbxref.source) source = None if source_key not in key_to_source else key_to_source[source_key] if source is None: print 'Source not found: ' + str(source_key) yield None link = None bioitem_type = None if dbxref_type == 'UniProt/Swiss-Prot ID': urls = dbxref.urls if len(urls) == 1: link = urls[0].url.replace('_SUBSTITUTE_THIS_', dbxref.dbxref_id) bioitem_type = 'UniProtKB' elif dbxref_type == 'UniProtKB Subcellular Location': link = "http://www.uniprot.org/locations/" + dbxref.dbxref_id bioitem_type = 'UniProtKB-SubCell' elif dbxref_type == 'InterPro': link = "http://www.ebi.ac.uk/interpro/entry/" + dbxref.dbxref_id bioitem_type = 'InterPro' elif dbxref_type == 'DNA accession ID': link = None bioitem_type = 'EMBL' elif dbxref_type == 'Gene ID': link = None bioitem_type = dbxref.source elif dbxref_type == 'HAMAP ID' or dbxref_type == 'HAMAP': link = None bioitem_type = 'HAMAP' elif dbxref_type == 'PDB identifier': link = None bioitem_type = 'PDB' elif dbxref_type == 'Protein version ID': link = None bioitem_type = 'protein_id' elif dbxref_type == 'UniPathway ID': link = 'http://www.grenoble.prabi.fr/obiwarehouse/unipathway/upa?upid=' + dbxref.dbxref_id bioitem_type = 'UniPathway' elif dbxref_type == 'UniProtKB Keyword': link = 'http://www.uniprot.org/keywords/' + dbxref.dbxref_id bioitem_type = 'UniProtKB-KW' yield {'display_name': dbxref.dbxref_id, 'link': link, 'source': source, 'description': dbxref.dbxref_name, 'bioitem_type': bioitem_type} bud_session.close() nex_session.close()
def domain_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) panther_id_to_description = {} for row in make_file_starter('src/sgd/convert/data/PANTHER9.0_HMM_classifications.txt')(): panther_id_to_description[row[0]] = row[1].lower() for row in make_file_starter('src/sgd/convert/data/domains.tab')(): source_key = row[3].strip() if source_key.startswith('ProSite'): source_key = 'Prosite' if source_key.startswith('SignalP'): source_key = 'SignalP' if source_key.startswith('Hamap'): source_key = 'HAMAP' if source_key == 'Coils': source_key = '-' display_name = row[4].strip() description = row[5].strip() interpro_id = None interpro_description = None if len(row) == 13: interpro_id = row[11].strip() interpro_description = row[12].strip() source_key = create_format_name(source_key) source = None if source_key not in key_to_source else key_to_source[source_key] description = None if description == '' else description interpro_description = None if interpro_description == '' else interpro_description interpro_id = None if interpro_id == '' else interpro_id if source_key == 'PANTHER': if display_name in panther_id_to_description: yield {'display_name': display_name, 'source': source, 'description': panther_id_to_description[display_name], 'bioitem_type': source_key, 'interpro_id': interpro_id, 'interpro_description': interpro_description} elif source_key is not None: yield {'display_name': display_name, 'source': source, 'description': description if description is not None else interpro_description, 'bioitem_type': source_key, 'interpro_id': interpro_id, 'interpro_description': interpro_description} else: print 'Source not found: ' + source_key for row in make_file_starter('src/sgd/convert/data/TF_family_class_accession04302013.txt')(): description = 'Class: ' + row[4] + ', Family: ' + row[3] yield {'display_name': row[0], 'source': key_to_source['JASPAR'], 'description': description, 'bioitem_type': 'JASPAR'} yield {'display_name': 'predicted signal peptide', 'source': key_to_source['SignalP'], 'description': 'predicted signal peptide', 'bioitem_type': 'SignalP'} yield {'display_name': 'predicted transmembrane domain', 'source': key_to_source['TMHMM'], 'description': 'predicted transmembrane domain', 'bioitem_type': 'TMHMM'} for bud_obj in bud_session.query(Dbxref).filter(or_(Dbxref.dbxref_type == 'PANTHER', Dbxref.dbxref_type == 'Prosite')).all(): dbxref_type = bud_obj.dbxref_type source_key = create_format_name(bud_obj.source) source = None if source_key not in key_to_source else key_to_source[source_key] if source is None: print source_key yield None bioitem_type = None if dbxref_type == 'Prosite ID': bioitem_type = 'Prosite' elif dbxref_type == 'PANTHER': bioitem_type = 'PANTHER' if bioitem_type == 'PANTHER': if display_name in panther_id_to_description: yield {'display_name': bud_obj.dbxref_id, 'source': source, 'description': panther_id_to_description[bud_obj.dbxref_id], 'bioitem_type': bioitem_type} else: yield {'display_name': bud_obj.dbxref_id, 'source': source, 'description': bud_obj.dbxref_name, 'bioitem_type': bioitem_type} bud_session.close() nex_session.close()
def experiment_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) for bud_obj in make_obo_file_starter('src/sgd/convert/data/eco.obo')(): description = None if 'def' not in bud_obj else bud_obj['def'] if description is not None and description.find('[') >= 0: description = description[:description.find('[')-1] if description is not None and description.find('"') >= 0: description = description[1:-1] yield {'display_name': bud_obj['name'], 'source': key_to_source['ECO'], 'description': description, 'eco_id': bud_obj['id']} for bud_obj in bud_session.query(CVTerm).filter(CVTerm.cv_no==7).all(): format_name = create_format_name(bud_obj.name) yield {'display_name': bud_obj.name, 'source': key_to_source['SGD'], 'description': bud_obj.definition, 'category': 'large-scale survey' if format_name in large_scale_survey else 'classical genetics' if format_name in classical_genetics else None, 'date_created': bud_obj.date_created, 'created_by': bud_obj.created_by} for row in make_file_starter('src/sgd/convert/data/2014-05-15_reg_data/Venters_Macisaac_Hu05-12-2014_regulator_lines')(): source_key = row[11].strip() if source_key in key_to_source: yield {'display_name': row[4] if row[4] != '' else row[5], 'source': None if source_key not in key_to_source else key_to_source[source_key], 'eco_id': row[5]} else: print 'Source not found: ' + str(source_key) for row in make_file_starter('src/sgd/convert/data/2014-05-15_reg_data/SGD_data_05_14_2014')(): source_key = row[11].strip() if source_key in key_to_source: yield {'display_name': row[4] if row[4] != '' else row[5], 'source': None if source_key not in key_to_source else key_to_source[source_key], 'eco_id': row[5]} else: print 'Source not found: ' + str(source_key) for row in make_file_starter('src/sgd/convert/data/2014-05-15_reg_data/Madhani_fixed')(): if len(row) >= 10: if source_key in key_to_source: source_key = row[11].strip() yield {'display_name': row[5] if row[5] != '' else row[4], 'source': None if source_key not in key_to_source else key_to_source[source_key], 'eco_id': row[4]} else: print 'Source not found: ' + str(source_key) for row in make_file_starter('src/sgd/convert/data/2014-05-15_reg_data/Pimentel_PMID22616008.txt')(): if len(row) >= 10: if source_key in key_to_source: source_key = row[11].strip() yield {'display_name': row[4] if row[4] != '' else row[5], 'source': None if source_key not in key_to_source else key_to_source[source_key], 'eco_id': row[5]} else: print 'Source not found: ' + str(source_key) for row in make_file_starter('src/sgd/convert/data/yetfasco_data.txt', delimeter=';')(): expert_confidence = row[8][1:-1] if expert_confidence == 'High': yield {'display_name': row[9][1:-1], 'source': key_to_source['YeTFaSCo']} yield {'display_name': 'protein abundance', 'source': key_to_source['SGD']} yield {'display_name': 'EXP', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/exp-inferred-experiment', 'description': 'Inferred from Experiment'} yield {'display_name': 'IDA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ida-inferred-direct-assay', 'description': 'Inferred from Direct Assay'} yield {'display_name': 'IPI', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ipi-inferred-physical-interaction', 'description': 'Inferred from Physical Interaction'} yield {'display_name': 'IMP', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/imp-inferred-mutant-phenotype', 'description': 'Inferred from Mutant Phenotype'} yield {'display_name': 'IGI', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/igi-inferred-genetic-interaction', 'description': 'Inferred from Genetic Interaction'} yield {'display_name': 'IEP', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/iep-inferred-expression-pattern', 'description': 'Inferred from Expression Pattern'} yield {'display_name': 'ISS', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/iss-inferred-sequence-or-structural-similarity', 'description': 'Inferred from Sequence or Structural Similarity'} yield {'display_name': 'ISA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/isa-inferred-sequence-alignment', 'description': 'Inferred from Sequence Alignment'} yield {'display_name': 'ISO', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/iso-inferred-sequence-orthology', 'description': 'Inferred from Sequence Orthology'} yield {'display_name': 'ISM', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ism-inferred-sequence-model', 'description': 'Inferred from Sequence Model'} yield {'display_name': 'IGC', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/igc-inferred-genomic-context', 'description': 'Inferred from Genomic Context'} yield {'display_name': 'IBA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/iba-inferred-biological-aspect-ancestor', 'description': 'Inferred from Biological aspect of Ancestor'} yield {'display_name': 'IBD', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ibd-inferred-biological-aspect-descendent', 'description': 'Inferred from Biological aspect of Descendent'} yield {'display_name': 'IKR', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ikr-inferred-key-residues', 'description': 'Inferred from Key Residues'} yield {'display_name': 'IRD', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ird-inferred-rapid-divergence', 'description': 'Inferred from Rapid Divergence'} yield {'display_name': 'RCA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/rca-inferred-reviewed-computational-analysis', 'description': 'inferred from Reviewed Computational Analysis'} yield {'display_name': 'TAS', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/tas-traceable-author-statement', 'description': 'Traceable Author Statement'} yield {'display_name': 'NAS', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/nas-non-traceable-author-statement', 'description': 'Non-traceable Author Statement'} yield {'display_name': 'IC', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ic-inferred-curator', 'description': 'Inferred by Curator'} yield {'display_name': 'ND', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/nd-no-biological-data-available', 'description': 'No Biological Data Available'} yield {'display_name': 'IEA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/automatically-assigned-evidence-codes', 'description': 'Inferred from Electronic Annotation'} bud_session.close() nex_session.close()
def orphan_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) for bud_obj in bud_session.query(ExperimentProperty).filter(ExperimentProperty.type == 'Reporter').all(): if bud_obj.type == 'Reporter': yield {'display_name': bud_obj.value, 'source': key_to_source['SGD']} f = open('src/sgd/convert/data/gp_association.559292_sgd') soid = None for line in f: pieces = line.split('\t') if len(pieces) < 11: continue go_extensions = pieces[10] if go_extensions == '' or '(SO:' not in go_extensions: continue items = go_extensions.replace(',', '|').split('|') for item in items: if "(SO:" not in item: continue soid = item.split('(')[1][:-1] yield { 'display_name': soid, 'source': key_to_source['SGD'], 'bioitem_type': 'SO' } f.close() for bud_obj in bud_session.query(GorefDbxref).all(): dbxref = bud_obj.dbxref dbxref_type = dbxref.dbxref_type if dbxref_type != 'GOID' and dbxref_type != 'EC number' and dbxref_type != 'DBID Primary' and dbxref_type != 'PANTHER' and dbxref_type != 'Prosite': source_key = create_format_name(dbxref.source) source = None if source_key not in key_to_source else key_to_source[source_key] if source is None: print 'Source not found: ' + str(source_key) yield None link = None bioitem_type = None if dbxref_type == 'UniProt/Swiss-Prot ID': urls = dbxref.urls if len(urls) == 1: link = urls[0].url.replace('_SUBSTITUTE_THIS_', dbxref.dbxref_id) bioitem_type = 'UniProtKB' elif dbxref_type == 'UniProtKB Subcellular Location': link = "http://www.uniprot.org/locations/" + dbxref.dbxref_id bioitem_type = 'UniProtKB-SubCell' elif dbxref_type == 'InterPro': link = "http://www.ebi.ac.uk/interpro/entry/" + dbxref.dbxref_id bioitem_type = 'InterPro' elif dbxref_type == 'DNA accession ID': link = None bioitem_type = 'EMBL' elif dbxref_type == 'Gene ID': link = None bioitem_type = dbxref.source elif dbxref_type == 'HAMAP ID' or dbxref_type == 'HAMAP': link = None bioitem_type = 'HAMAP' elif dbxref_type == 'PDB identifier': link = None bioitem_type = 'PDB' elif dbxref_type == 'Protein version ID': link = None bioitem_type = 'protein_id' elif dbxref_type == 'UniPathway ID': link = 'http://www.grenoble.prabi.fr/obiwarehouse/unipathway/upa?upid=' + dbxref.dbxref_id bioitem_type = 'UniPathway' elif dbxref_type == 'UniProtKB Keyword': link = 'http://www.uniprot.org/keywords/' + dbxref.dbxref_id bioitem_type = 'UniProtKB-KW' yield {'display_name': dbxref.dbxref_id, 'link': link, 'source': source, 'description': dbxref.dbxref_name, 'bioitem_type': bioitem_type} bud_session.close() nex_session.close()