def _siphon_family_interactions(self, triples): ints = [] FIELDS = ("FAM", "ID", "INT_TYPE") for row in iterate_csv(self._get_path("homodomain_interaction.csv", is_db=False), num_skip=1, delimiter="\t", fieldnames=FIELDS): ints.append((row["FAM"], row["FAM"], row["INT_TYPE"])) FIELDS = ("FAM1", "ID1", "FAM2", "ID2", "INT_TYPE") for row in iterate_csv(self._get_path("heterodomain_interaction.csv", is_db=False), num_skip=1, delimiter="\t", fieldnames=FIELDS): ints.append((row["FAM1"], row["FAM2"], row["INT_TYPE"])) int_types = set() for fam1, fam2, int_type in ints: fam1 = O.uri(O.PFAM_ID, fam1) fam2 = O.uri(O.PFAM_ID, fam2) int_type = O.uri(O.IPFAM_INT_TYPE, int_type) blank = B() triples.extend([ (blank, O.RDF.type, O.IPFAM_INT), (blank, O.IPFAM_INT_HAS_INT_TYPE, int_type), (blank, O.IPFAM_INT_HAS_PFAM, fam1), (blank, O.IPFAM_INT_HAS_PFAM, fam2), ]) self._add_if_new(triples, int_types, int_type, O.RDF.type, O.IPFAM_INT_TYPE) del ints
def _siphon_pdb_entries(self, triples): """Converts the `pdb_entry.txt` file.""" FIELDS = ( "PDB_ID", # `pdb_id` varchar(4) NOT NULL DEFAULT 'NULL', "HEADER", # `header` text, "TITLE", # `title` text, "DATE", # `date` date NOT NULL, "RESOLUTION", # `resolution` decimal(5,2) unsigned NOT NULL, "EXP_METHOD", # `expt_method` text NOT NULL, "AUTHOR", # `author` mediumtext, "PDB_FILE", # `pdb_file` int(10) DEFAULT '0', "SIFTS_FILE" # `sifts_file` int(10) DEFAULT '0', ) for row in iterate_csv(self._get_path("pdb_entry.txt"), delimiter="\t", fieldnames=FIELDS): pdb_id = U(O.PDBR + row["PDB_ID"].lower()) res = L(float(row["RESOLUTION"])) method = L(self._sanitize(row["EXP_METHOD"])) triples.extend([ (pdb_id, O.RDF.type, O.PDBID), (pdb_id, O.PDB_HAS_RESOLUTION, res), (pdb_id, O.PDB_HAS_EXP_METHOD, method), ])
def _siphon_regions(self, triples): FIELDS = ( "REGION", # int(10) "PROT_FAM", # int(11) "PROT_FAM_ACC", # varchar(45) "PDB_ID", # varchar(4) "CHAIN", # varchar(1) "START", # int(11) "START_ICODE", # varchar(1) "END", # int(11) "END_ICODE", # varchar(1) "REGION_SOURCE_DB", # varchar(12) ) for row in iterate_csv(self._get_path("pdb_protein_region.txt"), delimiter="\t", fieldnames=FIELDS): region = O.uri(O.IPFAM_REGION, row["REGION"]) pfam = O.uri(O.PFAM_ID, row["PROT_FAM_ACC"]) pdb_id_chain = U(O.PDBR + row["PDB_ID"].lower() + "_" + row["CHAIN"]) triples.extend([ (region, O.RDF.type, O.IPFAM_REGION), (region, O.IPFAM_REGION_INSTANCE_OF, pfam), (region, O.IPFAM_REGION_OCCURS_IN, pdb_id_chain), (region, O.IPFAM_REGION_STARTS_AT, L(int(row["START"]))), (region, O.IPFAM_REGION_STOPS_AT, L(int(row["END"]))), ])
def _siphon_pdb_homologues(self, triples): """Converts the `pdb_homologs.tab` file.""" FIELDS = ( "FEAT_NAME", # S. cerevisiae systematic name "START_COORD_QUERY", # start coord (aa position) in yeast "STOP_COORD_QUERY", # stop coord (aa position) in yeast "START_COORD_TARGET", # start coord (aa position) in target "STOP_COORD_TARGET", # stop coord (aa position) in target "PERCENT_ALIGNED", # percent of yeast contained in target "SCORE", # log of the expectation value "TARGET_PDB_ID", # PDB identifier "TARGET_TAXON_ID", # target taxon ID "TARGET_TAXON_NAME", # target taxon species name ) for row in iterate_csv(self._get_path("pdb_homologs.tab"), delimiter="\t", fieldnames=FIELDS): pdb_id_chain, _ = self._pdb_to_uri(row["TARGET_PDB_ID"]) match = B() triples.extend([ (match, O.RDF.type, O.SGD_PDB_HOMOLOGY), (match, O.SGD_PDB_HAS_QUERY, O.uri(O.SGD_FEATURE, row["FEAT_NAME"])), (match, O.SGD_PDB_ALIGNMENT, L(float(row["PERCENT_ALIGNED"]) / 100.0)), (match, O.SGD_PDB_HAS_TARGET, pdb_id_chain), ])
def _siphon_cog(self, triples): FIELDS = ("TAXON.STRING_ID", "START", "STOP", "CLUSTER_ID", "ANNOTATION") path = self._get_path("COG.mappings.v{}.txt").format(self._version) for row in iterate_csv(path, delimiter="\t", fieldnames=FIELDS, num_skip=1): parts = row["TAXON.STRING_ID"].split(".") taxon = parts[0] string_id = ".".join(parts[1:]) if taxon != self._taxon: continue string_id = O.uri(O.STRING_ID, string_id) cluster_id = row["CLUSTER_ID"] if cluster_id.startswith("COG"): cluster_id = O.uri(O.COG_CLUSTER_ID, cluster_id) triples.extend([(string_id, O.STRING_ID_IN_COG, cluster_id), (cluster_id, O.RDF.type, O.COG_CLUSTER_ID)]) elif cluster_id.startswith("KOG"): cluster_id = O.uri(O.KOG_CLUSTER_ID, cluster_id) triples.extend([(string_id, O.STRING_ID_IN_KOG, cluster_id), (cluster_id, O.RDF.type, O.COG_CLUSTER_ID)]) elif cluster_id.startswith("NOG"): cluster_id = O.uri(O.NOG_CLUSTER_ID, cluster_id) triples.extend([(string_id, O.STRING_ID_IN_NOG, cluster_id), (cluster_id, O.RDF.type, O.COG_CLUSTER_ID)])
def _siphon_pdb_uniprot(self, triples): """Converts the `pdb_chain_uniprot.csv` file.""" FIELDS = ("PDB_ID", "PDB_CHAIN", "SP_ID", "RES_BEGIN", "RES_END", "PDB_BEGIN", "PDB_END", "SP_BEGIN", "SP_END") for row in iterate_csv(self._get_path("pdb_chain_uniprot.csv"), delimiter="\t", fieldnames=FIELDS): pdb_id_chain = self._pdb_uri(row["PDB_ID"], row["PDB_CHAIN"]) sp_id = self._uniprot_uri(row["SP_ID"]) triples.append((pdb_id_chain, O.SIFTS_SAME_AS, sp_id))
def _siphon_interactions(self, triples): """Converts the `interaction_data.tab` file.""" FIELDS = ( "BAIT_FEAT_NAME", # Bait: feature (ORF) name and gene name (optional) "BAIT_STD_NAME", # "HIT_FEAT_NAME", # Hit: feature (ORF) name and gene name (optional) "HIT_STD_NAME", # "EXPERIMENT_TYPE", # Description of the experiment "INTERACTION_TYPE", # 'Genetic' or 'Physical' "SOURCE_DATABASE", # Source database "CURATION_TYPE", # Manual or high-throughput "NOTES", # Free text (useless) "PHENOTYPE", # Phenotype of the interaction (optional) "REFERENCE", # List of references as 'SGD_REF:" (SGDID) or 'PMID:' (PubMed) "CITATION", # List of citations ) int_types, exp_types, cur_types, sources = set(), set(), set(), set() for row in iterate_csv(self._get_path("interaction_data.tab"), delimiter="\t", fieldnames=FIELDS): bait = O.uri(O.SGD_FEATURE, self._sanitize(row["BAIT_FEAT_NAME"])) hit = O.uri(O.SGD_FEATURE, self._sanitize(row["HIT_FEAT_NAME"])) int_type = O.uri(O.SGD_INT_TYPE, self._sanitize(row["INTERACTION_TYPE"])) exp_type = O.uri(O.SGD_INT_EXP_TYPE, self._sanitize(row["EXPERIMENT_TYPE"])) cur_type = O.uri(O.SGD_INT_CUR_TYPE, self._sanitize(row["CURATION_TYPE"])) source = O.uri(O.SGD_INT_SOURCE, self._sanitize(row["SOURCE_DATABASE"])) self._add_if_new(triples, int_types, int_type, O.RDF.type, O.SGD_INT_TYPE) self._add_if_new(triples, exp_types, exp_type, O.RDF.type, O.SGD_INT_EXP_TYPE) self._add_if_new(triples, cur_types, cur_type, O.RDF.type, O.SGD_INT_CUR_TYPE) self._add_if_new(triples, source, source, O.RDF.type, O.SGD_INT_SOURCE) interaction = B() triples.extend([ (interaction, O.RDF.type, O.SGD_INT), (interaction, O.SGD_INT_HAS_BAIT, bait), (interaction, O.SGD_INT_HAS_HIT, hit), (interaction, O.SGD_INT_HAS_TYPE, int_type), (interaction, O.SGD_INT_HAS_EXP_TYPE, exp_type), (interaction, O.SGD_INT_HAS_CUR_TYPE, cur_type), (interaction, O.SGD_INT_HAS_SOURCE, source), ])
def _siphon_domains(self, triples): """Converts the `domains.tab` file. The data comes from an InterPro scan over the SGD entries. """ FIELDS = ( "FEAT_NAME", # S. cerevisiae systematic name (ID of the input sequence) "CRC64", # CRC of the proteic sequence "LENGTH", # Lenght of the sequence in AA "METHOD", # Analysis method "DB_MEMBERS", # DB members entry for this match "DB_DESCRIPTION", # DB member description for the entry "START", # start of the domain match "STOP", # end of the domain match "EVALUE", # E-value of the match (defined by DB) "STATUS", # Status of the match: T=true, ?=unknown "DATE", # Date of the run "IPR_ID", # InterPro ID "IPR_DESCRIPTION", # InterPro description "IPR_GO", # GO description of the InterPro entry ) for row in iterate_csv(self._get_path("domains.tab"), delimiter="\t", fieldnames=FIELDS): feat_id = O.uri(O.SGD_FEATURE, self._sanitize(row["FEAT_NAME"])) is_true = L({"T": True, "?": False}[row["STATUS"]]) db_id = L(None) if row["METHOD"] == "Pfam" and row["STATUS"] == "T": db_id = O.uri(O.PFAM_ID, row["DB_MEMBERS"]) try: evalue = L(float(row["EVALUE"])) except ValueError: evalue = L(-1.0) _ = B() triples.extend([ (_, O.RDF.type, O.SGD_IPR_HIT), (_, O.SGD_IPR_HIT_HAS_ID, L(row["IPR_ID"])), (_, O.SGD_IPR_HIT_HAS_METHOD, L(row["METHOD"])), (_, O.SGD_IPR_HIT_HAS_DB_ID, db_id), (_, O.SGD_IPR_HIT_STARTS_AT, L(int(row["START"]))), (_, O.SGD_IPR_HIT_STOPS_AT, L(int(row["STOP"]))), (_, O.SGD_IPR_HIT_HAS_EVALUE, evalue), (_, O.SGD_IPR_HIT_IS_TRUE, is_true), (feat_id, O.SGD_FEATURE_HAS_IPR_HIT, _), ])
def _siphon_interactions(self, triples): FIELDS = ("ITEM_ID_A", "ITEM_ID_B", "MODE", "ACTION", "A_IS_ACTING", "SCORE", "SOURCES", "TRANSFERRED_SOURCES") path = self._get_path("{}.protein.actions.detailed.v{}.txt") \ .format(self._taxon, self._version) for row in iterate_csv(path, delimiter="\t", fieldnames=FIELDS, num_skip=1): id_a, id_b = row["ITEM_ID_A"], row["ITEM_ID_B"] if not (id_a.startswith("{}".format(self._taxon)) and \ id_b.startswith("{}".format(self._taxon))): continue id_a = O.uri(O.STRING_ID, id_a.split(".", 1)[1]) id_b = O.uri(O.STRING_ID, id_b.split(".", 1)[1]) mode = O.uri(O.STRING_ACTION_MODE, row["MODE"]) triples.append((id_a, mode, id_b))
def _siphon_aliases(self, triples): FIELDS = ("TAXON", "STRING_ID", "ALIAS_ID", "ALIAS_TYPE") path = self._get_path("{}.protein.aliases.v{}.txt") \ .format(self._taxon, self._version) for row in iterate_csv(path, delimiter="\t", fieldnames=FIELDS, num_skip=1): string_id = O.uri(O.STRING_ID, row["STRING_ID"]) triples.append((string_id, O.RDF.type, O.OCELOT.STRING_ID)) alias_id = row["ALIAS_ID"] for alias_type in row["ALIAS_TYPE"].split(): if alias_type == "SGD" and \ (alias_id.startswith("S0") or alias_id.startswith("L0")): db_alias_id = O.uri(O.SGD_ID, alias_id) else: # XXX handle UniProt ACs here continue triples.append((string_id, O.OWL.sameAs, db_alias_id))
def _siphon_goslims(self, triples): """Converts the `go_slim_mapping.tab` file.""" # XXX there are a few goslim annotations with commas FIELDS = ( "GENE_ORF", # Systematic gene name "GENE_STD", # Gene name (optional) "SGD_ID", # Gene SGD ID "GO_ASPECT", # P = Process, F = Function, C = Component "GO_TERM", # GO SLIM term "GO_ID", # GO term ID "FEATURE_TYPE", # Such as 'ORF' or 'tRNA' ) for row in iterate_csv(self._get_path("go_slim_mapping.tab"), delimiter="\t", fieldnames=FIELDS): if row["GO_TERM"] in ("other", "not_yet_annotated"): continue assert len(row["GO_ID"]), row sgd_id = O.uri(O.SGD_ID, self._sanitize(row["SGD_ID"])) goslim = O.go_to_uri(self._sanitize(row["GO_ID"])) triples.append((sgd_id, O.SGD_ID_HAS_GOSLIM, goslim))
def _siphon_region_interactions(self, triples): FIELDS = ( "REGION_INT", # `auto_reg_int` bigint(20) NOT NULL AUTO_INCREMENT, "PDB_ID", # `pdb_id` varchar(4) NOT NULL, "REGION_A", # `region_id_A` int(10) unsigned NOT NULL, "REGION_B", # `region_id_B` int(10) unsigned NOT NULL, "IS_INTRACHAIN", # `intrachain` tinyint(1) NOT NULL, "QUALITY_CONTROL", # `quality_control` int(10) unsigned NOT NULL, ) for row in iterate_csv(self._get_path("pdb_protein_region_int.txt"), delimiter="\t", fieldnames=FIELDS): region_int = O.uri(O.IPFAM_REGION_INT, row["REGION_INT"]) region_a = O.uri(O.IPFAM_REGION, row["REGION_A"]) region_b = O.uri(O.IPFAM_REGION, row["REGION_B"]) pdb_id = U(O.PDBR + row["PDB_ID"].lower()) triples.extend([ (region_int, O.RDF.type, O.IPFAM_REGION_INT), (region_int, O.IPFAM_REGION_INT_OCCURS_IN, pdb_id), (region_int, O.IPFAM_REGION_INT_HAS_REGION, region_a), (region_int, O.IPFAM_REGION_INT_HAS_REGION, region_b), ])
def _siphon_xrefs(self, triples): """Converts the `dbxref.tab` file.""" FIELDS = ( "XREF_ID", # Cross-reference ID "XREF_ID_SOURCE", # Cross-reference database ID "XREF_ID_TYPE", # Cross-reference type (like 'PDB chain' or 'PDB best hit') "FEAT_NAME", # ORF name "SGD_ID", # SGD ID "UNDOCUMENTED", # Undocumented ) for row in iterate_csv(self._get_path("dbxref.tab"), delimiter="\t", fieldnames=FIELDS): sgd_id = O.uri(O.SGD_ID, self._sanitize(row["SGD_ID"])) xref_source = self._sanitize(row["XREF_ID_SOURCE"]) xref_type = self._sanitize(row["XREF_ID_TYPE"]) if (xref_source, xref_type) == ("EBI", self._sanitize("UniProt/TrEMBL ID")) or \ (xref_source, xref_type) == ("EBI", self._sanitize("UniProt/Swiss-Prot ID")): xref_id = O.UNIPROT_ID + row["XREF_ID"] else: continue triples.append((sgd_id, O.SGD_ID_HAS_XREF, xref_id))
def _siphon_families(self, triples): """Converts the `protein_family.txt file.""" FIELDS = ( "FAMILY_INT", # `auto_prot_fam` int(11) NOT NULL AUTO_INCREMENT, "FAMILY_ACC", # `accession` varchar(45) DEFAULT NULL, "FAMILY_ID", # `identifier` varchar(45) DEFAULT NULL, "DESCRIPTION", # `description` text, "COMMENT", # `comment` longtext, "FAMILY_TYPE", # `type` enum('family','domain','motif','repeat') DEFAULT NULL, "SOURCE_DB", # `source_db` enum('pfama') DEFAULT NULL, "COLOUR", # `colour` varchar(7) DEFAULT NULL, "NUMBER_FAM_INT", # `number_fam_int` int(5) DEFAULT '0', "NUMBER_LIG_INT", # `number_lig_int` int(5) DEFAULT '0', "NUMBER_PDBS", # `number_pdbs` int(5) DEFAULT '0', ) pfam_types = set() for row in iterate_csv(self._get_path("protein_family.txt"), num_skip=1, delimiter="\t", fieldnames=FIELDS): assert len(row["FAMILY_ACC"]) > 0 if row["SOURCE_DB"] != "pfama": print "Warning: invalid row '{}'".format(row.items()) continue pfam_acc = O.uri(O.PFAM_ID, row["FAMILY_ACC"]) triples.append((pfam_acc, O.RDF.type, O.PFAM_ID)) if row["FAMILY_TYPE"] is None: continue if row["FAMILY_TYPE"]: pfam_type = O.uri(O.PFAM_TYPE, row["FAMILY_TYPE"]) triples.append((pfam_acc, O.PFAM_ID_HAS_TYPE, pfam_type)) if not pfam_type in pfam_types: pfam_types.add(pfam_type) triples.append((pfam_type, O.RDF.type, O.PFAM_TYPE))
def _siphon_gos(self, triples): """Converts the `gene_association.sgd` file.""" FIELDS = ( "DB", # always "SGD" "SGD_ID", # SGD ID "DONTCARE_ID", # yeah "QUALIFIERS", # Stuff like NOT, contributes_to, colocalizes; pipe-separated "GO_ID", # GO term ID "DB_REFS", # DB references; pipe-separated "ECODE", # Evidence code for the annotation "WITH_FROM", # With or From optional qualifiers "GO_ASPECT", # P = Process, F = Function, C = Component "DB_NAMES", # DB object names; pipe-separated "DB_SYNONYMS", # DB synonyms; pipe-separated "DB_TYPES", # DB object types; pipe-separated "TAXIDS", # Taxon IDs; pipe-separated "DATE", # Date the GO annotation was defined in YYYYMMDD "ASSIGNED_BY", # always "SGD" "EXTENSION", # yeah "VARIANTS", # yeah ) for row in iterate_csv(self._get_path("gene_association.sgd"), num_skip=26, delimiter="\t", fieldnames=FIELDS): assert len(row["SGD_ID"]), row assert len(row["GO_ID"]), row sgd_id = O.uri(O.SGD_ID, self._sanitize(row["SGD_ID"])) go_term = O.go_to_uri(self._sanitize(row["GO_ID"])) annotation = B() triples.extend([ (annotation, O.RDF.type, O.SGD_GO_ANNOTATION), (annotation, O.SGD_GO_ANNOTATION_HAS_SGD_ID, sgd_id), (annotation, O.SGD_GO_ANNOTATION_HAS_TERM, go_term), (annotation, O.SGD_GO_ANNOTATION_HAS_ECODE, L(row["ECODE"])), ])
def _siphon_features(self, triples): """Converts the `SGD_features.tab` file.""" # XXX there may be chromosomes not marked as features FIELDS = ( "SGD_ID", # Primary SGD ID "FEAT_TYPE", # Feature type "FEAT_QUALIFIER", # Feature qualifier (optional) "FEAT_NAME", # Feature name (optional) "STD_GENE_NAME", # Standard gene name (optional) "ALIASES", # Aliases (optional, |-separated) "PARENT_FEAT_NAME", # Parent feature name (optional) "OTHER_SGD_IDS", # Secondary SGD IDs (optional, |-separated) "CHROMOSOME", # Chromosome (optional) "START_COORD", # Start coordinate (optional) "STOP_COORD", # Stop coordinate (optional) "STRAND", # Strand (optional) "GENETIC_POS", # Genetic position (optional) "COORD_VERSION", # Coordinate version (optional) "SEQ_VERSION", # Sequence version (optional) "DESCRIPTION", # Description (optional) ) feat_types, qualifiers, chromosomes = set(), set(), set() for row in iterate_csv(self._get_path("SGD_features.tab"), delimiter="\t", fieldnames=FIELDS): sgd_id = O.uri(O.SGD_ID, self._sanitize(row["SGD_ID"])) feat_id = O.uri(O.SGD_FEATURE, self._sanitize(row["FEAT_NAME"])) feat_type = O.uri(O.SGD_FEATURE_TYPE, self._sanitize(row["FEAT_TYPE"])) triples.extend([ (sgd_id, O.RDF.type, O.SGD_ID), (feat_id, O.RDF.type, O.SGD_FEATURE), (sgd_id, O.OWL.sameAs, feat_id), (sgd_id, O.SGD_ID_HAS_TYPE, feat_type), ]) self._add_if_new(triples, feat_types, feat_type, O.RDF.type, O.SGD_FEATURE_TYPE) for q in filter(lambda q: len(q), self._sanitize_all(row["FEAT_QUALIFIER"])): q = O.uri(O.SGD_ID_QUALIFIER, q) self._add_if_new(triples, qualifiers, q, O.RDF.type, O.SGD_ID_QUALIFIER) triples.append((sgd_id, O.SGD_ID_HAS_QUALIFIER, q)) for a in filter(lambda a: len(a), self._sanitize_all(row["OTHER_SGD_IDS"])): triples.append((sgd_id, O.OWL.sameAs, O.uri(O.SGD_ID, a))) parent = row["PARENT_FEAT_NAME"] if len(parent): parent = O.uri(O.SGD_FEATURE, self._sanitize(parent)) triples.extend([ (parent, O.RDF.type, O.SGD_FEATURE), (sgd_id, O.DCTERMS.isPartOf, parent), ]) chromosome = row["CHROMOSOME"] if len(chromosome): chromosome = O.uri(O.SGD_FEATURE, "chrosomosome_" + chromosome) self._add_if_new(triples, chromosomes, chromosome, O.RDF.type, O.SGD_CHROMOSOME) triples.append((sgd_id, O.SGD_ID_IN_CHROMOSOME, chromosome)) if len(row["STRAND"]): triples.append((sgd_id, O.SGD_ID_IN_STRAND, L(row["STRAND"]))) if len(row["START_COORD"]): triples.append( (sgd_id, O.SGD_ID_STARTS_AT, L(int(row["START_COORD"])))) if len(row["STOP_COORD"]): triples.append( (sgd_id, O.SGD_ID_STOPS_AT, L(int(row["STOP_COORD"]))))