class HGNCParser(ReturnParser): def __init__(self): """ :param ncbigene_instance: NcbiGene Instance :type ncbigene_instance: DataSourceInstance :param taxid: """ super(HGNCParser, self).__init__() # output data self.genes = NodeSet(['Gene'], merge_keys=['sid']) self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'], ['sid'], ['sid']) self.gene_maps_genesymbol = RelationshipSet('MAPS', ['Gene'], ['GeneSymbol'], ['sid'], ['sid', 'taxid']) def run_with_mounted_arguments(self): self.run() def run(self): hgnc_instance = self.get_instance_by_name('HGNC') hgnc_complete_file = hgnc_instance.get_file('hgnc_complete_set.txt') self.parse_hgnc_complete_file(hgnc_complete_file) def parse_hgnc_complete_file(self, hgnc_complete_file): with open(hgnc_complete_file, 'rt') as f: header = next(f) for l in f: flds = l.strip().split('\t') sid = flds[0] gene_symbol = flds[1] ncbi_id = flds[18] if len(flds) > 18 else None ensembl_id = flds[19] if len(flds) > 19 else None all_props = dict(zip(header, flds)) all_props['sid'] = sid all_props['source'] = 'hgnc' self.genes.add_node(all_props) if ncbi_id: self.gene_maps_gene.add_relationship({'sid': sid}, {'sid': ncbi_id}, {'source': 'hgnc'}) if ensembl_id: self.gene_maps_gene.add_relationship({'sid': sid}, {'sid': ensembl_id}, {'source': 'hgnc'}) if gene_symbol: self.gene_maps_genesymbol.add_relationship({'sid': sid}, { 'sid': gene_symbol, 'taxid': '9606' }, {'source': 'hgnc'})
def __init__(self): super(SomeParser, self).__init__() self.source = NodeSet(['Source'], merge_keys=['source_id']) self.target = NodeSet(['Target'], merge_keys=['target_id']) self.rels = RelationshipSet('FOO', ['Source'], ['Target'], ['source_id'], ['target_id'])
def __init__(self): super(NcbiGeneParser, self).__init__() # arguments self.arguments = ['taxid'] # output data # both gene IDs and GeneSymbols have the label 'Gene' # two different NodeSets are used because only the GeneSymbol nodes need taxid for uniqueness self.genes = NodeSet(['Gene'], merge_keys=['sid'], default_props={'source': 'ncbigene'}) self.genesymbols = NodeSet(['Gene'], merge_keys=['sid', 'taxid'], default_props={ 'source': 'ncbigene', 'type': 'symbol' }) self.genesymbol_synonym_genesymbol = RelationshipSet( 'SYNONYM', ['Gene'], ['Gene'], ['sid', 'taxid'], ['sid', 'taxid'], default_props={'source': 'ncbigene'}) self.gene_maps_genesymbol = RelationshipSet( 'MAPS', ['Gene'], ['Gene'], ['sid'], ['sid', 'taxid'], default_props={'source': 'ncbigene'})
def __init__(self): """ """ super(BigWordListParser, self).__init__() # NodeSets self.words = NodeSet(['Word'], merge_keys=['value'])
class DummyParser(ReturnParser): def __init__(self): super(DummyParser, self).__init__() # arguments self.arguments = ['taxid'] # output data self.dummy_nodes = NodeSet(['Dummy'], merge_keys=['sid']) self.fummy_nodes = NodeSet(['Fummy'], merge_keys=['sid']) self.dummy_knows_fummy = RelationshipSet('KNOWS', ['Dummy'], ['Fummy'], ['sid'], ['sid']) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): dummy_instance = self.get_instance_by_name('Dummy') dummyfile = dummy_instance.get_file('file.txt') target_sids = list(string.ascii_lowercase) # Fummy nodes for i in range(10): self.fummy_nodes.add_node({'sid': i, 'taxid': taxid}) with open(dummyfile) as f: for l in f: letter = l.strip() self.dummy_nodes.add_node({'sid': letter, 'taxid': taxid}) self.dummy_knows_fummy.add_relationship({'sid': letter}, {'sid': randint(0, 9)}, {'key': 'value'})
def __init__(self): super(LncipediaParser, self).__init__() self.genes = NodeSet(['Gene'], merge_keys=['sid']) self.transcripts = NodeSet(['Transcript'], merge_keys=['sid']) self.gene_codes_transcripts = RelationshipSet('CODES', ['Gene'], ['Transcript'], ['sid'], ['sid']) self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'], ['sid'], ['sid']) self.transcript_maps_transcript = RelationshipSet('MAPS', ['Transcript'], ['Transcript'], ['sid'], ['sid'])
def __init__(self): super(RefseqEntityParser, self).__init__() # arguments self.arguments = ['taxid'] # define NodeSet and RelationshipSet self.transcripts = NodeSet(['Transcript'], merge_keys=['sid'], default_props={'source': 'refseq'}) self.proteins = NodeSet(['Protein'], merge_keys=['sid'], default_props={'source': 'refseq'})
def __init__(self): super(NcbiLegacyGeneParser, self).__init__() self.arguments = ['taxid'] self.legacy_genes = NodeSet(['Gene', 'Legacy'], merge_keys=['sid'], default_props={'source': 'ncbigene'}) self.legacy_gene_now_gene = RelationshipSet( 'REPLACED_BY', ['Gene', 'Legacy'], ['Gene'], ['sid'], ['sid'], default_props={'source': 'ncbigene'})
def __init__(self): super(DummyParser, self).__init__() # arguments self.arguments = ['taxid'] # output data self.dummy_nodes = NodeSet(['Dummy'], merge_keys=['sid']) self.fummy_nodes = NodeSet(['Fummy'], merge_keys=['sid']) self.dummy_knows_fummy = RelationshipSet('KNOWS', ['Dummy'], ['Fummy'], ['sid'], ['sid'])
def __init__(self): super(SwissLipidsParser, self).__init__() # define NodeSet and RelationshipSet self.lipids = NodeSet(['Lipid'], merge_keys=['sid']) self.lipid_fromclass_lipid = RelationshipSet('FROM_LIPID_CLASS', ['Lipid'], ['Lipid'], ['sid'], ['sid']) self.lipid_parent_lipid = RelationshipSet('HAS_PARENT', ['Lipid'], ['Lipid'], ['sid'], ['sid']) self.lipid_component_lipid = RelationshipSet('HAS_COMPONENT', ['Lipid'], ['Lipid'], ['sid'], ['sid']) self.lipid_maps_metabolite = RelationshipSet('MAPS', ['Lipid'], ['Metabolite'], ['sid'], ['sid']) self.lipid_associates_protein = RelationshipSet('HAS_ASSOCIATION', ['Lipid'], ['Protein'], ['sid'], ['sid'])
def __init__(self): """ :param ensembl_instance: The ENSEMBL DataSource instance. """ super(EnsemblLocusParser, self).__init__() # arguments self.arguments = ['taxid'] # NodeSets self.locus = NodeSet(['Locus'], merge_keys=['uuid'], default_props={'source': 'ensembl'})
class RootTestParser(ReturnParser): def __init__(self): super(RootTestParser, self).__init__() self.source = NodeSet(['Source'], merge_keys=['source_id']) self.target = NodeSet(['Target'], merge_keys=['target_id']) def run_with_mounted_arguments(self): self.run() def run(self): for i in range(100): self.source.add_node({'source_id': i}) self.target.add_node({'target_id': i})
class EnsemblLocusParser(ReturnParser): def __init__(self): """ :param ensembl_instance: The ENSEMBL DataSource instance. """ super(EnsemblLocusParser, self).__init__() # arguments self.arguments = ['taxid'] # NodeSets self.locus = NodeSet(['Locus'], merge_keys=['uuid'], default_props={'source': 'ensembl'}) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): ensembl_instance = self.get_instance_by_name('Ensembl') datasource_name = ensembl_instance.datasource.name ensembl_gtf_file_path = Ensembl.get_gtf_file_path( taxid, ensembl_instance) annotation = GffReader(ensembl_gtf_file_path) log.info("Start parsing ENSEMBL gtf file, taxid {}, {}".format( taxid, ensembl_gtf_file_path)) for r in annotation.records: # one line is one unique Locus props = { 'chr': r.chr, 'annotation_source': r.source, 'start': int(r.start), 'end': int(r.end), 'type': r.type, 'score': r.score, 'strand': r.strand, 'frame': r.frame, 'taxid': taxid, 'ref': 'h38', 'uuid': str(uuid4()) } props.update(r.attributes) self.locus.add_node(props) log.info("Finished parsing ENSEMBL gtf file.")
def __init__(self): super(HmdbParser, self).__init__() # NodeSets self.metabolites = NodeSet(['Metabolite'], merge_keys=['sid'], default_props={'source': 'hmdb'}) self.metabolite_map_metabolite = RelationshipSet( 'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'hmdb'}) self.metabolite_associates_protein = RelationshipSet( 'HAS_ASSOCIATION', ['Metabolite'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'hmdb'})
class NcbiLegacyGeneParser(ReturnParser): """ Parse legacy gene IDs from gene_history.gz #tax_id GeneID Discontinued_GeneID Discontinued_Symbol Discontinue_Date 9 - 1246494 repA1 20031113 9 - 1246495 repA2 20031113 9 - 1246496 leuA 20031113 """ def __init__(self): super(NcbiLegacyGeneParser, self).__init__() self.arguments = ['taxid'] self.legacy_genes = NodeSet(['Gene', 'Legacy'], merge_keys=['sid'], default_props={'source': 'ncbigene'}) self.legacy_gene_now_gene = RelationshipSet( 'REPLACED_BY', ['Gene', 'Legacy'], ['Gene'], ['sid'], ['sid'], default_props={'source': 'ncbigene'}) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): log.debug(f'Run parser {self.__class__.__name__} for taxID: {taxid}.') ncbigene_instance = self.get_instance_by_name('NcbiGene') gene_history_file = ncbigene_instance.get_file('gene_history.gz') with gzip.open(gene_history_file, 'rt') as f: # skip header next(f) for l in f: flds = l.strip().split('\t') this_taxid = flds[0] if this_taxid == taxid: new_gene_id = flds[1] discontinued_gene_id = flds[2] discontinued_symbol = flds[3] date = flds[4] self.legacy_genes.add_node({ 'sid': discontinued_gene_id, 'date': date, 'symbol': discontinued_symbol, 'taxid': taxid }) if new_gene_id != '-': self.legacy_gene_now_gene.add_relationship( {'sid': discontinued_gene_id}, {'sid': new_gene_id}, {})
def __init__(self): super(RefseqRemovedRecordsParser, self).__init__() self.arguments = ['taxid'] self.legacy_ids = set() self.legacy_transcripts = NodeSet(['Transcript', 'Legacy'], merge_keys=['sid'], default_props={'source': 'refseq'}) self.legacy_transcript_now_transcript = RelationshipSet('REPLACED_BY', ['Transcript'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'refseq'}) self.legacy_proteins = NodeSet(['Protein', 'Legacy'], merge_keys=['sid'], default_props={'source': 'refseq'}) self.legacy_protein_now_protein = RelationshipSet('REPLACED_BY', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'refseq'}) self.gene_codes_legacy_transcript = RelationshipSet('CODES', ['Gene'], ['Transcript', 'Legacy'], ['sid'], ['sid'], default_props={'source': 'refseq'}) self.legacy_transcript_codes_protein = RelationshipSet('CODES', ['Transcript', 'Legacy'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'refseq'})
def __init__(self): super(ChebiParser, self).__init__() # NodeSets self.metabolites = NodeSet(['Metabolite'], merge_keys=['sid'], default_props={'source': 'chebi'}) self.metabolite_isa_metabolite = RelationshipSet( 'IS_A', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'chebi'}) self.metabolite_rel_metabolite = RelationshipSet( 'CHEBI_REL', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'chebi'}) self.metabolite_maps_metabolite = RelationshipSet( 'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'chebi'})
def __init__(self): super(MirbaseParser, self).__init__() # NodeSets self.precursor_mirna = NodeSet(['PrecursorMirna'], merge_keys=['sid']) self.mature_mirna = NodeSet(['Mirna'], merge_keys=['sid']) # RelationshipSets self.precursor_codes_mature = RelationshipSet('PRE', ['PrecursorMirna'], ['Mirna'], ['sid'], ['sid']) self.transcript_codes_precursor = RelationshipSet( 'IS', ['Transcript'], ['PrecursorMirna'], ['sid'], ['sid']) self.gene_is_precursor = RelationshipSet('IS', ['Gene'], ['PrecursorMirna'], ['sid'], ['sid'])
def deserialize(cls, source_dir: str, metadata_only: bool = False) -> 'Parser': """ Read from a serialized directory, recreate a Parser that can load to the database. :param source_dir: Directory to read from. :return: A Parser object. """ log.debug(f"Read Parser from {source_dir}.") p = cls() for file in os.listdir(source_dir): if not metadata_only: if file.startswith('nodeset_'): ns_name = file.replace('.json', '') with open(os.path.join(source_dir, file), 'rt') as f: log.debug(f"Deserialize {f}") ns = NodeSet.from_dict(json.load(f)) log.debug(f"Num nodes in NodeSet: {len(ns.nodes)}") p.__dict__[ns_name] = ns elif file.startswith('relationshipset_'): rs_name = file.replace('.json', '') with open(os.path.join(source_dir, file), 'rt') as f: log.debug(f"Deserialize {f}") rs = RelationshipSet.from_dict(json.load(f)) log.debug(f"Num relationships in RelationshipSet: {len(rs.relationships)}") p.__dict__[rs_name] = rs if file == 'parser_data.json': with open(os.path.join(source_dir, file), 'rt') as f: metadata = json.load(f) # TODO add datasource instances to deserializer p.name = metadata['name'] return p
def __init__(self): """ :param uniprot_instance: The Uniprot instance :param taxid: The taxid """ super(UniprotKnowledgebaseParser, self).__init__() # arguments self.arguments = ['taxid'] # NodeSet self.proteins = NodeSet(['Protein'], merge_keys=['sid'], default_props={'source': 'uniprot'}) # RelationshipSet self.protein_primary_protein = RelationshipSet('PRIMARY', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'}) self.transcript_codes_protein = RelationshipSet('CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'}) self.protein_maps_protein = RelationshipSet('MAPS', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'})
def __init__(self): """ :param ncbigene_instance: NcbiGene Instance :type ncbigene_instance: DataSourceInstance :param taxid: """ super(HGNCParser, self).__init__() # output data self.genes = NodeSet(['Gene'], merge_keys=['sid']) self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'], ['sid'], ['sid']) self.gene_maps_genesymbol = RelationshipSet('MAPS', ['Gene'], ['GeneSymbol'], ['sid'], ['sid', 'taxid'])
class SomeParser(ReturnParser): def __init__(self): super(SomeParser, self).__init__() self.source = NodeSet(['Source'], merge_keys=['source_id']) self.target = NodeSet(['Target'], merge_keys=['target_id']) self.rels = RelationshipSet('FOO', ['Source'], ['Target'], ['source_id'], ['target_id']) def run_with_mounted_arguments(self): self.run() def run(self): for i in range(100): self.source.add_node({'source_id': i}) self.target.add_node({'target_id': i}) self.rels.add_relationship({'source_id': i}, {'target_id': i}, {'source': 'test'})
def _add_node(self, node): # create nodeSet if necessary labels = frozenset(node.labels) if not labels in self.nodeSets: # get primary keys self.nodeSets[labels] = NodeSet( list(labels), merge_keys=self._get_merge_keys(node)) # add node to nodeset self.nodeSets[labels].nodes.append(node)
class BigWordListParser(ReturnParser): def __init__(self): """ """ super(BigWordListParser, self).__init__() # NodeSets self.words = NodeSet(['Word'], merge_keys=['value']) def run_with_mounted_arguments(self): self.run() def run(self): log.info("Run {}".format(self.__class__.__name__)) bigwordlist_instance = self.get_instance_by_name('BigWordList') # collect the words and a list of wordlists they are metioned in word_to_list = {} log.info("") for i in range(3, 13): match_file = bigwordlist_instance.get_file( 'wlist_match{}.txt'.format(i)) try: log.info("Open {}".format(match_file)) with open(match_file, 'r') as f: for l in f: word = l.strip() if word in word_to_list: word_to_list[word].append(i) else: word_to_list[word] = [i] except TypeError: log.info("Cannot open file {}".format(match_file)) for word, list_of_matches in word_to_list.items(): node_props = {'value': word} for i in list_of_matches: node_props["match{}".format(i)] = True self.words.add_node(node_props)
def __init__(self): """ :param ensembl_instance: The ENSEMBL DataSource instance. """ super(EnsemblEntityParser, self).__init__() # arguments self.arguments = ['taxid'] # NodeSets self.genes = NodeSet(['Gene'], merge_keys=['sid'], default_props={'source': 'ensembl'}) self.transcripts = NodeSet(['Transcript'], merge_keys=['sid'], default_props={'source': 'ensembl'}) self.proteins = NodeSet(['Protein'], merge_keys=['sid'], default_props={'source': 'ensembl'}) # RelationshipSets self.gene_codes_transcript = RelationshipSet( 'CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'ensembl'}) self.transcript_codes_protein = RelationshipSet( 'CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'ensembl'})
def __init__(self): """ :param mesh_instance: NcbiGene Instance :type mesh_instance: DataSourceInstance """ super(GtexMetadataParser, self).__init__() # NodeSets self.tissues = NodeSet(['GtexTissue'], merge_keys=['name']) self.detailed_tissues = NodeSet(['GtexDetailedTissue'], merge_keys=['name']) self.sample = NodeSet(['GtexSample'], merge_keys=['sid']) self.sample_measures_tissue = RelationshipSet('MEASURES', ['GtexSample'], ['GtexTissue'], ['sid'], ['name']) self.sample_measures_detailed_tissue = RelationshipSet( 'MEASURES', ['GtexSample'], ['GtexDetailedTissue'], ['sid'], ['name']) self.tissue_parent_detailed_tissue = RelationshipSet( 'PARENT', ['GtexTissue'], ['GtexDetailedTissue'], ['name'], ['name']) self.tissue_parent_detailed_tissue.unique = True
def __init__(self): super(MeshParser, self).__init__() # NodeSets self.descriptor = NodeSet(['MeshDescriptor'], merge_keys=['sid']) self.qualifier = NodeSet(['MeshQualifier'], merge_keys=['sid']) self.concept = NodeSet(['MeshConcept'], merge_keys=['sid']) self.term = NodeSet(['MeshTerm'], merge_keys=['sid']) self.descriptor_allowed_qualifier = RelationshipSet('ALLOWED', ['MeshDescriptor'], ['MeshQualifier'], ['sid'], ['sid']) self.descriptor_has_concept = RelationshipSet('HAS', ['MeshDescriptor'], ['MeshConcept'], ['sid'], ['sid']) self.descriptor_has_concept.unique = True self.concept_has_term = RelationshipSet('HAS', ['MeshConcept'], ['MeshTerm'], ['sid'], ['sid']) self.concept_has_term.unique = True self.concept_related_concept = RelationshipSet('RELATED', ['MeshConcept'], ['MeshConcept'], ['sid'], ['sid']) self.concept_related_concept.unique = True
class ChebiParser(ReturnParser): def __init__(self): super(ChebiParser, self).__init__() # NodeSets self.metabolites = NodeSet(['Metabolite'], merge_keys=['sid'], default_props={'source': 'chebi'}) self.metabolite_isa_metabolite = RelationshipSet( 'IS_A', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'chebi'}) self.metabolite_rel_metabolite = RelationshipSet( 'CHEBI_REL', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'chebi'}) self.metabolite_maps_metabolite = RelationshipSet( 'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'chebi'}) def run_with_mounted_arguments(self): self.run() def run(self): chebi_instance = self.get_instance_by_name('Chebi') obo_file = chebi_instance.get_file('chebi.obo') cleaned_obo_file = clean_obo_file(obo_file) chebi_ontology = pronto.Ontology(cleaned_obo_file) reltypes = set() # iterate terms for term in chebi_ontology.terms(): term_sid = (term.id).split(':')[1] ontology_id = term.id self.metabolites.add_node({ 'name': (term.name), 'sid': term_sid, 'ontology_id': ontology_id, 'definition': term.definition, 'alt_ids': list(term.alternate_ids) }) for parent in term.superclasses(distance=1, with_self=False): self.metabolite_isa_metabolite.add_relationship( {'sid': term_sid}, {'sid': parent.id}, {}) ## other named relationships try: for reltype, targets in term.relationships.items(): for target in targets: self.metabolite_rel_metabolite.add_relationship( {'sid': term_sid}, {'sid': target.id}, {'type': reltype.id}) except KeyError as e: log.error(f"Cannot iterate relationshis of term {term_sid}") log.error(e) # metabolite-MAPS-metabolite for xref in term.xrefs: if 'HMDB:' in xref.id: hmdb_id = xref.id.strip().split('HMDB:')[1] self.metabolite_maps_metabolite.add_relationship( {'sid': term_sid}, {'sid': hmdb_id}, {})
def load_wpp_data(base_path, graph): """ Load UN population data. :param base_path: Path where file was downloaded. """ un_wpp_csv_file = os.path.join(base_path, 'WPP2019_PopulationByAgeSex_Medium.csv') log.info('Parse UN population data file: {}'.format(un_wpp_csv_file)) country = NodeSet(['Country'], ['name']) age_group_nodes = NodeSet(['AgeGroup'], ['group']) country_total_group = RelationshipSet('CURRENT_TOTAL', ['Country'], ['AgeGroup'], ['name'], ['group']) country_male_group = RelationshipSet('CURRENT_MALE', ['Country'], ['AgeGroup'], ['name'], ['group']) country_female_group = RelationshipSet('CURRENT_FEMALE', ['Country'], ['AgeGroup'], ['name'], ['group']) countries_added = set() age_groups_added = set() with open(un_wpp_csv_file, 'rt') as f: csv_file = csv.reader(f, delimiter=',', quotechar='"') # skip header next(csv_file) for row in csv_file: # LocID,Location,VarID,Variant,Time,MidPeriod,AgeGrp,AgeGrpStart,AgeGrpSpan,PopMale,PopFemale,PopTotal loc_id = row[0] location = row[1] time = int(row[4]) age_group = row[6] age_group_start = int(row[7]) age_group_span = row[8] pop_male = int(float((row[9])) * 1000) pop_female = int(float((row[10])) * 1000) pop_total = int(float((row[11])) * 1000) # only take 2019 if time == 2019: if location not in countries_added: country.add_node({'name': location, 'un_id': loc_id}) countries_added.add(location) if age_group not in age_groups_added: age_group_nodes.add_node({'group': age_group, 'start': age_group_start, 'span': age_group_span}) country_total_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_total}) country_male_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_male}) country_female_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_female}) log.info('Load data to Neo4j') country.merge(graph) age_group_nodes.merge(graph) country_total_group.merge(graph) country_male_group.merge(graph) country_female_group.merge(graph)
class EnsemblEntityParser(ReturnParser): def __init__(self): """ :param ensembl_instance: The ENSEMBL DataSource instance. """ super(EnsemblEntityParser, self).__init__() # arguments self.arguments = ['taxid'] # NodeSets self.genes = NodeSet(['Gene'], merge_keys=['sid'], default_props={'source': 'ensembl'}) self.transcripts = NodeSet(['Transcript'], merge_keys=['sid'], default_props={'source': 'ensembl'}) self.proteins = NodeSet(['Protein'], merge_keys=['sid'], default_props={'source': 'ensembl'}) # RelationshipSets self.gene_codes_transcript = RelationshipSet( 'CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'ensembl'}) self.transcript_codes_protein = RelationshipSet( 'CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'ensembl'}) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): ensembl_instance = self.get_instance_by_name('Ensembl') datasource_name = ensembl_instance.datasource.name # try patched path, if not available take flat ensembl_gtf_file_path = Ensembl.get_gtf_file_path(taxid, ensembl_instance, patched=True) if not os.path.exists(ensembl_gtf_file_path): ensembl_gtf_file_path = Ensembl.get_gtf_file_path(taxid, ensembl_instance, patched=False) annotation = GffReader(ensembl_gtf_file_path) check_gene_ids = set() check_transcript_ids = set() check_protein_ids = set() check_gene_transcript_rels = set() check_transcript_protein_rels = set() log.info("Start parsing ENSEMBL gtf file, taxid {}, {}".format( taxid, ensembl_gtf_file_path)) for r in annotation.records: # add gene node gene_id = r.attributes['gene_id'] if gene_id not in check_gene_ids: props = { 'sid': gene_id, 'name': r.attributes['gene_name'], 'taxid': taxid } self.genes.add_node(props) check_gene_ids.add(gene_id) # add transcript node if r.type == 'transcript': transcript_id = r.attributes['transcript_id'] if transcript_id not in check_transcript_ids: props = {'sid': transcript_id, 'taxid': taxid} self.transcripts.add_node(props) check_transcript_ids.add(transcript_id) # add protein node if r.type == 'CDS': protein_id = r.attributes['protein_id'] if protein_id not in check_protein_ids: props = {'sid': protein_id, 'taxid': taxid} self.proteins.add_node(props) check_protein_ids.add(protein_id) # Gene-CODES-Transcript if r.type == 'transcript': transcript_id = r.attributes['transcript_id'] gene_id = r.attributes['gene_id'] # add gene-transcript rel if gene_id + transcript_id not in check_gene_transcript_rels: self.gene_codes_transcript.add_relationship( {'sid': gene_id}, {'sid': transcript_id}, {}) check_gene_transcript_rels.add(gene_id + transcript_id) # Transcript-CODES-Protein if r.type == 'CDS': protein_id = r.attributes['protein_id'] transcript_id = r.attributes['transcript_id'] # add transcript-protein rel if transcript_id + protein_id not in check_transcript_protein_rels: self.transcript_codes_protein.add_relationship( {'sid': transcript_id}, {'sid': protein_id}, {}) check_transcript_protein_rels.add(transcript_id + protein_id) log.info("Finished parsing ENSEMBL gtf file.")