Ejemplo n.º 1
0
class HGNCParser(ReturnParser):
    def __init__(self):
        """

        :param ncbigene_instance: NcbiGene Instance
        :type ncbigene_instance: DataSourceInstance
        :param taxid:
        """
        super(HGNCParser, self).__init__()

        # output data
        self.genes = NodeSet(['Gene'], merge_keys=['sid'])

        self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'],
                                              ['sid'], ['sid'])
        self.gene_maps_genesymbol = RelationshipSet('MAPS', ['Gene'],
                                                    ['GeneSymbol'], ['sid'],
                                                    ['sid', 'taxid'])

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):

        hgnc_instance = self.get_instance_by_name('HGNC')

        hgnc_complete_file = hgnc_instance.get_file('hgnc_complete_set.txt')
        self.parse_hgnc_complete_file(hgnc_complete_file)

    def parse_hgnc_complete_file(self, hgnc_complete_file):
        with open(hgnc_complete_file, 'rt') as f:
            header = next(f)

            for l in f:
                flds = l.strip().split('\t')
                sid = flds[0]
                gene_symbol = flds[1]
                ncbi_id = flds[18] if len(flds) > 18 else None
                ensembl_id = flds[19] if len(flds) > 19 else None

                all_props = dict(zip(header, flds))
                all_props['sid'] = sid
                all_props['source'] = 'hgnc'

                self.genes.add_node(all_props)

                if ncbi_id:
                    self.gene_maps_gene.add_relationship({'sid': sid},
                                                         {'sid': ncbi_id},
                                                         {'source': 'hgnc'})
                if ensembl_id:
                    self.gene_maps_gene.add_relationship({'sid': sid},
                                                         {'sid': ensembl_id},
                                                         {'source': 'hgnc'})

                if gene_symbol:
                    self.gene_maps_genesymbol.add_relationship({'sid': sid}, {
                        'sid': gene_symbol,
                        'taxid': '9606'
                    }, {'source': 'hgnc'})
Ejemplo n.º 2
0
    def __init__(self):
        super(SomeParser, self).__init__()

        self.source = NodeSet(['Source'], merge_keys=['source_id'])
        self.target = NodeSet(['Target'], merge_keys=['target_id'])
        self.rels = RelationshipSet('FOO', ['Source'], ['Target'],
                                    ['source_id'], ['target_id'])
Ejemplo n.º 3
0
    def __init__(self):

        super(NcbiGeneParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # output data
        # both gene IDs and GeneSymbols have the label 'Gene'
        # two different NodeSets are used because only the GeneSymbol nodes need taxid for uniqueness
        self.genes = NodeSet(['Gene'],
                             merge_keys=['sid'],
                             default_props={'source': 'ncbigene'})
        self.genesymbols = NodeSet(['Gene'],
                                   merge_keys=['sid', 'taxid'],
                                   default_props={
                                       'source': 'ncbigene',
                                       'type': 'symbol'
                                   })
        self.genesymbol_synonym_genesymbol = RelationshipSet(
            'SYNONYM', ['Gene'], ['Gene'], ['sid', 'taxid'], ['sid', 'taxid'],
            default_props={'source': 'ncbigene'})
        self.gene_maps_genesymbol = RelationshipSet(
            'MAPS', ['Gene'], ['Gene'], ['sid'], ['sid', 'taxid'],
            default_props={'source': 'ncbigene'})
Ejemplo n.º 4
0
    def __init__(self):
        """
        """
        super(BigWordListParser, self).__init__()

        # NodeSets
        self.words = NodeSet(['Word'], merge_keys=['value'])
Ejemplo n.º 5
0
class DummyParser(ReturnParser):
    def __init__(self):
        super(DummyParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # output data
        self.dummy_nodes = NodeSet(['Dummy'], merge_keys=['sid'])
        self.fummy_nodes = NodeSet(['Fummy'], merge_keys=['sid'])

        self.dummy_knows_fummy = RelationshipSet('KNOWS', ['Dummy'], ['Fummy'],
                                                 ['sid'], ['sid'])

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):
        dummy_instance = self.get_instance_by_name('Dummy')
        dummyfile = dummy_instance.get_file('file.txt')

        target_sids = list(string.ascii_lowercase)

        # Fummy nodes
        for i in range(10):
            self.fummy_nodes.add_node({'sid': i, 'taxid': taxid})

        with open(dummyfile) as f:
            for l in f:
                letter = l.strip()
                self.dummy_nodes.add_node({'sid': letter, 'taxid': taxid})
                self.dummy_knows_fummy.add_relationship({'sid': letter},
                                                        {'sid': randint(0, 9)},
                                                        {'key': 'value'})
Ejemplo n.º 6
0
    def __init__(self):
        super(LncipediaParser, self).__init__()

        self.genes = NodeSet(['Gene'], merge_keys=['sid'])
        self.transcripts = NodeSet(['Transcript'], merge_keys=['sid'])
        self.gene_codes_transcripts = RelationshipSet('CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'])
        self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'], ['sid'], ['sid'])
        self.transcript_maps_transcript = RelationshipSet('MAPS', ['Transcript'], ['Transcript'], ['sid'], ['sid'])
Ejemplo n.º 7
0
    def __init__(self):

        super(RefseqEntityParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # define NodeSet and RelationshipSet
        self.transcripts = NodeSet(['Transcript'], merge_keys=['sid'], default_props={'source': 'refseq'})
        self.proteins = NodeSet(['Protein'], merge_keys=['sid'], default_props={'source': 'refseq'})
Ejemplo n.º 8
0
    def __init__(self):
        super(NcbiLegacyGeneParser, self).__init__()

        self.arguments = ['taxid']

        self.legacy_genes = NodeSet(['Gene', 'Legacy'],
                                    merge_keys=['sid'],
                                    default_props={'source': 'ncbigene'})
        self.legacy_gene_now_gene = RelationshipSet(
            'REPLACED_BY', ['Gene', 'Legacy'], ['Gene'], ['sid'], ['sid'],
            default_props={'source': 'ncbigene'})
Ejemplo n.º 9
0
    def __init__(self):
        super(DummyParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # output data
        self.dummy_nodes = NodeSet(['Dummy'], merge_keys=['sid'])
        self.fummy_nodes = NodeSet(['Fummy'], merge_keys=['sid'])

        self.dummy_knows_fummy = RelationshipSet('KNOWS', ['Dummy'], ['Fummy'],
                                                 ['sid'], ['sid'])
Ejemplo n.º 10
0
    def __init__(self):

        super(SwissLipidsParser, self).__init__()

        # define NodeSet and RelationshipSet
        self.lipids = NodeSet(['Lipid'], merge_keys=['sid'])

        self.lipid_fromclass_lipid = RelationshipSet('FROM_LIPID_CLASS', ['Lipid'], ['Lipid'], ['sid'], ['sid'])
        self.lipid_parent_lipid = RelationshipSet('HAS_PARENT', ['Lipid'], ['Lipid'], ['sid'], ['sid'])
        self.lipid_component_lipid = RelationshipSet('HAS_COMPONENT', ['Lipid'], ['Lipid'], ['sid'], ['sid'])
        self.lipid_maps_metabolite = RelationshipSet('MAPS', ['Lipid'], ['Metabolite'], ['sid'], ['sid'])
        self.lipid_associates_protein = RelationshipSet('HAS_ASSOCIATION', ['Lipid'], ['Protein'], ['sid'], ['sid'])
Ejemplo n.º 11
0
    def __init__(self):
        """
        :param ensembl_instance: The ENSEMBL DataSource instance.
        """
        super(EnsemblLocusParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # NodeSets
        self.locus = NodeSet(['Locus'],
                             merge_keys=['uuid'],
                             default_props={'source': 'ensembl'})
Ejemplo n.º 12
0
class RootTestParser(ReturnParser):
    def __init__(self):
        super(RootTestParser, self).__init__()

        self.source = NodeSet(['Source'], merge_keys=['source_id'])
        self.target = NodeSet(['Target'], merge_keys=['target_id'])

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        for i in range(100):
            self.source.add_node({'source_id': i})
            self.target.add_node({'target_id': i})
Ejemplo n.º 13
0
class EnsemblLocusParser(ReturnParser):
    def __init__(self):
        """
        :param ensembl_instance: The ENSEMBL DataSource instance.
        """
        super(EnsemblLocusParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # NodeSets
        self.locus = NodeSet(['Locus'],
                             merge_keys=['uuid'],
                             default_props={'source': 'ensembl'})

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):
        ensembl_instance = self.get_instance_by_name('Ensembl')
        datasource_name = ensembl_instance.datasource.name

        ensembl_gtf_file_path = Ensembl.get_gtf_file_path(
            taxid, ensembl_instance)

        annotation = GffReader(ensembl_gtf_file_path)

        log.info("Start parsing ENSEMBL gtf file, taxid {}, {}".format(
            taxid, ensembl_gtf_file_path))
        for r in annotation.records:

            # one line is one unique Locus
            props = {
                'chr': r.chr,
                'annotation_source': r.source,
                'start': int(r.start),
                'end': int(r.end),
                'type': r.type,
                'score': r.score,
                'strand': r.strand,
                'frame': r.frame,
                'taxid': taxid,
                'ref': 'h38',
                'uuid': str(uuid4())
            }
            props.update(r.attributes)

            self.locus.add_node(props)

        log.info("Finished parsing ENSEMBL gtf file.")
Ejemplo n.º 14
0
    def __init__(self):
        super(HmdbParser, self).__init__()

        # NodeSets
        self.metabolites = NodeSet(['Metabolite'],
                                   merge_keys=['sid'],
                                   default_props={'source': 'hmdb'})

        self.metabolite_map_metabolite = RelationshipSet(
            'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'hmdb'})
        self.metabolite_associates_protein = RelationshipSet(
            'HAS_ASSOCIATION', ['Metabolite'], ['Protein'], ['sid'], ['sid'],
            default_props={'source': 'hmdb'})
Ejemplo n.º 15
0
class NcbiLegacyGeneParser(ReturnParser):
    """
    Parse legacy gene IDs from gene_history.gz
    #tax_id GeneID  Discontinued_GeneID     Discontinued_Symbol     Discontinue_Date
    9       -       1246494 repA1   20031113
    9       -       1246495 repA2   20031113
    9       -       1246496 leuA    20031113
    """
    def __init__(self):
        super(NcbiLegacyGeneParser, self).__init__()

        self.arguments = ['taxid']

        self.legacy_genes = NodeSet(['Gene', 'Legacy'],
                                    merge_keys=['sid'],
                                    default_props={'source': 'ncbigene'})
        self.legacy_gene_now_gene = RelationshipSet(
            'REPLACED_BY', ['Gene', 'Legacy'], ['Gene'], ['sid'], ['sid'],
            default_props={'source': 'ncbigene'})

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):
        log.debug(f'Run parser {self.__class__.__name__} for taxID: {taxid}.')
        ncbigene_instance = self.get_instance_by_name('NcbiGene')
        gene_history_file = ncbigene_instance.get_file('gene_history.gz')

        with gzip.open(gene_history_file, 'rt') as f:
            # skip header
            next(f)
            for l in f:
                flds = l.strip().split('\t')
                this_taxid = flds[0]
                if this_taxid == taxid:
                    new_gene_id = flds[1]
                    discontinued_gene_id = flds[2]
                    discontinued_symbol = flds[3]
                    date = flds[4]
                    self.legacy_genes.add_node({
                        'sid': discontinued_gene_id,
                        'date': date,
                        'symbol': discontinued_symbol,
                        'taxid': taxid
                    })
                    if new_gene_id != '-':
                        self.legacy_gene_now_gene.add_relationship(
                            {'sid': discontinued_gene_id},
                            {'sid': new_gene_id}, {})
Ejemplo n.º 16
0
    def __init__(self):
        super(RefseqRemovedRecordsParser, self).__init__()

        self.arguments = ['taxid']

        self.legacy_ids = set()

        self.legacy_transcripts = NodeSet(['Transcript', 'Legacy'], merge_keys=['sid'], default_props={'source': 'refseq'})
        self.legacy_transcript_now_transcript = RelationshipSet('REPLACED_BY', ['Transcript'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'refseq'})
        self.legacy_proteins = NodeSet(['Protein', 'Legacy'], merge_keys=['sid'], default_props={'source': 'refseq'})
        self.legacy_protein_now_protein = RelationshipSet('REPLACED_BY', ['Protein'], ['Protein'],
                                                                ['sid'], ['sid'], default_props={'source': 'refseq'})
        self.gene_codes_legacy_transcript = RelationshipSet('CODES', ['Gene'], ['Transcript', 'Legacy'], ['sid'], ['sid'], default_props={'source': 'refseq'})
        self.legacy_transcript_codes_protein = RelationshipSet('CODES', ['Transcript', 'Legacy'], ['Protein'],
                                                               ['sid'], ['sid'], default_props={'source': 'refseq'})
Ejemplo n.º 17
0
    def __init__(self):
        super(ChebiParser, self).__init__()

        # NodeSets
        self.metabolites = NodeSet(['Metabolite'],
                                   merge_keys=['sid'],
                                   default_props={'source': 'chebi'})
        self.metabolite_isa_metabolite = RelationshipSet(
            'IS_A', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'chebi'})
        self.metabolite_rel_metabolite = RelationshipSet(
            'CHEBI_REL', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'chebi'})
        self.metabolite_maps_metabolite = RelationshipSet(
            'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'chebi'})
Ejemplo n.º 18
0
    def __init__(self):
        super(MirbaseParser, self).__init__()

        # NodeSets
        self.precursor_mirna = NodeSet(['PrecursorMirna'], merge_keys=['sid'])
        self.mature_mirna = NodeSet(['Mirna'], merge_keys=['sid'])
        # RelationshipSets
        self.precursor_codes_mature = RelationshipSet('PRE',
                                                      ['PrecursorMirna'],
                                                      ['Mirna'], ['sid'],
                                                      ['sid'])
        self.transcript_codes_precursor = RelationshipSet(
            'IS', ['Transcript'], ['PrecursorMirna'], ['sid'], ['sid'])
        self.gene_is_precursor = RelationshipSet('IS', ['Gene'],
                                                 ['PrecursorMirna'], ['sid'],
                                                 ['sid'])
Ejemplo n.º 19
0
    def deserialize(cls, source_dir: str, metadata_only: bool = False) -> 'Parser':
        """
        Read from a serialized directory, recreate a Parser that can load to the database.

        :param source_dir: Directory to read from.
        :return: A Parser object.
        """
        log.debug(f"Read Parser from {source_dir}.")
        p = cls()

        for file in os.listdir(source_dir):
            if not metadata_only:
                if file.startswith('nodeset_'):
                    ns_name = file.replace('.json', '')
                    with open(os.path.join(source_dir, file), 'rt') as f:
                        log.debug(f"Deserialize {f}")
                        ns = NodeSet.from_dict(json.load(f))
                        log.debug(f"Num nodes in NodeSet: {len(ns.nodes)}")
                        p.__dict__[ns_name] = ns

                elif file.startswith('relationshipset_'):
                    rs_name = file.replace('.json', '')
                    with open(os.path.join(source_dir, file), 'rt') as f:
                        log.debug(f"Deserialize {f}")
                        rs = RelationshipSet.from_dict(json.load(f))
                        log.debug(f"Num relationships in RelationshipSet: {len(rs.relationships)}")
                        p.__dict__[rs_name] = rs

            if file == 'parser_data.json':
                with open(os.path.join(source_dir, file), 'rt') as f:
                    metadata = json.load(f)
                    # TODO add datasource instances to deserializer
                    p.name = metadata['name']

        return p
Ejemplo n.º 20
0
    def __init__(self):
        """
        :param uniprot_instance: The Uniprot instance
        :param taxid: The taxid
        """
        super(UniprotKnowledgebaseParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # NodeSet
        self.proteins = NodeSet(['Protein'], merge_keys=['sid'], default_props={'source': 'uniprot'})

        # RelationshipSet
        self.protein_primary_protein = RelationshipSet('PRIMARY', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'})
        self.transcript_codes_protein = RelationshipSet('CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'})
        self.protein_maps_protein = RelationshipSet('MAPS', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'})
Ejemplo n.º 21
0
    def __init__(self):
        """

        :param ncbigene_instance: NcbiGene Instance
        :type ncbigene_instance: DataSourceInstance
        :param taxid:
        """
        super(HGNCParser, self).__init__()

        # output data
        self.genes = NodeSet(['Gene'], merge_keys=['sid'])

        self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'],
                                              ['sid'], ['sid'])
        self.gene_maps_genesymbol = RelationshipSet('MAPS', ['Gene'],
                                                    ['GeneSymbol'], ['sid'],
                                                    ['sid', 'taxid'])
Ejemplo n.º 22
0
class SomeParser(ReturnParser):
    def __init__(self):
        super(SomeParser, self).__init__()

        self.source = NodeSet(['Source'], merge_keys=['source_id'])
        self.target = NodeSet(['Target'], merge_keys=['target_id'])
        self.rels = RelationshipSet('FOO', ['Source'], ['Target'],
                                    ['source_id'], ['target_id'])

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        for i in range(100):
            self.source.add_node({'source_id': i})
            self.target.add_node({'target_id': i})
            self.rels.add_relationship({'source_id': i}, {'target_id': i},
                                       {'source': 'test'})
 def _add_node(self, node):
     # create nodeSet if necessary
     labels = frozenset(node.labels)
     if not labels in self.nodeSets:
         # get primary keys
         self.nodeSets[labels] = NodeSet(
             list(labels), merge_keys=self._get_merge_keys(node))
     # add node to nodeset
     self.nodeSets[labels].nodes.append(node)
Ejemplo n.º 24
0
class BigWordListParser(ReturnParser):
    def __init__(self):
        """
        """
        super(BigWordListParser, self).__init__()

        # NodeSets
        self.words = NodeSet(['Word'], merge_keys=['value'])

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):

        log.info("Run {}".format(self.__class__.__name__))

        bigwordlist_instance = self.get_instance_by_name('BigWordList')

        # collect the words and a list of wordlists they are metioned in
        word_to_list = {}

        log.info("")
        for i in range(3, 13):
            match_file = bigwordlist_instance.get_file(
                'wlist_match{}.txt'.format(i))
            try:
                log.info("Open {}".format(match_file))
                with open(match_file, 'r') as f:
                    for l in f:
                        word = l.strip()
                        if word in word_to_list:
                            word_to_list[word].append(i)
                        else:
                            word_to_list[word] = [i]
            except TypeError:
                log.info("Cannot open file {}".format(match_file))

        for word, list_of_matches in word_to_list.items():
            node_props = {'value': word}
            for i in list_of_matches:
                node_props["match{}".format(i)] = True

            self.words.add_node(node_props)
Ejemplo n.º 25
0
    def __init__(self):
        """
        :param ensembl_instance: The ENSEMBL DataSource instance.
        """
        super(EnsemblEntityParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # NodeSets
        self.genes = NodeSet(['Gene'],
                             merge_keys=['sid'],
                             default_props={'source': 'ensembl'})
        self.transcripts = NodeSet(['Transcript'],
                                   merge_keys=['sid'],
                                   default_props={'source': 'ensembl'})
        self.proteins = NodeSet(['Protein'],
                                merge_keys=['sid'],
                                default_props={'source': 'ensembl'})

        # RelationshipSets
        self.gene_codes_transcript = RelationshipSet(
            'CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})
        self.transcript_codes_protein = RelationshipSet(
            'CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})
Ejemplo n.º 26
0
    def __init__(self):
        """

        :param mesh_instance: NcbiGene Instance
        :type mesh_instance: DataSourceInstance
        """
        super(GtexMetadataParser, self).__init__()

        # NodeSets
        self.tissues = NodeSet(['GtexTissue'], merge_keys=['name'])
        self.detailed_tissues = NodeSet(['GtexDetailedTissue'],
                                        merge_keys=['name'])
        self.sample = NodeSet(['GtexSample'], merge_keys=['sid'])

        self.sample_measures_tissue = RelationshipSet('MEASURES',
                                                      ['GtexSample'],
                                                      ['GtexTissue'], ['sid'],
                                                      ['name'])
        self.sample_measures_detailed_tissue = RelationshipSet(
            'MEASURES', ['GtexSample'], ['GtexDetailedTissue'], ['sid'],
            ['name'])
        self.tissue_parent_detailed_tissue = RelationshipSet(
            'PARENT', ['GtexTissue'], ['GtexDetailedTissue'], ['name'],
            ['name'])
        self.tissue_parent_detailed_tissue.unique = True
Ejemplo n.º 27
0
    def __init__(self):
        super(MeshParser, self).__init__()

        # NodeSets
        self.descriptor = NodeSet(['MeshDescriptor'], merge_keys=['sid'])
        self.qualifier = NodeSet(['MeshQualifier'], merge_keys=['sid'])
        self.concept = NodeSet(['MeshConcept'], merge_keys=['sid'])
        self.term = NodeSet(['MeshTerm'], merge_keys=['sid'])

        self.descriptor_allowed_qualifier = RelationshipSet('ALLOWED', ['MeshDescriptor'], ['MeshQualifier'], ['sid'],
                                                            ['sid'])

        self.descriptor_has_concept = RelationshipSet('HAS', ['MeshDescriptor'], ['MeshConcept'], ['sid'], ['sid'])
        self.descriptor_has_concept.unique = True
        self.concept_has_term = RelationshipSet('HAS', ['MeshConcept'], ['MeshTerm'], ['sid'], ['sid'])
        self.concept_has_term.unique = True
        self.concept_related_concept = RelationshipSet('RELATED', ['MeshConcept'], ['MeshConcept'], ['sid'], ['sid'])
        self.concept_related_concept.unique = True
Ejemplo n.º 28
0
class ChebiParser(ReturnParser):
    def __init__(self):
        super(ChebiParser, self).__init__()

        # NodeSets
        self.metabolites = NodeSet(['Metabolite'],
                                   merge_keys=['sid'],
                                   default_props={'source': 'chebi'})
        self.metabolite_isa_metabolite = RelationshipSet(
            'IS_A', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'chebi'})
        self.metabolite_rel_metabolite = RelationshipSet(
            'CHEBI_REL', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'chebi'})
        self.metabolite_maps_metabolite = RelationshipSet(
            'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'chebi'})

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        chebi_instance = self.get_instance_by_name('Chebi')

        obo_file = chebi_instance.get_file('chebi.obo')

        cleaned_obo_file = clean_obo_file(obo_file)

        chebi_ontology = pronto.Ontology(cleaned_obo_file)

        reltypes = set()

        # iterate terms
        for term in chebi_ontology.terms():

            term_sid = (term.id).split(':')[1]
            ontology_id = term.id
            self.metabolites.add_node({
                'name': (term.name),
                'sid': term_sid,
                'ontology_id': ontology_id,
                'definition': term.definition,
                'alt_ids': list(term.alternate_ids)
            })

            for parent in term.superclasses(distance=1, with_self=False):
                self.metabolite_isa_metabolite.add_relationship(
                    {'sid': term_sid}, {'sid': parent.id}, {})

            ## other named relationships
            try:
                for reltype, targets in term.relationships.items():

                    for target in targets:
                        self.metabolite_rel_metabolite.add_relationship(
                            {'sid': term_sid}, {'sid': target.id},
                            {'type': reltype.id})
            except KeyError as e:
                log.error(f"Cannot iterate relationshis of term {term_sid}")
                log.error(e)

            # metabolite-MAPS-metabolite
            for xref in term.xrefs:
                if 'HMDB:' in xref.id:
                    hmdb_id = xref.id.strip().split('HMDB:')[1]
                    self.metabolite_maps_metabolite.add_relationship(
                        {'sid': term_sid}, {'sid': hmdb_id}, {})
Ejemplo n.º 29
0
def load_wpp_data(base_path, graph):
    """
    Load UN population data.

    :param base_path: Path where file was downloaded.
    """
    un_wpp_csv_file = os.path.join(base_path, 'WPP2019_PopulationByAgeSex_Medium.csv')
    log.info('Parse UN population data file: {}'.format(un_wpp_csv_file))

    country = NodeSet(['Country'], ['name'])
    age_group_nodes = NodeSet(['AgeGroup'], ['group'])
    country_total_group = RelationshipSet('CURRENT_TOTAL', ['Country'], ['AgeGroup'], ['name'], ['group'])
    country_male_group = RelationshipSet('CURRENT_MALE', ['Country'], ['AgeGroup'], ['name'], ['group'])
    country_female_group = RelationshipSet('CURRENT_FEMALE', ['Country'], ['AgeGroup'], ['name'], ['group'])

    countries_added = set()
    age_groups_added = set()

    with open(un_wpp_csv_file, 'rt') as f:
        csv_file = csv.reader(f, delimiter=',', quotechar='"')
        # skip header
        next(csv_file)
        for row in csv_file:
            # LocID,Location,VarID,Variant,Time,MidPeriod,AgeGrp,AgeGrpStart,AgeGrpSpan,PopMale,PopFemale,PopTotal
            loc_id = row[0]
            location = row[1]
            time = int(row[4])
            age_group = row[6]
            age_group_start = int(row[7])
            age_group_span = row[8]
            pop_male = int(float((row[9])) * 1000)
            pop_female = int(float((row[10])) * 1000)
            pop_total = int(float((row[11])) * 1000)

            # only take 2019
            if time == 2019:
                if location not in countries_added:
                    country.add_node({'name': location, 'un_id': loc_id})
                    countries_added.add(location)
                if age_group not in age_groups_added:
                    age_group_nodes.add_node({'group': age_group, 'start': age_group_start, 'span': age_group_span})

                country_total_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_total})
                country_male_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_male})
                country_female_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_female})

    log.info('Load data to Neo4j')
    country.merge(graph)
    age_group_nodes.merge(graph)
    country_total_group.merge(graph)
    country_male_group.merge(graph)
    country_female_group.merge(graph)
Ejemplo n.º 30
0
class EnsemblEntityParser(ReturnParser):
    def __init__(self):
        """
        :param ensembl_instance: The ENSEMBL DataSource instance.
        """
        super(EnsemblEntityParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # NodeSets
        self.genes = NodeSet(['Gene'],
                             merge_keys=['sid'],
                             default_props={'source': 'ensembl'})
        self.transcripts = NodeSet(['Transcript'],
                                   merge_keys=['sid'],
                                   default_props={'source': 'ensembl'})
        self.proteins = NodeSet(['Protein'],
                                merge_keys=['sid'],
                                default_props={'source': 'ensembl'})

        # RelationshipSets
        self.gene_codes_transcript = RelationshipSet(
            'CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})
        self.transcript_codes_protein = RelationshipSet(
            'CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):
        ensembl_instance = self.get_instance_by_name('Ensembl')
        datasource_name = ensembl_instance.datasource.name

        # try patched path, if not available take flat
        ensembl_gtf_file_path = Ensembl.get_gtf_file_path(taxid,
                                                          ensembl_instance,
                                                          patched=True)
        if not os.path.exists(ensembl_gtf_file_path):
            ensembl_gtf_file_path = Ensembl.get_gtf_file_path(taxid,
                                                              ensembl_instance,
                                                              patched=False)

        annotation = GffReader(ensembl_gtf_file_path)

        check_gene_ids = set()
        check_transcript_ids = set()
        check_protein_ids = set()
        check_gene_transcript_rels = set()
        check_transcript_protein_rels = set()
        log.info("Start parsing ENSEMBL gtf file, taxid {}, {}".format(
            taxid, ensembl_gtf_file_path))
        for r in annotation.records:

            # add gene node
            gene_id = r.attributes['gene_id']
            if gene_id not in check_gene_ids:
                props = {
                    'sid': gene_id,
                    'name': r.attributes['gene_name'],
                    'taxid': taxid
                }

                self.genes.add_node(props)
                check_gene_ids.add(gene_id)

            # add transcript node
            if r.type == 'transcript':
                transcript_id = r.attributes['transcript_id']
                if transcript_id not in check_transcript_ids:
                    props = {'sid': transcript_id, 'taxid': taxid}

                    self.transcripts.add_node(props)
                    check_transcript_ids.add(transcript_id)

            # add protein node
            if r.type == 'CDS':
                protein_id = r.attributes['protein_id']
                if protein_id not in check_protein_ids:
                    props = {'sid': protein_id, 'taxid': taxid}

                    self.proteins.add_node(props)
                    check_protein_ids.add(protein_id)

            # Gene-CODES-Transcript
            if r.type == 'transcript':
                transcript_id = r.attributes['transcript_id']
                gene_id = r.attributes['gene_id']

                # add gene-transcript rel
                if gene_id + transcript_id not in check_gene_transcript_rels:
                    self.gene_codes_transcript.add_relationship(
                        {'sid': gene_id}, {'sid': transcript_id}, {})
                    check_gene_transcript_rels.add(gene_id + transcript_id)

            # Transcript-CODES-Protein
            if r.type == 'CDS':
                protein_id = r.attributes['protein_id']
                transcript_id = r.attributes['transcript_id']

                # add transcript-protein rel
                if transcript_id + protein_id not in check_transcript_protein_rels:
                    self.transcript_codes_protein.add_relationship(
                        {'sid': transcript_id}, {'sid': protein_id}, {})
                    check_transcript_protein_rels.add(transcript_id +
                                                      protein_id)

        log.info("Finished parsing ENSEMBL gtf file.")