Ejemplo n.º 1
0
class MirtarbaseParser(ReturnParser):

    def __init__(self):
        """
        """
        super(MirtarbaseParser, self).__init__()

        # RelationshipSets
        self.mirna_targets_gene = RelationshipSet('TARGETS', ['Mirna'], ['Gene'], ['name'], ['sid'])

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        log.debug("Run {}".format(self.__class__.__name__))

        mirtarbase_instance = self.get_instance_by_name('Mirtarbase')

        mirtarbase_file = mirtarbase_instance.get_file(FILE_NAME)

        df = pandas.read_excel(mirtarbase_file, index_col=None, header=0)
        # rename columns for easier access
        df.columns = ['mirtarbase_id', 'mirna', 'species_mirna', 'target_genesymbol', 'target_entrez', 'species_target',
                      'experiments', 'support_type', 'references']

        for row in df.itertuples():
            self.mirna_targets_gene.add_relationship(
                {'name': row.mirna.strip()}, {'sid': str(row.target_entrez).strip()},
                {'experiments': row.experiments, 'support_type': row.support_type, 'references': row.references,
                 'source': mirtarbase_instance.datasource.name}
            )
Ejemplo n.º 2
0
class DummyParser(ReturnParser):
    def __init__(self):
        super(DummyParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # output data
        self.dummy_nodes = NodeSet(['Dummy'], merge_keys=['sid'])
        self.fummy_nodes = NodeSet(['Fummy'], merge_keys=['sid'])

        self.dummy_knows_fummy = RelationshipSet('KNOWS', ['Dummy'], ['Fummy'],
                                                 ['sid'], ['sid'])

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):
        dummy_instance = self.get_instance_by_name('Dummy')
        dummyfile = dummy_instance.get_file('file.txt')

        target_sids = list(string.ascii_lowercase)

        # Fummy nodes
        for i in range(10):
            self.fummy_nodes.add_node({'sid': i, 'taxid': taxid})

        with open(dummyfile) as f:
            for l in f:
                letter = l.strip()
                self.dummy_nodes.add_node({'sid': letter, 'taxid': taxid})
                self.dummy_knows_fummy.add_relationship({'sid': letter},
                                                        {'sid': randint(0, 9)},
                                                        {'key': 'value'})
Ejemplo n.º 3
0
def read_daily_report_data_csv_JHU(file):
    """
    Extract data from a single daile report file from JHU.

    :param file: Path to the CSV file
    :return:
    """
    log.info('Read JHU CSV file {}'.format(file))

    countries = NodeSet(['Country'], ['name'])
    provinces = NodeSet(['Province'], ['name'])
    updates = NodeSet(['DailyReport'], ['uuid'])
    province_in_country = RelationshipSet('PART_OF', ['Province'], ['Country'], ['name'], ['name'])
    province_in_country.unique = True
    province_rep_update = RelationshipSet('REPORTED', ['Province'], ['DailyReport'], ['name'], ['uuid'])

    with open(file, 'rt') as csvfile:
        rows = csv.reader(csvfile, delimiter=',', quotechar='"')
        # skip header
        next(rows)

        for row in rows:
            country = row[1]
            province = row[0]
            # if no name for province, use country name
            if not province:
                province = '{}_complete'.format(country)

            date = parse(row[2])
            uuid = country+province+str(date)
            confirmed = int(row[3]) if row[3] else 'na'
            death = int(row[4]) if row[4] else 'na'
            recovered = int(row[5]) if row[5] else 'na'

            lat = row[6] if len(row) >= 7 else None
            long = row[7] if len(row) >= 8 else None

            province_dict = {'name': province}
            if lat and long:
                province_dict['latitude'] = lat
                province_dict['longitude'] = long
            provinces.add_unique(province_dict)

            countries.add_unique({'name': country})

            updates.add_unique(
                {'date': date, 'confirmed': confirmed, 'death': death, 'recovered': recovered, 'uuid': uuid})

            province_in_country.add_relationship({'name': province}, {'name': country}, {'source': 'jhu'})
            province_rep_update.add_relationship({'name': province}, {'uuid': uuid}, {'source': 'jhu'})

    return countries, provinces, updates, province_in_country, province_rep_update
Ejemplo n.º 4
0
class DependingTestParser(ReturnParser):
    def __init__(self):
        super(DependingTestParser, self).__init__()
        self.rels = RelationshipSet('FOO', ['Source'], ['Target'],
                                    ['source_id'], ['target_id'])

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        for i in range(100):
            self.rels.add_relationship({'source_id': i}, {'target_id': i},
                                       {'source': 'test'})
Ejemplo n.º 5
0
class NcbiLegacyGeneParser(ReturnParser):
    """
    Parse legacy gene IDs from gene_history.gz
    #tax_id GeneID  Discontinued_GeneID     Discontinued_Symbol     Discontinue_Date
    9       -       1246494 repA1   20031113
    9       -       1246495 repA2   20031113
    9       -       1246496 leuA    20031113
    """
    def __init__(self):
        super(NcbiLegacyGeneParser, self).__init__()

        self.arguments = ['taxid']

        self.legacy_genes = NodeSet(['Gene', 'Legacy'],
                                    merge_keys=['sid'],
                                    default_props={'source': 'ncbigene'})
        self.legacy_gene_now_gene = RelationshipSet(
            'REPLACED_BY', ['Gene', 'Legacy'], ['Gene'], ['sid'], ['sid'],
            default_props={'source': 'ncbigene'})

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):
        log.debug(f'Run parser {self.__class__.__name__} for taxID: {taxid}.')
        ncbigene_instance = self.get_instance_by_name('NcbiGene')
        gene_history_file = ncbigene_instance.get_file('gene_history.gz')

        with gzip.open(gene_history_file, 'rt') as f:
            # skip header
            next(f)
            for l in f:
                flds = l.strip().split('\t')
                this_taxid = flds[0]
                if this_taxid == taxid:
                    new_gene_id = flds[1]
                    discontinued_gene_id = flds[2]
                    discontinued_symbol = flds[3]
                    date = flds[4]
                    self.legacy_genes.add_node({
                        'sid': discontinued_gene_id,
                        'date': date,
                        'symbol': discontinued_symbol,
                        'taxid': taxid
                    })
                    if new_gene_id != '-':
                        self.legacy_gene_now_gene.add_relationship(
                            {'sid': discontinued_gene_id},
                            {'sid': new_gene_id}, {})
Ejemplo n.º 6
0
class GtexDataParser(ReturnParser):
    def __init__(self):
        """

        :param mesh_instance: NcbiGene Instance
        :type mesh_instance: DataSourceInstance
        """
        super(GtexDataParser, self).__init__()

        self.gene_expressed_tissue = RelationshipSet('EXPRESSED', ['Gene'],
                                                     ['GtexDetailedTissue'],
                                                     ['sid'], ['name'])

        self.object_sets = [self.gene_expressed_tissue]

        self.container.add_all(self.object_sets)

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        gtex_instance = self.get_instance_by_name('Gtex')

        gtex_mean_gene = gtex_instance.get_file(
            'GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz')

        with gzip.open(gtex_mean_gene, 'rt') as f:
            lines = f.readlines()
            # remove first two lines
            lines = lines[2:]
            # get header line
            header = lines.pop(0)
            header_fields = header.split('\t')

            # iterate data lines
            for line in lines:
                flds = line.split('\t')
                gene_id = flds[0].split('.')[0]
                data_flds = flds[2:]

                # iterate the other elements with index
                # have the index start at 2 to match the header which also includes the first two columns
                for i, value in enumerate(data_flds, start=2):
                    tissue_detailed_name = header_fields[i]
                    self.gene_expressed_tissue.add_relationship(
                        {'sid': gene_id}, {'name': tissue_detailed_name},
                        {'val': value})
Ejemplo n.º 7
0
class SomeParser(ReturnParser):
    def __init__(self):
        super(SomeParser, self).__init__()

        self.source = NodeSet(['Source'], merge_keys=['source_id'])
        self.target = NodeSet(['Target'], merge_keys=['target_id'])
        self.rels = RelationshipSet('FOO', ['Source'], ['Target'],
                                    ['source_id'], ['target_id'])

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        for i in range(100):
            self.source.add_node({'source_id': i})
            self.target.add_node({'target_id': i})
            self.rels.add_relationship({'source_id': i}, {'target_id': i},
                                       {'source': 'test'})
Ejemplo n.º 8
0
class HGNCParser(ReturnParser):
    def __init__(self):
        """

        :param ncbigene_instance: NcbiGene Instance
        :type ncbigene_instance: DataSourceInstance
        :param taxid:
        """
        super(HGNCParser, self).__init__()

        # output data
        self.genes = NodeSet(['Gene'], merge_keys=['sid'])

        self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'],
                                              ['sid'], ['sid'])
        self.gene_maps_genesymbol = RelationshipSet('MAPS', ['Gene'],
                                                    ['GeneSymbol'], ['sid'],
                                                    ['sid', 'taxid'])

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):

        hgnc_instance = self.get_instance_by_name('HGNC')

        hgnc_complete_file = hgnc_instance.get_file('hgnc_complete_set.txt')
        self.parse_hgnc_complete_file(hgnc_complete_file)

    def parse_hgnc_complete_file(self, hgnc_complete_file):
        with open(hgnc_complete_file, 'rt') as f:
            header = next(f)

            for l in f:
                flds = l.strip().split('\t')
                sid = flds[0]
                gene_symbol = flds[1]
                ncbi_id = flds[18] if len(flds) > 18 else None
                ensembl_id = flds[19] if len(flds) > 19 else None

                all_props = dict(zip(header, flds))
                all_props['sid'] = sid
                all_props['source'] = 'hgnc'

                self.genes.add_node(all_props)

                if ncbi_id:
                    self.gene_maps_gene.add_relationship({'sid': sid},
                                                         {'sid': ncbi_id},
                                                         {'source': 'hgnc'})
                if ensembl_id:
                    self.gene_maps_gene.add_relationship({'sid': sid},
                                                         {'sid': ensembl_id},
                                                         {'source': 'hgnc'})

                if gene_symbol:
                    self.gene_maps_genesymbol.add_relationship({'sid': sid}, {
                        'sid': gene_symbol,
                        'taxid': '9606'
                    }, {'source': 'hgnc'})
Ejemplo n.º 9
0
class MirdbParser(ReturnParser):
    def __init__(self):
        super(MirdbParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # RelationshipSets
        self.mirna_targets_transcript = RelationshipSet(
            'TARGETS', ['Mirna'], ['Transcript'], ['name'], ['sid'])

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):

        mirdb_instance = self.get_instance_by_name('Mirdb')
        mirdb_file = mirdb_instance.datasource.get_prediction_file(
            mirdb_instance)

        datasource_name = mirdb_instance.datasource.name
        mir_prefix = TAXID_2_MIRPREFIX[taxid]

        with gzip.open(mirdb_file, 'rt') as f:
            for l in f:
                flds = l.split()
                mir_name = flds[0]

                if mir_name.startswith(mir_prefix):
                    target = flds[1]
                    score = float(flds[2])

                    self.mirna_targets_transcript.add_relationship(
                        {'name': mir_name}, {'sid': target}, {
                            'score': score,
                            'source': datasource_name
                        })
Ejemplo n.º 10
0
class NcbiGeneOrthologParser(ReturnParser):
    def __init__(self):
        """

        :param ncbigene_instance: NcbiGene Instance
        :type ncbigene_instance: DataSourceInstance
        :param taxid:
        """
        super(NcbiGeneOrthologParser, self).__init__()

        self.gene_ortholog_gene = RelationshipSet('ORTHOLOG', ['Gene'],
                                                  ['Gene'], ['sid'], ['sid'])

        self.object_sets = [self.gene_ortholog_gene]
        self.container.add_all(self.object_sets)

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        """
        Get the Gene-ORTHOLOG-Gene relationships. This is currently not filteres for taxid.
        """
        ncbigene_instance = self.get_instance_by_name('NcbiGene')
        ortholog_file = ncbigene_instance.get_file('gene_orthologs.gz')

        with gzip.open(ortholog_file, 'rt') as f:
            # skip first line
            next(f)

            for l in f:
                flds = l.strip().split()
                g1 = flds[1]
                g2 = flds[4]

                self.gene_ortholog_gene.add_relationship({'sid': g1},
                                                         {'sid': g2}, {})
Ejemplo n.º 11
0
def load_wpp_data(base_path, graph):
    """
    Load UN population data.

    :param base_path: Path where file was downloaded.
    """
    un_wpp_csv_file = os.path.join(base_path, 'WPP2019_PopulationByAgeSex_Medium.csv')
    log.info('Parse UN population data file: {}'.format(un_wpp_csv_file))

    country = NodeSet(['Country'], ['name'])
    age_group_nodes = NodeSet(['AgeGroup'], ['group'])
    country_total_group = RelationshipSet('CURRENT_TOTAL', ['Country'], ['AgeGroup'], ['name'], ['group'])
    country_male_group = RelationshipSet('CURRENT_MALE', ['Country'], ['AgeGroup'], ['name'], ['group'])
    country_female_group = RelationshipSet('CURRENT_FEMALE', ['Country'], ['AgeGroup'], ['name'], ['group'])

    countries_added = set()
    age_groups_added = set()

    with open(un_wpp_csv_file, 'rt') as f:
        csv_file = csv.reader(f, delimiter=',', quotechar='"')
        # skip header
        next(csv_file)
        for row in csv_file:
            # LocID,Location,VarID,Variant,Time,MidPeriod,AgeGrp,AgeGrpStart,AgeGrpSpan,PopMale,PopFemale,PopTotal
            loc_id = row[0]
            location = row[1]
            time = int(row[4])
            age_group = row[6]
            age_group_start = int(row[7])
            age_group_span = row[8]
            pop_male = int(float((row[9])) * 1000)
            pop_female = int(float((row[10])) * 1000)
            pop_total = int(float((row[11])) * 1000)

            # only take 2019
            if time == 2019:
                if location not in countries_added:
                    country.add_node({'name': location, 'un_id': loc_id})
                    countries_added.add(location)
                if age_group not in age_groups_added:
                    age_group_nodes.add_node({'group': age_group, 'start': age_group_start, 'span': age_group_span})

                country_total_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_total})
                country_male_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_male})
                country_female_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_female})

    log.info('Load data to Neo4j')
    country.merge(graph)
    age_group_nodes.merge(graph)
    country_total_group.merge(graph)
    country_male_group.merge(graph)
    country_female_group.merge(graph)
Ejemplo n.º 12
0
class ChebiParser(ReturnParser):
    def __init__(self):
        super(ChebiParser, self).__init__()

        # NodeSets
        self.metabolites = NodeSet(['Metabolite'],
                                   merge_keys=['sid'],
                                   default_props={'source': 'chebi'})
        self.metabolite_isa_metabolite = RelationshipSet(
            'IS_A', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'chebi'})
        self.metabolite_rel_metabolite = RelationshipSet(
            'CHEBI_REL', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'chebi'})
        self.metabolite_maps_metabolite = RelationshipSet(
            'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'chebi'})

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        chebi_instance = self.get_instance_by_name('Chebi')

        obo_file = chebi_instance.get_file('chebi.obo')

        cleaned_obo_file = clean_obo_file(obo_file)

        chebi_ontology = pronto.Ontology(cleaned_obo_file)

        reltypes = set()

        # iterate terms
        for term in chebi_ontology.terms():

            term_sid = (term.id).split(':')[1]
            ontology_id = term.id
            self.metabolites.add_node({
                'name': (term.name),
                'sid': term_sid,
                'ontology_id': ontology_id,
                'definition': term.definition,
                'alt_ids': list(term.alternate_ids)
            })

            for parent in term.superclasses(distance=1, with_self=False):
                self.metabolite_isa_metabolite.add_relationship(
                    {'sid': term_sid}, {'sid': parent.id}, {})

            ## other named relationships
            try:
                for reltype, targets in term.relationships.items():

                    for target in targets:
                        self.metabolite_rel_metabolite.add_relationship(
                            {'sid': term_sid}, {'sid': target.id},
                            {'type': reltype.id})
            except KeyError as e:
                log.error(f"Cannot iterate relationshis of term {term_sid}")
                log.error(e)

            # metabolite-MAPS-metabolite
            for xref in term.xrefs:
                if 'HMDB:' in xref.id:
                    hmdb_id = xref.id.strip().split('HMDB:')[1]
                    self.metabolite_maps_metabolite.add_relationship(
                        {'sid': term_sid}, {'sid': hmdb_id}, {})
Ejemplo n.º 13
0
class EnsemblEntityParser(ReturnParser):
    def __init__(self):
        """
        :param ensembl_instance: The ENSEMBL DataSource instance.
        """
        super(EnsemblEntityParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # NodeSets
        self.genes = NodeSet(['Gene'],
                             merge_keys=['sid'],
                             default_props={'source': 'ensembl'})
        self.transcripts = NodeSet(['Transcript'],
                                   merge_keys=['sid'],
                                   default_props={'source': 'ensembl'})
        self.proteins = NodeSet(['Protein'],
                                merge_keys=['sid'],
                                default_props={'source': 'ensembl'})

        # RelationshipSets
        self.gene_codes_transcript = RelationshipSet(
            'CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})
        self.transcript_codes_protein = RelationshipSet(
            'CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):
        ensembl_instance = self.get_instance_by_name('Ensembl')
        datasource_name = ensembl_instance.datasource.name

        # try patched path, if not available take flat
        ensembl_gtf_file_path = Ensembl.get_gtf_file_path(taxid,
                                                          ensembl_instance,
                                                          patched=True)
        if not os.path.exists(ensembl_gtf_file_path):
            ensembl_gtf_file_path = Ensembl.get_gtf_file_path(taxid,
                                                              ensembl_instance,
                                                              patched=False)

        annotation = GffReader(ensembl_gtf_file_path)

        check_gene_ids = set()
        check_transcript_ids = set()
        check_protein_ids = set()
        check_gene_transcript_rels = set()
        check_transcript_protein_rels = set()
        log.info("Start parsing ENSEMBL gtf file, taxid {}, {}".format(
            taxid, ensembl_gtf_file_path))
        for r in annotation.records:

            # add gene node
            gene_id = r.attributes['gene_id']
            if gene_id not in check_gene_ids:
                props = {
                    'sid': gene_id,
                    'name': r.attributes['gene_name'],
                    'taxid': taxid
                }

                self.genes.add_node(props)
                check_gene_ids.add(gene_id)

            # add transcript node
            if r.type == 'transcript':
                transcript_id = r.attributes['transcript_id']
                if transcript_id not in check_transcript_ids:
                    props = {'sid': transcript_id, 'taxid': taxid}

                    self.transcripts.add_node(props)
                    check_transcript_ids.add(transcript_id)

            # add protein node
            if r.type == 'CDS':
                protein_id = r.attributes['protein_id']
                if protein_id not in check_protein_ids:
                    props = {'sid': protein_id, 'taxid': taxid}

                    self.proteins.add_node(props)
                    check_protein_ids.add(protein_id)

            # Gene-CODES-Transcript
            if r.type == 'transcript':
                transcript_id = r.attributes['transcript_id']
                gene_id = r.attributes['gene_id']

                # add gene-transcript rel
                if gene_id + transcript_id not in check_gene_transcript_rels:
                    self.gene_codes_transcript.add_relationship(
                        {'sid': gene_id}, {'sid': transcript_id}, {})
                    check_gene_transcript_rels.add(gene_id + transcript_id)

            # Transcript-CODES-Protein
            if r.type == 'CDS':
                protein_id = r.attributes['protein_id']
                transcript_id = r.attributes['transcript_id']

                # add transcript-protein rel
                if transcript_id + protein_id not in check_transcript_protein_rels:
                    self.transcript_codes_protein.add_relationship(
                        {'sid': transcript_id}, {'sid': protein_id}, {})
                    check_transcript_protein_rels.add(transcript_id +
                                                      protein_id)

        log.info("Finished parsing ENSEMBL gtf file.")
Ejemplo n.º 14
0
class NcbiGeneParser(ReturnParser):
    def __init__(self):

        super(NcbiGeneParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # output data
        # both gene IDs and GeneSymbols have the label 'Gene'
        # two different NodeSets are used because only the GeneSymbol nodes need taxid for uniqueness
        self.genes = NodeSet(['Gene'],
                             merge_keys=['sid'],
                             default_props={'source': 'ncbigene'})
        self.genesymbols = NodeSet(['Gene'],
                                   merge_keys=['sid', 'taxid'],
                                   default_props={
                                       'source': 'ncbigene',
                                       'type': 'symbol'
                                   })
        self.genesymbol_synonym_genesymbol = RelationshipSet(
            'SYNONYM', ['Gene'], ['Gene'], ['sid', 'taxid'], ['sid', 'taxid'],
            default_props={'source': 'ncbigene'})
        self.gene_maps_genesymbol = RelationshipSet(
            'MAPS', ['Gene'], ['Gene'], ['sid'], ['sid', 'taxid'],
            default_props={'source': 'ncbigene'})

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):
        log.info(f"Run {self.__class__.__name__}")
        ncbigene_instance = self.get_instance_by_name('NcbiGene')

        # get org specific gene_info file if available
        if taxid in TAXID_SPECIFIC_GENEINFO:
            gene_info_file = ncbigene_instance.get_file(
                TAXID_SPECIFIC_GENEINFO[taxid])
        else:
            gene_info_file = ncbigene_instance.get_file('gene_info.gz')

        log.info(gene_info_file)
        self.parse_gene_info(gene_info_file, taxid)

    def parse_gene_info(self, gene_info_file, taxid):
        # check sets
        check_ids = set()
        check_ids_symbols = set()

        with gzip.open(gene_info_file, 'rt') as f:

            header = next(f)

            # account for different formatting of header line (leading "#Format: " or not)
            if header.startswith('#Format:'):
                header_fields = tuple(
                    header.split(':')[1].split('(')
                    [0].rstrip().lstrip().split())
            elif header.startswith('#tax'):
                header_fields = tuple(header[1:].strip().split('\t'))
            else:
                raise AttributeError(
                    "File header was reformatted: {0}".format(header))

            for l in f:

                flds = l.rstrip().split('\t')

                this_taxid = flds[0]
                if this_taxid == taxid:

                    # (Gene)
                    entrez_gene_id = flds[1]
                    if entrez_gene_id not in check_ids:
                        props = {'sid': entrez_gene_id, 'taxid': taxid}
                        # update with all fields
                        props.update(zip(header_fields, flds))

                        check_ids.add(entrez_gene_id)
                        self.genes.add_node(props)

                    # (GeneSymbol) and (GeneSymbol)-[SYNONYM]-(GeneSymbol)
                    primary_symbol = flds[2]
                    synonym_symbols = flds[4].split('|')

                    # add primary symbol node
                    if primary_symbol not in check_ids_symbols and primary_symbol != '-':
                        check_ids_symbols.add(primary_symbol)
                        self.genesymbols.add_node({
                            'sid': primary_symbol,
                            'taxid': taxid
                        })

                    for synonym in synonym_symbols:
                        # GeneSymbol-[SYNONYM]-GeneSymbol
                        self.genesymbol_synonym_genesymbol.add_relationship(
                            {
                                'sid': synonym,
                                'taxid': taxid
                            }, {
                                'sid': primary_symbol,
                                'taxid': taxid
                            }, {})

                        if synonym not in check_ids_symbols and synonym != '-':
                            check_ids_symbols.add(synonym)
                            self.genesymbols.add_node({
                                'sid': synonym,
                                'status': 'synonym',
                                'taxid': taxid
                            })

                    # (Gene)-[MAPS]-(GeneSymbol)
                    # primary
                    self.gene_maps_genesymbol.add_relationship(
                        {'sid': entrez_gene_id}, {
                            'sid': primary_symbol,
                            'taxid': taxid
                        }, {'status': 'primary'})
                    # synonym
                    for symbol in synonym_symbols:
                        self.gene_maps_genesymbol.add_relationship(
                            {'sid': entrez_gene_id}, {
                                'sid': symbol,
                                'taxid': taxid
                            }, {'status': 'synonym'})
Ejemplo n.º 15
0
class UniprotKnowledgebaseParser(ReturnParser):
    """

    Uniprot has extensive mapping data to other data sources.

    Data is in the main Uniprot data file (referred to as Uniprot knowledge base).

    Ensembl:
        DR   Ensembl; ENST00000353703; ENSP00000300161; ENSG00000166913. [P31946-1]
        DR   Ensembl; ENST00000372839; ENSP00000361930; ENSG00000166913. [P31946-1]

    Refseq:
        DR   RefSeq; NP_006752.1; NM_006761.4. [P62258-1]

    The mapping parser returns transcript-protein relationships for both ENSEMBL and RefSeq.
    """
    def __init__(self):
        """
        :param uniprot_instance: The Uniprot instance
        :param taxid: The taxid
        """
        super(UniprotKnowledgebaseParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # NodeSet
        self.proteins = NodeSet(['Protein'], merge_keys=['sid'], default_props={'source': 'uniprot'})

        # RelationshipSet
        self.protein_primary_protein = RelationshipSet('PRIMARY', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'})
        self.transcript_codes_protein = RelationshipSet('CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'})
        self.protein_maps_protein = RelationshipSet('MAPS', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'})

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):

        uniprot_instance = self.get_instance_by_name('Uniprot')

        knowledgebase_files = uniprot_instance.datasource.get_knowledgebase_files_for_taxid(taxid, uniprot_instance)
        datasource_name = uniprot_instance.datasource.name

        # get organims name from taxid
        os_string_id = TAXID_OS_NAME[taxid]

        check_protein = set()
        check_p_p_p = set()
        check_t_c_p = set()
        check_p_m_p = set()

        # for now we always run on SPROT and TREMBL
        for kb_file in knowledgebase_files:
            log.debug(f"Parsing {kb_file}")
            with gzip.open(kb_file, 'rt') as f:
                up_parser = EMBLReaderUniProt(f)

                for record in up_parser.records:
                    # check taxon
                    if os_string_id in record['OS']:
                        # acc
                        acc_list = record['AC']
                        primary_acc = acc_list[0]
                        secondary = acc_list[1:]

                        # (Protein)
                        # make primary protein with full data
                        desc = record['DE']
                        rec_name = desc.split(';')[0].split('Full=')[1]

                        primary_props = {'sid': primary_acc, 'name': rec_name, 'desc': desc, 'category': 'primary',
                                         'taxid': taxid}

                        if primary_acc not in check_protein:
                            self.proteins.add_node(primary_props)
                            check_protein.add(primary_acc)

                        for secondary_acc in secondary:
                            if secondary_acc not in check_protein:
                                self.proteins.add_node(
                                    {'sid': secondary_acc, 'category': 'secondary',
                                     'taxid': taxid})
                                check_protein.add(secondary_acc)

                            # (Protein)-[PRIMARY]-(Protein)
                            if frozenset([primary_acc, secondary_acc]) not in check_p_p_p:
                                self.protein_primary_protein.add_relationship(
                                    {'sid': primary_acc}, {'sid': secondary}, {}
                                )
                                check_p_p_p.add(frozenset([primary_acc, secondary_acc]))

                        # (Transcript)-[CODES]-(Protein)
                        # (Protein)-[MAPS]-(Protein)
                        ## RefSeq
                        # ('RefSeq', ['NP_003395']),
                        refseq_mappings = [x[1] for x in record['DR'] if x[0] == 'RefSeq']
                        for map in refseq_mappings:

                            for refseq_id in map:
                                # remove version from refseq ID
                                refseq_id = refseq_id.split('.')[0]
                                second_letter = refseq_id[1]

                                # (Transcript)-[CODES]-(Protein)
                                if second_letter == 'M' or second_letter == 'R':
                                    for uniprot_acc in acc_list:
                                        if refseq_id + uniprot_acc not in check_t_c_p:
                                            self.transcript_codes_protein.add_relationship(
                                                {'sid': refseq_id}, {'sid': uniprot_acc},
                                                {'source': datasource_name}
                                            )
                                            check_t_c_p.add(refseq_id + uniprot_acc)

                                # (Protein)-[MAPS]-(Protein)
                                if second_letter == 'P':
                                    for uniprot_acc in acc_list:
                                        if uniprot_acc + refseq_id not in check_p_m_p:
                                            self.protein_maps_protein.add_relationship(
                                                {'sid': uniprot_acc}, {'sid': refseq_id},
                                                {}
                                            )
                                            check_p_m_p.add(uniprot_acc + refseq_id)

                        ## ensembl
                        ensembl_mappings = [x[1] for x in record['DR'] if x[0] == 'Ensembl']
                        for map in ensembl_mappings:
                            ensembl_transcript_id = map[0]
                            ensembl_protein_id = map[1]

                            for uniprot_acc in acc_list:
                                # (Transcript)-[CODES]-(Protein)
                                if ensembl_transcript_id + uniprot_acc not in check_t_c_p:
                                    self.transcript_codes_protein.add_relationship(
                                        {'sid': ensembl_transcript_id}, {'sid': uniprot_acc},
                                        {}
                                    )
                                    check_t_c_p.add(ensembl_transcript_id + uniprot_acc)

                                # (Protein)-[MAPS]-(Protein)
                                if ensembl_protein_id + uniprot_acc not in check_p_m_p:
                                    self.protein_maps_protein.add_relationship(
                                        {'sid': uniprot_acc}, {'sid': ensembl_protein_id},
                                        {}
                                    )
                                    check_p_m_p.add(ensembl_protein_id + uniprot_acc)
Ejemplo n.º 16
0
class RefseqCodesParser(ReturnParser):
    """
        Get mappings from NCBI Gene to Refseq transcripts.

        Refseq provides a mapping file that contains a gene-transcript-protein
        mapping per line: release86.accession2geneid.gz

        Example line: TaxID, NCBI Gene ID, RefSeq transcript ID, RefSeq protein ID

            9606    100008586       NM_001098405.2  NP_001091875.1

        :param refseq_mapping_file: The release86.accession2geneid.gz mapping file
        :param taxid: TaxID
        :return: List of (Gene)-[CODES]-(Transcript) Relationships
        """

    def __init__(self):
        """
        :param refseq_instance: The RefSeq DataSource instance.
        """
        super(RefseqCodesParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # define NodeSet and RelationshipSet
        self.gene_codes_transcript = RelationshipSet('CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'refseq'})
        self.transcript_codes_protein = RelationshipSet('CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'refseq'})

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):
        refseq_instance = self.get_instance_by_name('Refseq')
        datasource_name = refseq_instance.datasource.name

        refseq_accession2geneid_file = refseq_instance.datasource.get_accession2geneid_file_path(refseq_instance)

        # check sets to avoid duplicates
        check_g_t_rels = set()
        check_t_p_rels = set()

        with gzip.open(refseq_accession2geneid_file, 'rt') as f:
            for l in f:
                flds = l.strip().split('\t')

                this_taxid = flds[0]
                gene_id = flds[1]
                transcript_id = flds[2].split('.')[0]
                protein_id = flds[3].split('.')[0]

                if this_taxid == taxid:
                    # gene-transcript or transcript-protein pairs can be duplicate
                    # if e.g. a gene has one transcript which gives rise to two proteins
                    # we thus check for each pair if it was added already
                    if gene_id + transcript_id not in check_g_t_rels:
                        self.gene_codes_transcript.add_relationship(
                            {'sid': gene_id}, {'sid': transcript_id},
                            {'taxid': taxid}
                        )
                        check_g_t_rels.add(gene_id + transcript_id)

                    # the gene/transcript relationship is mostly clear
                    # but often there are no proteins associated
                    if protein_id != 'na':
                        if transcript_id + protein_id not in check_t_p_rels:
                            self.transcript_codes_protein.add_relationship(
                                {'sid': transcript_id},
                                {'sid': protein_id},
                                {'taxid': taxid}
                            )

                            check_t_p_rels.add(transcript_id + protein_id)
Ejemplo n.º 17
0
class RefseqRemovedRecordsParser(ReturnParser):
    """
    Parse all removed records from the removed_records files.

    The mappings to gene IDs are in the accession2geneid of the *previous* release.

    Simple approach is to collect all files from all previous releases and collect all records (relationships
    are filtered locally).
    """
    def __init__(self):
        super(RefseqRemovedRecordsParser, self).__init__()

        self.arguments = ['taxid']

        self.legacy_ids = set()

        self.legacy_transcripts = NodeSet(['Transcript', 'Legacy'], merge_keys=['sid'], default_props={'source': 'refseq'})
        self.legacy_transcript_now_transcript = RelationshipSet('REPLACED_BY', ['Transcript'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'refseq'})
        self.legacy_proteins = NodeSet(['Protein', 'Legacy'], merge_keys=['sid'], default_props={'source': 'refseq'})
        self.legacy_protein_now_protein = RelationshipSet('REPLACED_BY', ['Protein'], ['Protein'],
                                                                ['sid'], ['sid'], default_props={'source': 'refseq'})
        self.gene_codes_legacy_transcript = RelationshipSet('CODES', ['Gene'], ['Transcript', 'Legacy'], ['sid'], ['sid'], default_props={'source': 'refseq'})
        self.legacy_transcript_codes_protein = RelationshipSet('CODES', ['Transcript', 'Legacy'], ['Protein'],
                                                               ['sid'], ['sid'], default_props={'source': 'refseq'})

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):
        # get the nodes first, this also creates a set of all legacy IDs
        self.get_legacy_nodes(taxid)
        # then get the relationnships to gene IDs, this uses the set of legacy IDs to not recreate existing relationships
        self.get_legacy_gene_rels(taxid)

    def get_legacy_nodes(self, taxid):
        """
        ==========================================
        release#.removed-records
        ==========================================
        Content: Tab-delimited report of records that were included in the previous
        release but are not included in the current release.

        Columns:
         1. taxonomy ID
         2. species name
         3. accession.version
         4. refseq release directory accession is included in
              complete + other directories
              '|' delimited
         5. refseq status
              na - not available; status codes are not applied to most genomic records
              INFERRED
              PREDICTED
              PROVISIONAL
              VALIDATED
              REVIEWED
              MODEL
              UNKNOWN - status code not provided; however usually is provided for
                        this type of record
         6. length
         7. removed status
              dead protein: protein was removed when genomic record was reloaded
                            and protein  was not found on the nucleotide update.
                            This is an implied permanent suppress.

              temporarily suppressed: record was temporarily removed and may be
                                      restored at a later date.

              permanently suppressed: record was permanently removed. It is possible
                                      to restore this type of record however at the
                                      time of removal that action is not anticipated.

              replaced by accession:  the accession in column 3 has become a secondary
                                      accession that cited in column 8.

        :param taxid:
        :return:
        """
        refseq_instance = self.get_instance_by_name('Refseq')

        removed_records_files = refseq_instance.find_files(lambda x: 'removed-records' in x and x.endswith('.gz'))

        for file in removed_records_files:
            log.debug(f"Parse {file}")
            release = file.split('/')[-1].split('.')[0].replace('release', '')
            with gzip.open(file, 'rt') as f:
                for l in f:
                    flds = l.strip().split('\t')
                    this_taxid = flds[0]

                    if this_taxid == taxid:
                        refseq_acc, version = flds[2].split('.')
                        reason = flds[-1]
                        # transcript
                        if refseq_acc.startswith('NM') or refseq_acc.startswith('NR') or refseq_acc.startswith(
                                "XM") or refseq_acc.startswith("XR"):
                            if refseq_acc not in self.legacy_ids:
                                self.legacy_transcripts.add_node(
                                    {'sid': refseq_acc, 'version': version,
                                     'status': 'removed', 'removed_in': release, 'reason': reason,
                                     'taxid': taxid}
                                )

                                self.legacy_ids.add(refseq_acc)

                                if 'replaced by' in reason:
                                    # replaced by NM_022375 -> NM_022375
                                    new_accession = (reason.rsplit(' ', 1)[1]).split('.')[0]
                                    self.legacy_transcript_now_transcript.add_relationship(
                                        {'sid': refseq_acc}, {'sid': new_accession}, {}
                                    )
                        # protein
                        if refseq_acc.startswith('NP') or refseq_acc.startswith('XP'):
                            if refseq_acc not in self.legacy_ids:
                                self.legacy_proteins.add_node(
                                    {'sid': refseq_acc, 'version': version,
                                     'status': 'removed', 'removed_in': release, 'reason': reason,
                                     'taxid': taxid})
                                self.legacy_ids.add(refseq_acc)

                                if 'replaced by' in reason:
                                    # replaced by NM_022375 -> NM_022375
                                    new_accession = (reason.rsplit(' ', 1)[1]).split('.')[0]
                                    self.legacy_protein_now_protein.add_relationship(
                                        {'sid': refseq_acc}, {'sid': new_accession}, {}
                                    )

    def get_legacy_gene_rels(self, taxid):
        """
        Get the gene/protein relationships for the legacy Transcripts.

        ==========================================
        release#.accession2geneid
        ==========================================
        Content: Report of GeneIDs available at the time of the RefSeq release.
        Limited to GeneIDs that are associated with RNA or mRNA records with
        accession prefix N[M|R] and X[M|R].

        Columns (tab delimited):

            1: Taxonomic ID
            2: Entrez GeneID
            3: Transcript accession.version
            4: Protein accession.version
               na if no data
               --for example, the NR_ accession prefix is used for RNA
                 so there is no corresponding protein record

        :param taxid:
        :return:
        """
        log.debug("Get relationships from legacy RefSeq IDs to genes.")
        refseq_instance = self.get_instance_by_name('Refseq')

        archived_accession2geneid = refseq_instance.find_files(lambda x: 'accession2geneid' in x and x.endswith('.gz'))
        check_set = set()
        for file in archived_accession2geneid:
            log.debug(f"Parse {file}")

            with gzip.open(file, 'rt') as f:
                for l in f:
                    flds = l.strip().split('\t')
                    this_taxid = flds[0]
                    if this_taxid == taxid:
                        """
                                    1: Taxonomic ID
            2: Entrez GeneID
            3: Transcript accession.version
            4: Protein accession.version
               na if no data
               --for example, the NR_ accession prefix is used for RNA
                 so there is no corresponding protein record
                        """
                        gene_id = flds[1].strip()
                        transcript_accession = flds[2].strip().split('.')[0]
                        protein_accession = flds[3].strip().split('.')[0]
                        if transcript_accession in self.legacy_ids:
                            if (gene_id, transcript_accession) not in check_set:
                                self.gene_codes_legacy_transcript.add_relationship(
                                    {'sid': gene_id}, {'sid': transcript_accession}, {}
                                )
                                check_set.add((gene_id, transcript_accession))
                            if transcript_accession != 'na':
                                if (transcript_accession, protein_accession) not in check_set:
                                    self.legacy_transcript_codes_protein.add_relationship(
                                        {'sid': transcript_accession}, {'sid': protein_accession}, {}
                                    )
                                    check_set.add((transcript_accession, protein_accession))
Ejemplo n.º 18
0
class GeneOntologyAssociationParser(ReturnParser):
    """
    Parse GeneOntology Associations from the official UniProt association files.

    There are three different files available:

    - goa_uniprot_all.gaf.gz
    - goa_uniprot_all.gpa.gz
    - goa_uniprot_all.gpi.gz

    The GPA (Gene Product Association) file contains one gene product - GO Term tuple per line. There is additional
    information about the gene products in the GPI (Gene Product Information) file which augments the GPA file.

    The GAF file merges GPA and GPI (by adding the gene product information to each line with an association)
    and thus contains a lot of redundant information on the gene product.

    The GAF file is parsed for the mappings because it contains the mapping as well as the taxonomy ID. It is
    easier to iterate one file instead of generating a gene product - taxonomy ID mapping from the GPI file and
    then read the GPA file.

    More information from the header of the GPA file:

    !This file contains all GO annotations for proteins in the UniProt KnowledgeBase (UniProtKB).
    !
    !It also contains all annotations for protein complexes, identified by ComplexPortal identifiers,
    !and for non-coding RNAs, identified by RNAcentral identifiers
    !
    !Columns:
    !
    !   name                  required? cardinality   GAF column #
    !   DB                    required  1             1
    !   DB_Object_ID          required  1             2 / 17
    !   Qualifier             required  1 or greater  4
    !   GO ID                 required  1             5
    !   DB:Reference(s)       required  1 or greater  6
    !   ECO evidence code     required  1             7 (GO evidence code)
    !   With                  optional  0 or greater  8
    !   Interacting taxon ID  optional  0 or 1        13
    !   Date                  required  1             14
    !   Assigned_by           required  1             15
    !   Annotation Extension  optional  0 or greater  16
    !   Annotation Properties optional  0 or 1        n/a

    And from the header of the GPI file:

    !This file contains additional information for proteins in the UniProt KnowledgeBase (UniProtKB).
    !Protein accessions are represented in this file even if there is no associated GO annotation.
    !
    !Columns:
    !
    !   name                   required? cardinality   GAF column #  Example content
    !   DB                     required  1             1             UniProtKB
    !   DB_Object_ID           required  1             2/17          Q4VCS5-1
    !   DB_Object_Symbol       required  1             3             AMOT
    !   DB_Object_Name         optional  0 or greater  10            Angiomotin
    !   DB_Object_Synonym(s)   optional  0 or greater  11            AMOT|KIAA1071
    !   DB_Object_Type         required  1             12            protein
    !   Taxon                  required  1             13            taxon:9606
    !   Parent_Object_ID       optional  0 or 1        -             UniProtKB:Q4VCS5
    !   DB_Xref(s)             optional  0 or greater  -             WB:WBGene00000035
    !   Properties             optional  0 or greater  -             db_subset=Swiss-Prot|target_set=KRUK,BHFL

    """
    def __init__(self):
        super(GeneOntologyAssociationParser, self).__init__()

        self.arguments = ['taxid']

        # RelationshipSets
        self.protein_associates_goterm = RelationshipSet(
            'ASSOCIATION', ['Protein'], ['Term'], ['sid'], ['sid'])

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, ref_taxid):
        go_instance = self.get_instance_by_name('GeneOntology')
        log.debug("Run for {}".format(ref_taxid))
        if ref_taxid in TAXID_2_ORG_FILE_NAME:
            goa_uniprot_gaf_file_name = 'goa_{0}.gaf.gz'.format(
                TAXID_2_ORG_FILE_NAME[ref_taxid])
            goa_uniprot_gaf_file = go_instance.get_file(
                goa_uniprot_gaf_file_name)
        else:
            goa_uniprot_gaf_file = go_instance.get_file(
                'goa_uniprot_all.gaf.gz')

        self.parse_goa_uniprot_gaf_file(goa_uniprot_gaf_file, ref_taxid)

    def parse_goa_uniprot_gaf_file(self, goa_uniprot_gaf_file, ref_taxid):

        with gzip.open(goa_uniprot_gaf_file, 'rt') as f:
            for line in f:
                if not line.startswith('!'):
                    line = line.strip()
                    flds = line.split('\t')
                    db = flds[0]

                    try:
                        taxid = flds[12].split(':')[1]
                    except IndexError:
                        continue

                    if taxid == ref_taxid:
                        if db == 'UniProtKB':
                            db_id = flds[1]
                            qualifier = flds[3]
                            go_id = flds[4]
                            evidence = flds[6]

                            rel_properties = {'evidence': evidence}
                            if qualifier:
                                rel_properties['qualifier'] = qualifier

                            self.protein_associates_goterm.add_relationship(
                                {'sid': db_id}, {'sid': go_id}, rel_properties)
Ejemplo n.º 19
0
class GtexMetadataParser(ReturnParser):
    def __init__(self):
        """

        :param mesh_instance: NcbiGene Instance
        :type mesh_instance: DataSourceInstance
        """
        super(GtexMetadataParser, self).__init__()

        # NodeSets
        self.tissues = NodeSet(['GtexTissue'], merge_keys=['name'])
        self.detailed_tissues = NodeSet(['GtexDetailedTissue'],
                                        merge_keys=['name'])
        self.sample = NodeSet(['GtexSample'], merge_keys=['sid'])

        self.sample_measures_tissue = RelationshipSet('MEASURES',
                                                      ['GtexSample'],
                                                      ['GtexTissue'], ['sid'],
                                                      ['name'])
        self.sample_measures_detailed_tissue = RelationshipSet(
            'MEASURES', ['GtexSample'], ['GtexDetailedTissue'], ['sid'],
            ['name'])
        self.tissue_parent_detailed_tissue = RelationshipSet(
            'PARENT', ['GtexTissue'], ['GtexDetailedTissue'], ['name'],
            ['name'])
        self.tissue_parent_detailed_tissue.unique = True

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        gtex_instance = self.get_instance_by_name('Gtex')

        gtext_sample_attribute_file = gtex_instance.get_file(
            'GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt')

        gtex_df = pandas.read_csv(gtext_sample_attribute_file,
                                  sep='\t',
                                  header=0,
                                  index_col=False,
                                  encoding="utf-8-sig")

        for row in gtex_df.itertuples():
            sid = row.SAMPID
            tissue_name = row.SMTS
            detailed_tissue_name = row.SMTSD

            props = {
                'sid': sid,
                'SMATSSCR': row.SMATSSCR,
                'SMCENTER': row.SMCENTER,
                'SMPTHNTS': row.SMPTHNTS,
                'SMRIN': row.SMRIN,
                'SMTS': row.SMTS,
                'SMTSD': row.SMTSD,
                'SMUBRID': row.SMUBRID,
                'SMTSISCH': row.SMTSISCH,
                'SMTSPAX': row.SMTSPAX,
                'SMNABTCH': row.SMNABTCH,
                'SMNABTCHT': row.SMNABTCHT,
                'SMNABTCHD': row.SMNABTCHD,
                'SMGEBTCH': row.SMGEBTCH,
                'SMGEBTCHD': row.SMGEBTCHD,
                'SMGEBTCHT': row.SMGEBTCHT,
                'SMAFRZE': row.SMAFRZE,
                'SMGTC': row.SMGTC,
                'SME2MPRT': row.SME2MPRT,
                'SMCHMPRS': row.SMCHMPRS,
                'SMNTRART': row.SMNTRART,
                'SMNUMGPS': row.SMNUMGPS,
                'SMMAPRT': row.SMMAPRT,
                'SMEXNCRT': row.SMEXNCRT,
                'SM550NRM': row.SM550NRM,
                'SMGNSDTC': row.SMGNSDTC,
                'SMUNMPRT': row.SMUNMPRT,
                'SM350NRM': row.SM350NRM,
                'SMRDLGTH': row.SMRDLGTH,
                'SMMNCPB': row.SMMNCPB,
                'SME1MMRT': row.SME1MMRT,
                'SMSFLGTH': row.SMSFLGTH,
                'SMESTLBS': row.SMESTLBS,
                'SMMPPD': row.SMMPPD,
                'SMNTERRT': row.SMNTERRT,
                'SMRRNANM': row.SMRRNANM,
                'SMRDTTL': row.SMRDTTL,
                'SMVQCFL': row.SMVQCFL,
                'SMMNCV': row.SMMNCV,
                'SMTRSCPT': row.SMTRSCPT,
                'SMMPPDPR': row.SMMPPDPR,
                'SMCGLGTH': row.SMCGLGTH,
                'SMGAPPCT': row.SMGAPPCT,
                'SMUNPDRD': row.SMUNPDRD,
                'SMNTRNRT': row.SMNTRNRT,
                'SMMPUNRT': row.SMMPUNRT,
                'SMEXPEFF': row.SMEXPEFF,
                'SMMPPDUN': row.SMMPPDUN,
                'SME2MMRT': row.SME2MMRT,
                'SME2ANTI': row.SME2ANTI,
                'SMALTALG': row.SMALTALG,
                'SME2SNSE': row.SME2SNSE,
                'SMMFLGTH': row.SMMFLGTH,
                'SME1ANTI': row.SME1ANTI,
                'SMSPLTRD': row.SMSPLTRD,
                'SMBSMMRT': row.SMBSMMRT,
                'SME1SNSE': row.SME1SNSE,
                'SME1PCTS': row.SME1PCTS,
                'SMRRNART': row.SMRRNART,
                'SME1MPRT': row.SME1MPRT,
                'SMNUM5CD': row.SMNUM5CD,
                'SMDPMPRT': row.SMDPMPRT,
                'SME2PCTS': row.SME2PCTS
            }

            self.sample.add_node(props)
            self.tissues.add_unique({'name': tissue_name})
            self.detailed_tissues.add_unique({'name': detailed_tissue_name})

            self.sample_measures_tissue.add_relationship({'sid': sid},
                                                         {'name': tissue_name},
                                                         {})
            self.sample_measures_detailed_tissue.add_relationship(
                {'sid': sid}, {'name': detailed_tissue_name}, {})

            self.tissue_parent_detailed_tissue.add_relationship(
                {'name': tissue_name}, {'name': detailed_tissue_name}, {})
Ejemplo n.º 20
0
        # create nodes and relationships
        # add NCBI gene node
        ncbi_gene_nodes.add_node({'gene_id': ncbi_gene_id, 'db': 'ncbi'})
        # add ENSEMBL gene nodes if they not exist already
        for ensembl_gene_id in mapped_ensembl_gene_ids:
            if ensembl_gene_id not in ensembl_gene_ids_added:
                ensembl_gene_nodes.add_node({
                    'gene_id': ensembl_gene_id,
                    'db': 'ensembl'
                })
                ensembl_gene_ids_added.add(ensembl_gene_id)

        # add (:Gene)-[:MAPS]->(:Gene) relationship
        for ensembl_gene_id in mapped_ensembl_gene_ids:
            gene_mapping_rels.add_relationship({'gene_id': ncbi_gene_id},
                                               {'gene_id': ensembl_gene_id},
                                               {'db': 'ncbi'})

# load data to Neo4j
print(len(ncbi_gene_nodes.nodes))
print(len(ensembl_gene_nodes.nodes))
print(len(gene_mapping_rels.relationships))

# create index for property 'gene_id' on (Gene) nodes first
print('Create index on Gene nodes')
try:
    graph.schema.create_index('Gene', 'gene_id')
except py2neo.database.ClientError:
    pass

# load data, first nodes then relationships
Ejemplo n.º 21
0
def read_daily_report_data_csv_JHU(file):
    """
    Extract data from a single daile report file from JHU.

    Old format (until 03-21-2020)
        Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude
    New format:
        FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key

    :param file: Path to the CSV file
    :return:
    """
    log.info('Read JHU CSV file {}'.format(file))
    # understand if old fromat (

    countries = NodeSet(['Country'], ['name'])
    provinces = NodeSet(['Province'], ['name'])
    updates = NodeSet(['DailyReport'], ['uuid'])
    province_in_country = RelationshipSet('PART_OF', ['Province'], ['Country'],
                                          ['name'], ['name'])
    province_in_country.unique = True
    province_rep_update = RelationshipSet('REPORTED', ['Province'],
                                          ['DailyReport'], ['name'], ['uuid'])

    with open(file, 'rt') as csvfile:
        rows = csv.reader(csvfile, delimiter=',', quotechar='"')
        # skip header
        header = next(rows)
        if len(header) > 8:
            file_type = 'new'
        else:
            file_type = 'old'
        log.info("File type: {}".format(file_type))

        for row in rows:

            if file_type == 'old':
                country, province, date, confirmed, death, recovered, lat, long = parse_jhu_old_file_row(
                    row)
            elif file_type == 'new':
                country, province, date, confirmed, death, recovered, lat, long = parse_jhu_new_file_row(
                    row)

            province_dict = {'name': province}
            if lat and long:
                province_dict['latitude'] = lat
                province_dict['longitude'] = long

            uuid = country + province + str(date)

            provinces.add_unique(province_dict)

            countries.add_unique({'name': country})

            updates.add_unique({
                'date': date,
                'confirmed': confirmed,
                'death': death,
                'recovered': recovered,
                'uuid': uuid
            })

            province_in_country.add_relationship({'name': province},
                                                 {'name': country},
                                                 {'source': 'jhu'})
            province_rep_update.add_relationship({'name': province},
                                                 {'uuid': uuid},
                                                 {'source': 'jhu'})

    return countries, provinces, updates, province_in_country, province_rep_update
Ejemplo n.º 22
0
class LncipediaParser(ReturnParser):
    """
    Parse Lncipedia GFF file.

    'lnc_RNA' entries contain gene and transcript IDs as well as mappings to ENSEMBL
    'exon' entries don't have different IDs, they reuse the gene/transcript IDs from their parent 'lnc_RNA' entries

    chr16	lncipedia.org	lnc_RNA	52005479	52026435	.	-	.	ID=lnc-TOX3-1:20;gene_id=lnc-TOX3-1;transcript_id=lnc-TOX3-1:20;gene_alias_1=XLOC_011939;gene_alias_2=linc-SALL1-6;transcript_alias_1=TCONS_00025002;transcript_alias_2=NONHSAT142490;
    chr10	lncipedia.org	exon	8052243	8052735	.	-	.	Parent=GATA3-AS1:5;gene_id=GATA3-AS1;transcript_id=GATA3-AS1:5;gene_alias_1=XLOC_008724;gene_alias_2=linc-KIN-5;gene_alias_3=ENSG00000243350;gene_alias_4=RP11-379F12.3;gene_alias_5=ENSG00000243350.1;gene_alias_6=OTTHUMG00000017641.1;gene_alias_7=ENSG00000197308.9;gene_alias_8=GATA3-AS1;transcript_alias_1=TCONS_00017730;transcript_alias_2=ENST00000458727;transcript_alias_3=ENST00000458727.1;transcript_alias_4=RP11-379F12.3-001;transcript_alias_5=OTTHUMT00000046722.1;transcript_alias_6=NONHSAT011314;transcript_alias_7=NR_104327;transcript_alias_8=NR_104327.1;


    """
    
    def __init__(self):
        super(LncipediaParser, self).__init__()

        self.genes = NodeSet(['Gene'], merge_keys=['sid'])
        self.transcripts = NodeSet(['Transcript'], merge_keys=['sid'])
        self.gene_codes_transcripts = RelationshipSet('CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'])
        self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'], ['sid'], ['sid'])
        self.transcript_maps_transcript = RelationshipSet('MAPS', ['Transcript'], ['Transcript'], ['sid'], ['sid'])

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        log.debug(f"Run {self.__class__.__name__}")
        lncipedia_instance = self.get_instance_by_name('Lncipedia')

        lncipedia_datasource_name = lncipedia_instance.datasource.name

        gff_file = lncipedia_instance.get_file('lncipedia_5_2_hg38.gff')

        annotation = GffReader(gff_file)

        check_ids = set()

        for r in annotation.records:
            if r.type == 'lnc_RNA':
                # create gene
                gene_id = r.attributes['gene_id']
                if gene_id not in check_ids:
                    self.genes.add_node({'sid': gene_id, 'source': lncipedia_datasource_name})
                    check_ids.add(gene_id)

                transcript_id = r.attributes['transcript_id']
                if transcript_id not in check_ids:
                    self.transcripts.add_node({'sid': transcript_id, 'source': lncipedia_datasource_name})
                    check_ids.add(transcript_id)

                if frozenset((gene_id, transcript_id)) not in check_ids:
                    self.gene_codes_transcripts.add_relationship(
                        {'sid': gene_id}, {'sid': transcript_id}, {}
                    )
                    check_ids.add(frozenset((gene_id, transcript_id)))

                for k,v in r.attributes.items():
                    if k.startswith('gene_alias'):
                        ref_gene_id = v.split('.')[0]
                        # don't create MAPS relationship if same name like mapped entity
                        if gene_id != ref_gene_id:
                            if frozenset((gene_id, ref_gene_id)) not in check_ids:
                                self.gene_maps_gene.add_relationship(
                                    {'sid': gene_id}, {'sid': ref_gene_id}, {'source': lncipedia_datasource_name}
                                )
                                check_ids.add(frozenset((gene_id, ref_gene_id)))

                    if k.startswith('transcript_alias'):
                        ref_transcript_id = v.split('.')[0]
                        # don't create MAPS relationship if same name like mapped entity
                        if transcript_id != ref_transcript_id:
                            if frozenset((transcript_id, ref_transcript_id)) not in check_ids:
                                self.transcript_maps_transcript.add_relationship(
                                    {'sid': transcript_id}, {'sid': ref_transcript_id}, {'source': lncipedia_datasource_name}
                                )
                                check_ids.add(frozenset((transcript_id, ref_transcript_id)))
Ejemplo n.º 23
0
class HmdbParser(ReturnParser):
    def __init__(self):
        super(HmdbParser, self).__init__()

        # NodeSets
        self.metabolites = NodeSet(['Metabolite'],
                                   merge_keys=['sid'],
                                   default_props={'source': 'hmdb'})

        self.metabolite_map_metabolite = RelationshipSet(
            'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'hmdb'})
        self.metabolite_associates_protein = RelationshipSet(
            'HAS_ASSOCIATION', ['Metabolite'], ['Protein'], ['sid'], ['sid'],
            default_props={'source': 'hmdb'})

    def run_with_mounted_arguments(self):
        self.run()

    def run(self, *args, **kwargs):

        hmdb_instance = self.get_instance_by_name('Hmdb')

        all_metabolites_file = hmdb_instance.get_file('hmdb_metabolites.xml')

        all_metabolites = etree.parse(all_metabolites_file)

        for metabolite in all_metabolites.getroot():
            # TODO just iterate over property list, this code snippet was copied from manually testing stuff in Spyder
            # TODO filter empty properties
            sid = metabolite.findtext('{http://www.hmdb.ca}accession')
            name = metabolite.findtext('{http://www.hmdb.ca}name')
            chebi_id = metabolite.findtext('{http://www.hmdb.ca}chebi_id')
            chemspider_id = metabolite.findtext(
                '{http://www.hmdb.ca}chemspider_id')
            cs_description = metabolite.findtext(
                '{http://www.hmdb.ca}cs_description')
            description = metabolite.findtext(
                '{http://www.hmdb.ca}description')
            chemical_formula = metabolite.findtext(
                '{http://www.hmdb.ca}chemical_formula')
            average_molecular_weight = metabolite.findtext(
                '{http://www.hmdb.ca}average_molecular_weight')
            iupac_name = metabolite.findtext('{http://www.hmdb.ca}iupac_name')
            cas_registry_number = metabolite.findtext(
                '{http://www.hmdb.ca}cas_registry_number')
            smiles = metabolite.findtext('{http://www.hmdb.ca}smiles')
            inchi = metabolite.findtext('{http://www.hmdb.ca}inchi')
            kegg_id = metabolite.findtext('{http://www.hmdb.ca}kegg_id')

            metabolite_properties = {
                'sid': sid,
                'name': name,
                'chebi_id': chebi_id,
                'chemspider_id': chemspider_id,
                'cs_description': cs_description,
                'description': description,
                'chemical_formula': chemical_formula,
                'average_molecular_weight': average_molecular_weight,
                'iupac_name': iupac_name,
                'cas_registry_number': cas_registry_number,
                'smiles': smiles,
                'inchi': inchi,
                'kegg_id': kegg_id
            }

            self.metabolites.add_node(metabolite_properties)

            # add mapping to Chebi
            if chebi_id:
                self.metabolite_map_metabolite.add_relationship(
                    {'sid': sid}, {'sid': chebi_id}, {})

            # add association to Proteins
            for protein in metabolite.find(
                    '{http://www.hmdb.ca}protein_associations'):
                uniprot_id = protein.findtext('{http://www.hmdb.ca}uniprot_id')
                self.metabolite_associates_protein.add_relationship(
                    {'sid': sid}, {'sid': uniprot_id}, {})
Ejemplo n.º 24
0
class MeshParser(ReturnParser):

    def __init__(self):
        super(MeshParser, self).__init__()

        # NodeSets
        self.descriptor = NodeSet(['MeshDescriptor'], merge_keys=['sid'])
        self.qualifier = NodeSet(['MeshQualifier'], merge_keys=['sid'])
        self.concept = NodeSet(['MeshConcept'], merge_keys=['sid'])
        self.term = NodeSet(['MeshTerm'], merge_keys=['sid'])

        self.descriptor_allowed_qualifier = RelationshipSet('ALLOWED', ['MeshDescriptor'], ['MeshQualifier'], ['sid'],
                                                            ['sid'])

        self.descriptor_has_concept = RelationshipSet('HAS', ['MeshDescriptor'], ['MeshConcept'], ['sid'], ['sid'])
        self.descriptor_has_concept.unique = True
        self.concept_has_term = RelationshipSet('HAS', ['MeshConcept'], ['MeshTerm'], ['sid'], ['sid'])
        self.concept_has_term.unique = True
        self.concept_related_concept = RelationshipSet('RELATED', ['MeshConcept'], ['MeshConcept'], ['sid'], ['sid'])
        self.concept_related_concept.unique = True

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        self.parse_xml()

    def parse_xml(self):
        """
        Parse descriptor XML file.
        """
        mesh_instance = self.get_instance_by_name('Mesh')

        version = DataSourceVersion.version_from_string(
            mesh_instance.version
        )

        descriptor_xml = mesh_instance.get_file('desc{}.xml'.format(str(version)))
        log.debug("XML file {}".format(descriptor_xml))

        tree = ET.parse(descriptor_xml)
        root = tree.getroot()

        check_qualifier = set()
        check_concepts = set()
        check_terms = set()

        for descriptor_record in root.getchildren():
            descriptor_ui = descriptor_record.find('DescriptorUI').text

            # <DescriptorName>
            #  <String>Calcimycin</String>
            # </DescriptorName>
            descriptor_name = descriptor_record.find('.DescriptorName/String').text

            self.descriptor.add_node({'sid': descriptor_ui, 'name': descriptor_name})

            #   <AllowableQualifiersList>
            #   <AllowableQualifier>
            #    <QualifierReferredTo>
            #     <QualifierUI>Q000302</QualifierUI>
            #      <QualifierName>
            #      <String>isolation &amp; purification</String>
            #      </QualifierName>
            #    </QualifierReferredTo>
            #    <Abbreviation>IP</Abbreviation>
            #   </AllowableQualifier>
            #   </AllowableQualifiersList>

            allowed_qualifiers = descriptor_record.findall(
                '.AllowableQualifiersList/AllowableQualifier/QualifierReferredTo')
            for qualifier in allowed_qualifiers:
                qualifier_ui = qualifier.find('.QualifierUI').text

                # add qualifier node id not exists
                if qualifier_ui not in check_qualifier:
                    qualifier_name = qualifier.find('.QualifierName/String').text
                    self.qualifier.add_node({'sid': qualifier_ui, 'name': qualifier_name})
                    check_qualifier.add(qualifier_ui)

                # add descriptor -> qualifier relationship
                self.descriptor_allowed_qualifier.add_relationship(
                    {'sid': descriptor_ui}, {'sid': qualifier_ui}, {'source': 'mesh'}
                )

            #  <ConceptList>
            #    <Concept PreferredConceptYN="Y">
            #     <ConceptUI>M0000001</ConceptUI>
            #     <ConceptName>
            #      <String>Calcimycin</String>
            #     </ConceptName>
            #     <CASN1Name>4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6S-(6alpha(2S*,3S*),8beta(R*),9beta,11alpha))-</CASN1Name>
            #     <RegistryNumber>37H9VM9WZL</RegistryNumber>
            #     <ScopeNote>An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports CALCIUM and other divalent cations across membranes and uncouples oxidative phosphorylation while inhibiting ATPase of rat liver mitochondria. The substance is used mostly as a biochemical tool to study the role of divalent cations in various biological systems.
            #     </ScopeNote>
            #     <RelatedRegistryNumberList>
            #      <RelatedRegistryNumber>52665-69-7 (Calcimycin)</RelatedRegistryNumber>
            #     </RelatedRegistryNumberList>
            #     <ConceptRelationList>
            #      <ConceptRelation RelationName="NRW">
            #      <Concept1UI>M0000001</Concept1UI>
            #      <Concept2UI>M0353609</Concept2UI>
            #      </ConceptRelation>
            #     </ConceptRelationList>
            #     <TermList>
            #      <Term  ConceptPreferredTermYN="Y"  IsPermutedTermYN="N"  LexicalTag="NON"  RecordPreferredTermYN="Y">
            #       <TermUI>T000002</TermUI>
            #       <String>Calcimycin</String>
            #       <DateCreated>
            #        <Year>1999</Year>
            #        <Month>01</Month>
            #        <Day>01</Day>
            #       </DateCreated>
            #       <ThesaurusIDlist>
            #        <ThesaurusID>FDA SRS (2014)</ThesaurusID>
            #        <ThesaurusID>NLM (1975)</ThesaurusID>
            #       </ThesaurusIDlist>
            #      </Term>
            #     </TermList>
            #    </Concept>

            concepts = descriptor_record.findall('.ConceptList/Concept')

            for concept in concepts:
                preferred_concept = concept.attrib['PreferredConceptYN']

                concept_ui = concept.find('.ConceptUI').text

                # concept node if not exists
                if concept_ui not in check_concepts:
                    concept_properties = {}
                    concept_properties['sid'] = concept_ui
                    concept_properties['name'] = concept.find('.ConceptName/String').text

                    try:
                        concept_properties['scope_note'] = concept.find('.ScopeNote').text
                    except AttributeError as e:
                        pass

                    self.concept.add_node(concept_properties)

                    check_concepts.add(concept_ui)

                # (Descriptor)--(Concept) relation
                self.descriptor_has_concept.add_relationship({'sid': descriptor_ui}, {'sid': concept_ui},
                                                             {'preferred': preferred_concept})

                # concept relations
                for concept_relation in concept.findall('.ConceptRelationList/ConceptRelation'):
                    left = concept_relation.find('.Concept1UI').text
                    right = concept_relation.find('.Concept2UI').text
                    name = concept_relation.attrib['RelationName']

                    self.concept_related_concept.add_relationship({'sid': left}, {'sid': right}, {'name': name})

                # iterate Terms for concept
                for term in concept.findall('.TermList/Term'):
                    term_ui = term.find('TermUI').text
                    concept_preferred_term = term.attrib['ConceptPreferredTermYN']

                    # Term node if not exists
                    if term_ui not in check_terms:
                        term_name = term.find('.String').text
                        self.term.add_node({'sid': term_ui, 'name': term_name})

                        check_terms.add(term_ui)

                    # (Concept)--(Term)
                    self.concept_has_term.add_relationship({'sid': concept_ui}, {'sid': term_ui},
                                                           {'preferred': concept_preferred_term})
Ejemplo n.º 25
0
class EnsemblMappingParser(ReturnParser):
    """
    Get mappings from ENSEMBL IDs to other databases.

    ENSEMBL dumps common mapping data to files in the `tsv` directory.


    ### Transcripts
    Extract (Transcript {ensembl})-[MAPS]-(Transcript {refseq}) mappings from ENSEMBL.

    Mappings to NCBI Gene are from: Homo_sapiens.GRCh38.91.refseq.tsv.gz

    Example:
        gene_stable_id|transcript_stable_id|protein_stable_id|xref|db_name|info_type|source_identity|xref_identity|linkage_type
        ENSG00000223972	ENST00000456328	-	102725121	EntrezGene	DEPENDENT	-	-	-


    ### Genes
    Extract (Gene {ensembl})-[MAPS]-(Gene {ncbigene}) mappings from ENSEMBL.

    Mappings to NCBI Gene are from: Homo_sapiens.GRCh38.91.entrez.tsv.gz

    Example:
        gene_stable_id|transcript_stable_id|protein_stable_id|xref|db_name|info_type|source_identity|xref_identity|linkage_type
        ENSG00000223972	ENST00000456328	-	102725121	EntrezGene	DEPENDENT	-	-	-


    ### Proteins
    Extract (Protein {ensembl})-[MAPS]-(Protein {refseq}) mappings from ENSEMBL.

    Mappings to NCBI Gene are from: Homo_sapiens.GRCh38.91.uniprot.tsv.gz

    Example:
        gene_stable_id	transcript_stable_id	protein_stable_id	xref	db_name	info_type	source_identity	xref_identity	linkage_type
        ENSG00000186092	ENST00000335137	ENSP00000334393	Q8NH21	Uniprot/SWISSPROT	DIRECT	100	100	-

    """
    def __init__(self):

        super(EnsemblMappingParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # define NodeSet and RelationshipSet
        self.gene_maps_gene = RelationshipSet(
            'MAPS', ['Gene'], ['Gene'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})
        self.transcript_maps_transcript = RelationshipSet(
            'MAPS', ['Transcript'], ['Transcript'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})
        self.protein_maps_protein = RelationshipSet(
            'MAPS', ['Protein'], ['Protein'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})

    # define properties that are used in multiple parsing functions
    @property
    def ensembl_instance(self):
        return self.get_instance_by_name('Ensembl')

    @property
    def datasource_name(self):
        return self.ensembl_instance.datasource.name

    def run_gene(self, taxid):

        ensembl_tsv_entrez_file_path = Ensembl.get_tsv_file_path(
            taxid, 'entrez', self.ensembl_instance)

        log.debug(
            'Ensembl TSV file path: {}'.format(ensembl_tsv_entrez_file_path))

        check_rels = set()

        with gzip.open(ensembl_tsv_entrez_file_path, 'rt') as f:
            lines = f.readlines()
            for l in lines[1:]:
                flds = l.strip().split()

                ensembl_gene_id = flds[0]
                ncbi_gene_id = flds[3]

                if frozenset([ensembl_gene_id,
                              ncbi_gene_id]) not in check_rels:
                    self.gene_maps_gene.add_relationship(
                        {'sid': ensembl_gene_id}, {'sid': ncbi_gene_id}, {})
                    check_rels.add(frozenset([ensembl_gene_id, ncbi_gene_id]))

    def run_transcript(self, taxid):

        ensembl_tsv_refseq_file_path = Ensembl.get_tsv_file_path(
            taxid, 'refseq', self.ensembl_instance)
        log.debug(
            'Ensembl TSV file path: {}'.format(ensembl_tsv_refseq_file_path))
        check_rels = set()

        with gzip.open(ensembl_tsv_refseq_file_path, 'rt') as f:
            lines = f.readlines()
            for l in lines[1:]:
                flds = l.strip().split()

                ensembl_transcript_id = flds[1]
                xref_id = flds[3]

                # filter transcripts
                second_letter = xref_id[1]

                if second_letter == 'M' or second_letter == 'R':
                    if frozenset([ensembl_transcript_id,
                                  xref_id]) not in check_rels:
                        self.transcript_maps_transcript.add_relationship(
                            {'sid': ensembl_transcript_id}, {'sid': xref_id},
                            {})

                        check_rels.add(
                            frozenset([ensembl_transcript_id, xref_id]))

    def run_protein(self, taxid):
        ensembl_tsv_uniprot_file_path = Ensembl.get_tsv_file_path(
            taxid, 'uniprot', self.ensembl_instance)
        log.debug(
            'Ensembl TSV file path: {}'.format(ensembl_tsv_uniprot_file_path))

        check_rels = set()
        with gzip.open(ensembl_tsv_uniprot_file_path, 'rt') as f:
            lines = f.readlines()
            for l in lines[1:]:
                flds = l.strip().split()

                ensembl_protein_id = flds[2]
                xref_id = flds[3]

                if frozenset([ensembl_protein_id, xref_id]) not in check_rels:
                    self.protein_maps_protein.add_relationship(
                        {'sid': ensembl_protein_id}, {'sid': xref_id},
                        {'taxid': self.taxid})

                    check_rels.add(frozenset([ensembl_protein_id, xref_id]))

    def run_with_mounted_arguments(self):
        self.run(self.taxid)

    def run(self, taxid):
        self.run_gene(taxid)
        self.run_transcript(taxid)
        self.run_protein(taxid)
Ejemplo n.º 26
0
class SwissLipidsParser(ReturnParser):

    def __init__(self):

        super(SwissLipidsParser, self).__init__()

        # define NodeSet and RelationshipSet
        self.lipids = NodeSet(['Lipid'], merge_keys=['sid'])

        self.lipid_fromclass_lipid = RelationshipSet('FROM_LIPID_CLASS', ['Lipid'], ['Lipid'], ['sid'], ['sid'])
        self.lipid_parent_lipid = RelationshipSet('HAS_PARENT', ['Lipid'], ['Lipid'], ['sid'], ['sid'])
        self.lipid_component_lipid = RelationshipSet('HAS_COMPONENT', ['Lipid'], ['Lipid'], ['sid'], ['sid'])
        self.lipid_maps_metabolite = RelationshipSet('MAPS', ['Lipid'], ['Metabolite'], ['sid'], ['sid'])
        self.lipid_associates_protein = RelationshipSet('HAS_ASSOCIATION', ['Lipid'], ['Protein'], ['sid'], ['sid'])

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):

        swisslipids_instance = self.get_instance_by_name('SwissLipids')

        self.get_lipids(swisslipids_instance)
        self.get_lipid_to_protein(swisslipids_instance)

    def get_lipids(self, instance):
        """
        Lipid ID	Level	Name	Abbreviation*	Synonyms*	Lipid class*	Parent	Components*	SMILES (pH7.3)	InChI (pH7.3)	InChI key (pH7.3)	Formula (pH7.3)	Charge (pH7.3)	Mass (pH7.3)	Exact Mass (neutral form)	Exact m/z of [M.]+	Exact m/z of [M+H]+	Exact m/z of [M+K]+ 	Exact m/z of [M+Na]+	Exact m/z of [M+Li]+	Exact m/z of [M+NH4]+	Exact m/z of [M-H]-	Exact m/z of [M+Cl]-	Exact m/z of [M+OAc]- 	CHEBI	LIPID MAPS	HMDB	PMID
        SLM:000000002	Class	Ceramide (iso-d17:1(4E))	Cer(iso-d17:1(4E))	N-acyl-15-methylhexadecasphing-4-enine	SLM:000399814			CC(C)CCCCCCCCC\C=C\[C@@H](O)[C@H](CO)NC([*])=O	InChI=none		C18H34NO3R	0												70846			14685263 | 21325339 | 9603947 | 21926990
        SLM:000000003	Isomeric subspecies	15-methylhexadecasphing-4-enine			SLM:000390097			CC(C)CCCCCCCCC\C=C\[C@@H](O)[C@@H]([NH3+])CO	InChI=1S/C17H35NO2/c1-15(2)12-10-8-6-4-3-5-7-9-11-13-17(20)16(18)14-19/h11,13,15-17,19-20H,3-10,12,14,18H2,1-2H3/p+1/b13-11+/t16-,17+/m0/s1	InChIKey=LZKPPSAEINBHRP-KORIGIIASA-O	C17H36NO2	1	286.473200	285.266779	285.266231	286.274056	324.229938	308.256000	292.282235	303.300605	284.259503	320.236181	344.280632	70771			19372430

        Columns:
            - difficult to select column names, use index
            - the * means the field is a list
            - different field separators in list fields

        0	Lipid ID
        1	Level
        2	Name
        3	Abbreviation*
        4	Synonyms*
        5	Lipid class*
        6	Parent
        7	Components*
        8	SMILES (pH7.3)
        9	InChI (pH7.3)
        10	InChI key (pH7.3)
        11	Formula (pH7.3)
        12	Charge (pH7.3)
        13	Mass (pH7.3)
        14	Exact Mass (neutral form)
        15	Exact m/z of [M.]+
        16	Exact m/z of [M+H]+
        17	Exact m/z of [M+K]+
        18	Exact m/z of [M+Na]+
        19	Exact m/z of [M+Li]+
        20	Exact m/z of [M+NH4]+
        21	Exact m/z of [M-H]-
        22	Exact m/z of [M+Cl]-
        23	Exact m/z of [M+OAc]-
        24	CHEBI
        25	LIPID MAPS
        26	HMDB
        27	PMID
        """

        lipids_file = instance.get_file('lipids.tsv.gz')

        # get header
        header = None

        with gzip.open(lipids_file, 'rt') as f:
            header = next(f)
        header = header.strip().split('\t')

        def safe_string(s):
            for char in [' ', '[', ']', '(', ')', '*', '/']:
                s = s.replace(char, '_')
            return s

        header_cypher_safe = [safe_string(s) for s in header]
        log.debug(header_cypher_safe)

        # iterate file
        with gzip.open(lipids_file, 'rt', errors="replace") as f:
            # skip header
            next(f)

            for l in f:
                flds = l.strip().split('\t')


                lipid_sid = flds[0]

                # (Lipid) node
                props = {'source': 'swisslipids'}
                props['sid'] = lipid_sid

                # add all properties, some are empty but contain whitespaces
                for i, fld in enumerate(flds):
                    fld = fld.strip()
                    if fld:
                        props[header_cypher_safe[i]] = fld
                #
                # print(
                #     dict(zip(header, flds))
                # )

                self.lipids.add_node(props)

                # (Lipid)-[FROM_LIPID_CLASS]-(Lipid)
                for lipid_class_sid in flds[5].strip().split('|'):
                    # strip leading/trailing spaces, not always existing
                    lipid_class_sid = lipid_class_sid.strip()
                    self.lipid_fromclass_lipid.add_relationship(
                        {'sid': lipid_sid}, {'sid': lipid_class_sid}, {'source': 'swisslipids'}
                    )

                # (Lipid)-[HAS_PARENT]-(Lipid)
                self.lipid_parent_lipid.add_relationship(
                    {'sid': lipid_sid}, {'sid': flds[6].strip()}, {'source': 'swisslipids'}
                )

                # (Lipid)-[COMPONENT]-(Lipid)
                ## e.g. SLM:000000510 (sn1) / SLM:000000418 (sn2)
                for lipid_component in flds[7].strip().split('/'):
                    # get sid and type of lipid component, type does not always exist
                    try:
                        lipid_component_sid, lipid_component_type = lipid_component.strip().split(' ', 1)

                        self.lipid_component_lipid.add_relationship(
                            {'sid': lipid_sid}, {'sid': lipid_component_sid}, {'type': lipid_component_type}
                        )
                    # some empty fields contain extra spaces
                    except ValueError:
                        pass

                # (Lipid)-[MAPS]-(Metabolite)
                try:
                    chebi_id = flds[24].strip()

                    if chebi_id:
                        self.lipid_maps_metabolite.add_relationship(
                            {'sid': lipid_sid}, {'sid': chebi_id}, {'source': 'swisslipids'}
                        )
                except IndexError:
                    pass

                try:
                    hmdb_id = flds[26].strip()

                    if hmdb_id:

                        self.lipid_maps_metabolite.add_relationship(
                            {'sid': lipid_sid}, {'sid': hmdb_id}, {'source': 'swisslipids'}
                        )
                except IndexError:
                    pass

    def get_lipid_to_protein(self, instance):
        """
        File: lipids2uniprot.tsv.gz

        Columns:
            - difficult to select column names, use index


        0	metabolite id
        1	UniprotKB IDs
        2	level
        3	metabolite name
        4	abbreviations
        5	synonyms
        6	lipid class
        7	components
        8	PMIDs
        9	SMILES (pH7.3)
        10	InChI (pH7.3)
        11	InChI key (pH7.3)
        12	Formula (pH7.3)
        13	Mass (pH7.3)
        14	Charge (pH7.3)
        15	Exact Mass (neutral form)
        16	Exact m/z of [M.]+
        17	Exact m/z of [M+H]+
        18	Exact m/z of [M+K]+Exact m/z of [M+Na]+
        19	Exact m/z of [M+Li]+
        20	Exact m/z of [M+NH4]+
        21	Exact m/z of [M-H]-
        22	Exact m/z of [M+Cl]-
        23	Exact m/z of [M+OAc]-
        24	ChEBI
        25	LipidMaps
        26	HMDB
        27	Mapping level

        :param instance: The datasource instance.
        """
        lipids_2_protein_file = instance.get_file('lipids2uniprot.tsv.gz')


        # iterate file
        with gzip.open(lipids_2_protein_file, 'rt', errors="replace") as f:
            next(f)
            for l in f:
                flds = l.strip().split('\t')
                swisslipids_id = flds[0].strip()
                mapping_level = flds[27].strip()

                # collect UniProt IDs from uniprot fields, contains a '|' separated list
                # G5EC84 | O18037 | P91079 | Q09517 | Q10916 | Q20735 | Q21054 | Q23498 | Q9U3D4
                # note: not always formatted with space: ' | '
                uniprot_id_string = flds[1]
                uniprot_ids = set()
                for u in uniprot_id_string.split('|'):
                    u = u.strip()
                    if u:
                        uniprot_ids.add(u)

                for up in uniprot_ids:
                    self.lipid_associates_protein.add_relationship(
                        {'sid': swisslipids_id}, {'sid': up}, {'source': 'swisslipids', 'level': mapping_level}
                    )
Ejemplo n.º 27
0
class NcbiHomoloGeneParser(ReturnParser):
    """
    The NCBI HomoloGene parser reads the basic datafile `homologene.data` from HomoloGene.

    The file `homolgene.data` is a tab separated list of homology groups.

    Fields: group ID, tax ID, gene ID, gene symbol, unclear?, refseq ID

    Example::

    3	9606	34	ACADM	4557231	NP_000007.1
    3	9598	469356	ACADM	160961497	NP_001104286.1
    3	9544	705168	ACADM	109008502	XP_001101274.1
    3	9615	490207	ACADM	545503811	XP_005622188.1

    """
    def __init__(self):
        super(NcbiHomoloGeneParser, self).__init__()

        # output data
        self.gene_homolog_gene = RelationshipSet('HOMOLOG', ['Gene'], ['Gene'],
                                                 ['sid'], ['sid'])

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        ncbihomologene_instance = self.get_instance_by_name('NcbiHomoloGene')
        datafile = ncbihomologene_instance.get_file('homologene.data')

        with open(datafile) as f:

            current_group_id = None
            current_group_genes = set()

            for l in f:
                # iterate and collect groups (identified by forst column)
                # take gene IDs from group and create all pairwise relationships
                flds = l.strip().split('\t')
                group_id = flds[0]
                gene_id = flds[2]

                # set group_id on first line
                if not current_group_id:
                    current_group_id = group_id

                if current_group_id == group_id:
                    current_group_genes.add(gene_id)

                else:
                    # first line with new group_id
                    # create relationships for all gene_id from previous group
                    for g1, g2 in combinations(current_group_genes, 2):
                        self.gene_homolog_gene.add_relationship({'sid': g1},
                                                                {'sid': g2},
                                                                {})

                    # clear gene set
                    current_group_genes = set()
                    # add current gene_id which is the first from a new group
                    current_group_genes.add(gene_id)
                    # set current_group_id to this group_id
                    current_group_id = group_id
Ejemplo n.º 28
0
class MirbaseParser(ReturnParser):
    def __init__(self):
        super(MirbaseParser, self).__init__()

        # NodeSets
        self.precursor_mirna = NodeSet(['PrecursorMirna'], merge_keys=['sid'])
        self.mature_mirna = NodeSet(['Mirna'], merge_keys=['sid'])
        # RelationshipSets
        self.precursor_codes_mature = RelationshipSet('PRE',
                                                      ['PrecursorMirna'],
                                                      ['Mirna'], ['sid'],
                                                      ['sid'])
        self.transcript_codes_precursor = RelationshipSet(
            'IS', ['Transcript'], ['PrecursorMirna'], ['sid'], ['sid'])
        self.gene_is_precursor = RelationshipSet('IS', ['Gene'],
                                                 ['PrecursorMirna'], ['sid'],
                                                 ['sid'])

    def run_with_mounted_arguments(self):
        self.run()

    def run(self):
        self.get_mature_mirnas()
        self.get_pre_mirnas()
        self.get_pre_mature_relationship()
        self.get_pre_transcript_relationships()
        self.get_gene_pre_relationships()

    @property
    def mirbase_instance(self):
        return self.get_instance_by_name('Mirbase')

    @property
    def pre_mirna_df(self):
        """
        Ge precursor miRNA DataFrame from  mirna.txt.gz

        mir_acc, mir_id, prev_mir_id, desc, sequence, comment, organism_key, dead_flag
        """
        precursor_mirna_table_file = self.mirbase_instance.get_file(
            'mirna.txt.gz')
        pre_mirs_df = pandas.read_csv(precursor_mirna_table_file,
                                      sep='\t',
                                      index_col=0,
                                      header=None)
        pre_mirs_df.columns = [
            'mir_acc', 'mir_id', 'prev_mir_id', 'desc', 'sequence', 'comment',
            'organism_key', 'dead_flag'
        ]
        return pre_mirs_df

    @property
    def mature_mirna_df(self):
        """
        Get mature miRNA DataFrame from: mirna_mature.txt.gz

        name, prev_name, mir_acc, evidence, ref, similarity, dead_flag
        """
        mirna_table_file = self.mirbase_instance.get_file(
            'mirna_mature.txt.gz')
        mirnas_df = pandas.read_csv(mirna_table_file,
                                    sep='\t',
                                    index_col=0,
                                    header=None)
        mirnas_df.columns = [
            'name', 'prev_name', 'mir_acc', 'evidence', 'ref', 'similarity',
            'dead_flag'
        ]
        return mirnas_df

    @property
    def context_df(self):
        """
        Get context DataFrame from: mirna_context.txt.gz

        auto_mirna  transcript_id   overlap_sense   overlap_type    number  transcript_source   transcript_name
        64777	    ENST00000545242	+	            intron	        15	    HGNC_trans_name	    ABLIM2-203
        """
        mirna_context_file = self.mirbase_instance.get_file(
            'mirna_context.txt.gz')
        mirna_context_df = pandas.read_csv(mirna_context_file,
                                           sep='\t',
                                           index_col=0,
                                           header=None)
        mirna_context_df.columns = [
            'transcript_id', 'overlap_sense', 'overlap_type', 'number',
            'transcript_source', 'transcript_name'
        ]
        return mirna_context_df

    @property
    def mirna_database_url_df(self):
        """
        Database list from: mirna_database_url.txt.gz

        `auto_db`, `display_name`, `url`        
        5	EntrezGene	https://www.ncbi.nlm.nih.gov/gene/<?>
        """
        file = self.mirbase_instance.get_file('mirna_database_url.txt.gz')
        df = pandas.read_csv(file, sep='\t', index_col=0, header=None)
        df.columns = ['display_name', 'url']
        return df

    @property
    def mirna_database_link_df(self):
        """
        Database links from: mirna_database_links.txt.gz

        'auto_mirna', 'auto_db', 'link', 'display_name'
        64744	5	406883	MIRLET7A3
        """
        file = self.mirbase_instance.get_file('mirna_database_links.txt.gz')
        df = pandas.read_csv(file, sep='\t', index_col=0, header=None)
        df.columns = ['auto_db', 'link', 'display_name']
        return df

    def get_mature_mirnas(self):
        """
        Mature miRNAs are stored in a single table: mirna_mature.txt.gz

        name, prev_name, mir_acc, evidence, ref, similarity, dead_flag
        """
        for row in self.mature_mirna_df.itertuples():
            # add node
            self.mature_mirna.add_node({
                'sid': row.mir_acc,
                'name': row.name,
                'evidence': row.evidence
            })

    def get_pre_mirnas(self):
        """
        Precursor miRNAs are stored in a table: mirna.txt.gz

        mir_acc, mir_id, prev_mir_id, desc, sequence, comment, organism_key

        Organism identifier for the precursor miRNAs are in another table: mirna_species.txt.gz

        organism, division, org_name, taxonomy, genome_assembly, genome_accession, ensembl_db

        `org_name` is the long name of the organism, there is no taxonomy ID.

        :return: List of precursor miRNAs
        :rtype: list[Entity]
        """

        organism_table_file = self.mirbase_instance.get_file(
            'mirna_species.txt.gz')

        # load pre-miRNA table

        # load organism table
        orgs_df = pandas.read_csv(organism_table_file,
                                  sep='\t',
                                  index_col=0,
                                  header=None)
        orgs_df.columns = [
            'organism', 'division', 'org_name', 'taxon_id', 'taxonomy',
            'genome_assembly', 'genome_accession', 'ensembl_db'
        ]

        merged_pre_mirs_org_df = pandas.merge(self.pre_mirna_df,
                                              orgs_df,
                                              on=None,
                                              left_on='organism_key',
                                              right_index=True)

        # add precursor miRNA nodes
        for row in merged_pre_mirs_org_df.itertuples():
            props = {
                'sid': row.mir_acc,
                'name': row.mir_id,
                'desc': row.desc,
                'sequence': row.sequence,
                'taxid': row.taxon_id,
                'comment': str(row.comment)
            }

            self.precursor_mirna.add_node(props)

    def get_pre_mature_relationship(self):
        """
        Mature miRNAs and precursor miRNAs are in the same files described in the respective parser function (above).

        Mapping is stored in a mapping table: mirna_pre_mature.txt.gz

        pre_dbid, mature_dbid, start, end

        It contains the primary key of mature and precursor miRNA tables and the start/end of the mature
        sequence within the precursor.

        :return: List of relationships between mature and precursor miRNAs
        :rtype: list[Relationship]
        """
        mapping_table_file = self.mirbase_instance.get_file(
            'mirna_pre_mature.txt.gz')

        # collect db_primary_key -> mirBase accession to later parse
        # the mature/precursor mapping table

        precursor_db_key_2_accession = {}

        for row in self.pre_mirna_df.itertuples():
            precursor_db_key_2_accession[row.Index] = row.mir_acc

        # mature miRNAs

        # get miRNAs from miRNAs table first
        # they are mapped to pre-miRNAs with a mapping table that contains the position of the mature sequence
        # organims is also only stored for pre-miRNA

        # collect db_primary_key -> mirBase accession to later parse
        # the mature/precursor mapping table
        mature_db_key_2_accession = {}

        for row in self.mature_mirna_df.itertuples():
            mature_db_key_2_accession[row.Index] = row.mir_acc

        # parse mappings

        # get mapping table
        pre_2_mature_df = pandas.read_csv(mapping_table_file,
                                          sep='\t',
                                          index_col=False,
                                          header=None)
        pre_2_mature_df.columns = ['pre_dbid', 'mature_dbid', 'start', 'end']

        # iterate over mapping table and create relationships

        for row in pre_2_mature_df.itertuples():
            mature_acc = mature_db_key_2_accession[row.mature_dbid]
            precursor_acc = precursor_db_key_2_accession[row.pre_dbid]

            self.precursor_codes_mature.add_relationship(
                {'sid': precursor_acc}, {'sid': mature_acc}, {
                    'start': int(row.start),
                    'end': int(row.end)
                })

    def get_pre_transcript_relationships(self):
        """
        MirBase provides the transcriptional context based on ENSEMBL transcipts.

        Context is stored in a single file: mirna_context.txt.gz

        auto_mirna  transcript_id   overlap_sense   overlap_type    number  transcript_source   transcript_name
        64777	    ENST00000545242	+	            intron	        15	    HGNC_trans_name	    ABLIM2-203

        For mapping the auto_mirna KEY we need the  precursor miRNAs from: mirna.txt.gz

        """
        pre_2_context = pandas.merge(self.context_df,
                                     self.pre_mirna_df,
                                     how='left',
                                     on=None,
                                     left_index=True,
                                     right_index=True)

        for row in pre_2_context.itertuples():
            self.transcript_codes_precursor.add_relationship(
                {'sid': row.transcript_id}, {'sid': row.mir_acc}, {
                    'overlap_type': row.overlap_type,
                    'number': row.number
                })

    def get_gene_pre_relationships(self):
        """
        Parse relationships from Gene to Mirna.

        MiRBase provides links to external databases in a table:mirna_database_links.txt.gz

        `auto_mirna`, `auto_db`, `link`, `display_name`
        64743	5	406882	MIRLET7A2

        auto_mirna is the mirna KEY from: mirna.txt.gz

        auto_db is the DB KEY from: mirna_database_url.txt.gz


        5	EntrezGene	https://www.ncbi.nlm.nih.gov/gene/<?>

        Example line: 64743	ENTREZGENE		406882	MIRLET7A2
        :return:
        """
        gene_pre_df = pandas.merge(self.mirna_database_link_df,
                                   self.mirna_database_url_df,
                                   how='left',
                                   left_on='auto_db',
                                   right_index=True)
        print(len(self.mirna_database_link_df),
              len(self.mirna_database_url_df))
        print(len(gene_pre_df))

        final_merge = pandas.merge(gene_pre_df,
                                   self.pre_mirna_df,
                                   how='left',
                                   on=None,
                                   left_index=True,
                                   right_index=True)

        for row in final_merge.itertuples():
            if row.display_name_y == 'EntrezGene':
                self.gene_is_precursor.add_relationship(
                    {'sid': row.link}, {'sid': row.mir_acc},
                    {'source': self.mirbase_instance.datasource.name})