Ejemplo n.º 1
0
def _build_uniprot_sequences():
    seq_file = resource_manager.get_create_resource_file('swissprot',
                                                         cached=True)
    iso_file = resource_manager.get_create_resource_file('isoforms',
                                                         cached=True)
    logger.info("Loading Swissprot sequences...")
    sp_seq = load_fasta_sequences(seq_file)
    logger.info("Loading Uniprot isoform sequences...")
    iso_seq = load_fasta_sequences(iso_file)
    sp_seq.update(iso_seq)
    return sp_seq
Ejemplo n.º 2
0
def _build_uniprot_entries():
    up_entries_file = resource_manager.get_create_resource_file('up')
    sc_entries_file = resource_manager.get_create_resource_file('up_sars_cov2')
    uniprot_gene_name = {}
    uniprot_mnemonic = {}
    uniprot_mnemonic_reverse = {}
    uniprot_mgi = {}
    uniprot_rgd = {}
    uniprot_mgi_reverse = {}
    uniprot_rgd_reverse = {}
    uniprot_length = {}
    uniprot_features = {}
    uniprot_reviewed = set()
    files = [up_entries_file, sc_entries_file]
    for file in files:
        with open(file, 'r') as fh:
            csv_rows = csv.reader(fh, delimiter='\t')
            # Skip the header row
            next(csv_rows)
            for row in csv_rows:
                up_id, gene_name, up_mnemonic, rgd, mgi, length, reviewed, \
                    features_json = row
                # Store the entry in the reviewed set
                if reviewed == 'reviewed':
                    uniprot_reviewed.add(up_id)
                uniprot_gene_name[up_id] = gene_name
                uniprot_mnemonic[up_id] = up_mnemonic
                uniprot_mnemonic_reverse[up_mnemonic] = up_id
                uniprot_length[up_id] = int(length)
                if mgi:
                    mgi_ids = mgi.split(';')
                    if mgi_ids:
                        uniprot_mgi[up_id] = mgi_ids[0]
                        uniprot_mgi_reverse[mgi_ids[0]] = up_id
                if rgd:
                    rgd_ids = rgd.split(';')
                    if rgd_ids:
                        uniprot_rgd[up_id] = rgd_ids[0]
                        uniprot_rgd_reverse[rgd_ids[0]] = up_id
                uniprot_features[up_id] = [
                    feature_from_json(feat)
                    for feat in json.loads(features_json)
                ]

    # Build a dict of features by feature ID
    features_by_id = {}
    for up_id, feats in uniprot_features.items():
        for feat in feats:
            features_by_id[feat.id] = feat

    return (uniprot_gene_name, uniprot_mnemonic, uniprot_mnemonic_reverse,
            uniprot_mgi, uniprot_rgd, uniprot_mgi_reverse, uniprot_rgd_reverse,
            uniprot_length, uniprot_reviewed, uniprot_features, features_by_id)
Ejemplo n.º 3
0
def _build_hgnc_mappings():
    hgnc_file = resource_manager.get_create_resource_file('hgnc')
    with gzip.open(hgnc_file, 'rt', encoding='utf-8') as fh:
        csv_rows = csv.reader(fh, delimiter='\t')
        # Skip the header row
        next(csv_rows)
        hgnc_name_to_id = {}
        hgnc_id_to_up = {}
        up_to_hgnc_id = {}
        entrez_to_up = {}
        up_to_entrez = {}
        for row in csv_rows:
            hgnc_id = row[0][5:]
            hgnc_status = row[3]
            if hgnc_status == 'Approved':
                hgnc_name = row[1]
                hgnc_name_to_id[hgnc_name] = hgnc_id
            # Uniprot
            uniprot_id = row[6]
            if uniprot_id:
                hgnc_id_to_up[hgnc_id] = uniprot_id
                uniprot_ids = uniprot_id.split(', ')
                for upid in uniprot_ids:
                    up_to_hgnc_id[upid] = hgnc_id
                # Entrez
                entrez_id = row[5]
                if entrez_id:
                    for upid in uniprot_ids:
                        up_to_entrez[upid] = entrez_id
                        entrez_to_up[entrez_id] = uniprot_id

    return hgnc_name_to_id, hgnc_id_to_up, up_to_hgnc_id, \
        entrez_to_up, up_to_entrez
Ejemplo n.º 4
0
def _build_human_mouse_rat():
    hgnc_file = resource_manager.get_create_resource_file('hgnc')
    with gzip.open(hgnc_file, 'rt', encoding='utf-8') as fh:
        csv_rows = csv.reader(fh, delimiter='\t')
        # Skip the header row
        next(csv_rows)
        uniprot_mouse = {}
        uniprot_rat = {}
        for row in csv_rows:
            human_id, mgi_id, rgd_id = row[6:9]
            if human_id:
                if mgi_id:
                    mgi_id = mgi_id.split(', ')[0]
                    if mgi_id.startswith('MGI:'):
                        mgi_id = mgi_id[4:]
                    mouse_id = um.uniprot_mgi_reverse.get(mgi_id)
                    if mouse_id:
                        uniprot_mouse[human_id] = mouse_id
                if rgd_id:
                    rgd_id = rgd_id.split(', ')[0]
                    if rgd_id.startswith('RGD:'):
                        rgd_id = rgd_id[4:]
                    rat_id = um.uniprot_rgd_reverse.get(rgd_id)
                    if rat_id:
                        uniprot_rat[human_id] = rat_id
    return uniprot_mouse, uniprot_rat
Ejemplo n.º 5
0
def _build_refseq_uniprot():
    refseq_uniprot_file = resource_manager.get_create_resource_file(
                                                'refseq_uniprot')
    refseq_up = {}
    with gzip.open(refseq_uniprot_file, 'rt', encoding='utf-8') as f:
        csvreader = csv.reader(f)
        for refseq_id, up_id in csvreader:
            if refseq_id not in refseq_up:
                refseq_up[refseq_id] = []
            refseq_up[refseq_id].append(up_id)
    return refseq_up
Ejemplo n.º 6
0
def _build_uniprot_entries():
    up_entries_file = resource_manager.get_create_resource_file('up')
    uniprot_gene_name = {}
    uniprot_mnemonic = {}
    uniprot_mnemonic_reverse = {}
    uniprot_mgi = {}
    uniprot_rgd = {}
    uniprot_mgi_reverse = {}
    uniprot_rgd_reverse = {}
    uniprot_length = {}
    uniprot_signal_peptide = {}
    uniprot_reviewed = set()
    with open(up_entries_file, 'r') as fh:
        csv_rows = csv.reader(fh, delimiter='\t')
        # Skip the header row
        next(csv_rows)
        for row in csv_rows:
            up_id, gene_name, up_mnemonic, rgd, mgi, length, reviewed, \
                signal_peptide = row
            # Store the entry in the reviewed set
            if reviewed == 'reviewed':
                uniprot_reviewed.add(up_id)
            uniprot_gene_name[up_id] = gene_name
            uniprot_mnemonic[up_id] = up_mnemonic
            uniprot_mnemonic_reverse[up_mnemonic] = up_id
            uniprot_length[up_id] = int(length)
            if mgi:
                mgi_ids = mgi.split(';')
                if mgi_ids:
                    uniprot_mgi[up_id] = mgi_ids[0]
                    uniprot_mgi_reverse[mgi_ids[0]] = up_id
            if rgd:
                rgd_ids = rgd.split(';')
                if rgd_ids:
                    uniprot_rgd[up_id] = rgd_ids[0]
                    uniprot_rgd_reverse[rgd_ids[0]] = up_id
            uniprot_signal_peptide[up_id] = (None, None)
            if signal_peptide:
                match = re.match(r'SIGNAL (\d+) (\d+) ', signal_peptide)
                if match:
                    beg_pos, end_pos = match.groups()
                    uniprot_signal_peptide[up_id] = \
                        (int(beg_pos), int(end_pos))

    return (uniprot_gene_name, uniprot_mnemonic, uniprot_mnemonic_reverse,
            uniprot_mgi, uniprot_rgd, uniprot_mgi_reverse, uniprot_rgd_reverse,
            uniprot_length, uniprot_reviewed, uniprot_signal_peptide)
Ejemplo n.º 7
0
def _build_uniprot_sec():
    # File containing secondary accession numbers mapped
    # to primary accession numbers
    sec_file = resource_manager.get_create_resource_file('upsec')
    uniprot_sec = {}
    lines = open(sec_file, 'rt').readlines()
    for i, l in enumerate(lines):
        if l.startswith('Secondary AC'):
            entry_lines = lines[i + 2:]

    for l in entry_lines:
        sec_id, prim_id = l.split()
        try:
            uniprot_sec[sec_id].append(prim_id)
        except KeyError:
            uniprot_sec[sec_id] = [prim_id]
    return uniprot_sec
Ejemplo n.º 8
0
def _get_phospho_site_dataset():
    """Read phosphosite data into dicts keyed by Uniprot ID and by site group.

    Returns
    -------
    tuple
        The first element of the tuple contains the PhosphoSite data keyed
        by Uniprot ID, the second element contains data keyed by site group.
        Both dicts have instances of the PhosphoSite namedtuple as values.
        If the PhosphoSite data file cannot be loaded, returns (None, None).
    """
    global _data_by_up
    global _data_by_site_grp
    phosphosite_data_file = resource_manager.get_create_resource_file('psp')
    if _data_by_up is None or _data_by_site_grp is None:
        with open(phosphosite_data_file, 'r') as fh:
            # Get the csv reader generator
            reader = csv.reader(fh, delimiter='\t')
            # Skip 4 rows
            for _ in range(4):
                next(reader)
            # Build up a dict by protein
            data_by_up = defaultdict(lambda: defaultdict(list))
            data_by_site_grp = defaultdict(list)
            for row in reader:
                site = PhosphoSite(*row)
                res_pos = site.MOD_RSD.split('-')[0]
                #res_pos = res_pos[1:] # DANGEROUS: lookup based on pos alone
                base_acc_id = site.ACC_ID.split('-')[0]
                data_by_up[site.ACC_ID][res_pos].append(site)
                # If the ID was isoform specific, add to the dict for the whole
                # protein
                if base_acc_id != site.ACC_ID:
                    data_by_up[base_acc_id][res_pos].append(site)
                # Catch the handful of isoforms that have a Uniprot ID without
                # the hyphen
                elif site.ACC_ID in _iso_to_ref_map:
                    ref_id = _iso_to_ref_map[site.ACC_ID]
                    data_by_up[ref_id][res_pos].append(site)
                # To catch additional cases, include an entry for the -1 base ID
                else:
                    data_by_up['%s-1' % base_acc_id] = data_by_up[base_acc_id]
                data_by_site_grp[site.SITE_GRP_ID].append(site)
            _data_by_up = data_by_up
            _data_by_site_grp = data_by_site_grp
    return (_data_by_up, _data_by_site_grp)
Ejemplo n.º 9
0
def _build_hgnc_mappings():
    hgnc_file = resource_manager.get_create_resource_file('hgnc')
    with open(hgnc_file, 'r') as fh:
        csv_rows = csv.reader(fh, delimiter='\t')
        # Skip the header row
        next(csv_rows)
        hgnc_ids = {}
        uniprot_ids = {}
        for row in csv_rows:
            hgnc_id = row[0][5:]
            hgnc_status = row[3]
            if hgnc_status == 'Approved':
                hgnc_name = row[1]
                hgnc_ids[hgnc_name] = hgnc_id
            # Uniprot
            uniprot_id = row[6]
            uniprot_ids[hgnc_id] = uniprot_id
    return hgnc_ids, uniprot_ids
Ejemplo n.º 10
0
def _build_uniprot_entries():
    up_entries_file = resource_manager.get_create_resource_file('up')
    uniprot_gene_name = {}
    uniprot_mnemonic = {}
    uniprot_mnemonic_reverse = {}
    uniprot_mgi = {}
    uniprot_rgd = {}
    uniprot_mgi_reverse = {}
    uniprot_rgd_reverse = {}
    uniprot_length = {}
    uniprot_features = {}
    uniprot_reviewed = set()
    organisms_by_id = {}
    uniprot_entrez = {}
    uniprot_entrez_reverse = {}
    files = [up_entries_file]
    mgi_name_to_up = {}
    rgd_name_to_up = {}
    for file in files:
        with gzip.open(file, 'rt', encoding='utf-8') as fh:
            csv_rows = csv.reader(fh, delimiter='\t')
            # Skip the header row
            next(csv_rows)
            for row in csv_rows:
                up_id, gene_name, up_mnemonic, rgd, mgi, length, reviewed, \
                    organism_id, entrez_id, features_json = row
                # Store the entry in the reviewed set
                if reviewed == 'reviewed':
                    uniprot_reviewed.add(up_id)
                # This is to turn empty strings into explicit Nones
                uniprot_gene_name[up_id] = gene_name if gene_name else None
                uniprot_mnemonic[up_id] = up_mnemonic
                uniprot_mnemonic_reverse[up_mnemonic] = up_id
                uniprot_length[up_id] = int(length)
                if mgi:
                    mgi_ids = mgi.split(';')
                    if mgi_ids:
                        uniprot_mgi[up_id] = mgi_ids[0]
                        uniprot_mgi_reverse[mgi_ids[0]] = up_id
                        mgi_name_to_up[gene_name] = up_id
                if rgd:
                    rgd_ids = rgd.split(';')
                    if rgd_ids:
                        uniprot_rgd[up_id] = rgd_ids[0]
                        uniprot_rgd_reverse[rgd_ids[0]] = up_id
                        rgd_name_to_up[gene_name] = up_id
                uniprot_features[up_id] = [feature_from_json(feat) for
                                           feat in json.loads(features_json)]
                organisms_by_id[up_id] = organism_id

                # Entrez mappings
                entrez_ids = [ei for ei in
                              [e.strip() for e in entrez_id.split(';')] if ei]
                for eid in entrez_ids:
                    uniprot_entrez[up_id] = eid
                    uniprot_entrez_reverse[eid] = up_id

    # Build a dict of features by feature ID
    features_by_id = {}
    for up_id, feats in uniprot_features.items():
        for feat in feats:
            features_by_id[feat.id] = feat

    return (uniprot_gene_name, uniprot_mnemonic, uniprot_mnemonic_reverse,
            uniprot_mgi, uniprot_rgd, uniprot_mgi_reverse, uniprot_rgd_reverse,
            uniprot_length, uniprot_reviewed, uniprot_features, features_by_id,
            organisms_by_id, uniprot_entrez, uniprot_entrez_reverse,
            mgi_name_to_up, rgd_name_to_up)
Ejemplo n.º 11
0
def _build_refseq_sequences():
    seq_file = resource_manager.get_create_resource_file('refseq_seq',
                                                         cached=True)
    logger.info("Loading RefSeq protein sequences...")
    seq = load_fasta_sequences(seq_file, id_delimiter=' ', id_index=0)
    return seq