Example #1
0
def load_refsnp_merged(fname):
    refsnp_merged = {}

    debug(f"Loading refsnp_merged file '{fname}'...")

    with gzip.open(fname, "rt", encoding="utf-8") as f:
        for line in f:
            fields = line.strip().split()
            rsid, merged_rsid = fields[0], fields[1]
            refsnp_merged[rsid] = merged_rsid

    debug(f"Complete loading refsnp_merged file '{fname}'...")

    return refsnp_merged
Example #2
0
def parse_grch38_dbsnp(fname):

    debug(f'Began parsing GRCh38')

    db ={}

    with gzip.open(fname, 'rt') as f:
        for line in f:
            fields = line.strip().split()
            rsid, chromosome, strand = fields[0], fields[1], fields[3]

            db[rsid] = (chromosome, strand)

    debug(f'Finishing parsing GRCh38')

    return db
Example #3
0
def load_dbsnp_by_coordinate(fname, coordinates, offset=0):
    """
    Read in NCBI dbSNP and return subset of entries keyed by coordinate. E.g.:

        db = {"1:1900500": ["rs123"],
              "3:2900500": ["rs456", "rs789"], ...}
    """

    db = {}

    plink_map = {str(n): str(n) for n in range(1, 23)}
    plink_map.update({
        "X": "23",
        "Y": "24",
        "PAR": "25",
        "M": "26",
        "MT": "26"
    })

    debug(f"Loading dbSNP file '{fname}'...")

    with gzip.open(fname, "rt", encoding="utf-8") as f:
        for line in f:
            fields = line.strip().split()

            fields_len = len(fields)

            if fields_len < 3 or fields[2] == "":
                continue

            snp_id = "rs" + fields[0]
            chromosome = plink_map[fields[1]]
            position = str(int(fields[2]) + offset)

            k = chromosome + ":" + position

            if k in coordinates:
                if fields_len >= 4:
                    db.setdefault(k, []).append(snp_id)
                else:
                    if fields[1] == "AltOnly":
                        db[k] = ["AltOnly"]
                    else:
                        debug("len(fields) < 4 and not AltOnly: " +
                              str(fields))

    return db
Example #4
0
def load_dbsnp_by_snp_id(fname, snp_ids, offset=0):
    """
    Read in NCBI dbSNP and return subset of entries keyed by SNP Id. E.g.:

        db = {"rs123": "1:1900500",
              "rs456": "2:3434343"}"
    """

    db = {}

    plink_map = {str(n): str(n) for n in range(1, 23)}
    plink_map.update({
        "X": "23",
        "Y": "24",
        "PAR": "25",
        "M": "26",
        "MT": "26"
    })

    debug(f"Loading dbSNP file '{fname}'...")

    with gzip.open(fname, "rt", encoding="utf-8") as f:
        for line in f:
            fields = line.strip().split()

            fields_len = len(fields)

            if fields_len < 3 or fields[2] == "":
                continue

            snp_id = "rs" + fields[0]

            if snp_id in snp_ids:
                if fields[1] == 'AltOnly':
                    db[snp_id] = ['AltOnly']
                else:
                    chromosome = plink_map[fields[1]]
                    position = str(int(fields[2]) + offset)
                    db[snp_id] = chromosome + ':' + position

    debug(f"Completed loading dbSNP file '{fname}'...")

    return db
Example #5
0
def process_dbsnp(fnames, outfile):

    with gzip.open(outfile + '.gz', 'at') as out:
        for fname in fnames:

            chromosome = re.search('chr(.*).json', fname).group(1)

            debug(f'Began parsing chr{chromosome} dbsnp file')
            with bz2.open(fname, "rb") as f2:
                for line in f2:
                    d = json.loads(line.decode('utf-8'))
                    snpid = d['refsnp_id']
                    orientation = '-'

                    if d['present_obs_movements']:
                        position = str(d['present_obs_movements'][0]
                                       ['allele_in_cur_release']['position'])
                    else:
                        debug(
                            f'rs{snpid} on chr{chromosome} does not have a position available'
                        )
                        continue

                    if len(d['primary_snapshot_data']['allele_annotations'][0]
                           ['assembly_annotation']) > 0:
                        if len(d['primary_snapshot_data']['allele_annotations']
                               [0]['assembly_annotation'][0]['genes']) > 0:
                            orientation = d['primary_snapshot_data'][
                                'allele_annotations'][0][
                                    'assembly_annotation'][0]['genes'][0][
                                        'orientation']
                            if orientation == 'plus':
                                orientation = '0'
                            elif orientation == 'minus':
                                orientation = '1'
                            else:
                                orientation = '-'
                    else:
                        debug(
                            f'rs{snpid} on chr{chromosome} does not have a orientation information',
                            level=2)

                    print(snpid + " " + chromosome + " " + position + " " +
                          orientation,
                          file=out)

                debug(f'Finished parsing chr{chromosome} dbsnp file')
Example #6
0
def map_using_coord_logic(bim_entries,
                          snps,
                          dbsnp,
                          keep_multi=False,
                          keep_unmapped_rsids=False,
                          skip_rs_ids=False):
    snps_to_update = []
    snps_to_delete = []
    multi_snps = []

    for entry in bim_entries:
        k = entry["chromosome"] + ":" + entry["position"]
        snp = entry["snp_id"]

        if snp.startswith("rs") and skip_rs_ids:
            continue

        if k in dbsnp:
            if len(dbsnp[k]) > 1:
                debug(f"Has more than one snp_id dbsnp[{k}] = {str(dbsnp[k])}")
                if keep_multi:
                    multi_snps.append((k, dbsnp[k]))

                    # This prevents rs123 being updated to rs123 (No change)
                    # This also checks to see if snp already in bim so duplicate snps are not included twice
                    if dbsnp[k][0] != snp and dbsnp[k][0] not in snps:
                        snps_to_update.append((snp, dbsnp[k][0]))
                    else:
                        continue
                else:
                    if keep_unmapped_rsids and snp.startswith("rs"):
                        continue
                    snps_to_delete.append(snp)
            else:
                if dbsnp[k][0] != snp:
                    debug(
                        f"Rewrote snp_id {snp} to {dbsnp[k][0]} for position {k}"
                    )
                    snps_to_update.append((snp, dbsnp[k][0]))
                    snp = dbsnp[k][0]
        else:
            if keep_unmapped_rsids and snp.startswith("rs"):
                continue
            debug("NO_MATCH: " + "\t".join(entry.values()))
            snps_to_delete.append(snp)

    return snps_to_delete, snps_to_update, multi_snps
Example #7
0
def process_rsmerge(fname, outfile):

    with gzip.open(outfile + '.gz', 'at') as out:

        debug(f'Began parsing Rsmerge file')
        with bz2.open(fname, "rb") as f2:
            for line in f2:
                d = json.loads(line.decode('utf-8'))
                if len(d['merged_snapshot_data']['merged_into']) > 0:
                    snpid = d['refsnp_id']
                    merged_snpid = d['merged_snapshot_data']['merged_into'][0]
                    print(snpid + " " + merged_snpid, file=out)
                else:
                    snpid = d['refsnp_id']
                    debug(f'rs{snpid} has no merge info!')

        debug(f'Finished parsing Rsmerge file')
Example #8
0
def map_chromosomes(grch37, grch38_db):

    debug(f'Began mapping GRCh37 chromosomes')

    db = {}
    multi_entries = set()

    with gzip.open(grch37, 'rt') as f:
        for line in f:
            fields = line.strip().split()
            rsid, chromosome, position = fields[:]

            if not chromosome.startswith("NC"):
                continue

            rsid = rsid[2:]

            if rsid in db:
                multi_entries.add(rsid)
                continue

            if rsid in grch38_db:
                chromosome = grch38_db[rsid][0]
                strand = grch38_db[rsid][1]
            else:
                debug(f'{rsid} was not found in GRCh38, therefore no change in chromosome', level=2)
                continue

            db[rsid] = chromosome + " " + position + " " + strand

    # remove any multi snps
    for rsid in multi_entries:
        del db[rsid]

    debug(f'Finished mapping GRCh37 chromosomes')

    return db, multi_entries
Example #9
0
def map_using_rs_id_logic(snp_map, dbsnp, unmappable_snps):
    snps_to_delete = []
    snps_to_update = []
    coords_to_update = []
    chromosomes_to_update = []

    snps_already_updated = set()

    for snp_id, original_coord, snp_id_new in snp_map:
        # If the snp has been updated (merged)
        if snp_id_new != snp_id:

            # If the merged snp was already in the original
            if snp_id_new in [snp[0] for snp in snp_map]:
                snps_to_delete.append(snp_id)

            elif snp_id_new in dbsnp:

                # If snp already being updated avoids duplicate snps
                if snp_id_new in snps_already_updated:
                    snps_to_delete.append(snp_id)
                    continue

                snps_to_update.append((snp_id, snp_id_new))
                snps_already_updated.add(snp_id_new)

                debug(
                    f"original_coord={original_coord} updated_coord={dbsnp[snp_id_new]}",
                    level=2)

                new_chromosome, new_position = dbsnp[snp_id_new].split(":")
                original_chromosome, original_position = original_coord.split(
                    ":")

                if new_position != original_position:
                    coords_to_update.append((snp_id_new, new_position))

                if new_chromosome != original_chromosome:
                    chromosomes_to_update.append((snp_id_new, new_chromosome))

            # If snp_id is updated to snp_id_new but unable to update chromosome and position
            elif snp_id_new in unmappable_snps:

                # If snp already being updated avoids duplicate snps
                if snp_id_new in snps_already_updated:
                    snps_to_delete.append(snp_id)
                    continue

                snps_to_update.append((snp_id, snp_id_new))
                snps_already_updated.add(snp_id_new)

                debug(
                    f"{snp_id} was updated to {snp_id_new} but cannot be updated by chr:position due to having multiple positions inside of GRCh37 VCF file"
                )

            else:
                snps_to_delete.append(snp_id)

        # If snp_id was not merged and is the same as snp_id_new (no change)
        else:
            if snp_id in dbsnp:
                debug(
                    f"original_coord={original_coord} updated_coord={dbsnp[snp_id]}",
                    level=2)

                new_chromosome, new_position = dbsnp[snp_id].split(":")
                original_chromosome, original_position = original_coord.split(
                    ":")

                if new_position != original_position:
                    coords_to_update.append((snp_id, new_position))

                if new_chromosome != original_chromosome:
                    chromosomes_to_update.append((snp_id, new_chromosome))

            elif snp_id in unmappable_snps:
                debug(
                    f"{snp_id} cannot be updated due to having multiple positions inside of GRCh37 VCF file"
                )

            # If snp_id is not in dbsnp it has been deleted
            else:
                snps_to_delete.append(snp_id)

    return snps_to_delete, snps_to_update, coords_to_update, chromosomes_to_update