Example #1
0
    def add_sample(self, sample_id, cnv_file):
        """
        """
        print "Adding CNVs for %s" % sample_id
        self._db.samples.remove({"sample_id": sample_id})
        self._db.cnvs.remove({"sample_id": sample_id})
        self._db.cnvs.ensure_index([("sample_id", 1), ("genes", 1)])

        self._db.samples.insert({"sample_id": str(sample_id), "status": "loading"})

        reader = csv.reader(cnv_file)
        reader.next()
        for row in reader:
            chrom = "chr" + row[7]
            start = int(row[5])
            stop = int(row[6])
            xstart = genomeloc.get_single_location(chrom, start)
            xstop = genomeloc.get_single_location(chrom, stop)
            cnv = {
                "sample_id": sample_id,
                "type": row[3],
                "nexons": int(row[4]),
                "xstart": xstart,
                "xstop": xstop,
                "genes": self.reference.get_genes_in_region(xstart, xstop),
                "reads": [int(row[10]), int(row[11])],
                "read_ratio": float(row[12]),
            }
            self._db.cnvs.insert(cnv)
Example #2
0
    def add_sample(self, sample_id, cnv_file):
        """
        """
        print "Adding CNVs for %s" % sample_id
        self.remove_sample(sample_id)
        self._db.cnvs.ensure_index([('sample_id', 1), ('genes', 1)])  # silly to have this here

        self._db.samples.insert({
            'sample_id': str(sample_id),
            'status': 'loading',
        })

        reader = csv.reader(cnv_file)
        reader.next()
        for row in reader:
            chrom = 'chr' + row[7]
            start = int(row[5])
            stop = int(row[6])
            xstart = genomeloc.get_single_location(chrom, start)
            xstop = genomeloc.get_single_location(chrom, stop)
            cnv = {
                'sample_id': sample_id,
                'type': row[3],
                'nexons': int(row[4]),
                'xstart': xstart,
                'xstop': xstop,
                'genes': self.reference.get_genes_in_region(chrom, start, stop),
                'reads': [int(row[10]), int(row[11])],
                'read_ratio': float(row[12]),
            }
            self._db.cnvs.insert(cnv)
Example #3
0
    def add_sample(self, sample_id, cnv_file):
        """
        """
        print "Adding CNVs for %s" % sample_id
        self.remove_sample(sample_id)
        self._db.cnvs.ensure_index([('sample_id', 1), ('genes', 1)])  # silly to have this here

        self._db.samples.insert({
            'sample_id': str(sample_id),
            'status': 'loading',
        })

        reader = csv.reader(cnv_file)
        reader.next()
        for row in reader:
            chrom = 'chr' + row[7]
            start = int(row[5])
            stop = int(row[6])
            xstart = genomeloc.get_single_location(chrom, start)
            xstop = genomeloc.get_single_location(chrom, stop)
            cnv = {
                'sample_id': sample_id,
                'type': row[3],
                'nexons': int(row[4]),
                'xstart': xstart,
                'xstop': xstop,
                'genes': self.reference.get_genes_in_region(xstart, xstop),
                'reads': [int(row[10]), int(row[11])],
                'read_ratio': float(row[12]),
            }
            self._db.cnvs.insert(cnv)
Example #4
0
def create_genome_subset_from_interval_list(interval_list_file):
    """
    Creates a genome subset from interval list file
    This is a file with cols chr, start, stop, strand, name
    Strand and name are ignored, and actually it could have extra cols too and won't complain
    Coordinates are 1-indexed and inclusive
    """
    intervals = []
    for line in interval_list_file:
        fields = line.strip('\n').split('\t')
        chrom = 'chr'+fields[0]
        start = int(fields[1])
        end = int(fields[2])
        intervals.append((genomeloc.get_single_location(chrom, start), genomeloc.get_single_location(chrom, end)))
    return GenomeSubsetFilter(intervals)
Example #5
0
def create_genome_subset_from_interval_list(interval_list_file):
    """
    Creates a genome subset from interval list file
    This is a file with cols chr, start, stop, strand, name
    Strand and name are ignored, and actually it could have extra cols too and won't complain
    Coordinates are 1-indexed and inclusive
    """
    intervals = []
    for line in interval_list_file:
        fields = line.strip('\n').split('\t')
        chrom = 'chr' + fields[0]
        start = int(fields[1])
        end = int(fields[2])
        intervals.append((genomeloc.get_single_location(chrom, start),
                          genomeloc.get_single_location(chrom, end)))
    return GenomeSubsetFilter(intervals)
Example #6
0
def load_dbnsfp():

    polyphen_map = {
        'D': 'probably_damaging',
        'P': 'possibly_damaging',
        'B': 'benign',
    }

    sift_map = {
        'D': 'damaging',
        'T': 'tolerated',
    }

    fathmm_map = {
        'D': 'damaging',
        'T': 'tolerated',
    }

    muttaster_map = {
        'A': 'disease_causing',
        'D': 'disease_causing',
        'N': 'polymorphism',
        'P': 'polymorphism',
    }

    nsfp_file = open(settings.INTERMEDIATE_FILE_DIR + 'dbnsfp.tsv', 'w')
    for chrom in CHROMOSOMES:
        print "Reading dbNSFP data for {}".format(chrom)
        single_chrom_file = open(settings.DBNSFP_DIR + 'dbNSFP2.1_variant.' + chrom)
        for i, line in enumerate(single_chrom_file):
            if i == 0: continue
            fields = line.strip('\n').split('\t')
            chrom, pos, ref, alt = fields[:4]
            chrom = 'chr' + chrom
            pos = int(pos)
            xpos = genomeloc.get_single_location(chrom, pos)
            if not xpos:
                continue
            polyphen = polyphen_map.get(fields[25], '.')
            sift = sift_map.get(fields[23], '.')
            fathmm = fathmm_map.get(fields[39], '.')
            muttaster = muttaster_map.get(fields[33], '.')
            fields = [
                str(xpos),
                ref,
                alt,
                polyphen,
                sift,
                fathmm,
                muttaster
            ]
            nsfp_file.write('\t'.join(fields)+'\n')
Example #7
0
    def load_population(self, population):
        """
        Take a population and a data source; extract and load it into annotator
        Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data)
        """
        if population['file_type'] == 'vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            progress = get_progressbar(
                size, 'Loading vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file,
                                                 genotypes=True,
                                                 genotype_meta=False):
                progress.update(progress_file.tell())
                freq = get_aaf(variant)
                self._add_population_frequency(variant.xpos, variant.ref,
                                               variant.alt, population['slug'],
                                               freq)
            vcf_file.close()

        elif population['file_type'] == 'sites_vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            meta_key = population.get('vcf_info_key', 'AF')

            progress = get_progressbar(
                size, 'Loading sites vcf: {}'.format(population['slug']))
            is_1kg_popmax = "popmax" in meta_key.lower() and (
                "1000 Genomes" in population["name"])
            if is_1kg_popmax:
                meta_fields = [
                    "EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF"
                ]
            else:
                meta_fields = [
                    meta_key,
                ]

            for variant in vcf_stuff.iterate_vcf(vcf_file,
                                                 meta_fields=meta_fields):
                progress.update(progress_file.tell())
                if "popmax" in meta_key.lower() and ("1000 Genomes"
                                                     in population["name"]):
                    allele_idx = variant.extras['alt_allele_pos']
                    freq = 0
                    for meta_key in meta_fields:
                        freq = max(
                            freq,
                            float(
                                variant.extras.get(meta_key,
                                                   0).split(',')[allele_idx]))

                    ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)">
                else:
                    freq = float(
                        variant.extras.get(
                            meta_key,
                            0).split(',')[variant.extras['alt_allele_pos']])

                self._add_population_frequency(variant.xpos, variant.ref,
                                               variant.alt, population['slug'],
                                               freq)
            vcf_file.close()

        #
        # Directory of per-chromosome VCFs that ESP publishes
        #
        elif population['file_type'] == 'esp_vcf_dir':
            for filename in os.listdir(population['dir_path']):
                file_path = os.path.abspath(
                    os.path.join(population['dir_path'], filename))
                f = open(file_path)
                file_size = os.path.getsize(file_path)
                progress = get_progressbar(
                    file_size, 'Loading ESP file: {}'.format(filename))
                for variant in get_variants_from_esp_file(f):
                    progress.update(f.tell())
                    self._add_population_frequency(
                        variant['xpos'], variant['ref'], variant['alt'],
                        population['slug'], variant[population['counts_key']])
                f.close()
        #
        # text file of allele counts, as Monkol has been using for the joint calling data
        #
        elif population['file_type'] == 'counts_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file

            progress = get_progressbar(
                size, 'Loading population: {}'.format(population['slug']))
            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = 'chr' + fields[0]
                pos = int(fields[1])
                xpos = genomeloc.get_single_location(chrom, pos)
                ref = fields[2]
                alt = fields[3]
                if int(fields[5]) == 0:
                    continue
                freq = float(fields[4]) / float(fields[5])
                self._add_population_frequency(xpos, ref, alt,
                                               population['slug'], freq)
            counts_file.close()

        # this is now the canonical allele frequency file -
        # tab separated file with xpos / ref / alt / freq
        elif population['file_type'] == 'xbrowse_freq_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                progress_file = counts_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(
                size, 'Loading population: {}'.format(population['slug']))

            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                xpos = int(fields[0])
                ref = fields[1]
                alt = fields[2]
                freq = float(fields[3])
                self._add_population_frequency(xpos, ref, alt,
                                               population['slug'], freq)
            counts_file.close()

        elif population['file_type'] == 'tsv_file':
            if population['file_path'].endswith('.gz'):
                freq_file = gzip.open(population['file_path'])
                progress_file = freq_file.fileobj
            else:
                freq_file = open(population['file_path'])
                progress_file = freq_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(
                size, 'Loading population: {}'.format(population['slug']))
            header = next(freq_file)
            print("Header: " + header)
            for line in freq_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = fields[0]
                pos = int(fields[1])
                ref = fields[2]
                alt = fields[3]
                freq = float(fields[4])

                xpos = genomeloc.get_single_location(chrom, pos)
                self._add_population_frequency(xpos, ref, alt,
                                               population['slug'], freq)
            freq_file.close()

        elif population['file_type'] == 'sites_vcf_with_counts':
            if population['file_path'].endswith(
                    '.gz') or population['file_path'].endswith('.bgz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            ac_info_key = population['ac_info_key']
            an_info_key = population['an_info_key']

            progress = get_progressbar(
                size, 'Loading sites vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(
                    vcf_file, meta_fields=[ac_info_key, an_info_key]):
                progress.update(progress_file.tell())

                alt_allele_pos = variant.extras['alt_allele_pos']
                try:
                    ac = int(
                        variant.extras.get(ac_info_key).split(',')
                        [alt_allele_pos].replace("NA", "0"))
                except Exception, e:
                    print(
                        "Couldn't parse AC value %s from %s: %s" %
                        (alt_allele_pos, ac_info_key, variant.extras), e)
                    continue

                try:
                    if "popmax" in ac_info_key.lower():
                        AN_index = alt_allele_pos  # each allele may have a different AN value from a different population
                    else:
                        AN_index = 0

                    an = int(
                        variant.extras.get(an_info_key).split(',')
                        [AN_index].replace("NA", "0"))
                except Exception, e:
                    print(
                        "Couldn't parse AN value %s from %s: %s" %
                        (alt_allele_pos, an_info_key, variant.extras), e)
                    continue

                if an == 0:
                    freq = 0.0
                else:
                    freq = float(ac) / an
                self._add_population_frequency(variant.xpos, variant.ref,
                                               variant.alt, population['slug'],
                                               freq)
    def load(self):
        self._db.drop_collection('variants')
        self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        # load dbnsfp info
        polyphen_map = {
            'D': 'probably_damaging',
            'P': 'possibly_damaging',
            'B': 'benign',
            '.': None
        }

        sift_map = {'D': 'damaging', 'T': 'tolerated', '.': None}

        fathmm_map = {'D': 'damaging', 'T': 'tolerated', '.': None}

        muttaster_map = {
            'A': 'disease_causing',
            'D': 'disease_causing',
            'N': 'polymorphism',
            'P': 'polymorphism',
            '.': None
        }

        #interesting_fields = "rs_dbSNP141   Ancestral_allele  SIFT_score    SIFT_converted_rankscore  SIFT_pred Polyphen2_HDIV_pred   Polyphen2_HVAR_pred   MutationTaster_pred   MutationAssessor_pred FATHMM_pred   MetaSVM_pred    CADD_phred"
        #     LRT_pred        MetaLR_pred     VEST3_rankscore PROVEAN_converted_rankscore     PROVEAN_pred    CADD_raw        CADD_raw_rankscore      GERP++_NR       GERP++_RS       GERP++_RS_rankscore    ESP6500_AA_AF   ESP6500_EA_AF   ARIC5606_AA_AC  ARIC5606_AA_AF  ARIC5606_EA_AC  ARIC5606_EA_AF  ExAC_AC ExAC_AF ExAC_Adj_AC     ExAC_Adj_AF     ExAC_AFR_AC     ExAC_AFR_AF     ExAC_AMR_AC     ExAC_AMR_AF     ExAC_EAS_AC     ExAC_EAS_AF     ExAC_FIN_AC     ExAC_FIN_AF     ExAC_NFE_AC     ExAC_NFE_AF     ExAC_SAS_AC     ExAC_SAS_AF     clinvar_rs      clinvar_clnsig  clinvar_trait"
        #interesting_fields = interesting_fields.split()

        def collapse(scores):
            s = set(scores.split(";"))
            if len(s) > 1:
                raise ValueError("Couldn't collapse %s" % str(scores))
            return list(s)[0]

        pred_rank = ['D', 'A', 'T', 'N', 'P', 'B', '.']

        def select_worst(pred_value):
            i = len(pred_rank) - 1
            for pred in pred_value.split(";"):
                r = pred_rank.index(pred)
                if r < i:
                    i = r
            return pred_rank[i]

        for chrom in CHROMOSOMES:
            if chrom == "chrM":
                continue  # no dbNSFP data for chrM

            print "Reading dbNSFP data for {}".format(chrom)
            single_chrom_file = open(self._settings.dbnsfp_dir +
                                     'dbNSFP2.9_variant.' + chrom)
            header = single_chrom_file.readline()
            header_fields = header.strip("\n").split()
            field_index = {
                name: header_fields.index(name)
                for name in header_fields
            }

            for i, line in enumerate(single_chrom_file):
                if i == 0:
                    continue
                if not i % 100000:
                    print i
                fields = line.strip('\n').split('\t')
                chrom, pos, ref, alt = fields[:4]
                chrom = 'chr' + chrom
                pos = int(pos)
                xpos = genomeloc.get_single_location(chrom, pos)
                if not xpos:
                    raise ValueError(
                        "Unexpected chr, pos: %(chrom)s, %(pos)s" %
                        (chrom, pos))

                rsid = fields[field_index["rs_dbSNP141"]]
                annotations_dict = {
                    'rsid':
                    rsid if rsid != '.' else None,
                    'polyphen':
                    polyphen_map[select_worst(
                        fields[field_index["Polyphen2_HVAR_pred"]])],
                    'sift':
                    sift_map[select_worst(fields[field_index["SIFT_pred"]])],
                    'fathmm':
                    fathmm_map[select_worst(
                        fields[field_index["FATHMM_pred"]])],
                    'muttaster':
                    muttaster_map[select_worst(
                        fields[field_index["MutationTaster_pred"]])],
                    'metasvm':
                    collapse(fields[field_index["MetaSVM_pred"]]),
                    #'cadd_phred': collapse(fields[field_index["CADD_phred"]]),
                }

                #extras_to_add_now = ["clinvar_rs", "clinvar_clnsig", "clinvar_trait"]
                #for name in extras_to_add_now:
                #    annotations_dict[name] = fields[field_index[name]]

                self._db.variants.update({
                    'xpos': xpos,
                    'ref': ref,
                    'alt': alt
                }, {'$set': annotations_dict},
                                         upsert=True)
    def load(self):
        self._db.drop_collection('variants')
        self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        # load dbsnp info
        for i, variant in enumerate(vcf_stuff.iterate_vcf(open(self._settings.dbsnp_vcf_file))):
            if not i % 100000:
                print i
            self._db.variants.update(
                {'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt},
                {'$set': {'rsid': variant.vcf_id}},
                upsert=True
            )

        # load dbnsfp info
        polyphen_map = {
            'D': 'probably_damaging',
            'P': 'possibly_damaging',
            'B': 'benign',
        }

        sift_map = {
            'D': 'damaging',
            'T': 'tolerated',
        }

        fathmm_map = {
            'D': 'damaging',
            'T': 'tolerated',
        }

        muttaster_map = {
            'A': 'disease_causing',
            'D': 'disease_causing',
            'N': 'polymorphism',
            'P': 'polymorphism',
        }

        for chrom in CHROMOSOMES:
            print "Reading dbNSFP data for {}".format(chrom)
            single_chrom_file = open(self._settings.dbnsfp_dir + 'dbNSFP2.1_variant.' + chrom)
            for i, line in enumerate(single_chrom_file):
                if i == 0:
                    continue
                if not i%100000:
                    print i
                fields = line.strip('\n').split('\t')
                chrom, pos, ref, alt = fields[:4]
                chrom = 'chr' + chrom
                pos = int(pos)
                xpos = genomeloc.get_single_location(chrom, pos)
                if not xpos:
                    continue
                polyphen = polyphen_map.get(fields[25])
                sift = sift_map.get(fields[23])
                fathmm = fathmm_map.get(fields[39])
                muttaster = muttaster_map.get(fields[33])

                self._db.variants.update(
                    {'xpos': xpos, 'ref': ref, 'alt': alt},
                    {'$set': {
                        'polyphen': polyphen,
                        'sift': sift,
                        'fathmm': fathmm,
                        'muttaster': muttaster,
                    }},
                    upsert=True
                )
    def load_population(self, population):
        """
        Take a population and a data source; extract and load it into annotator
        Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data)
        """
        if population['file_type'] == 'vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            progress = get_progressbar(size, 'Loading vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False):
                progress.update(progress_file.tell())
                freq = get_aaf(variant)
                self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq)
            vcf_file.close()

        elif population['file_type'] == 'sites_vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            meta_key = population.get('vcf_info_key', 'AF')

            progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug']))
            is_1kg_popmax = "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"])
            if is_1kg_popmax:
                meta_fields = ["EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF"]
            else:
                meta_fields = [meta_key,]

            for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=meta_fields):
                progress.update(progress_file.tell())
                if "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]):
                    allele_idx = variant.extras['alt_allele_pos']
                    freq = 0
                    for meta_key in meta_fields:
                        freq = max(freq, float(variant.extras.get(meta_key, 0).split(',')[allele_idx]))

                    ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)">
                else:
                    freq = float(variant.extras.get(meta_key, 0).split(',')[variant.extras['alt_allele_pos']])

                self._add_population_frequency(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    population['slug'],
                    freq
                )
            vcf_file.close()

        #
        # Directory of per-chromosome VCFs that ESP publishes
        #
        elif population['file_type'] == 'esp_vcf_dir':
            for filename in os.listdir(population['dir_path']):
                file_path = os.path.abspath(os.path.join(population['dir_path'], filename))
                f = open(file_path)
                file_size = os.path.getsize(file_path)
                progress = get_progressbar(file_size, 'Loading ESP file: {}'.format(filename))
                for variant in get_variants_from_esp_file(f):
                    progress.update(f.tell())
                    self._add_population_frequency(
                        variant['xpos'],
                        variant['ref'],
                        variant['alt'],
                        population['slug'],
                        variant[population['counts_key']]
                    )
                f.close()
        #
        # text file of allele counts, as Monkol has been using for the joint calling data
        #
        elif population['file_type'] == 'counts_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file

            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))
            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = 'chr' + fields[0]
                pos = int(fields[1])
                xpos = genomeloc.get_single_location(chrom, pos)
                ref = fields[2]
                alt = fields[3]
                if int(fields[5]) == 0:
                    continue
                freq = float(fields[4]) / float(fields[5])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
            counts_file.close()

        # this is now the canonical allele frequency file -
        # tab separated file with xpos / ref / alt / freq
        elif population['file_type'] == 'xbrowse_freq_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                progress_file = counts_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))

            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                xpos = int(fields[0])
                ref = fields[1]
                alt = fields[2]
                freq = float(fields[3])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
            counts_file.close()

        elif population['file_type'] == 'tsv_file':
            if population['file_path'].endswith('.gz'):
                freq_file = gzip.open(population['file_path'])
                progress_file = freq_file.fileobj
            else:
                freq_file = open(population['file_path'])
                progress_file = freq_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))
            header = next(freq_file)
            print("Header: " + header)
            for line in freq_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = fields[0]
                pos = int(fields[1])
                ref = fields[2]
                alt = fields[3]
                freq = float(fields[4])

                xpos = genomeloc.get_single_location(chrom, pos)
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
            freq_file.close()

        elif population['file_type'] == 'sites_vcf_with_counts':
            if population['file_path'].endswith('.gz') or population['file_path'].endswith('.bgz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            ac_info_key = population['ac_info_key']
            an_info_key = population['an_info_key']

            progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=[ac_info_key, an_info_key]):
                progress.update(progress_file.tell())

                alt_allele_pos = variant.extras['alt_allele_pos']
                try:
                    ac = int(variant.extras.get(ac_info_key).split(',')[alt_allele_pos].replace("NA", "0"))
                except Exception, e:
                    print("Couldn't parse AC value %s from %s: %s" % (alt_allele_pos, ac_info_key, variant.extras), e)
                    continue

                try:
                    if "popmax" in ac_info_key.lower():
                        AN_index = alt_allele_pos  # each allele may have a different AN value from a different population
                    else:
                        AN_index = 0

                    an = int(variant.extras.get(an_info_key).split(',')[AN_index].replace("NA", "0"))
                except Exception, e:
                    print("Couldn't parse AN value %s from %s: %s" % (alt_allele_pos, an_info_key, variant.extras), e)
                    continue

                if an == 0:
                    freq = 0.0
                else:
                    freq = float(ac)/an
                self._add_population_frequency(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    population['slug'],
                    freq
                )
    def load_population_to_annotator(self, population):
        """
        Take a population and a data source; extract and load it into annotator
        Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data)
        """
        if population['file_type'] == 'vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
            else:
                vcf_file = open(population['file_path'])
            for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False)):
                if i % 10000 == 0:
                    print i
                freq = get_aaf(variant)
                self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq)
        elif population['file_type'] == 'sites_vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
            else:
                vcf_file = open(population['file_path'])
            meta_key = population['vcf_info_key']
            for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, meta_fields=[meta_key,])):
                if i % 10000 == 0:
                    print i
                freq = float(variant.extras.get(meta_key, 0))
                self._add_population_frequency(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    population['slug'],
                    freq
                )

        #
        # Directory of per-chromosome VCFs that ESP publishes
        #
        elif population['file_type'] == 'esp_vcf_dir':
            for filename in os.listdir(population['dir_path']):
                print "Adding %s" % filename
                file_path = os.path.abspath(os.path.join(population['dir_path'], filename))
                f = open(file_path)
                for i, variant in enumerate(get_variants_from_esp_file(f)):
                    if i % 10000 == 0:
                        print i
                    self._add_population_frequency(
                        variant['xpos'],
                        variant['ref'],
                        variant['alt'],
                        population['slug'],
                        variant[population['counts_key']]
                    )

        #
        # text file of allele counts, as Monkol has been using for the joint calling data
        #
        elif population['file_type'] == 'counts_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
            else:
                counts_file = open(population['file_path'])
            for i, line in enumerate(counts_file):
                if i % 10000 == 0:
                    print i
                fields = line.strip('\n').split('\t')
                chrom = 'chr' + fields[0]
                pos = int(fields[1])
                xpos = genomeloc.get_single_location(chrom, pos)
                ref = fields[2]
                alt = fields[3]
                if int(fields[5]) == 0:
                    continue
                freq = float(fields[4]) / float(fields[5])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )

        # this is now the canonical allele frequency file -
        # tab separated file with xpos / ref / alt / freq
        elif population['file_type'] == 'xbrowse_freq_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
            else:
                counts_file = open(population['file_path'])
            for i, line in enumerate(counts_file):
                if i % 10000 == 0:
                    print i
                fields = line.strip('\n').split('\t')
                xpos = int(fields[0])
                ref = fields[1]
                alt = fields[2]
                freq = float(fields[3])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
    def load(self):
        self._db.drop_collection('variants')
        self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        # load dbnsfp info
        polyphen_map = {
            'D': 'probably_damaging',
            'P': 'possibly_damaging',
            'B': 'benign',
            '.': None
        }

        sift_map = {
            'D': 'damaging',
            'T': 'tolerated',
            '.': None
        }

        fathmm_map = {
            'D': 'damaging',
            'T': 'tolerated',
            '.': None
        }

        muttaster_map = {
            'A': 'disease_causing',
            'D': 'disease_causing',
            'N': 'polymorphism',
            'P': 'polymorphism',
            '.': None
        }

        #interesting_fields = "rs_dbSNP141   Ancestral_allele  SIFT_score    SIFT_converted_rankscore  SIFT_pred Polyphen2_HDIV_pred   Polyphen2_HVAR_pred   MutationTaster_pred   MutationAssessor_pred FATHMM_pred   MetaSVM_pred    CADD_phred"
        #     LRT_pred        MetaLR_pred     VEST3_rankscore PROVEAN_converted_rankscore     PROVEAN_pred    CADD_raw        CADD_raw_rankscore      GERP++_NR       GERP++_RS       GERP++_RS_rankscore    ESP6500_AA_AF   ESP6500_EA_AF   ARIC5606_AA_AC  ARIC5606_AA_AF  ARIC5606_EA_AC  ARIC5606_EA_AF  ExAC_AC ExAC_AF ExAC_Adj_AC     ExAC_Adj_AF     ExAC_AFR_AC     ExAC_AFR_AF     ExAC_AMR_AC     ExAC_AMR_AF     ExAC_EAS_AC     ExAC_EAS_AF     ExAC_FIN_AC     ExAC_FIN_AF     ExAC_NFE_AC     ExAC_NFE_AF     ExAC_SAS_AC     ExAC_SAS_AF     clinvar_rs      clinvar_clnsig  clinvar_trait"
        #interesting_fields = interesting_fields.split()

        def collapse(scores):
            s = set(scores.split(";"))
            if len(s) > 1:
                raise ValueError("Couldn't collapse %s" % str(scores))
            return list(s)[0]

        pred_rank = ['D', 'A', 'T', 'N', 'P', 'B', '.']
        def select_worst(pred_value):
            i = len(pred_rank) - 1
            for pred in pred_value.split(";"):
                r = pred_rank.index(pred)
                if r < i:
                    i = r
            return pred_rank[i]

        for chrom in CHROMOSOMES:
            if chrom == "chrM":
                continue  # no dbNSFP data for chrM

            print "Reading dbNSFP data for {}".format(chrom)
            single_chrom_file = open(self._settings.dbnsfp_dir + 'dbNSFP2.9_variant.' + chrom)
            header = single_chrom_file.readline()
            header_fields = header.strip("\n").split()
            field_index = {name: header_fields.index(name) for name in header_fields}

            for i, line in enumerate(single_chrom_file):
                if i == 0:
                    continue
                if not i%100000:
                    print i
                fields = line.strip('\n').split('\t')
                chrom, pos, ref, alt = fields[:4]
                chrom = 'chr' + chrom
                pos = int(pos)
                xpos = genomeloc.get_single_location(chrom, pos)
                if not xpos:
                    raise ValueError("Unexpected chr, pos: %(chrom)s, %(pos)s" % (chrom, pos))

                rsid = fields[field_index["rs_dbSNP141"]]
                annotations_dict = {
                    'rsid': rsid if rsid != '.' else None,
                    'polyphen': polyphen_map[select_worst(fields[field_index["Polyphen2_HVAR_pred"]])],
                    'sift': sift_map[select_worst(fields[field_index["SIFT_pred"]])],
                    'fathmm': fathmm_map[select_worst(fields[field_index["FATHMM_pred"]])],
                    'muttaster': muttaster_map[select_worst(fields[field_index["MutationTaster_pred"]])],
                    'metasvm': collapse(fields[field_index["MetaSVM_pred"]]),
                    #'cadd_phred': collapse(fields[field_index["CADD_phred"]]),
                }

                #extras_to_add_now = ["clinvar_rs", "clinvar_clnsig", "clinvar_trait"]
                #for name in extras_to_add_now:
                #    annotations_dict[name] = fields[field_index[name]]

                self._db.variants.update(
                    {'xpos': xpos, 'ref': ref, 'alt': alt},
                    {'$set': annotations_dict},
                    upsert=True
                )
Example #13
0
    def load_dbnsfp(self):
        self._db.drop_collection('variants')
        self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        # load dbnsfp info
        polyphen_map = {
            'D': 'probably_damaging',
            'P': 'possibly_damaging',
            'B': 'benign',
            '.': None
        }

        sift_map = {'D': 'damaging', 'T': 'tolerated', '.': None}

        fathmm_map = {'D': 'damaging', 'T': 'tolerated', '.': None}

        muttaster_map = {
            'A': 'disease_causing',
            'D': 'disease_causing',
            'N': 'polymorphism',
            'P': 'polymorphism',
            '.': None
        }

        def collapse(scores):
            s = set(scores.split(";"))
            if len(s) > 1:
                raise ValueError("Couldn't collapse %s" % str(scores))
            return list(s)[0]

        pred_rank = ['D', 'A', 'T', 'N', 'P', 'B', '.']

        def select_worst(pred_value):
            i = len(pred_rank) - 1
            for pred in pred_value.split(";"):
                r = pred_rank.index(pred)
                if r < i:
                    i = r
            return pred_rank[i]

        for chrom in CHROMOSOMES:
            if chrom == "chrM":
                continue  # no dbNSFP data for chrM

            print "Reading dbNSFP data for {}".format(chrom)
            single_chrom_file = open(
                self._settings.dbnsfp_dir[self._genome_version] +
                'dbNSFP2.9_variant.' + chrom)
            header = single_chrom_file.readline()
            header_fields = header.strip("\n").split()
            field_index = {
                name: header_fields.index(name)
                for name in header_fields
            }

            for i, line in tqdm(enumerate(single_chrom_file)):
                if i == 0:
                    continue
                fields = line.strip('\n').split('\t')
                chrom, pos, ref, alt = fields[:4]
                chrom = 'chr' + chrom
                pos = int(pos)
                xpos = genomeloc.get_single_location(chrom, pos)
                if not xpos:
                    raise ValueError(
                        "Unexpected chr, pos: %(chrom)s, %(pos)s" %
                        (chrom, pos))

                rsid = fields[field_index["rs_dbSNP141"]]
                annotations_dict = {
                    'rsid':
                    rsid if rsid != '.' else None,
                    'polyphen':
                    polyphen_map[select_worst(
                        fields[field_index["Polyphen2_HVAR_pred"]])],
                    'sift':
                    sift_map[select_worst(fields[field_index["SIFT_pred"]])],
                    'fathmm':
                    fathmm_map[select_worst(
                        fields[field_index["FATHMM_pred"]])],
                    'muttaster':
                    muttaster_map[select_worst(
                        fields[field_index["MutationTaster_pred"]])],
                    'metasvm':
                    collapse(fields[field_index["MetaSVM_pred"]]),
                    #'cadd_phred': collapse(fields[field_index["CADD_phred"]]),
                }

                self._db.variants.update({
                    'xpos': xpos,
                    'ref': ref,
                    'alt': alt
                }, {'$set': annotations_dict},
                                         upsert=True)
    def load_dbnsfp(self):
        settings.CUSTOM_ANNOTATOR_SETTINGS.db[self._genome_version].drop_collection('variants')
        settings.CUSTOM_ANNOTATOR_SETTINGS.db[self._genome_version].variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        # load dbnsfp info
        polyphen_map = {
            'D': 'probably_damaging',
            'P': 'possibly_damaging',
            'B': 'benign',
            '.': None
        }

        sift_map = {
            'D': 'damaging',
            'T': 'tolerated',
            '.': None
        }

        fathmm_map = {
            'D': 'damaging',
            'T': 'tolerated',
            '.': None
        }

        muttaster_map = {
            'A': 'disease_causing',
            'D': 'disease_causing',
            'N': 'polymorphism',
            'P': 'polymorphism',
            '.': None
        }

        def collapse(scores):
            s = set(scores.split(";"))
            if len(s) > 1:
                raise ValueError("Couldn't collapse %s" % str(scores))
            return list(s)[0]

        pred_rank = ['D', 'A', 'T', 'N', 'P', 'B', '.']

        def select_worst(pred_value):
            i = len(pred_rank) - 1
            for pred in pred_value.split(";"):
                r = pred_rank.index(pred)
                if r < i:
                    i = r
            return pred_rank[i]

        for chrom in CHROMOSOMES:
            if chrom == "chrM":
                continue  # no dbNSFP data for chrM

            print "Reading dbNSFP data for {}".format(chrom)
            single_chrom_file = open(settings.CUSTOM_ANNOTATOR_SETTINGS.dbnsfp_dir[self._genome_version] + 'dbNSFP2.9_variant.' + chrom)
            header = single_chrom_file.readline()
            header_fields = header.strip("\n").split()
            field_index = {name: header_fields.index(name) for name in header_fields}

            for i, line in tqdm(enumerate(single_chrom_file)):
                if i == 0:
                    continue
                fields = line.strip('\n').split('\t')
                chrom, pos, ref, alt = fields[:4]
                chrom = 'chr' + chrom
                pos = int(pos)
                xpos = genomeloc.get_single_location(chrom, pos)
                if not xpos:
                    raise ValueError("Unexpected chr, pos: %(chrom)s, %(pos)s" % (chrom, pos))

                rsid = fields[field_index["rs_dbSNP141"]]
                annotations_dict = {
                    'rsid': rsid if rsid != '.' else None,
                    'polyphen': polyphen_map[select_worst(fields[field_index["Polyphen2_HVAR_pred"]])],
                    'sift': sift_map[select_worst(fields[field_index["SIFT_pred"]])],
                    'fathmm': fathmm_map[select_worst(fields[field_index["FATHMM_pred"]])],
                    'muttaster': muttaster_map[select_worst(fields[field_index["MutationTaster_pred"]])],
                    'metasvm': collapse(fields[field_index["MetaSVM_pred"]]),
                    #'cadd_phred': collapse(fields[field_index["CADD_phred"]]),
                }

                settings.CUSTOM_ANNOTATOR_SETTINGS.db[self._genome_version].variants.update(
                    {'xpos': xpos, 'ref': ref, 'alt': alt},
                    {'$set': annotations_dict},
                    upsert=True
                )