Esempio n. 1
0
def get_cadd(mut, path='/u/sshuai/sshuai/func_score/cadd/v1.3'):
    ''' Get CADD scores with tabix
    '''
    # make chrom string
    mut['chrom'] = mut.chrom.astype(str)
    # create return table
    # SNP only. TO DO: ADD MNP and INDEL Support
    keep = mut['type'].isin(['SNP', 'DEL', 'INS'])
    cadd = mut[keep].copy()
    if cadd.shape[0] == 0:
        logger.warning('No mutations left in CADD adjustment')
        return None
    logger.info('Retrieving CADD SNP Scores')
    # name for version 1.3.
    # A single file for SNP.
    # Use pre-computed PCAWG indel scores
    snp = 'whole_genome_SNVs.tsv.gz'
    indel = 'PCAWG.INDELS.CADD.v1.3.tsv.gz'
    snp_path = os.path.join(path, snp)
    indel_path = os.path.join(path, indel)
    assert os.path.isfile(
        snp_path), 'Cannot find CADD SNP scores in {}'.format(snp_path)
    assert os.path.isfile(
        indel_path), 'Cannot find CADD PCAWG indel scores in {}'.format(
            indel_path)
    # open one CADD
    tb_snp = tabix.open(snp_path)
    tb_indel = tabix.open(indel_path)
    # row apply
    func = lambda x: query_cadd(x[3], tb_snp, tb_indel, x[0], x[1], x[2], x[4],
                                x[5])
    cadd['fscore'] = cadd.apply(func, axis=1)
    return cadd
def get_variants_by_tabix(sample_vcf,
                          contig=None,
                          start=None,
                          end=None,
                          query_str=None,
                          reference_vcf=None):
    """

    :param sample_vcf: str or pytabix handler;
    :param contig: str;
    :param start: int;
    :param end: int;
    :param query_str: int;
    :param reference_vcf: str or pytabix handler;
    :return: list; list of dict
    """

    if isinstance(sample_vcf, str):  # Open sample VCF
        sample_vcf = tabix.open(sample_vcf)

    if query_str:
        records = sample_vcf.querys(query_str)
    else:
        records = sample_vcf.query(contig, start, end)

    if reference_vcf and len(
            list(records)
    ) == 0:  # If sample does not have the record, query reference if given

        if isinstance(reference_vcf, str):  # Open reference VCF
            reference_vcf = tabix.open(reference_vcf)

        records = reference_vcf.query(contig, start - 1, end)

    return [parse_variant(r) for r in records]
Esempio n. 3
0
    def __init__(self, input_path, blacklist_regions=None, bases_order=None):
        """
        Constructs a `Genome` object.
        """
        self.genome = pyfaidx.Fasta(input_path)
        self.chrs = sorted(self.genome.keys())
        self.len_chrs = self._get_len_chrs()
        self._blacklist_tabix = None

        if blacklist_regions == "hg19":
            self._blacklist_tabix = tabix.open(
                pkg_resources.resource_filename(
                    "selene_sdk",
                    "sequences/data/hg19_blacklist_ENCFF001TDO.bed.gz"))
        elif blacklist_regions == "hg38":
            self._blacklist_tabix = tabix.open(
                pkg_resources.resource_filename(
                    "selene_sdk", "sequences/data/hg38.blacklist.bed.gz"))
        elif blacklist_regions is not None:  # user-specified file
            self._blacklist_tabix = tabix.open(blacklist_regions)

        if bases_order is not None:
            bases = [str.upper(b) for b in bases_order]
            self.BASES_ARR = bases
            lc_bases = [str.lower(b) for b in bases]
            self.BASE_TO_INDEX = {
                **{b: ix
                   for (ix, b) in enumerate(bases)},
                **{b: ix
                   for (ix, b) in enumerate(lc_bases)}
            }
            self.INDEX_TO_BASE = {ix: b for (ix, b) in enumerate(bases)}
            self.update_bases_order(bases)
Esempio n. 4
0
 def init_db(
     self,
     gene_file="/datd/huboqiang/test_hESC/database/refGene.up2000_down2000.promoter.Bsorted.longestTid.bed",
     motifBed_file="/data/Analysis/huboqiang/software/encode-motifs-v1.3/matches.txt.gz"
 ):
     """ reference file used. """
     self.file_geneTSS_tb = tabix.open(gene_file)
     self.file_motifBed_tb = tabix.open(motifBed_file)
Esempio n. 5
0
    def _unpicklable_init(self):
        if not self.initialized:
            self.genome = pyfaidx.Fasta(self.input_path)
            self.chrs = sorted(self.genome.keys())
            self.len_chrs = self._get_len_chrs()
            self._blacklist_tabix = None

            if self.blacklist_regions == "hg19":
                self._blacklist_tabix = tabix.open(
                    pkg_resources.resource_filename(
                        "selene_sdk",
                        "sequences/data/hg19_blacklist_ENCFF001TDO.bed.gz"))
            elif self.blacklist_regions == "hg38":
                self._blacklist_tabix = tabix.open(
                    pkg_resources.resource_filename(
                        "selene_sdk", "sequences/data/hg38.blacklist.bed.gz"))
            elif self.blacklist_regions is not None:  # user-specified file
                self._blacklist_tabix = tabix.open(self.blacklist_regions)

            self.lens = np.array([self.len_chrs[c] for c in self.chrs])
            self.inds = {
                c: ind
                for c, ind in zip(
                    self.chrs, np.concatenate([[0], np.cumsum(self.lens)]))
            }
            if self.memmapfile is not None and os.path.isfile(self.memmapfile):
                # load memmap file
                self.sequence_data = np.memmap(self.memmapfile,
                                               dtype="float32",
                                               mode="r")
                self.sequence_data = np.reshape(
                    self.sequence_data,
                    (4, int(self.sequence_data.shape[0] / 4)))
            else:
                # convert all sequences into encoding
                self.sequence_data = np.zeros((4, self.lens.sum()),
                                              dtype=np.float32)
                for c in self.chrs:
                    sequence = self.genome[c][:].seq
                    encoding = self.sequence_to_encoding(sequence)
                    self.sequence_data[:, self.inds[c]:self.inds[c] +
                                       self.len_chrs[c]] = encoding.T
                if self.memmapfile is not None:
                    # create memmap file
                    mmap = np.memmap(self.memmapfile,
                                     dtype="float32",
                                     mode="w+",
                                     shape=self.sequence_data.shape)
                    mmap[:] = self.sequence_data
                    self.sequence_data = np.memmap(
                        self.memmapfile,
                        dtype="float32",
                        mode="r",
                        shape=self.sequence_data.shape)

            self.initialized = True
Esempio n. 6
0
def getJSONObject(params, positives, negatives, tbCladeSNPsFile,
                  tbSNPcladesFile, snpPanelConfigFile):
    tbSNPclades = tabix.open(tbSNPcladesFile)
    tbCladeSNPs = tabix.open(tbCladeSNPsFile)
    uniqPositives = getUniqueSNPsetTabix(positives, tbSNPclades)
    uniqNegatives = getUniqueSNPsetTabix(negatives, tbSNPclades)
    conflicting = uniqPositives.intersection(uniqNegatives)
    uniqPositives = uniqPositives.difference(conflicting)
    uniqNegatives = uniqNegatives.difference(conflicting)
    if len(uniqPositives) == 0:
        return {"error": "unable to determine clade due to no positive SNPs"}
    warning = None
    if len(conflicting) > 0:
        warning = "conflicting calls for same SNP with names " + ", ".join(
            list(conflicting))
    (ranked, hierarchy) = getRankedSolutionsScratch(uniqPositives,
                                                    uniqNegatives, tbCladeSNPs,
                                                    tbSNPclades)
    if "all" in params:
        result = []
        for r in ranked:
            clade = r[1]
            score = r[4]
            result.append(
                decorateJSONObject(params, clade, score, uniqPositives,
                                   uniqNegatives, tbCladeSNPs, tbSNPclades,
                                   hierarchy, snpPanelConfigFile, conflicting,
                                   warning))
        return result
    else:
        if len(ranked) > 0:
            clade = ranked[0][1]
            score = ranked[0][4]
            decorated = decorateJSONObject(params, clade, score, uniqPositives,
                                           uniqNegatives, tbCladeSNPs,
                                           tbSNPclades, hierarchy,
                                           snpPanelConfigFile, conflicting,
                                           warning)
            if len(ranked) > 1 and "score" in params:
                clade = ranked[1][1]
                score = ranked[1][4]
                decorated["nextPrediction"] = {"clade": clade, "score": score}
            return decorated
        else:
            if len(positives) == 1:
                return {
                    "error":
                    "unable to find " + list(positives)[0] +
                    " on the YFull tree"
                }
            else:
                return {
                    "error":
                    "unable to find any of " + ", ".join(positives) +
                    " on the YFull tree"
                }
Esempio n. 7
0
    def init_resource(self):
        """ init features and other annotation resources """
        for rname in ['dbsnp']:
            if self.config.has_option(self.rv, 'dbsnp'):
                import tabix
                self.resources['dbsnp'] = tabix.open(self.config.get(self.rv, 'dbsnp'))

        self.features = []
        for rname in self.config.options(self.rv):
            featdb =  self.config.get(self.rv, rname)
            if featdb.endswith('.featuredb'):
                self.features.append((rname,tabix.open(featdb)))
Esempio n. 8
0
    def init_resource(self):
        """ init features and other annotation resources """
        for rname in ['dbsnp']:
            if self.config.has_option(self.rv, 'dbsnp'):
                import tabix
                self.resources['dbsnp'] = tabix.open(self.config.get(self.rv, 'dbsnp'))

        self.features = []
        for rname in self.config.options(self.rv):
            featdb =  self.config.get(self.rv, rname)
            if featdb.endswith('.featuredb'):
                self.features.append((rname,tabix.open(featdb)))
Esempio n. 9
0
def get_eigen(mut, path='/u/sshuai/sshuai/func_score/eigen/v1.1', coding=True):
    ''' Get eigen scores with tabix
    '''
    # make chrom string
    mut['chrom'] = mut.chrom.astype(str)
    # eigen only support chr 1-22
    valid_chrom = [str(i) for i in range(1, 23)]
    mut = mut[mut.chrom.isin(valid_chrom)]
    # create return table
    # SNP only. TO DO: ADD MNP and INDEL Support
    keep = mut['type'] == 'SNP'
    # chrom != X, Y
    eigen = mut[keep].copy()
    if eigen.shape[0] == 0:
        logger.warning('No mutations left in Eigen adjustment')
        return None
    if coding:
        logger.info('Retrieving Eigen Coding Scores')
        # name for version 1.1. A single file for coding.
        name = 'Eigen_hg19_coding_annot_04092016.tab.bgz'
        file_path = os.path.join(path, name)
        assert os.path.isfile(
            file_path), 'Cannot find eigen coding in {}'.format(file_path)
        # open one eigen
        tb = tabix.open(file_path)
        # row apply
        func = lambda x: query_eigen_SNP(tb, x[0], x[1], x[2], x[4], x[5])
        eigen['fscore'] = eigen.apply(func, axis=1)
    else:
        logger.info('Retrieving Eigen Non-Coding Scores')
        # One file per chrom for non-coding. 22 in total.
        file_dict = {
            str(i):
            os.path.join(path,
                         'Eigen_hg19_noncoding_annot_chr{}.tab.bgz'.format(i))
            for i in range(1, 23)
        }
        # check files, must be 22 True
        file_names = np.array(list(file_dict.values()))
        check_file = np.array([os.path.isfile(f) for f in file_names])
        assert np.sum(
            check_file) == 22, 'Cannot find eigen noncoding in {}'.format(
                ", ".join(file_names[~check_file]))
        # open 22 eigen files
        file_dict = {k: tabix.open(v) for k, v in list(file_dict.items())}
        # row apply
        func = lambda x: query_eigen_SNP(file_dict[str(x[0])], x[0], x[1], x[
            2], x[4], x[5])
        eigen['fscore'] = eigen.apply(func, axis=1)
    return eigen
Esempio n. 10
0
    def init_resource(self):

        for rname in ['dbsnp']:
            if self.config.has_option(self.rv, 'dbsnp'):
                import tabix
                self.resources['dbsnp'] = tabix.open(
                    self.config.get(self.rv, 'dbsnp'))
Esempio n. 11
0
def get_job_results(job_id, job=None):
    filters = request.args.to_dict()
    epacts_filename = job.relative_path("output.epacts.gz")
    with gzip.open(epacts_filename, "rt") as f:
        header = f.readline().rstrip('\n').split('\t')
        if header[1] == "BEG":
            header[1] = "BEGIN"
        if header[0] == "#CHROM":
            header[0] = "CHROM"
    assert len(header) > 0
    headerpos = {x:i for i,x in enumerate(header)}

    if filters.get("region", ""):
        tb = tabix.open(epacts_filename)
        indata = tb.query(chrom, start_pos, end_pos)
    else:
        indata = (x.split("\t") for x in gzip.open(epacts_filename))

    pass_tests = []
    if filters.get("non-monomorphic", False):
        if "AC" not in headerpos:
            raise Exception("Column AC not found")
        ac_index = headerpos["AC"]
        def mono_pass(row):
            if float(row[ac_index])>0:
                return True
            else:
                return False
        pass_tests.append(mono_pass)

    if "max-pvalue" in filters:
        if "PVALUE" not in headerpos:
            raise Exception("Column PVALUE not found")
        pval_index = headerpos["PVALUE"]
        thresh = float(filters.get("max-pvalue", 1))
        def pval_pass(row):
            if row[pval_index] == "NA":
                return False
            if float(row[pval_index])<thresh:
                return True
            else:
                return False
        pass_tests.append(pval_pass)

    def pass_row(row):
        if len(pass_tests)==0:
            return True
        for f in pass_tests:
            if not f(row):
                return False
        return True

    def generate():
        yield "\t".join(header) + "\n"
        next(indata) #skip header
        for row in indata:
            if pass_row(row):
                yield "\t".join(row)

    return Response(generate(), mimetype="text/plain")
Esempio n. 12
0
def getLeadSNPs(chrom, snps, IndSigSNPs, params):
    leadSNPs = []
    checked = []
    IndSigSNPs = IndSigSNPs[IndSigSNPs[:, 4].astype(float).argsort()]
    for snp in IndSigSNPs:
        if snp[1] in checked:
            continue
        ldfile = params.refgenome_dir + '/' + params.refpanel + '/' + params.pop + '/' + params.pop + '.chr' + str(
            snp[2]) + '.ld.gz'
        tb = tabix.open(ldfile)
        ld_tmp = tb.querys(snp[2] + ":" + snp[3] + "-" + snp[3])
        inSNPs = []
        inSNPs.append(snp[1])

        for l in ld_tmp:
            if float(l[6]) < params.r2_2:
                continue
            if int(l[1]) != int(snp[3]):
                continue
            if int(l[4]) in IndSigSNPs[:, 3].astype(int):
                rsID = IndSigSNPs[IndSigSNPs[:, 3].astype(int) == int(l[4]),
                                  1][0]
                checked.append(rsID)
                inSNPs.append(rsID)
        leadSNPs.append([
            snp[0], snp[1], snp[2], snp[3], snp[4],
            str(len(inSNPs)), ";".join(inSNPs)
        ])
    leadSNPs = np.array(leadSNPs)
    leadSNPs = leadSNPs[leadSNPs[:, 3].astype(int).argsort()]

    return leadSNPs
Esempio n. 13
0
def getChr15(filedir, snps, Chr15, Chr15cells, chr15dir):
    if int(Chr15) == 1:
        annot = pd.read_table(filedir + "annot.txt", sep="\t")
        annothead = list(annot.columns.values)
        annot = annot.as_matrix()
        annot = annot[ArrayIn(annot[:, 0], snps[:, 0])]
        if Chr15cells[0] == "all":
            Chr15cells = list(annothead[3:len(annothead)])
        for c in Chr15cells:
            snps = np.c_[snps, annot[:, annothead.index(c)]]
        Chr15data = []
        chrom = int(snps[0, 1])
        start = min(snps[:, 2])
        end = max(snps[:, 2])
        if end - start == 0:
            end += 500
            start -= 500
        for i in Chr15cells:
            tb = tabix.open(chr15dir + "/" + str(i) + "_core15.bed.gz")
            tmp = tb.querys(str(chrom) + ":" + str(start) + "-" + str(end))
            for l in tmp:
                if int(l[1] < start):
                    l[1] = str(start)
                if int(l[2] > str(end)):
                    l[2] = str(end)
                Chr15data.append([i, int(l[1]), int(l[2]), int(l[3])])
        # Chr15data = np.array(Chr15data)
        return [snps, Chr15data]
    else:
        return [snps, []]
def compute_1000genomes_prs():
    url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/"
    url += "ALL.2of4intersection.20100804.genotypes.vcf.gz"
    tb = tabix.open(url)
    records = tb.query("1", 752720, 752721)
    for record in records:
        print record
def compute_prs(raw_genotype_file, variants):
    counter = 0
    prs_score = 0.0
    url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/"
    url += "ALL.2of4intersection.20100804.genotypes.vcf.gz"
    tb = tabix.open(url)
    bed_file = open(raw_genotype_file.split(".")[0] + ".bed", "wb")

    with open(raw_genotype_file, "rb") as genotypes:
        for line in genotypes:
            if (line[0] == "#"):
                continue
            columns = line.split("\t")
            bed_file.write(columns[1] + "\t" + str(int(columns[2]) - 1) +
                           "\t" + str(int(columns[2])) + "\n")
            try:
                genotype = columns[3][0] + ":" + columns[3][1]
                variant = columns[1] + ":" + columns[2] + ":" + genotype
                score = variants[variant]
                prs_score += float(score)
            except:
                continue
            counter += 1
            print(counter)

    bed_file.close()
    return prs_score
Esempio n. 16
0
def checkAndOpen(db):
    if 'tabix' not in sys.modules:
        return None

    db = os.path.expanduser(db)
    if not os.path.exists(db):
        pass
    else:
        # 'source' is a variable used to title the column in the output
        # it is defined by the user in the configuration script step when generating the JSON file
        if os.path.splitext(db)[1] == ".gz" and os.path.exists(db + ".tbi"):
            try:
                database = gzip.open(db)
            except IOError:
                print("WARNING: could not open {}".format(db))
                return None
        elif os.path.splitext(db)[1] == ".vcf":
            abortWithMessage("Error: database file {0} must compressed with bgzip".format(db))
        elif os.path.splitext(db)[1] == ".gz" and not os.path.exists(db + ".tbi"):
            abortWithMessage("Compressed database is not tabix indexed")
        else: abortWithMessage("Error opening database files: {0}".format(db))
        
        try:
            row = database.readline()
        except StopIteration: 
            print("Empty file {}".format(db))
            return None

        tb = tabix.open(db)
        return tb
Esempio n. 17
0
def get_job_results(job_id, job=None):
    filters = request.args.to_dict()
    epacts_filename = job.relative_path("output.epacts.gz")
    with gzip.open(epacts_filename, "rt") as f:
        header = f.readline().rstrip('\n').split('\t')
        if header[1] == "BEG":
            header[1] = "BEGIN"
        if header[0] == "#CHROM":
            header[0] = "CHROM"
    assert len(header) > 0
    headerpos = {x:i for i,x in enumerate(header)}

    if filters.get("region", ""):
        tb = tabix.open(epacts_filename)
        indata = tb.query(chrom, start_pos, end_pos)
    else:
        indata = (x.split("\t") for x in gzip.open(epacts_filename))

    pass_tests = []
    if filters.get("non-monomorphic", False):
        if "AC" not in headerpos:
            raise Exception("Column AC not found")
        ac_index = headerpos["AC"]
        def mono_pass(row):
            if float(row[ac_index])>0:
                return True
            else:
                return False
        pass_tests.append(mono_pass)

    if "max-pvalue" in filters:
        if "PVALUE" not in headerpos:
            raise Exception("Column PVALUE not found")
        pval_index = headerpos["PVALUE"]
        thresh = float(filters.get("max-pvalue", 1))
        def pval_pass(row):
            if row[pval_index] == "NA":
                return False
            if float(row[pval_index])<thresh:
                return True
            else:
                return False
        pass_tests.append(pval_pass)

    def pass_row(row):
        if len(pass_tests)==0:
            return True
        for f in pass_tests:
            if not f(row):
                return False
        return True

    def generate():
        yield "\t".join(header) + "\n"
        next(indata) #skip header
        for row in indata:
            if pass_row(row):
                yield "\t".join(row)

    return Response(generate(), mimetype="text/plain")
Esempio n. 18
0
def getNonCandidateSNPs(filedir, snps, min_pos, max_pos):
    chrom = int(snps[0, 1])
    chrcol = 0
    poscol = 1

    tb = tabix.open(filedir + "all.txt.gz")
    tb_snps = tb.querys(
        str(chrom) + ":" + str(min_pos - 500000) + "-" + str(max_pos + 500000))
    tmp = []
    for l in tb_snps:
        tmp.append([int(l[0]), int(l[1]), float(l[2])])
    tmp = np.array(tmp)
    tmp = tmp[ArrayNotIn(tmp[:, poscol], snps[:, 3])]

    ### filter SNPs if there are too many #####
    if len(tmp) > 10000:
        tmp_keep = tmp[tmp[:, 2] < 0.05]
        tmp = tmp[tmp[:, 2] >= 0.05]
        step = int(len(tmp) / (10000 - len(tmp_keep))) + 1
        tmp = tmp[np.arange(0, len(tmp), step)]
        tmp = np.r_[tmp, tmp_keep]

    out = []
    for l in tmp:
        out.append([int(l[0]), int(l[1]), l[2]])
    return out
Esempio n. 19
0
def read_vcf(genotype_files, chrom, start, end):
    x = []
    for record in tabix.open(genotype_files[chrom - 1]).query(
            str(chrom), start, end):
        dose = [_.split(':')[2] if _ != '.' else -1 for _ in record[9:]]
        x.append([float(_) if _ != '.' else -1 for _ in dose])
    return np.array(x)
Esempio n. 20
0
def load_exclude(file_list):
    exclude_list = []
    if file_list is None:
        return exclude_list
    for filename in file_list:
        exclude_list.append(tabix.open(filename))
    return exclude_list
Esempio n. 21
0
def LoadGenotypes(gtfile, gtind, regdata):
    chrom = regdata["chrom"].values[0]
    start = min(regdata["str.start"])
    end = max(regdata["str.start"])
    positions = list(regdata["str.start"])
    loaded_positions = []
    tb = tabix.open(gtfile)
    records = tb.query(chrom, start - 1, end + 1)
    data = []
    for record in records:
        pos = int(record[1])
        if pos not in positions: continue
        loaded_positions.append(pos)
        data.append([GetFloat(record[i + 2])
                     for i in gtind])  # first two cols are chrom, start


#    assert(len(positions)==len(loaded_positions))
#    assert([positions[i]==loaded_positions[i] for i in range(len(positions))])
    regdata = regdata[regdata["str.start"].apply(
        lambda x: x in loaded_positions)]
    assert ([
        regdata["str.start"].values[i] == loaded_positions[i]
        for i in range(len(loaded_positions))
    ])
    return data, regdata
Esempio n. 22
0
def get_repeats(region):
    chrom = region.split(":")[0]
    self = '/uufs/chpc.utah.edu/common/home/u1021864/analysis/exacresiduals/data/self-chains.gt90.bed.gz'
    seg = '/uufs/chpc.utah.edu/common/home/u1021864/analysis/exacresiduals/data/hgsegmental.bed.gz'
    tb = tabix.open(self)
    rep = []
    for r in tb.querys(region):
        s = int(r[1])
        e = int(r[2])
        rep.append((s, e))
    tb = tabix.open(seg)
    for r in tb.querys(region):
        s = int(r[1])
        e = int(r[2])
        rep.append((s, e))
    return rep
Esempio n. 23
0
def pairwise_indel_finder_query():
    form = pairwise_indel_form()
    if form.validate_on_submit():
        data = form.data
        results = []
        strain_cmp = [data["strain_1"], data["strain_2"]]
        tb = tabix.open(SV_BED_URL)
        query = tb.query(data["chromosome"], data["start"], data["stop"])
        results = []
        for row in query:
            row = dict(zip(SV_COLUMNS, row))
            row["START"] = int(row["START"])
            row["END"] = int(row["END"])
            if row["STRAIN"] in strain_cmp and \
                MIN_SV_SIZE <= int(row["SIZE"]) <= MAX_SV_SIZE:
                row["site"] = f"{row['CHROM']}:{row['START']}-{row['END']} ({row['SVTYPE']})"
                results.append(row)

        # mark overlaps
        if results:
            results[0]['overlap'] = False
            first = results[0]
            for idx, row in enumerate(results[1:]):
                row["overlap"] = overlaps(first["START"], first["END"],
                                          row["START"], row["END"])
                if row["overlap"]:
                    results[idx]['overlap'] = True
                first = row

            # Filter overlaps
            results = [x for x in results if x['overlap'] is False]
            sorted(results, key=lambda x: (x["START"], x["END"]))
            return jsonify(results=results)
        return jsonify(results=[])
    return jsonify({"errors": form.errors})
Esempio n. 24
0
def parseVCF(vcfFile, tbPositionSNPsFile):
    tbPositionSNPs = tabix.open(tbPositionSNPsFile)
    positives = []
    negatives = []
    if isMale(vcfFile):
        vcf_reader = vcf.Reader(filename=vcfFile)
        record = next(vcf_reader)

        while record:
            if record.CHROM == "chrY":
                position = str(record.POS)
                basesString = record.samples[0].gt_bases
                if basesString:
                    allele = parseBases(basesString)
                    if allele:
                        (snp, call) = getPositionSNP(position, allele,
                                                     tbPositionSNPs)
                        if snp:
                            if call == "+":
                                positives.append(snp)
                            else:
                                negatives.append(snp)

            try:
                record = next(vcf_reader)
            except:
                record = None

    return positives, negatives
Esempio n. 25
0
def retrive_score(mut, conf):
    ''' Obtain functional scores based on mut and conf
    '''
    conf = conf.sort_values('order')
    score = np.empty(shape=mut.shape[0])
    score[:] = np.NAN
    for ix_conf, conf_row in conf.iterrows():
        # logger.info('Retriving {} - {} - chrom {}'.format(conf_row['name'], conf_row['type'], conf_row['chroms']))
        tb = tabix.open(conf_row['path'])
        for ix, var in mut.iterrows():
            if (var['type'] == conf_row['type'] == 'SNP'
                    or conf_row['type'] == 'ALL'
                    or (var['type'] in ['INS', 'DEL']
                        and conf_row['type'] == 'INDEL')):
                try:
                    query_res = tb.query(var.chrom, var.start, var.end)
                except TabixError:
                    query_res = []
                    # Known error, eigen coding has no chrom X, Y data.
                    if not (conf_row['name']
                            in ['EIGEN_CODING', 'EIGEN_NONCODING']
                            and var.chrom in ['X', 'Y']):
                        logger.warning('Retriving {} - {} score error for {}:{}-{}'\
                                       .format(conf_row['name'], conf_row['type'],
                                               var.chrom, var.start, var.end))
                if conf_row.ref_ix < 0:
                    # no ref alt info (e.g., LINSIGHT score)
                    score[ix] = np.mean(
                        [float(i[conf_row.score_ix]) for i in query_res])
                else:
                    for res in query_res:
                        if var.ref == res[conf_row.ref_ix] and var.alt == res[
                                conf_row.alt_ix]:
                            score[ix] = float(res[conf_row.score_ix])
    return score
def process_identical_query(query_obj):
    """
    Loops through the output of a 'identical' query and processes all entries. The function used to update
    the entries is defined before the loop to reduce the if-clauses evaluated in every iteration (slight
    performance increase)
    """

    # Execute the tabix query, skip query if empty result
    try:
        tb = tabix.open(query_obj.originalFile)
        records = tb.query(query_obj.oldSeqID, 0, query_obj.oldSeqLength)
    except tabix.TabixError:
        return

    # get frequently used attributes for faster lookup
    newID = query_obj.newSeqID

    # open files
    # lock is acquired then released after writing is done
    try:
        lock.acquire()
        with open(query_obj.dependentFile, 'a') as updated_file:
            # Loop through tabix output
            for entry in records:
                # Modify the oldID with the newID
                entry[0] = newID
                # check if the updated coordinate is negative. if yes, the entry is discarded
                if int(entry[1]) < 0:
                    continue
                updated_file.write("\t".join(entry))
                updated_file.write("\n")
    finally:
        lock.release()
Esempio n. 27
0
def getVariants(chrom, start_pos, window_size):
    directory = SNP_DIR  # directory where .gz and .gz.tbi files are stored
    fn = chrom + FILE_END
    readFile = os.path.join(directory, fn)

    # make sure inputs are integers
    start_pos = int(start_pos)
    window_size = int(window_size) - 1

    # open the tabix
    tb = tabix.open(readFile)

    # query for the position
    end_pos = start_pos + window_size

    print("grabbing variants from {} {} {}".format(chrom, str(start_pos),
                                                   str(end_pos)))

    # grab the variant records that fall between start_pos and end_pos
    tb_records = tb.query(chrom, start_pos, end_pos)

    # store tabix data into list
    records = []
    for record in tb_records:
        records.append(record)

    return records
Esempio n. 28
0
def intersect_region(region_file, label, dbi_file, output_folder):
    '''
    intersect dbi with region file (bed format) and write final file in the output folder
    '''
    fin = open(region_file, 'r')
    out_file = os.path.join(output_folder, label+'.bed')
    fout = open(out_file, 'w')
    try:
        dbi = tabix.open(dbi_file)
    except:
        print >>sys.stderr, "Can't load tabix file %s"% (dbi_file)
        exit(1)
    for line in fin:
        if line.strip().startswith('#') or line.strip() == '':
            continue
        row = line.strip().split()
        chrom = row[0]
        start = int(row[1])
        stop = int(row[2])
        result = dbi.query(chrom, start, stop)
        for x in result:
            print >>fout, '\t'.join(x)
    fin.close()
    fout.close()
    create_tabix(out_file)
Esempio n. 29
0
def gene_activity_matrix(fragments, features, barcodes):
    '''
    Computes the activity of each feature in scATAC-seq
    fragments : fragment file that is bgzipped (provided by cellranger)
    features: chr, start, end, gene
    barcodes: list of barcodes
    returns: gene activity matrix
    '''
    tb = tabix.open(fragments)
    gene_activity = np.zeros((len(barcodes), len(features)))
    barcode_lookup = dict(zip(barcodes, np.arange(
        1, 1 +
        len(barcodes))))  #hashmap to correspond barcodes with index in matrix

    for i in range(features.shape[0]):
        chrom, start, end = features.iloc[i, [0, 1, 2]]
        fragment_df = utils.read_tabix(tb, (chrom, start, end))
        if fragment_df.shape[0] > 0:
            curr_barcodes = fragment_df[3].values
            for b in curr_barcodes:
                z = barcode_lookup.get(b)
                if z:
                    gene_activity[z - 1, i] += 1
        if i % 1000 == 0 or i == features.shape[0]:
            percent_complete = str(
                np.round(100 * ((i + 1) / features.shape[0]), decimals=2))
            print('\r Progress: ' + percent_complete + '%', end="")

    gene_activity = pd.DataFrame(gene_activity,
                                 index=barcodes,
                                 columns=features['gene'].values)

    return gene_activity
def extract_CADD_score(arguments, q):
	vcf_record, caddfile = arguments
	
	tb = tabix.open(caddfile)

	chromosome = (vcf_record.CHROM).replace("chr","")
	vcf_record.INFO["RAWCADD"]   = 0
	vcf_record.INFO["PHREDCADD"] = 0

	# Specific for CADD files
	# FIXME: get info about chr or not from provided VCF file
	records = tb.query(chromosome, vcf_record.POS-1, vcf_record.POS)

	# Look for matching mutation
	# Works for SNVs, InDels optimisation is ongoing
	for rec in records:
		if rec[3] == vcf_record.ALT[0]:
			# FIXME: Make requested fields optional through arguments
			vcf_record.INFO["RAWCADD"]   = rec[4]
			vcf_record.INFO["PHREDCADD"] = rec[5]
			break
	
	# workaround since multiprocess can't handle VCF record class objects
	# FIXME: use VCF class records rather than this ugly string
	annotated = VCF_WRITER._map(str, [vcf_record.CHROM, vcf_record.POS, vcf_record.ID, vcf_record.REF]) + [VCF_WRITER._format_alt(vcf_record.ALT), str(vcf_record.QUAL) or '.', VCF_WRITER._format_filter(vcf_record.FILTER), VCF_WRITER._format_info(vcf_record.INFO)]

	# Return results to Queue
	q.put(annotated)
	return(annotated)
Esempio n. 31
0
def get_cadd(config, chrom, start, ref, alt):
    '''
    add cadd of variant
    '''
    tabix_fp = config['data_paths']['cadd']['whole_genome_cadd']
    try:
        tb = tabix.open(tabix_fp)
    except:
        logging.warning('{0} not available'.format(tabix_fp))
        return np.nan

    '''if stop != start:
        print('WARNING: the start {0} is different than stop {1}'.format(start, stop))
        return np.nan'''

    try:
        records = tb.querys(str(chrom) + ':' + str(start) + '-' + str(start))
    except:
        logging.warning('Error when trying to query {0}-{1}-{2}-{3}'.format(chrom, start, ref, alt))
        return np.nan

    for record in records:
        if record[2] != ref:
            logging.warning('Reference {0} is not the one in CADD for entry {1}-{2}-{3}-{4}'.format(ref, chrom, start, ref, alt))
            return np.nan
        if record[3] == alt:
            return float(record[5])

    logging.warning('I do not find a cadd entry for {0}-{1}-{2}-{4}'.format(alt, chrom, start, ref, alt))
    return np.nan
Esempio n. 32
0
def calc_all(all_genes, bases_to_exclude, rscu_fh, gerp_fp, genome_fa,
             syn_gerp_out, bed_out):
    """
    Calculates mean gerp score for all Gene objects contained in list
    all_genes and writes values to outfile
    :param all_genes: dict of Gene objects
    :param gerp_fp: path to tabix-indexed gerp file
    :param genome_fa: path to reference genome fasta that has been indexed
        via samtools faidx
    :param outfile: path to output file
    """
    with gzip.open(gerp_fp, 'rt') as gerp_f:
        gerp_header = gerp_f.readline()
        gerp_header = gerp_header.strip().split("\t")

    gerp_tb = tabix.open(gerp_fp)
    genome = pyfaidx.Fasta(genome_fa)

    rscu = read_rscu_f(rscu_fh)

    syn_gerp_out.write("#GENE\tSYN_GERP\n")

    bed_out.write("#CHROM\tPOS\tSTRAND\tGENE\tCDS_POS\tCODON\tRSCU\tGERP\n")

    for gene_obj in all_genes.values():
        gene_obj.calc_syn_gerp(genome, gerp_header, gerp_tb, rscu,
                               bases_to_exclude)
        syn_gerp_out.write("{}\t{}\n".format(gene_obj.gene, \
                                             gene_obj.syn_gerp))
        for line in gene_obj.bed:
            bed_out.write(line)
def query_fragments(fragment_file, chrom, start, end):
    """
    Counts number of fragments per barcode in fragment file.
    
    Parameters
    ----------
    
    fragment_file: path to fragment file
    
    chrom: chromosome to query
    
    start: start of query region
    
    end: end of query region
    
    Returns
    -------
    
    records: fragments in given region.
    
    """
    tb = tabix.open(fragment_file)
    results = tb.querys("%s:%d-%d" % (chrom, start, end))
    records = []
    for record in results:
        records.append(record)
    return records
Esempio n. 34
0
def do_peak_feat_row(map_args, def_param=(scores1, scores2)):
    """
    Loop definition for multithreading over table rows within add_peak_features.
    """
    (i, train, BED, opt) = map_args
    peaks = tb.open(BED)
    row = train.iloc[i]
    anchor1, anchor2 = prepare_anchors(row, opt.extension)

    feats1 = get_features(anchor1.chrom, anchor1.start, anchor1.end, peaks, [
        "chrom", "chromStart", "chromEnd", "name", "score", "strand",
        "signalValue", "pValue", "qValue", "peak"
    ], [
        "string", "int64", "int64", "string", "int64", "string", "float64",
        "float64", "float64", "int64"
    ])
    feats2 = get_features(anchor2.chrom, anchor2.start, anchor2.end, peaks, [
        "chrom", "chromStart", "chromEnd", "name", "score", "strand",
        "signalValue", "pValue", "qValue", "peak"
    ], [
        "string", "int64", "int64", "string", "int64", "string", "float64",
        "float64", "float64", "int64"
    ])

    score1 = choose_feat(feats1, "signalValue", opt.collapse_peaks)
    score2 = choose_feat(feats2, "signalValue", opt.collapse_peaks)
    lock.acquire()
    scores1[i] = (score1 + score2) / 2.0
    scores2[i] = np.std([score1, score2])
    lock.release()
Esempio n. 35
0
def get_exons(chrom, start, stop, file):
	tb = tabix.open(file)
	records = tb.query(chrom, start, stop)
	exons = []
	for record in records:
		exons.append(record)
	return exons
Esempio n. 36
0
def Query(tbk_file=None,seqname=None,start=1,end=2,
          idx=None,dtype=None,base_idx=8192):
    """
    The main function for querying.
    tbk_file: input a single .tbk file.
    seqname: Chromosome (or the sequence name for tabix).
    start: start position.
    end: end position.
    base_idx: Number of index that should be skipped.
    """
    if dtype is None:
        ver,dtype,num,idx1=Header(tbk_file)
        dtype=dtype_map_rev[dtype]
    if idx is None:
        idx=idx1
    fmt=dtype_fmt[dtype]
    tb = tabix.open(idx)
    records=tb.query(seqname,start,end)
    lineNum=[record[3] for record in records]
    if len(lineNum)==0:
        return None
    elif len(lineNum)==1:
        return read_one_site(tbk_file,int(lineNum[0]),fmt,base_idx)
    n1,n2=lineNum[0],lineNum[-1]
    return read_multi_site(tbk_file,n1,n2,fmt,base_idx)
 def __init__(self, _snp, _ref, _vcf, _restrict, \
                  _num_ctrls, _window, _match_context):
     self.snp = _snp
     self.ref = pyfasta.Fasta(_ref)
     self.vcf = tabix.open(_vcf)
     if _restrict is not None:
         self.restrict = tabix.open(_restrict)
     else: self.restrict = None
     self.chromToKey = {}
     for k in self.ref.keys():
         chrom = k.split()[0]
         self.chromToKey[chrom] = k
     self.num_ctrls = _num_ctrls
     self.window = _window
     self.match_context = _match_context
     if self.match_context >= 0:
         self.snp_context = self.GetContext(self.snp)
Esempio n. 38
0
def main(args):
    chrom, coords = loadCoords(args.bedFile)
    tb = ""
    if chrom:
        tabix.open(
            "/home/evansj/me/data/ExAC/coverage/ftp.broadinstitute.org/pub/ExAC_release/current/coverage/Panel.chr%s.coverage.txt.gz"
            % (chrom,)
        )

    with open(args.outFile, "w") as fout:
        if chrom:
            for st in coords:
                # st is 1-idx in coords
                # tabix needs 0-based
                records = tb.query(chrom, st - 1, st)
                for record in records:
                    thisChrom, pos, mean, median, c1, c5, c10, c15 = record[0:8]
                    print("\t".join((thisChrom, pos, c10)), file=fout)
Esempio n. 39
0
def tabix_vcf(vcf_file, in_chr, in_start, in_stop):
	"""A generator to get records in a VCF given a location."""
	chrom = str(in_chr); start = int(in_start); stop = int(in_stop)
	try:
		vcf_tb = tabix.open(vcf_file)
		for rec in vcf_tb.query(chrom, start, stop):
			yield rec
	except:
		return
Esempio n. 40
0
def get_tabixhandle(path):
    """Check if a file is zipped and that the index exists
        If something looks wierd raise a TabixError
    """
    if not path.endswith('.gz'):
        raise TabixError("File {0} does not end with '.gz'".format(path))
    index_file = path + '.tbi'
    if not os.path.isfile(index_file):
        raise TabixError("No index could be found for {0}".format(path))
    
    return tabix.open(path)
Esempio n. 41
0
 def __init__(self, args):
     self.args = args
     # parse out TransciptInfos
     print('Loading transcripts...', file=sys.stderr)
     self.tx_infos = self._parse_tx_infos(args.gencode_gtf)
     self.tx_info_by_id = dict([(info.transcript_id, info) for info in self.tx_infos])
     # open tabix file
     print('Opening tabix file...', file=sys.stderr)
     self.tabix = tabix.open(args.gencode_gtf)
     # open BAM file and iterate over it
     print('Opening BAM file...', file=sys.stderr)
     self.sam_file = pysam.AlignmentFile(args.alignment_bam, 'r')
Esempio n. 42
0
def ld_expand(df, ld_beds):
    """
    Expand a set of SNVs into all SNVs with LD >= 0.8 and return a BedTool of
    the expanded SNPs.
    
    Parameters
    ----------
    df : pandas.DataFrame
        Pandas dataframe with SNVs. The index is of the form chrom:pos where pos
        is the one-based position of the SNV. The columns are chrom, start, end.
        chrom, start, end make a zero-based bed file with the SNV coordinates.

    ld_beds : dict
        Dict whose keys are chromosomes and whose values are filenames of
        tabixed LD bed files. The LD bed files should be formatted like this:
            chr1    14463   14464   14464:51479:0.254183
        where the the first three columns indicate the zero-based coordinates of
        a SNV and the the fourth column has the one-based coordinate of that
        SNV, the one-based coordinate of another SNV on the same chromosome, and
        the LD between these SNVs (all separated by colons).

    Returns
    -------
    bt : pybedtools.BedTool
        BedTool with input SNVs and SNVs they are in LD with.
        indepdent SNVs.
    """
    import pybedtools as pbt
    import tabix
    out_snps = []
    for chrom in ld_beds.keys():
        t = tabix.open(ld_beds[chrom])
        tdf = df[df['chrom'].astype(str) == chrom]
        for ind in tdf.index:
            p = tdf.ix[ind, 'end']
            out_snps.append('{}\t{}\t{}\t{}\n'.format(chrom, p - 1, p, ind))
            try:
                r = t.query('{}'.format(chrom), p - 1, p)
                while True:
                    try:
                        n = r.next()
                        p1, p2, r2 = n[-1].split(':')
                        if float(r2) >= 0.8:
                            out_snps.append('{}\t{}\t{}\t{}\n'.format(
                                n[0], int(p2) - 1, int(p2), ind))
                    except StopIteration:
                        break
            except tabix.TabixError:
                continue
    bt = pbt.BedTool(''.join(out_snps), from_string=True)
    bt = bt.sort()
    return bt
def get_genotypes(CpG_location):
    import tabix
    import pandas as pd
    tb_file   = "/path/to/file/DF_meth_variants.gz"
    df        = pd.DataFrame(columns=xrange(0,782))
    tb        = tabix.open(tb_file)
#    print CpG_location
    records   = tb.querys(CpG_location)
    num       = 0
    for record in records:
        df.loc[num] = record[3:]
        num        += 1
    return(df)
 def test_same_aa_different_positions(self):
     ''' check that same_aa() works correctly for different amino acids
     '''
     
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=5, extra='Protein_position=2'))
     lines.append(make_vcf_line(pos=7, extra='Protein_position=3'))
     lines.append(make_vcf_line(pos=8, extra='Protein_position=4'))
     self.write_vcf(lines)
     
     vcf = tabix.open(self.path)
     pairs = [[('1', 7), ('1', 8)]]
     
     self.assertEqual(same_aa(vcf, pairs), [])
 def test_same_aa(self):
     ''' check that same_aa() works correctly
     '''
     
     # get the VCF lines
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=2, extra='Protein_position=1'))
     lines.append(make_vcf_line(pos=4, extra='Protein_position=1'))
     self.write_vcf(lines)
     
     vcf = tabix.open(self.path)
     pairs = [[('1', 2), ('1', 4)]]
     
     self.assertEqual(same_aa(vcf, pairs), [[('1', 2), ('1', 4)]])
Esempio n. 46
0
 def __search_pos_bdg(self):
   
     self.pd_frame = {}
     for i,bdg_file in enumerate(self.l_bdg_file):
         tb = tabix.open( bdg_file )
         record = tb.query( self.chrom, self.beg, self.end )
         l_pos  = []
         l_cons = []
         
         l_xticks = [ self.beg+1 ]
         bin_size = int( (self.end-self.beg)/10 )
         bin_size = 10**int(np.log10(bin_size))
         
         
         
         pre_pos  = 0
               
         for rec in record:
             for pos in xrange( int(rec[1]),int(rec[2]) ):
                 cons = float(rec[3])
               
                 if pre_pos == 0:
                     pre_pos = int(rec[1])
                    
                 """ Only consider the given region. """
                 if self.__is_intersect(pos):
                     
                     """ If bedGraph has gaps, """
                     if pos > pre_pos+1:
                         """ Using zero to fill bedGraph gaps """
                         for p in xrange( pre_pos+1,pos ):
                             l_pos.append( p )
                             l_cons.append( 0.0 )
                         
                     l_pos.append( pos  )
                     l_cons.append(cons )
                     
                     if pos % bin_size == 0:
                         l_xticks.append( pos )
                    
                 pre_pos  = pos
         
         l_xticks.append( self.end )
         data = { 'pos':l_pos, 'con':l_cons }
         self.pd_frame[ bdg_file ] = pd.DataFrame( data )
         
         if i == 0:
             self.l_xpos   = l_xticks
             self.l_xticks = [ str(tick) for tick in l_xticks ]
Esempio n. 47
0
 def __init__(self, task_queue, results_queue, families={}, phased=False, 
             vep=False, cadd_raw=False, cadd_file=None, cadd_1000g=None, 
             cadd_exac=None, cadd_ESP=None, cadd_InDels=None, 
             thousand_g=None, exac=None, dbNSFP=None, strict=False, 
             verbosity=False):
     Process.__init__(self)
     self.task_queue = task_queue
     self.families = families
     self.results_queue = results_queue
     self.verbosity = verbosity
     self.phased = phased
     self.vep = vep
     self.cadd_raw = cadd_raw
     self.cadd_file = cadd_file
     self.cadd_1000g = cadd_1000g
     self.cadd_exac = cadd_exac
     self.cadd_ESP = cadd_ESP
     self.cadd_InDels = cadd_InDels
     self.thousand_g = thousand_g
     self.exac = exac
     self.dbNSFP = dbNSFP
     self.strict = strict
     self.any_cadd_info = False
     if self.cadd_file:
         self.cadd_file = tabix.open(self.cadd_file)
         self.any_cadd_info = True
     if self.cadd_1000g:
         self.cadd_1000g = tabix.open(self.cadd_1000g)
         self.any_cadd_info = True
     if self.cadd_exac:
         self.cadd_exac = tabix.open(self.cadd_exac)
         self.any_cadd_info = True
     if self.cadd_ESP:
         self.cadd_ESP = tabix.open(self.cadd_ESP)
         self.any_cadd_info = True
     if self.cadd_InDels:
         self.cadd_InDels = tabix.open(self.cadd_InDels)
         self.any_cadd_info = True
     if self.thousand_g:
         self.thousand_g = tabix.open(self.thousand_g)
     if self.exac:
         self.exac = tabix.open(self.exac)
     if self.dbNSFP:
         self.exac = tabix.open(self.exac)
def get_1mb_snps():
    import tabix 
    tb = tabix.open('snps_all.gz')

    fname = 'newMethPosFile.txt_2-3_col_1'
    snps = {}

    with open(fname) as f:
        for line in f:
            a = line.rstrip('\n').rsplit('\t')
            start = str(int(a[1]) - 1000000)
            stop  = str(int(a[1]) + 1000000)
            pos = a[0] + ":" + start + "-" + stop
            records = tb.querys(pos)
            for record in records:
                snps[record[3]] = 0
    return(snps)
 def test_same_aa_missing_protein_positions(self):
     ''' check that same_aa() works correctly when the vars aren't in the CDS
     '''
     
     # if one of the variants in the pair does not have a protein position
     # listed (i.e. residue number), that indicates the variant could be
     # affecting the splice site, so we can't use the pair.
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=5))
     lines.append(make_vcf_line(pos=7))
     lines.append(make_vcf_line(pos=8, extra='Protein_position=4'))
     self.write_vcf(lines)
     
     vcf = tabix.open(self.path)
     pairs = [[('1', 7), ('1', 8)]]
     
     self.assertEqual(same_aa(vcf, pairs), [])
 def test_screen_pairs_nonstandard_pair(self):
     ''' test that screen_pairs() works correctly
     '''
     
     # get the VCF lines
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=2))
     lines.append(make_vcf_line(pos=4))
     lines.append(make_vcf_line(pos=5))
     lines.append(make_vcf_line(pos=7))
     lines.append(make_vcf_line(pos=8))
     self.write_vcf(lines)
     
     vcf = tabix.open(self.path)
     # set up a list of 'pairs', where one 'pair' has three variants in it.
     # we exclude 'pairs' where n != 2.
     pairs = [[('1', 2), ('1', 4), ('1', 5)], [('1', 7), ('1', 8)]]
     self.assertEqual(screen_pairs(vcf, pairs, is_not_indel), [[('1', 7), ('1', 8)]])
def open_tabix_file(file_path):
    """docstring for open_tabix_file"""
    file_handle = tabix.open(file_path)
    try:
        file_handle.query('1', 1, 100)
    except tabix.TabixError as e:
        logger.warning("Something wrong with tabix file: {0}".format(
            file_path))
        
        file_name, file_extension = os.path.splitext(file_path)
        if file_extension != '.gz':
            raise NotZippedError("File {0} does not seem to be bgzipped".format(
                file_path))
        else:
            raise NotIndexedError("File {0} does not seem to be tabix"\
                                  " indexed".format(file_path))
    return file_handle
    
Esempio n. 52
0
def gerp(vf, af, name="gerp"):
    v = BedTool(vf)
    t = tabix.open(af)

    results = {}

    for var in v:
        try:
            result = 0.0
            num = 0
            for res in t.query(var.chrom, var.start, var.end):
                result += float(res[4])
                num += 1
            if num > 0:
                results[var.name] = result/num
        except:
            pass

    return Series(results, name=name)
Esempio n. 53
0
def get_exac(config, chrom, start, stop, ref, alt):
    '''
    add exac annotation for the variant; in exac file:
    # AN_Adj is the overall total of alleleles (I do not know the difference as compared to AN)
    # AC_Adj is the overall number of mutatnt alleles observed across population
    # AF = AC_Adj/AN_adj
    '''

    tabix_fp = config['data_paths']['exac']['exac']
    tb = tabix.open(tabix_fp)

    if stop != start:
        logging.warning('The start {0} is different than stop {1}'.format(start, stop))
        return np.nan

    # A query returns an iterator over the results.
    records = tb.querys(str(chrom) + ':' + str(start) + '-' + str(stop))

    # if one single position is provided and this position exists, the iterator contains a single record (a single list)
    #    (if multiple alleles in that position, info is comma-separated)
    for record in records:

        # no entry with pass filter
        if record[6] != 'PASS':
            return np.nan

        # the reference is not annotated well!
        if record[3] != ref:
            print('WARNING:the ref {0} is not matching the info in\n {1}'.format(ref, record[0:5]))

        alt_s = record[4].split(',')
        try:
            # i split the alt entry in case there are multiple alleles in the same record
            allele_pos = (alt_s.index(alt))
            af_s = record[7].split('AF=')[1].split(';')[0].split(',')
            return float(af_s[allele_pos])
        except:
            # the alt is not in the record and thus is not in the exac data
            return np.nan

    # the "for record in records" loop is not entered if the tabix query is empty (no entry for that position)
    return np.nan
 def test_get_matches(self):
     ''' check that get_matches works correctly
     '''
     
     # get the VCF lines
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=1))
     lines.append(make_vcf_line(pos=2))
     lines.append(make_vcf_line(pos=4))
     lines.append(make_vcf_line(pos=5))
     self.write_vcf(lines)
     
     vcf = tabix.open(self.path)
     pair = [('1', 2), ('1', 4)]
     
     # define the expected lines
     var1 = parse_vcf_line(make_vcf_line(pos=2).split('\t'), self.Variant)
     var2 = parse_vcf_line(make_vcf_line(pos=4).split('\t'), self.Variant)
     
     self.assertEqual(list(get_matches(vcf, pair)), [var1, var2])
Esempio n. 55
0
    def make_refGene_track(self,bed_tabix="/datd/huboqiang/ChIP_human/Week12/Database/refGene.sort.bed.gz"):
        '''
cut -f 2- refGene.txt | awk '{OFS="\t";print $1"__"$12,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$13,$14,$15}' | /data/Analysis/huboqiang/software/UCSC/genePredToBed /dev/stdin /dev/stdout | bedtools sort -i /dev/stdin >refGene.sort.bed
bgzip -fc  refGene.sort.bed >refGene.sort.bed.gz
tabix -p bed -s 1 -b 2 -e 3  refGene.sort.bed.gz
        '''
        tb = tabix.open( bed_tabix )
        record = tb.query( self.chrom, self.beg, self.end )
        
        self.geneGraph = {}
        for rec in record:
            tran,gene = rec[3].split("__")
            if gene not in self.geneGraph:
                self.geneGraph[ gene ] = {}
            if tran not in self.geneGraph[gene]:
                self.geneGraph[ gene ][ tran ] = { 'beg':int(rec[1]),'end':int(rec[2]), 'exon_beg':[],'exon_ext':[],'cds_beg':[],'cds_ext':[], 'strand':rec[5]  }
            
            l_beg = [ int(beg)+int(rec[1]) for beg in rec[11].split(",")[:-1] ]
            l_ext = [ int(ext)             for ext in rec[10].split(",")[:-1] ]
            
            self.geneGraph[ gene ][ tran ][ 'exon_beg' ] = l_beg
            self.geneGraph[ gene ][ tran ][ 'exon_ext' ] = l_ext
            
            cds_beg = int(rec[6])
            cds_end = int(rec[7])
            exon_cnt= int(rec[9])
            
            if cds_beg == cds_end:
                continue
            for i in xrange( 0,exon_cnt ):
                beg = l_beg[i]
                end = l_beg[i] + l_ext[i]
               
                if cds_end < beg or cds_beg > end:
                    continue
               
                self.geneGraph[ gene ][ tran ][ 'cds_beg' ].append( max(cds_beg,beg) )
                self.geneGraph[ gene ][ tran ][ 'cds_ext' ].append( min(cds_end,end)-max(cds_beg,beg) )
        
        self.__Only_Region()
        self.__Only_Longest_tran()
 def test_screen_pairs(self):
     ''' test that screen_pairs() works correctly
     '''
     
     # get the VCF lines
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=1))
     lines.append(make_vcf_line(pos=2))
     lines.append(make_vcf_line(pos=4))
     lines.append(make_vcf_line(pos=5))
     lines.append(make_vcf_line(pos=7))
     lines.append(make_vcf_line(pos=8))
     self.write_vcf(lines)
     
     vcf = tabix.open(self.path)
     pairs = [[('1', 2), ('1', 4)], [('1', 7), ('1', 8)]]
     
     self.assertEqual(screen_pairs(vcf, pairs, is_not_indel), pairs)
     
     # check that the other filter function also works cleanly
     self.assertEqual(screen_pairs(vcf, pairs, is_coding), pairs)
Esempio n. 57
0
def get_coverage_stats(chrom, pos_start, pos_end):
	tb = tabix.open(COVERAGE_FOLDER + COVERAGE_FILE % chrom)
	records = tb.query(chrom, int(pos_start) - EXON_PADDING, int(pos_end) + EXON_PADDING)

	pos_means = []
	for record in records:
		pos_means.append(float(record[COVERAGE_POS_MEAN_INDEX]))

	coverage_stats = []

	# Check that there is coverage data for the exon
	if pos_means:		
		coverage_mean = "{0:.3f}".format(numpy.mean(pos_means))
		coverage_standart_deviation = "{0:.3f}".format(numpy.std(pos_means))
		coverage_max = max(pos_means)
		coverage_min = min(pos_means)
		coverage_stats = [coverage_mean, coverage_standart_deviation, coverage_max, coverage_min]
	else:
		coverage_stats = [0, 0, 0, 0]
	
	return coverage_stats
def get_mnv_candidates(path):
    ''' identify MNV candidates, and their MNV consequences within a VCF.
    
    Args:
        path: path to VCF
    
    Returns:
        list of (variant, mnv_consequence) tuples, where variant is (chrom, pos)
    '''
    
    with open_vcf(path) as vcf:
        exclude_header(vcf)
        header = get_vcf_header(vcf)
        pairs = find_nearby_variants(vcf)
    
    # ensure variants are not indels, are coding, and pairs alter the same amino
    # acid position
    vcf = tabix.open(path)
    pairs = screen_pairs(vcf, pairs, is_not_indel)
    pairs = screen_pairs(vcf, pairs, is_coding)
    pairs = same_aa(vcf, pairs)
    
    pattern = re.compile('[ACGT]')
    
    candidates = {}
    for pair in pairs:
        var1, var2 = list(get_matches(vcf, pair))
        try:
            cq = check_mnv_consequence(var1, var2, pattern)
            candidates[pair[0]] = cq
            candidates[pair[1]] = cq
        except AssertionError:
            print('{0}:{1} and {0}:{2} in {3} have multiple alternative ' \
                'transcripts or odd codon sequences'.format(var1.chrom,
                var1.pos, var2.pos, path))
    
    return candidates
Esempio n. 59
0
#! /usr/local/bin/python

import sys, tabix
from scipy import stats

inputFile = sys.argv[1]
tumorDepth = sys.argv[2]
normalDepth = sys.argv[3]

hIN = open(inputFile, 'r')
tumorDepth_tb = tabix.open(tumorDepth)
normalDepth_tb = tabix.open(normalDepth)

margin1 = 1000
margin2 = 500000
thres = 50

# for sorting
def cmp_chrPos(x1, x2):
    key1 = x1.split('\t')
    key2 = x2.split('\t')
    
    if key1[0] < key2[0]:
        return 1
    elif key1[0] > key2[0]:
        return -1
    else:
        if int(key1[1]) >= int(key2[1]):
            return 1
        else:
            return -1
Esempio n. 60
0
prog_name = sys.argv[0].split('/')[-1]
if len(sys.argv) == 4:
    in_vcf = sys.argv[1]
    in_db_dir = sys.argv[2]
    maf_cut = float(sys.argv[3])
    print >> sys.stderr, "[%s] %s run initiated." % (time.ctime(), prog_name)
else:
    sys.exit("\nUsage: python %s <in.vcf> <in.EVS.vcf.gz.dir> <maf.cut>\n" % prog_name)
# fi

# Init tabix
dbs = {}
for chrom_id in [str(x) for x in xrange(1, 23)] + ['X', 'Y']: # no mito var in 1000G
    file_to_glob = "%s/ESP6500SI-V2-SSA137.*.chr%s.*.vcf.gz" % (in_db_dir, chrom_id)
    db_file = glob.glob(file_to_glob)[0]
    dbs[chrom_id] = tabix.open(db_file)
#db = tabix.open(in_db)

# Proc VCF
for line in open(in_vcf, "r"):
    flag_printed = False
    if line.startswith('#'):
        print line.strip()
        continue
    field = line.strip().split('\t')
    chrom = field[0]
    chrom_id = chrom.replace("chr", '')
    chrom_id = 'M' if chrom_id == "MT" else chrom_id
    one_pos = int(field[1])
    chr_pos = "%s:%s" % (chrom_id, one_pos)
    ref = field[3]