def filterAndAnnotation(inputFilePath, outputFilePath, genome_id, is_grc):

    hIN = open(inputFilePath, 'r')
    hOUT = open(outputFilePath, 'w')

    # annotation_dir = config.param_conf.get("annotation", "annotation_dir")
    # filter_same_gene = config.param_conf.getboolean("filter_condition", "filter_same_gene")
    # annotation_dir = param_conf.resource_dir
    filter_same_gene = param_conf.filter_same_gene
    """
    # old procedure
    # ref_gene_bed = annotation_dir + "/refGene.bed.gz"
    ref_exon_bed = annotation_dir + "/refExon.bed.gz"
    ens_gene_bed = annotation_dir + "/ensGene.bed.gz"
    ens_exon_bed = annotation_dir + "/ensExon.bed.gz"
    grch2ucsc_file = annotation_dir + "/grch2ucsc.txt"

    # relationship between CRCh and UCSC chromosome names
    grch2ucsc = {}
    with open(grch2ucsc_file, 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')
            grch2ucsc[F[0]] = F[1]

    ref_gene_tb = pysam.TabixFile(ref_gene_bed)
    ref_exon_tb = pysam.TabixFile(ref_exon_bed)
    ens_gene_tb = pysam.TabixFile(ens_gene_bed)
    ens_exon_tb = pysam.TabixFile(ens_exon_bed)
    """

    annot_utils.gene.make_gene_info(outputFilePath + ".tmp.refGene.bed.gz",
                                    "refseq", genome_id, is_grc, False)
    annot_utils.gene.make_gene_info(outputFilePath + ".tmp.ensGene.bed.gz",
                                    "gencode", genome_id, is_grc, False)
    annot_utils.exon.make_exon_info(outputFilePath + ".tmp.refExon.bed.gz",
                                    "refseq", genome_id, is_grc, False)
    annot_utils.exon.make_exon_info(outputFilePath + ".tmp.ensExon.bed.gz",
                                    "gencode", genome_id, is_grc, False)

    ref_gene_tb = pysam.TabixFile(outputFilePath + ".tmp.refGene.bed.gz")
    ens_gene_tb = pysam.TabixFile(outputFilePath + ".tmp.ensGene.bed.gz")
    ref_exon_tb = pysam.TabixFile(outputFilePath + ".tmp.refExon.bed.gz")
    ens_exon_tb = pysam.TabixFile(outputFilePath + ".tmp.ensExon.bed.gz")

    for line in hIN:

        F = line.rstrip('\n').split('\t')

        # check gene annotation for the side 1
        gene1 = get_gene_info(F[0], F[1], ref_gene_tb, ens_gene_tb)

        # check gene annotation for the side 2
        gene2 = get_gene_info(F[3], F[4], ref_gene_tb, ens_gene_tb)

        # check exon-intron junction annotation for the side 1
        junction1 = get_junc_info(F[0], F[1], ref_exon_tb, ens_exon_tb,
                                  junction_margin)

        # check exon-intron junction annotation for the side 2
        junction2 = get_junc_info(F[3], F[4], ref_exon_tb, ens_exon_tb,
                                  junction_margin)

        sameGeneFlag = 0
        for g1 in gene1:
            for g2 in gene2:
                if g1 == g2 and g1 != "---": sameGeneFlag = 1

        if filter_same_gene == True and sameGeneFlag == 1: continue

        print >> hOUT, '\t'.join(F[0:8]) + '\t' + ';'.join(gene1) + '\t' + ';'.join(junction1) + '\t' + ';'.join(gene2) + '\t' + ';'.join(junction2) + '\t' + \
                         F[11] + '\t' + F[12] + '\t' + F[16] + '\t' + F[17]

    hIN.close()
    hOUT.close()

    subprocess.check_call(
        ["rm", "-rf", outputFilePath + ".tmp.refGene.bed.gz"])
    subprocess.check_call(
        ["rm", "-rf", outputFilePath + ".tmp.ensGene.bed.gz"])
    subprocess.check_call(
        ["rm", "-rf", outputFilePath + ".tmp.refExon.bed.gz"])
    subprocess.check_call(
        ["rm", "-rf", outputFilePath + ".tmp.ensExon.bed.gz"])

    subprocess.check_call(
        ["rm", "-rf", outputFilePath + ".tmp.refGene.bed.gz.tbi"])
    subprocess.check_call(
        ["rm", "-rf", outputFilePath + ".tmp.ensGene.bed.gz.tbi"])
    subprocess.check_call(
        ["rm", "-rf", outputFilePath + ".tmp.refExon.bed.gz.tbi"])
    subprocess.check_call(
        ["rm", "-rf", outputFilePath + ".tmp.ensExon.bed.gz.tbi"])
Exemple #2
0
    def setUp(self):

        self.tabix = pysam.TabixFile(self.filename)
        self.compare = load_and_convert(self.filename)
Exemple #3
0
    def testManager(self):

        with pysam.TabixFile(self.filename) as tabixfile:
            tabixfile.fetch()
        self.assertEqual(tabixfile.closed, True)
Exemple #4
0
def getPairCoverRegionFromBam(inputBam, outputFilePath, inputTabixFile):
    """
        script for obtaining pair read information (mainly end position, because it cannot recovered from bam files)
    """
    ####################
    bamfile = pysam.Samfile(inputBam, "rb")
    tabixfile = pysam.TabixFile(inputTabixFile)
    hOUT = open(outputFilePath + ".tmp", "w")

    ID2info = {}
    tempChr = ""
    tempPos = 0
    checkPositionMargin = 10000000

    tabixErrorMsg = ""
    for read in bamfile.fetch():

        # when into new regions, fetch the keys from the tabix indexed file
        if bamfile.getrname(
                read.tid) != tempChr or int(read.pos +
                                            1) > tempPos + checkPositionMargin:

            tempChr = bamfile.getrname(read.tid)
            tempPos = int(read.pos + 1) - 1

            ID2info = {}
            tabixErrorFlag = 0
            try:
                records = tabixfile.fetch(tempChr, tempPos,
                                          tempPos + checkPositionMargin)
            except Exception as inst:
                # print >> sys.stderr, "%s: %s" % (type(inst), inst.args)
                tabixErrorMsg = str(inst.args)
                tabixErrorFlag = 1

            if tabixErrorFlag == 0:
                for record in records:
                    splt_record = record.split('\t')
                    ID2info[splt_record[3]] = record

        flags = format(int(read.flag), '#014b')[:1:-1]

        # skip supplementary alignment
        if flags[8] == "1" or flags[11] == "1": continue

        # skip one of the pair is unmapped
        if flags[2] == "1" or flags[3] == "1": continue

        seqID = (read.qname + "/1" if flags[6] == "1" else read.qname + "/2")

        if seqID in ID2info:
            print(ID2info[seqID] + "\t" + bamfile.getrname(read.tid) + ":" +
                  str(read.pos + 1) + "-" + str(read.aend) + "\t" +
                  str(read.mapq),
                  file=hOUT)

    if tabixErrorMsg != "":
        utils.warningMessage(
            "One or more error occured in tabix file fetch, e.g.: " +
            tabixErrorMsg)

    bamfile.close()
    tabixfile.close()
    hOUT.close()
    ####################

    ####################
    hOUT = open(outputFilePath, 'w')
    subprocess.call(["sort", "-k5n", outputFilePath + ".tmp"], stdout=hOUT)
    hOUT.close()
    ####################

    ####################
    subprocess.call(["rm", outputFilePath + ".tmp"])
Exemple #5
0
 def setUp(self):
     if not pysam.config.HAVE_LIBCURL or not checkURL(self.url):
         self.remote_file = None
     else:
         self.remote_file = pysam.TabixFile(self.url, "r")
     self.local_file = pysam.TabixFile(self.local, "r")
Exemple #6
0
    def _add_vcf_file_for_family_set(self,
                                     family_info_list,
                                     vcf_file_path,
                                     reference_populations=None,
                                     vcf_id_map=None,
                                     start_from_chrom=None,
                                     end_with_chrom=None):
        collections = {
            f['family_id']: self._db[f['coll_name']]
            for f in family_info_list
        }
        #for collection in collections.values():
        #    collection.drop_indexes()
        indiv_id_list = [i for f in family_info_list for i in f['individuals']]
        number_of_families = len(family_info_list)
        sys.stderr.write(
            "Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n"
            % locals())

        for family in family_info_list:
            print("Indexing family: " + str(family))
            collection = collections[family['family_id']]
            collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        # check whether some of the variants for this chromosome has been loaded already
        # if yes, start from the last loaded variant, and not from the beginning
        if "_chr" in vcf_file_path or ".chr" in vcf_file_path:
            # if the VCF files are split by chromosome (eg. for WGS projects), check within the chromosome
            vcf_file = compressed_file(vcf_file_path)
            variant = next(
                vcf_stuff.iterate_vcf(vcf_file,
                                      genotypes=False,
                                      indiv_id_list=indiv_id_list,
                                      vcf_id_map=vcf_id_map))
            print(vcf_file_path + "  - chromsome: " + str(variant.chr))
            vcf_file.close()

            position_per_chrom = {}
            for chrom in range(1, 24):
                position_per_chrom[chrom] = defaultdict(int)
                for family in family_info_list:  #variants = collections[family['family_id']].find().sort([('xpos',-1)]).limit(1)
                    variants = list(collections[family['family_id']].find({
                        '$and': [{
                            'xpos': {
                                '$gte': chrom * 1e9
                            }
                        }, {
                            'xpos': {
                                '$lt': (chrom + 1) * 1e9
                            }
                        }]
                    }).sort([('xpos', -1)]).limit(1))
                    if len(variants) > 0:
                        position_per_chrom[chrom][family[
                            'family_id']] = variants[0]['xpos'] - chrom * 1e9
                    else:
                        position_per_chrom[chrom][family['family_id']] = 0

            for chrom in range(1, 24):
                position_per_chrom[chrom] = min(
                    position_per_chrom[chrom].values()
                )  # get the smallest last-loaded variant position for this chromosome across all families

            chr_idx = int(variant.xpos / 1e9)
            start_from_pos = int(position_per_chrom[chr_idx])

            print("Start from: %s - %s (%0.1f%% done)" %
                  (chr_idx, start_from_pos, 100. * start_from_pos /
                   CHROMOSOME_SIZES[variant.chr.replace("chr", "")]))
            tabix_file = pysam.TabixFile(vcf_file_path)
            vcf_iter = itertools.chain(
                tabix_file.header,
                tabix_file.fetch(variant.chr.replace("chr", ""),
                                 start_from_pos, int(2.5e8)))
        elif start_from_chrom or end_with_chrom:
            if start_from_chrom:
                print("Start chrom: chr%s" % start_from_chrom)
            if end_with_chrom:
                print("End chrom: chr%s" % end_with_chrom)

            chrom_list = list(map(str, range(1, 23))) + ['X', 'Y']
            chrom_list_start_index = 0
            if start_from_chrom:
                chrom_list_start_index = chrom_list.index(
                    start_from_chrom.replace("chr", "").upper())

            chrom_list_end_index = len(chrom_list)
            if end_with_chrom:
                chrom_list_end_index = chrom_list.index(
                    end_with_chrom.replace("chr", "").upper())

            tabix_file = pysam.TabixFile(vcf_file_path)
            vcf_iter = tabix_file.header
            for chrom in chrom_list[
                    chrom_list_start_index:chrom_list_end_index + 1]:
                print("Will load chrom: " + chrom)
                try:
                    vcf_iter = itertools.chain(vcf_iter,
                                               tabix_file.fetch(chrom))
                except ValueError as e:
                    print("WARNING: " + str(e))

        else:
            vcf_iter = vcf_file = compressed_file(vcf_file_path)
            # TODO handle case where it's one vcf file, not split by chromosome

        size = os.path.getsize(vcf_file_path)

        #progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path))

        def insert_all_variants_in_buffer(buff, collections_dict):
            for family_id in buff:
                if len(buff[family_id]) == 0:  # defensive programming
                    raise ValueError(
                        "%s has zero variants to insert. Should not be in buff."
                        % family_id)

            while len(buff) > 0:
                # choose a random family for which to insert a variant from among families that still have variants to insert
                family_id = random.choice(buff.keys())

                # pop a variant off the list for this family, and insert it
                family_variant_dict_to_insert = buff[family_id].pop()
                c = collections_dict[family_id]
                c.insert(family_variant_dict_to_insert)

                if len(buff[family_id]) == 0:
                    del buff[
                        family_id]  # if no more variants for this family, delete it

        vcf_rows_counter = 0
        variants_buffered_counter = 0
        family_id_to_variant_list = defaultdict(
            list)  # will accumulate variants to be inserted all at once
        for variant in vcf_stuff.iterate_vcf(vcf_iter,
                                             genotypes=True,
                                             indiv_id_list=indiv_id_list,
                                             vcf_id_map=vcf_id_map):
            if variant.alt == "*":
                #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple()))
                continue

            try:
                annotation = self._annotator.get_annotation(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    populations=reference_populations)
            except ValueError, e:
                sys.stderr.write("WARNING: " + str(e) + "\n")
                continue

            vcf_rows_counter += 1
            for family in family_info_list:
                # TODO: can we move this inside the if relevant clause below?
                try:
                    family_variant = variant.make_copy(
                        restrict_to_genotypes=family['individuals'])
                    family_variant_dict = family_variant.toJSON()
                    _add_index_fields_to_variant(family_variant_dict,
                                                 annotation)
                    if xbrowse_utils.is_variant_relevant_for_individuals(
                            family_variant, family['individuals']):
                        collection = collections[family['family_id']]
                        if not collection.find_one({
                                'xpos': family_variant.xpos,
                                'ref': family_variant.ref,
                                'alt': family_variant.alt
                        }):
                            family_id_to_variant_list[family[
                                'family_id']].append(family_variant_dict)
                            variants_buffered_counter += 1
                except Exception, e:
                    sys.stderr.write(
                        "ERROR: on variant %s, family: %s - %s\n" %
                        (variant.toJSON(), family, e))
Exemple #7
0
def torsid(variantlist, regiontext, build):
    """
    Parameters
    ----------
    variantlist : list
        List of variants in either rs id or other chr_pos, chr_pos_ref, chr_pos_ref_alt, chr_pos_ref_alt_build format.

    Returns
    -------
    rsidlist : list
        Corresponding rs id in the region if found.
        Otherwise returns '.'
    """
    
    if all(x=='.' for x in variantlist):
        raise InvalidUsage('No variants provided')

    variantlist = cleanSNPs(variantlist, regiontext, build)
    
    chrom, startbp, endbp = parseRegionText(regiontext, build)
    chrom = str(chrom).replace('23',"X")

    # Load dbSNP151 SNP names from region indicated
    dbsnp_filepath = ''
    suffix = 'b37'
    if build.lower() in ["hg38", "grch38"]:
        suffix = 'b38'
        dbsnp_filepath = os.path.join(MYDIR, 'data', 'dbSNP151', 'GRCh38p7', 'All_20180418.vcf.gz')
    else:
        suffix = 'b37'
        dbsnp_filepath = os.path.join(MYDIR, 'data', 'dbSNP151', 'GRCh37p13', 'All_20180423.vcf.gz')


    # Load dbSNP file
    tbx = pysam.TabixFile(dbsnp_filepath)
    print('Compiling list of known variants in the region from dbSNP151')
    chromcol = []
    poscol = []
    idcol = []
    refcol = []
    altcol = []
    rsid = dict({}) # chr_pos_ref_alt_build (keys) for rsid output (values)
    for row in tbx.fetch(str(chrom), startbp, endbp):
        rowlist = str(row).split('\t')
        chromi = rowlist[0].replace('chr','')
        posi = rowlist[1]
        idi = rowlist[2]
        refi = rowlist[3]
        alti = rowlist[4]
        varstr = '_'.join([chromi, posi, refi, alti, suffix])
        chromcol.append(chromi)
        poscol.append(posi)
        idcol.append(idi)
        refcol.append(refi)
        altcol.append(alti)
        rsid[varstr] = idi
        altalleles = alti.split(',') # could have more than one alt allele (multi-allelic)
        if len(altalleles)>1:
            varstr = '_'.join([chromi, posi, refi, altalleles[0], suffix])
            rsid[varstr] = idi
            for i in np.arange(len(altalleles)-1):
                varstr = '_'.join([chromi, posi, refi, altalleles[i+1], suffix])
                rsid[varstr] = idi
    
    finalvarlist = []
    for variant in variantlist:
        if not variant.startswith('rs'):
            try:
                finalvarlist.append(rsid[variant])
            except:
                finalvarlist.append('.')
        else:
            finalvarlist.append(variant)
    
    return finalvarlist
Exemple #8
0
def main(args, pass_through_args):
    if cram_input(args.bams):
        if "-r" not in pass_through_args and not "--reference" in pass_through_args:
            sys.exit("ERROR: missing reference file required for CRAM. " +
                     "Use -r option. (Run `samplot.py -h` for more help)")
    global HTML
    global HERE

    vcf = pysam.VariantFile(args.vcf)
    vcf_samples = vcf.header.samples
    vcf_samples_set = set(vcf_samples)
    vcf_samples_list = list(vcf_samples)

    annotations = None
    if args.gff:
        annotations = pysam.TabixFile(args.gff)

    filters = [to_exprs(f) for f in args.filter]

    ped_samples = parse_ped(args.ped, vcf_samples)

    # this is empty unless we have a sample with both parents defined.
    dn_row = get_dn_row(ped_samples)

    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    names_to_bams = get_names_to_bams(args.bams, args.sample_ids)
    important_regions = None
    if args.important_regions:
        important_regions = read_important_regions(args.important_regions)
    tabledata = []
    # user requested FORMAT fields to add to plot title
    format_field_ids = None
    if args.format:
        format_field_ids = args.format.split(",")

    out_file = sys.stdout
    if args.command_file:
        out_file = open(args.command_file, "w")

    for variant in vcf:
        svtype = variant.info.get("SVTYPE", "SV")
        if args.important_regions:
            if not var_in_important_regions(important_regions, variant.chrom,
                                            variant.start, variant.stop):
                continue

        if svtype in ("BND", "INS"):
            continue
        if variant.stop - variant.start > args.max_mb * 1000000:
            continue
        if variant.stop - variant.start > args.min_bp:
            continue

        gts = [s.get("GT", (None, None)) for s in variant.samples.values()]

        if sum(None in g
               for g in gts) >= args.min_call_rate * len(vcf_samples):
            continue
        if args.max_hets:
            # requisite hets/hom-alts
            if sum(sum(x) >= 1 for x in gts if not None in x) > args.max_hets:
                continue
        if not any(sum(x) > 0 for x in gts if not None in x):
            continue

        test_idxs = [
            i for i, gt in enumerate(gts) if not None in gt and sum(gt) > 0
        ]
        test_samples = [
            s for i, s in enumerate(variant.samples.values()) if i in test_idxs
        ]

        if len(filters) == 0:
            idxs = test_idxs
        else:
            idxs = []
            odict = make_single(dict(variant.info.items()))
            for i, ts in enumerate(test_samples):
                vdict = odict.copy()
                vdict.update(make_single(dict(ts.items())))

                if any(check_expr(vdict, fs) for fs in filters):
                    idxs.append(test_idxs[i])
        if len(idxs) == 0:
            continue
        is_dn = []

        # we call it a de novo if the sample passed the filters but the mom and
        # dad had homref genotypes before filtering.
        # so stringent filtering on the kid and lenient on parents.
        variant_samples = []
        for i in idxs:
            if vcf_samples[i] in names_to_bams:
                variant_samples.append(vcf_samples[i])
        if len(variant_samples) <= 0:
            continue

        bams = [names_to_bams[s] for s in variant_samples]
        if dn_row != "":
            test_sample_names = {s.name for s in test_samples}
            for variant_sample in variant_samples:
                sample = ped_samples[variant_sample]
                if sample.mom is None or sample.dad is None:
                    continue
                if not sample.mom.id in test_sample_names and not sample.dad.id in test_sample_names:
                    is_dn.append(sample.id)

        if len(is_dn) <= 0 and args.dn_only:
            continue

        # save these for the html.
        n_samples = len(variant_samples)
        # semi-colon delimited eases CSV export from HTML
        sample_str = ";".join(variant_samples)
        # dict holding sample to FORMAT title string
        plot_titles = dict()
        if format_field_ids:
            format_attrs = get_format_title(vcf_samples_list, format_field_ids,
                                            variant)
            plot_titles = make_plot_titles(variant_samples, format_attrs)

        # try to get family members
        if args.ped is not None:
            # do DN samples first so we can see parents.
            for variant_sample in is_dn + [
                    x for x in variant_samples if not x in is_dn
            ]:
                s = ped_samples.get(variant_sample)
                if s is None:
                    continue
                if s.mom is not None and not s.mom.id in variant_samples and s.mom.id in vcf_samples_set:
                    variant_samples.append("mom-of-%s[%s]" %
                                           (variant_sample, s.mom.id))
                    bams.append(names_to_bams[s.mom.id])
                if s.dad is not None and not s.dad.id in variant_samples and s.dad.id in vcf_samples_set:
                    variant_samples.append("dad-of-%s[%s]" %
                                           (variant_sample, s.dad.id))
                    bams.append(names_to_bams[s.dad.id])
                for kid in s.kids:
                    if not kid.id in variant_samples and kid.id in vcf_samples_set:
                        variant_samples.append("kid-of-%s[%s]" %
                                               (variant_sample, kid.id))
                        bams.append(names_to_bams[kid.id])
                    if args.max_hets:
                        if len(bams) > 1.5 * args.max_hets:
                            break
                if args.max_hets:
                    if len(bams) > 1.5 * args.max_hets:
                        break
        elif args.min_entries and len(bams) < args.min_entries:
            # extend with some controls:
            hom_ref_idxs = [
                i for i, gt in enumerate(gts)
                if len(gt) == 2 and gt[0] == 0 and gt[1] == 0
            ]
            if len(hom_ref_idxs) > 3:
                random.shuffle(hom_ref_idxs)
                hom_ref_idxs = hom_ref_idxs[:3]

            hom_ref_samples = []
            for i in hom_ref_idxs:
                if vcf_samples[i] in names_to_bams:
                    hom_ref_samples.append(vcf_samples[i])

            to_add_count = args.min_entries - len(bams)
            bams.extend(names_to_bams[s]
                        for s in hom_ref_samples[:to_add_count])
            variant_samples += [
                "control-sample:" + s for s in hom_ref_samples[:to_add_count]
            ]

        data_dict = {
            "chrom": variant.chrom,
            "start": variant.start,
            "end": variant.stop,
            "svtype": svtype,
            "svlength": variant.stop - variant.start,
            "samples": sample_str,
            "nsamples": n_samples,
        }
        if annotations:
            data_dict["overlaps"] = get_overlap(annotations, variant.chrom,
                                                variant.start, variant.stop)
        if dn_row != "":
            data_dict["dn"] = ",".join(is_dn)
        fig_path = os.path.join(
            args.out_dir, "{svtype}_{chrom}_{start}_{end}.{itype}".format(
                itype=args.output_type, **data_dict))
        tabledata.append(data_dict)

        if "CIPOS" in variant.info:
            v = variant.info["CIPOS"]
            cipos = "--start_ci '%s,%s'" % (abs(v[0]), abs(v[1]))
        else:
            cipos = ""
        if "CIEND" in variant.info:
            v = variant.info["CIEND"]
            ciend = "--end_ci '%s,%s'" % (abs(v[0]), abs(v[1]))
        else:
            ciend = ""
        # dynamically set Z to speed drawing and remove noise for larger events
        z = 3
        if variant.stop - variant.start > 2000:
            z = 4
        if variant.stop - variant.start > 10000:
            z = 6
        if variant.stop - variant.start > 20000:
            z = 9

        if args.max_entries:
            bams = bams[:args.max_entries]
            variant_samples = variant_samples[:args.max_entries]

        # update titles based on FORMAT fields requested
        title_list = list()
        for variant_sample in variant_samples:
            if variant_sample in plot_titles:
                title_list.append(plot_titles[variant_sample])
            else:
                title_list.append(variant_sample)

        out_file.write(
            "python {here}/samplot.py {extra_args} -z {z} --minq 0 -n {titles} {cipos} {ciend} {svtype} -c {chrom} -s {start} -e {end} -o {fig_path} -d 1 -b {bams}\n"
            .format(
                here=HERE,
                extra_args=" ".join(pass_through_args),
                bams=" ".join(bams),
                titles=" ".join(title_list),
                z=z,
                cipos=cipos,
                ciend=ciend,
                svtype="-t " + svtype if svtype != "SV" else "",
                fig_path=fig_path,
                chrom=variant.chrom,
                start=variant.start,
                end=variant.stop,
            ))

    if args.command_file:
        out_file.close()

    # update the javascript
    HTML = HTML.replace("[DATA]", json.dumps(tabledata))
    HTML = HTML.replace("[PLOT_TYPE]", args.output_type)
    HTML = HTML.replace("[GFF]", "true" if annotations else "false")
    HTML = HTML.replace("[DENOVO]", "true" if dn_row else "false")

    with open("{out_dir}/index.html".format(out_dir=args.out_dir), "w") as fh:
        print(HTML, file=fh)
#! /usr/bin/env python

import sys, pysam

input_file1 = sys.argv[1]
input_file2 = sys.argv[2]
output_file = sys.argv[3]
id_file = sys.argv[4]

id_tb = pysam.TabixFile(id_file)

"""
key2id = {}
with open(id_file, 'r') as hin:
    for line in hin:
        F = line.rstrip('\n').split('\t')
        FF = F[3].split('|')
        for i in [-2, 2, -1, 1, 0]:
            for j in [-2, 2, -1, 1, 0]:
                key = F[0] + '\t' + str(int(F[1]) + i) + '\t' + str(int(F[2]) + j)
                if key not in key_list: continue
                key2id[F[0] + '\t' + F[1] + '\t' + F[2]] = FF[0]
"""


def check_id(chr, start, end):

    tabixErrorFlag = 0
    try:
        records = id_tb.fetch(chr, start - 5, end + 5)
    except Exception as inst:
Exemple #10
0
    def handle(
        self,
        file: str,
        organism: str,
        doi: str = None,
        ignore: str = None,
        cpu: int = 1,
        verbosity: int = 1,
        **options
    ):
        """Execute the main function."""
        # retrieve only the file name
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        try:
            index_file = "{}.tbi".format(file)
            FileValidator().validate(index_file)
        except ImportingError:
            try:
                index_file = "{}.csi".format(file)
                FileValidator().validate(index_file)
            except ImportingError:
                raise CommandError("No index found (.tbi/.csi)")

        try:
            feature_file = FeatureLoader(
                filename=filename, source="GFF_SOURCE", doi=doi
            )
        except ImportingError as e:
            raise CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        chunk_size = cpu * 2

        # Load the GFF3 file
        with open(file) as tbx_file:
            tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file)
            for row in tqdm(tbx.fetch(parser=pysam.asGTF()), total=get_num_lines(file)):
                if ignore is not None and row.feature in ignore:
                    continue
                tasks.append(
                    pool.submit(feature_file.store_tabix_GFF_feature, row, organism)
                )

                if len(tasks) >= chunk_size:
                    for task in as_completed(tasks):
                        try:
                            task.result()
                        except ImportingError as e:
                            raise CommandError(e)
                    tasks.clear()
            else:
                for task in as_completed(tasks):
                    try:
                        task.result()
                    except ImportingError as e:
                        raise CommandError(e)
                tasks.clear()

        pool.shutdown()

        if verbosity > 0:
            self.stdout.write("Loading relationships")

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        for item in feature_file.relationships:
            tasks.append(
                pool.submit(
                    feature_file.store_relationship,
                    organism,
                    item["subject_id"],
                    item["object_id"],
                )
            )

        for task in tqdm(as_completed(tasks), total=len(tasks)):
            try:
                task.result()
            except ImportingError as e:
                raise CommandError(e)
        pool.shutdown()

        if feature_file.ignored_attrs is not None:
            self.stdout.write(
                self.style.WARNING(
                    "Ignored attrs: {}".format(feature_file.ignored_attrs)
                )
            )

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
Exemple #11
0
def allc_to_bigwig(allc_path, output_prefix, bin_size, mc_contexts,
                   chrom_size_path, strandness):
    """\
    Generate BigWig files from one ALLC file.

    Parameters
    ----------
    allc_path
        {allc_path_doc}
    output_prefix
        Path prefix of the output BigWig file.
    bin_size
        {bw_bin_sizes_doc}
    mc_contexts
        {mc_contexts_doc}
    strandness
        {strandness_doc}
    chrom_size_path
        {chrom_size_path_doc}
        If chrom_size_path provided, will use it to extract ALLC with chrom order,
        but if region provided, will ignore this.
    """
    if strandness not in {"split", "both"}:
        raise ValueError(
            f'strandness need to be "split" or "both", got "{strandness}"')

    chrom_sizes = parse_chrom_size(chrom_size_path)
    chrom_sizes_list = [(k, v) for k, v in chrom_sizes.items()]

    # create bigwig file handles for each case
    # context_handle: key is mC context pattern like CHN, CAN, CGN, value is the output handle
    context_handle = {}
    output_path_collect = {}
    for bw_type in ["frac", "cov"]:
        out_suffix = f"{bw_type}.bw"
        for mc_context in mc_contexts:
            if strandness == "split":
                file_path = output_prefix + f".{mc_context}-Watson.{out_suffix}"
                output_path_collect[(mc_context, "Watson",
                                     out_suffix)] = file_path
                # handle for Watson/+ strand
                w_handle = pyBigWig.open(file_path, "w")
                w_handle.addHeader(chrom_sizes_list)
                context_handle[(mc_context, "+", bw_type)] = w_handle

                file_path = output_prefix + f".{mc_context}-Crick.{out_suffix}"
                output_path_collect[(mc_context, "Crick",
                                     out_suffix)] = file_path
                # handle for Crick/- strand
                c_handle = pyBigWig.open(file_path, "w")
                c_handle.addHeader(chrom_sizes_list)
                context_handle[(mc_context, "-", bw_type)] = c_handle
            else:
                # handle for both strand
                file_path = output_prefix + f".{mc_context}-{strandness}.{out_suffix}"
                output_path_collect[(mc_context, strandness,
                                     out_suffix)] = file_path
                _handle = pyBigWig.open(file_path, "w")
                _handle.addHeader(chrom_sizes_list)
                context_handle[mc_context, bw_type] = _handle

    def _init_counter(_contexts, _strandness):
        if _strandness == "split":
            # a counter for +/- strand separately
            _counter = StrandContextCounter(_contexts)
        else:
            # a counter for both +/- strands
            _counter = ContextCounter(_contexts)
        return _counter

    with pysam.TabixFile(allc_path) as allc:
        allc_chroms = set(allc.contigs)
        for chrom, chrom_size in chrom_sizes.items():
            if chrom not in allc_chroms:
                continue
            counter = _init_counter(mc_contexts, strandness)
            cur_bin = 0
            for line in allc.fetch(chrom):
                _, pos, strand, context, mc, cov, _ = line.split("\t")
                pos = int(pos)
                mc = float(mc)
                cov = float(cov)
                this_bin = (pos - 1) // bin_size
                if this_bin != cur_bin:
                    # dump cur_bin counts
                    bin_start = int(cur_bin * bin_size)
                    write_entry(
                        counter=counter,
                        context_handle=context_handle,
                        mc_contexts=mc_contexts,
                        strandness=strandness,
                        chrom=chrom,
                        bin_start=bin_start,
                        bin_size=bin_size,
                    )
                    # initiate next bin
                    cur_bin = this_bin
                    counter = _init_counter(mc_contexts, strandness)

                # add counts
                if strandness == "split":
                    counter.add(context, strand, mc, cov)
                else:
                    counter.add(context, mc, cov)

            # final bin of the chrom
            bin_start = int(cur_bin * bin_size)
            write_entry(
                counter=counter,
                context_handle=context_handle,
                mc_contexts=mc_contexts,
                strandness=strandness,
                chrom=chrom,
                bin_start=bin_start,
                bin_size=bin_size,
            )
            print(chrom, "finished")

    for handle in context_handle.values():
        handle.close()
    return output_path_collect
Exemple #12
0
def annotate(in_vcf_gz_path: str, out_vcf_path: str, annot_bed_path: str):
    chroms = [f"chr{n}" for n in range(1, 23)]

    with pysam.TabixFile(in_vcf_gz_path) as in_vcf_file, pysam.TabixFile(
            annot_bed_path) as annot_bed_file, open(out_vcf_path,
                                                    "w") as out_vcf_file:
        # Make and write headers
        vcf_headers = in_vcf_file.header
        annot_key_str = annot_bed_file.header[0].split("=")[1]
        annot_info_header = f"##INFO=<ID=ANNOT,Key={annot_key_str}>"
        vcf_headers.append(annot_info_header)
        vcf_headers[-1], vcf_headers[-2] = (
            vcf_headers[-2],
            vcf_headers[-1],
        )  # Swap

        for vcf_header in vcf_headers:
            print(vcf_header, file=out_vcf_file)

        # Annotate by the input BED file
        for chrom in chroms:
            var_iter = in_vcf_file.fetch(chrom, parser=pysam.asTuple())
            bed_iter = annot_bed_file.fetch(chrom, parser=pysam.asTuple())
            bed_memory = deque()
            variant = next(var_iter, None)

            while variant is not None:
                var_pos = int(variant[1]) - 1  # 1-based -> 0-based
                var_ref = variant[3]
                var_alt = variant[4]

                # Determine a search region for BED coordinates
                if len(var_ref) == 1:  # Insertion or substitution
                    region_start = var_pos
                    region_end = (var_pos +
                                  2 if len(var_alt) > 1 else var_pos + 1)
                else:  # Deletion
                    region_start = var_pos + 1
                    region_end = region_start + len(var_ref) - 1

                # Get an annotation integer
                annot_int = 0
                stop_bed_iter = False

                # 1. Check the memory of previously checked BED coordinates
                while (len(bed_memory) > 0
                       and int(bed_memory[0][2]) <= region_start):
                    # Remove non-overlapped coordinates
                    bed_memory.popleft()

                for bed in bed_memory:
                    if int(bed[1]) < region_end:  # Overlap
                        annot_int |= int(bed[3])
                    else:
                        stop_bed_iter = True
                        break

                # 2. Continuously iterate over the BED coordinates and check
                if not stop_bed_iter:
                    bed = next(bed_iter, None)

                    while bed is not None:
                        bed_start = int(bed[1])
                        bed_end = int(bed[2])

                        if region_start < bed_end:
                            bed_memory.append(bed)
                            if bed_start < region_end:  # Overlap
                                annot_int |= int(bed[3])
                            else:
                                break

                        bed = next(bed_iter, None)

                print(str(variant) + f";ANNOT={annot_int}", file=out_vcf_file)
                variant = next(var_iter, None)
Exemple #13
0
def run(args):
    import snpCaller, indelCaller

    pool = mp.Pool(processes=args.cpu)

    if not args.output:
        args.output = os.getcwd()

    os.makedirs(args.output, exist_ok=True)

    end = None
    if not args.end:
        try:
            with open(args.ref + '.fai', 'r') as file:
                for line in file:
                    if line.split('\t')[0] == args.chrom:

                        end = int(line.split('\t')[1])

            if end == None:
                print('%s: contig %s not found in reference.' %
                      (str(datetime.datetime.now()), args.chrom),
                      flush=True)
                sys.exit(2)

        except FileNotFoundError:
            print('%s: Index file .fai required for reference genome file' %
                  (str(datetime.datetime.now())),
                  flush=True)
            sys.exit(2)

    else:
        end = args.end

    if not args.start:
        start = 1
    else:
        start = args.start

    threshold = [
        float(args.neighbor_threshold.split(',')[0]),
        float(args.neighbor_threshold.split(',')[1])
    ]

    dirname = os.path.dirname(__file__)

    if args.exclude_bed in ['hg38', 'hg19', 'mm10', 'mm39']:
        args.exclude_bed = os.path.join(
            dirname,
            'release_data/bed_files/%s_centro_telo.bed.gz' % args.exclude_bed)

    if args.include_bed:
        tbx = pysam.TabixFile(args.include_bed)
        include_intervals = IntervalTree(
            Interval(int(row[1]), int(row[2]), "%s" % (row[1]))
            for row in tbx.fetch(args.chrom, parser=pysam.asBed()))

        include_intervals = IntervalTree(include_intervals.overlap(start, end))

        if include_intervals:
            start = max(start, min(x[0] for x in include_intervals))
            end = min(end, max(x[1] for x in include_intervals))

        else:
            print(
                '%s: No overlap between include_bed file and start/end coordinates'
                % (str(datetime.datetime.now())),
                flush=True)
            return

    in_dict={'chrom':args.chrom, 'start':start, 'end':end, 'sam_path':args.bam, 'fasta_path':args.ref, \
             'mincov':args.mincov,  'maxcov':args.maxcov, 'min_allele_freq':args.min_allele_freq, 'min_nbr_sites':args.min_nbr_sites, \
             'threshold':threshold, 'snp_model':args.snp_model, 'cpu':args.cpu,  'vcf_path':args.output,'prefix':args.prefix,'sample':args.sample, \
            'seq':args.sequencing, 'supplementary':args.supplementary, 'include_bed':args.include_bed, 'exclude_bed':args.exclude_bed}

    snp_vcf = ''
    if args.mode in ['snps', 'snps_unphased', 'both']:
        snp_time = time.time()
        snp_vcf = snpCaller.test_model(in_dict, pool)
        print('\n%s: SNP calling completed for contig %s. Time taken= %.4f\n' %
              (str(datetime.datetime.now()), in_dict['chrom'],
               time.time() - snp_time),
              flush=True)

        if snp_vcf and args.mode in ['snps', 'both']:
            enable_whatshap = '--distrust-genotypes --include-homozygous' if args.enable_whatshap else ''

            print('\n%s: ------WhatsHap SNP phasing log------\n' %
                  (str(datetime.datetime.now())),
                  flush=True)

            run_cmd(
                "whatshap phase %s.vcf.gz %s -o %s.phased.preclean.vcf -r %s --ignore-read-groups --chromosome %s %s"
                % (snp_vcf, in_dict['sam_path'], snp_vcf,
                   in_dict['fasta_path'], in_dict['chrom'], enable_whatshap),
                verbose=True)

            run_cmd(
                "bcftools view -e  'GT=\"0\\0\"' %s.phased.preclean.vcf|bgziptabix %s.phased.vcf.gz"
                % (snp_vcf, snp_vcf))

            print('\n%s: ------SNP phasing completed------\n' %
                  (str(datetime.datetime.now())),
                  flush=True)

            if args.mode == 'both' or args.phase_bam:
                print('\n%s: ------WhatsHap BAM phasing log------\n' %
                      (str(datetime.datetime.now())),
                      flush=True)

                run_cmd(
                    "whatshap haplotag --ignore-read-groups --ignore-linked-read -o %s.phased.bam --reference %s %s.phased.vcf.gz %s --regions %s:%d:%d --tag-supplementary"
                    % (snp_vcf, in_dict['fasta_path'], snp_vcf,
                       in_dict['sam_path'], args.chrom, start, end),
                    verbose=True)

                run_cmd('samtools index %s.phased.bam' % snp_vcf)

                print('\n%s: ------BAM phasing completed-----\n' %
                      (str(datetime.datetime.now())),
                      flush=True)

        else:
            return

    if args.mode in ['indels', 'both']:

        sam_path = '%s.phased.bam' % snp_vcf if args.mode == 'both' else args.bam

        in_dict={'chrom':args.chrom, 'start':start, 'end':end, 'sam_path':sam_path, 'fasta_path':args.ref, \
             'mincov':args.mincov,  'maxcov':args.maxcov, 'min_allele_freq':args.min_allele_freq, 'min_nbr_sites':args.min_nbr_sites, \
             'threshold':threshold, 'snp_model':args.snp_model,'indel_model':args.indel_model, 'cpu':args.cpu,  'vcf_path':args.output,'prefix':args.prefix,'sample':args.sample, 'seq':args.sequencing, \
                'del_t':args.del_threshold,'ins_t':args.ins_threshold,'supplementary':args.supplementary, 'include_bed':args.include_bed\
                , 'exclude_bed':args.exclude_bed,'win_size':args.win_size,'small_win_size':args.small_win_size}
        ind_time = time.time()
        indel_vcf = indelCaller.test_model(in_dict, pool)

        print('%s: Post processing' % (str(datetime.datetime.now())),
              flush=True)

        run_cmd('samtools faidx %s %s > %s/%s.fa' %
                (args.ref, args.chrom, args.output, args.chrom))

        remove_path('%s/ref.sdf' % args.output)

        run_cmd('rtg RTG_MEM=4G format -f fasta %s/%s.fa -o %s/ref.sdf' %
                (args.output, args.chrom, args.output))

        remove_path('%s.vcf.gz' % indel_vcf)

        run_cmd(
            'rtg RTG_MEM=4G vcfdecompose -i %s.raw.vcf.gz --break-mnps -o - -t %s/ref.sdf|rtg RTG_MEM=4G vcffilter -i - --non-snps-only -o  %s.vcf.gz'
            % (indel_vcf, args.output, indel_vcf))

        print('%s: Indel calling completed for contig %s. Time taken= %.4f' %
              (str(datetime.datetime.now()), in_dict['chrom'],
               time.time() - ind_time),
              flush=True)

        if args.mode == 'both':

            if not args.keep_bam:
                os.remove('%s.phased.bam' % snp_vcf)

            final_path = os.path.join(args.output,
                                      '%s.final.vcf.gz' % args.prefix)
            run_cmd(
                'bcftools concat %s.phased.vcf.gz %s.vcf.gz -a -d all |bgziptabix %s'
                % (snp_vcf, indel_vcf, final_path))

    pool.close()
    pool.join()
Exemple #14
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-i", "--input-vcf", dest="input_vcf_file", type="string",
        help="input vcf file")

    parser.add_option(
        "-f", "--input-fasta", dest="input_fasta_file", type="string",
        help="input fasta file. faidx indexed reference sequence file to "
        "determine INDEL context [%default]")

    parser.add_option(
        "-e", "--input-bed", dest="input_bed_file", type="string",
        help="input file with intervals. Tab-delimited file of intervals "
        "in bed format to restrict analysis to. [%default]")

    parser.add_option(
        "-r", "--region", dest="region", type="string",
        help="Region string to restrict analysis to. Takes precedence "
        "over --input-bed. [%default]")

    parser.add_option(
        "-m", "--method", dest="methods", action="append", type="choice",
        choices=("mutational-signature",
                 "mutational-signature-profile",
                 "kinship",
                 "format-distribution",
                 "gc-context",
                 "gc-depth-profile"),
        help="methods to apply [%default]")

    parser.add_option(
        "--format-distribution", dest="format_distributions", action="append",
        type="string",
        help="format to compute histograms on. Option can specified multiple times. "
        "At the moment, only integer metrics are supported [%default]")

    parser.add_option(
        "--format-distribution-nbins", dest="format_distributions_nbins", type="int",
        help="number of bins to use for histograms [%default]")

    parser.add_option(
        "--only-variant-positions", dest="only_variant_positions",
        action="store_true",
        help="only use variant positions [%default]")

    parser.add_option(
        "--gc-window-size", dest="gc_window_size", type="int",
        help="(half) window size to use for G+C computation. A size "
        "of 50 means that 50 bases on either side of the variant are "
        "used to compute the G+C content [%default]")

    parser.set_defaults(
        methods=[],
        input_vcf_file=None,
        input_bed_file=None,
        region=None,
        input_fasta_file=None,
        format_distributions=[],
        format_distribution_nbins=1000,
        gc_window_size=50,
        report_step=1000000,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) == 1:
        options.input_vcf_file = args[0]

    if options.input_vcf_file is None:
        raise ValueError("please supply a VCF file")

    if options.input_fasta_file is None:
        raise ValueError("please supply a FASTA file")

    if "format-distribution" in options.methods and not options.format_distributions:
        raise ValueError("please supply at least one FORMAT field (DP, GQ) "
                         "when --method=format-distribution has been selected")

    if not os.path.exists(options.input_vcf_file):
        raise OSError("input vcf file {} does not exist".format(
            options.input_vcf_file))

    if not os.path.exists(options.input_vcf_file + ".tbi") and not \
       os.path.exists(options.input_vcf_file + ".csi"):
        raise OSError("input vcf file {} needs to be indexed".format(
            options.input_vcf_file))

    if not os.path.exists(options.input_fasta_file):
        raise OSError("input fasta file {} does not exist".format(
            options.input_fasta_file))

    if not os.path.exists(options.input_fasta_file + ".fai"):
        raise OSError("input fasta file {} needs to be indexed".format(
            options.input_fasta_file))

    # update paths to absolute
    options.input_fasta_file = os.path.abspath(options.input_fasta_file)
    options.input_vcf_file = os.path.abspath(options.input_vcf_file)

    # catch issue with empty variant files
    try:
        vcf_in = pysam.VariantFile(options.input_vcf_file)
    except (OSError, ValueError):
        E.warn("could not open variant file - likely to be empty")
        E.stop()
        return 0

    fasta_in = pysam.FastaFile(options.input_fasta_file)

    if options.input_bed_file:
        if not os.path.exists(options.input_bed_file):
            raise OSError("input bed file {} does not exist".format(
                options.input_bed_file))
        bed_in = pysam.TabixFile(options.input_bed_file)
    else:
        bed_in = None

    vcf2stats_count(
        vcf_in, fasta_in, bed_in, options)

    E.stop()
Exemple #15
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-fastq-file",
                      dest="input_fastq_file",
                      type="string",
                      help="input fastq file. "
                      "[%default]")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("read-variant", "depth-vcf", "read-list",
                               "coverage-vcf"),
                      help="method to apply [%default]")

    parser.add_option(
        "-e",
        "--input-bed",
        dest="input_bed_file",
        type="string",
        help="input file with intervals. Tab-delimited file of intervals "
        "in bed format to restrict analysis to. [%default]")

    parser.add_option(
        "-r",
        "--region-string",
        dest="region_string",
        type="string",
        help="region string. Only apply method in specified region. "
        "[%default]")

    parser.add_option("-f",
                      "--reference-fasta-file",
                      dest="reference_fasta_file",
                      help="reference genomic sequence in fasta format. "
                      "[%default]")

    parser.add_option("-s",
                      "--stepper",
                      dest="stepper",
                      type="choice",
                      choices=("nofilter", "samtools", "all"))

    parser.set_defaults(method="read-variant",
                        reference_fasta_file=None,
                        input_bed_file=None,
                        regex_sample_name="([^/]+).bam",
                        stepper="nofilter",
                        region_string=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    pysam_in = pysam.AlignmentFile(args[0], "rb")

    if options.input_bed_file:
        if not os.path.exists(options.input_bed_file):
            raise OSError("input bed file {} does not exist".format(
                options.input_bed_file))
        bed_in = pysam.TabixFile(options.input_bed_file)
    else:
        bed_in = None

    if options.region_string is not None:
        itr = generate_from_region(pysam_in,
                                   options.region,
                                   stepper=options.stepper)
    elif bed_in is not None:
        itr = generate_from_bed(pysam_in, bed_in, stepper=options.stepper)
    else:
        itr = generate_from_bam(pysam_in, stepper=options.stepper)

    reference_fasta = pysam.FastaFile(options.reference_fasta_file)

    outf = options.stdout
    c = E.Counter()

    if options.method == "read-variant":
        outf.write("chromosome\tposition\tref\ttypes\n")

        for pileupcolumn in itr:
            c.positions_pileup += 1
            reference_base = reference_fasta.fetch(
                pileupcolumn.reference_name, pileupcolumn.reference_pos,
                pileupcolumn.reference_pos + 1)
            matches = []
            bases = set()
            for read in pileupcolumn.pileups:
                qpos = read.query_position
                if qpos is not None:
                    base = read.alignment.query_sequence[qpos]
                else:
                    base = "-"

                matches.append((base, read.alignment.query_name))
                bases.add(base)

            bases = list(bases)
            if len(bases) == 1:
                c.position_noninformative += 1
                if bases[0] == reference_base:
                    c.position_reference += 1
                continue

            c.position_informative += 1

            d = {}
            for base in bases:
                d[base] = ",".join([x[1] for x in matches if x[0] == base])

            outf.write("{}\t{}\t{}\t{}\n".format(pileupcolumn.reference_name,
                                                 pileupcolumn.reference_pos,
                                                 reference_base,
                                                 json.dumps(d)))

    elif options.method in ("depth-vcf", "coverage-vcf"):
        if options.regex_sample_name:
            sample_name = re.search(options.regex_sample_name,
                                    args[0]).groups()[0]
        else:
            sample_name = "unknown"

        outf.write("##fileformat=VCFv4.1\n")
        outf.write("##FORMAT=<ID=GT,Number=1,Type=String,"
                   "Description=\"Genotype\">\n")
        outf.write("##FORMAT=<ID=DP,Number=1,Type=Integer,"
                   "Description=\"Genotype\">\n")
        outf.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\t"
                   "FILTER\tINFO\tFORMAT\t{}\n".format(sample_name))

        is_depth = options.method == "depth-vcf"

        for idx, pileupcolumn in enumerate(itr):

            if idx % 1000 == 0:
                E.info("processed {} positions".format(idx))

            reference_base = reference_fasta.fetch(
                pileupcolumn.reference_name, pileupcolumn.reference_pos,
                pileupcolumn.reference_pos + 1).upper()

            if reference_base == 'A':
                alt_base = 'C'
            else:
                alt_base = 'A'

            if is_depth:
                n = sum([
                    1 for x in pileupcolumn.pileups
                    if not (x.is_del or x.is_refskip)
                ])
            else:
                n = pileupcolumn.n

            outf.write("{}\t{}\t.\t{}\t{}\t.\tPASS\t.\tGT:DP\t0/1:{}\n".format(
                pileupcolumn.reference_name, pileupcolumn.reference_pos,
                reference_base, alt_base, n))

    elif options.method == "read-list":
        outf.write(
            "chromosome\tposition\treference_base\tbase\tquality\tquery_name\n"
        )

        for pileupcolumn in itr:
            reference_base = reference_fasta.fetch(
                pileupcolumn.reference_name, pileupcolumn.reference_pos,
                pileupcolumn.reference_pos + 1)
            matches = []
            for read in pileupcolumn.pileups:
                qpos = read.query_position
                if qpos is not None:
                    base = read.alignment.query_sequence[qpos]
                    quality = read.alignment.query_qualities[qpos]
                else:
                    base = "-"
                    quality = ""

                outf.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    pileupcolumn.reference_name, pileupcolumn.reference_pos,
                    reference_base, base, quality, read.alignment.query_name))

    E.info(c)
    # write footer and output benchmark information.
    E.stop()
Exemple #16
0
 def __init__(self, path, binned):
     self._tabixfile = pysam.TabixFile(path)
     self._binned = binned
def gm_main(path, region_file):
    simple_file_name = ''
    simple_file_name = os.path.basename(region_file)
    simple_file_name = os.path.splitext(simple_file_name)[0]

    regions = get_chrm_start_end(region_file)
    #print(regions[0]
    # for each file in path
    for bed_file in os.listdir(path):

        if bed_file.endswith('bed.gz'):
            # file outputs
            # intermediate quality
            intermediates_path = '/scratch/Shares/layer/nextflow/kristen/fastq_to_vcf/mpileup/chco-exome-analysis/intermediates'
            intermediate_Q_name = bed_file + simple_file_name + 'quality_intermediate.txt'
            intermediate_Q_txt = open(
                os.path.join(intermediates_path, intermediate_Q_name), 'a')
            intermediate_Q_txt.truncate(0)
            intermediate_SB_name = bed_file + simple_file_name + 'strand_bias_intermediate.txt'
            intermediate_SB_txt = open(
                os.path.join(intermediates_path, intermediate_SB_name), 'a')
            intermediate_SB_txt.truncate(0)

            # final quality
            finals_path = '/scratch/Shares/layer/nextflow/kristen/fastq_to_vcf/mpileup/chco-exome-analysis/final_metrics'
            final_Q_name = bed_file + simple_file_name + 'quality_final.txt'
            final_Q_txt = open(os.path.join(finals_path, final_Q_name), 'a')
            final_Q_txt.truncate(0)
            final_SB_name = bed_file + simple_file_name + 'strand_bias_final.txt'
            final_SB_txt = open(os.path.join(finals_path, final_SB_name), 'a')
            final_SB_txt.truncate(0)

            for r in range(len(regions)):
                chrm = regions[r][0]
                start = int(regions[r][1])
                end = int(regions[r][2])
                tbx = pysam.TabixFile(path + '/' + bed_file)

                # list to hold mpileup quality counts
                quality = get_quality(tbx, chrm, start, end)
                for q in quality:
                    print(q[0],
                          '\t',
                          q[1],
                          '\t',
                          q[2],
                          '\t',
                          np.average(q[3]),
                          file=intermediate_Q_txt)

                reads = get_reads(tbx, chrm, start, end)
                counts = get_counts(reads)
                strand_bias = get_strandbias(counts)
                if strand_bias == -1: pass
                else:
                    for sb in strand_bias:
                        print(sb[0][0],
                              '\t',
                              sb[0][1],
                              '\t',
                              sb[0][2],
                              '\t',
                              sb[1],
                              file=intermediate_SB_txt)

            intermediate_Q_txt.close()
            intermediate_SB_txt.close()
            geno_to_exo_main(
                region_file,
                os.path.join(intermediates_path, intermediate_Q_name),
                os.path.join(finals_path, final_Q_name))
            geno_to_exo_main(
                region_file,
                os.path.join(intermediates_path, intermediate_SB_name),
                os.path.join(finals_path, final_SB_name))

        else:
            continue
Exemple #18
0
def filterNonMatchControl(inputFilePath, outputFilePath, controlFile,
                          matchedNormal, controlPanel_num_thres,
                          controlPanel_check_margin):
    """
        script for removing candidate in which 
        non-matched normals have the junction reads
    """

    hIN = open(inputFilePath, 'r')
    hOUT = open(outputFilePath, 'w')

    use_control = True if controlFile != "" else False
    if use_control == True: tabixfile = pysam.TabixFile(controlFile)

    tabixErrorMsg = ""
    for line in hIN:
        F = line.rstrip('\n').split('\t')

        controlFlag = 0
        max_control_sample = "---"
        max_control_num = 0

        if use_control == True:

            inseqSize = (0 if F[7] == "---" else len(F[7]))

            ####################
            # get the records for control junction data for the current position
            tabixErrorFlag = 0
            try:
                records = tabixfile.fetch(
                    F[0],
                    int(F[1]) - controlPanel_check_margin,
                    int(F[2]) + controlPanel_check_margin)
            except Exception as inst:
                # print >> sys.stderr, "%s: %s" % (type(inst), inst.args)
                tabixErrorMsg = str(inst.args)
                tabixErrorFlag = 1
            ####################

            ####################
            # for each record in control junction extracted, check the consistency with the current junction
            # max_control_sample = "---"
            # max_control_num = 0
            if tabixErrorFlag == 0:
                for record_line in records:
                    record = record_line.split('\t')

                    if F[0] == record[0] and F[3] == record[3] and F[
                            8] == record[8] and F[9] == record[9]:

                        flag = 0
                        # detailed check on the junction position considering inserted sequences
                        if F[8] == "+":
                            expectedDiffSize = (int(F[2]) - int(record[2])) + (
                                inseqSize - int(record[7]))
                            if (F[9] == "+" and int(F[5]) == int(record[5]) -
                                    int(expectedDiffSize)) or (
                                        F[9] == "-"
                                        and int(F[5]) == int(record[5]) +
                                        int(expectedDiffSize)):
                                flag = 1
                        else:
                            expectedDiffSize = (int(F[2]) - int(record[2])) + (
                                int(record[7]) - inseqSize)
                            if (F[9] == "+" and int(F[5]) == int(record[5]) +
                                    int(expectedDiffSize)) or (
                                        F[9] == "-"
                                        and int(F[5]) == int(record[5]) -
                                        int(expectedDiffSize)):
                                flag = 1

                        # if position relationship including inserted sequences matches
                        if flag == 1:
                            controlSamples = record[10].split(';')
                            controlNums = record[11].split(';')

                            for i in range(0, len(controlSamples)):
                                if controlSamples[i] == matchedNormal: continue

                                if int(controlNums[i]) > max_control_num:
                                    max_control_sample = controlSamples[i]
                                    max_control_num = int(controlNums[i])

                                if int(controlNums[i]) >= int(
                                        controlPanel_num_thres):
                                    controlFlag = 1
                                """
                                # if controlSamples[i] != matchedNormal is not None and int(controlNums[i]) >= int(controlPanel_num_thres):
                                # if controlSamples[i] != matchedNormal and int(controlNums[i]) >= int(supportReadThres):
                                    controlFlag = 1
                                    if int(controlNums[i]) > max_control_num:
                                        max_control_sample = controlSamples[i]
                                        max_control_num = int(controlNums[i]) 
                                """

            ####################

        if controlFlag == 0:
            print >> hOUT, "\t".join(
                F) + '\t' + max_control_sample + '\t' + str(max_control_num)

    if tabixErrorMsg != "":
        utils.warningMessage(
            "One or more error occured in tabix file fetch, e.g.: " +
            tabixErrorMsg)

    hIN.close()
    hOUT.close()
    if use_control == True: tabixfile.close()
Exemple #19
0
def standardizeSNPsV2(variantlist, regiontxt, build):
    """
    Input: Variant names in any of these formats: rsid, chrom_pos_ref_alt, chrom:pos_ref_alt, chrom:pos_ref_alt_b37/b38 
    Output: chrom_pos_ref_alt_b37/b38 variant ID format, but looks at GTEx variant lookup table first.
    In the case of multi-allelic variants (e.g. rs2211330(T/A,C)), formats such as 1_205001063_T_A,C_b37 are accepted
    If variant ID format is chr:pos, and the chr:pos has a unique biallelic SNV, then it will be assigned that variant
    """
    
    if all(x=='.' for x in variantlist):
        raise InvalidUsage('No variants provided')
    
    if np.nan in variantlist:
        raise InvalidUsage('Missing variant IDs detected in row(s): ' + str([ i+1 for i,x in enumerate(variantlist) if str(x) == 'nan' ]))
    
    # Ensure valid region:
    chrom, startbp, endbp = parseRegionText(regiontxt, build)
    chrom = str(chrom).replace('23',"X")
    
    # Load GTEx variant lookup table for region indicated
    db = client.GTEx_V7
    rsid_colname = 'rs_id_dbSNP147_GRCh37p13'
    if build.lower() in ["hg38", "grch38"]:
        db = client.GTEx_V8
        rsid_colname = 'rs_id_dbSNP151_GRCh38p7'
    collection = db['variant_table']
    variants_query = collection.find(
        { '$and': [ 
            { 'chr': int(chrom.replace('X','23')) }, 
            { 'variant_pos': { '$gte': int(startbp), '$lte': int(endbp) } } 
            ]}
        )
    variants_list = list(variants_query)
    variants_df = pd.DataFrame(variants_list)
    variants_df = variants_df.drop(['_id'], axis=1)
    

    # Load dbSNP151 SNP names from region indicated
    dbsnp_filepath = ''
    suffix = 'b37'
    if build.lower() in ["hg38", "grch38"]:
        suffix = 'b38'
        dbsnp_filepath = os.path.join(MYDIR, 'data', 'dbSNP151', 'GRCh38p7', 'All_20180418.vcf.gz')
    else:
        suffix = 'b37'
        dbsnp_filepath = os.path.join(MYDIR, 'data', 'dbSNP151', 'GRCh37p13', 'All_20180423.vcf.gz')
    
    
    # Load dbSNP file
    #delayeddf = delayed(pd.read_csv)(dbsnp_filepath,skiprows=getNumHeaderLines(dbsnp_filepath),sep='\t')
    #dbsnp = dd.from_delayed(delayeddf)
    tbx = pysam.TabixFile(dbsnp_filepath)
    print('Compiling list of known variants in the region from dbSNP151')
    chromcol = []
    poscol = []
    idcol = []
    refcol = []
    altcol = []
    variantid = [] # in chr_pos_ref_alt_build format
    rsids = dict({}) # a multi-allelic variant rsid (key) can be represented in several variantid formats (values)
    for row in tbx.fetch(str(chrom), startbp, endbp):
        rowlist = str(row).split('\t')
        chromi = rowlist[0].replace('chr','')
        posi = rowlist[1]
        idi = rowlist[2]
        refi = rowlist[3]
        alti = rowlist[4]
        varstr = '_'.join([chromi, posi, refi, alti, suffix])
        chromcol.append(chromi)
        poscol.append(posi)
        idcol.append(idi)
        refcol.append(refi)
        altcol.append(alti)
        variantid.append(varstr)
        rsids[idi] = [varstr]
        altalleles = alti.split(',') # could have more than one alt allele (multi-allelic)
        if len(altalleles)>1:
            varstr = '_'.join([chromi, posi, refi, altalleles[0], suffix])
            rsids[idi].append(varstr)
            for i in np.arange(len(altalleles)-1):
                varstr = '_'.join([chromi, posi, refi, altalleles[i+1], suffix])
                rsids[idi].append(varstr)
    
    print('Cleaning and mapping list of variants')
    variantlist = [asnp.split(';')[0].replace(':','_').replace('.','') for asnp in variantlist] # cleaning up the SNP names a bit
    stdvariantlist = []
    for variant in variantlist:
        if variant == '':
            stdvariantlist.append('.')
            continue
        variantstr = variant.replace('chr','')
        if re.search("^23_",variantstr): variantstr = variantstr.replace('23_','X_',1)
        if variantstr.startswith('rs'):
            try:
                # Here's the difference from the first function version (we look at GTEx first)
                if variant in list(variants_df[rsid_colname]):
                    stdvar = variants_df['variant_id'].loc[ variants_df[rsid_colname] == variant].to_list()[0]
                    stdvariantlist.append(stdvar)
                else:
                    stdvariantlist.append(rsids[variantstr][0])
            except:
                stdvariantlist.append('.')
        elif re.search("^\d+_\d+_[A,T,G,C]+_[A,T,C,G]+,*", variantstr.replace('X','23')):
            strlist = variantstr.split('_')
            strlist = list(filter(None, strlist)) # remove empty strings
            try:
                achr, astart, aend = parseRegionText(strlist[0]+":"+strlist[1]+"-"+str(int(strlist[1])+1), build)
                achr = str(achr).replace('23','X')
                if achr == str(chrom) and astart >= startbp and astart <= endbp:
                    variantstr = variantstr.replace("_"+str(suffix),"") + "_"+str(suffix)
                    if len(variantstr.split('_')) == 5:
                        stdvariantlist.append(variantstr)
                    else:
                        raise InvalidUsage(f'Variant format not recognizable: {variant}. Is it from another coordinate build system?', status_code=410)
                else:
                    stdvariantlist.append('.')
            except:
                raise InvalidUsage(f'Problem with variant {variant}', status_code=410)
        elif re.search("^\d+_\d+_*[A,T,G,C]*", variantstr.replace('X','23')):
            strlist = variantstr.split('_')
            strlist = list(filter(None, strlist)) # remove empty strings
            try:
                achr, astart, aend = parseRegionText(strlist[0]+":"+strlist[1]+"-"+str(int(strlist[1])+1), build)
                achr = str(achr).replace('23','X')
                if achr == str(chrom) and astart >= startbp and astart <= endbp:
                    if len(strlist)==3:
                        aref=strlist[2]
                    else:
                        aref=''
                    stdvariantlist.append(fetchSNV(achr, astart, aref, build))
                else:
                    stdvariantlist.append('.')
            except:
                raise InvalidUsage(f'Problem with variant {variant}', status_code=410)
        else:
            raise InvalidUsage(f'Variant format not recognized: {variant}', status_code=410)
    return stdvariantlist
Exemple #20
0
def get_part_from_gtf(annotation, reference=None, feature="CDS"):   
    tabixfile = pysam.TabixFile(annotation, parser=pysam.asGTF())
    return [gtf for gtf in tabixfile.fetch(reference=reference) if (gtf.feature == feature)]
Exemple #21
0
def main(argv):
    prog = "paleomix vcf_to_fasta"
    usage = "%s [options] --genotype in.vcf --intervals in.bed" % (prog, )

    parser = argparse.ArgumentParser(prog=prog, usage=usage)
    parser.add_argument(
        "--genotype",
        required=True,
        metavar="VCF",
        help="Tabix indexed VCF file; by default the first "
        "sample is used in multi-sample VCFs. Use "
        "--nth-sample option to select another sample.",
    )
    parser.add_argument(
        "--nth-sample",
        default=1,
        type=int,
        metavar="NTH",
        help="Use Nth sample from the VCF, with the first "
        "sample numbered '1' [default: %(default)s].",
    )
    parser.add_argument(
        "--intervals",
        metavar="BED",
        help="Six column BED file; sequences on the same "
        "contig with the same name are assumed to "
        "represent the same gene, and are merged into a "
        "single contiguous FASTA sequence.",
    )
    parser.add_argument(
        "--padding",
        type=int,
        default=10,
        help="Number of bases to expand intervals, when "
        "checking for adjacent indels [%(default)s]",
    )
    parser.add_argument(
        "--whole-codon-indels-only",
        action="store_true",
        default=False,
        help="If true, only indels where (length %% 3) == 0 "
        "are retained [%(default)s]",
    )
    parser.add_argument(
        "--ignore-indels",
        action="store_true",
        default=False,
        help="Do not include indels generated FASTA "
        "sequence [%(default)s].",
    )

    opts = parser.parse_args(argv)

    print("Running vcf_to_fasta", end="", file=sys.stderr)
    if opts.whole_codon_indels_only:
        print(", assuming sequences represents CDS", end="", file=sys.stderr)
    print(file=sys.stderr)

    if not os.path.exists(opts.genotype):
        sys.stderr.write("ERROR: VCF file does not exist.\n")
        return 1
    elif not os.path.exists(opts.genotype + ".tbi"):
        sys.stderr.write("ERROR: VCF file not tabix indexed.\n")
        sys.stderr.write('       To index, run "tabix -p vcf <filename>".\n')
        return 1
    elif opts.nth_sample < 1:
        sys.stderr.write(
            "ERROR: --nth-sample uses 1-based offsets, zero and\n")
        sys.stderr.write("       negative values are not allowed!\n")
        return 1

    # Relevant VCF functions uses zero-based offsets
    opts.nth_sample -= 1

    genotype = pysam.TabixFile(opts.genotype)

    if opts.intervals is None:
        intervals = parse_intervals(genotype)
    else:
        intervals = read_intervals(opts.intervals)

    if intervals is None:
        return 1

    if not check_nth_sample(opts, genotype):
        return 1

    return genotype_genes(opts, intervals, genotype)
Exemple #22
0
def annotate_peaks(peaks, gtf_gz, gtf_index, cfg_dict, q, idx, attributes,
                   logger_options):
    """ 
		Input: 
		peaks (list): List of dictionaries containing information on peaks to annotate (see function 'annotate_single_peak')
		gtf_gz (str): Path to gtf.gz file
		gtf_index (str): Path to gtf.gz index 
		cfg-dict (dict): The loaded config containing queries
		q (Queue): The queue to put annotations into
		idx (int): The order in which the annotations should be written to output
		attributes (list): A list of attribute columns to write to output,
		logger_options (dict): A dict for initializing UROPALogger
	"""

    logger = UROPALogger(**logger_options)

    #Open tabix file
    tabix_obj = pysam.TabixFile(gtf_gz, index=gtf_index)

    #For each peak in input peaks, collect all_valid_annotations
    logger.debug("Annotating peaks in chunk {0}".format(idx))
    all_valid_annotations = []
    for peak in peaks:

        #Annotate single peak
        valid_annotations = annotate_single_peak(peak,
                                                 tabix_obj,
                                                 cfg_dict,
                                                 logger=logger)
        all_valid_annotations.extend(valid_annotations)

    tabix_obj.close()

    #Write annotations to best hits and final hits
    logger.debug(
        "Annotated all peaks in chunk {0}. Now adding contents to queue...".
        format(idx))
    content = "\n".join([
        annopeak_to_string(peak, attributes=attributes)
        for peak in all_valid_annotations
    ]) + "\n"
    q.put(("allhits.bed", idx, content))
    q.put(("allhits.txt", idx, content))
    content = ""

    finalhits_content = "\n".join([
        annopeak_to_string(peak, attributes=attributes)
        for peak in all_valid_annotations if peak.get("best_hit", 0) == 1
    ]) + "\n"
    q.put(("finalhits.bed", idx, finalhits_content))
    q.put(("finalhits.txt", idx, finalhits_content))
    finalhits_content = ""

    ## Hits per query if chosen
    if cfg_dict["output_by_query"] == True:
        query_names = [query["name"] for query in cfg_dict["queries"]]
        for name in query_names:
            query_str = "\n".join([
                annopeak_to_string(peak, attributes=attributes)
                for peak in all_valid_annotations
                if peak.get("query_name", "") == name
            ]) + "\n"
            q.put((name + ".bed", idx, query_str))
            q.put((name + ".txt", idx, query_str))

    logger.debug("Job finished for chunk {0}".format(idx))
    return (0)  #success
def main():
    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-vcf',
        help='Allsites vcf to apply filters to and get callable sites',
        required=True)
    parser.add_argument('-bed',
                        '--bed_repeats',
                        help='BED file with repeat regions listed',
                        required=True)
    parser.add_argument('-ar_bed',
                        '--ar_bed',
                        help='BED file of ancestral repeats',
                        default='None')
    parser.add_argument(
        '-DF',
        '--DepthFilter',
        help=
        'Defines abnormal depth eg) 2 means abnormal depth is twice and half the mean depth',
        default=2.0,
        type=float)
    parser.add_argument('-mean_depth',
                        '--mean_depth',
                        help='Mean coverage depth of samples',
                        default=44.0)
    parser.add_argument('-N',
                        '--no_individuals',
                        help='Number of individuals in allsites VCF',
                        type=float,
                        default=10.0)
    parser.add_argument(
        '-chr',
        help=
        'Specifies chromosome to extract callable sites for, if ALL will run a job for each, '
        '-chr ALL can only be specified in conjunction with -sub',
        default='ALL')
    parser.add_argument(
        '-pol',
        help=
        'If specified will check if site can be polarised, takes a wga bed file',
        default='None')
    parser.add_argument('-out',
                        help='Output directory and prefix',
                        required=True)
    parser.add_argument('-evolgen',
                        help='If specified will run on lab queue',
                        action='store_true',
                        default=False)
    parser.add_argument('-sub',
                        help='If specified will submit itself to cluster',
                        action='store_true',
                        default=False)
    args = parser.parse_args()

    # variables
    all_sites = args.vcf
    repeat_bed = args.bed_repeats
    line_bed = args.ar_bed
    filter_factor = args.DepthFilter
    all_data_mean_depth = float(args.mean_depth)
    no_indiv = args.no_individuals
    chromosome = args.chr
    pol = args.pol
    out = args.out
    fasta_out = out + '.' + chromosome + '.fa'
    evolgen = args.evolgen

    # submission loop
    if args.sub is True:
        if chromosome == 'ALL':

            # gen chromo list and submit job for each
            grep_cmd = (
                'zcat ' + all_sites +
                ' | head -n 20000 | grep ^##contig | cut -d "," -f 1 | cut -d "=" -f 3 | grep -v ^NODE'
            )
            chromo_list = subprocess.Popen(grep_cmd, stdout=subprocess.PIPE, shell=True)\
                .communicate()[0].split('\n')[:-1]
            output_fasta_list = []
            jid_list = []
            for chromo in chromo_list:
                output_fasta_list.append(out + '.' + chromo + '.fa')
                jid = 'callsites_' + chromo + '.sh'
                jid_list.append(jid)
                command_line = ('callable_sites_from_vcf.py '
                                '-vcf ' + all_sites + ' '
                                '-bed ' + repeat_bed + ' '
                                '-ar_bed ' + line_bed + ' '
                                '-DF ' + str(filter_factor) + ' '
                                '-mean_depth ' + str(all_data_mean_depth) + ' '
                                '-N ' + str(no_indiv) + ' '
                                '-chr ' + chromo + ' '
                                '-pol ' + pol + ' '
                                '-out ' + out)
                q_sub([command_line],
                      out + '.' + chromo,
                      jid=jid,
                      evolgen=evolgen,
                      t=48)

            # cat job for final output
            cat_cmd = 'cat ' + ' '.join(output_fasta_list) + ' > ' + fasta_out
            q_sub([cat_cmd], out + 'cat', evolgen=evolgen, hold=jid_list)
            sys.exit()

        else:
            # submit script for chromosome
            command_line = ('callable_sites_from_vcf.py '
                            '-vcf ' + all_sites + ' '
                            '-bed ' + repeat_bed + ' '
                            '-ar_bed ' + line_bed + ' '
                            '-DF ' + str(filter_factor) + ' '
                            '-mean_depth ' + str(all_data_mean_depth) + ' '
                            '-N ' + str(no_indiv) + ' '
                            '-chr ' + chromosome + ' '
                            '-pol ' + pol + ' '
                            '-out ' + out)
            q_sub([command_line], out, evolgen=evolgen, t=48)
            sys.exit()

    # catch -all specified without -sub
    if args.chr == 'ALL' and args.sub is False:
        sys.exit('"-chr ALL" can only be run in conjunction with "-sub"')

    # calculate depth cutoffs
    lower_depth_limit = all_data_mean_depth / filter_factor
    upper_depth_limit = all_data_mean_depth * filter_factor

    repeats = set()
    # get bed regions per chromo
    for x in open(repeat_bed):
        if x.split()[0] == chromosome:
            repeats |= {y for y in range(int(x.split()[1]), int(x.split()[2]))}

    lines = set()
    # get bed regions per chromo
    if line_bed != 'None':
        for x in gzip.open(line_bed):
            if x.split()[0] == chromosome:
                lines |= {
                    y
                    for y in range(int(x.split()[1]), int(x.split()[2]))
                }

    # loop through allsites for chromosome
    counter = 0
    fasta_string = '>' + chromosome + '\n'
    if pol != 'None':
        wga_bed = pysam.TabixFile(pol)
    else:
        wga_bed = None

    with open(fasta_out, 'w') as out_fa:
        out_fa.write(fasta_string)
        fasta_string = ''
        prev_position = 0
        for line in VariantFile(all_sites).fetch(chromosome):

            # catch missing sites in allsites (new gatk3.7 feature)
            position = int(line.pos)
            diff = position - prev_position
            if diff != 1:
                missed_bases = ''.join(['1' for i in range(0, diff - 1)])
                fasta_string += missed_bases
            prev_position = position

            # add line break every 60 bases
            if len(fasta_string) >= 60:
                if len(fasta_string) == 60:
                    out_fa.write(fasta_string + '\n')
                    fasta_string = ''
                else:
                    out_fa.write(fasta_string[:60] + '\n')
                    fasta_string = fasta_string[60:]
            counter += 1

            # check for ns
            if line.ref == 'N':
                fasta_string += '0'
                continue

            # depth filter
            try:
                cumulative_depth = line.info["DP"]
            except KeyError:
                fasta_string += '1'
                continue

            locus_mean_depth = cumulative_depth / no_indiv
            if lower_depth_limit <= locus_mean_depth <= upper_depth_limit:

                # repeat filter
                if line.pos not in repeats:

                    # check if polarisable
                    if pol != 'None':
                        can_polarise = polarisable(line, wga_bed)[0]
                        if can_polarise is False:
                            fasta_string += 'k'
                            continue
                        else:
                            fasta_string += 'K'
                            continue
                    else:
                        fasta_string += 'k'
                        continue

                else:
                    if line.pos in lines:

                        # check if polarisable
                        if pol != 'None':
                            can_polarise = polarisable(line, wga_bed)[0]
                            if can_polarise is False:
                                fasta_string += 'r'
                                continue
                            else:
                                fasta_string += 'R'
                                continue
                        else:
                            fasta_string += 'r'
                            continue
                    else:
                        fasta_string += '1'
                    continue

            else:
                fasta_string += '1'
                continue

        out_fa.write(fasta_string + '\n')

    print counter
Exemple #24
0
hg38 = Genome(assembly="hg38")
import os
import pysam
import argparse

parser = argparse.ArgumentParser(
    description='Process histograms, scatter plots and metaplots')

parser.add_argument('cell_type')
parser.add_argument('tabix_file')
parser.add_argument('fragments')

args = parser.parse_args()
cell_type = args.cell_type

tabix_file = pysam.TabixFile(args.tabix_file)

os.system(
    'gunzip -c {} | bedtools intersect -sorted -c -a /home/John/JohnProject/reference/DHS_adjusted_6mer_bias_adjustedby_30_sorted_no_blacklist.unique.bed -b - > {}/index_cuts_{}_intersect.bed'
    .format(args.fragments, cell_type, cell_type))

from reference.tools import exp_profile, tabix_profile

dhs = pd.read_table(
    "{}/index_cuts_{}_intersect.bed".format(cell_type, cell_type),
    names='dhs_chr adjusted_dhs_start adjusted_dhs_end index_cuts'.split(),
    header=None,
    low_memory=False)

dhs_bias = pd.read_table(
    '/home/John/JohnProject/reference/DHS_with_footprints_and_biases_6mer_adjustedby30.txt.gz',
Exemple #25
0
 def func1():
     # opens any tabix file
     with pysam.TabixFile(self.filename) as inf:
         pass
Exemple #26
0
def main():
    # The results in a dictionary to be printed at the end of the script.
    output_table = OrderedDict({i: {'Ref': 0, 'Mod': 0, 'Oth': 0,
                                    'Ref_SF': 0, 'Mod_SF': 0, 'Oth_SF': 0} for i in range(minLength, maxLength+1)})

    # Max divergence allowed in bwa using the ancient paramenters '-n 0.01 -o 2 -l 16500'. This will be used used to correct the estimates of spurious alignments.
    MaxDivBWA = {'20': 2, '21': 2,
                 '22': 3, '23': 3,  '24': 3,  '25': 3,  '26': 3,  '27': 3,  '28': 3,  '29': 3,  '30': 3,  '31': 3,  '32': 3,  '33': 3,  '34': 3,  '35': 3,  '36': 3,  '37': 3,  '38': 3,  '39': 3,  '40': 3,  '41': 3,
                 '42': 4, '43': 4, '44': 4, '45': 4, '46': 4, '47': 4, '48': 4, '49': 4, '50': 4, '51': 4, '52': 4, '53': 4, '54': 4, '55': 4, '56': 4, '57': 4, '58': 4, '59': 4, '60': 4}

    start = time.time()
    r = list(range(minLength, maxLength+1))
    with pysam.AlignmentFile(input_file, "rb", check_sq=False) as samfile, pysam.TabixFile(infosites) as tabixfile:
        for chrom in [str(k) for k in range(1, 23)] + ['X']:
            for read in samfile.fetch(chrom, until_eof=True): # until_eof=True prevent pysam to complain if there is no index file.
                Cigar = read.cigarstring
                if (rm_Indels):
                    if 'I' in Cigar or 'D' in Cigar:
                        continue
                # Filter out softclip, hardclip and for MapQuality cutoff
                if 'S' not in Cigar and 'H' not in Cigar and read.mapping_quality >= MQ_cutoff:
                    pos = read.get_reference_positions(full_length=False)
                    site_position = 0
                    passTvFilter = True
                    try:
                        for s in tabixfile.fetch(chrom, pos[0], pos[-1], parser=pysam.asBed()):
                            site_position = int(s[1])
                            reference = s[3]
                            modified = s[4]
                            if(Transversions == True):
                                passTvFilter = not reference + \
                                    modified in ['CT', 'TC', 'GA', 'AG']
                    except ValueError:
                        break
                    if site_position != 0 and passTvFilter:
                        refseq = read.get_reference_sequence()
                        myseq = read.query_sequence
                        bq = read.query_qualities
                        L = min(len(myseq), maxLength)
                        if site_position in pos and L >= minLength:
                            p = site_position
                            # Sequences have different length, we need to align them
                            # update the variables accordingly
                            if len(myseq) != len(refseq):
                                (refseq, myseq, pos, bq) = alnseq(
                                    refseq, myseq, CIGAR=read.cigartuples, basequalities=bq, start=pos[0])
                            # increment counters
                            if deam_filter_skip or is_deaminated(refseq, myseq, terminal_deam, read.is_reverse, isDoubleStranded=DoubleStrand):
                                Allele = myseq[pos.index(p)]
                                BQ = bq[pos.index(p)]
                                if Allele in ['A', 'C', 'G', 'T']:
                                    if BQ >= BQ_cutoff:
                                        ret_type, pass_SF = count(
                                            Allele, Reference=reference, Modified=modified, isReverse=read.is_reverse)
                                        output_table[L][ret_type] += 1
                                        if pass_SF:
                                            output_table[L][ret_type + '_SF'] += 1

    print('bp\tRef\tMod\tOth\tRef_SF\tMod_SF\tOth_SF\tSpuriousAln(95%CI)\tSpuriousAln_SF(95%CI)')
    SpAl = []
    TrAl = []
    for i, elem in sorted(output_table.items()):
        print(i, end='')
        d = MaxDivBWA[str(i)]/i
        for j in ['Ref', 'Mod', 'Oth', 'Ref_SF', 'Mod_SF', 'Oth_SF']:
            print('\t'+str(elem[j]), end='')
        for j in ['', '_SF']:
            TrueAln = float(elem['Ref'+j])
            SpuriousAln = float(elem['Mod'+j]+elem['Oth'+j])
            if TrueAln+SpuriousAln > 0:
                TrueAln1 = max(TrueAln - SpuriousAln*d/(3-d), 0)
                SpuriousAln1 = SpuriousAln / (1-d/3)
                SpAl.append(SpuriousAln1)
                TrAl.append(TrueAln1)
                spu = round(SpuriousAln1 / (SpuriousAln1 + TrueAln1), 4)
                ci = binom_interval(SpuriousAln1, SpuriousAln1+TrueAln1)
            else:
                spu = 0; ci = [0,0]
            print(
                '\t'+str(spu)+' ('+str(round(ci[0], 4))+','+str(round(ci[1], 4))+')', end='')
        print()

    # Split the SpAl and TrAl and then print the cutoffs using the cumulative estimates.
    spal = SpAl[0::2]
    spal_sf = SpAl[1::2]
    tral = TrAl[0::2]
    tral_sf = TrAl[1::2]
    j001 = j01 = j1 = j001sf = j01sf = j1sf = True
    for i in range(0, len(spal)):
        cum_spal = sum(spal[i:])/(sum(spal[i:])+sum(tral[i:]))
        cum_spal_sf = sum(spal_sf[i:])/(sum(spal_sf[i:])+sum(tral_sf[i:]))
        if(cum_spal < 0.001 and j001):
            print('# 0.1% cutoff is', r[i], 'bp')
            j001 = False
        if(cum_spal < 0.01 and j01):
            print('# 1% cutoff is', r[i], 'bp')
            j01 = False
        if(cum_spal < 0.1 and j1):
            print('# 10% cutoff is', r[i], 'bp')
            j1 = False
        if(cum_spal_sf < 0.001 and j001sf):
            print('# 0.1% cutoff with SF is', r[i], 'bp')
            j001sf = False
        if(cum_spal_sf < 0.01 and j01sf):
            print('# 1% cutoff with SF is', r[i], 'bp')
            j01sf = False
        if(cum_spal_sf < 0.1 and j1sf):
            print('# 10% cutoff with SF is', r[i], 'bp')
            j1sf = False

    end = time.time()
    print("#...done in", round((end - start)/60, 3), "minute(s)!")
Exemple #27
0
    def setUp(self):

        TestVCF.setUp(self)

        self.tabix = pysam.TabixFile(self.tmpfilename + ".gz")
        self.compare = load_and_convert(self.filename)
Exemple #28
0
    filter_value = gnomad_row_fields[6]
    info_fields = [('Filter', filter_value)] + [tuple(kv.split('=')) for kv in gnomad_row_fields[7].split(';')]
    info_fields = filter(lambda kv: kv[0] in NEEDED_GNOMAD_FIELDS_SET, info_fields)
    info_fields = dict(info_fields)
    gnomad_column_values = [info_fields.get(k, '') for k in NEEDED_GNOMAD_FIELDS]

    # check that the clinvar alt allele matches (one of the) gnomAD alt allele(s)    
    #if len(alt_alleles) > 1:
    #    # select the AC/AN numbers corresponding to the specific alt allele
    #    alt_allele_index = alt_alleles.index(alt)    
    #    gnomad_column_values = map(lambda x: x.split(",")[alt_allele_index] if "," in x else x, gnomad_column_values)

    return gnomad_column_values


gnomad_f = pysam.TabixFile(args.gnomad_sites_vcf)
clinvar_f = gzip.open(args.clinvar_table) if args.clinvar_table.endswith('.gz') else open(args.clinvar_table)
clinvar_header = next(clinvar_f).rstrip('\n').split('\t')
clinvar_with_gnomad_header = clinvar_header + NEEDED_GNOMAD_FIELDS
print("\t".join(clinvar_with_gnomad_header))
for i, clinvar_row in enumerate(clinvar_f):
    clinvar_fields = clinvar_row.rstrip('\n').split('\t')
    clinvar_dict = dict(zip(clinvar_header, clinvar_fields))

    chrom = clinvar_dict['chrom']
    pos = int(clinvar_dict['pos'])
    ref = clinvar_dict['ref']
    alt = clinvar_dict['alt']
    gnomad_column_values = get_gnomad_column_values(gnomad_f, chrom, pos, ref, alt)
    
    print("\t".join(clinvar_fields + gnomad_column_values))
Exemple #29
0
 def setUp(self):
     IterationTest.setUp(self)
     self.tabix = pysam.TabixFile(self.filename)
Exemple #30
0
import my_utils.seq
import pysam

input_file = sys.argv[1]
output_file = sys.argv[2]
reference = sys.argv[3]
# hgmd_file = sys.argv[4]
spidex_file = sys.argv[4]

key2exists = {}
header2ind = {}

seq_margin = 100

# hgmd_db = pysam.TabixFile(hgmd_file)
spidex_db = pysam.TabixFile(spidex_file)

hout = open(output_file, 'w')
print >> hout, '\t'.join([
    "Cancer_Type", "Sample_Name", "Gene_Symbol", "Mutation_Key", "Motif_Pos",
    "Motif_Seq", "Rel_Pos", "Ref_Base", "Alt_Base", "Mutation_Type",
    "Is_Canonical", "SPIDEX"
])

with open(input_file, 'r') as hin:

    header = hin.readline().rstrip('\n').split('\t')
    for i in range(len(header)):
        header2ind[header[i]] = i

    for line in hin: