def extract_rna_info(chrom_info_file, raw_allelic_counts_dir, genotype_dir,time_step, target_regions_dir):
    # make dictionary of identifier => index mapping
    all_genotype_samples_file = genotype_dir + 'all_genotyped_samples.txt'
    samp_idx = get_samples_index(all_genotype_samples_file)

    # Initialize chromosome objects
    chrom_list = chromosome.get_all_chromosomes(chrom_info_file)
    chrom_dict = chromosome.get_chromosome_dict(chrom_info_file)

    snp_files = SNPFiles(genotype_dir + 'snp_tab.h5',genotype_dir + 'snp_index.h5',genotype_dir+'haps.h5')

    # STEP 1: make combined HDF5 files of AS counts, 
    # total mapped read counts, and genotype counts
    individuals = get_individual_array(target_regions_dir + 'rna_seq_samples_' + str(time_step) + '.txt')
    combined_files = CombinedFiles(raw_allelic_counts_dir, chrom_list,time_step)
    for ind in individuals:
        print(ind)
        sample_id = ind + '_' + str(time_step)
        count_files = CountFiles(raw_allelic_counts_dir, sample_id)
            
        ind_idx = samp_idx[ind]
        combined_files.add_counts(chrom_list, count_files, snp_files, ind_idx)

        count_files.close()

    return combined_files
Esempio n. 2
0
def main():

    sys.stderr.write("cmd: %s\n" % " ".join(sys.argv))
    
    args = parse_args()

    out_f = None
    if args.output_file:
        if args.output_file.endswith(".gz"):
            out_f = gzip.open(args.output_file, "wt")
        else:
            out_f = open(args.output_file, "wt")
    else:
        out_f = sys.stdout

    
    # make dictionary of identifier => index mapping
    samp_idx = get_samples_index(args)

    # read individuals
    individuals = read_individuals(args, samp_idx)
    
    chrom_list = chromosome.get_all_chromosomes(args.chrom)
    chrom_dict = chromosome.get_chromosome_dict(args.chrom)


    combined_files = CombinedFiles(OUTPUT_DIR, chrom_list)
    snp_files = SNPFiles(args)

    # STEP 1: make combined HDF5 files of AS counts, 
    # total mapped read counts, and genotype counts
    sys.stderr.write("summing genotypes and read counts across individuals\n")
    for ind in individuals:
        # open count files for this indivudal
        sys.stderr.write("individual: %s\n" % ind)
        count_files = CountFiles(args.read_count_dir, ind)

        ind_idx = samp_idx[ind]
        
        # add counts to combined totals
        combined_files.add_counts(chrom_list, count_files, snp_files, ind_idx)

        count_files.close()
        

    sys.stderr.write("generating list of target regions\n")
    
    # STEP 2: generate list of target regions centered on test SNPs:
    write_target_regions(out_f, args, chrom_list, combined_files, snp_files)

    combined_files.close()
    snp_files.close()
Esempio n. 3
0
def main():

    sys.stderr.write("cmd: %s\n" % " ".join(sys.argv))
    
    args = parse_args()

    out_f = None
    if args.output_file:
        if args.output_file.endswith(".gz"):
            out_f = gzip.open(args.output_file, "wt")
        else:
            out_f = open(args.output_file, "wt")
    else:
        out_f = sys.stdout

    
    # make dictionary of identifier => index mapping
    samp_idx = get_samples_index(args)

    # read individuals
    individuals = read_individuals(args, samp_idx)
    
    chrom_list = chromosome.get_all_chromosomes(args.chrom)
    chrom_dict = chromosome.get_chromosome_dict(args.chrom)


    combined_files = CombinedFiles(OUTPUT_DIR, chrom_list)
    snp_files = SNPFiles(args)

    # STEP 1: make combined HDF5 files of AS counts, 
    # total mapped read counts, and genotype counts
    sys.stderr.write("summing genotypes and read counts across individuals\n")
    for ind in individuals:
        # open count files for this indivudal
        sys.stderr.write("individual: %s\n" % ind)
        count_files = CountFiles(args.read_count_dir, ind)

        ind_idx = samp_idx[ind]
        
        # add counts to combined totals
        combined_files.add_counts(chrom_list, count_files, snp_files, ind_idx)

        count_files.close()
        

    sys.stderr.write("generating list of target regions\n")
    
    # STEP 2: generate list of target regions centered on test SNPs:
    write_target_regions(out_f, args, chrom_list, combined_files, snp_files)

    combined_files.close()
    snp_files.close()
Esempio n. 4
0
File: bam2h5.py Progetto: Q-KIM/WASP
def main():
    args = parse_args()

    snp_tab_h5 = tables.openFile(args.snp_tab, "r")
    snp_index_h5 = tables.openFile(args.snp_index, "r")

    if args.haplotype:
        hap_h5 = tables.openFile(args.haplotype, "r")
        ind_idx = lookup_individual_index(args.samples, args.individual)
    else:
        hap_h5 = None
        ind_idx = None
    
    ref_count_h5 = tables.openFile(args.ref_as_counts, "w")
    alt_count_h5 = tables.openFile(args.alt_as_counts, "w")
    other_count_h5 = tables.openFile(args.other_as_counts, "w")
    read_count_h5 = tables.openFile(args.read_counts, "w")

    output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5]

    chrom_dict = {}
        
    # initialize every chromosome in output files
    chrom_list = chromosome.get_all_chromosomes(args.chrom)
    
    for chrom in chrom_list:
        for out_file in output_h5:
            create_carray(out_file, chrom, args.data_type)
        chrom_dict[chrom.name] = chrom

    count = 0
    dtype = None
    if args.data_type == "uint8":
        max_count = MAX_UINT8_COUNT
        dtype = np.uint8
    elif args.data_type == "uint16":
        max_count = MAX_UINT16_COUNT
        dtype = np.uint16
    else:
        raise NotImplementedError("unsupported datatype %s" % args.data_type)
    
    for chrom in chrom_list:
        sys.stderr.write("%s\n" % chrom.name)

        warned_pos = {}

        # fetch SNP info for this chromosome
        if chrom.name not in snp_tab_h5.root:
            # no SNPs for this chromosome
            continue

        sys.stderr.write("fetching SNPs\n")

        snp_tab = snp_tab_h5.getNode("/%s" % chrom.name)
        snp_index_array = snp_index_h5.getNode("/%s" % chrom.name)[:]
        if hap_h5:
            hap_tab = hap_h5.getNode("/%s" % chrom.name)
        else:
            hap_tab = None
        
        # initialize count arrays for this chromosome to 0
        ref_carray = get_carray(ref_count_h5, chrom)
        alt_carray = get_carray(alt_count_h5, chrom)
        other_carray = get_carray(other_count_h5, chrom)
        read_count_carray = get_carray(read_count_h5, chrom)
        
        ref_array = np.zeros(chrom.length, dtype)
        alt_array = np.zeros(chrom.length, dtype)
        other_array = np.zeros(chrom.length, dtype)
        read_count_array = np.zeros(chrom.length, dtype)
        
        # loop over all BAM files, pulling out reads
        # for this chromosome
        for bam_filename in args.bam_filenames:
            sys.stderr.write("reading from file %s\n" % bam_filename)

            samfile = pysam.Samfile(bam_filename, "rb")

            for read in get_sam_iter(samfile, chrom):
                count += 1
                if count == 10000:                        
                    sys.stderr.write(".")
                    count = 0

                add_read_count(read, chrom, ref_array, alt_array, 
                               other_array, read_count_array, 
                               snp_index_array, snp_tab, hap_tab,
                               warned_pos, max_count, ind_idx)

            # store results for this chromosome        
            ref_carray[:] = ref_array
            alt_carray[:] = alt_array
            other_carray[:] = other_array
            read_count_carray[:] = read_count_array
            sys.stderr.write("\n")

            samfile.close()

    # set track statistics and close HDF5 files
    
    sys.stderr.write("setting statistics for each chromosome\n")
    for h5f in output_h5:
        chromstat.set_stats(h5f, chrom_list)
        h5f.close()

    snp_tab_h5.close()
    snp_index_h5.close()    
    if hap_h5:
        hap_h5.close()


    sys.stderr.write("done\n")
Esempio n. 5
0
def main():
    args = parse_args()

    sys.stderr.write("command line: %s\n" % " ".join(sys.argv))
    sys.stderr.write("python version: %s\n" % sys.version)
    sys.stderr.write("pysam version: %s\n" % pysam.__version__)
    sys.stderr.write("pytables version: %s\n" % tables.__version__)

    util.check_pysam_version()
    util.check_pytables_version()

    # disable warnings that come from pytables when chromosome
    # names are like 1, 2, 3 (instead of chr1, chr2, chr3)
    warnings.filterwarnings('ignore', category=tables.NaturalNameWarning)

    snp_tab_h5 = tables.open_file(args.snp_tab, "r")
    snp_index_h5 = tables.open_file(args.snp_index, "r")

    if args.haplotype:
        hap_h5 = tables.open_file(args.haplotype, "r")
    else:
        hap_h5 = None

    ref_count_h5 = tables.open_file(args.ref_as_counts, "w")
    alt_count_h5 = tables.open_file(args.alt_as_counts, "w")
    other_count_h5 = tables.open_file(args.other_as_counts, "w")
    read_count_h5 = tables.open_file(args.read_counts, "w")

    output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5]

    chrom_dict = {}

    # initialize every chromosome in output files
    chrom_list = chromosome.get_all_chromosomes(args.chrom)

    for chrom in chrom_list:
        for out_file in output_h5:
            create_carray(out_file, chrom, args.data_type)
        chrom_dict[chrom.name] = chrom

    count = 0
    dtype = None
    if args.data_type == "uint8":
        max_count = MAX_UINT8_COUNT
        dtype = np.uint8
    elif args.data_type == "uint16":
        max_count = MAX_UINT16_COUNT
        dtype = np.uint16
    else:
        raise NotImplementedError("unsupported datatype %s" % args.data_type)

    # create a txt file to also holds the counts
    if args.txt_counts is not None:
        if os.path.splitext(args.txt_counts)[1] == ".gz":
            txt_counts = gzip.open(args.txt_counts, 'wt+')
        else:
            txt_counts = open(args.txt_counts, 'w+')

    for chrom in chrom_list:
        sys.stderr.write("%s\n" % chrom.name)

        if args.test_chrom:
            if chrom.name != args.test_chrom:
                sys.stderr.write("skipping because not test chrom\n")
                continue

        warned_pos = {}

        # fetch SNP info for this chromosome
        if chrom.name not in snp_tab_h5.root:
            # no SNPs for this chromosome
            sys.stderr.write("skipping %s because chromosome with this name "
                             "not found in SNP table\n" % chrom.name)
            continue

        sys.stderr.write("fetching SNPs\n")

        snp_tab = snp_tab_h5.get_node("/%s" % chrom.name)
        snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:]
        if hap_h5:
            hap_tab = hap_h5.get_node("/%s" % chrom.name)
            ind_dict, ind_idx = snptable.SNPTable().get_h5_sample_indices(
                hap_h5, chrom, [args.individual])

            if len(ind_idx) == 1:
                ind_idx = ind_idx[0]
                sys.stderr.write("index for individual %s is %d\n" %
                                 (args.individual, ind_idx))
            else:
                raise ValueError("got sample indices for %d individuals, "
                                 "but expected to get index for one "
                                 "individual (%s)" %
                                 (len(ind_idx), args.individual))
                hap_tab = None
                ind_idx = None
        else:
            hap_tab = None
            ind_idx = None

        # initialize count arrays for this chromosome to 0
        ref_carray = get_carray(ref_count_h5, chrom)
        alt_carray = get_carray(alt_count_h5, chrom)
        other_carray = get_carray(other_count_h5, chrom)
        read_count_carray = get_carray(read_count_h5, chrom)

        ref_array = np.zeros(chrom.length, dtype)
        alt_array = np.zeros(chrom.length, dtype)
        other_array = np.zeros(chrom.length, dtype)
        read_count_array = np.zeros(chrom.length, dtype)

        # loop over all BAM files, pulling out reads
        # for this chromosome
        for bam_filename in args.bam_filenames:
            sys.stderr.write("reading from file %s\n" % bam_filename)

            samfile = pysam.Samfile(bam_filename, "rb")

            for read in get_sam_iter(samfile, chrom):
                count += 1
                if count == 10000:
                    sys.stderr.write(".")
                    count = 0

                add_read_count(read, chrom, ref_array, alt_array, other_array,
                               read_count_array, snp_index_array, snp_tab,
                               hap_tab, warned_pos, max_count, ind_idx)

            # store results for this chromosome
            ref_carray[:] = ref_array
            alt_carray[:] = alt_array
            other_carray[:] = other_array
            read_count_carray[:] = read_count_array
            sys.stderr.write("\n")

            # write data to numpy arrays, so that they can be written to a txt
            # file later
            # columns are:
            # chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count
            if args.txt_counts is not None:
                write_txt_file(txt_counts, chrom, snp_tab, hap_tab, ind_idx,
                               ref_array, alt_array, other_array)
            samfile.close()

    if args.txt_counts:
        # close the open txt file handler
        txt_counts.close()

    # check if any of the reads contained an unimplemented CIGAR
    if unimplemented_CIGAR[0] > 0:
        sys.stderr.write("WARNING: Encountered " +
                         str(unimplemented_CIGAR[0]) +
                         " instances of CIGAR codes: " +
                         str(unimplemented_CIGAR[1]) + ". Reads with these "
                         "CIGAR codes were skipped because they "
                         "are currently unimplemented.\n")

    # set track statistics and close HDF5 files

    sys.stderr.write("setting statistics for each chromosome\n")
    for h5f in output_h5:
        chromstat.set_stats(h5f, chrom_list)
        h5f.close()

    snp_tab_h5.close()
    snp_index_h5.close()
    if hap_h5:
        hap_h5.close()

    sys.stderr.write("done\n")
def main():
    args = parse_args()

    write_header(sys.stdout)

    # find index of individual in list of samples
    ind_idx = lookup_individual_index(args, args.individual)
    
    data_files = DataFiles(args)

    chrom_list = chromosome.get_all_chromosomes(args.chrom)
    chrom_dict = chromosome.get_chromosome_dict(args.chrom)
    
    genomewide_read_counts = get_genomewide_count(data_files.read_count_h5,
                                                  chrom_list)

    if args.input_file.endswith(".gz"):
        f = gzip.open(args.input_file)
    else:
        f = open(args.input_file)

    line_count = 0

    if args.target_region_size:
        sys.stderr.write("setting target region size to %d\n" %
                         args.target_region_size)
    
    for line in f:
        line_count += 1
        if line_count % 1000 == 0:
            sys.stderr.write(".")

        if line.startswith("#"):
            continue
        
        words = line.rstrip().split()

        if words[1] == "NA":
            # no SNP defined on this line:
            write_NA_line(sys.stdout)
            continue
        
        chrom_name = words[0]
        chrom = chrom_dict[chrom_name]
        
        region_list = get_target_regions(args, chrom, words)

        snp_pos = int(words[1])
        snp_ref_base = words[3]
        snp_alt_base = words[4]
        # TODO: check that SNP ref/alt match?
                    
        snp_region = coord.Coord(chrom, snp_pos, snp_pos)
        
        # pull out all of the SNPs in the target region(s)
        region_snps = get_region_snps(data_files, region_list, ind_idx)

        # pull out test SNP
        test_snp_list = get_region_snps(data_files, [snp_region], ind_idx)
        if len(test_snp_list) != 1:
            test_snp = None
            sys.stderr.write("WARNING: could not find test SNP at "
                             "position %s:%d\n" % (chrom.name, snp_pos))
            het_snps = []
        else:
            test_snp = test_snp_list[0]
                
            # pull out haplotype counts from linked heterozygous SNPs
            het_snps = get_het_snps(region_snps)
            set_snp_counts(data_files, region_list, het_snps, test_snp, args)

        region_read_counts = get_region_read_counts(data_files, region_list)

        write_output(sys.stdout, region_list, het_snps, test_snp, snp_pos,
                     region_read_counts, genomewide_read_counts)

    sys.stderr.write("\n")
    f.close()
    data_files.close()
Esempio n. 7
0
def main():
    args = parse_args()

    sys.stderr.write("command line: %s\n" % " ".join(sys.argv))
    sys.stderr.write("python version: %s\n" % sys.version)
    sys.stderr.write("pysam version: %s\n" % pysam.__version__)
    sys.stderr.write("pytables version: %s\n" % tables.__version__)

    util.check_pysam_version()
    util.check_pytables_version()

    # disable warnings that come from pytables when chromosome
    # names are like 1, 2, 3 (instead of chr1, chr2, chr3)
    warnings.filterwarnings('ignore', category=tables.NaturalNameWarning)

    
    snp_tab_h5 = tables.open_file(args.snp_tab, "r")
    snp_index_h5 = tables.open_file(args.snp_index, "r")

    if args.haplotype:
        hap_h5 = tables.open_file(args.haplotype, "r")
    else:
        hap_h5 = None

    ref_count_h5 = tables.open_file(args.ref_as_counts, "w")
    alt_count_h5 = tables.open_file(args.alt_as_counts, "w")
    other_count_h5 = tables.open_file(args.other_as_counts, "w")
    read_count_h5 = tables.open_file(args.read_counts, "w")

    output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5]

    chrom_dict = {}

    # initialize every chromosome in output files
    chrom_list = chromosome.get_all_chromosomes(args.chrom)

    for chrom in chrom_list:
        for out_file in output_h5:
            create_carray(out_file, chrom, args.data_type)
        chrom_dict[chrom.name] = chrom

    count = 0
    dtype = None
    if args.data_type == "uint8":
        max_count = MAX_UINT8_COUNT
        dtype = np.uint8
    elif args.data_type == "uint16":
        max_count = MAX_UINT16_COUNT
        dtype = np.uint16
    else:
        raise NotImplementedError("unsupported datatype %s" % args.data_type)

    # create a txt file to also holds the counts
    if args.txt_counts is not None:
        if os.path.splitext(args.txt_counts)[1] == ".gz":
            txt_counts = gzip.open(args.txt_counts, 'wt+')
        else:
            txt_counts = open(args.txt_counts, 'w+')

    for chrom in chrom_list:
        sys.stderr.write("%s\n" % chrom.name)

        if args.test_chrom:
            if chrom.name != args.test_chrom:
                sys.stderr.write("skipping because not test chrom\n")
                continue

        warned_pos = {}

        # fetch SNP info for this chromosome
        if chrom.name not in snp_tab_h5.root:
            # no SNPs for this chromosome
            sys.stderr.write("skipping %s because chromosome with this name "
                             "not found in SNP table\n" % chrom.name)
            continue

        sys.stderr.write("fetching SNPs\n")

        snp_tab = snp_tab_h5.get_node("/%s" % chrom.name)
        snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:]
        if hap_h5:
            hap_tab = hap_h5.get_node("/%s" % chrom.name)
            ind_dict, ind_idx = snptable.SNPTable().get_h5_sample_indices(
                hap_h5, chrom, [args.individual])

            if len(ind_idx) == 1:
                ind_idx = ind_idx[0]
                sys.stderr.write("index for individual %s is %d\n" %
                                 (args.individual, ind_idx))
            else:
                raise ValueError("got sample indices for %d individuals, "
                                 "but expected to get index for one "
                                 "individual (%s)" % (len(ind_idx),
                                                      args.individual))
                hap_tab = None
                ind_idx = None
        else:
            hap_tab = None
            ind_idx = None


        # initialize count arrays for this chromosome to 0
        ref_carray = get_carray(ref_count_h5, chrom)
        alt_carray = get_carray(alt_count_h5, chrom)
        other_carray = get_carray(other_count_h5, chrom)
        read_count_carray = get_carray(read_count_h5, chrom)

        ref_array = np.zeros(chrom.length, dtype)
        alt_array = np.zeros(chrom.length, dtype)
        other_array = np.zeros(chrom.length, dtype)
        read_count_array = np.zeros(chrom.length, dtype)

        # loop over all BAM files, pulling out reads
        # for this chromosome
        for bam_filename in args.bam_filenames:
            sys.stderr.write("reading from file %s\n" % bam_filename)

            samfile = pysam.Samfile(bam_filename, "rb")

            for read in get_sam_iter(samfile, chrom):
                count += 1
                if count == 10000:
                    sys.stderr.write(".")
                    count = 0

                add_read_count(read, chrom, ref_array, alt_array,
                               other_array, read_count_array,
                               snp_index_array, snp_tab, hap_tab,
                               warned_pos, max_count, ind_idx)

            # store results for this chromosome
            ref_carray[:] = ref_array
            alt_carray[:] = alt_array
            other_carray[:] = other_array
            read_count_carray[:] = read_count_array
            sys.stderr.write("\n")

            # write data to numpy arrays, so that they can be written to a txt
            # file later
            # columns are:
            # chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count
            if args.txt_counts is not None:
                write_txt_file(txt_counts, chrom, snp_tab, hap_tab, ind_idx,
                               ref_array, alt_array, other_array)
            samfile.close()

    if args.txt_counts:
        # close the open txt file handler
        txt_counts.close()

    # check if any of the reads contained an unimplemented CIGAR
    if unimplemented_CIGAR[0] > 0:
        sys.stderr.write("WARNING: Encountered " + str(unimplemented_CIGAR[0])
                         + " instances of CIGAR codes: "
                         + str(unimplemented_CIGAR[1]) + ". Reads with these "
                         "CIGAR codes were skipped because they "
                         "are currently unimplemented.\n")

    # set track statistics and close HDF5 files

    sys.stderr.write("setting statistics for each chromosome\n")
    for h5f in output_h5:
        chromstat.set_stats(h5f, chrom_list)
        h5f.close()

    snp_tab_h5.close()
    snp_index_h5.close()
    if hap_h5:
        hap_h5.close()


    sys.stderr.write("done\n")
Esempio n. 8
0
def main():
    args = parse_args()

    sys.stderr.write("command line: %s\n" % " ".join(sys.argv))
    sys.stderr.write("python version: %s\n" % sys.version)
    sys.stderr.write("pysam version: %s\n" % pysam.__version__)
    sys.stderr.write("pytables version: %s\n" % tables.__version__)

    util.check_pysam_version()
    util.check_pytables_version()
    
    snp_tab_h5 = tables.open_file(args.snp_tab, "r")
    snp_index_h5 = tables.open_file(args.snp_index, "r")

    if args.haplotype:
        hap_h5 = tables.open_file(args.haplotype, "r")
        ind_idx = lookup_individual_index(args.samples, args.individual)
    else:
        hap_h5 = None
        ind_idx = None

    ref_count_h5 = tables.open_file(args.ref_as_counts, "w")
    alt_count_h5 = tables.open_file(args.alt_as_counts, "w")
    other_count_h5 = tables.open_file(args.other_as_counts, "w")
    read_count_h5 = tables.open_file(args.read_counts, "w")

    output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5]

    chrom_dict = {}

    # initialize every chromosome in output files
    chrom_list = chromosome.get_all_chromosomes(args.chrom)

    for chrom in chrom_list:
        for out_file in output_h5:
            create_carray(out_file, chrom, args.data_type)
        chrom_dict[chrom.name] = chrom

    count = 0
    dtype = None
    if args.data_type == "uint8":
        max_count = MAX_UINT8_COUNT
        dtype = np.uint8
    elif args.data_type == "uint16":
        max_count = MAX_UINT16_COUNT
        dtype = np.uint16
    else:
        raise NotImplementedError("unsupported datatype %s" % args.data_type)

    # create a list to hold the counts that will be later written
    # to a txt file
    if args.text_counts is not None:
        txt_counts = list()

    for chrom in chrom_list:
        sys.stderr.write("%s\n" % chrom.name)

        warned_pos = {}

        # fetch SNP info for this chromosome
        if chrom.name not in snp_tab_h5.root:
            # no SNPs for this chromosome
            continue

        sys.stderr.write("fetching SNPs\n")

        snp_tab = snp_tab_h5.get_node("/%s" % chrom.name)
        snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:]
        if hap_h5:
            hap_tab = hap_h5.get_node("/%s" % chrom.name)
        else:
            hap_tab = None

        # initialize count arrays for this chromosome to 0
        ref_carray = get_carray(ref_count_h5, chrom)
        alt_carray = get_carray(alt_count_h5, chrom)
        other_carray = get_carray(other_count_h5, chrom)
        read_count_carray = get_carray(read_count_h5, chrom)

        ref_array = np.zeros(chrom.length, dtype)
        alt_array = np.zeros(chrom.length, dtype)
        other_array = np.zeros(chrom.length, dtype)
        read_count_array = np.zeros(chrom.length, dtype)

        # loop over all BAM files, pulling out reads
        # for this chromosome
        for bam_filename in args.bam_filenames:
            sys.stderr.write("reading from file %s\n" % bam_filename)

            samfile = pysam.Samfile(bam_filename, "rb")

            for read in get_sam_iter(samfile, chrom):
                count += 1
                if count == 10000:
                    sys.stderr.write(".")
                    count = 0

                add_read_count(read, chrom, ref_array, alt_array,
                               other_array, read_count_array,
                               snp_index_array, snp_tab, hap_tab,
                               warned_pos, max_count, ind_idx)

            # store results for this chromosome
            ref_carray[:] = ref_array
            alt_carray[:] = alt_array
            other_carray[:] = other_array
            read_count_carray[:] = read_count_array
            sys.stderr.write("\n")

            # write data to numpy arrays, so that they can be written to a txt
            # file later
            # columns are:
            # chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count
            if args.text_counts is not None:
                chrom = np.tile(chrom.name, len(snp_tab))
                pos = np.array([snp['pos'] for snp in snp_tab])
                ref = np.array([snp['allele1'] for snp in snp_tab])
                alt = np.array([snp['allele2'] for snp in snp_tab])
                if hap_tab is not None:
                    genotype = np.array([str(hap[0])+"|"+str(hap[1])
                                         for hap in hap_tab])
                else:
                    genotype = np.empty((len(snp_tab), 0))
                txt_counts.append(
                    np.column_stack((chrom, pos, ref, alt, genotype,
                                     ref_array[pos-1],
                                     alt_array[pos-1],
                                     other_array[pos-1]))
                )


            samfile.close()

    # write the txt_counts np arrays to a txt file
    if args.text_counts is not None:
        # we use vstack to combine np arrays row-wise into a multi-dimensional
        # array
        np.savetxt(args.text_counts, np.vstack(tuple(txt_counts)),
                   fmt="%1s", delimiter=" ")

    # set track statistics and close HDF5 files

    sys.stderr.write("setting statistics for each chromosome\n")
    for h5f in output_h5:
        chromstat.set_stats(h5f, chrom_list)
        h5f.close()

    snp_tab_h5.close()
    snp_index_h5.close()
    if hap_h5:
        hap_h5.close()


    sys.stderr.write("done\n")
Esempio n. 9
0
def main():
    args = parse_args()
    write_header(sys.stdout)

    # find index of individual in list of samples
    ind_idx = lookup_individual_index(args, args.individual)

    data_files = DataFiles(args)

    chrom_list = chromosome.get_all_chromosomes(args.chrom)
    chrom_dict = chromosome.get_chromosome_dict(args.chrom)

    genomewide_read_counts = get_genomewide_count(data_files.read_count_h5,
                                                  chrom_list)

    unknown_chrom = set([])

    if util.is_gzipped(args.input_file):
        f = gzip.open(args.input_file, "rt")
    else:
        f = open(args.input_file, "r")

    line_count = 0

    if args.target_region_size:
        sys.stderr.write("setting target region size to %d\n" %
                         args.target_region_size)

    for line in f:
        line_count += 1
        if line_count % 1000 == 0:
            sys.stderr.write(".")

        if line.startswith("#"):
            continue

        words = line.rstrip().split()

        if words[1] == "NA":
            # no SNP defined on this line:
            write_NA_line(sys.stdout)
            continue

        chrom_name = words[0]
        if chrom_name in chrom_dict:
            chrom = chrom_dict[chrom_name]
        else:
            if not chrom_name.startswith("chr"):
                # try adding 'chr' to front of name
                new_chrom_name = "chr" + chrom_name
                if new_chrom_name in chrom_dict:
                    chrom_name = new_chrom_name
                    chrom = chrom_dict[chrom_name]
                else:
                    # can't figure out this chromosome name
                    if not chrom_name in unknown_chrom:
                        unknown_chrom.add(chrom_name)
                        sys.stderr.write("WARNING: unknown chromosome '%s'")
                    continue

        region_list = get_target_regions(args, chrom, words)

        snp_pos = int(words[1])
        snp_ref_base = words[3]
        snp_alt_base = words[4]
        # TODO: check that SNP ref/alt match?

        snp_region = coord.Coord(chrom, snp_pos, snp_pos)

        # pull out all of the SNPs in the target region(s)
        region_snps = get_region_snps(data_files, region_list, ind_idx)

        # pull out test SNP
        test_snp_list = get_region_snps(data_files, [snp_region], ind_idx)
        if len(test_snp_list) != 1:
            test_snp = None
            sys.stderr.write("WARNING: could not find test SNP at "
                             "position %s:%d\n" % (chrom.name, snp_pos))
            het_snps = []
        else:
            test_snp = test_snp_list[0]

            # pull out haplotype counts from linked heterozygous SNPs
            het_snps = get_het_snps(region_snps)
            set_snp_counts(data_files, region_list, het_snps, test_snp, args)

        region_read_counts = get_region_read_counts(data_files, region_list)

        write_output(sys.stdout, region_list, het_snps, test_snp, snp_pos,
                     region_read_counts, genomewide_read_counts)

    sys.stderr.write("\n")
    f.close()
    data_files.close()
Esempio n. 10
0
def main():
    args = parse_args()

    sys.stderr.write("command line: %s\n" % " ".join(sys.argv))
    sys.stderr.write("python version: %s\n" % sys.version)
    sys.stderr.write("pysam version: %s\n" % pysam.__version__)
    sys.stderr.write("pytables version: %s\n" % tables.__version__)

    util.check_pysam_version()
    util.check_pytables_version()

    snp_tab_h5 = tables.open_file(args.snp_tab, "r")
    snp_index_h5 = tables.open_file(args.snp_index, "r")

    if args.haplotype:
        hap_h5 = tables.open_file(args.haplotype, "r")
    else:
        hap_h5 = None

    ref_count_h5 = tables.open_file(args.ref_as_counts, "w")
    alt_count_h5 = tables.open_file(args.alt_as_counts, "w")
    other_count_h5 = tables.open_file(args.other_as_counts, "w")
    read_count_h5 = tables.open_file(args.read_counts, "w")

    output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5]

    chrom_dict = {}

    # initialize every chromosome in output files
    chrom_list = chromosome.get_all_chromosomes(args.chrom)

    for chrom in chrom_list:
        for out_file in output_h5:
            create_carray(out_file, chrom, args.data_type)
        chrom_dict[chrom.name] = chrom

    count = 0
    dtype = None
    if args.data_type == "uint8":
        max_count = MAX_UINT8_COUNT
        dtype = np.uint8
    elif args.data_type == "uint16":
        max_count = MAX_UINT16_COUNT
        dtype = np.uint16
    else:
        raise NotImplementedError("unsupported datatype %s" % args.data_type)

    # create a txt file to also holds the counts
    if args.txt_counts is not None:
        if os.path.splitext(args.txt_counts)[1] == ".gz":
            txt_counts = gzip.open(args.txt_counts, 'a+')
        else:
            txt_counts = open(args.txt_counts, 'a+')

    for chrom in chrom_list:
        sys.stderr.write("%s\n" % chrom.name)

        warned_pos = {}

        # fetch SNP info for this chromosome
        if chrom.name not in snp_tab_h5.root:
            # no SNPs for this chromosome
            continue

        sys.stderr.write("fetching SNPs\n")

        snp_tab = snp_tab_h5.get_node("/%s" % chrom.name)
        snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:]
        if hap_h5:
            hap_tab = hap_h5.get_node("/%s" % chrom.name)
            ind_idx = snptable.SNPTable().get_h5_sample_indices(
                hap_h5, chrom, [args.individual])[1]
            if len(ind_idx) != 0:
                ind_idx = ind_idx[0]
            else:
                hap_tab = None
                ind_idx = None
        else:
            hap_tab = None
            ind_idx = None

        # initialize count arrays for this chromosome to 0
        ref_carray = get_carray(ref_count_h5, chrom)
        alt_carray = get_carray(alt_count_h5, chrom)
        other_carray = get_carray(other_count_h5, chrom)
        read_count_carray = get_carray(read_count_h5, chrom)

        ref_array = np.zeros(chrom.length, dtype)
        alt_array = np.zeros(chrom.length, dtype)
        other_array = np.zeros(chrom.length, dtype)
        read_count_array = np.zeros(chrom.length, dtype)

        # loop over all BAM files, pulling out reads
        # for this chromosome
        for bam_filename in args.bam_filenames:
            sys.stderr.write("reading from file %s\n" % bam_filename)

            samfile = pysam.Samfile(bam_filename, "rb")

            for read in get_sam_iter(samfile, chrom):
                count += 1
                if count == 10000:
                    sys.stderr.write(".")
                    count = 0

                add_read_count(read, chrom, ref_array, alt_array, other_array,
                               read_count_array, snp_index_array, snp_tab,
                               hap_tab, warned_pos, max_count, ind_idx)

            # store results for this chromosome
            ref_carray[:] = ref_array
            alt_carray[:] = alt_array
            other_carray[:] = other_array
            read_count_carray[:] = read_count_array
            sys.stderr.write("\n")

            # write data to numpy arrays, so that they can be written to a txt
            # file later
            # columns are:
            # chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count
            if args.txt_counts is not None:
                chrom = np.tile(chrom.name, len(snp_tab))
                pos = np.array([snp['pos'] for snp in snp_tab])
                ref = np.array([snp['allele1'] for snp in snp_tab])
                alt = np.array([snp['allele2'] for snp in snp_tab])
                if hap_tab is not None:
                    genotype = np.array(
                        [str(hap[0]) + "|" + str(hap[1]) for hap in hap_tab])
                else:
                    genotype = np.empty((len(snp_tab), 0))
                # write an np array to a txt file
                np.savetxt(txt_counts,
                           np.column_stack(
                               (chrom, pos, ref, alt, genotype,
                                ref_array[pos - 1], alt_array[pos - 1],
                                other_array[pos - 1])),
                           fmt="%1s",
                           delimiter=" ")

            samfile.close()

    if args.txt_counts:
        # close the open txt file handler
        txt_counts.close()

    # check if any of the reads contained an unimplemented CIGAR
    sys.stderr.write(
        "WARNING: Encountered " + str(unimplemented_CIGAR[0]) +
        " instances of any of the following CIGAR codes: " +
        str(unimplemented_CIGAR[1]) +
        ". The regions of reads with these CIGAR codes were skipped because these CIGAR codes are currently unimplemented.\n"
    )

    # set track statistics and close HDF5 files

    sys.stderr.write("setting statistics for each chromosome\n")
    for h5f in output_h5:
        chromstat.set_stats(h5f, chrom_list)
        h5f.close()

    snp_tab_h5.close()
    snp_index_h5.close()
    if hap_h5:
        hap_h5.close()

    sys.stderr.write("done\n")
Esempio n. 11
0
def main():
    args = parse_args()

    snp_tab_h5 = tables.openFile(args.snp_tab, "r")
    snp_index_h5 = tables.openFile(args.snp_index, "r")

    if args.haplotype:
        hap_h5 = tables.openFile(args.haplotype, "r")
        ind_idx = lookup_individual_index(args.samples, args.individual)
    else:
        hap_h5 = None
        ind_idx = None

    ref_count_h5 = tables.openFile(args.ref_as_counts, "w")
    alt_count_h5 = tables.openFile(args.alt_as_counts, "w")
    other_count_h5 = tables.openFile(args.other_as_counts, "w")
    read_count_h5 = tables.openFile(args.read_counts, "w")

    output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5]

    chrom_dict = {}

    # initialize every chromosome in output files
    chrom_list = chromosome.get_all_chromosomes(args.chrom)

    for chrom in chrom_list:
        for out_file in output_h5:
            create_carray(out_file, chrom, args.data_type)
        chrom_dict[chrom.name] = chrom

    count = 0
    dtype = None
    if args.data_type == "uint8":
        max_count = MAX_UINT8_COUNT
        dtype = np.uint8
    elif args.data_type == "uint16":
        max_count = MAX_UINT16_COUNT
        dtype = np.uint16
    else:
        raise NotImplementedError("unsupported datatype %s" % args.data_type)

    for chrom in chrom_list:
        sys.stderr.write("%s\n" % chrom.name)

        warned_pos = {}

        # fetch SNP info for this chromosome
        if chrom.name not in snp_tab_h5.root:
            # no SNPs for this chromosome
            continue

        sys.stderr.write("fetching SNPs\n")

        snp_tab = snp_tab_h5.getNode("/%s" % chrom.name)
        snp_index_array = snp_index_h5.getNode("/%s" % chrom.name)[:]
        if hap_h5:
            hap_tab = hap_h5.getNode("/%s" % chrom.name)
        else:
            hap_tab = None

        # initialize count arrays for this chromosome to 0
        ref_carray = get_carray(ref_count_h5, chrom)
        alt_carray = get_carray(alt_count_h5, chrom)
        other_carray = get_carray(other_count_h5, chrom)
        read_count_carray = get_carray(read_count_h5, chrom)

        ref_array = np.zeros(chrom.length, dtype)
        alt_array = np.zeros(chrom.length, dtype)
        other_array = np.zeros(chrom.length, dtype)
        read_count_array = np.zeros(chrom.length, dtype)

        # loop over all BAM files, pulling out reads
        # for this chromosome
        for bam_filename in args.bam_filenames:
            sys.stderr.write("reading from file %s\n" % bam_filename)

            samfile = pysam.Samfile(bam_filename, "rb")

            for read in get_sam_iter(samfile, chrom):
                count += 1
                if count == 10000:
                    sys.stderr.write(".")
                    count = 0

                add_read_count(read, chrom, ref_array, alt_array, other_array,
                               read_count_array, snp_index_array, snp_tab,
                               hap_tab, warned_pos, max_count, ind_idx)

            # store results for this chromosome
            ref_carray[:] = ref_array
            alt_carray[:] = alt_array
            other_carray[:] = other_array
            read_count_carray[:] = read_count_array
            sys.stderr.write("\n")

            samfile.close()

    # set track statistics and close HDF5 files

    sys.stderr.write("setting statistics for each chromosome\n")
    for h5f in output_h5:
        chromstat.set_stats(h5f, chrom_list)
        h5f.close()

    snp_tab_h5.close()
    snp_index_h5.close()
    if hap_h5:
        hap_h5.close()

    sys.stderr.write("done\n")