def extract_rna_info(chrom_info_file, raw_allelic_counts_dir, genotype_dir,time_step, target_regions_dir):
    # make dictionary of identifier => index mapping
    all_genotype_samples_file = genotype_dir + 'all_genotyped_samples.txt'
    samp_idx = get_samples_index(all_genotype_samples_file)

    # Initialize chromosome objects
    chrom_list = chromosome.get_all_chromosomes(chrom_info_file)
    chrom_dict = chromosome.get_chromosome_dict(chrom_info_file)

    snp_files = SNPFiles(genotype_dir + 'snp_tab.h5',genotype_dir + 'snp_index.h5',genotype_dir+'haps.h5')

    # STEP 1: make combined HDF5 files of AS counts, 
    # total mapped read counts, and genotype counts
    individuals = get_individual_array(target_regions_dir + 'rna_seq_samples_' + str(time_step) + '.txt')
    combined_files = CombinedFiles(raw_allelic_counts_dir, chrom_list,time_step)
    for ind in individuals:
        print(ind)
        sample_id = ind + '_' + str(time_step)
        count_files = CountFiles(raw_allelic_counts_dir, sample_id)
            
        ind_idx = samp_idx[ind]
        combined_files.add_counts(chrom_list, count_files, snp_files, ind_idx)

        count_files.close()

    return combined_files
Example #2
0
def main():

    sys.stderr.write("cmd: %s\n" % " ".join(sys.argv))
    
    args = parse_args()

    out_f = None
    if args.output_file:
        if args.output_file.endswith(".gz"):
            out_f = gzip.open(args.output_file, "wt")
        else:
            out_f = open(args.output_file, "wt")
    else:
        out_f = sys.stdout

    
    # make dictionary of identifier => index mapping
    samp_idx = get_samples_index(args)

    # read individuals
    individuals = read_individuals(args, samp_idx)
    
    chrom_list = chromosome.get_all_chromosomes(args.chrom)
    chrom_dict = chromosome.get_chromosome_dict(args.chrom)


    combined_files = CombinedFiles(OUTPUT_DIR, chrom_list)
    snp_files = SNPFiles(args)

    # STEP 1: make combined HDF5 files of AS counts, 
    # total mapped read counts, and genotype counts
    sys.stderr.write("summing genotypes and read counts across individuals\n")
    for ind in individuals:
        # open count files for this indivudal
        sys.stderr.write("individual: %s\n" % ind)
        count_files = CountFiles(args.read_count_dir, ind)

        ind_idx = samp_idx[ind]
        
        # add counts to combined totals
        combined_files.add_counts(chrom_list, count_files, snp_files, ind_idx)

        count_files.close()
        

    sys.stderr.write("generating list of target regions\n")
    
    # STEP 2: generate list of target regions centered on test SNPs:
    write_target_regions(out_f, args, chrom_list, combined_files, snp_files)

    combined_files.close()
    snp_files.close()
Example #3
0
def main():

    sys.stderr.write("cmd: %s\n" % " ".join(sys.argv))
    
    args = parse_args()

    out_f = None
    if args.output_file:
        if args.output_file.endswith(".gz"):
            out_f = gzip.open(args.output_file, "wt")
        else:
            out_f = open(args.output_file, "wt")
    else:
        out_f = sys.stdout

    
    # make dictionary of identifier => index mapping
    samp_idx = get_samples_index(args)

    # read individuals
    individuals = read_individuals(args, samp_idx)
    
    chrom_list = chromosome.get_all_chromosomes(args.chrom)
    chrom_dict = chromosome.get_chromosome_dict(args.chrom)


    combined_files = CombinedFiles(OUTPUT_DIR, chrom_list)
    snp_files = SNPFiles(args)

    # STEP 1: make combined HDF5 files of AS counts, 
    # total mapped read counts, and genotype counts
    sys.stderr.write("summing genotypes and read counts across individuals\n")
    for ind in individuals:
        # open count files for this indivudal
        sys.stderr.write("individual: %s\n" % ind)
        count_files = CountFiles(args.read_count_dir, ind)

        ind_idx = samp_idx[ind]
        
        # add counts to combined totals
        combined_files.add_counts(chrom_list, count_files, snp_files, ind_idx)

        count_files.close()
        

    sys.stderr.write("generating list of target regions\n")
    
    # STEP 2: generate list of target regions centered on test SNPs:
    write_target_regions(out_f, args, chrom_list, combined_files, snp_files)

    combined_files.close()
    snp_files.close()
Example #4
0
def main():
    options = parse_options()

    util.info.write_info(sys.stdout, options)

    gdb = genome.db.GenomeDB(assembly="hg19")
    seq_track = gdb.open_track("seq")
    
    chrom_dict = chromosome.get_chromosome_dict(options.chrom_file)

    f = open(options.exon_file)

    for line in f:
        if line.startswith("#"):
            continue
        
        words = line.split()
        
        gene_id = words[0]
        gene_name = words[1]
        chrom_name = words[2]
        exon_num = int(words[3])
        start = int(words[4])
        end = int(words[5])
        strand = int(words[6])

        exon_len = end - start + 1

        if exon_len < options.min_size:
            start = start - (options.min_size - exon_len)/2
            end = end + (options.min_size - exon_len)/2

            sys.stderr.write("extended exon from %d bp to %d bp\n" %
                             (exon_len, end - start + 1))

        
        seq_str = seq_track.get_seq_str(chrom_name, start, end)

        sys.stdout.write(">%s exon %d\n%s\n" % (gene_name, exon_num, seq_str))
Example #5
0
def main():
    options = parse_options()

    exon_out_f = open(options.exon_output_filename, "w")
    gene_out_f = open(options.gene_output_filename, "w")
    
    util.info.write_info(exon_out_f, options)
    util.info.write_info(gene_out_f, options)
        
    chrom_dict = chromosome.get_chromosome_dict(options.chrom_file)
    
    gene_dict, tr_dict, gene_chrom_dict, tr_chrom_dict = \
      gff.read_gff(options.gff, chrom_dict,
                   region_chrom=options.chrom,
                   region_start=options.start,
                   region_end=options.end)

    gene_num = 0
    for gene in gene_chrom_dict[options.chrom]:
        exons = gene.get_merged_exons()

        gene_num += 1
        gene_out_f.write("%s %s %s %d %d %d %d\n" % (gene.gene_id, gene.gene_name, 
                                                     gene.chrom.name, gene_num, gene.start,
                                                     gene.end, gene.strand))
        exon_num = 0

        if gene.strand == -1:
            exons = exons[::-1]
        
        for ex in exons:
            exon_num += 1
            exon_out_f.write("%s %s %s %d %d %d %d\n" % 
                             (gene.gene_id, gene.gene_name, gene.chrom.name, exon_num,
                              ex.start, ex.end, gene.strand))


    exon_out_f.close()
    gene_out_f.close()
def main():
    args = parse_args()

    write_header(sys.stdout)

    # find index of individual in list of samples
    ind_idx = lookup_individual_index(args, args.individual)
    
    data_files = DataFiles(args)

    chrom_list = chromosome.get_all_chromosomes(args.chrom)
    chrom_dict = chromosome.get_chromosome_dict(args.chrom)
    
    genomewide_read_counts = get_genomewide_count(data_files.read_count_h5,
                                                  chrom_list)

    if args.input_file.endswith(".gz"):
        f = gzip.open(args.input_file)
    else:
        f = open(args.input_file)

    line_count = 0

    if args.target_region_size:
        sys.stderr.write("setting target region size to %d\n" %
                         args.target_region_size)
    
    for line in f:
        line_count += 1
        if line_count % 1000 == 0:
            sys.stderr.write(".")

        if line.startswith("#"):
            continue
        
        words = line.rstrip().split()

        if words[1] == "NA":
            # no SNP defined on this line:
            write_NA_line(sys.stdout)
            continue
        
        chrom_name = words[0]
        chrom = chrom_dict[chrom_name]
        
        region_list = get_target_regions(args, chrom, words)

        snp_pos = int(words[1])
        snp_ref_base = words[3]
        snp_alt_base = words[4]
        # TODO: check that SNP ref/alt match?
                    
        snp_region = coord.Coord(chrom, snp_pos, snp_pos)
        
        # pull out all of the SNPs in the target region(s)
        region_snps = get_region_snps(data_files, region_list, ind_idx)

        # pull out test SNP
        test_snp_list = get_region_snps(data_files, [snp_region], ind_idx)
        if len(test_snp_list) != 1:
            test_snp = None
            sys.stderr.write("WARNING: could not find test SNP at "
                             "position %s:%d\n" % (chrom.name, snp_pos))
            het_snps = []
        else:
            test_snp = test_snp_list[0]
                
            # pull out haplotype counts from linked heterozygous SNPs
            het_snps = get_het_snps(region_snps)
            set_snp_counts(data_files, region_list, het_snps, test_snp, args)

        region_read_counts = get_region_read_counts(data_files, region_list)

        write_output(sys.stdout, region_list, het_snps, test_snp, snp_pos,
                     region_read_counts, genomewide_read_counts)

    sys.stderr.write("\n")
    f.close()
    data_files.close()
Example #7
0
def main():
    args = parse_args()
    write_header(sys.stdout)

    # find index of individual in list of samples
    ind_idx = lookup_individual_index(args, args.individual)

    data_files = DataFiles(args)

    chrom_list = chromosome.get_all_chromosomes(args.chrom)
    chrom_dict = chromosome.get_chromosome_dict(args.chrom)

    genomewide_read_counts = get_genomewide_count(data_files.read_count_h5,
                                                  chrom_list)

    unknown_chrom = set([])

    if util.is_gzipped(args.input_file):
        f = gzip.open(args.input_file, "rt")
    else:
        f = open(args.input_file, "r")

    line_count = 0

    if args.target_region_size:
        sys.stderr.write("setting target region size to %d\n" %
                         args.target_region_size)

    for line in f:
        line_count += 1
        if line_count % 1000 == 0:
            sys.stderr.write(".")

        if line.startswith("#"):
            continue

        words = line.rstrip().split()

        if words[1] == "NA":
            # no SNP defined on this line:
            write_NA_line(sys.stdout)
            continue

        chrom_name = words[0]
        if chrom_name in chrom_dict:
            chrom = chrom_dict[chrom_name]
        else:
            if not chrom_name.startswith("chr"):
                # try adding 'chr' to front of name
                new_chrom_name = "chr" + chrom_name
                if new_chrom_name in chrom_dict:
                    chrom_name = new_chrom_name
                    chrom = chrom_dict[chrom_name]
                else:
                    # can't figure out this chromosome name
                    if not chrom_name in unknown_chrom:
                        unknown_chrom.add(chrom_name)
                        sys.stderr.write("WARNING: unknown chromosome '%s'")
                    continue

        region_list = get_target_regions(args, chrom, words)

        snp_pos = int(words[1])
        snp_ref_base = words[3]
        snp_alt_base = words[4]
        # TODO: check that SNP ref/alt match?

        snp_region = coord.Coord(chrom, snp_pos, snp_pos)

        # pull out all of the SNPs in the target region(s)
        region_snps = get_region_snps(data_files, region_list, ind_idx)

        # pull out test SNP
        test_snp_list = get_region_snps(data_files, [snp_region], ind_idx)
        if len(test_snp_list) != 1:
            test_snp = None
            sys.stderr.write("WARNING: could not find test SNP at "
                             "position %s:%d\n" % (chrom.name, snp_pos))
            het_snps = []
        else:
            test_snp = test_snp_list[0]

            # pull out haplotype counts from linked heterozygous SNPs
            het_snps = get_het_snps(region_snps)
            set_snp_counts(data_files, region_list, het_snps, test_snp, args)

        region_read_counts = get_region_read_counts(data_files, region_list)

        write_output(sys.stdout, region_list, het_snps, test_snp, snp_pos,
                     region_read_counts, genomewide_read_counts)

    sys.stderr.write("\n")
    f.close()
    data_files.close()