def extract_rna_info(chrom_info_file, raw_allelic_counts_dir, genotype_dir,time_step, target_regions_dir): # make dictionary of identifier => index mapping all_genotype_samples_file = genotype_dir + 'all_genotyped_samples.txt' samp_idx = get_samples_index(all_genotype_samples_file) # Initialize chromosome objects chrom_list = chromosome.get_all_chromosomes(chrom_info_file) chrom_dict = chromosome.get_chromosome_dict(chrom_info_file) snp_files = SNPFiles(genotype_dir + 'snp_tab.h5',genotype_dir + 'snp_index.h5',genotype_dir+'haps.h5') # STEP 1: make combined HDF5 files of AS counts, # total mapped read counts, and genotype counts individuals = get_individual_array(target_regions_dir + 'rna_seq_samples_' + str(time_step) + '.txt') combined_files = CombinedFiles(raw_allelic_counts_dir, chrom_list,time_step) for ind in individuals: print(ind) sample_id = ind + '_' + str(time_step) count_files = CountFiles(raw_allelic_counts_dir, sample_id) ind_idx = samp_idx[ind] combined_files.add_counts(chrom_list, count_files, snp_files, ind_idx) count_files.close() return combined_files
def main(): sys.stderr.write("cmd: %s\n" % " ".join(sys.argv)) args = parse_args() out_f = None if args.output_file: if args.output_file.endswith(".gz"): out_f = gzip.open(args.output_file, "wt") else: out_f = open(args.output_file, "wt") else: out_f = sys.stdout # make dictionary of identifier => index mapping samp_idx = get_samples_index(args) # read individuals individuals = read_individuals(args, samp_idx) chrom_list = chromosome.get_all_chromosomes(args.chrom) chrom_dict = chromosome.get_chromosome_dict(args.chrom) combined_files = CombinedFiles(OUTPUT_DIR, chrom_list) snp_files = SNPFiles(args) # STEP 1: make combined HDF5 files of AS counts, # total mapped read counts, and genotype counts sys.stderr.write("summing genotypes and read counts across individuals\n") for ind in individuals: # open count files for this indivudal sys.stderr.write("individual: %s\n" % ind) count_files = CountFiles(args.read_count_dir, ind) ind_idx = samp_idx[ind] # add counts to combined totals combined_files.add_counts(chrom_list, count_files, snp_files, ind_idx) count_files.close() sys.stderr.write("generating list of target regions\n") # STEP 2: generate list of target regions centered on test SNPs: write_target_regions(out_f, args, chrom_list, combined_files, snp_files) combined_files.close() snp_files.close()
def main(): options = parse_options() util.info.write_info(sys.stdout, options) gdb = genome.db.GenomeDB(assembly="hg19") seq_track = gdb.open_track("seq") chrom_dict = chromosome.get_chromosome_dict(options.chrom_file) f = open(options.exon_file) for line in f: if line.startswith("#"): continue words = line.split() gene_id = words[0] gene_name = words[1] chrom_name = words[2] exon_num = int(words[3]) start = int(words[4]) end = int(words[5]) strand = int(words[6]) exon_len = end - start + 1 if exon_len < options.min_size: start = start - (options.min_size - exon_len)/2 end = end + (options.min_size - exon_len)/2 sys.stderr.write("extended exon from %d bp to %d bp\n" % (exon_len, end - start + 1)) seq_str = seq_track.get_seq_str(chrom_name, start, end) sys.stdout.write(">%s exon %d\n%s\n" % (gene_name, exon_num, seq_str))
def main(): options = parse_options() exon_out_f = open(options.exon_output_filename, "w") gene_out_f = open(options.gene_output_filename, "w") util.info.write_info(exon_out_f, options) util.info.write_info(gene_out_f, options) chrom_dict = chromosome.get_chromosome_dict(options.chrom_file) gene_dict, tr_dict, gene_chrom_dict, tr_chrom_dict = \ gff.read_gff(options.gff, chrom_dict, region_chrom=options.chrom, region_start=options.start, region_end=options.end) gene_num = 0 for gene in gene_chrom_dict[options.chrom]: exons = gene.get_merged_exons() gene_num += 1 gene_out_f.write("%s %s %s %d %d %d %d\n" % (gene.gene_id, gene.gene_name, gene.chrom.name, gene_num, gene.start, gene.end, gene.strand)) exon_num = 0 if gene.strand == -1: exons = exons[::-1] for ex in exons: exon_num += 1 exon_out_f.write("%s %s %s %d %d %d %d\n" % (gene.gene_id, gene.gene_name, gene.chrom.name, exon_num, ex.start, ex.end, gene.strand)) exon_out_f.close() gene_out_f.close()
def main(): args = parse_args() write_header(sys.stdout) # find index of individual in list of samples ind_idx = lookup_individual_index(args, args.individual) data_files = DataFiles(args) chrom_list = chromosome.get_all_chromosomes(args.chrom) chrom_dict = chromosome.get_chromosome_dict(args.chrom) genomewide_read_counts = get_genomewide_count(data_files.read_count_h5, chrom_list) if args.input_file.endswith(".gz"): f = gzip.open(args.input_file) else: f = open(args.input_file) line_count = 0 if args.target_region_size: sys.stderr.write("setting target region size to %d\n" % args.target_region_size) for line in f: line_count += 1 if line_count % 1000 == 0: sys.stderr.write(".") if line.startswith("#"): continue words = line.rstrip().split() if words[1] == "NA": # no SNP defined on this line: write_NA_line(sys.stdout) continue chrom_name = words[0] chrom = chrom_dict[chrom_name] region_list = get_target_regions(args, chrom, words) snp_pos = int(words[1]) snp_ref_base = words[3] snp_alt_base = words[4] # TODO: check that SNP ref/alt match? snp_region = coord.Coord(chrom, snp_pos, snp_pos) # pull out all of the SNPs in the target region(s) region_snps = get_region_snps(data_files, region_list, ind_idx) # pull out test SNP test_snp_list = get_region_snps(data_files, [snp_region], ind_idx) if len(test_snp_list) != 1: test_snp = None sys.stderr.write("WARNING: could not find test SNP at " "position %s:%d\n" % (chrom.name, snp_pos)) het_snps = [] else: test_snp = test_snp_list[0] # pull out haplotype counts from linked heterozygous SNPs het_snps = get_het_snps(region_snps) set_snp_counts(data_files, region_list, het_snps, test_snp, args) region_read_counts = get_region_read_counts(data_files, region_list) write_output(sys.stdout, region_list, het_snps, test_snp, snp_pos, region_read_counts, genomewide_read_counts) sys.stderr.write("\n") f.close() data_files.close()
def main(): args = parse_args() write_header(sys.stdout) # find index of individual in list of samples ind_idx = lookup_individual_index(args, args.individual) data_files = DataFiles(args) chrom_list = chromosome.get_all_chromosomes(args.chrom) chrom_dict = chromosome.get_chromosome_dict(args.chrom) genomewide_read_counts = get_genomewide_count(data_files.read_count_h5, chrom_list) unknown_chrom = set([]) if util.is_gzipped(args.input_file): f = gzip.open(args.input_file, "rt") else: f = open(args.input_file, "r") line_count = 0 if args.target_region_size: sys.stderr.write("setting target region size to %d\n" % args.target_region_size) for line in f: line_count += 1 if line_count % 1000 == 0: sys.stderr.write(".") if line.startswith("#"): continue words = line.rstrip().split() if words[1] == "NA": # no SNP defined on this line: write_NA_line(sys.stdout) continue chrom_name = words[0] if chrom_name in chrom_dict: chrom = chrom_dict[chrom_name] else: if not chrom_name.startswith("chr"): # try adding 'chr' to front of name new_chrom_name = "chr" + chrom_name if new_chrom_name in chrom_dict: chrom_name = new_chrom_name chrom = chrom_dict[chrom_name] else: # can't figure out this chromosome name if not chrom_name in unknown_chrom: unknown_chrom.add(chrom_name) sys.stderr.write("WARNING: unknown chromosome '%s'") continue region_list = get_target_regions(args, chrom, words) snp_pos = int(words[1]) snp_ref_base = words[3] snp_alt_base = words[4] # TODO: check that SNP ref/alt match? snp_region = coord.Coord(chrom, snp_pos, snp_pos) # pull out all of the SNPs in the target region(s) region_snps = get_region_snps(data_files, region_list, ind_idx) # pull out test SNP test_snp_list = get_region_snps(data_files, [snp_region], ind_idx) if len(test_snp_list) != 1: test_snp = None sys.stderr.write("WARNING: could not find test SNP at " "position %s:%d\n" % (chrom.name, snp_pos)) het_snps = [] else: test_snp = test_snp_list[0] # pull out haplotype counts from linked heterozygous SNPs het_snps = get_het_snps(region_snps) set_snp_counts(data_files, region_list, het_snps, test_snp, args) region_read_counts = get_region_read_counts(data_files, region_list) write_output(sys.stdout, region_list, het_snps, test_snp, snp_pos, region_read_counts, genomewide_read_counts) sys.stderr.write("\n") f.close() data_files.close()