def overlapping_reads(reads, distance): """returns all the overlapping reads within a given distance""" reads_list = [] cur_tss = 0 cur_chrom = '' for read in reads: if not cur_tss: cur_tss = bed12.get_tss(read) reads_list.append(read) cur_chrom = bed12.get_chrom(read) continue tss = bed12.get_tss(read) chrom = bed12.get_chrom(read) #if not overlap if (tss - cur_tss > distance) or (chrom != cur_chrom): yield reads_list reads_list = [read] cur_tss = tss cur_chrom = chrom else: reads_list.append(read) cur_tss = tss yield reads_list
def print_read_to_bed12(reads): """ Merge the reads by blocks and print a single read in the BED12 format on stdout. It assumes that the reads are on the same TSS and contains fingerprint information in the read's name. Args: reads: A list of reads """ block_sizes, block_starts = bed12.merge_overlapping_blocks(reads) #bed12 first_read = sorted(reads, key=bed12.get_start)[0] chrom = bed12.get_chrom(first_read) start = bed12.get_start(first_read) end = start + block_starts[-1] + block_sizes[-1] score = len(reads) strand = bed12.get_strand(first_read) if strand == '+': thick_start = start thick_end = start + block_sizes[0] else: thick_start = end - block_sizes[-1] thick_end = end color = "255,0,0" block_count = len(block_sizes) block_sizes = ','.join(map(str, block_sizes)) block_starts = ','.join(map(str, block_starts)) name = map(str, [chrom, start, end, strand]) name = ":".join(name) output = [ chrom, start, end, name, score, strand, thick_start, thick_end, color, block_count, block_sizes, block_starts ] output_str = map(str, output) print '\t'.join(output_str)
def print_read_to_bed12(reads): """ Merge the reads by blocks and print a single read in the BED12 format on stdout. It assumes that the reads are on the same TSS and contains fingerprint information in the read's name. Args: reads: A list of reads """ block_sizes, block_starts = bed12.merge_overlapping_blocks(reads) #bed12 first_read = sorted(reads, key=bed12.get_start)[0] chrom = bed12.get_chrom(first_read) start = bed12.get_start(first_read) end = start + block_starts[-1] + block_sizes[-1] score = len(reads) strand = bed12.get_strand(first_read) if strand == '+': thick_start = start thick_end = start + block_sizes[0] else: thick_start = end - block_sizes[-1] thick_end = end color = "255,0,0" block_count = len(block_sizes) block_sizes = ','.join(map(str, block_sizes)) block_starts = ','.join(map(str, block_starts)) name = map(str, [chrom, start, end, strand]) name = ":".join(name) output = [chrom, start, end, name, score, strand, thick_start, thick_end, color, block_count, block_sizes, block_starts] output_str = map(str, output) print '\t'.join(output_str)
def main(): #PARSER TODO: move this code somewhere else parser = argparse.ArgumentParser() group = parser.add_mutually_exclusive_group(required=True) group.add_argument( "-d", "--directory", help="absolute path of the folder containing the bed files") group.add_argument("-f", "--file", help="a bed file") parser.add_argument( "-o", help= 'name of the output file. Only works if the script is called with the -f option, \ ignored otherwise.') args = parser.parse_args() if args.directory: path, folder, files = os.walk(args.directory).next() elif args.file: path = '' files = [args.file] #ENDPARSER #create a temporary directory tmp_dir = tempfile.mkdtemp() plus_strand_tmp_file = open(os.path.join(tmp_dir, '+'), 'w') minus_strand_tmp_file = open(os.path.join(tmp_dir, '-'), 'w') plus_and_minus_sorted_path = os.path.join(tmp_dir, '+-s') #creates two temporary bed files containing either reads on the plus or minus strand for bed_file in files: with open(os.path.join(path, bed_file)) as bed_file: reader = csv.reader(bed_file, delimiter='\t') for read in reader: strand = bed12.get_strand(read) if strand == '+': plus_strand_tmp_file.write('\t'.join(read) + '\n') elif strand == '-': minus_strand_tmp_file.write('\t'.join(read) + '\n') #close the files plus_strand_tmp_file.close() minus_strand_tmp_file.close() #call unix sort on the file containing reads on the plus strand by tss with open(os.path.join(tmp_dir, '+sorted'), "w") as outfile: subprocess.call( ["sort", '-k1,1', '-k2,2n', os.path.join(tmp_dir, '+')], stdout=outfile) #call unix sort on the file containing reads on the minus strand by tss with open(os.path.join(tmp_dir, '-sorted'), "w") as outfile: subprocess.call( ["sort", '-k1,1', '-k3,3n', os.path.join(tmp_dir, '-')], stdout=outfile) #concatenate the files sorted by tss with open(plus_and_minus_sorted_path, "w") as outfile: subprocess.call([ 'cat', os.path.join(tmp_dir, '+sorted'), os.path.join(tmp_dir, '-sorted') ], stdout=outfile) with open(plus_and_minus_sorted_path) as bedfile: reader = csv.reader(bedfile, delimiter='\t') reads = (line for line in reader) #for each reads on the same tss for tss, same_tss_reads in itertools.groupby(reads, bed12.get_tss): d = defaultdict(list) #group the reads by chr and fingerprint for read in same_tss_reads: key = (bed12.get_chrom(read), get_fingerprint(read)) d[key].append(read) #merge and print the reads that have same tss, and fingerprint for key, same_fingerprint_reads in d.iteritems(): print_read_to_bed12(key, same_fingerprint_reads) shutil.rmtree(tmp_dir)
def main(): # PARSER TODO: move this code somewhere else parser = argparse.ArgumentParser() group = parser.add_mutually_exclusive_group(required=True) group.add_argument("-d", "--directory", help="absolute path of the folder containing the bed files") group.add_argument("-f", "--file", help="a bed file") parser.add_argument( "-o", help="name of the output file. Only works if the script is called with the -f option, \ ignored otherwise.", ) args = parser.parse_args() if args.directory: path, folder, files = os.walk(args.directory).next() elif args.file: path = "" files = [args.file] # ENDPARSER # create a temporary directory tmp_dir = tempfile.mkdtemp() plus_strand_tmp_file = open(os.path.join(tmp_dir, "+"), "w") minus_strand_tmp_file = open(os.path.join(tmp_dir, "-"), "w") plus_and_minus_sorted_path = os.path.join(tmp_dir, "+-s") # creates two temporary bed files containing either reads on the plus or minus strand for bed_file in files: with open(os.path.join(path, bed_file)) as bed_file: reader = csv.reader(bed_file, delimiter="\t") for read in reader: strand = bed12.get_strand(read) if strand == "+": plus_strand_tmp_file.write("\t".join(read) + "\n") elif strand == "-": minus_strand_tmp_file.write("\t".join(read) + "\n") # close the files plus_strand_tmp_file.close() minus_strand_tmp_file.close() # call unix sort on the file containing reads on the plus strand by tss with open(os.path.join(tmp_dir, "+sorted"), "w") as outfile: subprocess.call(["sort", "-k2,2n", os.path.join(tmp_dir, "+")], stdout=outfile) # call unix sort on the file containing reads on the minus strand by tss with open(os.path.join(tmp_dir, "-sorted"), "w") as outfile: subprocess.call(["sort", "-k3,3n", os.path.join(tmp_dir, "-")], stdout=outfile) # concatenate the files sorted by tss with open(plus_and_minus_sorted_path, "w") as outfile: subprocess.call(["cat", os.path.join(tmp_dir, "+sorted"), os.path.join(tmp_dir, "-sorted")], stdout=outfile) with open(plus_and_minus_sorted_path) as bedfile: reader = csv.reader(bedfile, delimiter="\t") reads = (line for line in reader) # for each reads on the same tss for tss, reads in itertools.groupby(reads, bed12.get_tss): d = defaultdict(list) # group the reads by chr, barcode and fingerprint for read in reads: key = (bed12.get_chrom(read), get_barcode(read), get_fingerprint(read)) d[key].append(read) # merge and print the reads that have similar tss, barcode and fingerprint for key, reads in d.iteritems(): print_read_to_bed12(key, reads) shutil.rmtree(tmp_dir)