def bin_coordinates_through_genome(input_file, output_file, genome_file, bin_size): open_file=utils_logging.open_input_file(input_file) open_output=utils_logging.open_output_file(output_file) all_coordinates_per_chr={} genome_loader=GenomeLoader(genome_file) previous_bin=0 all_chr=[] for line in open_file: sp_line=line.split() all_coordinates=all_coordinates_per_chr.get(sp_line[0]) if all_coordinates is None: all_chr.append(sp_line[0]) all_coordinates=[] all_coordinates_per_chr[sp_line[0]]=all_coordinates all_coordinates.append(int(sp_line[1])) all_chr.sort() for chr in all_chr: header, sequence =genome_loader.get_chr(chr) chr=header.strip() chr_len=len(sequence) all_coordinates=all_coordinates_per_chr.get(chr) all_bins=bin_value_from_array(all_coordinates, bin_size, chr_len) for bin,value in enumerate(all_bins): open_output.write('%s\t%s\t%s\t%s\n'%(chr, bin*bin_size, (bin*bin_size)+previous_bin, value)) previous_bin+=len(all_bins)*bin_size open_output.close()
def print_distribution_holder(holder, output_file=None, textgraph=None, sort_by_weight=False, reverse=False, nb_bin=None): if not sort_by_weight: if nb_bin is None: values,weights=holder.get_sorted_value_and_weight(reverse=reverse) else: values,weights=holder.get_binned_value(nb_bin=nb_bin) else: values,weights=holder.get_sorted_weight_and_value(reverse=reverse) out=[] if output_file: open_output=utils_logging.open_output_file(output_file, pipe=False) function=open_output.write else: function=out.append if textgraph: multiplier=200 sum=0 mark='|' maximum=max(weights) if maximum<multiplier: maximum=multiplier for i in range(len(values)): function('%s\t%s %s\n'%(values[i],(mark * int(float(weights[i])/maximum*multiplier)), weights[i] )) else: for i in range(len(values)): function('%s\t%s\n'%(values[i],weights[i])) if output_file: open_output.close() to_return = output_file else: to_return = ''.join(out) return to_return
def shift_reads(bam_file, fasta_file, output_sam_file): all_sequences = load_new_fasta(fasta_file) stream, process = utils.get_sam_stream(bam_file, options='-h') open_output = utils_logging.open_output_file(output_sam_file, pipe=True) open_output.write("@HD\tVN:1.0\tSO:unsorted\n") all_values = all_sequences.values() all_values.sort(key=lambda x: x[0]) for header, sequence in all_values: open_output.write("@SQ\tSN:%s\tLN:%s\n" % (header, len(sequence))) # read the header to get the read groups for line in stream: if line.startswith('@'): if line.startswith('@RG'): open_output.write("%s\n" % (line.strip())) else: break sam_record = Sam_record(line) sam_record = process_one_record(sam_record, all_sequences) if sam_record: open_output.write(str(sam_record)) for line in stream: sam_record = Sam_record(line) sam_record = process_one_record(sam_record, all_sequences) if sam_record: open_output.write(str(sam_record)) open_output.close()
def output_all_sites(all_sites, output_sites): open_file = utils_logging.open_output_file(output_sites) open_file.write("sites\t%s\n" % ("\t".join(all_sites_headers))) for site_name in all_sites.keys(): open_file.write("%s\t%s\n" % (site_name, "\t".join([ str(all_sites.get(site_name).get(key)) for key in all_sites_headers ]))) open_file.close()
def bin_coordinates(input_file, output_file, bin_size): open_file=utils_logging.open_input_file(input_file) open_output=utils_logging.open_output_file(output_file) all_coordinates_per_chr={} for line in open_file: sp_line=line.split() all_coordinates=all_coordinates_per_chr.get(sp_line[0]) if all_coordinates is None: all_coordinates=[] all_coordinates_per_chr[sp_line[0]]=all_coordinates all_coordinates.append(int(sp_line[1])) for chr in all_coordinates_per_chr.keys(): all_coordinates=all_coordinates_per_chr.get(chr) all_bins=bin_value_from_array(all_coordinates, bin_size) for bin,value in enumerate(all_bins): open_output.write('%s\t%s\t%s\n'%(chr,bin*bin_size,value)) open_output.close()
def RAD_median_coverage(bam_files,output_file): try: pipeline_param=utils_param.get_pipeline_parameters() samtools_dir=pipeline_param.get_samtools_dir() except Config_file_error, e: #logging.exception('Config_file_error:') logging.warning("You'll need to have samtools in your path") samtools_dir='' samtools_bin=os.path.join(samtools_dir,"samtools") bam_file_str=' '.join(bam_files) all_dists=[] pileup_stream = get_mpileup_from_bam(bam_file_str, genome_file=None, samtools_bin=samtools_bin, options="-d 100000 -A") if output_file: open_output=utils_logging.open_output_file(output_file) else: open_output=sys.stdout bam_file_names=[] for file in bam_files: bam_file_names.append(os.path.basename(file)) open_output.write("Consensus\t%s\n"%("\t".join(bam_file_names))) line = pileup_stream.readline() sp_line=line.strip().split() curr_contig=sp_line[0] for i in range(len(sp_line)/3-1): all_dists.append(Distribution_holder()); all_dists[i].add_value(sp_line[(i+1)*3]) for line in pileup_stream: sp_line=line.strip().split()
def output_all_sites(all_sites, output_sites): open_file = utils_logging.open_output_file(output_sites) open_file.write("sites\t%s\n"%("\t".join(all_sites_headers))) for site_name in all_sites.keys(): open_file.write("%s\t%s\n"%(site_name,"\t".join([str(all_sites.get(site_name).get(key)) for key in all_sites_headers]))) open_file.close()