Beispiel #1
0
def bin_coordinates_through_genome(input_file, output_file, genome_file, bin_size):
    open_file=utils_logging.open_input_file(input_file)
    open_output=utils_logging.open_output_file(output_file)
    all_coordinates_per_chr={}
    genome_loader=GenomeLoader(genome_file)
    previous_bin=0
    all_chr=[]
    for line in open_file:
        sp_line=line.split()
        all_coordinates=all_coordinates_per_chr.get(sp_line[0])
        if all_coordinates is None:
            all_chr.append(sp_line[0])
            all_coordinates=[]
            all_coordinates_per_chr[sp_line[0]]=all_coordinates
        all_coordinates.append(int(sp_line[1]))
    all_chr.sort()
    for chr in all_chr:
        header, sequence =genome_loader.get_chr(chr)
        chr=header.strip()
        chr_len=len(sequence)
        
        all_coordinates=all_coordinates_per_chr.get(chr)
        all_bins=bin_value_from_array(all_coordinates, bin_size, chr_len)
        for bin,value in enumerate(all_bins):
            open_output.write('%s\t%s\t%s\t%s\n'%(chr, bin*bin_size, (bin*bin_size)+previous_bin, value))
        previous_bin+=len(all_bins)*bin_size
    open_output.close()    
Beispiel #2
0
def print_distribution_holder(holder, output_file=None, textgraph=None, sort_by_weight=False, reverse=False, nb_bin=None):
    if not sort_by_weight:
        if nb_bin is None:
            values,weights=holder.get_sorted_value_and_weight(reverse=reverse)
        else:
            values,weights=holder.get_binned_value(nb_bin=nb_bin)
    else:
        values,weights=holder.get_sorted_weight_and_value(reverse=reverse)
    
    out=[]
    if output_file:
        open_output=utils_logging.open_output_file(output_file, pipe=False)
        function=open_output.write
    else:
        function=out.append
    if textgraph:
        multiplier=200
        sum=0
        mark='|'
        maximum=max(weights)
        if maximum<multiplier:
            maximum=multiplier
        for i in range(len(values)):
            function('%s\t%s %s\n'%(values[i],(mark * int(float(weights[i])/maximum*multiplier)), weights[i] ))
    else:
        for i in range(len(values)):
            function('%s\t%s\n'%(values[i],weights[i]))
    if output_file:
        open_output.close()
        to_return = output_file
    else:
        to_return = ''.join(out)
    return to_return
def shift_reads(bam_file, fasta_file, output_sam_file):
    all_sequences = load_new_fasta(fasta_file)
    stream, process = utils.get_sam_stream(bam_file, options='-h')
    open_output = utils_logging.open_output_file(output_sam_file, pipe=True)
    open_output.write("@HD\tVN:1.0\tSO:unsorted\n")
    all_values = all_sequences.values()
    all_values.sort(key=lambda x: x[0])
    for header, sequence in all_values:
        open_output.write("@SQ\tSN:%s\tLN:%s\n" % (header, len(sequence)))
    # read the header to get the read groups
    for line in stream:
        if line.startswith('@'):
            if line.startswith('@RG'):
                open_output.write("%s\n" % (line.strip()))
        else:
            break

    sam_record = Sam_record(line)
    sam_record = process_one_record(sam_record, all_sequences)
    if sam_record:
        open_output.write(str(sam_record))
    for line in stream:
        sam_record = Sam_record(line)
        sam_record = process_one_record(sam_record, all_sequences)
        if sam_record:
            open_output.write(str(sam_record))
    open_output.close()
Beispiel #4
0
def output_all_sites(all_sites, output_sites):
    open_file = utils_logging.open_output_file(output_sites)
    open_file.write("sites\t%s\n" % ("\t".join(all_sites_headers)))
    for site_name in all_sites.keys():
        open_file.write("%s\t%s\n" % (site_name, "\t".join([
            str(all_sites.get(site_name).get(key)) for key in all_sites_headers
        ])))
    open_file.close()
Beispiel #5
0
def bin_coordinates(input_file, output_file, bin_size):
    open_file=utils_logging.open_input_file(input_file)
    open_output=utils_logging.open_output_file(output_file)
    all_coordinates_per_chr={}
    for line in open_file:
        sp_line=line.split()
        all_coordinates=all_coordinates_per_chr.get(sp_line[0])
        if all_coordinates is None:
            all_coordinates=[]
            all_coordinates_per_chr[sp_line[0]]=all_coordinates
        all_coordinates.append(int(sp_line[1]))
    
    for chr in all_coordinates_per_chr.keys():
        all_coordinates=all_coordinates_per_chr.get(chr)
        all_bins=bin_value_from_array(all_coordinates, bin_size)
        for bin,value in enumerate(all_bins):
            open_output.write('%s\t%s\t%s\n'%(chr,bin*bin_size,value))
    open_output.close()
Beispiel #6
0

def RAD_median_coverage(bam_files,output_file):
    try:
        pipeline_param=utils_param.get_pipeline_parameters()
        samtools_dir=pipeline_param.get_samtools_dir()
    except Config_file_error, e:
        #logging.exception('Config_file_error:')
        logging.warning("You'll need to have samtools in your path")
        samtools_dir=''
    samtools_bin=os.path.join(samtools_dir,"samtools")
    bam_file_str=' '.join(bam_files)
    all_dists=[]
    pileup_stream = get_mpileup_from_bam(bam_file_str, genome_file=None, samtools_bin=samtools_bin, options="-d 100000 -A")
    if output_file:
        open_output=utils_logging.open_output_file(output_file)
    else:
        open_output=sys.stdout
    bam_file_names=[]
    for file in bam_files:
        bam_file_names.append(os.path.basename(file))
    open_output.write("Consensus\t%s\n"%("\t".join(bam_file_names))) 
    line = pileup_stream.readline()
    sp_line=line.strip().split()
    curr_contig=sp_line[0]
    for i in range(len(sp_line)/3-1):
        all_dists.append(Distribution_holder());
        all_dists[i].add_value(sp_line[(i+1)*3])
    
    for line in pileup_stream:
        sp_line=line.strip().split()
def output_all_sites(all_sites, output_sites):
    open_file = utils_logging.open_output_file(output_sites)
    open_file.write("sites\t%s\n"%("\t".join(all_sites_headers)))
    for site_name in all_sites.keys():
        open_file.write("%s\t%s\n"%(site_name,"\t".join([str(all_sites.get(site_name).get(key)) for key in all_sites_headers])))
    open_file.close()