Ejemplo n.º 1
0
    input_stream, process = utils_commands.get_output_stream_from_command(
        command)
    tmp_bam_file = output_bam_file = tmp + '_mrk_dup.bam.tmp'
    command = "%s view -bS - >%s" % (samtools_bin, tmp_bam_file)
    output_stream, process = utils_commands.get_input_stream_from_command(
        command)
    nb_reference = 0
    current_reference = None
    first_reads = {}
    second_reads = {}
    for line in input_stream:
        if line.startswith("@"):
            output_stream.write(line)
            continue
        sam_record = Sam_record(line)
        if sam_record.get_reference_name(
        ) != current_reference and not current_reference is None:
            #process this consensus
            if current_reference != '*':

                nb_dups, nb_uniq = find_duplicates(first_reads, second_reads,
                                                   distance_threshold)
                total_nb_uniqs += nb_uniq
                total_nb_dups += nb_dups
                nb_fragment += len(second_reads)
            output_reads(output_stream, first_reads, second_reads)
            first_reads = {}
            second_reads = {}
        if sam_record.is_second_read():
            second_reads[sam_record.get_query_name()] = sam_record
        else:
            first_reads[sam_record.get_query_name()] = sam_record
 #get the header
 line = input_stream.readline()
 while line.startswith("@"):
     output_stream.write(line)
     line = input_stream.readline()
 
 while line:
     read1=Sam_record(line)
     line = input_stream.readline()
     read2=Sam_record(line)
     if read1.get_query_name() == read2.get_query_name():
         if read1.is_second_read() and read2.is_first_read():
             tmp = read1
             read1=read2
             read2=tmp
         read2.set_reference_name(read1.get_reference_name())
         output_stream.write(str(read1))
         output_stream.write(str(read2))
     else:
         logging.critical("bam file is not sorted by read name")
         input_stream.close()
         output_stream.close()
         #os.remove(output_bam_file+'.bam')
         return
     line = input_stream.readline()
     
 return_code=process_input.wait()
 print return_code
 if return_code!=0:
     sys.exit(return_code)
 return_code=process_output.wait()
Ejemplo n.º 3
0
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info, samtools_bin):
    command = "%s view -h -F 132 %s" % (samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig = None
    sample_name, ext = os.path.splitext(bam_file)
    read_groups = {}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id = rg_sample = rg_library = None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id = value[3:]
                    elif value.startswith("SM"):
                        rg_sample = value[3:]
                    elif value.startswith("LB"):
                        rg_library = value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id] = rg_sample
                    elif rg_library:
                        read_groups[rg_id] = rg_library
                    else:
                        read_groups[rg_id] = rg_id
        all_sample_coverage = {}
        all_sample_duplicate = {}
        for sample in read_groups.values():
            all_sample_coverage[sample] = 0
            all_sample_duplicate[sample] = 0
            # process the first read
            # if line.startswith("@"):
        #    #Still in the header. There's no read, exit
        #    return
        sam_record = Sam_record(line.strip())
        current_contig = sam_record.get_reference_name()
        if not sam_record.is_unmapped():
            rg_id = sam_record.get_tag("RG")
            if sam_record.is_duplicate_read():
                all_sample_duplicate[read_groups.get(rg_id)] += 1
            all_sample_coverage[read_groups.get(rg_id)] += 1
        i = 1
        # process all the others
        for line in open_stream:
            i += 1
            if i % 1000000 == 0:
                print i
            sam_record = Sam_record(line.strip())
            if current_contig != sam_record.get_reference_name() and current_contig != None:
                for sample in read_groups.values():
                    all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample),
                                                all_sample_duplicate.get(sample), sample=sample)
                    all_sample_coverage[sample] = 0
                    all_sample_duplicate[sample] = 0
            current_contig = sam_record.get_reference_name()

            if not sam_record.is_unmapped():
                rg_id = sam_record.get_tag("RG")
                if sam_record.is_duplicate_read():
                    all_sample_duplicate[read_groups.get(rg_id)] += 1
                all_sample_coverage[read_groups.get(rg_id)] += 1
        if current_contig != None:
            for sample in read_groups.values():
                all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample),
                                            all_sample_duplicate.get(sample), sample=sample)
                all_sample_coverage[sample] = 0
                all_sample_duplicate[sample] = 0
    finally:
        open_stream.close()
Ejemplo n.º 4
0
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info,
                                                samtools_bin):
    command = "%s view -h -F 132 %s" % (samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig = None
    sample_name, ext = os.path.splitext(bam_file)
    read_groups = {}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id = rg_sample = rg_library = None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id = value[3:]
                    elif value.startswith("SM"):
                        rg_sample = value[3:]
                    elif value.startswith("LB"):
                        rg_library = value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id] = rg_sample
                    elif rg_library:
                        read_groups[rg_id] = rg_library
                    else:
                        read_groups[rg_id] = rg_id
        all_sample_coverage = {}
        all_sample_duplicate = {}
        for sample in read_groups.values():
            all_sample_coverage[sample] = 0
            all_sample_duplicate[sample] = 0
            # process the first read
            # if line.startswith("@"):
        #    #Still in the header. There's no read, exit
        #    return
        sam_record = Sam_record(line.strip())
        current_contig = sam_record.get_reference_name()
        if not sam_record.is_unmapped():
            rg_id = sam_record.get_tag("RG")
            if sam_record.is_duplicate_read():
                all_sample_duplicate[read_groups.get(rg_id)] += 1
            all_sample_coverage[read_groups.get(rg_id)] += 1
        i = 1
        # process all the others
        for line in open_stream:
            i += 1
            if i % 1000000 == 0:
                print i
            sam_record = Sam_record(line.strip())
            if current_contig != sam_record.get_reference_name(
            ) and current_contig != None:
                for sample in read_groups.values():
                    all_contigs_info.add_values(
                        current_contig,
                        all_sample_coverage.get(sample),
                        all_sample_duplicate.get(sample),
                        sample=sample)
                    all_sample_coverage[sample] = 0
                    all_sample_duplicate[sample] = 0
            current_contig = sam_record.get_reference_name()

            if not sam_record.is_unmapped():
                rg_id = sam_record.get_tag("RG")
                if sam_record.is_duplicate_read():
                    all_sample_duplicate[read_groups.get(rg_id)] += 1
                all_sample_coverage[read_groups.get(rg_id)] += 1
        if current_contig != None:
            for sample in read_groups.values():
                all_contigs_info.add_values(current_contig,
                                            all_sample_coverage.get(sample),
                                            all_sample_duplicate.get(sample),
                                            sample=sample)
                all_sample_coverage[sample] = 0
                all_sample_duplicate[sample] = 0
    finally:
        open_stream.close()
Ejemplo n.º 5
0
 
 command = "%s view -h %s"%(samtools_bin, bam_file)
 input_stream, process = utils_commands.get_output_stream_from_command(command)
 tmp_bam_file = output_bam_file = tmp + '_mrk_dup.bam.tmp'
 command = "%s view -bS - >%s"%(samtools_bin, tmp_bam_file)
 output_stream, process = utils_commands.get_input_stream_from_command(command)
 nb_reference=0
 current_reference=None
 first_reads={}
 second_reads={}
 for line in input_stream:
     if line.startswith("@"):
         output_stream.write(line)
         continue
     sam_record = Sam_record(line)
     if sam_record.get_reference_name()!=current_reference and not current_reference is None:
         #process this consensus
         if current_reference!='*':
             
             nb_dups, nb_uniq = find_duplicates(first_reads,second_reads, distance_threshold)
             total_nb_uniqs+=nb_uniq
             total_nb_dups+=nb_dups
             nb_fragment+=len(second_reads)
         output_reads(output_stream, first_reads, second_reads)
         first_reads={}
         second_reads={}
     if sam_record.is_second_read():
         second_reads[sam_record.get_query_name()]=sam_record
     else:
         first_reads[sam_record.get_query_name()]=sam_record
     nb_reference+=1
Ejemplo n.º 6
0
def process_single_samtools_run_with_read_group(bam_file,all_contigs_info,samtools_bin):
    command="%s view -h -F 132 %s"%(samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig=None
    sample_name, ext = os.path.splitext(bam_file)
    read_groups={}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id=rg_sample=rg_library=None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id=value[3:]
                    elif value.startswith("SM"):
                        rg_sample=value[3:]
                    elif value.startswith("LB"):
                        rg_library=value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id]=rg_sample
                    elif rg_library:
                        read_groups[rg_id]=rg_library
                    else:
                        read_groups[rg_id]=rg_id
        all_sample_coverage={}
        all_sample_coverage_reads = {}
        all_sample_duplicate={}
        for sample in read_groups.values():
            all_sample_coverage[sample]=Counter()
            all_sample_duplicate[sample]=Counter()
            all_sample_coverage_reads[sample] = defaultdict(Counter)
        #process the first read
        sam_record = Sam_record(line.strip())
        current_contig = sam_record.get_reference_name()
        if not sam_record.is_unmapped():
            rg_id = sam_record.get_tag("RG")
            read_sequence = sam_record.get_query_sequence()
            loci = get_loci_from_read(sam_record)
            if sam_record.is_duplicate_read():
                all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1
            all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1
            all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1
        i=1
        #process all the others
        for line in open_stream:
            i+=1
            if i%1000000==0:
                print i
            sam_record = Sam_record(line.strip())
            if current_contig != sam_record.get_reference_name() and current_contig != None:
                for sample in read_groups.values():
                    for loci in all_sample_coverage.get(sample):
                        alleles = all_sample_coverage_reads[sample].get(loci)
                        all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0),
                                                    all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles,
                                                    sample=sample)

                    all_sample_coverage[sample]=Counter()
                    all_sample_duplicate[sample]=Counter()
                    all_sample_coverage_reads[sample] = defaultdict(Counter)
            current_contig = sam_record.get_reference_name()
            
            if not sam_record.is_unmapped():
                rg_id = sam_record.get_tag("RG")
                loci = get_loci_from_read(sam_record)
                read_sequence = sam_record.get_query_sequence()
                if sam_record.is_duplicate_read():
                    all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1
                all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1
                all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1
        if current_contig != None:
            for sample in read_groups.values():
                for loci in all_sample_coverage.get(sample):
                    alleles = all_sample_coverage_reads[sample].get(loci)
                    all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0),
                                                all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles,
                                                sample=sample)
                all_sample_coverage[sample]=Counter()
                all_sample_duplicate[sample]=Counter()
                all_sample_coverage_reads[sample] = defaultdict(Counter)
    finally:
        open_stream.close()