Ejemplo n.º 1
0
def process_alleles(vcf_record_in_one_contig, sample_names, curr_reference):
    sample_to_allele = generate_empty_hash_with_sample(sample_names)
    command = "samtools view -F 1028 %s %s" % (bam_file, curr_reference)
    stream, process = utils_commands.get_output_stream_from_command(command)
    for line in stream:
        sam_record = Sam_record(line)
        allele_array = []
        sequence = sam_record.get_query_sequence()
        sample = sam_record.get_tag("RG")
        for position in vcf_record_in_one_contig.keys():
            #if vcf_record_in_one_contig.get(position).get_genotype_quality(sample)>20:
            allele_array.append(sequence[position - 1])
            #else:
            #    allele_array.append('.')
        count_with_hash(sample_to_allele[sample], ''.join(allele_array))
        count_with_hash(sample_to_allele['all'], ''.join(allele_array))
    process.wait()
    pprint.pprint(sample_to_allele)
    filter_alleles(sample_to_allele)
    pprint.pprint(sample_to_allele)
    all_alleles = set()
    valid = True
    for sample in sample_to_allele.keys():
        alleles = sample_to_allele.get(sample)
        all_alleles.update(set(alleles.keys()))
        if len(alleles) > 2:
            valid = False
    if len(all_alleles) > 4:
        valid = False
    if not valid:
        print curr_reference
Ejemplo n.º 2
0
def process_alleles(vcf_record_in_one_contig, sample_names,curr_reference):
    sample_to_allele = generate_empty_hash_with_sample(sample_names)
    command = "samtools view -F 1028 %s %s"%(bam_file,curr_reference)
    stream,process=utils_commands.get_output_stream_from_command(command)
    for line in stream:
        sam_record=Sam_record(line)
        allele_array=[]
        sequence = sam_record.get_query_sequence()
        sample = sam_record.get_tag("RG")
        for position in vcf_record_in_one_contig.keys():
            #if vcf_record_in_one_contig.get(position).get_genotype_quality(sample)>20:
            allele_array.append(sequence[position-1])
            #else:
            #    allele_array.append('.')
        count_with_hash(sample_to_allele[sample], ''.join(allele_array))
        count_with_hash(sample_to_allele['all'], ''.join(allele_array))
    process.wait()
    pprint.pprint(sample_to_allele)
    filter_alleles(sample_to_allele)
    pprint.pprint(sample_to_allele)
    all_alleles=set()
    valid=True
    for sample in sample_to_allele.keys():
        alleles = sample_to_allele.get(sample)
        all_alleles.update(set(alleles.keys()))
        if len(alleles)>2:
            valid=False
    if len(all_alleles)>4:
        valid=False
    if not valid:
        print curr_reference
def set_read1_consensus_to_read2(input_stream, output_stream):
    
    #get the header
    line = input_stream.readline()
    while line.startswith("@"):
        output_stream.write(line)
        line = input_stream.readline()
    prev_read=Sam_record(line)
    for line in input_stream:
        read=Sam_record(line)
        if prev_read and read.get_query_name() == prev_read.get_query_name():
            if read.is_second_read() and prev_read.is_first_read():
                read1=prev_read
                read2=read
            else:
                read2=prev_read
                read1=read
            if not read1.is_unmapped():
                read2.set_reference_name(read1.get_reference_name())
                read2.set_unmapped_flag(False)
                read2.set_position(1)
                read2.set_cigar_string("%sM"%len(read2.get_query_sequence()))
            output_stream.write(str(read1))
            output_stream.write(str(read2))
            prev_read=None
        elif prev_read:
            output_stream.write(str(prev_read))
            prev_read=read
        else:
            prev_read=read
Ejemplo n.º 4
0
def load_from_sites_generator(stream):
    all_unmatched_read1={}
    all_unmatched_read2={}
    count_line=0
    for line in stream:
        count_line+=1
        if count_line%10000==0:
            sys.stderr.write('%s %s %s\n'%(count_line, len(all_unmatched_read1), len(all_unmatched_read2)))
        sam_record = Sam_record(line)
        if sam_record.is_first_read():
            sam_record_r1 = sam_record
            sam_record_r2 = all_unmatched_read2.pop(sam_record.get_query_name(),None)
            if not sam_record_r2:
               all_unmatched_read1[sam_record.get_query_name()]=sam_record
        else:
            sam_record_r2 = sam_record
            sam_record_r1 = all_unmatched_read1.pop(sam_record.get_query_name(),None)
            if not sam_record_r1:
                all_unmatched_read2[sam_record.get_query_name()]=sam_record

        if sam_record_r1 and sam_record_r2:
            yield  ((sam_record_r1,sam_record_r2))
Ejemplo n.º 5
0
    command = "%s view -h %s" % (samtools_bin, bam_file)
    input_stream, process = utils_commands.get_output_stream_from_command(
        command)
    tmp_bam_file = output_bam_file = tmp + '_mrk_dup.bam.tmp'
    command = "%s view -bS - >%s" % (samtools_bin, tmp_bam_file)
    output_stream, process = utils_commands.get_input_stream_from_command(
        command)
    nb_reference = 0
    current_reference = None
    first_reads = {}
    second_reads = {}
    for line in input_stream:
        if line.startswith("@"):
            output_stream.write(line)
            continue
        sam_record = Sam_record(line)
        if sam_record.get_reference_name(
        ) != current_reference and not current_reference is None:
            #process this consensus
            if current_reference != '*':

                nb_dups, nb_uniq = find_duplicates(first_reads, second_reads,
                                                   distance_threshold)
                total_nb_uniqs += nb_uniq
                total_nb_dups += nb_dups
                nb_fragment += len(second_reads)
            output_reads(output_stream, first_reads, second_reads)
            first_reads = {}
            second_reads = {}
        if sam_record.is_second_read():
            second_reads[sam_record.get_query_name()] = sam_record
def set_read1_consensus_to_read1_and_read2(input_stream, output_stream):
    #get the header
    line = input_stream.readline()
    while line.startswith("@"):
        output_stream.write(line)
        line = input_stream.readline()
    n_1_read=Sam_record(line)
    line = input_stream.readline()
    n_2_read=Sam_record(line)
    #We need three read in a row to assign one to the others
    
    for line in input_stream:
        read=Sam_record(line)
        
        if n_1_read and n_2_read and read.get_query_name() == n_1_read.get_query_name() and \
        read.get_query_name() == n_2_read.get_query_name():
            three_reads={}
            three_reads[test_read_for_assignation(read)]=read
            three_reads[test_read_for_assignation(n_1_read)]=n_1_read
            three_reads[test_read_for_assignation(n_2_read)]=n_2_read
            #All 3 have been found and assigned
            if not three_reads['first_assigned'].is_unmapped():
                three_reads['first_unassigned'].set_reference_name(three_reads['first_assigned'].get_reference_name())
                three_reads['first_unassigned'].set_unmapped_flag(False)
                three_reads['first_unassigned'].set_position(three_reads['first_assigned'].get_position())
                three_reads['first_unassigned'].set_cigar_string("%sM"%len(three_reads['first_unassigned'].get_query_sequence()))
                
                three_reads['second'].set_reference_name(three_reads['first_assigned'].get_reference_name())
                three_reads['second'].set_unmapped_flag(False)
                three_reads['second'].set_position(three_reads['first_assigned'].get_position())
                three_reads['second'].set_cigar_string("%sM"%len(three_reads['second'].get_query_sequence()))
            output_stream.write(str(three_reads['first_unassigned']))
            output_stream.write(str(three_reads['second']))
            
            
            n_1_read=None
            n_2_read=None
        elif n_1_read and n_2_read:
            logging.warning('Missing pair for singleton %s: is this file sorted.'%(n_2_read.get_query_name()))
            output_stream.write(str(n_2_read))
            n_2_read=n_1_read
            n_1_read=read
        elif n_1_read:
            n_2_read=n_1_read
            n_1_read=read
        else:
            n_1_read=read
 #change_consensus_on_read2
 command ="%s view -h %s "%(samtools_bin,input_bam_file)
 logging.info(command)
 input_stream,process_input = utils_commands.get_output_stream_from_command(command)
 command ="%s view -bS - | %s sort - %s"%(samtools_bin,  samtools_bin, output_bam_file)
 logging.info(command)
 output_stream,process_output= utils_commands.get_input_stream_from_command(command)
 
 #get the header
 line = input_stream.readline()
 while line.startswith("@"):
     output_stream.write(line)
     line = input_stream.readline()
 
 while line:
     read1=Sam_record(line)
     line = input_stream.readline()
     read2=Sam_record(line)
     if read1.get_query_name() == read2.get_query_name():
         if read1.is_second_read() and read2.is_first_read():
             tmp = read1
             read1=read2
             read2=tmp
         read2.set_reference_name(read1.get_reference_name())
         output_stream.write(str(read1))
         output_stream.write(str(read2))
     else:
         logging.critical("bam file is not sorted by read name")
         input_stream.close()
         output_stream.close()
         #os.remove(output_bam_file+'.bam')
Ejemplo n.º 8
0
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info, samtools_bin):
    command = "%s view -h -F 132 %s" % (samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig = None
    sample_name, ext = os.path.splitext(bam_file)
    read_groups = {}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id = rg_sample = rg_library = None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id = value[3:]
                    elif value.startswith("SM"):
                        rg_sample = value[3:]
                    elif value.startswith("LB"):
                        rg_library = value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id] = rg_sample
                    elif rg_library:
                        read_groups[rg_id] = rg_library
                    else:
                        read_groups[rg_id] = rg_id
        all_sample_coverage = {}
        all_sample_duplicate = {}
        for sample in read_groups.values():
            all_sample_coverage[sample] = 0
            all_sample_duplicate[sample] = 0
            # process the first read
            # if line.startswith("@"):
        #    #Still in the header. There's no read, exit
        #    return
        sam_record = Sam_record(line.strip())
        current_contig = sam_record.get_reference_name()
        if not sam_record.is_unmapped():
            rg_id = sam_record.get_tag("RG")
            if sam_record.is_duplicate_read():
                all_sample_duplicate[read_groups.get(rg_id)] += 1
            all_sample_coverage[read_groups.get(rg_id)] += 1
        i = 1
        # process all the others
        for line in open_stream:
            i += 1
            if i % 1000000 == 0:
                print i
            sam_record = Sam_record(line.strip())
            if current_contig != sam_record.get_reference_name() and current_contig != None:
                for sample in read_groups.values():
                    all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample),
                                                all_sample_duplicate.get(sample), sample=sample)
                    all_sample_coverage[sample] = 0
                    all_sample_duplicate[sample] = 0
            current_contig = sam_record.get_reference_name()

            if not sam_record.is_unmapped():
                rg_id = sam_record.get_tag("RG")
                if sam_record.is_duplicate_read():
                    all_sample_duplicate[read_groups.get(rg_id)] += 1
                all_sample_coverage[read_groups.get(rg_id)] += 1
        if current_contig != None:
            for sample in read_groups.values():
                all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample),
                                            all_sample_duplicate.get(sample), sample=sample)
                all_sample_coverage[sample] = 0
                all_sample_duplicate[sample] = 0
    finally:
        open_stream.close()
Ejemplo n.º 9
0
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info,
                                                samtools_bin):
    command = "%s view -h -F 132 %s" % (samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig = None
    sample_name, ext = os.path.splitext(bam_file)
    read_groups = {}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id = rg_sample = rg_library = None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id = value[3:]
                    elif value.startswith("SM"):
                        rg_sample = value[3:]
                    elif value.startswith("LB"):
                        rg_library = value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id] = rg_sample
                    elif rg_library:
                        read_groups[rg_id] = rg_library
                    else:
                        read_groups[rg_id] = rg_id
        all_sample_coverage = {}
        all_sample_duplicate = {}
        for sample in read_groups.values():
            all_sample_coverage[sample] = 0
            all_sample_duplicate[sample] = 0
            # process the first read
            # if line.startswith("@"):
        #    #Still in the header. There's no read, exit
        #    return
        sam_record = Sam_record(line.strip())
        current_contig = sam_record.get_reference_name()
        if not sam_record.is_unmapped():
            rg_id = sam_record.get_tag("RG")
            if sam_record.is_duplicate_read():
                all_sample_duplicate[read_groups.get(rg_id)] += 1
            all_sample_coverage[read_groups.get(rg_id)] += 1
        i = 1
        # process all the others
        for line in open_stream:
            i += 1
            if i % 1000000 == 0:
                print i
            sam_record = Sam_record(line.strip())
            if current_contig != sam_record.get_reference_name(
            ) and current_contig != None:
                for sample in read_groups.values():
                    all_contigs_info.add_values(
                        current_contig,
                        all_sample_coverage.get(sample),
                        all_sample_duplicate.get(sample),
                        sample=sample)
                    all_sample_coverage[sample] = 0
                    all_sample_duplicate[sample] = 0
            current_contig = sam_record.get_reference_name()

            if not sam_record.is_unmapped():
                rg_id = sam_record.get_tag("RG")
                if sam_record.is_duplicate_read():
                    all_sample_duplicate[read_groups.get(rg_id)] += 1
                all_sample_coverage[read_groups.get(rg_id)] += 1
        if current_contig != None:
            for sample in read_groups.values():
                all_contigs_info.add_values(current_contig,
                                            all_sample_coverage.get(sample),
                                            all_sample_duplicate.get(sample),
                                            sample=sample)
                all_sample_coverage[sample] = 0
                all_sample_duplicate[sample] = 0
    finally:
        open_stream.close()
Ejemplo n.º 10
0
 tmp,ext = os.path.splitext(bam_file)
 
 command = "%s view -h %s"%(samtools_bin, bam_file)
 input_stream, process = utils_commands.get_output_stream_from_command(command)
 tmp_bam_file = output_bam_file = tmp + '_mrk_dup.bam.tmp'
 command = "%s view -bS - >%s"%(samtools_bin, tmp_bam_file)
 output_stream, process = utils_commands.get_input_stream_from_command(command)
 nb_reference=0
 current_reference=None
 first_reads={}
 second_reads={}
 for line in input_stream:
     if line.startswith("@"):
         output_stream.write(line)
         continue
     sam_record = Sam_record(line)
     if sam_record.get_reference_name()!=current_reference and not current_reference is None:
         #process this consensus
         if current_reference!='*':
             
             nb_dups, nb_uniq = find_duplicates(first_reads,second_reads, distance_threshold)
             total_nb_uniqs+=nb_uniq
             total_nb_dups+=nb_dups
             nb_fragment+=len(second_reads)
         output_reads(output_stream, first_reads, second_reads)
         first_reads={}
         second_reads={}
     if sam_record.is_second_read():
         second_reads[sam_record.get_query_name()]=sam_record
     else:
         first_reads[sam_record.get_query_name()]=sam_record
Ejemplo n.º 11
0
def process_single_samtools_run_with_read_group(bam_file,all_contigs_info,samtools_bin):
    command="%s view -h -F 132 %s"%(samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig=None
    sample_name, ext = os.path.splitext(bam_file)
    read_groups={}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id=rg_sample=rg_library=None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id=value[3:]
                    elif value.startswith("SM"):
                        rg_sample=value[3:]
                    elif value.startswith("LB"):
                        rg_library=value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id]=rg_sample
                    elif rg_library:
                        read_groups[rg_id]=rg_library
                    else:
                        read_groups[rg_id]=rg_id
        all_sample_coverage={}
        all_sample_coverage_reads = {}
        all_sample_duplicate={}
        for sample in read_groups.values():
            all_sample_coverage[sample]=Counter()
            all_sample_duplicate[sample]=Counter()
            all_sample_coverage_reads[sample] = defaultdict(Counter)
        #process the first read
        sam_record = Sam_record(line.strip())
        current_contig = sam_record.get_reference_name()
        if not sam_record.is_unmapped():
            rg_id = sam_record.get_tag("RG")
            read_sequence = sam_record.get_query_sequence()
            loci = get_loci_from_read(sam_record)
            if sam_record.is_duplicate_read():
                all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1
            all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1
            all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1
        i=1
        #process all the others
        for line in open_stream:
            i+=1
            if i%1000000==0:
                print i
            sam_record = Sam_record(line.strip())
            if current_contig != sam_record.get_reference_name() and current_contig != None:
                for sample in read_groups.values():
                    for loci in all_sample_coverage.get(sample):
                        alleles = all_sample_coverage_reads[sample].get(loci)
                        all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0),
                                                    all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles,
                                                    sample=sample)

                    all_sample_coverage[sample]=Counter()
                    all_sample_duplicate[sample]=Counter()
                    all_sample_coverage_reads[sample] = defaultdict(Counter)
            current_contig = sam_record.get_reference_name()
            
            if not sam_record.is_unmapped():
                rg_id = sam_record.get_tag("RG")
                loci = get_loci_from_read(sam_record)
                read_sequence = sam_record.get_query_sequence()
                if sam_record.is_duplicate_read():
                    all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1
                all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1
                all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1
        if current_contig != None:
            for sample in read_groups.values():
                for loci in all_sample_coverage.get(sample):
                    alleles = all_sample_coverage_reads[sample].get(loci)
                    all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0),
                                                all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles,
                                                sample=sample)
                all_sample_coverage[sample]=Counter()
                all_sample_duplicate[sample]=Counter()
                all_sample_coverage_reads[sample] = defaultdict(Counter)
    finally:
        open_stream.close()