def process_alleles(vcf_record_in_one_contig, sample_names, curr_reference): sample_to_allele = generate_empty_hash_with_sample(sample_names) command = "samtools view -F 1028 %s %s" % (bam_file, curr_reference) stream, process = utils_commands.get_output_stream_from_command(command) for line in stream: sam_record = Sam_record(line) allele_array = [] sequence = sam_record.get_query_sequence() sample = sam_record.get_tag("RG") for position in vcf_record_in_one_contig.keys(): #if vcf_record_in_one_contig.get(position).get_genotype_quality(sample)>20: allele_array.append(sequence[position - 1]) #else: # allele_array.append('.') count_with_hash(sample_to_allele[sample], ''.join(allele_array)) count_with_hash(sample_to_allele['all'], ''.join(allele_array)) process.wait() pprint.pprint(sample_to_allele) filter_alleles(sample_to_allele) pprint.pprint(sample_to_allele) all_alleles = set() valid = True for sample in sample_to_allele.keys(): alleles = sample_to_allele.get(sample) all_alleles.update(set(alleles.keys())) if len(alleles) > 2: valid = False if len(all_alleles) > 4: valid = False if not valid: print curr_reference
def process_alleles(vcf_record_in_one_contig, sample_names,curr_reference): sample_to_allele = generate_empty_hash_with_sample(sample_names) command = "samtools view -F 1028 %s %s"%(bam_file,curr_reference) stream,process=utils_commands.get_output_stream_from_command(command) for line in stream: sam_record=Sam_record(line) allele_array=[] sequence = sam_record.get_query_sequence() sample = sam_record.get_tag("RG") for position in vcf_record_in_one_contig.keys(): #if vcf_record_in_one_contig.get(position).get_genotype_quality(sample)>20: allele_array.append(sequence[position-1]) #else: # allele_array.append('.') count_with_hash(sample_to_allele[sample], ''.join(allele_array)) count_with_hash(sample_to_allele['all'], ''.join(allele_array)) process.wait() pprint.pprint(sample_to_allele) filter_alleles(sample_to_allele) pprint.pprint(sample_to_allele) all_alleles=set() valid=True for sample in sample_to_allele.keys(): alleles = sample_to_allele.get(sample) all_alleles.update(set(alleles.keys())) if len(alleles)>2: valid=False if len(all_alleles)>4: valid=False if not valid: print curr_reference
def set_read1_consensus_to_read2(input_stream, output_stream): #get the header line = input_stream.readline() while line.startswith("@"): output_stream.write(line) line = input_stream.readline() prev_read=Sam_record(line) for line in input_stream: read=Sam_record(line) if prev_read and read.get_query_name() == prev_read.get_query_name(): if read.is_second_read() and prev_read.is_first_read(): read1=prev_read read2=read else: read2=prev_read read1=read if not read1.is_unmapped(): read2.set_reference_name(read1.get_reference_name()) read2.set_unmapped_flag(False) read2.set_position(1) read2.set_cigar_string("%sM"%len(read2.get_query_sequence())) output_stream.write(str(read1)) output_stream.write(str(read2)) prev_read=None elif prev_read: output_stream.write(str(prev_read)) prev_read=read else: prev_read=read
def load_from_sites_generator(stream): all_unmatched_read1={} all_unmatched_read2={} count_line=0 for line in stream: count_line+=1 if count_line%10000==0: sys.stderr.write('%s %s %s\n'%(count_line, len(all_unmatched_read1), len(all_unmatched_read2))) sam_record = Sam_record(line) if sam_record.is_first_read(): sam_record_r1 = sam_record sam_record_r2 = all_unmatched_read2.pop(sam_record.get_query_name(),None) if not sam_record_r2: all_unmatched_read1[sam_record.get_query_name()]=sam_record else: sam_record_r2 = sam_record sam_record_r1 = all_unmatched_read1.pop(sam_record.get_query_name(),None) if not sam_record_r1: all_unmatched_read2[sam_record.get_query_name()]=sam_record if sam_record_r1 and sam_record_r2: yield ((sam_record_r1,sam_record_r2))
command = "%s view -h %s" % (samtools_bin, bam_file) input_stream, process = utils_commands.get_output_stream_from_command( command) tmp_bam_file = output_bam_file = tmp + '_mrk_dup.bam.tmp' command = "%s view -bS - >%s" % (samtools_bin, tmp_bam_file) output_stream, process = utils_commands.get_input_stream_from_command( command) nb_reference = 0 current_reference = None first_reads = {} second_reads = {} for line in input_stream: if line.startswith("@"): output_stream.write(line) continue sam_record = Sam_record(line) if sam_record.get_reference_name( ) != current_reference and not current_reference is None: #process this consensus if current_reference != '*': nb_dups, nb_uniq = find_duplicates(first_reads, second_reads, distance_threshold) total_nb_uniqs += nb_uniq total_nb_dups += nb_dups nb_fragment += len(second_reads) output_reads(output_stream, first_reads, second_reads) first_reads = {} second_reads = {} if sam_record.is_second_read(): second_reads[sam_record.get_query_name()] = sam_record
def set_read1_consensus_to_read1_and_read2(input_stream, output_stream): #get the header line = input_stream.readline() while line.startswith("@"): output_stream.write(line) line = input_stream.readline() n_1_read=Sam_record(line) line = input_stream.readline() n_2_read=Sam_record(line) #We need three read in a row to assign one to the others for line in input_stream: read=Sam_record(line) if n_1_read and n_2_read and read.get_query_name() == n_1_read.get_query_name() and \ read.get_query_name() == n_2_read.get_query_name(): three_reads={} three_reads[test_read_for_assignation(read)]=read three_reads[test_read_for_assignation(n_1_read)]=n_1_read three_reads[test_read_for_assignation(n_2_read)]=n_2_read #All 3 have been found and assigned if not three_reads['first_assigned'].is_unmapped(): three_reads['first_unassigned'].set_reference_name(three_reads['first_assigned'].get_reference_name()) three_reads['first_unassigned'].set_unmapped_flag(False) three_reads['first_unassigned'].set_position(three_reads['first_assigned'].get_position()) three_reads['first_unassigned'].set_cigar_string("%sM"%len(three_reads['first_unassigned'].get_query_sequence())) three_reads['second'].set_reference_name(three_reads['first_assigned'].get_reference_name()) three_reads['second'].set_unmapped_flag(False) three_reads['second'].set_position(three_reads['first_assigned'].get_position()) three_reads['second'].set_cigar_string("%sM"%len(three_reads['second'].get_query_sequence())) output_stream.write(str(three_reads['first_unassigned'])) output_stream.write(str(three_reads['second'])) n_1_read=None n_2_read=None elif n_1_read and n_2_read: logging.warning('Missing pair for singleton %s: is this file sorted.'%(n_2_read.get_query_name())) output_stream.write(str(n_2_read)) n_2_read=n_1_read n_1_read=read elif n_1_read: n_2_read=n_1_read n_1_read=read else: n_1_read=read
#change_consensus_on_read2 command ="%s view -h %s "%(samtools_bin,input_bam_file) logging.info(command) input_stream,process_input = utils_commands.get_output_stream_from_command(command) command ="%s view -bS - | %s sort - %s"%(samtools_bin, samtools_bin, output_bam_file) logging.info(command) output_stream,process_output= utils_commands.get_input_stream_from_command(command) #get the header line = input_stream.readline() while line.startswith("@"): output_stream.write(line) line = input_stream.readline() while line: read1=Sam_record(line) line = input_stream.readline() read2=Sam_record(line) if read1.get_query_name() == read2.get_query_name(): if read1.is_second_read() and read2.is_first_read(): tmp = read1 read1=read2 read2=tmp read2.set_reference_name(read1.get_reference_name()) output_stream.write(str(read1)) output_stream.write(str(read2)) else: logging.critical("bam file is not sorted by read name") input_stream.close() output_stream.close() #os.remove(output_bam_file+'.bam')
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info, samtools_bin): command = "%s view -h -F 132 %s" % (samtools_bin, bam_file) open_stream, process = get_output_stream_from_command(command) current_contig = None sample_name, ext = os.path.splitext(bam_file) read_groups = {} try: for line in open_stream: if not line.startswith("@"): break if line.startswith("@RG"): sp_line = line.strip().split() rg_id = rg_sample = rg_library = None for value in sp_line: if value.startswith("ID"): rg_id = value[3:] elif value.startswith("SM"): rg_sample = value[3:] elif value.startswith("LB"): rg_library = value[3:] if rg_id: if rg_sample: read_groups[rg_id] = rg_sample elif rg_library: read_groups[rg_id] = rg_library else: read_groups[rg_id] = rg_id all_sample_coverage = {} all_sample_duplicate = {} for sample in read_groups.values(): all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 # process the first read # if line.startswith("@"): # #Still in the header. There's no read, exit # return sam_record = Sam_record(line.strip()) current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)] += 1 all_sample_coverage[read_groups.get(rg_id)] += 1 i = 1 # process all the others for line in open_stream: i += 1 if i % 1000000 == 0: print i sam_record = Sam_record(line.strip()) if current_contig != sam_record.get_reference_name() and current_contig != None: for sample in read_groups.values(): all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample), all_sample_duplicate.get(sample), sample=sample) all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)] += 1 all_sample_coverage[read_groups.get(rg_id)] += 1 if current_contig != None: for sample in read_groups.values(): all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample), all_sample_duplicate.get(sample), sample=sample) all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 finally: open_stream.close()
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info, samtools_bin): command = "%s view -h -F 132 %s" % (samtools_bin, bam_file) open_stream, process = get_output_stream_from_command(command) current_contig = None sample_name, ext = os.path.splitext(bam_file) read_groups = {} try: for line in open_stream: if not line.startswith("@"): break if line.startswith("@RG"): sp_line = line.strip().split() rg_id = rg_sample = rg_library = None for value in sp_line: if value.startswith("ID"): rg_id = value[3:] elif value.startswith("SM"): rg_sample = value[3:] elif value.startswith("LB"): rg_library = value[3:] if rg_id: if rg_sample: read_groups[rg_id] = rg_sample elif rg_library: read_groups[rg_id] = rg_library else: read_groups[rg_id] = rg_id all_sample_coverage = {} all_sample_duplicate = {} for sample in read_groups.values(): all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 # process the first read # if line.startswith("@"): # #Still in the header. There's no read, exit # return sam_record = Sam_record(line.strip()) current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)] += 1 all_sample_coverage[read_groups.get(rg_id)] += 1 i = 1 # process all the others for line in open_stream: i += 1 if i % 1000000 == 0: print i sam_record = Sam_record(line.strip()) if current_contig != sam_record.get_reference_name( ) and current_contig != None: for sample in read_groups.values(): all_contigs_info.add_values( current_contig, all_sample_coverage.get(sample), all_sample_duplicate.get(sample), sample=sample) all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)] += 1 all_sample_coverage[read_groups.get(rg_id)] += 1 if current_contig != None: for sample in read_groups.values(): all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample), all_sample_duplicate.get(sample), sample=sample) all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 finally: open_stream.close()
tmp,ext = os.path.splitext(bam_file) command = "%s view -h %s"%(samtools_bin, bam_file) input_stream, process = utils_commands.get_output_stream_from_command(command) tmp_bam_file = output_bam_file = tmp + '_mrk_dup.bam.tmp' command = "%s view -bS - >%s"%(samtools_bin, tmp_bam_file) output_stream, process = utils_commands.get_input_stream_from_command(command) nb_reference=0 current_reference=None first_reads={} second_reads={} for line in input_stream: if line.startswith("@"): output_stream.write(line) continue sam_record = Sam_record(line) if sam_record.get_reference_name()!=current_reference and not current_reference is None: #process this consensus if current_reference!='*': nb_dups, nb_uniq = find_duplicates(first_reads,second_reads, distance_threshold) total_nb_uniqs+=nb_uniq total_nb_dups+=nb_dups nb_fragment+=len(second_reads) output_reads(output_stream, first_reads, second_reads) first_reads={} second_reads={} if sam_record.is_second_read(): second_reads[sam_record.get_query_name()]=sam_record else: first_reads[sam_record.get_query_name()]=sam_record
def process_single_samtools_run_with_read_group(bam_file,all_contigs_info,samtools_bin): command="%s view -h -F 132 %s"%(samtools_bin, bam_file) open_stream, process = get_output_stream_from_command(command) current_contig=None sample_name, ext = os.path.splitext(bam_file) read_groups={} try: for line in open_stream: if not line.startswith("@"): break if line.startswith("@RG"): sp_line = line.strip().split() rg_id=rg_sample=rg_library=None for value in sp_line: if value.startswith("ID"): rg_id=value[3:] elif value.startswith("SM"): rg_sample=value[3:] elif value.startswith("LB"): rg_library=value[3:] if rg_id: if rg_sample: read_groups[rg_id]=rg_sample elif rg_library: read_groups[rg_id]=rg_library else: read_groups[rg_id]=rg_id all_sample_coverage={} all_sample_coverage_reads = {} all_sample_duplicate={} for sample in read_groups.values(): all_sample_coverage[sample]=Counter() all_sample_duplicate[sample]=Counter() all_sample_coverage_reads[sample] = defaultdict(Counter) #process the first read sam_record = Sam_record(line.strip()) current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") read_sequence = sam_record.get_query_sequence() loci = get_loci_from_read(sam_record) if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1 all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1 all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1 i=1 #process all the others for line in open_stream: i+=1 if i%1000000==0: print i sam_record = Sam_record(line.strip()) if current_contig != sam_record.get_reference_name() and current_contig != None: for sample in read_groups.values(): for loci in all_sample_coverage.get(sample): alleles = all_sample_coverage_reads[sample].get(loci) all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0), all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles, sample=sample) all_sample_coverage[sample]=Counter() all_sample_duplicate[sample]=Counter() all_sample_coverage_reads[sample] = defaultdict(Counter) current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") loci = get_loci_from_read(sam_record) read_sequence = sam_record.get_query_sequence() if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1 all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1 all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1 if current_contig != None: for sample in read_groups.values(): for loci in all_sample_coverage.get(sample): alleles = all_sample_coverage_reads[sample].get(loci) all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0), all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles, sample=sample) all_sample_coverage[sample]=Counter() all_sample_duplicate[sample]=Counter() all_sample_coverage_reads[sample] = defaultdict(Counter) finally: open_stream.close()