def set_read1_consensus_to_read2(input_stream, output_stream):
    
    #get the header
    line = input_stream.readline()
    while line.startswith("@"):
        output_stream.write(line)
        line = input_stream.readline()
    prev_read=Sam_record(line)
    for line in input_stream:
        read=Sam_record(line)
        if prev_read and read.get_query_name() == prev_read.get_query_name():
            if read.is_second_read() and prev_read.is_first_read():
                read1=prev_read
                read2=read
            else:
                read2=prev_read
                read1=read
            if not read1.is_unmapped():
                read2.set_reference_name(read1.get_reference_name())
                read2.set_unmapped_flag(False)
                read2.set_position(1)
                read2.set_cigar_string("%sM"%len(read2.get_query_sequence()))
            output_stream.write(str(read1))
            output_stream.write(str(read2))
            prev_read=None
        elif prev_read:
            output_stream.write(str(prev_read))
            prev_read=read
        else:
            prev_read=read
def set_read1_consensus_to_read1_and_read2(input_stream, output_stream):
    #get the header
    line = input_stream.readline()
    while line.startswith("@"):
        output_stream.write(line)
        line = input_stream.readline()
    n_1_read=Sam_record(line)
    line = input_stream.readline()
    n_2_read=Sam_record(line)
    #We need three read in a row to assign one to the others
    
    for line in input_stream:
        read=Sam_record(line)
        
        if n_1_read and n_2_read and read.get_query_name() == n_1_read.get_query_name() and \
        read.get_query_name() == n_2_read.get_query_name():
            three_reads={}
            three_reads[test_read_for_assignation(read)]=read
            three_reads[test_read_for_assignation(n_1_read)]=n_1_read
            three_reads[test_read_for_assignation(n_2_read)]=n_2_read
            #All 3 have been found and assigned
            if not three_reads['first_assigned'].is_unmapped():
                three_reads['first_unassigned'].set_reference_name(three_reads['first_assigned'].get_reference_name())
                three_reads['first_unassigned'].set_unmapped_flag(False)
                three_reads['first_unassigned'].set_position(three_reads['first_assigned'].get_position())
                three_reads['first_unassigned'].set_cigar_string("%sM"%len(three_reads['first_unassigned'].get_query_sequence()))
                
                three_reads['second'].set_reference_name(three_reads['first_assigned'].get_reference_name())
                three_reads['second'].set_unmapped_flag(False)
                three_reads['second'].set_position(three_reads['first_assigned'].get_position())
                three_reads['second'].set_cigar_string("%sM"%len(three_reads['second'].get_query_sequence()))
            output_stream.write(str(three_reads['first_unassigned']))
            output_stream.write(str(three_reads['second']))
            
            
            n_1_read=None
            n_2_read=None
        elif n_1_read and n_2_read:
            logging.warning('Missing pair for singleton %s: is this file sorted.'%(n_2_read.get_query_name()))
            output_stream.write(str(n_2_read))
            n_2_read=n_1_read
            n_1_read=read
        elif n_1_read:
            n_2_read=n_1_read
            n_1_read=read
        else:
            n_1_read=read
Ejemplo n.º 3
0
def load_from_sites_generator(stream):
    all_unmatched_read1={}
    all_unmatched_read2={}
    count_line=0
    for line in stream:
        count_line+=1
        if count_line%10000==0:
            sys.stderr.write('%s %s %s\n'%(count_line, len(all_unmatched_read1), len(all_unmatched_read2)))
        sam_record = Sam_record(line)
        if sam_record.is_first_read():
            sam_record_r1 = sam_record
            sam_record_r2 = all_unmatched_read2.pop(sam_record.get_query_name(),None)
            if not sam_record_r2:
               all_unmatched_read1[sam_record.get_query_name()]=sam_record
        else:
            sam_record_r2 = sam_record
            sam_record_r1 = all_unmatched_read1.pop(sam_record.get_query_name(),None)
            if not sam_record_r1:
                all_unmatched_read2[sam_record.get_query_name()]=sam_record

        if sam_record_r1 and sam_record_r2:
            yield  ((sam_record_r1,sam_record_r2))
Ejemplo n.º 4
0
        sam_record = Sam_record(line)
        if sam_record.get_reference_name(
        ) != current_reference and not current_reference is None:
            #process this consensus
            if current_reference != '*':

                nb_dups, nb_uniq = find_duplicates(first_reads, second_reads,
                                                   distance_threshold)
                total_nb_uniqs += nb_uniq
                total_nb_dups += nb_dups
                nb_fragment += len(second_reads)
            output_reads(output_stream, first_reads, second_reads)
            first_reads = {}
            second_reads = {}
        if sam_record.is_second_read():
            second_reads[sam_record.get_query_name()] = sam_record
        else:
            first_reads[sam_record.get_query_name()] = sam_record
        nb_reference += 1
        if nb_reference % 1000 == 0:
            print "process %s consensus" % nb_reference
        current_reference = sam_record.get_reference_name()
    if sam_record.get_reference_name(
    ) != current_reference and not current_reference is None:
        #process this consensus
        if current_reference != '*':
            nb_dups = find_duplicates(first_reads, second_reads,
                                      distance_threshold)
            total_nb_dups += nb_dups
            nb_fragment += len(second_reads)
        output_reads(output_stream, first_reads, second_reads)
 input_stream,process_input = utils_commands.get_output_stream_from_command(command)
 command ="%s view -bS - | %s sort - %s"%(samtools_bin,  samtools_bin, output_bam_file)
 logging.info(command)
 output_stream,process_output= utils_commands.get_input_stream_from_command(command)
 
 #get the header
 line = input_stream.readline()
 while line.startswith("@"):
     output_stream.write(line)
     line = input_stream.readline()
 
 while line:
     read1=Sam_record(line)
     line = input_stream.readline()
     read2=Sam_record(line)
     if read1.get_query_name() == read2.get_query_name():
         if read1.is_second_read() and read2.is_first_read():
             tmp = read1
             read1=read2
             read2=tmp
         read2.set_reference_name(read1.get_reference_name())
         output_stream.write(str(read1))
         output_stream.write(str(read2))
     else:
         logging.critical("bam file is not sorted by read name")
         input_stream.close()
         output_stream.close()
         #os.remove(output_bam_file+'.bam')
         return
     line = input_stream.readline()
     
Ejemplo n.º 6
0
         output_stream.write(line)
         continue
     sam_record = Sam_record(line)
     if sam_record.get_reference_name()!=current_reference and not current_reference is None:
         #process this consensus
         if current_reference!='*':
             
             nb_dups, nb_uniq = find_duplicates(first_reads,second_reads, distance_threshold)
             total_nb_uniqs+=nb_uniq
             total_nb_dups+=nb_dups
             nb_fragment+=len(second_reads)
         output_reads(output_stream, first_reads, second_reads)
         first_reads={}
         second_reads={}
     if sam_record.is_second_read():
         second_reads[sam_record.get_query_name()]=sam_record
     else:
         first_reads[sam_record.get_query_name()]=sam_record
     nb_reference+=1
     if nb_reference%1000==0:
         print "process %s consensus"%nb_reference
     current_reference = sam_record.get_reference_name()
 if sam_record.get_reference_name()!=current_reference and not current_reference is None:
     #process this consensus
     if current_reference!='*':
         nb_dups = find_duplicates(first_reads,second_reads, distance_threshold)
         total_nb_dups+=nb_dups
         nb_fragment+=len(second_reads)
     output_reads(output_stream, first_reads, second_reads)
 library_size = estimate_library_size(nb_fragment, total_nb_uniqs)
 print "%s fragments"%(nb_fragment)