def paired_mate_unmapper( input2, input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, output ):
    """
    Given a SAM file corresponding to alignments of *subsegments* of paired 'reads' to a reference sequence,
    convert the positions on the subsegments to positions on the reads.  Also (optionally) add quality values.
    
    The input file is in SAM format, as shown below.  Each line represents the alignment of a part of a read
    to a reference sequence.  Read pairs are indicated by suffixes in their names.  Normally, the suffixes _L
    and _R indicate the left and right mates of reads (this can be overridden with the --left and --right
    options).  Reads that were not mates have no suffix.
    
        (SAM header lines omitted)
        F2YP0BU02G7LK5_R 16 chr21 15557360 255 40M          * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT           *
        F2YP0BU02HXV58_L 16 chr21 15952091 255 40M6S        * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat     *
        F2YP0BU02HREML_R 0  chr21 16386077 255 33M5S        * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc             *
        F2YP0BU02IOF1F_L 0  chr21 17567321 255 7S28M        * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC                *
        F2YP0BU02IKX84_R 16 chr21 18491628 255 22M1D18M9S   * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt  *
        F2YP0BU02GW5VA_L 16 chr21 20255344 255 6S32M        * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA             *
        F2YP0BU02JIMJ4_R 0  chr21 22383051 255 19M          * 0 0 CCCTTTATCATTTTTTATT                                *
        F2YP0BU02IXZGF_L 16 chr21 23094798 255 13M1I18M     * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT                   *
        F2YP0BU02IODR5_L 0  chr21 30935325 255 37M          * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA              *
        F2YP0BU02IMZBL_L 16 chr21 31603486 255 28M1D1M      * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG                      *
        F2YP0BU02JA9PR_L 16 chr21 31677159 255 23M          * 0 0 CACACCTGTAACCCCAGCACTTT                            *
        F2YP0BU02HKC61_R 0  chr21 31678718 255 40M          * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT           *
        F2YP0BU02HKC61_R 0  chr21 31678718 255 40M          * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT           *
        F2YP0BU02HVA88   16 chr21 31703558 255 1M1D35M8S    * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat       *
        F2YP0BU02JDCF1_L 0  chr21 31816600 255 38M          * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT             *
        F2YP0BU02GZ1GO_R 0  chr21 33360122 255 6S38M        * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC       *
        F2YP0BU02FX387_L 16 chr22 14786201 255 26M          * 0 0 TGGATGAAGCTGGAAACCATCATTCT                         *
        F2YP0BU02IF2NE_R 0  chr22 16960842 255 40M10S       * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc *
        F2YP0BU02F4TVA   0  chr22 19200522 255 49M          * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA  *
        F2YP0BU02HKC61_R 16 chr22 29516998 255 8S32M        * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG           *
        F2YP0BU02FS4EM_R 0  chr22 30159364 255 29M          * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG                      *
        F2YP0BU02G197P_L 0  chr22 32044496 255 40M10S       * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc *
        F2YP0BU02FIING   16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG         *
        F2YP0BU02GUB9L_L 16 chr22 49198404 255 16M1I20M     * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA              *
    
    The user must provide a mapping file (which might better be called an unmapping file).  This file is usually
    created by split_paired_reads, and tells us how to map the subsegments back to original coordinates in a single
    read (this means the left and right mates were part of a single read).  The mapping file contains four columns.
    The first two give the mates's name (including the suffix) and the read name.  The last two columns describe how
    much of the full original sequence is missing from the mate.  For example, in the read below, the left mate is
    missing 63 on the right (42 for the linker and 21 for the right half).  The right mate is missing 339 on the left.
    
        left half:  TTTCAACATATGCAAATCAATAAATGTAATCCAGCATATAAACAGAACCA
                    AAGACAAAAACCACATGATTATCTCAATAGATGCAGAAAAGGCCTTCGGC
                    AAAATTCAACAAAACTCCATGCTAAAACTCTCAATAAGGTATTGATGGGA
                    CATGCCGCATAATAATAAGACATATCTATGACAAACCCACAGCCAATATC
                    ATGCTGAATGCACAAAAATTGGAAGCATTCCCTTTGAAAACTGGCACAAG
                    ACTGGGATGCCCTCTCTCACAACTCCTATTCAACATAGTGTTGGAAG
        linker:     CGTAATAACTTCGTATAGCATACATTATACGAAGTCATACGA
        right half: CTCCTGCCTCAGCCTCCCGAG
    
        mate_name        read_name      offset_to_start offset_from_end
        F2YP0BU02FS4EM_L F2YP0BU02FS4EM         0              71
        F2YP0BU02FS4EM_R F2YP0BU02FS4EM       339               0
    
    The user can also specify a quality scores file, which should look something like this.  Quality values are presumed
    to be PHRED scores, written in space-delimited decimal.
    
        >F2YP0BU02FS4EM
        38 38 38 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 38 21 21 21 40
        40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 33
        32 32 40 40 40 21 21 18 18 21 34 34 31 40 40 40 40 40 40 40 40 40 40 40 40
        40 40 40 40 40 40 40 40 40 40 40 32 32 32 32 40 40 40 40 40 40 40 34 34 35
        31 31 28 28 33 33 33 36 36 36 17 17 17 19 26 36 36 36 40 40 40 40 40 33 34
        34 34 39 39 39 40 40 40 40 40 33 33 34 34 40 40 40 40 40 40 40 39 39 39 40
        40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
        40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 39 39 39 40 40 40 40 40 40
        40 40 40 40 40 40 40 40 40 40 40 40 40 26 26 26 26 26 40 40 38 38 37 35 33
        36 40 19 17 17 17 17 19 19 23 30 20 20 20 23 35 40 36 36 36 36 36 36 36 36
        39 40 34 20 27 27 35 39 40 37 40 40 40 40 40 40 40 40 40 40 34 34 35 39 40
        40 40 40 40 40 40 39 39 39 40 40 40 40 36 36 32 32 28 28 29 30 36 40 30 26
        26 26 34 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39
        40 39 35 34 34 40 40 40 40 30 30 30 35 40 40 40 40 40 39 39 36 40 40 40 40
        39 39 39 39 30 30 28 35 35 39 40 40 40 40 40 35 35 35
        >F2YP0BU02G197P
        40 40 40 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 40 40 40 40 40 40
        40 40 40 40 26 26 26 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
        40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
        40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
        40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 34 34 34 40 40
        40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40
        40 40 40 40 40 40 40 34 34 34 34 40 40 40 40 34 34 34 34 40 40 40 40 40 40
        40 40 40 40 40 39 39 39 34 34 34 34 40 40 40 40 39 39 25 25 26 39 40 40 40
        40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
        33 33 33 33 40 35 21 21 21 30 38 40 40 40 40 40 40 40 40 35 35 30 30 30 40
        40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
        40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40
        40 40 40 39 39 39 40 40
        >F2YP0BU02FIING
        32 32 32 25 25 25 25 24 25 30 31 30 27 27 27 28 28 21 19 19 13 13 13 14 19
        19 17 19 16 16 25 28 22 21 17 17 18 25 24 25 25 25
    
    The output file is also SAM:
    
        (SAM header lines omitted)
        F2YP0BU02G7LK5 81  chr21 15557360 255 40M303H        * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT           D>>>>IIIIIIHHG???IIIIIIIIIHHHFFEIH999HII
        F2YP0BU02HXV58 145 chr21 15952091 255 226H40M6S      * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat     AA===DDDDAAAAD???:::ABBBBBAAA:888ECF;F>>>?8??@
        F2YP0BU02HREML 65  chr21 16386077 255 320H33M5S      * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc             HH???HHIIIHFHIIIIIIICDDHHIIIIIIHHHHHHH
        F2YP0BU02IOF1F 129 chr21 17567321 255 7S28M409H      * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC                4100<<A>4113:<EFGGGFFFHHHHHHDFFFFED
        F2YP0BU02IKX84 81  chr21 18491628 255 22M1D18M9S341H * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt  ;;;[email protected]?2?11112GGB=CCCCDIIIIIIIIIHHHHHHII
        F2YP0BU02GW5VA 145 chr21 20255344 255 286H6S32M      * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA             IIIIIIIHHHIIIIIIICCCCIIIIIIIIIIIIIIIII
        F2YP0BU02JIMJ4 65  chr21 22383051 255 208H19M        * 0 0 CCCTTTATCATTTTTTATT                                555544E?GE113344I22
        F2YP0BU02IXZGF 145 chr21 23094798 255 291H13M1I18M   * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT                   IIIIIIIIIIIGG;;;GGHIIIIIGGGIIIII
        F2YP0BU02IODR5 129 chr21 30935325 255 37M154H        * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA              6...7/--..,30;9<<>@BFFFAAAAHIIIIIH@@@
        F2YP0BU02IMZBL 145 chr21 31603486 255 342H28M1D1M    * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG                      BB1552222<<>9==8;;?AA=??A???A
        F2YP0BU02JA9PR 145 chr21 31677159 255 229H23M        * 0 0 CACACCTGTAACCCCAGCACTTT                            IIIIIIIIIIICCCCIIIIIHHH
        F2YP0BU02HKC61 65  chr21 31678718 255 300H40M        * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT           AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442
        F2YP0BU02HKC61 65  chr21 31678718 255 300H40M        * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT           AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442
        F2YP0BU02HVA88 16  chr21 31703558 255 1M1D35M8S      * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat       >8888DFFHHGFHHHH@@?@?DDC96666HIIIFFFFFFFFFFF
        F2YP0BU02JDCF1 129 chr21 31816600 255 38M103H        * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT             IIIIIIIIIIIHHHIIHHHIIIIIIIIIIIIIIIIIII
        F2YP0BU02GZ1GO 65  chr21 33360122 255 76H6S38M       * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC       BBBBD?:688CFFFFFFFFFFFFFFFFFFFFFFFFFFDDBBB51
        F2YP0BU02FX387 145 chr22 14786201 255 201H26M        * 0 0 TGGATGAAGCTGGAAACCATCATTCT                         IIHHHHHHHHHHHHHFFFFFFFFFFF
        F2YP0BU02IF2NE 65  chr22 16960842 255 209H40M10S     * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc BAAADDDDFDDDDDDBBA889<A?4444000@<>AA?9444;;8>77<7-
        F2YP0BU02F4TVA 0   chr22 19200522 255 49M            * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA  FFF???FFFFFIIIIIIIIIIIIIIIIIIIIIIIHHIIFHFFFGDDB=5
        F2YP0BU02HKC61 81  chr22 29516998 255 8S32M300H      * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG           2444<<<<>?>><40009<:8888?A@AAA==:::DB@AA
        F2YP0BU02FS4EM 65  chr22 30159364 255 339H29M        * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG                      IIIIHHEIIIIHHHH??=DDHIIIIIDDD
        F2YP0BU02G197P 129 chr22 32044496 255 40M10S258H     * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc IIIIIIIIIIHHHHHHIIIIIIIIIIIII;;;IIIIIIIIIIIIIIIIII
        F2YP0BU02FIING 16  chr22 45959944 255 3M1I11M1I26M   * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG         :::9:32267=:114244/...446==<<<?@?:9::::AAA
        F2YP0BU02GUB9L 145 chr22 49198404 255 176H16M1I20M   * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA              IIIIIIIIIHAAC;<</////@4F5778;IIIIIIII
    
    """
    left_suffix       = "_L"
    right_suffix      = "_R"
    # Read the mapping
    mate_to_read_dict = {}
    i = 0
    for i, line in enumerate( file( tmp_mates_mapping_file_name, 'rb' ) ):
        line = line.strip()
        if not line.startswith( "#" ):
            fields = line.split()
            if len( fields ) != 4:
                skip_line( "num_fields", i+1, line )
                continue
            mate_name, read_name, s_offset, e_offset = fields
            if mate_name in mate_to_read_dict:
                skip_line( 'two_mate_names', i+1, mate_name )
                continue
            mate_to_read_dict[ mate_name ] = ( read_name, int( s_offset ), int( e_offset ) )
    # Read sequence data
    read_to_nucs_dict = {}
    seqs = 0
    fasta_reader = FastaReader( file( input2, 'rb' ) )
    while True:
        seq = fasta_reader.next()
        if not seq:
            break
        seqs += 1
        seq_text_upper = seq.text.upper()
        if seq.name in read_to_nucs_dict:
            if seq_text_upper != read_to_nucs_dict[ seq.name ]:
                skip_line( 'inconsistent_reads', seqs, seq.name )
                continue
        read_to_nucs_dict[ seq.name ] = seq_text_upper
    # Read quality data
    def quality_sequences( f ):
        seq_name  = None
        seq_quals = None
        line_number = 0
        for line in f:
            line_number += 1
            line = line.strip()
            if line.startswith( ">" ):
                if seq_name != None:
                    yield ( seq_name, seq_quals, seq_line )
                seq_name  = sequence_name( line )
                seq_line  = line_number
                seq_quals = []
            elif seq_name is None:
                skip_line( 'no_header', line_number, line )
                continue
            else:
                seq_quals += [ int( q ) for q in line.split() ]
        if seq_name is not None:
            yield ( seq_name, seq_quals, seq_line )
    def sequence_name( s ):
        s = s[ 1: ].strip()
        if not s:
            return ""
        else:
            return s.split()[ 0 ]
    read_to_quals_dict = {}
    # TODO: should we use Dan's fastaNamedReader here?
    for seq_name, quals, line_number in quality_sequences( file( input4 ) ):
        quals = samify_phred_scores( quals )
        if seq_name in read_to_quals_dict:
            if quals != read_to_quals_dict[ seq_name ]:
                skip_line( 'inconsistent_reads', line_number, seq_name )
            continue
        if len( quals ) != len( read_to_nucs_dict[ seq_name ] ):
            skip_line( 'inconsistent_read_lengths', line_number, seq_name )
            continue
        read_to_quals_dict[ seq_name ] = quals
    # process the SAM file
    tmp_align_file_names = ' '.join( tmp_align_file_name_list )
    combined_chrom_file_name = get_tmp_file_name( suffix='combined_chrom' )
    command = 'cat %s | grep -v "^@" | sort -k 1 > %s' % ( tmp_align_file_names, combined_chrom_file_name )
    run_command( command )
    fout = file( output, 'w+b' )
    has_non_header = False
    i = 0
    for i, line in enumerate( file( combined_chrom_file_name, 'rb' ) ):
        line = line.strip()
        if line.startswith( "@" ):
            if has_non_header:
                skip_line( 'sam_headers', i+1, line )
                continue
            fout.write( "%s\n" % line )
            continue
        has_non_header = True
        fields = line.split()
        num_fields = len( fields )
        if num_fields < SAM_MIN_COLUMNS:
            skip_line( 'sam_min_columns', i+1, line )
            continue
        # Set flags for mates
        try:
            flag = int( fields[ SAM_FLAG_COLUMN ] )
        except ValueError:
            skip_line( 'sam_flag', i+1, line )
            continue
        if not( flag & ( BAM_FPAIRED + BAM_FREAD1 + BAM_FREAD2 ) == 0 ):
            skip_line( 'reads_paired', i+1, line )
            continue
        mate_name = fields[ SAM_QNAME_COLUMN ]
        unmap_it = False
        half = None
        if mate_name.endswith( left_suffix ):
            flag += BAM_FPAIRED + BAM_FREAD2
            fields[ SAM_FLAG_COLUMN ] = "%d" % flag
            unmap_it = True
            half = "L"
        elif mate_name.endswith( right_suffix ):
            flag += BAM_FPAIRED + BAM_FREAD1
            fields[ SAM_FLAG_COLUMN ] = "%d" % flag
            unmap_it = True
            half = "R"
        on_plus_strand = ( flag & BAM_FREVERSE == 0 )
        # Convert position from mate to read by adding clipping to cigar
        if not unmap_it:
            read_name = mate_name
        else:
            try:
                read_name, s_offset, e_offset = mate_to_read_dict[ mate_name ]
            except KeyError:
                skip_line( 'missing_mate', i+1, mate_name )
                continue
            cigar = fields[ SAM_CIGAR_COLUMN ]
            cigar_prefix = None
            cigar_suffix = None
            if half == "L": 
                if on_plus_strand:
                    if s_offset > 0:
                        cigar_prefix = ( s_offset, "S" )
                    if e_offset > 0:
                        cigar_suffix = ( e_offset, "H" )
                else:
                    if e_offset > 0:
                        cigar_prefix = ( e_offset, "H" )
                    if s_offset > 0:
                        cigar_suffix = ( s_offset, "S" )
            elif half == "R": 
                if on_plus_strand:
                    if s_offset > 0:
                        cigar_prefix = ( s_offset, "H" )
                    if e_offset > 0:
                        cigar_suffix = ( e_offset, "S" )
                else:
                    if e_offset > 0:
                        cigar_prefix = ( e_offset, "S" )
                    if s_offset > 0:
                        cigar_suffix = ( s_offset, "H" )
            else:               
                if on_plus_strand:
                    if s_offset > 0:
                        cigar_prefix = ( s_offset, "S" )
                    if e_offset > 0:
                        cigar_suffix = ( e_offset, "S" )
                else:
                    if e_offset > 0:
                        cigar_prefix = ( e_offset, "S" )
                    if s_offset > 0:
                        cigar_suffix = ( s_offset, "S" )
            if cigar_prefix != None:
                count, op = cigar_prefix
                cigar = prefix_cigar( "%d%s" % ( count, op ), cigar )
                if op == "S":
                    refPos = int( fields[ SAM_POS_COLUMN ] ) - count
                    fields[ SAM_POS_COLUMN ] = "%d" % refPos
            if cigar_suffix != None:
                count, op = cigar_suffix
                cigar = suffix_cigar( cigar,"%d%s" % ( count, op) )
            fields[ SAM_QNAME_COLUMN ] = read_name
            fields[ SAM_CIGAR_COLUMN ] = cigar
        # Fetch sequence and quality values, and flip/clip them
        if read_name not in read_to_nucs_dict:
            skip_line( 'missing_seq', i+1, read_name )
            continue
        nucs = read_to_nucs_dict[ read_name ]
        if not on_plus_strand:
            nucs = reverse_complement( nucs )
        quals = None
        if read_to_quals_dict != None:
            if read_name not in read_to_quals_dict:
                skip_line( 'missing_quals', i+1, read_name )
                continue
            quals = read_to_quals_dict[ read_name ]
            if not on_plus_strand:
                quals = reverse_string( quals )
        cigar = split_cigar( fields[ SAM_CIGAR_COLUMN ] )
        nucs, quals = clip_for_cigar( cigar, nucs, quals )
        fields[ SAM_SEQ_COLUMN ] = nucs
        if quals != None:
            fields[ SAM_QUAL_COLUMN ] = quals
        # Output the line
        fout.write( "%s\n" % "\t".join( fields ) )
    fout.close()
def split_paired_reads( input2, combined_linker_file_name ):
    """
    Given a fasta file of allegedly paired end reads ( input2 ), and a list of intervals
    showing where the linker is on each read ( combined_linker_file_name ), split the reads into left and right
    halves.
    
    The input intervals look like this.  Note that they may include multiple intervals for the same read
    ( which should overlap ), and we use the union of them as the linker interval.  Non-overlaps are
    reported to the user, and those reads are not processed.  Starts are origin zero.
    
        #name     strand start len size
        FG3OYDA05FTEES +   219  42 283
        FG3OYDA05FVOLL +   263  41 416
        FG3OYDA05FFL7J +    81  42 421
        FG3OYDA05FOQWE +    55  42 332
        FG3OYDA05FV4DW +   297  42 388
        FG3OYDA05FWAQV +   325  42 419
        FG3OYDA05FVLGA +    90  42 367
        FG3OYDA05FWJ71 +    58  42 276
    
    The output gives each half-sequence on a separate line, like this.  This allows easy sorting of the
    sequences by length, after the fact.
    
        219 FG3OYDA05FTEES_L TTTAGTTACACTTAACTCACTTCCATCCTCTAAATACGTGATTACCTTTC...
        22  FG3OYDA05FTEES_R CCTTCCTTAAGTCCTAAAACTG
    """
    # Bob says these should be hard-coded.
    seq_len_lower_threshold = 17
    short_mate_cutoff = 50
    # We need to pass the name of this file back to the caller.
    tmp_mates_file_name = get_tmp_file_name( suffix='mates.txt' )
    mates_file = file( tmp_mates_file_name, "w+b" )
    # Read the linker intervals
    combined_linker_file = file( combined_linker_file_name, "rb" )
    read_to_linker_dict = {}
    i = 0
    for i, line in enumerate( combined_linker_file ):
        line = line.strip()
        if line.startswith( "#" ):
            continue
        if line.find( '#' ) >= 0:
            line = line.split( "#", 1 )[0].rstrip()
        fields = line.split()
        if len( fields ) != 4:
            skip_line( 'num_fields', i+1, line )
            continue
        name, start, length, size = fields
        start = int( start )
        length = int( length )
        size = int( size )
        end = start + length
        if end > size:
            skip_line[ 'bad_interval' ] += 1
            continue
        if name not in read_to_linker_dict:
            read_to_linker_dict[ name ] = ( start, end, size )
            continue
        if read_to_linker_dict[ name ] == None:
            # Read previously marked as non-overlapping intervals, so skip this sequence - see below
            continue
        ( s, e, sz ) = read_to_linker_dict[ name ]
        if sz != size:
            skip_line( 'inconsistent_sizes', i+1, name )
            continue
        if s > end or e < start:
            # Non-overlapping intervals, so skip this sequence
            read_to_linker_dict[ name ] = None
            continue
        read_to_linker_dict[ name ] = ( min( s, start ), max( e, end ), size )
    combined_linker_file.close()
    # We need to pass the name of this file back to the caller.
    tmp_mates_mapping_file_name = get_tmp_file_name( suffix='mates.mapping' )
    mates_mapping_file = file( tmp_mates_mapping_file_name, 'w+b' )
    # Process the sequences
    seqs = 0
    fasta_reader = FastaReader( file( input2, 'rb' ) )
    while True:
        seq = fasta_reader.next()
        if not seq:
            break
        seqs += 1
        if seq.name not in read_to_linker_dict:
            if seq.length > seq_len_lower_threshold:
                mates_file.write( "%-3d %s   %s\n" % ( seq.length, seq.name, seq.text ) )
            read_to_linker_dict[ seq.name ] = ""
            continue
        if read_to_linker_dict[ seq.name ] == "":
            skip_line( 'multiple_seqs', seqs, seq.name )
            continue
        if read_to_linker_dict[ seq.name ] == None:
            # Read previously marked as non-overlapping intervals, so skip this sequence - see above
            continue
        ( start, end, size ) = read_to_linker_dict[ seq.name ]
        if seq.length != size:
            skip_line( 'wrong_seq_len', seqs, seq.name )
            continue
        left = seq.text[ :start ]
        right = seq.text[ end: ]
        left_is_small = len( left ) <= seq_len_lower_threshold
        right_is_small = len( right ) <= seq_len_lower_threshold
        if left_is_small and right_is_small:
            continue
        if not left_is_small:
            mates_file.write( "%-3d %s %s\n" % ( len( left ), seq.name + "_L", left ) )
            mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_L", seq.name, 0, size - start ) )
        if not right_is_small:
            mates_file.write( "%-3d %s %s\n" % ( len( right ), seq.name + "_R", right ) )
            mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_R", seq.name, end, 0 ) )
        read_to_linker_dict[ seq.name ] = ""
    combined_linker_file.close()
    mates_file.close()
    mates_mapping_file.close()
    # Create temporary files for short and long mates
    tmp_mates_short_file_name = get_tmp_file_name( suffix='mates.short' )
    tmp_mates_long_file_name = get_tmp_file_name( suffix='mates.long' )
    tmp_mates_short = open( tmp_mates_short_file_name, 'w+b' )
    tmp_mates_long = open( tmp_mates_long_file_name, 'w+b' )
    i = 0
    for i, line in enumerate( file( tmp_mates_file_name, 'rb' ) ):
        fields = line.split()
        seq_len = int( fields[0] )
        seq_name = fields[1]
        seq_text = fields[2]
        if seq_len <= short_mate_cutoff:
            tmp_mates_short.write( ">%s\n%s\n" % ( seq_name, seq_text ) )
        else:
            tmp_mates_long.write( ">%s\n%s\n" % ( seq_name, seq_text ) )
    tmp_mates_short.close()
    tmp_mates_long.close()
    return tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name
def align_mates( input1, ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ):
    tmp_align_file_names = []
    if ref_source == 'history':
        # Reference is a fasta dataset from the history
        # Create temporary files to contain the output from lastz executions
        tmp_short_file_name = get_tmp_file_name( suffix='short_out' )
        tmp_align_file_names.append( tmp_short_file_name )
        tmp_long_file_name = get_tmp_file_name( suffix='long_out' )
        tmp_align_file_names.append( tmp_long_file_name )
        seqs = 0
        fasta_reader = FastaReader( open( input1 ) )
        while True:
            # Read the next sequence from the reference dataset.  Note that if the reference contains
            # a small number of chromosomes this loop is ok, but in many cases the genome has a bunch
            # of small straggler scaffolds and contigs and it is a computational waste to do each one
            # of these in its own run.  There is an I/O down side to running by subsets (even if they are
            # one sequence per subset), compared to splitting the reference into sizes of 250 mb.  With
            # the subset action, lastz still has to read and parse the entire file for every run (this
            # is true for fasta, but for .2bit files it can access each sequence directly within the file,
            # so the overhead is minimal).
            """
            :> output_file  (this creates the output file, empty)
            while there are more sequences to align
                find the next sequences that add up to 250M, put their names in farf.names
                lastz ${refFile}[subset=farf.names][multi][unmask] ${matesPath}/${matesFile} ... 
                  >> output_file
            """
            seq = fasta_reader.next()
            if not seq:
                break
            seqs += 1
            # Create a temporary file to contain the current sequence as input to lastz.
            # We're doing this a bit differently here since we could be generating a huge
            # number of temporary files.
            tmp_in_fd, tmp_in_file_name = tempfile.mkstemp( suffix='seq_%d_in' % seqs )
            tmp_in_file = os.fdopen( tmp_in_fd, 'w+b' )
            tmp_in_file.write( '>%s\n%s\n' % ( seq.name, seq.text ) )
            tmp_in_file.close()
            # Align short mates
            command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_short_file_name )
            command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- '
            command += '>> %s' % tmp_short_file_name
            run_command( command )
            # Align long mates
            command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_long_file_name )
            command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- '
            command += '>> %s' % tmp_long_file_name
            run_command( command )
            # Remove the temporary file that contains the current sequence
            os.remove( tmp_in_file_name )
    else:
        # Reference is a locally cached 2bit file, split lastz calls across number of chroms in 2bit file
        tbf = TwoBitFile( open( input1, 'rb' ) )
        for chrom in tbf.keys():
            # Align short mates
            tmp_short_file_name = get_tmp_file_name( suffix='short_vs_%s' % chrom )
            tmp_align_file_names.append( tmp_short_file_name )
            command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_short_file_name )
            command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- '
            command += '> %s' % tmp_short_file_name
            run_command( command )
            # Align long mates
            tmp_long_file_name = get_tmp_file_name( suffix='long_vs_%s' % chrom )
            tmp_align_file_names.append( tmp_long_file_name )
            command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_long_file_name )
            command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- '
            command += '> %s' % tmp_long_file_name
            run_command( command )
    return tmp_align_file_names
Ejemplo n.º 4
0
def __main__():
    #Parse Command Line
    parser = optparse.OptionParser()
    parser.add_option(
        '',
        '--ref_name',
        dest='ref_name',
        help='The reference name to change all output matches to')
    parser.add_option(
        '',
        '--ref_source',
        dest='ref_source',
        help='Whether the reference is cached or from the history')
    parser.add_option('',
                      '--ref_sequences',
                      dest='ref_sequences',
                      help='Number of sequences in the reference dataset')
    parser.add_option('',
                      '--source_select',
                      dest='source_select',
                      help='Whether to used pre-set or cached reference file')
    parser.add_option(
        '',
        '--input1',
        dest='input1',
        help=
        'The name of the reference file if using history or reference base name if using cached'
    )
    parser.add_option('',
                      '--input2',
                      dest='input2',
                      help='The reads file to align')
    parser.add_option(
        '',
        '--pre_set_options',
        dest='pre_set_options',
        help='Which of the pre set options to use, if using pre-sets')
    parser.add_option(
        '',
        '--strand',
        dest='strand',
        help='Which strand of the read to search, if specifying all parameters'
    )
    parser.add_option('',
                      '--seed',
                      dest='seed',
                      help='Seeding settings, if specifying all parameters')
    parser.add_option(
        '',
        '--transition',
        dest='transition',
        help=
        'Number of transitions to allow in each seed hit, if specifying all parameters'
    )
    parser.add_option(
        '',
        '--gfextend',
        dest='gfextend',
        help=
        'Whether to perform gap-free extension of seed hits to HSPs (high scoring segment pairs), if specifying all parameters'
    )
    parser.add_option(
        '',
        '--chain',
        dest='chain',
        help='Whether to perform chaining of HSPs, if specifying all parameters'
    )
    parser.add_option('',
                      '--O',
                      dest='O',
                      help='Gap opening penalty, if specifying all parameters')
    parser.add_option(
        '',
        '--E',
        dest='E',
        help='Gap extension penalty, if specifying all parameters')
    parser.add_option('',
                      '--X',
                      dest='X',
                      help='X-drop threshold, if specifying all parameters')
    parser.add_option('',
                      '--Y',
                      dest='Y',
                      help='Y-drop threshold, if specifying all parameters')
    parser.add_option('',
                      '--K',
                      dest='K',
                      help='Threshold for HSPs, if specifying all parameters')
    parser.add_option(
        '',
        '--L',
        dest='L',
        help='Threshold for gapped alignments, if specifying all parameters')
    parser.add_option(
        '',
        '--entropy',
        dest='entropy',
        help=
        'Whether to involve entropy when filtering HSPs, if specifying all parameters'
    )
    parser.add_option(
        '',
        '--identity_min',
        dest='identity_min',
        help="Minimum identity (don't report matches under this identity)")
    parser.add_option(
        '',
        '--identity_max',
        dest='identity_max',
        help="Maximum identity (don't report matches above this identity)")
    parser.add_option(
        '',
        '--coverage',
        dest='coverage',
        help=
        "The minimum coverage value (don't report matches covering less than this)"
    )
    parser.add_option(
        '',
        '--out_format',
        dest='format',
        help='The format of the output file (sam, diffs, or tabular (general))'
    )
    parser.add_option('', '--output', dest='output', help='The output file')
    parser.add_option('',
                      '--num_threads',
                      dest='num_threads',
                      help='The number of threads to run')
    parser.add_option('',
                      '--lastzSeqsFileDir',
                      dest='lastzSeqsFileDir',
                      help='Directory of local lastz_seqs.loc file')
    (options, args) = parser.parse_args()

    # If the reference sequences are from the history, temporary input files will be created
    # ( 1 for each sequence ), and we'll keep track of them for later removal from disk ( by closing them )
    tmp_in_file_names = []
    # Each thread will create a temporary file to which it writes the output from lastz
    tmp_out_file_names = []
    # Execution of lastz based on job splitting
    commands = []
    if options.ref_name != 'None':
        ref_name = '[nickname=%s]' % options.ref_name
    else:
        ref_name = ''
    # Prepare for commonly-used preset options
    if options.source_select == 'pre_set':
        set_options = '--%s' % options.pre_set_options
    # Prepare for user-specified options
    else:
        set_options = '--%s --%s --gapped --strand=%s --seed=%s --%s O=%s E=%s X=%s Y=%s K=%s L=%s --%s' % \
                    ( options.gfextend, options.chain, options.strand, options.seed,
                      options.transition, options.O, options.E, options.X,
                      options.Y, options.K, options.L, options.entropy )
    # Specify input2 and add [fullnames] modifier if output format is diffs
    if options.format == 'diffs':
        input2 = '%s[fullnames]' % options.input2
    else:
        input2 = options.input2
    if options.format == 'tabular':
        # Change output format to general if it's tabular and add field names for tabular output
        format = 'general-'
        tabular_fields = ':score,name1,strand1,size1,start1,zstart1,end1,length1,text1,name2,strand2,size2,start2,zstart2,end2,start2+,zstart2+,end2+,length2,text2,diff,cigar,identity,coverage,gaprate,diagonal,shingle'
    elif options.format == 'sam':
        # We currently ALWAYS suppress SAM headers.
        format = 'sam-'
        tabular_fields = ''
    else:
        format = options.format
        tabular_fields = ''
    if options.ref_source == 'history':
        # Reference is a fasta dataset from the history, so split job across number of sequences in the dataset
        try:
            # Ensure there is at least 1 sequence in the dataset ( this may not be necessary ).
            error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes."
            ref_sequences = int(options.ref_sequences)
            if ref_sequences < 1:
                stop_err(error_msg)
        except:
            stop_err(error_msg)
        seqs = 0
        fasta_reader = FastaReader(open(options.input1))
        while True:
            # Read the next sequence from the reference dataset
            seq = fasta_reader.next()
            if not seq:
                break
            seqs += 1
            # Create a temporary file to contain the current sequence as input to lastz
            tmp_in = tempfile.NamedTemporaryFile(prefix=seq.name,
                                                 suffix='.fasta')
            tmp_in_name = tmp_in.name
            tmp_in.close()
            tmp_in = file(tmp_in_name, 'w+b')
            # Keep track of our list of temporary input files so we can remove them later by closing them
            tmp_in_file_names.append(tmp_in_name)
            # Write the current sequence to the temporary input file
            tmp_in.write('>%s\n%s\n' % (seq.name, seq.text))
            tmp_in.close()
            # Create a 2nd temporary file to contain the output from lastz execution on the current sequence
            tmp_out = tempfile.NamedTemporaryFile(prefix='%s_out' % seq.name)
            tmp_out_name = tmp_out.name
            tmp_out.close()
            # Keep track of our list of temporary output files so we can merge them into our output dataset
            tmp_out_file_names.append(tmp_out_name)
            # Generate the command line for calling lastz on the current sequence
            command = 'lastz %s%s %s %s --ambiguousn --nolaj --identity=%s..%s --coverage=%s --format=%s%s > %s' % \
                ( tmp_in_name, ref_name, input2, set_options, options.identity_min,
                  options.identity_max, options.coverage, format, tabular_fields, tmp_out_name )
            # Append the command line to our list of commands for sending to the LastzJobRunner queue
            commands.append(command)
        # Make sure the value of sequences in the metadata is the
        # same as the number of sequences read from the dataset ( this may not be necessary ).
        if ref_sequences != seqs:
            stop_error(
                "The value of metadata.sequences (%d) differs from the number of sequences read from the reference ( %d)."
                % (ref_sequences, seqs))
    else:
        # Reference is a locally cached 2bit file, split job across number of chroms in 2bit file
        tbf = TwoBitFile(open(options.input1, 'r'))
        for chrom in tbf.keys():
            # Create a temporary file to contain the output from lastz execution on the current chrom
            tmp_out = tempfile.NamedTemporaryFile(prefix='%s_out' % chrom)
            tmp_out_name = tmp_out.name
            tmp_out.close()
            # Keep track of our list of temporary output files so we can merge them into our output dataset
            tmp_out_file_names.append(tmp_out_name)
            command = 'lastz %s/%s%s %s %s --ambiguousn --nolaj --identity=%s..%s --coverage=%s --format=%s%s >> %s' % \
                ( options.input1, chrom, ref_name, input2, set_options, options.identity_min,
                  options.identity_max, options.coverage, format, tabular_fields, tmp_out_name )
            commands.append(command)
    job_runner = LastzJobRunner(int(options.num_threads), commands)
    # Merge all of the output from lastz ( currently in temporary files ) into our output dataset
    command = 'cat %s >> %s' % (' '.join(tmp_out_file_names), options.output)
    proc = subprocess.Popen(args=command, shell=True)
    proc.wait()
    # Remove all temporary files from disk by closing them
    for name in tmp_in_file_names:
        try:
            os.remove(name)
        except:
            pass
    for name in tmp_out_file_names:
        try:
            os.remove(name)
        except:
            pass
Ejemplo n.º 5
0
def __main__():
    #Parse Command Line
    parser = optparse.OptionParser()
    parser.add_option( '', '--threads', dest='threads', help='The number of threads to use' )
    parser.add_option( '', '--ref_name', dest='ref_name', help='The reference name to change all output matches to' )
    parser.add_option( '', '--ref_source', dest='ref_source', help='Whether the reference is self, cached or from the history' )
    parser.add_option( '', '--ref_sequences', dest='ref_sequences', help='Number of sequences in the reference dataset' )
    parser.add_option( '', '--mirror', dest='mirror', help='Do or do not report mirror image of all gap-free alignments' )
    parser.add_option( '', '--source_select', dest='source_select', help='Whether to used pre-set or cached reference file' )
    parser.add_option( '', '--input1', dest='input1', help='The name of the reference file if using history or reference base name if using cached' )
    parser.add_option( '', '--input2', dest='input2', help='The reads file to align' )
    parser.add_option( '', '--strand', dest='strand', help='Which strand of the read to search, if specifying all parameters' )
    parser.add_option( '', '--match_reward', dest='match_reward', help='Score values for a match (reward)' )
    parser.add_option( '', '--match_penalty', dest='match_penalty', help='Score values for a mismatch (penalty), same as reward when not specified (but reward is)' )
    parser.add_option( '', '--gapped', dest='gapped', help='Perform gapped extension of HSPs (or seeds if gapped-free extension is not performed) after first reducing them to anchor points' )
    parser.add_option( '', '--gap_open', dest='gap_open', help='Score penalties for opening a gap' )
    parser.add_option( '', '--gap_extend', dest='gap_extend', help='Score penalties for extending a gap' )
    parser.add_option( '', '--ambiguous', dest='ambiguous', help='Treat as ambiguous nucleotides' )
    parser.add_option( '', '--step', dest='step', help='Offset between the starting positions of successive target words considered for potential seeds' )
    parser.add_option( '', '--masking', dest='masking', help='Dynamically mask the target sequence by excluding any positions that appear in too many alignments from further consideration for seeds' )
    parser.add_option( '', '--seed', dest='seed', help='Offset between the starting positions of successive target words considered for potential seeds' )
    parser.add_option( '', '--match_length', dest='match_length', help='Seeds require bp word of this length with matches in all positions' )
    parser.add_option( '', '--transition', dest='transition', help='Transition settings, affects the number of allowed transition substitutions in each seed' )
    parser.add_option( '', '--xdrop', dest='xdrop', help='Find HSPs using the xdrop extension method with the given termination threshold instead of using the exact match method' )
    parser.add_option( '', '--hspthresh', dest='hspthresh', help='Score threshold for the x-drop extension method' )
    parser.add_option( '', '--entropy', dest='entropy', help='Whether to adjust for entropy when qualifying HSPs in the x-drop extension method' )
    parser.add_option( '', '--chain', dest='chain', help='Perform chaining of HSPs with no penalties' )
    parser.add_option( '', '--ydrop', dest='ydrop', help='Set the threshold for terminating gapped extension' )
    parser.add_option( '', '--ytrim', dest='ytrim', help='Trim back to peak score if y-drop extension encounters end of sequence' )
    parser.add_option( '', '--gappedthresh', dest='gappedthresh', help='Threshold for gapped extension.  Alignments scoring lower are discarded.' )
    parser.add_option( '', '--filter', dest='filter', help='Filter alignments.' )
    parser.add_option( '', '--identity_min', dest='identity_min', help='Minimum for filtering alignments by their percent identity.' )
    parser.add_option( '', '--identity_max', dest='identity_max', help='Maximum for filtering alignments by their percent identity.' )
    parser.add_option( '', '--coverage_min', dest='coverage_min', help='Minimum for filtering alignments by how much of the input sequence they cover.' )
    parser.add_option( '', '--coverage_max', dest='coverage_max', help='Maximum for filtering alignments by how much of the input sequence they cover.' )
    parser.add_option( '', '--nmatch_min', dest='nmatch_min', help='Minimum for filtering alignments by how many bases they match.' )
    parser.add_option( '', '--nmismatch_max', dest='nmismatch_max', help='Maximum for filtering alignments by the number of mismatches.' )
    parser.add_option( '', '--trivial', dest='trivial', help='Do or do not output a trivial self-alignment block if the target and query sequences are identical.' )
    parser.add_option( '', '--inner', dest='inner', help='Perform additional alignment between the gapped alignment blocks using (presumably) more sensitive alignment parameters.' )
    parser.add_option( '', '--shortcuts_for_yasra', dest='shortcuts_for_yasra', help='Shortcut options to support the Yasra mapping assembler' )
    parser.add_option( '', '--out_format', dest='format', help='The format of the output file (sam, diffs, or tabular (general))' )
    parser.add_option( '', '--output', dest='output', help='The output file' )
    parser.add_option( '', '--lastzSeqsFileDir', dest='lastzSeqsFileDir', help='Directory of local lastz_seqs.loc file' )
    ( options, args ) = parser.parse_args()
    # Output version # of tool
    try:
        tmp = tempfile.NamedTemporaryFile().name
        tmp_stdout = open( tmp, 'wb' )
        proc = subprocess.Popen( args='lastz -v', shell=True, stdout=tmp_stdout )
        tmp_stdout.close()
        returncode = proc.wait()
        stdout = None
        for line in open( tmp_stdout.name, 'rb' ):
            if line.lower().find( 'version' ) >= 0:
                stdout = line.strip()
                break
        if stdout:
            sys.stdout.write( '%s\n' % stdout )
        else:
            raise Exception
    except:
        sys.stdout.write( 'Could not determine Lastz version\n' )

    if options.ref_name:
        ref_name = '[nickname=%s]' % options.ref_name
    else:
        ref_name = ''
    set_options = ''
    # Commonly-used preset options
    if options.source_select == 'pre_set':
        # Handle ref_source
        if options.ref_source == 'self':
            # --mirror is available only if ref_source selection is --self
            if options.mirror == 'yes':
                set_options += '--nomirror '
    else:
        # Full set of user-specified options
        # Handle ref_source
        if options.ref_source == 'self':
            # --mirror is available only if ref_source selection is --self
            if options.mirror == 'yes':
                set_options += '--nomirror '
        else:
            # Using --self automatically enables this option
            if options.trivial == 'no':
                set_options += '--notrivial '
        # Handle --match
        if options.match_reward not in [ "", "0" ]:
            if options.match_penalty in [ "", "0" ]:
                match_penalty = options.match_reward
            else:
                match_penalty = options.match_penalty
            set_options += '--match=%s,%s ' % ( options.match_reward, match_penalty )
        # Handle --gapped
        if options.gapped == 'yes':
            set_options += '--gapped '
            if options.gap_open not in [ "" ]:
                if options.gap_extend in [ "" ]:
                    set_options += '--gap=%s ' % options.gap_open
                else:
                    set_options += '--gap=%s,%s ' % ( options.gap_open, options.gap_extend )
            # Handle --ydrop
            if options.ydrop not in [ "", "0" ]:
                set_options += '--ydrop=%s ' % options.ydrop
            # Handle --ytrim
            if options.ytrim == 'no':
                set_options += '--noytrim '
            # Handle --gappedthresh
            if options.gappedthresh not in [ "", "0" ]:
                set_options += '--gappedthresh=%s ' % options.gappedthresh
            # Handle --inner
            if options.inner not in [ "" ]:
                set_options += '--inner=%s ' % options.inner
        else:
            set_options += '--nogapped '
        # Handle --step
        if options.step not in [ "", "0" ]:
            set_options += '--step=%s ' % options.step
        # Handle --masking
        if options.masking not in [ '0' ]:
            set_options += '--masking=%s ' % options.masking
        # Handle --seed
        if options.seed not in [ "no" ]:
            if options.seed == 'match':
                set_options += '--seed=match%s ' % options.match_length
            else:
                set_options += '--seed=%s ' % options.seed
        # Handle --transition
        if options.transition == '0':
            set_options += '--notransition '
        else:
            set_options += '--transition=%s ' % options.transition
        # Handle --xdrop
        if options.xdrop not in [ "", "0" ]:
            set_options += '--xdrop=%s ' % options.xdrop
        # handle --hspthresh
        if options.hspthresh not in [ "", "0" ]:
            set_options += '--hspthresh=%s ' % options.hspthresh
        # Handle --entropy
        if options.entropy == 'no':
            set_options += '--noentropy '
        else:
            set_options += '--entropy '
        # Handle --chain
        if options.chain == 'no':
            set_options += '--nochain '
        else:
            set_options += '--chain '
        # Handle --filter
        if options.filter not in [ "no" ]:
            if options.filter == 'identity':
                identity_min = options.identity_min
                if options.identity_max in [ "", "0" ] or options.identity_max <= identity_min:
                    identity_max = '100'
                else:
                    identity_max = options.identity_max
                set_options += '--filter=identity:%s..%s ' % ( identity_min, identity_max )
            elif options.filter == 'coverage':
                coverage_min = options.coverage_min
                if options.coverage_max in [ "", "0" ] or options.coverage_max <= coverage_min:
                    coverage_max = '100'
                else:
                    coverage_max = options.coverage_max
                set_options += '--filter=coverage:%s..%s ' % ( coverage_min, coverage_max )
            elif options.filter == 'nmatch':
                set_options += '--filter=nmatch:%s% ' % options.nmatch_min
            elif options.filter == 'nmismatch':
                set_options += '--filter=nmismatch:0..%s ' % options.nmismatch_max
    # Handle --strand
    set_options += '--strand=%s ' % options.strand
    # Handle --ambiguous
    if options.ambiguous not in [ "no" ]:
        set_options += '--ambiguous=%s ' % options.ambiguous
    # Handle --shortcuts_for_yasra
    if options.shortcuts_for_yasra not in [ 'none' ]:
        set_options += '--%s ' % ( options.shortcuts_for_yasra )
    # Specify input2 and add [fullnames] modifier if output format is diffs
    if options.format == 'diffs':
        input2 = '%s[fullnames]' % options.input2
    else:
        input2 = options.input2
    if options.format == 'tabular':
        # Change output format to general if it's tabular and add field names for tabular output
        format = 'general-'
        tabular_fields = ':score,name1,strand1,size1,start1,zstart1,end1,length1,text1,name2,strand2,size2,start2,zstart2,end2,start2+,zstart2+,end2+,length2,text2,diff,cigar,identity,coverage,gaprate,diagonal,shingle'
    elif options.format == 'sam':
        # We currently need to keep headers.
        format = 'sam'
        tabular_fields = ''
    else:
        format = options.format
        tabular_fields = ''
    # Set up our queues
    threads = int( options.threads )
    lastz_job_queue = LastzJobQueue( threads, slots=SLOTS )
    combine_data_queue = CombineDataQueue( options.output )
    if str( options.ref_source ) in [ 'history', 'self' ]:
        # Reference is a fasta dataset from the history or the dataset containing the target sequence itself,
        # so split job across the number of sequences in the dataset ( this could be a HUGE number ).
        try:
            # Ensure there is at least 1 sequence in the dataset ( this may not be necessary ).
            error_msg = "The reference dataset is missing metadata.  Click the pencil icon in the history item and 'auto-detect' the metadata attributes."
            ref_sequences = int( options.ref_sequences )
            if ref_sequences < 1:
                stop_queues( lastz_job_queue, combine_data_queue )
                stop_err( error_msg )
        except:
            stop_queues( lastz_job_queue, combine_data_queue )
            stop_err( error_msg )
        seqs = 0
        fasta_reader = FastaReader( open( options.input1 ) )
        while True:
            # Read the next sequence from the reference dataset
            seq = fasta_reader.next()
            if not seq:
                break
            seqs += 1
            # Create a temporary file to contain the current sequence as input to lastz
            tmp_in_fd, tmp_in_name = tempfile.mkstemp( suffix='.in' )
            tmp_in = os.fdopen( tmp_in_fd, 'wb' )
            # Write the current sequence to the temporary input file
            tmp_in.write( '>%s\n%s\n' % ( seq.name, seq.text ) )
            tmp_in.close()
            # Create a 2nd temporary file to contain the output from lastz execution on the current sequence
            tmp_out_fd, tmp_out_name = tempfile.mkstemp( suffix='.out' )
            os.close( tmp_out_fd )
            # Generate the command line for calling lastz on the current sequence
            command = 'lastz %s%s %s %s --format=%s%s > %s' % ( tmp_in_name, ref_name, input2, set_options, format, tabular_fields, tmp_out_name )
            # Create a job object
            job = Bunch()
            job.command = command
            job.output = tmp_out_name
            job.cleanup = [ tmp_in_name, tmp_out_name ]
            job.combine_data_queue = combine_data_queue
            # Add another job to the lastz_job_queue.  Execution will wait at this point if the queue is full.
            lastz_job_queue.put( job, block=True )
        # Make sure the value of sequences in the metadata is the same as the number of
        # sequences read from the dataset.  According to Bob, this may not be necessary.
        if ref_sequences != seqs:
            stop_queues( lastz_job_queue, combine_data_queue )
            stop_err( "The value of metadata.sequences (%d) differs from the number of sequences read from the reference (%d)." % ( ref_sequences, seqs ) )
    else:
        # Reference is a locally cached 2bit file, split job across number of chroms in 2bit file
        tbf = TwoBitFile( open( options.input1, 'r' ) )
        for chrom in tbf.keys():
            # Create a temporary file to contain the output from lastz execution on the current chrom
            tmp_out_fd, tmp_out_name = tempfile.mkstemp( suffix='.out' )
            os.close( tmp_out_fd )
            command = 'lastz %s/%s%s %s %s --format=%s%s >> %s' % \
                ( options.input1, chrom, ref_name, input2, set_options, format, tabular_fields, tmp_out_name )
            # Create a job object
            job = Bunch()
            job.command = command
            job.output = tmp_out_name
            job.cleanup = [ tmp_out_name ]
            job.combine_data_queue = combine_data_queue
            # Add another job to the lastz_job_queue.  Execution will wait at this point if the queue is full.
            lastz_job_queue.put( job, block=True )
    # Stop the lastz_job_queue.
    for t in lastz_job_queue.threads:
        lastz_job_queue.put( STOP_SIGNAL, True )
    # Although all jobs are submitted to the queue, we can't shut down the combine_data_queue
    # until we know that all jobs have been submitted to its queue.  We do this by checking
    # whether all of the threads in the lastz_job_queue have terminated.
    while threading.activeCount() > 2:
        time.sleep( 1 )
    # Now it's safe to stop the combine_data_queue.
    combine_data_queue.put( STOP_SIGNAL )
Ejemplo n.º 6
0
def __main__():
    #Parse Command Line
    parser = optparse.OptionParser()
    parser.add_option(
        '',
        '--ref_name',
        dest='ref_name',
        help='The reference name to change all output matches to')
    parser.add_option(
        '',
        '--ref_source',
        dest='ref_source',
        help='Whether the reference is cached or from the history')
    parser.add_option('',
                      '--ref_sequences',
                      dest='ref_sequences',
                      help='Number of sequences in the reference dataset')
    parser.add_option('',
                      '--source_select',
                      dest='source_select',
                      help='Whether to used pre-set or cached reference file')
    parser.add_option(
        '',
        '--input1',
        dest='input1',
        help=
        'The name of the reference file if using history or reference base name if using cached'
    )
    parser.add_option('',
                      '--input2',
                      dest='input2',
                      help='The reads file to align')
    parser.add_option(
        '',
        '--pre_set_options',
        dest='pre_set_options',
        help='Which of the pre set options to use, if using pre-sets')
    parser.add_option(
        '',
        '--strand',
        dest='strand',
        help='Which strand of the read to search, if specifying all parameters'
    )
    parser.add_option('',
                      '--seed',
                      dest='seed',
                      help='Seeding settings, if specifying all parameters')
    parser.add_option(
        '',
        '--transition',
        dest='transition',
        help=
        'Number of transitions to allow in each seed hit, if specifying all parameters'
    )
    parser.add_option(
        '',
        '--gfextend',
        dest='gfextend',
        help=
        'Whether to perform gap-free extension of seed hits to HSPs (high scoring segment pairs), if specifying all parameters'
    )
    parser.add_option(
        '',
        '--chain',
        dest='chain',
        help='Whether to perform chaining of HSPs, if specifying all parameters'
    )
    parser.add_option('',
                      '--O',
                      dest='O',
                      help='Gap opening penalty, if specifying all parameters')
    parser.add_option(
        '',
        '--E',
        dest='E',
        help='Gap extension penalty, if specifying all parameters')
    parser.add_option('',
                      '--X',
                      dest='X',
                      help='X-drop threshold, if specifying all parameters')
    parser.add_option('',
                      '--Y',
                      dest='Y',
                      help='Y-drop threshold, if specifying all parameters')
    parser.add_option('',
                      '--K',
                      dest='K',
                      help='Threshold for HSPs, if specifying all parameters')
    parser.add_option(
        '',
        '--L',
        dest='L',
        help='Threshold for gapped alignments, if specifying all parameters')
    parser.add_option(
        '',
        '--entropy',
        dest='entropy',
        help=
        'Whether to involve entropy when filtering HSPs, if specifying all parameters'
    )
    parser.add_option(
        '',
        '--identity_min',
        dest='identity_min',
        help="Minimum identity (don't report matches under this identity)")
    parser.add_option(
        '',
        '--identity_max',
        dest='identity_max',
        help="Maximum identity (don't report matches above this identity)")
    parser.add_option(
        '',
        '--coverage',
        dest='coverage',
        help=
        "The minimum coverage value (don't report matches covering less than this)"
    )
    parser.add_option('',
                      '--unmask',
                      dest='unmask',
                      help='Whether to convert lowercase bases to uppercase')
    parser.add_option(
        '',
        '--out_format',
        dest='format',
        help='The format of the output file (sam, diffs, or tabular (general))'
    )
    parser.add_option('', '--output', dest='output', help='The output file')
    parser.add_option('',
                      '--lastzSeqsFileDir',
                      dest='lastzSeqsFileDir',
                      help='Directory of local lastz_seqs.loc file')
    (options, args) = parser.parse_args()

    # output version # of tool
    try:
        tmp = tempfile.NamedTemporaryFile().name
        tmp_stdout = open(tmp, 'wb')
        proc = subprocess.Popen(args='lastz -v', shell=True, stdout=tmp_stdout)
        tmp_stdout.close()
        returncode = proc.wait()
        stdout = None
        for line in open(tmp_stdout.name, 'rb'):
            if line.lower().find('version') >= 0:
                stdout = line.strip()
                break
        if stdout:
            sys.stdout.write('%s\n' % stdout)
        else:
            raise Exception
    except:
        sys.stdout.write('Could not determine Lastz version\n')

    if options.unmask == 'yes':
        unmask = '[unmask]'
    else:
        unmask = ''
    if options.ref_name:
        ref_name = '[nickname=%s]' % options.ref_name
    else:
        ref_name = ''
    # Prepare for commonly-used preset options
    if options.source_select == 'pre_set':
        set_options = '--%s' % options.pre_set_options
    # Prepare for user-specified options
    else:
        set_options = '--%s --%s --gapped --strand=%s --seed=%s --%s O=%s E=%s X=%s Y=%s K=%s L=%s --%s' % \
                    ( options.gfextend, options.chain, options.strand, options.seed, options.transition,
                      options.O, options.E, options.X, options.Y, options.K, options.L, options.entropy )
    # Specify input2 and add [fullnames] modifier if output format is diffs
    if options.format == 'diffs':
        input2 = '%s[fullnames]' % options.input2
    else:
        input2 = options.input2
    if options.format == 'tabular':
        # Change output format to general if it's tabular and add field names for tabular output
        format = 'general-'
        tabular_fields = ':score,name1,strand1,size1,start1,zstart1,end1,length1,text1,name2,strand2,size2,start2,zstart2,end2,start2+,zstart2+,end2+,length2,text2,diff,cigar,identity,coverage,gaprate,diagonal,shingle'
    elif options.format == 'sam':
        # We currently ALWAYS suppress SAM headers.
        format = 'sam-'
        tabular_fields = ''
    else:
        format = options.format
        tabular_fields = ''

    # Set up our queues
    lastz_job_queue = LastzJobQueue(WORKERS, slots=SLOTS)
    combine_data_queue = CombineDataQueue(options.output)

    if options.ref_source == 'history':
        # Reference is a fasta dataset from the history, so split job across
        # the number of sequences in the dataset ( this could be a HUGE number )
        try:
            # Ensure there is at least 1 sequence in the dataset ( this may not be necessary ).
            error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes."
            ref_sequences = int(options.ref_sequences)
            if ref_sequences < 1:
                stop_queues(lastz_job_queue, combine_data_queue)
                stop_err(error_msg)
        except:
            stop_queues(lastz_job_queue, combine_data_queue)
            stop_err(error_msg)
        seqs = 0
        fasta_reader = FastaReader(open(options.input1))
        while True:
            # Read the next sequence from the reference dataset
            seq = fasta_reader.next()
            if not seq:
                break
            seqs += 1
            # Create a temporary file to contain the current sequence as input to lastz
            tmp_in_fd, tmp_in_name = tempfile.mkstemp(suffix='.in')
            tmp_in = os.fdopen(tmp_in_fd, 'wb')
            # Write the current sequence to the temporary input file
            tmp_in.write('>%s\n%s\n' % (seq.name, seq.text))
            tmp_in.close()
            # Create a 2nd temporary file to contain the output from lastz execution on the current sequence
            tmp_out_fd, tmp_out_name = tempfile.mkstemp(suffix='.out')
            os.close(tmp_out_fd)
            # Generate the command line for calling lastz on the current sequence
            command = 'lastz %s%s%s %s %s --ambiguousn --nolaj --identity=%s..%s --coverage=%s --format=%s%s > %s' % \
                ( tmp_in_name, unmask, ref_name, input2, set_options, options.identity_min,
                  options.identity_max, options.coverage, format, tabular_fields, tmp_out_name )
            # Create a job object
            job = Bunch()
            job.command = command
            job.output = tmp_out_name
            job.cleanup = [tmp_in_name, tmp_out_name]
            job.combine_data_queue = combine_data_queue
            # Add another job to the lastz_job_queue. Execution
            # will wait at this point if the queue is full.
            lastz_job_queue.put(job, block=True)
        # Make sure the value of sequences in the metadata is the same as the
        # number of sequences read from the dataset ( this may not be necessary ).
        if ref_sequences != seqs:
            stop_queues(lastz_job_queue, combine_data_queue)
            stop_err(
                "The value of metadata.sequences (%d) differs from the number of sequences read from the reference (%d)."
                % (ref_sequences, seqs))
    else:
        # Reference is a locally cached 2bit file, split job across number of chroms in 2bit file
        tbf = TwoBitFile(open(options.input1, 'r'))
        for chrom in tbf.keys():
            # Create a temporary file to contain the output from lastz execution on the current chrom
            tmp_out_fd, tmp_out_name = tempfile.mkstemp(suffix='.out')
            os.close(tmp_out_fd)
            command = 'lastz %s/%s%s%s %s %s --ambiguousn --nolaj --identity=%s..%s --coverage=%s --format=%s%s >> %s' % \
                ( options.input1, chrom, unmask, ref_name, input2, set_options, options.identity_min,
                  options.identity_max, options.coverage, format, tabular_fields, tmp_out_name )
            # Create a job object
            job = Bunch()
            job.command = command
            job.output = tmp_out_name
            job.cleanup = [tmp_out_name]
            job.combine_data_queue = combine_data_queue
            # Add another job to the lastz_job_queue. Execution
            # will wait at this point if the queue is full.
            lastz_job_queue.put(job, block=True)

    # Stop the lastz_job_queue
    for t in lastz_job_queue.threads:
        lastz_job_queue.put(STOP_SIGNAL, True)
    # Although all jobs are submitted to the queue, we can't shut down the combine_data_queue
    # until we know that all jobs have been submitted to its queue.  We do this by checking
    # whether all of the threads in the lastz_job_queue have terminated.
    while threading.activeCount() > 2:
        time.sleep(1)
    # Now it's safe to stop the combine_data_queue
    combine_data_queue.put(STOP_SIGNAL)
Ejemplo n.º 7
0
def __main__():
    #Parse Command Line
    parser = optparse.OptionParser()
    parser.add_option( '', '--ref_name', dest='ref_name', help='The reference name to change all output matches to' )
    parser.add_option( '', '--ref_source', dest='ref_source', help='Whether the reference is cached or from the history' )
    parser.add_option( '', '--ref_sequences', dest='ref_sequences', help='Number of sequences in the reference dataset' )
    parser.add_option( '', '--source_select', dest='source_select', help='Whether to used pre-set or cached reference file' )
    parser.add_option( '', '--input1', dest='input1', help='The name of the reference file if using history or reference base name if using cached' )
    parser.add_option( '', '--input2', dest='input2', help='The reads file to align' )
    parser.add_option( '', '--pre_set_options', dest='pre_set_options', help='Which of the pre set options to use, if using pre-sets' )
    parser.add_option( '', '--strand', dest='strand', help='Which strand of the read to search, if specifying all parameters' )
    parser.add_option( '', '--seed', dest='seed', help='Seeding settings, if specifying all parameters' )
    parser.add_option( '', '--transition', dest='transition', help='Number of transitions to allow in each seed hit, if specifying all parameters' )
    parser.add_option( '', '--gfextend', dest='gfextend', help='Whether to perform gap-free extension of seed hits to HSPs (high scoring segment pairs), if specifying all parameters' )
    parser.add_option( '', '--chain', dest='chain', help='Whether to perform chaining of HSPs, if specifying all parameters' )
    parser.add_option( '', '--O', dest='O', help='Gap opening penalty, if specifying all parameters' )
    parser.add_option( '', '--E', dest='E', help='Gap extension penalty, if specifying all parameters' )
    parser.add_option( '', '--X', dest='X', help='X-drop threshold, if specifying all parameters' )
    parser.add_option( '', '--Y', dest='Y', help='Y-drop threshold, if specifying all parameters' )
    parser.add_option( '', '--K', dest='K', help='Threshold for HSPs, if specifying all parameters' )
    parser.add_option( '', '--L', dest='L', help='Threshold for gapped alignments, if specifying all parameters' )
    parser.add_option( '', '--entropy', dest='entropy', help='Whether to involve entropy when filtering HSPs, if specifying all parameters' )
    parser.add_option( '', '--identity_min', dest='identity_min', help="Minimum identity (don't report matches under this identity)" )
    parser.add_option( '', '--identity_max', dest='identity_max', help="Maximum identity (don't report matches above this identity)" )
    parser.add_option( '', '--coverage', dest='coverage', help="The minimum coverage value (don't report matches covering less than this)" )
    parser.add_option( '', '--unmask', dest='unmask', help='Whether to convert lowercase bases to uppercase' )
    parser.add_option( '', '--out_format', dest='format', help='The format of the output file (sam, diffs, or tabular (general))' )
    parser.add_option( '', '--output', dest='output', help='The output file' )
    parser.add_option( '', '--lastzSeqsFileDir', dest='lastzSeqsFileDir', help='Directory of local lastz_seqs.loc file' )
    ( options, args ) = parser.parse_args()

    # output version # of tool
    try:
        tmp = tempfile.NamedTemporaryFile().name
        tmp_stdout = open( tmp, 'wb' )
        proc = subprocess.Popen( args='lastz -v', shell=True, stdout=tmp_stdout )
        tmp_stdout.close()
        returncode = proc.wait()
        stdout = None
        for line in open( tmp_stdout.name, 'rb' ):
            if line.lower().find( 'version' ) >= 0:
                stdout = line.strip()
                break
        if stdout:
            sys.stdout.write( '%s\n' % stdout )
        else:
            raise Exception
    except:
        sys.stdout.write( 'Could not determine Lastz version\n' )

    if options.unmask == 'yes':
        unmask = '[unmask]'
    else:
        unmask = ''
    if options.ref_name:
        ref_name = '[nickname=%s]' % options.ref_name
    else:
        ref_name = ''
    # Prepare for commonly-used preset options
    if options.source_select == 'pre_set':
        set_options = '--%s' % options.pre_set_options
    # Prepare for user-specified options
    else:
        set_options = '--%s --%s --gapped --strand=%s --seed=%s --%s O=%s E=%s X=%s Y=%s K=%s L=%s --%s' % \
                    ( options.gfextend, options.chain, options.strand, options.seed, options.transition,
                      options.O, options.E, options.X, options.Y, options.K, options.L, options.entropy )
    # Specify input2 and add [fullnames] modifier if output format is diffs
    if options.format == 'diffs':
        input2 = '%s[fullnames]' % options.input2
    else:
        input2 = options.input2
    if options.format == 'tabular':
        # Change output format to general if it's tabular and add field names for tabular output
        format = 'general-'
        tabular_fields = ':score,name1,strand1,size1,start1,zstart1,end1,length1,text1,name2,strand2,size2,start2,zstart2,end2,start2+,zstart2+,end2+,length2,text2,diff,cigar,identity,coverage,gaprate,diagonal,shingle'
    elif options.format == 'sam':
        # We currently ALWAYS suppress SAM headers.
        format = 'sam-'
        tabular_fields = ''
    else:
        format = options.format
        tabular_fields = ''

    # Set up our queues
    lastz_job_queue = LastzJobQueue( WORKERS, slots=SLOTS )
    combine_data_queue = CombineDataQueue( options.output )

    if options.ref_source == 'history':
        # Reference is a fasta dataset from the history, so split job across
        # the number of sequences in the dataset ( this could be a HUGE number )
        try:
            # Ensure there is at least 1 sequence in the dataset ( this may not be necessary ).
            error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes."
            ref_sequences = int( options.ref_sequences )
            if ref_sequences < 1:
                stop_queues( lastz_job_queue, combine_data_queue )
                stop_err( error_msg )
        except:
            stop_queues( lastz_job_queue, combine_data_queue )
            stop_err( error_msg )
        seqs = 0
        fasta_reader = FastaReader( open( options.input1 ) )
        while True:
            # Read the next sequence from the reference dataset
            seq = fasta_reader.next()
            if not seq:
                break
            seqs += 1
            # Create a temporary file to contain the current sequence as input to lastz
            tmp_in_fd, tmp_in_name = tempfile.mkstemp( suffix='.in' )
            tmp_in = os.fdopen( tmp_in_fd, 'wb' )
            # Write the current sequence to the temporary input file
            tmp_in.write( '>%s\n%s\n' % ( seq.name, seq.text ) )
            tmp_in.close()
            # Create a 2nd temporary file to contain the output from lastz execution on the current sequence
            tmp_out_fd, tmp_out_name = tempfile.mkstemp( suffix='.out' )
            os.close( tmp_out_fd )
            # Generate the command line for calling lastz on the current sequence
            command = 'lastz %s%s%s %s %s --ambiguousn --nolaj --identity=%s..%s --coverage=%s --format=%s%s > %s' % \
                ( tmp_in_name, unmask, ref_name, input2, set_options, options.identity_min, 
                  options.identity_max, options.coverage, format, tabular_fields, tmp_out_name )
            # Create a job object
            job = Bunch()
            job.command = command
            job.output = tmp_out_name
            job.cleanup = [ tmp_in_name, tmp_out_name ]
            job.combine_data_queue = combine_data_queue
            # Add another job to the lastz_job_queue. Execution 
            # will wait at this point if the queue is full.
            lastz_job_queue.put( job, block=True )
        # Make sure the value of sequences in the metadata is the same as the
        # number of sequences read from the dataset ( this may not be necessary ).
        if ref_sequences != seqs:
            stop_queues( lastz_job_queue, combine_data_queue )
            stop_err( "The value of metadata.sequences (%d) differs from the number of sequences read from the reference (%d)." % ( ref_sequences, seqs ) )
    else:
        # Reference is a locally cached 2bit file, split job across number of chroms in 2bit file
        tbf = TwoBitFile( open( options.input1, 'r' ) )
        for chrom in tbf.keys():
            # Create a temporary file to contain the output from lastz execution on the current chrom
            tmp_out_fd, tmp_out_name = tempfile.mkstemp( suffix='.out' )
            os.close( tmp_out_fd )
            command = 'lastz %s/%s%s%s %s %s --ambiguousn --nolaj --identity=%s..%s --coverage=%s --format=%s%s >> %s' % \
                ( options.input1, chrom, unmask, ref_name, input2, set_options, options.identity_min, 
                  options.identity_max, options.coverage, format, tabular_fields, tmp_out_name )
            # Create a job object
            job = Bunch()
            job.command = command
            job.output = tmp_out_name
            job.cleanup = [ tmp_out_name ]
            job.combine_data_queue = combine_data_queue
            # Add another job to the lastz_job_queue. Execution 
            # will wait at this point if the queue is full.
            lastz_job_queue.put( job, block=True )

    # Stop the lastz_job_queue
    for t in lastz_job_queue.threads:
        lastz_job_queue.put( STOP_SIGNAL, True )
    # Although all jobs are submitted to the queue, we can't shut down the combine_data_queue
    # until we know that all jobs have been submitted to its queue.  We do this by checking
    # whether all of the threads in the lastz_job_queue have terminated.
    while threading.activeCount() > 2:
        time.sleep( 1 )
    # Now it's safe to stop the combine_data_queue
    combine_data_queue.put( STOP_SIGNAL )
Ejemplo n.º 8
0
def __main__():
    #Parse Command Line
    parser = optparse.OptionParser()
    parser.add_option( '', '--ref_name', dest='ref_name', help='The reference name to change all output matches to' )
    parser.add_option( '', '--ref_source', dest='ref_source', help='Whether the reference is cached or from the history' )
    parser.add_option( '', '--ref_sequences', dest='ref_sequences', help='Number of sequences in the reference dataset' )
    parser.add_option( '', '--source_select', dest='source_select', help='Whether to used pre-set or cached reference file' )
    parser.add_option( '', '--input1', dest='input1', help='The name of the reference file if using history or reference base name if using cached' )
    parser.add_option( '', '--input2', dest='input2', help='The reads file to align' )
    parser.add_option( '', '--pre_set_options', dest='pre_set_options', help='Which of the pre set options to use, if using pre-sets' )
    parser.add_option( '', '--strand', dest='strand', help='Which strand of the read to search, if specifying all parameters' )
    parser.add_option( '', '--seed', dest='seed', help='Seeding settings, if specifying all parameters' )
    parser.add_option( '', '--transition', dest='transition', help='Number of transitions to allow in each seed hit, if specifying all parameters' )
    parser.add_option( '', '--gfextend', dest='gfextend', help='Whether to perform gap-free extension of seed hits to HSPs (high scoring segment pairs), if specifying all parameters' )
    parser.add_option( '', '--chain', dest='chain', help='Whether to perform chaining of HSPs, if specifying all parameters' )
    parser.add_option( '', '--O', dest='O', help='Gap opening penalty, if specifying all parameters' )
    parser.add_option( '', '--E', dest='E', help='Gap extension penalty, if specifying all parameters' )
    parser.add_option( '', '--X', dest='X', help='X-drop threshold, if specifying all parameters' )
    parser.add_option( '', '--Y', dest='Y', help='Y-drop threshold, if specifying all parameters' )
    parser.add_option( '', '--K', dest='K', help='Threshold for HSPs, if specifying all parameters' )
    parser.add_option( '', '--L', dest='L', help='Threshold for gapped alignments, if specifying all parameters' )
    parser.add_option( '', '--entropy', dest='entropy', help='Whether to involve entropy when filtering HSPs, if specifying all parameters' )
    parser.add_option( '', '--identity_min', dest='identity_min', help="Minimum identity (don't report matches under this identity)" )
    parser.add_option( '', '--identity_max', dest='identity_max', help="Maximum identity (don't report matches above this identity)" )
    parser.add_option( '', '--coverage', dest='coverage', help="The minimum coverage value (don't report matches covering less than this)" )
    parser.add_option( '', '--out_format', dest='format', help='The format of the output file (sam, diffs, or tabular (general))' )
    parser.add_option( '', '--output', dest='output', help='The output file' )
    parser.add_option( '', '--num_threads', dest='num_threads', help='The number of threads to run' )
    parser.add_option( '', '--lastzSeqsFileDir', dest='lastzSeqsFileDir', help='Directory of local lastz_seqs.loc file' )
    ( options, args ) = parser.parse_args()

    # If the reference sequences are from the history, temporary input files will be created
    # ( 1 for each sequence ), and we'll keep track of them for later removal from disk ( by closing them )
    tmp_in_file_names = []
    # Each thread will create a temporary file to which it writes the output from lastz
    tmp_out_file_names = []
    # Execution of lastz based on job splitting
    commands = []
    if options.ref_name != 'None':
        ref_name = '[nickname=%s]' % options.ref_name
    else:
        ref_name = ''
    # Prepare for commonly-used preset options
    if options.source_select == 'pre_set':
        set_options = '--%s' % options.pre_set_options
    # Prepare for user-specified options
    else:
        set_options = '--%s --%s --gapped --strand=%s --seed=%s --%s O=%s E=%s X=%s Y=%s K=%s L=%s --%s' % \
                    ( options.gfextend, options.chain, options.strand, options.seed, 
                      options.transition, options.O, options.E, options.X, 
                      options.Y, options.K, options.L, options.entropy )
    # Specify input2 and add [fullnames] modifier if output format is diffs
    if options.format == 'diffs':
        input2 = '%s[fullnames]' % options.input2
    else:
        input2 = options.input2
    if options.format == 'tabular':
        # Change output format to general if it's tabular and add field names for tabular output
        format = 'general-'
        tabular_fields = ':score,name1,strand1,size1,start1,zstart1,end1,length1,text1,name2,strand2,size2,start2,zstart2,end2,start2+,zstart2+,end2+,length2,text2,diff,cigar,identity,coverage,gaprate,diagonal,shingle'
    elif options.format == 'sam':
        # We currently ALWAYS suppress SAM headers.
        format = 'sam-'
        tabular_fields = ''
    else:
        format = options.format
        tabular_fields = ''
    if options.ref_source == 'history':
        # Reference is a fasta dataset from the history, so split job across number of sequences in the dataset
        try:
            # Ensure there is at least 1 sequence in the dataset ( this may not be necessary ).
            error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes."
            ref_sequences = int( options.ref_sequences )
            if ref_sequences < 1:
                stop_err( error_msg )
        except:
            stop_err( error_msg )
        seqs = 0
        fasta_reader = FastaReader( open( options.input1 ) )
        while True:
            # Read the next sequence from the reference dataset
            seq = fasta_reader.next()
            if not seq:
                break
            seqs += 1
            # Create a temporary file to contain the current sequence as input to lastz
            tmp_in = tempfile.NamedTemporaryFile( prefix=seq.name, suffix='.fasta' )
            tmp_in_name = tmp_in.name
            tmp_in.close()
            tmp_in = file(tmp_in_name,'w+b')
            # Keep track of our list of temporary input files so we can remove them later by closing them
            tmp_in_file_names.append( tmp_in_name )
            # Write the current sequence to the temporary input file
            tmp_in.write( '>%s\n%s\n' % ( seq.name, seq.text ) )
            tmp_in.close()
            # Create a 2nd temporary file to contain the output from lastz execution on the current sequence
            tmp_out = tempfile.NamedTemporaryFile( prefix='%s_out' % seq.name )
            tmp_out_name = tmp_out.name
            tmp_out.close()
            # Keep track of our list of temporary output files so we can merge them into our output dataset
            tmp_out_file_names.append( tmp_out_name )
            # Generate the command line for calling lastz on the current sequence
            command = 'lastz %s%s %s %s --ambiguousn --nolaj --identity=%s..%s --coverage=%s --format=%s%s > %s' % \
                ( tmp_in_name, ref_name, input2, set_options, options.identity_min, 
                  options.identity_max, options.coverage, format, tabular_fields, tmp_out_name )
            # Append the command line to our list of commands for sending to the LastzJobRunner queue
            commands.append( command )
        # Make sure the value of sequences in the metadata is the
        # same as the number of sequences read from the dataset ( this may not be necessary ).
        if ref_sequences != seqs:
            stop_error( "The value of metadata.sequences (%d) differs from the number of sequences read from the reference ( %d)." % ( ref_sequences, seqs ) )
    else:
        # Reference is a locally cached 2bit file, split job across number of chroms in 2bit file
        tbf = TwoBitFile( open( options.input1, 'r' ) )
        for chrom in tbf.keys():
            # Create a temporary file to contain the output from lastz execution on the current chrom
            tmp_out = tempfile.NamedTemporaryFile( prefix='%s_out' % chrom )
            tmp_out_name = tmp_out.name
            tmp_out.close()
            # Keep track of our list of temporary output files so we can merge them into our output dataset
            tmp_out_file_names.append( tmp_out_name )
            command = 'lastz %s/%s%s %s %s --ambiguousn --nolaj --identity=%s..%s --coverage=%s --format=%s%s >> %s' % \
                ( options.input1, chrom, ref_name, input2, set_options, options.identity_min, 
                  options.identity_max, options.coverage, format, tabular_fields, tmp_out_name )
            commands.append( command )
    job_runner = LastzJobRunner( int( options.num_threads ), commands )
    # Merge all of the output from lastz ( currently in temporary files ) into our output dataset
    command = 'cat %s >> %s' % ( ' '.join( tmp_out_file_names ), options.output )
    proc = subprocess.Popen( args=command, shell=True )
    proc.wait()
    # Remove all temporary files from disk by closing them
    for name in tmp_in_file_names:
        try:
            os.remove( name )
        except:
            pass
    for name in tmp_out_file_names:
        try:
            os.remove( name )
        except:
            pass