Esempio n. 1
0
def run( pos_file, neg_file, format, mapping, radix, orders, modname, fold, loo ):

    # Split up 

    # Read integer sequences
    pos_strings = list( rp.io.get_reader( pos_file, format, mapping ) )
    neg_strings = list( rp.io.get_reader( neg_file, format, mapping ) )

    # Determine radix
    if not radix:
        if mapping: radix = mapping.get_out_size()
        else: radix = max( map( max, pos_strings ) + map( max, neg_strings ) ) + 1

    print "Order     TP  ~TP  ~FN   FN   FP  ~FP  ~TN   TN       %    time"

    # Cross validate for various orders
    for order in orders:
        model_factory = lambda d0, d1: rp.models.train( modname, order, radix, d0, d1 )
        if loo: passes = 1
        else: passes = 5 
        cv_engine = rp.cv.CV( model_factory, pos_strings, neg_strings, fold, passes, loo )
        start_time = time.time()
        cv_engine.run()
        seconds = time.time() - start_time

        print "%5d  " % order,
        print cv_engine.cls1, cv_engine.cls2,
        print "  %2.2f    %2.2f" % ( cv_engine.get_success_rate()*100, seconds )
Esempio n. 2
0
def run( ts_fnames, format, mapping, radix, orders, modname, fold, loo, print_results ):

    # Read integer sequences
    radix = 0
    training_sets = []
    for f in ts_fnames:
        strings = []
        for s in rp.io.get_reader( open( f ), format, mapping ):
            if sum( s != -1 ) >= 50:
                strings.append( s )
                radix = max( radix, max( s ) + 1 )
        training_sets.append( strings )
                
    # Determine radix
    if mapping: 
        radix = mapping.get_out_size()

    # Cross validate for various orders
    for order in orders:
        model_factory = lambda d0: rp.models.prob_train( modname, order, radix, d0 )
        if loo: passes = 1
        else: passes = 5 
        cv_engine = rp.cv.MultiCV( model_factory, training_sets, fold, passes, loo, keep_results=print_results )
        start_time = time.time()
        cv_engine.run()
        seconds = time.time() - start_time

        if print_results:
            for r, c in zip( cv_engine.scores, cv_engine.classes ):
                print "\t".join( map( str, list( r ) + list( c ) ) )
        else:
            print cv_engine.get_success_rate()*100, cv_engine.get_summary(), seconds
Esempio n. 3
0
def calc_merit( pos_strings, neg_strings, mapping ):
    # Apply mapping to strings
    pos_strings = [ mapping.translate( s ) for s in pos_strings ]
    neg_strings = [ mapping.translate( s ) for s in neg_strings ]
    # Cross validate using those strings
    radix = mapping.get_out_size()
    order = max_order( radix )
    model_factory = lambda d0, d1: model.train( order, radix, d0, d1 )
    cv_engine = cv.CV( model_factory, pos_strings, neg_strings, fold=fold, passes=passes )
    cv_engine.run()
    # Merit is TP + TN
    return cv_engine.cls1.pos + cv_engine.cls2.neg
Esempio n. 4
0
def run( pos_file, neg_file, out_dir, format, align_count, mapping ):

    # Open merit output
    merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' )

    # Read integer sequences
    pos_strings = list( io.get_reader( pos_file, format, None ) )
    neg_strings = list( io.get_reader( neg_file, format, None ) )

    symbol_count = mapping.get_out_size()

    # Collapse
    while symbol_count > stop_size:

        print >> sys.stderr, "Collapsing from:", symbol_count

        best_mapping = None
        best_merit = 0

        pb = ProgressBar( 0, symbol_count * ( symbol_count - 1 ) / 2, 78 )

        count = 0

        pairs = all_pairs( symbol_count )

        for i, j in pairs:

                collapsed = mapping.collapse( i, j )
                
                merit = calc_merit( pos_strings, neg_strings, collapsed ) 
                
                if merit > best_merit:
                    best_merit = merit
                    best_mapping = collapsed
	
                count += 1
                pb.update_and_print( count, sys.stderr )
                
        mapping = best_mapping
        symbol_count -= 1
        
        # Append merit to merit output        
        print >>merit_out, symbol_count, best_merit
        
        print >>sys.stderr, "\nBest Merit %d." % best_merit,
        
        # Write best mapping to a file
        mapping_out = open( os.path.join( out_dir, "%03d.mapping" % symbol_count ), 'w' )
        for i, symbol in enumerate( mapping.get_table() ): 
            print >>mapping_out, str.join( '', rp.mapping.DNA.reverse_map( i, align_count ) ), symbol
        mapping_out.close()
Esempio n. 5
0
def run( pos_file, neg_file, out_file, format, mapping, radix, order, modname ):

    # Read integer sequences
    pos_strings = list( rp.io.get_reader( pos_file, format, mapping ) )
    neg_strings = list( rp.io.get_reader( neg_file, format, mapping ) )

    # Determine radix
    if not radix:
        if mapping: radix = mapping.get_out_size()
        else: radix = max( map( max, pos_strings ) + map( max, neg_strings ) ) + 1
               
    # Build model
    model = rp.models.train( modname, order, radix, pos_strings, neg_strings )

    # Write to out file
    model.to_file( out_file )
Esempio n. 6
0
def run( pos_file, out_file, format, mapping, radix, order, modname ):

    # Read integer sequences
    pos_strings = list( rp.io.get_reader( pos_file, format, mapping ) )

    # Determine radix
    if not radix:
        if mapping: radix = mapping.get_out_size()
        else: radix = max( map( max, pos_strings ) ) + 1
               
    # Build model
    print "about to train"
    model = rp.models.prob_train( modname, order, radix, pos_strings )
    print "trained"

    # Write to out file
    print "about to write"
    model.to_file( out_file )
    print "written"
Esempio n. 7
0
 def calc_merit(self, mapping):
     # Apply mapping to strings
     pos_strings = [mapping.translate(s) for s in self.pos_strings]
     neg_strings = [mapping.translate(s) for s in self.neg_strings]
     # Cross validate using those strings
     radix = mapping.get_out_size()
     ## model_factory = lambda d0, d1: ProductModel( radix, d0, d1 )
     model_factory = lambda d0, d1: rp.models.complex_periodic.train(5, 1, 4, radix, d0, d1)
     cv_engine = rp.cv.CV(
         model_factory,
         zip(self.pos_genome_seqs, pos_strings),
         zip(self.neg_genome_seqs, neg_strings),
         fold=fold,
         passes=passes,
     )
     cv_engine.run()
     # Merit is TP + TN
     ## print "Pos:", cv_engine.cls1
     ## print "Neg:", cv_engine.cls2
     return (cv_engine.cls1.pos / (len(pos_strings) * passes) + cv_engine.cls2.neg / (len(neg_strings) * passes)) / 2
Esempio n. 8
0
def calc_merit( training_sets, mapping, modname, modorder ):
    # Apply mapping to strings
    training_sets = [ [ mapping.translate( s ) for s in strings ] for strings in training_sets ]
    # Cross validate using those strings
    radix = mapping.get_out_size()
    
    if len( training_sets ) == 2:
        pos_strings, neg_strings = training_sets
        model_factory = lambda d0, d1: rp.models.train( modname, modorder, radix, d0, d1 )
        cv_engine = rp.cv.CV( model_factory, pos_strings, neg_strings, fold=fold, passes=passes )
        cv_engine.run()
        # Merit is TP + TN
        return ( cv_engine.cls1.pos / ( len( pos_strings ) * passes ) + cv_engine.cls2.neg / ( len( neg_strings ) * passes ) ) / 2
    elif len( training_sets ) > 2:
        model_factory = lambda d: rp.models.prob_train( modname, modorder, radix, d )    
        cv_engine = rp.cv.MultiCV( model_factory, training_sets, fold=fold, passes=passes )
        cv_engine.run()
        ## print >> sys.stderr, cv_engine.get_summary()
        ## print >> sys.stderr, cv_engine.get_success_rate()
        return cv_engine.get_success_rate()       
    else:
        raise Exception( "No support for '%d' training sets" % len( training_sets ) )
Esempio n. 9
0
    def run(self):

        mapping = self.starting_mapping

        # Open merit output
        merit_out = open(os.path.join(self.out_dir, "merits.txt"), "w")

        # Read training data and build seperate sets for sequence (translated
        # to ints in range(0,4)) and alignment (translated with self.atom_mapping)
        print >>sys.stderr, "Loading training data"
        self.pos_genome_seqs, self.pos_strings = self.read_maf(self.pos_fname)
        self.neg_genome_seqs, self.neg_strings = self.read_maf(self.neg_fname)

        # Count how many times each atom appears in the training data
        atom_counts = zeros(self.atom_mapping.get_out_size())
        for string in chain(self.pos_strings, self.neg_strings):
            for val in string:
                atom_counts[val] += 1

        # Valid candiates for expansion must occur more than 10 times in the training data
        can_expand = compress(atom_counts > 10, arange(len(atom_counts)))

        # Handling bad columns in the training data is not obvious, so don't do it for now
        # for string in chain( pos_strings, neg_strings ):
        #    assert -1 not in string, "Cannot have invalid columns (map to -1) in training data"

        best_merit_overall = 0
        best_mapping_overall = None
        index_best_merit_overall = 0
        out_counter = 0

        step_counter = 0

        #
        last_force_counter = 0

        print >>sys.stderr, "Searching"

        # Collapse
        while 1:

            symbol_count = mapping.get_out_size()

            best_merit = 0
            best_mapping = None

            clock = time.clock()
            cv_runs = 0

            # First try a bunch of collapses
            if symbol_count > stop_size:
                pairs = all_pairs(symbol_count)
                if len(pairs) > samp_size_collapse:
                    pairs = random.sample(pairs, samp_size_collapse)
                for i, j in pairs:
                    new_mapping = mapping.collapse(i, j)
                    merit = self.calc_merit(new_mapping)
                    cv_runs += 1
                    if merit > best_merit:
                        best_merit = merit
                        best_mapping = new_mapping

            # Also try a bunch of expansions
            elements = random.sample(can_expand, samp_size_expand)
            for i in elements:
                new_mapping = mapping.expand(i)
                if new_mapping.get_out_size() == symbol_count:
                    continue
                merit = self.calc_merit(new_mapping)
                cv_runs += 1
                if merit > best_merit:
                    best_merit = merit
                    best_mapping = new_mapping

            clock = time.clock() - clock

            mapping = best_mapping

            # Append merit to merit output
            print >> merit_out, step_counter, symbol_count, best_merit
            merit_out.flush()

            if best_merit >= best_merit_overall:
                best_merit_overall = best_merit
                best_mapping_overall = best_mapping
                # So we know what step the best mapping was encountered at
                best_merit_overall_index = step_counter
                restart_counter = step_counter
                # Reset the counter we use to force expansions
                last_force_counter = step_counter
                # Write best mapping to a file
                mapping_out = open(os.path.join(self.out_dir, "%03d.mapping" % out_counter), "w")
                for i, symbol in enumerate(self.atom_mapping.get_table()):
                    # Apply the 'second' mapping to the atom symbol
                    if symbol >= 0:
                        symbol = mapping[symbol]
                    print >> mapping_out, str.join("", TS_DNA.reverse_map(i, self.align_count)), symbol
                mapping_out.close()
                out_counter += 1

            print >>sys.stderr, "%06d, New best merit: %2.2f%%, size: %d, overall best: %2.2f%% at %06d, cvs per sec: %f" % (
                step_counter,
                best_merit * 100,
                mapping.get_out_size(),
                best_merit_overall * 100,
                best_merit_overall_index,
                cv_runs / clock,
            )

            # If we have gone 50 steps without improving over the best, restart from best
            if step_counter > restart_counter + 50:
                print >>sys.stderr, "Restarting from best mapping"
                print >> merit_out, step_counter, "RESTART"
                mapping = best_mapping_overall
                restart_counter = step_counter
                # Immediately force expansions after restart
                last_force_counter = 0

            if step_counter > last_force_counter + 20:
                last_force_counter = step_counter
                print >>sys.stderr, "Forcing expansions"
                print >> merit_out, step_counter, "FORCED EXPANSIONS"
                for i in range(5):
                    symbol_count = mapping.get_out_size()
                    best_merit = 0
                    best_mapping = None
                    for i in random.sample(can_expand, samp_size_expand):
                        new_mapping = mapping.expand(i)
                        if new_mapping.get_out_size() == symbol_count:
                            continue
                        merit = self.calc_merit(new_mapping)
                        if merit > best_merit:
                            best_merit = merit
                            best_mapping = new_mapping
                    mapping = best_mapping
            step_counter += 1
Esempio n. 10
0
def run( ts_fnames, out_dir, format, align_count, atom_mapping, mapping, modname, modorder  ):

    samp_size_collapse = 30
    samp_size_expand = 10
    
    if mpi:
        global pypar, node_id, nodes
        # Startup pypar and get some info about what node we are
        pypar = __import__( 'pypar' )
        nodes = pypar.size() 
        node_id = pypar.rank() 
        print "I am node %d of %d" % ( node_id, nodes )
        # Modify these, they get split over nodes
        samp_size_collapse = samp_size_collapse // nodes
        samp_size_expand = samp_size_expand // nodes

    # Open merit output
    merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' )

    # Read integer sequences
    message( "Loading training data" )
    
    training_sets = []
    for fname in ts_fnames:
        strings = []
        skipped = 0
        for s in rp.io.get_reader( open( fname ), format, None ):
            # Apply initial mapping
            s = atom_mapping.translate( s )
            # Ensure required columns
            if sum( s != -1 ) < min_cols:
                skipped += 1
                continue
            # Add to set
            strings.append( s )
        # Informational
        message( "Loaded training data from '%s', found %d usable strings and %d skipped" \
            % ( fname, len( strings ), skipped ) )
        training_sets.append( strings )

    # Count how many times each atom appears in the training data, valid 
    # candiates for expansion must occur more than 10 times in the training 
    # data.
    message( "Finding expandable atoms" )
    atom_counts = zeros( atom_mapping.get_out_size() )
    for string in chain( * training_sets ):
        for val in string:
            atom_counts[ int( val ) ] += 1
    can_expand = compress( atom_counts > 10, arange( len( atom_counts ) ) )

    # Open merit output
    merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' )

    best_merit_overall = 0
    best_mapping_overall = None
    index_best_merit_overall = 0
    out_counter = 0

    step_counter = 0
    last_force_counter = 0
    
    message(  "Searching" )

    # Collapse
    while 1:

        clock = time.clock()
        cv_runs = 0

        if mpi:
            # Sync up nodes at start of each pass
            pypar.barrier()

        symbol_count = mapping.get_out_size()

        best_i = None
        best_j = None
        best_merit = 0
        best_mapping = None

        # First try a bunch of collapses
        if symbol_count > stop_size:
            # Select some random pairs from the region owned by this node
            pairs = all_pairs( symbol_count )
            if mpi:
                lo, hi = pypar.balance( len( pairs ), nodes, node_id )
                pairs = pairs[lo:hi]
            if len( pairs ) > samp_size_collapse: 
                pairs = random.sample( pairs, samp_size_collapse )
            # Try collapsing each pair 
            for i, j in pairs:
                new_mapping = mapping.collapse( i, j )
                merit = calc_merit( training_sets, new_mapping, modname, modorder  )
                cv_runs += 1
                if merit > best_merit:
                    best_i, best_j = i, j
                    best_merit = merit
                    best_mapping = new_mapping

        # Also try a bunch of expansions
        if mpi:
            lo, hi = pypar.balance( len( can_expand ), nodes, node_id )
            elements = random.sample( can_expand[lo:hi], samp_size_expand )
        else:
            elements = random.sample( can_expand, samp_size_expand )
        for i in elements:
            new_mapping = mapping.expand( i )
            if new_mapping.get_out_size() == symbol_count: 
                continue
            merit = calc_merit( training_sets, new_mapping, modname, modorder  )
            cv_runs += 1
            if merit > best_merit:
                best_i, best_j = i, None
                best_merit = merit
                best_mapping = new_mapping

        clock = time.clock() - clock

        if mpi:
            best_i, best_j, best_merit, cv_runs = sync_nodes( best_i, best_j, best_merit, cv_runs )  
            # Collapse or expand (if j is None) to get the overall best mapping
            if best_j is None:
                best_mapping = mapping.expand( best_i )
            else:
                best_mapping = mapping.collapse( best_i, best_j )

        mapping = best_mapping

        # Append merit to merit output
        if not mpi or node_id == 0:
            print >>merit_out, step_counter, symbol_count, best_merit
            merit_out.flush()

        if best_merit >= best_merit_overall:
            best_merit_overall = best_merit
            best_mapping_overall = best_mapping
            # So we know what step the best mapping was encountered at
            best_merit_overall_index = step_counter
            restart_counter = step_counter
            # Reset the counter we use to force expansions
            last_force_counter = step_counter
            # Write best mapping to a file
            if not mpi or node_id == 0:
                write_mapping( mapping, os.path.join( out_dir, "%03d.mapping" % out_counter ) )
            out_counter += 1

        message( "%06d, New best merit: %2.2f%%, size: %d, overall best: %2.2f%% at %06d, cvs/sec: %f" \
                  % ( step_counter, best_merit * 100, mapping.get_out_size(), best_merit_overall * 100, best_merit_overall_index, cv_runs/clock  ) )

        # If we have gone 50 steps without improving over the best, restart from best
        if step_counter > restart_counter + 50:
            message( "Restarting from best mapping" )
            if not mpi or node_id == 0:
                print >>merit_out, step_counter, "RESTART"
            mapping = best_mapping_overall
            restart_counter = step_counter
            # Immediately force expansions after restart
            last_force_counter = 0

        if step_counter > last_force_counter + 20:
            last_force_counter = step_counter
            message( "Forcing expansions" )
            if not mpi or node_id == 0:
                print >>merit_out, step_counter, "FORCED EXPANSIONS"
            if mpi:
                lo, hi = pypar.balance( len( can_expand ), nodes, node_id )
                my_can_expand = can_expand[lo:hi]
            else:
                my_can_expand = can_expand
            for i in range( 5 ):
                symbol_count = mapping.get_out_size()
                best_merit = 0
                best_i = None
                best_mapping = None
                for i in random.sample( my_can_expand, samp_size_expand ):
                    new_mapping = mapping.expand( i )
                    if new_mapping.get_out_size() == symbol_count: 
                        continue
                    merit = calc_merit( training_sets, new_mapping, modname, modorder  )
                    if merit > best_merit:
                        best_i = i
                        best_merit = merit
                        best_mapping = new_mapping
                if mpi:
                    best_i, best_j, best_merit, cv_runs = sync_nodes( best_i, None, best_merit, 0 )
                    assert best_j == None
                    best_mapping = mapping.expand( best_i )
                if best_mapping:
                    mapping = best_mapping
                
        step_counter += 1
Esempio n. 11
0
def run( pos_file, neg_file, out_dir, format, align_count, mapping ):

    # Open merit output
    merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' )

    # Read integer sequences
    pos_strings = list( rp.io.get_reader( pos_file, format, None ) )
    neg_strings = list( rp.io.get_reader( neg_file, format, None ) )

    symbol_count = mapping.get_out_size()

    # Collapse
    while symbol_count > stop_size:

        # Sync nodes on each pass, may not be required
        pypar.barrier()

        if node_id == 0:
            print "Collapsing from:", symbol_count

        pairs = all_pairs( symbol_count )

        # Decide which subrange of all pairs this node will handle
        lo, hi = pypar.balance( len( pairs ), nodes, node_id )

        # Find best collapsed mapping in interval
        best_i, best_j, best_merit = None, None, 0
        for i, j in pairs[lo:hi]:
            merit = calc_merit( pos_strings, neg_strings, mapping.collapse( i, j )  ) 
            if merit > best_merit:
                best_i, best_j, best_merit = i, j, merit
            
        # Aggregate results
        if node_id != 0:
            # Send best i, j, merit to the master
            pypar.send( ( best_i, best_j, merit ), 0 )
        else:
            # I am the master, get results from all other nodes and determine
            # which had the best merit
            for other_node_id in range( 1, nodes ):
                i, j, merit = pypar.receive( other_node_id )
                if merit > best_merit:
                    best_i, best_j, best_merit = i, j, merit
  
        # Collapse the two symbols that resulted in the best merit   
        mapping = mapping.collapse( best_i, best_j )
        symbol_count -= 1
        
        # Ensure only the master writes files
        if node_id == 0:
        
            # Append merit to merit output        
            print >>merit_out, symbol_count, best_merit
        
            print "\nBest Merit %d." % best_merit,
        
            # Write best mapping to a file
            mapping_out = open( os.path.join( out_dir, "%03d.mapping" % symbol_count ), 'w' )
            for i, symbol in enumerate( mapping.get_table() ): 
                print >>mapping_out, str.join( '', rp.mapping.DNA.reverse_map( i, align_count ) ), symbol
            mapping_out.close()