Beispiel #1
0
def _get_timestamp(action):
    """ Look for ident() in .state subdirectory of current directory.
        If pickled value matches return the timestamp.
    """
    try:
        if not os.path.exists('.state'):
            os.mkdir('.state')
    
        filename = os.path.join('.state', grace.filesystem_friendly_name(action.ident()))
        if os.path.exists(filename):
            with open(filename,'rb') as f:
                old = pickle.load(f)
            
            if action == old:
                if not hasattr(old, 'timestamp'):
                    return None                        
                return old.timestamp
            
            #for parameter in self.parameters:
            #    if parameter.get(self) != parameter.get(old):
            #        print >> sys.stderr, parameter.name, parameter.get(old), '->', parameter.get(self)
            
    except Exception, error:
        import traceback
        traceback.print_exc()
        print >> sys.stderr, 'Error making %s, re-running: %s' % (action.ident(), error)
def _get_timestamp(action):
    """ Look for ident() in .state subdirectory of current directory.
        If pickled value matches return the timestamp.
    """
    if selection.matches(LOCAL.do_selection, [action.shell_name()]):
        return None
    
    try:
        for filename in [
            action.state_filename(),
            os.path.join('.state', grace.filesystem_friendly_name(action.ident())), #Old location of state files
        ]:
            if os.path.exists(filename):
                with open(filename,'rb') as f:
                    old = pickle.load(f)
                
                if action != old:
                    return None
                
                if not hasattr(old, 'timestamp'):
                    return None                        
                
                if hasattr(old, 'timestamp_for') and old.timestamp_for != filename:
                    return None
                
                return old.timestamp
                
                #for parameter in self.parameters:
                #    if parameter.get(self) != parameter.get(old):
                #        print >> sys.stderr, parameter.name, parameter.get(old), '->', parameter.get(self)            
    except Exception, error:
        import traceback
        traceback.print_exc()
        print >> sys.stderr, 'Error making %s, re-running: %s' % (action.ident(), error)
def get_graph(path, name, suffix):
    filename = os.path.join(path, grace.filesystem_friendly_name(name) + '-' + suffix + '.userplot')
    
    result = [ ]
    for item in open(filename,'rb'):
        result.append( float(item.strip()) )
    return numpy.array(result)
Beispiel #4
0
    def target(self, path, dep, *commands):
        """ path is a directory or prefix or directory/prefix
            command is a command to execute to produce it
        """

        state_prefix = join(
            'state',
            grace.filesystem_friendly_name(path) + '_'
        )
        
        state_name = state_prefix + hashlib.sha1(
            '\n'.join(item.strip() for item in commands)
        ).hexdigest()
        self.all.append(state_name)
        
        self.lines.extend([
           '',
           '%s : %s' % (state_name, ' '.join(dep)),
           '\t@rm -f %s%s' % (state_prefix, '?'*40),
        ] + [
           '\t%s' % make_quote(self.submit.replace('%',command)) for command in commands 
        ] + [
           '\t@touch %s' % state_name,
        ])
        
        return state_name
Beispiel #5
0
def get_graph(path, name, suffix):
    filename = os.path.join(
        path,
        grace.filesystem_friendly_name(name) + '-' + suffix + '.userplot')

    result = []
    for item in open(filename, 'rb'):
        result.append(float(item.strip()))
    return numpy.array(result)
Beispiel #6
0
def _run_and_save_state(action, timestamp):
    filename = os.path.join('.state', grace.filesystem_friendly_name(action.ident()))
    temp_filename = os.path.join('.state', 'temp-' + grace.filesystem_friendly_name(action.ident()))
    
    if os.path.exists(filename):
        os.unlink(filename)
    
    if LOCAL.do_nothing:
        result = None
    else:
        result = action.run()
    
    LOCAL.time = max(LOCAL.time, timestamp)
    action.timestamp = timestamp
    with open(temp_filename,'wb') as f:
        pickle.dump(action, f)
    os.rename(temp_filename, filename)
    
    return result
 def genbank_callback(name, record):
     """ Make a copy of any genbank files passed in. """
     from Bio import SeqIO
     
     SeqIO.write([record], reference_genbank_file, 'genbank')
     
     f = open(self / (grace.filesystem_friendly_name(name) + '.gbk'), 'wb')
     SeqIO.write([record], f, 'genbank')
     f.close()
     
     any_genbank[0] = True
Beispiel #8
0
 def genbank_callback(name, record):
     """ Make a copy of any genbank files passed in. """
     from Bio import SeqIO
     
     SeqIO.write([record], reference_genbank_file, 'genbank')
     
     f = open(self / (grace.filesystem_friendly_name(name) + '.gbk'), 'wb')
     SeqIO.write([record], f, 'genbank')
     f.close()
     
     any_genbank[0] = True
Beispiel #9
0
def evidence_reader(working_dir, name):
    filename = os.path.join(working_dir, grace.filesystem_friendly_name(name) + '-evidence.txt')
    f = open(filename,'rb')
    
    header = f.readline()
    if header.count('\t') != 7:
        raise grace.Error('Old style evidence file. Please re-run nesoni consensus.')
    
    for line in f:
        fields = line.rstrip('\n').split('\t')
        yield Call(fields[4], fields[1], fields[6])
        yield Call(fields[5], fields[2], fields[7])

    f.close()
Beispiel #10
0
def evidence_reader(working_dir, name):
    filename = os.path.join(working_dir, grace.filesystem_friendly_name(name) + "-evidence.txt")
    f = open(filename, "rb")

    header = f.readline()
    if header.count("\t") != 7:
        raise grace.Error("Old style evidence file. Please re-run nesoni consensus.")

    for line in f:
        fields = line.rstrip("\n").split("\t")
        yield Call(fields[4], fields[1], fields[6])
        yield Call(fields[5], fields[2], fields[7])

    f.close()
Beispiel #11
0
def debias(args):
    import numpy

    radius, args = grace.get_option_value(args, '--radius', int, 2)

    dirs = args

    for dir_name in dirs:
        for name, seq in io.read_sequences(
                os.path.join(dir_name, 'reference.fa')):
            for suffix, ambig_suffix in [
                ('-depth', '-ambiguous-depth'),
                ('-pairspan-depth', '-ambiguous-pairspan-depth'),
            ]:
                root = grace.filesystem_friendly_name(name)
                full_name = os.path.join(dir_name, root + suffix + '.userplot')
                full_ambig_name = os.path.join(
                    dir_name, root + ambig_suffix + '.userplot')
                if not os.path.exists(full_name): continue
                if not os.path.exists(full_ambig_name): continue

                output_suffix = '-%d.userplot' % radius

                print dir_name, root, output_suffix

                depths = numpy.array(read_unstranded_userplot(full_name))
                ambig_depths = numpy.array(
                    read_unstranded_userplot(full_ambig_name))
                expect = expected_depth(root, seq, depths, ambig_depths,
                                        radius)

                write_unstranded_userplot(
                    os.path.join(dir_name,
                                 root + suffix + '-expected' + output_suffix),
                    expect)

                corrected = depths / expect * numpy.median(expect)
                corrected[expect <= 5.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(dir_name,
                                 root + suffix + '-corrected' + output_suffix),
                    corrected)

                ambig_corrected = ambig_depths / expect * numpy.median(expect)
                ambig_corrected[expect <= 0.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(
                        dir_name,
                        root + ambig_suffix + '-corrected' + output_suffix),
                    ambig_corrected)
Beispiel #12
0
 def run(self):
     working = io.Workspace(self.output_dir, must_exist=False)
 
     for filename in self.files:
         reader = io.Table_reader(filename)
         
         name = os.path.splitext(os.path.split(filename)[1])[0]
         
         rname = None
         files = None
         for record in reader:
             if record['Chromosome'] != rname:
                 if files: 
                     for item in files: 
                         item.close()
                 rname = record['Chromosome']
                 grace.status('Convert '+name+' '+rname)
                 files = [
                     open(working / (
                         name + 
                         '-' + grace.filesystem_friendly_name(rname) + 
                         '-' + grace.filesystem_friendly_name(item) + '.userplot'
                     ), 'wb')
                     for item in reader.headings[4:]
                 ]
                 pos = 0
             assert int(record['Start']) == pos and int(record['End']) == pos + 1
             
             for val, f in zip(record.values()[4:], files):
                 print >> f, val
             
             pos += 1
         
         if files: 
             for item in files: 
                 item.close()
         grace.status('')
Beispiel #13
0
def debias(args):
    import numpy

    radius, args = grace.get_option_value(args, '--radius', int, 2) 

    dirs = args
    
    for dir_name in dirs:
        for name, seq in io.read_sequences(os.path.join(dir_name,'reference.fa')):
            for suffix, ambig_suffix in [
                ('-depth', '-ambiguous-depth'),
                ('-pairspan-depth', '-ambiguous-pairspan-depth'),
            ]:
                root = grace.filesystem_friendly_name(name)
                full_name = os.path.join(dir_name, root + suffix + '.userplot')
                full_ambig_name = os.path.join(dir_name, root + ambig_suffix + '.userplot')
                if not os.path.exists(full_name): continue
                if not os.path.exists(full_ambig_name): continue
                
                output_suffix = '-%d.userplot' % radius 

                print dir_name, root, output_suffix
                
                depths = numpy.array( read_unstranded_userplot(full_name) )
                ambig_depths = numpy.array( read_unstranded_userplot(full_ambig_name) )
                expect = expected_depth(root, seq, depths, ambig_depths, radius)
                
                write_unstranded_userplot(
                    os.path.join(dir_name, root + suffix + '-expected' + output_suffix),
                    expect) 
                
                corrected = depths / expect * numpy.median(expect)
                corrected[expect <= 5.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(dir_name, root + suffix + '-corrected' + output_suffix),
                    corrected)                 
                
                ambig_corrected = ambig_depths / expect * numpy.median(expect)
                ambig_corrected[expect <= 0.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(dir_name, root + ambig_suffix + '-corrected' + output_suffix),
                    ambig_corrected)                 
Beispiel #14
0
def _get_timestamp(action):
    """ Look for ident() in .state subdirectory of current directory.
        If pickled value matches return the timestamp.
    """
    if selection.matches(LOCAL.do_selection, [action.shell_name()]):
        return None

    try:
        for filename in [
                action.state_filename(),
                os.path.join(
                    '.state', grace.filesystem_friendly_name(
                        action.ident())),  #Old location of state files
        ]:
            if os.path.exists(filename):
                with open(filename, 'rb') as f:
                    old = pickle.load(f)

                if action != old:
                    return None

                if not hasattr(old, 'timestamp'):
                    return None

                if hasattr(old,
                           'timestamp_for') and old.timestamp_for != filename:
                    return None

                return old.timestamp

                #for parameter in self.parameters:
                #    if parameter.get(self) != parameter.get(old):
                #        print >> sys.stderr, parameter.name, parameter.get(old), '->', parameter.get(self)
    except Exception, error:
        import traceback
        traceback.print_exc()
        print >> sys.stderr, 'Error making %s, re-running: %s' % (
            action.ident(), error)
Beispiel #15
0
def main(args):
    mincov, args = grace.get_option_value(args, '--mincov', int, 1)
    maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16)
    minsize, args = grace.get_option_value(args, '--minsize', int, 200)
    what, args = grace.get_option_value(args, '--what', as_core_or_unique,
                                        'core')
    is_core = (what == 'core')

    grace.expect_no_further_options(args)

    if len(args) < 2:
        print >> sys.stderr, HELP
        raise grace.Help_shown()

    output_dir, working_dirs = args[0], args[1:]

    assert not path.exists(path.join(output_dir, 'reference.fa')), \
        'Output directory not given'

    if not path.exists(output_dir):
        os.mkdir(output_dir)

    for name, seq in io.read_sequences(
            path.join(working_dirs[0], 'reference.fa')):
        print name
        friendly_name = grace.filesystem_friendly_name(name)

        good = [True] * len(seq)

        for working_dir in working_dirs:
            if is_core:
                suffix = '-depth.userplot'
            else:
                suffix = '-ambiguous-depth.userplot'
            data = trivia.read_unstranded_userplot(
                os.path.join(working_dir, friendly_name + suffix))
            assert len(seq) == len(data)
            for i in xrange(len(seq)):
                if good[i]:
                    if is_core:
                        good[i] = data[i] >= mincov
                    else:
                        good[i] = data[i] < mincov

        #Close holes
        start = -maxdiff - 1
        n_holes = 0
        for i in xrange(len(seq)):
            if good[i]:
                if 0 < i - start <= maxdiff:
                    for j in xrange(start, i):
                        good[j] = True
                    n_holes += 1
                start = i + 1
        print 'Closed', grace.pretty_number(n_holes), 'holes'

        f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name, what)),
                 'wb')
        io.write_fasta(
            f, name,
            ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))]))
        f.close()

        f = open(
            path.join(output_dir, '%s-%s_masked.fa' % (friendly_name, what)),
            'wb')
        io.write_fasta(
            f, name, ''.join([(seq[i] if good[i] else seq[i].lower())
                              for i in xrange(len(seq))]))
        f.close()

        f_good = open(
            path.join(output_dir, '%s-%s_parts.fa' % (friendly_name, what)),
            'wb')
        f_nongood = open(
            path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name, what)),
            'wb')
        start = 0
        n_good = [0]
        n_good_bases = [0]

        def emit(i):
            if i - start < minsize: return
            if good[start]:
                n_good[0] += 1
                n_good_bases[0] += i - start
            io.write_fasta(f_good if good[start] else f_nongood,
                           '%s:%d..%d' % (name, start + 1, i), seq[start:i])

        for i in xrange(1, len(seq)):
            if good[i] != good[start]:
                emit(i)
                start = i
        emit(len(seq))
        f_nongood.close()
        f_good.close()

        print grace.pretty_number(
            sum(good)), 'bases are ' + what + ', of', grace.pretty_number(
                len(seq)), 'in reference sequence'
        print grace.pretty_number(
            n_good[0]), 'parts at least', grace.pretty_number(
                minsize), 'bases long with', grace.pretty_number(
                    n_good_bases[0]), 'total bases'

        print
Beispiel #16
0
def main(args):
    mincov, args = grace.get_option_value(args, '--mincov', int, 1) 
    maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) 
    minsize, args = grace.get_option_value(args, '--minsize', int, 200)
    what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')    
    is_core = (what == 'core') 

    grace.expect_no_further_options(args)
    
    if len(args) < 2:
        print >> sys.stderr, HELP
        raise grace.Help_shown()
    
    output_dir, working_dirs = args[0], args[1:]
    
    assert not path.exists(path.join(output_dir, 'reference.fa')), \
        'Output directory not given'
    
    if not path.exists(output_dir):
        os.mkdir(output_dir)
    
    for name, seq in io.read_sequences(path.join(working_dirs[0],'reference.fa')):
        print name
        friendly_name = grace.filesystem_friendly_name(name)
        
        good = [ True ] * len(seq)
        
        for working_dir in working_dirs:
            if is_core:
               suffix = '-depth.userplot'
            else:
               suffix = '-ambiguous-depth.userplot'
            data = trivia.read_unstranded_userplot(
                os.path.join(working_dir, friendly_name+suffix)
            )
            assert len(seq) == len(data)
            for i in xrange(len(seq)):
               if good[i]:
                   if is_core:
                       good[i] = data[i] >= mincov
                   else:
                       good[i] = data[i] < mincov

        #Close holes
        start = -maxdiff-1
        n_holes = 0
        for i in xrange(len(seq)):
            if good[i]:
                 if 0 < i-start <= maxdiff:
                     for j in xrange(start,i): good[j] = True
                     n_holes += 1
                 start = i+1
        print 'Closed', grace.pretty_number(n_holes), 'holes'
        
        
        f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name,what)), 'wb')
        io.write_fasta(f, name,
            ''.join([ (seq[i] if good[i] else 'N')
                      for i in xrange(len(seq)) ])
        )
        f.close()

        f = open(path.join(output_dir, '%s-%s_masked.fa' % (friendly_name,what)), 'wb')
        io.write_fasta(f, name,
            ''.join([ (seq[i] if good[i] else seq[i].lower())
                      for i in xrange(len(seq)) ])
        )
        f.close()

        f_good = open(path.join(output_dir, '%s-%s_parts.fa' % (friendly_name,what)), 'wb')
        f_nongood = open(path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name,what)), 'wb')
        start = 0
        n_good = [0]
        n_good_bases = [0]    
        def emit(i):
            if i-start < minsize: return
            if good[start]:
                n_good[0] += 1
                n_good_bases[0] += i-start
            io.write_fasta(
                f_good if good[start] else f_nongood,
                '%s:%d..%d' % (name, start+1,i),
                seq[start:i]
            )
        for i in xrange(1,len(seq)):
            if good[i] != good[start]:
                emit(i)
                start = i
        emit(len(seq))
        f_nongood.close()
        f_good.close()
        
        print grace.pretty_number(sum(good)), 'bases are '+what+', of', grace.pretty_number(len(seq)), 'in reference sequence'
        print grace.pretty_number(n_good[0]), 'parts at least', grace.pretty_number(minsize), 'bases long with', grace.pretty_number(n_good_bases[0]), 'total bases'

        print
Beispiel #17
0
def main(args):
    title1, args = grace.get_option_value(args, '--title1', str, None)
    title2, args = grace.get_option_value(args, '--title2', str, None)
    grace.expect_no_further_options(args)

    if len(args) != 3:
        print >> sys.stderr, USAGE
        return 1

    working_dir1 = args[0]
    working_dir2 = args[1]
    cutoff = float(args[2])

    sequence_names = [
        name for name, sequence in io.read_sequences(
            os.path.join(working_dir1, 'reference.fa'))
    ]

    if title1 is None:
        title1 = working_dir1
    if title2 is None:
        title2 = working_dir2

    n = 1
    while significance([('A', n)], [('T', n)], 1.0) > cutoff:
        n += 1

    print '%g\tsignificance cutoff' % cutoff
    print '%d\tdepth required to call substitution (greater if there are errors in the reads)' % n

    print 'Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s' % (
        title1, title2, title1, title2)

    for sequence_name in sequence_names:
        filename1 = os.path.join(
            working_dir1,
            grace.filesystem_friendly_name(sequence_name) + '-evidence.txt')
        filename2 = os.path.join(
            working_dir2,
            grace.filesystem_friendly_name(sequence_name) + '-evidence.txt')

        for (pos1, ins1, sub1, ref1, conins1,
             consub1), (pos2, ins2, sub2, ref2, conins2,
                        consub2) in itertools.izip(read_file(filename1),
                                                   read_file(filename2)):
            assert pos1 == pos2 and ref1 == ref2

            if pos1 % 1000 == 0:
                grace.status('Testing %s %d' % (sequence_name, pos1))

            dec_ins1 = io.decode_evidence(ins1)
            dec_ins2 = io.decode_evidence(ins2)
            if dec_ins1 and dec_ins2:
                sig = significance(io.decode_evidence(ins1),
                                   io.decode_evidence(ins2), cutoff)
                if sig is not None and sig <= cutoff:
                    grace.status('')
                    print '%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s' % (
                        sequence_name, pos1, 'insertion-before', ins1, ins2,
                        sig, conins1, conins2)

            dec_sub1 = io.decode_evidence(sub1)
            dec_sub2 = io.decode_evidence(sub2)
            if dec_sub1 and dec_sub2:
                sig = significance(dec_sub1, dec_sub2, cutoff)
                if sig is not None and sig <= cutoff:
                    if dec_sub1[0][0] == '-' or dec_sub2[0][0] == '-':
                        what = 'deletion'
                    elif dec_sub1[0][0] != dec_sub2[0][0]:
                        what = 'substitution'
                    else:
                        what = 'different mix'
                    grace.status('')
                    print '%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s' % (
                        sequence_name, pos1, what, ref1, sub1, sub2, sig,
                        consub1, consub2)

    grace.status('')
    return 0
Beispiel #18
0
    def run(self):
        #mincov, args = grace.get_option_value(args, '--mincov', int, 1) 
        #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) 
        #minsize, args = grace.get_option_value(args, '--minsize', int, 200)
        #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')    
        #is_core = (what == 'core') 
        #
        #grace.expect_no_further_options(args)
        #
        #if len(args) < 2:
        #    print >> sys.stderr, HELP
        #    raise grace.Help_shown()
        #
        #output_dir, working_dirs = args[0], args[1:]
        #
        ##assert not path.exists(path.join(output_dir, 'reference.fa')), \
        #assert not path.exists(path.join(output_dir, 'parameters')), \
        #        'Output directory not given'
        #
        #if not path.exists(output_dir):
        #    os.mkdir(output_dir)

        assert self.what in ('core','unique'), 'Expected --what to be either "core" or "unique".'
        is_core = (self.what == 'core') 
        
        workspace = self.get_workspace()
        
        for name, seq in io.read_sequences(working_directory.Working(self.working_dirs[0]).get_reference().reference_fasta_filename()):
            self.log.log(name + '\n')
            friendly_name = grace.filesystem_friendly_name(name)
            
            good = [ True ] * len(seq)
            
            for working_dir in self.working_dirs:
                if is_core:
                   suffix = '-depth.userplot'
                else:
                   suffix = '-ambiguous-depth.userplot'
                data = trivia.read_unstranded_userplot(
                    os.path.join(working_dir, friendly_name+suffix)
                )
                assert len(seq) == len(data)
                for i in xrange(len(seq)):
                   if good[i]:
                       if is_core:
                           good[i] = data[i] >= self.mincov
                       else:
                           good[i] = data[i] < self.mincov
    
            #Close holes
            start = -self.maxdiff-1
            n_holes = 0
            for i in xrange(len(seq)):
                if good[i]:
                     if 0 < i-start <= self.maxdiff:
                         for j in xrange(start,i): good[j] = True
                         n_holes += 1
                     start = i+1
            self.log.log('Closed '+grace.pretty_number(n_holes)+' holes\n')
            
            
            f = open( workspace/('%s-%s.fa' % (friendly_name,self.what)), 'wb')
            io.write_fasta(f, name,
                ''.join([ (seq[i] if good[i] else 'N')
                          for i in xrange(len(seq)) ])
            )
            f.close()
    
            f = open( workspace/('%s-%s_masked.fa' % (friendly_name,self.what)), 'wb')
            io.write_fasta(f, name,
                ''.join([ (seq[i] if good[i] else seq[i].lower())
                          for i in xrange(len(seq)) ])
            )
            f.close()
    
            f_good = open( workspace/('%s-%s_parts.fa' % (friendly_name,self.what)), 'wb')
            f_nongood = open( workspace/('%s-non%s_parts.fa' % (friendly_name,self.what)), 'wb')
            start = 0
            n_good = [0]
            n_good_bases = [0]    
            def emit(i):
                if i-start < self.minsize: return
                if good[start]:
                    n_good[0] += 1
                    n_good_bases[0] += i-start
                io.write_fasta(
                    f_good if good[start] else f_nongood,
                    '%s:%d..%d' % (name, start+1,i),
                    seq[start:i]
                )
            for i in xrange(1,len(seq)):
                if good[i] != good[start]:
                    emit(i)
                    start = i
            emit(len(seq))
            f_nongood.close()
            f_good.close()
            
            self.log.log(grace.pretty_number(sum(good))+' bases are '+self.what+', of '+grace.pretty_number(len(seq))+' in reference sequence\n')
            self.log.log(grace.pretty_number(n_good[0])+' parts at least '+grace.pretty_number(self.minsize)+' bases long with '+grace.pretty_number(n_good_bases[0])+' total bases\n')
            self.log.log('\n')
Beispiel #19
0
    def run(self):
        title1 = self.title1
        title2 = self.title2

        working1 = working_directory.Working(self.working_dir1)
        working2 = working_directory.Working(self.working_dir2)

        cutoff = self.cutoff

        sequence_names = [
            name for name, length in working1.get_reference().get_lengths()
        ]

        if title1 is None:
            title1 = working1.name
        if title2 is None:
            title2 = working2.name

        n = 1
        while significance([('A', n)], [('T', n)], 1.0) > cutoff:
            n += 1

        f = open(self.prefix + '.txt', 'wb')
        print >> f, '%g\tsignificance cutoff' % cutoff
        print >> f, '%d\tdepth required to call substitution (greater if there are errors in the reads)' % n

        print >> f, 'Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s' % (
            title1, title2, title1, title2)

        for sequence_name in sequence_names:
            filename1 = working1 / (
                grace.filesystem_friendly_name(sequence_name) +
                '-evidence.txt')
            filename2 = working2 / (
                grace.filesystem_friendly_name(sequence_name) +
                '-evidence.txt')

            for (pos1, ins1, sub1, ref1, conins1,
                 consub1), (pos2, ins2, sub2, ref2, conins2,
                            consub2) in itertools.izip(read_file(filename1),
                                                       read_file(filename2)):
                assert pos1 == pos2 and ref1 == ref2

                if pos1 % 1000 == 0:
                    grace.status('Testing %s %d' % (sequence_name, pos1))

                dec_ins1 = io.decode_evidence(ins1)
                dec_ins2 = io.decode_evidence(ins2)
                if dec_ins1 and dec_ins2:
                    sig = significance(io.decode_evidence(ins1),
                                       io.decode_evidence(ins2), cutoff)
                    if sig is not None and sig <= cutoff:
                        print >> f, '%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s' % (
                            sequence_name, pos1, 'insertion-before', ins1,
                            ins2, sig, conins1, conins2)
                        f.flush()

                dec_sub1 = io.decode_evidence(sub1)
                dec_sub2 = io.decode_evidence(sub2)
                if dec_sub1 and dec_sub2:
                    sig = significance(dec_sub1, dec_sub2, cutoff)
                    if sig is not None and sig <= cutoff:
                        if dec_sub1[0][0] == '-' or dec_sub2[0][0] == '-':
                            what = 'deletion'
                        elif dec_sub1[0][0] != dec_sub2[0][0]:
                            what = 'substitution'
                        else:
                            what = 'different mix'
                        print >> f, '%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s' % (
                            sequence_name, pos1, what, ref1, sub1, sub2, sig,
                            consub1, consub2)
                        f.flush()

        f.close()

        grace.status('')
        return 0
Beispiel #20
0
    def run(self):
        title1 = self.title1
        title2 = self.title2
        
        working1 = working_directory.Working(self.working_dir1)
        working2 = working_directory.Working(self.working_dir2)
        
        cutoff = self.cutoff
        
        sequence_names = [ name 
                           for name, length 
                           in working1.get_reference().get_lengths() ]
        
        if title1 is None:
            title1 = working1.name
        if title2 is None:
            title2 = working2.name
            
        n = 1
        while significance([('A',n)],[('T',n)],1.0) > cutoff:
            n += 1

        f = open(self.prefix + '.txt','wb')        
        print >> f, '%g\tsignificance cutoff' % cutoff
        print >> f, '%d\tdepth required to call substitution (greater if there are errors in the reads)' % n
            
        print >> f, 'Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s' % (title1, title2, title1, title2)
    
        for sequence_name in sequence_names:
            filename1 = working1/(grace.filesystem_friendly_name(sequence_name) + '-evidence.txt')
            filename2 = working2/(grace.filesystem_friendly_name(sequence_name) + '-evidence.txt')
        
            for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip(read_file(filename1), read_file(filename2)):
                assert pos1 == pos2 and ref1 == ref2
            
                if pos1 % 1000 == 0:
                    grace.status('Testing %s %d' % (sequence_name, pos1))
            
                dec_ins1 = io.decode_evidence(ins1)
                dec_ins2 = io.decode_evidence(ins2)
                if dec_ins1 and dec_ins2:
                    sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff)    
                    if sig is not None and sig <= cutoff:
                        print >> f, '%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s' % (sequence_name, pos1, 'insertion-before', ins1, ins2, sig, conins1, conins2)
                        f.flush()
            
                dec_sub1 = io.decode_evidence(sub1)
                dec_sub2 = io.decode_evidence(sub2)
                if dec_sub1 and dec_sub2:
                    sig = significance(dec_sub1, dec_sub2, cutoff)        
                    if sig is not None and sig <= cutoff:
                        if dec_sub1[0][0] == '-' or dec_sub2[0][0] == '-':
                            what = 'deletion'
                        elif dec_sub1[0][0] != dec_sub2[0][0]:
                            what = 'substitution'
                        else:
                            what = 'different mix'
                        print >> f, '%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s' % (sequence_name, pos1, what, ref1, sub1, sub2, sig, consub1, consub2)
                        f.flush()
        
        f.close()
        
        grace.status('')
        return 0
Beispiel #21
0
def old_main(args):
    use_indels, args = grace.get_option_value(args,'--indels',int,1)
    use_reference, args = grace.get_option_value(args,'--reference',int,1)
    make_list, args = grace.get_option_value(args,'--list',int,0)
    fasta_output, args = grace.get_option_value(args,'--fasta',int,0)
    grace.expect_no_further_options(args)
    
    if len(args) < 1:
        sys.stderr.write(USAGE)
        return 1
        
    if fasta_output and use_indels:
        print >> sys.stderr, 'Indels will not be included in FASTA output'
        use_indels = 0
    
    working_dirs = args
    
    #reference_data = { } # (ref_name, position, change_type) -> string
    #strain_data = { } # working_dir -> (ref_name, position, change_type) -> string
    
    names = ['reference'] + working_dirs
    
    substitution_calls = { } # ref_name -> [ [ call ] ]
    insertion_calls = { } # ref_name -> [ [ call ] ]
    substitution_evidence = { }
    insertion_evidence = { }
    
    for name, sequence in io.read_sequences(os.path.join(working_dirs[0], 'reference.fa')):
        substitution_calls[name] = [ list(sequence.upper()) ]
        insertion_calls[name] = [ [ '-' ] * len(sequence) ]
        substitution_evidence[name] = [ [ '' ] * len(sequence) ]    
        insertion_evidence[name] = [ [ '' ] * len(sequence) ]    
    
    for working_dir in working_dirs:
        for name in substitution_calls:
            filename = os.path.join(working_dir, grace.filesystem_friendly_name(name) + '-evidence.txt')
            f = open(filename,'rb')
            
            this_substitution_calls = [ ]
            this_insertion_calls = [ ]
            this_substitution_evidence = [ ]
            this_insertion_evidence = [ ]
            
            header = f.readline()
            if header.count('\t') != 5:
                print >> sys.stderr, 'Old style evidence file. Please re-run nesoni consensus.'
                return 1
            
            for line in f:
                fields = line.rstrip('\n').split('\t')
                this_substitution_calls.append(fields[5])
                this_insertion_calls.append(fields[4])
                this_substitution_evidence.append(fields[2])
                this_insertion_evidence.append(fields[1])
            
            substitution_calls[name].append(this_substitution_calls)
            insertion_calls[name].append(this_insertion_calls)
            substitution_evidence[name].append(this_substitution_evidence)
            insertion_evidence[name].append(this_insertion_evidence)
    
    if not use_reference:
        names.pop(0)
        for name in substitution_calls:
            substitution_calls[name].pop(0)
            insertion_calls[name].pop(0)
            substitution_evidence[name].pop(0)
            insertion_evidence[name].pop(0)

    interesting = find_interesting('substitution', substitution_calls, substitution_evidence)
    if use_indels:
        interesting.extend( find_interesting('insertion-before', insertion_calls, insertion_evidence) )

    if not use_indels:
        interesting = [ item for item in interesting if '-' not in item[3] ]
    
    interesting.sort()


    if fasta_output:
        do_fasta_output(names, interesting)
        return 0 

    
    #strain_reference_having_consensus = { } # working_dir -> ref_name -> string
    #
    #for working_dir in working_dirs:
    #    assert working_dir not in strain_data, 'Working directory given twice'
    #    strain_data[working_dir] = { }
    #    
    #    report_file = open(os.path.join(working_dir, 'report.txt'), 'rU')
    #    report_file.readline()
    #    for line in report_file:
    #        ref_name, position, change_type, old, new, evidence = \
    #            line.rstrip('\n').split('\t')
    #        
    #        if change_type == 'deletion':
    #            change_type = 'substitution'
    #        
    #        if not use_indels and \
    #           (change_type == 'insertion-before' or new == '-'):
    #            continue
    #        
    #        key = (ref_name, int(position), change_type)
    #        if key in reference_data:
    #            assert reference_data[key] == old
    #        else:
    #            reference_data[key] = old
    #        
    #        strain_data[working_dir][key] = new
    #    report_file.close()
    #    
    #    strain_reference_having_consensus[working_dir] = { }
    #    ref_have_con_filename = os.path.join(working_dir, 'reference_having_consensus.fa')
    #    for name, sequence in io.read_fasta(ref_have_con_filename):
    #        strain_reference_having_consensus[working_dir][name] = sequence
    #
    #keys = sorted(reference_data)
    #
    ##Fill in any blanks
    #for working_dir in working_dirs:
    #    for key in keys:
    #        if key in strain_data[working_dir]: continue
    #    
    #        # - Positions in report files start from 1 not 0
    #        # - Insertions must be bracketed
    #        lacks_consensus = (
    #            strain_reference_having_consensus[working_dir][key[0]][key[1]-1] == 'N' or
    #            (key[2] == 'insertion-before' and key[1] > 1 and
    #             strain_reference_having_consensus[working_dir][key[0]][key[1]-2] == 'N')
    #        )
    #        
    #        #If there's no consensus, record it as ambiguous
    #        if lacks_consensus:
    #            strain_data[working_dir][key] = 'N'                
    #        else:
    #            strain_data[working_dir][key] = reference_data[key]

 
    #all_data_names = ([ 'reference' ] if use_reference else []) + working_dirs
    #all_data = ([ reference_data ] if use_reference else []) + \
    #           [ strain_data[working_dir] for working_dir in working_dirs ] 
    

    #all_data_names = ([ 'reference' ] if use_reference else []) + working_dirs
    
    

    
    
    ones = ( 1 << len(names) )-1
    
    total_differences = 0
    
    if make_list:
        print '\t'.join(['Partition','Sequence','Position in reference','Change type'] + names + names) 
    
    for i in xrange(1,(1<<len(names))-1,2):
        set1 = [ ]
        set2 = [ ]
        for j in xrange(len(names)):
            if i & (1<<j):
                set1.append(j)
            else:
                set2.append(j)

        if make_list:
            print
            print ', '.join( names[i] for i in set1 ) + '   vs   ' + \
                  ', '.join( names[i] for i in set2 )
            print
                
        n = 0
        for refname, position, change_type, values, has_ambiguous, evidence in interesting: 
            #Skip if *any* ambiguity
            if has_ambiguous:
                continue
            
            if any( values[i] != values[set1[0]] for i in set1[1:] ) or \
               any( values[i] != values[set2[0]] for i in set2[1:] ):
                continue
            
            if make_list:
                if change_type == 'substitution' and '-' in values: change_type = 'deletion'
                print '\t%s\t%d\t%s\t' % (refname,position,change_type) + '\t'.join(values) + '\t' + '\t'.join(evidence) 
            
            n += 1

        total_differences += n

        if not make_list:
            print ', '.join( names[i] for i in set1 ) + '   vs   ' + \
                  ', '.join( names[i] for i in set2 ) + \
                  ': %d differences' %n            

    if not make_list:
        print
        print 'Total: %d' % total_differences


    if make_list:
        print
        print 'Ignored'
        print
    
    n_multiway = 0
    n_ambiguous = 0    
    for refname, position, change_type, values, has_ambiguous, evidence in interesting: 
        confusing = False
        if has_ambiguous:
            n_ambiguous += 1
            confusing = True
        elif len(set(values)) > 2:
            n_multiway += 1
            confusing = True
        
        if make_list and confusing:
            print '\t%s\t%d\t%s\t' % (refname,position,change_type) + '\t'.join(values) + '\t' + '\t'.join(evidence) 

    if not make_list:
        print
        print 'Ambiguities ignored: %d' % n_ambiguous
        print 'Multi-way changes ignored: %d' % n_multiway
    
    assert total_differences + n_ambiguous + n_multiway == len(interesting)
    
    return 0
Beispiel #22
0
def main(args):
    title1, args = grace.get_option_value(args, "--title1", str, None)
    title2, args = grace.get_option_value(args, "--title2", str, None)
    grace.expect_no_further_options(args)

    if len(args) != 3:
        print >> sys.stderr, USAGE
        return 1

    working_dir1 = args[0]
    working_dir2 = args[1]
    cutoff = float(args[2])

    sequence_names = [name for name, sequence in io.read_sequences(os.path.join(working_dir1, "reference.fa"))]

    if title1 is None:
        title1 = working_dir1
    if title2 is None:
        title2 = working_dir2

    n = 1
    while significance([("A", n)], [("T", n)], 1.0) > cutoff:
        n += 1

    print "%g\tsignificance cutoff" % cutoff
    print "%d\tdepth required to call substitution (greater if there are errors in the reads)" % n

    print "Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s" % (
        title1,
        title2,
        title1,
        title2,
    )

    for sequence_name in sequence_names:
        filename1 = os.path.join(working_dir1, grace.filesystem_friendly_name(sequence_name) + "-evidence.txt")
        filename2 = os.path.join(working_dir2, grace.filesystem_friendly_name(sequence_name) + "-evidence.txt")

        for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip(
            read_file(filename1), read_file(filename2)
        ):
            assert pos1 == pos2 and ref1 == ref2

            if pos1 % 1000 == 0:
                grace.status("Testing %s %d" % (sequence_name, pos1))

            dec_ins1 = io.decode_evidence(ins1)
            dec_ins2 = io.decode_evidence(ins2)
            if dec_ins1 and dec_ins2:
                sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff)
                if sig is not None and sig <= cutoff:
                    grace.status("")
                    print "%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s" % (
                        sequence_name,
                        pos1,
                        "insertion-before",
                        ins1,
                        ins2,
                        sig,
                        conins1,
                        conins2,
                    )

            dec_sub1 = io.decode_evidence(sub1)
            dec_sub2 = io.decode_evidence(sub2)
            if dec_sub1 and dec_sub2:
                sig = significance(dec_sub1, dec_sub2, cutoff)
                if sig is not None and sig <= cutoff:
                    if dec_sub1[0][0] == "-" or dec_sub2[0][0] == "-":
                        what = "deletion"
                    elif dec_sub1[0][0] != dec_sub2[0][0]:
                        what = "substitution"
                    else:
                        what = "different mix"
                    grace.status("")
                    print "%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s" % (
                        sequence_name,
                        pos1,
                        what,
                        ref1,
                        sub1,
                        sub2,
                        sig,
                        consub1,
                        consub2,
                    )

    grace.status("")
    return 0
Beispiel #23
0
    def run(self):
        #mincov, args = grace.get_option_value(args, '--mincov', int, 1)
        #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16)
        #minsize, args = grace.get_option_value(args, '--minsize', int, 200)
        #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')
        #is_core = (what == 'core')
        #
        #grace.expect_no_further_options(args)
        #
        #if len(args) < 2:
        #    print >> sys.stderr, HELP
        #    raise grace.Help_shown()
        #
        #output_dir, working_dirs = args[0], args[1:]
        #
        ##assert not path.exists(path.join(output_dir, 'reference.fa')), \
        #assert not path.exists(path.join(output_dir, 'parameters')), \
        #        'Output directory not given'
        #
        #if not path.exists(output_dir):
        #    os.mkdir(output_dir)

        assert self.what in (
            'core',
            'unique'), 'Expected --what to be either "core" or "unique".'
        is_core = (self.what == 'core')

        workspace = self.get_workspace()

        for name, seq in io.read_sequences(
                working_directory.Working(self.working_dirs[0]).get_reference(
                ).reference_fasta_filename()):
            self.log.log(name + '\n')
            friendly_name = grace.filesystem_friendly_name(name)

            good = [True] * len(seq)

            for working_dir in self.working_dirs:
                if is_core:
                    suffix = '-depth.userplot'
                else:
                    suffix = '-ambiguous-depth.userplot'
                data = trivia.read_unstranded_userplot(
                    os.path.join(working_dir, friendly_name + suffix))
                assert len(seq) == len(data)
                for i in xrange(len(seq)):
                    if good[i]:
                        if is_core:
                            good[i] = data[i] >= self.mincov
                        else:
                            good[i] = data[i] < self.mincov

            #Close holes
            start = -self.maxdiff - 1
            n_holes = 0
            for i in xrange(len(seq)):
                if good[i]:
                    if 0 < i - start <= self.maxdiff:
                        for j in xrange(start, i):
                            good[j] = True
                        n_holes += 1
                    start = i + 1
            self.log.log('Closed ' + grace.pretty_number(n_holes) + ' holes\n')

            f = open(workspace / ('%s-%s.fa' % (friendly_name, self.what)),
                     'wb')
            io.write_fasta(
                f, name, ''.join([(seq[i] if good[i] else 'N')
                                  for i in xrange(len(seq))]))
            f.close()

            f = open(
                workspace / ('%s-%s_masked.fa' % (friendly_name, self.what)),
                'wb')
            io.write_fasta(
                f, name, ''.join([(seq[i] if good[i] else seq[i].lower())
                                  for i in xrange(len(seq))]))
            f.close()

            f_good = open(
                workspace / ('%s-%s_parts.fa' % (friendly_name, self.what)),
                'wb')
            f_nongood = open(
                workspace / ('%s-non%s_parts.fa' % (friendly_name, self.what)),
                'wb')
            start = 0
            n_good = [0]
            n_good_bases = [0]

            def emit(i):
                if i - start < self.minsize: return
                if good[start]:
                    n_good[0] += 1
                    n_good_bases[0] += i - start
                io.write_fasta(f_good if good[start] else f_nongood,
                               '%s:%d..%d' % (name, start + 1, i),
                               seq[start:i])

            for i in xrange(1, len(seq)):
                if good[i] != good[start]:
                    emit(i)
                    start = i
            emit(len(seq))
            f_nongood.close()
            f_good.close()

            self.log.log(
                grace.pretty_number(sum(good)) + ' bases are ' + self.what +
                ', of ' + grace.pretty_number(len(seq)) +
                ' in reference sequence\n')
            self.log.log(
                grace.pretty_number(n_good[0]) + ' parts at least ' +
                grace.pretty_number(self.minsize) + ' bases long with ' +
                grace.pretty_number(n_good_bases[0]) + ' total bases\n')
            self.log.log('\n')