Exemple #1
0
def main(argv):
    
    fn_list = argv[0]
    seqdir = argv[1]
    seqfile = open(argv[2] + '.fa', 'wb')

    setname = seqdir.split('_')[1]
    sys.stderr.write('%s> Reading sets...\n' % get_timestamp())
    seq_d = {'U': read_seqs(os.path.join(seqdir, 
                                         '%s_unpaired_all.fastq' % setname)),
             'R1': read_seqs(os.path.join(seqdir,
                                          '%s_R1_paired.fastq' % setname)),
             'R2': read_seqs(os.path.join(seqdir,
                                          '%s_R2_paired.fastq' % setname))}
    sys.stderr.write('%s> Checking for duplicates...\n' % get_timestamp())
    # seqfile = open(os.path.basename(fn_list) + '.fa', 'wb')        
    
    drf = DuplicateReadFinder()    
    for line in open(fn_list):
        line = line.strip()
        id_, sources = line.split('\t')
        sources = sources.split(',')
        for k in sources:
            if len(sources) == 2:
                kk = 'p'
            elif sources[0].startswith('R'):
                kk = 'u'
            else:
                kk = ''
            if id_ in seq_d[k]:
                drf.add_sequence(id_ + '_' + k + kk, seq_d[k][id_])
                # seqfile.write('>%s_%s\n%s\n' % (id_, k, seq_d[k][id_]))
    
    sys.stderr.write('%s> Writing sequences...\n' % get_timestamp())
    drf.write_sequences(seqfile, modify_output=lambda x:x.replace('U', 'T'))                
    seqfile.close()
    sys.stderr.write('%s> Done.\n' % get_timestamp())
        
        


    pass
Exemple #2
0
def main(argv):
    
    fn_list = argv[0]
    seqdir = argv[1]
    seqfile = open(argv[2] + '.fa', 'wb')
    
    drf = DuplicateReadFinder()
    wanted = dict()
    for line in open(fn_list):
        line = line.strip()
        id_, sources = line.split('\t')
        sources = sources.split(',')
        if len(sources) == 2:
            kk = 'p'
        elif sources[0].startswith('R'):
            kk = 'u'
        else:
            kk = ''
        for k in sources:
            wanted[id_] = '%s_%s%s' % (id_, k, kk)
    
    
    setname = seqdir.split('_')[1]    
    sys.stderr.write('%s> Reading sequences and checking for duplicates...\n' % get_timestamp())
    
    for fn in ['%s_unpaired_all.fastq', '%s_R1_paired.fastq', '%s_R2_paired.fastq']:
        read_seqs(os.path.join(seqdir, fn % setname), drf, wanted)
            
    sys.stderr.write('%s> Writing sequences...\n' % get_timestamp())
    drf.write_sequences(seqfile, modify_output=lambda x:x.replace('U', 'T'))                
    seqfile.close()
    sys.stderr.write('%s> Done.\n' % get_timestamp())
        
        


    pass