# removes samples from FASTA file:
#
# ./me FASTA_FILE sample_1,sample_2,[...],sample_N
#

import sys

import Oligotyping.lib.fastalib as u
from Oligotyping.utils.utils import pretty_print as pp

fasta = u.SequenceSource(sys.argv[1])
output = u.FastaOutput(sys.argv[1] + '-SAMPLES-REMOVED.fa')
samples_to_be_removed = [s.strip() for s in sys.argv[2].split(',')]

while fasta.next():
    if fasta.pos % 1000 == 0:
        sys.stderr.write('\rreads processed so far: %s' % (pp(fasta.pos)))
        sys.stderr.flush()
    sample_name = '_'.join(fasta.id.split('_')[:-1])

    if sample_name in samples_to_be_removed:
        continue

    output.store(fasta, split=False)

sys.stderr.write('\rNew FASTA file .............: %s\n' % (sys.argv[1] + '-SAMPLES-REMOVED.fa'))
fasta.close()
output.close()
Example #2
0
import sys
import operator

import Oligotyping.lib.fastalib as u
from Oligotyping.utils.utils import pretty_print as pp

fasta = u.SequenceSource(sys.argv[1])

samples = {}
while fasta.next():
    if fasta.pos % 1000 == 0:
        sys.stderr.write('\rreads processed so far: %d' % (fasta.pos))
        sys.stderr.flush()
    sample_name = '_'.join(fasta.id.split('_')[:-1])

    if samples.has_key(sample_name):
        samples[sample_name] += 1
    else:
        samples[sample_name] = 1

sys.stderr.write('\rSamples and read counts found in the FASTA file:\n')
for sample, read_count in sorted(samples.iteritems(), key=operator.itemgetter(1), reverse = True):
    print '%-30s %s' % (sample, pp(read_count)) 

print
print
print 'Total number of samples: ', pp(len(samples))
print 'Total number of reads: ', pp(fasta.pos)
print
fasta.close()