Exemple #1
0
#After selection
samples = aaf_kmercount(selection_dir,kl,n,options.nThreads,memSize/options.nThreads)

###Merge output wc files
divFile = selection_dir+'.wc'
handle = open(divFile, 'w')
handle.close()

for sample in samples:
    countfile = sample + '.wc'
    os.system('cat {} >> {}'.format(countfile, divFile))
    os.remove(countfile)

 ###Run kmer_merge
outFile = selection_dir+'.dat.gz'
handle = smartopen(outFile, 'w')
handle.write(('#-k {}\n'.format(kl)).encode())
handle.write(('#-n {}\n'.format(n)).encode())
for i, sample in enumerate(samples):
    handle.write(('#sample{}: {}\n'.format(i + 1, sample)).encode())
handle.close()

command = "{} -k s -c -d '0' -a 'T,M,F'".format(filt)
cut = []
for i, sample in enumerate(samples):
    command += " '{}.pkdat.gz'".format(sample)
    cut.append(str((i + 1) * 2))

command += ' | cut -f {} | gzip >> {}'.format(','.join(cut), outFile)
print('\n', time.strftime('%c'))
print(command)
Exemple #2
0
#  MA 02110-1301, USA.
#


from AAF import smartopen, present
from optparse import OptionParser
import sys
Usage = "%prog [options] shared_kmer_table"
version = '%prog 20170118.1'
parser = OptionParser(Usage, version = version)
parser.add_option("-n", dest = "filter", type = int, default = 1,
                  help = "k-mer filtering threshold, default = 1")

(options, args) = parser.parse_args()

kmer_table = smartopen(sys.argv[1])
outfile = open(sys.argv[1].split('.')[0]+'_kmerMatrix.tsv','w')
n = options.filter

line1 = kmer_table.readline()
if line1.startswith('#') #with header
    outfile.write('kmers')
    for line in kmer_table:
        if line.startswith('#sample'):
            outfile.write('\t'+line.split(":")[1].strip()+'\n')
        else:
            outfile.write(line.split()[0]+'\t')
            outfile.write('\t'.join([present(i,n) for i in line.split()[1:]])+'\n')
else:
    outfile.write(line1.split()[0]+'\t')
    outfile.write('\t'.join([present(i,n) for i in line1.split()[1:]])+'\n')
Exemple #3
0
            if flag[-3] == '1': #read unmapped
                if flag[-1] == '1': #read paired
                    if flag[-4] =='1': #mate unmapped
                        dic[line.split()[0]] = 'both'
                    else:
                        if flag[5] == '1':
                            dic[line.split()[0]] = 'R1'
                        else:
                            dic[line.split()[0]] = 'R2'
                else:
                    if flag[5] == '1':
                        dic[line.split()[0]] = 'R1'
                    else:
                        dic[line.split()[0]] = 'R2'

file1 = smartopen(sys.argv[2])
out1 = gzip.open(sys.argv[1].split('.')[0] + '_pair1.fq.gz','w')
out3 = gzip.open(sys.argv[1].split('.')[0] + '_singleton.fq.gz','w')
for record1 in SeqIO.parse(file1,'fastq'):
    if record1.id in dic:
        if dic[record1.id] == 'both':
            SeqIO.write(record1,out1,"fastq")
        elif dic[record1.id] == 'R1':
            SeqIO.write(record1,out3,"fastq")
file1.close()
out1.close()

file2 = smartopen(sys.argv[3])
out2 = gzip.open(sys.argv[1].split('.')[0] + '_pair2.fq.gz','w')
for record2 in SeqIO.parse(file2,'fastq'):
    if record2.id in dic:
                pattern[kmer] = line_pattern
    return pattern


Usage = "%prog [options] shared_kmer_table kmer_file"
version = '%prog 20161117.1'
parser = OptionParser(Usage, version = version)
parser.add_option("-n", dest = "filter", type = int, default = 1,
                  help = "k-mer filtering threshold, default = 1")
parser.add_option("-t", dest = "nThreads", type = int, default = 1,
                  help = "number of threads to use, default = 1")
parser.add_option("-G", dest = "memsize", type = float, default = 1,
                  help = "max memory to use (in GB), default = 1")
(options, args) = parser.parse_args()

kmer_table = smartopen(sys.argv[1])
input = smartopen(sys.argv[2])
n = options.filter
nThreads = options.nThreads
memory = options.memsize

line = input.readline()
line = input.readline()
if line.startswith(tuple('ATCG')):
	Type = 'kmer'
	output = open(os.path.basename(sys.argv[2]).split('.')[0]+'.pattern','w')
elif line.startswith(tuple('01')):
    Type = 'pattern'
    print(os.path.basename(sys.argv[2]).split('.')[0]+'.pattern')
    output = open(os.path.basename(sys.argv[2]).split('.')[0]+'.kmer','w')
else:
Exemple #5
0
            if flag[-3] == '1':  #read unmapped
                if flag[-1] == '1':  #read paired
                    if flag[-4] == '1':  #mate unmapped
                        dic[line.split()[0]] = 'both'
                    else:
                        if flag[5] == '1':
                            dic[line.split()[0]] = 'R1'
                        else:
                            dic[line.split()[0]] = 'R2'
                else:
                    if flag[5] == '1':
                        dic[line.split()[0]] = 'R1'
                    else:
                        dic[line.split()[0]] = 'R2'

file1 = smartopen(sys.argv[2])
out1 = gzip.open(sys.argv[1].split('.')[0] + '_pair1.fq.gz', 'w')
out3 = gzip.open(sys.argv[1].split('.')[0] + '_singleton.fq.gz', 'w')
for record1 in SeqIO.parse(file1, 'fastq'):
    if record1.id in dic:
        if dic[record1.id] == 'both':
            SeqIO.write(record1, out1, "fastq")
        elif dic[record1.id] == 'R1':
            SeqIO.write(record1, out3, "fastq")
file1.close()
out1.close()

file2 = smartopen(sys.argv[3])
out2 = gzip.open(sys.argv[1].split('.')[0] + '_pair2.fq.gz', 'w')
for record2 in SeqIO.parse(file2, 'fastq'):
    if record2.id in dic:
Exemple #6
0
Usage = "%prog [options] <data directory> <sequence format, fasta or fastq>"
version = '%prog 20161212.1'
parser = OptionParser(Usage, version=version)
(options, args) = parser.parse_args()

if os.path.isdir(sys.argv[1]):
    file_list_1 = os.listdir(sys.argv[1])
    file_list = [os.path.join(sys.argv[1], x) for x in file_list_1]
else:
    file_list = [sys.argv[1]]
seq_form = sys.argv[2]
dic = {}
c = 0
g = 0
n = 0
total = 0
length = []
for seq_file in file_list:
    if not seq_file.endswith("~"):
        fh = smartopen(seq_file)
        for seq_record in SeqIO.parse(fh, seq_form):
            c += seq_record.seq.count('C')
            g += seq_record.seq.count('G')
            n += seq_record.seq.count('N')
            total += len(seq_record.seq)
            length.append(len(seq_record.seq))
        print(seq_file, c, g, n, total)
print('GC:', float((c + g)) / (total - n))
print('mean:', total / len(length))
print('total:', total)
        fitch = './fitch_kmerX_long'
    else:
        fitch = './fitch_kmerX'
    if not is_exe(fitch):
        print(fitch+' not found. Make sure it is in your PATH or the')
        print('current directory, and that it is executable')
        sys.exit()
else:
    if options.long:
        fitch = 'fitch_kmerX_long'
    else:
        fitch = 'fitch_kmerX'

#check input files
try:
	kmerTable = smartopen(options.iptf)
except IOError:
	print('Cannot open file', options.iptf)
	sys.exit()

try:
	singleton = open(options.countfs)
except IOError:
	print('Cannot open file', options.countf)
	sys.exit()

###Read header and get sample list
samples = []              #species list
line = kmerTable.readline()
ll = line.split()
kl = int(ll[1])       #kmer length
Exemple #8
0
Usage = "%prog [options] <data directory> <sequence format, fasta or fastq>"
version = '%prog 20161212.1'
parser = OptionParser(Usage, version = version)
(options, args) = parser.parse_args()

if os.path.isdir(sys.argv[1]):
    file_list_1 = os.listdir(sys.argv[1])
    file_list = [os.path.join(sys.argv[1],x) for x in file_list_1]
else:
    file_list = [sys.argv[1]]
seq_form = sys.argv[2]
dic={}
c=0
g=0
n=0
total=0
length = []
for seq_file in file_list:
    if not seq_file.endswith("~"):
        fh = smartopen(seq_file)
        for seq_record in SeqIO.parse(fh, seq_form):
            c += seq_record.seq.count('C')
            g += seq_record.seq.count('G')
            n += seq_record.seq.count('N')
            total += len(seq_record.seq)
            length.append(len(seq_record.seq))
        print(seq_file, c, g, n, total)
print('GC:',float((c+g))/(total-n))
print('mean:',total/len(length))
print('total:',total)
                  type=int,
                  default=1,
                  help="k-mer filtering threshold, default = 1")
parser.add_option("-t",
                  dest="nThreads",
                  type=int,
                  default=1,
                  help="number of threads to use, default = 1")
parser.add_option("-G",
                  dest="memsize",
                  type=float,
                  default=1,
                  help="max memory to use (in GB), default = 1")
(options, args) = parser.parse_args()

kmer_table = smartopen(sys.argv[1])
input = smartopen(sys.argv[2])
n = options.filter
nThreads = options.nThreads
memory = options.memsize

line = input.readline()
line = input.readline()
if line.startswith(tuple('ATCG')):
    Type = 'kmer'
    output = open(
        os.path.basename(sys.argv[2]).split('.')[0] + '.pattern', 'w')
elif line.startswith(tuple('01')):
    Type = 'pattern'
    print(os.path.basename(sys.argv[2]).split('.')[0] + '.pattern')
    output = open(os.path.basename(sys.argv[2]).split('.')[0] + '.kmer', 'w')
Exemple #10
0
#After selection
samples = aaf_kmercount(selection_dir,kl,n,options.nThreads,memSize/options.nThreads)

###Merge output wc files
divFile = selection_dir+'.wc'
handle = open(divFile, 'w')
handle.close()

for sample in samples:
    countfile = sample + '.wc'
    os.system('cat {} >> {}'.format(countfile, divFile))
    os.remove(countfile)

 ###Run kmer_merge
outFile = selection_dir+'.dat.gz'
handle = smartopen(outFile, 'w')
handle.write(('#-k {}\n'.format(kl)).encode('latin-1'))
handle.write(('#-n {}\n'.format(n)).encode('latin-1'))
for i, sample in enumerate(samples):
    handle.write(('#sample{}: {}\n'.format(i + 1, sample)).encode('latin-1'))
handle.close()

command = "{} -k s -c -d '0' -a 'T,M,F'".format(filt)
cut = []
for i, sample in enumerate(samples):
    command += " '{}.pkdat.gz'".format(sample)
    cut.append(str((i + 1) * 2))

command += ' | cut -f {} | gzip >> {}'.format(','.join(cut), outFile)
print('\n', time.strftime('%c'))
print(command)