def extract_dbotu_otu_seqs(membershipfn, fasta_derep, otu_seqs_fn):
    """
    Parameters
    ----------
    membershipfn           membership filename, made by dbotu.call_otus
                           has OTU representative seq IDs in first column, and
                           OTU member seq IDs in rest of row
    fatas_derep            fasta dereplicated; sequence IDs should match those
                           in membershipfn
    out_seqs_fn            output fasta file to write representative seqs to
    """

    ## Parse membership file
    with open(membershipfn, 'r') as f:
        lines = f.readlines()
    otu_reps = [l.split('\t')[0] for l in lines]

    ## Grab OTU representative seqs in membership file from dereplicated fasta
    ## and write to output
    with open(otu_seqs_fn, 'w') as out:
        for record in util.iter_fst(fasta_derep):
            sid = record[0][1:]
            seq = record[1]
            if sid in otu_reps:
                out.write('>dbotu' + sid + '\n' + seq + '\n')
    return None
def extract_dbotu_otu_seqs(membershipfn, fasta_derep, otu_seqs_fn):
    """
    Parameters
    ----------
    membershipfn           membership filename, made by dbotu.call_otus
                           has OTU representative seq IDs in first column, and
                           OTU member seq IDs in rest of row
    fatas_derep            fasta dereplicated; sequence IDs should match those
                           in membershipfn
    out_seqs_fn            output fasta file to write representative seqs to
    """

    ## Parse membership file
    with open(membershipfn, 'r') as f:
        lines = f.readlines()
    otu_reps = [l.split('\t')[0] for l in lines]

    ## Grab OTU representative seqs in membership file from dereplicated fasta
    ## and write to output
    with open(otu_seqs_fn, 'w') as out:
        for record in util.iter_fst(fasta_derep):
            sid = record[0][1:]
            seq = record[1]
            if sid in otu_reps:
                out.write('>dbotu' + sid + '\n' + seq + '\n')
    return None
Example #3
0
	def load_db(self):
		# Load existing SeqDB (if exists)
		if os.path.exists(self.fn):
			for tag, seq in util.iter_fst(self.fn):
				otu, size = re.search('>(.*);size=(\d+)', tag).groups()
				self.db[int(otu)] = seq
				self.size[int(otu)] = int(size)
		return self
Example #4
0
def parse_files(f, q):
    """
    Parse fasta and quality files, f and q
    """
    sids = []
    seqs = []
    quals = []

    for sid, seq in util.iter_fst(f):
        sids.append(sid[1:])
        seqs.append(seq)

    for _, qual in util.iter_fst(q):
        quals.append(qual.split(' '))

    if len(sids) != len(seqs) != len(quals):
        raise ValueError('fasta and quality files are not the same length!')

    return sids, seqs, quals
Example #5
0
def dist(fasta):
    data = []
    for i,record in enumerate(util.iter_fst(fasta)):
	sid, seq = record[:2]
	logfreq = np.log10(float(sid[1:]))
	data.append(logfreq)
	if i > 100000:
	    break
    data = np.array(data)
    np.savetxt('test100000.out', data, delimiter = '\t')
Example #6
0
def dist(fasta):
    data = []
    for i, record in enumerate(util.iter_fst(fasta)):
        sid, seq = record[:2]
        logfreq = np.log10(float(sid[1:]))
        data.append(logfreq)
        if i > 100000:
            break
    data = np.array(data)
    np.savetxt('test100000.out', data, delimiter='\t')
Example #7
0
def parse_index_file(index_fn, format='fasta'):
    # Map FASTQ sequences to their barcodes
    s2b = {} # maps sequences to barcodes
    # Case 1: index file is FASTA format
    if format=='fasta':
        for [s,b] in util.iter_fst(index_fn):
            s2b[s] = b
    # Case 2: index file is tab-delimited
    elif format=='tab':
        for line in open(index_fn):
            [s,b] = line.rstrip().split()
            s2b[s] = b
    return s2b
def parse_index_file(index_fn, format='fasta'):
    # Map FASTQ sequences to their barcodes
    s2b = {}  # maps sequences to barcodes
    # Case 1: index file is FASTA format
    if format == 'fasta':
        for [s, b] in util.iter_fst(index_fn):
            s2b[s] = b
    # Case 2: index file is tab-delimited
    elif format == 'tab':
        for line in open(index_fn):
            [s, b] = line.rstrip().split()
            s2b[s] = b
    return s2b
Example #9
0
def Freq_calculate(fasta, k, l, n, outfile):
    # Open the fasta file
    fn = fasta
    f = open(outfile, 'w')
    for record in util.iter_fst(fn):
        sid, seq = record[:2]
        C = float(sid[1:])
        l = float(l)
        n = float(n)
        k = float(k)
        freq = C / ((l - k + 1) * n)
        #q = '< k:'+str(k)+' l:'+str(l)+' n:'+str(n)+' C:'+str(C)+' Freq='+str(freq)+'\n'+seq+'\n'
        q = '<' + str(freq) + '\n' + seq + '\n'
        f.write(q)
Example #10
0
def Freq_calculate(fasta, k, l, n, outfile):
    # Open the fasta file
    fn = fasta
    f = open(outfile,'w')
    for record in util.iter_fst(fn):
        sid, seq = record[:2]
	C = float(sid[1:])
	l = float(l)
	n = float(n)
	k = float(k)
	freq = C/((l-k+1)*n)
	#q = '< k:'+str(k)+' l:'+str(l)+' n:'+str(n)+' C:'+str(C)+' Freq='+str(freq)+'\n'+seq+'\n'
	q = '<'+str(freq)+'\n'+seq+'\n'
	f.write(q)
Example #11
0
def Freq_calculate(fasta, k, l, n, outfile):
    # Open the fasta file
    fn = fasta
    f = open(outfile, 'w')
    for record in util.iter_fst(fn):
        sid, seq = record[:2]
        C = float(sid[1:])
        l = float(l)
        n = float(n)
        k = float(k)
        freq = C / ((l - k + 1) * n)
        if np.log10(freq) > -8.5:
            q = '>' + str(freq) + '\n' + seq + '\n'
            f.write(q)
Example #12
0
def load_db(fn, trim_len):
    # Load OTU database (otu id -> sequence)
    if not fn:
        return {}
    db = {}
    for [sid, seq] in util.iter_fst(fn):
        sid = int(sid)
        if trim_len:
            if len(seq) >= trim_len:
                seq = seq[:trim_len]
            else:
                continue
        db[sid] = seq
    return db
Example #13
0
def Freq_calculate(fasta, k, l, n, outfile):
    # Open the fasta file
    fn = fasta
    f = open(outfile,'w')
    for record in util.iter_fst(fn):
        sid, seq = record[:2]
        C = float(sid[1:])
        l = float(l)
        n = float(n)
        k = float(k)
        freq = C/((l-k+1)*n)
	if np.log10(freq) > -8.5:
	    q = '>'+str(freq)+'\n'+seq+'\n'
            f.write(q)
def parse_barcodes_file(map_fn, format='fasta', rc=False):
    # Map barcodes to samples
    b2s = {}  # maps barcodes to samples
    # Case 1: barcodes file is FASTA format
    if format == 'fasta':
        for [s, b] in util.iter_fst(map_fn):
            if rc == True:
                seq = reverse_complement(s)
            b2s[b] = s
    # Case 2: barcodes file is tab-delimited
    elif format == 'tab':
        for line in open(map_fn):
            [s, b] = line.rstrip().split()
            if rc == True:
                b = reverse_complement(b)
            b2s[b] = s
    # Return map of barcodes to samples
    return b2s
def remove_size_from_headers(raw_derep_in, raw_derep_out):
    """
    Rename sequences in raw_dereplicated.fasta from 'seq;size=204' to 'seq'.

    Parameters
    ----------
    raw_derep_in      fasta file with dereplicated, trimmed reads
                      This is the output from dereplicate_and_sort() which calls
                      3.dereplicate.py.
    raw_derep_out     file name for output file with renamed headers
    """

    with open(raw_derep_out, 'w') as out:
        for record in util.iter_fst(raw_derep_in):
            sid = record[0].split(';')[0]
            seq = record[1]
            out.write(sid + '\n' + seq + '\n')
    return None
Example #16
0
def parse_barcodes_file(map_fn, format='fasta', rc=False):
    # Map barcodes to samples
    b2s = {} # maps barcodes to samples
    # Case 1: barcodes file is FASTA format
    if format == 'fasta':
        for [s,b] in util.iter_fst(map_fn):
            if rc == True:
                seq = reverse_complement(s)
            b2s[b] = s
    # Case 2: barcodes file is tab-delimited
    elif format == 'tab':
        for line in open(bcode_fn):
            [s,b] = line.rstrip().split()
            if rc == True:
                b = reverse_complement(b)
            b2s[b] = s
    # Return map of barcodes to samples
    return b2s
def remove_size_from_headers(raw_derep_in, raw_derep_out):
    """
    Rename sequences in raw_dereplicated.fasta from 'seq;size=204' to 'seq'.

    Parameters
    ----------
    raw_derep_in      fasta file with dereplicated, trimmed reads
                      This is the output from dereplicate_and_sort() which calls
                      3.dereplicate.py.
    raw_derep_out     file name for output file with renamed headers
    """

    with open(raw_derep_out, 'w') as out:
        for record in util.iter_fst(raw_derep_in):
            sid = record[0].split(';')[0]
            seq = record[1]
            out.write(sid + '\n' + seq + '\n')
    return None
Example #18
0
def Create_table(inlist):
    # initiate dictionary
    dict = {}
    # read in each file:
    for i, fasta in enumerate(open(inlist)):
    	fastafile = fasta[:-1]
    	print i
	for item in dict:
            dict[item] += [-20]
    	for record in util.iter_fst(fastafile):
		sid, seq = record[:2]
    		sfreq = float(sid[1:])
    		if dict.has_key(seq) == False:
    			dict[seq] = [-20]*i + [sfreq]
    		else:
    			dict[seq][-1]= sfreq
#    print dict
    return dict
Example #19
0
def Create_table(inlist):
    # initiate dictionary
    dict = {}
    # read in each file:
    for i, fasta in enumerate(open(inlist)):
        fastafile = fasta[:-1]
        print i
        for item in dict:
            dict[item] += [-20]
        for record in util.iter_fst(fastafile):
            sid, seq = record[:2]
            sfreq = float(sid[1:])
            if dict.has_key(seq) == False:
                dict[seq] = [-20] * i + [sfreq]
            else:
                dict[seq][-1] = sfreq


#    print dict
    return dict
def fasta2table(fastaIn, tableOut):
    # Converts a set of fasta sequences into a table format with the first column
    # corresponding to label lines beginning with > and the second column to the sequence.
    keep = {}
    seqs = {}

    for [otu_number, seq] in util.iter_fst(fastaIn):
        otu_number = otu_number[1:]
        keep[otu_number] = 1
        seqs[otu_number] = seq

    # Sort and organize into a new tab-delimited file with OTU_ID and Sequence as columns
    fid = open(tableOut,'w')
    headerline = "OTU_ID" + '\t' + 'Sequence'
    fid.write(headerline+'\n')
    for otu_number in keep:
        line = str(otu_number) + '\t' + seqs[otu_number]
        fid.write(line+'\n')
    fid.close()
    return None
def parse_index_file(index_fn, format='fasta'):
    # Map FASTQ sequences to their barcodes
    s2b = {} # maps sequences to barcodes
    # Case 1: index file is FASTA format
    if format=='fasta':
        for [s,b] in util.iter_fst(index_fn):
            # note: I'm pretty sure this won't work for downstream, because you need
            # to remove the first character from sequence ID
            s2b[s] = b  
    # Case 2: index file is tab-delimited
    elif format=='tab':
        for line in open(index_fn):
            [s,b] = line.rstrip().split()
            s2b[s] = b
    # Case 3: index file is FASTQ format
    elif format=='fastq':
        for [s,b,_,_] in util.iter_fsq(index_fn):
            # If sequence ID has :Y:0: thing at the end (standard Illumina format), remove it
            # For this kind of fastq line: @SL-MAJ:AY3TB170104:AY3TB:1:1101:10000:7854 :N:0:
            s = s.rsplit(' ', 1)[0]
            s2b[s[1:]] = b
    return s2b
Example #22
0
def parse_index_file(index_fn, format='fasta'):
    # Map FASTQ sequences to their barcodes
    s2b = {}  # maps sequences to barcodes
    # Case 1: index file is FASTA format
    if format == 'fasta':
        for [s, b] in util.iter_fst(index_fn):
            # note: I'm pretty sure this won't work for downstream, because you need
            # to remove the first character from sequence ID
            s2b[s] = b
    # Case 2: index file is tab-delimited
    elif format == 'tab':
        for line in open(index_fn):
            [s, b] = line.rstrip().split()
            s2b[s] = b
    # Case 3: index file is FASTQ format
    elif format == 'fastq':
        for [s, b, _, _] in util.iter_fsq(index_fn):
            # If sequence ID has :Y:0: thing at the end (standard Illumina format), remove it
            # For this kind of fastq line: @SL-MAJ:AY3TB170104:AY3TB:1:1101:10000:7854 :N:0:
            s = s.rsplit(' ', 1)[0]
            s2b[s[1:]] = b
    return s2b
Example #23
0
import argparse, util

parser = argparse.ArgumentParser()
parser.add_argument('--fst', default='input fasta file')
parser.add_argument('--otus', default='list of otus')
args = parser.parse_args()

otus = [line.rstrip() for line in open(args.otus)]

for [sid, seq] in util.iter_fst(args.fst):
    if sid in otus:
        print '>%s\n%s' %(sid, seq)
Example #24
0
    "GGA": "G",
    "TGG": "W",
    "CGG": "R",
    "AGG": "R",
    "GGG": "G"
}


def get_codons(nt):
    if len(nt) % 3 != 0:
        quit('error: len(nt) not divisible by 3')
    for i in range(int(len(nt) / 3)):
        beg = i * 3
        end = beg + 3
        yield nt[beg:end]


def translate(nt):
    nt = nt.upper()
    aa = ''
    for codon in get_codons(nt):
        aa += codon_table[codon]
    return aa


if __name__ == '__main__':
    import sys
    for record in util.iter_fst(sys.argv[1]):
        record[1] = translate(record[1])
    print('\n'.join(record))
Example #25
0
                    help='print command',
                    default=False,
                    action='store_true')
args = parser.parse_args()

# auto settings
if args.lax == True:
    args.b1 = .5
    args.b2 = .5
    args.b3 = 10
    args.b4 = 5
    args.b5 = 'all'
    args.cut = 25

# fix arguments
n = len([record for record in util.iter_fst(args.aln)])
args.b1 = int(args.b1 * n) + 1
args.b2 = int(args.b2 * n) + 1
args.b5 = {'none': 'N', 'half': 'H', 'all': 'A'}[args.b5]

# encode gblocks input
gmap = {}
temp = open('%s.temp.aln' % (args.aln), 'w')
print('writing alignmnet to %s.temp.aln' % (args.aln))
for i, record in enumerate(util.iter_fst(args.aln)):
    old = record[0]
    new = 'seq%d' % (i)
    gmap[new] = old
    record[0] = '>%s' % (new)
    temp.write('\n'.join(record) + '\n')
temp.close()
Example #26
0
cds = sorted(translate.codon_table.keys())
aas = sorted('G A L M F W K Q E S P V I C Y H R N D T'.split())
alphabet = nts + cds + aas
header = ['gene'] + ['count_nt_%s' %(li) for li in nts] + ['count_cd_%s' %(li) for li in cds] + ['count_aa_%s' %(li) for li in aas] + \
    ['freq_nt_%s' %(li) for li in nts] + ['freq_cd_%s' %(li) for li in cds] + ['freq_aa_%s' %(li) for li in aas]
print('\t'.join(header))

# read sequence id map
smap = {}
if args.map:
    for line in open(args.map):
        line = line.rstrip().split()
        smap[line[0]] = line[1]

# iterate over sequences
for record in util.iter_fst(args.fst):
    sid, seq = record

    if len(seq) % 3 != 0:
        continue

    # map sequence ids
    sid = sid.split()[0][1:]
    if args.map:
        if sid in smap:
            sid = smap[sid]
        else:
            continue

    # count fna features
    # ------------------
args = parser.parse_args()
filemap = args.fmap

with open(filemap, 'r') as f:
    lines = f.readlines()

lines = [line.strip().split('\t') for line in lines]
sids = [line[1] for line in lines]
fastas = [line[0] for line in lines]

## Relabel sequences in individual fastas. Save each one as *.sb
## Also concatenate all of these fastas into one large fasta, named myDataset.raw_concat.fasta

concat_fasta = args.dataset + '.raw_concat.fasta'
with open(concat_fasta, 'w') as concat:
    for fasta, newsid in zip(fastas, sids):
        new_fasta = fasta + '.sb'
        counter = 0
        with open(new_fasta, 'w') as f:
            print(new_fasta)
            for oldsid, seq in util.iter_fst(fasta):
                counter += 1
                f.write('>' + newsid + '_' + str(counter) + '\n')
                f.write(seq + '\n')
                
                # @Thomas: I think concatenating at the same time as relabeling will be fastest, but I'm not sure!
                concat.write('>' + newsid + '_' + str(counter) + '\n')
                concat.write(seq + '\n')

"""
Relabel sequences in *.raw_trimmed.fasta to have
datasetID--sampleID_N
"""

import os
import argparse
import util

parser = argparse.ArgumentParser()
parser.add_argument('trimmed_dir',
                    help='directory with *.raw_trimmed.fasta files')
args = parser.parse_args()

files = [
    os.path.join(args.trimmed_dir, i) for i in os.listdir(args.trimmed_dir)
    if i.endswith('.raw_trimmed.fasta')
]

for fname in files:
    dataset = fname.split('/')[-1].split('.')[0]
    with open(fname + '.relabeled', 'w') as fnew:
        for sid, seq in util.iter_fst(fname):
            sid = '>' + dataset + '--' + sid[1:]
            fnew.write('\n'.join([sid, seq]) + '\n')
                    help='Input fasta sequences (optional)',
                    default='')
parser.add_argument('--map', help='Input mapping file', required=True)
parser.add_argument('--min_count', help='Minimum read count', type=int)
parser.add_argument('--min_samples',
                    help='Minimum number of samples',
                    type=int)
parser.add_argument('--out', help='Output counts matrix')

# Parse command line arguments
args = parser.parse_args()

# Load valid fst seqs
keep = {}
if args.fst:
    for [otu, seq] in util.iter_fst(args.fst):
        otu = otu[1:]
        keep[otu] = 1

# Keep track of samples and otus
samples = {}
otus = {}

# For every line in the mapping file
for line in open(args.map):
    # Load otu name and table of sample counts
    otu, table = line.rstrip().split('\t')
    if len(keep) > 0 and otu not in keep:
        continue
    entries = table.split(' ')
    count = sum(
Example #30
0
import argparse
import util

# parse args
parser = argparse.ArgumentParser()
parser.add_argument('-f', help='FASTA file')
parser.add_argument('-q', help='FASTQ file')
parser.add_argument('-s', help='Subset ids')
args = parser.parse_args()

# load subset
subset = [line.rstrip() for line in open(args.s)]

# get iterator
iter_seq = ''
if args.f:
    iter_seq = util.iter_fst(args.f)
if args.q:
    iter_seq = util.iter_fsq(args.q)

# subset file
for record in iter_seq:
    sid = record[0][1:].split(';')[0]
    if sid in subset:
        print '\n'.join(record)
    
Example #31
0
parser.add_argument('--minlen', help='Minimum length', type=int, default=0)
parser.add_argument('--num', help='Number to keep', type=int, default=0)
parser.add_argument('--prefix', help='Prefix to add', type=str, default='')
parser.add_argument('--prefix_sep',
                    help='Prefix separator',
                    type=str,
                    default='.')
parser.add_argument('--debug',
                    help='Debug mode',
                    action='store_true',
                    default=False)
args = parser.parse_args()

# get iterator
if args.fst:
    iter_seq = util.iter_fst(args.fst)
elif args.fsq:
    iter_seq = util.iter_fsq(args.fsq)
elif args.FST:
    iter_seq = util.iter_fst(sys.stdin)
elif args.FSQ:
    iter_seq = util.iter_fsq(sys.stdin)
else:
    quit('error: must specify fst, fsq, FST, or FSQ')

# initialize variables
keep = {}
remove = {}

# load IDs/coordinates to keep
if args.keep:
Example #32
0
import util, sys

fn = sys.argv[1]
k = int(sys.argv[2])


for [sid, seq] in util.iter_fst(fn):

    if len(seq) >= k:
        print '>%s\n%s' %(sid, seq[:k])
import util

parser = argparse.ArgumentParser()
parser.add_argument('--fst', help='Input fasta sequences (optional)', default='')
parser.add_argument('--map', help='Input mapping file', required=True)
parser.add_argument('--min_count', help='Minimum read count', type=int)
parser.add_argument('--min_samples', help='Minimum number of samples', type=int)
parser.add_argument('--out', help='Output counts matrix')

# Parse command line arguments
args = parser.parse_args()

# Load valid fst seqs
keep = {}
if args.fst:
    for [otu, seq] in util.iter_fst(args.fst):
        otu = otu[1:]
        keep[otu] = 1

# Keep track of samples and otus
samples = {}
otus = {}

# For every line in the mapping file
for line in open(args.map):
    # Load otu name and table of sample counts
    otu, table = line.rstrip().split('\t')
    if len(keep) > 0 and otu not in keep:
        continue
    entries = table.split(' ')
    count = sum([int(entry.split(':')[1]) >= args.min_count for entry in entries])
Example #34
0
print('Parsing dereplication map: {}'.format(args.map_file))
seq_sizes = {}

with open(args.map_file, 'r') as f:
    lines = f.readlines()

for line in lines:
    line = line.strip().split('\t')
    seqID = line[0]
    total_size = sum([int(i.split('size=')[1].split(':1')[0]) for i in line[1].split(' ')])
    seq_sizes[seqID] = total_size

## Read in the entire fasta file into a dict {seqID: sequence}
print('Reading fasta file: {}'.format(args.fasta_in))
fasta = {}
for sid, seq in util.iter_fst(args.fasta_in):
    # sid in the fasta is something like >444;size=8
    # sid.split(';')[0][1:] returns 444, which is a key in seq_sizes
    sid = sid.split(';')[0][1:]
    newsid = '>' + sid + ';size=' + str(seq_sizes[sid])
    fasta[sid] = {}
    fasta[sid]['new_sid'] = newsid
    fasta[sid]['seq'] = seq

## Get list of sequence IDs in descending size (i.e. largest first)
ordered_seqs = sorted(seq_sizes, key=lambda k: seq_sizes[k], reverse=True)

## Write new fasta file in descending size
print('Writing sorted and relabled fasta: {}'.format(args.fasta_out))
with open(args.fasta_out, 'w') as f:
    f.write('\n'.join([fasta[s]['new_sid'] + '\n' + fasta[s]['seq'] for s in ordered_seqs]))