def parse_index_file(index_fn, format='fasta'):
    # Map FASTQ sequences to their barcodes
    s2b = {} # maps sequences to barcodes
    # Case 1: index file is FASTA format
    if format=='fasta':
        for [s,b] in util.iter_fst(index_fn):
            # note: I'm pretty sure this won't work for downstream, because you need
            # to remove the first character from sequence ID
            s2b[s] = b  
    # Case 2: index file is tab-delimited
    elif format=='tab':
        for line in open(index_fn):
            [s,b] = line.rstrip().split()
            s2b[s] = b
    # Case 3: index file is FASTQ format
    elif format=='fastq':
        for [s,b,_,_] in util.iter_fsq(index_fn):
            # If sequence ID has :Y:0: thing at the end (standard Illumina format), remove it
            # For this kind of fastq line: @SL-MAJ:AY3TB170104:AY3TB:1:1101:10000:7854 :N:0:
            s = s.rsplit(' ', 1)[0]
            s2b[s[1:]] = b
    return s2b
Esempio n. 2
0
def parse_index_file(index_fn, format='fasta'):
    # Map FASTQ sequences to their barcodes
    s2b = {}  # maps sequences to barcodes
    # Case 1: index file is FASTA format
    if format == 'fasta':
        for [s, b] in util.iter_fst(index_fn):
            # note: I'm pretty sure this won't work for downstream, because you need
            # to remove the first character from sequence ID
            s2b[s] = b
    # Case 2: index file is tab-delimited
    elif format == 'tab':
        for line in open(index_fn):
            [s, b] = line.rstrip().split()
            s2b[s] = b
    # Case 3: index file is FASTQ format
    elif format == 'fastq':
        for [s, b, _, _] in util.iter_fsq(index_fn):
            # If sequence ID has :Y:0: thing at the end (standard Illumina format), remove it
            # For this kind of fastq line: @SL-MAJ:AY3TB170104:AY3TB:1:1101:10000:7854 :N:0:
            s = s.rsplit(' ', 1)[0]
            s2b[s[1:]] = b
    return s2b
Esempio n. 3
0
parser.add_argument('--fastq', help='FASTQ file', required=True)
parser.add_argument('--blast', help='BLAST file', required=True)
parser.add_argument('--sample', help='Sample name', required=True)
parser.add_argument('--prefix', help='OTU prefix', default='bacteria')
parser.add_argument('--out', help='Output prefix (counts)')
args = parser.parse_args()


# ---------------
# Initialize data
# ---------------
print('Initializing data')

# 1. Read FASTQ sequences into dictionary
seqs = {}
for record in util.iter_fsq(args.fastq):
    sid = record[0][1:]
    seq = record[1]
    seqs[sid] = seq

# 2. Map: microbial contigs to GCF IDs
contig2gcf = {}
for line in open('/home/unix/csmillie/aviv/db/refseq/meta/contig2gcf.txt'):
    contig, gcf = re.sub('"', '', line).rstrip().split('\t')
    contig2gcf[contig] = gcf        


# 3. Map: GCF IDs to taxonomy
gcf2sp = {}
gcf2gn = {}
for line in open('/home/unix/csmillie/aviv/db/refseq/meta/gcf.taxonomy_table.txt'):
Esempio n. 4
0
parser.add_argument('--prefix', help='Prefix to add', type=str, default='')
parser.add_argument('--prefix_sep',
                    help='Prefix separator',
                    type=str,
                    default='.')
parser.add_argument('--debug',
                    help='Debug mode',
                    action='store_true',
                    default=False)
args = parser.parse_args()

# get iterator
if args.fst:
    iter_seq = util.iter_fst(args.fst)
elif args.fsq:
    iter_seq = util.iter_fsq(args.fsq)
elif args.FST:
    iter_seq = util.iter_fst(sys.stdin)
elif args.FSQ:
    iter_seq = util.iter_fsq(sys.stdin)
else:
    quit('error: must specify fst, fsq, FST, or FSQ')

# initialize variables
keep = {}
remove = {}

# load IDs/coordinates to keep
if args.keep:
    for line in open(args.keep):
        line = line.rstrip().split('\t')
Esempio n. 5
0
#!/usr/bin/env python
import util
q = util.iter_fsq()
for record in q:
    print '>%s\n%s' %(record[0][1:], record[1])
Esempio n. 6
0
    line = line.rstrip().split()
    folder = line[0]
    sample = line[1]
    info[sample] = folder

# Extract info
[sample, site] = args.sid.split('.')
folder = info[sample]
site = sites[site]

# Get sequence IDs
seqs = {}
orgs = 'archaea bacteria fungi protozoa viral'.split()
for org in orgs:
    fn = './%s/%s_%s_bowtie2_contam.fastq' % (org, args.sid, org)
    for record in util.iter_fsq(fn):
        if len(record) > 0:
            seqs[record[0][1:]] = record
print 'Found %d sequences' % (len(seqs))

# Parse BAM file
out = open(args.out, 'w')
fn = '/home/unix/csmillie/Gut_Human/data/%s/%s/outs/possorted_genome_bam.bam' % (
    folder, site)
if not os.path.exists(fn):
    print 'BAM file not found'
for line in os.popen('samtools view -f 4 %s' % (fn)):
    read = line.split()[0]
    if read in seqs:

        cell = ''
Esempio n. 7
0
import argparse
import util

# parse args
parser = argparse.ArgumentParser()
parser.add_argument('-f', help='FASTA file')
parser.add_argument('-q', help='FASTQ file')
parser.add_argument('-s', help='Subset ids')
args = parser.parse_args()

# load subset
subset = [line.rstrip() for line in open(args.s)]

# get iterator
iter_seq = ''
if args.f:
    iter_seq = util.iter_fst(args.f)
if args.q:
    iter_seq = util.iter_fsq(args.q)

# subset file
for record in iter_seq:
    sid = record[0][1:].split(';')[0]
    if sid in subset:
        print '\n'.join(record)