def call_starcode_fastq_file(fastq): #pdb.set_trace() MIN_BRCD = 15 MAX_BRCD = 25 brcd_outfname = fname + '_barcodes.tsv' spk_outfname = fname + '_spikes.tsv' GFP = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4) SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2) barcode_tempf = tempfile.NamedTemporaryFile(delete=False) spike_tempf = tempfile.NamedTemporaryFile(delete=False) with gzopen(fastq) as f: outf = None for lineno, line in enumerate(f): if lineno % 4 != 1: continue hit = GFP.match(line) if hit is not None: outf = barcode_tempf else: hit = SPIKE.match(line) if hit is not None: outf = spike_tempf else: continue pos = hit.matchlist[0][0] if MIN_BRCD <= pos <= MAX_BRCD: outf.write(line[:pos] + '\n') barcode_tempf.close() spike_tempf.close() subprocess.call([ 'starcode', '-t4', '-i', barcode_tempf.name, '-o', brcd_outfname, ]) subprocess.call([ 'starcode', '-t4', '-i', spike_tempf.name, '-o', spk_outfname, ]) # Delete temporary files. os.unlink(barcode_tempf.name) os.unlink(spike_tempf.name) # Save the names of the files processsed #processed.append([brcd_outfname,spk_outfname]) processed.append(brcd_outfname) spikessed.append(spk_outfname) #pdb.set_trace() return
def call_starcode_fastq_file(fastq): #pdb.set_trace() MIN_BRCD = 15 MAX_BRCD = 25 brcd_outfname = fname + '_barcodes.tsv' spk_outfname = fname + '_spikes.tsv' GFP = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4) SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2) barcode_tempf = tempfile.NamedTemporaryFile(delete=False) spike_tempf = tempfile.NamedTemporaryFile(delete=False) with gzopen(fastq) as f: outf = None for lineno,line in enumerate(f): if lineno % 4 != 1: continue hit = GFP.match(line) if hit is not None: outf = barcode_tempf else: hit = SPIKE.match(line) if hit is not None: outf = spike_tempf else: continue pos = hit.matchlist[0][0] if MIN_BRCD <= pos <= MAX_BRCD: outf.write(line[:pos] + '\n') barcode_tempf.close() spike_tempf.close() subprocess.call([ 'starcode', '-t4', '-i', barcode_tempf.name, '-o', brcd_outfname,]) subprocess.call([ 'starcode', '-t4', '-i', spike_tempf.name, '-o', spk_outfname,]) # Delete temporary files. os.unlink(barcode_tempf.name) os.unlink(spike_tempf.name) # Save the names of the files processsed #processed.append([brcd_outfname,spk_outfname]) processed.append(brcd_outfname) spikessed.append(spk_outfname) #pdb.set_trace() return
def __init__(self, cst, fwd, rev, samples, spikes=dict()): '''Simple constructor instantiating attributes.''' # Base multiplexing information. self.cst = seeq.compile(cst, len(cst) / 5) self.samples = samples # Store the sequences of the primers used # in a frozen set for reference. self.Lseq = frozenset([fwd[-a:] for (a, b) in self.samples.keys()]) self.Rseq = frozenset([rev[-b:] for (a, b) in self.samples.keys()]) # Check that the primers are not too close to each # other, otherwise it will be impossible to demultiplex. # If all the primers are at a distance greater than # twice the cut-off, then no double hit is possible. for (a, b) in combinations(self.Lseq, 2): if dist_less_than(a, b, 2 * len(fwd) / 5): raise BadSpecifications('primer sequences too close', a, b) for (a, b) in combinations(self.Rseq, 2): if dist_less_than(a, b, 2 * len(rev) / 5): raise BadSpecifications('primer sequences too close', a, b) # Check that the spikes (if present) are not too close to # each other to avoid multiple matches. The distance between # any two spikes has to be at least 2. This does not guarantee # that the matches are unique but we have to accomodate the # experiments. for (a, b) in combinations(spikes.values(), 2): if dist_less_than(a, b, 1): raise BadSpecifications('spike sequences too close', a, b) # Check that the spikes can be found by looking for the # constant part. for spseq in spikes.values(): if self.cst.match(spseq) is None: raise BadSpecifications('spike too divergent', spseq) # Spikes are not too close. Replace the values of the # dictionary by a compiled seeq pattern allowing 2 errors. self.spikes = dict() for spname in spikes: self.spikes[spname] = seeq.compile(spikes[spname], 1) # Check that the sample specification corresponds to # the primers used for the PCR. if len(fwd) != max([a for (a, b) in samples.keys()]): raise BadSpecifications('inconsistent sample keys') if len(rev) != max([b for (a, b) in samples.keys()]): raise BadSpecifications('inconsistent sample keys')
def test_matchSuffix(self): matcher = seeq.compile("CGCTAATTAATGGAAT", 3) nomatch = "ATGCTGATGCTGGGGG" match = "GGGGCGCTAATAATGGAATGGGG" self.assertEqual(matcher.matchSuffix(nomatch, True), "") self.assertEqual(matcher.matchSuffix(nomatch, False), "") self.assertEqual(matcher.matchSuffix(match, True), "CGCTAATAATGGAATGGGG") self.assertEqual(matcher.matchSuffix(match, False), "GGGG")
def extract_reads_from_PE_fastq(fname_iPCR_PE1, fname_iPCR_PE2): """This function takes the 2 pair-end sequencing files and extracts the barcode making sure that the other read contains the transposon.""" MIN_BRCD = 15 MAX_BRCD = 25 MIN_GENOME = 15 # The known parts of the sequences are matched with a Levenshtein # automaton. On the reverse read, the end of the transposon # corresponds to a 34 bp sequence ending as shown below. We allow # up to 5 mismatches/indels. On the forward read, the only known # sequence is the CATG after the barcode, which is matched exactly. pT2 = seeq.compile('TGTATGTAAACTTCCGACTTCAACTGTA', 5) # Open a file to write fname_fasta = re.sub(r'[\_F][w\_].fastq(\.gz)?', 'iPCR.fasta', fname_iPCR_PE1) # Substitution failed, append '.fasta' to avoid name collision. if fname_fasta == fname_iPCR_PE1: fname_fasta = fname_iPCR_PE1 + '.fasta' # Skip if file exists. if os.path.exists(fname_fasta): return fname_fasta with gzopen(fname_iPCR_PE1) as f, gzopen(fname_iPCR_PE2) as g, \ open(fname_fasta, 'w') as outf: # Aggregate iterator of f,g iterators -> izip(f,g). for lineno, (line1, line2) in enumerate(izip(f, g)): # Take sequence only. if lineno % 4 != 1: continue # Split on "CATG" and take the first fragment. # In case there is no "CATG", the barcode will be rejected # for being too long. brcd = line1.rstrip().split('CATG')[0] if not MIN_BRCD < len(brcd) < MAX_BRCD: continue # Use a Levenshtein automaton to find the transpsoson. genome = pT2.matchSuffix(line2, False) if not genome: continue # Select the region from the end of the transposon to # the first "CATG", if any. genome = genome.split('CATG')[0].rstrip() if len(genome) < MIN_GENOME: continue outf.write('>%s\n%s\n' % (brcd, genome)) return fname_fasta
def extract_fingerprint_and_GATCGATC(f): '''The design of the oligo is the following: o(12) GATCGATC o(12) CGCACTAATGAATTCGTTGC u(20) The nucleotides labelled "o" are oligo-specific random nucleotides; those labelled "u" are random UMI nucleotides introduced during the linear amplification or the PCR. The "fingerprint" is the concatenation of the random nucleotides with a constant sequence, i.e. o(12) o(12) AGATACAGAGATAATACA u(20). ''' cst = seeq.compile(r'CGCACTAATGAATTCGTTGCA', 4) GATCGATC = seeq.compile(r'GATCGATC', 1) for line in f: # First remove the constant part, keep the left part # with oligo-specific nucleotides plus GATCGATC, and # keep the UMI on the right. try: oligo, ignore, umi = cst.match(line.rstrip()).tokenize() # Target length is 32. Allow at most 2 indels. if not 30 <= len(oligo) <= 34: continue except (ValueError, AttributeError): continue # Then split the oligo part to extract GATCGATC try: start, end, ignore = GATCGATC.match(oligo[10:22]).matchlist[0] except AttributeError: continue brcd = oligo[:10 + start] + oligo[10 + end:] readout = oligo[10 + start:10 + end] # Output fingerprint and GATCGATC fingerprint = brcd + 'AGATACAGAGATAATACA' + umi sys.stdout.write('%s\t%s\n' % (fingerprint, readout))
def extract_reads_from_PE_fastq(fname_iPCR_PE1, fname_iPCR_PE2): """This function takes the 2 pair-end sequencing files and extracts the barcode making sure that the other read contains the transposon.""" MIN_BRCD = 15 MAX_BRCD = 25 MIN_GENOME = 15 # The known parts of the sequences are matched with a Levenshtein # automaton. On the reverse read, the end of the transposon # corresponds to a 34 bp sequence ending as shown below. We allow # up to 5 mismatches/indels. On the forward read, the only known # sequence is the CATG after the barcode, which is matched exactly. pT2 = seeq.compile('TGTATGTAAACTTCCGACTTCAACTGTA', 5) # Open a file to write fname_fasta = re.sub(r'read[1-2].fastq(\.gz)?', 'iPCR.fasta', fname_iPCR_PE1) # Substitution failed, append '.fasta' to avoid name collision. if fname_fasta == fname_iPCR_PE1: fname_fasta = fname_iPCR_PE1 + '.fasta' # Skip if file exists. if os.path.exists(fname_fasta): return fname_fasta with gzopen(fname_iPCR_PE1) as f, gzopen(fname_iPCR_PE2) as g, \ open(fname_fasta, 'w') as outf: # Aggregate iterator of f,g iterators -> izip(f,g). for lineno,(line1,line2) in enumerate(izip(f,g)): # Take sequence only. if lineno % 4 != 1: continue # Split on "CATG" and take the first fragment. # In case there is no "CATG", the barcode will be rejected # for being too long. brcd = line1.rstrip().split('CATG')[0] if not MIN_BRCD < len(brcd) < MAX_BRCD: continue # Use a Levenshtein automaton to find the transpsoson. genome = pT2.matchSuffix(line2, False) if not genome: continue # Select the region from the end of the transposon to # the first "CATG", if any. genome = genome.split('CATG')[0].rstrip() if len(genome) < MIN_GENOME: continue outf.write('>%s\n%s\n' % (brcd,genome)) return fname_fasta
def main(f): constant = seeq.compile('TATAGTGAGTCGTATTAAAAGCGAAAGGGAAACCAGAGGAGC', 5) for lineno, line in enumerate(f): if lineno % 4 == 0: index2 = re.sub(r'.*\+', '', line.rstrip()) elif lineno % 4 == 1: m = constant.match(line.rstrip()) if m is None: continue try: barcode, ignore, tail = m.tokenize() except ValueError: continue if len(tail) < 8 or len(barcode) < 14: continue UMI = tail[:4] index1 = tail[4:12] sys.stdout.write('%s %s %s %s\n' % (barcode, index1, index2, UMI))
def call_starcode_on_fastq_file(fname_fastq): ''' Extracts the gDNA,cDNA reads and spikes and runs stracode on them.''' MIN_BRCD = 15 MAX_BRCD = 25 brcd_outfname = re.sub(r'\.fastq.*', '_starcode.txt', fname_fastq) spk_outfname = re.sub(r'\.fastq.*', '_spikes_starcode.txt', fname_fastq) if brcd_outfname == fname_fastq: brcd_outfname = fname_fastq + '_starcode.txt' if spk_outfname == fname_fastq: spk_outfname = fname_fastq + '_spikes_starcode.txt' if os.path.exists(brcd_outfname) and os.path.exists(spk_outfname): return (brcd_outfname, spk_outfname) SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2) barcode_tempf = tempfile.NamedTemporaryFile(delete=False) spike_tempf = tempfile.NamedTemporaryFile(delete=False) with gzopen(fname_fastq) as f: outf = None for lineno, line in enumerate(f): if lineno % 4 != 1: continue spike = SPIKE.match(line) if spike is not None: outf = spike_tempf outf.write(line[:spike.matchlist[0][0]] + '\n') else: outf = barcode_tempf outf.write(line[:20] + '\n') barcode_tempf.close() spike_tempf.close() # Call `starcode`. starcode_process = subprocess.call([ 'starcode', '-t4', '--print-clusters', '-i', barcode_tempf.name, '-o', brcd_outfname, ]) if int(starcode_process) < 0: sys.stderr.write("Error during Starcode call on: %s\n" % barcode_tempf.name) starcode_process = subprocess.call([ 'starcode', '-t4', '--print-clusters', '-i', spike_tempf.name, '-o', spk_outfname, ]) if int(starcode_process) < 0: sys.stderr.write("Error during Starcode call on: %s\n" % spk_outfname) # Delete temporary files. os.unlink(barcode_tempf.name) os.unlink(spike_tempf.name)
help="iPCR reverse reads (gzipped+fastq format)") params = parser.parse_args() fbrcd = open(params.barcodes, 'w') fseqs = open(params.reads, 'w') if params.bdist < 0: bdist = int(max(1, round(0.2 * len(params.b)))) else: bdist = params.bdist if params.ldist < 0: ldist = int(max(1, round(0.1 * len(params.l)))) else: ldist = params.rdist # BHIVE seqs: # T7 promoter (-b) TATAGTGAGTCGTA # LTR sequence (-l) AGCCCTTCCA # HIVRE sequence (-r) CGCTTTTAA T7 = seeq.compile(params.b, bdist) LTR = seeq.compile(params.l, ldist) HIVRE = None if params.r: HIVRE = seeq.compile(params.r, params.rdist) fqline = [""] * 4 with gzip.open(params.ipcr_forward) as r1, gzip.open( params.ipcr_reverse) as r2: for lineno, line in enumerate(r1): if lineno % 4 != 1: continue for i in range(0, 4): fqline[i] = r2.readline() # Match 5' LTR on reverse read (required, there must be 20 nt). l = LTR.matchBest(fqline[1]) if not l:
#!/usr/bin/env python # -*- coding:utf-8 -*- import sys import seeq from gzopen import gzopen T7 = seeq.compile('TATAGTGAGTCGTATTAAAA', 3) def main(f, indx): for lineno,line in enumerate(f): if lineno % 4 != 1: continue try: barcode,suffix = T7.match(line).split() if 14 < len(barcode) < 25 and suffix.startswith(indx): print barcode except (AttributeError, ValueError, IndexError) as e: continue if __name__ == '__main__': with gzopen(sys.argv[1]) as f: main(f, sys.argv[2])
#!/usr/bin/env python # -*- coding:utf-8 -*- import pdb import seeq import sys from gzopen import gzopen from itertools import izip pT2 = seeq.compile('TGTATGTAAACTTCCGACTTCAACTGTA', 5) MIN_BRCD = 15 MAX_BRCD = 25 MIN_GENOME = 15 #pdb.set_trace() outfname = sys.argv[1].split("_")[0] + ".tomap" with gzopen(sys.argv[1]) as f, gzopen(sys.argv[2]) as g, \ open(outfname, 'w') as outf: # Aggregate iterator of f,g iterators -> izip(f,g). for lineno,(line1,line2) in enumerate(izip(f,g)): # Take sequence only. if lineno % 4 != 1: continue # Split on "CATG" and take the first fragment. # In case there is no "CATG", the barcode will be rejected # for being too long. brcd = line1.rstrip().split('CATG')[0] if not MIN_BRCD < len(brcd) < MAX_BRCD: continue # Use a Levenshtein automaton to find the transpsoson. genome = pT2.matchSuffix(line2, False)y if not genome: continue # Select the region from the end of the transposon to
def main(fname): lineno = 0 mode = 2 # Parse parameter file cf = open(fname) for line in cf: lineno += 1 line = ''.join(line.split()) if line == '' or line[0] == '#': continue elif len(line.split('=')) == 2: [param, value] = line.split('=') params[param] = value elif len(line.split(':')) == 2: section = line.split(':')[0] if section == 'dna-index': mode = 0 elif section == 'rna-index': mode = 1 else: print "error in parameter file: Uknown section '{}' in {}, line {}.".format( section, fname, lineno) sys.exit(1) elif mode < 2 and len(line.split(',')) == 2: [index, fout] = line.split(',') if IND[mode].has_key(index): print "duplicate index '{}' in {}, line {}".format( index, fname, lineno) sys.exit(1) IND[mode][index] = fout else: print "unknown parameter '{}' in {}, line {}".format( line, fname, lineno) sys.exit(1) # Check parameters error = 0 if not params.has_key('bfs'): print "missing parameter in {}: 'bfs' must be defined.".format(fname) error = 1 if not params.has_key('dist'): print "missing parameter in {}: 'dist' must be defined.".format(fname) error = 1 if not params.has_key('dna-seqfile'): print "missing parameter in {}: 'dna-seqfile' must be defined.".format( fname) error = 1 if not params.has_key('rna-seqfile'): print "missing parameter in {}: 'rna-seqfile' must be defined.".format( fname) error = 1 if error: sys.exit(1) # Compile flanking sequence T7 = seeq.compile(params['bfs'], int(params['dist'])) FDICT = [dict(), dict()] for index, fname in IND[0].items(): if not FDICT[0].has_key(fname): FDICT[0][fname] = gzip.open(fname, 'wb') for index, fname in IND[1].items(): if not FDICT[1].has_key(fname): FDICT[1][fname] = gzip.open(fname, 'wb') try: # Demultiplex DNA indices with gzopen(params['dna-seqfile']) as f: # Read fastq file. for lineno, line in enumerate(f): lines[lineno % 4] = line if lineno % 4 == 3: index = getindex(T7, lines[1]) if IND[0].has_key(index): f = FDICT[0][IND[0][index]] for l in lines: f.write(l) # Demultiplex RNA indices with gzopen(params['rna-seqfile']) as f: # Read fastq file. for lineno, line in enumerate(f): lines[lineno % 4] = line if lineno % 4 == 3: index = getindex(T7, lines[1]) if IND[1].has_key(index): f = FDICT[1][IND[1][index]] for l in lines: f.write(l) finally: for f in FDICT[0].values(): f.close() for f in FDICT[1].values(): f.close()
def call_starcode_on_fastq_file(fname_fastq): ''' Extracts the gDNA,cDNA reads and spikes and runs stracode on them.''' MIN_BRCD = 15 MAX_BRCD = 25 brcd_outfname = re.sub(r'\.fastq.*', '_starcode.txt', fname_fastq) spk_outfname = re.sub(r'\.fastq.*', '_spikes_starcode.txt', fname_fastq) if brcd_outfname == fname_fastq: brcd_outfname = fname_fastq + '_starcode.txt' if spk_outfname == fname_fastq: spk_outfname = fname_fastq + '_spikes_starcode.txt' if os.path.exists(brcd_outfname) and os.path.exists(spk_outfname): return (brcd_outfname, spk_outfname) GFP = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4) SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2) barcode_tempf = tempfile.NamedTemporaryFile(delete=False) spike_tempf = tempfile.NamedTemporaryFile(delete=False) with gzopen(fname_fastq) as f: outf = None for lineno,line in enumerate(f): if lineno % 4 != 1: continue hit = GFP.match(line) if hit is not None: outf = barcode_tempf else: hit = SPIKE.match(line) if hit is not None: outf = spike_tempf else: continue pos = hit.matchlist[0][0] if MIN_BRCD <= pos <= MAX_BRCD: outf.write(line[:pos] + '\n') barcode_tempf.close() spike_tempf.close() # Skip if file exists. if not os.path.exists(brcd_outfname): # Call `starcode`. subprocess.call([ 'starcode', '-t4', '-i', barcode_tempf.name, '-o', brcd_outfname, ]) if not os.path.exists(spk_outfname): subprocess.call([ 'starcode', '-t4', '-i', spike_tempf.name, '-o', spk_outfname, ]) # Delete temporary files. os.unlink(barcode_tempf.name) os.unlink(spike_tempf.name) return (brcd_outfname, spk_outfname)
import sys import re import seeq import pdb import os #from automata import PatternMatcher from itertools import izip from gzopen import gzopen #pdb.set_trace() fname1 = sys.argv[1] fname2 = sys.argv[2] #hind = seeq.compile('AAGCTAGCTT', 1) dpn = seeq.compile('GATC', 0) # Open 2 files to write out1 = re.sub(r'.fastq(\.gz)?', 'read1.fasta', os.path.basename(fname1)) out2 = re.sub(r'.fastq(\.gz)?', 'read2.fasta', os.path.basename(fname2)) # We cut in enzyme restriction site GATC (DpnII) and make a fasta file # Or cut in AAGCTAGCTT (HindIII) with gzopen(fname1) as f, gzopen(fname2) as g, \ open(out1,'w') as y, open(out2,'w') as z: for lineno, (line1, line2) in enumerate(izip(f, g)): if lineno % 4 != 1: continue seq1 = dpn.matchPrefix(line1, False) or line1.rstrip() seq2 = dpn.matchPrefix(line2, False) or line2.rstrip() if len(seq1) > 16 and len(seq2) > 16: y.write('>%d\n' % (lineno / 4)) y.write(seq1 + '\n')
import pdb import sys import seeq from collections import defaultdict COMMON = seeq.compile('CTAGTTGTGGTTTGTCCAAACTCATCGAGCTCGAGA', 3) PROMD = defaultdict(int) with open(sys.argv[1]) as f: for lineno, line in enumerate(f): if lineno % 4 != 1: continue barcode = COMMON.matchPrefix(line, False) prom = COMMON.matchSuffix(line.rstrip(), False) if prom: PROMD[prom] += 1 #pdb.set_trace() for k in PROMD: count = PROMD[k] sys.stdout.write('%s\t%d\n' % (k, count))
def call_starcode_on_fastq_file(fname_fastq): ''' Extracts the gDNA,cDNA reads and spikes and runs stracode on them.''' MIN_BRCD = 15 MAX_BRCD = 25 brcd_outfname = re.sub(r'\.fastq.*', '_starcode.txt', fname_fastq) spk_outfname = re.sub(r'\.fastq.*', '_spikes_starcode.txt', fname_fastq) if brcd_outfname == fname_fastq: brcd_outfname = fname_fastq + '_starcode.txt' if spk_outfname == fname_fastq: spk_outfname = fname_fastq + '_spikes_starcode.txt' if os.path.exists(brcd_outfname) and os.path.exists(spk_outfname): return (brcd_outfname, spk_outfname) GFP = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4) SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2) barcode_tempf = tempfile.NamedTemporaryFile(delete=False) spike_tempf = tempfile.NamedTemporaryFile(delete=False) with gzopen(fname_fastq) as f: outf = None for lineno, line in enumerate(f): if lineno % 4 != 1: continue hit = GFP.match(line) if hit is not None: outf = barcode_tempf else: hit = SPIKE.match(line) if hit is not None: outf = spike_tempf else: continue pos = hit.matchlist[0][0] if MIN_BRCD <= pos <= MAX_BRCD: outf.write(line[:pos] + '\n') barcode_tempf.close() spike_tempf.close() # Skip if file exists. if not os.path.exists(brcd_outfname): # Call `starcode`. subprocess.call([ 'starcode', '-t4', '-i', barcode_tempf.name, '-o', brcd_outfname, ]) if not os.path.exists(spk_outfname): subprocess.call([ 'starcode', '-t4', '-i', spike_tempf.name, '-o', spk_outfname, ]) # Delete temporary files. os.unlink(barcode_tempf.name) os.unlink(spike_tempf.name) return (brcd_outfname, spk_outfname)