samples = lssp() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() counts_all = [] for protein in proteins: counts = [] for samplename, sample in samples.iterrows(): if submit: fork_self(samplename, protein, VERBOSE=VERBOSE, qual_min=qual_min) continue if VERBOSE >= 1: print protein, samplename sample = SamplePat(sample) # NOTE: How do we find what fragment covers the protein? Well, a # protein can happily cross fragments. Since each # codon is independent, we should iterate over codons. We do not # do that for efficiency reasons. Instead, we identify all potential # fragments and split the protein into full codon chunks covered by # a single fragment. fragment_rois = sample.get_fragments_covered(
default=1, help='Include details on number of reads, length of consensus') parser.add_argument('--submit', action='store_true', help='Execute the script in parallel on the cluster') args = parser.parse_args() seq_runs = args.runs adaIDs = args.adaIDs use_pats = args.use_pats use_interactive = args.interactive detail = args.detail submit = args.submit if submit: fork_self(seq_runs, adaIDs=adaIDs, pats=use_pats, detail=detail) sys.exit() samples_pat = lssp(include_wrong=True) samples = lss() samples = samples.loc[samples['seq run'].isin(seq_runs)] if adaIDs is not None: samples = samples.loc[samples.adapter.isin(adaIDs)] if len(seq_runs) >= 2: samples.sort(columns=['patient sample', 'seq run'], inplace=True) for isa, (samplename, sample) in enumerate(samples.iterrows()): sample = SampleSeq(sample) print sample.name, 'seq:', sample['seq run'], sample.adapter,
if PCR is None: PCRs_sample = (1, 2) else: PCRs_sample = [PCR] for PCR_sample in PCRs_sample: bamfilename = sample.get_mapped_filtered_filename( fragment, PCR=PCR_sample, decontaminated=False) if not os.path.isfile(bamfilename): continue #if check_already_decontaminated(sample, fragment, PCR_sample): # continue fork_self(samplename, fragment, VERBOSE=VERBOSE, maxreads=maxreads, summary=summary, PCR=PCR_sample) sys.exit() for fragment in fragments: consensi = { refname: ''.join(load_custom_reference(refname + '_' + fragment)) for refname in refnames } for samplename, sample in samples.iterrows(): sample = SamplePat(sample) try: consensi[samplename] = sample.get_consensus(fragment, PCR=1) except IOError:
if VERBOSE >= 3: print 'adaID ' + adaID + ': fragments ' + ' '.join( fragments_sample) # Iterate over fragments for fragment in fragments_sample: frag_gen = fragment[:2] # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, frag_gen, VERBOSE=VERBOSE, threads=threads, maxreads=maxreads, filter_reads=filter_reads, summary=summary, rescue=use_rescue) continue if summary: sfn = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=use_rescue) with open(sfn, 'w') as f: f.write('Call: python map_to_consensus.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\
submit = args.submit summary = args.summary # Specify the dataset dataset = MiSeq_runs[seq_run] data_folder = dataset["folder"] # Branch to the cluster if required if submit: # If no adaID is specified, use all if adaID == 0: adaIDs = load_adapter_table(data_folder)["ID"] else: adaIDs = [adaID] for adaID in adaIDs: fork_self(seq_run, adaID, VERBOSE=VERBOSE, summary=summary) sys.exit() ########################################################################### # The actual script starts here ########################################################################### # Open BAM bamfilename = get_last_mapped(data_folder, adaID, type="bam", filtered=True) # Try to convert to BAM if needed if not os.path.isfile(bamfilename): samfile = pysam.Samfile(bamfilename[:-3] + "sam", "r") bamfile = pysam.Samfile(bamfilename, "wb", template=samfile) for s in samfile: bamfile.write(s) bamfile = pysam.Samfile(bamfilename, "rb")
samples = lssp() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() counts_all = [] for protein in proteins: counts = [] for samplename, sample in samples.iterrows(): if submit: fork_self(samplename, protein, VERBOSE=VERBOSE, qual_min=qual_min) continue if VERBOSE >= 1: print protein, samplename sample = SamplePat(sample) # NOTE: How do we find what fragment covers the protein? Well, a # protein can happily cross fragments. Since each # codon is independent, we should iterate over codons. We do not # do that for efficiency reasons. Instead, we identify all potential # fragments and split the protein into full codon chunks covered by # a single fragment. fragment_rois = sample.get_fragments_covered(protein, include_coordinates=True)
for adaID in adaIDs: # If the script is called with no fragment, iterate over all samplename = dataset['samples'][dataset['adapters'].index(adaID)] if not fragments: fragments_sample = samples[samplename]['fragments'] else: from re import findall fragments_all = samples[samplename]['fragments'] fragments_sample = [] for fragment in fragments: frs = filter(lambda x: fragment in x, fragments_all) if len(frs): fragments_sample.append(frs[0]) if VERBOSE >= 3: print 'adaID '+adaID+': fragments '+' '.join(fragments_sample) # Iterate over fragments for fragment in fragments_sample: if submit: fork_self(seq_run, adaID, fragment, VERBOSE=VERBOSE, maxreads=maxreads, chunk_size=chunksize) continue split_reads(data_folder, adaID, fragment, chunk_size=chunksize, maxreads=maxreads, VERBOSE=VERBOSE)
fragments_sample = [ fr[:2] for fr in samples[samplename]['fragments'] ] else: fragments_sample = fragments if VERBOSE >= 3: print 'adaID:', adaID + ', fragments:', fragments_sample for fragment in fragments_sample: # Submit to the cluster self if requested if submit: fork_self(data_folder, adaID, fragment, VERBOSE=VERBOSE, summary=summary) continue # Get coverage and counts counts = np.load( get_allele_counts_filename(data_folder, adaID, fragment)) if len(counts.shape) == 2: import warnings warnings.warn( 'Counts not divided by read type: will normalize instead of filter!' ) nu_filtered = 1.0 * counts / counts.sum(axis=0) else:
fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments for (samplename_pat, PCR), samplenames_seq in samples_groups.groups.iteritems(): sample_pat = samples_pat.loc[samplename_pat].copy() samples_seq_group = samples_seq.loc[samples_seq.index.isin(samplenames_seq)] sample_pat.samples_seq = samples_seq_group pname = sample_pat.patient PCR = int(PCR) for fragment in fragments: if submit: fork_self(samplename_pat, fragment, VERBOSE=VERBOSE, n_pairs=n_pairs, PCR=PCR, summary=summary) continue if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'w') as f: f.write('Call: python filter_mapped_reads.py'+\ ' --samples '+samplename_pat+\ ' --fragments '+fragment+\ ' --verbose '+str(VERBOSE)) if n_pairs != -1: f.write(' --maxreads '+str(n_pairs)) f.write('\n')
from re import findall fragments_all = samples[samplename]['fragments'] fragments_sample = [] for fragment in fragments: frs = filter(lambda x: fragment in x, fragments_all) if len(frs): fragments_sample.append(frs[0]) if VERBOSE >= 3: print 'adaID ' + adaID + ': fragments ' + ' '.join( fragments_sample) # Iterate over fragments for fragment in fragments_sample: if submit: fork_self(seq_run, adaID, fragment, VERBOSE=VERBOSE, maxreads=maxreads, chunk_size=chunksize) continue split_reads(data_folder, adaID, fragment, chunk_size=chunksize, maxreads=maxreads, VERBOSE=VERBOSE)
help='Fork the job to the cluster via qsub') parser.add_argument('--no-savefig', action='store_false', dest='savefig', help='Show figure instead of saving it') args = parser.parse_args() seq_run = args.run VERBOSE = args.verbose submit = args.submit maxreads = args.maxreads adaID = args.adaID savefig = args.savefig if submit: fork_self(seq_run, VERBOSE=VERBOSE, maxreads=maxreads, savefig=savefig) sys.exit() dataset = load_sequencing_run(seq_run) data_folder = dataset.folder read_len = dataset.cycles // 2 reads_filenames = get_read_filenames(data_folder, adaID, gzip=True) if not os.path.isfile(reads_filenames[0]): reads_filenames = get_read_filenames(data_folder, adaID, gzip=False) title = seq_run + ', ' + adaID quality = quality_score_along_reads(read_len, reads_filenames, randomreads=(maxreads >= 1), maxreads=maxreads,
submit = args.submit summary = args.summary # Specify the dataset dataset = MiSeq_runs[seq_run] data_folder = dataset['folder'] # Branch to the cluster if required if submit: # If no adaID is specified, use all if adaID == 0: adaIDs = load_adapter_table(data_folder)['ID'] else: adaIDs = [adaID] for adaID in adaIDs: fork_self(seq_run, adaID, VERBOSE=VERBOSE, summary=summary) sys.exit() ########################################################################### # The actual script starts here ########################################################################### # Open BAM bamfilename = get_last_mapped(data_folder, adaID, type='bam', filtered=True) # Try to convert to BAM if needed if not os.path.isfile(bamfilename): samfile = pysam.Samfile(bamfilename[:-3] + 'sam', 'r') bamfile = pysam.Samfile(bamfilename, 'wb', template=samfile) for s in samfile:
elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments if submit: for fragment in fragments: for samplename, sample in samples.iterrows(): fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min, PCR=PCR, maxreads=maxreads, use_tests=use_tests) sys.exit() counts_all = [] for fragment in fragments: counts = [] for samplename, sample in samples.iterrows(): sample = SamplePat(sample) pname = sample.patient if VERBOSE >= 2: print pname, fragment, samplename refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta')
fragments_sample_gen) if frg in fragments] if VERBOSE >= 3: print 'adaID '+adaID+': fragments '+' '.join(fragments_sample) # Iterate over fragments for fragment in fragments_sample: frag_gen = fragment[:2] # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, frag_gen, VERBOSE=VERBOSE, threads=threads, maxreads=maxreads, filter_reads=filter_reads, summary=summary, rescue=use_rescue) continue if summary: sfn = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=use_rescue) with open(sfn, 'w') as f: f.write('Call: python map_to_consensus.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --fragments '+frag_gen+\ ' --threads '+str(threads)+\ ' --verbose '+str(VERBOSE)) if maxreads != -1:
ind |= samples.index.isin(samples_run.index) samples = samples.loc[ind] for i, (samplename, sample) in enumerate(samples.iterrows()): sample = SampleSeq(sample) seq_run = sample['seq run'] data_folder = sample.sequencing_run.folder adaID = sample.adapter fragments = sample.regions_complete if VERBOSE: print seq_run, adaID if submit: fork_self(samplename, maxreads=maxreads, VERBOSE=VERBOSE) continue if titles is not None: title = titles[i] else: title = None (counts, inserts) = check_premap(data_folder, adaID, fragments, seq_run, samplename, maxreads=maxreads, VERBOSE=VERBOSE, title=title)
# If the script is called with no adaID, iterate over all samples = dataset.samples if adaIDs is not None: samples = samples.loc[samples.adapter.isin(adaIDs)] if VERBOSE >= 2: print samples.index.tolist() # Iterate over all adaIDs for samplename, sample in samples.iterrows(): adaID = str(sample.adapter) # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, VERBOSE=VERBOSE, threads=threads, reference=refname, summary=summary) continue if summary: with open(get_trim_summary_filename(data_folder, adaID), 'w') as f: f.write('Call: python trim_reads_lowq.py --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --threads '+str(threads)+\ ' --reference '+refname+\ ' --verbose '+str(VERBOSE)+'\n') trim_reads(data_folder, adaID, VERBOSE=VERBOSE, summary=summary)
parser.add_argument('--submit', action='store_true', default=False, help='Fork the job to the cluster via qsub') parser.add_argument('--no-summary', action='store_false', dest='summary', help='Do not save results in a summary file') args = parser.parse_args() seq_run = args.run VERBOSE = args.verbose maxreads = args.maxreads submit = args.submit summary = args.summary # If submit, outsource to the cluster if submit: fork_self(seq_run, VERBOSE=VERBOSE, maxreads=maxreads, summary=summary) sys.exit() # Specify the dataset dataset = MiSeq_runs[seq_run] data_folder = dataset['folder'] if summary: with open(get_demultiplex_summary_filename(data_folder), 'w') as f: f.write('Call: python demultiplex.py --run '+seq_run+' --verbose '+str(VERBOSE)+'\n') adapters_designed = get_adapters_designed(dataset, VERBOSE=VERBOSE, summary=summary) make_output_folders(data_folder, adapters_designed, VERBOSE=VERBOSE, summary=summary)
if VERBOSE >= 2: print 'samples', samples.index.tolist() if not fragments: fragments = ['F' + str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments if submit: for fragment in fragments: for samplename, sample in samples.iterrows(): fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min, PCR=PCR, maxreads=maxreads, use_tests=use_tests) sys.exit() counts_all = [] for fragment in fragments: counts = [] for samplename, sample in samples.iterrows(): sample = SamplePat(sample) pname = sample.patient if VERBOSE >= 2: print pname, fragment, samplename
if VERBOSE >= 3: print 'fragments', fragments counts_all = [] for fragment in fragments: counts = [] for samplename, sample in samples.iterrows(): if VERBOSE >= 1: print samplename, fragment, if VERBOSE >= 2: print '' if submit: fork_self(samplename, fragment, VERBOSE=VERBOSE, PCR=PCR, block_len=block_len, n_reads_per_ali=n_reads_per_ali) continue sample = SamplePat(sample) pname = sample.patient refseq = SeqIO.read( get_initial_reference_filename(pname, fragment), 'fasta') refm = np.array(refseq) len_reference = len(refseq) # NOTE: we need consensi to decontaminate, so bamfilename = sample.get_mapped_filtered_filename( fragment, PCR=PCR, decontaminated=(not use_raw_reads)) if not os.path.isfile(bamfilename):
elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments for fragment in fragments: inses = [] for samplename, sample in samples.iterrows(): if submit: fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min) continue if VERBOSE >= 1: print fragment, samplename sample = SamplePat(sample) pname = sample.patient refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta') fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR) if not os.path.isfile(fn): warn('No BAM file found', NoDataWarning) continue _, inse = gac(fn, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE)
# If the input file if missing, skip input_filename = get_input_filename(sample.seqrun_folder, sample.adapter, sample.convert_region(fragment), type='bam', only_chunk=only_chunk, filtered=filtered) if not os.path.isfile(input_filename): if VERBOSE: print 'WARNING: input file not found' continue if submit: fork_self(samplename, fragment, VERBOSE=VERBOSE, threads=threads, n_pairs=n_pairs, summary=summary, only_chunks=[only_chunk], filtered=filtered) continue if summary: sfn = get_map_initial_summary_filename(pname, samplename_pat, samplename, fragment, PCR=PCR, only_chunk=only_chunk) with open(sfn, 'w') as f: f.write('Call: python map_to_initial_reference.py'+\ ' --samples '+samplename+\ ' --fragments '+fragment+\ ' --threads '+str(threads)+\ ' --verbose '+str(VERBOSE)) if n_pairs != -1:
use_save = args.save patients = load_patients() if pnames is not None: patients = patients.loc[pnames] data = [] for pname, patient in patients.iterrows(): if VERBOSE >= 1: print patient.code, start, end if submit: fork_self(patient.code, width, gap, start, end, VERBOSE=VERBOSE, freqmin=freqmin, countmin=countmin) continue patient = Patient(patient) ref = patient.get_reference('genomewide') L = len(ref) win_start = start while win_start + width - gap < min(L, end): win_end = min(win_start + width, end, L) if VERBOSE >= 1: print patient.code, win_start, win_end
print 'fragments', fragments for (samplename_pat, PCR), samplenames_seq in samples_groups.groups.iteritems(): sample_pat = samples_pat.loc[samplename_pat].copy() samples_seq_group = samples_seq.loc[samples_seq.index.isin( samplenames_seq)] sample_pat.samples_seq = samples_seq_group pname = sample_pat.patient PCR = int(PCR) for fragment in fragments: if submit: fork_self(samplename_pat, fragment, VERBOSE=VERBOSE, n_pairs=n_pairs, PCR=PCR, summary=summary) continue if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'w') as f: f.write('Call: python filter_mapped_reads.py'+\ ' --samples '+samplename_pat+\ ' --fragments '+fragment+\ ' --verbose '+str(VERBOSE)) if n_pairs != -1:
fragments_sample = [] for fragment in fragments: frs = filter(lambda x: fragment in x, fragments_all) if len(frs): fragments_sample.append(frs[0]) if VERBOSE >= 3: print 'adaID '+adaID+': fragments '+' '.join(fragments_sample) make_output_folders(data_folder, adaID, VERBOSE=VERBOSE) for fragment in fragments_sample: # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, fragment, n_reads, iterations_max, VERBOSE=VERBOSE) continue if summary: sfn = get_build_consensus_summary_filename(data_folder, adaID, fragment) with open(sfn, 'w') as f: f.write('Call: python build_consensus_iterative.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --fragments '+fragment+\ ' --iterations '+str(iterations_max)+\ ' -n '+str(n_reads)+\ ' --verbose '+str(VERBOSE)) f.write('\n')
# Iterate over all requested samples for samplename, sample in samples.iterrows(): sample = SampleSeq(sample) adaID = sample.adapter if not fragments: fragments_sample = sample.regions_generic else: fragments_sample = sorted( set(fragments) & set(sample.regions_generic)) for fragment in fragments_sample: # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, fragment, VERBOSE=VERBOSE) continue counts, inserts = get_allele_counts(data_folder, adaID, fragment, VERBOSE=VERBOSE) write_counts_files(data_folder, adaID, fragment, counts, inserts, VERBOSE=VERBOSE) if summary: plot_coverage(data_folder,
for samplename, sample in samples_focal.iterrows(): sample = SamplePat(sample) if PCR is None: PCRs_sample = (1, 2) else: PCRs_sample = [PCR] for PCR_sample in PCRs_sample: bamfilename = sample.get_mapped_filtered_filename(fragment, PCR=PCR_sample, decontaminated=False) if not os.path.isfile(bamfilename): continue # if check_already_decontaminated(sample, fragment, PCR_sample): # continue fork_self(samplename, fragment, VERBOSE=VERBOSE, maxreads=maxreads, summary=summary, PCR=PCR_sample) sys.exit() for fragment in fragments: consensi = {refname: "".join(load_custom_reference(refname + "_" + fragment)) for refname in refnames} for samplename, sample in samples.iterrows(): sample = SamplePat(sample) try: consensi[samplename] = sample.get_consensus(fragment, PCR=1) except IOError: print samplename, "file not found" continue for samplename, sample in samples_focal.iterrows(): sample = SamplePat(sample)
if VERBOSE >= 1: print fragment # There is a blacklist of samples which are probably contaminated, # we want to discard those altogether contstr = sample['suspected contamination'] if pd.notnull(contstr) and (fragment in contstr): print 'WARNING: This sample has a suspected contamination! Skipping.' continue # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, fragment, VERBOSE=VERBOSE, summary=summary, maxreads=maxreads, max_mismatches=max_mismatches, susp_mismatches=susp_mismatches) continue if summary: sfn = get_filter_mapped_summary_filename( data_folder, adaID, fragment) with open(sfn, 'w') as f: f.write('Call: python filter_mapped_reads.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --fragments '+fragment+\ ' --max-mismatches '+str(max_mismatches)+\ ' --suspicious-mismatches '+str(susp_mismatches)+\
samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() if not fragments: fragments = ['F' + str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments for fragment in fragments: inses = [] for samplename, sample in samples.iterrows(): if submit: fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min) continue if VERBOSE >= 1: print fragment, samplename sample = SamplePat(sample) pname = sample.patient refseq = SeqIO.read( get_initial_reference_filename(pname, fragment), 'fasta') fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR) if not os.path.isfile(fn): warn('No BAM file found', NoDataWarning) continue
ind |= samples.index.isin(samples_run.index) samples = samples.loc[ind] for i, (samplename, sample) in enumerate(samples.iterrows()): sample = SampleSeq(sample) seq_run = sample['seq run'] data_folder = sample.sequencing_run.folder adaID = sample.adapter fragments = sample.regions_complete if VERBOSE: print seq_run, adaID if submit: fork_self(samplename, maxreads=maxreads, VERBOSE=VERBOSE) continue if titles is not None: title = titles[i] else: title = None (counts, inserts) = check_premap(data_folder, adaID, fragments, seq_run, samplename, maxreads=maxreads, VERBOSE=VERBOSE, title=title) if show and (not submit) and (counts is not None): plt.ion()
if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments counts_all = [] for fragment in fragments: counts = [] for samplename, sample in samples.iterrows(): if VERBOSE >= 1: print samplename, fragment, if VERBOSE >= 2: print '' if submit: fork_self(samplename, fragment, VERBOSE=VERBOSE, PCR=PCR, block_len=block_len, n_reads_per_ali=n_reads_per_ali) continue sample = SamplePat(sample) pname = sample.patient refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta') refm = np.array(refseq) len_reference = len(refseq) # NOTE: we need consensi to decontaminate, so bamfilename = sample.get_mapped_filtered_filename(fragment, PCR=PCR, decontaminated=(not use_raw_reads)) if not os.path.isfile(bamfilename): continue
samples = samples.loc[samples.adapter.isin(adaIDs)] if VERBOSE >= 2: print samples.index.tolist() # Iterate over all adaIDs for samplename, sample in samples.iterrows(): adaID = str(sample.adapter) # Submit to the cluster self if requested if submit: fork_self( seq_run, adaID, VERBOSE=VERBOSE, threads=threads, reference=refname, summary=summary, trimmed=use_trimmed, subsrate=subsrate, gapopen=gapopen, gapextend=gapextend, maxreads=maxreads) continue make_output_folders( data_folder, adaID, VERBOSE=VERBOSE, summary=summary) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'w') as f: outstr = 'Call: python premap_to_reference.py --run '+seq_run+\ ' --adaIDs '+adaID+\
# Iterate over all requested samples for samplename, sample in samples.iterrows(): sample = SampleSeq(sample) adaID = sample.adapter if not fragments: fragments_sample = sample.regions_generic else: fragments_sample = sorted(set(fragments) & set(sample.regions_generic)) for fragment in fragments_sample: # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, fragment, VERBOSE=VERBOSE) continue counts, inserts = get_allele_counts(data_folder, adaID, fragment, VERBOSE=VERBOSE) write_counts_files(data_folder, adaID, fragment, counts, inserts, VERBOSE=VERBOSE) if summary: plot_coverage(data_folder, adaID, fragment, counts, VERBOSE=VERBOSE, savefig=True) if write_frequencies: nu_filtered = filter_nus(counts) write_frequency_files(data_folder, adaID, fragment, nu_filtered, VERBOSE=VERBOSE) if summary: plot_SFS_folded(data_folder, adaID, fragment, nu_filtered, VERBOSE=VERBOSE, savefig=True)
if len(frs): fragments_sample.append(frs[0]) if VERBOSE >= 3: print 'adaID ' + adaID + ': fragments ' + ' '.join( fragments_sample) make_output_folders(data_folder, adaID, VERBOSE=VERBOSE) for fragment in fragments_sample: # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, fragment, n_reads, iterations_max, VERBOSE=VERBOSE) continue if summary: sfn = get_build_consensus_summary_filename( data_folder, adaID, fragment) with open(sfn, 'w') as f: f.write('Call: python build_consensus_iterative.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --fragments '+fragment+\ ' --iterations '+str(iterations_max)+\ ' -n '+str(n_reads)+\ ' --verbose '+str(VERBOSE))
# Specify the dataset dataset = load_sequencing_run(seq_run) data_folder = dataset.folder # If the script is called with no adaID, iterate over all samples = dataset.samples if adaIDs is not None: samples = samples.loc[samples.adapter.isin(adaIDs)] if VERBOSE >= 2: print samples.index.tolist() # Iterate over all adaIDs for samplename, sample in samples.iterrows(): adaID = str(sample.adapter) # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, VERBOSE=VERBOSE, threads=threads, reference=refname, summary=summary) continue if summary: with open(get_trim_summary_filename(data_folder, adaID), 'w') as f: f.write('Call: python trim_reads_lowq.py --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --threads '+str(threads)+\ ' --reference '+refname+\ ' --verbose '+str(VERBOSE)+'\n') trim_reads(data_folder, adaID, VERBOSE=VERBOSE, summary=summary)
if VERBOSE >= 3: print 'adaIDs', adaIDs # If the script is called with no fragment, iterate over all if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments # Iterate over all requested samples for adaID in adaIDs: for fragment in fragments: # Submit to the cluster self if requested if submit: fork_self(data_folder, adaID, fragment, VERBOSE=VERBOSE, summary=summary) continue # Get cocounts cocounts = get_coallele_counts(data_folder, adaID, fragment, VERBOSE=VERBOSE) ## Check using the allele counts and the diagonal cocounts #counts, _ = get_allele_counts(data_folder, adaID, fragment, # VERBOSE=VERBOSE, # maxreads=2 * maxreads) #cocount = cocounts.sum(axis=0) #count = counts.sum(axis=0) ## Read reference
fragments_sample.append(frs[0]) if 'genomewide' in fragments: fragments_sample.append('genomewide') if VERBOSE >= 3: print 'adaID ' + adaID + ': fragments ' + ' '.join( fragments_sample) for fragment in fragments_sample: # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, fragment, block_len_initial, n_reads_per_ali, store_allele_counts, VERBOSE=VERBOSE) continue if summary: sfn = get_build_consensus_summary_filename(data_folder, adaID, fragment, iterative=False) with open(sfn, 'w') as f: f.write('Call: python build_consensus.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --fragments '+fragment+\
fragments_sample = [] for fragment in fragments: frs = filter(lambda x: fragment in x, fragments_all) if len(frs): fragments_sample.append(frs[0]) if 'genomewide' in fragments: fragments_sample.append('genomewide') if VERBOSE >= 3: print 'adaID '+adaID+': fragments '+' '.join(fragments_sample) for fragment in fragments_sample: # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, fragment, block_len_initial, n_reads_per_ali, store_allele_counts, VERBOSE=VERBOSE) continue if summary: sfn = get_build_consensus_summary_filename(data_folder, adaID, fragment, iterative=False) with open(sfn, 'w') as f: f.write('Call: python build_consensus.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --fragments '+fragment+\ ' --block-length '+str(block_len_initial)+\ ' --reads-per-alignment '+str(n_reads_per_ali)+\ ' --verbose '+str(VERBOSE)) if store_allele_counts: f.write(' --allele-counts')
PCR = int(sample.PCR) if PCR == 1: PCR_suffix = 'o' elif PCR ==2: PCR_suffix = 'i' else: raise ValueError('PCR should be only 1 or 2') fragments = [str('F'+fr+PCR_suffix) for fr in sample.regions.split(' ')] adaID = sample.adapter # Submit to the cluster self if requested if submit: if include_tests: raise ValueError('Tests require an interactive shell') fork_self(seq_run, adaID, VERBOSE=VERBOSE, maxreads=maxreads, minisize=minisize, summary=summary) continue make_output_folders(data_folder, adaID, VERBOSE=VERBOSE) if summary: with open(get_divide_summary_filename(data_folder, adaID), 'w') as f: f.write('Call: python trim_and_divide.py --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --minisize '+str(minisize)+\ ' --verbose '+str(VERBOSE)) if maxreads != -1: f.write(' --maxreads '+str(maxreads)) if include_tests: f.write(' --include_tests') f.write('\n')
help='Fork the job to the cluster via qsub') parser.add_argument('--no-summary', action='store_false', dest='summary', help='Do not save results in a summary file') args = parser.parse_args() seq_run = args.run VERBOSE = args.verbose maxreads = args.maxreads submit = args.submit summary = args.summary # If submit, outsource to the cluster if submit: fork_self(seq_run, VERBOSE=VERBOSE, maxreads=maxreads, summary=summary) sys.exit() # Specify the dataset dataset = MiSeq_runs[seq_run] data_folder = dataset['folder'] if summary: with open(get_demultiplex_summary_filename(data_folder), 'w') as f: f.write('Call: python demultiplex.py --run ' + seq_run + ' --verbose ' + str(VERBOSE) + '\n') adapters_designed = get_adapters_designed(dataset, VERBOSE=VERBOSE, summary=summary)
samples = samples.loc[samples.adapter.isin(adaIDs)] if VERBOSE >= 2: print samples.index.tolist() # Iterate over all adaIDs for samplename, sample in samples.iterrows(): adaID = str(sample.adapter) # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, VERBOSE=VERBOSE, threads=threads, reference=refname, summary=summary, trimmed=use_trimmed, subsrate=subsrate, gapopen=gapopen, gapextend=gapextend, maxreads=maxreads) continue make_output_folders(data_folder, adaID, VERBOSE=VERBOSE, summary=summary) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'w') as f:
parser.add_argument("--adaID", required=True, help="Adapter ID to analyze (e.g. TS2)") parser.add_argument("--verbose", type=int, default=0, help="Verbosity level [0-3]") parser.add_argument("--maxreads", type=int, default=-1, help="Maximal number of reads to analyze") parser.add_argument("--submit", action="store_true", default=False, help="Fork the job to the cluster via qsub") parser.add_argument("--no-savefig", action="store_false", dest="savefig", help="Show figure instead of saving it") args = parser.parse_args() seq_run = args.run VERBOSE = args.verbose submit = args.submit maxreads = args.maxreads adaID = args.adaID savefig = args.savefig if submit: fork_self(seq_run, VERBOSE=VERBOSE, maxreads=maxreads, savefig=savefig) sys.exit() dataset = load_sequencing_run(seq_run) data_folder = dataset.folder read_len = dataset.cycles // 2 reads_filenames = get_read_filenames(data_folder, adaID, gzip=True) if not os.path.isfile(reads_filenames[0]): reads_filenames = get_read_filenames(data_folder, adaID, gzip=False) title = seq_run + ", " + adaID quality = quality_score_along_reads( read_len, reads_filenames, randomreads=(maxreads >= 1), maxreads=maxreads, VERBOSE=VERBOSE )
countmin = args.countmin submit = args.submit use_plot = args.plot use_save = args.save patients = load_patients() if pnames is not None: patients = patients.loc[pnames] data = [] for pname, patient in patients.iterrows(): if VERBOSE >= 1: print patient.code, start, end if submit: fork_self(patient.code, width, gap, start, end, VERBOSE=VERBOSE, freqmin=freqmin, countmin=countmin) continue patient = Patient(patient) ref = patient.get_reference('genomewide') L = len(ref) win_start = start while win_start + width - gap < min(L, end): win_end = min(win_start + width, end, L) if VERBOSE >= 1: print patient.code, win_start, win_end if VERBOSE >= 2: print 'Get region haplotypes'
for fragment in fragments_sample: if VERBOSE >= 1: print fragment # There is a blacklist of samples which are probably contaminated, # we want to discard those altogether contstr = sample['suspected contamination'] if pd.notnull(contstr) and (fragment in contstr): print 'WARNING: This sample has a suspected contamination! Skipping.' continue # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, fragment, VERBOSE=VERBOSE, summary=summary, maxreads=maxreads, max_mismatches=max_mismatches, susp_mismatches=susp_mismatches) continue if summary: sfn = get_filter_mapped_summary_filename(data_folder, adaID, fragment) with open(sfn, 'w') as f: f.write('Call: python filter_mapped_reads.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --fragments '+fragment+\ ' --max-mismatches '+str(max_mismatches)+\ ' --suspicious-mismatches '+str(susp_mismatches)+\ ' --verbose '+str(VERBOSE)) if maxreads > 0: f.write(' --maxreads '+str(maxreads))