def _subsample_paired(fastq_map): qual_sample = collections.defaultdict(list) for fwd, rev, index in fastq_map: file_pair = zip(_read_fastq_seqs(fwd), _read_fastq_seqs(rev)) for i, (fseq, rseq) in enumerate(file_pair): if i == index[0]: qual_sample['forward'].append(_decode_qual_to_phred33(fseq[3])) qual_sample['reverse'].append(_decode_qual_to_phred33(rseq[3])) index.pop(0) if len(index) == 0: break return qual_sample
def _subsample_single(fastq_map): qual_sample = collections.defaultdict(list) for file, index in fastq_map: for i, seq in enumerate(_read_fastq_seqs(file)): if i == index[0]: qual_sample['forward'].append(_decode_qual_to_phred33(seq[3])) index.pop(0) if len(index) == 0: break return qual_sample
def _subsample(fastq_map): qual_sample = [] min_seq_len = float('inf') for file, index in fastq_map: for i, seq in enumerate(_read_fastq_seqs(file)): if i == index[0]: min_seq_len = min(min_seq_len, len(seq[1])) qual_sample.append(_decode_qual_to_phred33(seq[3])) index.pop(0) if len(index) == 0: break return qual_sample, min_seq_len
def _subsample_single(fastq_map): qual_sample = collections.defaultdict(list) min_seq_len = {'forward': float('inf'), 'reverse': None} for file, index in fastq_map: for i, seq in enumerate(_read_fastq_seqs(file)): if i == index[0]: min_seq_len['forward'] = min(min_seq_len['forward'], len(seq[1])) qual_sample['forward'].append(_decode_qual_to_phred33(seq[3])) index.pop(0) if len(index) == 0: break return qual_sample, min_seq_len
def subsample_paired(sequences: SingleLanePerSamplePairedEndFastqDirFmt, fraction: float ) -> CasavaOneEightSingleLanePerSampleDirFmt: result = CasavaOneEightSingleLanePerSampleDirFmt() manifest = sequences.manifest.view(pd.DataFrame) for _, fwd_path, rev_path in manifest.itertuples(): fwd_name = os.path.basename(fwd_path) rev_name = os.path.basename(rev_path) fwd_path_in = str(sequences.path / fwd_name) rev_path_in = str(sequences.path / rev_name) fwd_path_out = str(result.path / fwd_name) rev_path_out = str(result.path / rev_name) with gzip.open(str(fwd_path_out), mode='w') as fwd: with gzip.open(str(rev_path_out), mode='w') as rev: file_pair = zip(_read_fastq_seqs(fwd_path_in), _read_fastq_seqs(rev_path_in)) for fwd_rec, rev_rec in file_pair: if random.random() <= fraction: fwd.write(('\n'.join(fwd_rec) + '\n').encode('utf-8')) rev.write(('\n'.join(rev_rec) + '\n').encode('utf-8')) return result
def summarize(output_dir: str, data: _PlotQualView, n: int = 10000) -> None: paired = data.paired data = data.directory_format dangers = [] warnings = [] manifest = pd.read_csv(os.path.join(str(data), data.manifest.pathspec), header=0, comment='#') manifest.filename = manifest.filename.apply( lambda x: os.path.join(str(data), x)) fwd = manifest[manifest.direction == 'forward'].filename.tolist() rev = manifest[manifest.direction == 'reverse'].filename.tolist() per_sample_fastq_counts = {} reads = rev if not fwd and rev else fwd file_records = [] for file in reads: count = 0 for seq in _read_fastq_seqs(file): count += 1 sample_id = manifest.loc[manifest.filename == file, 'sample-id'].iloc[0] per_sample_fastq_counts[sample_id] = count file_records.append((file, sample_id)) result = pd.Series(per_sample_fastq_counts) result.name = 'Sequence count' result.index.name = 'Sample name' result.sort_values(inplace=True, ascending=False) result.to_csv(os.path.join(output_dir, 'per-sample-fastq-counts.csv'), header=True, index=True) sequence_count = result.sum() if n > sequence_count: n = sequence_count warnings.append('A subsample value was provided that is greater than ' 'the amount of sequences across all samples. The plot ' 'was generated using all available sequences.') subsample_ns = sorted(random.sample(range(sequence_count), n)) link = _link_sample_n_to_file(file_records, per_sample_fastq_counts, subsample_ns) if paired: sample_map = [(file, rev[fwd.index(file)], link[file]) for file in link] quality_scores, min_seq_len = _subsample_paired(sample_map) else: sample_map = [(file, link[file]) for file in link] quality_scores, min_seq_len = _subsample_single(sample_map) forward_scores = pd.DataFrame(quality_scores['forward']) forward_stats = _compute_stats_of_df(forward_scores) if (forward_stats.loc['50%'] > 45).any(): dangers.append('Some of the PHRED quality values are out of range. ' 'This is likely because an incorrect PHRED offset ' 'was chosen on import of your raw data. You can learn ' 'how to choose your PHRED offset during import in the ' 'importing tutorial.') if paired: reverse_scores = pd.DataFrame(quality_scores['reverse']) reverse_stats = _compute_stats_of_df(reverse_scores) show_plot = len(fwd) > 1 if show_plot: ax = sns.distplot(result, kde=False) ax.set_xlabel('Number of sequences') ax.set_ylabel('Frequency') fig = ax.get_figure() fig.savefig(os.path.join(output_dir, 'demultiplex-summary.png')) fig.savefig(os.path.join(output_dir, 'demultiplex-summary.pdf')) html = q2templates.df_to_html(result.to_frame()) index = os.path.join(TEMPLATES, 'assets', 'index.html') overview_template = os.path.join(TEMPLATES, 'assets', 'overview.html') quality_template = os.path.join(TEMPLATES, 'assets', 'quality-plot.html') context = { 'result_data': { 'min': result.min(), 'median': result.median(), 'mean': result.mean(), 'max': result.max(), 'sum': sequence_count }, 'result': html, 'show_plot': show_plot, 'paired': paired, 'tabs': [{ 'title': 'Overview', 'url': 'overview.html' }, { 'title': 'Interactive Quality Plot', 'url': 'quality-plot.html' }], 'dangers': dangers, 'warnings': warnings, } templates = [index, overview_template, quality_template] q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") json.dump( { 'n': int(n), 'totalSeqCount': int(sequence_count), 'minSeqLen': min_seq_len }, fh) fh.write(',') forward_stats.to_json(fh) if paired: fh.write(',') reverse_stats.to_json(fh) fh.write(');')
def summarize(output_dir: str, data: _PlotQualView, n: int = 10000) -> None: paired = data.paired data = data.directory_format summary_columns = ['Minimum', 'Median', 'Mean', 'Maximum', 'Total'] index = os.path.join(TEMPLATES, 'assets', 'index.html') overview_template = os.path.join(TEMPLATES, 'assets', 'overview.html') templates = [index, overview_template] context = { 'result_data': pd.DataFrame([], columns=summary_columns), 'result': pd.DataFrame(), 'n_samples': { 'forward': None, 'reverse': None }, 'show_plot': { 'forward': None, 'reverse': None }, 'paired': paired, 'tabs': [{ 'title': 'Overview', 'url': 'overview.html' }], 'dangers': [], 'warnings': [], 'length_tables': { 'forward': None, 'reverse': None }, } manifest = data.manifest.view(pd.DataFrame) columns = list(manifest.columns) directions = columns file_records = {'forward': [], 'reverse': []} per_sample_fastq_counts = {'forward': {}, 'reverse': {}} subsample_size = {'forward': n, 'reverse': n} sequence_count = {'forward': None, 'reverse': None} links = {'forward': {}, 'reverse': {}} qual_stats = {'forward': None, 'reverse': None} min_seq_len = {'forward': None, 'reverse': None} for sample_id, row in manifest.iterrows(): for direction in directions: count = 0 filename = row[direction] # If we have an empty direction for a sample that will be a nan in # the manifest. Skip that nan if type(filename) != str: if filename is None or np.isnan(filename): continue for seq in _read_fastq_seqs(filename): count += 1 per_sample_fastq_counts[direction][sample_id] = count file_records[direction].append({ 'filename': filename, 'sample_id': sample_id, }) for direction in directions: # Prepare summary result = pd.Series(per_sample_fastq_counts[direction]) result.name = '%s sequence count' % (direction, ) result.index.name = 'sample ID' result.sort_values(inplace=True, ascending=False) sequence_count[direction] = int(result.sum()) if subsample_size[direction] > sequence_count[direction]: subsample_size[direction] = sequence_count[direction] context['warnings'].append( 'A subsample value was provided that is greater than the ' 'amount of sequences across all samples for the %s reads. ' 'The plot was generated using all available sequences.' % (direction, )) subsample_ns = sorted( random.sample(range(sequence_count[direction]), subsample_size[direction])) links[direction] = _link_sample_n_to_file(file_records, per_sample_fastq_counts, subsample_ns, direction) sample_map = [(k, v) for k, v in links[direction].items()] quality_scores, dir_min_seq_len = _subsample(sample_map, ) min_seq_len[direction] = dir_min_seq_len show_plot = len(sample_map) > 0 ax = sns.histplot(result, kde=False, color='black') ax.set_xlabel('Number of sequences') ax.set_ylabel('Number of samples') fig = ax.get_figure() fig.savefig( os.path.join(output_dir, 'demultiplex-summary-%s.png' % (direction, ))) fig.savefig( os.path.join(output_dir, 'demultiplex-summary-%s.pdf' % (direction, ))) fig.clear() df = pd.DataFrame([[ result.min(), result.median(), result.mean(), result.max(), sequence_count[direction] ]], index=['%s reads' % (direction, )], columns=summary_columns) context['result_data'] = context['result_data'].append(df) html_df = result.to_frame() context['result'] = context['result'].join(html_df, how='outer') context['n_samples'][direction] = result.count() context['show_plot'][direction] = show_plot scores = pd.DataFrame(quality_scores) if not scores.empty: stats = _compute_stats_of_df(scores) stats.to_csv(os.path.join( output_dir, '%s-seven-number-summaries.tsv' % (direction, )), header=True, index=True, sep='\t') length_table = _build_seq_len_table(scores) qual_stats[direction] = stats if (stats.loc['50%'] > 45).any(): context['dangers'].append( 'Some of the %s PHRED quality values are out of range. ' 'This is likely because an incorrect PHRED offset was ' 'chosen on import of your raw data. You can learn how ' 'to choose your PHRED offset during import in the ' 'importing tutorial.' % (direction, )) context['length_tables'][direction] = length_table if qual_stats['forward'] is not None or qual_stats['reverse'] is not None: templates.append(os.path.join(TEMPLATES, 'assets', 'quality-plot.html')) context['tabs'].append({ 'title': 'Interactive Quality Plot', 'url': 'quality-plot.html' }) context['result_data'] = \ q2templates.df_to_html(context['result_data'].transpose()) # Create a TSV before turning into HTML table result_fn = 'per-sample-fastq-counts.tsv' result_path = os.path.join(output_dir, result_fn) context['result'].to_csv(result_path, header=True, index=True, sep='\t') context['result'] = q2templates.df_to_html(context['result']) q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") json.dump( { 'subsampleSize': subsample_size, 'totalSeqCount': sequence_count, 'minSeqLen': min_seq_len }, fh) fh.write(', ') if qual_stats['forward'] is not None and not \ qual_stats['forward'].empty: qual_stats['forward'].to_json(fh) else: fh.write('undefined') fh.write(', ') if qual_stats['reverse'] is not None and not \ qual_stats['reverse'].empty: qual_stats['reverse'].to_json(fh) else: fh.write('undefined') fh.write(');')