def hcv_sample(args): resolved_args = MiCallArgs(args) midi_args = MiCallArgs(args, map_midi=True) scratch_path = os.path.join(args.results_folder, "scratch") midi_scratch_path = os.path.join(args.results_folder, "scratch_midi") makedirs(scratch_path) shutil.rmtree(midi_scratch_path, ignore_errors=True) sample_groups = [] run_info = RunInfo(sample_groups, reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'], output_path=args.results_folder, scratch_path=scratch_path, is_denovo=args.denovo) main_sample = Sample(fastq1=resolved_args.fastq1, fastq2=resolved_args.fastq2, bad_cycles_csv=resolved_args.bad_cycles_csv, scratch_path=scratch_path) midi_sample = Sample(fastq1=midi_args.fastq1, fastq2=midi_args.fastq2, bad_cycles_csv=resolved_args.bad_cycles_csv, scratch_path=midi_scratch_path) main_and_midi = SampleGroup(main_sample, midi_sample) sample_groups.append(main_and_midi) process_run(run_info, args)
def single_sample(args): resolved_args = MiCallArgs(args) scratch_path = os.path.join(args.results_folder, "scratch") makedirs(scratch_path) sample_groups = [] run_info = RunInfo(sample_groups, reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'], output_path=args.results_folder, scratch_path=scratch_path, is_denovo=args.denovo) sample = Sample(fastq1=resolved_args.fastq1, fastq2=resolved_args.fastq2, bad_cycles_csv=resolved_args.bad_cycles_csv, scratch_path=scratch_path) sample.project_code = args.project_code sample_group = SampleGroup(sample) sample_groups.append(sample_group) process_run(run_info, args)
def link_samples( run_path: str, output_path: str, is_denovo: bool, fastq1s: typing.Sequence[str] = None, fastq2s: typing.Sequence[str] = None, project_code: str = None): """ Load the data from a run folder. """ shutil.rmtree(output_path, ignore_errors=True) makedirs(output_path) scratch_path = os.path.join(output_path, 'scratch') makedirs(scratch_path) sample_groups = [] run_info_path = os.path.join(run_path, 'RunInfo.xml') interop_path = os.path.join(run_path, 'InterOp') if not (os.path.exists(run_info_path) and os.path.exists(interop_path)): read_sizes = None else: read_sizes = parse_read_sizes(run_info_path) run_info = RunInfo(sample_groups, reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'], interop_path=interop_path, scratch_path=scratch_path, output_path=output_path, read_sizes=read_sizes, is_denovo=is_denovo) sample_sheet_path = os.path.join(run_path, "SampleSheet.csv") if (fastq1s is not None and len(fastq1s) > 0 or not os.path.exists(sample_sheet_path)): if fastq1s is not None and len(fastq1s) > 0: # forward files are specified if fastq2s is None: raise ValueError("Reverse read files must also be specified.") elif len(fastq2s) != len(fastq1s): raise ValueError( "The same number of forward and reverse read files must be " "specified." ) forward_reverse_pairs = zip(fastq1s, fastq2s) else: # there is no sample sheet # Sort the FASTQ files alphabetically and run them in pairs. logger.info( "No sample sheet found; running on all FASTQ files in folder {}".format( run_path ) ) fastq_files = (list(glob(os.path.join(run_path, "*.fastq"))) + list(glob(os.path.join(run_path, "*.fastq.gz")))) fastq_files.sort() forward_reverse_pairs = [] for idx in range(0, len(fastq_files), 2): forward = fastq_files[idx] if idx == len(fastq_files) - 1: # We have an odd number of FASTQ files; ignore this last one. logger.info( "File {} appears extraneous; omitting.".format(forward) ) break reverse = fastq_files[idx + 1] logger.info( "Pairing files {} and {}.".format(forward, reverse) ) forward_reverse_pairs.append((forward, reverse)) for forward, reverse in forward_reverse_pairs: sample = Sample( fastq1=os.path.join(run_path, forward), fastq2=os.path.join(run_path, reverse), ) sample.project_code = project_code sample_groups.append(SampleGroup(sample, midi_sample=None)) else: # a sample sheet is specified fastq_files = list(glob(os.path.join(run_path, 'Data', 'Intensities', 'BaseCalls', '*_R1_*')) or glob(os.path.join(run_path, '*_R1_*'))) source_folder = fastq_files and os.path.dirname(fastq_files[0]) file_names = [os.path.basename(fastq_file) for fastq_file in fastq_files] groups = find_groups(file_names, sample_sheet_path) for group in groups: main_file, midi_file = group.names if main_file.startswith('Undetermined'): continue main_sample = Sample(fastq1=os.path.join(source_folder, main_file)) main_sample.project_code = project_code if midi_file is None: midi_sample = None else: midi_sample = Sample(fastq1=os.path.join(source_folder, midi_file)) midi_sample.project_code = project_code sample_groups.append(SampleGroup(main_sample, midi_sample)) sample_count = sum(1 for _ in run_info.get_all_samples()) for i, sample in enumerate(run_info.get_all_samples(), 1): sample.rank = '{} of {}'.format(i, sample_count) sample.bad_cycles_csv = run_info.bad_cycles_csv sample.scratch_path = os.path.join(scratch_path, sample.name) return run_info
def load_samples(data_path): """ Load JSON file from the data path, and pull out the arguments for this run. :param str data_path: folder that contains a JSON file in the BaseSpace AppSession format. :return RunInfo: details about the run and samples """ json_path = os.path.join(data_path, 'input', 'AppSession.json') try: with open(json_path, 'r') as json_file: raw_args = json.load(json_file) arg_map = {item['Name']: item for item in raw_args['Properties']['Items']} href_app_session = raw_args['Href'] run = arg_map.get('Input.run-id') if run is None: run_id = interop_path = read_sizes = None else: run_content = run['Content'] run_id = run_content['Id'] interop_path = os.path.join(data_path, 'input', 'runs', run_id, 'InterOp') read_sizes = ReadSizes( run_content['SequencingStats']['NumCyclesRead1'], run_content['SequencingStats']['NumCyclesRead2'], run_content['SequencingStats']['NumCyclesIndex1'], run_content['SequencingStats']['NumCyclesIndex2']) project_id = arg_map['Input.project-id']['Content']['Id'] output_path = os.path.join(data_path, 'output', 'appresults', project_id, 'results') makedirs(output_path) reports = arg_map['Input.reports']['Items'] builder_node = arg_map.get('Input.builder') if builder_node is None: is_denovo = False else: is_denovo = builder_node['Content'] == 'denovo' primer_node = arg_map.get('Input.project_code') if primer_node is None: project_code = None else: project_code = primer_node['Content'] scratch_path = os.path.join(data_path, 'scratch') sample_groups = [] run_info = RunInfo(sample_groups, reports, interop_path, scratch_path, output_path, read_sizes, href_app_session, is_denovo) main_samples = arg_map['Input.sample-ids.main']['Items'] midi_samples = arg_map['Input.sample-ids.midi']['Items'] for main_sample_json, midi_sample_json in zip(main_samples, midi_samples): sample_group = SampleGroup(load_sample(main_sample_json, data_path, scratch_path, project_code), load_sample(midi_sample_json, data_path, scratch_path, project_code)) sample_groups.append(sample_group) # Do we have run_ids for all sample_ids ? if run_id is not None: bs = BSrequest() all_ids = {s.basespace_id for s in run_info.get_all_samples()} sample_id_set = bs.check_run_sample_ids( [run_id], all_ids) if len(sample_id_set) != len(all_ids): for s in run_info.get_all_samples(): if s.basespace_id not in sample_id_set: logger.warning( 'Run info not found for %s, skipping error rate data.', s) run_info.read_sizes = run_info.interop_path = None create_app_result(run_info) except IOError: if os.path.exists(json_path): # copy the input file to the output dir for postmortem analysis logger.error("Error occurred while parsing %r.", json_path) with open(json_path, 'r') as json_file: file_cont = json_file.read() out_path = os.path.join(data_path, 'logs', 'AppSession.json') with open(out_path, 'w') as json_file: json_file.write(file_cont) else: logger.error("Error: no such file as %r.", json_path) raise return run_info
def collate_samples(run_info: RunInfo): """ Combine all the sample files into run files. :param run_info: details of the run and samples """ filenames = ['remap_counts.csv', 'remap_conseq.csv', 'conseq_ins.csv', 'failed_read.csv', 'nuc.csv', 'amino.csv', 'coord_ins.csv', 'conseq.csv', 'conseq_all.csv', 'conseq_region.csv', 'failed_align.csv', 'coverage_scores.csv', 'g2p.csv', 'g2p_summary.csv', 'resistance.csv', 'mutations.csv', 'nuc_mutations.csv', 'resistance_fail.csv', 'resistance_consensus.csv', 'cascade.csv', 'merge_lengths.csv'] for filename in filenames: out_path = run_info.output_path with open(os.path.join(out_path, filename), 'w') as fout: writer = csv.writer(fout, lineterminator=os.linesep) is_header_written = False for sample_info in run_info.get_all_samples(): sample_name = sample_info.name sample_scratch_path = sample_info.scratch_path srcfile = os.path.join(sample_scratch_path, filename) try: with open(srcfile, 'r') as fin: reader = csv.reader(fin) for i, row in enumerate(reader): if i == 0: if not is_header_written: row.insert(0, 'sample') writer.writerow(row) is_header_written = True else: row.insert(0, sample_name) writer.writerow(row) except IOError as ex: if ex.errno != errno.ENOENT: raise resistance_reports_path = os.path.join(run_info.output_path, 'resistance_reports') makedirs(resistance_reports_path) coverage_maps_path = os.path.join(run_info.output_path, 'coverage_maps') genome_coverage_path = os.path.join(coverage_maps_path, 'genome') makedirs(genome_coverage_path) merge_lengths_path = os.path.join(run_info.output_path, 'merge_lengths') makedirs(merge_lengths_path) for sample_info in run_info.get_all_samples(): if os.path.exists(sample_info.coverage_maps): for map_file in os.listdir(sample_info.coverage_maps): safe_file_move(os.path.join(sample_info.coverage_maps, map_file), os.path.join(coverage_maps_path, map_file)) if os.path.exists(sample_info.contigs_svg): safe_file_move(sample_info.contigs_svg, os.path.join(coverage_maps_path, sample_info.name + '_contigs.svg')) if os.path.exists(sample_info.genome_coverage_svg): safe_file_move(sample_info.genome_coverage_svg, os.path.join(genome_coverage_path, sample_info.name + '_genome_coverage.svg')) if os.path.exists(sample_info.merge_lengths_svg): safe_file_move(sample_info.merge_lengths_svg, os.path.join(merge_lengths_path, sample_info.name + '_merge_lengths.svg')) if os.path.exists(sample_info.resistance_pdf): safe_file_move(sample_info.resistance_pdf, os.path.join(resistance_reports_path, sample_info.name + '_resistance.pdf')) try: # Remove directory, if it's empty. os.rmdir(genome_coverage_path) except OSError: # Guess it wasn't empty. pass
def process(self, pssm, excluded_seeds=(), excluded_projects=(), force_gzip=False, use_denovo=False): """ Process a single sample. :param pssm: the pssm library for running G2P analysis :param excluded_seeds: seeds to exclude from mapping :param excluded_projects: project codes to exclude from reporting :param bool force_gzip: treat FASTQ files as gzipped, even when they don't end in .gz :param bool use_denovo: True if de novo assembly should be used, instead of bowtie2 mapping against references. """ logger.info('Processing %s (%r).', self, self.fastq1) scratch_path = self.get_scratch_path() makedirs(scratch_path) use_gzip = force_gzip or self.fastq1.endswith('.gz') sample_info = self.load_sample_info() with open(self.read_summary_csv, 'w') as read_summary: trim((self.fastq1, self.fastq2), self.bad_cycles_csv, (self.trimmed1_fastq, self.trimmed2_fastq), summary_file=read_summary, use_gzip=use_gzip, skip=self.skip, project_code=sample_info.get('project')) if use_denovo: logger.info('Running merge_for_entropy on %s.', self) with open(self.read_entropy_csv, 'w') as read_entropy_csv: merge_for_entropy(self.trimmed1_fastq, self.trimmed2_fastq, read_entropy_csv, scratch_path) write_merge_lengths_plot(self.read_entropy_csv, self.merge_lengths_svg) logger.info('Running fastq_g2p on %s.', self) with open(self.trimmed1_fastq) as fastq1, \ open(self.trimmed2_fastq) as fastq2, \ open(self.g2p_csv, 'w') as g2p_csv, \ open(self.g2p_summary_csv, 'w') as g2p_summary_csv, \ open(self.g2p_unmapped1_fastq, 'w') as g2p_unmapped1, \ open(self.g2p_unmapped2_fastq, 'w') as g2p_unmapped2, \ open(self.g2p_aligned_csv, 'w') as g2p_aligned_csv, \ open(self.merged_contigs_csv, 'w') as merged_contigs_csv: fastq_g2p(pssm=pssm, fastq1=fastq1, fastq2=fastq2, g2p_csv=g2p_csv, g2p_summary_csv=g2p_summary_csv, unmapped1=g2p_unmapped1, unmapped2=g2p_unmapped2, aligned_csv=g2p_aligned_csv, min_count=DEFAULT_MIN_COUNT, min_valid=MIN_VALID, min_valid_percent=MIN_VALID_PERCENT, merged_contigs_csv=merged_contigs_csv) if use_denovo: self.run_denovo(excluded_seeds) else: self.run_mapping(excluded_seeds) logger.info('Running sam2aln on %s.', self) with open(self.remap_csv) as remap_csv, \ open(self.aligned_csv, 'w') as aligned_csv, \ open(self.conseq_ins_csv, 'w') as conseq_ins_csv, \ open(self.failed_csv, 'w') as failed_csv, \ open(self.clipping_csv, 'w') as clipping_csv: sam2aln(remap_csv, aligned_csv, conseq_ins_csv, failed_csv, clipping_csv=clipping_csv) logger.info('Running aln2counts on %s.', self) if use_denovo: contigs_path = self.contigs_csv else: contigs_path = os.devnull with open(self.aligned_csv) as aligned_csv, \ open(self.g2p_aligned_csv) as g2p_aligned_csv, \ open(self.clipping_csv) as clipping_csv, \ open(self.conseq_ins_csv) as conseq_ins_csv, \ open(self.remap_conseq_csv) as remap_conseq_csv, \ open(contigs_path) as contigs_csv, \ open(self.nuc_csv, 'w') as nuc_csv, \ open(self.nuc_detail_csv, 'w') as nuc_detail_csv, \ open(self.amino_csv, 'w') as amino_csv, \ open(self.amino_detail_csv, 'w') as amino_detail_csv, \ open(self.coord_ins_csv, 'w') as coord_ins_csv, \ open(self.conseq_csv, 'w') as conseq_csv, \ open(self.conseq_region_csv, 'w') as conseq_region_csv, \ open(self.failed_align_csv, 'w') as failed_align_csv, \ open(self.coverage_summary_csv, 'w') as coverage_summary_csv, \ open(self.genome_coverage_csv, 'w') as genome_coverage_csv, \ open(self.conseq_all_csv, "w") as conseq_all_csv, \ open(self.minimap_hits_csv, "w") as minimap_hits_csv: if not use_denovo: for f in (amino_detail_csv, nuc_detail_csv): f.close() os.remove(f.name) amino_detail_csv = nuc_detail_csv = None aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, coverage_summary_csv=coverage_summary_csv, clipping_csv=clipping_csv, conseq_ins_csv=conseq_ins_csv, g2p_aligned_csv=g2p_aligned_csv, remap_conseq_csv=remap_conseq_csv, conseq_region_csv=conseq_region_csv, amino_detail_csv=amino_detail_csv, nuc_detail_csv=nuc_detail_csv, genome_coverage_csv=genome_coverage_csv, contigs_csv=contigs_csv, conseq_all_csv=conseq_all_csv, minimap_hits_csv=minimap_hits_csv) logger.info('Running coverage_plots on %s.', self) os.makedirs(self.coverage_maps) with open(self.amino_csv) as amino_csv, \ open(self.coverage_scores_csv, 'w') as coverage_scores_csv: coverage_plot(amino_csv, coverage_scores_csv, coverage_maps_path=self.coverage_maps, coverage_maps_prefix=self.name, excluded_projects=excluded_projects) with open(self.genome_coverage_csv) as genome_coverage_csv, \ open(self.minimap_hits_csv) as minimap_hits_csv: if not use_denovo: minimap_hits_csv = None plot_genome_coverage(genome_coverage_csv, minimap_hits_csv, self.genome_coverage_svg) logger.info('Running cascade_report on %s.', self) with open(self.g2p_summary_csv) as g2p_summary_csv, \ open(self.remap_counts_csv) as remap_counts_csv, \ open(self.aligned_csv) as aligned_csv, \ open(self.cascade_csv, 'w') as cascade_csv: cascade_report = CascadeReport(cascade_csv) cascade_report.g2p_summary_csv = g2p_summary_csv cascade_report.remap_counts_csv = remap_counts_csv cascade_report.aligned_csv = aligned_csv cascade_report.generate() logger.info('Finished sample %s.', self)