def genreport_rerun(source, working): run_path = os.path.dirname(os.path.dirname(source)) run_name = os.path.basename(run_path) publish_path = os.path.join(working, 'rerun_results', run_name) os.makedirs(publish_path) print('##', run_name) working_paths = split_files(source, working) file_names = (os.path.basename(working_path) for working_path in working_paths) sorted_file_names = sorted(file_names) sample_sheet_path = os.path.join(source, '../../SampleSheet.csv') groups = list( find_groups(sorted_file_names, sample_sheet_path, included_projects={'HCV'})) for group in groups: working_path = os.path.join(working, group.names[0]) if group.names[1] is None: midi_name = '' midi_path = working_path else: midi_name = group.names[1] midi_path = os.path.join(working, group.names[1]) if not os.path.isdir(midi_path): midi_name = 'failed MidHCV' midi_path = working_path print(working_path, midi_name) with open(os.path.join(working_path, 'amino.csv')) as amino_csv, \ open(os.path.join(midi_path, 'amino.csv')) as midi_amino_csv, \ open(os.path.join(working_path, 'resistance.csv'), 'w') as resistance_csv, \ open(os.path.join(working_path, 'mutations.csv'), 'w') as mutations_csv, \ open(os.path.join(working_path, 'resistance_fail.csv'), 'w') as resistance_fail_csv: report_resistance(amino_csv, midi_amino_csv, resistance_csv, mutations_csv, resistance_fail_csv) sample_name = os.path.basename(working_path) with open(os.path.join(working_path, 'resistance.csv')) as resistance_csv, \ open(os.path.join(working_path, 'mutations.csv')) as mutations_csv, \ open(os.path.join(working_path, 'resistance_report.pdf'), 'wb') as resistance_report_csv: gen_report(resistance_csv, mutations_csv, resistance_report_csv, sample_name=sample_name) for file_name in ('resistance.csv', 'mutations.csv', 'resistance_fail.csv'): with open(os.path.join(publish_path, file_name), 'w') as dest: dest_writer = csv.writer(dest) for i, group in enumerate(groups): working_path = os.path.join(working, group.names[0]) sample_name = os.path.basename(working_path) with open(os.path.join(working_path, file_name), 'r') as source: source_reader = csv.reader(source) for j, row in enumerate(source_reader): if j != 0: row.insert(0, sample_name) elif i == 0: row.insert(0, 'sample') else: continue dest_writer.writerow(row)
def test_find_groups_checks_overrides(tmpdir): sample_sheet_path = Path(str(tmpdir)) / 'SampleSheet.csv' sample_sheet_path.write_text(BASIC_HEADER + """\ Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder CFE_SomeId_10-Jul-2014_N501-N701_Sample1_Proj1,Sample1_Proj1,,,ACGTACGT,TGCATGCA,,, CFE_SomeId_10-Jul-2014_N501-N702_Sample2_Proj2,Sample2_Proj2,,,AAAAGGGG,CCCCTTTT,,, """) sample_sheet_overrides_path = sample_sheet_path.parent / 'SampleSheetOverrides.csv' sample_sheet_overrides_path.write_text("""\ sample,project Sample1-Proj1_S1,AltProjA """) expected_groups = [ SampleGroup('Sample1', ('Sample1-Proj1_S1_L001_R1_001.fastq.gz', None), ('AltProjA', None)), SampleGroup('Sample2', ('Sample2-Proj2_S2_L001_R1_001.fastq.gz', None), ('Proj2', None)) ] groups = list( find_groups([ 'Sample1-Proj1_S1_L001_R1_001.fastq.gz', 'Sample2-Proj2_S2_L001_R1_001.fastq.gz' ], sample_sheet_path)) assert expected_groups == groups
def link_samples(run_path, data_path): """ Load the data from a run folder into the BaseSpace layout. """ shutil.rmtree(data_path, ignore_errors=True) makedirs(data_path) results_path = os.path.join(run_path, 'Results', 'basespace') makedirs(results_path) output_path = os.path.join(data_path, 'output') os.symlink(results_path, output_path) scratch_path = os.path.join(data_path, 'scratch') makedirs(scratch_path) sample_groups = [] run_info_path = os.path.join(run_path, 'RunInfo.xml') interop_path = os.path.join(run_path, 'InterOp') if not (os.path.exists(run_info_path) and os.path.exists(interop_path)): read_sizes = None else: read_sizes = parse_read_sizes(run_info_path) run_info = RunInfo(sample_groups, reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'], interop_path=interop_path, scratch_path=scratch_path, output_path=output_path, read_sizes=read_sizes) fastq_files = list(glob(os.path.join(run_path, 'Data', 'Intensities', 'BaseCalls', '*_R1_*')) or glob(os.path.join(run_path, '*_R1_*'))) source_folder = fastq_files and os.path.dirname(fastq_files[0]) file_names = [os.path.basename(fastq_file) for fastq_file in fastq_files] groups = find_groups(file_names, os.path.join(run_path, 'SampleSheet.csv')) for group in groups: main_file, midi_file = group.names if main_file.startswith('Undetermined'): continue main_sample = Sample(fastq1=os.path.join(source_folder, main_file)) if midi_file is None: midi_sample = None else: midi_sample = Sample(fastq1=os.path.join(source_folder, midi_file)) sample_groups.append(SampleGroup(main_sample, midi_sample)) sample_count = sum(1 for _ in run_info.get_all_samples()) for i, sample in enumerate(run_info.get_all_samples(), 1): sample.rank = '({} of {})'.format(i, sample_count) sample.bad_cycles_csv = run_info.bad_cycles_csv sample.scratch_path = os.path.join(scratch_path, sample.name) return run_info
def find_full_groups(fastq_files, sandbox_path): groups = list( find_groups([p.name for p in fastq_files], sandbox_path / 'SampleSheet.csv')) full_groups = [] for group in groups: full_names = tuple(name and (sandbox_path / name) for name in group.names) full_groups.append( SampleGroup(group.enum, full_names, group.project_codes)) return full_groups
def find_sample_groups(run_path, base_calls_path): # noinspection PyBroadException try: fastq_files = base_calls_path.glob("*_R1_*.fastq.gz") sample_sheet_path = run_path / "SampleSheet.csv" file_names = [f.name for f in fastq_files] sample_groups = list(find_groups(file_names, sample_sheet_path)) sample_groups.sort(key=lambda group: get_sample_number(group.names[0]), reverse=True) except Exception: logger.error("Finding sample groups in %s", run_path, exc_info=True) (run_path / "errorprocessing").write_text( "Finding sample groups failed.\n") sample_groups = [] return sample_groups
def test_unmatched_midi_file_not_found(tmpdir): sample_sheet_path = Path(str(tmpdir)) / 'SampleSheet.csv' sample_sheet_path.write_text(BASIC_HEADER + """\ Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder CFE_SomeId_10-Jul-2014_N501-N701_Sample1MIDI_MidHCV,Sample1MIDI_MidHCV,,,ACGTACGT,TGCATGCA,,, CFE_SomeId_10-Jul-2014_N501-N702_Sample2_Proj2,Sample2_Proj2,,,AAAAGGGG,CCCCTTTT,,, """) expected_groups = [ SampleGroup('Sample2', ('Sample2-Proj2_S2_L001_R1_001.fastq.gz', None), ('Proj2', None)) ] groups = list( find_groups(['Sample2-Proj2_S2_L001_R1_001.fastq.gz'], sample_sheet_path)) assert expected_groups == groups
def test_combine_midi(tmpdir): sample_sheet_path = Path(str(tmpdir)) / 'SampleSheet.csv' sample_sheet_path.write_text(BASIC_HEADER + """\ Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder CFE_SomeId_10-Jul-2014_N501-N701_Sample1_HCV,Sample1_HCV,,,ACGTACGT,TGCATGCA,,, CFE_SomeId_10-Jul-2014_N501-N702_Sample1MIDI_MidHCV,Sample1MIDI_MidHCV,,,AAAAGGGG,CCCCTTTT,,, """) expected_groups = [ SampleGroup('Sample1', ('Sample1-HCV_S1_L001_R1_001.fastq.gz', 'Sample1MIDI-MidHCV_S2_L001_R1_001.fastq.gz'), ('HCV', 'MidHCV')) ] groups = list( find_groups([ 'Sample1-HCV_S1_L001_R1_001.fastq.gz', 'Sample1MIDI-MidHCV_S2_L001_R1_001.fastq.gz' ], sample_sheet_path)) assert expected_groups == groups
def link_samples( run_path: str, output_path: str, is_denovo: bool, fastq1s: typing.Sequence[str] = None, fastq2s: typing.Sequence[str] = None, project_code: str = None): """ Load the data from a run folder. """ shutil.rmtree(output_path, ignore_errors=True) makedirs(output_path) scratch_path = os.path.join(output_path, 'scratch') makedirs(scratch_path) sample_groups = [] run_info_path = os.path.join(run_path, 'RunInfo.xml') interop_path = os.path.join(run_path, 'InterOp') if not (os.path.exists(run_info_path) and os.path.exists(interop_path)): read_sizes = None else: read_sizes = parse_read_sizes(run_info_path) run_info = RunInfo(sample_groups, reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'], interop_path=interop_path, scratch_path=scratch_path, output_path=output_path, read_sizes=read_sizes, is_denovo=is_denovo) sample_sheet_path = os.path.join(run_path, "SampleSheet.csv") if (fastq1s is not None and len(fastq1s) > 0 or not os.path.exists(sample_sheet_path)): if fastq1s is not None and len(fastq1s) > 0: # forward files are specified if fastq2s is None: raise ValueError("Reverse read files must also be specified.") elif len(fastq2s) != len(fastq1s): raise ValueError( "The same number of forward and reverse read files must be " "specified." ) forward_reverse_pairs = zip(fastq1s, fastq2s) else: # there is no sample sheet # Sort the FASTQ files alphabetically and run them in pairs. logger.info( "No sample sheet found; running on all FASTQ files in folder {}".format( run_path ) ) fastq_files = (list(glob(os.path.join(run_path, "*.fastq"))) + list(glob(os.path.join(run_path, "*.fastq.gz")))) fastq_files.sort() forward_reverse_pairs = [] for idx in range(0, len(fastq_files), 2): forward = fastq_files[idx] if idx == len(fastq_files) - 1: # We have an odd number of FASTQ files; ignore this last one. logger.info( "File {} appears extraneous; omitting.".format(forward) ) break reverse = fastq_files[idx + 1] logger.info( "Pairing files {} and {}.".format(forward, reverse) ) forward_reverse_pairs.append((forward, reverse)) for forward, reverse in forward_reverse_pairs: sample = Sample( fastq1=os.path.join(run_path, forward), fastq2=os.path.join(run_path, reverse), ) sample.project_code = project_code sample_groups.append(SampleGroup(sample, midi_sample=None)) else: # a sample sheet is specified fastq_files = list(glob(os.path.join(run_path, 'Data', 'Intensities', 'BaseCalls', '*_R1_*')) or glob(os.path.join(run_path, '*_R1_*'))) source_folder = fastq_files and os.path.dirname(fastq_files[0]) file_names = [os.path.basename(fastq_file) for fastq_file in fastq_files] groups = find_groups(file_names, sample_sheet_path) for group in groups: main_file, midi_file = group.names if main_file.startswith('Undetermined'): continue main_sample = Sample(fastq1=os.path.join(source_folder, main_file)) main_sample.project_code = project_code if midi_file is None: midi_sample = None else: midi_sample = Sample(fastq1=os.path.join(source_folder, midi_file)) midi_sample.project_code = project_code sample_groups.append(SampleGroup(main_sample, midi_sample)) sample_count = sum(1 for _ in run_info.get_all_samples()) for i, sample in enumerate(run_info.get_all_samples(), 1): sample.rank = '{} of {}'.format(i, sample_count) sample.bad_cycles_csv = run_info.bad_cycles_csv sample.scratch_path = os.path.join(scratch_path, sample.name) return run_info
def genreport_rerun(source, working): run_path = os.path.dirname(os.path.dirname(source)) run_name = os.path.basename(run_path) publish_path = os.path.join(working, 'rerun_results', run_name) os.makedirs(publish_path) print('##', run_name) working_paths = split_files(source, working) file_names = (os.path.basename(working_path) for working_path in working_paths) sorted_file_names = sorted(file_names) sample_sheet_path = os.path.join(source, '../../SampleSheet.csv') groups = list(find_groups(sorted_file_names, sample_sheet_path, included_projects={'HCV'})) for group in groups: working_path = os.path.join(working, group.names[0]) if group.names[1] is None: midi_name = '' midi_path = working_path else: midi_name = group.names[1] midi_path = os.path.join( working, group.names[1]) if not os.path.isdir(midi_path): midi_name = 'failed MidHCV' midi_path = working_path print(working_path, midi_name) with open(os.path.join(working_path, 'amino.csv')) as amino_csv, \ open(os.path.join(midi_path, 'amino.csv')) as midi_amino_csv, \ open(os.path.join(working_path, 'resistance.csv'), 'w') as resistance_csv, \ open(os.path.join(working_path, 'mutations.csv'), 'w') as mutations_csv, \ open(os.path.join(working_path, 'resistance_fail.csv'), 'w') as resistance_fail_csv: report_resistance(amino_csv, midi_amino_csv, resistance_csv, mutations_csv, resistance_fail_csv) sample_name = os.path.basename(working_path) with open(os.path.join(working_path, 'resistance.csv')) as resistance_csv, \ open(os.path.join(working_path, 'mutations.csv')) as mutations_csv, \ open(os.path.join(working_path, 'resistance_report.pdf'), 'wb') as resistance_report_csv: gen_report(resistance_csv, mutations_csv, resistance_report_csv, sample_name=sample_name) for file_name in ('resistance.csv', 'mutations.csv', 'resistance_fail.csv'): with open(os.path.join(publish_path, file_name), 'w') as dest: dest_writer = csv.writer(dest) for i, group in enumerate(groups): working_path = os.path.join(working, group.names[0]) sample_name = os.path.basename(working_path) with open(os.path.join(working_path, file_name), 'r') as source: source_reader = csv.reader(source) for j, row in enumerate(source_reader): if j != 0: row.insert(0, sample_name) elif i == 0: row.insert(0, 'sample') else: continue dest_writer.writerow(row)