def genreport_rerun(source, working):
    run_path = os.path.dirname(os.path.dirname(source))
    run_name = os.path.basename(run_path)
    publish_path = os.path.join(working, 'rerun_results', run_name)
    os.makedirs(publish_path)
    print('##', run_name)
    working_paths = split_files(source, working)
    file_names = (os.path.basename(working_path)
                  for working_path in working_paths)
    sorted_file_names = sorted(file_names)
    sample_sheet_path = os.path.join(source, '../../SampleSheet.csv')
    groups = list(
        find_groups(sorted_file_names,
                    sample_sheet_path,
                    included_projects={'HCV'}))
    for group in groups:
        working_path = os.path.join(working, group.names[0])
        if group.names[1] is None:
            midi_name = ''
            midi_path = working_path
        else:
            midi_name = group.names[1]
            midi_path = os.path.join(working, group.names[1])
            if not os.path.isdir(midi_path):
                midi_name = 'failed MidHCV'
                midi_path = working_path
        print(working_path, midi_name)
        with open(os.path.join(working_path, 'amino.csv')) as amino_csv, \
                open(os.path.join(midi_path, 'amino.csv')) as midi_amino_csv, \
                open(os.path.join(working_path, 'resistance.csv'), 'w') as resistance_csv, \
                open(os.path.join(working_path, 'mutations.csv'), 'w') as mutations_csv, \
                open(os.path.join(working_path, 'resistance_fail.csv'), 'w') as resistance_fail_csv:
            report_resistance(amino_csv, midi_amino_csv, resistance_csv,
                              mutations_csv, resistance_fail_csv)
        sample_name = os.path.basename(working_path)
        with open(os.path.join(working_path, 'resistance.csv')) as resistance_csv, \
                open(os.path.join(working_path, 'mutations.csv')) as mutations_csv, \
                open(os.path.join(working_path, 'resistance_report.pdf'), 'wb') as resistance_report_csv:
            gen_report(resistance_csv,
                       mutations_csv,
                       resistance_report_csv,
                       sample_name=sample_name)
    for file_name in ('resistance.csv', 'mutations.csv',
                      'resistance_fail.csv'):
        with open(os.path.join(publish_path, file_name), 'w') as dest:
            dest_writer = csv.writer(dest)
            for i, group in enumerate(groups):
                working_path = os.path.join(working, group.names[0])
                sample_name = os.path.basename(working_path)
                with open(os.path.join(working_path, file_name),
                          'r') as source:
                    source_reader = csv.reader(source)
                    for j, row in enumerate(source_reader):
                        if j != 0:
                            row.insert(0, sample_name)
                        elif i == 0:
                            row.insert(0, 'sample')
                        else:
                            continue
                        dest_writer.writerow(row)
Exemple #2
0
def test_find_groups_checks_overrides(tmpdir):
    sample_sheet_path = Path(str(tmpdir)) / 'SampleSheet.csv'
    sample_sheet_path.write_text(BASIC_HEADER + """\
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder
CFE_SomeId_10-Jul-2014_N501-N701_Sample1_Proj1,Sample1_Proj1,,,ACGTACGT,TGCATGCA,,,
CFE_SomeId_10-Jul-2014_N501-N702_Sample2_Proj2,Sample2_Proj2,,,AAAAGGGG,CCCCTTTT,,,
""")
    sample_sheet_overrides_path = sample_sheet_path.parent / 'SampleSheetOverrides.csv'
    sample_sheet_overrides_path.write_text("""\
sample,project
Sample1-Proj1_S1,AltProjA
""")
    expected_groups = [
        SampleGroup('Sample1', ('Sample1-Proj1_S1_L001_R1_001.fastq.gz', None),
                    ('AltProjA', None)),
        SampleGroup('Sample2', ('Sample2-Proj2_S2_L001_R1_001.fastq.gz', None),
                    ('Proj2', None))
    ]

    groups = list(
        find_groups([
            'Sample1-Proj1_S1_L001_R1_001.fastq.gz',
            'Sample2-Proj2_S2_L001_R1_001.fastq.gz'
        ], sample_sheet_path))

    assert expected_groups == groups
Exemple #3
0
def link_samples(run_path, data_path):
    """ Load the data from a run folder into the BaseSpace layout. """

    shutil.rmtree(data_path, ignore_errors=True)
    makedirs(data_path)

    results_path = os.path.join(run_path, 'Results', 'basespace')
    makedirs(results_path)
    output_path = os.path.join(data_path, 'output')
    os.symlink(results_path, output_path)
    scratch_path = os.path.join(data_path, 'scratch')
    makedirs(scratch_path)

    sample_groups = []
    run_info_path = os.path.join(run_path, 'RunInfo.xml')
    interop_path = os.path.join(run_path, 'InterOp')
    if not (os.path.exists(run_info_path) and os.path.exists(interop_path)):
        read_sizes = None
    else:
        read_sizes = parse_read_sizes(run_info_path)
    run_info = RunInfo(sample_groups,
                       reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'],
                       interop_path=interop_path,
                       scratch_path=scratch_path,
                       output_path=output_path,
                       read_sizes=read_sizes)

    fastq_files = list(glob(os.path.join(run_path,
                                         'Data',
                                         'Intensities',
                                         'BaseCalls',
                                         '*_R1_*')) or
                       glob(os.path.join(run_path,
                                         '*_R1_*')))
    source_folder = fastq_files and os.path.dirname(fastq_files[0])
    file_names = [os.path.basename(fastq_file) for fastq_file in fastq_files]
    groups = find_groups(file_names,
                         os.path.join(run_path, 'SampleSheet.csv'))
    for group in groups:
        main_file, midi_file = group.names
        if main_file.startswith('Undetermined'):
            continue
        main_sample = Sample(fastq1=os.path.join(source_folder, main_file))
        if midi_file is None:
            midi_sample = None
        else:
            midi_sample = Sample(fastq1=os.path.join(source_folder, midi_file))
        sample_groups.append(SampleGroup(main_sample, midi_sample))

    sample_count = sum(1 for _ in run_info.get_all_samples())
    for i, sample in enumerate(run_info.get_all_samples(), 1):
        sample.rank = '({} of {})'.format(i, sample_count)
        sample.bad_cycles_csv = run_info.bad_cycles_csv
        sample.scratch_path = os.path.join(scratch_path, sample.name)

    return run_info
def find_full_groups(fastq_files, sandbox_path):
    groups = list(
        find_groups([p.name for p in fastq_files],
                    sandbox_path / 'SampleSheet.csv'))
    full_groups = []
    for group in groups:
        full_names = tuple(name and (sandbox_path / name)
                           for name in group.names)
        full_groups.append(
            SampleGroup(group.enum, full_names, group.project_codes))
    return full_groups
Exemple #5
0
def find_sample_groups(run_path, base_calls_path):
    # noinspection PyBroadException
    try:
        fastq_files = base_calls_path.glob("*_R1_*.fastq.gz")
        sample_sheet_path = run_path / "SampleSheet.csv"
        file_names = [f.name for f in fastq_files]
        sample_groups = list(find_groups(file_names, sample_sheet_path))
        sample_groups.sort(key=lambda group: get_sample_number(group.names[0]),
                           reverse=True)
    except Exception:
        logger.error("Finding sample groups in %s", run_path, exc_info=True)
        (run_path / "errorprocessing").write_text(
            "Finding sample groups failed.\n")
        sample_groups = []
    return sample_groups
def find_sample_groups(run_path, base_calls_path):
    # noinspection PyBroadException
    try:
        fastq_files = base_calls_path.glob("*_R1_*.fastq.gz")
        sample_sheet_path = run_path / "SampleSheet.csv"
        file_names = [f.name for f in fastq_files]
        sample_groups = list(find_groups(file_names, sample_sheet_path))
        sample_groups.sort(key=lambda group: get_sample_number(group.names[0]),
                           reverse=True)
    except Exception:
        logger.error("Finding sample groups in %s", run_path, exc_info=True)
        (run_path / "errorprocessing").write_text(
            "Finding sample groups failed.\n")
        sample_groups = []
    return sample_groups
Exemple #7
0
def test_unmatched_midi_file_not_found(tmpdir):
    sample_sheet_path = Path(str(tmpdir)) / 'SampleSheet.csv'
    sample_sheet_path.write_text(BASIC_HEADER + """\
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder
CFE_SomeId_10-Jul-2014_N501-N701_Sample1MIDI_MidHCV,Sample1MIDI_MidHCV,,,ACGTACGT,TGCATGCA,,,
CFE_SomeId_10-Jul-2014_N501-N702_Sample2_Proj2,Sample2_Proj2,,,AAAAGGGG,CCCCTTTT,,,
""")
    expected_groups = [
        SampleGroup('Sample2', ('Sample2-Proj2_S2_L001_R1_001.fastq.gz', None),
                    ('Proj2', None))
    ]

    groups = list(
        find_groups(['Sample2-Proj2_S2_L001_R1_001.fastq.gz'],
                    sample_sheet_path))

    assert expected_groups == groups
Exemple #8
0
def test_combine_midi(tmpdir):
    sample_sheet_path = Path(str(tmpdir)) / 'SampleSheet.csv'
    sample_sheet_path.write_text(BASIC_HEADER + """\
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder
CFE_SomeId_10-Jul-2014_N501-N701_Sample1_HCV,Sample1_HCV,,,ACGTACGT,TGCATGCA,,,
CFE_SomeId_10-Jul-2014_N501-N702_Sample1MIDI_MidHCV,Sample1MIDI_MidHCV,,,AAAAGGGG,CCCCTTTT,,,
""")
    expected_groups = [
        SampleGroup('Sample1', ('Sample1-HCV_S1_L001_R1_001.fastq.gz',
                                'Sample1MIDI-MidHCV_S2_L001_R1_001.fastq.gz'),
                    ('HCV', 'MidHCV'))
    ]

    groups = list(
        find_groups([
            'Sample1-HCV_S1_L001_R1_001.fastq.gz',
            'Sample1MIDI-MidHCV_S2_L001_R1_001.fastq.gz'
        ], sample_sheet_path))

    assert expected_groups == groups
Exemple #9
0
def link_samples(
        run_path: str,
        output_path: str,
        is_denovo: bool,
        fastq1s: typing.Sequence[str] = None,
        fastq2s: typing.Sequence[str] = None,
        project_code: str = None):
    """ Load the data from a run folder. """

    shutil.rmtree(output_path, ignore_errors=True)
    makedirs(output_path)

    scratch_path = os.path.join(output_path, 'scratch')
    makedirs(scratch_path)

    sample_groups = []
    run_info_path = os.path.join(run_path, 'RunInfo.xml')
    interop_path = os.path.join(run_path, 'InterOp')
    if not (os.path.exists(run_info_path) and os.path.exists(interop_path)):
        read_sizes = None
    else:
        read_sizes = parse_read_sizes(run_info_path)
    run_info = RunInfo(sample_groups,
                       reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'],
                       interop_path=interop_path,
                       scratch_path=scratch_path,
                       output_path=output_path,
                       read_sizes=read_sizes,
                       is_denovo=is_denovo)

    sample_sheet_path = os.path.join(run_path, "SampleSheet.csv")
    if (fastq1s is not None and len(fastq1s) > 0
            or not os.path.exists(sample_sheet_path)):
        if fastq1s is not None and len(fastq1s) > 0:  # forward files are specified
            if fastq2s is None:
                raise ValueError("Reverse read files must also be specified.")
            elif len(fastq2s) != len(fastq1s):
                raise ValueError(
                    "The same number of forward and reverse read files must be "
                    "specified."
                )
            forward_reverse_pairs = zip(fastq1s, fastq2s)

        else:  # there is no sample sheet
            # Sort the FASTQ files alphabetically and run them in pairs.
            logger.info(
                "No sample sheet found; running on all FASTQ files in folder {}".format(
                    run_path
                )
            )
            fastq_files = (list(glob(os.path.join(run_path, "*.fastq")))
                           + list(glob(os.path.join(run_path, "*.fastq.gz"))))
            fastq_files.sort()
            forward_reverse_pairs = []
            for idx in range(0, len(fastq_files), 2):
                forward = fastq_files[idx]
                if idx == len(fastq_files) - 1:
                    # We have an odd number of FASTQ files; ignore this last one.
                    logger.info(
                        "File {} appears extraneous; omitting.".format(forward)
                    )
                    break
                reverse = fastq_files[idx + 1]
                logger.info(
                    "Pairing files {} and {}.".format(forward, reverse)
                )
                forward_reverse_pairs.append((forward, reverse))

        for forward, reverse in forward_reverse_pairs:
            sample = Sample(
                fastq1=os.path.join(run_path, forward),
                fastq2=os.path.join(run_path, reverse),
            )
            sample.project_code = project_code
            sample_groups.append(SampleGroup(sample, midi_sample=None))

    else:  # a sample sheet is specified
        fastq_files = list(glob(os.path.join(run_path,
                                             'Data',
                                             'Intensities',
                                             'BaseCalls',
                                             '*_R1_*')) or
                           glob(os.path.join(run_path,
                                             '*_R1_*')))
        source_folder = fastq_files and os.path.dirname(fastq_files[0])
        file_names = [os.path.basename(fastq_file) for fastq_file in fastq_files]
        groups = find_groups(file_names, sample_sheet_path)
        for group in groups:
            main_file, midi_file = group.names
            if main_file.startswith('Undetermined'):
                continue
            main_sample = Sample(fastq1=os.path.join(source_folder, main_file))
            main_sample.project_code = project_code
            if midi_file is None:
                midi_sample = None
            else:
                midi_sample = Sample(fastq1=os.path.join(source_folder, midi_file))
                midi_sample.project_code = project_code
            sample_groups.append(SampleGroup(main_sample, midi_sample))

    sample_count = sum(1 for _ in run_info.get_all_samples())
    for i, sample in enumerate(run_info.get_all_samples(), 1):
        sample.rank = '{} of {}'.format(i, sample_count)
        sample.bad_cycles_csv = run_info.bad_cycles_csv
        sample.scratch_path = os.path.join(scratch_path, sample.name)

    return run_info
Exemple #10
0
def genreport_rerun(source, working):
    run_path = os.path.dirname(os.path.dirname(source))
    run_name = os.path.basename(run_path)
    publish_path = os.path.join(working,
                                'rerun_results',
                                run_name)
    os.makedirs(publish_path)
    print('##', run_name)
    working_paths = split_files(source, working)
    file_names = (os.path.basename(working_path) for working_path in working_paths)
    sorted_file_names = sorted(file_names)
    sample_sheet_path = os.path.join(source, '../../SampleSheet.csv')
    groups = list(find_groups(sorted_file_names,
                              sample_sheet_path,
                              included_projects={'HCV'}))
    for group in groups:
        working_path = os.path.join(working, group.names[0])
        if group.names[1] is None:
            midi_name = ''
            midi_path = working_path
        else:
            midi_name = group.names[1]
            midi_path = os.path.join(
                working,
                group.names[1])
            if not os.path.isdir(midi_path):
                midi_name = 'failed MidHCV'
                midi_path = working_path
        print(working_path, midi_name)
        with open(os.path.join(working_path, 'amino.csv')) as amino_csv, \
                open(os.path.join(midi_path, 'amino.csv')) as midi_amino_csv, \
                open(os.path.join(working_path, 'resistance.csv'), 'w') as resistance_csv, \
                open(os.path.join(working_path, 'mutations.csv'), 'w') as mutations_csv, \
                open(os.path.join(working_path, 'resistance_fail.csv'), 'w') as resistance_fail_csv:
            report_resistance(amino_csv,
                              midi_amino_csv,
                              resistance_csv,
                              mutations_csv,
                              resistance_fail_csv)
        sample_name = os.path.basename(working_path)
        with open(os.path.join(working_path, 'resistance.csv')) as resistance_csv, \
                open(os.path.join(working_path, 'mutations.csv')) as mutations_csv, \
                open(os.path.join(working_path, 'resistance_report.pdf'), 'wb') as resistance_report_csv:
            gen_report(resistance_csv,
                       mutations_csv,
                       resistance_report_csv,
                       sample_name=sample_name)
    for file_name in ('resistance.csv', 'mutations.csv', 'resistance_fail.csv'):
        with open(os.path.join(publish_path, file_name), 'w') as dest:
            dest_writer = csv.writer(dest)
            for i, group in enumerate(groups):
                working_path = os.path.join(working, group.names[0])
                sample_name = os.path.basename(working_path)
                with open(os.path.join(working_path, file_name), 'r') as source:
                    source_reader = csv.reader(source)
                    for j, row in enumerate(source_reader):
                        if j != 0:
                            row.insert(0, sample_name)
                        elif i == 0:
                            row.insert(0, 'sample')
                        else:
                            continue
                        dest_writer.writerow(row)