Example #1
0
def link_samples(run_path, data_path):
    """ Load the data from a run folder into the BaseSpace layout. """

    shutil.rmtree(data_path, ignore_errors=True)
    makedirs(data_path)

    results_path = os.path.join(run_path, 'Results', 'basespace')
    makedirs(results_path)
    output_path = os.path.join(data_path, 'output')
    os.symlink(results_path, output_path)
    scratch_path = os.path.join(data_path, 'scratch')
    makedirs(scratch_path)

    sample_groups = []
    run_info_path = os.path.join(run_path, 'RunInfo.xml')
    interop_path = os.path.join(run_path, 'InterOp')
    if not (os.path.exists(run_info_path) and os.path.exists(interop_path)):
        read_sizes = None
    else:
        read_sizes = parse_read_sizes(run_info_path)
    run_info = RunInfo(sample_groups,
                       reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'],
                       interop_path=interop_path,
                       scratch_path=scratch_path,
                       output_path=output_path,
                       read_sizes=read_sizes)

    fastq_files = list(glob(os.path.join(run_path,
                                         'Data',
                                         'Intensities',
                                         'BaseCalls',
                                         '*_R1_*')) or
                       glob(os.path.join(run_path,
                                         '*_R1_*')))
    source_folder = fastq_files and os.path.dirname(fastq_files[0])
    file_names = [os.path.basename(fastq_file) for fastq_file in fastq_files]
    groups = find_groups(file_names,
                         os.path.join(run_path, 'SampleSheet.csv'))
    for group in groups:
        main_file, midi_file = group.names
        if main_file.startswith('Undetermined'):
            continue
        main_sample = Sample(fastq1=os.path.join(source_folder, main_file))
        if midi_file is None:
            midi_sample = None
        else:
            midi_sample = Sample(fastq1=os.path.join(source_folder, midi_file))
        sample_groups.append(SampleGroup(main_sample, midi_sample))

    sample_count = sum(1 for _ in run_info.get_all_samples())
    for i, sample in enumerate(run_info.get_all_samples(), 1):
        sample.rank = '({} of {})'.format(i, sample_count)
        sample.bad_cycles_csv = run_info.bad_cycles_csv
        sample.scratch_path = os.path.join(scratch_path, sample.name)

    return run_info
Example #2
0
    def test_get_all_samples(self):
        expected_fastq_paths = ['1a_R1_001.fastq',
                                '1b_R1_001.fastq',
                                '2_R1_001.fastq']

        run_info = RunInfo(
            sample_groups=[SampleGroup(Sample(fastq1='1a_R1_001.fastq'),
                                       Sample(fastq1='1b_R1_001.fastq')),
                           SampleGroup(Sample(fastq1='2_R1_001.fastq'))])
        fastq_paths = [sample.fastq1 for sample in run_info.get_all_samples()]

        self.assertEqual(expected_fastq_paths, fastq_paths)
Example #3
0
    def test_get_all_samples(self):
        expected_fastq_paths = [
            '1a_R1_001.fastq', '1b_R1_001.fastq', '2_R1_001.fastq'
        ]

        run_info = RunInfo(sample_groups=[
            SampleGroup(Sample(
                fastq1='1a_R1_001.fastq'), Sample(fastq1='1b_R1_001.fastq')),
            SampleGroup(Sample(fastq1='2_R1_001.fastq'))
        ])
        fastq_paths = [sample.fastq1 for sample in run_info.get_all_samples()]

        self.assertEqual(expected_fastq_paths, fastq_paths)
Example #4
0
def hcv_sample(args):
    resolved_args = MiCallArgs(args)
    midi_args = MiCallArgs(args, map_midi=True)
    scratch_path = os.path.join(args.results_folder, "scratch")
    midi_scratch_path = os.path.join(args.results_folder, "scratch_midi")
    makedirs(scratch_path)
    shutil.rmtree(midi_scratch_path, ignore_errors=True)

    sample_groups = []
    run_info = RunInfo(sample_groups,
                       reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'],
                       output_path=args.results_folder,
                       scratch_path=scratch_path,
                       is_denovo=args.denovo)
    main_sample = Sample(fastq1=resolved_args.fastq1,
                         fastq2=resolved_args.fastq2,
                         bad_cycles_csv=resolved_args.bad_cycles_csv,
                         scratch_path=scratch_path)
    midi_sample = Sample(fastq1=midi_args.fastq1,
                         fastq2=midi_args.fastq2,
                         bad_cycles_csv=resolved_args.bad_cycles_csv,
                         scratch_path=midi_scratch_path)
    main_and_midi = SampleGroup(main_sample, midi_sample)
    sample_groups.append(main_and_midi)

    process_run(run_info, args)
Example #5
0
def single_sample(args):
    resolved_args = MiCallArgs(args)
    scratch_path = os.path.join(args.results_folder, "scratch")
    makedirs(scratch_path)

    sample_groups = []
    run_info = RunInfo(sample_groups,
                       reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'],
                       output_path=args.results_folder,
                       scratch_path=scratch_path,
                       is_denovo=args.denovo)
    sample = Sample(fastq1=resolved_args.fastq1,
                    fastq2=resolved_args.fastq2,
                    bad_cycles_csv=resolved_args.bad_cycles_csv,
                    scratch_path=scratch_path)
    sample.project_code = args.project_code
    sample_group = SampleGroup(sample)
    sample_groups.append(sample_group)

    process_run(run_info, args)
Example #6
0
def link_samples(
        run_path: str,
        output_path: str,
        is_denovo: bool,
        fastq1s: typing.Sequence[str] = None,
        fastq2s: typing.Sequence[str] = None,
        project_code: str = None):
    """ Load the data from a run folder. """

    shutil.rmtree(output_path, ignore_errors=True)
    makedirs(output_path)

    scratch_path = os.path.join(output_path, 'scratch')
    makedirs(scratch_path)

    sample_groups = []
    run_info_path = os.path.join(run_path, 'RunInfo.xml')
    interop_path = os.path.join(run_path, 'InterOp')
    if not (os.path.exists(run_info_path) and os.path.exists(interop_path)):
        read_sizes = None
    else:
        read_sizes = parse_read_sizes(run_info_path)
    run_info = RunInfo(sample_groups,
                       reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'],
                       interop_path=interop_path,
                       scratch_path=scratch_path,
                       output_path=output_path,
                       read_sizes=read_sizes,
                       is_denovo=is_denovo)

    sample_sheet_path = os.path.join(run_path, "SampleSheet.csv")
    if (fastq1s is not None and len(fastq1s) > 0
            or not os.path.exists(sample_sheet_path)):
        if fastq1s is not None and len(fastq1s) > 0:  # forward files are specified
            if fastq2s is None:
                raise ValueError("Reverse read files must also be specified.")
            elif len(fastq2s) != len(fastq1s):
                raise ValueError(
                    "The same number of forward and reverse read files must be "
                    "specified."
                )
            forward_reverse_pairs = zip(fastq1s, fastq2s)

        else:  # there is no sample sheet
            # Sort the FASTQ files alphabetically and run them in pairs.
            logger.info(
                "No sample sheet found; running on all FASTQ files in folder {}".format(
                    run_path
                )
            )
            fastq_files = (list(glob(os.path.join(run_path, "*.fastq")))
                           + list(glob(os.path.join(run_path, "*.fastq.gz"))))
            fastq_files.sort()
            forward_reverse_pairs = []
            for idx in range(0, len(fastq_files), 2):
                forward = fastq_files[idx]
                if idx == len(fastq_files) - 1:
                    # We have an odd number of FASTQ files; ignore this last one.
                    logger.info(
                        "File {} appears extraneous; omitting.".format(forward)
                    )
                    break
                reverse = fastq_files[idx + 1]
                logger.info(
                    "Pairing files {} and {}.".format(forward, reverse)
                )
                forward_reverse_pairs.append((forward, reverse))

        for forward, reverse in forward_reverse_pairs:
            sample = Sample(
                fastq1=os.path.join(run_path, forward),
                fastq2=os.path.join(run_path, reverse),
            )
            sample.project_code = project_code
            sample_groups.append(SampleGroup(sample, midi_sample=None))

    else:  # a sample sheet is specified
        fastq_files = list(glob(os.path.join(run_path,
                                             'Data',
                                             'Intensities',
                                             'BaseCalls',
                                             '*_R1_*')) or
                           glob(os.path.join(run_path,
                                             '*_R1_*')))
        source_folder = fastq_files and os.path.dirname(fastq_files[0])
        file_names = [os.path.basename(fastq_file) for fastq_file in fastq_files]
        groups = find_groups(file_names, sample_sheet_path)
        for group in groups:
            main_file, midi_file = group.names
            if main_file.startswith('Undetermined'):
                continue
            main_sample = Sample(fastq1=os.path.join(source_folder, main_file))
            main_sample.project_code = project_code
            if midi_file is None:
                midi_sample = None
            else:
                midi_sample = Sample(fastq1=os.path.join(source_folder, midi_file))
                midi_sample.project_code = project_code
            sample_groups.append(SampleGroup(main_sample, midi_sample))

    sample_count = sum(1 for _ in run_info.get_all_samples())
    for i, sample in enumerate(run_info.get_all_samples(), 1):
        sample.rank = '{} of {}'.format(i, sample_count)
        sample.bad_cycles_csv = run_info.bad_cycles_csv
        sample.scratch_path = os.path.join(scratch_path, sample.name)

    return run_info
Example #7
0
def load_samples(data_path):
    """ Load JSON file from the data path, and pull out the arguments for this run.

    :param str data_path: folder that contains a JSON file in the BaseSpace
    AppSession format.
    :return RunInfo: details about the run and samples
    """
    json_path = os.path.join(data_path, 'input', 'AppSession.json')
    try:
        with open(json_path, 'r') as json_file:
            raw_args = json.load(json_file)

        arg_map = {item['Name']: item
                   for item in raw_args['Properties']['Items']}

        href_app_session = raw_args['Href']
        run = arg_map.get('Input.run-id')
        if run is None:
            run_id = interop_path = read_sizes = None
        else:
            run_content = run['Content']
            run_id = run_content['Id']
            interop_path = os.path.join(data_path,
                                        'input',
                                        'runs',
                                        run_id,
                                        'InterOp')
            read_sizes = ReadSizes(
                run_content['SequencingStats']['NumCyclesRead1'],
                run_content['SequencingStats']['NumCyclesRead2'],
                run_content['SequencingStats']['NumCyclesIndex1'],
                run_content['SequencingStats']['NumCyclesIndex2'])
        project_id = arg_map['Input.project-id']['Content']['Id']
        output_path = os.path.join(data_path,
                                   'output',
                                   'appresults',
                                   project_id,
                                   'results')
        makedirs(output_path)
        reports = arg_map['Input.reports']['Items']
        builder_node = arg_map.get('Input.builder')
        if builder_node is None:
            is_denovo = False
        else:
            is_denovo = builder_node['Content'] == 'denovo'
        primer_node = arg_map.get('Input.project_code')
        if primer_node is None:
            project_code = None
        else:
            project_code = primer_node['Content']

        scratch_path = os.path.join(data_path, 'scratch')
        sample_groups = []
        run_info = RunInfo(sample_groups,
                           reports,
                           interop_path,
                           scratch_path,
                           output_path,
                           read_sizes,
                           href_app_session,
                           is_denovo)
        main_samples = arg_map['Input.sample-ids.main']['Items']
        midi_samples = arg_map['Input.sample-ids.midi']['Items']
        for main_sample_json, midi_sample_json in zip(main_samples, midi_samples):
            sample_group = SampleGroup(load_sample(main_sample_json,
                                                   data_path,
                                                   scratch_path,
                                                   project_code),
                                       load_sample(midi_sample_json,
                                                   data_path,
                                                   scratch_path,
                                                   project_code))
            sample_groups.append(sample_group)

        # Do we have run_ids for all sample_ids ?
        if run_id is not None:
            bs = BSrequest()
            all_ids = {s.basespace_id for s in run_info.get_all_samples()}
            sample_id_set = bs.check_run_sample_ids(
                [run_id],
                all_ids)
            if len(sample_id_set) != len(all_ids):
                for s in run_info.get_all_samples():
                    if s.basespace_id not in sample_id_set:
                        logger.warning(
                            'Run info not found for %s, skipping error rate data.',
                            s)
                run_info.read_sizes = run_info.interop_path = None

        create_app_result(run_info)
    except IOError:
        if os.path.exists(json_path):
            # copy the input file to the output dir for postmortem analysis
            logger.error("Error occurred while parsing %r.", json_path)
            with open(json_path, 'r') as json_file:
                file_cont = json_file.read()
            out_path = os.path.join(data_path, 'logs', 'AppSession.json')
            with open(out_path, 'w') as json_file:
                json_file.write(file_cont)
        else:
            logger.error("Error: no such file as %r.", json_path)
        raise

    return run_info
Example #8
0
def collate_samples(run_info: RunInfo):
    """ Combine all the sample files into run files.

    :param run_info: details of the run and samples
    """
    filenames = ['remap_counts.csv',
                 'remap_conseq.csv',
                 'conseq_ins.csv',
                 'failed_read.csv',
                 'nuc.csv',
                 'amino.csv',
                 'coord_ins.csv',
                 'conseq.csv',
                 'conseq_all.csv',
                 'conseq_region.csv',
                 'failed_align.csv',
                 'coverage_scores.csv',
                 'g2p.csv',
                 'g2p_summary.csv',
                 'resistance.csv',
                 'mutations.csv',
                 'nuc_mutations.csv',
                 'resistance_fail.csv',
                 'resistance_consensus.csv',
                 'cascade.csv',
                 'merge_lengths.csv']
    for filename in filenames:
        out_path = run_info.output_path
        with open(os.path.join(out_path, filename), 'w') as fout:
            writer = csv.writer(fout, lineterminator=os.linesep)
            is_header_written = False
            for sample_info in run_info.get_all_samples():
                sample_name = sample_info.name
                sample_scratch_path = sample_info.scratch_path
                srcfile = os.path.join(sample_scratch_path, filename)
                try:
                    with open(srcfile, 'r') as fin:
                        reader = csv.reader(fin)
                        for i, row in enumerate(reader):
                            if i == 0:
                                if not is_header_written:
                                    row.insert(0, 'sample')
                                    writer.writerow(row)
                                    is_header_written = True
                            else:
                                row.insert(0, sample_name)
                                writer.writerow(row)
                except IOError as ex:
                    if ex.errno != errno.ENOENT:
                        raise
    resistance_reports_path = os.path.join(run_info.output_path,
                                           'resistance_reports')
    makedirs(resistance_reports_path)
    coverage_maps_path = os.path.join(run_info.output_path, 'coverage_maps')
    genome_coverage_path = os.path.join(coverage_maps_path, 'genome')
    makedirs(genome_coverage_path)
    merge_lengths_path = os.path.join(run_info.output_path, 'merge_lengths')
    makedirs(merge_lengths_path)
    for sample_info in run_info.get_all_samples():
        if os.path.exists(sample_info.coverage_maps):
            for map_file in os.listdir(sample_info.coverage_maps):
                safe_file_move(os.path.join(sample_info.coverage_maps, map_file),
                               os.path.join(coverage_maps_path, map_file))
        if os.path.exists(sample_info.contigs_svg):
            safe_file_move(sample_info.contigs_svg,
                           os.path.join(coverage_maps_path,
                                        sample_info.name + '_contigs.svg'))
        if os.path.exists(sample_info.genome_coverage_svg):
            safe_file_move(sample_info.genome_coverage_svg,
                           os.path.join(genome_coverage_path,
                                        sample_info.name + '_genome_coverage.svg'))
        if os.path.exists(sample_info.merge_lengths_svg):
            safe_file_move(sample_info.merge_lengths_svg,
                           os.path.join(merge_lengths_path,
                                        sample_info.name + '_merge_lengths.svg'))
        if os.path.exists(sample_info.resistance_pdf):
            safe_file_move(sample_info.resistance_pdf,
                           os.path.join(resistance_reports_path,
                                        sample_info.name + '_resistance.pdf'))
    try:
        # Remove directory, if it's empty.
        os.rmdir(genome_coverage_path)
    except OSError:
        # Guess it wasn't empty.
        pass
Example #9
0
def load_samples(data_path):
    """ Load JSON file from the data path, and pull out the arguments for this run.

    :param str data_path: folder that contains a JSON file in the BaseSpace
    AppSession format.
    :return RunInfo: details about the run and samples
    """
    json_path = os.path.join(data_path, 'input', 'AppSession.json')
    try:
        with open(json_path, 'r') as json_file:
            raw_args = json.load(json_file)

        arg_map = {item['Name']: item
                   for item in raw_args['Properties']['Items']}

        href_app_session = raw_args['Href']
        run = arg_map.get('Input.run-id')
        if run is None:
            run_id = interop_path = read_sizes = None
        else:
            run_content = run['Content']
            run_id = run_content['Id']
            interop_path = os.path.join(data_path,
                                        'input',
                                        'runs',
                                        run_id,
                                        'InterOp')
            read_sizes = ReadSizes(
                run_content['SequencingStats']['NumCyclesRead1'],
                run_content['SequencingStats']['NumCyclesRead2'],
                run_content['SequencingStats']['NumCyclesIndex1'],
                run_content['SequencingStats']['NumCyclesIndex2'])
        project_id = arg_map['Input.project-id']['Content']['Id']
        output_path = os.path.join(data_path,
                                   'output',
                                   'appresults',
                                   project_id,
                                   'results')
        makedirs(output_path)
        reports = arg_map['Input.reports']['Items']

        scratch_path = os.path.join(data_path, 'scratch')
        sample_groups = []
        run_info = RunInfo(sample_groups,
                           reports,
                           interop_path,
                           scratch_path,
                           output_path,
                           read_sizes,
                           href_app_session)
        main_samples = arg_map['Input.sample-ids.main']['Items']
        midi_samples = arg_map['Input.sample-ids.midi']['Items']
        for main_sample_json, midi_sample_json in zip(main_samples, midi_samples):
            sample_group = SampleGroup(load_sample(main_sample_json,
                                                   data_path,
                                                   scratch_path),
                                       load_sample(midi_sample_json,
                                                   data_path,
                                                   scratch_path))
            sample_groups.append(sample_group)

        # Do we have run_ids for all sample_ids ?
        if run_id is not None:
            bs = BSrequest()
            all_ids = {s.basespace_id for s in run_info.get_all_samples()}
            sample_id_set = bs.check_run_sample_ids(
                [run_id],
                all_ids)
            if len(sample_id_set) != len(all_ids):
                for s in run_info.get_all_samples():
                    if s.basespace_id not in sample_id_set:
                        logger.warning(
                            'Run info not found for %s, skipping error rate data.',
                            s)
                run_info.read_sizes = run_info.interop_path = None

        create_app_result(run_info)
    except IOError:
        if os.path.exists(json_path):
            # copy the input file to the output dir for postmortem analysis
            logger.error("Error occurred while parsing %r.", json_path)
            with open(json_path, 'r') as json_file:
                file_cont = json_file.read()
            out_path = os.path.join(data_path, 'logs', 'AppSession.json')
            with open(out_path, 'w') as json_file:
                json_file.write(file_cont)
        else:
            logger.error("Error: no such file as %r.", json_path)
        raise

    return run_info
def main():
    logging.basicConfig(level=logging.WARN)
    args = parse_args()
    sample_group = load_sample(args)

    sample_group.process_resistance(RunInfo([sample_group]))