Exemple #1
0
    def _compare_files(self, actual_output_path, expected_output_path):
        actual_file = vcf.FileReader(actual_output_path)
        actual_file.open()
        actual = [line for line in actual_file.read_lines()]
        actual_file.close()

        expected_file = vcf.FileReader(expected_output_path)
        expected_file.open()
        expected = [line for line in expected_file.read_lines()]
        expected_file.close()

        self.assertEquals(len(expected), len(actual))

        for i in range(len(expected)):
            if expected[i].startswith("##jacquard=<timestamp="):
                self.assertStartsWith(actual[i], "##jacquard=<timestamp=")
            else:
                self.assertEquals(expected[i].rstrip(), actual[i].rstrip())
Exemple #2
0
def execute(args, execution_context):
    input_file = os.path.abspath(args.input)
    output = os.path.abspath(args.output)

    summary_caller = summarize_caller.SummarizeCaller()

    vcf_reader = vcf.VcfReader(vcf.FileReader(input_file))
    tmp_output_file = output + ".tmp"
    tmp_writer = vcf.FileWriter(tmp_output_file)

    _write_to_tmp_file(summary_caller, vcf_reader, tmp_writer)

    tmp_reader = vcf.VcfReader(vcf.FileReader(tmp_output_file))
    file_writer = vcf.FileWriter(output)

    logger.info("Calculating zscores")
    caller = zscore_caller.ZScoreCaller(tmp_reader)
    metaheaders = execution_context + summary_caller.get_metaheaders()
    _write_zscores(caller, metaheaders, tmp_reader, file_writer)

    os.remove(tmp_output_file)
Exemple #3
0
def execute(args, dummy_execution_context):
    #for the moment, there is no good place to put the execution context
    input_file = os.path.abspath(args.input)
    output_file = os.path.abspath(args.output)
    col_spec = None
    if args.selected_columns_file:
        col_spec = args.selected_columns_file

    logger.debug("Expanding [{}] to [{}]", input_file, output_file)
    logger.info("Expanding [{}] to [{}]", args.input, args.original_output)

    vcf_reader = vcf.VcfReader(vcf.FileReader(input_file))
    file_writer = vcf.FileWriter(output_file)
    file_writer.open()

    (columns, glossary_fields) = _get_actual_columns(vcf_reader, col_spec)

    file_writer.write("#" + "\t".join(columns) + "\n")

    line_count = 0
    vcf_reader.open()
    for vcf_record in vcf_reader.vcf_records():
        row_dict = _create_row_dict(vcf_reader.split_column_header, vcf_record)

        new_line = []
        for col in columns:
            if col in row_dict:
                new_line.append(row_dict[col])
            else:
                new_line.append(".")

        file_writer.write("\t".join(new_line) + "\n")
        line_count += 1
        if line_count % 10000 == 0:
            logger.info("Expanding: {} rows processed", line_count)
    logger.info("Expand complete: {} rows processed", line_count)

    file_writer.close()

    glossary_writer = _get_glossary_writer(output_file)
    glossary_writer.open()
    _create_glossary(vcf_reader.metaheaders, glossary_fields, glossary_writer)
    glossary_writer.close()
    logger.info("Wrote glossary to [{}]",
                os.path.basename(glossary_writer.output_filepath))

    vcf_reader.close()
    logger.debug("Wrote input [{}] to output [{}]", input_file, output_file)
Exemple #4
0
def _sort_vcf(reader, sorted_dir):
    vcf_records = []
    reader.open()
    for vcf_record in reader.vcf_records():
        vcf_records.append(vcf_record)

    reader.close()
    vcf_records.sort()
    writer = FileWriter(os.path.join(sorted_dir, reader.file_name))
    writer.open()
    writer.write("\n".join(reader.metaheaders) + "\n")
    writer.write(reader.column_header + "\n")
    for vcf_record in vcf_records:
        writer.write(vcf_record.text())

    writer.close()
    reader = MergeVcfReader(vcf.FileReader(writer.output_filepath))
    return reader
Exemple #5
0
def execute(args, execution_context):
    input_path = os.path.abspath(args.input)
    output_path = os.path.abspath(args.output)
    filter_strategy = _Filter(args)
    format_tag_regex = _get_format_tag_regex(args)

    input_files = sorted(glob.glob(os.path.join(input_path, "*.vcf")))
    file_readers = [vcf.FileReader(i) for i in input_files]
    _validate_consistent_samples(file_readers)

    try:
        file_writer = vcf.FileWriter(output_path)
        file_writer.open()

        #TODO: jebene: _build_format_tags, _built_sample_list, _build_info_tags,
        #and _build_contigs behave differently. It seems like we could make the
        #signatures of these methods more similar or even combine some methods to
        #reduce excess iterations over the coordinates/vcf_readers
        merge_vcf_readers = _create_vcf_readers(file_readers)
        _validate_consistent_input(merge_vcf_readers, args.include_all)
        merge_vcf_readers = _sort_readers(merge_vcf_readers, output_path)
        format_tags = _get_format_tags(merge_vcf_readers)
        _disambiguate_format_tags(merge_vcf_readers, format_tags)
        format_tags_to_keep = _build_format_tags(format_tag_regex,
                                                 merge_vcf_readers)
        (all_sample_names,
         merge_metaheaders) = _build_sample_list(merge_vcf_readers)

        coordinates = _build_coordinates(merge_vcf_readers)
        info_tags_to_keep = _build_info_tags(coordinates)
        contigs_to_keep = _build_contigs(coordinates)
        incoming_headers = _FILE_FORMAT + execution_context + merge_metaheaders
        headers = _compile_metaheaders(incoming_headers, merge_vcf_readers,
                                       all_sample_names, contigs_to_keep,
                                       format_tags_to_keep, info_tags_to_keep)

        _write_metaheaders(file_writer, headers)

        _merge_records(merge_vcf_readers, coordinates, filter_strategy,
                       all_sample_names, format_tags_to_keep, file_writer)
    finally:
        for vcf_reader in merge_vcf_readers:
            vcf_reader.close()
        file_writer.close()
Exemple #6
0
def validate_args(args):
    if args.selected_columns_file:
        if not os.path.isfile(args.selected_columns_file):
            raise utils.UsageError(("The selected_columns_file [{}] could "
                                    "not be read. Review inputs/usage and "
                                    "try again."), args.selected_columns_file)
        columns = None
        try:
            columns = _read_col_spec(args.selected_columns_file)
        except:
            pass
        if not columns:
            raise utils.UsageError(
                "The selected_columns_file .* has no rows. Review inputs/usage and try again"
            )

    try:
        vcf.VcfReader(vcf.FileReader(args.input))
    except:
        raise utils.UsageError(
            ("The expand command requires a VCF file as an "
             "input, but the specified input [{}] contains no VCF "
             "metaheaders. Review inputs and try again.").format(args.input))