Ejemplo n.º 1
0
def _build_coordinates(vcf_readers):
    coordinate_set = set()
    mult_alts = defaultdict(set)

    for i, vcf_reader in enumerate(vcf_readers):
        logger.info("Reading [{}] ({}/{})",
                    os.path.basename(vcf_reader.file_name),
                    i + 1,
                    len(vcf_readers))
        try:
            vcf_reader.open()

            for vcf_record in vcf_reader.vcf_records():
                coordinate_set.add(vcf_record.get_empty_record())
                ref_alt = vcf_record.ref, vcf_record.alt
                locus = vcf_record.chrom, vcf_record.pos
                mult_alts[locus].add(ref_alt)
        finally:
            vcf_reader.close()

    for vcf_record in coordinate_set:
        ref_alts_for_this_locus = mult_alts[vcf_record.chrom, vcf_record.pos]
        inferred_mult_alt = len(ref_alts_for_this_locus) > 1
        explicit_mult_alt = "," in next(iter(ref_alts_for_this_locus))[1]

        if inferred_mult_alt or explicit_mult_alt:
            vcf_record.add_info_field(_MULT_ALT_TAG)

    return sorted(list(coordinate_set))
Ejemplo n.º 2
0
def _build_coordinates(vcf_readers):
    coordinate_set = set()
    mult_alts = defaultdict(set)

    for i, vcf_reader in enumerate(vcf_readers):
        logger.info("Reading [{}] ({}/{})",
                    os.path.basename(vcf_reader.file_name), i + 1,
                    len(vcf_readers))
        try:
            vcf_reader.open()

            for vcf_record in vcf_reader.vcf_records():
                coordinate_set.add(vcf_record.get_empty_record())
                ref_alt = vcf_record.ref, vcf_record.alt
                locus = vcf_record.chrom, vcf_record.pos
                mult_alts[locus].add(ref_alt)
        finally:
            vcf_reader.close()

    for vcf_record in coordinate_set:
        ref_alts_for_this_locus = mult_alts[vcf_record.chrom, vcf_record.pos]
        inferred_mult_alt = len(ref_alts_for_this_locus) > 1
        explicit_mult_alt = "," in next(iter(ref_alts_for_this_locus))[1]

        if inferred_mult_alt or explicit_mult_alt:
            vcf_record.add_info_field(_MULT_ALT_TAG)

    return sorted(list(coordinate_set))
Ejemplo n.º 3
0
def _merge_records(vcf_readers, coordinates, filter_strategy, all_sample_names,
                   format_tags_to_keep, file_writer):
    #pylint: disable=too-many-arguments
    row_count = 0
    next_breakpoint = 0
    buffered_readers = _create_buffered_readers(vcf_readers)

    for coordinate in coordinates:
        vcf_records = _pull_matching_records(filter_strategy, coordinate,
                                             buffered_readers)

        if vcf_records:
            merged_record = _build_merged_record(coordinate, vcf_records,
                                                 all_sample_names,
                                                 format_tags_to_keep)
            file_writer.write(merged_record.text())

        row_count += 1
        progress = 100 * row_count / len(coordinates)
        if progress > next_breakpoint:
            logger.info("Merging: {} rows processed ({}%)", row_count,
                        next_breakpoint)
            next_breakpoint = 10 * int(progress / 10) + 10

    logger.info("Merge complete: {} rows processed (100%)", row_count)
    filter_strategy.log_statistics()
Ejemplo n.º 4
0
def _merge_records(vcf_readers,
                   coordinates,
                   filter_strategy,
                   all_sample_names,
                   format_tags_to_keep,
                   file_writer):
    #pylint: disable=too-many-arguments
    row_count = 0
    next_breakpoint = 0
    buffered_readers = _create_buffered_readers(vcf_readers)

    for coordinate in coordinates:
        vcf_records = _pull_matching_records(filter_strategy,
                                             coordinate,
                                             buffered_readers)

        if vcf_records:
            merged_record = _build_merged_record(coordinate,
                                                 vcf_records,
                                                 all_sample_names,
                                                 format_tags_to_keep)
            file_writer.write(merged_record.text())

        row_count += 1
        progress = 100 * row_count / len(coordinates)
        if progress > next_breakpoint:
            logger.info("Merging: {} rows processed ({}%)",
                        row_count,
                        next_breakpoint)
            next_breakpoint = 10 * int(progress/10) + 10

    logger.info("Merge complete: {} rows processed (100%)", row_count)
    filter_strategy.log_statistics()
Ejemplo n.º 5
0
    def test_noExceptionOnMalformedMessage(self):
        tool = "foo"
        args = Namespace(subparser_name=tool,
                         log_file=None,
                         verbose=None)
        logger.initialize_logger(args)

        logger.info("bar {}/{}/{}", "1", "2")

        root_logger = logger.logging.getLogger()
        self.assertEquals(["root: INFO: Malformed log message (IndexError: tuple index out of range)|bar {}/{}/{}|['1', '2']"], root_logger.handlers[0].buffer)
Ejemplo n.º 6
0
def _move_tmp_contents_to_original(args):
    temp_dir = args.temp_working_dir
    dest_dir = os.path.dirname(args.temp_working_dir)

    logger.debug(
        "Moving files from tmp directory [{}] to output "
        "directory [{}]", temp_dir, dest_dir)
    distutils.dir_util.copy_tree(temp_dir, dest_dir)
    logger.debug("Output saved to [{}]", os.path.abspath(args.original_output))
    logger.info("Output saved to [{}]", args.original_output)
    _cleanup(temp_dir)
Ejemplo n.º 7
0
def _write_to_tmp_file(caller, vcf_reader, tmp_writer):
    vcf_reader.open()
    tmp_writer.open()

    try:
        _write_metaheaders(caller, vcf_reader, tmp_writer)
        logger.info("Adding summary tags for [{}]", vcf_reader.file_name)
        _add_tags(caller, vcf_reader, tmp_writer)

    finally:
        vcf_reader.close()
        tmp_writer.close()
Ejemplo n.º 8
0
def _write_to_tmp_file(caller, vcf_reader, tmp_writer):
    vcf_reader.open()
    tmp_writer.open()

    try:
        _write_metaheaders(caller, vcf_reader, tmp_writer)
        logger.info("Adding summary tags for [{}]", vcf_reader.file_name)
        _add_tags(caller, vcf_reader, tmp_writer)

    finally:
        vcf_reader.close()
        tmp_writer.close()
Ejemplo n.º 9
0
    def test_noExceptionOnMalformedMessage(self):
        tool = "foo"
        args = Namespace(subparser_name=tool, log_file=None, verbose=None)
        logger.initialize_logger(args)

        logger.info("bar {}/{}/{}", "1", "2")

        root_logger = logger.logging.getLogger()
        errmsg = root_logger.handlers[0].buffer[0]
        self.assertRegexpMatches(
            r"root: INFO: Malformed log message .*\|bar {}/{}/{}\|['1', '2']",
            errmsg)
Ejemplo n.º 10
0
def _move_tmp_contents_to_original(args):
    temp_dir = args.temp_working_dir
    dest_dir = os.path.dirname(args.temp_working_dir)

    logger.debug("Moving files from tmp directory [{}] to output "
                 "directory [{}]",
                 temp_dir,
                 dest_dir)
    distutils.dir_util.copy_tree(temp_dir, dest_dir)
    logger.debug("Output saved to [{}]", os.path.abspath(args.original_output))
    logger.info("Output saved to [{}]", args.original_output)
    _cleanup(temp_dir)
Ejemplo n.º 11
0
    def log_statistics(self):
        cell_msg = "{}% ({}) cells were excluded because (--include_cells={})"
        logger.info(cell_msg,
                    int(round(100 * self.cells_excluded / self.cell_count, 0)),
                    self.cells_excluded, self._args.include_cells)

        row_msg = "{}% ({}) rows were excluded because (--include_rows={})"
        logger.info(row_msg,
                    int(round(100 * self.rows_excluded / self.row_count, 0)),
                    self.rows_excluded, self._args.include_rows)

        for key, count in self.excluded_breakdown.items():
            msg = "{} cells were excluded with [{}]"
            logger.debug(msg, count, key)
Ejemplo n.º 12
0
    def log_statistics(self):
        cell_msg = "{}% ({}) cells were excluded because (--include_cells={})"
        logger.info(cell_msg,
                    int(round(100 * self.cells_excluded / self.cell_count, 0)),
                    self.cells_excluded,
                    self._args.include_cells)

        row_msg = "{}% ({}) rows were excluded because (--include_rows={})"
        logger.info(row_msg,
                    int(round(100 * self.rows_excluded / self.row_count, 0)),
                    self.rows_excluded,
                    self._args.include_rows)

        for key, count in self.excluded_breakdown.items():
            msg = "{} cells were excluded with [{}]"
            logger.debug(msg, count, key)
Ejemplo n.º 13
0
def _sort_readers(vcf_readers, output_path):
    unsorted_readers = _get_unsorted_readers(vcf_readers)
    sorted_readers = []
    unsorted_count = 0
    if unsorted_readers:
        sorted_dir = os.path.join(os.path.dirname(output_path), "tmp")
        if not os.path.isdir(sorted_dir):
            os.makedirs(sorted_dir)

    for reader in vcf_readers:
        if reader in unsorted_readers:
            unsorted_count += 1
            logger.info("Sorting vcf [{}] ({}/{})", reader.file_name,
                        unsorted_count, len(unsorted_readers))
            reader = _sort_vcf(reader, sorted_dir)
        sorted_readers.append(reader)
    return sorted_readers
Ejemplo n.º 14
0
def _get_unsorted_readers(vcf_readers):
    unsorted_readers = []
    for i, reader in enumerate(vcf_readers):
        logger.info("Checking sort order of [{}] ({}/{})", reader.file_name,
                    i + 1, len(vcf_readers))
        previous_record = None
        reader.open()
        for vcf_record in reader.vcf_records():
            if previous_record and vcf_record < previous_record:
                logger.debug(
                    "VCF file:chrom:pos [{}:{}:{}] is out of order".format(
                        reader.file_name, vcf_record.chrom, vcf_record.pos))
                unsorted_readers.append(reader)
                break
            else:
                previous_record = vcf_record
        reader.close()
    return unsorted_readers
Ejemplo n.º 15
0
def execute(args, dummy_execution_context):
    #for the moment, there is no good place to put the execution context
    input_file = os.path.abspath(args.input)
    output_file = os.path.abspath(args.output)
    col_spec = None
    if args.selected_columns_file:
        col_spec = args.selected_columns_file

    logger.debug("Expanding [{}] to [{}]",
                 input_file,
                 output_file)
    logger.info("Expanding [{}] to [{}]",
                args.input,
                args.original_output)

    vcf_reader = vcf.VcfReader(vcf.FileReader(input_file))
    file_writer = vcf.FileWriter(output_file)
    file_writer.open()

    (columns, glossary_fields) = _get_actual_columns(vcf_reader, col_spec)

    file_writer.write("#" + "\t".join(columns) + "\n")

    line_count = 0
    vcf_reader.open()
    for vcf_record in vcf_reader.vcf_records():
        row_dict = _create_row_dict(vcf_reader.split_column_header,
                                    vcf_record)

        new_line = []
        for col in columns:
            if col in row_dict:
                new_line.append(row_dict[col])
            else:
                new_line.append(".")

        file_writer.write("\t".join(new_line) + "\n")
        line_count +=1
        if line_count % 10000 == 0:
            logger.info("Expanding: {} rows processed", line_count)
    logger.info("Expand complete: {} rows processed", line_count)

    file_writer.close()

    glossary_writer = _get_glossary_writer(output_file)
    glossary_writer.open()
    _create_glossary(vcf_reader.metaheaders, glossary_fields, glossary_writer)
    glossary_writer.close()
    logger.info("Wrote glossary to [{}]",
                os.path.basename(glossary_writer.output_filepath))

    vcf_reader.close()
    logger.debug("Wrote input [{}] to output [{}]", input_file, output_file)
Ejemplo n.º 16
0
def _sort_readers(vcf_readers, output_path):
    unsorted_readers = _get_unsorted_readers(vcf_readers)
    sorted_readers = []
    unsorted_count = 0
    if unsorted_readers:
        sorted_dir = os.path.join(os.path.dirname(output_path), "tmp")
        if not os.path.isdir(sorted_dir):
            os.makedirs(sorted_dir)

    for reader in vcf_readers:
        if reader in unsorted_readers:
            unsorted_count += 1
            logger.info("Sorting vcf [{}] ({}/{})",
                        reader.file_name,
                        unsorted_count,
                        len(unsorted_readers))
            reader = _sort_vcf(reader, sorted_dir)
        sorted_readers.append(reader)
    return sorted_readers
Ejemplo n.º 17
0
def execute(args, execution_context):
    input_file = os.path.abspath(args.input)
    output = os.path.abspath(args.output)

    summary_caller = summarize_caller.SummarizeCaller()

    vcf_reader = vcf.VcfReader(vcf.FileReader(input_file))
    tmp_output_file = output + ".tmp"
    tmp_writer = vcf.FileWriter(tmp_output_file)

    _write_to_tmp_file(summary_caller, vcf_reader, tmp_writer)

    tmp_reader = vcf.VcfReader(vcf.FileReader(tmp_output_file))
    file_writer = vcf.FileWriter(output)

    logger.info("Calculating zscores")
    caller = zscore_caller.ZScoreCaller(tmp_reader)
    metaheaders = execution_context + summary_caller.get_metaheaders()
    _write_zscores(caller, metaheaders, tmp_reader, file_writer)

    os.remove(tmp_output_file)
Ejemplo n.º 18
0
    def test_info(self):
        tool = "foo"
        args = Namespace(subparser_name=tool, log_file=None, verbose=None)
        logger.initialize_logger(args)
        current_time = datetime.now()

        logger.info("bar")

        root_logger = logger.logging.getLogger()
        self.assertEquals(["root: INFO: bar"], root_logger.handlers[0].buffer)

        output_lines = self.output.getvalue().rstrip().split("\n")
        self.assertEquals(len(output_lines), 1)

        log_fields = output_lines[0].split("|")
        self.assertEquals(4, len(log_fields))
        log_time_string = log_fields[0]
        log_datetime = datetime.strptime(log_time_string, "%Y-%m-%d %H:%M:%S")
        seconds_delta = (log_datetime - current_time).total_seconds()
        self.assertLess(seconds_delta, 1)
        self.assertEquals(["INFO", "foo", "bar"], log_fields[1:])
Ejemplo n.º 19
0
def execute(args, execution_context):
    input_file = os.path.abspath(args.input)
    output = os.path.abspath(args.output)

    summary_caller = summarize_caller.SummarizeCaller()

    vcf_reader = vcf.VcfReader(vcf.FileReader(input_file))
    tmp_output_file = output + ".tmp"
    tmp_writer = vcf.FileWriter(tmp_output_file)

    _write_to_tmp_file(summary_caller, vcf_reader, tmp_writer)

    tmp_reader = vcf.VcfReader(vcf.FileReader(tmp_output_file))
    file_writer = vcf.FileWriter(output)

    logger.info("Calculating zscores")
    caller = zscore_caller.ZScoreCaller(tmp_reader)
    metaheaders = execution_context + summary_caller.get_metaheaders()
    _write_zscores(caller, metaheaders, tmp_reader, file_writer)

    os.remove(tmp_output_file)
Ejemplo n.º 20
0
def execute(args, execution_context):
    validate_args(args)

    output_dir = os.path.abspath(args.output)
    unclaimed_readers, trans_vcf_readers = _claim_readers(args)

    _log_unclaimed_readers(unclaimed_readers)

    logger.info("Processing [{}] VCF file(s) from [{}]",
                len(trans_vcf_readers),
                args.input)

    new_tags = [_ExcludeMalformedRef(),
                _ExcludeMalformedAlt(),
                _ExcludeMissingAlt()]

    for i, trans_vcf_reader in enumerate(trans_vcf_readers):
        logger.info("Translating file {}/{} [{}]",
                    i + 1,
                    len(trans_vcf_readers),
                    trans_vcf_reader.file_name)
        new_filename = _mangle_output_filename(trans_vcf_reader.file_name)
        output_filepath = os.path.join(output_dir, new_filename)
        file_writer = FileWriter(output_filepath)
        _translate_files(trans_vcf_reader,
                         new_tags,
                         execution_context,
                         file_writer,)

    logger.info("Wrote [{}] VCF file(s)", len(trans_vcf_readers))
Ejemplo n.º 21
0
def execute(args, execution_context):
    validate_args(args)

    output_dir = os.path.abspath(args.output)
    unclaimed_readers, trans_vcf_readers = _claim_readers(args)

    _log_unclaimed_readers(unclaimed_readers)

    logger.info("Processing [{}] VCF file(s) from [{}]",
                len(trans_vcf_readers), args.input)

    new_tags = [
        _ExcludeMalformedRef(),
        _ExcludeMalformedAlt(),
        _ExcludeMissingAlt()
    ]

    for i, trans_vcf_reader in enumerate(trans_vcf_readers):
        logger.info("Translating file {}/{} [{}]", i + 1,
                    len(trans_vcf_readers), trans_vcf_reader.file_name)
        new_filename = _mangle_output_filename(trans_vcf_reader.file_name)
        output_filepath = os.path.join(output_dir, new_filename)
        file_writer = FileWriter(output_filepath)
        _translate_files(
            trans_vcf_reader,
            new_tags,
            execution_context,
            file_writer,
        )

    logger.info("Wrote [{}] VCF file(s)", len(trans_vcf_readers))
Ejemplo n.º 22
0
def _dispatch(modules, arguments):
    try:
        command, args = _parse_command_line_args(modules, arguments)
        execution_context = _get_execution_context(arguments)

        logger.initialize_logger(args)
        logger.debug("Jacquard run begins")
        logger.debug("cwd|{}", os.getcwd())
        logger.debug("command|{}", " ".join(arguments))

        command_validator.preflight(command, args)

        logger.info("Jacquard begins (v{})", __version__)
        logger.info("Saving log to [{}]",
                    os.path.basename(logger.log_filename))
        logger.debug("Writing output to tmp directory [{}]",
                     args.temp_working_dir)

        command.execute(args, execution_context)

        _move_tmp_contents_to_original(args)

        if logger.WARNING_OCCURRED:
            logger.info("Done. (See warnings above)")
        else:
            logger.info("Done")

    except utils.UsageError as usage_error:
        message = "Jacquard usage problem: {}".format(str(usage_error))
        logger.debug(message)
        print(message, file=sys.stderr)
        try:
            print("See 'jacquard {} --help'.".format(args.subparser_name),
                  file=sys.stderr)
        except NameError:  #could not determine the command
            print("See 'jacquard --help'.", file=sys.stderr)
        sys.exit(1)
    except Exception as exception:  #pylint: disable=broad-except
        logger.error(str(exception))
        logger.error("Jacquard encountered an unanticipated problem. "
                     "Please review log file and contact your sysadmin "
                     "or Jacquard support for assistance.")
        logger.debug(traceback.format_exc())
        sys.exit(1)
    finally:
        try:
            _cleanup(args.temp_working_dir)
        except Exception:  #pylint: disable=broad-except
            pass  #we tried
Ejemplo n.º 23
0
def _dispatch(modules, arguments):
    try:
        command, args = _parse_command_line_args(modules, arguments)
        execution_context = _get_execution_context(arguments)

        logger.initialize_logger(args)
        logger.debug("Jacquard run begins")
        logger.debug("cwd|{}", os.getcwd())
        logger.debug("command|{}", " ".join(arguments))

        command_validator.preflight(command, args)

        logger.info("Jacquard begins (v{})", __version__)
        logger.info("Saving log to [{}]",
                    os.path.basename(logger.log_filename))
        logger.debug("Writing output to tmp directory [{}]",
                     args.temp_working_dir)

        command.execute(args, execution_context)

        _move_tmp_contents_to_original(args)

        if logger.WARNING_OCCURRED:
            logger.info("Done. (See warnings above)")
        else:
            logger.info("Done")

    except utils.UsageError as usage_error:
        message = "Jacquard usage problem: {}".format(str(usage_error))
        logger.debug(message)
        print(message, file=sys.stderr)
        try:
            print("See 'jacquard {} --help'.".format(args.subparser_name),
                  file=sys.stderr)
        except NameError: #could not determine the command
            print("See 'jacquard --help'.", file=sys.stderr)
        sys.exit(1)
    except Exception as exception: #pylint: disable=broad-except
        logger.error(str(exception))
        logger.error("Jacquard encountered an unanticipated problem. "
                     "Please review log file and contact your sysadmin "
                     "or Jacquard support for assistance.")
        logger.debug(traceback.format_exc())
        sys.exit(1)
    finally:
        try:
            _cleanup(args.temp_working_dir)
        except Exception: #pylint: disable=broad-except
            pass #we tried
Ejemplo n.º 24
0
def _get_unsorted_readers(vcf_readers):
    unsorted_readers = []
    for i, reader in enumerate(vcf_readers):
        logger.info("Checking sort order of [{}] ({}/{})",
                    reader.file_name,
                    i+1,
                    len(vcf_readers)
                    )
        previous_record = None
        reader.open()
        for vcf_record in reader.vcf_records():
            if previous_record and vcf_record < previous_record:
                logger.debug("VCF file:chrom:pos [{}:{}:{}] is out of order"
                             .format(reader.file_name,
                                     vcf_record.chrom,
                                     vcf_record.pos))
                unsorted_readers.append(reader)
                break
            else:
                previous_record = vcf_record
        reader.close()
    return unsorted_readers
Ejemplo n.º 25
0
def execute(args, dummy_execution_context):
    #for the moment, there is no good place to put the execution context
    input_file = os.path.abspath(args.input)
    output_file = os.path.abspath(args.output)
    col_spec = None
    if args.selected_columns_file:
        col_spec = args.selected_columns_file

    logger.debug("Expanding [{}] to [{}]", input_file, output_file)
    logger.info("Expanding [{}] to [{}]", args.input, args.original_output)

    vcf_reader = vcf.VcfReader(vcf.FileReader(input_file))
    file_writer = vcf.FileWriter(output_file)
    file_writer.open()

    (columns, glossary_fields) = _get_actual_columns(vcf_reader, col_spec)

    file_writer.write("#" + "\t".join(columns) + "\n")

    line_count = 0
    vcf_reader.open()
    for vcf_record in vcf_reader.vcf_records():
        row_dict = _create_row_dict(vcf_reader.split_column_header, vcf_record)

        new_line = []
        for col in columns:
            if col in row_dict:
                new_line.append(row_dict[col])
            else:
                new_line.append(".")

        file_writer.write("\t".join(new_line) + "\n")
        line_count += 1
        if line_count % 10000 == 0:
            logger.info("Expanding: {} rows processed", line_count)
    logger.info("Expand complete: {} rows processed", line_count)

    file_writer.close()

    glossary_writer = _get_glossary_writer(output_file)
    glossary_writer.open()
    _create_glossary(vcf_reader.metaheaders, glossary_fields, glossary_writer)
    glossary_writer.close()
    logger.info("Wrote glossary to [{}]",
                os.path.basename(glossary_writer.output_filepath))

    vcf_reader.close()
    logger.debug("Wrote input [{}] to output [{}]", input_file, output_file)
Ejemplo n.º 26
0
    def test_info_message_args(self):
        tool = "foo"
        args = Namespace(subparser_name=tool,
                         log_file=None,
                         verbose=None)
        logger.initialize_logger(args)
        current_time = datetime.now()

        logger.info("bar {}/{}", "1", "2")

        root_logger = logger.logging.getLogger()
        self.assertEquals(["root: INFO: bar 1/2"], root_logger.handlers[0].buffer)

        output_lines = self.output.getvalue().rstrip().split("\n")
        self.assertEquals(len(output_lines), 1)

        log_fields = output_lines[0].split("|")
        self.assertEquals(4, len(log_fields))
        log_time_string = log_fields[0]
        log_datetime = datetime.strptime(log_time_string, "%Y-%m-%d %H:%M:%S")
        seconds_delta = (log_datetime - current_time).total_seconds()
        self.assertLess(seconds_delta, 1)
        self.assertEquals(["INFO", "foo", "bar 1/2"], log_fields[1:])