def _compare_files(self, actual_output_path, expected_output_path): actual_file = vcf.FileReader(actual_output_path) actual_file.open() actual = [line for line in actual_file.read_lines()] actual_file.close() expected_file = vcf.FileReader(expected_output_path) expected_file.open() expected = [line for line in expected_file.read_lines()] expected_file.close() self.assertEquals(len(expected), len(actual)) for i in range(len(expected)): if expected[i].startswith("##jacquard=<timestamp="): self.assertStartsWith(actual[i], "##jacquard=<timestamp=") else: self.assertEquals(expected[i].rstrip(), actual[i].rstrip())
def execute(args, execution_context): input_file = os.path.abspath(args.input) output = os.path.abspath(args.output) summary_caller = summarize_caller.SummarizeCaller() vcf_reader = vcf.VcfReader(vcf.FileReader(input_file)) tmp_output_file = output + ".tmp" tmp_writer = vcf.FileWriter(tmp_output_file) _write_to_tmp_file(summary_caller, vcf_reader, tmp_writer) tmp_reader = vcf.VcfReader(vcf.FileReader(tmp_output_file)) file_writer = vcf.FileWriter(output) logger.info("Calculating zscores") caller = zscore_caller.ZScoreCaller(tmp_reader) metaheaders = execution_context + summary_caller.get_metaheaders() _write_zscores(caller, metaheaders, tmp_reader, file_writer) os.remove(tmp_output_file)
def execute(args, dummy_execution_context): #for the moment, there is no good place to put the execution context input_file = os.path.abspath(args.input) output_file = os.path.abspath(args.output) col_spec = None if args.selected_columns_file: col_spec = args.selected_columns_file logger.debug("Expanding [{}] to [{}]", input_file, output_file) logger.info("Expanding [{}] to [{}]", args.input, args.original_output) vcf_reader = vcf.VcfReader(vcf.FileReader(input_file)) file_writer = vcf.FileWriter(output_file) file_writer.open() (columns, glossary_fields) = _get_actual_columns(vcf_reader, col_spec) file_writer.write("#" + "\t".join(columns) + "\n") line_count = 0 vcf_reader.open() for vcf_record in vcf_reader.vcf_records(): row_dict = _create_row_dict(vcf_reader.split_column_header, vcf_record) new_line = [] for col in columns: if col in row_dict: new_line.append(row_dict[col]) else: new_line.append(".") file_writer.write("\t".join(new_line) + "\n") line_count += 1 if line_count % 10000 == 0: logger.info("Expanding: {} rows processed", line_count) logger.info("Expand complete: {} rows processed", line_count) file_writer.close() glossary_writer = _get_glossary_writer(output_file) glossary_writer.open() _create_glossary(vcf_reader.metaheaders, glossary_fields, glossary_writer) glossary_writer.close() logger.info("Wrote glossary to [{}]", os.path.basename(glossary_writer.output_filepath)) vcf_reader.close() logger.debug("Wrote input [{}] to output [{}]", input_file, output_file)
def _sort_vcf(reader, sorted_dir): vcf_records = [] reader.open() for vcf_record in reader.vcf_records(): vcf_records.append(vcf_record) reader.close() vcf_records.sort() writer = FileWriter(os.path.join(sorted_dir, reader.file_name)) writer.open() writer.write("\n".join(reader.metaheaders) + "\n") writer.write(reader.column_header + "\n") for vcf_record in vcf_records: writer.write(vcf_record.text()) writer.close() reader = MergeVcfReader(vcf.FileReader(writer.output_filepath)) return reader
def execute(args, execution_context): input_path = os.path.abspath(args.input) output_path = os.path.abspath(args.output) filter_strategy = _Filter(args) format_tag_regex = _get_format_tag_regex(args) input_files = sorted(glob.glob(os.path.join(input_path, "*.vcf"))) file_readers = [vcf.FileReader(i) for i in input_files] _validate_consistent_samples(file_readers) try: file_writer = vcf.FileWriter(output_path) file_writer.open() #TODO: jebene: _build_format_tags, _built_sample_list, _build_info_tags, #and _build_contigs behave differently. It seems like we could make the #signatures of these methods more similar or even combine some methods to #reduce excess iterations over the coordinates/vcf_readers merge_vcf_readers = _create_vcf_readers(file_readers) _validate_consistent_input(merge_vcf_readers, args.include_all) merge_vcf_readers = _sort_readers(merge_vcf_readers, output_path) format_tags = _get_format_tags(merge_vcf_readers) _disambiguate_format_tags(merge_vcf_readers, format_tags) format_tags_to_keep = _build_format_tags(format_tag_regex, merge_vcf_readers) (all_sample_names, merge_metaheaders) = _build_sample_list(merge_vcf_readers) coordinates = _build_coordinates(merge_vcf_readers) info_tags_to_keep = _build_info_tags(coordinates) contigs_to_keep = _build_contigs(coordinates) incoming_headers = _FILE_FORMAT + execution_context + merge_metaheaders headers = _compile_metaheaders(incoming_headers, merge_vcf_readers, all_sample_names, contigs_to_keep, format_tags_to_keep, info_tags_to_keep) _write_metaheaders(file_writer, headers) _merge_records(merge_vcf_readers, coordinates, filter_strategy, all_sample_names, format_tags_to_keep, file_writer) finally: for vcf_reader in merge_vcf_readers: vcf_reader.close() file_writer.close()
def validate_args(args): if args.selected_columns_file: if not os.path.isfile(args.selected_columns_file): raise utils.UsageError(("The selected_columns_file [{}] could " "not be read. Review inputs/usage and " "try again."), args.selected_columns_file) columns = None try: columns = _read_col_spec(args.selected_columns_file) except: pass if not columns: raise utils.UsageError( "The selected_columns_file .* has no rows. Review inputs/usage and try again" ) try: vcf.VcfReader(vcf.FileReader(args.input)) except: raise utils.UsageError( ("The expand command requires a VCF file as an " "input, but the specified input [{}] contains no VCF " "metaheaders. Review inputs and try again.").format(args.input))