def test_e2e_fasta_beginning_end_sites(self, fasta_file, bed_file, out_file, sequences, argparser): from_beginning = 3 from_end = 1 arg_sites = [5, 12] expected_removals = sorted( set(TEST_BED_SEQUENCE + [s - 1 for s in arg_sites])) print(expected_removals) args = argparser( "-s %s -o %s --mask %s --mask-from-beginning %s --mask-from-end %s --mask-sites %s" % (fasta_file, out_file, bed_file, from_beginning, from_end, " ".join(str(s) for s in arg_sites))) mask.run(args) output = SeqIO.parse(out_file, "fasta") for record in output: reference = str(sequences[record.id].seq) masked_seq = str(record.seq) assert masked_seq[:from_beginning] == "N" * from_beginning assert masked_seq[-from_end:] == "N" * from_end for idx, site in enumerate(masked_seq[from_beginning:-from_end], from_beginning): if idx in expected_removals: assert site == "N" else: assert site == reference[idx]
def test_run_with_mask_sites(self, vcf_file, out_file, argparser, mp_context): args = argparser("--mask-sites 2 8 -s %s -o %s" % (vcf_file, out_file)) def check_mask_sites(mask_sites, *args, **kwargs): # mask-sites are passed to the CLI as one-indexed assert mask_sites == [1, 7] mp_context.setattr(mask, "mask_vcf", check_mask_sites) mask.run(args)
def test_e2e_fasta_mask_invalid(self, fasta_file, out_file, sequences, argparser): args = argparser("-s %s -o %s --mask-invalid" % (fasta_file, out_file)) mask.run(args) output = SeqIO.parse(out_file, "fasta") for record in output: reference = str(sequences[record.id].seq) for idx, site in enumerate(reference): assert record.seq[ idx] == site if site in VALID_NUCLEOTIDES else "N"
def test_e2e_vcf_minimal(self, vcf_file, bed_file, argparser): args = argparser("-s %s --mask %s" % (vcf_file, bed_file)) mask.run(args) with open(vcf_file) as output: assert output.readline().startswith("##fileformat") # is a VCF assert output.readline().startswith( "#CHROM\tPOS\t") # have a header for line in output.readlines(): site = int(line.split("\t")[1]) # POS column site = site - 1 # shift to zero-indexed site assert site not in TEST_BED_SEQUENCE
def test_e2e_fasta_minimal(self, fasta_file, bed_file, sequences, argparser): args = argparser("-s %s --mask %s" % (fasta_file, bed_file)) mask.run(args) output = SeqIO.parse(fasta_file, "fasta") for record in output: reference = sequences[record.id].seq for idx, site in enumerate(record.seq): if idx in TEST_BED_SEQUENCE: assert site == "N" else: assert site == reference[idx]
def test_run_with_mask_sites_and_mask_file(self, vcf_file, out_file, bed_file, argparser, mp_context): args = argparser("--mask-sites 20 21 --mask %s -s %s -o %s" % (bed_file, vcf_file, out_file)) def check_mask_sites(mask_sites, *args, **kwargs): # mask-sites are passed to the CLI as one-indexed assert mask_sites == sorted(set(TEST_BED_SEQUENCE + [19, 20])) mp_context.setattr(mask, "mask_vcf", check_mask_sites) mask.run(args)
def test_run_fasta_mask_from_beginning_or_end(self, fasta_file, out_file, argparser, mp_context): args = argparser( "-s %s -o %s --mask-from-beginning 2 --mask-from-end 3" % (fasta_file, out_file)) def check_mask_from(*args, mask_from_beginning=0, mask_from_end=0): assert mask_from_beginning == 2 assert mask_from_end == 3 mp_context.setattr(mask, "mask_fasta", check_mask_from) mask.run(args)
def test_run_recognize_vcf(self, bed_file, vcf_file, argparser, mp_context): """Ensure we're handling vcf files correctly""" args = argparser("--mask=%s -s %s --no-cleanup" % (bed_file, vcf_file)) def fail(*args, **kwargs): assert False, "Called mask_fasta incorrectly" mp_context.setattr(mask, "mask_vcf", lambda *a, **k: None) mp_context.setattr(mask, "mask_fasta", fail) mp_context.setattr(mask, "copyfile", lambda *args: None) mask.run(args)
def test_run_normal_case(self, bed_file, vcf_file, out_file, argparser, mp_context): def check_args(mask_sites, in_file, _out_file, cleanup): assert mask_sites == TEST_BED_SEQUENCE, "Wrong mask sites provided" assert in_file == vcf_file, "Incorrect input file provided" assert _out_file == out_file, "Incorrect output file provided" assert cleanup is True, "Cleanup erroneously passed in as False" mp_context.setattr(mask, "mask_vcf", check_args) args = argparser("--mask=%s --sequences=%s --output=%s" % (bed_file, vcf_file, out_file)) mask.run(args) assert os.path.exists(out_file), "Output file incorrectly deleted"
def test_run_respect_no_cleanup(self, bed_file, vcf_file, argparser, mp_context): out_file = os.path.join(os.path.dirname(vcf_file), "masked_" + os.path.basename(vcf_file)) def make_outfile(mask_sites, in_file, out_file, cleanup=True): assert cleanup == False open(out_file, "w").close() # need out_file to exist mp_context.setattr(mask, "mask_vcf", make_outfile) args = argparser("--mask=%s -s %s -o %s --no-cleanup" % (bed_file, vcf_file, out_file)) mask.run(args) assert os.path.exists(out_file), "Output file incorrectly deleted"
def test_run_handle_missing_outfile(self, bed_file, fasta_file, argparser, mp_context): args = argparser("--mask=%s -s %s" % (bed_file, fasta_file)) expected_outfile = os.path.join( os.path.dirname(fasta_file), "masked_" + os.path.basename(fasta_file)) def check_outfile(mask_sites, in_file, out_file, **kwargs): assert out_file == expected_outfile with open(out_file, "w") as fh: fh.write("test_string") mp_context.setattr(mask, "mask_fasta", check_outfile) mask.run(args) with open(fasta_file) as fh: assert fh.read() == "test_string"
def test_e2e_vcf_with_options(self, vcf_file, bed_file, out_file, argparser): arg_sites = [5, 12, 14] expected_removals = sorted( set(TEST_BED_SEQUENCE + [s - 1 for s in arg_sites])) args = argparser( "-s %s -o %s --mask %s --mask-sites %s" % (vcf_file, out_file, bed_file, " ".join(str(s) for s in arg_sites))) mask.run(args) with open(out_file) as output: assert output.readline().startswith("##fileformat") # is a VCF assert output.readline().startswith( "#CHROM\tPOS\t") # have a header for line in output.readlines(): site = int(line.split("\t")[1]) # POS column site = site - 1 #re-zero-index the VCF sites assert site not in expected_removals
def test_run_handle_empty_sequence_file(self, vcf_file, argparser): open(vcf_file, "w").close() args = argparser("-s %s --mask-sites 1" % vcf_file) with pytest.raises(SystemExit): mask.run(args)
def test_run_vcf_cannot_mask_beginning_or_end(self, vcf_file, argparser, op): args = argparser("-s %s --mask-from-%s 2" % (vcf_file, op)) with pytest.raises(SystemExit) as err: mask.run(args)
def test_run_requires_some_masking(self, vcf_file, argparser): args = argparser("-s %s" % vcf_file) with pytest.raises(SystemExit) as err: mask.run(args)
def test_run_handle_missing_sequence_file(self, vcf_file, argparser): os.remove(vcf_file) args = argparser("-s %s" % vcf_file) with pytest.raises(SystemExit): mask.run(args)
def test_run_handle_empty_mask_file(self, vcf_file, bed_file, argparser): open(bed_file, "w").close() args = argparser("-s %s --mask %s" % (vcf_file, bed_file)) with pytest.raises(SystemExit): mask.run(args)
def test_run_handle_missing_mask_file(self, vcf_file, bed_file, argparser): os.remove(bed_file) args = argparser("-s %s --mask %s" % (vcf_file, bed_file)) with pytest.raises(SystemExit): mask.run(args)