def test_write_genbank_sequence(self, tmpdir, genbank_reference): output_filename = Path(tmpdir) / Path("new_sequences.fasta") reference = SeqIO.read(genbank_reference, "genbank") sequences_written = write_sequences([reference], output_filename, "genbank") assert sequences_written == 1
def test_write_single_set_of_sequences_to_lzma_file( self, tmpdir, sequences): output_filename = Path(tmpdir) / Path("new_sequences.fasta.xz") sequences_written = write_sequences(sequences, output_filename, "fasta") assert sequences_written == len(sequences) with lzma.open(output_filename, "rt") as handle: assert sequences_written == len( [line for line in handle if line.startswith(">")])
def test_write_sequences_by_external_handle(self, tmpdir, sequences): output_filename = Path(tmpdir) / Path("new_sequences.fasta") with open_file(output_filename, "w") as handle: total_sequences_written = 0 for sequence in sequences: sequences_written = write_sequences(sequence, handle) total_sequences_written += sequences_written with open(output_filename, "r") as handle: assert total_sequences_written == len( [line for line in handle if line.startswith(">")])
prefixes = "|".join(args.strip_prefixes) pattern = f"^({prefixes})|{pattern}" with open_file(args.output, "w") as output_handle: # In order to prefer the latter files, we have to reverse the order of # the files. sequences = read_sequences(*reversed(sequence_files)) renamed_sequences = rename_sequences(sequences, pattern) deduplicated_sequences = drop_duplicate_sequences( renamed_sequences, args.error_on_duplicate_strains ) try: for sequence in deduplicated_sequences: write_sequences(sequence, output_handle) except DuplicateSequenceError as error: print( f"ERROR: The following strains have duplicate sequences: {error}", file=sys.stderr ) sys.exit(1) # Clean up any open sequence files. for sequence_file in sequence_files: if hasattr(sequence_file, "close"): sequence_file.close() # Clean up any open tarballs. for tar_handle in tar_handles: tar_handle.close()
nargs='+', type=int, help="list of sites to mask") parser.add_argument("--output", required=True, help="FASTA file of output alignment") args = parser.parse_args() begin_length = 0 if args.mask_from_beginning: begin_length = args.mask_from_beginning end_length = 0 if args.mask_from_end: end_length = args.mask_from_end with open_file(args.output, 'w') as outfile: for record in read_sequences(args.alignment): seq = str(record.seq) if args.mask_terminal_gaps: seq = mask_terminal_gaps(seq) start = "N" * begin_length middle = seq[begin_length:-end_length] end = "N" * end_length seq_list = list(start + middle + end) if args.mask_sites: for site in args.mask_sites: seq_list[site - 1] = "N" record.seq = Seq("".join(seq_list)) write_sequences(record, outfile)
def test_write_sequences_from_generator(self, tmpdir, sequences_generator): output_filename = Path(tmpdir) / Path("new_sequences.fasta") sequences_written = write_sequences(sequences_generator, output_filename, "fasta") assert sequences_written == 3
def test_write_sequences(self, tmpdir, sequences): output_filename = Path(tmpdir) / Path("new_sequences.fasta") sequences_written = write_sequences(sequences, output_filename, "fasta") assert sequences_written == len(sequences)
# same name already exists and if the hash is different. sequence_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest() if record.name in sequence_hash_by_name: # If the hashes differ (multiple entries with the same # strain name but different sequences), we keep the first # sequence and add the strain to a list of duplicates to # report at the end. if sequence_hash_by_name.get(record.name) != sequence_hash: duplicate_strains.add(record.name) # If the current strain has been seen before, don't write # out its sequence again. continue sequence_hash_by_name[record.name] = sequence_hash write_sequences(record, output_handle) if len(duplicate_strains) > 0: error_mode = "ERROR" exit_code = 1 if args.warn_about_duplicates: error_mode = "WARNING" exit_code = 0 print( f"{error_mode}: Detected the following duplicate input strains with different sequences:", file=sys.stderr ) for strain in duplicate_strains: print(textwrap.indent(strain, " "), file=sys.stderr)