def test_read_sequences_from_multiple_files_with_different_compression( self, fasta_filename, gzip_fasta_filename, lzma_fasta_filename): sequences = read_sequences(fasta_filename, gzip_fasta_filename, lzma_fasta_filename, format="fasta") assert len(list(sequences)) == 9
def test_read_sequences_from_multiple_files_or_buffers( self, fasta_filename, additional_fasta_filename): with open(fasta_filename) as fasta_handle: sequences = read_sequences(fasta_handle, additional_fasta_filename, format="fasta") assert len(list(sequences)) == 6
except FileNotFoundError as error: print(f"ERROR: {error}", file=sys.stderr) sys.exit(1) else: sequence_files.append(sequence_filename) # Replace whitespace and everything following pipes with nothing. pattern = "( )|(\|.*)" if args.strip_prefixes: prefixes = "|".join(args.strip_prefixes) pattern = f"^({prefixes})|{pattern}" with open_file(args.output, "w") as output_handle: # In order to prefer the latter files, we have to reverse the order of # the files. sequences = read_sequences(*reversed(sequence_files)) renamed_sequences = rename_sequences(sequences, pattern) deduplicated_sequences = drop_duplicate_sequences( renamed_sequences, args.error_on_duplicate_strains ) try: for sequence in deduplicated_sequences: write_sequences(sequence, output_handle) except DuplicateSequenceError as error: print( f"ERROR: The following strains have duplicate sequences: {error}", file=sys.stderr ) sys.exit(1)
nargs='+', type=int, help="list of sites to mask") parser.add_argument("--output", required=True, help="FASTA file of output alignment") args = parser.parse_args() begin_length = 0 if args.mask_from_beginning: begin_length = args.mask_from_beginning end_length = 0 if args.mask_from_end: end_length = args.mask_from_end with open_file(args.output, 'w') as outfile: for record in read_sequences(args.alignment): seq = str(record.seq) if args.mask_terminal_gaps: seq = mask_terminal_gaps(seq) start = "N" * begin_length middle = seq[begin_length:-end_length] end = "N" * end_length seq_list = list(start + middle + end) if args.mask_sites: for site in args.mask_sites: seq_list[site - 1] = "N" record.seq = Seq("".join(seq_list)) write_sequences(record, outfile)
parser.add_argument('--output', type=str, metavar="FASTA", required=True, help="output FASTA") args = parser.parse_args() sequence_hash_by_name = {} duplicate_strains = set() counter = 0 with open(args.output, "w") as output_handle: # Stream sequences from all input files into a single output file, # skipping duplicate records (same strain and sequence) and noting # mismatched sequences for the same strain name. for record in read_sequences(*args.input): counter += 1 if counter % 10000 == 0: print(f"Processed {counter} records") # Hash each sequence and check whether another sequence with the # same name already exists and if the hash is different. sequence_hash = hashlib.sha256(str( record.seq).encode("utf-8")).hexdigest() if record.name in sequence_hash_by_name: # If the hashes differ (multiple entries with the same # strain name but different sequences), we keep the first # sequence and add the strain to a list of duplicates to # report at the end. if sequence_hash_by_name.get(record.name) != sequence_hash: duplicate_strains.add(record.name)
import argparse from augur.io import open_file, read_sequences, write_sequences import re if __name__ == '__main__': parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--sequences", nargs="+", required=True, help="sequences to be sanitized") parser.add_argument( "--strip-prefixes", nargs="+", help="prefixes to strip from strain names in the sequences") parser.add_argument("--output", required=True, help="sanitized sequences") args = parser.parse_args() if args.strip_prefixes: prefixes = "|".join(args.strip_prefixes) pattern = f"^({prefixes})" else: pattern = "" with open_file(args.output, "w") as output_handle: for sequence in read_sequences(*args.sequences): sequence.id = re.sub(pattern, "", sequence.id) write_sequences(sequence, output_handle)
type=int, default=10000, help= "number of samples in the global alignment to process at once. Reduce this number to reduce memory usage at the cost of increased run-time." ) parser.add_argument("--output", type=str, required=True, help="FASTA file of output alignment") args = parser.parse_args() # load entire alignment and the alignment of focal sequences (upper case -- probably not necessary) ref = sequence_to_int_array(SeqIO.read(args.reference, 'fasta').seq) alignment_length = len(ref) focal_seqs = read_sequences(args.focal_alignment) focal_seqs_dict = calculate_snp_matrix(focal_seqs, consensus=ref, ignore_seqs=args.ignore_seqs) if focal_seqs_dict is None: print( f"ERROR: There are no valid sequences in the focal alignment, '{args.focal_alignment}', to compare against the full alignment.", "Check your subsampling settings for the focal alignment or consider disabling proximity-based subsampling.", file=sys.stderr) sys.exit(1) seqs = read_sequences(args.alignment) # export priorities fh_out = open(args.output, 'w')
from augur.io import open_file, read_sequences, write_sequences import re if __name__ == '__main__': parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--sequences", nargs="+", required=True, help="sequences to be sanitized") parser.add_argument( "--strip-prefixes", nargs="+", help="prefixes to strip from strain names in the sequences") parser.add_argument("--output", required=True, help="sanitized sequences") args = parser.parse_args() if args.strip_prefixes: prefixes = "|".join(args.strip_prefixes) pattern = f"^({prefixes})" else: pattern = "" with open_file(args.output, "w") as output_handle: # In order to prefer the latter files, we have to reverse the order of # the files. for sequence in read_sequences(*reversed(args.sequences)): sequence.id = re.sub(pattern, "", sequence.id) write_sequences(sequence, output_handle)
def test_read_sequences_from_multiple_files(self, fasta_filename, additional_fasta_filename): sequences = read_sequences(fasta_filename, additional_fasta_filename, format="fasta") assert len(list(sequences)) == 6
def test_read_sequences_from_single_file(self, fasta_filename): sequences = read_sequences(fasta_filename, format="fasta") assert len(list(sequences)) == 3
def test_read_single_genbank_record_from_a_path(self, genbank_reference): reference = next( read_sequences(Path(genbank_reference), format="genbank")) assert reference.id == "KX369547.1"
def test_read_single_fasta_record(self, fasta_filename): record = next(read_sequences(fasta_filename, format="fasta")) assert record.id == "SEQ_1"
parser.add_argument('--input', type=str, nargs="+", metavar="FASTA", required=True, help="input FASTAs") parser.add_argument('--warn-about-duplicates', action="store_true", help="warn the user about duplicate strains instead of exiting with an error. The output will include the first occurrence of a duplicate sequence.") parser.add_argument('--output', type=str, metavar="FASTA", required=True, help="output FASTA") args = parser.parse_args() sequence_hash_by_name = {} duplicate_strains = set() counter = 0 with open_file(args.output, "w") as output_handle: # Stream sequences from all input files into a single output file, # skipping duplicate records (same strain and sequence) and noting # mismatched sequences for the same strain name. In order to # prefer the latter files, we have to reverse the order of the # files. for record in read_sequences(*reversed(args.input)): counter += 1 if counter % 10000 == 0: print(f"Processed {counter} records") # Hash each sequence and check whether another sequence with the # same name already exists and if the hash is different. sequence_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest() if record.name in sequence_hash_by_name: # If the hashes differ (multiple entries with the same # strain name but different sequences), we keep the first # sequence and add the strain to a list of duplicates to # report at the end. if sequence_hash_by_name.get(record.name) != sequence_hash: duplicate_strains.add(record.name)