def test_read_in_clade_definitions_simple(): clades = read_in_clade_definitions("tests/data/clades/simple_clades.tsv") assert clades == { 'Clade_1': [('ctpE', 80, 'D')], 'Clade_2': [('nuc', 30641, 'T')], 'Clade_3': [('nuc', 444295, 'A'), ('pks8', 633, 'T')] }
def test_read_in_clade_definitions_inherit_chained(): clades = read_in_clade_definitions("tests/data/clades/inherit_chained_clades.tsv") assert clades == { 'Clade_1': [('ctpE', 80, 'D')], 'Clade_2': [('ctpE', 80, 'D'),('nuc', 30641, 'T')], 'Clade_3': [('ctpE', 80, 'D'),('nuc', 30641, 'T'), ('pks8', 633, 'T')] }
default=10, type=int, help="don't clean up") parser.add_argument("--nthreads", default=1, type=int, help="Number of threads to use in alignment") args = parser.parse_args() #refname = f"config/reference.gb" refname = args.gbk features = load_features(refname) seqs = SeqIO.parse(args.sequences, 'fasta') ref = SeqIO.read(refname, 'genbank') #clade_designations = read_in_clade_definitions(f"config/clades.tsv") clade_designations = read_in_clade_definitions(args.clade) log_fname = "clade_assignment.log" in_fname = "clade_assignment_tmp.fasta" out_fname = "clade_assignment_tmp_alignment.fasta" output = open(args.output, 'w') print('name\tclade\tparent clades', file=output) # break the sequences into chunks, align each to the reference, and assign clades one-by-one done = False while not done: # generate a chunk with chunk-size sequences chunk = [] while len(chunk) < args.chunk_size and (not done): try:
parser = argparse.ArgumentParser( description="Assign clades to sequences", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--sequences", required=True, help="FASTA file of HA sequences") parser.add_argument("--lineage", required=True, help="lineage of the sequences supplied") args = parser.parse_args() refname = f"config/reference_{args.lineage}_ha.gb" seqs = SeqIO.parse(args.sequences, 'fasta') ref = SeqIO.read(refname, 'genbank') features = load_features(refname) clade_designations = read_in_clade_definitions( f"config/clades_{args.lineage}_ha.tsv") # get sequence as string, CDS seq, amino acid sequence, and start/end pos refstr, refCDS, refAA, cds_start, cds_end = get_cds(ref) alignment = [] for seq in seqs: seq_container = tmpNode() seq_aln = codon_align(seq, refstr, refAA, cds_start, cds_end) if seq_aln is None: print(f"{seq.id}\tnot translatable", file=sys.stdout) continue seq_container.sequences['nuc'] = {i: c for i, c in enumerate(seq_aln)} for fname, feat in features.items(): if feat.type != 'source':
help="process this many sequences at once") parser.add_argument("--nthreads", default=1, type=int, help="Number of threads to use in alignment") args = parser.parse_args() refname = f"defaults/reference_seq.gb" features = load_features(refname) if args.sequences: seqs = SeqIO.parse(args.sequences, 'fasta') else: alignment = SeqIO.parse(args.alignment, 'fasta') ref = SeqIO.read(refname, 'genbank') clade_designations = read_in_clade_definitions(f"defaults/clades.tsv") log_fname = "clade_assignment.log" in_fname = "clade_assignment_tmp.fasta" out_fname = "clade_assignment_tmp_alignment.fasta" output = open(args.output, 'w') print('name\tclade\tparent clades', file=output) # break the sequences into chunks, align each to the reference, and assign clades one-by-one done = False while not done: # if not aligned, align if args.sequences: # generate a chunk with chunk-size sequences chunk = []
group.add_argument("--alignment", help="*aligned* FASTA file of SARS-CoV-2 sequences relative to Wuhan-HU-1 with insertions removed") parser.add_argument("--output", type=str, default='clade_assignment.tsv', help="tsv file to write clade definitions to") parser.add_argument("--keep-temporary-files", action='store_true', help="don't clean up") parser.add_argument("--chunk-size", default=10, type=int, help="process this many sequences at once") parser.add_argument("--nthreads", default=1, type=int, help="Number of threads to use in alignment") args = parser.parse_args() refname = f"config/reference.gb" features = load_features(refname) if args.sequences: seqs = SeqIO.parse(args.sequences, 'fasta') else: alignment = SeqIO.parse(args.alignment, 'fasta') ref = SeqIO.read(refname, 'genbank') clade_designations = read_in_clade_definitions(f"config/clades.tsv") log_fname = "clade_assignment.log" in_fname = "clade_assignment_tmp.fasta" out_fname = "clade_assignment_tmp_alignment.fasta" output = open(args.output, 'w') print('name\tclade\tparent clades', file=output) # break the sequences into chunks, align each to the reference, and assign clades one-by-one done = False while not done: # if not aligned, align if args.sequences: # generate a chunk with chunk-size sequences
def test_read_in_clade_definitions_inheritance_from_self_error(): with pytest.raises(ValueError): read_in_clade_definitions("tests/data/clades/self_inherit_clades.tsv")
def test_read_in_clade_definitions_inheritance_from_nonexistent_clade_error(): with pytest.raises(ValueError): read_in_clade_definitions("tests/data/clades/nonexistent_clade_inheritance_clades.tsv")
def test_read_in_clade_definitions_multiple_inheritance_error(): with pytest.raises(ValueError): read_in_clade_definitions("tests/data/clades/multiple_inheritance_clades.tsv")
def test_read_in_clade_definitions_inherit_cycle_error(): with pytest.raises(ValueError): read_in_clade_definitions("tests/data/clades/inherit_cycle_clades.tsv")