parser.add_argument("--keep-temporary-files", action='store_true', help="don't clean up") parser.add_argument("--chunk-size", default=10, type=int, help="don't clean up") parser.add_argument("--nthreads", default=1, type=int, help="Number of threads to use in alignment") args = parser.parse_args() #refname = f"config/reference.gb" refname = args.gbk features = load_features(refname) seqs = SeqIO.parse(args.sequences, 'fasta') ref = SeqIO.read(refname, 'genbank') #clade_designations = read_in_clade_definitions(f"config/clades.tsv") clade_designations = read_in_clade_definitions(args.clade) log_fname = "clade_assignment.log" in_fname = "clade_assignment_tmp.fasta" out_fname = "clade_assignment_tmp_alignment.fasta" output = open(args.output, 'w') print('name\tclade\tparent clades', file=output) # break the sequences into chunks, align each to the reference, and assign clades one-by-one done = False while not done:
parser = argparse.ArgumentParser( description="Add translations", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument('--tree', type=str, required=True, help="input tree") parser.add_argument('--reference', type=str, required=True, help="reference genbank sequence") parser.add_argument('--translations', type=str, nargs='+', required=True, help="amino acid alignment") parser.add_argument('--genes', type=str, nargs='+', required=True, help="amino acid alignment") parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") args = parser.parse_args() genes = args.genes if type(args.genes)==list else [args.genes] translations = args.translations if type(args.translations)==list else [args.translations] ref = SeqIO.read(args.reference, format='genbank') features = load_features(args.reference) if not set(features.keys())==set(args.genes): print("WARNING: supplied genes don't match the annotation") print("the following features are in the annotation by not supplied as genes:", set(features.keys()).difference(args.genes)) print("the following features are in the supplied as genes but not the annotation:", set(args.genes).difference(features.keys())) T = Phylo.read(args.tree, 'newick') leafs = {n.name for n in T.get_terminals()} node_data = {} root_sequence_translations = {} for gene, translation in zip(genes, translations): seqs = [] for s in SeqIO.parse(translation, 'fasta'): if s.id in leafs:
sequences = [] for seq in SeqIO.parse(args.sequences, 'fasta'): if seq.name in metadata: if metadata[seq.name]["num_date"]>=time_interval[0] and \ metadata[seq.name]["num_date"]<time_interval[1] and \ metadata[seq.name]["region"]==region: sequences.append(seq) tmp_str = "".join(sample('ABCDEFGHILKLMOPQRSTUVWXYZ', 20)) if not os.path.isdir('tmp'): os.mkdir('tmp') print("selected %d for region %s and date interval %f-%f" % (len(sequences), region, time_interval[0], time_interval[1])) features_to_translate = load_features(args.reference_sequence, args.genes) tmp_file = "tmp/sequence_file_%s_%s.fasta" % (region, tmp_str) tmp_file_out = "tmp/sequence_file_%s_%s_aln.fasta" % (region, tmp_str) SeqIO.write(sequences, tmp_file, 'fasta') fail = align.run( pseudo_args(sequences=tmp_file, reference_sequence=args.reference_sequence, output=tmp_file_out, reference_name=None, remove_reference=True, method='mafft', nthreads=1, fill_gaps=False)) if fail: sys.exit(fail)
if __name__ == "__main__": parser = argparse.ArgumentParser( description="Translate nucleotide sequences to amino acid sequences for the requested genes", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument('--sequences', required=True, help='FASTA file of nucleotide sequences to translate') parser.add_argument('--reference-sequence', required=True, help='GenBank or GFF file containing the annotation') parser.add_argument('--genes', nargs='+', help="genes to translate (list or file containing list)") parser.add_argument('--output', nargs='+', help="FASTA files of amino acid sequences per gene") args = parser.parse_args() # Load features for requested genes. features = load_features(args.reference_sequence, args.genes) # Load sequences indexed by sequence id. sequences = { sequence.id: str(sequence.seq) for sequence in Bio.SeqIO.parse(args.sequences, "fasta") if "N" not in str(sequence.seq) } #if sorted(set(list(str(sequence.seq)))) == ["A", "C", "G", "T"] # Translate requested genes. translations = {} invalid_samples = set() for feature_name, output_file in zip(args.genes, args.output): translations[feature_name] = translate_feature(sequences, features[feature_name]) records = [