def get_cds(ref): ''' assuming there is one contiguous coding region which might be split into multiple sub-proteins like HA1 and HA2. loop over all features, pull out min and max of their union ''' cds_start, cds_end = np.inf, 0 for feature in ref.features: if feature.type=='CDS': if feature.location.start<cds_start: cds_start=feature.location.start if feature.location.end>cds_end: cds_end=feature.location.end refstr = str(ref.seq).upper() refCDS = refstr[cds_start:cds_end] refAA = safe_translate(refstr[cds_start:cds_end]) return refstr, refCDS, refAA, cds_start, cds_end
def codon_align(seq, refstr, refAA, cds_start, cds_end): seqstr = str(seq.seq).upper() score, refaln, seqaln = align_pairwise(refstr, seqstr) if score<0: # did not align return None ref_aln_array = np.array(list(refaln)) seq_aln_array = np.array(list(seqaln)) # stip gaps ungapped = ref_aln_array!='-' ref_aln_array_ungapped = ref_aln_array[ungapped] seq_aln_array_ungapped = seq_aln_array[ungapped] seq5pUTR = "".join(seq_aln_array_ungapped[:cds_start]) seq3pUTR = "".join(seq_aln_array_ungapped[cds_end:]) seqCDS = "".join(seq_aln_array_ungapped[cds_start:cds_end]) seqCDS_ungapped = seqCDS.replace('-', '') seqAA = safe_translate(seqCDS_ungapped) scoreAA, refalnAA, seqalnAA = align_pairwise(refAA, seqAA) if scoreAA<0 or sum(seqAA.count(x) for x in ['*', 'X'])>5 or refalnAA.count('-')>5: print(seq.id, "didn't translate properly", file=sys.stderr) return None seqCDS_aln = seq5pUTR pos = 0 for aa_ref, aa_seq in zip(refalnAA, seqalnAA): if aa_seq=='-': seqCDS_aln += '---' # if the nucleotide sequence is gapped # (i.e. because of missing data at the 5p and 3p end, advance pos) if seqCDS_ungapped[pos:pos+3]=='---': pos += 3 else: if len(seqCDS_ungapped)>=pos+3: seqCDS_aln += seqCDS_ungapped[pos:pos+3] else: seqCDS_aln += '---' pos += 3 return ''.join(seqCDS_aln)+seq3pUTR
for seq in alignment: if seq.id == ref.id: continue # read sequence and all its annotated features seq_container = tmpNode() seq_str = str(seq.seq) seq_container.sequences['nuc'] = { i: c for i, c in enumerate(seq_str) } for fname, feat in features.items(): if feat.type != 'source': seq_container.sequences[fname] = { i: c for i, c in enumerate( safe_translate(feat.extract(seq_str))) } # for each clade, check whether it matches any of the clade definitions in the tsv matches = [] for clade_name, clade_alleles in clade_designations.items(): if is_node_in_clade(clade_alleles, seq_container, ref): matches.append(clade_name) # print the last match as clade assignment and all others as ancestral clades # note that this assumes that clades in the tsv are ordered by order of appearence. # furthermore, this will only work if parent clades don't have definitions that exclude # child clades, i.e. positions can only be additive for this to work. if matches: print( f"{seq.description}\t{matches[-1]}\t{', '.join(matches[:-1])}",
ref = SeqIO.read(args.reference, 'genbank') # assuming there is one contiguous coding region which might be # split into multiple sub-proteins like HA1 and HA2. # loop over all features, pull out min and max of their union cds_start, cds_end = np.inf, 0 for feature in ref.features: if feature.type == 'CDS': if feature.location.start < cds_start: cds_start = feature.location.start if feature.location.end > cds_end: cds_end = feature.location.end refstr = str(ref.seq).upper() refCDS = refstr[cds_start:cds_end] refAA = safe_translate(refstr[cds_start:cds_end]) alignment = [] for seq in aln: seqstr = str(seq.seq).upper() score, refaln, seqaln = align_pairwise(refstr, seqstr) if score < 0: # did not align continue ref_aln_array = np.array(list(refaln)) seq_aln_array = np.array(list(seqaln)) # stip gaps ungapped = ref_aln_array != '-' ref_aln_array_ungapped = ref_aln_array[ungapped] seq_aln_array_ungapped = seq_aln_array[ungapped]
ref = SeqIO.read(refname, 'genbank') features = load_features(refname) clade_designations = read_in_clade_definitions( f"config/clades_{args.lineage}_ha.tsv") # get sequence as string, CDS seq, amino acid sequence, and start/end pos refstr, refCDS, refAA, cds_start, cds_end = get_cds(ref) alignment = [] for seq in seqs: seq_container = tmpNode() seq_aln = codon_align(seq, refstr, refAA, cds_start, cds_end) if seq_aln is None: print(f"{seq.id}\tnot translatable", file=sys.stdout) continue seq_container.sequences['nuc'] = {i: c for i, c in enumerate(seq_aln)} for fname, feat in features.items(): if feat.type != 'source': seq_container.sequences[fname] = { i: c for i, c in enumerate(safe_translate(feat.extract( seq_aln))) } matches = [] for clade_name, clade_alleles in clade_designations.items(): if is_node_in_clade(clade_alleles, seq_container, ref): matches.append(clade_name) print(f"{seq.description}\t{', '.join(matches)}", file=sys.stdout)
tmp_str = "".join(sample('ABCDEFGHILKLMOPQRSTUVWXYZ', 20)) ref = SeqIO.read(args.reference_sequence, 'genbank') # get sequence as string, CDS seq, amino acid sequence, and start/end pos features_to_translate = load_features(args.reference_sequence, args.genes) refstr, refCDS, refAA, cds_start, cds_end = get_cds(ref) alignment = [] for seq in sequences: seq_aln = codon_align(seq, refstr, refAA, cds_start, cds_end) if seq_aln: seq.seq=Seq.Seq(seq_aln) alignment.append(seq) print("selected %d for region %s and date interval %f-%f"%(len(sequences), region, time_interval[0], time_interval[1])) for gene, fname in zip(args.genes, args.output): if gene not in features_to_translate: continue seqs = [] feature = features_to_translate[gene] for seq in alignment: try: translation = SeqRecord.SeqRecord(seq=Seq.Seq(safe_translate(str(feature.extract(seq.seq)))), id=seq.name, name=seq.name, description='') seqs.append(translation) except: print("WARN:",seq.name,"did not translate") SeqIO.write(seqs, fname, 'fasta')
for seq in alignment: if seq.id==ref.id: continue if len(seq.seq)!=len(ref.seq): import ipdb; ipdb.set_trace() print(f"ERROR: this file doesn't seem aligned to the reference. {seq.id} as length {len(seq.seq)} while the reference has length {len(ref.seq)}.") sys.exit(1) # read sequence and all its annotated features seq_container = tmpNode() seq_str = str(seq.seq) seq_container.sequences['nuc'] = {i:c for i,c in enumerate(seq_str)} for fname, feat in features.items(): if feat.type != 'source': seq_container.sequences[fname] = {i:c for i,c in enumerate(safe_translate(feat.extract(seq_str)))} # for each clade, check whether it matches any of the clade definitions in the tsv matches = [] for clade_name, clade_alleles in clade_designations.items(): if is_node_in_clade(clade_alleles, seq_container, ref): matches.append(clade_name) # print the last match as clade assignment and all others as ancestral clades # note that this assumes that clades in the tsv are ordered by order of appearence. # furthermore, this will only work if parent clades don't have definitions that exclude # child clades, i.e. positions can only be additive for this to work. if matches: print(f"{seq.description}\t{matches[-1]}\t{', '.join(matches[:-1])}", file=output) else: