Beispiel #1
0
def get_cds(ref):
    '''
    assuming there is one contiguous coding region which might be
    split into multiple sub-proteins like HA1 and HA2.
    loop over all features, pull out min and max of their union
    '''
    cds_start, cds_end = np.inf, 0
    for feature in ref.features:
        if feature.type=='CDS':
            if feature.location.start<cds_start:
                cds_start=feature.location.start
            if feature.location.end>cds_end:
                cds_end=feature.location.end

    refstr = str(ref.seq).upper()
    refCDS = refstr[cds_start:cds_end]
    refAA = safe_translate(refstr[cds_start:cds_end])
    return refstr, refCDS, refAA, cds_start, cds_end
Beispiel #2
0
def codon_align(seq, refstr, refAA, cds_start, cds_end):
    seqstr = str(seq.seq).upper()
    score, refaln, seqaln = align_pairwise(refstr, seqstr)
    if score<0: # did not align
        return None
    ref_aln_array = np.array(list(refaln))
    seq_aln_array = np.array(list(seqaln))

    # stip gaps
    ungapped = ref_aln_array!='-'
    ref_aln_array_ungapped = ref_aln_array[ungapped]
    seq_aln_array_ungapped = seq_aln_array[ungapped]

    seq5pUTR = "".join(seq_aln_array_ungapped[:cds_start])
    seq3pUTR = "".join(seq_aln_array_ungapped[cds_end:])
    seqCDS = "".join(seq_aln_array_ungapped[cds_start:cds_end])
    seqCDS_ungapped = seqCDS.replace('-', '')
    seqAA = safe_translate(seqCDS_ungapped)

    scoreAA, refalnAA, seqalnAA = align_pairwise(refAA, seqAA)
    if scoreAA<0 or sum(seqAA.count(x) for x in ['*', 'X'])>5 or refalnAA.count('-')>5:
        print(seq.id, "didn't translate properly", file=sys.stderr)
        return None

    seqCDS_aln = seq5pUTR
    pos = 0
    for aa_ref, aa_seq in zip(refalnAA, seqalnAA):
        if aa_seq=='-':
            seqCDS_aln += '---'
            # if the nucleotide sequence is gapped
            # (i.e. because of missing data at the 5p and 3p end, advance pos)
            if seqCDS_ungapped[pos:pos+3]=='---':
                pos += 3
        else:
            if len(seqCDS_ungapped)>=pos+3:
                seqCDS_aln += seqCDS_ungapped[pos:pos+3]
            else:
                seqCDS_aln += '---'
            pos += 3

    return ''.join(seqCDS_aln)+seq3pUTR
Beispiel #3
0
        for seq in alignment:
            if seq.id == ref.id:
                continue
            # read sequence and all its annotated features
            seq_container = tmpNode()
            seq_str = str(seq.seq)
            seq_container.sequences['nuc'] = {
                i: c
                for i, c in enumerate(seq_str)
            }
            for fname, feat in features.items():
                if feat.type != 'source':
                    seq_container.sequences[fname] = {
                        i: c
                        for i, c in enumerate(
                            safe_translate(feat.extract(seq_str)))
                    }

            # for each clade, check whether it matches any of the clade definitions in the tsv
            matches = []
            for clade_name, clade_alleles in clade_designations.items():
                if is_node_in_clade(clade_alleles, seq_container, ref):
                    matches.append(clade_name)

            # print the last match as clade assignment and all others as ancestral clades
            # note that this assumes that clades in the tsv are ordered by order of appearence.
            # furthermore, this will only work if parent clades don't have definitions that exclude
            # child clades, i.e. positions can only be additive for this to work.
            if matches:
                print(
                    f"{seq.description}\t{matches[-1]}\t{', '.join(matches[:-1])}",
    ref = SeqIO.read(args.reference, 'genbank')

    # assuming there is one contiguous coding region which might be
    # split into multiple sub-proteins like HA1 and HA2.
    # loop over all features, pull out min and max of their union
    cds_start, cds_end = np.inf, 0
    for feature in ref.features:
        if feature.type == 'CDS':
            if feature.location.start < cds_start:
                cds_start = feature.location.start
            if feature.location.end > cds_end:
                cds_end = feature.location.end

    refstr = str(ref.seq).upper()
    refCDS = refstr[cds_start:cds_end]
    refAA = safe_translate(refstr[cds_start:cds_end])

    alignment = []
    for seq in aln:
        seqstr = str(seq.seq).upper()
        score, refaln, seqaln = align_pairwise(refstr, seqstr)
        if score < 0:  # did not align
            continue
        ref_aln_array = np.array(list(refaln))
        seq_aln_array = np.array(list(seqaln))

        # stip gaps
        ungapped = ref_aln_array != '-'
        ref_aln_array_ungapped = ref_aln_array[ungapped]
        seq_aln_array_ungapped = seq_aln_array[ungapped]
Beispiel #5
0
    ref = SeqIO.read(refname, 'genbank')
    features = load_features(refname)
    clade_designations = read_in_clade_definitions(
        f"config/clades_{args.lineage}_ha.tsv")

    # get sequence as string, CDS seq, amino acid sequence, and start/end pos
    refstr, refCDS, refAA, cds_start, cds_end = get_cds(ref)

    alignment = []
    for seq in seqs:
        seq_container = tmpNode()
        seq_aln = codon_align(seq, refstr, refAA, cds_start, cds_end)
        if seq_aln is None:
            print(f"{seq.id}\tnot translatable", file=sys.stdout)
            continue

        seq_container.sequences['nuc'] = {i: c for i, c in enumerate(seq_aln)}
        for fname, feat in features.items():
            if feat.type != 'source':
                seq_container.sequences[fname] = {
                    i: c
                    for i, c in enumerate(safe_translate(feat.extract(
                        seq_aln)))
                }

        matches = []
        for clade_name, clade_alleles in clade_designations.items():
            if is_node_in_clade(clade_alleles, seq_container, ref):
                matches.append(clade_name)
        print(f"{seq.description}\t{', '.join(matches)}", file=sys.stdout)
    tmp_str = "".join(sample('ABCDEFGHILKLMOPQRSTUVWXYZ', 20))

    ref = SeqIO.read(args.reference_sequence, 'genbank')

    # get sequence as string, CDS seq, amino acid sequence, and start/end pos
    features_to_translate = load_features(args.reference_sequence, args.genes)
    refstr, refCDS, refAA, cds_start, cds_end = get_cds(ref)

    alignment = []
    for seq in sequences:
        seq_aln = codon_align(seq,  refstr, refAA, cds_start, cds_end)
        if seq_aln:
            seq.seq=Seq.Seq(seq_aln)
            alignment.append(seq)

    print("selected %d for region %s and date interval %f-%f"%(len(sequences), region, time_interval[0], time_interval[1]))

    for gene, fname in zip(args.genes, args.output):
        if gene not in features_to_translate:
            continue
        seqs = []
        feature = features_to_translate[gene]
        for seq in alignment:
            try:
                translation =  SeqRecord.SeqRecord(seq=Seq.Seq(safe_translate(str(feature.extract(seq.seq)))),
                                                   id=seq.name, name=seq.name, description='')
                seqs.append(translation)
            except:
                print("WARN:",seq.name,"did not translate")
        SeqIO.write(seqs, fname, 'fasta')
        for seq in alignment:
            if seq.id==ref.id:
                continue
            if len(seq.seq)!=len(ref.seq):
                import ipdb; ipdb.set_trace()
                print(f"ERROR: this file doesn't seem aligned to the reference. {seq.id} as length {len(seq.seq)} while the reference has length {len(ref.seq)}.")
                sys.exit(1)

            # read sequence and all its annotated features
            seq_container = tmpNode()
            seq_str = str(seq.seq)
            seq_container.sequences['nuc'] = {i:c for i,c in enumerate(seq_str)}
            for fname, feat in features.items():
                if feat.type != 'source':
                    seq_container.sequences[fname] = {i:c for i,c in enumerate(safe_translate(feat.extract(seq_str)))}

            # for each clade, check whether it matches any of the clade definitions in the tsv
            matches = []
            for clade_name, clade_alleles in clade_designations.items():
                if is_node_in_clade(clade_alleles, seq_container, ref):
                    matches.append(clade_name)


            # print the last match as clade assignment and all others as ancestral clades
            # note that this assumes that clades in the tsv are ordered by order of appearence.
            # furthermore, this will only work if parent clades don't have definitions that exclude
            # child clades, i.e. positions can only be additive for this to work.
            if matches:
                print(f"{seq.description}\t{matches[-1]}\t{', '.join(matches[:-1])}", file=output)
            else: