Ejemplo n.º 1
0
def test_read_in_clade_definitions_simple():
    clades = read_in_clade_definitions("tests/data/clades/simple_clades.tsv")
    assert clades == {
        'Clade_1': [('ctpE', 80, 'D')],
        'Clade_2': [('nuc', 30641, 'T')],
        'Clade_3': [('nuc', 444295, 'A'), ('pks8', 633, 'T')]
    }
Ejemplo n.º 2
0
def test_read_in_clade_definitions_inherit_chained():
    clades = read_in_clade_definitions("tests/data/clades/inherit_chained_clades.tsv")
    assert clades == {
        'Clade_1': [('ctpE', 80, 'D')],
        'Clade_2': [('ctpE', 80, 'D'),('nuc', 30641, 'T')],
        'Clade_3': [('ctpE', 80, 'D'),('nuc', 30641, 'T'), ('pks8', 633, 'T')]
    }
Ejemplo n.º 3
0
                        default=10,
                        type=int,
                        help="don't clean up")
    parser.add_argument("--nthreads",
                        default=1,
                        type=int,
                        help="Number of threads to use in alignment")
    args = parser.parse_args()

    #refname = f"config/reference.gb"
    refname = args.gbk
    features = load_features(refname)
    seqs = SeqIO.parse(args.sequences, 'fasta')
    ref = SeqIO.read(refname, 'genbank')
    #clade_designations = read_in_clade_definitions(f"config/clades.tsv")
    clade_designations = read_in_clade_definitions(args.clade)

    log_fname = "clade_assignment.log"
    in_fname = "clade_assignment_tmp.fasta"
    out_fname = "clade_assignment_tmp_alignment.fasta"

    output = open(args.output, 'w')
    print('name\tclade\tparent clades', file=output)

    # break the sequences into chunks, align each to the reference, and assign clades one-by-one
    done = False
    while not done:
        # generate a chunk with chunk-size sequences
        chunk = []
        while len(chunk) < args.chunk_size and (not done):
            try:
Ejemplo n.º 4
0
    parser = argparse.ArgumentParser(
        description="Assign clades to sequences",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--sequences",
                        required=True,
                        help="FASTA file of HA sequences")
    parser.add_argument("--lineage",
                        required=True,
                        help="lineage of the sequences supplied")
    args = parser.parse_args()

    refname = f"config/reference_{args.lineage}_ha.gb"
    seqs = SeqIO.parse(args.sequences, 'fasta')
    ref = SeqIO.read(refname, 'genbank')
    features = load_features(refname)
    clade_designations = read_in_clade_definitions(
        f"config/clades_{args.lineage}_ha.tsv")

    # get sequence as string, CDS seq, amino acid sequence, and start/end pos
    refstr, refCDS, refAA, cds_start, cds_end = get_cds(ref)

    alignment = []
    for seq in seqs:
        seq_container = tmpNode()
        seq_aln = codon_align(seq, refstr, refAA, cds_start, cds_end)
        if seq_aln is None:
            print(f"{seq.id}\tnot translatable", file=sys.stdout)
            continue

        seq_container.sequences['nuc'] = {i: c for i, c in enumerate(seq_aln)}
        for fname, feat in features.items():
            if feat.type != 'source':
Ejemplo n.º 5
0
                        help="process this many sequences at once")
    parser.add_argument("--nthreads",
                        default=1,
                        type=int,
                        help="Number of threads to use in alignment")
    args = parser.parse_args()

    refname = f"defaults/reference_seq.gb"
    features = load_features(refname)
    if args.sequences:
        seqs = SeqIO.parse(args.sequences, 'fasta')
    else:
        alignment = SeqIO.parse(args.alignment, 'fasta')

    ref = SeqIO.read(refname, 'genbank')
    clade_designations = read_in_clade_definitions(f"defaults/clades.tsv")

    log_fname = "clade_assignment.log"
    in_fname = "clade_assignment_tmp.fasta"
    out_fname = "clade_assignment_tmp_alignment.fasta"

    output = open(args.output, 'w')
    print('name\tclade\tparent clades', file=output)

    # break the sequences into chunks, align each to the reference, and assign clades one-by-one
    done = False
    while not done:
        # if not aligned, align
        if args.sequences:
            # generate a chunk with chunk-size sequences
            chunk = []
Ejemplo n.º 6
0
    group.add_argument("--alignment", help="*aligned* FASTA file of SARS-CoV-2 sequences relative to Wuhan-HU-1 with insertions removed")
    parser.add_argument("--output", type=str, default='clade_assignment.tsv', help="tsv file to write clade definitions to")
    parser.add_argument("--keep-temporary-files", action='store_true', help="don't clean up")
    parser.add_argument("--chunk-size", default=10, type=int, help="process this many sequences at once")
    parser.add_argument("--nthreads", default=1, type=int, help="Number of threads to use in alignment")
    args = parser.parse_args()

    refname = f"config/reference.gb"
    features = load_features(refname)
    if args.sequences:
        seqs = SeqIO.parse(args.sequences, 'fasta')
    else:
        alignment = SeqIO.parse(args.alignment, 'fasta')

    ref = SeqIO.read(refname, 'genbank')
    clade_designations = read_in_clade_definitions(f"config/clades.tsv")

    log_fname = "clade_assignment.log"
    in_fname = "clade_assignment_tmp.fasta"
    out_fname = "clade_assignment_tmp_alignment.fasta"


    output = open(args.output, 'w')
    print('name\tclade\tparent clades', file=output)

    # break the sequences into chunks, align each to the reference, and assign clades one-by-one
    done = False
    while not done:
        # if not aligned, align
        if args.sequences:
            # generate a chunk with chunk-size sequences
Ejemplo n.º 7
0
def test_read_in_clade_definitions_inheritance_from_self_error():
    with pytest.raises(ValueError):
        read_in_clade_definitions("tests/data/clades/self_inherit_clades.tsv")
Ejemplo n.º 8
0
def test_read_in_clade_definitions_inheritance_from_nonexistent_clade_error():
    with pytest.raises(ValueError):
        read_in_clade_definitions("tests/data/clades/nonexistent_clade_inheritance_clades.tsv")
Ejemplo n.º 9
0
def test_read_in_clade_definitions_multiple_inheritance_error():
    with pytest.raises(ValueError):
        read_in_clade_definitions("tests/data/clades/multiple_inheritance_clades.tsv")
Ejemplo n.º 10
0
def test_read_in_clade_definitions_inherit_cycle_error():
    with pytest.raises(ValueError):
        read_in_clade_definitions("tests/data/clades/inherit_cycle_clades.tsv")