Esempio n. 1
0
def test_sequence_conversion():
    path = os.path.join(data_dir, "nuc.fasta")
    file = fasta.FastaFile()
    file.read(path)
    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)

    seq_dict = fasta.get_sequences(file)
    file2 = fasta.FastaFile()
    fasta.set_sequences(file2, seq_dict)
    seq_dict2 = fasta.get_sequences(file2)
    assert seq_dict == seq_dict2

    file3 = fasta.FastaFile()
    fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
    assert file3["sequence"] == "AACCTTGG"

    path = os.path.join(data_dir, "prot.fasta")
    file4 = fasta.FastaFile()
    file4.read(path)
    assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4)

    path = os.path.join(data_dir, "invalid.fasta")
    file5 = fasta.FastaFile()
    file5.read(path)
    with pytest.raises(ValueError):
        seq.NucleotideSequence(fasta.get_sequence(file5))
Esempio n. 2
0
def test_sequence_conversion():
    path = os.path.join(data_dir("sequence"), "nuc.fasta")
    file = fasta.FastaFile.read(path)
    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)

    seq_dict = fasta.get_sequences(file)
    file2 = fasta.FastaFile()
    fasta.set_sequences(file2, seq_dict)
    seq_dict2 = fasta.get_sequences(file2)
    # Cannot compare dicts directly, since the original RNA sequence is
    # now guessed as protein sequence
    for seq1, seq2 in zip(seq_dict.values(), seq_dict2.values()):
        assert str(seq1) == str(seq2)

    file3 = fasta.FastaFile()
    fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
    assert file3["sequence"] == "AACCTTGG"

    path = os.path.join(data_dir("sequence"), "prot.fasta")
    file4 = fasta.FastaFile.read(path)
    # Expect a warning for selenocysteine conversion
    with pytest.warns(UserWarning):
        assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(file4)

    path = os.path.join(data_dir("sequence"), "invalid.fasta")
    file5 = fasta.FastaFile.read(path)
    with pytest.raises(ValueError):
        seq.NucleotideSequence(fasta.get_sequence(file5))
Esempio n. 3
0
def test_fetch_single_file(as_file_like):
    file_name = None if as_file_like else biotite.temp_file("fa")
    file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"], file_name, "protein",
                                    "fasta")
    fasta_file = fasta.FastaFile.read(file)
    prot_seqs = fasta.get_sequences(fasta_file)
    assert len(prot_seqs) == 2
Esempio n. 4
0
def test_fetch_single_file():
    file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"],
                                    biotite.temp_file("fa"), "protein",
                                    "fasta")
    fasta_file = fasta.FastaFile()
    fasta_file.read(file)
    prot_seqs = fasta.get_sequences(fasta_file)
    assert len(prot_seqs) == 2
Esempio n. 5
0
def test_access_high_level():
    path = os.path.join(data_dir("sequence"), "nuc.fasta")
    file = fasta.FastaFile.read(path)
    sequences = fasta.get_sequences(file)
    assert sequences == {
        "dna sequence": seq.NucleotideSequence("ACGCTACGT", False),
        "another dna sequence": seq.NucleotideSequence("A", False),
        "third dna sequence": seq.NucleotideSequence("ACGT", False),
        "rna sequence": seq.NucleotideSequence("ACGT", False),
        "ambiguous rna sequence": seq.NucleotideSequence("ACGTNN", True),
    }
Esempio n. 6
0
def test_fetch(format, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True)
    if format == "pdb":
        file = pdb.PDBFile.read(file_path_or_obj)
        pdb.get_structure(file)
    elif format == "pdbx":
        file = pdbx.PDBxFile.read(file_path_or_obj)
        pdbx.get_structure(file)
    elif format == "mmtf":
        file = mmtf.MMTFFile.read(file_path_or_obj)
        mmtf.get_structure(file)
    elif format == "fasta":
        file = fasta.FastaFile.read(file_path_or_obj)
        # Test if the file contains any sequences
        assert len(fasta.get_sequences(file)) > 0
Esempio n. 7
0
def test_fetch_single_file(as_file_like):
    if as_file_like:
        file_name = None
    else:
        file = tempfile.NamedTemporaryFile("r", suffix=".fa")
        file_name = file.name
    
    downloaded_file_name = entrez.fetch_single_file(
        ["1L2Y_A", "3O5R_A"], file_name, "protein", "fasta"
    )
    fasta_file = fasta.FastaFile.read(downloaded_file_name)
    prot_seqs = fasta.get_sequences(fasta_file)
    assert len(prot_seqs) == 2

    if not as_file_like:
        file.close()
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser(
            description='Score sequences based on a given structure.'
    )
    parser.add_argument(
            'pdbfile', type=str,
            help='input filepath, either .pdb or .cif',
    )
    parser.add_argument(
            'seqfile', type=str,
            help='input filepath for variant sequences in a .fasta file',
    )
    parser.add_argument(
            '--outpath', type=str,
            help='output filepath for scores of variant sequences',
            default='output/sequence_scores.csv',
    )
    parser.add_argument(
            '--chain', type=str,
            help='chain id for the chain of interest', default='A',
    )
    args = parser.parse_args()

    model, alphabet = esm.pretrained.esm_if1_gvp4_t16_142M_UR50()
    coords, seq = esm.inverse_folding.util.load_coords(args.pdbfile, args.chain)
    print('Native sequence loaded from structure file:')
    print(seq)
    print('\n')

    ll, _ = esm.inverse_folding.util.score_sequence(
            model, alphabet, coords, seq) 
    print('Native sequence')
    print(f'Log likelihood: {ll:.2f}')
    print(f'Perplexity: {np.exp(-ll):.2f}')

    print('\nScoring variant sequences from sequence file..\n')
    infile = FastaFile()
    infile.read(args.seqfile)
    seqs = get_sequences(infile)
    Path(args.outpath).parent.mkdir(parents=True, exist_ok=True)
    with open(args.outpath, 'w') as fout:
        fout.write('seqid,log_likelihood\n')
        for header, seq in tqdm(seqs.items()):
            ll, _ = esm.inverse_folding.util.score_sequence(
                    model, alphabet, coords, str(seq))
            fout.write(header + ',' + str(ll) + '\n')
    print(f'Results saved to {args.outpath}') 
                            spacing=spacing)

    twin = axes.get_shared_x_axes().get_siblings(axes)[0]
    for ax in (axes, twin):
        ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color": "white"})
    axes.get_figure().patch.set_facecolor("#181818")


# Using cyclotide sequences as example
query = (entrez.SimpleQuery("Cyclotide") & entrez.SimpleQuery("cter")
         & entrez.SimpleQuery("srcdb_swiss-prot", field="Properties")
         ^ entrez.SimpleQuery("Precursor"))
uids = entrez.search(query, "protein")
fasta_file = fasta.FastaFile.read(
    entrez.fetch_single_file(uids, None, "protein", "fasta"))
sequence_dict = fasta.get_sequences(fasta_file)
headers = list(sequence_dict.keys())
sequences = list(sequence_dict.values())
labels = [header[-1] for header in headers]

# Perform a multiple sequence alignment
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]
labels = [labels[i] for i in order]

# Visualize the alignment using the new alignment plotter
fig = plt.figure(figsize=(8.0, 3.7))
ax = fig.add_subplot(111)
plot_alignment_shapes(ax,