def test_sequence_conversion(): path = os.path.join(data_dir, "nuc.fasta") file = fasta.FastaFile() file.read(path) assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file) seq_dict = fasta.get_sequences(file) file2 = fasta.FastaFile() fasta.set_sequences(file2, seq_dict) seq_dict2 = fasta.get_sequences(file2) assert seq_dict == seq_dict2 file3 = fasta.FastaFile() fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG")) assert file3["sequence"] == "AACCTTGG" path = os.path.join(data_dir, "prot.fasta") file4 = fasta.FastaFile() file4.read(path) assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4) path = os.path.join(data_dir, "invalid.fasta") file5 = fasta.FastaFile() file5.read(path) with pytest.raises(ValueError): seq.NucleotideSequence(fasta.get_sequence(file5))
def test_sequence_conversion(): path = os.path.join(data_dir("sequence"), "nuc.fasta") file = fasta.FastaFile.read(path) assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file) seq_dict = fasta.get_sequences(file) file2 = fasta.FastaFile() fasta.set_sequences(file2, seq_dict) seq_dict2 = fasta.get_sequences(file2) # Cannot compare dicts directly, since the original RNA sequence is # now guessed as protein sequence for seq1, seq2 in zip(seq_dict.values(), seq_dict2.values()): assert str(seq1) == str(seq2) file3 = fasta.FastaFile() fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG")) assert file3["sequence"] == "AACCTTGG" path = os.path.join(data_dir("sequence"), "prot.fasta") file4 = fasta.FastaFile.read(path) # Expect a warning for selenocysteine conversion with pytest.warns(UserWarning): assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(file4) path = os.path.join(data_dir("sequence"), "invalid.fasta") file5 = fasta.FastaFile.read(path) with pytest.raises(ValueError): seq.NucleotideSequence(fasta.get_sequence(file5))
def test_fetch_single_file(as_file_like): file_name = None if as_file_like else biotite.temp_file("fa") file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"], file_name, "protein", "fasta") fasta_file = fasta.FastaFile.read(file) prot_seqs = fasta.get_sequences(fasta_file) assert len(prot_seqs) == 2
def test_fetch_single_file(): file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"], biotite.temp_file("fa"), "protein", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seqs = fasta.get_sequences(fasta_file) assert len(prot_seqs) == 2
def test_access_high_level(): path = os.path.join(data_dir("sequence"), "nuc.fasta") file = fasta.FastaFile.read(path) sequences = fasta.get_sequences(file) assert sequences == { "dna sequence": seq.NucleotideSequence("ACGCTACGT", False), "another dna sequence": seq.NucleotideSequence("A", False), "third dna sequence": seq.NucleotideSequence("ACGT", False), "rna sequence": seq.NucleotideSequence("ACGT", False), "ambiguous rna sequence": seq.NucleotideSequence("ACGTNN", True), }
def test_fetch(format, as_file_like): path = None if as_file_like else biotite.temp_dir() file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": file = pdbx.PDBxFile.read(file_path_or_obj) pdbx.get_structure(file) elif format == "mmtf": file = mmtf.MMTFFile.read(file_path_or_obj) mmtf.get_structure(file) elif format == "fasta": file = fasta.FastaFile.read(file_path_or_obj) # Test if the file contains any sequences assert len(fasta.get_sequences(file)) > 0
def test_fetch_single_file(as_file_like): if as_file_like: file_name = None else: file = tempfile.NamedTemporaryFile("r", suffix=".fa") file_name = file.name downloaded_file_name = entrez.fetch_single_file( ["1L2Y_A", "3O5R_A"], file_name, "protein", "fasta" ) fasta_file = fasta.FastaFile.read(downloaded_file_name) prot_seqs = fasta.get_sequences(fasta_file) assert len(prot_seqs) == 2 if not as_file_like: file.close()
def main(): parser = argparse.ArgumentParser( description='Score sequences based on a given structure.' ) parser.add_argument( 'pdbfile', type=str, help='input filepath, either .pdb or .cif', ) parser.add_argument( 'seqfile', type=str, help='input filepath for variant sequences in a .fasta file', ) parser.add_argument( '--outpath', type=str, help='output filepath for scores of variant sequences', default='output/sequence_scores.csv', ) parser.add_argument( '--chain', type=str, help='chain id for the chain of interest', default='A', ) args = parser.parse_args() model, alphabet = esm.pretrained.esm_if1_gvp4_t16_142M_UR50() coords, seq = esm.inverse_folding.util.load_coords(args.pdbfile, args.chain) print('Native sequence loaded from structure file:') print(seq) print('\n') ll, _ = esm.inverse_folding.util.score_sequence( model, alphabet, coords, seq) print('Native sequence') print(f'Log likelihood: {ll:.2f}') print(f'Perplexity: {np.exp(-ll):.2f}') print('\nScoring variant sequences from sequence file..\n') infile = FastaFile() infile.read(args.seqfile) seqs = get_sequences(infile) Path(args.outpath).parent.mkdir(parents=True, exist_ok=True) with open(args.outpath, 'w') as fout: fout.write('seqid,log_likelihood\n') for header, seq in tqdm(seqs.items()): ll, _ = esm.inverse_folding.util.score_sequence( model, alphabet, coords, str(seq)) fout.write(header + ',' + str(ll) + '\n') print(f'Results saved to {args.outpath}')
spacing=spacing) twin = axes.get_shared_x_axes().get_siblings(axes)[0] for ax in (axes, twin): ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color": "white"}) axes.get_figure().patch.set_facecolor("#181818") # Using cyclotide sequences as example query = (entrez.SimpleQuery("Cyclotide") & entrez.SimpleQuery("cter") & entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^ entrez.SimpleQuery("Precursor")) uids = entrez.search(query, "protein") fasta_file = fasta.FastaFile.read( entrez.fetch_single_file(uids, None, "protein", "fasta")) sequence_dict = fasta.get_sequences(fasta_file) headers = list(sequence_dict.keys()) sequences = list(sequence_dict.values()) labels = [header[-1] for header in headers] # Perform a multiple sequence alignment matrix = align.SubstitutionMatrix.std_protein_matrix() alignment, order, _, _ = align.align_multiple(sequences, matrix) # Order alignment according to guide tree alignment = alignment[:, order.tolist()] labels = [labels[i] for i in order] # Visualize the alignment using the new alignment plotter fig = plt.figure(figsize=(8.0, 3.7)) ax = fig.add_subplot(111) plot_alignment_shapes(ax,