def test_search_composite(): query1 = rcsb.FieldQuery("rcsb_entity_host_organism.scientific_name", exact_match="H**o sapiens") query2 = rcsb.FieldQuery("exptl.method", exact_match="SOLUTION NMR") ids_1 = set(rcsb.search(query1)) ids_2 = set(rcsb.search(query2)) ids_or = set(rcsb.search(query1 | query2)) ids_and = set(rcsb.search(query1 & query2)) assert ids_or == ids_1 | ids_2 assert ids_and == ids_1 & ids_2
def test_search(): query1 = rcsb.ResolutionQuery(0.0, 0.8) query2 = rcsb.MolecularWeightQuery(0, 1000) ids_query1 = sorted(rcsb.search(query1)) ids_query2 = sorted(rcsb.search(query2)) ids_comp = sorted(rcsb.search(rcsb.CompositeQuery("or", [query1, query2]))) ids_comp2 = [] for id in ids_query1 + ids_query2: if id not in ids_comp2: ids_comp2.append(id) assert ids_comp == sorted(ids_comp2)
def test_search_field(field, molecular_definition, params, ref_ids): query = rcsb.FieldQuery(field, molecular_definition, **params) test_ids = rcsb.search(query) test_count = rcsb.count(query) assert set(test_ids) == set(ref_ids) assert test_count == len(ref_ids)
def test_search_invalid(): class InvalidQuery(rcsb.SimpleQuery): def __init__(self): super().__init__("InvalidQuery", "gibberish") self.add_param("foo", "bar") with pytest.raises(RequestError): ids = rcsb.search(InvalidQuery())
def test_simple_query_types(query_type, params, exp_ids): query = query_type(**params) print("Query:") print(query) ids = rcsb.search(query) if isinstance(exp_ids, int): assert len(ids) == pytest.approx(exp_ids, rel=0.1) else: assert set(ids) == set(exp_ids)
def test_search_sequence(): IDENTIY_CUTOFF = 0.9 pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif")) ref_sequence = pdbx.get_sequence(pdbx_file)[0] query = rcsb.SequenceQuery(ref_sequence, "protein", min_identity=IDENTIY_CUTOFF) test_ids = rcsb.search(query) for id in test_ids: fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta")) test_sequence = fasta.get_sequence(fasta_file) matrix = align.SubstitutionMatrix.std_protein_matrix() alignment = align.align_optimal(ref_sequence, test_sequence, matrix, terminal_penalty=False)[0] identity = align.get_sequence_identity(alignment, mode="shortest") assert identity >= IDENTIY_CUTOFF
print("\n".join(lines[:10] + ["..."])) ######################################################################## # In many cases you are not interested in a specific structure, but you # want a set of structures that fits your desired criteria. # For this purpose the *RCSB* search API can be used. # At first you have to create :class:`Query` object for the property you # want to filter. # The :func:`search()` method takes the :class:`Query` and returns a # list of PDB IDs, which itself can be used as input for # :func:`fetch()`. # Likewise, :func:`count()` is used to count the number of matching # PDB IDs. query = rcsb.BasicQuery("HCN1") pdb_ids = rcsb.search(query) print(pdb_ids) print(rcsb.count(query)) files = rcsb.fetch(pdb_ids, "mmtf", gettempdir()) ######################################################################## # This was a simple search for the occurrence of the search term in any # field. # You can also search for a value in a specific field with a # :class:`FieldQuery`. # A complete list of the available fields and its supported operators # is documented # `on this page <https://search.rcsb.org/search-attributes.html>`_. # Query for 'lacA' gene query1 = rcsb.FieldQuery("rcsb_entity_source_organism.rcsb_gene_name.value",
em_count = np.zeros(len(years), dtype=int) tot_count = np.zeros(len(years), dtype=int) # For each year fetch the list of released PDB IDs # and count the number for i, year in enumerate(years): # A query that comprises one year date_query = rcsb.DateQuery(datetime.date(year, 1, 1), datetime.date(year, 12, 31), event="release") xray_query = rcsb.MethodQuery("X-RAY") nmr_query = rcsb.MethodQuery("SOLUTION_NMR") em_query = rcsb.MethodQuery("ELECTRON MICROSCOPY") # Get the amount of structures, that were released in that year # AND were elucidated with the respective method xray_count[i], nmr_count[i], em_count[i] = [ len(rcsb.search(rcsb.CompositeQuery("and", (date_query, q)))) for q in [xray_query, nmr_query, em_query] ] # Get the total amount of structures released in that year tot_count[i] = len(rcsb.search(date_query)) fig, ax = plt.subplots(figsize=(8.0, 5.0)) ax.set_title("PDB release statistics") ax.set_xlim(years[0] - 1, years[-1] + 1) ax.set_xticks(years) ax.set_xticklabels([str(y) for y in years], rotation=45) ax.set_xlabel("Year") ax.set_ylabel("Released structures per year") ax.bar(years, xray_count, color=biotite.colors["darkorange"], label="X-RAY") ax.bar(years, nmr_count,
# Download file in the fast and small binary MMTF format file_path = rcsb.fetch("1l2y", "mmtf", biotite.temp_dir(), overwrite=True) ######################################################################## # In many cases you are not interested in a specific structure, but you # want a set of structures that fits your desired criteria. # For this purpose the *RCSB* SEARCH service can be interfaced. # # At first you have to create :class:`Query` object for the property you # want to filter. # The :func:`search()` method takes the :class:`Query` and returns a # list of PDB IDs, which itself can be used as input for :func:`fetch()`. query = rcsb.ResolutionQuery(0.0, 0.6) pdb_ids = rcsb.search(query) print(pdb_ids) files = rcsb.fetch(pdb_ids, "mmtf", biotite.temp_dir()) ######################################################################## # Not all query types of the SEARCH service are supported yet. But it is # quite easy to implement your needed query type by inheriting # :class:`SimpleQuery`. # # Multiple :class:`SimpleQuery` objects can be 'and'/'or' combined using # a :class:`CompositeQuery`. query1 = rcsb.ResolutionQuery(0.0, 1.0) query2 = rcsb.MolecularWeightQuery(10000, 100000) composite = rcsb.CompositeQuery("and", [query1, query2])
def test_search_empty(): ids = rcsb.search(rcsb.MolecularWeightQuery(0, 1)) assert len(ids) == 0
# MMTF files are downloaded into a new directory in this path # and the .tar archive is created here base_path = "path/to/directoy" # A Query class for getting all available PDB IDs class HoldingsQuery(rcsb.SimpleQuery): def __init__(self, method="ignore", molecule_type="ignore", has_data=None): super().__init__("HoldingsQuery") self.add_param("experimentalMethod", method) self.add_param("moleculeType", molecule_type) # Obtain all PDB IDs all_id_query = HoldingsQuery() pdb_ids = rcsb.search(all_id_query) # Name for donwload directory now = datetime.datetime.now() mmtf_dir = os.path.join(base_path, f"mmtf_{now.year:04d}{now.month:02d}{now.day:02d}") if not os.path.isdir(mmtf_dir): os.mkdir(mmtf_dir) # Download all PDB IDs with parallelized HTTP requests with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: for pdb_id in pdb_ids: executor.submit(rcsb.fetch, pdb_id, "mmtf", mmtf_dir) # Create .tar archive file from MMTF files in directory with tarfile.open(mmtf_dir + ".tar", mode="w") as file:
def test_search_basic(): query = rcsb.BasicQuery("tc5b") assert rcsb.search(query) == ["1L2Y"] assert rcsb.count(query) == 1
def test_search_invalid(field, params): invalid_query = rcsb.FieldQuery(field, **params) with pytest.raises(RequestError, match="400"): rcsb.search(invalid_query) with pytest.raises(RequestError, match="400"): rcsb.count(invalid_query)
def test_search_empty(): query = rcsb.BasicQuery("This will not match any ID") assert rcsb.search(query) == [] assert rcsb.count(query) == 0
def test_search_return_type(return_type, expected): query = rcsb.BasicQuery("tc5b") assert rcsb.search(query, return_type) == expected assert rcsb.count(query, return_type) == len(expected)
def test_search_structure(): query = rcsb.StructureQuery("1L2Y", chain="A") test_ids = rcsb.search(query) assert "1L2Y" in test_ids