Exemple #1
0
def test_search_composite():
    query1 = rcsb.FieldQuery("rcsb_entity_host_organism.scientific_name",
                             exact_match="H**o sapiens")
    query2 = rcsb.FieldQuery("exptl.method", exact_match="SOLUTION NMR")
    ids_1 = set(rcsb.search(query1))
    ids_2 = set(rcsb.search(query2))
    ids_or = set(rcsb.search(query1 | query2))
    ids_and = set(rcsb.search(query1 & query2))

    assert ids_or == ids_1 | ids_2
    assert ids_and == ids_1 & ids_2
Exemple #2
0
def test_search():
    query1 = rcsb.ResolutionQuery(0.0, 0.8)
    query2 = rcsb.MolecularWeightQuery(0, 1000)
    ids_query1 = sorted(rcsb.search(query1))
    ids_query2 = sorted(rcsb.search(query2))
    ids_comp = sorted(rcsb.search(rcsb.CompositeQuery("or", [query1, query2])))
    ids_comp2 = []
    for id in ids_query1 + ids_query2:
        if id not in ids_comp2:
            ids_comp2.append(id)
    assert ids_comp == sorted(ids_comp2)
Exemple #3
0
def test_search_field(field, molecular_definition, params, ref_ids):
    query = rcsb.FieldQuery(field, molecular_definition, **params)
    test_ids = rcsb.search(query)
    test_count = rcsb.count(query)

    assert set(test_ids) == set(ref_ids)
    assert test_count == len(ref_ids)
Exemple #4
0
def test_search_invalid():
    class InvalidQuery(rcsb.SimpleQuery):
        def __init__(self):
            super().__init__("InvalidQuery", "gibberish")
            self.add_param("foo", "bar")

    with pytest.raises(RequestError):
        ids = rcsb.search(InvalidQuery())
Exemple #5
0
def test_simple_query_types(query_type, params, exp_ids):
    query = query_type(**params)
    print("Query:")
    print(query)
    ids = rcsb.search(query)
    if isinstance(exp_ids, int):
        assert len(ids) == pytest.approx(exp_ids, rel=0.1)
    else:
        assert set(ids) == set(exp_ids)
Exemple #6
0
def test_search_sequence():
    IDENTIY_CUTOFF = 0.9
    pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif"))
    ref_sequence = pdbx.get_sequence(pdbx_file)[0]
    query = rcsb.SequenceQuery(ref_sequence,
                               "protein",
                               min_identity=IDENTIY_CUTOFF)
    test_ids = rcsb.search(query)

    for id in test_ids:
        fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta"))
        test_sequence = fasta.get_sequence(fasta_file)
        matrix = align.SubstitutionMatrix.std_protein_matrix()
        alignment = align.align_optimal(ref_sequence,
                                        test_sequence,
                                        matrix,
                                        terminal_penalty=False)[0]
        identity = align.get_sequence_identity(alignment, mode="shortest")
        assert identity >= IDENTIY_CUTOFF
Exemple #7
0
print("\n".join(lines[:10] + ["..."]))

########################################################################
# In many cases you are not interested in a specific structure, but you
# want a set of structures that fits your desired criteria.
# For this purpose the *RCSB* search API can be used.
# At first you have to create :class:`Query` object for the property you
# want to filter.
# The :func:`search()` method takes the :class:`Query` and returns a
# list of PDB IDs, which itself can be used as input for
# :func:`fetch()`.
# Likewise, :func:`count()` is used to count the number of matching
# PDB IDs.

query = rcsb.BasicQuery("HCN1")
pdb_ids = rcsb.search(query)
print(pdb_ids)
print(rcsb.count(query))
files = rcsb.fetch(pdb_ids, "mmtf", gettempdir())

########################################################################
# This was a simple search for the occurrence of the search term in any
# field.
# You can also search for a value in a specific field with a
# :class:`FieldQuery`.
# A complete list of the available fields and its supported operators
# is documented
# `on this page <https://search.rcsb.org/search-attributes.html>`_.

# Query for 'lacA' gene
query1 = rcsb.FieldQuery("rcsb_entity_source_organism.rcsb_gene_name.value",
em_count = np.zeros(len(years), dtype=int)
tot_count = np.zeros(len(years), dtype=int)
# For each year fetch the list of released PDB IDs
# and count the number
for i, year in enumerate(years):
    # A query that comprises one year
    date_query = rcsb.DateQuery(datetime.date(year, 1, 1),
                                datetime.date(year, 12, 31),
                                event="release")
    xray_query = rcsb.MethodQuery("X-RAY")
    nmr_query = rcsb.MethodQuery("SOLUTION_NMR")
    em_query = rcsb.MethodQuery("ELECTRON MICROSCOPY")
    # Get the amount of structures, that were released in that year
    # AND were elucidated with the respective method
    xray_count[i], nmr_count[i], em_count[i] = [
        len(rcsb.search(rcsb.CompositeQuery("and", (date_query, q))))
        for q in [xray_query, nmr_query, em_query]
    ]
    # Get the total amount of structures released in that year
    tot_count[i] = len(rcsb.search(date_query))

fig, ax = plt.subplots(figsize=(8.0, 5.0))
ax.set_title("PDB release statistics")
ax.set_xlim(years[0] - 1, years[-1] + 1)
ax.set_xticks(years)
ax.set_xticklabels([str(y) for y in years], rotation=45)
ax.set_xlabel("Year")
ax.set_ylabel("Released structures per year")
ax.bar(years, xray_count, color=biotite.colors["darkorange"], label="X-RAY")
ax.bar(years,
       nmr_count,
Exemple #9
0
# Download file in the fast and small binary MMTF format
file_path = rcsb.fetch("1l2y", "mmtf", biotite.temp_dir(), overwrite=True)

########################################################################
# In many cases you are not interested in a specific structure, but you
# want a set of structures that fits your desired criteria.
# For this purpose the *RCSB* SEARCH service can be interfaced.
#
# At first you have to create :class:`Query` object for the property you
# want to filter.
# The :func:`search()` method takes the :class:`Query` and returns a
# list of PDB IDs, which itself can be used as input for :func:`fetch()`.

query = rcsb.ResolutionQuery(0.0, 0.6)
pdb_ids = rcsb.search(query)
print(pdb_ids)
files = rcsb.fetch(pdb_ids, "mmtf", biotite.temp_dir())

########################################################################
# Not all query types of the SEARCH service are supported yet. But it is
# quite easy to implement your needed query type by inheriting
# :class:`SimpleQuery`.
#
# Multiple :class:`SimpleQuery` objects can be 'and'/'or' combined using
# a :class:`CompositeQuery`.

query1 = rcsb.ResolutionQuery(0.0, 1.0)
query2 = rcsb.MolecularWeightQuery(10000, 100000)
composite = rcsb.CompositeQuery("and", [query1, query2])
Exemple #10
0
def test_search_empty():
    ids = rcsb.search(rcsb.MolecularWeightQuery(0, 1))
    assert len(ids) == 0
# MMTF files are downloaded into a new directory in this path
# and the .tar archive is created here
base_path = "path/to/directoy"


# A Query class for getting all available PDB IDs
class HoldingsQuery(rcsb.SimpleQuery):
    def __init__(self, method="ignore", molecule_type="ignore", has_data=None):
        super().__init__("HoldingsQuery")
        self.add_param("experimentalMethod", method)
        self.add_param("moleculeType", molecule_type)


# Obtain all PDB IDs
all_id_query = HoldingsQuery()
pdb_ids = rcsb.search(all_id_query)

# Name for donwload directory
now = datetime.datetime.now()
mmtf_dir = os.path.join(base_path,
                        f"mmtf_{now.year:04d}{now.month:02d}{now.day:02d}")
if not os.path.isdir(mmtf_dir):
    os.mkdir(mmtf_dir)

# Download all PDB IDs with parallelized HTTP requests
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    for pdb_id in pdb_ids:
        executor.submit(rcsb.fetch, pdb_id, "mmtf", mmtf_dir)

# Create .tar archive file from MMTF files in directory
with tarfile.open(mmtf_dir + ".tar", mode="w") as file:
Exemple #12
0
def test_search_basic():
    query = rcsb.BasicQuery("tc5b")
    assert rcsb.search(query) == ["1L2Y"]
    assert rcsb.count(query) == 1
Exemple #13
0
def test_search_invalid(field, params):
    invalid_query = rcsb.FieldQuery(field, **params)
    with pytest.raises(RequestError, match="400"):
        rcsb.search(invalid_query)
    with pytest.raises(RequestError, match="400"):
        rcsb.count(invalid_query)
Exemple #14
0
def test_search_empty():
    query = rcsb.BasicQuery("This will not match any ID")
    assert rcsb.search(query) == []
    assert rcsb.count(query) == 0
Exemple #15
0
def test_search_return_type(return_type, expected):
    query = rcsb.BasicQuery("tc5b")
    assert rcsb.search(query, return_type) == expected
    assert rcsb.count(query, return_type) == len(expected)
Exemple #16
0
def test_search_structure():
    query = rcsb.StructureQuery("1L2Y", chain="A")
    test_ids = rcsb.search(query)
    assert "1L2Y" in test_ids