Beispiel #1
0
def fetch_xray(xray_fp, force_download=False):
    """Fetch list of pdb entries, process, and write results to a yaml file.

    List of all PDB entries, identification of each as a protein,
    nucleic acid, or protein-nucleic acid complex and whether
    the structure was determined by diffraction or NMR.

    Reference:
    http://www.rcsb.org/pdb/static.do?p=general_information/about_pdb/summaries.html

    Args:
        xray_fp (Unicode): The destination yaml file to be written.
        force_download (bool): If true, download the file even it
            the path already exists locally.

    Returns:
        None

    """
    # Manually unit tested.
    if isfile(xray_fp) and not force_download:
        print(
            "Found local copy of \"{}.\" Using file:\n\t{}".format(
                basename(xray_fp),
                xray_fp
            )
        )
        return None

    assert os.path.isabs(xray_fp)

    remote_directory = '/pub/pdb/derived_data/'
    remote_file = 'pdb_entry_type.txt'
    domain = 'ftp.wwpdb.org'

    ftp = FTP(domain)
    ftp.login()
    ftp.cwd(remote_directory)

    lines = []
    ftp.retrlines('RETR {}'.format(remote_file), lambda l: lines.append(l))

    xray = []
    for line in lines:
        columns = line.split()

        x_type = columns[2].strip()
        p_type = columns[1]
        pdb = columns[0]

        if p_type == 'prot' or p_type == 'prot-nuc':
            if x_type == 'diffraction':
                xray.append(pdb.upper())

    write_yaml(xray, xray_fp)
    assert isfile(xray_fp)
    return None
Beispiel #2
0
def fetch_obsolete(
        obs_file_path,
        url='http://www.rcsb.org/pdb/rest/getObsolete',
        force_download=False
):
    """Fetch list of obsolete entries.

    Fetch list of obsolete entries, process, and write
    results to a yaml file.

    Args:
        obs_file_path (Unicode): The destination yaml file to be written.
        url (Unicode):  The url address of the data.
        force_download (bool): If true, download the file even it
            the path already exists locally.

    Returns:
        None

    """
    if isfile(obs_file_path) and not force_download:
        print(
            "Found local copy of \"{}.\" Using file:\n\t{}".format(
                basename(obs_file_path),
                obs_file_path
            )
        )
    else:
        obs = []
        obs_req = requests.get(url)
        root = ETree.fromstring(obs_req.text)
        for child in root:
            obs.append(child.attrib['structureId'].upper())
        obs_req.close()
        write_yaml(obs, obs_file_path)
    return None
Beispiel #3
0
def uniprot_composite(dirs):
    """Creates final UniProt DataFrame.

    Create final UniProt DataFrame where the
    UniProt ID provides a unique key.

    Args:
        dirs (ProjectFolders): A named tuple of directory paths.

    """
    pdb_initial_composite_fp = os.path.join(dirs.tsv_data, "pdb_initial_composite_df.tsv")
    assert os.path.isfile(pdb_initial_composite_fp)

    uni_folder_path = dirs.uni_data
    file_names = _create_composite_file_names()
    paths = _create_composite_file_paths(uni_folder_path, file_names)

    uni_composite_tsv = paths["tsv_file"]
    uni_composite_yaml = paths["yaml_file"]
    uni_composite_json = paths["json_file"]

    if _uni_composite_file_exists(uni_folder_path):
        print(
            "A final uni_composite file already exists. Composite "
            "function complete. (Note: remove existing uni_composite "
            'files in the "{}" directory to have them '
            "regenerated.".format(uni_folder_path)
        )
        return None

    pdb_df = pd.read_csv(
        pdb_initial_composite_fp, sep="\t", header=0, encoding="utf-8", keep_default_na=False, na_values=["NULL", "N/A"]
    )

    print("Creating the UniProt composite structure.")
    uni_df = create_uni_struct(pdb_df)
    print("Done creating UniProt composite structure.")

    print("Validating UniProt composite structure.")
    uni_pdb_validation(uni_df, pdb_df)
    print("Validation complete.")

    print("Assigning missing region designations.")
    uni_df = create_intervals(pdb_df, uni_df)
    print("Done assigning missing regions.")

    assert isinstance(uni_df, pd.DataFrame)
    delimiter = create_delimiter("\t")
    uni_df.to_csv(uni_composite_tsv, sep=delimiter, encoding="utf-8")
    uni_df.to_json(uni_composite_json, force_ascii=False)

    json_data = read_json(uni_composite_json)
    write_yaml(json_data, uni_composite_yaml)

    print("Done writing UniProt composite files:")
    print("\t{}".format(uni_composite_tsv))
    print("\t{}".format(uni_composite_yaml))
    print("\t{}".format(uni_composite_json))
    print("This is the final UniProt ID DataFrame.")

    return None