コード例 #1
0
ファイル: fetch_uniprot.py プロジェクト: shellydeforte/PDB
 def _write_new_dataframe(self):
     delimiter = create_delimiter('\t')
     self.df.to_csv(self.pdb_seq_fp, sep=delimiter, encoding='utf-8')
     return None
コード例 #2
0
ファイル: uni_composite.py プロジェクト: shellydeforte/PDB
def uniprot_composite(dirs):
    """Creates final UniProt DataFrame.

    Create final UniProt DataFrame where the
    UniProt ID provides a unique key.

    Args:
        dirs (ProjectFolders): A named tuple of directory paths.

    """
    pdb_initial_composite_fp = os.path.join(dirs.tsv_data, "pdb_initial_composite_df.tsv")
    assert os.path.isfile(pdb_initial_composite_fp)

    uni_folder_path = dirs.uni_data
    file_names = _create_composite_file_names()
    paths = _create_composite_file_paths(uni_folder_path, file_names)

    uni_composite_tsv = paths["tsv_file"]
    uni_composite_yaml = paths["yaml_file"]
    uni_composite_json = paths["json_file"]

    if _uni_composite_file_exists(uni_folder_path):
        print(
            "A final uni_composite file already exists. Composite "
            "function complete. (Note: remove existing uni_composite "
            'files in the "{}" directory to have them '
            "regenerated.".format(uni_folder_path)
        )
        return None

    pdb_df = pd.read_csv(
        pdb_initial_composite_fp, sep="\t", header=0, encoding="utf-8", keep_default_na=False, na_values=["NULL", "N/A"]
    )

    print("Creating the UniProt composite structure.")
    uni_df = create_uni_struct(pdb_df)
    print("Done creating UniProt composite structure.")

    print("Validating UniProt composite structure.")
    uni_pdb_validation(uni_df, pdb_df)
    print("Validation complete.")

    print("Assigning missing region designations.")
    uni_df = create_intervals(pdb_df, uni_df)
    print("Done assigning missing regions.")

    assert isinstance(uni_df, pd.DataFrame)
    delimiter = create_delimiter("\t")
    uni_df.to_csv(uni_composite_tsv, sep=delimiter, encoding="utf-8")
    uni_df.to_json(uni_composite_json, force_ascii=False)

    json_data = read_json(uni_composite_json)
    write_yaml(json_data, uni_composite_yaml)

    print("Done writing UniProt composite files:")
    print("\t{}".format(uni_composite_tsv))
    print("\t{}".format(uni_composite_yaml))
    print("\t{}".format(uni_composite_json))
    print("This is the final UniProt ID DataFrame.")

    return None
コード例 #3
0
def final_filtering(dirs):
    """Create PDB composite.

    Args:
        dirs (ProjectFolders): A named tuple of directory paths.

    Returns:
        None
    """
    pdb_initial_composite_fp = os.path.join(
        dirs.tsv_data,
        'pdb_initial_composite_df.tsv'
    )

    uni_filtered_path = os.path.join(
        dirs.working,
        'pdb_seq_uni_filtered.tsv'
    )
    if not os.path.exists(pdb_initial_composite_fp):
        df = pd.read_csv(
            uni_filtered_path,
            sep='\t',
            index_col=0,
            keep_default_na=False,
            na_values=['NULL', 'N/A']
        )
        ss_dis = fetch_ss_dis(dirs.working)
        print("Creating PDB composite.")
        df = create_pdb_composite(df, ss_dis, dirs.uni_data)
        print("\nPDB composite finished.")

        print(
            "Removing UniProt entries with < 2 PDB "
            "chains. Starting with {0} rows".format(len(df.index))
        )
        df = filter_single(df)
        print(
            "Entries removed. There are now {0} rows".format(len(df.index))
        )

        print("Writing final PDB chain DataFrame.")
        delimiter = create_delimiter('\t')
        df.to_csv(pdb_initial_composite_fp, sep=delimiter, encoding='utf-8')
        print(
            "Finished writing {}:\n"
            "\t{}\n"
            "This is the final PDB_CHAIN DataFrame.\n"
            "Note that only pdb_chain_uniprot.tsv provides a "
            "unique key".format(
                basename(pdb_initial_composite_fp),
                pdb_initial_composite_fp,
            )
        )
    else:
        print(
            "Found {}. Using local file:\n"
            "\t{}".format(
                basename(pdb_initial_composite_fp),
                pdb_initial_composite_fp
            )
        )
    print("")
    return None
コード例 #4
0
def initial_filtering(dirs):
    """Creates a dataframe from pdb_chain_uniprot.tsv.

    Perform initial filtering with pdb_chain_uniprot.tsv
    and supplementary files.

    Supplementary file processing steps:
        1. Removes the PDB_BEG, PDB_END columns.
        2. Converts all PDB IDs to upper case.
        3. Removes any rows where the PDB ID isn't in the xray list.
        4. Removes any rows where the PDB ID is in the obs list.
        5. Removes any rows where the RES_BEG or SP_BEG are < 1.
        6. Removes any rows where the length of the intervals doesn't match.
        7. Removes any rows where the length of the interval is <= 3.
        8. Removes any rows for pdb_chains not in ss_dis.
        9. Removes uniIDs with < 2 pdb chains.
        10. Adds a column called 'PDB_SEQ' that has the section of the PDB
            chain corresponding to the interval in RES_BEG:RES_END.

    Args:
        dirs (ProjectFolders): A named tuple of directory paths.

    Returns:
        None

    """
    # Return used_local for unittest because of problems capturing stdout
    # with logging instance.
    used_local = False
    pdb_seq_fp = os.path.join(dirs.working, 'pdb_seq.tsv')
    msg = getLogger('root')

    if not os.path.exists(pdb_seq_fp):
        obs_fp = os.path.join(dirs.working, 'obs.yaml')
        xray_fp = os.path.join(dirs.working, 'xray.yaml')
        chain_fp = os.path.join(dirs.tsv_data, 'pdb_chain_uniprot.tsv')

        msg.info('START: Initial filtering.')

        msg.debug("START: Fetch ss_dis.tsv.")
        ss_dis = fetch_ss_dis(dirs.working)
        msg.debug("COMPLETE: Fetch ss_dis.tsv.")

        msg.debug("START: Read obs.yaml.")
        obs = read_yaml(obs_fp)
        msg.debug("COMPLETE: Read obs.yaml.")

        msg.debug("START: Read xray.yaml.")
        xray = read_yaml(xray_fp)
        msg.debug("COMPLETE: Read xray.yaml.")

        msg.debug("START: Create initial DataFrame.")
        df = pd.read_csv(
            chain_fp,
            sep='\t',
            header=1,
            encoding='utf-8',
            keep_default_na=False,
            na_values=['NULL', 'N/A'])
        msg.debug("COMPLETE: Create initial DataFrame.")
        msg.debug("Initial DataFrame has {} rows.".format(len(df.index)))

        msg.debug("START: Remove rows where "
                  "the PDB ID is not in the xray list.")
        df = filter_pdb_chain_uniprot(df, obs, xray)
        msg.debug("COMPLETE: Remove rows where "
                  "the PDB ID is not in the xray list.")
        msg.debug("DataFrame now has {} rows.".format(len(df.index)))

        msg.debug("START: Remove entries not in ss_dis "
                  "and add the PDB peptide.")
        df = add_pdbseq_to_df(df, ss_dis)
        msg.debug("COMPLETE: Remove entries not in ss_dis "
                  "and add the PDB peptide.")
        msg.debug("DataFrame now has {} rows.".format(len(df.index)))

        msg.debug("START: Remove UniProt IDs with < 2 pdb chains.")
        df = filter_single(df)
        msg.debug("COMPLETE: Remove UniProt IDs with < 2 pdb chains.")
        msg.debug("DataFrame now has {} rows.".format(len(df.index)))

        msg.debug("START: Writing DataFrame to TSV file.")
        delimiter = create_delimiter('\t')
        df.to_csv(pdb_seq_fp, sep=delimiter, encoding='utf-8')
        msg.debug("COMPLETE: Writing DataFrame to TSV file.")
        msg.info(
            "Wrote {} to:\n\t{}".format(basename(pdb_seq_fp), pdb_seq_fp)
        )
        msg.info('COMPLETE: Initial filtering.')

    else:
        used_local = True
        msg.info(
            "Found and using local {filename}: \n"
            "\t{filepath}".format(
                filename=basename(pdb_seq_fp),
                filepath=pdb_seq_fp
            )
        )
        msg.info('COMPLETE: Initial filtering.')

    return used_local