Example #1
0
def create():
    """Generates SidechainNet for a single CASP thinning."""
    # First, parse raw proteinnet files into Python dictionaries for convenience
    pnids = parse_raw_proteinnet(args.proteinnet_in, args.proteinnet_out,
                                 args.training_set)
    pnids = pnids[:args.limit]  # Limit the length of the list for debugging

    # Using the ProteinNet IDs as a guide, download the relevant sidechain data
    sc_only_data, sc_filename = download_sidechain_data(
        pnids, args.sidechainnet_out, args.casp_version, args.training_set,
        args.limit, args.proteinnet_in, args.regenerate_scdata)

    # Finally, unify the sidechain data with ProteinNet
    sidechainnet_raw = combine_datasets(args.proteinnet_out, sc_only_data,
                                        args.training_set)

    sidechainnet_outfile = os.path.join(
        args.sidechainnet_out,
        format_sidechainnet_path(args.casp_version, args.training_set))
    sidechainnet = organize_data(sidechainnet_raw, args.proteinnet_out,
                                 args.casp_version, args.training_set)
    save_data(sidechainnet, sidechainnet_outfile)
    print(
        f"SidechainNet for CASP {args.casp_version} written to {sidechainnet_outfile}."
    )
Example #2
0
def create_all():
    """Generates all thinnings of a particular CASP dataset, starting with the largest."""
    # First, parse raw proteinnet files into Python dictionaries for convenience
    pnids = parse_raw_proteinnet(args.proteinnet_in, args.proteinnet_out, 100)
    pnids = pnids[:args.limit]  # Limit the length of the list for debugging

    # Using the ProteinNet IDs as a guide, download the relevant sidechain data
    sc_only_data, sc_filename = download_sidechain_data(
        pnids,
        args.sidechainnet_out,
        args.casp_version,
        100,
        args.limit,
        args.proteinnet_in,
        regenerate_scdata=args.regenerate_scdata)

    # Finally, unify the sidechain data with ProteinNet
    sidechainnet_raw_100 = combine_datasets(args.proteinnet_out, sc_only_data, 100)

    for training_set in [100, 95, 90, 70, 50, 30]:
        sc_outfile = os.path.join(
            args.sidechainnet_out,
            format_sidechainnet_path(args.casp_version, training_set))
        sidechainnet = organize_data(sidechainnet_raw_100, args.proteinnet_out,
                                     args.casp_version, training_set)
        save_data(sidechainnet, sc_outfile)
        print(f"SidechainNet for CASP {args.casp_version} "
              f"({training_set}% thinning) written to {sc_outfile}.")