def create(): """Generates SidechainNet for a single CASP thinning.""" # First, parse raw proteinnet files into Python dictionaries for convenience pnids = parse_raw_proteinnet(args.proteinnet_in, args.proteinnet_out, args.training_set) pnids = pnids[:args.limit] # Limit the length of the list for debugging # Using the ProteinNet IDs as a guide, download the relevant sidechain data sc_only_data, sc_filename = download_sidechain_data( pnids, args.sidechainnet_out, args.casp_version, args.training_set, args.limit, args.proteinnet_in, args.regenerate_scdata) # Finally, unify the sidechain data with ProteinNet sidechainnet_raw = combine_datasets(args.proteinnet_out, sc_only_data, args.training_set) sidechainnet_outfile = os.path.join( args.sidechainnet_out, format_sidechainnet_path(args.casp_version, args.training_set)) sidechainnet = organize_data(sidechainnet_raw, args.proteinnet_out, args.casp_version, args.training_set) save_data(sidechainnet, sidechainnet_outfile) print( f"SidechainNet for CASP {args.casp_version} written to {sidechainnet_outfile}." )
def create_all(): """Generates all thinnings of a particular CASP dataset, starting with the largest.""" # First, parse raw proteinnet files into Python dictionaries for convenience pnids = parse_raw_proteinnet(args.proteinnet_in, args.proteinnet_out, 100) pnids = pnids[:args.limit] # Limit the length of the list for debugging # Using the ProteinNet IDs as a guide, download the relevant sidechain data sc_only_data, sc_filename = download_sidechain_data( pnids, args.sidechainnet_out, args.casp_version, 100, args.limit, args.proteinnet_in, regenerate_scdata=args.regenerate_scdata) # Finally, unify the sidechain data with ProteinNet sidechainnet_raw_100 = combine_datasets(args.proteinnet_out, sc_only_data, 100) for training_set in [100, 95, 90, 70, 50, 30]: sc_outfile = os.path.join( args.sidechainnet_out, format_sidechainnet_path(args.casp_version, training_set)) sidechainnet = organize_data(sidechainnet_raw_100, args.proteinnet_out, args.casp_version, training_set) save_data(sidechainnet, sc_outfile) print(f"SidechainNet for CASP {args.casp_version} " f"({training_set}% thinning) written to {sc_outfile}.")