def average_duplicates(args: Args): """Averages duplicate data points in a dataset.""" print('Loading data') header = get_header(args.data_path) data = get_data(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns) print(f'Data size = {len(data):,}') # Map SMILES string to lists of targets smiles_in_order = [] smiles_to_targets = defaultdict(list) for smiles, targets in zip(data.smiles(), data.targets()): smiles_to_targets[smiles].append(targets) if len(smiles_to_targets[smiles]) == 1: smiles_in_order.append(smiles) # Find duplicates duplicate_count = 0 stds = [] new_data = [] for smiles in smiles_in_order: all_targets = smiles_to_targets[smiles] duplicate_count += len(all_targets) - 1 num_tasks = len(all_targets[0]) targets_by_task = [[] for _ in range(num_tasks)] for task in range(num_tasks): for targets in all_targets: if targets[task] is not None: targets_by_task[task].append(targets[task]) stds.append([ np.std(task_targets) if len(task_targets) > 0 else 0.0 for task_targets in targets_by_task ]) means = [ np.mean(task_targets) if len(task_targets) > 0 else None for task_targets in targets_by_task ] new_data.append((smiles, means)) print(f'Number of duplicates = {duplicate_count:,}') print( f'Duplicate standard deviation per task = {", ".join(f":{std:.4e}" for std in np.mean(stds, axis=0))}' ) print(f'New data size = {len(new_data):,}') # Save new data with open(args.save_path, 'w') as f: f.write(','.join(header) + '\n') for smiles, avg_targets in new_data: f.write(smiles + ',' + ','.join( str(value) if value is not None else '' for value in avg_targets) + '\n')
def interpret(args: InterpretArgs) -> None: global C_PUCT, MIN_ATOMS chemprop_model = ChempropModel(args) def scoring_function(smiles: List[str]) -> List[float]: return chemprop_model(smiles)[:, args.property_id - 1] C_PUCT = args.c_puct MIN_ATOMS = args.min_atoms all_smiles = get_smiles(path=args.data_path, smiles_column=args.smiles_column) header = get_header(path=args.data_path) property_name = header[ args.property_id] if len(header) > args.property_id else 'score' print(f'smiles,{property_name},rationale,rationale_score') rat_smiles = [] rat_scores = [] for smiles in all_smiles: score = scoring_function([smiles])[0] if score > args.prop_delta: rationales = mcts(smiles=smiles, scoring_function=scoring_function, n_rollout=args.rollout, max_atoms=args.max_atoms, prop_delta=args.prop_delta) else: rationales = [] if len(rationales) == 0: rat_smiles.append('N/A') rat_scores.append(0) print(f'{smiles},{score:.3f},,') else: min_size = min(len(x.atoms) for x in rationales) min_rationales = [ x for x in rationales if len(x.atoms) == min_size ] rats = sorted(min_rationales, key=lambda x: x.P, reverse=True) rat_smiles.append(rats[0].smiles) rat_scores.append(rats[0].P) print(f'{smiles},{score:.3f},{rats[0].smiles},{rats[0].P:.3f}') return pd.DataFrame( list(zip(all_smiles, rat_smiles, rat_scores)), columns=['smiles', 'rationale_smiles', 'rationale_score'])
def average_duplicates(args): print('Loading data') header = get_header(args.data_path) data = get_data(args.data_path) print('Data size = {:,}'.format(len(data))) # Map SMILES string to lists of targets smiles_to_targets = defaultdict(list) for smiles, targets in zip(data.smiles(), data.targets()): smiles_to_targets[smiles].append(targets) # Find duplicates duplicate_count = 0 stds = [] new_data = [] for smiles, all_targets in smiles_to_targets.items(): duplicate_count += len(all_targets) - 1 num_tasks = len(all_targets[0]) targets_by_task = [[] for _ in range(num_tasks)] for task in range(num_tasks): for targets in all_targets: if targets[task] is not None: targets_by_task[task].append(targets[task]) stds.append([ np.std(task_targets) if len(task_targets) > 0 else 0.0 for task_targets in targets_by_task ]) means = [ np.mean(task_targets) if len(task_targets) > 0 else None for task_targets in targets_by_task ] new_data.append((smiles, means)) print('Number of duplicates = {:,}'.format(duplicate_count)) print('Duplicate standard deviation per task = {}'.format(', '.join( '{:.4e}'.format(std) for std in np.mean(stds, axis=0)))) print('New data size = {:,}'.format(len(new_data))) # Save new data with open(args.save_path, 'w') as f: f.write(','.join(header) + '\n') for smiles, avg_targets in new_data: f.write(smiles + ',' + ','.join( str(value) if value is not None else '' for value in avg_targets) + '\n')
def predict(): """Renders the predict page and makes predictions if the method is POST.""" if request.method == 'GET': return render_predict() # Get arguments ckpt_id = request.form['checkpointName'] if request.form['textSmiles'] != '': smiles = request.form['textSmiles'].split() elif request.form['drawSmiles'] != '': smiles = [request.form['drawSmiles']] else: # Upload data file with SMILES data = request.files['data'] data_name = secure_filename(data.filename) data_path = os.path.join(app.config['TEMP_FOLDER'], data_name) data.save(data_path) # Check if header is smiles possible_smiles = get_header(data_path)[0] smiles = [possible_smiles ] if Chem.MolFromSmiles(possible_smiles) is not None else [] # Get remaining smiles smiles.extend(get_smiles(data_path)) models = db.get_models(ckpt_id) model_paths = [ os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt') for model in models ] task_names = load_task_names(model_paths[0]) num_tasks = len(task_names) gpu = request.form.get('gpu') train_args = load_args(model_paths[0]) # Build arguments arguments = [ '--test_path', 'None', '--preds_path', os.path.join(app.config['TEMP_FOLDER'], app.config['PREDICTIONS_FILENAME']), '--checkpoint_paths', *model_paths ] if gpu is not None: if gpu == 'None': arguments.append('--no_cuda') else: arguments += ['--gpu', gpu] # Handle additional features if train_args.features_path is not None: # TODO: make it possible to specify the features generator if trained using features_path arguments += [ '--features_generator', 'rdkit_2d_normalized', '--no_features_scaling' ] elif train_args.features_generator is not None: arguments += ['--features_generator', *train_args.features_generator] if not train_args.features_scaling: arguments.append('--no_features_scaling') # Parse arguments args = PredictArgs().parse_args(arguments) # Run predictions preds = make_predictions(args=args, smiles=smiles) if all(p is None for p in preds): return render_predict(errors=['All SMILES are invalid']) # Replace invalid smiles with message invalid_smiles_warning = 'Invalid SMILES String' preds = [ pred if pred is not None else [invalid_smiles_warning] * num_tasks for pred in preds ] return render_predict( predicted=True, smiles=smiles, num_smiles=min(10, len(smiles)), show_more=max(0, len(smiles) - 10), task_names=task_names, num_tasks=len(task_names), preds=preds, warnings=["List contains invalid SMILES strings"] if None in preds else None, errors=["No SMILES strings given"] if len(preds) == 0 else None)
def predict(): """Renders the predict page and makes predictions if the method is POST.""" if request.method == 'GET': return render_predict() # Get arguments ckpt_id = request.form['checkpointName'] if request.form['textSmiles'] != '': smiles = request.form['textSmiles'].split() elif request.form['drawSmiles'] != '': smiles = [request.form['drawSmiles']] else: print(" GOT HERE") # Upload data file with SMILES data = request.files['data'] data_name = secure_filename(data.filename) data_path = os.path.join(app.config['TEMP_FOLDER'], data_name) data.save(data_path) # Check if header is smiles possible_smiles = get_header(data_path)[0] smiles = [possible_smiles] if Chem.MolFromSmiles(possible_smiles) is not None else [] # Get remaining smiles smiles.extend(get_smiles(data_path)) models = db.get_models(ckpt_id) model_paths = [os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt') for model in models] task_names = load_task_names(model_paths[0]) num_tasks = len(task_names) gpu = request.form.get('gpu') # Create and modify args args = load_args(model_paths[0]) if args.features_path != None: args.features_generator = ["rdkit_2d_normalized"] args.features_path = None preds_path = os.path.join(app.config['TEMP_FOLDER'], app.config['PREDICTIONS_FILENAME']) args.test_path = 'None' # TODO: Remove this hack to avoid assert crashing in modify_predict_args args.preds_path = preds_path args.checkpoint_paths = model_paths if gpu is not None: if gpu == 'None': args.no_cuda = True else: args.gpu = int(gpu) modify_predict_args(args) # Run predictions preds = make_predictions(args, smiles=smiles) if all(p is None for p in preds): return render_predict(errors=['All SMILES are invalid']) # Replace invalid smiles with message invalid_smiles_warning = "Invalid SMILES String" preds = [pred if pred is not None else [invalid_smiles_warning] * num_tasks for pred in preds] return render_predict(predicted=True, smiles=smiles, num_smiles=min(10, len(smiles)), show_more=max(0, len(smiles)-10), task_names=task_names, num_tasks=len(task_names), preds=preds, warnings=["List contains invalid SMILES strings"] if None in preds else None, errors=["No SMILES strings given"] if len(preds) == 0 else None)
if __name__ == "__main__": args = Args().parse_args() chemprop_model = ChempropModel(checkpoint_dir=args.checkpoint_dir, device=args.device) def scoring_function(smiles: List[str]) -> List[float]: return chemprop_model(smiles)[:, args.property_id - 1] C_PUCT = args.c_puct MIN_ATOMS = args.min_atoms all_smiles = get_smiles(path=args.data_path) header = get_header(path=args.data_path) property_name = header[ args.property_id] if len(header) > args.property_id else 'score' print(f'smiles,{property_name},rationale,rationale_score') for smiles in all_smiles: score = scoring_function([smiles])[0] if score > args.prop_delta: rationales = mcts(smiles=smiles, scoring_function=scoring_function, n_rollout=args.rollout, max_atoms=args.max_atoms, prop_delta=args.prop_delta) else: rationales = []
def predict(): if request.method == 'GET': return render_predict() # Get arguments checkpoint_name = request.form['checkpointName'] if 'data' in request.files: # Upload data file with SMILES data = request.files['data'] data_name = secure_filename(data.filename) data_path = os.path.join(app.config['TEMP_FOLDER'], data_name) data.save(data_path) # Check if header is smiles possible_smiles = get_header(data_path)[0] smiles = [possible_smiles ] if Chem.MolFromSmiles(possible_smiles) is not None else [] # Get remaining smiles smiles.extend(get_smiles(data_path)) elif request.form['textSmiles'] != '': smiles = request.form['textSmiles'].split() else: smiles = [request.form['drawSmiles']] checkpoint_path = os.path.join(app.config['CHECKPOINT_FOLDER'], checkpoint_name) task_names = load_task_names(checkpoint_path) num_tasks = len(task_names) gpu = request.form.get('gpu') # Create and modify args parser = ArgumentParser() add_predict_args(parser) args = parser.parse_args() preds_path = os.path.join(app.config['TEMP_FOLDER'], app.config['PREDICTIONS_FILENAME']) args.test_path = 'None' # TODO: Remove this hack to avoid assert crashing in modify_predict_args args.preds_path = preds_path args.checkpoint_path = checkpoint_path args.write_smiles = True if gpu is not None: if gpu == 'None': args.no_cuda = True else: args.gpu = int(gpu) modify_predict_args(args) # Run predictions preds = make_predictions(args, smiles=smiles) if all(p is None for p in preds): return render_predict(errors=['All SMILES are invalid']) # Replace invalid smiles with message invalid_smiles_warning = "Invalid SMILES String" preds = [ pred if pred is not None else [invalid_smiles_warning] * num_tasks for pred in preds ] return render_predict( predicted=True, smiles=smiles, num_smiles=min(10, len(smiles)), show_more=max(0, len(smiles) - 10), task_names=task_names, num_tasks=len(task_names), preds=preds, warnings=["List contains invalid SMILES strings"] if None in preds else None, errors=["No SMILES strings given"] if len(preds) == 0 else None)