def encode_allele_datasets( allele_datasets, max_ic50, binary_encoding=False): """ Parameters ---------- allele_dataset : AlleleDataset Named tuple with fields "X" and "ic50" max_ic50 : float Largest IC50 value predictor should return binary_encoding : bool (default = False) If True, use a binary 1-of-k encoding of amino acids, otherwise expect a vector embedding to use integer indices. Returns three dictionarys - mapping from allele name to X (features) - mapping from allele name to Y_log_ic50 (continuous outputs) - mapping from allele name to binder_label (binary outputs) """ X_dict = OrderedDict() Y_log_ic50_dict = OrderedDict([]) ic50_dict = OrderedDict([]) for (allele_name, dataset) in allele_datasets.items(): allele_name = normalize_allele_name(allele_name) (X, Y_log_ic50, Y_ic50) = encode_allele_dataset( dataset, max_ic50=max_ic50, binary_encoding=binary_encoding) X_dict[allele_name] = X Y_log_ic50_dict[allele_name] = Y_log_ic50 ic50_dict[allele_name] = Y_ic50 return (X_dict, Y_log_ic50_dict, ic50_dict)
def run(): args = parser.parse_args(sys.argv[1:]) print(args) total = 0 order = [] name_to_record = {} for fasta in args.fastas: reader = Bio.SeqIO.parse(fasta, "fasta") for record in reader: total += 1 if len(record.seq) < 50: print("-- Skipping '%s', sequence too short" % ( record.description,)) continue parts = record.description.split() candidate_strings = [ record.description, parts[1], " ".join(parts[1:]) ] name = None for candidate_string in candidate_strings: name = normalize_allele_name( candidate_string, raise_on_error=False) if name is not None: break if name is None: print("Skipping '%s'" % (record.description,)) continue print("Parsed '%s' as %s" % (record.description, name)) record.description = name + " " + record.description if name in name_to_record: old_record = name_to_record[name] old_sequence = old_record.seq if len(old_sequence) < len(record.seq): print("-- Replacing old record (%d aa) with new (%d aa)" % ( len(old_record.seq), len(record.seq))) name_to_record[name] = record else: print("-- Skipping, already seen") else: order.append(name) name_to_record[name] = record records = [name_to_record[name] for name in order] with open(args.out, "w") as fd: Bio.SeqIO.write(records, fd, "fasta") print("Wrote %d / %d sequences: %s" % (len(records), total, args.out))
def filter_alleles(allele_datasets, min_samples_per_allele=5): for (allele_name, dataset) in sorted( allele_datasets.items(), key=lambda pair: pair[0]): # Want alleles to be 4-digit + gene name e.g. C0401 if allele_name.isdigit() or len(allele_name) < 5: print("Skipping allele %s" % (allele_name,)) continue allele_name = normalize_allele_name(allele_name) ic50_allele = dataset.ic50 n_samples_allele = len(ic50_allele) if n_samples_allele < min_samples_per_allele: print("Skipping allele %s due to too few samples: %d" % ( allele_name, n_samples_allele)) continue binders = ic50_allele <= 500 if binders.all(): print("No negative examples for %s" % allele_name) continue if not binders.any(): print("No positive examples for %s" % allele_name) continue yield (allele_name, dataset)
def normalize_allele_name_or_return_unknown(s): if s is numpy.nan: return "UNKNOWN" return normalize_allele_name(s, raise_on_error=False, default_value="UNKNOWN")
def run(argv=sys.argv[1:]): global GLOBAL_DATA # On sigusr1 print stack trace print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid()) signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack()) args = parser.parse_args(argv) configure_logging() serial_run = not args.cluster_parallelism and args.num_jobs == 0 alleles = [ normalize_allele_name(a, raise_on_error=False) for a in args.allele ] n_bad_alleles = sum([a is None for a in alleles]) if n_bad_alleles > 0: print("Dropping %d bad alleles" % n_bad_alleles) alleles = numpy.array(sorted({a for a in alleles if a})) peptides = pandas.read_csv( args.input_peptides, nrows=args.max_peptides).peptide.drop_duplicates() print("Filtering to valid peptides. Starting at: ", len(peptides)) peptides = peptides[peptides.str.match("^[ACDEFGHIKLMNPQRSTVWY]+$")] print("Filtered to: ", len(peptides)) peptides = peptides.unique() num_peptides = len(peptides) print("Predictions for %d alleles x %d peptides." % (len(alleles), num_peptides)) if not os.path.exists(args.out): print("Creating", args.out) os.mkdir(args.out) GLOBAL_DATA["predictor"] = args.predictor GLOBAL_DATA["args"] = args GLOBAL_DATA["cols"] = PREDICTOR_TO_COLS[args.predictor] # Write peptide and allele lists to out dir. out_peptides = os.path.abspath(os.path.join(args.out, "peptides.csv")) pandas.DataFrame({"peptide": peptides}).to_csv(out_peptides, index=False) print("Wrote: ", out_peptides) manifest_df = [] for allele in alleles: for col in PREDICTOR_TO_COLS[args.predictor]: manifest_df.append((allele, col)) manifest_df = pandas.DataFrame(manifest_df, columns=["allele", "kind"]) manifest_df["col"] = (manifest_df.allele + " " + manifest_df.kind) manifest_df["path"] = manifest_df.col.map( lambda s: s.replace("*", "").replace(" ", ".")) + ".npz" out_manifest = os.path.abspath(os.path.join(args.out, "alleles.csv")) manifest_df.to_csv(out_manifest, index=False) col_to_filename = manifest_df.set_index("col").path.map( lambda s: os.path.abspath(os.path.join(args.out, s))) print("Wrote: ", out_manifest) result_df = pandas.DataFrame(index=peptides, columns=manifest_df.col.values, dtype=args.result_dtype) result_df[:] = numpy.nan if args.reuse_predictions: # Allocating this here to hit any memory errors as early as possible. is_null_matrix = numpy.ones(shape=(result_df.shape[0], len(alleles)), dtype="int8") for dirname in args.reuse_predictions: if not dirname: continue # ignore empty strings if os.path.exists(dirname): print("Loading predictions", dirname) result_df = load_results(dirname, result_df, dtype=args.result_dtype) else: print("WARNING: skipping because does not exist", dirname) # We rerun any alleles that have nulls for any kind of values # (e.g. affinity, percentile rank, elution score). for (i, allele) in enumerate(alleles): sub_df = manifest_df.loc[manifest_df.allele == allele] is_null_matrix[:, i] = result_df[sub_df.col.values].isnull().any(1) print("Fraction null", is_null_matrix.mean()) print("Grouping peptides by alleles") allele_indices_to_peptides = collections.defaultdict(list) for (i, peptide) in tqdm.tqdm(enumerate(peptides), total=len(peptides)): (allele_indices, ) = numpy.where(is_null_matrix[i]) if len(allele_indices) > 0: allele_indices_to_peptides[tuple(allele_indices)].append( peptide) del is_null_matrix work_items = [] print("Assigning peptides to work items.") for (indices, block_peptides) in allele_indices_to_peptides.items(): num_chunks = int(math.ceil(len(block_peptides) / args.chunk_size)) peptide_chunks = numpy.array_split(peptides, num_chunks) for chunk_peptides in peptide_chunks: work_items.append({ 'alleles': alleles[list(indices)], 'peptides': chunk_peptides, }) else: # Same number of chunks for all alleles num_chunks = int(math.ceil(len(peptides) / args.chunk_size)) print("Splitting peptides into %d chunks" % num_chunks) peptide_chunks = numpy.array_split(peptides, num_chunks) work_items = [] for (_, chunk_peptides) in enumerate(peptide_chunks): work_item = { 'alleles': alleles, 'peptides': chunk_peptides, } work_items.append(work_item) print("Work items: ", len(work_items)) for (i, work_item) in enumerate(work_items): work_item["work_item_num"] = i # Combine work items to form tasks. tasks = [] peptides_in_last_task = None # We sort work_items to put small items first so they get combined. for work_item in sorted(work_items, key=lambda d: len(d['peptides'])): if peptides_in_last_task is not None and ( len(work_item['peptides']) + peptides_in_last_task < args.chunk_size): # Add to last task. tasks[-1]['work_item_dicts'].append(work_item) peptides_in_last_task += len(work_item['peptides']) else: # New task tasks.append({'work_item_dicts': [work_item]}) peptides_in_last_task = len(work_item['peptides']) print("Collected %d work items into %d tasks" % (len(work_items), len(tasks))) if args.predictor == "mhcflurry": do_predictions_function = do_predictions_mhcflurry else: do_predictions_function = do_predictions_mhctools worker_pool = None start = time.time() if serial_run: # Serial run print("Running in serial.") results = (do_predictions_function(**task) for task in tasks) elif args.cluster_parallelism: # Run using separate processes HPC cluster. print("Running on cluster.") results = cluster_results_from_args( args, work_function=do_predictions_function, work_items=tasks, constant_data=GLOBAL_DATA, input_serialization_method="dill", result_serialization_method="pickle", clear_constant_data=True) else: worker_pool = worker_pool_with_gpu_assignments_from_args(args) print("Worker pool", worker_pool) assert worker_pool is not None results = worker_pool.imap_unordered(partial(call_wrapped_kwargs, do_predictions_function), tasks, chunksize=1) allele_to_chunk_index_to_predictions = {} for allele in alleles: allele_to_chunk_index_to_predictions[allele] = {} def write_col(col): out_path = os.path.join(args.out, col_to_filename[col]) numpy.savez(out_path, result_df[col].values) print("Wrote [%f%% null]:" % (result_df[col].isnull().mean() * 100.0), out_path) print("Writing all columns.") last_write_time_per_column = {} for col in result_df.columns: write_col(col) last_write_time_per_column[col] = time.time() print("Done writing all columns. Reading results.") for worker_results in tqdm.tqdm(results, total=len(work_items)): for (work_item_num, col_to_predictions) in worker_results: for (col, predictions) in col_to_predictions.items(): result_df.loc[work_items[work_item_num]['peptides'], col] = predictions if time.time() - last_write_time_per_column[col] > 180: write_col(col) last_write_time_per_column[col] = time.time() print("Done processing. Final write for each column.") for col in result_df.columns: write_col(col) if worker_pool: worker_pool.close() worker_pool.join() prediction_time = time.time() - start print("Done generating predictions in %0.2f min." % (prediction_time / 60.0))
def normalize_allele_name_or_return_unknown(s): result = normalize_allele_name(s, raise_on_error=False, default_value="UNKNOWN")
imputed_dataset = Dataset.create_empty() else: imputed_dataset = dataset.impute_missing_values(imputer) for allele_name in alleles: allele_dataset = dataset.get_allele(allele_name) n_allele = len(allele_dataset) if allele_name in imputed_dataset.unique_alleles(): imputed_dataset_allele = imputed_dataset.get_allele(allele_name) else: imputed_dataset_allele = Dataset.create_empty() # normalize allele name to check if it's just allele_name = normalize_allele_name(allele_name) if allele_name.isdigit(): print("Skipping allele %s" % (allele_name,)) continue print("\n=== Training predictor for %s: %d samples" % ( allele_name, n_allele)) model = predictor_from_args(args, allele_name) json_filename = allele_name + ".json" json_path = join(args.output_dir, json_filename) hdf_filename = allele_name + ".hdf" hdf_path = join(args.output_dir, hdf_filename)
def leave_out_allele_cross_validation( model, allele_datasets, max_ic50, binary_encoding=False, n_pretrain_epochs=0, n_training_epochs=100, min_samples_per_allele=5, cv_folds=5, minibatch_size=128): """ Fit the model for every allele in the dataset and return a DataFrame with the following columns: allele_name dataset_size auc_mean auc_median auc_std auc_min auc_max accuracy_mean accuracy_median accuracy_std accuracy_min accuracy_max f1_mean f1_median f1_std f1_min f1_max """ scores = ScoreCollection() X_dict, Y_log_ic50_dict, ic50_dict = encode_allele_datasets( allele_datasets=allele_datasets, max_ic50=max_ic50, binary_encoding=binary_encoding) initial_weights = [w.copy() for w in model.get_weights()] for allele_name, dataset in filter_alleles( allele_datasets, min_samples_per_allele=min_samples_per_allele): model.set_weights(initial_weights) X_allele = X_dict[allele_name] Y_allele = Y_log_ic50_dict[allele_name] ic50_allele = ic50_dict[allele_name] if n_pretrain_epochs > 0: X_other_alleles = np.vstack([ X for (other_allele, X) in X_dict.items() if normalize_allele_name(other_allele) != allele_name]) Y_other_alleles = np.concatenate([ y for (other_allele, y) in Y_log_ic50_dict.items() if normalize_allele_name(other_allele) != allele_name]) print("Pre-training X shape: %s" % (X_other_alleles.shape,)) print("Pre-training Y shape: %s" % (Y_other_alleles.shape,)) model.fit( X_other_alleles, Y_other_alleles, nb_epoch=n_pretrain_epochs, batch_size=minibatch_size, verbose=0) print("Cross-validation for %s (%d):" % (allele_name, len(Y_allele))) aucs, accuracies, f1_scores = kfold_cross_validation_for_single_allele( allele_name=allele_name, model=model, X=X_allele, Y=Y_allele, ic50=ic50_allele, n_training_epochs=n_training_epochs, cv_folds=cv_folds, max_ic50=max_ic50, minibatch_size=minibatch_size) if len(aucs) == 0: print("Skipping allele %s" % allele_name) continue scores.add(allele_name, auc=aucs, accuracy=accuracies, f1=f1_scores) return scores.dataframe()
def load_test_data( dirpaths, sep="\s+", ic50_base=10.0, comment_char="B", dataset_name="blind"): """ Load all allele-specific datasets from the given path assuming filenames have the form: pred.PREDICTOR_NAME.CV_METHOD.ALLELE-LENGTH.xls Example: pred.netmhc.blind.HLA-A-3201-9.xls pred.blind.smmpmbec_cpp.Mamu-A-02-9.xls where ALLELE could be HLA-A-0201 and LENGTH is an integer Combines all loaded files into a single DataFrame. If `column_per_predictor` is True then reshape the DataFrame to have multiple prediction columns, one per distinct predictor. If ic50_base is not None, then transform IC50 using ic50_base ** pred """ # dictionary mapping from (allele, sequence) to dictionary of binding # predictions and the actual measuremnt called "meas" test_datasets = {} predictor_names = set([]) for dirpath in dirpaths: for filename in listdir(dirpath): filepath = join(dirpath, filename) dot_parts = filename.split(".") dot_parts if len(dot_parts) != 5: print("Skipping %s" % filepath) continue prefixes = dot_parts[:-2] interesting_prefixes = { substring for substring in prefixes if substring not in {"pred", "test", dataset_name} and not substring.startswith("cv_") } if len(interesting_prefixes) != 1: print("Can't infer predictor name for %s" % filepath) continue predictor_name = list(interesting_prefixes)[0] suffix, ext = dot_parts[-2:] dash_parts = suffix.split("-") if len(dash_parts) < 2: print("Skipping %s due to incorrect format" % filepath) continue predictor_names.add(predictor_name) print("Reading %s" % filepath) allele = normalize_allele_name("-".join(dash_parts[:-1])) length = int(dash_parts[-1]) df = pd.read_csv(filepath, sep=sep, comment=comment_char) df["dirpath"] = dirpath df["predictor"] = predictor_name df["allele"] = allele df["length"] = length if ic50_base is not None: df["pred"] = ic50_base ** df["pred"] df["meas"] = ic50_base ** df["meas"] if allele not in test_datasets: test_datasets[allele] = defaultdict(OrderedDict) dataset_dict = test_datasets[allele] for _, row in df.iterrows(): sequence = row["sequence"] dataset_dict[sequence]["length"] = length dataset_dict[sequence]["meas"] = row["meas"] dataset_dict[sequence][predictor_name] = row["pred"] test_dataframes = { allele: pd.DataFrame.from_dict( ic50_values, orient="index") for (allele, ic50_values) in test_datasets.items() } return test_dataframes, predictor_names
def normalize_allele_name_optional(s): return normalize_allele_name(s, raise_on_error=False)