Example #1
0
def encode_allele_datasets(
        allele_datasets,
        max_ic50,
        binary_encoding=False):
    """
    Parameters
    ----------
    allele_dataset : AlleleDataset
        Named tuple with fields "X" and "ic50"
    max_ic50 : float
        Largest IC50 value predictor should return
    binary_encoding : bool (default = False)
        If True, use a binary 1-of-k encoding of amino acids, otherwise
        expect a vector embedding to use integer indices.

    Returns three dictionarys
        - mapping from allele name to X (features)
        - mapping from allele name to Y_log_ic50 (continuous outputs)
        - mapping from allele name to binder_label (binary outputs)
    """
    X_dict = OrderedDict()
    Y_log_ic50_dict = OrderedDict([])
    ic50_dict = OrderedDict([])
    for (allele_name, dataset) in allele_datasets.items():
        allele_name = normalize_allele_name(allele_name)
        (X, Y_log_ic50, Y_ic50) = encode_allele_dataset(
            dataset,
            max_ic50=max_ic50,
            binary_encoding=binary_encoding)
        X_dict[allele_name] = X
        Y_log_ic50_dict[allele_name] = Y_log_ic50
        ic50_dict[allele_name] = Y_ic50
    return (X_dict, Y_log_ic50_dict, ic50_dict)
Example #2
0
def run():
    args = parser.parse_args(sys.argv[1:])
    print(args)

    total = 0
    order = []
    name_to_record = {}
    for fasta in args.fastas:
        reader = Bio.SeqIO.parse(fasta, "fasta")
        for record in reader:
            total += 1
            if len(record.seq) < 50:
                print("-- Skipping '%s', sequence too short" % (
                    record.description,))
                continue

            parts = record.description.split()
            candidate_strings = [
                record.description,
                parts[1],
                " ".join(parts[1:])
            ]
            name = None
            for candidate_string in candidate_strings:
                name = normalize_allele_name(
                    candidate_string, raise_on_error=False)
                if name is not None:
                    break
            if name is None:
                print("Skipping '%s'" % (record.description,))
                continue

            print("Parsed '%s' as %s" % (record.description, name))
            record.description = name + " " + record.description

            if name in name_to_record:
                old_record = name_to_record[name]
                old_sequence = old_record.seq
                if len(old_sequence) < len(record.seq):
                    print("-- Replacing old record (%d aa) with new (%d aa)" % (
                        len(old_record.seq),
                        len(record.seq)))
                    name_to_record[name] = record
                else:
                    print("-- Skipping, already seen")
            else:
                order.append(name)
                name_to_record[name] = record


    records = [name_to_record[name] for name in order]

    with open(args.out, "w") as fd:
        Bio.SeqIO.write(records, fd, "fasta")

    print("Wrote %d / %d sequences: %s" % (len(records), total, args.out))
def filter_alleles(allele_datasets, min_samples_per_allele=5):
    for (allele_name, dataset) in sorted(
            allele_datasets.items(), key=lambda pair: pair[0]):
        # Want alleles to be 4-digit + gene name e.g. C0401
        if allele_name.isdigit() or len(allele_name) < 5:
            print("Skipping allele %s" % (allele_name,))
            continue
        allele_name = normalize_allele_name(allele_name)
        ic50_allele = dataset.ic50
        n_samples_allele = len(ic50_allele)
        if n_samples_allele < min_samples_per_allele:
            print("Skipping allele %s due to too few samples: %d" % (
                allele_name, n_samples_allele))
            continue
        binders = ic50_allele <= 500
        if binders.all():
            print("No negative examples for %s" % allele_name)
            continue
        if not binders.any():
            print("No positive examples for %s" % allele_name)
            continue
        yield (allele_name, dataset)
Example #4
0
def normalize_allele_name_or_return_unknown(s):
    if s is numpy.nan:
        return "UNKNOWN"
    return normalize_allele_name(s,
                                 raise_on_error=False,
                                 default_value="UNKNOWN")
Example #5
0
def run(argv=sys.argv[1:]):
    global GLOBAL_DATA

    # On sigusr1 print stack trace
    print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid())
    signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack())

    args = parser.parse_args(argv)

    configure_logging()

    serial_run = not args.cluster_parallelism and args.num_jobs == 0

    alleles = [
        normalize_allele_name(a, raise_on_error=False) for a in args.allele
    ]
    n_bad_alleles = sum([a is None for a in alleles])
    if n_bad_alleles > 0:
        print("Dropping %d bad alleles" % n_bad_alleles)

    alleles = numpy.array(sorted({a for a in alleles if a}))

    peptides = pandas.read_csv(
        args.input_peptides,
        nrows=args.max_peptides).peptide.drop_duplicates()
    print("Filtering to valid peptides. Starting at: ", len(peptides))
    peptides = peptides[peptides.str.match("^[ACDEFGHIKLMNPQRSTVWY]+$")]
    print("Filtered to: ", len(peptides))
    peptides = peptides.unique()
    num_peptides = len(peptides)

    print("Predictions for %d alleles x %d peptides." %
          (len(alleles), num_peptides))

    if not os.path.exists(args.out):
        print("Creating", args.out)
        os.mkdir(args.out)

    GLOBAL_DATA["predictor"] = args.predictor
    GLOBAL_DATA["args"] = args
    GLOBAL_DATA["cols"] = PREDICTOR_TO_COLS[args.predictor]

    # Write peptide and allele lists to out dir.
    out_peptides = os.path.abspath(os.path.join(args.out, "peptides.csv"))
    pandas.DataFrame({"peptide": peptides}).to_csv(out_peptides, index=False)
    print("Wrote: ", out_peptides)

    manifest_df = []
    for allele in alleles:
        for col in PREDICTOR_TO_COLS[args.predictor]:
            manifest_df.append((allele, col))
    manifest_df = pandas.DataFrame(manifest_df, columns=["allele", "kind"])
    manifest_df["col"] = (manifest_df.allele + " " + manifest_df.kind)
    manifest_df["path"] = manifest_df.col.map(
        lambda s: s.replace("*", "").replace(" ", ".")) + ".npz"
    out_manifest = os.path.abspath(os.path.join(args.out, "alleles.csv"))
    manifest_df.to_csv(out_manifest, index=False)
    col_to_filename = manifest_df.set_index("col").path.map(
        lambda s: os.path.abspath(os.path.join(args.out, s)))
    print("Wrote: ", out_manifest)

    result_df = pandas.DataFrame(index=peptides,
                                 columns=manifest_df.col.values,
                                 dtype=args.result_dtype)
    result_df[:] = numpy.nan

    if args.reuse_predictions:
        # Allocating this here to hit any memory errors as early as possible.
        is_null_matrix = numpy.ones(shape=(result_df.shape[0], len(alleles)),
                                    dtype="int8")

        for dirname in args.reuse_predictions:
            if not dirname:
                continue  # ignore empty strings
            if os.path.exists(dirname):
                print("Loading predictions", dirname)
                result_df = load_results(dirname,
                                         result_df,
                                         dtype=args.result_dtype)
            else:
                print("WARNING: skipping because does not exist", dirname)

        # We rerun any alleles that have nulls for any kind of values
        # (e.g. affinity, percentile rank, elution score).
        for (i, allele) in enumerate(alleles):
            sub_df = manifest_df.loc[manifest_df.allele == allele]
            is_null_matrix[:, i] = result_df[sub_df.col.values].isnull().any(1)
        print("Fraction null", is_null_matrix.mean())

        print("Grouping peptides by alleles")
        allele_indices_to_peptides = collections.defaultdict(list)
        for (i, peptide) in tqdm.tqdm(enumerate(peptides),
                                      total=len(peptides)):
            (allele_indices, ) = numpy.where(is_null_matrix[i])
            if len(allele_indices) > 0:
                allele_indices_to_peptides[tuple(allele_indices)].append(
                    peptide)

        del is_null_matrix

        work_items = []
        print("Assigning peptides to work items.")
        for (indices, block_peptides) in allele_indices_to_peptides.items():
            num_chunks = int(math.ceil(len(block_peptides) / args.chunk_size))
            peptide_chunks = numpy.array_split(peptides, num_chunks)
            for chunk_peptides in peptide_chunks:
                work_items.append({
                    'alleles': alleles[list(indices)],
                    'peptides': chunk_peptides,
                })
    else:
        # Same number of chunks for all alleles
        num_chunks = int(math.ceil(len(peptides) / args.chunk_size))
        print("Splitting peptides into %d chunks" % num_chunks)
        peptide_chunks = numpy.array_split(peptides, num_chunks)

        work_items = []
        for (_, chunk_peptides) in enumerate(peptide_chunks):
            work_item = {
                'alleles': alleles,
                'peptides': chunk_peptides,
            }
            work_items.append(work_item)
    print("Work items: ", len(work_items))

    for (i, work_item) in enumerate(work_items):
        work_item["work_item_num"] = i

    # Combine work items to form tasks.
    tasks = []
    peptides_in_last_task = None
    # We sort work_items to put small items first so they get combined.
    for work_item in sorted(work_items, key=lambda d: len(d['peptides'])):
        if peptides_in_last_task is not None and (
                len(work_item['peptides']) + peptides_in_last_task <
                args.chunk_size):

            # Add to last task.
            tasks[-1]['work_item_dicts'].append(work_item)
            peptides_in_last_task += len(work_item['peptides'])
        else:
            # New task
            tasks.append({'work_item_dicts': [work_item]})
            peptides_in_last_task = len(work_item['peptides'])

    print("Collected %d work items into %d tasks" %
          (len(work_items), len(tasks)))

    if args.predictor == "mhcflurry":
        do_predictions_function = do_predictions_mhcflurry
    else:
        do_predictions_function = do_predictions_mhctools

    worker_pool = None
    start = time.time()
    if serial_run:
        # Serial run
        print("Running in serial.")
        results = (do_predictions_function(**task) for task in tasks)
    elif args.cluster_parallelism:
        # Run using separate processes HPC cluster.
        print("Running on cluster.")
        results = cluster_results_from_args(
            args,
            work_function=do_predictions_function,
            work_items=tasks,
            constant_data=GLOBAL_DATA,
            input_serialization_method="dill",
            result_serialization_method="pickle",
            clear_constant_data=True)
    else:
        worker_pool = worker_pool_with_gpu_assignments_from_args(args)
        print("Worker pool", worker_pool)
        assert worker_pool is not None
        results = worker_pool.imap_unordered(partial(call_wrapped_kwargs,
                                                     do_predictions_function),
                                             tasks,
                                             chunksize=1)

    allele_to_chunk_index_to_predictions = {}
    for allele in alleles:
        allele_to_chunk_index_to_predictions[allele] = {}

    def write_col(col):
        out_path = os.path.join(args.out, col_to_filename[col])
        numpy.savez(out_path, result_df[col].values)
        print("Wrote [%f%% null]:" % (result_df[col].isnull().mean() * 100.0),
              out_path)

    print("Writing all columns.")
    last_write_time_per_column = {}
    for col in result_df.columns:
        write_col(col)
        last_write_time_per_column[col] = time.time()
    print("Done writing all columns. Reading results.")

    for worker_results in tqdm.tqdm(results, total=len(work_items)):
        for (work_item_num, col_to_predictions) in worker_results:
            for (col, predictions) in col_to_predictions.items():
                result_df.loc[work_items[work_item_num]['peptides'],
                              col] = predictions
                if time.time() - last_write_time_per_column[col] > 180:
                    write_col(col)
                    last_write_time_per_column[col] = time.time()

    print("Done processing. Final write for each column.")
    for col in result_df.columns:
        write_col(col)

    if worker_pool:
        worker_pool.close()
        worker_pool.join()

    prediction_time = time.time() - start
    print("Done generating predictions in %0.2f min." %
          (prediction_time / 60.0))
Example #6
0
def normalize_allele_name_or_return_unknown(s):
    result = normalize_allele_name(s,
                                   raise_on_error=False,
                                   default_value="UNKNOWN")
        imputed_dataset = Dataset.create_empty()
    else:
        imputed_dataset = dataset.impute_missing_values(imputer)

    for allele_name in alleles:
        allele_dataset = dataset.get_allele(allele_name)

        n_allele = len(allele_dataset)

        if allele_name in imputed_dataset.unique_alleles():
            imputed_dataset_allele = imputed_dataset.get_allele(allele_name)
        else:
            imputed_dataset_allele = Dataset.create_empty()

        # normalize allele name to check if it's just
        allele_name = normalize_allele_name(allele_name)
        if allele_name.isdigit():
            print("Skipping allele %s" % (allele_name,))
            continue

        print("\n=== Training predictor for %s: %d samples" % (
            allele_name,
            n_allele))

        model = predictor_from_args(args, allele_name)

        json_filename = allele_name + ".json"
        json_path = join(args.output_dir, json_filename)

        hdf_filename = allele_name + ".hdf"
        hdf_path = join(args.output_dir, hdf_filename)
def leave_out_allele_cross_validation(
        model,
        allele_datasets,
        max_ic50,
        binary_encoding=False,
        n_pretrain_epochs=0,
        n_training_epochs=100,
        min_samples_per_allele=5,
        cv_folds=5,
        minibatch_size=128):
    """
    Fit the model for every allele in the dataset and return a DataFrame
    with the following columns:
            allele_name
            dataset_size
            auc_mean
            auc_median
            auc_std
            auc_min
            auc_max
            accuracy_mean
            accuracy_median
            accuracy_std
            accuracy_min
            accuracy_max
            f1_mean
            f1_median
            f1_std
            f1_min
            f1_max
    """
    scores = ScoreCollection()
    X_dict, Y_log_ic50_dict, ic50_dict = encode_allele_datasets(
        allele_datasets=allele_datasets,
        max_ic50=max_ic50,
        binary_encoding=binary_encoding)
    initial_weights = [w.copy() for w in model.get_weights()]
    for allele_name, dataset in filter_alleles(
            allele_datasets, min_samples_per_allele=min_samples_per_allele):
        model.set_weights(initial_weights)
        X_allele = X_dict[allele_name]
        Y_allele = Y_log_ic50_dict[allele_name]
        ic50_allele = ic50_dict[allele_name]
        if n_pretrain_epochs > 0:
            X_other_alleles = np.vstack([
                X
                for (other_allele, X) in X_dict.items()
                if normalize_allele_name(other_allele) != allele_name])
            Y_other_alleles = np.concatenate([
                y
                for (other_allele, y)
                in Y_log_ic50_dict.items()
                if normalize_allele_name(other_allele) != allele_name])
            print("Pre-training X shape: %s" % (X_other_alleles.shape,))
            print("Pre-training Y shape: %s" % (Y_other_alleles.shape,))
            model.fit(
                X_other_alleles,
                Y_other_alleles,
                nb_epoch=n_pretrain_epochs,
                batch_size=minibatch_size,
                verbose=0)
        print("Cross-validation for %s (%d):" % (allele_name, len(Y_allele)))
        aucs, accuracies, f1_scores = kfold_cross_validation_for_single_allele(
            allele_name=allele_name,
            model=model,
            X=X_allele,
            Y=Y_allele,
            ic50=ic50_allele,
            n_training_epochs=n_training_epochs,
            cv_folds=cv_folds,
            max_ic50=max_ic50,
            minibatch_size=minibatch_size)
        if len(aucs) == 0:
            print("Skipping allele %s" % allele_name)
            continue
        scores.add(allele_name, auc=aucs, accuracy=accuracies, f1=f1_scores)
    return scores.dataframe()
Example #9
0
def load_test_data(
        dirpaths,
        sep="\s+",
        ic50_base=10.0,
        comment_char="B",
        dataset_name="blind"):
    """
    Load all allele-specific datasets from the given path assuming filenames
    have the form:
        pred.PREDICTOR_NAME.CV_METHOD.ALLELE-LENGTH.xls
    Example:
        pred.netmhc.blind.HLA-A-3201-9.xls
        pred.blind.smmpmbec_cpp.Mamu-A-02-9.xls
    where ALLELE could be HLA-A-0201 and LENGTH is an integer

    Combines all loaded files into a single DataFrame.

    If `column_per_predictor` is True then reshape the DataFrame to have
    multiple prediction columns, one per distinct predictor.

    If ic50_base is not None, then transform IC50 using ic50_base ** pred
    """

    # dictionary mapping from (allele, sequence) to dictionary of binding
    # predictions and the actual measuremnt called "meas"
    test_datasets = {}
    predictor_names = set([])

    for dirpath in dirpaths:
        for filename in listdir(dirpath):
            filepath = join(dirpath, filename)
            dot_parts = filename.split(".")
            dot_parts
            if len(dot_parts) != 5:
                print("Skipping %s" % filepath)
                continue
            prefixes = dot_parts[:-2]
            interesting_prefixes = {
                substring
                for substring in prefixes
                if substring not in {"pred", "test", dataset_name}
                and not substring.startswith("cv_")
            }
            if len(interesting_prefixes) != 1:
                print("Can't infer predictor name for %s" % filepath)
                continue
            predictor_name = list(interesting_prefixes)[0]
            suffix, ext = dot_parts[-2:]
            dash_parts = suffix.split("-")
            if len(dash_parts) < 2:
                print("Skipping %s due to incorrect format" % filepath)
                continue
            predictor_names.add(predictor_name)
            print("Reading %s" % filepath)
            allele = normalize_allele_name("-".join(dash_parts[:-1]))
            length = int(dash_parts[-1])
            df = pd.read_csv(filepath, sep=sep, comment=comment_char)
            df["dirpath"] = dirpath
            df["predictor"] = predictor_name
            df["allele"] = allele
            df["length"] = length
            if ic50_base is not None:
                df["pred"] = ic50_base ** df["pred"]
                df["meas"] = ic50_base ** df["meas"]

            if allele not in test_datasets:
                test_datasets[allele] = defaultdict(OrderedDict)

            dataset_dict = test_datasets[allele]
            for _, row in df.iterrows():
                sequence = row["sequence"]
                dataset_dict[sequence]["length"] = length
                dataset_dict[sequence]["meas"] = row["meas"]
                dataset_dict[sequence][predictor_name] = row["pred"]
    test_dataframes = {
        allele: pd.DataFrame.from_dict(
            ic50_values, orient="index")
        for (allele, ic50_values) in test_datasets.items()
    }
    return test_dataframes, predictor_names
def normalize_allele_name_optional(s):
    return normalize_allele_name(s, raise_on_error=False)