Exemple #1
0
    def parse_external_result(self, output):
        """
        Searches within the defined dir _file for the newest dir and reads
        the prediction file from there

        :param str output: The path to the output dir
        :return: The predicted HLA genotype
        :rtype: list(:class:`~Fred2.Core.Allele.Allele`)
        """
        alleles = []
        if os.path.isdir(output):
            _file = os.path.join(output, ".typing.txt")
        else:
            _file = output + ".typing.txt"

        typing = False
        with open(_file, "r") as f:

            for l in f:
                if typing and l.strip() != "":
                    a1, a2, _ = l.split()
                    alleles.append(Allele("HLA-" +
                                          ":".join(a1.split(":")[:2])))
                    alleles.append(Allele("HLA-" +
                                          ":".join(a2.split(":")[:2])))
                if "------------ Inferred Allelic Pairs -------------" in l:
                    typing = True
        return alleles
 def setUp(self):
     self.peptides_mhcI = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")]
     self.peptides_mhcII = [Peptide("AAAAAASYFPEITHI"), Peptide("IHTIEPFYSAAAAAA")]
     self.mhcI = [Allele("HLA-B*07:02"), Allele("HLA-A*02:01")]
     self.mhcII = [Allele("HLA-DRB1*07:01"), Allele("HLA-DRB1*15:01")]
     self.mhcII_combined_alleles = [CombinedAllele("DPA1*01:03-DPB1*01:01"), CombinedAllele("DQA1*06:02-DQB1*06:31")]
     self.transcript = Transcript("")
Exemple #3
0
 def test_allele_factory(self):
     a = Allele("HLA-DPA1*01:03-DPB1*01:01", prob=1)
     b = Allele("HLA-A*02:01", prob=2)
     self.assertIsInstance(a, CombinedAllele)
     self.assertEqual(a.prob, 1)
     self.assertIsInstance(b, Allele)
     self.assertEqual(b.prob, 2)
def read_hla_input(hla_file):
    """
    reads in the hla file
    header are defined as:

    A1 - first HLA-A allele in 4-digit notation
    A2 - second HLA-A allele in 4-digit notation
    B1 - first HLA-B allele in 4-digit notation
    B2 - second HLA-B allele in 4-digit notation
    C1 - first HLA-C allele in 4-digit notation
    C2 - second HLA-C allele in 4-digit notation
    A_expression - expression of HLA A gene
    B_expression - expression of HLA B gene
    C_expression - expression of HLA C gene

    :param hla_file:
    :return: list(Allele)
    """
    alleles = []

    with open(hla_file, "rU") as f:
        reader = csv.DictReader(f, delimiter='\t')

        for row in reader:
            for n, hla in itr.product([1,2],["A","B","C"]):
                a = Allele(row[hla+str(n)])
                a.log_metadata("abundance",float(row[hla+"_expression"]))
                alleles.append(a)

    return alleles
Exemple #5
0
    def parse_external_result(self, output):
        """
        Searches within the defined dir _file for the newest dir and reads
        the prediction file from there

        :param str output: The path to the output dir
        :return: The predicted HLA genotype
        :rtype: list(:class:`~Fred2.Core.Allele.Allele`)
        """
        alleles = []
        try:
            with open(os.path.join(output, "winner.hla.txt"), "r") as f:
                for l in f:
                    try:
                        _, a1, a2 = l.replace("-n",
                                              "").replace("-e",
                                                          "").strip().split()
                        a1 = a1.split("_")
                        a2 = a2.split("_")
                        alleles.extend([
                            Allele("HLA-" + a1[1].upper() + "*" + a1[2] + ":" +
                                   a1[3]),
                            Allele("HLA-" + a2[1].upper() + "*" + a2[2] + ":" +
                                   a2[3])
                        ])
                    except ValueError:
                        IOError(
                            "Output format seems incorrect:\n{line}\n. Please check if Polysolver ran correctly."
                            .format(lines=l))
                return alleles
        except IOError:
            raise IOError(
                "File {out} could not be found. Please check your specified output folder"
                .format(out=os.path.join(output, "winner.hla.txt")))
def read_hla_input(hla_file):
    """
    reads in the hla file
    header are defined as:

    A1 - first HLA-A allele in 4-digit notation
    A2 - second HLA-A allele in 4-digit notation
    B1 - first HLA-B allele in 4-digit notation
    B2 - second HLA-B allele in 4-digit notation
    C1 - first HLA-C allele in 4-digit notation
    C2 - second HLA-C allele in 4-digit notation
    A_expression - expression of HLA A gene
    B_expression - expression of HLA B gene
    C_expression - expression of HLA C gene

    :param hla_file:
    :return: list(Allele)
    """
    alleles = []

    with open(hla_file, "rU") as f:
        reader = csv.DictReader(f, delimiter='\t')

        for row in reader:
            for n, hla in itr.product([1, 2], ["A", "B", "C"]):
                a = Allele(row[hla + str(n)])
                a.log_metadata("abundance", float(row[hla + "_expression"]))
                alleles.append(a)

    return alleles
Exemple #7
0
def affinities_from_csv(bindings_file,
                        allele_data=None,
                        peptide_coverage=None):
    ''' Loads binding affinities from a csv file. Optionally, augments alleles with probability
        and peptides with protein coverage. Discards all peptides for which coverage is not provided.
    '''
    df = pd.read_csv(bindings_file)

    df['Seq'] = df.Seq.apply(Peptide)
    if peptide_coverage is not None:
        keep = []
        for pep in df.Seq:
            if pep not in peptide_coverage:
                keep.append(False)
                continue

            keep.append(True)
            for prot in peptide_coverage[str(pep)]:
                pep.proteins[prot] = prot

        df = df[keep]

    df = df.set_index(['Seq', 'Method'])

    if allele_data is not None:
        df.columns = [
            Allele(c, allele_data[c]['frequency'] / 100) for c in df.columns
        ]
    else:
        df.columns = [Allele(c) for c in df.columns]

    return EpitopePredictionResult(df)
Exemple #8
0
    def parse_external_result(self, output):
        """
        Searches within the defined dir _file for the newest dir and reads
        the prediction file from there

        :param str output: The path to the output dir
        :return: The predicted HLA genotype
        :rtype: list(:class:`~Fred2.Core.Allele.Allele`)
        """
        alleles = []
        try:
            with open(output+"-ClassI.HLAgenotype4digits") as c1:
                for row in csv.DictReader(c1, delimiter="\t"):
                    alleles.extend([Allele("HLA-"+row["Allele 1"]), Allele("HLA-"+row["Allele 2"])])
        except IOError as e:
            warnings.warn("Output file {c1} for HLA-I could not be found. {error}".format(
                c1=output + "-ClassI.HLAgenotype4digits"), error=e)

        try:
            with open(output+"-ClassII.HLAgenotype4digits") as c2:
                for row in csv.DictReader(c2, delimiter="\t"):
                    alleles.extend([Allele("HLA-"+row["Allele 1"]), Allele("HLA-"+row["Allele 2"])])
        except IOError as e:
            warnings.warn("Output file {c2} for HLA-I could not be found. {error}".format(
                c2=output + "-ClassII.HLAgenotype4digits"), error=e)

        return alleles
Exemple #9
0
 def test_consistency(self):
     """
     tests all __*__ (including init)
     test has several asserts! If one fails, the following will not be evaluated!
     """
     self.assertTrue(repr(self.simple) == "HLA-A*02:01")
     self.assertEqual(self.simple, Allele("HLA-A*02:01"))
     self.assertNotEqual(repr(self.simple), Allele("HLA-A*02:01:666"))
 def setUp(self):
     #Peptides of different length 9,10,11,12,13,14,15
     self.peptides_mhcI = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")]
     self.peptides_mhcII = [
         Peptide("SYFPEITHI"),
         Peptide("IHTIEPFYSAAAAAA")
     ]
     self.mhcI = [Allele("HLA-B*15:01"), Allele("HLA-A*02:01")]
     self.mhcII = [Allele("HLA-DRB1*07:01"), Allele("HLA-DRB1*15:01")]
def compute_affinities(input_alleles, input_peptides, output_affinities,
                       processes, predictor):
    ''' Computes the binding affinities between the given peptides and HLA alleles
    '''
    alleles = [
        Allele(a.replace('HLA-', ''))
        for a in utilities.get_alleles_and_thresholds(input_alleles).index
    ]
    LOGGER.info('Loaded %d alleles', len(alleles))

    with open(input_peptides) as f:
        reader = csv.DictReader(f)
        peptides = [(Peptide(r['peptide']), len(r['proteins'].split(';')))
                    for r in reader]

    peptides.sort(key=lambda p: p[1], reverse=True)
    LOGGER.info('Loaded %d peptides', len(peptides))

    results = utilities.parallel_apply(
        get_binding_affinity_process,
        ((predictor.lower(), batch, alleles)
         for batch in utilities.batches((p for p, _ in peptides), bsize=256)),
        processes)

    count = 0
    for bindings in results:
        bindings.to_csv(output_affinities,
                        header=(count == 0),
                        mode=('w' if count == 0 else 'a'))
        count += len(bindings)
        LOGGER.debug('Processed %d peptides (%.2f%%)...', count,
                     100 * count / len(peptides))
Exemple #12
0
    def parse_external_result(self, output):
        """
        Searches within the defined dir _file for the newest dir and reads
        the prediction file from there

        :param str output: The path to the output dir
        :return: The predicted HLA genotype
        :rtype: list(:class:`~Fred2.Core.Allele.Allele`)
        """
        alleles = []
        try:
            with open(output + "-ClassI.HLAgenotype4digits") as c1:
                for row in csv.DictReader(c1, delimiter="\t"):
                    alleles.extend([
                        Allele("HLA-" + row["Allele 1"].replace("'", "")),
                        Allele("HLA-" + row["Allele 2"].replace("'", ""))
                    ])
        except IOError as e:
            warnings.warn(
                "Output file {c1} for HLA-I could not be found. {error}".
                format(c1=output + "-ClassI.HLAgenotype4digits"),
                error=e)

        try:
            with open(output + "-ClassII.HLAgenotype4digits") as c2:
                DQA = []
                DQB = []
                for row in csv.DictReader(c2, delimiter="\t"):
                    a1, a2 = row["Allele 1"], row["Allele 2"]
                    if "DRB" in a1 or "DRB" in a2:
                        alleles.extend([
                            Allele("HLA-" + a1.replace("'", "")),
                            Allele("HLA-" + a2.replace("'", ""))
                        ])
                    elif "DQA" in a1 or "DQA" in a2:
                        DQA.extend([a1.replace("'", ""), a2.replace("'", "")])
                    else:
                        DQB.extend([a1.replace("'", ""), a2.replace("'", "")])
                    for dq in itertools.product(DQA, DQB):
                        alleles.append(CombinedAllele("HLA-" + "-".join(dq)))
        except IOError as e:
            warnings.warn(
                "Output file {c2} for HLA-I could not be found. {error}".
                format(c2=output + "-ClassII.HLAgenotype4digits"),
                error=e)

        return alleles
def predict_peptide_effects(peptides, alleles=None):
    """
    Predict the peptide effect for all the available methods on the machine

    Args:
        peptides (list of Peptides): Usually an output from read_fasta
        alleles (list of chars): Alleles for which to run the predictors

    Returns:
        pd.DataFrame: Tidy pd.DataFrame. If the method is unable to predict
                      for a particular value the rows are not present.

    Example:
    >>> peptides = [Peptide("SYFPEITHI"), Peptide("FIASNGVKL"), Peptide("LLGATCMFV")]
    >>> alleles = ['A*02:16', 'B*45:01']
    >>> predict_peptide_effects(peptides, alleles = alleles).head()
                               Seq    Method   allele       score
    0  (F, I, A, S, N, G, V, K, L)       arb  A*02:16  594.691144
    1  (F, I, A, S, N, G, V, K, L)       smm  A*02:16  159.768074
    2  (F, I, A, S, N, G, V, K, L)  smmpmbec  A*02:16  211.977614
    4  (F, I, A, S, N, G, V, K, L)   unitope  A*02:16    0.527849
    5  (L, L, G, A, T, C, M, F, V)       arb  A*02:16    6.784222
    """
    dt = valid_predictors()
    results = []
    for i in range(len(dt)):
        # subset to valid alleles
        if alleles is not None:
            valid_alleles = dt.iloc[i]["supportedAlleles"].intersection(
                alleles)

            if len(valid_alleles) == 0:
                continue
            valid_alleles = [Allele(al) for al in valid_alleles]
        else:
            valid_alleles = None
        method = dt.iloc[i]["name"]
        print("method: ", method)
        # TODO - use try, except
        t0 = time.time()

        try:
            results.append(
                EpitopePredictorFactory(method).predict(peptides,
                                                        alleles=valid_alleles))
        except:
            print("Error! Unable to run ", method, ": ", sys.exc_info())
        t1 = time.time()
        print("  - runtime: ", str(t1 - t0))

    df = results[0].merge_results(results[1:]).reset_index()
    dfm = pd.melt(df,
                  id_vars=["Seq", "Method"],
                  var_name="allele",
                  value_name="score")
    dfm = dfm[dfm["score"].notnull()]
    dfm.rename(columns={'Seq': 'peptide', 'Method': 'method'}, inplace=True)
    return dfm
Exemple #14
0
def generate_alleles(allele_file, generated=None):
    """
                generate allele objects from input
    """
    result=[]
    with open(allele_file, "r") as f:
        for l in f:
            al,freq = l.replace(","," ").replace(";"," ").replace("\n","").split()
            if al.split("HLA-")[-1][0] in ["A","B","C"]:
                result.append(Allele(al,prob=float(freq)))
    return result
Exemple #15
0
    def test_pareto_front_assembly(self):
        cl_pred = CleavageSitePredictorFactory("PCM")
        ep_pred = EpitopePredictorFactory("SMM")
        allele = [Allele("HLA-A*02:01")]
        thresh = {a.name:10000 for a in allele}
        comp = lambda a,b: a <= b

        assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=0)
        r = assembler.paretosolve()
        print(r)

        #print assembler.solve(eps=2.0)
Exemple #16
0
def run_predictor(pred, dataset):
    predictor = EpitopePredictorFactory(pred)
    results = ()
    try:
        results = predictor.predict(dataset,
                                    alleles=[Allele(a) for a in args.allele])
        print(results)
        print(results.describe())
    except ValueError:
        pass

    return (len(results), len(dataset))
Exemple #17
0
    def test_pareto_assembly(self):
        cl_pred = CleavageSitePredictorFactory("PCM")
        ep_pred = EpitopePredictorFactory("SMM")
        allele = [Allele("HLA-A*02:01")]
        thresh = {a.name:10000 for a in allele}
        comp = lambda a,b: a <= b

        print(ep_pred.predict(self.peptides,alleles=allele))
        #cl_pred, ep_pred, alleles, threshold, comparator, length=9

        assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=1)
        r = assembler.solve(eps=1e10, order=(1,0))
        print(r)
def design_spacers(input_epitopes, input_alleles, top_proteins, top_immunogen,
                   top_alleles, solver, pssm_cleavage, alpha, beta,
                   spacer_length, pssm_epitope, processes, output_spacers):

    all_epitopes = list(
        utilities.load_epitopes(input_epitopes, top_immunogen, top_alleles,
                                top_proteins).keys())
    epitopes = [e for e in all_epitopes if 'X' not in e]
    LOGGER.debug('Removed %d epitopes with unknown amino acids',
                 len(all_epitopes) - len(epitopes))
    LOGGER.info('Loaded %d epitopes', len(epitopes))

    alleles_df = utilities.get_alleles_and_thresholds(input_alleles)
    allele_list = [
        Allele(a.replace('HLA-', ''), prob=row.frequency / 100)
        for a, row in alleles_df.iterrows()
    ]
    threshold = {
        a.replace('HLA-', ''): row.threshold
        for a, row in alleles_df.iterrows()
    }
    LOGGER.info('Loaded %d alleles', len(allele_list))

    if pssm_cleavage != 'PCM':
        raise ValueError('Only PCM supported as cleavage predictor')
    cleavage_predictor = PCM()  # TODO use factory when it works
    if pssm_epitope != 'BIMAS':
        raise ValueError('Only BIMAS supported as epitope predictor')
    epitope_predictor = BIMAS()  # TODO use factory when it works

    designer = OptimalSpacerDesign(
        epitopes,
        cleavage_predictor,
        epitope_predictor,
        allele_list,
        threshold=threshold,
        solver=solver,
        k=spacer_length,
        alpha=alpha,
        beta=beta,
    ).solve(threads=processes)

    LOGGER.info('Writing results...')
    with open(output_spacers, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(('from', 'to', 'score', 'spacer'))
        writer.writerows(
            (ei, ej, designer.adj_matrix[ei, ej], designer.spacer[ei, ej])
            for ei in epitopes for ej in epitopes if ei != ej)
Exemple #19
0
    def parse_external_result(self, output):
        """
        Searches within the defined dir _file for the newest dir and reads
        the prediction file from there

        :param str output: The path to the output dir
        :return: The predicted HLA genotype
        :rtype: list(:class:`~Fred2.Core.Allele.Allele`)
        """
        all_subdirs = [os.path.join(output,d) for d in os.listdir(output) if os.path.isdir(os.path.join(output,d))]
        latest_subdir = max(all_subdirs, key=os.path.getmtime)
        result_file = latest_subdir+"/"+os.path.basename(os.path.normpath(latest_subdir))+"_result.tsv"
        with open(result_file, "r") as f:
            row = csv.DictReader(f, delimiter="\t").next()
            return map(lambda x: Allele("HLA-"+x), [ row[k] for k in ["A1","A2","B1","B2","C1","C2"]])
Exemple #20
0
def generate_epitope_result(input, allele_file):
    """
    generates EpitopePredictionResult from output of epitopeprediction and neoepitopeprediction
    """
    #first generate alleles in allele file
    alleles = {}
    with open(allele_file, "r") as af:
        for l in af:
            allele, freq = l.split("\t")
            alleles[allele] = Allele(allele, prob=float(freq))

    r_raw = pandas.read_csv(input, sep="\t")
    res_dic = {}
    method = r_raw.loc[0, "Method"]
    columns = set(["Sequence", "Method", "Antigen ID", "Variant"])
    alleles_raw = [c for c in r_raw.columns if c not in columns]
    for k, row in r_raw.iterrows():
        seq = row["Sequence"]
        protPos = collections.defaultdict(list)
        try:
            protPos = {Protein(p, gene_id=p, transcript_id=p): [0] for p in str(row["Antigen ID"]).split(",")}
        except KeyError:
            pass
        pep = Peptide(seq, protein_pos=protPos)
        for a in alleles_raw:
            if a in alleles:
                if alleles[a] not in res_dic:
                    res_dic[alleles[a]] = {}
                res_dic[alleles[a]][pep] = float(row[a])

    if not res_dic:
        sys.stderr.write("HLA alleles of population and HLA used for prediction did not overlap.")
        sys.exit(-1)

    df_result = EpitopePredictionResult.from_dict(res_dic)
    df_result.index = pandas.MultiIndex.from_tuples([tuple((i, method)) for i in df_result.index],
                                                        names=['Seq', 'Method'])
    return df_result, method
def string_of_beads(input_proteins, input_alleles, input_epitopes,
                    input_cleavages, output_vaccine, cocktail, greedy_subtour,
                    max_aminoacids, max_epitopes, min_alleles, min_proteins,
                    min_avg_prot_conservation, min_avg_alle_conservation):
    program_start_time = time.time()

    # load proteins
    LOGGER.info('Reading sequences...')
    proteins = FileReader.read_fasta(input_proteins, in_type=Protein)
    LOGGER.info('%d proteins read', len(proteins))

    # load alleles
    alleles = [
        Allele(a)
        for a in utilities.get_alleles_and_thresholds(input_alleles).index
    ]
    LOGGER.info('Loaded %d alleles', len(alleles))

    # load epitopes
    epitopes = utilities.load_epitopes(input_epitopes)
    LOGGER.info('Loaded %d epitopes', len(epitopes))

    # read cleavage scores
    cleavage_epitopes = set()
    with open(input_cleavages) as f:
        cleavages = {}
        for row in csv.DictReader(f):
            cleavages[(row['from'], row['to'])] = float(row['score'])
            cleavage_epitopes.add(row['from'])
            cleavage_epitopes.add(row['to'])
    LOGGER.info('Loaded %d cleavage scores', len(cleavages))

    # compute edge cost
    edge_cost, vertices, vertices_rewards = [], [], []
    vertex_to_epitope = [''] + list(cleavage_epitopes)
    for ep_from in vertex_to_epitope:
        vertices.append(ep_from)
        vertices_rewards.append(0 if ep_from ==
                                '' else epitopes[ep_from]['immunogen'])
        edge_cost.append([
            cleavages[(ep_from,
                       ep_to)] if ep_from != '' and ep_to != '' else 0.0
            for ep_to in vertex_to_epitope
        ])
    LOGGER.info('Kept %d epitopes with available clevages', len(vertices) - 1)

    type_coverage, min_type_coverage, min_avg_type_conservation = utilities.compute_coverage_matrix(
        [epitopes[e] for e in vertex_to_epitope[1:]], min_alleles,
        min_proteins, min_avg_prot_conservation, min_avg_alle_conservation,
        len(proteins), len(alleles))

    # find optimal design
    solver_build_time = time.time()
    solver = TeamOrienteeringIlp(
        num_teams=cocktail,
        vertex_reward=vertices_rewards,
        edge_cost=edge_cost,
        type_coverage=type_coverage,
        min_type_coverage=min_type_coverage,
        min_avg_type_conservation=min_avg_type_conservation,
        max_edge_cost=max_aminoacids,
        max_vertices=max_epitopes,
        lazy_subtour_elimination=not greedy_subtour)
    solver.build_model()
    solver_start_time = time.time()
    result = solver.solve()
    solver_end_time = time.time()

    # print info and save
    with open(output_vaccine, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(('cocktail', 'index', 'epitope'))
        for i, mosaic in enumerate(result):
            LOGGER.info('Mosaic #%d', i + 1)
            for j, (_, vertex) in enumerate(mosaic[:-1]):
                epitope = epitopes[vertex_to_epitope[vertex]]
                writer.writerow((i, j, epitope['epitope']))
                LOGGER.info('    %s - IG: %.2f', epitope['epitope'],
                            epitope['immunogen'])

    LOGGER.info('==== Stopwatch')
    LOGGER.info('          Total time : %.2f s',
                solver_end_time - program_start_time)
    LOGGER.info('      Pre-processing : %.2f s',
                solver_build_time - program_start_time)
    LOGGER.info(' Model creation time : %.2f s',
                solver_start_time - solver_build_time)
    LOGGER.info('        Solving time : %.2f s',
                solver_end_time - solver_start_time)
Exemple #22
0
def run_sequential(input_epitopes, input_alleles, input_affinities,
                   output_vaccine, num_epitopes, min_alleles, min_proteins,
                   solver, **kwargs):

    epitope_data = {
        k: v
        for k, v in utilities.load_epitopes(input_epitopes).items()
        if 'X' not in k
    }
    LOGGER.info('Loaded %d epitopes', len(epitope_data))

    peptide_coverage = {
        # we don't really need the actual protein sequence, just fill it with the id to make it unique
        Peptide(r['epitope']):
        set(Protein(gid, gene_id=gid) for gid in r['proteins'])
        for r in epitope_data.values()
    }

    allele_data = utilities.get_alleles_and_thresholds(input_alleles).to_dict(
        'index')
    alleles = [
        Allele(allele.replace('HLA-', ''), prob=data['frequency'] / 100)
        for allele, data in allele_data.items()
    ]
    threshold = {
        allele.replace('HLA-', ''): data['threshold']
        for allele, data in allele_data.items()
    }
    LOGGER.info('Loaded %d alleles', len(threshold))

    affinities = affinities_from_csv(input_affinities,
                                     allele_data,
                                     peptide_coverage=peptide_coverage)
    LOGGER.info('Loaded %d affinities', len(affinities))

    LOGGER.info('Selecting epitopes...')
    model = OptiTope(affinities, threshold, k=num_epitopes, solver=solver)
    if min_alleles is not None:
        model.activate_allele_coverage_const(min_alleles)
    if min_proteins is not None:
        model.activate_antigen_coverage_const(min_proteins)
    selected_epitopes = model.solve()

    LOGGER.info('Creating spacers...')
    vaccine = EpitopeAssemblyWithSpacer(selected_epitopes,
                                        PCM(),
                                        BIMAS(),
                                        alleles,
                                        threshold=threshold,
                                        solver=solver).solve()

    immunogen = sum(epitope_data[str(e)]['immunogen'] for e in vaccine[::2])
    sequence = ''.join(map(str, vaccine))
    cleavage = pcm.DoennesKohlbacherPcm().cleavage_per_position(sequence)

    with open(output_vaccine, 'w') as f:
        writer = csv.DictWriter(
            f, ('immunogen', 'vaccine', 'spacers', 'cleavage'))
        writer.writeheader()
        writer.writerow({
            'immunogen': immunogen,
            'vaccine': sequence,
            'spacers': ';'.join(str(e) for e in vaccine[1::2]),
            'cleavage': ';'.join('%.3f' % c for c in cleavage)
        })
Exemple #23
0
def get_mosaic_solver_instance(logger, input_proteins, input_alleles,
                               input_epitopes, input_overlaps, **kwargs):
    top_immunogen = kwargs.pop('top_immunogen')
    top_alleles = kwargs.pop('top_alleles')
    top_proteins = kwargs.pop('top_proteins')
    min_overlap = kwargs.get('min_overlap', 0)
    cocktail = kwargs.get('cocktail', 1)
    greedy_subtour = kwargs.get('greedy_subtour')
    max_epitopes = kwargs.get('max_epitopes')
    max_aminoacids = kwargs.get('max_aminoacids')
    min_alleles = kwargs.get('min_alleles', 0)
    min_proteins = kwargs.get('min_proteins', 0)
    min_avg_prot_conservation = kwargs.get('min_avg_prot_conservation', 0)
    min_avg_alle_conservation = kwargs.get('min_avg_alle_conservation', 0)

    # load proteins
    logger.info('Reading sequences...')
    proteins = FileReader.read_fasta(input_proteins, in_type=Protein)
    logger.info('%d proteins read', len(proteins))

    # load alleles
    alleles = [
        Allele(a) for a in get_alleles_and_thresholds(input_alleles).index
    ]
    logger.info('Loaded %d alleles', len(alleles))

    # load epitopes
    epitope_data = list(
        load_epitopes(input_epitopes, top_immunogen, top_alleles,
                      top_proteins).values())
    logger.info('Loaded %d epitopes', len(epitope_data))

    # load edge cost
    logger.info('Loading overlaps...')
    vertex_rewards = [0] + [b['immunogen'] for b in epitope_data]
    edges = load_edges_from_overlaps(input_overlaps, min_overlap,
                                     [b['epitope'] for b in epitope_data])
    logger.info('Kept %d edges (from %d)', len(edges),
                len(epitope_data) * (len(epitope_data) + 1))

    # compute hla and protein coverage
    logger.info('Computing coverage matrix...')
    type_coverage, min_type_coverage, min_avg_type_conservation = compute_coverage_matrix(
        epitope_data, min_alleles, min_proteins, min_avg_prot_conservation,
        min_avg_alle_conservation, len(proteins), len(alleles))

    # find optimal design
    solver = TeamOrienteeringIlp(
        num_teams=cocktail,
        vertex_reward=vertex_rewards,
        edge_cost=edges,
        max_edge_cost=0,
        max_vertices=0,
        lazy_subtour_elimination=not greedy_subtour,
        type_coverage=type_coverage,
        min_type_coverage=min_type_coverage,
        min_avg_type_conservation=min_avg_type_conservation,
    )

    if isinstance(max_epitopes, (int, float)):
        solver.update_max_vertices(max_epitopes)

    if isinstance(max_aminoacids, (int, float)):
        solver.update_max_edge_cost(max_aminoacids)

    return solver, {
        'proteins': proteins,
        'alleles': alleles,
        'epitope_data': epitope_data,
    }
Exemple #24
0
from Fred2.EpitopePrediction import EpitopePredictorFactory

####################################################################################################

# Convert raw peptide sequences to Fred2.Core.Peptide objects
all_peptides = [Peptide(row.strip()) for row in args.peptides]

# Separate peptides by length
peptides_by_length = {}
for peptide in all_peptides:
    if not len(peptide) in peptides_by_length:
        peptides_by_length[len(peptide)] = []
    peptides_by_length[len(peptide)].append(peptide)

# Convert raw allele strings to Fred2.Core.Allele objects
alleles = [Allele(allele) for allele in args.alleles]

# Instatiate predictor
predictor = EpitopePredictorFactory("Syfpeithi")


def matrix_max(matrix):
    """Returns the maximum attainable score for a pssm"""
    return sum([max(value.values()) for _, value in matrix.items()])


def load_allele_model(allele, length):
    """Returns the SYFPEITHI pssm for a given allele"""
    allele_model = "%s_%i" % (allele, length)
    try:
        return matrix_max(
Exemple #25
0
 def setUp(self):
     self.simple = Allele("HLA-A*02:01")
    file_in = arguments["--input"]
    if not file_in:
        file_in = "./data/binders.csv"
    file_out = arguments["--output"]

    dt = pd.read_csv(file_in)
    dt = dt[dt["Sequence"].notnull()]
    dt = dt[dt["Sequence"].str.len() == 9]

    peptides = [Peptide(peptide) for peptide in dt["Sequence"]]

    dt["allele"] = dt["allele"].str.replace("\*","").\
                   str.replace("(-[a-zA-Z]+)([0-9]{2})([0-9]{2})","\\1*\\2:\\3").\
                   str.replace("w","").\
                   str.replace("HLA-","")
    dt.rename(columns={"Sequence": "peptide"}, inplace=True)

    alleles = [Allele(allele) for allele in dt["allele"].unique().tolist()]
    res = fred2wrap.predict_peptide_effects(
        peptides, alleles=dt["allele"].unique().tolist())
    res["peptide"] = [peptide.tostring() for peptide in res["peptide"]]
    res["allele"] = [str(allele) for allele in res["allele"]]

    res = res.pivot_table(index=["peptide", "allele"],
                          columns='method',
                          values='score').reset_index(None)

    dt_merge = pd.merge(dt, res, on=["peptide", "allele"], how="left")

    dt_merge.to_csv(file_out, index=False)
Exemple #27
0
    all_peptides = dt["mutant_sequence"].append(dt["wt_sequence"]).unique()
    peptides = [Peptide(peptide) for peptide in all_peptides]

    dt["allele"] = dt["allele"].str.replace("\*", "").\
        str.replace(":", "").\
        str.replace("(-[a-zA-Z]+)([0-9]{2})([0-9]{2})", "\\1*\\2:\\3").\
        str.replace("w", "").\
        str.replace("HLA-", "")

    # TODO
    # dt.rename(columns = {"Sequence": "peptide"}, inplace = True)
    alleles = []
    valid_alleles = []
    for allele in dt["allele"].tolist():
        try:
            a = Allele(allele)
            valid_alleles.append(True)
        except:
            a = None
            valid_alleles.append(False)
        alleles.append(a)

    # subset invalid allele names
    dt = dt[pd.Series(valid_alleles)]

    res = fred2wrap.predict_peptide_effects(peptides, alleles=dt["allele"].unique().tolist())
    res["peptide"] = [str(peptide) for peptide in res["peptide"]]
    res["allele"] = [str(allele) for allele in res["allele"]]

    # TODO - change melt order
    res = res.pivot_table(index=["peptide", "allele"],