Example #1
0
    def test_peptides_from_varaints(self):
        coding = {}
        coding['NM_080751'] = MutationSyntax('NM_080751',2629,876,'c.2630C>T','p.Pro877Leu')
        var = Variant('line0',0,20,2621905,'C','T',coding,True,False)
        var.gene = 'TMC2'
        ma = MartsAdapter(biomart="http://ensembl.org")

        vars = [var, Variant("testInsertion", 2, 20, 2621899, "", "AAAAAA", {'NM_080751':MutationSyntax('NM_080751',2625,876,'c.2630C>T','p.Pro877Leu')}, True, False)]

        test = Generator.generate_peptides_from_variants(vars, 9, ma, id_type=EIdentifierTypes.REFSEQ, peptides=None)
        test2 = [x for x in test]
        print(len(test2))

        ts = list()
        #using a tweaked generator that takes another sequence source if the sequence is too short in respect to the given variants
        #in this case a newer/older sequence from mart in respect to what was given as reference in the annotation process
        t = Generator.generate_transcripts_from_variants(vars, ma, id_type=EIdentifierTypes.REFSEQ)
        ts = [x for x in t]
        print(len(ts[0]))
        p = Generator.generate_proteins_from_transcripts(ts, to_stop=True)
        ps = [x for x in p]
        e = Generator.generate_peptides_from_proteins(ps, 9)
        es = [x for x in e]
        print(len(es))

        #print vars
        print len(vars)
Example #2
0
    def test_real_life_test(self):
        mart = MartsAdapter(biomart="http://grch37.ensembl.org/biomart/martservice?query=")

        ano_path = os.path.join(os.path.dirname(inspect.getfile(Fred2)), "Data/examples/test_annovar.out")
        vars = read_annovar_exonic(ano_path)

        peps = set(map(lambda x: str(x), Generator.generate_peptides_from_variants(vars, 9, mart,
                                                                                   EIdentifierTypes.REFSEQ)))

        peps_from_prot = set(map(str, Generator.generate_peptides_from_proteins(
            Generator.generate_proteins_from_transcripts(
            Generator.generate_transcripts_from_variants(vars, mart, EIdentifierTypes.REFSEQ)), 9)))

        self.assertTrue(len(peps - peps_from_prot) == 0)
        self.assertTrue(len(peps_from_prot - peps) == 0)
Example #3
0
    def test_real_life_test(self):
        mart = MartsAdapter(
            biomart="http://grch37.ensembl.org/biomart/martservice?query=")

        ano_path = os.path.join(os.path.dirname(inspect.getfile(Fred2)),
                                "Data/examples/test_annovar.out")
        vars = read_annovar_exonic(ano_path)

        peps = set(
            map(
                lambda x: str(x),
                Generator.generate_peptides_from_variants(
                    vars, 9, mart, EIdentifierTypes.REFSEQ)))

        peps_from_prot = set(
            map(
                str,
                Generator.generate_peptides_from_proteins(
                    Generator.generate_proteins_from_transcripts(
                        Generator.generate_transcripts_from_variants(
                            vars, mart, EIdentifierTypes.REFSEQ)), 9)))

        self.assertTrue(len(peps - peps_from_prot) == 0)
        self.assertTrue(len(peps_from_prot - peps) == 0)
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True)
    parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True)
    parser.add_argument('-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction")
    parser.add_argument('-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]".format(m=PRED_METH))
    parser.add_argument('-f', "--filter", dest="filter", type=float, help="Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict")
    parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.")
    parser.add_argument('-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
                        filemode='w+', level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

# MyObject = type('MyObject', (object,), {})
# options = MyObject()
# setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
#
# vt = os.path.splitext(options.var_file)[-1]
# if ".vcf" == vt:
#     vcfvars, accessions = FileReader.read_vcf(options.var_file)
#
# mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
#
# transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
# transcripts = [x for x in transcript_gen if x.vars]
# transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
# protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
# proteins = [x for x in protein_gen if x.vars]
# for p in proteins:
#     p.gene_id = p.vars.values()[0][0].gene
#
#
# for t in transcripts:
#     t.gene_id = t.vars.values()[0].gene
#

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values()[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [x for x in peptides if any(x.get_variants_by_protein(y) for y in x.proteins.keys())]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
Example #5
0
def make_predictions_from_variants(variants_all, methods, alleles, minlength,
                                   maxlength, martsadapter, protein_db,
                                   identifier, metadata, transcriptProteinMap):
    # list for all peptides and filtered peptides
    all_peptides = []
    all_peptides_filtered = []

    # dictionaries for syfpeithi matrices max values and allele mapping
    max_values_matrices = {}
    allele_string_map = {}

    # list to hold dataframes for all predictions
    pred_dataframes = []

    prots = [
        p for p in generator.generate_proteins_from_transcripts(
            generator.generate_transcripts_from_variants(
                variants_all, martsadapter, ID_SYSTEM_USED))
    ]

    for peplen in range(minlength, maxlength):
        peptide_gen = generator.generate_peptides_from_proteins(prots, peplen)

        peptides_var = [x for x in peptide_gen]

        # remove peptides which are not 'variant relevant'
        peptides = [
            x for x in peptides_var if any(
                x.get_variants_by_protein(y) for y in x.proteins.keys())
        ]

        # filter out self peptides
        selfies = [str(p) for p in peptides if protein_db.exists(str(p))]
        filtered_peptides = [p for p in peptides if str(p) not in selfies]

        all_peptides = all_peptides + peptides
        all_peptides_filtered = all_peptides_filtered + filtered_peptides

        results = []

        if len(filtered_peptides) > 0:
            for m in methods:
                try:
                    results.extend([
                        EpitopePredictorFactory(
                            m.split('-')[0],
                            version=m.split('-')[1]).predict(filtered_peptides,
                                                             alleles=alleles)
                    ])
                except:
                    logging.warning(
                        "Prediction for length {length} and allele {allele} not possible with {method}."
                        .format(length=peplen,
                                allele=','.join([str(a) for a in alleles]),
                                method=m))

        if (len(results) == 0):
            continue

        df = results[0].merge_results(results[1:])

        for a in alleles:
            conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype)
            allele_string_map['%s_%s' %
                              (a, peplen)] = '%s_%i' % (conv_allele, peplen)
            max_values_matrices['%s_%i' %
                                (conv_allele, peplen)] = get_matrix_max_score(
                                    conv_allele, peplen)

        df.insert(0, 'length', df.index.map(create_length_column_value))
        df['chr'] = df.index.map(create_variant_chr_column_value)
        df['pos'] = df.index.map(create_variant_pos_column_value)
        df['gene'] = df.index.map(create_gene_column_value)
        df['transcripts'] = df.index.map(create_transcript_column_value)
        df['proteins'] = df.index.map(create_protein_column_value)
        df['variant type'] = df.index.map(create_variant_type_column_value)
        df['synonymous'] = df.index.map(create_variant_syn_column_value)
        df['homozygous'] = df.index.map(create_variant_hom_column_value)
        df['variant details (genomic)'] = df.index.map(
            create_mutationsyntax_genome_column_value)
        df['variant details (protein)'] = df.index.map(
            create_mutationsyntax_column_value)

        # reset index to have index as columns
        df.reset_index(inplace=True)

        for c in df.columns:
            if '*' in str(c):
                idx = df.columns.get_loc(c)
                df.insert(
                    idx + 1, '%s affinity' % c,
                    df.apply(lambda x: create_affinity_values(
                        str(c), int(x['length']), float(x[c]), x['Method'],
                        max_values_matrices, allele_string_map),
                             axis=1))
                df.insert(
                    idx + 2, '%s binder' % c,
                    df.apply(lambda x: create_binder_values(
                        float(x['%s affinity' % c]), x['Method']),
                             axis=1))
                df = df.rename(columns={c: '%s score' % c})
                df['%s score' % c] = df['%s score' %
                                        c].map(lambda x: round(x, 4))

        for c in metadata:
            df[c] = df.apply(lambda row: create_metadata_column_value(row, c),
                             axis=1)

        df = df.rename(columns={'Seq': 'sequence'})
        df = df.rename(columns={'Method': 'method'})
        pred_dataframes.append(df)

    statistics = {
        'prediction_methods': methods,
        'number_of_variants': len(variants_all),
        'number_of_peptides': len(all_peptides),
        'number_of_peptides_after_filtering': len(all_peptides_filtered)
    }

    return pred_dataframes, statistics, all_peptides_filtered
Example #6
0
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V',
                        '--variations',
                        dest="var_file",
                        help='<Required> full path to the input variations',
                        required=True)
    parser.add_argument('-o',
                        "--outfile",
                        dest="outfile_path",
                        help="Created fasta file",
                        required=True)
    parser.add_argument(
        '-d',
        "--digest",
        dest="digest",
        type=int,
        help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a',
                        "--alleles",
                        dest="alleles",
                        help="Input alleles for prediction")
    parser.add_argument(
        '-p',
        "--predict",
        dest="predict_with",
        help="Method of prediction, needs alleles & length, allowed:[{m}]".
        format(m=PRED_METH))
    parser.add_argument(
        '-f',
        "--filter",
        dest="filter",
        type=float,
        help=
        "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict"
    )
    parser.add_argument('-P',
                        "--Proteins",
                        dest="only_proteins",
                        action='store_true',
                        help="Will write only proteins.")
    parser.add_argument(
        '-b',
        "--base",
        dest="basefasta_path",
        help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(
        filename=os.path.splitext(options.outfile_path)[0] +
        "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
        filemode='w+',
        level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path +
                 " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

    # MyObject = type('MyObject', (object,), {})
    # options = MyObject()
    # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
    #
    # vt = os.path.splitext(options.var_file)[-1]
    # if ".vcf" == vt:
    #     vcfvars, accessions = FileReader.read_vcf(options.var_file)
    #
    # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
    #
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # transcripts = [x for x in transcript_gen if x.vars]
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    # proteins = [x for x in protein_gen if x.vars]
    # for p in proteins:
    #     p.gene_id = p.vars.values()[0][0].gene
    #
    #
    # for t in transcripts:
    #     t.gene_id = t.vars.values()[0].gene
    #

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org"
                           )  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(
        vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values(
        )[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins
                if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [
        x for x in peptides if any(
            x.get_variants_by_protein(y) for y in x.proteins.keys())
    ]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(
            FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(
            f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(
            f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(
            e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
Example #7
0
    def test_peptides_from_variants(self):
        """
        Create multiple peptides, given a set
        containing heterozygous variants .

        Variants:
        3-DEL(-2)  , 5-INS(+3)  , 7-DEL(-4)
        HET-DEL(-2), HOM-INS(+3), HET-DEL(-1)

        Reference sequence:
        AAAAACCCCCGGGGG
        AAATTTGGGGG (DEL,INS,DEL)
        AAATTTCCCCCGGGGG (DEL,INS)
        AAAAATTTGGGGG (INS,DEL)
        AAAAATTTCCCCCGGGGG (INS)

        GGGGGCCCCCAAAAA
        GGGTTTCAAAAA (DEL,INS,DEL)
        GGGTTTCCCCCAAAAA (DEL,INS)
        GGGGGTTTCAAAAA (INS,DEL)
        GGGGGTTTCCCCCAAAAA (INS)


        Resulting protein sequences:
        KFG
        KNLG
        KFPPG
        KNFPRG

        GFK
        GGLK
        GFPPK
        GGFPQK

        Resulting peptides of length 3:
        KFG +
        KNL +
        NLG +
        KFP +
        FPP +
        PPG +
        KNF +
        NFP +
        FPR +
        PRG +

        GFK +
        GGL +
        GLK +
        GFP +
        FPP +
        PPK +
        GGF +
        GFP +
        FPQ +
        PQK +
        """
        dummy_db = DummyAdapter()

        exp_peps = set([
            'PRG', 'GLK', 'PPG', 'KFP', 'GFK', 'PPK', 'GFP', 'PQK', 'KNL',
            'KFG', 'GGF', 'FPQ', 'FPP', 'NLG', 'FPR', 'KNF', 'GGL', 'NFP'
        ])
        # 1) INS, SNP, DEL
        dummy_vars = [var_10, var_11, var_12]
        peps = set(
            map(
                lambda x: str(x),
                Generator.generate_peptides_from_variants(
                    dummy_vars, 3, dummy_db, EIdentifierTypes.REFSEQ)))

        peps_from_prot = set(
            map(
                str,
                Generator.generate_peptides_from_proteins(
                    Generator.generate_proteins_from_transcripts(
                        Generator.generate_transcripts_from_variants(
                            dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)),
                    3)))

        self.assertTrue(len(peps - peps_from_prot) == 0)
        self.assertTrue(len(peps_from_prot - peps) == 0)
        self.assertTrue(len(peps - exp_peps) == 0)
        self.assertTrue(len(exp_peps - peps) == 0)
Example #8
0
    def test_peptides_from_variants(self):
        """
        Create multiple peptides, given a set
        containing heterozygous variants .

        Variants:
        3-DEL(-2)  , 5-INS(+3)  , 7-DEL(-4)
        HET-DEL(-2), HOM-INS(+3), HET-DEL(-1)

        Reference sequence:
        AAAAACCCCCGGGGG
        AAATTTGGGGG (DEL,INS,DEL)
        AAATTTCCCCCGGGGG (DEL,INS)
        AAAAATTTGGGGG (INS,DEL)
        AAAAATTTCCCCCGGGGG (INS)

        GGGGGCCCCCAAAAA
        GGGTTTCAAAAA (DEL,INS,DEL)
        GGGTTTCCCCCAAAAA (DEL,INS)
        GGGGGTTTCAAAAA (INS,DEL)
        GGGGGTTTCCCCCAAAAA (INS)


        Resulting protein sequences:
        KFG
        KNLG
        KFPPG
        KNFPRG

        GFK
        GGLK
        GFPPK
        GGFPQK

        Resulting peptides of length 3:
        KFG +
        KNL +
        NLG +
        KFP +
        FPP +
        PPG +
        KNF +
        NFP +
        FPR +
        PRG +

        GFK +
        GGL +
        GLK +
        GFP +
        FPP +
        PPK +
        GGF +
        GFP +
        FPQ +
        PQK +
        """
        dummy_db = DummyAdapter()

        exp_peps = set(['PRG', 'GLK', 'PPG', 'KFP', 'GFK', 'PPK', 'GFP', 'PQK', 'KNL', 'KFG', 'GGF', 'FPQ',
                        'FPP', 'NLG', 'FPR', 'KNF', 'GGL', 'NFP'])
        # 1) INS, SNP, DEL
        dummy_vars = [var_10, var_11, var_12]
        peps = set(map(lambda x: str(x), Generator.generate_peptides_from_variants(dummy_vars, 3, dummy_db, EIdentifierTypes.REFSEQ)))

        peps_from_prot = set(map(str, Generator.generate_peptides_from_proteins(Generator.generate_proteins_from_transcripts(
                           Generator.generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)),
            3)))

        self.assertTrue(len(peps - peps_from_prot) == 0)
        self.assertTrue(len(peps_from_prot - peps) == 0)
        self.assertTrue(len(peps-exp_peps) == 0)
        self.assertTrue(len(exp_peps-peps) == 0)