Ejemplo n.º 1
0
    def generate_features(seq):
        """
        expect a list of sequences (a list of one for single sequence input)
        return pandas dataframe containing 20 unscaled features 10 from modlamp, 
        10 from custom feature generateion

        """
        from modlamp.descriptors import GlobalDescriptor
        custom_features = pd.Series(seq).apply(generate_custom_features)
        gdesc = GlobalDescriptor(seq)
        gdesc.calculate_all()
        modlamp_features = pd.DataFrame(gdesc.descriptor)
        modlamp_features.columns=gdesc.featurenames
        out = pd.concat([modlamp_features,custom_features],axis=1)
        return out
Ejemplo n.º 2
0
def calculate_peptide_props(fasta_dict):
    '''
    Give a sequence_dictionary (made from get_sequence_dict) returns a
    list of dictionaries. Each dictionary has type of chemical property as the
    keys and the calculated value for that property as the value. Designed to
    be written to a csv file using DictWriter.
    '''
    property_list = []
    for header in fasta_dict:
        s = str(fasta_dict[header].seq)
        t = GlobalDescriptor([s])
        t.calculate_all()
        d = dict(zip(t.featurenames, t.descriptor[0]))
        d['Peptide_name'] = header
        property_list.append(d)
    return property_list
Ejemplo n.º 3
0
def propi():
    des_fis = GlobalDescriptor(seq)
    des_fis.calculate_all()
    prop_fis = des_fis.descriptor

    # Composición de aminoácidos
    amino_comp = map(AC.CalculateAAComposition, seq)  # AA
    dipep_comp = map(AC.CalculateDipeptideComposition, seq)  # Dipéptidos

    # Autocorrelación
    moreau_auto = map(auto.CalculateNormalizedMoreauBrotoAutoTotal,
                      seq)  # Moreau
    moran_auto = map(auto.CalculateMoranAutoTotal, seq)  # Moran
    geary_auto = map(auto.CalculateGearyAutoTotal, seq)  # Geary

    # Composition - Distribution - Transition
    ctd = map(CTD.CalculateCTD, seq)

    # QuasiSequence
    sqa = map(lambda p: qua.GetQuasiSequenceOrder(p, maxlag=5, weight=0.1),
              seq)
    secq = map(lambda p: qua.GetSequenceOrderCouplingNumber(p, d=1), seq)

    amino_comp = pd.DataFrame.from_dict(amino_comp)
    amino_comp.reset_index(drop=True, inplace=True)
    dipep_comp = pd.DataFrame.from_dict(dipep_comp)
    dipep_comp.reset_index(drop=True, inplace=True)

    moreau_auto = pd.DataFrame.from_dict(moreau_auto)
    moreau_auto.reset_index(drop=True, inplace=True)
    moran_auto = pd.DataFrame.from_dict(moran_auto)
    moran_auto.reset_index(drop=True, inplace=True)
    geary_auto = pd.DataFrame.from_dict(geary_auto)
    geary_auto.reset_index(drop=True, inplace=True)

    ctd = pd.DataFrame.from_dict(ctd)
    ctd.reset_index(drop=True, inplace=True)

    # PseudoAAC - Tipo I
    Hydrophobicity = PAAC._Hydrophobicity
    hydrophilicity = PAAC._hydrophilicity
    residuemass = PAAC._residuemass
    pK1 = PAAC._pK1
    pK2 = PAAC._pK2
    pI = PAAC._pI
    clasI_pse = map(
        lambda p: PAAC.GetPseudoAAC(
            p,
            lamda=3,
            weight=0.7,
            AAP=[Hydrophobicity, hydrophilicity, residuemass, pK1, pK2, pI]),
        seq)
    clasI_pse = pd.DataFrame.from_dict(clasI_pse)
    clasI_pse.reset_index(drop=True, inplace=True)

    sqa = pd.DataFrame.from_dict(sqa)
    sqa.reset_index(drop=True, inplace=True)
    secq = pd.DataFrame.from_dict(secq)
    secq.reset_index(drop=True, inplace=True)

    prop_fis = pd.DataFrame(prop_fis)
    prop_fis.columns = [
        'Longitud', 'MW', 'Carga', 'DensCarga', 'pIso', 'InestInd', 'Aroma',
        'Alifa', 'Boman', 'HidroRa'
    ]

    var = pd.concat([
        amino_comp, dipep_comp, moreau_auto, moran_auto, ctd, clasI_pse, sqa,
        secq, geary_auto, prop_fis
    ],
                    axis=1)
    return var
Ejemplo n.º 4
0
newFeatures = [
    'MW', 'ChargeDensity', 'pI', 'InstabilityInd', 'Aromaticity',
    'AliphaticInd', 'BomanInd', 'HydRatio'
]

#writing feature names in excel sheet
for i in range(cols + len(aminoAcid) + 1,
               cols + len(aminoAcid) + len(newFeatures) + 1):
    writingSheet.cell(
        row=1, column=i).value = newFeatures[i - (cols + len(aminoAcid) + 1)]

for i in range(2, rows + 1):  #filling feature value in excel sheet
    pepSequencee = readingSheet.cell(row=i, column=cols).value
    desc = GlobalDescriptor(pepSequencee)
    desc.calculate_all(amide=True)
    array = desc.descriptor.tolist()
    countt = 1
    for j in range(cols + len(aminoAcid) + 1,
                   cols + len(aminoAcid) + 1 + len(newFeatures)):
        writingSheet.cell(row=i, column=j).value = float(array[0][countt])
        countt += 1

writingBook.save(str(outputFile))  #saving all data to output file

##################################################################TESTING DATA####################################################

trainingData = pd.read_csv(r"test.csv")  #reading CSV training data
trainingData.to_excel(r"test.xlsx", index=None,
                      header=True)  #converting CSV to Excel
Ejemplo n.º 5
0
    def analyze_generated(self, num, fname='analysis.txt', plot=False):
        """ Method to analyze the generated sequences located in `self.generated`.

        :param num: {int} wanted number of sequences to sample
        :param fname: {str} filename to save analysis info to
        :param plot: {bool} whether to plot an overview of descriptors
        :return: file with analysis info (distances)
        """
        with open(fname, 'w') as f:
            print("Analyzing...")
            f.write("ANALYSIS OF SAMPLED SEQUENCES\n==============================\n\n")
            f.write("Nr. of duplicates in generated sequences: %i\n" % (len(self.generated) - len(set(self.generated))))
            count = len(set(self.generated) & set(self.sequences))  # get shared entries in both lists
            f.write("%.1f percent of generated sequences are present in the training data.\n" %
                    ((count / len(self.generated)) * 100))
            d = GlobalDescriptor(self.generated)
            len1 = len(d.sequences)
            d.filter_aa('B')
            len2 = len(d.sequences)
            d.length()
            f.write("\n\nLENGTH DISTRIBUTION OF GENERATED DATA:\n\n")
            f.write("Number of sequences too short:\t%i\n" % (num - len1))
            f.write("Number of invalid (with 'B'):\t%i\n" % (len1 - len2))
            f.write("Number of valid unique seqs:\t%i\n" % len2)
            f.write("Mean sequence length:     \t\t%.1f ± %.1f\n" % (np.mean(d.descriptor), np.std(d.descriptor)))
            f.write("Median sequence length:   \t\t%i\n" % np.median(d.descriptor))
            f.write("Minimal sequence length:  \t\t%i\n" % np.min(d.descriptor))
            f.write("Maximal sequence length:  \t\t%i\n" % np.max(d.descriptor))
            
            descriptor = 'pepcats'
            seq_desc = PeptideDescriptor([s[1:].rstrip() for s in self.sequences], descriptor)
            seq_desc.calculate_autocorr(7)
            gen_desc = PeptideDescriptor(d.sequences, descriptor)
            gen_desc.calculate_autocorr(7)
            
            # random comparison set
            self.ran = Random(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))  # generate rand seqs
            probas = count_aas(''.join(seq_desc.sequences)).values()  # get the aa distribution of training seqs
            self.ran.generate_sequences(proba=probas)
            ran_desc = PeptideDescriptor(self.ran.sequences, descriptor)
            ran_desc.calculate_autocorr(7)
            
            # amphipathic helices comparison set
            self.hel = Helices(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))
            self.hel.generate_sequences()
            hel_desc = PeptideDescriptor(self.hel.sequences, descriptor)
            hel_desc.calculate_autocorr(7)
            
            # distance calculation
            f.write("\n\nDISTANCE CALCULATION IN '%s' DESCRIPTOR SPACE\n\n" % descriptor.upper())
            desc_dist = distance.cdist(gen_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.3f +/- %.3f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(ran_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(hel_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # more simple descriptors
            g_seq = GlobalDescriptor(seq_desc.sequences)
            g_gen = GlobalDescriptor(gen_desc.sequences)
            g_ran = GlobalDescriptor(ran_desc.sequences)
            g_hel = GlobalDescriptor(hel_desc.sequences)
            g_seq.calculate_all()
            g_gen.calculate_all()
            g_ran.calculate_all()
            g_hel.calculate_all()
            sclr = StandardScaler()
            sclr.fit(g_seq.descriptor)
            f.write("\n\nDISTANCE CALCULATION FOR SCALED GLOBAL DESCRIPTORS\n\n")
            desc_dist = distance.cdist(sclr.transform(g_gen.descriptor), sclr.transform(g_seq.descriptor),
                                       metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.2f +/- %.2f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(sclr.transform(g_ran.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(sclr.transform(g_hel.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # hydrophobic moments
            uh_seq = PeptideDescriptor(seq_desc.sequences, 'eisenberg')
            uh_seq.calculate_moment()
            uh_gen = PeptideDescriptor(gen_desc.sequences, 'eisenberg')
            uh_gen.calculate_moment()
            uh_ran = PeptideDescriptor(ran_desc.sequences, 'eisenberg')
            uh_ran.calculate_moment()
            uh_hel = PeptideDescriptor(hel_desc.sequences, 'eisenberg')
            uh_hel.calculate_moment()
            f.write("\n\nHYDROPHOBIC MOMENTS\n\n")
            f.write("Hydrophobic moment of training seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_seq.descriptor), np.std(uh_seq.descriptor)))
            f.write("Hydrophobic moment of sampled seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_gen.descriptor), np.std(uh_gen.descriptor)))
            f.write("Hydrophobic moment of random seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_ran.descriptor), np.std(uh_ran.descriptor)))
            f.write("Hydrophobic moment of amphipathic seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_hel.descriptor), np.std(uh_hel.descriptor)))
        
        if plot:
            if self.refs:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences, uh_hel.sequences, uh_ran.sequences],
                                   ['training', 'sampled', 'hel', 'ran'])
            else:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences], ['training', 'sampled'])
            a.plot_summary(filename=fname[:-4] + '.png')
Ejemplo n.º 6
0
def describe_sequences():
    path = r"C:\Users\Patrick\OneDrive - University College Dublin\Bioinformatics\HemolyticStudies\BOTH_peptides.json"

    aa_letters = [
        'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'
    ]
    di_letters = ["%s%s" % (a, b) for a in aa_letters for b in aa_letters]
    tri_letters = [
        "%s%s%s" % (a, b, c) for a in aa_letters for b in aa_letters
        for c in aa_letters
    ]
    conjoint_letters = ["A", "I", "Y", "H", "R", "D", "C"]
    letters = {
        1: aa_letters,
        2: di_letters,
        3: tri_letters,
        4: conjoint_letters
    }

    #Conjoint src = https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0828-1

    conjoint_dict = {
        "A": "A",
        "G": "A",
        "V": "A",
        "I": "I",
        "L": "I",
        "F": "I",
        "P": "I",
        "Y": "Y",
        "M": "Y",
        "T": "Y",
        "S": "Y",
        "H": "H",
        "N": "H",
        "Q": "H",
        "W": "H",
        "R": "R",
        "K": "R",
        "D": "D",
        "E": "D",
        "C": "C",
    }

    def counter(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i: 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k: d[k] / l for k in d}
        if seq_type == 2:
            for a in range(l - 1):
                s = string[a:a + seq_type]
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k: d[k] / (l - 1) for k in d}
        if seq_type == 3:
            for a in range(l - 2):
                s = string[a:a + seq_type]
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k: d[k] / (l - 2) for k in d}
        return d

    def counter_boolean(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i: 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] = 1.0
                except KeyError:
                    d[s] = 1.0
        if seq_type == 2:
            for a in range(l - 1):
                s = string[a:a + seq_type]
                try:
                    d[s] = 1.0
                except KeyError:
                    d[s] = 1.0
        return d

    def counter_abs(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i: 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] = d[s] + 1.0
                except KeyError:
                    d[s] = 1.0
        if seq_type == 2:
            for a in range(l - 1):
                s = string[a:a + seq_type]
                try:
                    d[s] = d[s] + 1.0
                except KeyError:
                    d[s] = 1.0
        return d

    def residue_distribution(all_residues, seq_type, dp):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter(all_residues, seq_type)
        if seq_type == 1:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]
                        ]))  ##Removes ambiguous letters
        elif seq_type == 2:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50]))
        elif seq_type == 3:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if tp[i] >= 20]))
        elif seq_type == 4:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]]))

        r_c = [i[1] for i in residue_counts]
        dis = np.array([
            r_c,
        ])
        return dis

    def residue_boolean(all_residues, seq_type, dp):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter_boolean(all_residues, seq_type)
        if seq_type == 1:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]
                        ]))  ##Removes ambiguous letters
        elif seq_type == 2:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50]))
        r_c = [i[1] for i in residue_counts]
        dis = np.array([
            r_c,
        ])
        return dis

    def residue_abs(all_residues, seq_type, dp):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter_abs(all_residues, seq_type)
        if seq_type == 1:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]
                        ]))  ##Removes ambiguous letters
        elif seq_type == 2:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50]))
        r_c = [i[1] for i in residue_counts]
        dis = np.array([
            r_c,
        ])
        return dis

    with open(path, "r") as f:
        text = f.read()

    peptides = eval(text)["Peptides"]

    train_peptides, test_peptides = train_test_split(peptides,
                                                     test_size=0.15,
                                                     random_state=42)

    train_peptides_seqs = [peptide["seq"] for peptide in train_peptides]

    for peptide in peptides:
        if peptide["seq"] in train_peptides_seqs:
            peptide["train"] = True
        else:
            peptide["train"] = False

    print(len([p for p in peptides if p["train"] == True]))
    print(len([p for p in peptides if p["train"] == False]))

    new_peptides = []
    for peptide in peptides:
        if peptide["train"] == True:
            new_peptide = peptide.copy()
            new_seq = ''.join(reversed(peptide["seq"]))
            new_peptide["seq"] = new_seq
            new_peptides.append(new_peptide)

    #peptides.extend(new_peptides)
    random.shuffle(peptides)

    print(len([p for p in peptides if p["train"] == True]))
    print(len([p for p in peptides if p["train"] == False]))
    print("doubling complete")

    dp = {i: 0 for i in letters[2]}
    tp = {i: 0 for i in letters[3]}

    name_i = 0

    for peptide in peptides:
        temp_set = set()
        seq = peptide["seq"]
        l = len(seq)
        for a in range(l - 1):
            s = seq[a:a + 2]
            temp_set.add(s)
        for s in temp_set:
            dp[s] = dp[s] + 1

    for peptide in peptides:
        temp_set = set()
        seq = peptide["seq"]
        l = len(seq)
        for a in range(l - 2):
            s = seq[a:a + 3]
            temp_set.add(s)
        for s in temp_set:
            tp[s] = tp[s] + 1

    for peptide in peptides:
        peptide["conjoint_seq"] = "".join(
            [conjoint_dict[letter] for letter in peptide["seq"]])

    for peptide in peptides:

        globdesc = GlobalDescriptor(peptide["seq"])
        globdesc.calculate_all(amide=peptide["cTer"] == "Amidation")

        ctdc = CTD.CalculateC(peptide["seq"])
        ctdc_keys = list(sorted(list([key for key in ctdc])))
        ctdc_vals = np.array([[ctdc[key] for key in ctdc_keys]])

        conjointtriad = ConjointTriad.CalculateConjointTriad(peptide["seq"])
        conjointtriad_keys = list(sorted(list([key for key in conjointtriad])))
        conjointtriad_vals = np.array(
            [[conjointtriad[key] for key in conjointtriad_keys]])

        conjoint_dis = residue_distribution(peptide["conjoint_seq"], 4, None)

        #peptide["GlobalDescriptor"] = globdesc

        #print(peptide["GlobalDescriptor"].descriptor)

        #Eisenberg hydrophobicity consensus
        #Take most of the values from here

        pepdesc = PeptideDescriptor(peptide["seq"], "eisenberg")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        #pepdesc.calculate_profile(append=True, prof_type = "uH")

        pepdesc.load_scale("Ez")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("aasi")
        pepdesc.calculate_global(append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("abhprk")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("charge_acid")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("cougar")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("gravy")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("hopp-woods")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("kytedoolittle")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("ppcali")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("msw")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("charge_phys")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("flexibility")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("bulkiness")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("TM_tend")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("mss")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("t_scale")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("peparc")
        pepdesc.calculate_arc(modality="max", append=True)
        pepdesc.calculate_arc(modality="mean", append=True)

        pepdesc.load_scale("msw")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("polarity")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("pepcats")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("isaeci")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("refractivity")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("z3")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("z5")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        #pepdesc.load_scale("PPCALI")
        #pepdesc.calculate_autocorr(2)
        #peptide["PeptideDescriptor"] = pepdesc

        protein = PyPro()
        protein.ReadProteinSequence(peptide["seq"])
        paac = protein.GetPAAC(lamda=1, weight=0.05)
        paac2 = [[
            paac[a] for a in list(
                sorted([k for k in paac],
                       key=lambda x: int(x.replace("PAAC", ""))))
        ]]

        cTer = np.array([[1 if peptide["cTer"] == "Amidation" else 0]])

        paac = np.array(paac2)

        analysed_seq = ProteinAnalysis(peptide["seq"])
        secondary_structure_fraction = np.array(
            [analysed_seq.secondary_structure_fraction()])

        peptide["TotalDescriptor"] = str(
            np.concatenate((pepdesc.descriptor, globdesc.descriptor), axis=1))

        try:
            pepid = np.array([[
                int(peptide["id"].replace("HEMOLYTIK", "").replace(
                    "DRAMP", "").replace("DBAASP", ""))
            ]])
        except KeyError:
            pepid = 0

        pep_train = np.array([[1 if peptide["train"] == True else 0]])

        freq_1d = residue_distribution(peptide["seq"], 1, dp)
        freq_2d = residue_distribution(peptide["seq"], 2, dp)
        freq_3d = residue_distribution(peptide["seq"], 3, dp)
        freq_1dbool = residue_boolean(peptide["seq"], 1, dp)
        freq_2dbool = residue_boolean(peptide["seq"], 2, dp)
        freq_1dabs = residue_abs(peptide["seq"], 1, dp)
        freq_2dabs = residue_abs(peptide["seq"], 2, dp)

        len_peptide = np.array([[len(peptide["seq"])]])

        if peptide["activity"] == "YES":
            pepact = 1
        else:
            pepact = 0
        pepact = np.array([[pepact]])

        peptide_di2 = di2(peptide["seq"])
        peptide_di3 = di3(peptide["conjoint_seq"])

        ####################### AAindex #########################
        to_get = [
            ("CHAM810101", "mean"),  #Steric Hinderance
            ("CHAM810101", "total"),  #Steric Hinderance
            ("KYTJ820101", "mean"),  #Hydropathy
            ("KLEP840101", "total"),  #Charge
            ("KLEP840101", "mean"),  #Charge
            ("MITS020101", "mean"),  #Amphiphilicity
            ("FAUJ830101", "mean"),  #Hydrophobic parameter pi
            ("GOLD730102", "total"),  #Residue volume
            ("MEEJ800101", "mean"),  #Retention coefficient in HPLC
            ("OOBM850105",
             "mean"),  #Optimized side chain interaction parameter
            ("OOBM850105",
             "total"),  #Optimized side chain interaction parameter
            ("VELV850101", "total"),  #Electron-ion interaction parameter
            ("VELV850101", "mean"),  #Electron-ion interaction parameter
            ("PUNT030102",
             "mean"),  #Knowledge-based membrane-propensity scale from 3D_Helix
            ("BHAR880101", "mean"),  #Average flexibility indeces
            ("KRIW790102", "mean"),  #Fraction of site occupied by water
            ("PLIV810101", "mean"),  #Partition coefficient
            ("ZIMJ680102", "mean"),  #Bulkiness
            ("ZIMJ680102", "total"),  #Bulkiness
            ("ZHOH040101", "mean"),  #Stability scale
            ("CHAM820102", "total"),  #Free energy solubility in water
            #From HemoPi: src = https://github.com/riteshcanfly/Hemopi/blob/master/pcCalculator.java
            ("HOPT810101", "mean"),  #Hydrophilicity 
            ("EISD840101", "mean"),  #Hydrophobicity
            ("FAUJ880109", "total"),  #Net Hydrogen
            ("EISD860101", "mean"),  #Solvation
        ]

        tetra_peptides = [
            "KLLL",  # src = https://github.com/riteshcanfly/Hemopi/blob/master/tetrapos.txt
            "GCSC",
            "AAAK",
            "KLLS",
            "LGKL",
            "VLKA",
            "LLGK",
            "LVGA",
            "LSDF",
            "SDFK",
            "SWLR",
            "WLRD",
        ]

        tp_bin = []
        for t_p in tetra_peptides:
            if t_p in peptide["seq"]:
                tp_bin.append(1)
            else:
                tp_bin.append(0)
        tp_bin = np.array([tp_bin])

        for identifier, mode in to_get:
            x = aaf(peptide["seq"], identifier, mode)

        aminoacidindeces = np.array([[
            aaf(peptide["seq"], identifier, mode)
            for identifier, mode in to_get
        ]])

        peptide["array"] = np.concatenate(
            (
                pepid,
                pep_train,
                pepdesc.descriptor,
                globdesc.descriptor,
                len_peptide,
                cTer,
                secondary_structure_fraction,
                aminoacidindeces,
                ctdc_vals,
                conjointtriad_vals,
                tp_bin,
                freq_1d,
                freq_2d,
                freq_3d,
                freq_1dbool,
                freq_2dbool,
                freq_1dabs,
                freq_2dabs,
                peptide_di2,
                peptide_di3,  #Conjoint Alphabet
                paac,
                pepact,
            ),
            axis=1)
        #print(peptide["TotalDescriptor"])

    x = np.concatenate([peptide["array"] for peptide in peptides], axis=0)

    np.save("peptides_array", x, allow_pickle=False)
Ejemplo n.º 7
0
Archivo: protein.py Proyecto: jancr/ppv
    def _add_features_to_peptide_series(self,
                                        peptide,
                                        index,
                                        n_cluster=-1,
                                        lpvs=None):
        # primary intensity weights d = delta, pd = penalty delta
        # TODO only d_start and d_stop depends on impval, pd_start and pd_stop does not because
        # they are always between a d_start and d_stop, and should thus be above imp_val!
        # therefore we can write out d_start as and d_stop as:
        #   [before_start, after_start], [befrore_stop, after_stop]
        # thus if we have
        #       raw data     = [0, 0, 5, 5, 7, 7, 5, 5, 0, 0]
        # then for the peptide        3--------------8
        #       before_start, after_start = [ 0, 5 ]
        # but for the peptide               5--6
        #       before_start, after_start = [ 5, 7 ]
        # by making a none linear model we could formulate the w_start parameter as follows:
        # w_start * (after_start - max(before_start, imp_val))
        # which is consistent with how we currently do the grid search (imp_val=4):
        #       d_start = 5 - max(0, 4) = 1
        #       d_start = 7 - max(5, 4) = 2
        if lpvs is None:
            lpvs = set()
        i_start = peptide.start.index
        i_stop = peptide.stop.index

        # MS Delta
        series = pd.Series(np.zeros(len(index)) * np.nan, index=index)
        ms_int = self.ms_intensity_features.type
        series[ms_int, 'start'] = self.start_scores[i_start]
        series[ms_int, 'stop'] = self.stop_scores[i_stop]

        if 4 < len(peptide):
            penalty = SequenceRange(peptide.start + 1,
                                    peptide.stop - 1,
                                    validate=False)
            series[ms_int,
                   'penalty_start'] = self.start_scores[penalty.slice].sum()
            series[ms_int,
                   'penalty_stop'] = self.stop_scores[penalty.slice].sum()
        else:
            series[ms_int, 'penalty_start'] = series[ms_int,
                                                     'penalty_stop'] = 0

        # MS Bool
        b_obs, f_obs = self._calc_observed(peptide)
        series[self.ms_bool_features.type, "first"] = self.h_first[i_start]
        series[self.ms_bool_features.type, "last"] = self.h_last[i_stop]
        series[self.ms_bool_features.type, "observed"] = b_obs

        # MS Frequency
        # ptm weights
        # TODO: should it get extra penalties if there are PTM's between start and end?
        ms_freq = self.ms_frequency_features.type
        series[ms_freq, 'acetylation'] = self.ac_freq[i_start]
        series[ms_freq, 'amidation'] = self.am_freq[i_stop]

        series[ms_freq, 'start'] = self.h_start_freq[i_start]
        series[ms_freq, 'stop'] = self.h_stop_freq[i_stop]
        series[ms_freq, 'observed'] = f_obs
        series[ms_freq, 'sample'] = self.h_sample[peptide.slice].min()
        series[ms_freq, 'ladder'] = \
            self.h_ladder_start[i_start] * self.h_ladder_stop[i_stop]
        series[ms_freq, 'protein_coverage'] = self.protein_coverage
        series[ms_freq, 'cluster_coverage'] = self.cluster_coverage[n_cluster]

        # thise are good features, but there may be better ways to extract them
        series[ms_freq,
               'bond'] = self.h_bond[self.get_bond_slice(peptide)].min()

        # MS Counts
        ms_count = self.ms_count_features.type
        series[ms_count, 'start'] = self.start_counts[peptide.start]
        series[ms_count, 'stop'] = self.stop_counts[peptide.stop]
        #  series[ms_count, 'ladder'] = \
        #      self.h_ladder_start[i_start] + self.h_ladder_stop[i_stop]

        ############################################################

        # Chemical
        sequence = self.protein_sequence[peptide.slice]
        peptide_features = GlobalDescriptor(sequence)

        is_amidated = series[ms_freq, 'amidation'] > 0.05
        peptide_features.calculate_all(amide=is_amidated)

        chem = self.chemical_features.type
        for i, name in enumerate(peptide_features.featurenames):
            if name in self.chemical_features.features:
                series[chem, name] = peptide_features.descriptor[0, i]

            eisenberg = PeptideDescriptor(sequence, 'eisenberg')
            eisenberg.calculate_moment()
            series[chem, 'eisenberg'] = eisenberg.descriptor.flatten()[0]

        # Annotations
        series[self.annotations.type, "Known"] = peptide in self.known_peptides
        #  series[self.annotations.type, "Type"] = peptide in self.known_peptides
        series[self.annotations.type, "Cluster"] = n_cluster
        series[self.annotations.type, "Sequence"] = peptide.seq
        series[self.annotations.type, "LPV"] = False  # TODO!

        series[self.annotations.type, "N Flanking"] = \
            self.get_nflanking_region(peptide.start, self.protein_sequence)
        series[self.annotations.type, "C Flanking"] = \
            self.get_cflanking_region(peptide.stop, self.protein_sequence)
        series[self.annotations.type, "LPV"] = peptide in lpvs
        if f_obs != 0:
            _pep_index = (slice(None), slice(None), peptide.start.pos,
                          peptide.stop.pos)
            series[self.annotations.type,
                   "Intensity"] = self.df.loc[_pep_index, :].sum().sum()
        return series
Ejemplo n.º 8
0
def describe_sequences():
    aa_letters = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    di_letters = ["%s%s" % (a, b) for a in aa_letters for b in aa_letters]
    letters = {1 : aa_letters, 2 : di_letters}
    
    def counter(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i : 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k : d[k]/l for k in d}
        if seq_type == 2:        
            for a in range(l-1):
                s = string[a:a+seq_type]
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k : d[k]/(l-1) for k in d}
        return d
        
    def residue_distribution(all_residues, seq_type):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter(all_residues, seq_type)
        residue_counts = list(sorted([(i, d[i]) for i in letters[seq_type] ]))                              ##Removes ambiguous letters
        r_c = [i[1] for i in residue_counts]
        dis = np.array([r_c,])
        return dis
    
    peptides = [{"seq" : "FLPILASLAAKFGPKLFCLVTKKC", "cTer" : None, "activity" : "YES"},
                {"seq" : "ILGPVISTIGGVLGGLLKNL", "cTer" : "Amidation", "activity" : "YES"},
                {"seq": "GIGGKILSGLKTALKGAAKELASTYLH", "cTer" : None, "activity" : "NO"},
                {"seq": "GIGSAILSAGKSALKGLAKGLAEHFAN", "cTer" : None, "activity" : "NO"},
                {"seq": "FLSLIPHAINAVSAIAKHF", "cTer" : "Amidation", "activity" : "NO"},
    ]
    
    
    for peptide in peptides:
        #print(peptide["id"])
        #print(peptide["seq"])
        
        globdesc = GlobalDescriptor(peptide["seq"])
        globdesc.calculate_all(amide = peptide["cTer"] == "Amidation")
        
        #peptide["GlobalDescriptor"] = globdesc
        
        #print(peptide["GlobalDescriptor"].descriptor)
        
        #Eisenberg hydrophobicity consensus
        #Take most of the values from here
        
        pepdesc = PeptideDescriptor(peptide["seq"], "eisenberg")
        pepdesc.calculate_global()
        pepdesc.calculate_moment(append=True)
        #pepdesc.calculate_profile(append=True, prof_type = "uH")
        
        pepdesc.load_scale("Ez")
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("charge_phys")
        pepdesc.calculate_moment(append=True)
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("flexibility")
        pepdesc.calculate_moment(append=True)
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("polarity")
        pepdesc.calculate_moment(append=True)
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("isaeci")
        pepdesc.calculate_global(append=True)
    
        pepdesc.load_scale("refractivity")
        pepdesc.calculate_moment(append=True)
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("z5")
        pepdesc.calculate_global(append=True)
        
        #peptide["PeptideDescriptor"] = pepdesc
    
        peptide["TotalDescriptor"] = str(np.concatenate((pepdesc.descriptor, globdesc.descriptor), axis=1))
        
        try:
            pepid = np.array([[int(peptide["id"].replace("HEMOLYTIK",""))]])
        except KeyError:
            pepid = np.array([[0]])
        
        freq_1d = residue_distribution(peptide["seq"], 1)
        freq_2d = residue_distribution(peptide["seq"], 2)
        
        len_peptide = np.array([[len(peptide["seq"])]])
        
        if peptide["activity"] == "YES":
            pepact = 1
        else:
            pepact = 0
        pepact = np.array([[pepact]])
        
        peptide_di2 = di2(peptide["seq"])
        
        peptide["array"] = np.concatenate((pepid, pepdesc.descriptor, globdesc.descriptor, len_peptide, 
               freq_1d, 
               #freq_2d, 
               #peptide_di2, 
               pepact,), axis=1)
        #print(peptide["TotalDescriptor"])
        
    
    x = np.concatenate([peptide["array"] for peptide in peptides], axis=0)
    print(x)
    
    np.save("hemolytik_array_custom_tests", x, allow_pickle=False)