Exemple #1
0
def generate_plot(key, my_seq):
    analysed_seq = ProteinAnalysis(my_seq)
    l = len(my_seq)

    window_size = 21

    scale = analysed_seq.protein_scale(param_dict=amino_acids,
                                       window=window_size,
                                       edge=0.75)

    x = range((window_size + 1) / 2, len(scale) + (window_size + 1) / 2)

    lookahead = 7
    minp, maxp = peakdetect(scale, lookahead=(lookahead + 1) / 2)

    start = min(x) - 1

    xpeaks = [xp[0] + (window_size + 1) / 2 for xp in minp]
    ypeaks = [scale[xpi - (window_size + 1) / 2] for xpi in xpeaks]

    t_x = np.array(scale)
    added_min = np.where(t_x < 0.9)[0]

    print(added_min)

    xdpeaks = [xdp[0] + (window_size + 1) / 2 for xdp in maxp]
    ydpeaks = [scale[xdpi - (window_size + 1) / 2] for xdpi in xdpeaks]

    num_pos = np.where(np.array(ydpeaks) < 0.9)[0].size
    print(num_pos)
    if num_pos == 0 and len(added_min) != 0:
        added_val = [scale[i] for i in list(added_min)]
        minimum = added_val.index(min(added_val)) - start + 2
        print(added_min[minimum])
        print(added_val[minimum])
        xdpeaks.append(added_min[minimum])
        ydpeaks.append(added_val[minimum])

    print("maxs:", np.array(xpeaks) + start)
    print("mins:", np.array(xdpeaks) + start)
    #print(scale)
    plt.clf()
    plt.plot(x, scale, 'b', xpeaks, ypeaks, 'ro', xdpeaks, ydpeaks, 'go')
    plt.grid(True)
    #plt.axis([0,max(x), min(scale)-0.05*min(scale), max(scale)+0.05*max(scale)])
    #plt.axis([0,max(x), 0.85, max(scale)+0.05*max(scale)])
    plt.legend(['Scores for ' + key])  #,'local maxima', 'local minima' ])
    plt.xlabel('Position')
    plt.ylabel('Score')
    plt.savefig('figs/' + key + '.png')
Exemple #2
0
def generate_plot(key, my_seq):
    analysed_seq = ProteinAnalysis(my_seq)
    l = len(my_seq)

    window_size = 21
    
    scale = analysed_seq.protein_scale(param_dict=amino_acids, window=window_size, edge=0.75)

    x = range((window_size+1)/2,len(scale)+(window_size+1)/2)

    lookahead = 7
    minp, maxp = peakdetect(scale, lookahead=(lookahead+1)/2)

    start = min(x)-1

    xpeaks = [xp[0]+(window_size+1)/2 for xp in minp]
    ypeaks = [scale[xpi-(window_size+1)/2] for xpi in xpeaks]

    t_x = np.array(scale)
    added_min = np.where(t_x < 0.9)[0]

    print(added_min)
    

    xdpeaks = [xdp[0]+(window_size+1)/2 for xdp in maxp]
    ydpeaks = [scale[xdpi-(window_size+1)/2] for xdpi in xdpeaks]

    num_pos = np.where(np.array(ydpeaks) < 0.9)[0].size 
    print(num_pos)
    if num_pos == 0 and len(added_min) != 0:
        added_val = [scale[i] for i in list(added_min)]
        minimum = added_val.index(min(added_val))-start+2
        print(added_min[minimum]) 
        print(added_val[minimum]) 
        xdpeaks.append(added_min[minimum])
        ydpeaks.append(added_val[minimum])

    print("maxs:",np.array(xpeaks)+start)
    print("mins:",np.array(xdpeaks)+start)
    #print(scale)
    plt.clf()
    plt.plot(x,scale,'b', xpeaks, ypeaks ,'ro', xdpeaks, ydpeaks ,'go')
    plt.grid(True)
    #plt.axis([0,max(x), min(scale)-0.05*min(scale), max(scale)+0.05*max(scale)])
    #plt.axis([0,max(x), 0.85, max(scale)+0.05*max(scale)])
    plt.legend( ['Scores for '+key])#,'local maxima', 'local minima' ])
    plt.xlabel('Position')
    plt.ylabel('Score')
    plt.savefig('figs/'+key+'.png')
Exemple #3
0
def biopython_protein_scale(inseq, scale, custom_scale_dict=None, window=7):
    """Use Biopython to calculate properties using a sliding window over a sequence given a specific scale to use."""

    if scale == 'kd_hydrophobicity':
        scale_dict = kd_hydrophobicity_one
    elif scale == 'bulkiness':
        scale_dict = bulkiness_one
    elif scale == 'custom':
        scale_dict = custom_scale_dict
    else:
        raise ValueError('Scale not available')

    inseq = ssbio.protein.sequence.utils.cast_to_str(inseq)
    analysed_seq = ProteinAnalysis(inseq)
    result = analysed_seq.protein_scale(param_dict=scale_dict, window=window)

    # Correct list length by prepending and appending "inf" (result needs to be same length as sequence)
    for i in range(window // 2):
        result.insert(0, float("Inf"))
        result.append(float("Inf"))

    return result
for seq in sequences:
        X=ProteinAnalysis(str(seq))
        isoelectricPt.append(X.isoelectric_point())
        aromaticity.append(X.aromaticity())  
        aminoPercent.append(X.get_amino_acids_percent())
        secstruct.append(X.secondary_structure_fraction())

# These features throw Key & Value Errors due to non standard amino acids
# (i.e. out of the 20 standard ones) e.g. X, U etc
        try:
            gravy.append(X.gravy())
            molweight.append(X.molecular_weight())
            instidx.append(X.instability_index())
            flex.append(X.flexibility())
            hydrophob.append(X.protein_scale(ProtParamData.kd, 9, 0.4))
            hydrophil.append(X.protein_scale(ProtParamData.hw, 9, 0.4))
            surface.append(X.protein_scale(ProtParamData.em, 9, 0.4))

        except (KeyError,ValueError):
            gravy.append(0)
            molweight.append(0)
            instidx.append(0)
            flex.append([0,0])
            hydrophob.append([0,0])
            hydrophil.append([0,0])
            surface.append([0,0])

isoelectricPt_df = pd.DataFrame(isoelectricPt,columns=['isoelectricPt'])
aromaticity_df = pd.DataFrame(aromaticity,columns=['aromaticity'])
aminoPercent_df = pd.DataFrame()
Exemple #5
0
def openfile():
    global prob, probab, te
    global my_seq
    global anti
    global structure, structure_id, filename
    global antigenicity, hydro, flex, sec
    global m, a, c, b, length, j, k
    global hydroph, flexi, access
    anti = []
    sec = []
    probab = []
    from tkinter import filedialog
    root = Tk()
    root.filename = filedialog.askopenfilename(
        initialdir="/",
        title="Select file",
        filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb")))
    filename = root.filename
    print(filename)
    structure_id = "1e6j"
    structure = PDBParser().get_structure(structure_id, root.filename)
    ppb = PPBuilder()
    for pp in ppb.build_peptides(structure):
        my_seq = pp.get_sequence()  # type: Seq
        print(my_seq)
    for model in structure:
        for chain in model:
            print(chain)
    sequence = list(my_seq)
    m = ''.join(sequence)
    print(m)
    length = len(m)  # type: int
    print("Sequence consist of", length, "Amino Acids")
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    analysed_seq = ProteinAnalysis(m)
    print("Molecular weight = ", analysed_seq.molecular_weight())
    print("Amino Acid Count = ", analysed_seq.count_amino_acids())
    print("Secondary structure fraction =",
          analysed_seq.secondary_structure_fraction())
    kd = {
        'A': 1.8,
        'R': -4.5,
        'N': -3.5,
        'D': -3.5,
        'C': 2.5,
        'Q': -3.5,
        'E': -3.5,
        'G': -0.4,
        'H': -3.2,
        'I': 4.5,
        'L': 3.8,
        'K': -3.9,
        'M': 1.9,
        'F': 2.8,
        'P': -1.6,
        'S': -0.8,
        'T': -0.7,
        'W': -0.9,
        'Y': -1.3,
        'V': 4.2
    }
    c = list(analysed_seq.flexibility())
    b = list(analysed_seq.protein_scale(kd, 10, 1.0))
    hydro = list(analysed_seq.protein_scale(kd, 10, 1.0))
    flex = list(analysed_seq.flexibility())
    hydroph = list(analysed_seq.protein_scale(kd, 10, 1.0))
    flexi = list(analysed_seq.flexibility())

    i = 1
    j = -1  # type: int
    k = 9
    while i <= (length - 10):
        print("Sequence is = ", m[j + 1:k + 1])
        print("Flexibility value = ", c[j + 1])
        print("Hydrophilicity value = ", b[j + 1])
        ana_seq = ''.join(m[j + 1:k + 1])
        analyze_seq = ProteinAnalysis(ana_seq)
        # For Secondary structure Analysis
        print("Secondary structure fraction =",
              analyze_seq.secondary_structure_fraction())
        a = list(analyze_seq.secondary_structure_fraction())
        a = a[0]
        sec.append(a)
        i += 1
        j += 1
        k += 1
    f = length
    r = 1
    y = 10
    global acc, logacc
    acc = []
    for i in range(0, f):
        str1 = "accessibility, resi "
        str2 = str(r) + "-" + str(y)
        saving = str1 + str2
        print(saving)
        r = r + 1
        y = y + 1
        structure = freesasa.Structure("1e6j.pdb")
        resulta = freesasa.calc(structure)
        area_classes = freesasa.classifyResults(resulta, structure)
        print("Total : %.2f A2" % resulta.totalArea())
        for key in area_classes:
            print(key, ": %.2f A2" % area_classes[key])
        resulta = freesasa.calc(
            structure,
            freesasa.Parameters({
                'algorithm': freesasa.LeeRichards,
                'n-slices': 10
            }))
        selections = freesasa.selectArea(('alanine, resn ala', saving),
                                         structure, resulta)
        for key in selections:
            print(key, ": %.2f A2" % selections[key])
            a = selections[key]
            acc.append(a)

    l = acc[0::2]
    access = l
    print(acc)
    print(l)
    logacc = [math.log(y, 10) for y in l]

    print(logacc)
Exemple #6
0
class Protein(ProteinAnalysis):

    category_codes = {
        'cyto': 0,
        'mito': 1,
        'secreted': 2,
        'nucleus': 3,
        'blind': 4
    }
    inv_category_codes = {
        0: 'cyto',
        1: 'mito',
        2: 'secreted',
        3: 'nucleus',
        4: 'blind'
    }

    def __init__(self, category_name, name, sequence):
        super(Protein, self).__init__(sequence)
        self._name = name
        self._category = Protein.category_codes[category_name]
        self._sequence = sequence
        if 'X' in sequence or 'U' in sequence or 'B' in sequence:
            self._contains_unknown = True
            new_seq = self._sequence.replace('U', '')
            new_seq = new_seq.replace('B', '')
            new_seq = new_seq.replace('X', '')
            self._no_unknowns = ProteinAnalysis(new_seq)
        else:
            self._contains_unknown = False
            self._no_unknowns = self

    def get_name(self):
        return self._name

    def get_sequence(self):
        return self._sequence

    def sequence_length(self):
        return len(sequence)

    def get_sub_sequence(self, seq_flag):
        if seq_flag == WHOLE_SEQUENCE:
            return self
        elif seq_flag == N_TERMINAL_50:
            return Protein(Protein.inv_category_codes[self._category],
                           self._name, self._sequence[:50])
        elif seq_flag == C_TERMINAL_50:
            return Protein(Protein.inv_category_codes[self._category],
                           self._name, self._sequence[-50:])

    def get_category(self):
        return self._category

    def molecular_weight(self):  #overrides base class
        if self._contains_unknown:
            new_seq = ''
            for aa in self._sequence:
                if aa not in UNKNOWNS:
                    new_seq += aa
            new_p = ProteinAnalysis(new_seq)
            mw = new_p.molecular_weight()
            #just increase by avg mw of known aa's
            mw *= len(self._sequence) / len(new_seq)
        else:
            mw = super(Protein, self).molecular_weight()

        return mw

    def instability_index(self):  #overrides base class
        if self._contains_unknown:
            index = ProtParamData.DIWV
            score = 0.0
            sub_sequences = self._sequence.replace('U', 'X')
            sub_sequences = sub_sequences.replace('B', 'X')
            sub_sequences = sub_sequences.split('X')
            for seq in sub_sequences:
                for i in range(len(seq) - 1):
                    this, nextt = seq[i:i + 2]
                    dipeptide_value = index[this][nextt]
                    score += dipeptide_value

            in_idx = (10.0 /
                      (len(self._sequence) - len(sub_sequences) + 1)) * score
        else:
            in_idx = super(Protein, self).instability_index()

        return in_idx

    def flexibility(self):  #overrides base class
        if self._contains_unknown:
            new_sequence = self._sequence.replace('X',
                                                  'R')  #replace with an avg aa
            new_sequence = new_sequence.replace('U',
                                                'R')  #replace with an avg aa
            new_sequence = new_sequence.replace('B',
                                                'R')  #replace with an avg aa
            new_p = ProteinAnalysis(new_sequence)
            flex = new_p.flexibility()
        else:
            flex = super(Protein, self).flexibility()

        return flex

    def in_vivo_half_life(self):  #N-end rule
        if self._sequence[0] not in UNKNOWNS:
            return in_vivo_half_life[self._sequence[0]]
        else:
            return 5  #approx avg value

    def has_KDEL(self):  #ER retention signal
        if self._sequence[-4:] == 'KDEL':
            return True
        else:
            return False

    def has_KKXX(self):  #ER retention signal
        if self._sequence[-4:-2] == 'KK':
            return True
        else:
            return False

    def has_NLS(self):  #NLS signal
        if 'PKKKRKV' in self._sequence:
            return True
        else:
            return False

    def has_Chelsky_sequence(self):  #NLS signal
        #               K-K/R-X-K/R
        try:
            found = re.search('K[KR].[KR]', self._sequence)
        except AttributeError:
            # AAA, ZZZ not found in the original string
            found = None  # apply your error handling

        if found is not None:
            return True
        else:
            return False

    def has_PTS(
            self):  #peroxisomal targeting signal (PTS) - SKL in carboxy tail
        if self._sequence[-3:] == 'SKL':
            return True
        else:
            return False

    def hydrophobicity(self):
        hphob = self._no_unknowns.protein_scale(ProtParamData.kd, 3)
        return np.mean(hphob)

    def surface_accessibility(self):
        em = self._no_unknowns.protein_scale(ProtParamData.em, 3)
        return np.mean(em)

    def transfer_energy(self):
        te = self._no_unknowns.protein_scale(ProtParamData.ja, 3)
        return np.mean(te)

    def hydrophilicity(self):
        hphil = self._no_unknowns.protein_scale(ProtParamData.hw, 3)
        return np.mean(hphil)
def openfile():
    global my_seq
    global antigenicity
    global m, a, c, b
    from tkinter import filedialog
    root = Tk()
    root.filename = filedialog.askopenfilename(
        initialdir="/",
        title="Select file",
        filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb")))
    print(root.filename)
    structure_id = "1e6j"
    structure = PDBParser().get_structure(structure_id, root.filename)
    ppb = PPBuilder()
    for pp in ppb.build_peptides(structure):
        my_seq = pp.get_sequence()  # type: Seq
        print(my_seq)
    for model in structure:
        for chain in model:
            print(chain)
    sequence = list(my_seq)
    m = ''.join(sequence)  # type: str
    print(m)
    length = len(m)  # type: int
    print(length)
    print("Sequence consist of", len(m), "Amino Acids")
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    analysed_seq = ProteinAnalysis(m)
    print("Molecular weight = ", analysed_seq.molecular_weight())
    print("Amino Acid Count = ", analysed_seq.count_amino_acids())
    print("Secondary structure fraction =",
          analysed_seq.secondary_structure_fraction())
    kd = {
        'A': 1.8,
        'R': -4.5,
        'N': -3.5,
        'D': -3.5,
        'C': 2.5,
        'Q': -3.5,
        'E': -3.5,
        'G': -0.4,
        'H': -3.2,
        'I': 4.5,
        'L': 3.8,
        'K': -3.9,
        'M': 1.9,
        'F': 2.8,
        'P': -1.6,
        'S': -0.8,
        'T': -0.7,
        'W': -0.9,
        'Y': -1.3,
        'V': 4.2
    }
    c = list(analysed_seq.flexibility())
    b = list(analysed_seq.protein_scale(kd, 10, 1.0))
    i = 1
    j = -1  # type: int
    k = 9
    while i <= (length - 10):
        print("Sequence is = ", m[j + 1:k + 1])
        print("Flexibility value = ", c[j + 1])
        print("Hydrophilicity value = ", b[j + 1])
        ana_seq = ''.join(m[j + 1:k + 1])
        analyze_seq = ProteinAnalysis(ana_seq)
        # For Secondary structure Analysis
        print("Secondary structure fraction =",
              analyze_seq.secondary_structure_fraction())
        a = list(analyze_seq.secondary_structure_fraction())
        global tupleall
        tupleall = (m[j + 1:k + 1], c[j + 1], b[j + 1], a)
        print(tupleall[0], tupleall[2], tupleall[1], tupleall[3])
        i = i + 1
        if a[0] >= a[1]:
            a[0] = 1
        else:
            a[0] = a[1]
        # For Hydrophilicity
        if b[j + 1] > 0.5:
            b[j + 1] = 2
        elif b[j + 1] < 0.5 or b[j + 1] > 0:
            b[j + 1] = 1
        elif b[j + 1] > 0 or b[j + 1] > -0.4:
            b[j + 1] = -1
        elif b[j + 1] < -0.4:
            b[j + 1] = -2
        else:
            b[j + 1] = 0
        # For Flexibility
        if c[j + 1] > 1.0:
            c[j + 1] = 1
        else:
            c[j + 1] = 0
        # For antigenicity Index
        antigenicity = 0.3 * b[j + 1] + 0.15 * 1 + 0.15 * c[j + 1] + 0.2 * a[0]
        print("antigenicity", antigenicity)
        j += 1
        k += 1
 def transform(self, X):
     vec = np.zeros((len(X), 1))
     for i in range(len(X)):
         pa = ProteinAnalysis(str(X[i]))
         vec[i, 0] = pa.protein_scale()
     return vec
def disruptin_table(garnier_file, fasta_file):
    # Iterable variables
    position = 1
    net_charge = 0
    charge_res = 0
    record_number = 0

    # loop structures
    names = []
    sec_struct = []

    # reading the lines from the garnier csv file
#    with open(garnier_file,'r') as csvfile:
#        garnierreader = csv.reader(csvfile)
    for row in garnier_file:
        if row[0] == 'Sequence: ':
            names += [row[1]]
        elif row[0] in 'HETC':
	    row = row.split('\t')
            sec_struct += [''.join(row)]
			
    record = []
    p = []
    r = []
    c = []
    h = []
    s = []

    # Parse the .fasta file and get the sequence
    for rec in SeqIO.parse(fasta_file, "fasta"):
        sequence = str(rec.seq)

        # Set up the information vectors: for position #, residue, hydrophobic/charge/polar/nonpolar, and secondary
        # structure
        record += [rec.id]
        position_vec = []
        residue_vec = []
        charge_sym_vec = []
        sec_struct_vec = []

        for aa in sequence:
            position_vec += [str(position)]
            residue_vec += [str(aa)]
            sec_struct_vec += [str(sec_struct[record_number][position - 1])]

            # For R and K residues a positive charge is given
            if aa in "RK":
                symbol = "+"
            # For D and E residues a negative charge is given
            elif aa in "DE":
                symbol = "-"
            elif aa in "AVMILPWFG":
                symbol = "N"
            elif aa in "HSYTCQN":
                symbol = "P"
            charge_sym_vec += symbol
            position += 1

            # Calculating hyrophobicity based on Kyte and Doolittle scale. Using binning value of 9. Since the binning
            # is 9, the first 4 residues and last 4 residues as set blank so as to center the values to their
            # approximate position on the sequence.
            prot_ana_seq = ProteinAnalysis(sequence)
            hydro = [0] * 4 + prot_ana_seq.protein_scale(ProtParamData.kd, 9) + [0] * 4

        record_number += 1
        position = 1

        p += [position_vec]
        r += [residue_vec]
        c += [charge_sym_vec]
        h += [hydro]
        s += [sec_struct_vec]

    # returns values for name of the sequence
    return record, p, r, c, h, s
    '--graph',
    type=bool,
    default=False,
    help='Toggles the creation of a folder with images of the hydropathy graphs'
)

args = parser.parse_args()

#Parsing the preestablished sequences
for record in SeqIO.parse(args.input_reference, 'fasta'):
    seq_record.append(str(record.seq))
    seq_names.append(str(record.id))

for index in range(len(seq_names)):
    seq = ProteinAnalysis(seq_record[index])
    test = seq.protein_scale(kyle_doolittle, 7)
    seq_analysis.append(test)

all_trends = []
s_regions = []

#Generates Average for preestablished data
for index in range(len(seq_analysis)):
    a = generate_region_average(
        filter_profiles(
            analyze_profile(create_cors(seq_analysis[index], 4), [])))
    s_regions.append(a)

for index in range(len(s_regions)):
    all_trends = mysort(s_regions[index], all_trends)
Exemple #11
0
    def featurise(self, data):
        """
        Featurise the data.

        Parameters:
        -----------
        data : `list` of `Bio.SeqRecord.SeqRecord`
            The data to be featurised.

        Returns:
        -------
        featurised_data : `pandas.DataFrame`
            (num_data, features) The featurised data.
        """
        # Get features of data
        features = collections.defaultdict(list)

        # Featurise the data
        for i, example in enumerate(data):
            # Convert Bio.SeqRecord.SeqRecord object to string for Bio.SeqUtils.ProtParam.ProteinAnalysis
            analysed_example = ProteinAnalysis(str(example.seq))
            first50_analysed_example = ProteinAnalysis(str(example.seq)[:50])
            last50_analysed_example = ProteinAnalysis(str(example.seq)[-50:])

            features["length"].append(analysed_example.length)
            features["molecular_weight"].append(
                analysed_example.molecular_weight())
            features["isoelectric_point"].append(
                analysed_example.isoelectric_point())
            features["aromaticity"].append(analysed_example.aromaticity())
            features["instability_index"].append(
                analysed_example.instability_index())
            features["gravy"].append(analysed_example.gravy())

            reduced, oxidised = analysed_example.molar_extinction_coefficient()
            features["reduced"].append(reduced)
            features["oxidised"].append(oxidised)

            helix, turn, sheet = analysed_example.secondary_structure_fraction(
            )
            features["helix"].append(helix)
            features["turn"].append(turn)
            features["sheet"].append(sheet)

            features["charge_at_ph1"].append(analysed_example.charge_at_pH(1))
            # features["charge_at_ph2"].append(analysed_example.charge_at_pH(2))
            # features["charge_at_ph3"].append(analysed_example.charge_at_pH(3))
            # features["charge_at_ph4"].append(analysed_example.charge_at_pH(4))
            features["charge_at_ph7"].append(analysed_example.charge_at_pH(7))
            features["charge_at_ph12"].append(
                analysed_example.charge_at_pH(12))

            features["hydrophobicity"].append(
                np.mean(
                    analysed_example.protein_scale(self.dicts['kd'],
                                                   window=5,
                                                   edge=1.0)))
            features["flexibility"].append(
                np.mean(
                    analysed_example.protein_scale(self.dicts['flex'],
                                                   window=5,
                                                   edge=1.0)))
            features["hydrophilicity"].append(
                np.mean(
                    analysed_example.protein_scale(self.dicts['hw'],
                                                   window=5,
                                                   edge=1.0)))
            features["surface_accessibility"].append(
                np.mean(
                    analysed_example.protein_scale(self.dicts['em'],
                                                   window=5,
                                                   edge=1.0)))
            features["janin"].append(
                np.mean(
                    analysed_example.protein_scale(self.dicts['ja'],
                                                   window=5,
                                                   edge=1.0)))
            #         features["dipeptide_dg "].append(np.mean(analysed_example.protein_scale(self.dicts['diwv'], window=5, edge=1.0)))

            features["first50_hydrophobicity"].append(
                np.mean(
                    first50_analysed_example.protein_scale(self.dicts['kd'],
                                                           window=5,
                                                           edge=1.0)))
            features["first50_flexibility"].append(
                np.mean(
                    first50_analysed_example.protein_scale(self.dicts['flex'],
                                                           window=5,
                                                           edge=1.0)))
            features["first50_hydrophilicity"].append(
                np.mean(
                    first50_analysed_example.protein_scale(self.dicts['hw'],
                                                           window=5,
                                                           edge=1.0)))
            features["first50_surface_accessibility"].append(
                np.mean(
                    first50_analysed_example.protein_scale(self.dicts['em'],
                                                           window=5,
                                                           edge=1.0)))
            features["first50_janin"].append(
                np.mean(
                    first50_analysed_example.protein_scale(self.dicts['ja'],
                                                           window=5,
                                                           edge=1.0)))

            features["last50_hydrophobicity"].append(
                np.mean(
                    last50_analysed_example.protein_scale(self.dicts['kd'],
                                                          window=5,
                                                          edge=1.0)))
            features["last50_flexibility"].append(
                np.mean(
                    last50_analysed_example.protein_scale(self.dicts['flex'],
                                                          window=5,
                                                          edge=1.0)))
            features["last50_hydrophilicity"].append(
                np.mean(
                    last50_analysed_example.protein_scale(self.dicts['hw'],
                                                          window=5,
                                                          edge=1.0)))
            features["last50_surface_accessibility"].append(
                np.mean(
                    last50_analysed_example.protein_scale(self.dicts['em'],
                                                          window=5,
                                                          edge=1.0)))
            features["last50_janin"].append(
                np.mean(
                    last50_analysed_example.protein_scale(self.dicts['ja'],
                                                          window=5,
                                                          edge=1.0)))

            for key, val in analysed_example.get_amino_acids_percent().items():
                features[key].append(val * 5)
            for key, val in first50_analysed_example.get_amino_acids_percent(
            ).items():
                features["first_50_" + str(key)].append(val * 5)
            for key, val in last50_analysed_example.get_amino_acids_percent(
            ).items():
                features["last_50_" + str(key)].append(val * 5)
        return pd.DataFrame.from_dict(features)
        st.write('Thank you for your feedback, it has been recorded!')
    st.markdown('---')

    if show_aa_chart:
        st.subheader('Breakdown of Virus Amino Acids')
        display_aa_chart(virus_df)
        st.markdown('---')

    if show_prot_scale:
        st.subheader(
            'Protein Scale of Virus Sequence (Hydropathicity Amino Acid Scale)'
        )
        analysis = ProteinAnalysis(virus_df[virus_df['id'] == virus_select]
                                   ['protein_sequence'].iloc[0])
        prot_scale = analysis.protein_scale(param_dict=hydropathicity,
                                            window=wsl,
                                            edge=esl)
        scale_fig = go.Figure(data=go.Scatter(x=np.arange(1, len(prot_scale)),
                                              y=np.array(prot_scale)))
        scale_fig.update_layout(xaxis_title='Position',
                                yaxis_title='Hydropathicity')
        st.plotly_chart(scale_fig)
        st.markdown('---')

    if show_df:
        st.subheader('Raw Data')
        st_ms = st.multiselect("Columns",
                               virus_df.columns.tolist(),
                               default=ms_cols)
        # row_limit = st.sidebar.slider("Dataframe rows:", 1, 10, 1)
        st.dataframe(virus_df[virus_df['id'] == virus_select][st_ms].head())
Exemple #13
0
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import ProtParamData
from Bio import SeqIO
with open('../../samples/pdbaa') as fh:
   for rec in SeqIO.parse(fh,'fasta'):
       myprot = ProteinAnalysis(str(rec.seq))
       print(myprot.count_amino_acids())
       print(myprot.get_amino_acids_percent())
       print(myprot.molecular_weight())
       print(myprot.aromaticity())
       print(myprot.instability_index())
       print(myprot.flexibility())
       print(myprot.isoelectric_point())
       print(myprot.secondary_structure_fraction())
       print(myprot.protein_scale(ProtParamData.kd, 9, .4))