コード例 #1
0
    def test(self, positive_file, negative_file, sequence_position=10):
        # for my test files sequence position = 10
        test_features = []
        test_labels = []
        with open(positive_file) as f:
            for i in f:
                if ">" not in i and i[sequence_position] == self.amino_acid:
                    temp_window = ProteinAnalysis(
                        windower(i, sequence_position,
                                 self.window).strip("\t"))
                    feat = featurify(temp_window, (2 * self.window + 1))
                    test_features.append(feat)
                    test_labels.append(1)
        with open(negative_file) as f:
            for i in f:

                if ">" not in i and i[
                        sequence_position] == self.amino_acid and "X" not in i and "U" not in i:
                    temp_window = ProteinAnalysis(
                        windower(i, sequence_position,
                                 self.window).strip("\t"))
                    feat = featurify(temp_window, (2 * self.window + 1))
                    test_features.append(feat)
                    test_labels.append(0)
        temp = list(zip(test_features, test_labels))
        random.shuffle(temp)
        test_features, test_labels = zip(*temp)

        test_results = self.clf.predict(test_features)
        #print("cross val"+str(cross_val_score(self.clf, test_features, test_labels, cv=5)))
        report(results=test_results, answers=test_labels, classy=self.clf)
コード例 #2
0
    def analyzeCleaves(self):

        #i used to iterate through cleave sites
        #j used to iterate for miss cleaves. Skips j cleave site(s) when calculating the peptide from cleave sites
        
        for i in range(len(self.sites)):
            end = False
            for j in range(self.misses+1):
                l = self.peptide[:self.sites[i]+1]
                try:
                    r = self.peptide[self.sites[i+j+1]+1:]
                    dp = self.peptide[self.sites[i]+1:self.sites[i+j+1]+1]
                except IndexError:
                    #When code reaches this block, it means the end of the input string has been found
                    #Set end to true to stop going through missed cleaves, no more exist
                    r = ''
                    dp = self.peptide[self.sites[i]+1:]
                    end = True
                if i == 0:
                    l = self.peptide[:self.sites[i+j]+1]
                    if self.checkLenWeight(l):
                        self.dpeps.append([l,len(l),ProteinAnalysis(str(l)).molecular_weight(),j,'',dp+r,str(1)+'-'+str(len(l))])
                if self.checkLenWeight(dp):
                    self.dpeps.append([dp,len(dp),ProteinAnalysis(str(dp)).molecular_weight(),j,l,r,str(self.sites[i]+2)+'-'+str(self.sites[i]+len(dp)+1)])
                if end:
                    break
コード例 #3
0
def binaryFeatureTable(PosSeqFiles, NegSeqFiles):
    seqDicts = []

    #add sequences from each file in positive group
    sequenceClass = 1
    for file in PosSeqFiles:
        records = readfasta(file)
        for rec in records:
            seqDict = ProteinAnalysis(str(rec.seq)).get_amino_acids_percent()
            seqDict['Class'] = sequenceClass
            seqDict['Length'] = len(rec.seq)
            seqDict['ID'] = rec.id
            seqDicts.append(seqDict)

    #add sequences from each file in negative group
    sequenceClass = 0
    for file in NegSeqFiles:
        records = readfasta(file)
        for rec in records:
            seqDict = ProteinAnalysis(str(rec.seq)).get_amino_acids_percent()
            seqDict['Class'] = sequenceClass
            seqDict['Length'] = len(rec.seq)
            seqDict['ID'] = rec.id
            seqDicts.append(seqDict)

    return pd.DataFrame(seqDicts)
コード例 #4
0
ファイル: seqproperties.py プロジェクト: joseph266394/bactome
def _toPeptide(sequence, molecule, genetic_code=1, to_stop=True):
    '''
    Private function - Takes a sequence (DNA/RNA/amino acid) and 
    process it according to return a ProteinAnalysis object.

    @param sequence String: Nucleotide (DNA/RNA) or amino acid 
    sequence.
    @param molecule String: Defines the type of molecule. Three 
    options are allowed: 'peptide' for amino acid sequences, 'DNA' for 
    DNA sequences (requires transcription and translation), and 'RNA' 
    for RNA sequence (requires translation).
    @param genetic_code Integer: Genetic code number to be used for 
    translation. Default = 1 (Standard Code). For more information, 
    see <https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>
    @param to_stop Boolean: Flag to stop translation when first stop 
    codon is encountered. Default = True.
    @return: Bio.SeqUtils.ProtParam.ProteinAnalysis object
    '''
    if molecule.lower() == 'peptide':
        peptide = ProteinAnalysis(sequence)
    elif molecule.lower() == 'rna':
        rna = str(sequence)
        rna = Seq(rna, generic_rna)
        peptide = rna.translate(genetic_code, to_stop=to_stop)
        peptide = ProteinAnalysis(str(peptide))
    elif molecule.lower() == 'dna':
        dna = str(sequence)
        dna = Seq(dna, generic_dna)
        rna = dna.transcribe()
        peptide = rna.translate(genetic_code, to_stop=to_stop)
        peptide = ProteinAnalysis(str(peptide))
    return peptide
コード例 #5
0
def gravy(sequence):
    if 'X' or '*' in sequence:
        sequence = sequence.replace('X', '')
        sequence = sequence.replace('*', '')
        g = ProteinAnalysis(sequence).gravy()
    else:
        g = ProteinAnalysis(sequence).gravy()
    return g
コード例 #6
0
 def test_molecular_weight(self):
     "Test Lassopeptide.molecular_weight"
     lasso = Lassopeptide(23, 42, 17, 51, "lead", "MAGICHATTIP")
     lasso.c_cut = "TIP"
     analysis = ProteinAnalysis("MAGICHATTIP", monoisotopic=False)
     cut_analysis = ProteinAnalysis("MAGICHAT", monoisotopic=False)
     mw = analysis.molecular_weight() - 18.02
     cut_mw = cut_analysis.molecular_weight() - 18.02
     self.assertAlmostEqual(mw, lasso.molecular_weight)
     self.assertAlmostEqual(cut_mw, lasso.cut_weight)
コード例 #7
0
def _create_features_biopython(data, local=20, indiv_keys=False):
    from Bio.SeqUtils.ProtParam import ProteinAnalysis

    feature_fun = {
        'molecular_weight{}':
        lambda pa: pa.molecular_weight(),
        'iso_point{}':
        lambda pa: pa.isoelectric_point(),
        'aromaticity{}':
        lambda pa: pa.aromaticity(),
        # 'gravy{}': lambda pa: pa.gravy(),
        'instability_index{}':
        lambda pa: pa.instability_index(),
        'flexibility{}':
        lambda pa: flexibility_index(pa.flexibility()),
        'secondary_structure_fraction{}':
        lambda pa: pa.secondary_structure_fraction()
    }

    for k in feature_fun.keys():
        data[k.format('')] = []
        data[k.format('_localfirst')] = []
        data[k.format('_locallast')] = []

    for seq in data['seq']:
        # Global features
        seq = replace_selenocysteine(replace_wild_first(seq))
        pa = ProteinAnalysis(seq)
        for k, fun in feature_fun.items():
            data[k.format('')].append(fun(pa))

        # Local features
        pa = ProteinAnalysis(seq[:local])
        for k, fun in feature_fun.items():
            data[k.format('_localfirst')].append(fun(pa))

        pa = ProteinAnalysis(seq[-local:])
        for k, fun in feature_fun.items():
            data[k.format('_locallast')].append(fun(pa))

    _make_numpy(data)

    if indiv_keys:
        ssf_len = data['secondary_structure_fraction'].shape[1]
        for i in range(ssf_len):
            data['secondary_structure_fraction_{}'.format(
                i)] = data['secondary_structure_fraction'][:, i]
            data['secondary_structure_fraction_localfirst_{}'.format(
                i)] = data['secondary_structure_fraction_localfirst'][:, i]
            data['secondary_structure_fraction_locallast_{}'.format(
                i)] = data['secondary_structure_fraction_locallast'][:, i]

        del data['secondary_structure_fraction']
        del data['secondary_structure_fraction_localfirst']
        del data['secondary_structure_fraction_locallast']
コード例 #8
0
ファイル: amino_acids.py プロジェクト: GradinaruLab/pepars
def generate_array_of_interest(enrichment_sequences_array, array_property):
    array_of_interest = []
    for idx in range(0, len(enrichment_sequences_array)):
        current_sequence = enrichment_sequences_array[idx]
        if (str(array_property) == 'aromaticity'):
            array_of_interest.append(
                ProteinAnalysis(str(current_sequence)).aromaticity())
        elif (str(array_property) == 'flexibility'):
            array_of_interest.append(
                ProteinAnalysis(str(current_sequence)).flexibility())
    return array_of_interest
コード例 #9
0
def calculate_pI_from_file(file, output_dir, cutoff_pi, out_CSV_pi):
    """Calculate the pI of all sequences in FASTA file.

    """
    modifications = define_seq_modifications()
    count_sequences_done = 0
    total_start_time = time.time()
    with open(file, "r") as handle:
        for record in SeqIO.parse(handle, "fasta", alphabet=IUPAC.protein):
            record_list = record.description.split("|")
            # get meta data
            res = get_record_meta(record_list)
            acc_code, organism, EC_code, species, note = res
            # get unmodified pI
            seq_obj = ProteinAnalysis(''.join(record.seq))
            pi = seq_obj.isoelectric_point()
            count_sequences_done += 1
            modifier = '0'
            if pi < cutoff_pi:
                category = '0'
            else:
                category = '1'
            # output to CSV
            output_pI_row(output_dir, out_CSV_pi, file, acc_code, organism,
                          EC_code, species, note, pi, modifier, category)

            # if the category is 1 - i.e. pi > cutoff
            # then we test modification
            if category == '1':
                modifier = '1'
                # get modified pI
                seq = record.seq
                # replace target amino acid residue
                # with replacement amino acid residue
                # one letter codes
                targ = convert_to_one_letter_code_sing(
                    modifications[modifier]['target_res'])
                replacement = convert_to_one_letter_code_sing(
                    modifications[modifier]['replace_res'])
                mod_seq = ''.join(seq).replace(targ, replacement)
                seq_obj = ProteinAnalysis(mod_seq)
                pi = seq_obj.isoelectric_point()
                count_sequences_done += 1
                if pi < cutoff_pi:
                    category = '0'
                else:
                    category = '1'
                # output to CSV
                output_pI_row(output_dir, out_CSV_pi, file, acc_code, organism,
                              EC_code, species, note, pi, modifier, category)
            # break
    print('--- finished %s sequences in %s seconds ---' %
          (count_sequences_done,
           '{0:.2f}'.format(time.time() - total_start_time)))
コード例 #10
0
def aa_analysis(df, property):
    if property == "ncpr":
        df = df[pd.notnull(df['Amino_acids'])]
        df[["AA1","AA2"]] = df['Amino_acids'].str.split('/',expand=True)
        isoelectric_point = []
        for sequence in df["AA1"]:
            try:
                cdr3 = ProteinAnalysis(str(sequence))
                cidercdr3 = SequenceParameters(str(sequence)) 
                isoelectric_point.append(cidercdr3.get_NCPR())
            except:
                isoelectric_point.append(0)
                pass
        df["AA1_Iso"] = isoelectric_point
        isoelectric_point2 = []
        for sequence in df["AA2"]:
            try:
                cdr3 = ProteinAnalysis(str(sequence))
                cidercdr3 = SequenceParameters(str(sequence)) 
                isoelectric_point2.append(cidercdr3.get_NCPR())
            except:
                isoelectric_point2.append(0)
                pass
        df["AA2_Iso"] = isoelectric_point2
        df["AA_Iso_Delta"] = df["AA2_Iso"] - df["AA1_Iso"]
        df = df[["AA1_Iso", "AA2_Iso", "AA_Iso_Delta"]]
    elif property == "uversky_hydropathy":
        df = df[pd.notnull(df['Amino_acids'])]
        df[["AA1","AA2"]] = df['Amino_acids'].str.split('/',expand=True)
        isoelectric_point = []
        for sequence in df["AA1"]:
            try:
                cdr3 = ProteinAnalysis(str(sequence))
                cidercdr3 = SequenceParameters(str(sequence)) 
                isoelectric_point.append(cidercdr3.get_uversky_hydropathy())
            except:
                isoelectric_point.append(0)
                pass
        df["AA1_Iso"] = isoelectric_point
        isoelectric_point2 = []
        for sequence in df["AA2"]:
            try:
                cdr3 = ProteinAnalysis(str(sequence))
                cidercdr3 = SequenceParameters(str(sequence)) 
                isoelectric_point2.append(cidercdr3.get_uversky_hydropathy())
            except:
                isoelectric_point2.append(0)
                pass
        df["AA2_Iso"] = isoelectric_point2
        df["AA_Iso_Delta"] = df["AA2_Iso"] - df["AA1_Iso"]
        df = df[["AA1_Iso", "AA2_Iso", "AA_Iso_Delta"]]
    return df
コード例 #11
0
def find_composition(df_original):
    df_copy = df_original.copy()

    column_names = []
    for ch in codes:
        column_names.append(ch + '_percent')
        column_names.append(ch + '_percent_first')
        column_names.append(ch + '_percent_last')
    column_names.append('len')
    column_names.append('weight')
    column_names.append('gravy')
    column_names.append('flex_mean')
    column_names.append('flex_std')
    column_names.append('ss_helix')
    column_names.append('ss_turn')
    column_names.append('ss_sheet')
    column_names.append('iep')
    column_names.append('aromaticity')

    df = pd.DataFrame(columns=column_names)
    for _, seq in enumerate(tqdm(df_copy['seq'])):
        df_temp = pd.Series()
        sequence = str(seq)
        analysed = ProteinAnalysis(sequence)
        analysed_first = ProteinAnalysis(sequence[:first_n])
        analysed_last = ProteinAnalysis(sequence[-last_n:])

        df_temp['len'] = analysed.length
        df_temp['ss_helix'], df_temp['ss_turn'], df_temp['ss_sheet'] = analysed.secondary_structure_fraction()
        df_temp['iep'] = analysed.isoelectric_point()

        # overall
        for aa, percent in analysed.get_amino_acids_percent().items():
            df_temp[aa + '_percent'] = percent

        # # first N
        for aa, percent in analysed_first.get_amino_acids_percent().items():
            df_temp[aa + '_percent_first'] = percent

        # last N
        for aa, percent in analysed_last.get_amino_acids_percent().items():
            df_temp[aa + '_percent_last'] = percent

        df_temp['weight'] = analysed.molecular_weight()
        df_temp['gravy'] = analysed.gravy()
        df_temp['aromaticity'] = analysed.aromaticity()
        df_temp['flex_mean'] = np.mean(analysed.flexibility())
        df_temp['flex_std'] = np.std(analysed.flexibility())
        df = df.append(df_temp, ignore_index=True)

    return pd.concat([df_copy, df], axis=1)
コード例 #12
0
def calculate_rxn_syst_pI(sequence, rxn_syst, cutoff_pi):
    """
    Calculate the pI of a sequence associated with a reaction system.

    """
    modifications = define_seq_modifications()
    seq_obj = ProteinAnalysis(sequence)
    pi = seq_obj.isoelectric_point()
    modifier = '0'
    if pi < cutoff_pi:
        category = '0'
    else:
        category = '1'

    if category == '0':
        rxn_syst.seed_MOF = True
        rxn_syst.pI = pi

    # if the category is 1 - i.e. pi > cutoff
    # then we test modification
    elif category == '1':
        # report unmodified pI if modification isn't successful
        rxn_syst.pI = pi
        modifier = '1'
        # get modified pI
        seq = sequence
        # replace target amino acid residue
        # with replacement amino acid residue
        # one letter codes
        targ = convert_to_one_letter_code_sing(
            modifications[modifier]['target_res'])
        replacement = convert_to_one_letter_code_sing(
            modifications[modifier]['replace_res'])
        mod_seq = ''.join(seq).replace(targ, replacement)
        seq_obj = ProteinAnalysis(mod_seq)
        pi = seq_obj.isoelectric_point()
        if pi < cutoff_pi:
            category = '0'
        else:
            category = '1'

        if category == '0':
            rxn_syst.seed_MOF = True
            rxn_syst.req_mod = modifier
            rxn_syst.pI = pi
        else:
            rxn_syst.seed_MOF = False

    return rxn_syst
コード例 #13
0
 def calc_isoelectric_point(self) -> float:
     """
     using biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam-pysrc.html
     :return: calculates the sequence's isoelectric point
     """
     protein_analysis = ProteinAnalysis(self.get_seq())
     return protein_analysis.isoelectric_point()
コード例 #14
0
    def _protein_parameters(self, sequence):
        """Calculates physicochemical properties for the amino acid sequence.
        
        Args:
            sequence: str, amino acid sequence.
            
        Returns: 
            property_arr: np array, vector of properties.
            
        """

        analysis = ProteinAnalysis(sequence)

        property_arr = []

        property_arr.append(analysis.molecular_weight())
        property_arr.append(analysis.aromaticity())
        property_arr.append(analysis.instability_index())
        property_arr.append(analysis.gravy())
        property_arr.append(analysis.isoelectric_point())

        secondary = analysis.secondary_structure_fraction()
        property_arr.append(secondary[0])
        property_arr.append(secondary[1])
        property_arr.append(secondary[2])

        molar_extinction_coefficient = analysis.molar_extinction_coefficient()
        property_arr.append(molar_extinction_coefficient[0])
        property_arr.append(molar_extinction_coefficient[1])

        property_arr.append(self._net_charge(sequence))

        return np.array(property_arr)
コード例 #15
0
ファイル: protparam.py プロジェクト: marlanbar/master-thesis
def get_protparam(row, func_name):
    protein_analysis = ProteinAnalysis(row)
    try:
        param = getattr(protein_analysis, func_name)()
        return param
    except:
        return np.nan
コード例 #16
0
def calculate_residue_features(temp_dict, sequence):
    analyzed_seq = ProteinAnalysis(sequence)
    aa_percent = analyzed_seq.get_amino_acids_percent()

    hydrophobicity = 0
    hydrophilicity = 0
    interior__surface_transfer_energy_scale = 0
    surface_fractional_probability = 0

    for key in aa_percent.keys():
        hydrophobicity += aa_percent[key] * kd[key]
        hydrophilicity += aa_percent[key] * hw[key]
        surface_fractional_probability += aa_percent[key] * em[key]
        interior__surface_transfer_energy_scale += aa_percent[key] * ja[key]

    temp_dict.update({
        "Hydrophobicity":
        hydrophobicity,
        "Hydrophilicity":
        hydrophilicity,
        "Surface Fractional Probability":
        surface_fractional_probability,
        "I2S Transfer Energy Scale":
        interior__surface_transfer_energy_scale
    })
    temp_dict.update(aa_percent)
コード例 #17
0
 def transform(self, X):
     vec = np.zeros((len(X), len(VALID_AMINO_ACIDS)))
     for i in range(len(X)):
         pa = ProteinAnalysis(str(X[i]))
         for j, a in enumerate(VALID_AMINO_ACIDS):
             vec[i, j] = pa.get_amino_acids_percent().get(a, 0.0)
     return vec
コード例 #18
0
def physchem_props(data):
    """Calculate the physicochemical properties per protein in ara_d."""
    new_table = []
    header = "ID\tclass\tindex\tsequon\tsequence\tmol_weight\tgravy\taromaticity\tinstab_index\tiso_point\n"
    new_table.append(header)
    for line in data:
        split_line = line.rstrip().split('\t')
        seq = split_line[-2]  # Sequon, not sequence
        # Calculates the properties
        if "X" in seq or '*' in seq or seq == '':
            continue  # Skip non-usable sequences, only negs
        try:
            a_seq = ProteinAnalysis(seq)
            # Update ara_d with new physchem properties
            results = [
                a_seq.molecular_weight(),
                a_seq.gravy(),
                a_seq.aromaticity(),
                a_seq.instability_index(),
                #a_seq.flexibility(),
                a_seq.isoelectric_point(),
                #a_seq.secondary_structure_fraction(),
            ]
        except:
            print(split_line)
            sys.exit(1)
        new_line = line.rstrip() + "\t{}\t{}\t{}\t{}\t{}\n".format(*results)
        new_table.append(new_line)
    return new_table
コード例 #19
0
def feat_extract(sequences):
    list_dict_feat = []
    for sequence in sequences:

        protein = ProteinAnalysis(sequence)
        sequence_feat = defaultdict(float)
        sequence_len = len(sequence)

        sequence_feat["sequence_length"] = sequence_len
        sequence_feat["aromaticty"] = protein.aromaticity()
        sequence_feat["isoeletric_point"] = protein.isoelectric_point()
        #sequence_feat["flexibility"] = protein.flexibility()
        if ('X' not in sequence) and ('O' not in sequence) and (
                'U' not in sequence) and ('B' not in sequence):
            sequence_feat["molecular_weight"] = protein.molecular_weight()
        for letter in sequence:
            sequence_feat["relative_fre_{}".format(letter)] += 1 / sequence_len
            for property in dic_properties:
                if letter in dic_properties[property]:
                    sequence_feat['freq_{}'.format(property)] += 1
        for letter in sequence[0:50]:
            sequence_feat["relative_fre_start{}".format(letter)] += 1 / 50
        for letter in sequence[-51:-1]:
            sequence_feat["relative_fre_end{}".format(letter)] += 1 / 50
        list_dict_feat.append(sequence_feat)
    return list_dict_feat
コード例 #20
0
def featureExtraction(train_df, test_df):
    #feature extraction using bio library to acquire peptide attributes
    n = len(train_df)
    Y = train_df[0]
    train_df = train_df.drop(columns=0)
    train_df = train_df.rename(columns={1: 0})
    big = pd.concat([train_df, test_df], ignore_index=True)
    big['molecular_weight'] = 0.0
    #big['flexibility'] = 0
    big['isoelectric_point'] = 0.0
    big['aromaticity'] = 0.0
    big['stability'] = 0.0
    for i in range(len(big)):
        #print(big.iloc[i, 0])
        val = big.iloc[i, 0]
        #invalid peptide check, set all values to 0
        if 'X' in val or 'Z' in val:
            big.at[i, 'molecular_weight'] = -1
            #big.at[i, 'flexibility'] = -1
            big.at[i, 'isoelectric_point'] = -1
            big.at[i, 'aromaticity'] = -1
            big.at[i, 'stability'] = -1
            continue
        model = ProteinAnalysis(val)
        big.at[i, 'molecular_weight'] = model.molecular_weight()
        #big.at[i, 'flexibility'] = model.flexibility()
        big.at[i, 'isoelectric_point'] = model.isoelectric_point()
        big.at[i, 'aromaticity'] = model.aromaticity()
        big.at[i, 'stability'] = model.instability_index()
    big = big.drop(columns=0)
    train_df = big.iloc[:n, ]
    test_df = big.iloc[n:, ]
    return train_df, test_df, Y
コード例 #21
0
def getProps(f):
    """
    Code for getting the molecular weight and other properties using Biopython
    """
    L = myPDB.loader(f)
    aseq = ProteinAnalysis(L.seq)
    return aseq.molecular_weight(), np.max(aseq.flexibility()), np.sum(L.ASA)
コード例 #22
0
def get_protein_features(seq):
    seq = correct(seq)
    prot_analysis = ProteinAnalysis(seq)
    prot_weight = molecular_weight(seq)
    pI = prot_analysis.isoelectric_point()
    aa_count = prot_analysis.count_amino_acids()
    neg_charged_residues = aa_count['D'] + aa_count['E']
    pos_charged_residues = aa_count['K'] + aa_count['R']
    extinction_coefficient_1 = aa_count['Y'] * 1490 + aa_count['W'] * 5500
    extinction_coefficient_2 = aa_count['Y'] * 1490 + aa_count[
        'W'] * 5500 + aa_count['C'] * 125
    instability_idx = instability_index(seq)
    gravy = hydrophobicity(seq)
    secondary_structure_fraction = [
        frac for frac in prot_analysis.secondary_structure_fraction()
    ]

    names = [
        'length', 'weight', 'pI', 'neg_charged_residues',
        'pos_charged_residues', 'extinction_coeff1', 'extinction_coeff2',
        'instability_index', 'gravy', 'helix', 'turn', 'sheet'
    ]

    return names, [
        len(seq), prot_weight, pI, neg_charged_residues, pos_charged_residues,
        extinction_coefficient_1, extinction_coefficient_2, instability_idx,
        gravy, *secondary_structure_fraction
    ]
コード例 #23
0
def prot_feats_seq(seq):

    aa=['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']


    f=[]



    X = ProteinAnalysis(str(seq))

    X.molecular_weight() #throws an error if 'X' in sequence. we skip such sequences
    p=X.get_amino_acids_percent()

    dp=[]
    for a in aa:
        dp.append(p[a])
    dp=np.array(dp)
    dp=normalize(np.atleast_2d(dp), norm='l2', copy=True, axis=1, return_norm=False)
    f.extend(dp[0])
    tm=np.array(twomerFromSeq(str(seq)))
    tm=normalize(np.atleast_2d(tm), norm='l2', copy=True, axis=1,return_norm=False)

    f.extend(tm[0])
    thm=np.array(threemerFromSeq(str(seq)))
    thm=normalize(np.atleast_2d(thm), norm='l2', copy=True, axis=1,return_norm=False)
    f.extend(thm[0])


    return np.array(f)
コード例 #24
0
ファイル: models.py プロジェクト: Amrithasuresh/BPPRC_v1
 def get_secondary_structure(self):
     x = ProteinAnalysis(self.sequence)
     sec_stru = x.secondary_structure_fraction()
     helix = "{0:0.2f}".format(sec_stru[0])
     turn = "{0:0.2f}".format(sec_stru[1])
     sheet = "{0:0.2f}".format(sec_stru[2])
     return helix, turn, sheet
コード例 #25
0
def prot_feats(filename):
    XX=[]
    ids=[]


    for rec in SeqIO.parse(filename, "fasta"):
        f=[]
        X = ProteinAnalysis(str(rec.seq))
#        import pdb; pdb.set_trace()
        try:
            X.molecular_weight() #throws an error if 'X' in sequence. we skip such sequences
            f=list(prot_feats_seq(str(rec.seq)))
    #
            XX.append(f)
            ids.append(rec.id)
            
        except:
#            print ("exception")
            continue






    XX=np.array(XX)
#    import pdb; pdb.set_trace()

    return XX,ids
コード例 #26
0
def my_own_filtering(input_file, output_file, filt_gc=45, filt_arom=0.01):

    sequences = {}
    c = 0

    with open(input_file, "r") as content:

        for record in SeqIO.parse(content, "fasta"):
            c += 1

            # calculate GC content using Bio

            calc_gc = SeqUtils.GC(record.seq)

            # calculate aromaticity using Bio

            prot_seq = record.seq.translate()
            X = ProteinAnalysis(str(prot_seq))
            calc_arom = X.aromaticity()

            # so, now you can filter
            if calc_gc >= filt_gc and calc_arom >= filt_arom:
                sequences[record.id] = record.se

    # write a new fasta file with aminoacids
    records = []
    for seq_id, seq in sequences.items():
        records.append(SeqRecord(seq.translate(), id=seq_id, description=""))

    write_file = open('my_fasta', 'w')
    SeqIO.write(records, write_file, 'fasta')
    write_file.close()

    # print the percentage
    print(len(records) / c)
コード例 #27
0
ファイル: protParam.py プロジェクト: j-brady/nmrsa
def protParam(seq):
    params = ProteinAnalysis(seq)
    mw = params.molecular_weight()
    c_aa = params.count_amino_acids()
    p_aa = params.get_amino_acids_percent()
    gravy = params.gravy()
    aromaticity = params.aromaticity()
    isoelectric_point = params.isoelectric_point()
    ext_coeff = sum([c_aa["W"] * 5690, c_aa["Y"] * 1280, c_aa["C"] * 120])
    mgml = ext_coeff * (1. / mw)

    print("Amino acid count")
    pprint.pprint(c_aa)
    print("Amino acid percent")
    pprint.pprint(p_aa)
    print("Molecular weight")
    print("%f Da" % mw)
    print("Gravy")
    print(gravy)
    print("Isoelectric point")
    print(isoelectric_point)
    print("Aromaticity")
    print(aromaticity)
    print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)" % ext_coeff)
    print("")
コード例 #28
0
ファイル: lasso_module.py プロジェクト: cschwal/rodeo2
 def set_monoisotopic_mass(self):
     self._set_number_bridges()
     CC_mass = 2*self._num_bridges
     # dehydration indicative of cyclization     
     bond = 18.02
     monoisotopic_mass = ProteinAnalysis(self.core.replace('X', ''), monoisotopic=True).molecular_weight()
     self._monoisotopic_weight = monoisotopic_mass + CC_mass - bond
コード例 #29
0
def protein_properties(seq):
    """Return a tuple with some protein biochemical properties

    seq is a Bio.Seq.Seq or str representing protein sequence
    """
    pa = ProteinAnalysis(seq)

    aa_counts = pa.count_amino_acids()
    arom = pa.aromaticity()
    isoelec = pa.isoelectric_point()
    try:
        instability = pa.instability_index()
    except KeyError:
        instability = None
    try:
        gravy = pa.gravy()
    except KeyError:
        gravy = None

    return ProtProp(aa=str(seq),
                    gravy=gravy,
                    aromaticity=arom,
                    isoelectric_point=isoelec,
                    instability=instability,
                    aa_counts=aa_counts)
コード例 #30
0
def sequence_vector(temp_window: str, window: int = 6, chemical=1):
    """
    This vector takes the sequence and has each amino acid represented by an int
    0 represents nonstandard amino acids or as fluff for tails/heads of sequences
    Strip is a list which can be modified as user needs call for
    """
    temp_window = clean(temp_window)
    temp_window = windower(sequence=temp_window, position=int(len(temp_window)*.5), wing_size=window)

    vec = []
    aa = {"G": 1, "A": 2, "L": 3, "M": 4, "F": 5, "W": 6, "K": 7, "Q": 8, "E": 9, "S": 10, "P": 11, "V": 12, "I": 13,
          "C": 14, "Y": 15, "H": 16, "R": 17, "N": 18, "D": 19, "T": 20, "X": 0}

    for i in temp_window:
        vec.append(aa[i])
    if len(vec) != (window*2)+1:
        t = len(vec)
        for i in range((window*2)+1-t):
            vec.append(0)
    # Hydrophobicity is optional
    if chemical == 1:
        s = ProteinAnalysis(temp_window)
        vec.append(s.gravy())
        vec.append(s.instability_index())
        vec.append(s.aromaticity())

    return vec