def Run_Uverskey(Fasta1, Fasta2, OutFile): amyload_seq = load_fasta_file(Fasta1) disprot_seq = load_fasta_file(Fasta2) net_abs_charge = Feature(get_aa2charge(default=0)).then(average_absolute) mean_hydropathy = Feature(get_aa2hydropathy(default=0)).then(average) uversky_fs = FeatureSet("uversky") uversky_fs.add(mean_hydropathy, name="mean_hydropathy") uversky_fs.add(net_abs_charge, name="net_abs_charge") amyload_uversky_seq = uversky_fs(amyload_seq) disprot_uversky_seq = uversky_fs(disprot_seq) amyload_data_x = amyload_uversky_seq.columns(feature="mean_hydropathy")[0] amyload_data_y = amyload_uversky_seq.columns(feature="net_abs_charge")[0] plt.plot(amyload_data_x, amyload_data_y,'.', label="Amyload") disprot_data = compact(disprot_uversky_seq).columns() plt.plot(disprot_data[0], disprot_data[1],'.', label="Disprot") plt.plot([-0.78, 0.835], [0.0, 0.5],'k') plt.xlabel("mean hydrophobicity") plt.ylabel("net abs charge") plt.legend() plt.savefig(OutFile)
def _get_feature_map(index='JOND920101'): """ To get the feature mapping object using the amino acid index given. The mapping is created using AAindex. '-' is mapped to 0.0. """ # Create a Feature object aaindex_map = get_aaindex_file(index) aaindex_map.mapping['-'] = 0.0 feat_map = Feature(aaindex_map) return feat_map
def run(Fasta1, Fasta2, windows_per_frame, overlap_factor, xlabel, ylabel, pop1_label, pop2_label, htmlOutDir, htmlFname, Workdirpath): if not os.path.exists(htmlOutDir): os.makedirs(htmlOutDir) amyload_pos_seq = load_fasta_file(Fasta1) amyload_neg_seq = load_fasta_file(Fasta2) # Calculate quantitive features: volume and hydropathy mean_volume = Feature(get_aa2volume()).then(average) mean_hydropathy = Feature(get_aa2hydropathy()).then(average) fs = FeatureSet("volume'n'hydropathy") fs.add(mean_volume) fs.add(mean_hydropathy) amyload_pos_conv_seq = fs(amyload_pos_seq) amyload_neg_conv_seq = fs(amyload_neg_seq) # Do local Fisher: result = local_fisher_2d(amyload_pos_conv_seq, amyload_neg_conv_seq, windows_per_frame=int(windows_per_frame), overlap_factor=int(overlap_factor)) # Plot local Fisher: _plot_local_fisher_2d(result, xlabel=xlabel, ylabel=ylabel, pop1_label=pop1_label, pop2_label=pop2_label, out_file_path=os.path.join(os.getcwd(), "out.png")) # plt.savefig(os.path.join(Workdirpath, htmlOutDir, "1.png")) HTML_Gen(os.path.join(Workdirpath, htmlOutDir, htmlFname))
def Run_ngrams(fasta1, fasta2, OutFile ): alphasyn_seq = load_fasta_file(fasta1) amyload_pos_seq = load_fasta_file(fasta2) fs_aa = FeatureSet("aa patterns") fs_aa.add(identity) fs_aa.add(pattern_match, pattern='VT', padded=True) fs_aa.add(pattern_count, pattern='VT') result_seq = fs_aa(alphasyn_seq) fs_hp = FeatureSet("hydropathy patterns") fs_hp.add(Feature(get_aa2hydropathy())) fs_hp.add(Feature(get_aa2hydropathy()).then(pattern_match, pattern=[0.0, 2.0], metric='taxi', radius=1.0)) result_seq2 = fs_hp(alphasyn_seq) result_freq = ngram_count(alphasyn_seq, n=2) result_fit = zipf_law_fit(amyload_pos_seq, n=3, verbose=True) counts = sorted(result_fit["ngram_counts"], reverse=True) ranks = range(1, len(counts)+1) slope = result_fit["slope"] harmonic_num = sum([rank**-slope for rank in ranks]) fitted_counts = [(rank**-slope) / harmonic_num * sum(counts) for rank in ranks] plt.plot(ranks, counts, 'k', label="empirical") plt.plot(ranks, fitted_counts, 'k--', label="Zipf's law\nslope: {:.2f}".format((slope))) plt.xlabel('rank') plt.ylabel('count') plt.xscale('log') plt.yscale('log') plt.legend() plt.savefig(OutFile)
def then_all(self, function, name=None, window=0, **params): """ Define a post-processor feature. The method can either take a Feature or a function. In the former case the post-processor is a deep copy of the input Feature. The method modifies the self object. Args: function (function): Feature or function to serve a post-processor. Only when 'function' is not a Feature, the following arguments are taken into account: name (str): name for the feature. If None (default), the 'function' name is used instead as 'name'. window (int): length of the window over which the feature is calculated. Defaults to the whole sequence (window=0). params (**kwargs): arbitrary params to be passed to the function. Returns the self to allow feature chaining. """ if isinstance(function, Feature): post_feat = copy.deepcopy(function) else: my_name = name if name is not None else function.__name__ post_feat = Feature(function, name=my_name, window=window, **params) if self._post_feat is None: self._post_feat = post_feat else: self._post_feat.then(post_feat) self._post_feat_name += ">" + post_feat.name return self
import os import sys sys.path.insert(0, os.path.abspath('..')) from quantiprot.utils.io import load_fasta_file from quantiprot.utils.feature import Feature, FeatureSet from quantiprot.metrics.aaindex import get_aaindex_file from quantiprot.metrics.basic import average # Load data: seq = load_fasta_file("data/Alphasyn.fasta") # Build a feature: average polarity (Graham, 1974), AAindex entry: GRAR740102: feat = Feature(get_aaindex_file("GRAR740102")).then(average) # Add the feature to new feature set: fs = FeatureSet("my set") fs.add(feat) # Process sequences: res_seq = fs(seq) # Export average polarities res = res_seq.columns() print res
# Find and count matches to a pattern 'VT' fs_aa = FeatureSet("aa patterns") fs_aa.add(identity) fs_aa.add(pattern_match, pattern='VT', padded=True) fs_aa.add(pattern_count, pattern='VT') result_seq = fs_aa(alphasyn_seq) for seq in result_seq[:3]: print seq # ...and something much more subtle: # Map a sequence to the hydrophaty scale, and search for the pattern 0.0 - 2.0 # with the similarity radius 1.0 in the L1 norm (the 'taxi' metric). fs_hp = FeatureSet("hydropathy patterns") fs_hp.add(Feature(get_aa2hydropathy())) fs_hp.add(Feature(get_aa2hydropathy()).then(pattern_match, pattern=[0.0, 2.0], metric='taxi', radius=1.0)) result_seq2 = fs_hp(alphasyn_seq) for seq in result_seq2[:2]: print seq # Calculate bigram frequencies in 'alphasyn_seq': result_freq = ngram_count(alphasyn_seq, n=2) print result_freq # Fit Zipf's law for a trigram distribution in 'amyload_pos_seq': result_fit = zipf_law_fit(amyload_pos_seq, n=3, verbose=True) # Calculate the empirical rank-frequency plot:
args = parser.parse_args() # Retrieve AAindex mappings for the properties if and as requested # property1 if args.property1 is not None: try: aa_mapping1 = get_aaindex_file(args.property1) except ValueError: aa_mapping1 = get_aaindex_www(args.property1) try: aa_mapping1.default = float(args.default1) except (TypeError, ValueError): aa_mapping1.default = args.default1 feat1 = Feature(aa_mapping1) else: feat1 = Feature(identity) # property2 if args.property2 is not None: try: aa_mapping2 = get_aaindex_file(args.property2) except ValueError: aa_mapping2 = get_aaindex_www(args.property2) try: aa_mapping2.default = float(args.default2) except (TypeError, ValueError): aa_mapping2.default = args.default2 feat2 = Feature(aa_mapping2) else:
aa_mapping = get_aaindex_www(args.property) # Simplify if and as requested if args.simplify is not None: aa_mapping = simplify(aa_mapping, aa_mapping.__name__+"/"+args.classes, method=args.simplify, k=int(args.classes), iters=int(args.iterations)) # Assign 'default' value for the Mapping try: aa_mapping.default = float(args.default) except (TypeError, ValueError): aa_mapping.default = args.default # Make a Feature from the Mapping feat = Feature(aa_mapping) else: feat = Feature(identity) # Add the Feature to a FeatureSet fs = FeatureSet("fs") fs.add(feat) # And use it to convert the input set conv_seq = fs(input_seq) # Get the alphabet of the converted set alphabet = list(set([element for seq in conv_seq for element in seq.data])) # Prepare the n-gram counts extractor nfs = NgramFeatureSet('ngram_'+args.n, n=int(args.n), alphabet=alphabet,
for record in SeqIO.parse("sequence_2.fasta", "fasta"): length_seqs.append(len(record)) #print((record)) #load the sequence from the file seq = load_fasta_file("sequence_2.fasta") SequenceIds = [] SequenceIds2_list = [] for i in SequenceSet.ids(seq): SequenceIds.append(i) for i in SequenceIds: SequenceIds2 = i[i.find("[") + 1:i.find("]")] SequenceIds2_list.append(SequenceIds2) #gather important protein features polarity = Feature(get_aaindex_file("GRAR740102")).then(average) hydropathy = Feature(get_aaindex_file("KYTJ820101")).then(average) iso_point = Feature(get_aaindex_file("ZIMJ680104")).then(average) pk_COOH = Feature(get_aaindex_file("JOND750102")).then(average) entropy_form = Feature(get_aaindex_file("HUTJ700103")).then(average) melting_point = Feature(get_aaindex_file("FASG760102")).then(average) net_charge = Feature(get_aaindex_file("KLEP840101")).then(average) glycine = Feature(pattern_count, pattern='G') RGD = Feature(pattern_count, pattern='RGD') GFPGER = Feature(pattern_count, pattern='GFPGER') #Build the feature set fs = FeatureSet("my set") fs1 = FeatureSet("test") fs2 = FeatureSet("glycine") fs3 = FeatureSet("GFPGER")
from quantiprot.metrics.aaindex import get_aaindex_file from quantiprot.metrics.basic import identity # Load the 'data/Alphasyn.fasta' sequence set, which contains several # peptides from alpha-synuclein deposed in the Amyload database: alphasyn_seq = load_fasta_file("data/Alphasyn.fasta") # Retrieve predefined mapping from aminoacids to formal charge, # and AAindex mapping to relative frequency of occurence (entry: JOND920101) aa2charge_map = get_aa2charge() aa2freq_map = get_aaindex_file("JOND920101") print aa2charge_map print aa2freq_map # Make Feature objects based on Mappings: charge_feat = Feature(aa2charge_map) freq_feat = Feature(aa2freq_map) print charge_feat print freq_feat # And use them to covert 1st sequence in 'alphasyn_seq': print charge_feat(alphasyn_seq[0]) print freq_feat(alphasyn_seq[0]) # Make a FeatureSet from a Feature and Mappings: fs = FeatureSet("basic features") fs.add(charge_feat) fs.add(aa2freq_map, name="frequency") fs.add(get_aa2hydropathy()) print fs
from quantiprot.utils.io import load_fasta_file from quantiprot.utils.feature import Feature, FeatureSet from quantiprot.metrics.aaindex import get_aa2volume, get_aa2hydropathy from quantiprot.metrics.basic import average # Local Fisher-test related imports: from quantiprot.analysis.fisher import local_fisher_2d, _plot_local_fisher_2d from matplotlib import pyplot as plt # Load sets of amyloidogenic and non-amyloidogenic peptides: amyload_pos_seq = load_fasta_file("data/Amyload_positive.fasta") amyload_neg_seq = load_fasta_file("data/Amyload_negative.fasta") # Calculate quantitive features: volume and hydropathy mean_volume = Feature(get_aa2volume()).then(average) mean_hydropathy = Feature(get_aa2hydropathy()).then(average) fs = FeatureSet("volume'n'hydropathy") fs.add(mean_volume) fs.add(mean_hydropathy) amyload_pos_conv_seq = fs(amyload_pos_seq) amyload_neg_conv_seq = fs(amyload_neg_seq) # Do local Fisher: result = local_fisher_2d(amyload_pos_conv_seq, amyload_neg_conv_seq, windows_per_frame=5, overlap_factor=5)
sys.path.insert(0, os.path.abspath('..')) from quantiprot.utils.io import load_fasta_file from quantiprot.utils.feature import Feature, FeatureSet from quantiprot.metrics.aaindex import get_aa2charge, get_aa2hydropathy, get_aa2volume from quantiprot.utils.mapping import simplify # Quantification-related imports: from quantiprot.metrics.basic import identity, average, sum_absolute, uniq_count from quantiprot.utils.sequence import compact # Load some data: alphasyn_seq = load_fasta_file("data/Alphasyn.fasta") # Prepare Features: charge_sum_abs_feat = Feature(get_aa2charge()).then(sum_absolute) hydropathy_average_feat = Feature(get_aa2hydropathy()).then(average) volume_levels_feat = Feature( simplify(get_aa2volume(), name="volume levels", k=3)).then(uniq_count) # Prepare a FeatureSet fs = FeatureSet("simple quantification") fs.add(hydropathy_average_feat) fs.add(charge_sum_abs_feat) fs.add(volume_levels_feat) # And use it to quantify protein sequence(s): result_seq = fs(alphasyn_seq) print result_seq for seq in result_seq: print seq
sys.path.insert(0, os.path.abspath('..')) # Uversky plot from quantiprot.utils.io import load_fasta_file from quantiprot.utils.feature import Feature, FeatureSet from quantiprot.utils.sequence import compact from quantiprot.metrics.aaindex import get_aa2charge, get_aa2hydropathy from quantiprot.metrics.basic import average, average_absolute from matplotlib import pyplot as plt amyload_seq = load_fasta_file("data/Amyload_positive.fasta") disprot_seq = load_fasta_file("data/Disprot.fasta") # Non-standard letters in Disprot assigned neutral charge and hydropathy: net_abs_charge = Feature(get_aa2charge(default=0)).then(average_absolute) mean_hydropathy = Feature(get_aa2hydropathy(default=0)).then(average) uversky_fs = FeatureSet("uversky") uversky_fs.add(mean_hydropathy, name="mean_hydropathy") uversky_fs.add(net_abs_charge, name="net_abs_charge") amyload_uversky_seq = uversky_fs(amyload_seq) disprot_uversky_seq = uversky_fs(disprot_seq) # First approach to get hydrophobicity/charge pairs amyload_data_x = amyload_uversky_seq.columns(feature="mean_hydropathy")[0] amyload_data_y = amyload_uversky_seq.columns(feature="net_abs_charge")[0] plt.plot(amyload_data_x, amyload_data_y, '.', label="Amyload") # Second approach to get hydrophobicity/charge pairs
# Assign 'default' value for the Mapping try: aa_mapping.default = float(args.default) except (TypeError, ValueError): aa_mapping.default = args.default # Simplify if and as requested if args.simplify is not None: aa_mapping = simplify(aa_mapping, aa_mapping.__name__ + "/" + args.classes, method=args.simplify, k=int(args.classes), iters=int(args.iterations)) # Make a Feature from the Mapping feat = Feature(aa_mapping) else: feat = Feature(identity) # Order quantification if and as requested if args.quantify is not None: quantify_method = { 'sum': sum, 'sum_abs': sum_absolute, 'avg': average, 'avg_abs': average_absolute, 'rec': recurrence, 'det': determinism, 'pal': palindromism, 'ratio_det': ratio_determinism,