Esempio n. 1
0
def Run_Uverskey(Fasta1, Fasta2, OutFile):


    amyload_seq = load_fasta_file(Fasta1)
    disprot_seq = load_fasta_file(Fasta2)

    net_abs_charge = Feature(get_aa2charge(default=0)).then(average_absolute)
    mean_hydropathy = Feature(get_aa2hydropathy(default=0)).then(average)

    uversky_fs = FeatureSet("uversky")
    uversky_fs.add(mean_hydropathy, name="mean_hydropathy")
    uversky_fs.add(net_abs_charge, name="net_abs_charge")

    amyload_uversky_seq = uversky_fs(amyload_seq)
    disprot_uversky_seq = uversky_fs(disprot_seq)


    amyload_data_x = amyload_uversky_seq.columns(feature="mean_hydropathy")[0]
    amyload_data_y = amyload_uversky_seq.columns(feature="net_abs_charge")[0]
    plt.plot(amyload_data_x, amyload_data_y,'.', label="Amyload")

    disprot_data = compact(disprot_uversky_seq).columns()
    plt.plot(disprot_data[0], disprot_data[1],'.', label="Disprot")

    plt.plot([-0.78, 0.835], [0.0, 0.5],'k')
    plt.xlabel("mean hydrophobicity")
    plt.ylabel("net abs charge")
    plt.legend()

    plt.savefig(OutFile)
Esempio n. 2
0
def _get_feature_map(index='JOND920101'):
    """ To get the feature mapping object 
		using the amino acid index given. 

		The mapping is created using AAindex.
		'-' is mapped to 0.0.
	"""

    # Create a Feature object
    aaindex_map = get_aaindex_file(index)
    aaindex_map.mapping['-'] = 0.0
    feat_map = Feature(aaindex_map)

    return feat_map
Esempio n. 3
0
def run(Fasta1, Fasta2, windows_per_frame, overlap_factor, xlabel, ylabel,
        pop1_label, pop2_label, htmlOutDir, htmlFname, Workdirpath):

    if not os.path.exists(htmlOutDir):
        os.makedirs(htmlOutDir)

    amyload_pos_seq = load_fasta_file(Fasta1)
    amyload_neg_seq = load_fasta_file(Fasta2)

    # Calculate quantitive features: volume and hydropathy
    mean_volume = Feature(get_aa2volume()).then(average)
    mean_hydropathy = Feature(get_aa2hydropathy()).then(average)

    fs = FeatureSet("volume'n'hydropathy")
    fs.add(mean_volume)
    fs.add(mean_hydropathy)

    amyload_pos_conv_seq = fs(amyload_pos_seq)
    amyload_neg_conv_seq = fs(amyload_neg_seq)

    # Do local Fisher:
    result = local_fisher_2d(amyload_pos_conv_seq,
                             amyload_neg_conv_seq,
                             windows_per_frame=int(windows_per_frame),
                             overlap_factor=int(overlap_factor))

    # Plot local Fisher:
    _plot_local_fisher_2d(result,
                          xlabel=xlabel,
                          ylabel=ylabel,
                          pop1_label=pop1_label,
                          pop2_label=pop2_label,
                          out_file_path=os.path.join(os.getcwd(), "out.png"))

    #   plt.savefig(os.path.join(Workdirpath, htmlOutDir, "1.png"))

    HTML_Gen(os.path.join(Workdirpath, htmlOutDir, htmlFname))
def Run_ngrams(fasta1, fasta2, OutFile ):

    alphasyn_seq = load_fasta_file(fasta1)
    amyload_pos_seq = load_fasta_file(fasta2)

    fs_aa = FeatureSet("aa patterns")
    fs_aa.add(identity)
    fs_aa.add(pattern_match, pattern='VT', padded=True)
    fs_aa.add(pattern_count, pattern='VT')

    result_seq = fs_aa(alphasyn_seq)

    fs_hp = FeatureSet("hydropathy patterns")
    fs_hp.add(Feature(get_aa2hydropathy()))
    fs_hp.add(Feature(get_aa2hydropathy()).then(pattern_match, pattern=[0.0, 2.0],
                                                metric='taxi', radius=1.0))
    result_seq2 = fs_hp(alphasyn_seq)
    result_freq = ngram_count(alphasyn_seq, n=2)
    result_fit = zipf_law_fit(amyload_pos_seq, n=3, verbose=True)

    counts = sorted(result_fit["ngram_counts"], reverse=True)
    ranks = range(1, len(counts)+1)

    slope = result_fit["slope"]
    harmonic_num = sum([rank**-slope for rank in ranks])
    fitted_counts = [(rank**-slope) / harmonic_num * sum(counts) for rank in ranks]

    plt.plot(ranks, counts, 'k', label="empirical")
    plt.plot(ranks, fitted_counts, 'k--',
             label="Zipf's law\nslope: {:.2f}".format((slope)))
    plt.xlabel('rank')
    plt.ylabel('count')
    plt.xscale('log')
    plt.yscale('log')
    plt.legend()

    plt.savefig(OutFile)
Esempio n. 5
0
    def then_all(self, function, name=None, window=0, **params):
        """
        Define a post-processor feature.

        The method can either take a Feature or a function. In the former case
        the post-processor is a deep copy of the input Feature.

        The method modifies the self object.

        Args:
            function (function): Feature or function to serve a post-processor.
                Only when 'function' is not a Feature, the following arguments
                are taken into account:
            name (str): name for the feature. If None (default),
                        the 'function' name is used instead as 'name'.
            window (int): length of the window over which the feature is
                          calculated. Defaults to the whole sequence (window=0).
            params (**kwargs): arbitrary params to be passed to the function.

         Returns the self to allow feature chaining.
        """
        if isinstance(function, Feature):
            post_feat = copy.deepcopy(function)
        else:
            my_name = name if name is not None else function.__name__
            post_feat = Feature(function,
                                name=my_name,
                                window=window,
                                **params)

        if self._post_feat is None:
            self._post_feat = post_feat
        else:
            self._post_feat.then(post_feat)

        self._post_feat_name += ">" + post_feat.name

        return self
Esempio n. 6
0
import os
import sys

sys.path.insert(0, os.path.abspath('..'))

from quantiprot.utils.io import load_fasta_file
from quantiprot.utils.feature import Feature, FeatureSet
from quantiprot.metrics.aaindex import get_aaindex_file
from quantiprot.metrics.basic import average

# Load data:
seq = load_fasta_file("data/Alphasyn.fasta")

# Build a feature: average polarity (Graham, 1974), AAindex entry: GRAR740102:
feat = Feature(get_aaindex_file("GRAR740102")).then(average)

# Add the feature to new feature set:
fs = FeatureSet("my set")
fs.add(feat)

# Process sequences:
res_seq = fs(seq)

# Export average polarities
res = res_seq.columns()
print res
Esempio n. 7
0
# Find and count matches to a pattern 'VT'
fs_aa = FeatureSet("aa patterns")
fs_aa.add(identity)
fs_aa.add(pattern_match, pattern='VT', padded=True)
fs_aa.add(pattern_count, pattern='VT')

result_seq = fs_aa(alphasyn_seq)

for seq in result_seq[:3]:
    print seq

# ...and something much more subtle:
# Map a sequence to the hydrophaty scale, and search for the pattern 0.0 - 2.0
# with the similarity radius 1.0 in the L1 norm (the 'taxi' metric).
fs_hp = FeatureSet("hydropathy patterns")
fs_hp.add(Feature(get_aa2hydropathy()))
fs_hp.add(Feature(get_aa2hydropathy()).then(pattern_match, pattern=[0.0, 2.0],
                                            metric='taxi', radius=1.0))
result_seq2 = fs_hp(alphasyn_seq)

for seq in result_seq2[:2]:
    print seq

# Calculate bigram frequencies in 'alphasyn_seq':
result_freq = ngram_count(alphasyn_seq, n=2)
print result_freq

# Fit Zipf's law for a trigram distribution in 'amyload_pos_seq':
result_fit = zipf_law_fit(amyload_pos_seq, n=3, verbose=True)

# Calculate the empirical rank-frequency plot:
Esempio n. 8
0
args = parser.parse_args()

# Retrieve AAindex mappings for the properties if and as requested
# property1
if args.property1 is not None:
    try:
        aa_mapping1 = get_aaindex_file(args.property1)
    except ValueError:
        aa_mapping1 = get_aaindex_www(args.property1)
    try:
        aa_mapping1.default = float(args.default1)
    except (TypeError, ValueError):
        aa_mapping1.default = args.default1

    feat1 = Feature(aa_mapping1)
else:
    feat1 = Feature(identity)
# property2
if args.property2 is not None:
    try:
        aa_mapping2 = get_aaindex_file(args.property2)
    except ValueError:
        aa_mapping2 = get_aaindex_www(args.property2)
    try:
        aa_mapping2.default = float(args.default2)
    except (TypeError, ValueError):
        aa_mapping2.default = args.default2

    feat2 = Feature(aa_mapping2)
else:
Esempio n. 9
0
        aa_mapping = get_aaindex_www(args.property)

    # Simplify if and as requested
    if args.simplify is not None:
        aa_mapping = simplify(aa_mapping, aa_mapping.__name__+"/"+args.classes,
                              method=args.simplify, k=int(args.classes),
                              iters=int(args.iterations))

    # Assign 'default' value for the Mapping
    try:
        aa_mapping.default = float(args.default)
    except (TypeError, ValueError):
        aa_mapping.default = args.default

    # Make a Feature from the Mapping
    feat = Feature(aa_mapping)
else:
    feat = Feature(identity)

# Add the Feature to a FeatureSet
fs = FeatureSet("fs")
fs.add(feat)

# And use it to convert the input set
conv_seq = fs(input_seq)

# Get the alphabet of the converted set
alphabet = list(set([element for seq in conv_seq for element in seq.data]))

# Prepare the n-gram counts extractor
nfs = NgramFeatureSet('ngram_'+args.n, n=int(args.n), alphabet=alphabet,
for record in SeqIO.parse("sequence_2.fasta", "fasta"):
    length_seqs.append(len(record))
    #print((record))

#load the sequence from the file
seq = load_fasta_file("sequence_2.fasta")
SequenceIds = []
SequenceIds2_list = []
for i in SequenceSet.ids(seq):
    SequenceIds.append(i)
for i in SequenceIds:
    SequenceIds2 = i[i.find("[") + 1:i.find("]")]
    SequenceIds2_list.append(SequenceIds2)

#gather important protein features
polarity = Feature(get_aaindex_file("GRAR740102")).then(average)
hydropathy = Feature(get_aaindex_file("KYTJ820101")).then(average)
iso_point = Feature(get_aaindex_file("ZIMJ680104")).then(average)
pk_COOH = Feature(get_aaindex_file("JOND750102")).then(average)
entropy_form = Feature(get_aaindex_file("HUTJ700103")).then(average)
melting_point = Feature(get_aaindex_file("FASG760102")).then(average)
net_charge = Feature(get_aaindex_file("KLEP840101")).then(average)
glycine = Feature(pattern_count, pattern='G')
RGD = Feature(pattern_count, pattern='RGD')
GFPGER = Feature(pattern_count, pattern='GFPGER')

#Build the feature set
fs = FeatureSet("my set")
fs1 = FeatureSet("test")
fs2 = FeatureSet("glycine")
fs3 = FeatureSet("GFPGER")
Esempio n. 11
0
from quantiprot.metrics.aaindex import get_aaindex_file
from quantiprot.metrics.basic import identity

# Load the 'data/Alphasyn.fasta' sequence set, which contains several
# peptides from alpha-synuclein deposed in the Amyload database:
alphasyn_seq = load_fasta_file("data/Alphasyn.fasta")

# Retrieve predefined mapping from aminoacids to formal charge,
# and AAindex mapping to relative frequency of occurence (entry: JOND920101)
aa2charge_map = get_aa2charge()
aa2freq_map = get_aaindex_file("JOND920101")
print aa2charge_map
print aa2freq_map

# Make Feature objects based on Mappings:
charge_feat = Feature(aa2charge_map)
freq_feat = Feature(aa2freq_map)
print charge_feat
print freq_feat

# And use them to covert 1st sequence in 'alphasyn_seq':
print charge_feat(alphasyn_seq[0])
print freq_feat(alphasyn_seq[0])

# Make a FeatureSet from a Feature and Mappings:
fs = FeatureSet("basic features")
fs.add(charge_feat)
fs.add(aa2freq_map, name="frequency")
fs.add(get_aa2hydropathy())
print fs
Esempio n. 12
0
from quantiprot.utils.io import load_fasta_file
from quantiprot.utils.feature import Feature, FeatureSet
from quantiprot.metrics.aaindex import get_aa2volume, get_aa2hydropathy
from quantiprot.metrics.basic import average

# Local Fisher-test related imports:
from quantiprot.analysis.fisher import local_fisher_2d, _plot_local_fisher_2d

from matplotlib import pyplot as plt

# Load sets of amyloidogenic and non-amyloidogenic peptides:
amyload_pos_seq = load_fasta_file("data/Amyload_positive.fasta")
amyload_neg_seq = load_fasta_file("data/Amyload_negative.fasta")

# Calculate quantitive features: volume and hydropathy
mean_volume = Feature(get_aa2volume()).then(average)
mean_hydropathy = Feature(get_aa2hydropathy()).then(average)

fs = FeatureSet("volume'n'hydropathy")
fs.add(mean_volume)
fs.add(mean_hydropathy)

amyload_pos_conv_seq = fs(amyload_pos_seq)
amyload_neg_conv_seq = fs(amyload_neg_seq)

# Do local Fisher:
result = local_fisher_2d(amyload_pos_conv_seq,
                         amyload_neg_conv_seq,
                         windows_per_frame=5,
                         overlap_factor=5)
Esempio n. 13
0
sys.path.insert(0, os.path.abspath('..'))

from quantiprot.utils.io import load_fasta_file
from quantiprot.utils.feature import Feature, FeatureSet
from quantiprot.metrics.aaindex import get_aa2charge, get_aa2hydropathy, get_aa2volume
from quantiprot.utils.mapping import simplify

# Quantification-related imports:
from quantiprot.metrics.basic import identity, average, sum_absolute, uniq_count
from quantiprot.utils.sequence import compact

# Load some data:
alphasyn_seq = load_fasta_file("data/Alphasyn.fasta")

# Prepare Features:
charge_sum_abs_feat = Feature(get_aa2charge()).then(sum_absolute)
hydropathy_average_feat = Feature(get_aa2hydropathy()).then(average)
volume_levels_feat = Feature(
    simplify(get_aa2volume(), name="volume levels", k=3)).then(uniq_count)

# Prepare a FeatureSet
fs = FeatureSet("simple quantification")
fs.add(hydropathy_average_feat)
fs.add(charge_sum_abs_feat)
fs.add(volume_levels_feat)

# And use it to quantify protein sequence(s):
result_seq = fs(alphasyn_seq)
print result_seq
for seq in result_seq:
    print seq
Esempio n. 14
0
sys.path.insert(0, os.path.abspath('..'))

# Uversky plot
from quantiprot.utils.io import load_fasta_file
from quantiprot.utils.feature import Feature, FeatureSet
from quantiprot.utils.sequence import compact
from quantiprot.metrics.aaindex import get_aa2charge, get_aa2hydropathy
from quantiprot.metrics.basic import average, average_absolute

from matplotlib import pyplot as plt

amyload_seq = load_fasta_file("data/Amyload_positive.fasta")
disprot_seq = load_fasta_file("data/Disprot.fasta")

# Non-standard letters in Disprot assigned neutral charge and hydropathy:
net_abs_charge = Feature(get_aa2charge(default=0)).then(average_absolute)
mean_hydropathy = Feature(get_aa2hydropathy(default=0)).then(average)

uversky_fs = FeatureSet("uversky")
uversky_fs.add(mean_hydropathy, name="mean_hydropathy")
uversky_fs.add(net_abs_charge, name="net_abs_charge")

amyload_uversky_seq = uversky_fs(amyload_seq)
disprot_uversky_seq = uversky_fs(disprot_seq)

# First approach to get hydrophobicity/charge pairs
amyload_data_x = amyload_uversky_seq.columns(feature="mean_hydropathy")[0]
amyload_data_y = amyload_uversky_seq.columns(feature="net_abs_charge")[0]
plt.plot(amyload_data_x, amyload_data_y, '.', label="Amyload")

# Second approach to get hydrophobicity/charge pairs
Esempio n. 15
0
    # Assign 'default' value for the Mapping
    try:
        aa_mapping.default = float(args.default)
    except (TypeError, ValueError):
        aa_mapping.default = args.default

    # Simplify if and as requested
    if args.simplify is not None:
        aa_mapping = simplify(aa_mapping,
                              aa_mapping.__name__ + "/" + args.classes,
                              method=args.simplify,
                              k=int(args.classes),
                              iters=int(args.iterations))

    # Make a Feature from the Mapping
    feat = Feature(aa_mapping)

else:
    feat = Feature(identity)

# Order quantification if and as requested
if args.quantify is not None:
    quantify_method = {
        'sum': sum,
        'sum_abs': sum_absolute,
        'avg': average,
        'avg_abs': average_absolute,
        'rec': recurrence,
        'det': determinism,
        'pal': palindromism,
        'ratio_det': ratio_determinism,