Ejemplo n.º 1
0
def Run_Uverskey(Fasta1, Fasta2, OutFile):


    amyload_seq = load_fasta_file(Fasta1)
    disprot_seq = load_fasta_file(Fasta2)

    net_abs_charge = Feature(get_aa2charge(default=0)).then(average_absolute)
    mean_hydropathy = Feature(get_aa2hydropathy(default=0)).then(average)

    uversky_fs = FeatureSet("uversky")
    uversky_fs.add(mean_hydropathy, name="mean_hydropathy")
    uversky_fs.add(net_abs_charge, name="net_abs_charge")

    amyload_uversky_seq = uversky_fs(amyload_seq)
    disprot_uversky_seq = uversky_fs(disprot_seq)


    amyload_data_x = amyload_uversky_seq.columns(feature="mean_hydropathy")[0]
    amyload_data_y = amyload_uversky_seq.columns(feature="net_abs_charge")[0]
    plt.plot(amyload_data_x, amyload_data_y,'.', label="Amyload")

    disprot_data = compact(disprot_uversky_seq).columns()
    plt.plot(disprot_data[0], disprot_data[1],'.', label="Disprot")

    plt.plot([-0.78, 0.835], [0.0, 0.5],'k')
    plt.xlabel("mean hydrophobicity")
    plt.ylabel("net abs charge")
    plt.legend()

    plt.savefig(OutFile)
Ejemplo n.º 2
0
def get_df_from_file(fname):
    """ Loads data from a file to a dataframe.
	
		The identifiers are set as a primary
		key of the dataframe.
		
		The sequence entries are a1, a2,... and so on.
	"""

    # Load the file
    f = load_fasta_file(fname)

    # Get the identifiers and the sequences
    names, dataset = [], []
    for i in range(len(f)):
        dataset.append(f[i].data)
        names.append(f[i].identifier)

    # Generate a header for the dataframe
    headers = ['a' + str(i + 1) for i in range(np.shape(dataset)[1])]

    # Generate dataframe
    df = pd.DataFrame(dataset, columns=headers)
    df['names'] = names
    df = df.set_index('names')

    return df
Ejemplo n.º 3
0
def run(Fasta1, Fasta2, windows_per_frame, overlap_factor, xlabel, ylabel,
        pop1_label, pop2_label, htmlOutDir, htmlFname, Workdirpath):

    if not os.path.exists(htmlOutDir):
        os.makedirs(htmlOutDir)

    amyload_pos_seq = load_fasta_file(Fasta1)
    amyload_neg_seq = load_fasta_file(Fasta2)

    # Calculate quantitive features: volume and hydropathy
    mean_volume = Feature(get_aa2volume()).then(average)
    mean_hydropathy = Feature(get_aa2hydropathy()).then(average)

    fs = FeatureSet("volume'n'hydropathy")
    fs.add(mean_volume)
    fs.add(mean_hydropathy)

    amyload_pos_conv_seq = fs(amyload_pos_seq)
    amyload_neg_conv_seq = fs(amyload_neg_seq)

    # Do local Fisher:
    result = local_fisher_2d(amyload_pos_conv_seq,
                             amyload_neg_conv_seq,
                             windows_per_frame=int(windows_per_frame),
                             overlap_factor=int(overlap_factor))

    # Plot local Fisher:
    _plot_local_fisher_2d(result,
                          xlabel=xlabel,
                          ylabel=ylabel,
                          pop1_label=pop1_label,
                          pop2_label=pop2_label,
                          out_file_path=os.path.join(os.getcwd(), "out.png"))

    #   plt.savefig(os.path.join(Workdirpath, htmlOutDir, "1.png"))

    HTML_Gen(os.path.join(Workdirpath, htmlOutDir, htmlFname))
Ejemplo n.º 4
0
def Run_ngrams(fasta1, fasta2, OutFile ):

    alphasyn_seq = load_fasta_file(fasta1)
    amyload_pos_seq = load_fasta_file(fasta2)

    fs_aa = FeatureSet("aa patterns")
    fs_aa.add(identity)
    fs_aa.add(pattern_match, pattern='VT', padded=True)
    fs_aa.add(pattern_count, pattern='VT')

    result_seq = fs_aa(alphasyn_seq)

    fs_hp = FeatureSet("hydropathy patterns")
    fs_hp.add(Feature(get_aa2hydropathy()))
    fs_hp.add(Feature(get_aa2hydropathy()).then(pattern_match, pattern=[0.0, 2.0],
                                                metric='taxi', radius=1.0))
    result_seq2 = fs_hp(alphasyn_seq)
    result_freq = ngram_count(alphasyn_seq, n=2)
    result_fit = zipf_law_fit(amyload_pos_seq, n=3, verbose=True)

    counts = sorted(result_fit["ngram_counts"], reverse=True)
    ranks = range(1, len(counts)+1)

    slope = result_fit["slope"]
    harmonic_num = sum([rank**-slope for rank in ranks])
    fitted_counts = [(rank**-slope) / harmonic_num * sum(counts) for rank in ranks]

    plt.plot(ranks, counts, 'k', label="empirical")
    plt.plot(ranks, fitted_counts, 'k--',
             label="Zipf's law\nslope: {:.2f}".format((slope)))
    plt.xlabel('rank')
    plt.ylabel('count')
    plt.xscale('log')
    plt.yscale('log')
    plt.legend()

    plt.savefig(OutFile)
Ejemplo n.º 5
0
def encoded_seq_from_file(fname, dirname, particle, index):
    """ Function to encode the sequences from
		a file using AAindex. The encoded sequences
		are then padded to maximum length.
	"""

    # Load the fasta file
    f = load_fasta_file(dirname + '/' + fname)
    feat_map = _get_feature_map(index)

    # Get the sequences in a dataset
    dataset = []
    for i in range(len(f)):
        dataset.append(f[i])

    # Create a dictionary with keys as identifiers
    # and their values as the data.
    enc = {}
    if particle == 'virus':
        for seq in dataset:
            seq_id = _change_format_virus(seq.identifier)
            enc[seq_id] = feat_map(seq).data
    elif particle == 'mouse':
        for seq in dataset:
            seq_id = _change_format_mouse(seq.identifier)
            if seq_id not in ids_set_mouse:
                print seq.identifier, seq_id
            enc[seq_id] = feat_map(seq).data

    # Pad all sequences to maximum value in the
    # dataset.
    maxlen = max([len(val) for val in enc.values()])
    enc = _pad_encoding(enc, maxlen)

    # Check if all values have lengths
    # equal to maxlen.
    for val in enc.values():
        assert len(val) == maxlen

    return enc
Ejemplo n.º 6
0
import os
import sys

sys.path.insert(0, os.path.abspath('..'))

from quantiprot.utils.io import load_fasta_file
from quantiprot.utils.feature import Feature, FeatureSet
from quantiprot.metrics.aaindex import get_aaindex_file
from quantiprot.metrics.basic import average

# Load data:
seq = load_fasta_file("data/Alphasyn.fasta")

# Build a feature: average polarity (Graham, 1974), AAindex entry: GRAR740102:
feat = Feature(get_aaindex_file("GRAR740102")).then(average)

# Add the feature to new feature set:
fs = FeatureSet("my set")
fs.add(feat)

# Process sequences:
res_seq = fs(seq)

# Export average polarities
res = res_seq.columns()
print res
Ejemplo n.º 7
0
sys.path.insert(0, os.path.abspath('..'))

from quantiprot.utils.io import load_fasta_file
from quantiprot.utils.feature import Feature, FeatureSet
from quantiprot.metrics.aaindex import get_aa2hydropathy
from quantiprot.metrics.basic import identity

# Ngram-related imports
from quantiprot.metrics.ngram import pattern_match, pattern_count
from quantiprot.analysis.ngram import ngram_count
from quantiprot.analysis.ngram import zipf_law_fit

from matplotlib import pyplot as plt

# Load some data
alphasyn_seq = load_fasta_file("data/Alphasyn.fasta")
amyload_pos_seq = load_fasta_file("data/Amyload_positive.fasta")

# Find and count matches to a pattern 'VT'
fs_aa = FeatureSet("aa patterns")
fs_aa.add(identity)
fs_aa.add(pattern_match, pattern='VT', padded=True)
fs_aa.add(pattern_count, pattern='VT')

result_seq = fs_aa(alphasyn_seq)

for seq in result_seq[:3]:
    print seq

# ...and something much more subtle:
# Map a sequence to the hydrophaty scale, and search for the pattern 0.0 - 2.0
Ejemplo n.º 8
0
# feature2
if args.quantify2 in ['rec', 'det', 'pal', 'ratio_det', 'ratio_pal']:
    feat2 = feat2.then(quantify_method[args.quantify2],
                       metric=args.metric2, radius=float(args.radius2),
                       dim=int(args.dim2), tau=int(args.tau2),
                       det_len=int(args.diaglen2), pal_len=int(args.diaglen2))
else:
    feat2 = feat2.then(quantify_method[args.quantify2])

# Add the features to a FeatureSet
fs = FeatureSet("fs")
fs.add(feat1)
fs.add(feat2)

# Convert and plot input1 sequences in the 2d space
input_seq1 = load_fasta_file(args.input1)
conv_seq1 = fs(input_seq1)
conv_data1_x = conv_seq1.columns(feature=feat1.name)[0]
conv_data1_y = conv_seq1.columns(feature=feat2.name)[0]
plt.plot(conv_data1_x, conv_data1_y, '.', label="input1")

# Convert and plot input1 sequences in the 2d space
if args.input2 is not None:
    input_seq2 = load_fasta_file(args.input2)
    conv_seq2 = fs(input_seq2)
    conv_data2_x = conv_seq2.columns(feature=feat1.name)[0]
    conv_data2_y = conv_seq2.columns(feature=feat2.name)[0]
    plt.plot(conv_data2_x, conv_data2_y, '.', label="input2")

# Show legend and labels
plt.xlabel(feat1.name)
Ejemplo n.º 9
0

def getLabelIndex(fromFile):
	print(fromFile)
	splitArray = fromFile.split(' ', 1)

	# Remove last element from split
	splitArray.pop(0)
	y = splitArray[0].split(' ')
	y = sorted([int(i) for i in y[:-1]])

	return y


# Load the 'xxxxx.fasta' sequence set
alphasyn_seq = load_fasta_file("./Benchmark/benchmark3.fasta")
alphasyn_seq1 = load_fasta_file("./Benchmark/benchmark3.fasta")

# Get array of lengths
fastaLength, fastaID, count = [],[], 0

for seq in alphasyn_seq1:
	fastaLength.append(len(seq.data))
	fastaID.append(seq.identifier)

for leng in fastaLength: 
	count += leng


print(fastaLength)
Ejemplo n.º 10
0
                            help='num. of classes (default: 3)')
group_simplify.add_argument('-t', '--iterations', default=0,
                            help='num. of iterations for kmeans (default: 0)')

group_ngrams = parser.add_argument_group('N-grams')
group_ngrams.add_argument('-n', '--n', default='1', help='n-gram size (default: 1)')
group_ngrams.add_argument('-m', '--metric', default='identity',
                          choices=['identity', 'taxi', 'euclid', 'sup', 'inf'],
                          help='metric for matching n-grams (default: identity)')
group_ngrams.add_argument('-r', '--radius', default=0.0,
                          help='similarity radius (default: 0.0)')

args = parser.parse_args()

# Load the 'input' sequence set
input_seq = load_fasta_file(args.input, unique=False)

# Retrieve AAindex mapping for the 'property'
if args.property is not None:
    try:
        aa_mapping = get_aaindex_file(args.property)
    except ValueError:
        aa_mapping = get_aaindex_www(args.property)

    # Simplify if and as requested
    if args.simplify is not None:
        aa_mapping = simplify(aa_mapping, aa_mapping.__name__+"/"+args.classes,
                              method=args.simplify, k=int(args.classes),
                              iters=int(args.iterations))

    # Assign 'default' value for the Mapping
Ejemplo n.º 11
0
import os
import sys

sys.path.insert(0, os.path.abspath('..'))

from quantiprot.utils.io import load_fasta_file
from quantiprot.utils.sequence import SequenceSet
from quantiprot.utils.sequence import merge

# Load protein sequences from 'data/Amyload_positive.fasta':
amyload_pos_seq = load_fasta_file("data/Amyload_positive.fasta")

# Display first three sequences:
print amyload_pos_seq
for seq in amyload_pos_seq[:3]:
    print seq

# Find a sequence 'AMY438|7-13|Sup35' in 'amyload_pos_seq':
my_seq_index = amyload_pos_seq.ids().index("AMY438|7-13|Sup35")
my_seq = amyload_pos_seq[my_seq_index]
print my_seq

# And copy the sequence to a new sequence set:
my_seq_set = SequenceSet("my seq set")
my_seq_set.add(my_seq)
print my_seq_set

# Try again to add the same sequence to 'my_seq_set' with 'unique' = True:
my_seq_set.add(my_seq)
print my_seq_set
from quantiprot.metrics.basic import average
from quantiprot.metrics.ngram import pattern_match, pattern_count

from quantiprot.utils.sequence import compact, subset
from quantiprot.metrics.ngram import NgramFeatureSet
from quantiprot.metrics.alphabet import PROTEIN

from Bio import SeqIO
#Load sequence
length_seqs = []
for record in SeqIO.parse("sequence_2.fasta", "fasta"):
    length_seqs.append(len(record))
    #print((record))

#load the sequence from the file
seq = load_fasta_file("sequence_2.fasta")
SequenceIds = []
SequenceIds2_list = []
for i in SequenceSet.ids(seq):
    SequenceIds.append(i)
for i in SequenceIds:
    SequenceIds2 = i[i.find("[") + 1:i.find("]")]
    SequenceIds2_list.append(SequenceIds2)

#gather important protein features
polarity = Feature(get_aaindex_file("GRAR740102")).then(average)
hydropathy = Feature(get_aaindex_file("KYTJ820101")).then(average)
iso_point = Feature(get_aaindex_file("ZIMJ680104")).then(average)
pk_COOH = Feature(get_aaindex_file("JOND750102")).then(average)
entropy_form = Feature(get_aaindex_file("HUTJ700103")).then(average)
melting_point = Feature(get_aaindex_file("FASG760102")).then(average)
Ejemplo n.º 13
0
###quantiprot analysis, it will write out tables with AMK properties for segregating sites
# in order to see what are the physiochemical differences between the haplotypes
from quantiprot.utils.io import load_fasta_file
from quantiprot.utils.sequence import SequenceSet
from quantiprot.utils.sequence import subset, columns
from quantiprot.utils.feature import Feature, FeatureSet

# Conversions-related imports:
from quantiprot.utils.mapping import simplify
from quantiprot.metrics.aaindex import get_aa2charge, get_aa2hydropathy, get_aa2volume, get_aa2mj
from quantiprot.metrics.aaindex import get_aaindex_file
from quantiprot.metrics.basic import identity
import numpy as np

fapr = load_fasta_file('prot_segregating.fasta')  #load fasta
fs = FeatureSet("myTLRset")
fs.add(get_aa2charge())
fs.add(get_aa2volume())
fs.add(get_aa2mj())
fs.add(get_aa2hydropathy())

convfapr = fs(fapr)
metrics = ["formal_charge", "volume", "miyazawa-jernigan",
           "hydropathy"]  # which metrics of AMK to generate

#print convfapr
for m in metrics:
    outf = open(m + ".tsv", "w")
    with outf as f:
        h = np.matrix(columns(convfapr, feature=m, transpose=True))
Ejemplo n.º 14
0
import os
import sys

sys.path.insert(0, os.path.abspath('..'))

from quantiprot.utils.io import load_fasta_file
from quantiprot.utils.feature import FeatureSet
from quantiprot.metrics.aaindex import get_aa2mj
from quantiprot.metrics.rqa import RQAFeatureSet
from quantiprot.metrics.basic import average

from matplotlib import pyplot as plt

# Load the HET-E1 sequence with WD40 repeats:
hete1_seq = load_fasta_file("data/HETE1_PODAS.fasta")

# Prepare FeatureSet for conversion from aa to Miyazawa-Jernigan hydrophobicity:
mj_fs = FeatureSet("mj")
mj_fs.add(get_aa2mj())

# Prepare specialized FeatureSet with basic RQA parameters calculated
# over 100aa window, then smoothed over the 10aa window:
rqa_fs = RQAFeatureSet("rqa",
                       features=['recurrence', 'determinism'],
                       window=100,
                       metric='taxi',
                       radius=4,
                       dim=4,
                       det_len=8)
rqa_fs.then_all(average, window=10)
print rqa_fs
Ejemplo n.º 15
0
import os
import sys
sys.path.insert(0, os.path.abspath('..'))

# Uversky plot
from quantiprot.utils.io import load_fasta_file
from quantiprot.utils.feature import Feature, FeatureSet
from quantiprot.utils.sequence import compact
from quantiprot.metrics.aaindex import get_aa2charge, get_aa2hydropathy
from quantiprot.metrics.basic import average, average_absolute

from matplotlib import pyplot as plt

amyload_seq = load_fasta_file("data/Amyload_positive.fasta")
disprot_seq = load_fasta_file("data/Disprot.fasta")

# Non-standard letters in Disprot assigned neutral charge and hydropathy:
net_abs_charge = Feature(get_aa2charge(default=0)).then(average_absolute)
mean_hydropathy = Feature(get_aa2hydropathy(default=0)).then(average)

uversky_fs = FeatureSet("uversky")
uversky_fs.add(mean_hydropathy, name="mean_hydropathy")
uversky_fs.add(net_abs_charge, name="net_abs_charge")

amyload_uversky_seq = uversky_fs(amyload_seq)
disprot_uversky_seq = uversky_fs(disprot_seq)

# First approach to get hydrophobicity/charge pairs
amyload_data_x = amyload_uversky_seq.columns(feature="mean_hydropathy")[0]
amyload_data_y = amyload_uversky_seq.columns(feature="net_abs_charge")[0]
plt.plot(amyload_data_x, amyload_data_y, '.', label="Amyload")
Ejemplo n.º 16
0
                       default=1,
                       help='rqa: embedding dimension (default: 1)')
group_rqa.add_argument('-u',
                       '--tau',
                       default=0,
                       help='rqa: embedding delay tau (default: 0)')
group_rqa.add_argument(
    '-l',
    '--diaglen',
    default=2,
    help='rqa: minimal diagonal length for det/pal (default: 2)')

args = parser.parse_args()

# Load the 'input' sequence set
input_seq = load_fasta_file(args.input)

# Retrieve AAindex mapping for the 'property'
if args.property is not None:
    try:
        aa_mapping = get_aaindex_file(args.property)
    except ValueError:
        aa_mapping = get_aaindex_www(args.property)

    # Assign 'default' value for the Mapping
    try:
        aa_mapping.default = float(args.default)
    except (TypeError, ValueError):
        aa_mapping.default = args.default

    # Simplify if and as requested