コード例 #1
0
ファイル: msa.py プロジェクト: javicorvi/esmsagen
def summary(msa, output, title):
    '''
       Ala (A) 9.10   Gln (Q) 3.79   Leu (L) 9.87   Ser (S) 6.69
       Arg (R) 5.71   Glu (E) 6.16   Lys (K) 4.99   Thr (T) 5.57
       Asn (N) 3.88   Gly (G) 7.26   Met (M) 2.38   Trp (W) 1.29
       Asp (D) 5.45   His (H) 2.19   Phe (F) 3.92   Tyr (Y) 2.93
       Cys (C) 1.21   Ile (I) 5.70   Pro (P) 4.85   Val (V) 6.88
    '''
    #unit_prot freq table of aminoacids 23/11/2017
    e_freq_dict = {
        'A': 0.091,
        'R': 0.0571,
        'N': 0.0388,
        'D': 0.0545,
        'C': 0.0121,
        'Q': 0.0379,
        'E': 0.0616,
        'G': 0.0726,
        'H': 0.0219,
        'I': 0.0570,
        'L': 0.0987,
        'K': 0.0499,
        'M': 0.0238,
        'F': 0.0392,
        'P': 0.0485,
        'S': 0.0669,
        'T': 0.0557,
        'W': 0.0129,
        'Y': 0.0293,
        'V': 0.0688
    }
    #e_freq_dict={'A': 0.175, 'B': 0.325, 'C': 0.5}
    e_freq_table = FreqTable.FreqTable(e_freq_dict,
                                       FreqTable.FREQ,
                                       alphabet=Alphabet.ProteinAlphabet())
    #e_freq_table=None
    df = pandas.DataFrame()
    alignment = AlignIO.read(msa, "fasta", alphabet=Alphabet.ProteinAlphabet())
    summary_align = AlignInfo.SummaryInfo(alignment)
    total_entropy, entropy_columns, freq_dict_columns = information_content(
        summary_align, e_freq_table=e_freq_table)
    '''Print File de resultados'''
    for i in range(len(entropy_columns.values())):
        freq_dict = freq_dict_columns[i]
        df_2 = pandas.DataFrame([freq_dict], columns=freq_dict.keys())
        df_2['Entropy'] = entropy_columns[i]
        df = df.append(df_2, ignore_index=True)
        #df.set_value(i, 'Entropy' , entropy_columns[i])
    df.to_csv(output)
コード例 #2
0
ファイル: test_fileio.py プロジェクト: KIT-MBS/distruct
def test_fileio():
    ffName = 'test'
    from Bio import Alphabet
    alphabet = Alphabet.ProteinAlphabet()
    alphabet.size = 3
    alphabet.letters = ['BB1', 'BB2']
    inferAngles = True
    topPath = testFilePath

    testDB = generate(ffName, [alphabet], inferAngles, topPath=topPath)

    write_topology_database(testDB, 'test', [alphabet], outDir=testFilePath)

    result = read_topology_database('test', inDir=testFilePath)
    os.remove(testFilePath + 'test.xml')
    assert result['BB1']['vertices'] == [('A1', 'A'), ('A2', 'A'), ('A3', 'A'),
                                         ('A4', 'A')]

    assert result['BB1']['bondEdges'][('A1', 'A2')] == approx(1.2)
    assert result['BB1']['bondEdges'][('A2', 'A3')] == approx(1.0)
    assert result['BB1']['bondEdges'][('A3', 'A4')] == approx(1.1)
    assert result['BB1']['angleEdges'][('A1',
                                        'A3')] == approx(1.90787884028338913,
                                                         rel=1e-5)
    assert result['BB1']['angleEdges'][('A2',
                                        'A4')] == approx(1.7719368430701863,
                                                         rel=1e-5)
    assert result['BB1']['improperEdges'][('A1',
                                           'A4')] == approx(2.065313144262336,
                                                            rel=1e-5)

    return
コード例 #3
0
ファイル: UniprotIO.py プロジェクト: davidmam/biopython
 def __init__(self,
              elem,
              alphabet=Alphabet.ProteinAlphabet(),
              return_raw_comments=False):
     self.entry = elem
     self.alphabet = alphabet
     self.return_raw_comments = return_raw_comments
コード例 #4
0
ファイル: UniprotIO.py プロジェクト: joshainglis/biopython
def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
    """Generator function to parse UniProt XML as SeqRecord objects.

    parses an XML entry at a time from any UniProt XML file
    returns a SeqRecord for each iteration

    This generator can be used in Bio.SeqIO

    return_raw_comments = True --> comment fields are returned as complete XML to allow further processing
    skip_parsing_errors = True --> if parsing errors are found, skip to next entry
    """
    if isinstance(alphabet, Alphabet.NucleotideAlphabet):
        raise ValueError("Wrong alphabet %r" % alphabet)
    if isinstance(alphabet, Alphabet.Gapped):
        if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet):
            raise ValueError("Wrong alphabet %r" % alphabet)

    if not hasattr(handle, "read"):
        if isinstance(handle, str):
            handle = StringIO(handle)
        else:
            raise Exception('An XML-containing handler or an XML string must be passed')

    if ElementTree is None:
        from Bio import MissingExternalDependencyError
        raise MissingExternalDependencyError(
                "No ElementTree module was found. "
                "Use Python 2.5+, lxml or elementtree if you "
                "want to use Bio.SeqIO.UniprotIO.")

    for event, elem in ElementTree.iterparse(handle, events=("start", "end")):
        if event == "end" and elem.tag == NS + "entry":
            yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse()
            elem.clear()
コード例 #5
0
def test_generate():
    ffname = 'test'
    from Bio import Alphabet
    alphabet = Alphabet.ProteinAlphabet()
    alphabet.size = 3
    alphabet.letters = ['BB1', 'BB2']
    inferAngles = True
    topPath = testFilePath

    result = ffparsergmx.generate(ffname, [alphabet],
                                  inferAngles,
                                  topPath=topPath)
    assert result['BB1']['vertices'] == [('A1', 'A'), ('A2', 'A'), ('A3', 'A'),
                                         ('A4', 'A')]

    assert result['BB1']['bondEdges'][('A1', 'A2')] == approx(1.2)
    assert result['BB1']['bondEdges'][('A2', 'A3')] == approx(1.0)
    assert result['BB1']['bondEdges'][('A3', 'A4')] == approx(1.1)
    assert result['BB1']['angleEdges'][('A1',
                                        'A3')] == approx(1.90787884028338913,
                                                         rel=1e-5)
    assert result['BB1']['angleEdges'][('A2',
                                        'A4')] == approx(1.7719368430701863,
                                                         rel=1e-5)
    assert result['BB1']['improperEdges']['A1',
                                          'A4'] == approx(2.065313144262336)

    return
コード例 #6
0
ファイル: UniprotIO.py プロジェクト: stuckmke/biopython
 def __init__(self,
              elem,
              alphabet=Alphabet.ProteinAlphabet(),
              return_raw_comments=False):
     """Initialize the class."""
     self.entry = elem
     self.alphabet = alphabet
     self.return_raw_comments = return_raw_comments
コード例 #7
0
ファイル: msa.py プロジェクト: javicorvi/esmsagen
def conservation(msa_path):
    import numpy as np
    import scipy.stats as sc
    from Bio import AlignIO
    from Bio.Align import AlignInfo
    from Bio.Alphabet import IUPAC
    from Bio.SubsMat import FreqTable
    import Bio.Alphabet as Alphabet
    from Bio import motifs
    for filename in os.listdir(msa_path):
        if filename.endswith(".cluster"):
            alignment = AlignIO.read(msa_path + filename,
                                     "fasta",
                                     alphabet=Alphabet.ProteinAlphabet())
            columns_quantity = []
            columns_frequency = []
            #summary_align = AlignInfo.SummaryInfo(alignment)
            #pssm = summary_align.pos_specific_score_matrix()
            #print pssm
            for x in range(0, len(alignment[0].seq) - 1):
                column = alignment[:, x]
                quantity = letters
                for f in column:
                    print(f)
                    quantity[f] += 1
                double = 20 / len(alignment)
                print len(alignment)
                print(quantity)
                #frequency=list(map(lambda x: x/len(alignment), quantity))
                frequency = dict(
                    map(lambda (k, v): (k, v / len(alignment)),
                        quantity.iteritems()))
                print frequency
                columns_quantity.append(quantity)
                columns_frequency.append(frequency)
            print(columns_quantity)
            '''
            m = motifs.create(alignment,alphabet=Alphabet.ProteinAlphabet())
            print (m)
            
            alfa = summary_align.alignment._alphabet
            base_alpha = Alphabet._get_base_alphabet(alfa) 
            print(summary_align)
            print(alfa)
            print(base_alpha)
            data=summary_align.information_content(5,30)
            print(data)'''

    #n is the number of data points
    ''''n=10
コード例 #8
0
def reduceSeq(infile, outfile, alph):
    rec = []
    with open(infile, 'rU') as input_handle:
        for record in SeqIO.parse(input_handle, "fasta"):
            rec.append(record)
    new_p = []
    for r in rec:
        n_p = Seq('', Alphabet.ProteinAlphabet())
        if alph == 'Murphy10':
            n_p = Seq('', Alphabet.Reduced.Murphy10())
        elif alph == 'Murphy15':
            n_p = Seq('', Alphabet.Reduced.Murphy15())
        elif alph == 'Murphy8':
            n_p = Seq('', Alphabet.Reduced.Murphy8())
        elif alph == 'Murphy4':
            n_p = Seq('', Alphabet.Reduced.Murphy4())
        elif alph == 'PC5':
            n_p = Seq('', Alphabet.Reduced.PC5())
        elif alph == 'HPModel':
            n_p = Seq('', Alphabet.Reduced.HPModel())
        for aa in r:
            if aa != '*' and aa != '-' and aa != 'U':
                if aa == 'X':
                    aa = random.sample(set('ACDEFGHIKLMNPQRSTVWY'), 1)[0]
                if alph == 'Murphy10':
                    n_p += Alphabet.Reduced.murphy_10_tab[aa]
                elif alph == 'Murphy15':
                    n_p += Alphabet.Reduced.murphy_15_tab[aa]
                elif alph == 'Murphy8':
                    n_p += Alphabet.Reduced.murphy_8_tab[aa]
                elif alph == 'Murphy4':
                    n_p += Alphabet.Reduced.murphy_4_tab[aa]
                elif alph == 'PC5':
                    n_p += Alphabet.Reduced.pc_5_table[aa]
                elif alph == 'HPModel':
                    n_p += Alphabet.Reduced.hp_model_tab[aa]
            else:
                n_p += aa
        x = SeqRecord(n_p)
        x.id = r.id
        x.description = r.description
        new_p.append(x)
    SeqIO.write(new_p, outfile, "fasta")
コード例 #9
0
def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False,
                     monoisotopic=False):
    """Calculates the molecular weight of a DNA, RNA or protein sequence.

    Only unambiguous letters are allowed. Nucleotide sequences are assumed to
    have a 5' phosphate.

        - seq: String or Biopython sequence object.
        - seq_type: The default (None) is to take the alphabet from the seq argument,
          or assume DNA if the seq argument is a string. Override this with
          a string 'DNA', 'RNA', or 'protein'.
        - double_stranded: Calculate the mass for the double stranded molecule?
        - circular: Is the molecule circular (has no ends)?
        - monoisotopic: Use the monoisotopic mass tables?

    Note that for backwards compatibility, if the seq argument is a string,
    or Seq object with a generic alphabet, and no seq_type is specified
    (i.e. left as None), then DNA is assumed.

    >>> print("%0.2f" % molecular_weight("AGC"))
    949.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC")))
    949.61

    However, it is better to be explicit - for example with strings:

    >>> print("%0.2f" % molecular_weight("AGC", "DNA"))
    949.61
    >>> print("%0.2f" % molecular_weight("AGC", "RNA"))
    997.61
    >>> print("%0.2f" % molecular_weight("AGC", "protein"))
    249.29

    Or, with the sequence alphabet:

    >>> from Bio.Seq import Seq
    >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna)))
    949.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna)))
    997.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein)))
    249.29

    Also note that contradictory sequence alphabets and seq_type will also
    give an exception:

    >>> from Bio.Seq import Seq
    >>> from Bio.Alphabet import generic_dna
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA"))
    Traceback (most recent call last):
      ...
    ValueError: seq_type='RNA' contradicts DNA from seq alphabet

    """
    # Rewritten by Markus Piotrowski, 2014

    # Find the alphabet type
    tmp_type = ''
    if isinstance(seq, Seq) or isinstance(seq, MutableSeq):
        base_alphabet = Alphabet._get_base_alphabet(seq.alphabet)
        if isinstance(base_alphabet, Alphabet.DNAAlphabet):
            tmp_type = 'DNA'
        elif isinstance(base_alphabet, Alphabet.RNAAlphabet):
            tmp_type = 'RNA'
        elif isinstance(base_alphabet, Alphabet.ProteinAlphabet):
            tmp_type = 'protein'
        elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein):
            tmp_type = 'protein'
            # Convert to one-letter sequence. Have to use a string for seq1
            seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet())
        elif not isinstance(base_alphabet, Alphabet.Alphabet):
            raise TypeError("%s is not a valid alphabet for mass calculations"
                             % base_alphabet)
        else:
            tmp_type = "DNA" # backward compatibity
        if seq_type and tmp_type and tmp_type != seq_type:
            raise ValueError("seq_type=%r contradicts %s from seq alphabet"
                             % (seq_type, tmp_type))
        seq_type = tmp_type
    elif isinstance(seq, str):
        if seq_type is None:
            seq_type = "DNA" # backward compatibity
    else:
        raise TypeError("Expected a string or Seq object, not seq=%r" % seq)

    seq = ''.join(str(seq).split()).upper() # Do the minimum formatting

    if seq_type == 'DNA':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_unambiguous_dna_weights
        else:
            weight_table = IUPACData.unambiguous_dna_weights
    elif seq_type == 'RNA':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_unambiguous_rna_weights
        else:
            weight_table = IUPACData.unambiguous_rna_weights
    elif seq_type == 'protein':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_protein_weights
        else:
            weight_table = IUPACData.protein_weights
    else:
        raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r"
                         % seq_type)

    if monoisotopic:
        water = 18.010565
    else:
        water = 18.0153

    try:
        weight = sum(weight_table[x] for x in seq) - (len(seq)-1) * water
        if circular:
            weight -= water
    except KeyError as e:
        raise ValueError('%s is not a valid unambiguous letter for %s'
                         %(e, seq_type))
    except:
        raise

    if seq_type in ('DNA', 'RNA') and double_stranded:
        seq = str(Seq(seq).complement())
        weight += sum(weight_table[x] for x in seq) - (len(seq)-1) * water
        if circular:
            weight -= water
    elif seq_type == 'protein' and double_stranded:
        raise ValueError('double-stranded proteins await their discovery')

    return weight
コード例 #10
0
fasta = tempfile.NamedTemporaryFile()

idents = sorted(os.listdir('proteins'))
for ident in idents:
    fasta.write(open('proteins/' + ident).read())
fasta.flush()
print 'Calculating multiple alignment...'
aligner = subprocess.Popen([
    "/tmp/muscle3.8.31_i86linux64", "-clwstrict", "-in", fasta.name, "-out",
    "-"
],
                           stdout=subprocess.PIPE)
stdout, stderr = aligner.communicate()
align = AlignIO.read(StringIO(stdout),
                     'clustal',
                     alphabet=Alphabet.ProteinAlphabet())

from Bio.SubsMat import MatrixInfo


def score_match(pair, matrix):
    if pair not in matrix:
        return matrix[(tuple(reversed(pair)))]
    else:
        return matrix[pair]


def score_pairwise(seq1, seq2, matrix, gap_s, gap_e):
    score = 0
    gap = False
    for i in range(len(seq1)):