Python Trigrams Examples

Programming Language: Python

Namespace/Package Name: Trigrams

Class/Type: Trigrams

Examples at hotexamples.com: 4

Python Trigrams - 4 examples found. These are the top rated real world Python examples of Trigrams.Trigrams extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

calculate_probabilities(2)

add_trigram(1)

add_trigrams_from_text(1)

create_trigrams(1)

eliminate_frequences(1)

Example #1

Show file

File: lid.py Project: ktisha/ebook-service

 def load_language_models(self):
     for x in listdir( STAT_DIR ):
         if x[-4:] == ".dat":
             modelfile = file(os.path.join(STAT_DIR,x))
             language = x[0:-4]
             self.languages.append( language )
             new_model = Trigrams()
             for line in modelfile:
                 tokens = split(line)
                 if len(tokens) == 2:
                     trigram = lower( unicode( tokens[0],'utf-8') )
                     probability = float(tokens[1])
                     new_model.add_trigram(trigram,probability)
             self.models.append(new_model)
             modelfile.close()

Example #2

Show file

File: lid.py Project: ktisha/ebook-service

 def __init__(self):
     """Lid constructor
         The constructor loads automatically all language models in the
         current directory.
         The language models are stored in files that are made up as follows:
         LANGUAGE_NAME followd by .dat.
     """
     self.trigrams = Trigrams()
     self.languages  = [] # list of loaded language models
     self.models     = [] # list with the trigram models
     self.load_language_models()

Example #3

Show file

File: lid.py Project: ktisha/ebook-service

class Lid:
    """The basic class for Language Identification Library
    """

    def __init__(self):
        """Lid constructor
            The constructor loads automatically all language models in the
            current directory.
            The language models are stored in files that are made up as follows:
            LANGUAGE_NAME followd by .dat.
        """
        self.trigrams = Trigrams()
        self.languages  = [] # list of loaded language models
        self.models     = [] # list with the trigram models
        self.load_language_models()

    def load_language_models(self):
        for x in listdir( STAT_DIR ):
            if x[-4:] == ".dat":
                modelfile = file(os.path.join(STAT_DIR,x))
                language = x[0:-4]
                self.languages.append( language )
                new_model = Trigrams()
                for line in modelfile:
                    tokens = split(line)
                    if len(tokens) == 2:
                        trigram = lower( unicode( tokens[0],'utf-8') )
                        probability = float(tokens[1])
                        new_model.add_trigram(trigram,probability)
                self.models.append(new_model)
                modelfile.close()

    def checkText(self, text):
        """Check which language a text is."""
        self.trigrams.create_trigrams(text)
        self.trigrams.calculate_probabilities()
        result = self.count_deviation()
        language, confidence = find_best_language(self.languages, result)
        answer = {'confidence':confidence}
        stat = {}
        for x, lang in enumerate(self.languages):
            stat[lang] = result[x]
        answer['stat'] = stat
        return language, answer
#
#        if self.is_results_equal(result):
#            return '?', res

    def count_result_in_percents(results, num):
        new_result = [ float(result) / float(num) for result in results ]
        return new_result

    def count_deviation(self):
        result = []   
        for x in range(len(self.languages)):
            result.append(0)
        for x in self.trigrams.trigrams.keys():
            for i in range(len(self.models)):
                model = self.models[i]
                if model.trigrams.has_key(x):
                    value = model.trigrams[x] - self.trigrams.trigrams[x]
                    result[i] += abs( value )
                else:
                    # otherwise set the resulting value to 1 = max. deviation
                    result[i] += 1
        return result

    def is_results_equal(self,results):
        for x in range(len(results)-1):
            if results[x] != results[x+1]:
                return False
        return True

Example #4

Show file

File: lidtrainer.py Project: ktisha/ebook-service

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = "Alex Turkin <*****@*****.**>"



import sys, os.path, glob
from Trigrams import Trigrams
from lid import clean_punctuation_from_text

sys.path.insert(1,'../')
import chardet

if __name__ == "__main__":
    myTrigrams = Trigrams()
    if len(sys.argv) > 1:
        for x in sys.argv[1:]:
            for y in glob.glob(os.path.normcase(x)):
                try:
                    f = open(y)
                    string = f.read()
                    encoding = chardet.detect(string)['encoding']
                    unistring = unicode(string, encoding)#.decode(encoding)
                    cleaned =  clean_punctuation_from_text(unistring)
                    myTrigrams.add_trigrams_from_text(cleaned)
                except IOError:
                    pass

        myTrigrams.eliminate_frequences(2)
        myTrigrams.calculate_probabilities()