Example #1
0
 def load_language_models(self):
     for x in listdir( STAT_DIR ):
         if x[-4:] == ".dat":
             modelfile = file(os.path.join(STAT_DIR,x))
             language = x[0:-4]
             self.languages.append( language )
             new_model = Trigrams()
             for line in modelfile:
                 tokens = split(line)
                 if len(tokens) == 2:
                     trigram = lower( unicode( tokens[0],'utf-8') )
                     probability = float(tokens[1])
                     new_model.add_trigram(trigram,probability)
             self.models.append(new_model)
             modelfile.close()
Example #2
0
 def __init__(self):
     """Lid constructor
         The constructor loads automatically all language models in the
         current directory.
         The language models are stored in files that are made up as follows:
         LANGUAGE_NAME followd by .dat.
     """
     self.trigrams = Trigrams()
     self.languages  = [] # list of loaded language models
     self.models     = [] # list with the trigram models
     self.load_language_models()
Example #3
0
class Lid:
    """The basic class for Language Identification Library
    """

    def __init__(self):
        """Lid constructor
            The constructor loads automatically all language models in the
            current directory.
            The language models are stored in files that are made up as follows:
            LANGUAGE_NAME followd by .dat.
        """
        self.trigrams = Trigrams()
        self.languages  = [] # list of loaded language models
        self.models     = [] # list with the trigram models
        self.load_language_models()

    def load_language_models(self):
        for x in listdir( STAT_DIR ):
            if x[-4:] == ".dat":
                modelfile = file(os.path.join(STAT_DIR,x))
                language = x[0:-4]
                self.languages.append( language )
                new_model = Trigrams()
                for line in modelfile:
                    tokens = split(line)
                    if len(tokens) == 2:
                        trigram = lower( unicode( tokens[0],'utf-8') )
                        probability = float(tokens[1])
                        new_model.add_trigram(trigram,probability)
                self.models.append(new_model)
                modelfile.close()

    def checkText(self, text):
        """Check which language a text is."""
        self.trigrams.create_trigrams(text)
        self.trigrams.calculate_probabilities()
        result = self.count_deviation()
        language, confidence = find_best_language(self.languages, result)
        answer = {'confidence':confidence}
        stat = {}
        for x, lang in enumerate(self.languages):
            stat[lang] = result[x]
        answer['stat'] = stat
        return language, answer
#
#        if self.is_results_equal(result):
#            return '?', res

    def count_result_in_percents(results, num):
        new_result = [ float(result) / float(num) for result in results ]
        return new_result

    def count_deviation(self):
        result = []   
        for x in range(len(self.languages)):
            result.append(0)
        for x in self.trigrams.trigrams.keys():
            for i in range(len(self.models)):
                model = self.models[i]
                if model.trigrams.has_key(x):
                    value = model.trigrams[x] - self.trigrams.trigrams[x]
                    result[i] += abs( value )
                else:
                    # otherwise set the resulting value to 1 = max. deviation
                    result[i] += 1
        return result

    def is_results_equal(self,results):
        for x in range(len(results)-1):
            if results[x] != results[x+1]:
                return False
        return True
Example #4
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = "Alex Turkin <*****@*****.**>"



import sys, os.path, glob
from Trigrams import Trigrams
from lid import clean_punctuation_from_text

sys.path.insert(1,'../')
import chardet

if __name__ == "__main__":
    myTrigrams = Trigrams()
    if len(sys.argv) > 1:
        for x in sys.argv[1:]:
            for y in glob.glob(os.path.normcase(x)):
                try:
                    f = open(y)
                    string = f.read()
                    encoding = chardet.detect(string)['encoding']
                    unistring = unicode(string, encoding)#.decode(encoding)
                    cleaned =  clean_punctuation_from_text(unistring)
                    myTrigrams.add_trigrams_from_text(cleaned)
                except IOError:
                    pass

        myTrigrams.eliminate_frequences(2)
        myTrigrams.calculate_probabilities()