def readfile(filename): #print 'readfile', filename f = codecs.open(filename, encoding='utf-8') tokenz = LM.tokenize(f.read()) f.close() #print tokenz return tokenz
def _readfile(cls, filename): """ Reads a file a utf-8 file, and retuns character tokens. :param filename: Name of file to be read. """ f = codecs.open(filename, encoding='utf-8') filedata = f.read() f.close() tokenz = LM.tokenize(filedata, mode='c') #print tokenz return tokenz
def classify(self, text=u''): """ Predicts the Language of a given text. :param text: Unicode text to be classified. """ text = self.lm.normalize(text) tokenz = LM.tokenize(text, mode='c') result = self.lm.calculate(doc_terms=tokenz) #print 'Karbasa:', self.karbasa(result) if self.unk and self.lm.karbasa(result) < self.min_karbasa: lang = 'unk' else: lang = result['calc_id'] return lang