Beispiel #1
0
class TestLangID(unittest.TestCase):

    def setUp(self):
        self.lm1 = LangID(unk=False)
        self.lm2 = LangID(unk=True)
        self.lm1.train()
        self.lm2.train()

    def test_lang_set(self):
        lang_set = self.lm1.get_lang_set()
        lang_set.sort()
        expected_lang_set = ['en','es','ar','pt']
        expected_lang_set.sort()       
        self.assertEqual(lang_set,expected_lang_set)

    def test_classify_en(self):
        lang = self.lm1.classify(u'hello world')
        self.assertEqual(lang,'en')

    def test_classify_es(self):
        lang = self.lm1.classify(u'hola mis amigos')
        self.assertEqual(lang,'es')

    def test_classify_unk(self):
        lang1 = self.lm1.classify(u'this is la fiesta del mundo')
        self.assertEqual(lang1,'en')
        lang2 = self.lm2.classify(u'this is la fiesta del mundo')
        self.assertEqual(lang2,'unk')
class GetLanguageDyslBolt(BasicBolt):

  def __init__(self, *args, **kwargs):
    #super(BasicBolt, self).__init__(*args, **kwargs)
    #print dir(BasicBolt)
    #BasicBolt.__init__(*args, **kwargs)
    self.l = LangID()
    self.l.train()

  def process(self, tup):
    text = tup.values[1]
    #language = langid.classify(text)[0]
    #l = LangID()
    #l.train()
    language = self.l.classify(text)
    storm.emit([tup.values[0], language])
Beispiel #3
0
 def setUp(self):
     self.lm1 = LangID(unk=False)
     self.lm2 = LangID(unk=True)
     self.lm1.train()
     self.lm2.train()
 def __init__(self, *args, **kwargs):
   #super(BasicBolt, self).__init__(*args, **kwargs)
   #print dir(BasicBolt)
   #BasicBolt.__init__(*args, **kwargs)
   self.l = LangID()
   self.l.train()
Beispiel #5
0
def main():

    parser = ArgumentParser(description='Do you speak London? A library for Natural Language Identification.')
    parser.add_argument('--version', action='store_true', help='Show version')
    parser.add_argument('--list-langs', action='store_true', help='List supported languages in training data')
    parser.add_argument('--unk', choices=['y','n'], default='n', help='Input text to classify')
    parser.add_argument('--corpus', default='', help='Specify path to custom training-set')
    parser.add_argument('--lang', help='Add training sample for the language specified')
    parser.add_argument('input', nargs='*', help='Input text to classify')
    args = parser.parse_args()
    #print args

    unk = False if args.unk == 'n' else True

    input_text = decode_input(args.input)

    if args.version:
        sys.exit(__version__)
    elif args.list_langs:
        l = LangID(unk=unk)
        l.train(root=args.corpus)
        print 'Languages: [' + '-'.join(l.get_lang_set()) + ']'
        sys.exit()
    elif args.lang and input_text:
        l = LangID(unk=unk)
        l.train(root=args.corpus)
        l.add_training_sample(text=input_text, lang=args.lang)
        l.save_training_samples()
        sys.exit('Training Sample for "%s" added successfully.\n' % args.lang)
    elif input_text:
        l = LangID(unk=unk)
        l.train(root=args.corpus)
        lang = l.classify(input_text)
        print 'Input text:', input_text
        print 'Language:', lang
    else:
        parser.print_help()
        sys.exit('\n')