Exemple #1
0
    def get_response(self, text):
        lang = guessLanguage(text)
        if 'UNKNOWN' == lang:
            if is_ascii(text):
                lang = 'en'
            else:
                lang = self.default_lang
        elif lang == 'zh':
            lang == 'ch'  # see http://developer.simsimi.com/lclist

        # only to handle en, fr, zh in my case
        if lang not in ('ch', 'en', 'fr'):
            lang = self.default_lang

        payload = {
            'key': self.__key,
            'text': text,
            'lc': lang,
            'ft': self.filter_rate
        }
        resp = requests.get(self.url, params=payload)
        print resp.url
        resp_json = resp.json()
        if resp_json['result'] == 100:
            return resp_json['response']
        else:
            raise SimSimiException(resp_json['result'], resp_json['msg'])
def set_detected_language(context, event):
    if hasattr(context, 'SearchableText') and hasattr(context, 'Language'):
        text = context.SearchableText().strip().lstrip(context.id).decode('utf-8', 'ignore')
        language = guessLanguage(text)
        logger.debug('Detected language %s' % language)
        if language != context.Language() and language != UNKNOWN:
            logger.debug('Set language %s' % language)
            context.setLanguage(language)
Exemple #3
0
    def setupclipboard(self, clipboard):
        lang = guessLanguage(clipboard)
        if lang not in self.languages:
            lang = self.languages[0]
        self.speaker.voice["language"] = lang

        clipboard = clipboard.decode(
            'utf-8') if sys.version_info[0] >= 3 and type(
                clipboard) == bytes else clipboard
        text = text_substitution(clipboard, lang)
        if self.dirname is None:
            self.speaker.add_callback(self.clipcallback)
        self.speaker.play(text)
Exemple #4
0
def index(page=1):
    form = PostForm()
    if form.validate_on_submit():
        language = guessLanguage(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = 'en'
        post = Post(body=form.post.data,
                    timestamp=datetime.utcnow(),
                    author=g.user,
                    language=language)
        db.session.add(post)
        db.session.commit()
        flash('Your post is now live!')
        return redirect(url_for('index'))
    posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False)
    return render_template('index.html', title='Home', form=form, posts=posts)
Exemple #5
0
    def test_guess(self):
        tests = [
            ("This is a test of the language checker", "en"),
            ("Verifions que le détecteur de langues marche", "fr"),
            ("Sprawdźmy, czy odgadywacz języków pracuje", "pl"),
            ("авай проверить  узнает ли наш угадатель русски язык", "ru"),
            ("La respuesta de los acreedores a la oferta argentina para salir del default no ha sido muy positiv", "es"),
             ("Сайлау нәтижесінде дауыстардың басым бөлігін ел премьер министрі Виктор Янукович пен оның қарсыласы, оппозиция жетекшісі Виктор Ющенко алды.", "kk"), # Kazakh
            ("милиция ва уч солиқ идораси ходимлари яраланган. Шаҳарда хавфсизлик чоралари кучайтирилган.", "uz"), # uzbek
            ("көрбөгөндөй элдик толкундоо болуп, Кокон шаарынын көчөлөрүндө бир нече миң киши нааразылык билдирди.", "ky"), # kyrgyz
            ("yakın tarihin en çekişmeli başkanlık seçiminde oy verme işlemi sürerken, katılımda rekor bekleniyor.", "tr"), 
             ("Daxil olan xəbərlərdə deyilir ki, 6 nəfər Bağdadın mərkəzində yerləşən Təhsil Nazirliyinin binası yaxınlığında baş vermiş partlayış zamanı həlak olub.", "az"), # Azerbaijani

             (" ملايين الناخبين الأمريكيين يدلون بأصواتهم وسط إقبال قياسي على انتخابات هي الأشد تنافسا منذ عقود",  "ar"),
             ("Американське суспільство, поділене суперечностями, збирається взяти активну участь у голосуванні",  "uk"), # ukrainian
             ("Francouzský ministr financí zmírnil výhrady vůči nízkým firemním daním v nových členských státech EU",  "cs"), # czech
             ("biće prilično izjednačena, sugerišu najnovije ankete. Oba kandidata tvrde da su sposobni da dobiju rat protiv terorizma",  "hr"), # croatian
             (" е готов да даде гаранции, че няма да прави ядрено оръжие, ако му се разреши мирна атомна програма",  "bg"), # bulgarian
             ("на јавното мислење покажуваат дека трката е толку тесна, што се очекува двајцата соперници да ја прекршат традицијата и да се појават и на самиот изборен ден.",  "mk"), # macedonian
             ("în acest sens aparţinînd Adunării Generale a organizaţiei, în ciuda faptului că mai multe dintre solicitările organizaţiei privind organizarea scrutinului nu au fost soluţionate",  "ro"), # romanian
             ("kaluan ditën e fundit të fushatës në shtetet kryesore për të siguruar sa më shumë votues.",  "sq"), # albanian
             ("αναμένεται να σπάσουν παράδοση δεκαετιών και να συνεχίσουν την εκστρατεία τους ακόμη και τη μέρα των εκλογών",  "el"), # greek
             (" 美国各州选民今天开始正式投票。据信,",  "zh"), # chinese
             (" Die kritiek was volgens hem bitter hard nodig, omdat Nederland binnen een paar jaar in een soort Belfast zou dreigen te veranderen",  "nl"), # dutch
             ("På denne side bringer vi billeder fra de mange forskellige forberedelser til arrangementet, efterhånden som vi får dem ",  "da"), # danish
             ("Vi säger att Frälsningen är en gåva till alla, fritt och för intet.  Men som vi nämnt så finns det två villkor som måste",  "sv"), # swedish
             ("Nominasjonskomiteen i Akershus KrF har skviset ut Einar Holstad fra stortingslisten. Ytre Enebakk-mannen har plass p Stortinget s lenge Valgerd Svarstad Haugland sitter i",  "nb"), # norwegian
             ("on julkishallinnon verkkopalveluiden yhteinen osoite. Kansalaisten arkielämää helpottavaa tietoa on koottu eri aihealueisiin",  "fi"), # finnish
             ("Ennetamaks reisil ebameeldivaid vahejuhtumeid vii end kurssi reisidokumentide ja viisade reeglitega ning muu praktilise informatsiooniga",  "et"), # estonian
             ("Hiába jön létre az önkéntes magyar haderő, hiába nem lesz többé bevonulás, változatlanul fennmarad a hadkötelezettség intézménye",  "hu"), # hungarian
             ("հարաբերական",  "hy"), # armenian
             ("Hai vấn đề khó chịu với màn hình thường gặp nhất khi bạn dùng laptop là vết trầy xước và điểm chết. Sau đây là vài cách xử lý chú", "vi"),
             ("ii",  UNKNOWN),
             
             # This text has a mix of Hirigana, Katakana and CJK which requires the fix for issue:3 to classify correctly
             ("トヨタ自動車、フィリピンの植林活動で第三者認証取得 トヨタ自動車(株)(以下、トヨタ)は、2007年9月よりフィリピンのルソン島北部に位置するカガヤン州ペニャブラン", 'ja'),
        ]

        for text, name in tests:
            self.assertEquals(name, guessLanguage(text))

        text = "Verifions que le détecteur de langues marche"
        self.assertEquals('fr', guessLanguageTag(text))
        self.assertEquals('French', guessLanguageName(text))
        self.assertEquals(26150, guessLanguageId(text))
        self.assertEquals(('fr', 26150, 'French'), guessLanguageInfo(text))
Exemple #6
0
    def _guess_lang(self, wordcontext=None):

        if not self._lang:
            words = self._corpus

            if not words:
                words = wordcontext

            lang = guess_language.guessLanguage(words)

            log.info('Guessed lang: %s', lang)

            if self._lang == LANG_UNKNOWN:
                lang = None

            self._lang = lang

        return self._lang
def setLanguage(self):
    brains = self.portal_catalog(portal_type = 'File')
    j = len(brains)
    i = 0
    k = 0
    for brain in brains:
        i +=1
        context = brain.getObject()
        text = context.SearchableText().strip().lstrip(context.id).decode('utf-8', 'ignore')
        language = guessLanguage(text)
        if language != context.Language():
            logger.info('%i/%i' %(i,j))
            logger.info('Content language %s' % context.Language())
            logger.info('Detected language %s' % language)
        if language != context.Language() and language != UNKNOWN:
            logger.info('Set language %s' % language)
            context.setLanguage(language)
            k += 1
    logger.info('Set language completed')
    logger.info('%i language settings updated' % k)
    def __init__(self, item_data):
        """ Populates variables by parsing a provided dictionary.

        Args:
            item_data (feedparser.FeedParserDict): A dictionary full of item information.

        """
        abstract = BeautifulSoup.BeautifulSoup(item_data["summary"]).text
        language = guessLanguage(abstract) or item_data["title_detail"]["language"] 
        
        try:
            self.language = self.language_code[language]
        except KeyError:
            self.language = self.language_code[DEFAULT_LANGUAGE_CODE]
            logging.warning('language code "%s" not in the list: %s.' %
                          (language, ", ".join(self.language_code)))
            logging.warning('the abstract was: %s' % abstract)
            logging.warning('continue with the default language "%s"' % self.language)
                
        self.webpage_url = item_data["link"]
        self.published_date = item_data["published"]
        self.title = item_data["title"]
        self.abstract = abstract

        # hash from the title and the feed url
        self.id = Item.get_id(item_data)
        
        self.webpage_text = u""
        if ACTIVE_ACTICLE_EXTRACTOR:
            try:
                # extract the text of the linked web page. 
                extractor = Extractor(extractor='ArticleExtractor', url=self.webpage_url)
                self.webpage_text = extractor.getText()
            except Exception as er:
                logging.warning("can't extract the article")
                logging.warning('url was "%s"'%self.webpage_url)
                logging.debug(er)
Exemple #9
0
from sys import argv
from guess_language import guess_language

script, file = argv
f = open(file,'r')

text = ""
for i in range(0,3):
	text = text+f.readline()

language = guess_language.guessLanguage(text)
print language