async def detect_language( self, text: str) -> Optional[Union[Language, List[Language]]]: async with ClientSession() as http: # 1) Try to detect entity using rasa nlu async for each_entity in self._detect_entities(text, http): if each_entity.entity in [ 'language', 'country', 'country_flag' ]: raw_language_obj = iso639.find(each_entity.value) # False positive from rasa (maybe add some strings comparison later if raw_language_obj and raw_language_obj[ 'name'] != "Undetermined": logging.info( f"NLU model detected language: ({text})[{raw_language_obj['name']}]" ) return raw_language_obj if each_entity.entity in ['country', 'country_flag']: # Returned country name # Our dict must have mapping to the country langs = available_langs.get(each_entity.value) if langs: logging.info( f"NLU model detected country: ({text})[{each_entity.entity}]" ) langs = [iso639.find(lang) for lang in langs] return [ lang_obj for lang_obj in langs if lang_obj and lang_obj['name'] != "Undetermined" ] else: # 2) Detect what is the language of the speaker language = await self.tr.detect_language(text, http) logging.info( f"Translator detected language: ({text})[{language}]") # If user sent message in his own language and we figured out what is it - case closed language = iso639.find(language) if language and language['name'] != "Undetermined": return language
def bib2std(code): """ Translate a bibliographic variant ISO 639-2 three letter code to its corresponding ISO 639-1 code which can be compared with the output from the language detectors. """ entry = iso639.find(iso639_2=code) if not entry: pass # print('**Failed to find ISO 639-2 code: %s' % code) # Just return original code without translating # This may be a discontinued code like scc for Serbian (instead of the # now standard srp) since these don't appear to be included in the package #code = None elif u'iso639_1' in entry: code = entry[u'iso639_1'] else: code = entry[u'iso639_2_t'] return code
async def entry(self, context: Context, user: User, db): user['context']['bq_state'] = 1 # Special case for telegram-like client side language code entity if user['context'].get('language_state') is None and context['request']['user']['lang_code']: lang = iso639.find(context['request']['user']['lang_code']) if lang and lang['name'] != "Undetermined": # Update current context user['language'] = lang['iso639_1'] self.set_language(lang['iso639_1']) # Ask if user wants to continue with the language context['request']['message']['text'] = self.strings["app_confirm_language"].format(lang['native']) context['request']['has_buttons'] = True context['request']['buttons_type'] = "text" context['request']['buttons'] = [ {"text": self.strings['yes']}, {"text": self.strings['no']}, {"text": self.strings['stop']} ] user['context']['language_state'] = 4 self.send(user, context) # [DEBUG] # logging.info(f"{user['context']['language_state']}, {user['language']}") return base_state.OK # [DEBUG] # logging.info(f"{lang}, {context['request']['user']['lang_code']}") # Send language message context['request']['message']['text'] = self.strings["choose_lang"] # Add confirmation button to skip if context['request']['user']['lang_code']: user['context']['lang_code'] = context['request']['user']['lang_code'] context['request']['has_buttons'] = True context['request']['buttons_type'] = "text" context['request']['buttons'] = [ {"text": self.strings['skip_lang'].format(context['request']['user']['lang_code'])} ] # Set user context as 'Was Asked Language Question' user['context']['language_state'] = 1 # Don't forget to add task self.send(user, context) return base_state.OK
def turn_into_dictionary(input_data: List[str]) -> dict: """ Transform a list with fic data into a dictionary. """ if not isinstance(input_data, list): raise TypeError(f"'{type(input_data)}' cannot be used here") result_dictionary = {} for index, data in enumerate(input_data): if ":" in data: temp_values = [x.strip() for x in data.split(": ")] key = temp_values[0] if match(r"^\d+(,\d+)*$", temp_values[1]): temp_values[1] = sub(",", "", temp_values[1]) val = int(temp_values[1]) else: val = temp_values[1] else: if data == "OC": key = "Characters" val = data elif data == "Complete": key = "Status" val = data else: lang = iso639.find(language=data) if lang: key = "Language" val = lang["name"] else: key = "Characters" val = [x.strip() for x in data.split(",")] for x in data.split("/"): if x in GENRES: key = "Genres" val = data.split("/") break result_dictionary[key] = val return result_dictionary
if key in keep.keys(): country_code = line['region'] language_code = line['language'] original = line['isOriginalTitle'] if (country_code == '\\N'): country_code = '' else: try: country[country_code] = iso3166.countries.get(country_code).name except: country[country_code] = "Not an ISO-3166 country?" # tgg22 : I was not able to fix these if (language_code == '\\N'): language_code = '' else: try: language[language_code] = iso639.find(language_code)['name'] except: language[language_code] = fix_iso639[language_code] # "Not an ISO-639 language?" if (original == '\\N'): original = '' else: if (original == "0"): original = "false" else: original = "true" if key in rowkey.keys(): rowkey[key] = rowkey[key] + 1 else: rowkey[key] = 0 print(bar.join([key, str(rowkey[key]), line['title'], country_code, language_code, original]), file=has_alt_out)
def is_valid_language(item: str) -> bool: return iso639.find(whatever=item) is not None
for entry in data['entries']: # Read the article article = Article(entry.link) article.download() article.parse() document = article.text words = word_tokenize(document) print "reading article", article.title # Remove punctuation words = [word.lower() for word in words if word.isalpha()] # Get language try: language = iso639.find(detect(entry.title))['name'].lower() stemmer = SnowballStemmer(language) except Exception, e: print str(e) stemmer = SnowballStemmer("english") print "stem language is set to english" # Stem text for word in words: word = stemmer.stem(word) document = ' '.join(words) for keyword in keywords: relevance = Document.get_relevance(document, stemmer.stem(keyword)) if relevance > 0: