def get_lyric_lang(trans_type, code, long=False): try: if long: return '{} ({})'.format(iso639.to_name(code), trans_type) else: return '[{}] {}'.format(trans_type[:1], iso639.to_name(code)) except ValueError: return trans_type
def __init__(self, file_name, part_of_speech_filter="", extra_filter=""): config_ini = ConfigIni.getInstance() language = config_ini.getLanguage() base_language = to_name(config_ini.getBaseLanguage()).lower() learning_language = to_name(config_ini.getLearningLanguage()).lower() self.say_out = config_ini.isSayOut(), self.show_pattern = config_ini.isShowPattern(), self.show_note = config_ini.isShowNote() self.myFilteredDictionary = FilteredDictionary(file_name, base_language, learning_language, part_of_speech_filter, extra_filter)
def make_title_page(self) -> None: """ Parses the main page for information about the story and author. """ _header = self.main_page.find( "div", class_="storysummary formbody defaultcolors" ) _author = self.main_page.find("a", href=compile(r"^/AuthorStories")) _data = Header( *[ sub(r"\xa0", " ", x.text.strip()) for x in _header.table.find_all("tr")[-1].find_all("td") ] ) self.metadata.title = self.main_page.find("h2").string.strip() if not self.metadata.chapters: self.metadata.chapters = [self.metadata.title] self.metadata.author.name = _author.text self.metadata.author.url = self.url.copy().set(path=_author["href"]) self.metadata.complete = _data.complete self.metadata.rating = _data.rating self.metadata.updated = _data.updated self.metadata.published = _data.published if self.metadata.updated == self.metadata.published: self.metadata.updated = None self.metadata.language = iso639.to_name(self.main_page.html["lang"]) self.metadata.words = _data.words self.metadata.summary = _header.find_all("p")[-1].text self.metadata.genres = None self.metadata.category = _data.category self.metadata.tags = None self.metadata.characters = {"couples": None, "singles": None}
def make_langmaterial(record): langmaterials = record.xpath("./marc:datafield[@tag='546']", namespaces=ns) controlfield_008 = record.xpath("./marc:controlfield[@tag='008']", namespaces=ns)[0].text.strip() controlfield_langcode = controlfield_008[35:38] try: converted_langcode = iso639.to_name(controlfield_langcode) except: converted_langcode = False if converted_langcode: language_element = make_language(controlfield_langcode, converted_langcode) language_element_exists = True else: language_element_exists = False if langmaterials: langmaterial = langmaterials[0] language_note = langmaterial.xpath("./marc:subfield[@code='a']", namespaces=ns)[0].text.strip() materials_specified = langmaterial.xpath("./marc:subfield[@code='3']", namespaces=ns) if materials_specified: language_note += ": {}".format(materials_specified[0].text.strip()) if language_note == "In English.": language = make_language("eng", "English") return E.langmaterial("The material is in ", language) elif converted_langcode and (converted_langcode in language_note): first_half, second_half = language_note.split(converted_langcode, 1) return E.langmaterial(first_half, language_element, second_half) else: return E.langmaterial(language_note) elif language_element_exists: return E.langmaterial("The material is in ", language_element) else: return ""
def chunk_sentences(self, text, language='en'): import iso639 lang = iso639.to_name(language).lower() try: sentences = sent_tokenize(text, lang) except LookupError: sentences = sent_tokenize(text) return sentences
def format_language(text): try: # TODO: use https://pypi.org/project/pycountry/ val = iso639.to_name(format_string_word(text)) except Exception: logging.warning(f"could not match language: {text}") val = '' return val
def get_reachable_langs(self, lang): where_lang_is_src = self._src_tgt.get(lang, []) where_lang_is_tgt = self._tgt_src.get(lang, []) return { 'name': lang, 'title': to_name(lang), 'to': where_lang_is_src, 'from': where_lang_is_tgt }
def __init__(self, models_cfg): self._models = {} self._default_model_name = models_cfg[0]['model'] self._G = nx.DiGraph() for cfg in models_cfg: if not isinstance(cfg['source'], list) or not isinstance( cfg['target'], list): log.error("Error in config source and target must be lists") import sys sys.exit(1) model = Model.create(cfg) if model.model in self._models: log.error("Model names should be unique") import sys sys.exit(1) self._models[model.model] = model if cfg.get('default'): _default_model_name = cfg['model'] if cfg.get('include_in_graph', True): flip_src_tgt = cfg.get('target_to_source', False) for src_lang in cfg['source']: for tgt_lang in cfg['target']: # This will keep only the last model self._G.add_edge(src_lang, tgt_lang, cfg=model) if flip_src_tgt: self._G.add_edge(tgt_lang, src_lang, cfg=model) # There may be more than one shortest path between source and target; this returns only one self._shortest_path = nx.shortest_path(self._G) _directions = [] self._src_tgt = {} self._tgt_src = {} for item in self._shortest_path.items(): u = item[0] for v in item[1].keys(): if u != v: display = '{}->{}'.format(to_name(u), to_name(v)) _directions.append((u, v, display)) targets = get_or_create(self._src_tgt, u) targets.append(v) sources = get_or_create(self._tgt_src, v) sources.append(u) self._directions = sorted(_directions, key=lambda x: x[2])
def _get_language(self, key) -> str: key = key.lower().strip() try: language = _LANGUAGE_CODES[key] except KeyError: # In some cases it returns "Language; Dialect"; # we just save the "first half". language = iso639.to_name(key).split(";")[0] logging.info('Language: "%s"', language) return language
def lookup_communities(): global users_dict global games_dict json = requests.get(communities_api_url, headers=headers).json() for c in json.get('communities'): community_id = c.get('_id') community_api_url = 'https://api.twitch.tv/kraken/communities/' + str( community_id) community_json = requests.get(community_api_url, headers=headers).json() community_db = Community() community_db.id = community_id community_db.name = community_json.get('display_name') community_db.description = community_json.get('description') language_code = community_json.get('language') language = language_code if language_code: try: language_iso = iso639.to_name(language_code) if language_iso: language = language_iso except: pass community_db.language = language community_db.rules = community_json.get('rules') community_db.image_url = community_json.get('avatar_image_url') # TODO: Also check against empty string if community_db.image_url == None: community_db.image_url = 'https://static-cdn.jtvnw.net/jtv_user_pictures/xarth/404_user_70x70.png' # This is Twitch's default logo owner_id = community_json.get('owner_id') community_db.owner_id = owner_id game_id = lookup_user(community_db, owner_id) if game_id != None: community_db.game_id = game_id # for k in users_dict: # users_d[k] = users_dict[k] # for key in games_dict: # games_d[key] = games_dict[key] try: db.session.add(community_db) db.session.commit() db.session.close() except Exception as e: print(str(e) + '\n') db.session.rollback()
def _get_language(self, key) -> str: key = key.lower().strip() if key.startswith("proto-"): language = "-".join(x.title() for x in key.split("-")) return language try: language = LANGUAGE_CODES[key] except KeyError: # In some cases it returns "Language; Dialect"; # we just save the "first half". language = iso639.to_name(key).split(";")[0] logging.info('Language: "%s"', language) return language
def get_language_details(iso_639_3): """ dict container iso639-2, name and native name for an iso-639-3 code """ non_iso_langs = { "zh-Hans": { "code": "zh-Hans", "iso-639-1": "zh", "english": "Simplified Chinese", "native": "简化字", }, "zh-Hant": { "code": "zh-Hant", "iso-639-1": "zh", "english": "Traditional Chinese", "native": "正體字", }, "iw": { "code": "iw", "iso-639-1": "he", "english": "Hebrew", "native": "עברית" }, "es-419": { "code": "es-419", "iso-639-1": "es-419", "english": "Spanish", "native": "Español", }, "multi": { "code": "mul", "iso-639-1": "en", "english": "Multiple Languages", "native": "Multiple Languages", }, } try: return (non_iso_langs.get(iso_639_3) if iso_639_3 in non_iso_langs.keys() else { "code": iso_639_3, "iso-639-1": iso639.to_iso639_1(iso_639_3), "english": iso639.to_name(iso_639_3), "native": iso639.to_native(iso_639_3), }) except iso639.NonExistentLanguageError: return { "code": iso_639_3, "iso_639_3": iso_639_3, "english": iso_639_3, "native": iso_639_3, }
def get_language_name(iso_code): """ Gets the language name for the given ISO639-2 code. """ if iso_code not in LANGUAGES_BY_CODE: try: lang = iso639.to_name(iso_code) except iso639.NonExistentLanguageError: return None # we only show up to the first semi or paren lang = re.split(';|\(', lang)[0].strip() LANGUAGES_BY_CODE[iso_code] = lang return LANGUAGES_BY_CODE[iso_code]
def get_language_name(iso_code): """ Gets a language name for a given ISO639-2 code. Args: iso_code: three character iso_code """ if iso_code not in iso_codes: try: lang = iso639.to_name(iso_code) except NonExistentLanguageError: return None # we only show up to the first semi or paren lang = re.split(';|\(', lang)[0].strip() iso_codes[iso_code] = lang return iso_codes[iso_code]
def get_trends_by_location(loc_id,count): '''Get Trending Tweets by Location''' import iso639 import numpy as np from langdetect import detect df = pd.DataFrame([]) try: trends = api.trends_place(loc_id) df = pd.DataFrame([trending['name'], trending['tweet_volume'], iso639.to_name(detect(trending['name']))] for trending in trends[0]['trends']) df.columns = ['Trends','Volume','Language'] #df = df.sort_values('Volume', ascending = False) return(df[:count]) except Exception as e: pass print("An exception occurred: ",e) df = pd.DataFrame([trending['name'], trending['tweet_volume'], np.nan] for trending in trends[0]['trends']) df.columns = ['Trends','Volume','Language'] return(df[:count])
def get_stopwords(cls, language_code): if language_code in cls.STOPWORDS: return cls.STOPWORDS[language_code] try: names = [l.strip().lower() for l in iso639.to_name(language_code).split(';')] except iso639.NonExistentLanguageError: return [] for name in names: try: sw = stopwords.words(name) cls.STOPWORDS[language_code] = sw return sw except Exception: pass return []
def handle_file_upload(file_storage: FileStorage, upload_type: str): filename, file_extension = os.path.splitext(file_storage.filename) file_uuid = f'{str(uuid.uuid4())}_{filename}' log.info(f'Save file {file_uuid}') original_filename = config.custom_input_file( f'{file_uuid}{file_extension}') csv_filename = config.custom_input_file(f'{file_uuid}.csv') file_storage.save(original_filename) log.info(f'Extract {upload_type}') data = None detected_language = None try: if upload_type == CSV_FILE: data = pd.read_csv(original_filename) elif upload_type == TXT_FILE: data = convert_txt_to_csv(original_filename) elif upload_type == WHATS_APP_TXT_FILE: data = convert_whats_app_to_csv(original_filename) elif upload_type == ZIP_FILE: data = DirectoryAnalytics(original_filename).pandas_data else: error_response(f"{file_extension} not supported yet") sample_text = " ".join(map(str, data.iloc[0].tolist())) detected_language = iso639.to_name(detect(sample_text)).lower() except Exception as e: upload_error_handler(e, original_filename, upload_type) if upload_type != CSV_FILE: log.info(f'Remove file: {original_filename}') os.remove(original_filename) data.to_csv(csv_filename) return { 'cols': [col for col in data.columns], 'filename': file_uuid, 'language': detected_language, 'recommendationSet': calculate_n_clusters_by_category(data.shape[0]) }
def to_post(self): """ Represents a view for WP """ language = iso639.to_name(self._languages[0]) # print(f"Original language: {language}") language = language if ";" not in language else language.split(";")[0] return { "title": self._title, "description": self._description, "movie_first_url": self._link if self._link else self.find_on_storage(), "poster_url": self.backdrop_url(), "runtime": self._runtime, "writer": self.get_from(self._crew, lambda x: "Writing" in x.department), "mpaaratings": self.mpaa(), "language": [language], "company": self.get_from(self._companies), "producer": self.get_from(self._crew, lambda x: "Production" in x.department), "director": self.get_from(self._crew, lambda x: x.job == "Director"), "country": self.get_from(self._countries), "movie_genre": self.get_from(self._genres), "movie_year": self._movie_year, "actors": self.get_from(self._cast), "mpaa": self.mpaa(), "img": self.img_url(), }
def index(self, q): """ root address of the server. q is the search string that should be analyzed. for example server/?q=hello%20World would search for hello world. :param q: the GET query param to be used :return: a respones """ # take the query from url encoded format to a string qstr = unquote(q) langs = detect_langs(qstr) best = langs[0] prob, iso_code = best.prob, best.lang is_relieable = prob >= RELIABILITY_THRESHOLD language_name = to_name(iso_code) shortend_query = qstr[0: min(len(qstr), 16)] # send back the first 32 characters. return json.dumps({"query_short": shortend_query, "prob": math.floor(prob * 100), "reliable": is_relieable, "iso_code": iso_code, "lang": language_name}) + "\n"
def getGamesJP(): games = [] #f = open('jpgame', 'w') for i in range(FIRST_NSUID, FIRST_NSUID + 9999): r = requests.get(GUESS_GAMES_GP_URL + str(i)) r.encoding = 'utf-8' if r.status_code == 200: match = re.search(JSON_REGEX, r.text).group(1) _s = match.find("NXSTORE") match = match[0:_s] _s = match.find("};") match = match[0:(_s + 1)] game = json.loads(match) #game = json.loads(re.search(JSON_REGEX, r.text).group(1)) title = html.unescape(game['formal_name']) nsuid = int(game['id']) try: img = game['applications'][0]['image_url'] except: img = BeautifulSoup(r.text, features='lxml').find( 'meta', { 'property': "twitter:image:src" }).attrs['content'] excerpt = game['description'] date_from = game['release_date_on_eshop'] try: on_sale = True if (datetime.datetime.strptime( game['release_date_on_eshop'], "%Y-%m-%d") <= datetime.datetime.now()) else False except ValueError: on_sale = False publisher = game['publisher']['name'] language_availability = [ iso639.to_name(i['iso_code']).lower().split(';')[0] if ';' in iso639.to_name(i['iso_code']).lower() else iso639.to_name(i['iso_code']).lower() for i in game['languages'] ] game_jp = copy.deepcopy(game_info) game_jp.update({ "title": title, "nsuid": nsuid, "img": img, "excerpt": excerpt, "date_from": date_from, "on_sale": on_sale, "publisher": publisher, "region": ['jp'], "language_availability": language_availability, "google_titles": getTitleByGoogle(title, 'jp') }) currency, price, jp_discount = getPrice(nsuid) if jp_discount == None: game_jp.update({"prices": { "JP": { currency: price, } }}) else: game_jp.update({ "prices": { "JP": { currency: price, "discount": jp_discount } } }) #print(game_jp) #f.write(str(game_jp)) game_jp_collection.find_one_and_update({'title': title}, {"$set": game_jp}, upsert=True)
def lookup_user(parent_db, user_id): ''' parent_db: The parent object that the user is nested under user_id: The user to lookup This method will perform the necessary API calls to lookup a Twitch user and their associated game and returns their associated game's id. ''' global games_dict global users_dict if user_id not in users_dict: # users_set.add(user_id) user_db = User() user_db.id = user_id # Have to do another request to get description for user because not included in teams users response user_json = requests.get((channel_api_url + str(user_db.id)), headers=headers).json() user_db.name = user_json.get('display_name') user_db.description = user_json.get('description') if not user_db.description: user_db.description = 'This user has no bio.' # This is what Twitch displays if a user leaves this field empty. language_code = user_json.get('language') language = language_code if language_code: try: language_iso = iso639.to_name(language_code) if language_iso: language = language_iso except: pass user_db.language = language user_db.views = user_json.get('views') user_db.followers = user_json.get('followers') user_db.url = user_json.get('url') user_db.created = user_json.get('created_at') user_db.updated = user_json.get('updated_at') user_db.image_url = user_json.get('logo') if user_db.image_url == None: user_db.image_url = 'https://static-cdn.jtvnw.net/jtv_user_pictures/xarth/404_user_70x70.png' # This is Twitch's default user logo if isinstance(parent_db, Team): user_db.team_ids = [parent_db.id] elif isinstance(parent_db, Community): user_db.community_id = parent_db.id user_game_name = user_json.get('game') if user_game_name != None: # Get game information from 2nd API gb_request_url = gb_api_url + user_game_name + '&resources=game' game_response = requests.get(gb_request_url, headers={'user-agent': 'streamGlean'}) game_json = None if game_response: game_json = game_response.json() if game_json: game_json = game_json.get('results') if game_json: game_json = game_json[0] game_id = game_json.get('id') if game_id and game_id not in games_dict: user_db.game_id = game_id game_db = Game() game_db.id = game_id game_db.user_ids = [user_db.id] if isinstance(parent_db, Team): game_db.team_ids = [parent_db.id] elif isinstance(parent_db, Community): game_db.community_ids = [parent_db.id] game_db.name = game_json.get('name') game_db.description = game_json.get('deck') genres = [] giantbomb_game_api_url = 'http://www.giantbomb.com/api/game/' + str( game_db.id ) + '?api_key=0ac0037d403fff412c9e9ac9e60a23acc5b2e736&format=json' game_response = requests.get( giantbomb_game_api_url, headers={'user-agent': 'streamGlean'}) if game_response: game_response = game_response.json().get( 'results') genres_response = game_response.get('genres') if genres_response: for genre_dict in game_response.get( 'genres'): genres.append(genre_dict.get('name')) game_db.genres = genres platforms = [] game_platforms = game_json.get('platforms') if (game_platforms and isinstance(game_platforms, Iterable)): for platform_dict in game_platforms: platforms.append(platform_dict.get('name')) game_db.platforms = platforms game_db.release_date = game_json.get( 'original_release_date') rating = game_json.get('original_game_rating') if rating: for d in rating: if 'ESRB' in d.get('name'): actual_rating = d.get('name').replace( 'ESRB: ', '') game_db.rating = actual_rating game_image = game_json.get('image') if game_image: game_image_small_url = game_image.get( 'small_url') if game_image_small_url: game_db.image_url = game_image_small_url else: game_db.image_url = 'https://static-cdn.jtvnw.net/jtv_user_pictures/xarth/404_user_70x70.png' # This is Twitch's default logo games_dict[game_db.id] = game_db elif game_id: game_db = games_dict[game_id] if user_db.id not in game_db.user_ids: game_db.user_ids += [user_db.id] if isinstance(parent_db, Team): if game_db.team_ids: if parent_db.id not in game_db.team_ids: game_db.team_ids += [parent_db.id] elif isinstance(parent_db, Community): if game_db.community_ids: if parent_db.id not in game_db.community_ids: game_db.community_ids += [parent_db.id] users_dict[user_db.id] = user_db return user_db.game_id else: user_db = users_dict[user_id] if isinstance(parent_db, Team): if user_db.team_ids != None: user_db.team_ids += [parent_db.id] else: user_db.team_ids = [parent_db.id] elif isinstance(parent_db, Community): user_db.community_id = parent_db.id
plt.ylabel('Number of tweets from country') plt.show() # Print question answers # Question 2: print() print('Question 2:') print('Of the ' + str(tweets_length) + ' tweets analyzed, about ' + str(percent_tagged) + '% of them were LangID tagged.') print('Twitter provides 33 different language tags.') for current, count in langs_count.items(): current_percent = str(round(100 * (count / langs_length), 2)) try: print('Of the tweets language tagged, twitter says ' + current_percent + '% of them are ' + iso.to_name(current) + '.') except: print('Of the tweets language tagged, twitter says ' + current_percent + '% of them are ' + current + '.') # Question 3: print() print('Question 3:') for current, count in langs2_count.items(): current_percent = str(round(100 * (count / langs2_length), 2)) try: print('Of the tweets language tagged, langid says ' + current_percent + '% of them are ' + iso.to_name(current) + '.') except: print('Of the tweets language tagged, langid says ' + current_percent + '% of them are ' + current + '.')
def lang_detection(self): try: self.language_code = detect(self.text) self.language_name = iso639.to_name(self.language_code) except: return 'Missing Input'
def convert_iso639_to_lang(self, data): if not data: return [] data = [iso639.to_name(_.iso_639_1) for _ in data] return data
def normalize_language(language: str) -> str: """Returns a 2-letter language code""" return iso639.to_name(language).lower()
def to_lang_name(lang): if is_valid_lang(lang): return iso639.to_name(lang) else: return False
async def lang(self, message_object, args): iso = langdetect.detect(args) x = "```{0}```Language result: {1}[{2}]".format( args, iso639.to_name(iso), iso639.to_native(iso)) await self.pm.client.send_message(message_object.channel, x)
def __init__(self, iso_code): self.language = iso_code self.name = iso_code self.title = to_name(iso_code) self.sources = set() self.targets = set()