def predict(self, tokens): sentence = [] for token in tokens: token = to_lower(token) candidate_analyzes = self.candidate_generator.get_analysis_candidates( token) roots = [] tags = [] for analysis in candidate_analyzes: roots.append(analysis[0]) tags.append(analysis[2]) sentence.append(WordStruct(token, roots, [], tags, 0)) selected_indices = self.predict_indices(sentence) res = [] for i, j in enumerate(selected_indices): if "Prop" in sentence[i].tags[j]: sentence[i].roots[j] = capitalize(sentence[i].roots[j]) if sentence[i].tags[j] == "Unknown": selected_analysis = sentence[i].roots[j] + "+" + sentence[ i].tags[j] else: selected_analysis = sentence[i].roots[j] + "+" + "+".join( sentence[i].tags[j]) selected_analysis = selected_analysis.replace("+DB", "^DB") res.append(selected_analysis) return res
def __init__(self, f, f_audio, info, session, referer, cw=None): self.f_audio = f_audio self.cw = cw self.title = title = info['title'] self.id = info['id'] self.url = f['url'] self.artist = info.get('uploader') self.header = utils.capitalize(get_ie_key(info)) self.session = session self.referer = referer self.url_thumb = info.get('thumbnail') self.thumb = BytesIO() if self.url_thumb: downloader.download(self.url_thumb, referer=referer, buffer=self.thumb, session=session) ext = get_ext_(self.url, session, referer) if not ext: print('empty ext') if f['_resolution']: ext = '.mp4' else: ext = '.mp3' if ext.lower() == '.m3u8': try: url = playlist2stream(self.url, referer, session=session, n_thread=4) except: url = M3u8_stream(self.url, referer=referer, session=session, n_thread=4) ext = '.mp4' else: url = self.url self.url = LazyUrl(referer, lambda x: url, self, pp=self.pp) self.filename = format_filename(title, self.id, ext, header=self.header)
def get_stem_suffix_candidates(self, surface_word): candidate_roots = [] candidate_suffixes = [] for i in range(1, len(surface_word)): candidate_root = surface_word[:i] candidate_suffix = surface_word[i:] if not self.case_sensitive: candidate_root = to_lower(candidate_root) candidate_suffix = to_lower(candidate_suffix) self._add_candidate_stem_suffix(candidate_root, candidate_suffix, candidate_roots, candidate_suffixes) else: candidate_suffix = to_lower(candidate_suffix) self._add_candidate_stem_suffix(to_lower(candidate_root), candidate_suffix, candidate_roots, candidate_suffixes) if TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match( candidate_root): self._add_candidate_stem_suffix(capitalize(candidate_root), candidate_suffix, candidate_roots, candidate_suffixes) candidate_suffixes.append("") candidate_roots.append(to_lower(surface_word)) if self.case_sensitive and TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match( surface_word): candidate_suffixes.append("") candidate_roots.append(capitalize(surface_word)) assert len(candidate_roots) == len(candidate_suffixes) TurkishStemSuffixCandidateGenerator._root_transform(candidate_roots) if self.asciification: candidate_roots = [ asciify(candidate_root) for candidate_root in candidate_roots ] candidate_suffixes = [ asciify(candidate_suffix) for candidate_suffix in candidate_suffixes ] if self.suffix_normalization: TurkishStemSuffixCandidateGenerator.suffix_transform( candidate_suffixes) return candidate_roots, candidate_suffixes
def display_searchable_properties(self, instance): name = utils.capitalize(instance.name) print("\n----------------------------------------------------------") print(f"Search {name} with") for i in instance.get_searchable_props(): print(i)
def top_ten_values(attribute): con = loading.get_connection(config.get_db_env_vars()) df = pd.read_sql(top_val_sql.format(attribute=attribute), con) con.close() if attribute == "yield_percent": df["yield_percent"] = df["yield_percent"] * 100 if max(df[attribute]) > 1e6: df[attribute] = (df[attribute] / 1e6).round(5).apply("{:,}".format) df.rename(columns={attribute: f"{attribute} (M)"}, inplace=True) else: df[attribute] = df[attribute].round(5) output_table = html.Table( [html.Tr([html.Th(capitalize(col)) for col in df.columns])] + [ html.Tr([html.Td(df.iloc[i][col]) for col in df.columns]) for i in range(10) ]) return output_table
def timeseries_plot(y_value, companies, start, end): data = get_all_data() traces = [] for c in companies: trace = go.Scatter( name=c, x=data[data["company_name"] == c][start:end].index, y=data[data["company_name"] == c][y_value][start:end], ) traces.append(trace) layout = go.Layout( title=f"Timeseries analysis of {capitalize(y_value)}", xaxis={"title": "Date"}, yaxis={"title": capitalize(y_value)}, ) output_plot = go.Figure(data=traces, layout=layout) return output_plot
def bottom_ten_prog(attribute): con = loading.get_connection(config.get_db_env_vars()) df = pd.read_sql(bottom_prog_sql.format(attribute=attribute), con) con.close() if attribute == "yield_percent": df["diff"] = df["diff"] * 100 if max(df["diff"]) > 1e6: df["diff"] = (df["diff"] / 1e6).round(5).apply("{:,}".format) df.rename(columns={"diff": f"diff (M)"}, inplace=True) elif max(df["diff"]) > 1e3: df["diff"] = (df["diff"] / 1e3).round(5).apply("{:,}".format) df.rename(columns={"diff": f"diff (K)"}, inplace=True) else: df["diff"] = df["diff"].round(5) output_table = html.Table( [html.Tr([html.Th(capitalize(col)) for col in df.columns])] + [ html.Tr([html.Td(df.iloc[i][col]) for col in df.columns]) for i in range(10) ]) return output_table
from utils import capitalize print(capitalize("blabla")) import numpy as np np.array([[1,2,3],[4,5,6]])
# ========== ## Functions # ========== # def function_name(parameter, another_param): # do something # return result def add_string(pre, ante): result = pre + ante return result added = add_string("The", "Netherlands") print added # ========== ## Modules # ========== from utils import capitalize print capitalize(added)
def load_data(file_path, max_sentence=0, add_gold_labels=True, case_sensitive=False): sentences = [] sentence = [] candidate_generator = TurkishStemSuffixCandidateGenerator( case_sensitive=case_sensitive) with open(file_path, "r", encoding="UTF-8") as f: for i, line in enumerate(f): if 0 < max_sentence < i: break trimmed_line = line.strip(" \r\n\t") trimmed_line = trimmed_line.replace("s", "s") if trimmed_line.startswith("<S>") or trimmed_line.startswith( "<s>"): sentence = [] elif trimmed_line.startswith("</S>") or trimmed_line.startswith( "</s>"): if len(sentence) > 0: sentences.append(sentence) elif len( trimmed_line ) == 0 or "<DOC>" in trimmed_line or trimmed_line.startswith( "</DOC>") or trimmed_line.startswith( "<TITLE>") or trimmed_line.startswith("</TITLE>"): pass else: parses = re.split(r"[\t ]", trimmed_line) surface = parses[0] candidates = candidate_generator.get_analysis_candidates( surface) roots = [] suffixes = [] tags = [] ambiguity_level = 0 if add_gold_labels: analyzes = parses[1:] ambiguity_level = len(analyzes) gold_root = get_root_from_analysis(analyzes[0]) gold_root = to_lower(gold_root) roots.append(gold_root) gold_suffix = surface[len(gold_root):] if not case_sensitive: gold_suffix = to_lower(gold_suffix) suffixes.append(gold_suffix) gold_tag = standardize_tags( get_tags_from_analysis(analyzes[0])) tags.append(gold_tag) for candidate_root, candidate_suffix, candidate_tag in candidates: if to_lower(candidate_root) != to_lower( gold_root) or "".join( candidate_tag) != "".join(gold_tag): roots.append(to_lower(candidate_root)) suffixes.append(candidate_suffix) tags.append(candidate_tag) elif candidate_suffix != gold_suffix and candidate_root == gold_root: suffixes[0] = candidate_suffix else: for candidate_root, candidate_suffix, candidate_tag in candidates: roots.append(candidate_root) suffixes.append(candidate_suffix) tags.append(candidate_tag) if len(roots) == 0: if TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match( surface): candidate_tags = candidate_generator.get_tags( "", stem_tags=["Noun", "Noun+Prop"]) else: candidate_tags = candidate_generator.get_tags( "", stem_tags=["Noun"]) for candidate_tag in candidate_tags: if "Prop" in candidate_tag: roots.append(capitalize(surface)) suffixes.append("") tags.append(candidate_tag) else: roots.append(to_lower(surface)) suffixes.append("") tags.append(candidate_tag) if not case_sensitive: surface = to_lower(surface) roots = [to_lower(root) for root in roots] suffixes = [to_lower(suffix) for suffix in suffixes] current_word = WordStruct(surface, roots, suffixes, tags, ambiguity_level) sentence.append(current_word) return sentences
from utils import capitalize print capitalize("blabla") import numpy as np np.array([[1,2,3],[4,5,6]])
def test_capitalize(self): """ Tests test_capitalize() """ actual = bool(re.search("^[A-Z]{1}", utils.capitalize('string'))) expected = True self.assertEqual(actual, expected)
def error_analysis(file_path, output_path, add_gold_labels=True): test_data = data_generator(file_path, add_gold_labels=add_gold_labels) stemmer = AnalysisScorerModel.create_from_existed_model( "lookup_disambiguator_wo_suffix") corrects = 0 total = 0 ambiguous_count = 0 ambiguous_corrects = 0 with open("error_analysis/" + output_path, "w", encoding="UTF-8") as f: for sentence_index, sentence in enumerate(test_data): if sentence_index % 100 == 0: print("{} sentences processed so far.".format(sentence_index)) scores = stemmer.propogate(sentence) for word_index, (score, word) in enumerate(zip(scores, sentence)): analysis_scores = {} probs = dy.softmax(score) analyzes_probs = probs.npvalue() max_analysis = "" max_prob = 0.0 for i, (root, analysis, analysis_prob) in enumerate( zip(word.roots, word.tags, analyzes_probs)): analysis_str = "+".join(analysis).replace("+DB", "^DB") if "Prop" in analysis_str: root = capitalize(root) analysis_str = root + "+" + analysis_str if i == 0: correct_analysis = analysis_str if analysis_prob > max_prob: max_prob = analysis_prob max_analysis = analysis_str analysis_scores[analysis_str] = analysis_prob if word.ambiguity_level > 0: ambiguous_count += 1 if max_analysis == correct_analysis: corrects += 1 if word.ambiguity_level > 0: ambiguous_corrects += 1 else: f.write("Surface: {}\n".format(word.surface_word)) f.write( "Correct analysis: {}\tSelected analysis: {}\n".format( correct_analysis, max_analysis)) if word_index < 2: start = 0 else: start = word_index - 2 if word_index > len(sentence) - 3: end = len(sentence) else: end = word_index + 3 f.write("Context: {}\n".format(" ".join( [w.surface_word for w in sentence[start:end]]))) for analysis_str, prob in analysis_scores.items(): f.write("{}:\t{}\n".format(analysis_str, prob)) f.write("\n\n") total += 1 print("Corrects: {}\tTotal: {}\t Accuracy: {}".format( corrects, total, corrects * 1.0 / total)) print( "Ambiguous Corrects: {}\tTotal Ambiguous: {}\t Ambiguous Accuracy: {}". format(corrects, total, corrects * 1.0 / total))
def get_context_menu(self, api_data, program, cache_file): """Get context menu""" from addon import plugin favorite_marker = '' watchlater_marker = '' context_menu = [] # WATCH LATER if self._resumepoints.is_activated(): asset_id = self.get_asset_id(api_data) # VRT NU Search API if api_data.get('type') == 'episode': program_title = api_data.get('program') # VRT NU Schedule API (some are missing vrt.whatson-id) elif api_data.get('vrt.whatson-id') or api_data.get('startTime'): program_title = api_data.get('title') if asset_id is not None: # We need to ensure forward slashes are quoted program_title = to_unicode( quote_plus(from_unicode(program_title))) url = url_to_episode(api_data.get('url', '')) if self._resumepoints.is_watchlater(asset_id): extras = {} # If we are in a watchlater menu, move cursor down before removing a favorite if plugin.path.startswith('/resumepoints/watchlater'): extras = dict(move_down=True) # Unwatch context menu context_menu.append( (capitalize(localize(30402)), 'RunPlugin(%s)' % url_for('unwatchlater', asset_id=asset_id, title=program_title, url=url, **extras))) watchlater_marker = '[COLOR={highlighted}]ᶫ[/COLOR]' else: # Watch context menu context_menu.append( (capitalize(localize(30401)), 'RunPlugin(%s)' % url_for('watchlater', asset_id=asset_id, title=program_title, url=url))) # FOLLOW PROGRAM if self._favorites.is_activated(): # VRT NU Search API if api_data.get('type') == 'episode': program_title = api_data.get('program') program_type = api_data.get('programType') follow_suffix = localize( 30410) if program_type != 'oneoff' else '' # program follow_enabled = True # VRT NU Suggest API elif api_data.get('type') == 'program': program_title = api_data.get('title') follow_suffix = '' follow_enabled = True # VRT NU Schedule API (some are missing vrt.whatson-id) elif api_data.get('vrt.whatson-id') or api_data.get('startTime'): program_title = api_data.get('title') follow_suffix = localize(30410) # program follow_enabled = bool(api_data.get('url')) if follow_enabled and program: program_title = to_unicode( quote_plus(from_unicode(program_title)) ) # We need to ensure forward slashes are quoted if self._favorites.is_favorite(program): extras = {} # If we are in a favorites menu, move cursor down before removing a favorite if plugin.path.startswith('/favorites'): extras = dict(move_down=True) context_menu.append(( localize(30412, title=follow_suffix), # Unfollow 'RunPlugin(%s)' % url_for('unfollow', program=program, title=program_title, **extras))) favorite_marker = '[COLOR={highlighted}]ᵛ[/COLOR]' else: context_menu.append(( localize(30411, title=follow_suffix), # Follow 'RunPlugin(%s)' % url_for( 'follow', program=program, title=program_title))) # GO TO PROGRAM if api_data.get('programType') != 'oneoff' and program: if plugin.path.startswith( ('/favorites/offline', '/favorites/recent', '/offline', '/recent', '/resumepoints/continue', '/resumepoints/watchlater', '/tvguide')): context_menu.append(( localize(30417), # Go to program 'Container.Update(%s)' % url_for('programs', program=program, season='allseasons'))) # REFRESH MENU context_menu.append(( localize(30413), # Refresh menu 'RunPlugin(%s)' % url_for('delete_cache', cache_file=cache_file))) return context_menu, colour(favorite_marker), colour(watchlater_marker)