コード例 #1
0
 def predict(self, tokens):
     sentence = []
     for token in tokens:
         token = to_lower(token)
         candidate_analyzes = self.candidate_generator.get_analysis_candidates(
             token)
         roots = []
         tags = []
         for analysis in candidate_analyzes:
             roots.append(analysis[0])
             tags.append(analysis[2])
         sentence.append(WordStruct(token, roots, [], tags, 0))
     selected_indices = self.predict_indices(sentence)
     res = []
     for i, j in enumerate(selected_indices):
         if "Prop" in sentence[i].tags[j]:
             sentence[i].roots[j] = capitalize(sentence[i].roots[j])
         if sentence[i].tags[j] == "Unknown":
             selected_analysis = sentence[i].roots[j] + "+" + sentence[
                 i].tags[j]
         else:
             selected_analysis = sentence[i].roots[j] + "+" + "+".join(
                 sentence[i].tags[j])
             selected_analysis = selected_analysis.replace("+DB", "^DB")
         res.append(selected_analysis)
     return res
コード例 #2
0
    def __init__(self, f, f_audio, info, session, referer, cw=None):
        self.f_audio = f_audio
        self.cw = cw
        self.title = title = info['title']
        self.id = info['id']
        self.url = f['url']
        self.artist = info.get('uploader')
        self.header = utils.capitalize(get_ie_key(info))
        self.session = session
        self.referer = referer

        self.url_thumb = info.get('thumbnail')
        self.thumb = BytesIO()
        if self.url_thumb:
            downloader.download(self.url_thumb, referer=referer, buffer=self.thumb, session=session)

        ext = get_ext_(self.url, session, referer)

        if not ext:
            print('empty ext')
            if f['_resolution']:
                ext = '.mp4'
            else:
                ext = '.mp3'

        if ext.lower() == '.m3u8':
            try:
                url = playlist2stream(self.url, referer, session=session, n_thread=4)
            except:
                url = M3u8_stream(self.url, referer=referer, session=session, n_thread=4)
            ext = '.mp4'
        else:
            url = self.url
        self.url = LazyUrl(referer, lambda x: url, self, pp=self.pp)
        self.filename = format_filename(title, self.id, ext, header=self.header)
コード例 #3
0
    def get_stem_suffix_candidates(self, surface_word):
        candidate_roots = []
        candidate_suffixes = []
        for i in range(1, len(surface_word)):
            candidate_root = surface_word[:i]
            candidate_suffix = surface_word[i:]
            if not self.case_sensitive:
                candidate_root = to_lower(candidate_root)
                candidate_suffix = to_lower(candidate_suffix)
                self._add_candidate_stem_suffix(candidate_root,
                                                candidate_suffix,
                                                candidate_roots,
                                                candidate_suffixes)
            else:
                candidate_suffix = to_lower(candidate_suffix)
                self._add_candidate_stem_suffix(to_lower(candidate_root),
                                                candidate_suffix,
                                                candidate_roots,
                                                candidate_suffixes)
                if TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match(
                        candidate_root):
                    self._add_candidate_stem_suffix(capitalize(candidate_root),
                                                    candidate_suffix,
                                                    candidate_roots,
                                                    candidate_suffixes)

        candidate_suffixes.append("")
        candidate_roots.append(to_lower(surface_word))
        if self.case_sensitive and TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match(
                surface_word):
            candidate_suffixes.append("")
            candidate_roots.append(capitalize(surface_word))

        assert len(candidate_roots) == len(candidate_suffixes)
        TurkishStemSuffixCandidateGenerator._root_transform(candidate_roots)
        if self.asciification:
            candidate_roots = [
                asciify(candidate_root) for candidate_root in candidate_roots
            ]
            candidate_suffixes = [
                asciify(candidate_suffix)
                for candidate_suffix in candidate_suffixes
            ]
        if self.suffix_normalization:
            TurkishStemSuffixCandidateGenerator.suffix_transform(
                candidate_suffixes)
        return candidate_roots, candidate_suffixes
コード例 #4
0
    def display_searchable_properties(self, instance):
        name = utils.capitalize(instance.name)

        print("\n----------------------------------------------------------")
        print(f"Search {name} with")

        for i in instance.get_searchable_props():
            print(i)
コード例 #5
0
def top_ten_values(attribute):
    con = loading.get_connection(config.get_db_env_vars())
    df = pd.read_sql(top_val_sql.format(attribute=attribute), con)
    con.close()
    if attribute == "yield_percent":
        df["yield_percent"] = df["yield_percent"] * 100
    if max(df[attribute]) > 1e6:
        df[attribute] = (df[attribute] / 1e6).round(5).apply("{:,}".format)
        df.rename(columns={attribute: f"{attribute} (M)"}, inplace=True)
    else:
        df[attribute] = df[attribute].round(5)
    output_table = html.Table(
        [html.Tr([html.Th(capitalize(col)) for col in df.columns])] + [
            html.Tr([html.Td(df.iloc[i][col]) for col in df.columns])
            for i in range(10)
        ])
    return output_table
コード例 #6
0
def timeseries_plot(y_value, companies, start, end):
    data = get_all_data()
    traces = []
    for c in companies:
        trace = go.Scatter(
            name=c,
            x=data[data["company_name"] == c][start:end].index,
            y=data[data["company_name"] == c][y_value][start:end],
        )
        traces.append(trace)

    layout = go.Layout(
        title=f"Timeseries analysis of {capitalize(y_value)}",
        xaxis={"title": "Date"},
        yaxis={"title": capitalize(y_value)},
    )

    output_plot = go.Figure(data=traces, layout=layout)
    return output_plot
コード例 #7
0
def bottom_ten_prog(attribute):
    con = loading.get_connection(config.get_db_env_vars())
    df = pd.read_sql(bottom_prog_sql.format(attribute=attribute), con)
    con.close()
    if attribute == "yield_percent":
        df["diff"] = df["diff"] * 100
    if max(df["diff"]) > 1e6:
        df["diff"] = (df["diff"] / 1e6).round(5).apply("{:,}".format)
        df.rename(columns={"diff": f"diff (M)"}, inplace=True)
    elif max(df["diff"]) > 1e3:
        df["diff"] = (df["diff"] / 1e3).round(5).apply("{:,}".format)
        df.rename(columns={"diff": f"diff (K)"}, inplace=True)
    else:
        df["diff"] = df["diff"].round(5)

    output_table = html.Table(
        [html.Tr([html.Th(capitalize(col)) for col in df.columns])] + [
            html.Tr([html.Td(df.iloc[i][col]) for col in df.columns])
            for i in range(10)
        ])
    return output_table
コード例 #8
0
from utils import capitalize

print(capitalize("blabla"))


import numpy as np

np.array([[1,2,3],[4,5,6]])
コード例 #9
0
# ==========
## Functions
# ==========


# def function_name(parameter, another_param):
# 	do something

# 	return result

def add_string(pre, ante):
	result = pre + ante

	return result

added = add_string("The", "Netherlands")

print added



# ==========
## Modules
# ==========

from utils import capitalize

print capitalize(added)
コード例 #10
0
def load_data(file_path,
              max_sentence=0,
              add_gold_labels=True,
              case_sensitive=False):
    sentences = []
    sentence = []
    candidate_generator = TurkishStemSuffixCandidateGenerator(
        case_sensitive=case_sensitive)
    with open(file_path, "r", encoding="UTF-8") as f:
        for i, line in enumerate(f):
            if 0 < max_sentence < i:
                break
            trimmed_line = line.strip(" \r\n\t")
            trimmed_line = trimmed_line.replace("s", "s")
            if trimmed_line.startswith("<S>") or trimmed_line.startswith(
                    "<s>"):
                sentence = []
            elif trimmed_line.startswith("</S>") or trimmed_line.startswith(
                    "</s>"):
                if len(sentence) > 0:
                    sentences.append(sentence)
            elif len(
                    trimmed_line
            ) == 0 or "<DOC>" in trimmed_line or trimmed_line.startswith(
                    "</DOC>") or trimmed_line.startswith(
                        "<TITLE>") or trimmed_line.startswith("</TITLE>"):
                pass
            else:
                parses = re.split(r"[\t ]", trimmed_line)
                surface = parses[0]
                candidates = candidate_generator.get_analysis_candidates(
                    surface)
                roots = []
                suffixes = []
                tags = []
                ambiguity_level = 0
                if add_gold_labels:
                    analyzes = parses[1:]
                    ambiguity_level = len(analyzes)
                    gold_root = get_root_from_analysis(analyzes[0])
                    gold_root = to_lower(gold_root)
                    roots.append(gold_root)
                    gold_suffix = surface[len(gold_root):]
                    if not case_sensitive:
                        gold_suffix = to_lower(gold_suffix)
                    suffixes.append(gold_suffix)
                    gold_tag = standardize_tags(
                        get_tags_from_analysis(analyzes[0]))
                    tags.append(gold_tag)

                    for candidate_root, candidate_suffix, candidate_tag in candidates:
                        if to_lower(candidate_root) != to_lower(
                                gold_root) or "".join(
                                    candidate_tag) != "".join(gold_tag):
                            roots.append(to_lower(candidate_root))
                            suffixes.append(candidate_suffix)
                            tags.append(candidate_tag)
                        elif candidate_suffix != gold_suffix and candidate_root == gold_root:
                            suffixes[0] = candidate_suffix
                else:
                    for candidate_root, candidate_suffix, candidate_tag in candidates:
                        roots.append(candidate_root)
                        suffixes.append(candidate_suffix)
                        tags.append(candidate_tag)
                    if len(roots) == 0:
                        if TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match(
                                surface):
                            candidate_tags = candidate_generator.get_tags(
                                "", stem_tags=["Noun", "Noun+Prop"])
                        else:
                            candidate_tags = candidate_generator.get_tags(
                                "", stem_tags=["Noun"])
                        for candidate_tag in candidate_tags:
                            if "Prop" in candidate_tag:
                                roots.append(capitalize(surface))
                                suffixes.append("")
                                tags.append(candidate_tag)
                            else:
                                roots.append(to_lower(surface))
                                suffixes.append("")
                                tags.append(candidate_tag)
                if not case_sensitive:
                    surface = to_lower(surface)
                    roots = [to_lower(root) for root in roots]
                    suffixes = [to_lower(suffix) for suffix in suffixes]
                current_word = WordStruct(surface, roots, suffixes, tags,
                                          ambiguity_level)
                sentence.append(current_word)
    return sentences
コード例 #11
0
from utils import capitalize

print capitalize("blabla")


import numpy as np

np.array([[1,2,3],[4,5,6]])
コード例 #12
0
 def test_capitalize(self):
     """ Tests test_capitalize() """
     actual = bool(re.search("^[A-Z]{1}", utils.capitalize('string')))
     expected = True
     self.assertEqual(actual, expected)
コード例 #13
0
def error_analysis(file_path, output_path, add_gold_labels=True):
    test_data = data_generator(file_path, add_gold_labels=add_gold_labels)
    stemmer = AnalysisScorerModel.create_from_existed_model(
        "lookup_disambiguator_wo_suffix")
    corrects = 0
    total = 0
    ambiguous_count = 0
    ambiguous_corrects = 0
    with open("error_analysis/" + output_path, "w", encoding="UTF-8") as f:
        for sentence_index, sentence in enumerate(test_data):
            if sentence_index % 100 == 0:
                print("{} sentences processed so far.".format(sentence_index))
            scores = stemmer.propogate(sentence)
            for word_index, (score, word) in enumerate(zip(scores, sentence)):
                analysis_scores = {}
                probs = dy.softmax(score)
                analyzes_probs = probs.npvalue()
                max_analysis = ""
                max_prob = 0.0
                for i, (root, analysis, analysis_prob) in enumerate(
                        zip(word.roots, word.tags, analyzes_probs)):
                    analysis_str = "+".join(analysis).replace("+DB", "^DB")
                    if "Prop" in analysis_str:
                        root = capitalize(root)
                    analysis_str = root + "+" + analysis_str
                    if i == 0:
                        correct_analysis = analysis_str
                    if analysis_prob > max_prob:
                        max_prob = analysis_prob
                        max_analysis = analysis_str
                    analysis_scores[analysis_str] = analysis_prob
                if word.ambiguity_level > 0:
                    ambiguous_count += 1
                if max_analysis == correct_analysis:
                    corrects += 1
                    if word.ambiguity_level > 0:
                        ambiguous_corrects += 1
                else:
                    f.write("Surface: {}\n".format(word.surface_word))
                    f.write(
                        "Correct analysis: {}\tSelected analysis: {}\n".format(
                            correct_analysis, max_analysis))
                    if word_index < 2:
                        start = 0
                    else:
                        start = word_index - 2
                    if word_index > len(sentence) - 3:
                        end = len(sentence)
                    else:
                        end = word_index + 3
                    f.write("Context: {}\n".format(" ".join(
                        [w.surface_word for w in sentence[start:end]])))
                    for analysis_str, prob in analysis_scores.items():
                        f.write("{}:\t{}\n".format(analysis_str, prob))
                    f.write("\n\n")
                total += 1

    print("Corrects: {}\tTotal: {}\t Accuracy: {}".format(
        corrects, total, corrects * 1.0 / total))
    print(
        "Ambiguous Corrects: {}\tTotal Ambiguous: {}\t Ambiguous Accuracy: {}".
        format(corrects, total, corrects * 1.0 / total))
コード例 #14
0
    def get_context_menu(self, api_data, program, cache_file):
        """Get context menu"""
        from addon import plugin
        favorite_marker = ''
        watchlater_marker = ''
        context_menu = []

        # WATCH LATER
        if self._resumepoints.is_activated():
            asset_id = self.get_asset_id(api_data)

            # VRT NU Search API
            if api_data.get('type') == 'episode':
                program_title = api_data.get('program')

            # VRT NU Schedule API (some are missing vrt.whatson-id)
            elif api_data.get('vrt.whatson-id') or api_data.get('startTime'):
                program_title = api_data.get('title')

            if asset_id is not None:
                # We need to ensure forward slashes are quoted
                program_title = to_unicode(
                    quote_plus(from_unicode(program_title)))
                url = url_to_episode(api_data.get('url', ''))
                if self._resumepoints.is_watchlater(asset_id):
                    extras = {}
                    # If we are in a watchlater menu, move cursor down before removing a favorite
                    if plugin.path.startswith('/resumepoints/watchlater'):
                        extras = dict(move_down=True)
                    # Unwatch context menu
                    context_menu.append(
                        (capitalize(localize(30402)),
                         'RunPlugin(%s)' % url_for('unwatchlater',
                                                   asset_id=asset_id,
                                                   title=program_title,
                                                   url=url,
                                                   **extras)))
                    watchlater_marker = '[COLOR={highlighted}]ᶫ[/COLOR]'
                else:
                    # Watch context menu
                    context_menu.append(
                        (capitalize(localize(30401)),
                         'RunPlugin(%s)' % url_for('watchlater',
                                                   asset_id=asset_id,
                                                   title=program_title,
                                                   url=url)))

        # FOLLOW PROGRAM
        if self._favorites.is_activated():

            # VRT NU Search API
            if api_data.get('type') == 'episode':
                program_title = api_data.get('program')
                program_type = api_data.get('programType')
                follow_suffix = localize(
                    30410) if program_type != 'oneoff' else ''  # program
                follow_enabled = True

            # VRT NU Suggest API
            elif api_data.get('type') == 'program':
                program_title = api_data.get('title')
                follow_suffix = ''
                follow_enabled = True

            # VRT NU Schedule API (some are missing vrt.whatson-id)
            elif api_data.get('vrt.whatson-id') or api_data.get('startTime'):
                program_title = api_data.get('title')
                follow_suffix = localize(30410)  # program
                follow_enabled = bool(api_data.get('url'))

            if follow_enabled and program:
                program_title = to_unicode(
                    quote_plus(from_unicode(program_title))
                )  # We need to ensure forward slashes are quoted
                if self._favorites.is_favorite(program):
                    extras = {}
                    # If we are in a favorites menu, move cursor down before removing a favorite
                    if plugin.path.startswith('/favorites'):
                        extras = dict(move_down=True)
                    context_menu.append((
                        localize(30412, title=follow_suffix),  # Unfollow
                        'RunPlugin(%s)' % url_for('unfollow',
                                                  program=program,
                                                  title=program_title,
                                                  **extras)))
                    favorite_marker = '[COLOR={highlighted}]ᵛ[/COLOR]'
                else:
                    context_menu.append((
                        localize(30411, title=follow_suffix),  # Follow
                        'RunPlugin(%s)' % url_for(
                            'follow', program=program, title=program_title)))

        # GO TO PROGRAM
        if api_data.get('programType') != 'oneoff' and program:
            if plugin.path.startswith(
                ('/favorites/offline', '/favorites/recent', '/offline',
                 '/recent', '/resumepoints/continue',
                 '/resumepoints/watchlater', '/tvguide')):
                context_menu.append((
                    localize(30417),  # Go to program
                    'Container.Update(%s)' %
                    url_for('programs', program=program, season='allseasons')))

        # REFRESH MENU
        context_menu.append((
            localize(30413),  # Refresh menu
            'RunPlugin(%s)' % url_for('delete_cache', cache_file=cache_file)))

        return context_menu, colour(favorite_marker), colour(watchlater_marker)