Ejemplo n.º 1
0
def get_data_state(name: str):

    try:
        name = name.lower()
        response = requests.get(
            ApiBrazilState.URL_STATE.value.format(state=name), timeout=2)

        print(f"get_data_state >>> {response.status_code}")
        if response.status_code == 200:
            response = response.json()

            if not response.get('error', None):
                return pasrse_json(response)

            else:
                print(
                    "estado não encontrado pela UF. Tentando buscar pelo nome do estado."
                )
                response = requests.get(ApiBrazilState.URL_STATES.value,
                                        timeout=2)
                response = pasrse_json(response.json()).get('data')

                name = strip_accents(name).lower()
                state = [
                    row for row in response
                    if strip_accents(row['state']).lower() == name
                ]

                if state:
                    return state.pop()

    except Exception as exc:
        print(f"erro na requisição: {exc}")

    return None
Ejemplo n.º 2
0
def gen_filename(record):
    """
    Guess the expected filename from the record.

    Args:
        record (dict): a record of the bibtex entry.

    Returns:
        A string which corresponds to guessed filename (expected to be a pdf).
    """
    record_copy = record.copy()
    record_copy = bibtexparser.customization.author(record_copy)

    # Retrieve a stripped down last name of the first authors
    last_names = []
    for author in record_copy['author']:
        stripped = utils.strip_accents(codecs.decode(author, "ulatex"))
        name = re.sub('([\\{\\}])', '', stripped.split(',')[0])
        name = re.sub('~', ' ', name)
        name = re.sub("\\\\'ı", "i", name)
        name = re.sub("\\\\`ı", "i", name)
        name = re.sub("ı", "i", name)
        name = re.sub('\xf8', 'o', name)
        name = re.sub('\\\\textquotesingle ', "'", name)
        name = name.replace('ł', 'l')
        last_names.append(name)

    # If there are more than 4 authors, use the 'et al.' form
    if len(last_names) > 4:
        prefix = '(' + last_names[0] + ' et al.) '
    else:
        prefix = '(' + ', '.join(last_names) + ') '

    title = utils.get_title(record_copy)
    title = title.replace('$\\Lambda_{훜fty}$ ', 'λ∞')
    title = re.sub('\\\\textendash  ', '- ', title)
    title = utils.strip_accents(codecs.decode(title, "ulatex"))
    title = re.sub('([\\{\\}])', '', title)
    title = re.sub(' *: ', ' - ', title)
    title = re.sub(' *— *', ' - ', title)
    title = re.sub('–', '-', title)
    title = re.sub('/', '-', title)
    # title = re.sub('\\$\\mathplus \\$', '+', title)
    title = re.sub('\\\\textquotesingle ', "'", title)
    title = to_titlecase(title)
    title = re.sub('"', '', title)
    title = re.sub('’', "'", title)
    title = re.sub('\u2010', '-', title)
    title = re.sub('\u2122', '', title)
    title = title.replace('$\\texttt FreeFem++$', 'FreeFem++')
    title = title.replace('$\\lambda _\\Infty $ ', 'λ∞')

    return prefix + title + '.pdf'
def gen_filename(record):
    """
    Guess the expected filename from the record.

    Args:
        record (dict): a record of the bibtex entry.

    Returns:
        A string which corresponds to guessed filename (expected to be a pdf).
    """
    record_copy = record.copy()
    record_copy = bibtexparser.customization.author(record_copy)

    # Retrieve a stripped down last name of the first authors
    last_names = []
    for author in record_copy['author']:
        stripped = utils.strip_accents(codecs.decode(author, "ulatex"))
        name = re.sub('([\\{\\}])', '', stripped.split(',')[0])
        name = re.sub('~', ' ', name)
        name = re.sub("\\\\'ı", "i", name)
        name = re.sub("\\\\`ı", "i", name)
        name = re.sub("ı", "i", name)
        name = re.sub('\xf8', 'o', name)
        name = re.sub('\\\\textquotesingle ', "'", name)
        name = name.replace('ł', 'l')
        last_names.append(name)

    # If there are more than 4 authors, use the 'et al.' form
    if len(last_names) > 4:
        prefix = '(' + last_names[0] + ' et al.) '
    else:
        prefix = '(' + ', '.join(last_names) + ') '

    title = utils.get_title(record_copy)
    title = title.replace('$\\Lambda_{훜fty}$ ', 'λ∞')
    title = re.sub('\\\\textendash  ', '- ', title)
    title = utils.strip_accents(codecs.decode(title, "ulatex"))
    title = re.sub('([\\{\\}])', '', title)
    title = re.sub(' *: ', ' - ', title)
    title = re.sub(' *— *', ' - ', title)
    title = re.sub('–', '-', title)
    title = re.sub('/', '-', title)
    # title = re.sub('\\$\\mathplus \\$', '+', title)
    title = re.sub('\\\\textquotesingle ', "'", title)
    title = to_titlecase(title)
    title = re.sub('"', '', title)
    title = re.sub('’', "'", title)
    title = re.sub('\u2010', '-', title)
    title = re.sub('\u2122', '', title)
    title = title.replace('$\\texttt FreeFem++$', 'FreeFem++')
    title = title.replace('$\\lambda _\\Infty $ ', 'λ∞')

    return prefix + title + '.pdf'
Ejemplo n.º 4
0
def get_data_city(name: str):

    try:
        response = requests.get(ApiBrazilCity.URL_CITY.value, timeout=2)

        print(f"get_data_city >>> {response.status_code}")
        if response.status_code == 200:
            name = name.lower()
            data = response.json().get('docs')

            data_city = []
            for item in data:
                city_name = strip_accents(item["city_name"]).lower()

                if city_name == name:
                    data_city.append(item)

            data_city.sort(key=lambda x: x["date"])

            print(f"get_data_city >>> {city_name}, {name}")

            return data_city.pop() if len(data_city) else None

    except Exception as exc:
        print(f"erro na requisão: {exc}")

    return None
Ejemplo n.º 5
0
def update_video(vidobj, statusid, filename=None):
    vidobj.statusid = statusid
    vidobj.lastupdated = datetime.now()

    if filename is not None:
        filename = utils.strip_accents(filename)
        filename = utils.clean_special_chars(filename)
        vidobj.filename = filename

    vidobj.save()
Ejemplo n.º 6
0
 def answer(self, question):
     pred_relation = www2fb(get_relation(question, self.questions, self.model, self.index2rel, self.args))
     query_tokens = get_query_text(question, self.questions, self.ent_model, self.index2tag, self.args)
     
     N = min(len(query_tokens), 3)
     
     C = []  # candidate entities
     for n in range(N, 0, -1):
         ngrams_set = find_ngrams(query_tokens, n)
         for ngram_tuple in ngrams_set:
             ngram = " ".join(ngram_tuple)
             ngram = strip_accents(ngram)
             # unigram stopwords have too many candidates so just skip over
             if ngram in stopwords:
                 continue
             ## PROBLEM! - ngram doesnt exist in index - at test-2592 - KeyError: 'p.a.r.c.e. parce'
             try:
                 cand_mids = self.index_ent[ngram]  # search entities
             except:
                 continue
             C.extend(cand_mids)
         if (len(C) > 0):
             break
         break
     
     C_pruned = []
     for mid in set(C):
         if mid in self.index_reach.keys():  # PROBLEM: don't know why this may not exist??
             count_mid = C.count(mid)  # count number of times mid appeared in C
             C_pruned.append((mid, count_mid))
             if pred_relation in self.index_reach[mid]:
                 count_mid = C.count(mid)  # count number of times mid appeared in C
                 C_pruned.append((mid, count_mid))
     
     num_entities_fbsubset = 1959820  # 2M - 1959820 , 5M - 1972702
     C_tfidf_pruned = []
     for mid, count_mid in C_pruned:
         if mid in self.index_names.keys():
             cand_ent_name = pick_best_name(question, self.index_names[mid])
             tfidf = calc_tf_idf(query_tokens, cand_ent_name, count_mid, num_entities_fbsubset, self.index_ent)
             C_tfidf_pruned.append((mid, cand_ent_name, tfidf))
     
     C_tfidf_pruned.sort(key=lambda t: -t[2])
     pred_ent, name_ent, score = C_tfidf_pruned[0]
     
     key = (pred_ent, pred_relation)
     if key not in self.fb_graph:
          return "UNKNOWN"
     result_mid = self.fb_graph[key]
     result_mid = list(result_mid)
     
     result = get_names(self.fb_graph, result_mid)[0]
     return result
def gen_bibkey(record, all_keys):
    """
    Generate a unique bibtex key for the given record.

    Args:
        record (dict): a record of the bibtex entry.
        all_keys (set): a set of existing bibtex keys in the current context.

    Returns:
        A string which corresponds to the newly generated unique bibtex key.
        The argument 'all_keys' is also appended with the new key.
    """
    for field in ['year', 'title', 'author']:
        if field not in record:
            record_str = json.dumps(record,
                                    sort_keys=True,
                                    indent=4,
                                    separators=(',', ': '))
            raise ValueError(
                "Missing field '{0}' in bibtex entry:\n{1}".format(
                    field, record_str))

    record_copy = record.copy()
    record_copy = bibtexparser.customization.author(record_copy)

    # Retrieve a stripped down last name of the first author
    first_author = record_copy['author'][0]
    stripped = utils.strip_accents(codecs.decode(first_author, "ulatex"))
    last_name = stripped.split(',')[0]
    last_name = last_name.replace('ø', 'o')
    last_name = last_name.replace('ł', 'l')
    last_name = re.sub('([^a-zA-Z])', '', last_name)

    # Then get the first 3 initials of the article title
    curated_title = re.sub('([^a-zA-Z])', ' ', utils.get_title(record_copy))
    short_title = ''.join(s[0] for s in curated_title.split())
    short_title += curated_title.split()[-1][1:]
    short_title = short_title[:3].upper()

    # Key is Author:Year:Initials
    basekey = last_name + ":" + record_copy['year'] + ":" + short_title
    bibkey = basekey

    # Assign a unique key
    tail = 'a'
    while bibkey in all_keys:
        bibkey = basekey + tail
        tail = chr((ord(tail) + 1))

    all_keys.add(bibkey)
    return bibkey
Ejemplo n.º 8
0
    def movie_query(self, title, year, single_query=False, caller_name=None):
        title = strip_accents(title)

        if self.caller_name is None:
            if caller_name is None:
                caller_name = get_caller_name()
            self.caller_name = caller_name

        self.title = source_utils.clean_title(title)
        self.year = year

        full_query = '%s %s' % (title, year)
        use_cache_only = self._get_cache(full_query)
        if use_cache_only:
            return self._get_movie_results()
        skip_set_cache = False

        try:
            self._url = self._find_url()
            if self._url is None:
                self._set_cache(full_query)
                return self._get_movie_results()

            movie = lambda query: self._query_thread(query,
                                                     [self.filter_movie_title])
            queries = [movie(self.title + ' ' + self.year)]

            try:
                alternative_title = replace_text_with_int(self.title)
                if self.title != alternative_title:
                    queries.append(movie(alternative_title + ' ' + self.year))
            except:
                pass

            wait_threads(queries)

            if len(
                    self._temp_results
            ) == 0 and not single_query and not self._request.self.has_timeout_exc:
                self._set_cache(full_query)
                skip_set_cache = True
                wait_threads([movie(self.title)])

            if not skip_set_cache:
                self._set_cache(full_query)
            return self._get_movie_results()

        except:
            if not skip_set_cache:
                self._set_cache(full_query)
            return self._get_movie_results()
Ejemplo n.º 9
0
def gen_bibkey(record, all_keys):
    """
    Generate a unique bibtex key for the given record.

    Args:
        record (dict): a record of the bibtex entry.
        all_keys (set): a set of existing bibtex keys in the current context.

    Returns:
        A string which corresponds to the newly generated unique bibtex key.
        The argument 'all_keys' is also appended with the new key.
    """
    for field in ['year', 'title', 'author']:
        if field not in record:
            record_str = json.dumps(record, sort_keys=True, indent=4, separators=(',', ': '))
            raise ValueError("Missing field '{0}' in bibtex entry:\n{1}".format(field, record_str))

    record_copy = record.copy()
    record_copy = bibtexparser.customization.author(record_copy)

    # Retrieve a stripped down last name of the first author
    first_author = record_copy['author'][0]
    stripped = utils.strip_accents(codecs.decode(first_author, "ulatex"))
    last_name = stripped.split(',')[0]
    last_name = last_name.replace('ø', 'o')
    last_name = last_name.replace('ł', 'l')
    last_name = re.sub('([^a-zA-Z])', '', last_name)

    # Then get the first 3 initials of the article title
    curated_title = re.sub('([^a-zA-Z])', ' ', utils.get_title(record_copy))
    short_title = ''.join(s[0] for s in curated_title.split())
    short_title += curated_title.split()[-1][1:]
    short_title = short_title[:3].upper()

    # Key is Author:Year:Initials
    basekey = last_name + ":" + record_copy['year'] + ":" + short_title
    bibkey = basekey

    # Assign a unique key
    tail = 'a'
    while bibkey in all_keys:
        bibkey = basekey + tail
        tail = chr((ord(tail) + 1))

    all_keys.add(bibkey)
    return bibkey
Ejemplo n.º 10
0
    def format_msg_state(self, data, name):
        result = Messages.NOT_FOUND_STATE.value

        if isinstance(data, dict):
            state = ParseDictAsObj(data)
            name = strip_accents(state.state).lower().replace(' ', '-')

            flag = download_img(
                UrlFlag.URL_FLAG_STATE.value.format(
                    state=name if name != 'sao-paulo' else name + '1'))

            result = (f"UF: {state.uf}\n"
                      f"Estado: {state.state}\n"
                      f"Confirmados: {state.cases}\n"
                      f"Suspeitos: {state.suspects}\n"
                      f"Casos descartados: {state.refuses}\n"
                      f"Mortes: {state.deaths}")

        return result, flag
Ejemplo n.º 11
0
def generate_documents():
    events = r.keys('event:*:title')
    for event_key in events:
        event_id = id(event_key)
        lang = r.get('event:' + event_id + ':lang')
        docs = r.keys('document:*:' + event_id)
        documents[event_id] = []
        for doc_key in docs:
            doc_id = id(doc_key)
            tweet_ids = r.lrange('document:' + doc_id + ':tweets', 0, -1)
            document = []
            for tweet_id in tweet_ids:
                # esto se puede mejorar...
                tweet = utils.remove_entities(tweet_id)
                tweet = parser.unescape(' '.join(tweet.split()))
                if len(tweet) == 0 or len(tweet.split()) == 0:
                    continue
                tweet = utils.strip_accents(tweet)
                tweet = utils.remove_stopwords(tweet, lang)
                tweet = ' '.join([stemmers[lang].stem(token) for token in tweet.split()])
                document.append(tweet)
            documents[event_id].append(' '.join(document))
Ejemplo n.º 12
0
def generate_documents_for(event_id):
    lang = r.get('event:' + event_id + ':lang')
    if lang is None:
        lang = 'spanish'
    docs = r.keys('document:*:' + event_id)
    documents[event_id] = []
    documents_ids[event_id] = []

    keys = []
    for eid in docs:
        keys.append(id(eid))

    docs = set(keys)
    for doc_id in docs:
        #doc_id = id(doc_key)

        # fb no se dejo resolver, y quedan muchos documentos apuntando a unsuportedbrowser
        # se ignora fb mientras no se arregle este problema
        url = r.get('document:%s:url' % doc_id)
        if urlparse(url).netloc == 'www.facebook.com':
            continue

        documents_real_ids.append(doc_id)
        tweet_ids = r.lrange('document:' + doc_id + ':tweets', 0, -1)
        documents_ids[event_id].append(tweet_ids)

        document = []
        for tweet_id in tweet_ids:
            # esto se puede mejorar...
            tweet = utils.remove_entities(tweet_id)
            tweet = parser.unescape(' '.join(tweet.split()))
            if len(tweet) == 0 or len(tweet.split()) == 0:
                continue
            tweet = utils.strip_accents(tweet)
            tweet = utils.remove_stopwords(tweet, lang)
            tweet = ' '.join([stemmers[lang].stem(token) for token in tweet.split()])
            document.append(tweet)
        documents[event_id].append(' '.join(document))
Ejemplo n.º 13
0
def get_search_terms_news(redis, news_id, lang):
    # obtener todas las paginas hijas del event id=news_id
    # que no hayan sido procesadas antes*
    keys = redis.keys('page:*:news_%s' % news_id)

    terms = []
    for key in keys:
        id = key.split(':')[1]

        got = redis.get('page:%s:searched' % id)

        # para poder buscar 2 veces tweets de una pagina de un evento
        if got is None or got < 2:
            title = redis.get('page:%s:title' % id)
            title = title.decode('utf-8', errors='ignore')
            title = h.unescape(title)
            title = utils.strip_accents(title)
            title = utils.remove_stopwords(title, lang=lang)
            terms.append(title)

            redis.incr('page:%s:searched' % id)

    print tag, 'got', len(terms), 'search terms for news'
    return terms
def postprocessing_td012(td012):
    table = td012.copy()

    is_rpn = table.rpn > 0
    is_rpint = table.rpint > 0
    is_chaudiere = is_rpint | is_rpn
    is_chaudiere = is_chaudiere | ~table.tv038_puissance_nominale_id.isnull()
    # all text description raw concat
    gen_ch_concat_txt_desc = table['tv031_type_generateur'].astype(
        'string').replace(np.nan, '') + ' '
    gen_ch_concat_txt_desc.loc[is_chaudiere] += 'chaudiere '
    gen_ch_concat_txt_desc += table['tv036_type_chaudiere'].astype(
        'string').replace(np.nan, ' ') + ' '
    gen_ch_concat_txt_desc += table["tv030_type_installation"].astype(
        'string').replace(np.nan, ' ') + ' '
    gen_ch_concat_txt_desc += table["tv032_type_generateur"].astype(
        'string').replace(np.nan, ' ') + ' '
    gen_ch_concat_txt_desc += table['tv035_type_chaudiere'].astype(
        'string').replace(np.nan, ' ') + ' '
    gen_ch_concat_txt_desc += table['tv036_type_generation'].astype(
        'string').replace(np.nan, ' ') + ' '
    gen_ch_concat_txt_desc += table["tv030_type_installation"].astype(
        'string').replace(np.nan, ' ') + ' '
    gen_ch_concat_txt_desc += table["tr004_description"].astype(
        'string').replace(np.nan, ' ') + ' '
    gen_ch_concat_txt_desc += table["tv045_energie"].astype('string').replace(
        np.nan, ' ') + ' '
    gen_ch_concat_txt_desc += table['tv046_nom_reseau'].isnull().replace({
        False:
        'réseau de chaleur',
        True:
        ""
    })
    gen_ch_concat_txt_desc = gen_ch_concat_txt_desc.str.lower().apply(
        lambda x: strip_accents(x))

    table['gen_ch_concat_txt_desc'] = gen_ch_concat_txt_desc

    table['gen_ch_concat_txt_desc'] = table['gen_ch_concat_txt_desc'].apply(
        lambda x: clean_str(x))

    # calcul gen_ch_lib_infer par matching score text.
    unique_gen_ch = table.gen_ch_concat_txt_desc.unique()
    gen_ch_lib_infer_dict = {
        k: affect_lib_by_matching_score(k, gen_ch_normalized_lib_matching_dict)
        for k in unique_gen_ch
    }
    table['gen_ch_lib_infer'] = table.gen_ch_concat_txt_desc.replace(
        gen_ch_lib_infer_dict)

    # calcul type energie chauffage

    table['type_energie_chauffage'] = table['tv045_energie'].replace(
        replace_elec_tv045_ener)

    # recup/fix PAC
    is_pac = (table.coefficient_performance > 2) | (table.rendement_generation
                                                    > 2)
    table.loc[is_pac, 'gen_ch_lib_infer'] = table.loc[
        is_pac, 'coefficient_performance'].replace(pac_dict)
    is_ind = is_pac & (
        ~table.loc[is_pac, 'gen_ch_lib_infer'].isin(pac_dict.values()))
    table.loc[is_pac, 'gen_ch_lib_infer'] = table.loc[
        is_pac, 'rendement_generation'].replace(pac_dict)
    is_ind = is_pac & (
        ~table.loc[is_pac, 'gen_ch_lib_infer'].isin(pac_dict.values()))
    table.loc[is_ind, 'gen_ch_lib_infer'] = 'pac indeterminee'

    # recup/fix poele bois
    is_bois = table.gen_ch_concat_txt_desc == 'bois, biomasse bois, biomasse'

    table.loc[is_bois, 'gen_ch_lib_infer'] = table.loc[
        is_bois, 'rendement_generation'].replace(poele_dict)

    is_ind = is_bois & (
        ~table.loc[is_bois, 'gen_ch_lib_infer'].isin(poele_dict.values()))
    table.loc[is_ind, 'gen_ch_lib_infer'] = 'non affecte'

    # recup reseau chaleur
    non_aff = table.gen_ch_lib_infer == 'non affecte'

    reseau_infer = non_aff & (table.rendement_generation == 0.97) & (
        table.tr004_description == 'Autres énergies')

    table.loc[reseau_infer, 'gen_ch_lib_infer'] = 'reseau de chaleur'
    table.loc[reseau_infer, 'type_energie_chauffage'] = 'Réseau de chaleurs'

    table['gen_ch_lib_infer_simp'] = table.gen_ch_lib_infer.replace(
        gen_ch_lib_simp_dict)

    # fix chaudiere elec

    bool_ej = table.gen_ch_lib_infer == 'autres emetteurs a effet joule'
    bool_ce = table.rendement_generation == 0.77

    table.loc[(bool_ej) & (bool_ce),
              'gen_ch_lib_infer'] = 'chaudiere electrique'

    rendement_gen_u = table[[
        'rendement_generation', 'coefficient_performance'
    ]].max(axis=1)

    s_rendement = pd.Series(index=table.index)
    s_rendement[:] = 1
    for rendement in [
            'rendement_distribution_systeme_chauffage',
            'rendement_emission_systeme_chauffage'
    ]:
        r = table[rendement].astype(float)
        r[r == 0] = 1
        r[r.isnull()] = 1
        s_rendement = s_rendement * r

    rendement_gen_u[rendement_gen_u == 0] = 1
    rendement_gen_u[rendement_gen_u.isnull()] = 1
    s_rendement = s_rendement * rendement_gen_u
    table['besoin_chauffage_infer'] = table[
        'consommation_chauffage'] * s_rendement

    return table
Ejemplo n.º 15
0
    def sources(self, simple_info, hostDict, hostprDict):
        if simple_info is None:
            return []

        supported_hosts = hostDict + hostprDict
        sources = []

        try:
            query_type = None
            if simple_info.get('title', None) is not None:
                query_type = 'movie'
                query = '%s %s' % (strip_accents(
                    simple_info['title']), simple_info['year'])
            else:
                query_type = 'episode'
                query = '%s S%sE%s' % (strip_accents(
                    simple_info['show_title']),
                                       simple_info['season_number_xx'],
                                       simple_info['episode_number_xx'])

            if len(supported_hosts) > 0:
                url = self.scraper._find_url()

                def search(url):
                    try:
                        result = self.search(url, query)
                        if result is None:
                            raise requests.exceptions.RequestException()
                        return result
                    except requests.exceptions.RequestException:
                        url = self.scraper._find_next_url(url)
                        if url is None:
                            return []
                        return search(url)

                hoster_results = search(url) if url is not None else []
            else:
                hoster_results = []

            for result in hoster_results:
                quality = source_utils.get_quality(result.title)

                if query_type == 'movie' and not source_utils.filter_movie_title(
                        result.title, simple_info['title'],
                        simple_info['year']):
                    continue

                if query_type == 'episode' and not source_utils.filter_single_episode(
                        simple_info, result.title):
                    continue

                for url in result.urls:
                    domain = re.findall(r"https?:\/\/(www\.)?(.*?)\/.*?",
                                        url)[0][1]

                    if domain not in supported_hosts:
                        continue
                    if any(x in url for x in ['.rar', '.zip', '.iso']):
                        continue

                    quality_from_url = source_utils.get_quality(url)
                    if quality_from_url != 'SD':
                        quality = quality_from_url

                    sources.append({
                        'release_title':
                        strip_non_ascii_and_unprintable(result.title),
                        'source':
                        domain,
                        'quality':
                        quality,
                        'language':
                        'en',
                        'url':
                        url,
                        'info': [],
                        'direct':
                        False,
                        'debridonly':
                        False
                    })

            sources.reverse()

            result_count = len(
                sources) if len(supported_hosts) > 0 else 'disabled'
            tools.log(
                'a4kScrapers.%s.%s: %s' %
                (query_type, self._caller_name, result_count), 'notice')

            return sources
        except:
            traceback.print_exc()
            return sources
Ejemplo n.º 16
0
    bert_reader = BertReader(args)
    ansrini_searcher = build_searcher(args.k1,
                                      args.b,
                                      args.index_path,
                                      args.rm3,
                                      chinese=args.chinese)

    count_hit = [0] * (args.para_num)
    count_total = [0] * (args.para_num)

    all_results = []

    for question_id in trange(len(QAs)):
        start_time = time.time()
        question = strip_accents(
            QAs[question_id]['question'])  # convert Latin into English
        if args.chinese:
            if args.toSimplified:
                question = HanziConv.toSimplified(question)
            paragraphs = anserini_retriever(question, ansrini_searcher,
                                            args.para_num)
        else:
            paragraphs = anserini_retriever(question, ansrini_searcher,
                                            args.para_num)
        if len(paragraphs) == 0:
            continue
        paragraph_texts = []
        paragraph_scores = []
        hit_flag = False
        for paragraph_id, paragraph in enumerate(paragraphs):
            paragraph_texts.append(paragraph['text'])
Ejemplo n.º 17
0
    def get_disc_info(self):
        """
            Returns information about the selected disc

            Inputs:
                None

            Outputs:
                None
        """

        proc = subprocess.Popen(
            [
                '%smakemkvcon' % self.makemkvconPath,
                '-r',
                'info',
                'dev:/dev/sr0',
                '--decrypt',
                '--minlength=%d' % self.minLength,
                '--messages=/tmp/makemkvMessages'
            ],
            stderr=subprocess.PIPE
        )

        (results, errors) = proc.communicate()

        if proc.returncode is not 0:
            self.log.error(
                "MakeMKV (get_disc_info) returned status code: %d" % proc.returncode)

        if errors is not None:
            if len(errors) is not 0:
                self.log.error("MakeMKV encountered the following error: ")
                self.log.error(errors)
                return False

        foundtitles = int(self._read_mkv_messages("TCOUNT")[0])

        self.log.debug("MakeMKV found {} titles".format(foundtitles))

        if foundtitles > 0:
            for titleNo in set(self._read_mkv_messages("TINFO")):
                durTemp = self._read_mkv_messages("TINFO", titleNo, 9)[0]
                x = time.strptime(durTemp, '%H:%M:%S')
                titleDur = datetime.timedelta(
                    hours=x.tm_hour,
                    minutes=x.tm_min,
                    seconds=x.tm_sec
                ).total_seconds()

                if self.vidType == "tv" and titleDur > self.maxLength:
                    self.log.debug("Excluding Title No.: {}, Title: {}. Exceeds maxLength".format(
                        titleNo,
                        self._read_mkv_messages("TINFO", titleNo, 27)
                    ))
                    continue

                if self.vidType == "movie" and not re.search('00', self._read_mkv_messages("TINFO", titleNo, 27)[0]):
                    self.log.debug("Excluding Title No.: {}, Title: {}. Only want first title".format(
                        titleNo,
                        self._read_mkv_messages("TINFO", titleNo, 27)
                    ))
                    continue

                self.log.debug("MakeMKV title info: Disc Title: {}, Title No.: {}, Title: {}, ".format(
                    self._read_mkv_messages("CINFO", 2),
                    titleNo,
                    self._read_mkv_messages("TINFO", titleNo, 27)
                ))

                title = self._read_mkv_messages("TINFO", titleNo, 27)[0]
                rename_title = utils.strip_accents(title)
                rename_title = utils.clean_special_chars(rename_title)

                self.saveFiles.append({
                    'index': titleNo,
                    'title': title,
                    'rename_title': rename_title,
                })
Ejemplo n.º 18
0
    def episode_query(self,
                      simple_info,
                      auto_query=True,
                      single_query=False,
                      caller_name=None,
                      exact_pack=False):
        simple_info['show_title'] = strip_accents(simple_info['show_title'])

        if self.caller_name is None:
            if caller_name is None:
                caller_name = get_caller_name()
            self.caller_name = caller_name

        simple_info['show_aliases'] = list(set(simple_info['show_aliases']))
        if '.' in simple_info['show_title']:
            no_dot_show_title = simple_info['show_title'].replace('.', '')
            simple_info['show_aliases'].append(no_dot_show_title)

        for alias in simple_info['show_aliases']:
            if '.' in alias:
                simple_info['show_aliases'].append(alias.replace('.', ''))

        self.simple_info = simple_info
        self.year = simple_info['year']
        self.country = simple_info['country']
        self.show_title = source_utils.clean_title(simple_info['show_title'])
        if self.year in self.show_title:
            self.show_title_fallback = re.sub(
                r'\s+', ' ', self.show_title.replace(self.year, ''))
        else:
            self.show_title_fallback = None

        self.episode_title = source_utils.clean_title(
            simple_info['episode_title'])
        self.season_x = simple_info['season_number']
        self.episode_x = simple_info['episode_number']
        self.season_xx = self.season_x.zfill(2)
        self.episode_xx = self.episode_x.zfill(2)

        #full_query = '%s %s %s %s %s' % (self.show_title, self.year, self.season_xx, self.episode_xx, self.episode_title)
        # use_cache_only = self._get_cache(full_query)
        # if use_cache_only:
        #     return self._get_episode_results()

        try:
            self._url = self._find_url()
            if self._url is None:
                #self._set_cache(full_query)
                return self._get_episode_results()

            if auto_query is False:
                wait_threads([self._episode('')])
                #self._set_cache(full_query)
                return self._get_episode_results()

            def query_results():
                if DEV_MODE:
                    if self.caller_name != 'eztv':
                        wait_threads([
                            self._season(self.show_title +
                                         ' S%s' % self.season_xx)
                        ])
                    else:
                        wait_threads([
                            self._episode(self.show_title + ' S%sE%s' %
                                          (self.season_xx, self.episode_xx))
                        ])
                    return

                # specials
                if self.season_x == '0':
                    wait_threads([
                        self._episode_special(self.show_title +
                                              ' %s' % self.episode_title)
                    ])
                    #self._set_cache(full_query)
                    return

                queries = [
                    self._episode(self.show_title + ' S%sE%s' %
                                  (self.season_xx, self.episode_xx))
                ]

                if single_query:
                    #self._set_cache(full_query)
                    wait_threads(queries)
                    return

                if exact_pack:
                    queries = queries + [
                        self._season_and_pack(self.show_title +
                                              '.S%s.' % self.season_xx)
                    ]
                else:
                    queries = queries + [
                        self._season(self.show_title + ' Season ' +
                                     self.season_x),
                        self._season(self.show_title +
                                     ' S%s' % self.season_xx),
                        self._pack(self.show_title + ' Seasons'),
                        self._season_and_pack(self.show_title + ' Complete')
                    ]

                if simple_info.get('isanime', False) and simple_info.get(
                        'absolute_number', None) is not None:
                    queries.insert(
                        0,
                        self._episode(self.show_title +
                                      ' %s' % simple_info['absolute_number']))

                if self._use_thread_for_info:
                    wait_threads([queries[0]])
                else:
                    wait_threads(queries)

            query_results()
            if len(self._temp_results
                   ) == 0 and self.show_title_fallback is not None:
                self.show_title = self.show_title_fallback
                self.simple_info['show_title'] = self.show_title_fallback
                query_results()

            #self._set_cache(full_query)
            return self._get_episode_results()

        except:
            #self._set_cache(full_query)
            return self._get_episode_results()
def postprocessing_ecs_ft(td005):
    td005_ecs = td005.loc[td005.tr011_sous_categorie_fiche_technique_id == '17']
    vr_ecs = td005_ecs.valeur_renseignee.str.lower().apply(lambda x: strip_accents(x))
    sys_ecs_lib_infer_ft=vr_ecs.apply(lambda x:affect_lib_by_matching_score(x,gen_ecs_normalized_lib_matching_dict_ft))
Ejemplo n.º 20
0
def lookup_muni(name_muni=None, code_muni=None, verbose=False):
    """ Lookup municipality codes and names.

    Input a municipality NAME or CODE and get the names and codes of the municipality's corresponding state, meso, micro,
    intermediate, and immediate regions. You should not select both code_muni and name_muni

    Parameters
    ----------

    name_muni : str, optional
    The municipality name to be looked up

    code_muni: str, optional
    The municipality code to be looked up

    verbose : bool, optional
    by default False

    Returns
    -------
    data.frame with 13 columns identifying the geographies information of that municipality

    Details Only available from 2010 Census data so far

    Raise
    -------
    Exception if code_muni or name_muni cannot be found

    Example
    -------
    >>> import geobr

    # Lookup table for municipality of Rio de Janeiro
    >>> mun = lookup_muni('Rio de Janeiro)
    or
    >>> mun = lookup_muni(3304557)

    # lookup table for all municipalities
    >>> mun_all = lookup_muni("all")
    """
    # Get metadata with data url addresses
    temp_meta = utils.select_metadata(geo='lookup_muni', year=2010)

    # Read DataFrame available at provided url
    lookup_table_2010 = utils.download_metadata(
        temp_meta.loc[:, 'download_path'].to_list()[0])
    lookup_table_2010['name_muni_format'] = lookup_table_2010[
        'name_muni_format'].str.lower()

    # Search by inputs
    if code_muni == 'all' or name_muni == 'all' or (code_muni is None
                                                    and name_muni is None):
        if verbose:
            print(f"Returning results for all municipalities")
        return lookup_table_2010.iloc[:, :-1]
    elif code_muni is not None:
        if name_muni is not None:
            if verbose:
                print("Ignoring argument name_muni")
        try:
            output = lookup_table_2010[lookup_table_2010['code_muni'] == int(
                code_muni)].iloc[:, :-1]
            if verbose:
                print(
                    f"Returning results for municipality {output.loc[:, 'name_muni'].to_list()[0]}"
                )
            return output
        except KeyError:
            raise Exception(
                f'The `code_muni` argument {code_muni} was not found in the database.'
            )
    elif name_muni is not None:
        # Cleaning from accents and turning into lower cases without spaces
        name_muni = utils.strip_accents(str(name_muni).lower().strip())
        output = lookup_table_2010[lookup_table_2010['name_muni_format'] ==
                                   name_muni]

        if len(output) == 0:
            if verbose:
                print("Please insert a valid municipality name")
            raise Exception(
                f'The `name_muni` argument {name_muni} was not found in the database.'
            )
        else:
            if verbose:
                print(
                    f"Returning results for municipality {output.loc[:, 'name_muni'].to_list()[0]}"
                )
            return output.iloc[:, :-1]
    elif code_muni == 'all' and name_muni == 'all':
        if verbose:
            print(
                "Please insert either a municipality name or a municipality code"
            )
Ejemplo n.º 21
0
    def get_disc_info(self):
        """
            Returns information about the selected disc

            Inputs:
                None

            Outputs:
                None
        """

        proc = subprocess.Popen(
            [
                '%smakemkvcon' % self.makemkvconPath,
                '-r',
                'info',
                'disc:%d' % self.discIndex,
                '--decrypt',
                '--minlength=%d' % self.minLength,
                '--messages=/tmp/makemkvMessages'
            ],
            stderr=subprocess.PIPE
        )

        (results, errors) = proc.communicate()

        if proc.returncode is not 0:
            self.log.error(
                "MakeMKV (get_disc_info) returned status code: %d" % proc.returncode)

        if errors is not None:
            if len(errors) is not 0:
                self.log.error("MakeMKV encountered the following error: ")
                self.log.error(errors)
                return False

        foundtitles = int(self._read_mkv_messages("TCOUNT")[0])

        self.log.debug("MakeMKV found {} titles".format(foundtitles))

        if foundtitles > 0:
            for titleNo in set(self._read_mkv_messages("TINFO")):
                durTemp = self._read_mkv_messages("TINFO", titleNo, 9)[0]
                x = time.strptime(durTemp, '%H:%M:%S')
                titleDur = datetime.timedelta(
                    hours=x.tm_hour,
                    minutes=x.tm_min,
                    seconds=x.tm_sec
                ).total_seconds()

                if self.vidType == "tv" and titleDur > self.maxLength:
                    self.log.debug("Excluding Title No.: {}, Title: {}. Exceeds maxLength".format(
                        titleNo,
                        self._read_mkv_messages("TINFO", titleNo, 27)
                    ))
                    continue

                if self.vidType == "movie" and not re.search('00', self._read_mkv_messages("TINFO", titleNo, 27)[0]):
                    self.log.debug("Excluding Title No.: {}, Title: {}. Only want first title".format(
                        titleNo,
                        self._read_mkv_messages("TINFO", titleNo, 27)
                    ))
                    continue

                self.log.debug("MakeMKV title info: Disc Title: {}, Title No.: {}, Title: {}, ".format(
                    self._read_mkv_messages("CINFO", 2),
                    titleNo,
                    self._read_mkv_messages("TINFO", titleNo, 27)
                ))

                title = self._read_mkv_messages("TINFO", titleNo, 27)[0]
                rename_title = utils.strip_accents(title)
                rename_title = utils.clean_special_chars(rename_title)

                self.saveFiles.append({
                    'index': titleNo,
                    'title': title,
                    'rename_title': rename_title,
                })
Ejemplo n.º 22
0
def process_content(real_content, lang):
    real_content = utils.strip_accents(real_content)
    real_content = utils.remove_stopwords(real_content, lang)
    real_content = utils.stem(real_content, lang)
    return real_content
Ejemplo n.º 23
0
 def cleanText(self, text):
     return utils.getTokensNoUserNoHashtag(utils.strip_accents(text))
Ejemplo n.º 24
0
import json
from tqdm import trange, tqdm

from bert_reader import BertReader
from args import *
from utils import strip_accents


if __name__ == "__main__":
    QAs = convert_squad_to_list("./data/squad_v1.1/dev-v1.1.json")

    bert_reader = BertReader(args)
    all_results = []
    for question_id in trange(len(QAs)):
        question = strip_accents(QAs[question_id]["question"])
        paragraph_texts = [QAs[question_id]["context"]]
        id_ = QAs[question_id]["id"]

        paragraph_scores = [100]

        final_answers = bert_reader.predict(id_, question, paragraph_texts, paragraph_scores)
        print(question, final_answers)

        all_results.append(final_answers)
    json.dump(all_results, open("pytorch_bert_squad.json", 'w'))
def postprocessing_td014(td013, td014):
    table = td014.copy()

    table = table.merge(td013[[
        'tr005_description', 'td013_installation_ecs_id',
        'surface_habitable_echantillon'
    ]],
                        on='td013_installation_ecs_id')

    is_chaudiere = table.rpn > 0

    gen_ecs_concat_txt_desc = table["tv027_type_installation"].astype(
        'string').replace(np.nan, '') + ' '
    gen_ecs_concat_txt_desc.loc[is_chaudiere] += 'chaudiere '

    gen_ecs_concat_txt_desc += table['tv027_type_systeme'].astype(
        'string').replace(np.nan, '') + ' '
    gen_ecs_concat_txt_desc += table['tv027_type_installation'].astype(
        'string').replace(np.nan, '') + ' '
    gen_ecs_concat_txt_desc += table["tv032_type_generateur"].astype(
        'string').replace(np.nan, '') + ' '
    gen_ecs_concat_txt_desc += table['tv036_type_generation'].astype(
        'string').replace(np.nan, '') + ' '
    gen_ecs_concat_txt_desc += table['tv037_type_production'].astype(
        'string').replace(np.nan, '') + ' '
    gen_ecs_concat_txt_desc += table['tv040_type_generateur'].astype(
        'string').replace(np.nan, '') + ' '
    gen_ecs_concat_txt_desc += table["tv040_type_installation"].astype(
        'string').replace(np.nan, '') + ' '
    gen_ecs_concat_txt_desc += table["tr004_description"].astype(
        'string').replace(np.nan, '') + ' '
    gen_ecs_concat_txt_desc += table["tv045_energie"].astype('string').replace(
        np.nan, '') + ' '
    gen_ecs_concat_txt_desc += table['tv047_type_generateur'].astype(
        'string').replace(np.nan, '') + ' '
    gen_ecs_concat_txt_desc += table['tr005_description'].astype(
        'string').replace(np.nan, '') + ' '

    gen_ecs_concat_txt_desc = gen_ecs_concat_txt_desc.str.lower().apply(
        lambda x: strip_accents(x))

    table['gen_ecs_concat_txt_desc'] = gen_ecs_concat_txt_desc

    table['gen_ecs_concat_txt_desc'] = table['gen_ecs_concat_txt_desc'].apply(
        lambda x: clean_str(x))

    # calcul gen_ecs_lib_infer par matching score text.
    unique_gen_ecs = table.gen_ecs_concat_txt_desc.unique()
    gen_ecs_lib_infer_dict = {
        k: affect_lib_by_matching_score(k,
                                        gen_ecs_normalized_lib_matching_dict)
        for k in unique_gen_ecs
    }
    table['gen_ecs_lib_infer'] = table.gen_ecs_concat_txt_desc.replace(
        gen_ecs_lib_infer_dict)
    is_pac = table.coefficient_performance > 2
    table.loc[
        is_pac,
        'gen_ecs_lib_infer'] = "ECS thermodynamique electrique(PAC ou ballon)"
    ecs_ind = table.gen_ecs_lib_infer == 'ecs electrique indeterminee'
    stockage = table.volume_stockage > 20
    table.loc[ecs_ind & stockage,
              'gen_ecs_lib_infer'] = 'ballon a accumulation electrique'
    table.loc[ecs_ind & (~stockage),
              'gen_ecs_lib_infer'] = 'ballon a accumulation electrique'
    table['gen_ecs_lib_infer_simp'] = table.gen_ecs_lib_infer.replace(
        gen_ecs_lib_simp_dict)

    # recupération fioul
    non_aff = table['gen_ecs_lib_infer'] == 'non affecte'
    fioul = table['tv045_energie'] == 'Fioul domestique'
    table.loc[fioul & non_aff, 'gen_ecs_lib_infer'] = 'chaudiere fioul'

    table['type_energie_ecs'] = table['tv045_energie'].replace(
        replace_elec_tv045_ener)

    table['score_gen_ecs_lib_infer'] = table['gen_ecs_lib_infer'].replace(
        sys_principal_score_lib).astype(float)

    return table