コード例 #1
0
ファイル: translator.py プロジェクト: awsmith823/Data-Science
def translate_term(
        search_term,
        language,
        schema_index=load_schema_index(),
        params=settings.search_params,
):
    """
    Parameters
    ----------
    - search_term (str): 
    - language (str): "english" | "garifuna" | "spanish"
    - schema_index:
    - params (dict):

    Output
    ------
    - translated results | suggestions | None
    """

    # pre-process
    language = language.lower()
    search_term = process_search_term(search_term)
    len_term = len(search_term)

    # instantiate parser
    parser = QueryParser(fieldname=language, schema=schema_index.schema)
    query = parser.parse(search_term)

    # search
    with schema_index.searcher() as searcher:

        suggested = searcher.suggest(
            fieldname=language,
            text=search_term,
            limit=params["limit"],
            maxdist=params["maxdist"],
            prefix=min(len_term, params["prefix"]),
        )

        if suggested and (search_term not in suggested):
            return {"search_term": search_term, "suggested": suggested}
        else:
            results = searcher.search(query, limit=5)
            if results:
                # format results
                results = pd.DataFrame(data=[
                    split_text(results[r]["output"], sep="|")
                    for r in range(len(results))
                ],
                                       columns=params["languages"])
                return {"search_term": search_term, "results": results}
            else:
                return {"search_term": search_term}
コード例 #2
0
def split_text(text, max_length, recursive_until=None, step=10):
    """Split text with multiple characters according to max length.
    buffer is accumulated with different segments and it will be splitted if it exceed the max length.
    Return a list of string.
    """
    if len(text) <= max_length:
        return [text]
    breaks = [
        i for i in re.finditer(
            ' |\n|\:|\:|\,|\,|\﹐|\。|\ㄧ|\?|\?|\!|\!|\;|\;|\、|\.', text)
    ]
    segments = []
    start_offset = 0
    for k, p in enumerate(breaks):
        if p.end() - start_offset > max_length:
            start = start_offset
            end = breaks[k - 1].end()
            segment = text[start:end]
            start_offset = breaks[k - 1].end()
            segments.append(segment)

    if segments == []:
        if len(breaks) == 0:
            if len(text) < max_length:
                return [text]
            else:
                return [text[:recursive_until]]
        else:
            mid = len(breaks) // 2
            segments = [
                text[:breaks[mid - 1].end()], text[breaks[mid - 1].end():]
            ]

    if segments == []:
        raise Exception(f'something is wrong \n{max_length}\n{text}')

    for segment in segments:
        if len(segment) > max_length:
            if recursive_until:
                if max_length + step < recursive_until:
                    return split_text(text,
                                      max_length + step,
                                      recursive_until=recursive_until)
                else:
                    return [text[:recursive_until]]
                    # raise Exception(f'splitted segment is larger than recursive limit {recursive_until}\n{segment}\n{text}')
            else:
                raise Exception(
                    f'splitted segment is larger than {max_length}\n{segment}\n{text}'
                )
    return segments
コード例 #3
0
    async def send_author_annotation_edit(cls, msg: Message, author_id: int, page: int):
        await cls.bot.send_chat_action(msg.chat.id, 'typing')

        annotation = await AuthorAnnotationAPI.get_by_author_id(author_id)
        if annotation is None:
            await cls.try_reply_or_send_message(msg.chat.id, "Нет информации для этого автора!",
                                                reply_to_message_id=msg.message_id)
            return

        msg_parts = split_text(annotation.body)

        text = msg_parts[page-1] + f'\n\n<code>Страница {page}/{len(msg_parts)}</code>'

        keyboard = await get_keyboard(page, len(msg_parts), f"a_ann_{author_id}", only_one=True)

        await cls.bot.edit_message_text(text, chat_id=msg.chat.id, message_id=msg.message_id,
                                        parse_mode="HTML", reply_markup=keyboard)
コード例 #4
0
    async def send_book_annotation(cls, msg: Message, book_id: int, page: int):
        await cls.bot.send_chat_action(msg.chat.id, 'typing')

        annotation = await BookAnnotationAPI.get_by_book_id(book_id)

        if annotation is None:
            await cls.try_reply_or_send_message(msg.chat.id, "Нет аннотации для этой книги!",
                                                reply_to_message_id=msg.message_id)
            return

        msg_parts = split_text(annotation.body)

        text = msg_parts[page-1] + f'\n<code>Страница {page}/{len(msg_parts)}</code>'

        keyboard = await get_keyboard(page, len(msg_parts), f"b_ann_{book_id}", only_one=True)
        if keyboard is None:
            keyboard = types.InlineKeyboardMarkup()
        keyboard.row(
            types.InlineKeyboardButton("Назад", callback_data=f"book_detail_{book_id}")
        )

        await cls.bot.edit_message_text(text, chat_id=msg.chat.id, message_id=msg.message_id,
                                        parse_mode="HTML", reply_markup=keyboard)
コード例 #5
0
    filtered = handle_space_and_newline(filtered)
    if len(filtered) > 5:
        df_clean.append(filtered)
        original.append(content)
    else:
        removed.append(content)
    if i % 1000 == 0:
        print(i)
print(len(df_clean))

#%%
df_clean_2 = []
original_2 = []
for content, origin in zip(df_clean, original):
    if len(content) > 512:
        splitted = split_text(content, 512)
        df_clean_2 += splitted
        for i in range(len(splitted)):
            original_2.append(origin)
    else:
        original_2.append(origin)
        df_clean_2.append(content)
print(len(max(df_clean_2, key=len)))
print(max(df_clean_2, key=len))

#%%
filename = 'cleaned_1202_v5'
with open(f"data/{filename}.json", 'w', encoding='utf-8') as f:
    f.write(json.dumps(df_clean_2, ensure_ascii=False))
with open(f"data/{filename}_original.json", 'w', encoding='utf-8') as f:
    f.write(json.dumps(original, ensure_ascii=False))
コード例 #6
0
ファイル: split_text.py プロジェクト: shaharv/scripts
def main():
    (infile, resdir, regexp) = Helper.parse_args()
    utils.split_text(infile, resdir, regexp)
コード例 #7
0
            if recursive_until:
                if max_length + step < recursive_until:
                    return split_text(text,
                                      max_length + step,
                                      recursive_until=recursive_until)
                else:
                    return [text[:recursive_until]]
                    # raise Exception(f'splitted segment is larger than recursive limit {recursive_until}\n{segment}\n{text}')
            else:
                raise Exception(
                    f'splitted segment is larger than {max_length}\n{segment}\n{text}'
                )
    return segments


splitted_comments = [split_text(i, 40, recursive_until=128) for i in comments]

# %%
with open("pretraining_dataset.txt", 'w') as f:
    tmp = '\n\n'.join(['\n'.join(i) for i in splitted_comments])
    f.write(tmp + '\n')

#%% CUSTOM TOKEN
tmp_freq = {}
for comment in tqdm(comments, desc="Parsing"):
    eng_vocabs = re.findall(r"[A-Za-z']+", comment)
    eng_vocabs = [i.lower().strip() for i in eng_vocabs]
    for vocab in eng_vocabs:
        if vocab in tmp_freq:
            tmp_freq[vocab] += 1
        else:
コード例 #8
0
# %%
df_clean = []
for i, content in enumerate(df):
    url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    filtered = re.sub(url_regex, '', content)
    filtered = ''.join(c for c in filtered if c in vocab)
    filtered = handle_space_and_newline(filtered)
    df_clean.append(filtered)
print(len(df_clean))

#%%
df_clean_2 = []
for content in df_clean:
    if len(content) > 511:
        name, post_content = content.split('π')
        for i in split_text(post_content, 511 - len(name)):
            df_clean_2.append(name + ' ' + i)
    else:
        df_clean_2.append(re.sub('π', ' ', content))
print(len(max(df_clean_2, key=len)))
print(max(df_clean_2, key=len))

#%%
filename = 'post_1202_v1'
with open(f"data/{filename}.json", 'w', encoding='utf-8') as f:
    f.write(json.dumps(df_clean_2, ensure_ascii=False))
# with open(f"data/{filename}_original.json", 'w', encoding='utf-8') as f:
#     f.write(json.dumps(df_clean, ensure_ascii=False))
# with open(f"data/{filename}_removed.json", 'w', encoding='utf-8') as f:
#     f.write(json.dumps(df_clean, ensure_ascii=False))