Ejemplo n.º 1
0
def cut_sentences_by_rule(text: str, sentence_delimiters: str = "。!?;"):
    paragraph = 0
    index = 0
    buffer = []
    results: List[SyntacticUnit] = []
    delimiters = set(sentence_delimiters)
    for paragraph_text in text.split("\n"):
        for char in paragraph_text:
            buffer.append(char)
            if char in delimiters:
                results.append(
                    SyntacticUnit(
                        text="".join(buffer),
                        token=len(results),
                        index=index,
                        paragraph=paragraph
                    )
                )
                buffer = []
                index += 1
        if len(buffer) > 0:
            results.append(
                SyntacticUnit(
                    text="".join(buffer),
                    token=len(results),
                    index=index,
                    paragraph=paragraph
                )
            )
            buffer = []
        elif index == 0:
            continue
        paragraph += 1
        index = 0
    return results
Ejemplo n.º 2
0
def merge_syntactic_units(original_units, filtered_units, tags=None):
    units = []
    for i in range(len(original_units)):
        if filtered_units[i] == '':
            continue

        text = original_units[i]
        token = filtered_units[i]
        tag = tags[i][1] if tags else None
        sentence = SyntacticUnit(text, token, tag)
        sentence.index = i

        units.append(sentence)

    return units
Ejemplo n.º 3
0
def merge_syntactic_units(original_units, filtered_units, tags=None):
    units = []
    for i in range(len(original_units)):
        if filtered_units[i] == '':
            continue

        text = original_units[i]
        token = filtered_units[i]
        tag = tags[i][1] if tags else None
        sentence = SyntacticUnit(text, token, tag)
        sentence.index = i

        units.append(sentence)

    return units
Ejemplo n.º 4
0
def insert_unit(target_list: List[SyntacticUnit], raw: List[str],
                tokens: List[str], pidx: int, sidx: int) -> None:
    target_list.append(
        SyntacticUnit(text="".join(raw),
                      token=" ".join(tokens),
                      index=sidx,
                      paragraph=pidx))