Ejemplo n.º 1
0
def add_bs_xml_content(text: str, f: Parsed, lang: str):
    """
    Add content to Parsed object from BeautifulSoup XML parser output.
    """

    sentences = re.split("[.!?]", text)

    for sentence in sentences:

        s = clean_text(sentence)

        if len(s) > 1:

            f.add_content_sent(" ".join(s))
            s_stem = stem_text(s, lang)
            f.add_stemmed_sent(" ".join(s_stem))
            s_filt = filter_text(s, lang)

            if len(s_filt) > 1:

                f.add_filtered_sent(" ".join(s_filt))
                s_filt_stem = stem_text(s_filt, lang)
                f.add_filtered_stemmed_sent(" ".join(s_filt_stem))

    text_list = clean_text(text)
    f.add_content(text_list)

    stem = stem_text(text_list, lang)
    f.add_stemmed(stem)

    filt = filter_text(text_list, lang)
    f.add_filtered(filt)

    filt_stem = stem_text(filt, lang)
    f.add_filtered_stemmed(filt_stem)
Ejemplo n.º 2
0
def add_xml_content(root, file: Parsed, language: str):
    """
    Transforms text from xml file into raw/filtered/stemmed forms and adds it to a file object.
    """

    text = ''
    if str(root.text) != 'None':
        text += root.text

    if str(root.tail) != 'None':
        text += ' ' + root.tail

    if text != '':
        sentences = re.split('(?<=[.!?]) +', text)

        for sentence in sentences:
            sentence = clean_text(sentence)

            if len(sentence) > 1:
                file.add_content_sent(" ".join(sentence))
                sentence_stemmed = stem_text(sentence, language)
                file.add_stemmed_sent(" ".join(sentence_stemmed))
                sentence_filtered = filter_text(sentence, language)

                if len(sentence_filtered) > 1:
                    file.add_filtered_sent(" ".join(sentence_filtered))
                    sentence_filtered_stemmed = stem_text(
                        sentence_filtered, language)
                    file.add_filtered_stemmed_sent(
                        " ".join(sentence_filtered_stemmed))

        text_list = clean_text(text)

        # full text
        file.add_content(text_list)

        # stem the full text
        stemmed = stem_text(text_list, language)
        file.add_stemmed(stemmed)

        # filter the unstemmed full text
        filtered = filter_text(text_list, language)
        file.add_filtered(filtered)

        # stem the filtered text
        filtered_stemmed = stem_text(filtered, language)
        file.add_filtered_stemmed(filtered_stemmed)
Ejemplo n.º 3
0
def add_content(text: str, file: Parsed, language: str):
    """
    Transforms text into raw/filtered/stemmed forms and adds it to a file object.
    """

    sentences = re.split('(?<=[.!?]) +', text)

    for sentence in sentences:
        sentence = clean_text(sentence)

        if len(sentence) > 1:
            file.add_content_sent(" ".join(sentence))
            sentence_stemmed = stem_text(sentence, language)
            file.add_stemmed_sent(" ".join(sentence_stemmed))
            sentence_filtered = filter_text(sentence, language)

            if len(sentence_filtered) > 1:
                file.add_filtered_sent(" ".join(sentence_filtered))
                sentence_filtered_stemmed = stem_text(sentence_filtered,
                                                      language)
                file.add_filtered_stemmed_sent(
                    " ".join(sentence_filtered_stemmed))

    text_list = clean_text(text)

    # full text
    file.add_content(text_list)

    # stem the full text
    stemmed = stem_text(text_list, language)
    file.add_stemmed(stemmed)

    # filter the unstemmed full text
    filtered = filter_text(text_list, language)
    file.add_filtered(filtered)

    # stem the filtered text
    filtered_stemmed = stem_text(filtered, language)
    file.add_filtered_stemmed(filtered_stemmed)