def lemm_sent_process3(text,
                       remove_stopwords=False,
                       summary=False,
                       mode="nltk",
                       withdot=False):
    nlp = en_core_web_sm.load()
    stops = set(stopwords.words("english"))

    lines = nltk.sent_tokenize(text)
    # Separate out reviewText and summary sentences
    text_lines = []
    pos_lines = []
    for idx, line in enumerate(lines):
        if line == "":
            continue  # empty line
        else:
            # line = remove_word2(remove_word(line))
            # spacy
            if mode == "spacy":
                # line = remove_word2(remove_word(line))
                doc = nlp(line)
                # 通過設置doc.is_parsed = True來欺騙spaCy忽略它,即通過讓它相信分配了依賴關係解析並以這種方式應用了句子邊界
                # doc.is_parsed = True
                sentences = " ".join([
                    word.lemma_ if word.pos_.startswith('N')
                    or word.pos_.startswith('J') or word.pos_.startswith('V')
                    or word.pos_.startswith('R') else word.orth_
                    for word in doc if ((
                        not word.is_space and not word.is_bracket
                        and not word.is_digit and not word.is_left_punct
                        and not word.is_right_punct and not word.is_bracket
                        and not word.is_quote and not word.is_currency and
                        not word.is_punct and word.ent_type_ not in ent_list
                        and (word.lemma_ not in list(html_escape_table +
                                                     stpwords_list2)) and
                        (word.orth_ not in list(html_escape_table +
                                                stpwords_list2)) and
                        #                              not (word.is_stop and remove_stopwords) and
                        word.tag_ not in ["SYM", "HYPH"]
                        and word.lemma_ != "-PRON-"
                        #                              word.lemma_ not in total_stopwords
                    ))
                ])

                if sentences == "": continue
                if withdot == True: sentences = sentences + "."
                if summary: sentences = ' <s> ' + sentences + " </s> "
                sentences = sentences + "\n"

                text_lines.append(sentences)

    text_lines = "".join(text_lines)
    return text_lines
def lemm_sent_process4(text,
                       remove_stopwords=False,
                       summary=False,
                       mode="nltk",
                       withdot=False):
    stops = set(stopwords.words("english"))
    # print('xxx')
    lines = nltk.sent_tokenize(text)
    # Separate out reviewText and summary sentences
    text_lines = []
    pos_lines = []
    for idx, line in enumerate(lines):
        if line == "":
            continue  # empty line
        else:
            # line = remove_word2(remove_word(line))
            # spacy
            if mode == "spacy":
                # line = remove_word2(remove_word(line))
                doc = nlp(line)
                # 通過設置doc.is_parsed = True來欺騙spaCy忽略它,即通過讓它相信分配了依賴關係解析並以這種方式應用了句子邊界
                # doc.is_parsed = True
                sentences = " ".join([
                    word.lemma_ if word.pos_.startswith('N')
                    or word.pos_.startswith('J') or word.pos_.startswith('V')
                    or word.pos_.startswith('R') else word.orth_
                    for word in doc if ((
                        not word.is_space and not word.is_bracket and
                        # not word.is_digit and
                        not word.is_left_punct and not word.is_right_punct
                        and not word.is_bracket and not word.is_quote
                        and not word.is_currency
                        # not word.is_punct and
                        # word.tag_ not in ["SYM", "HYPH"] and
                        # word.lemma_ != "-PRON-"
                    ))
                ])

                if sentences == "": continue
                # if withdot == True:  sentences = sentences + "."
                if summary: sentences = ' <s> ' + sentences + " </s> "
                sentences = sentences + "\n"
                if not withdot: sentences = sentences.replace(" . ", " ")

                text_lines.append(sentences)

    text_lines = "".join(text_lines)
    return text_lines
Beispiel #3
0
def get_Lemm_NLTK(text, remove_stopwords=False, summary=False):
    text = text.lower()
    for k, v in contractions.items():
        if k in text:
            text = text.replace(k, v)

    for k in html_escape_table:
        if k in text:
            text = text.replace(k, "")

    lines = nltk.sent_tokenize(text)
    # Separate out reviewText and summary sentences
    text_lines = []
    for idx, line in enumerate(lines):
        if line == "":
            continue  # empty line
        else:

            word_tokens = word_tokenize(line)
            new_word_tokens = []
            for w in word_tokens:
                new_word_tokens.append(w)

            word_tokens = new_word_tokens
            word_tokens = pos_tag(word_tokens)  # 获取单词词性
            wnl = WordNetLemmatizer()
            # print(line)
            filtered_sentence = []
            for w in word_tokens:
                word, tag = w[0], w[1]
                # if word not in stop_words:
                wordnet_pos = get_wordnet_pos(tag)  # or wordnet.NOUN
                if wordnet_pos != None:
                    lemmatize_word = wnl.lemmatize(word,
                                                   pos=wordnet_pos)  # 词形还原
                    filtered_sentence.append(lemmatize_word)
                else:
                    filtered_sentence.append(word)

            line = " ".join(filtered_sentence)
            if summary:
                text_lines.append(" <s> " + line + " </s> ")
            else:
                text_lines.append(line)

    # Make reviewText into a single string
    text = ' '.join(text_lines)

    # text = " ".join(sentences)
    text = remove_word2(text)

    # Optionally, remove stop words

    text = text.split()
    stops = set(stopwords.words("english"))
    if remove_stopwords:
        text = [
            w for w in text
            if not (w in (stops or stpwords_list1 or stpwords_list2))
        ]
    else:
        text = [
            w for w in text if not (w in (html_escape_table or stpwords_list2))
        ]
    text = " ".join(text)
    text = text + "."
    return text
Beispiel #4
0
def lemm_sent_process(text,
                      remove_stopwords=False,
                      summary=False,
                      mode="nltk",
                      withdot=False):
    text = text.lower()
    stops = set(stopwords.words("english"))
    for k, v in contractions.items():
        if k in text:
            text = text.replace(k, v)

    for k in html_escape_table:
        if k in text:
            text = text.replace(k, "")

    lines = nltk.sent_tokenize(text)
    # Separate out reviewText and summary sentences
    text_lines = []
    for idx, line in enumerate(lines):
        if line == "":
            continue  # empty line
        else:
            line = remove_word2(remove_word(line))
            # nltk
            if mode == "nltk":
                # ------------------------------------------------------------------
                word_tokens = word_tokenize(line)
                new_word_tokens = []
                for w in word_tokens:
                    new_word_tokens.append(w)

                word_tokens = new_word_tokens
                word_tokens = pos_tag(word_tokens)  # 获取单词词性
                wnl = WordNetLemmatizer()
                # print(line)
                filtered_sentence = []
                for w in word_tokens:
                    word, tag = w[0], w[1]
                    if word in list(html_escape_table + stpwords_list2):
                        continue
                    if remove_stopwords:
                        if word in list(
                                list(stops) + list(stpwords_list1) +
                                list(stpwords_list3)):
                            continue
                    # if word not in stop_words:
                    wordnet_pos = get_wordnet_pos(tag)  # or wordnet.NOUN
                    if wordnet_pos != None:
                        lemmatize_word = wnl.lemmatize(word,
                                                       pos=wordnet_pos)  # 词形还原
                        filtered_sentence.append(lemmatize_word)
                    else:
                        filtered_sentence.append(word)

                # filtered_sentence = [sent for sent in filtered_sentence if sent != "" ]
                line = " ".join(filtered_sentence)
                # line = remove_word2(remove_word(line))
                if line == "": continue
                if summary:
                    text_lines.append(" <s> " + line + " </s> ")
                else:
                    if withdot == True:
                        text_lines.append(line + ".\n")
                    else:
                        text_lines.append(line + "\n")
                    # --------------------------------------------------------------------------------
            # spacy
            elif mode == "spacy":
                # line = remove_word2(remove_word(line))
                doc = nlp(line)
                # 通過設置doc.is_parsed = True來欺騙spaCy忽略它,即通過讓它相信分配了依賴關係解析並以這種方式應用了句子邊界
                # doc.is_parsed = True
                sentences = " ".join([
                    word.lemma_ if word.pos_.startswith('N')
                    or word.pos_.startswith('J') or word.pos_.startswith('V')
                    or word.pos_.startswith('R') else word.orth_
                    for word in doc
                    if ((not word.is_space and not word.is_bracket
                         and not word.is_digit and not word.is_left_punct
                         and not word.is_right_punct and not word.is_bracket
                         and not word.is_quote and not word.is_currency
                         and not word.is_punct and (word.lemma_ not in list(
                             html_escape_table + stpwords_list2))
                         and not (word.is_stop and remove_stopwords)
                         and word.tag_ != "SYM" and word.tag_ != "HYPH"
                         and word.lemma_ != "-PRON-" and
                         (word.lemma_ not in stpwords_list3))
                        or word.lemma_ in ["the", "not"])
                ])

                if sentences == "": continue
                if withdot == True:
                    text_lines.append(sentences + ".\n")
                else:
                    text_lines.append(sentences + "\n")
    # --------------------------------------------------------------------------------
    text_lines = "".join(text_lines)
    return text_lines
Beispiel #5
0
def get_pos_sequence(text,
                     remove_stopwords=False,
                     summary=False,
                     mode="spacy",
                     withdot=False):
    text = text.lower()
    stops = set(stopwords.words("english"))
    for k, v in contractions.items():
        if k in text:
            text = text.replace(k, v)

    for k in html_escape_table:
        if k in text:
            text = text.replace(k, "")

    lines = nltk.sent_tokenize(text)
    # Separate out reviewText and summary sentences
    text_lines = []
    for idx, line in enumerate(lines):
        if line == "":
            continue  # empty line
        else:
            line = remove_word2(remove_word(line))
            # nltk
            if mode == "nltk":
                # ------------------------------------------------------------------
                word_tokens = word_tokenize(line)
                new_word_tokens = []
                for w in word_tokens:
                    new_word_tokens.append(w)

                word_tokens = new_word_tokens
                word_tokens = pos_tag(word_tokens)  # 获取单词词性
                wnl = WordNetLemmatizer()
                # print(line)
                filtered_sentence = []
                for w in word_tokens:
                    word, tag = w[0], w[1]
                    if word in list(html_escape_table + stpwords_list2):
                        continue
                    if remove_stopwords:
                        if word in list(list(stops) + list(stpwords_list1)):
                            continue
                    # if word not in stop_words:
                    wordnet_pos = get_wordnet_pos(tag)  # or wordnet.NOUN
                    if wordnet_pos != None:
                        lemmatize_word = wnl.lemmatize(word,
                                                       pos=wordnet_pos)  # 词形还原
                        filtered_sentence.append(lemmatize_word)
                    else:
                        filtered_sentence.append(word)

                # filtered_sentence = [sent for sent in filtered_sentence if sent != "" ]
                line = " ".join(filtered_sentence)
                line = remove_word2(remove_word(line))
                if line == "": continue
                if summary:
                    text_lines.append(" <s> " + line + " </s> ")
                else:
                    if withdot == True:
                        text_lines.append(line + ".\n")
                    else:
                        text_lines.append(line + "\n")
                    # --------------------------------------------------------------------------------
            # spacy
            elif mode == "spacy":
                doc = nlp(line)
                # 通過設置doc.is_parsed = True來欺騙spaCy忽略它,即通過讓它相信分配了依賴關係解析並以這種方式應用了句子邊界
                # doc.is_parsed = True
                sent = [
                    (word.tag_, word.orth_) if
                    ((word.tag_.startswith('N') or word.tag_.startswith('J')
                      or word.tag_.startswith('V')
                      or word.tag_.startswith('R'))) else
                    (word.tag_, word.orth_) for word in doc
                    # if((not word.is_space and
                    #      not word.is_bracket and
                    #      not word.is_digit and
                    #      not word.is_left_punct and
                    #      not word.is_right_punct and
                    #      not word.is_bracket and
                    #      not word.is_quote and
                    #      not word.is_currency and
                    #      not word.is_punct and
                    #      not (word.is_stop and remove_stopwords) and
                    #      word.tag_ != "SYM" and word.tag_ != "HYPH" and word.tag_ != "HYPH" and word.ent_type_ == ""
                    #      and word.ent_type_ not in ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT",
                    #                                 "EVENT",
                    #                                 "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME",
                    #                                 "PERCENT",
                    #                                 "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]
                    #      and word.lemma_ != "-PRON-" and (word.lemma_ not in stpwords_list2)))
                ]

                if sent == "": continue
                text_lines.append(sent)
    if len(text_lines) != 0:
        pos_lines = []
        for line in text_lines:
            pos_sent = ' '.join([word + '/' + pos for pos, word in line])
            pos_lines.append(pos_sent)

        return text_lines, pos_lines
    else:
        return text_lines, None
Beispiel #6
0
import re

import spacy
# gpu = spacy.prefer_gpu()
# print('GPU:', gpu)
from spacy.matcher import Matcher
import en_core_web_sm

# nlp = en_core_web_sm.load(disable = ['ner', 'tagger','parser'])
nlp = en_core_web_sm.load()

from nltk.corpus import stopwords
# from stopwords import *

stops = set(stopwords.words("english"))
stpwords_list3 = [
    f.replace("\n", "") for f in open(
        "data_util/stopwords.txt", "r", encoding="utf-8").readlines()
]
stpwords_list3.remove("not")
total_stopwords = list(html_escape_table + stpwords_list2) + list(
    list(stops) + list(stpwords_list1) + list(stpwords_list3))
total_stopwords = set(total_stopwords)
total_stopwords.remove("the")
total_stopwords.remove("not")

ent_list = [
    'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT',
    'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',
    'QUANTITY', 'ORDINAL', 'CARDINAL'