def lemm_sent_process3(text, remove_stopwords=False, summary=False, mode="nltk", withdot=False): nlp = en_core_web_sm.load() stops = set(stopwords.words("english")) lines = nltk.sent_tokenize(text) # Separate out reviewText and summary sentences text_lines = [] pos_lines = [] for idx, line in enumerate(lines): if line == "": continue # empty line else: # line = remove_word2(remove_word(line)) # spacy if mode == "spacy": # line = remove_word2(remove_word(line)) doc = nlp(line) # 通過設置doc.is_parsed = True來欺騙spaCy忽略它,即通過讓它相信分配了依賴關係解析並以這種方式應用了句子邊界 # doc.is_parsed = True sentences = " ".join([ word.lemma_ if word.pos_.startswith('N') or word.pos_.startswith('J') or word.pos_.startswith('V') or word.pos_.startswith('R') else word.orth_ for word in doc if (( not word.is_space and not word.is_bracket and not word.is_digit and not word.is_left_punct and not word.is_right_punct and not word.is_bracket and not word.is_quote and not word.is_currency and not word.is_punct and word.ent_type_ not in ent_list and (word.lemma_ not in list(html_escape_table + stpwords_list2)) and (word.orth_ not in list(html_escape_table + stpwords_list2)) and # not (word.is_stop and remove_stopwords) and word.tag_ not in ["SYM", "HYPH"] and word.lemma_ != "-PRON-" # word.lemma_ not in total_stopwords )) ]) if sentences == "": continue if withdot == True: sentences = sentences + "." if summary: sentences = ' <s> ' + sentences + " </s> " sentences = sentences + "\n" text_lines.append(sentences) text_lines = "".join(text_lines) return text_lines
def lemm_sent_process4(text, remove_stopwords=False, summary=False, mode="nltk", withdot=False): stops = set(stopwords.words("english")) # print('xxx') lines = nltk.sent_tokenize(text) # Separate out reviewText and summary sentences text_lines = [] pos_lines = [] for idx, line in enumerate(lines): if line == "": continue # empty line else: # line = remove_word2(remove_word(line)) # spacy if mode == "spacy": # line = remove_word2(remove_word(line)) doc = nlp(line) # 通過設置doc.is_parsed = True來欺騙spaCy忽略它,即通過讓它相信分配了依賴關係解析並以這種方式應用了句子邊界 # doc.is_parsed = True sentences = " ".join([ word.lemma_ if word.pos_.startswith('N') or word.pos_.startswith('J') or word.pos_.startswith('V') or word.pos_.startswith('R') else word.orth_ for word in doc if (( not word.is_space and not word.is_bracket and # not word.is_digit and not word.is_left_punct and not word.is_right_punct and not word.is_bracket and not word.is_quote and not word.is_currency # not word.is_punct and # word.tag_ not in ["SYM", "HYPH"] and # word.lemma_ != "-PRON-" )) ]) if sentences == "": continue # if withdot == True: sentences = sentences + "." if summary: sentences = ' <s> ' + sentences + " </s> " sentences = sentences + "\n" if not withdot: sentences = sentences.replace(" . ", " ") text_lines.append(sentences) text_lines = "".join(text_lines) return text_lines
def get_Lemm_NLTK(text, remove_stopwords=False, summary=False): text = text.lower() for k, v in contractions.items(): if k in text: text = text.replace(k, v) for k in html_escape_table: if k in text: text = text.replace(k, "") lines = nltk.sent_tokenize(text) # Separate out reviewText and summary sentences text_lines = [] for idx, line in enumerate(lines): if line == "": continue # empty line else: word_tokens = word_tokenize(line) new_word_tokens = [] for w in word_tokens: new_word_tokens.append(w) word_tokens = new_word_tokens word_tokens = pos_tag(word_tokens) # 获取单词词性 wnl = WordNetLemmatizer() # print(line) filtered_sentence = [] for w in word_tokens: word, tag = w[0], w[1] # if word not in stop_words: wordnet_pos = get_wordnet_pos(tag) # or wordnet.NOUN if wordnet_pos != None: lemmatize_word = wnl.lemmatize(word, pos=wordnet_pos) # 词形还原 filtered_sentence.append(lemmatize_word) else: filtered_sentence.append(word) line = " ".join(filtered_sentence) if summary: text_lines.append(" <s> " + line + " </s> ") else: text_lines.append(line) # Make reviewText into a single string text = ' '.join(text_lines) # text = " ".join(sentences) text = remove_word2(text) # Optionally, remove stop words text = text.split() stops = set(stopwords.words("english")) if remove_stopwords: text = [ w for w in text if not (w in (stops or stpwords_list1 or stpwords_list2)) ] else: text = [ w for w in text if not (w in (html_escape_table or stpwords_list2)) ] text = " ".join(text) text = text + "." return text
def lemm_sent_process(text, remove_stopwords=False, summary=False, mode="nltk", withdot=False): text = text.lower() stops = set(stopwords.words("english")) for k, v in contractions.items(): if k in text: text = text.replace(k, v) for k in html_escape_table: if k in text: text = text.replace(k, "") lines = nltk.sent_tokenize(text) # Separate out reviewText and summary sentences text_lines = [] for idx, line in enumerate(lines): if line == "": continue # empty line else: line = remove_word2(remove_word(line)) # nltk if mode == "nltk": # ------------------------------------------------------------------ word_tokens = word_tokenize(line) new_word_tokens = [] for w in word_tokens: new_word_tokens.append(w) word_tokens = new_word_tokens word_tokens = pos_tag(word_tokens) # 获取单词词性 wnl = WordNetLemmatizer() # print(line) filtered_sentence = [] for w in word_tokens: word, tag = w[0], w[1] if word in list(html_escape_table + stpwords_list2): continue if remove_stopwords: if word in list( list(stops) + list(stpwords_list1) + list(stpwords_list3)): continue # if word not in stop_words: wordnet_pos = get_wordnet_pos(tag) # or wordnet.NOUN if wordnet_pos != None: lemmatize_word = wnl.lemmatize(word, pos=wordnet_pos) # 词形还原 filtered_sentence.append(lemmatize_word) else: filtered_sentence.append(word) # filtered_sentence = [sent for sent in filtered_sentence if sent != "" ] line = " ".join(filtered_sentence) # line = remove_word2(remove_word(line)) if line == "": continue if summary: text_lines.append(" <s> " + line + " </s> ") else: if withdot == True: text_lines.append(line + ".\n") else: text_lines.append(line + "\n") # -------------------------------------------------------------------------------- # spacy elif mode == "spacy": # line = remove_word2(remove_word(line)) doc = nlp(line) # 通過設置doc.is_parsed = True來欺騙spaCy忽略它,即通過讓它相信分配了依賴關係解析並以這種方式應用了句子邊界 # doc.is_parsed = True sentences = " ".join([ word.lemma_ if word.pos_.startswith('N') or word.pos_.startswith('J') or word.pos_.startswith('V') or word.pos_.startswith('R') else word.orth_ for word in doc if ((not word.is_space and not word.is_bracket and not word.is_digit and not word.is_left_punct and not word.is_right_punct and not word.is_bracket and not word.is_quote and not word.is_currency and not word.is_punct and (word.lemma_ not in list( html_escape_table + stpwords_list2)) and not (word.is_stop and remove_stopwords) and word.tag_ != "SYM" and word.tag_ != "HYPH" and word.lemma_ != "-PRON-" and (word.lemma_ not in stpwords_list3)) or word.lemma_ in ["the", "not"]) ]) if sentences == "": continue if withdot == True: text_lines.append(sentences + ".\n") else: text_lines.append(sentences + "\n") # -------------------------------------------------------------------------------- text_lines = "".join(text_lines) return text_lines
def get_pos_sequence(text, remove_stopwords=False, summary=False, mode="spacy", withdot=False): text = text.lower() stops = set(stopwords.words("english")) for k, v in contractions.items(): if k in text: text = text.replace(k, v) for k in html_escape_table: if k in text: text = text.replace(k, "") lines = nltk.sent_tokenize(text) # Separate out reviewText and summary sentences text_lines = [] for idx, line in enumerate(lines): if line == "": continue # empty line else: line = remove_word2(remove_word(line)) # nltk if mode == "nltk": # ------------------------------------------------------------------ word_tokens = word_tokenize(line) new_word_tokens = [] for w in word_tokens: new_word_tokens.append(w) word_tokens = new_word_tokens word_tokens = pos_tag(word_tokens) # 获取单词词性 wnl = WordNetLemmatizer() # print(line) filtered_sentence = [] for w in word_tokens: word, tag = w[0], w[1] if word in list(html_escape_table + stpwords_list2): continue if remove_stopwords: if word in list(list(stops) + list(stpwords_list1)): continue # if word not in stop_words: wordnet_pos = get_wordnet_pos(tag) # or wordnet.NOUN if wordnet_pos != None: lemmatize_word = wnl.lemmatize(word, pos=wordnet_pos) # 词形还原 filtered_sentence.append(lemmatize_word) else: filtered_sentence.append(word) # filtered_sentence = [sent for sent in filtered_sentence if sent != "" ] line = " ".join(filtered_sentence) line = remove_word2(remove_word(line)) if line == "": continue if summary: text_lines.append(" <s> " + line + " </s> ") else: if withdot == True: text_lines.append(line + ".\n") else: text_lines.append(line + "\n") # -------------------------------------------------------------------------------- # spacy elif mode == "spacy": doc = nlp(line) # 通過設置doc.is_parsed = True來欺騙spaCy忽略它,即通過讓它相信分配了依賴關係解析並以這種方式應用了句子邊界 # doc.is_parsed = True sent = [ (word.tag_, word.orth_) if ((word.tag_.startswith('N') or word.tag_.startswith('J') or word.tag_.startswith('V') or word.tag_.startswith('R'))) else (word.tag_, word.orth_) for word in doc # if((not word.is_space and # not word.is_bracket and # not word.is_digit and # not word.is_left_punct and # not word.is_right_punct and # not word.is_bracket and # not word.is_quote and # not word.is_currency and # not word.is_punct and # not (word.is_stop and remove_stopwords) and # word.tag_ != "SYM" and word.tag_ != "HYPH" and word.tag_ != "HYPH" and word.ent_type_ == "" # and word.ent_type_ not in ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", # "EVENT", # "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", # "PERCENT", # "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"] # and word.lemma_ != "-PRON-" and (word.lemma_ not in stpwords_list2))) ] if sent == "": continue text_lines.append(sent) if len(text_lines) != 0: pos_lines = [] for line in text_lines: pos_sent = ' '.join([word + '/' + pos for pos, word in line]) pos_lines.append(pos_sent) return text_lines, pos_lines else: return text_lines, None
import re import spacy # gpu = spacy.prefer_gpu() # print('GPU:', gpu) from spacy.matcher import Matcher import en_core_web_sm # nlp = en_core_web_sm.load(disable = ['ner', 'tagger','parser']) nlp = en_core_web_sm.load() from nltk.corpus import stopwords # from stopwords import * stops = set(stopwords.words("english")) stpwords_list3 = [ f.replace("\n", "") for f in open( "data_util/stopwords.txt", "r", encoding="utf-8").readlines() ] stpwords_list3.remove("not") total_stopwords = list(html_escape_table + stpwords_list2) + list( list(stops) + list(stpwords_list1) + list(stpwords_list3)) total_stopwords = set(total_stopwords) total_stopwords.remove("the") total_stopwords.remove("not") ent_list = [ 'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'