def __init__(self, lang="en", stop_words_path=None): if stop_words_path: self.__stop_words_pattern = self.build_stop_word_regex_from_file( stop_words_path) else: stoplist = stopwordsiso.stopwords(lang) if not stopwordsiso.has_lang(lang): lang2 = lang.split("-")[0].lower() if not stopwordsiso.has_lang(lang2): raise ValueError( "No bundled stopword list available for {lang}, " "initialize Rake with stop_words_path " "argument".format(lang=lang)) stoplist = stopwordsiso.stopwords(lang2) self.__stop_words_pattern = self.build_stop_word_regex(stoplist)
def langmodelload(language, LibLocLang=CurLibLocLang): ## global model global stop_words global question_words ### if language == "en": model = Model(LibLocLang + 'english-ewt-ud-2.5-191206.udpipe') question_words = ['where', 'which', "who", "why", "what", "when", "please", "how", "is", "are", "will", "could", "should", "was", "were", "do", "did", "can"] elif language == "ar": model = Model(LibLocLang + 'arabic-padt-ud-2.5-191206.udpipe') question_words = ['أين', "أي", "من", "لماذا", "ماذا", "متى", "من فضلك", "كيف", "هي", "هي", "سوف", "يمكن", "يجب", "كانت ", " كان ", " فعل ", " فعل ", " يمكنه "] elif language == "zh": model = Model(LibLocLang + 'chinese-gsdsimp-ud-2.5-191206.udpipe') question_words = ["哪里", "哪个", "谁", "为什么", "什么", "何时", "请", "如何", "是", "将", "可以", "应该", "被", "做"] elif language == "id": model = Model(LibLocLang + 'indonesian-gsd-ud-2.5-191206.udpipe') question_words = ['dimana', 'yang', "siapa", "mengapa", "apa", "ketika", "tolong", "bagaimana", "adalah", "adalah", "akan", "bisa", "harus", "adalah", "adalah", "adalah", "lakukan ", " melakukan ", " bisa "] elif language == "ko": model = Model(LibLocLang + 'korean-gsd-ud-2.5-191206.udpipe') question_words = ['어느', "누가 왜", "무엇", "언제", "제발", "어떻게", "는", "은", "의지", "할 수있다", "해야한다", "있었다", "있었다", "할", "했다 ", "할 수있다"] elif language == "pt": model = Model(LibLocLang + 'portuguese-gsd-ud-2.5-191206.udpipe') question_words = ['onde', 'qual', "quem", "por que", "o que", "quando", "por favor", "como", "é", "vontade", "poderia", "deveria", "era", "faz", "fez", "pode"] elif language == "vn": model = Model(LibLocLang + 'vietnamese-vtb-ud-2.5-191206.udpipe') question_words = ['đâu', 'cái nào', "Ai", "tại sao", "gì", "khi", "làm ơn", "làm thế nào", "là", "là", "sẽ", "có thể", "nên", "đã", "đã", "làm", "đã", "có thể "] ######################## if stopwords.has_lang(language): ######################## stop_words = list(stopwords.stopwords(language)) stop_words_list = [] ######################## for i in range(0, len(stop_words)): try: sentences = model.tokenize(stop_words[i]) ######## for s in sentences: model.tag(s) # inplace tagging model.parse(s) # inplace parsing ######## datause = pd.read_csv(StringIO(model.write(sentences, "conllu")), sep="\t", header=None, skiprows=4) PosTagIntention = datause[datause.columns[2:4]].values.tolist() if (PosTagIntention[0][1] != "NOUN") and (PosTagIntention[0][1] != "VERB") and ( PosTagIntention[0][1] != "PRON"): stop_words_list.append(PosTagIntention[0][0]) except: print() stop_words = stop_words_list else: print(language + " has errors.") stop_words = []
def get_lang_stopwords(self, lang = None): ## standardize the lang lang_stand = pycountry.languages.lookup(lang).alpha_2 ## fetch stopwords if stopwords.has_lang(lang_stand): stop = stopwords.stopwords(lang_stand) if len(stop) > 1: ret = list(stop) else: ret = None else: ret = None return ret
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.model_selection import GridSearchCV from pprint import pprint # Plotting tools import pyLDAvis import pyLDAvis.gensim import pyLDAvis.sklearn import matplotlib.pyplot as plt #from nltk.corpus import stopwords #stop_words = stopwords.words('danish') import stopwordsiso as stopwords stopwords.langs() # return a set of all the supported languages stopwords.has_lang("da") # check if there is a stopwords for the language stopwords.stopwords("da") # danish stopwords import pandas as pd import numpy as np import scipy as sp import sklearn import sys #from nltk.corpus import stopwords; import nltk from gensim.models import ldamodel import gensim.corpora from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.decomposition import NMF from sklearn.preprocessing import normalize import pickle
def langmodelload(language): ######################## global stop_words global question_words global embeddings global model global lang_dict ######################## LibLocLang = "./udpipe-ud/" ######################## if language == "en": model = Model(LibLocLang + 'english-ewt-ud-2.5-191206.udpipe') elif language == "ar": model = Model(LibLocLang + 'arabic-padt-ud-2.5-191206.udpipe') elif language == "zh": model = Model(LibLocLang + 'chinese-gsdsimp-ud-2.5-191206.udpipe') elif language == "id": model = Model(LibLocLang + 'indonesian-gsd-ud-2.5-191206.udpipe') elif language == "ko": model = Model(LibLocLang + 'korean-gsd-ud-2.5-191206.udpipe') elif language == "pt": model = Model(LibLocLang + 'portuguese-gsd-ud-2.5-191206.udpipe') elif language == "vi": model = Model(LibLocLang + 'vietnamese-vtb-ud-2.5-191206.udpipe') elif language == "hi": model = Model(LibLocLang + 'hindi-hdtb-ud-2.5-191206.udpipe') elif language == "jp": model = Model(LibLocLang + 'japanese-gsd-ud-2.5-191206.udpipe') elif language == 'es': model = Model(LibLocLang + 'spanish-gsd-ud-2.5-191206.udpipe') ######################## base_question_words = [ 'where', 'which', "who", "why", "what", "when", "please", "how", "is", "are", "will", "could", "should", "was", "were", "do", "did", "can" ] question_words = [] for i in range(0, len(base_question_words)): question_words.append( Text(base_question_words[i]).transliterate(language)) ######################## if stopwords.has_lang( language ) and language != "hi" and language != "ar" and language != "zh" and language != "vi" and language != "ko" and language != "jp" and language != "id" and language != "ms": ######################## stop_words = list(stopwords.stopwords(language)) stop_words_list = [] ######################## for i in range(0, len(stop_words)): try: text = Text(stop_words[i], hint_language_code=language) ######################## if (text.pos_tags[0][1] != "NOUN") and ( text.pos_tags[0][1] != "VERB") and (text.pos_tags[0][1] != "PRON"): stop_words_list.append(text.pos_tags[0][0]) except Exception as e: print(e) stop_words = stop_words_list else: print(language + " has errors.") stop_words = [] ######################## ######################## embeddings = Embedding.load("./polyglot_data/embeddings2/" + language + "/embeddings_pkl.tar.bz2") lang_dict[language] = { 'model': model, 'embeddings': embeddings, 'stop_words': stop_words }