Ejemplo n.º 1
0
    def __init__(self, lang="en", stop_words_path=None):
        if stop_words_path:
            self.__stop_words_pattern = self.build_stop_word_regex_from_file(
                stop_words_path)
        else:
            stoplist = stopwordsiso.stopwords(lang)
            if not stopwordsiso.has_lang(lang):
                lang2 = lang.split("-")[0].lower()
                if not stopwordsiso.has_lang(lang2):
                    raise ValueError(
                        "No bundled stopword list available for {lang}, "
                        "initialize Rake with stop_words_path "
                        "argument".format(lang=lang))
                stoplist = stopwordsiso.stopwords(lang2)

            self.__stop_words_pattern = self.build_stop_word_regex(stoplist)
def langmodelload(language, LibLocLang=CurLibLocLang):
    ##
    global model
    global stop_words
    global question_words
    ###
    if language == "en":
        model = Model(LibLocLang + 'english-ewt-ud-2.5-191206.udpipe')
        question_words = ['where', 'which', "who", "why", "what", "when", "please", "how", "is", "are", "will", "could",
                          "should", "was", "were", "do", "did", "can"]
    elif language == "ar":
        model = Model(LibLocLang + 'arabic-padt-ud-2.5-191206.udpipe')
        question_words = ['أين', "أي", "من", "لماذا", "ماذا", "متى", "من فضلك", "كيف", "هي", "هي", "سوف", "يمكن", "يجب",
                          "كانت ", " كان ", " فعل ", " فعل ", " يمكنه "]
    elif language == "zh":
        model = Model(LibLocLang + 'chinese-gsdsimp-ud-2.5-191206.udpipe')
        question_words = ["哪里", "哪个", "谁", "为什么", "什么", "何时", "请", "如何", "是", "将", "可以", "应该", "被", "做"]
    elif language == "id":
        model = Model(LibLocLang + 'indonesian-gsd-ud-2.5-191206.udpipe')
        question_words = ['dimana', 'yang', "siapa", "mengapa", "apa", "ketika", "tolong", "bagaimana", "adalah",
                          "adalah", "akan", "bisa", "harus", "adalah", "adalah", "adalah", "lakukan ", " melakukan ",
                          " bisa "]
    elif language == "ko":
        model = Model(LibLocLang + 'korean-gsd-ud-2.5-191206.udpipe')
        question_words = ['어느', "누가 왜", "무엇", "언제", "제발", "어떻게", "는", "은", "의지", "할 수있다", "해야한다", "있었다", "있었다", "할",
                          "했다 ", "할 수있다"]
    elif language == "pt":
        model = Model(LibLocLang + 'portuguese-gsd-ud-2.5-191206.udpipe')
        question_words = ['onde', 'qual', "quem", "por que", "o que", "quando", "por favor", "como", "é", "vontade",
                          "poderia", "deveria", "era", "faz", "fez", "pode"]
    elif language == "vn":
        model = Model(LibLocLang + 'vietnamese-vtb-ud-2.5-191206.udpipe')
        question_words = ['đâu', 'cái nào', "Ai", "tại sao", "gì", "khi", "làm ơn", "làm thế nào", "là", "là", "sẽ",
                          "có thể", "nên", "đã", "đã", "làm", "đã", "có thể "]
    ########################
    if stopwords.has_lang(language):
        ########################
        stop_words = list(stopwords.stopwords(language))
        stop_words_list = []
        ########################
        for i in range(0, len(stop_words)):
            try:
                sentences = model.tokenize(stop_words[i])
                ########
                for s in sentences:
                    model.tag(s)  # inplace tagging
                    model.parse(s)  # inplace parsing
                ########
                datause = pd.read_csv(StringIO(model.write(sentences, "conllu")), sep="\t", header=None, skiprows=4)
                PosTagIntention = datause[datause.columns[2:4]].values.tolist()
                if (PosTagIntention[0][1] != "NOUN") and (PosTagIntention[0][1] != "VERB") and (
                        PosTagIntention[0][1] != "PRON"):
                    stop_words_list.append(PosTagIntention[0][0])
            except:
                print()
        stop_words = stop_words_list
    else:
        print(language + " has errors.")
        stop_words = []
 def get_lang_stopwords(self, lang = None):
     ## standardize the lang
     lang_stand = pycountry.languages.lookup(lang).alpha_2
     ## fetch stopwords
     if stopwords.has_lang(lang_stand):
         stop = stopwords.stopwords(lang_stand)
         if len(stop) > 1:
             ret = list(stop)
         else:
             ret = None
     else:
         ret = None
     return ret
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import pyLDAvis.sklearn
import matplotlib.pyplot as plt

#from nltk.corpus import stopwords
#stop_words = stopwords.words('danish')

import stopwordsiso as stopwords
stopwords.langs()  # return a set of all the supported languages
stopwords.has_lang("da")  # check if there is a stopwords for the language
stopwords.stopwords("da")  # danish stopwords

import pandas as pd
import numpy as np
import scipy as sp
import sklearn
import sys
#from nltk.corpus import stopwords;
import nltk
from gensim.models import ldamodel
import gensim.corpora
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import pickle
def langmodelload(language):
    ########################
    global stop_words
    global question_words
    global embeddings
    global model
    global lang_dict
    ########################
    LibLocLang = "./udpipe-ud/"
    ########################
    if language == "en":
        model = Model(LibLocLang + 'english-ewt-ud-2.5-191206.udpipe')
    elif language == "ar":
        model = Model(LibLocLang + 'arabic-padt-ud-2.5-191206.udpipe')
    elif language == "zh":
        model = Model(LibLocLang + 'chinese-gsdsimp-ud-2.5-191206.udpipe')
    elif language == "id":
        model = Model(LibLocLang + 'indonesian-gsd-ud-2.5-191206.udpipe')
    elif language == "ko":
        model = Model(LibLocLang + 'korean-gsd-ud-2.5-191206.udpipe')
    elif language == "pt":
        model = Model(LibLocLang + 'portuguese-gsd-ud-2.5-191206.udpipe')
    elif language == "vi":
        model = Model(LibLocLang + 'vietnamese-vtb-ud-2.5-191206.udpipe')
    elif language == "hi":
        model = Model(LibLocLang + 'hindi-hdtb-ud-2.5-191206.udpipe')
    elif language == "jp":
        model = Model(LibLocLang + 'japanese-gsd-ud-2.5-191206.udpipe')
    elif language == 'es':
        model = Model(LibLocLang + 'spanish-gsd-ud-2.5-191206.udpipe')
    ########################
    base_question_words = [
        'where', 'which', "who", "why", "what", "when", "please", "how", "is",
        "are", "will", "could", "should", "was", "were", "do", "did", "can"
    ]
    question_words = []
    for i in range(0, len(base_question_words)):
        question_words.append(
            Text(base_question_words[i]).transliterate(language))
    ########################
    if stopwords.has_lang(
            language
    ) and language != "hi" and language != "ar" and language != "zh" and language != "vi" and language != "ko" and language != "jp" and language != "id" and language != "ms":
        ########################
        stop_words = list(stopwords.stopwords(language))
        stop_words_list = []
        ########################
        for i in range(0, len(stop_words)):
            try:
                text = Text(stop_words[i], hint_language_code=language)
                ########################
                if (text.pos_tags[0][1] != "NOUN") and (
                        text.pos_tags[0][1] != "VERB") and (text.pos_tags[0][1]
                                                            != "PRON"):
                    stop_words_list.append(text.pos_tags[0][0])
            except Exception as e:
                print(e)
        stop_words = stop_words_list
    else:
        print(language + " has errors.")
        stop_words = []
    ########################
    ########################

    embeddings = Embedding.load("./polyglot_data/embeddings2/" + language +
                                "/embeddings_pkl.tar.bz2")
    lang_dict[language] = {
        'model': model,
        'embeddings': embeddings,
        'stop_words': stop_words
    }