def __init__(self, lang="en", stop_words_path=None): if stop_words_path: self.__stop_words_pattern = self.build_stop_word_regex_from_file( stop_words_path) else: stoplist = stopwordsiso.stopwords(lang) if not stopwordsiso.has_lang(lang): lang2 = lang.split("-")[0].lower() if not stopwordsiso.has_lang(lang2): raise ValueError( "No bundled stopword list available for {lang}, " "initialize Rake with stop_words_path " "argument".format(lang=lang)) stoplist = stopwordsiso.stopwords(lang2) self.__stop_words_pattern = self.build_stop_word_regex(stoplist)
def clear(text): text = text.lower() text = re.sub(r"_+", "", text) text = re.sub(r"\b\d+\b", "", text) text = " ".join( [w for w in text.split() if not w in stopwords.stopwords("pt")]) return text
def remove_stopwords(doc, langs='en', extended_stopwords=None, tokentype='lemma'): ''' Remove stopwords ''' if isinstance(langs, str): langs = [langs] stopword_list = set() for l in langs: stopword_list.update(list(stopwords(l))) if extended_stopwords: stopword_list.update(extended_stopwords) # STOPWORDS stop_ids = [ idx for idx, value in enumerate(doc[tokentype]) if value in stopword_list ] doc_sw_rm = remove_ids_all_keys(doc, stop_ids) return doc_sw_rm
def remove_stopwords(text, lang='et'): if lang == 'ee': lang = 'et' sw = stopwords(lang) for key in sw: text.replace(key, "") return text
def getStopWords(spacy_model): """Stop words tokenized with the default raw analyzer.""" # for languages available go to: https://github.com/stopwords-iso s_words = stopwords.stopwords('en') analyzer = partial(rawAnalyzer, spacy_model, []) return seq(s_words).flat_map(analyzer).to_list()
def langmodelload(language, LibLocLang=CurLibLocLang): ## global model global stop_words global question_words ### if language == "en": model = Model(LibLocLang + 'english-ewt-ud-2.5-191206.udpipe') question_words = ['where', 'which', "who", "why", "what", "when", "please", "how", "is", "are", "will", "could", "should", "was", "were", "do", "did", "can"] elif language == "ar": model = Model(LibLocLang + 'arabic-padt-ud-2.5-191206.udpipe') question_words = ['أين', "أي", "من", "لماذا", "ماذا", "متى", "من فضلك", "كيف", "هي", "هي", "سوف", "يمكن", "يجب", "كانت ", " كان ", " فعل ", " فعل ", " يمكنه "] elif language == "zh": model = Model(LibLocLang + 'chinese-gsdsimp-ud-2.5-191206.udpipe') question_words = ["哪里", "哪个", "谁", "为什么", "什么", "何时", "请", "如何", "是", "将", "可以", "应该", "被", "做"] elif language == "id": model = Model(LibLocLang + 'indonesian-gsd-ud-2.5-191206.udpipe') question_words = ['dimana', 'yang', "siapa", "mengapa", "apa", "ketika", "tolong", "bagaimana", "adalah", "adalah", "akan", "bisa", "harus", "adalah", "adalah", "adalah", "lakukan ", " melakukan ", " bisa "] elif language == "ko": model = Model(LibLocLang + 'korean-gsd-ud-2.5-191206.udpipe') question_words = ['어느', "누가 왜", "무엇", "언제", "제발", "어떻게", "는", "은", "의지", "할 수있다", "해야한다", "있었다", "있었다", "할", "했다 ", "할 수있다"] elif language == "pt": model = Model(LibLocLang + 'portuguese-gsd-ud-2.5-191206.udpipe') question_words = ['onde', 'qual', "quem", "por que", "o que", "quando", "por favor", "como", "é", "vontade", "poderia", "deveria", "era", "faz", "fez", "pode"] elif language == "vn": model = Model(LibLocLang + 'vietnamese-vtb-ud-2.5-191206.udpipe') question_words = ['đâu', 'cái nào', "Ai", "tại sao", "gì", "khi", "làm ơn", "làm thế nào", "là", "là", "sẽ", "có thể", "nên", "đã", "đã", "làm", "đã", "có thể "] ######################## if stopwords.has_lang(language): ######################## stop_words = list(stopwords.stopwords(language)) stop_words_list = [] ######################## for i in range(0, len(stop_words)): try: sentences = model.tokenize(stop_words[i]) ######## for s in sentences: model.tag(s) # inplace tagging model.parse(s) # inplace parsing ######## datause = pd.read_csv(StringIO(model.write(sentences, "conllu")), sep="\t", header=None, skiprows=4) PosTagIntention = datause[datause.columns[2:4]].values.tolist() if (PosTagIntention[0][1] != "NOUN") and (PosTagIntention[0][1] != "VERB") and ( PosTagIntention[0][1] != "PRON"): stop_words_list.append(PosTagIntention[0][0]) except: print() stop_words = stop_words_list else: print(language + " has errors.") stop_words = []
def get_extractors(extractor_type = 'count'): if extractor_type == 'count': transformer = CountVectorizer( preprocessor = clean_text, stop_words = stopwords('ny'), lowercase = True ) elif extractor_type == 'tfidf': transformer = TfidfVectorizer(preprocessor = clean_text, stop_words = stopwords("ny"), ngram_range=(1,2)) elif extractor_type == 'tfidf-transformer': transformer = TfidfTransformer(use_idf = False) else: transformer = CountVectorizer( preprocessor = clean_text, stop_words = stopwords('ny') ) return transformer
def build_kw(path, lang='et'): outs = {} sw = stopwords(lang) with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip("\n") parsed = prepare(line, lang) if not parsed in sw: outs[parsed] = outs.get(parsed, []) + [line] return outs
def count_vectorizer(self): vectorizer = CountVectorizer( preprocessor = clean_text, stop_words = stopwords("ny"), ngram_range = (1, 2), min_df = 0.05, ) train_features = vectorizer.fit_transform(self.train) test_features = vectorizer.transform(self.test) return train_features, test_features
def tfidf_transformer(self): vectorizer = CountVectorizer( preprocessor = clean_text, stop_words = stopwords("ny"), ngram_range = (1, 4), min_df = 0.05, ) train_features = vectorizer.fit_transform(self.train) test_features = vectorizer.transform(self.test) transformer = get_extractors('tfidf-transformer') train_features = transformer.fit_transform(train_features) test_features = transformer.transform(test_features) return train_features, test_features
def get_lang_stopwords(self, lang = None): ## standardize the lang lang_stand = pycountry.languages.lookup(lang).alpha_2 ## fetch stopwords if stopwords.has_lang(lang_stand): stop = stopwords.stopwords(lang_stand) if len(stop) > 1: ret = list(stop) else: ret = None else: ret = None return ret
def urlParze(url): DetectorFactory.seed = 0 print('attempting to querry' + url) try: response = requests.get(url, timeout=2) response.raise_for_status() except Exception as err: print(f'Error for: {url} occured') return (), 'zz' else: html = response.text text = text_from_html(html) if (len(text) < 100): return (), 'zz' lang = detect(text) #tokenizing text # THEN WE NEED TO MAKE DIFFERENT LOOPS DEPENDING ON THE LANGUAGE OF THE TEXT text = re.sub(r'[^\w\s]', '', text) # if language is english or indonesian if lang == 'en' or lang == 'id': text = nltk.word_tokenize(text) lowered = [x.lower() for x in text] if lang == 'en': lemmatizer = WordNetLemmatizer() output = [lemmatizer.lemmatize(x) for x in lowered] if lang == 'id': indLem = indLemm() output = [indLem.lemmatize(x) for x in lowered] elif lang == 'th': output = thaiword(text, keep_whitespace=False) elif lang == 'vi': output = list(chain.from_iterable(annotator.tokenize(text))) elif lang == 'ko': output = kParse.morphs(text) else: print("skipping because uknown language") return (), 'zz' stopL = set(stopwords(lang)) out = [w for w in output if not w in stopL] #setting directory return tuple(output), lang
def build_kw_json(path, lang='et'): outs = {} sw = stopwords(lang) df = pd.read_csv(path, names=["kw"], dtype={}) kw_df = df["kw"].astype(str).tolist() del df kws_ = set() for kw in tqdm(kw_df): #if lang == 'hr': # kws_.add(kw) #else: for k in kw.split(';'): kws_.add(k.lower()) return set(kws_) for line in list(kws_): parsed = prepare(line, lang) if not parsed in sw: outs[parsed] = outs.get(parsed, []) + [line] return outs
y='no of labels', kind='bar', legend=False, grid=True, figsize=(8, 8)) plt.title('Number of comments per category') plt.ylabel('No of occurences') plt.xlabel('Category') #plt.show() print() #vocabulary = build_vocabulary() vectorizer = TfidfVectorizer(sublinear_tf=True, norm='l2', ngram_range=(1, 2), stop_words=stopwords('ny')) train_features = vectorizer.fit_transform(train_texts).toarray() #test_features = vectorizer.transform(test_texts).toarray() reduced_vocabulary = [] print('Transformed features shape: ', train_features.shape) label_ids = train_data['Label_Id'] K = 600 for label_id, label in sorted(encoded_labels.items()): train_features_chi2 = chi2(train_features, label_ids == label_id) indices = np.argsort(train_features_chi2[0]) feature_names = np.array(vectorizer.get_feature_names())[indices] unigrams = [v for v in feature_names if len(v.split(' ')) == 1] bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
def clean( texts, language="en", min_token_freq=2, min_token_len=3, min_tokens=0, max_token_index=-1, min_ngram_count=3, ignore_words=None, remove_names=False, sample_size=1, verbose=True, ): """ Cleans text body to prepare it for analysis Parameters ---------- texts : str or list The texts to be cleaned and tokenized language : str (default=en) The language of Wikipedia to download min_token_freq : int (default=2) The minimum allowable frequency of a word inside the corpus min_token_len : int (default=3) The smallest allowable length of a word min_tokens : int (default=0) The minimum allowable length of a tokenized text max_token_index : int (default=-1) The maximum allowable length of a tokenized text min_ngram_count : int (default=5) The minimum occurrences for an n-gram to be included ignore_words : str or list Strings that should be removed from the text body remove_names : bool (default=False) Whether to remove common names sample_size : float (default=1) The amount of data to be randomly sampled verbose : bool (default=True) Whether to show a tqdm progress bar for the query Returns ------- text_corpus, selected_idxs : list, list The texts formatted for text analysis as well as the indexes for selected entries """ language = language.lower() # Select abbreviation for the lemmatizer, if it's available if language in languages.lem_abbr_dict().keys(): language = languages.lem_abbr_dict()[language] if type(texts) == str: texts = [texts] if type(ignore_words) == str: ignore_words = [ignore_words] elif ignore_words == None: ignore_words = [] if stopwords(language) != set(): # the input language has stopwords stop_words = stopwords(language) # Stemming and normal stopwords are still full language names elif language in languages.stem_abbr_dict().keys(): stop_words = stopwords(languages.stem_abbr_dict()[language]) elif language in languages.sw_abbr_dict().keys(): stop_words = stopwords(languages.sw_abbr_dict()[language]) else: stop_words = [] pbar = tqdm(desc="Cleaning steps complete", total=7, unit="step", disable=not verbose) # Remove spaces that are greater that one in length texts_no_large_spaces = [] for t in texts: for i in range( 25, 0, -1 ): # loop backwards to assure that smaller spaces aren't made large_space = str(i * " ") if large_space in t: t = t.replace(large_space, " ") texts_no_large_spaces.append(t) texts_no_websites = [] for t in texts_no_large_spaces: websites = [word for word in t.split() if word[:4] == "http"] for w in websites: t = t.replace(w, "") texts_no_websites.append(t) # Remove the references section but maintain the categories if they exist # The reference are in the text, so this just removes the section and external links # References are maintained for references like awards texts_no_references = [] for t in texts_no_websites: if "Category:" in t: t = re.sub(r"(?<= ==References==).+?(?= Category)", "", t, flags=re.DOTALL) else: t = t.split("==References==")[0] texts_no_references.append(t) gc.collect() pbar.update() texts_no_random_punctuation = [] # Prevent words from being combined when a user types word/word or word-word or word:word for t in texts_no_references: t = t.replace("/", " ") t = t.replace("-", " ") t = t.replace(":", " ") # split categories so they can be n-grammed t = re.sub("==[^>]+==", "", t) # remove headers t = re.sub("< !--[^>]+-- >", "", t) # remove comments texts_no_random_punctuation.append(t) texts_no_punctuation = [] for r in texts_no_random_punctuation: texts_no_punctuation.append( r.translate(str.maketrans("", "", string.punctuation + "–" + "’"))) # We lower case after names are removed to allow for filtering out capitalized words tokenized_texts = [text.split() for text in texts_no_punctuation] gc.collect() pbar.update() # Add bigrams and trigrams bigrams = Phrases( sentences=tokenized_texts, min_count=min_ngram_count, threshold=5.0, common_terms=stop_words, ) # half the normal threshold trigrams = Phrases( sentences=bigrams[tokenized_texts], min_count=min_ngram_count, threshold=5.0, common_terms=stop_words, ) tokens_with_ngrams = [] for text in tqdm( tokenized_texts, total=len(tokenized_texts), desc="n-grams generated", unit="texts", disable=not verbose, ): for token in bigrams[text]: if token.count("_") == 1: # Token is a bigram, so add it to the tokens text.insert(0, token) for token in trigrams[bigrams[text]]: if token.count("_") == 2: # Token is a trigram, so add it to the tokens text.insert(0, token) tokens_with_ngrams.append(text) gc.collect() pbar.update() args = zip( tokens_with_ngrams, [remove_names] * len(tokens_with_ngrams), [ignore_words] * len(tokens_with_ngrams), ) num_cores = os.cpu_count() if __name__ == "wikirec.data_utils": with Pool(processes=num_cores) as pool: tokens_lower = list( tqdm( pool.imap(_lower_remove_unwanted, args), total=len(tokens_with_ngrams), desc="Unwanted words removed", unit="texts", disable=not verbose, )) gc.collect() pbar.update() # Try lemmatization, and if not available stem, and if not available nothing nlp = None try: nlp = spacy.load(language) base_tokens = _lemmatize(tokens=tokens_lower, nlp=nlp, verbose=verbose) except OSError: try: os.system("python -m spacy download {}".format(language)) nlp = spacy.load(language) base_tokens = _lemmatize(tokens=tokens_lower, nlp=nlp, verbose=verbose) except: pass if nlp == None: # Lemmatization failed, so try stemming stemmer = None if language in SnowballStemmer.languages: stemmer = SnowballStemmer(language) # Correct if the abbreviations were put in elif language == "ar": stemmer = SnowballStemmer("arabic") elif language == "fi": stemmer = SnowballStemmer("finish") elif language == "hu": stemmer = SnowballStemmer("hungarian") elif language == "sv": stemmer = SnowballStemmer("swedish") if stemmer != None: # Stemming instead of lemmatization base_tokens = [] for tokens in tqdm( tokens_lower, total=len(tokens_lower), desc="Texts stemmed", unit="texts", disable=not verbose, ): stemmed_tokens = [stemmer.stem(t) for t in tokens] base_tokens.append(stemmed_tokens) else: # We cannot lemmatize or stem base_tokens = tokens_lower gc.collect() pbar.update() token_frequencies = defaultdict(int) for tokens in base_tokens: for t in list(set(tokens)): token_frequencies[t] += 1 if min_token_len == None or min_token_len == False: min_token_len = 0 if min_token_freq == None or min_token_freq == False: min_token_freq = 0 assert (type(min_token_len) == int ), "The 'min_token_len' argument must be an integer if used" assert (type(min_token_freq) == int ), "The 'min_token_freq' argument must be an integer if used" min_len_freq_tokens = [] for tokens in base_tokens: min_len_freq_tokens.append([ t for t in tokens if len(t) >= min_token_len and token_frequencies[t] >= min_token_freq ]) gc.collect() pbar.update() # Save original length for sampling original_len = len(min_len_freq_tokens) min_sized_texts = [[i, t] for i, t in enumerate(min_len_freq_tokens) if len(t) > min_tokens] args = zip(min_sized_texts, [max_token_index] * len(min_sized_texts)) if __name__ == "wikirec.data_utils": with Pool(processes=num_cores) as pool: text_corpus = list( tqdm( pool.imap(_subset_and_combine_tokens, args), total=len(min_sized_texts), desc="Texts finalized", unit="texts", disable=not verbose, )) gc.collect() # Sample texts if len(text_corpus) > int(sample_size * original_len): idxs = [t[0] for t in text_corpus] selected_idxs = np.random.choice(a=idxs, size=int(sample_size * original_len), replace=False) else: selected_idxs = [t[0] for t in text_corpus] text_corpus = [t[1] for t in text_corpus if t[0] in selected_idxs] pbar.update() return text_corpus, selected_idxs
def preprocess(tweet, ascii=True, ignore_rt_char=True, ignore_url=True, ignore_mention=True, ignore_hashtag=True, letter_only=True, remove_stopwords=True, min_tweet_len=3, content_words=True, lang='es'): key_words = ["coronavirus","corona","virus","coronaoutbreak","covid-19","covid19","2019-ncov","2019ncov","sars-cov-2","sarscov2","cov-19","cov19","covd19","covd19"] # keywords sword_en = set(stopwords.words('english')) sword_es = set(stopwords.words('spanish')) stop_words_iso = set(stopwordsiso.stopwords(["es", "en"])) reserved_words = ["rt", "fav", "vía", "nofollow", "twitter", "true", "href", "rel"] stop_words_es = set(get_stop_words('es')) stop_words_en = set(get_stop_words('en')) sword = set() sword.update(sword_en) sword.update(sword_es) sword.update(stop_words_en) sword.update(stop_words_iso) sword.update(stop_words_es) sword.update(reserved_words) sword.update(key_words) # gn_early_exit = ["nicaragua"] # lang_detect interprets gn if ascii: # maybe remove lines with ANY non-ascii character for c in tweet: if not (0 < ord(c) < 127): return '' #tokens = tag(tweet.lower()) #tweet.lower().split() # to lower, split doc = nlp(tweet.lower()) res = [] for token in doc: t = token token = t.text pos = t.pos_ if lang != 'es' and token in gn_early_exit: return '' if remove_stopwords and lang == 'es' and token in sword: continue if ignore_rt_char and token == 'rt': continue if ignore_url and token.startswith('https:'): continue if ignore_mention and token.startswith('@'): continue if ignore_hashtag and token.startswith('#'): continue if letter_only: if not token.isalpha(): continue elif token.isdigit(): token = '<num>' #POS if content_words and lang == 'es' and pos not in ["NOUN","PROPN","ADV","ADJ","VERB"]: # es continue if content_words and lang != 'es' and get_tag(token) not in ['n','v','adj','adv'] and pos not in ["NOUN","PROPN","ADV","ADJ","VERB"]: # gn continue # token = t.lemma_ if lang == 'es' else get_stem(token, True) res += token, #min_tweet_len if min_tweet_len and len(res) < min_tweet_len: return '' else: return ' '.join(res)
) from rubrix._constants import MAX_KEYWORD_LENGTH from stopwordsiso import stopwords from .api import EsRecordDataFieldNames SUPPORTED_LANGUAGES = ["es", "en", "fr", "de"] DATASETS_RECORDS_INDEX_TEMPLATE = { "settings": { "number_of_shards": settings.es_records_index_shards, "number_of_replicas": settings.es_records_index_replicas, "analysis": { "analyzer": { "multilingual_stop_analyzer": { "type": "stop", "stopwords": [w for w in stopwords(SUPPORTED_LANGUAGES)], } } }, }, "index_patterns": [DATASETS_RECORDS_INDEX_NAME.format("*")], "mappings": { "properties": { "event_timestamp": {"type": "date"}, EsRecordDataFieldNames.words: { "type": "text", "fielddata": True, "analyzer": "multilingual_stop_analyzer", }, # TODO: Not here since is task dependant "tokens": {"type": "text"},
def _get_stopwords(lang: str) -> AbstractSet[str]: result = set(stopwords(lang)) if lang == "en": result.update(("'m", "'re", "'s", "'ve", "n't", "nt", "n’t", "’m", "’re", "’s", "’ve")) result.difference_update(( "case", "cases", "help", "home", "information", "man", "million", "new", "novel", "state", "states", "system", "today", "uk", "work", "world", "year", "years", )) elif lang == "de": result.update(( "bleiben", "ca.", "echt", "eher", "eigentlich", "fast", "fest", "genau", "halt", "klar", "ne", "paar", "sogar", "trotz", "wahrscheinlich", )) result.difference_update(( "ernst", "jahr", "jahre", "jahren", "mensch", "menschen", "neuen", "tag", "tage", "uhr", "wissen", "zeit", )) return result
import string import stopwordsiso as stopwords app = Flask(__name__) #Loading some model categor = pd.read_csv('category.csv') nb = pickle.load(open("random_forest_classi.pkl","rb")) cv = pickle.load(open("cv_content.pkl","rb")) cv_head = pickle.load(open("cv_head.pkl","rb")) col_transform = pickle.load(open("one_hot.pkl","rb")) # Stop words stop_words = stopwords.stopwords("bn") # ##################### Function section ######################## # NLP Preprocess function # Apply a first round of text cleaning techniques def clean_text_round1(text): '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.''' text = text.lower() text = re.sub('\[.*?\]', '', text) text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\w*\d\w*', '', text) return text
for label in all_labels: mask = train_data['Label'] == label label_count.append((label,len(train_data[mask]))) le = LabelEncoding(all_labels) train_data, encoded_labels = le.encode(train_data) data_stats = pd.DataFrame(label_count, columns=['category','no of labels']) data_stats.plot(x='category',y='no of labels', kind='bar', legend=False, grid=True, figsize=(8, 8)) plt.title('Number of comments per category') plt.ylabel('No of occurences') plt.xlabel('Category') #plt.show() print() #vocabulary = build_vocabulary() vectorizer = TfidfVectorizer(sublinear_tf = True, norm = 'l2', ngram_range = (1,2), stop_words = stopwords('ny')) train_features = vectorizer.fit_transform(train_texts).toarray() #test_features = vectorizer.transform(test_texts).toarray() reduced_vocabulary = [] print('Transformed features shape: ',train_features.shape) label_ids = train_data['Label_Id'] K = 900 for label_id, label in sorted(encoded_labels.items()): train_features_chi2 = chi2(train_features, label_ids == label_id) indices = np.argsort(train_features_chi2[0]) feature_names = np.array(vectorizer.get_feature_names())[indices] unigrams = [v for v in feature_names if len(v.split(' ')) == 1] bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
from cltk.stop.classical_hindi.stops import STOPS_LIST # Cell tok = WordTokenizer(language='multilingual') ## libraries that can be used hi_stopwords = [] with open('../Data/Data/hindi_stopwords.txt', 'r') as fp: for w in fp.readlines(): hi_stopwords.append(str(w[:-1])) puncts = [ ">", "+", ":", ";", "*", "’", "●", "•", "-", ".", "''", "``", "'", "|", "", "!", ",", "@", "?", "\u200d", "#", "(", ")", "|", "%", "।", "=", "``", "&", "[", "]", "/", "'" ] stop_for_this = hi_stopwords + list( stopwords.stopwords(["en", "hi", "ta", "te", "bn"])) + [ "आएगा", "गए", "गई", "करे", "नही", "हम", "वो", "follow", "दे", "₹", "हर", "••••", "▀▄▀", "नही", "अब", "व्हाट्सएप", "॥", "–", "ov", "डॉ", "ॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐ", "क्या", "जी", "वो", "╬═╬", "_", "backhand_index_pointing_down", "backhand_index_pointing_right", "link", "subscribe", "backhand_index_pointing_down_light_skin_tone", "backhand_index_pointing_up", "Whatsapp", "Follow", "Tweet", "सब्सक्राइब", "Link", "\'\'", "``", "________________________________", "_________________________________________" ] # Cell def preprocess_sent( sent, params={
def langmodelload(language): ######################## global stop_words global question_words global embeddings global model global lang_dict ######################## LibLocLang = "./udpipe-ud/" ######################## if language == "en": model = Model(LibLocLang + 'english-ewt-ud-2.5-191206.udpipe') elif language == "ar": model = Model(LibLocLang + 'arabic-padt-ud-2.5-191206.udpipe') elif language == "zh": model = Model(LibLocLang + 'chinese-gsdsimp-ud-2.5-191206.udpipe') elif language == "id": model = Model(LibLocLang + 'indonesian-gsd-ud-2.5-191206.udpipe') elif language == "ko": model = Model(LibLocLang + 'korean-gsd-ud-2.5-191206.udpipe') elif language == "pt": model = Model(LibLocLang + 'portuguese-gsd-ud-2.5-191206.udpipe') elif language == "vi": model = Model(LibLocLang + 'vietnamese-vtb-ud-2.5-191206.udpipe') elif language == "hi": model = Model(LibLocLang + 'hindi-hdtb-ud-2.5-191206.udpipe') elif language == "jp": model = Model(LibLocLang + 'japanese-gsd-ud-2.5-191206.udpipe') elif language == 'es': model = Model(LibLocLang + 'spanish-gsd-ud-2.5-191206.udpipe') ######################## base_question_words = [ 'where', 'which', "who", "why", "what", "when", "please", "how", "is", "are", "will", "could", "should", "was", "were", "do", "did", "can" ] question_words = [] for i in range(0, len(base_question_words)): question_words.append( Text(base_question_words[i]).transliterate(language)) ######################## if stopwords.has_lang( language ) and language != "hi" and language != "ar" and language != "zh" and language != "vi" and language != "ko" and language != "jp" and language != "id" and language != "ms": ######################## stop_words = list(stopwords.stopwords(language)) stop_words_list = [] ######################## for i in range(0, len(stop_words)): try: text = Text(stop_words[i], hint_language_code=language) ######################## if (text.pos_tags[0][1] != "NOUN") and ( text.pos_tags[0][1] != "VERB") and (text.pos_tags[0][1] != "PRON"): stop_words_list.append(text.pos_tags[0][0]) except Exception as e: print(e) stop_words = stop_words_list else: print(language + " has errors.") stop_words = [] ######################## ######################## embeddings = Embedding.load("./polyglot_data/embeddings2/" + language + "/embeddings_pkl.tar.bz2") lang_dict[language] = { 'model': model, 'embeddings': embeddings, 'stop_words': stop_words }
import lemmy import lemmy.pipe import nltk from polyglot.text import Text import pycld2 as cld2 import pandas as pd from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import matplotlib.pyplot as plt import numpy as np from PIL import Image import stopwordsiso as stopwords lemmatizer = lemmy.load("da") # Stop words + cumstoms stopwordlist = stopwords.stopwords("da") stopwordlist.update([ 'du', 'og', 'til', 'kan', 'vores', 'brug', 'dine', 'første', 'ved', 'find', 'dit', 'mere', 'blevet', 'tager', 'søg', 'http', 'dk', 'søg', 'læs' ]) # Open file and lower case letters with open("pfa.txt", "r") as file: text = file.read().lower() # Remove numbers from text text = ''.join([i for i in text if not i.isdigit()]) # Remove all special characters text = re.sub(r'[-()\"#_/@;:<>{}`+=~|.!?,]', ' ', text)
def __init__(self, config): self.all_stopwords = stopwords(["en", "zh"]) use_cuda_flag = config.get("use_cuda", False) self.model = LAC(mode='seg', use_cuda=use_cuda_flag)
list_of_terms = [] with open(f"{sourcedir}/{domain}_{lang}_terms_nes.ann", "r", encoding="utf-8") as f: for line in f.readlines(): s = str(line).replace("OOD_Term", "").replace("Common_Term", "").replace("Specific_Term", "").replace( "Named_Entity", "").strip("\n").strip("\t") list_of_terms.append(s) kwp.add_keywords_from_list(list_of_terms) # Remove unwanted terms from list (single letters, prepositions, stop-words etc.) abc_list = list(string.ascii_uppercase + string.ascii_lowercase) kwp.remove_keywords_from_list(abc_list) kwp.remove_keywords_from_list(word_boundary_list) for i in stopwords(f"{lang}"): kwp.remove_keyword(i) kwp.remove_keyword(i.capitalize()) # Extract the terms with open(f"{outdir}/{lang}{suffix1}/{domain}{suffix2}full_tok.txt", "r", encoding="utf-8") as f: sentences = f.readlines() # print(sentences[-10:]) results =[] for line in tqdm(sentences): s = kwp.extract_keywords(line.rstrip()) results.append(s) print(results[-10:]) # Remove previously added terms from keyword processor for i in list_of_terms:
def __init__(self, config: dict): self.languages = json.loads(config['general']['languages']) self.chunksize = int(config['general']['chunksize']) self.rebuild_entire_database = config['general'][ 'rebuild_entire_database'].lower() == 'true' self.process_new_files_only = config['general'][ 'process_new_files_only'].lower() == 'true' self.data_dir = self.create_dir(ROOT_DIR, config['dir']['data_dir']) self.progress_dir = self.create_dir(self.data_dir, config['dir']['progress_dir']) self.spider_specific_dir = self.create_dir( ROOT_DIR, config['dir']['spider_specific_dir']) self.spiders_dir = self.create_dir(self.data_dir, config['dir']['spiders_subdir']) self.spacy_subdir = self.create_dir(self.data_dir, config['dir']['spacy_subdir']) self.datasets_subdir = self.create_dir( self.data_dir, config['dir']['datasets_subdir']) self.tmp_subdir = self.create_dir(self.data_dir, config['dir']['tmp_subdir']) self.corpora_subdir = self.create_dir(self.data_dir, config['dir']['corpora_subdir']) self.slc_subdir = self.create_dir(self.corpora_subdir, config['dir']['slc_subdir']) self.slc_spacy_subdir = self.create_dir(self.slc_subdir, config['dir']['spacy_subdir']) self.jureko_subdir = self.create_dir(self.corpora_subdir, config['dir']['jureko_subdir']) self.jureko_spacy_subdir = self.create_dir( self.jureko_subdir, config['dir']['spacy_subdir']) self.wikipedia_subdir = self.create_dir( self.corpora_subdir, config['dir']['wikipedia_subdir']) self.wikipedia_spacy_subdir = self.create_dir( self.wikipedia_subdir, config['dir']['spacy_subdir']) self.spider_specific_dir = self.create_dir( ROOT_DIR, config['dir']['spider_specific_dir']) self.output_dir = self.create_dir(self.data_dir, config['dir']['output_subdir']) self.legal_info_dir = self.create_dir(ROOT_DIR, config['dir']['legal_info_dir']) self.ip = config['postgres']['ip'] self.port = config['postgres']['port'] self.user = config['postgres']['user'] self.password = config['postgres']['password'] self.db_scrc = config['postgres']['db_scrc'] self.db_jureko = config['postgres']['db_jureko'] self.db_slc = config['postgres']['db_slc'] self.db_wikipedia = config['postgres']['db_wikipedia'] self.indexes = json.loads(config['postgres']['indexes']) self.num_cpus = multiprocessing.cpu_count() self.stopwords = stopwords(self.languages) # this should be filtered out by PUNCT pos tag already, but sometimes they are misclassified self.stopwords |= {' ', '.', '!', '?'} self.counter_types = ['counter_lemma', 'counter_pos', 'counter_tag']
def tfidf_vectorizer(self): vectorizer = TfidfVectorizer(preprocessor = clean_text, stop_words=stopwords("ny"), ngram_range=(1,2), sublinear_tf=True, min_df=0.05, norm='12') train_features = vectorizer.fit_transform(self.train) test_features = vectorizer.transform(self.test) return train_features, test_features
from sklearn.model_selection import GridSearchCV from pprint import pprint # Plotting tools import pyLDAvis import pyLDAvis.gensim import pyLDAvis.sklearn import matplotlib.pyplot as plt #from nltk.corpus import stopwords #stop_words = stopwords.words('danish') import stopwordsiso as stopwords stopwords.langs() # return a set of all the supported languages stopwords.has_lang("da") # check if there is a stopwords for the language stopwords.stopwords("da") # danish stopwords import pandas as pd import numpy as np import scipy as sp import sklearn import sys #from nltk.corpus import stopwords; import nltk from gensim.models import ldamodel import gensim.corpora from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.decomposition import NMF from sklearn.preprocessing import normalize import pickle from gensim.models import CoherenceModel
import pickle import gensim from razdel import tokenize import regex import stopwordsiso from typing import List, Union, Dict, Any, Set stops = set("""чей свой из-за вполне вообще вроде сюда аж той россия россии россию россией путин путина путину путиным путине даю даешь дает даем даете дают""".split()) stops = stops | stopwordsiso.stopwords("ru") with open("models/classifier.pkl", "rb") as file: loanword_clf = pickle.load(file) with open("models/cb_classifier.pkl", "rb") as file: obscene_clf = pickle.load(file) with open("models/expressive_classifier.pkl", "rb") as file: expressive_clf = pickle.load(file) model = gensim.models.KeyedVectors.load( "models/fasttext/araneum_none_fasttextcbow_300_5_2018.model") def statistics(analysis: List[dict]) -> dict: total = len(analysis) loanword = len([t for t in analysis if t["loanword"]]) obscene = len([t for t in analysis if t["obscene"]]) expressive = len( [t for t in analysis if (t["obscene"] or t["expressive"])]) stats = { "loanword_ratio": loanword,
def get_words(self): result = stopwords.stopwords(self.language) return result