def embers_stem(x): """ DESCRIPTION It will do stemming for words in x considering english, spanish and portuguese INPUT x: a tweet text, or other sentense or paragraph OUTPUT the tweet text after stemming. """ x = x.lower() if isinstance(x, unicode) == False: x = x.decode('utf-8', 'ignore') try: stemmer = SnowballStemmer('spanish') x1 = FeatureCountVectorizer.preprocess_unicode_text(x, stemmer.stem) if (x1 == ''): x1 = x # print x1 stemmer = SnowballStemmer('english') x2 = FeatureCountVectorizer.preprocess_unicode_text(x, stemmer.stem) if (x2 == ''): x2 = x # print x2 stemmer = SnowballStemmer('portuguese') x3 = FeatureCountVectorizer.preprocess_unicode_text(x, stemmer.stem) if (x3 == ''): x3 = x # print x3 # print 'success' return min(x1, x2, x3, key=lambda x: len(x)) except: return x
def tokenize(text, stemming=True, stoplist=[], remove_digits=False, lang='en'): translator = str.maketrans( string.punctuation, ' ' * len(string.punctuation)) # map punctuation to space text = text.translate(translator) text = text.lower() text = text.strip() table = str.maketrans({key: None for key in string.punctuation}) text = text.translate(table) if stemming: if lang == 'en': stemmer = Stemmer() elif lang == 'it': stemmer = SnowballStemmer('italian') elif lang == 'de': stemmer = SnowballStemmer('german') elif lang == 'fa': stemmer = paStemmer() analyzer = StemmingAnalyzer(stoplist=stoplist, minsize=1, stemfn=stemmer.stem) else: analyzer = StandardAnalyzer(stoplist=stoplist, minsize=1) tokens = [token.text for token in analyzer(text)] if remove_digits: tokens = [ word for word in tokens if not contains_digits(word) and 2 <= len(word) ] return tokens
def __init__(self, lang): s_lang = map_langs.get(lang, lang) self.re_digits = re.compile(r"^[0-9]+(?:[,.][0-9]+)*[ºªkKmM]?$") self.re_mail = re.compile(r"^[a-zA-Z0-9+_\-\.]+@[0-9a-zA-Z][.-0-9a-zA-Z]*.[a-zA-Z]+$") self.re_url = re.compile( r"^[A-Za-z0-9-_]+:\/\/[A-Za-z0-9-_]*(?:\.[A-Za-z0-9-_]+)*|[A-Za-z0-9-_]+(?:\.[A-Za-z0-9-_]+)+$") if SnowballStemmer.languages.__contains__(s_lang): self.stemmer = SnowballStemmer(s_lang) else: self.stemmer = SnowballStemmer('porter') self.mapstem = {}
def __init__(self, min_occurrence=10, window=15, from_corpus=False): self.min_occurrence = min_occurrence self.window = window # map words to integers (more memory efficient and faster) self.word2int_count = count() self.word2int = defaultdict(self.word2int_count.__next__) # map city names also to ints self.city2int_count = count() self.city2int = defaultdict(self.city2int_count.__next__) self.stemmer = SnowballStemmer('german') self.stopwords = set(stopwords.words('german')).union(STOP_CITIES) self.stems = defaultdict(lambda: defaultdict(int)) self.cores = multiprocessing.cpu_count() if from_corpus: print("loading spacy", file=sys.stderr, flush=True) self.nlp = spacy.load('de', parser=False, tagger=True, entity=False) print("done...", file=sys.stderr, flush=True)
def stemLine (text): snow = SnowballStemmer('english') text = [snow.stem(t) for t in text.split()] return (' ').join(text)
class Config(object): # TEXT CLENAING TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+" # Checking if GPU is available or not is_cuda = torch.cuda.is_available() if is_cuda: device = torch.device("cuda") else: device = torch.device("cpu") # Stop words nltk.download('stopwords') stop_words = stopwords.words('english') stemmer = SnowballStemmer('english') # Model params batch_size = 32 # Batch size embed_size = 300 # Word2Vec Embedding size hidden_layers = 2 # Number of Hidden layers for Bi-directional LSTM hidden_size = 100 # Size of each Hidden layer in LSTM output_size = 2 # Output size hidden_size_linear = 128 # Fully connected layers dropout_keep = 0.51 # Dropout layer probability lr = 0.05 # Learning rate epochs = 100 # Number of Epochs # Directories path model_path = "" # Trained model path state_dict.pt file embedding_path = "./Dataset/embedding_matrix.npz" # Embedding matrix path .npz file train_path = "./Dataset/trainset.npz" # Training data file path .npz test_path = "./Dataset/validset.npz" # Testing data file path .npz tokenizer_path = "" # Tokenizer file path which you can use during inference path = "./results/RCNN/0.0005" # directory path to save results
def quadratic(cls, language: 'model.Language'): """ This estimator computes the ratio of new words for a given user and language :param language: language of the text that needs to be estimated :param user: the user for which the difficulty estimation needs to be done :rtype: WordHistoryDifficultyEstimator :return: WordHistoryDifficultyEstimator with initialized user, language and word => score map which can be used for determining scores for multiple articles for the same user and language """ estimator = cls(language) freq_list = load_language_from_hermit(language.code) word_dict = dict() for k, v in freq_list.word_info_dict.items(): word_dict[k] = v.frequency stemmer = SnowballStemmer(language.name.lower()) score_map = defaultdict(int) for k, v in word_dict.items(): score_map[stemmer.stem(k.lower())] += v max_freq = max(score_map.values()) for k in score_map.keys(): score_map[k] = (1 - score_map[k] / max_freq)**0.5 estimator.score_map = score_map return estimator
def bag_of_words_spacy2(dataset): import spacy nlp = spacy.load('es_core_news_md') spanishstemmer = SnowballStemmer("spanish") all_stopwords = stopwords.words('spanish') all_stopwords.extend( ("saludo", "dia", "noche", "noches", "tardes", "buenos", "buenas", "atentamente", "dias", "estimado", "estimados", "estimada", "atte", "hola", "gracia", "caja", "respuesta", "adjunto", "mucha", "me", "cordoba", "buen", "ud")) removeList = ["no", "nunca"] all_stopwords = [e for e in all_stopwords if e not in removeList] corpus = [] for i, value in dataset.items(): review = str(html.unescape(dataset[i])) review = cleanhtml(review) review = re.sub( r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", normalize("NFD", review), 0, re.I) review = normalize('NFC', review) review = re.sub('[^a-zA-Zá-ú0-9]', ' ', review) review = review.lower() doc = nlp(review) stems = [ spanishstemmer.stem(token) for token in doc if not token in set(all_stopwords) ] review = ' '.join(stems) corpus.append(review) return corpus
def dat_to_db(language: Language, fname: str = "songbd.dat"): """ Builds a .db file out of the file created by pull_only_contents. This is really useful for creating a database without using API calls. Currently the only officially supported way to create a database. :param language: the language to create a database for :param fname: the song data to import (generated by pull_only_contents) :return: none - will overwrite .db file though """ vocabulary = {} with open(fname, "rb") as f: songs = pickle.load(f) stemmer = SnowballStemmer(language.name) for song in songs: lyrics = song.lyrics lyrics = lyrics.replace("\n", " ") lyrics = lyrics.split(" ") for word in lyrics: if word.isalpha() and not word == "": if word not in vocabulary: vocabulary[word] = set() vocabulary[word].add(song.name) with open(language.file, "wb") as f: pickle.dump(vocabulary, f)
def normalizeWords(text): ''' Text preprocessing ''' stemmer = SnowballStemmer(language='english') test = re.compile(r'\W+', re.UNICODE).split(text[0].lower()) stop_words = [ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'html' ] test = [ stemmer.stem(word) for word in test if not word in stop_words and word.isalpha() and len(word) > 2 ] return (test, text[1], len(test))
def sentence_stemming(sentence): options = { "ar": "arabic", "da": "danish", "nl": "dutch", "en": "english", "fi": "finnish", "fr": "french", "de": "german", "hu": "hungarian", "it": "italian", "no": "norwegian", "pt": "portuguese", "ro": "romanian", "ru": "russian", "es": "spanish", "sw": "swedish" } c = detect(sentence) try: stemmer = SnowballStemmer(options[c]) except KeyError: print("Language not supported") sys.exit() s = "".join(stemmer.stem(i) + " " for i in sentence.split()) return "".join(s + " " for s in word_tokenize(s) if s not in set(stopwords.words(options[c])))
def words_stemmer(words, type="PorterStemmer", lang="english", encoding="utf8"): supported_stemmers = [ "PorterStemmer", "LancasterStemmer", "SnowballStemmer" ] if type is False or type not in supported_stemmers: return words else: stem_words = [] if type == "PorterStemmer": stemmer = PorterStemmer() for word in words: stem_words.append(stemmer.stem(word).encode(encoding)) if type == "LancasterStemmer": stemmer = LancasterStemmer() for word in words: stem_words.append(stemmer.stem(word).encode(encoding)) if type == "SnowballStemmer": stemmer = SnowballStemmer(lang) for word in words: stem_words.append(stemmer.stem(word).encode(encoding)) return " ".join(stem_words)
def stemming(words_list, type="PorterStemmer", lang="english", encoding="utf8"): """Function stems all words with stemmer type Args: word_list : list of words Return: The return value. Encoded list of words """ supported_stemmers = [ "PorterStemmer", "SnowballStemmer", "LancasterStemmer", "WordNetLemmatizer" ] if type is False or type not in supported_stemmers: return words_list else: encoded_list = [] if type == "PorterStemmer": stemmer = PorterStemmer() for word in words_list: encoded_list.append(stemmer.stem(word).encode(encoding)) if type == "SnowballStemmer": stemmer = SnowballStemmer(lang) for word in words_list: encoded_list.append(stemmer.stem(word).encode(encoding)) if type == "LancasterStemmer": stemmer = LancasterStemmer() for word in words_list: encoded_list.append(stemmer.stem(word).encode(encoding)) if type == "WordNetLemmatizer": wnl = WordNetLemmatizer() for word in words_list: encoded_list.append(wnl.lemmatize(word).encode(encoding)) return encoded_list
def target_stemming_spanish(self, words): result = "" wordset = words.split(" ") stemmer = SnowballStemmer('spanish') for word in wordset: result += stemmer.stem(word) + "_" return result
def bag_of_words_spacy(dataset): import spacy spanishstemmer = SnowballStemmer("spanish") nlp = spacy.load('es_core_news_md') nlp.Defaults.stop_words |= { "saludo", "dia", "noche", "noches", "tardes", "buenos", "buenas", "atentamente", "dias", "hola", "estimado", "estimados", "estimada", "atte" } nlp.Defaults.stop_words -= {"no", "nunca"} corpus = [] for i, value in dataset.items(): review = str(html.unescape(dataset[i])) review = cleanhtml(review) doc = nlp(review) words = [t.orth_.lower() for t in doc if not t.is_punct | t.is_stop ] #elimina signos de puntuacion y stopwords #lexical_tokens = [t.lower() for t in words if len(t) > 2 and t.isalpha()] # pasa a minuscula, elimina pal de 2letras y num review = ' '.join(words) doc = nlp(review) lemmas = [tok.lemma_.lower() for tok in doc] stems = [spanishstemmer.stem(token) for token in lemmas] review = ' '.join(stems) corpus.append(review) return corpus
def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"): supported_stemmers = [ "PorterStemmer", "SnowballStemmer", "LancasterStemmer", "WordNetLemmatizer" ] if type is False or type not in supported_stemmers: return words_l else: l = [] if type == "PorterStemmer": stemmer = PorterStemmer() for word in words_l: l.append(stemmer.stem(word).encode(encoding)) if type == "SnowballStemmer": stemmer = SnowballStemmer(lang) for word in words_l: l.append(stemmer.stem(word).encode(encoding)) if type == "LancasterStemmer": stemmer = LancasterStemmer() for word in words_l: l.append(stemmer.stem(word).encode(encoding)) if type == "WordNetLemmatizer": #TODO: context wnl = WordNetLemmatizer() for word in words_l: l.append(wnl.lemmatize(word).encode(encoding)) return l
def __init__(self): super(DBRDPreprocessing, self).__init__( MultiLineTokenizer(), SnowballStemmer('english', ignore_stopwords=True), set( stopwords.words('english') + list(string.punctuation) + ["n't", "'t"]), [HTMLSymbolFilter()])
def stemming_and_stopwords(text): stemmer = SnowballStemmer("english") stop = stopwords.words("english") text = text.apply(lambda x: x.split()) # text = text.apply(lambda word_list: [w for w in word_list if w not in stop]) return text.apply(lambda word_list: " ".join( [stemmer.stem(w) for w in word_list if w not in stop]))
def stem_text(text: str, lang_code: str) -> [str]: if lang_code in languages.languages.keys(): tokens = word_tokenize(text) stemmer = SnowballStemmer(languages.languages[lang_code]) stems = [stemmer.stem(token) for token in tokens] return stems return []
def _create_stemmer(stemmer_type): """ Initialize a stemmer """ return { 'Porter': PorterStemmer(), 'Snowball': SnowballStemmer('english'), 'Lancaster': LancasterStemmer(), }[stemmer_type]
def __init__(self, lang): lang_ipa = {'es': 'spa-Latn', 'en': 'eng-Latn'} lang_stemm = {'es': 'spanish', 'en': 'english'} self.lang = lang self.stemmer = SnowballStemmer(language=lang_stemm[lang]) self.epi = epitran.Epitran(lang_ipa[lang]) self.nlp = self.load_sapcy(lang)
def Snowball(self): newTokens = [] for t in self.rm_stopwords(): x = SnowballStemmer('english').stem(t) if x not in newTokens: newTokens.append(x) return newTokens
def text_process(self, text): # Remove punctutation no_punc = [ char.lower() for char in text if char not in string.punctuation ] # Join the characters again to form the string. no_punc = ''.join(no_punc) # Remove any stopwords try: no_stopwords = [ word for word in no_punc.split() if word.lower() not in stopwords.words(self.language) ] except LookupError: nltk.download('stopwords') no_stopwords = [ word for word in no_punc.split() if word.lower() not in stopwords.words(self.language) ] result = no_stopwords if self.tagging: # Tag each word tagged_words = self._tag_text(result) # Remove unwanted tags extracted_tags = self._extract_tags(tagged_words) result = extracted_tags if self.stemming: # Stem it stemmer = SnowballStemmer(self.language) result = [stemmer.stem(word) for word in result] return result
def __init__(self, max_edit_distance_dictionary: int = 5, prefix_length: int = 10, count_threshold: int = 1, compact_level: int = 5): super().__init__(max_edit_distance_dictionary, prefix_length, count_threshold, compact_level) self.stemmer = SnowballStemmer('german')
def get_stem(lang, sentence): stemmer = SnowballStemmer(lang) stemmed = '' for word in casual_tokenize(sentence): word = stemmer.stem(word) stemmed = stemmed + word + ' ' return stemmed
def __init__(self): # self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]') self._tok = MosesTokenizer(lang='en') self._stemmer = SnowballStemmer('english') self._lemmatizer = TreeTagger(language='english') self._stopwords = set(open(STOPWORDS).read().splitlines()) # istopwords.words('french') # self._porter_stemmer = nltk.stem.porter.PorterStemmer()
def stem(word): '''danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese", 'romanian', 'russian', 'spanish', 'swedish')''' stemmer = SnowballStemmer("english") try: word = stemmer.stem(word).encode('utf-8') except Exception, e: word = word
def stemLine(title, abstract): snow = SnowballStemmer('english') title = [snow.stem(t) for t in title.split()] abstract = [snow.stem(a) for a in abstract.split()] return (' ').join(title) + '\t' + (' ').join(abstract)
def __init__(self): self.tokenize=RegexpTokenizer(r'\b([A-Za-z]+)\b') #remove the punctuations if ver==2: self.stemmer = SnowballStemmer("english") #using stemmed version of words elif ver==1: self.stemmer = LancasterStemmer() else: self.stemmer = PorterStemmer()
def _remove_pattern_2(input_text_list): stoplist = read_stopwords() cleaned_text_list = [] for text in input_text_list: text = text.translate(string.punctuation) # Remove puncuation 去除标点 text = text.lower() # Convert words to lower case and split them # text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) # 除A-Za-z0-9(),!?'`外的字符,去除 text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(r"https://t.co/[A-Za-z]{10}", " ", text) text = text.split() text = [word for word in text if word not in stoplist] ## 在提取词根前清除一次停用词 stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] cleanwordlist = [ word for word in stemmed_words if word not in stoplist ] ## 提取词根后,再清除 text = " ".join(cleanwordlist) cleaned_text_list.append(text) return cleaned_text_list