def get_stemmer(self, lang_code): "danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish" if lang_code == "en": return nltk.SnowballStemmer("english") elif lang_code == "fr": return nltk.SnowballStemmer("french") elif lang_code == "ge": return nltk.SnowballStemmer("german") elif lang_code == 'it': return nltk.SnowballStemmer("italian") else: return nltk.SnowballStemmer("english")
def text_pre_processing(self, text): stemmer = nltk.SnowballStemmer("swedish") bag = {} script = text ### Decode for special characters, lowercase and split the string into independant words. script = script.decode('utf-8') script = script.lower() splitted = re.split(r'[^A-Za-z0-9\wåäöÅÄÖ]+', script) # print(splitted) ### Fill in the bag-of-words with stemmed non-stopwords i = 0 while i < len(splitted) - 1: if splitted[i] not in stopwords.words( 'swedish') and splitted[i] not in Stopword().stopwords: word = stemmer.stem(splitted[i]) #word = splitted[i] #print(word) if word in bag: bag[word] += 1 else: # print(word) bag[word] = 1 i += 1 return bag
def FeedReview(text: str, language: str) -> str: """ Remove html tags, stopwords and special characters from review. Stem word in review. Parameters ---------- - text : str review - language : str language of the review Returns ------- str {prepared review} """ # Remove html tags text = RemoveHTML(text) # Remove stopwords and special characters stopwords = nltk.corpus.stopwords.words(language) text = RemoveSWSC(text, stopwords) # Stem words stemmer = nltk.SnowballStemmer(language) text = StemText(text, stemmer) return text
def groupingArea(data, dict): class Inline(object): pass data_out = Inline() #looping on training_set data for quest in data: if quest[2] in dict: # removing unless data for bag-of-words analysis qs = quest[3:] # removing nan string values # tmp = [x for x in tmp if str(x) != 'nan'] # converting list to string and replacing semicolumn with space #solved problem with integer character qs = ' '.join(str(v) for v in qs) qs = qs.replace(";", " ") #TOKENING tokenizer = nltk.RegexpTokenizer(r'\w+') qs = tokenizer.tokenize(qs) #STOPPING (dealing with stressed characters) stop = [ word.decode('utf-8') for word in nltk.corpus.stopwords.words('italian') ] text = [word for word in qs if word not in stop] # adding string to dict #TODO: fix unrecognized characters (instead of using re.sub(...)) text = ' '.join(text) text = re.sub('[\W_]+', ' ', text, flags=re.LOCALE) #converting string to list text = text.split(' ') #STEMMING stemmer = nltk.SnowballStemmer('italian') text = [stemmer.stem(word) for word in text] #converting list to string text = ' '.join(text) dict.get(quest[2]).append(text.lower()) # grouping list of document D = [] y_train = [] for key in dict: y_train.append(key) tmp = dict.get(key) tmp = ' '.join(tmp) D.append(tmp) data_out.data = D data_out.target = y_train return data_out
def stemmatize(dct,language): for j in dct: i = dct[j][1] stemmer = nltk.SnowballStemmer(language) stemmed = [stemmer.stem(token) for token in i] dct[j][1] = stemmed return dct
def stemInput(l1): ''' returns array of stemmed input ''' snowball = nltk.SnowballStemmer(language='german') stopset = set(stopwords.words('german')) stopset |= set("(),") l1 = [snowball.stem(l) for l in l1] return l1
def __init__(self, source='rawtext', language='dutch', use_lemma=True, use_id=True, use_stemming=True, remove_repeating=False, delete_stopwords=False, postagging=True, posfilter=None, ngrams=1, lowercase=True, zeropunctuation=False, max_paragraph=None, marknegations=False, headlinefilter=None, reportparameters=True): self.source = source self.posfilter = posfilter self.ngrams = ngrams self.postagging = postagging self.headlinefilter = headlinefilter self.lowercase = lowercase self.marknegations = marknegations self.zeropunctuation = zeropunctuation self.max_paragraph = max_paragraph if use_stemming == True: self.stemmer = nltk.SnowballStemmer(language) else: self.stemmer = None if remove_repeating: self.repeatReplacer = RepeatReplacer() else: self.repeatReplacer = None if delete_stopwords == True: self.stopwords = nltk.corpus.stopwords.words(language) else: self.stopwords = None if source == 'parsed': self.use_lemma = use_lemma self.use_id = use_id if self.use_id: self.repeatReplacer = None if self.stopwords: if use_lemma == False: self.stopwords = set(w.id for w in Word.objects.filter(word__in = stopwords)) if use_lemma == True: self.stopwords = set(w.id for w in Lemma.objects.filter(lemma__in = stopwords)) if source == 'rawtext': self.use_lemma = False self.use_id = False self.tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+|[^\w\s]+') if posfilter or (postagging == True): self.tagger = self.taggerCombo(language) if reportparameters == True: self.reportParameters()
def main(): """ remember you need nltk dependencies for first use: nltk.download('punkt') #tokenizer nltk.download('stopwords') """ prep = PrepText() stemmer = nltk.SnowballStemmer('english') stop = stopwords.words('english') stop = stop + settings.academic_words data_files = [ (settings.text_training_data, settings.text_training_prep), (settings.text_test_data, settings.text_test_prep) ] data_cols = [settings.id_colname, settings.text_colname] for input_file, output_file in data_files: df = pd.read_csv(input_file, sep='\|\|',skiprows=1, engine='python', names=data_cols) df[settings.text_colname] = df[settings.text_colname].str.lower() df = prep.toke(df, settings.text_colname) df = prep.normalize_text(df, settings.text_colname, stop) df = prep.stem_text(df, settings.text_colname, stemmer) df.to_csv(output_file, index=False)
def find_cos(statment1, statment2): statment1 = statment1.lower() statment2 = statment2.lower() word_tokens = word_tokenize(statment1) word_tokens2 = word_tokenize(statment2) stop_words = set(stopwords.words('english')) snow = nltk.SnowballStemmer('english') filtered_sentence1 = [w for w in word_tokens if not w in stop_words] filtered_sentence1 = [] for w in word_tokens: if w not in stop_words: filtered_sentence1.append(w) filtered_sentence2 = [w for w in word_tokens2 if not w in stop_words] filtered_sentence2 = [] for w in word_tokens2: if w not in stop_words: filtered_sentence2.append(w) statment1 = '' for word in filtered_sentence1: statment1 += snow.stem(word) + ' ' statment2 = '' for word in filtered_sentence2: statment2 += snow.stem(word) + ' ' statment1 = re.sub(r'[^\w\s]', '', statment1).replace(' ', ' ') statment2 = re.sub(r'[^\w\s]', '', statment2).replace(' ', ' ') return get_result(statment1, statment2)
def tokenize(filename): ''' Tokenizes the sentences and words :param filename: path of the file containing the text to be summarized ''' global sentences, sentences_processing, sentence_dictionary with io.open(filename, "r", encoding="utf-8") as inputFile: for line in inputFile: sentences.append(line) sentences_processing.append(line.lower().strip()) inputFile.close() counter = 0 stemmer = nltk.SnowballStemmer('portuguese') for sentence in sentences_processing: # print ("sent: "+sentence) # sentence = sentence[:-1] sentence = re.sub(',|\.|-|\(|\)', ' ', sentence) tokens = sentence.split() actualTokens = removeStopWords(tokens) stemmedTokens = [stem(word) for word in actualTokens] # stemmedTokens1 = [stem(word) for word in actualTokens] sentence_dictionary[counter] = stemmedTokens # sentence_dictionary1[counter] = stemmedTokens1 counter += 1
def stem_tweet(tokens): """ Stemming the process of reducing a derived word to it's original word. """ #Using SnowballStemmer for english stemmer = nltk.SnowballStemmer('english') return [stemmer.stem(x) for x in tokens]
class StringStemmingRuleProvider: _porter_stemmer = nltk.PorterStemmer() _snowball_stemmer = nltk.SnowballStemmer('english') _n_gram = 4 # _lemmatizer = wordnet.WordNetLemmatizer() @staticmethod def porter(str_to_stem): return [StringStemmingRuleProvider._porter_stemmer.stem(str_to_stem)] @staticmethod def snowball(str_to_stem): return [StringStemmingRuleProvider._snowball_stemmer.stem(str_to_stem)] @staticmethod def n_gram_yielder(str_to_stem): nr_of_grams = len(str_to_stem)-StringStemmingRuleProvider._n_gram + 1 for index in range(0, nr_of_grams): yield str_to_stem[index: index+StringStemmingRuleProvider._n_gram] @staticmethod def n_gram(str_to_stem): if(len(str_to_stem) <= StringStemmingRuleProvider._n_gram): return [str_to_stem] return list(StringStemmingRuleProvider.n_gram_yielder(str_to_stem))
def intersection(wordlist1, wordlist2): """Calculate number of intersection words between two comments Args: comment1, comment2: two wordlists Return: overlap: number of common words in both wordlists """ snowball = nltk.SnowballStemmer("english") #wordlist1 = clean_word(comment1.sentence) #wordlist2 = clean_word(comment2.sentence) wordlist1 = nltk.word_tokenize(wordlist1.lower()) wordlist2 = nltk.word_tokenize(wordlist2.lower()) wordlist1 = remove_stop_words(wordlist1) wordlist2 = remove_stop_words(wordlist2) wordlist1 = [snowball.stem(t) for t in wordlist1] wordlist2 = [snowball.stem(t) for t in wordlist2] #wordlist1 = remove_numbers(wordlist1) #wordlist2 = remove_numbers(wordlist2) norm = math.log(len(wordlist1)) + math.log(len(wordlist2)) overlap = len(list(set(wordlist1) & set(wordlist2))) / norm return overlap
def stemming(sentences): snowball = nltk.SnowballStemmer("english") stemmed_sentences = [] for sentence in sentences: tokens = nltk.word_tokenize(sentence.sentence) stemmed_tokens = [snowball.stem(t) for t in tokens] stemmed_sentences.append(' '.join(stemmed_tokens)) return stemmed_sentences
def _stem(self, tokens): """Stems the tokens with nltk SnowballStemmer :param tokens: list of string :return: list of string with words stems """ stemmer = nltk.SnowballStemmer(language='english') tokens_stemmed = [stemmer.stem(token) for token in tokens] return tokens_stemmed
def process_data(file): """ Before we can analyize our data, we need to process it. This includes: - converting all words to lowercase - deleting punctuation marks - deleting numbers - tokenize tweets to get a list of words - deleting stopwords or not deleting stopwords depending on the results - stem all words This function will return a list of lists containing all stemmed words of every tweet from the trainset (data_list) """ data_list = [] punctuation_numbers = r'[^a-zA-Z ]' stopwords = nltk.corpus.stopwords.words("english") # list of english stopwords with open(file, 'r') as csvfile: # collect tweets from csv-data in list reader = csv.reader(csvfile, delimiter=';') next(reader, None) # skip header for row in reader: data_list.append(row[5]) dict_words = pickle.load(open("complete_dict.p", "rb")) for index, element in enumerate(data_list): stopwords = nltk.corpus.stopwords.words("english") # list of english stopwords element = element.lower() # utterance to lowercase punctuation_numbers = r'[^a-zA-Z ]' non_alpha = re.findall(punctuation_numbers, element) for x in non_alpha: element = element.replace(x, '') # delete punctuation marks element = word_tokenize(element) # tokenize utterance element = [w for w in element if w not in stopwords] # delete stopwords (depending on results we may not remvove stopwords) for i, word in enumerate(element): in_dict = False if word in dict_words: word = dict_words[word] element[i] = str(word) in_dict = True if in_dict is False: word = nltk.SnowballStemmer("english").stem(word) element[i] = str(word) data_list[index] = element return data_list
def stemming(inlist): ''' input a list, returns a list of stemmed words ''' outlist = [] stemmer = nltk.SnowballStemmer('english') for word in inlist: outlist.append(stemmer.stem(word)) return outlist
def resize(): qs = '' if request.method == 'POST': content = request.get_json(silent=True) #handling json request for k, v in content.iteritems(): # handling array if type(v) is list: for val in v: qs += val + ' ' else: # handling string qs += v + ' ' else: resp = Response(response='405', status=405, mimetype="application/json") return (resp) # LOADING TF-IDF STRUCTURE vectorizer = pickle.load(open('/models/TfidfVectorizer.pk', 'rb')) # DOWNLOADING stopwords nltk.download('stopwords') # TOKENING tokenizer = nltk.RegexpTokenizer(r'\w+') qs = tokenizer.tokenize(qs) # STOPPING (working) stop = [ word.decode('utf-8') for word in nltk.corpus.stopwords.words('italian') ] text = [word for word in qs if word not in stop] # STEMMING stemmer = nltk.SnowballStemmer('italian') text = [stemmer.stem(word) for word in text] # converting list to string text = ' '.join(text) tl = [] tl.append(text) # RETRIEVING FEATURES X_test = vectorizer.transform(tl) res = classifier(MultinomialNB(alpha=.01), X_test) return Response(response=json.dumps({'classes': res}, sort_keys=False, indent=4, encoding='utf-8'), status=200, mimetype="application/json")
def clean_data(text): ## Remove puncuation text = text.translate(string.punctuation) ## Convert words to lower case and split them text = text.lower().split() ## Remove stop words stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) ## Clean the text text = re.sub(r'\\n', " ", text) text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r":", "", text) text = re.sub(r"-", "", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(" n ", " ", text) text = re.sub(r".com", "", text) text = re.sub(r"www", "", text) ## Stemming text = text.split(' ') stemmer = nltk.SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) return text
def __init__(self): self.ngrams_size = 1 nltk.download('punkt') nltk.download('stopwords') self.stemmer = nltk.SnowballStemmer('english') self.nlp = spacy.load("en_core_web_sm") self.nlp.remove_pipe('tagger') self.nlp.remove_pipe('parser') self.nlp.remove_pipe('ner') self.ngrams = None
def getStemmetizedText(text_tokens, clean_input=False, printRes=False): # if this flag is set , it is assumed that passed input text is raw paragraph if (clean_input): text_tokens = cleanText(text_tokens) stemmer = nltk.SnowballStemmer(language="english") res = [stemmer.stem(word) for word in text_tokens] if (printRes): print(res) return res
def stem_text(tokens): ''' Stemming is the process of reducing a derived word to it's original word. Args : tokens : The original non stemmed tokens. Returns : The stemmed tokens. ''' #Using SnowballStemmer for english stemmer = nltk.SnowballStemmer('english') return [stemmer.stem(x) for x in tokens]
def tagging(stem=False): """ Tags and chunk words between the two entities """ # set filepath to input basepath = os.path.dirname(__file__) file_in = os.path.abspath(os.path.join(basepath, 'csv/relevant_sentences.csv')) if stem: file_out = os.path.abspath(os.path.join(basepath, 'csv/tagged_sentences_stemmed.csv')) else: file_out = os.path.abspath(os.path.join(basepath, 'csv/tagged_sentences.csv')) chunker = set_up_chunker() if stem: stemmer = nltk.SnowballStemmer('english') with open(file_in, 'rb') as csv_in: with open(file_out, 'wb') as csv_out: # set columns here so they can be more easily changed cols = ['pid', 'sent_num', 'true_relation', 'rel_type', 'e1', 'e2', 'type1', 'type2', 'start1', 'end1', 'start2', 'end2', 'sentence', 'before_tags', 'between_tags', 'after_tags', 'before', # TODO get rid of these, need to change the write cols instead of using update 'between', 'after'] csv_reader = csv.DictReader(csv_in, delimiter=',') csv_writer = csv.DictWriter(csv_out, cols, delimiter=',') csv_writer.writeheader() for row in csv_reader: # display progress bar sys.stdout.write('.') sys.stdout.flush() if stem: row.update({'before_tags': pos_and_chunk_tags(row['before'], chunker, stemmer)}) row.update({'between_tags': pos_and_chunk_tags(row['between'], chunker, stemmer)}) row.update({'after_tags': pos_and_chunk_tags(row['after'], chunker, stemmer)}) else: row.update({'before_tags': pos_and_chunk_tags(row['before'], chunker)}) row.update({'between_tags': pos_and_chunk_tags(row['between'], chunker)}) row.update({'after_tags': pos_and_chunk_tags(row['after'], chunker)}) csv_writer.writerow(row)
def preprocess(text): stemmer = nltk.SnowballStemmer('french') data = nltk.word_tokenize(text.replace('\n', ' ')) data = [t.lower() for t in data if t.lower() not in stopwords] #data = [re.sub(r"^(d|l|qu|c|s)'", '', w) for w in data if not re.match(r"^(\d+|\.)$", w)] #data = [re.sub(r"^(d|l|qu|c|s)'", '', w) for w in data] #data = [w for w in data if not re.match(r"^(\d+|\.)$", w)] data = [stemmer.stem(t) for t in data] return ' '.join(data)
def __init__(self): """ The constructor takes a training data set and trains the classifier """ # conll2000 chunker is more detailed than the treebank one ie includes prepositional chunks train_sents = nltk.corpus.conll2000.chunked_sents('train.txt') #train_sents = nltk.corpus.treebank_chunk.chunked_sents() train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = nltk.BigramTagger(train_data) self.stemmer = nltk.SnowballStemmer('english')
def stem_corpus(self, corpus, lang): """Apply stemming on corpus. Args: corpus: corpus of a web page lang: language of a web page Returns: corpus: stemmed corpus """ log = logging.getLogger('stem_corpus') try: corpus = corpus.replace("\\t", "") corpus = corpus.replace("\\n", "") corpus = corpus.replace("\\xa", "") corpus = re.sub("\d+", " ", corpus) pattern = re.compile(r'(?!\d)[\w\-]{3,}') corpus = pattern.findall(corpus) new_corpus = [] if lang == 'fr': stemmer = nltk.SnowballStemmer("french") for word in corpus: new_corpus.append(stemmer.stem(word)) elif lang == 'en': stemmer = nltk.SnowballStemmer("english") for word in corpus: new_corpus.append(stemmer.stem(word)) elif lang == 'it': stemmer = nltk.SnowballStemmer("italian") for word in corpus: new_corpus.append(stemmer.stem(word)) elif lang == 'es': stemmer = nltk.SnowballStemmer("spanish") for word in corpus: new_corpus.append(stemmer.stem(word)) return ' '.join([item for item in new_corpus]) except Exception as e: log.debug("type error: {}".format(e)) log.debug(traceback.format_exc())
def removeStopWords(NativeallWords): nltk.download('stopwords') nltk.download('punkt') # stop = set(stopwords.words('english')) # list = [w for w in NativeallWords if w not in stop] # nltk.download('averaged_perceptron_tagger') # keylist = pos_tag(NativeallWords) # list = [word for word, pos in keylist if # pos != 'VBZ' and pos != 'DT' and pos != 'IN' and pos != 'PRP' and pos != 'CC'] stemmer = nltk.SnowballStemmer("english") list = [stemmer.stem(w).lower() for w in NativeallWords] return list
def list_stemmer(doc_list): n = len(doc_list) stemmed_text_list = [] # Initialize the stemmer stemmer = nltk.SnowballStemmer("english") for i in range(0, n): # According to stackexchange discussions, a list comprehension is much faster for this task # than a loop. stemmed_text = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(doc_list[i])) stemmed_text_list.append(stemmed_text) return stemmed_text_list
def __init__(self): # Set relative NLTK data path for corpora parsing corpus_dir = os.path.dirname(os.path.abspath(__file__)) + '/../corpus' nltk.data.path.append(corpus_dir) self.spanish_stemmer = nltk.SnowballStemmer("spanish") self.es_unigram_tagger = self._load_tagger( corpus_dir + '/taggers/es_unigram.pickle', 'unigram') self.es_bigram_tagger = self._load_tagger( corpus_dir + '/taggers/es_bigram.pickle', 'bigram') self.en_unigram_model = self._load_model( corpus_dir + '/models/en_unigram.pickle', 1) self.en_bigram_model = self._load_model( corpus_dir + '/models/en_bigram.pickle', 2)
def df_stemmatize_dataframe(dataframe): stemmer = nltk.SnowballStemmer("english") text_column = dataframe["text"] new_words= [] for i in text_column: stemmed = [stemmer.stem(token) for token in i] new_words.append(stemmed) text_column = new_words dataframe["text"] = text_column return dataframe