def normalize_nltk_snowball(token): # choose language english firstly # or you could import snowball englishstemmer directly # use 'print(" ".join(SnowballStemmer.languages))))' to # check what languages snowball supports snowball = SnowballStemmer('english') return snowball.stem(token)
def classify(self, sText): """Given a target string sText, this function returns the most likely document class to which the target string belongs (i.e., positive, negative or neutral). """ tokens = self.tokenize(sText) posProbability, negProbability = 0, 0 posNum, negNum = float(sum(self.pos_dic.values())), float(sum(self.neg_dic.values())) stemmer = SnowballStemmer("english") for i in range(len(tokens) - 1): if not isPunctuationMark(tokens[i]): unigram = stemmer.stem(tokens[i]) second_word = stemmer.stem(tokens[i + 1]) try: bigram = unigram + " " + second_word except UnicodeDecodeError: continue #adds one smoothing and takes log to avoid underflow posProbability += math.log(float((self.pos_dic.get(bigram, 0) + 1)) / posNum) posProbability += math.log(float((self.pos_dic.get(unigram, 0) + 1)) / posNum) negProbability += math.log(float((self.neg_dic.get(bigram, 0) + 1)) / negNum) negProbability += math.log(float((self.neg_dic.get(unigram, 0) + 1)) / negNum) if tokens: posProbability += math.log(float((self.pos_dic.get(tokens[-1], 0) + 1)) / posNum) negProbability += math.log(float((self.neg_dic.get(tokens[-1], 0) + 1)) / negNum) if posProbability > negProbability: return "positive" else: return "negative"
def parseOutText(f): """ given an opened email file f, parse out all text below the metadata block at the top (in Part 2, you will also add stemming capabilities) and return a string that contains all the words in the email (space-separated) example use case: f = open("email_file_name.txt", "r") text = parseOutText(f) """ f.seek(0) ### go back to beginning of file (annoying) all_text = f.read() ### split off metadata content = all_text.split("X-FileName:") words = "" if len(content) > 1: ### remove punctuation text_string = content[1].translate(string.maketrans("", ""), string.punctuation) ### split the text string into individual words, stem each word, ### and append the stemmed word to words (make sure there's a single ### space between each stemmed word) stemmer = SnowballStemmer("english") stemmed_words = [] for word in text_string.split(): stemmed_words.append(stemmer.stem(word.strip())) words = " ".join(stemmed_words) return words
def _text_to_words(self, text): ''' Processe un texte et retourne une liste de mots Le processing effectue les actions suivantes: - mise en minuscule du texte - tokenisation - retrait des stop_words - stemming des mots ''' # On met le texte en minuscule text = text.lower().strip() # Tokenisation tokens = word_tokenize(text, language="english") # On retire les mots commencant par une apostrophe # (la tokenization transforme I'd like en ["I", "'d", "like"] # et on pourrait se passer de "'d") tokens = [token for token in tokens if not token.startswith("'")] # stop_words # On retire les stop words de notre vecteur. # En plus des stopwords donnees avec la collection, je rajoute les mots courants # Anglais donnés par NLTK et la ponctuation (sauf parantheses car utile pour query bool) stop_words = self.stop_words + list(string.punctuation) + stopwords.words("english") tokens = [token for token in tokens if token not in stop_words] # Stemming stemmer = SnowballStemmer(language="english") tokens = [stemmer.stem(word) for word in tokens] return tokens
def parseOutText(f): """ given an opened email file f, parse out all text below the metadata block at the top (in Part 2, you will also add stemming capabilities) and return a string that contains all the words in the email (space-separated) example use case: f = open("email_file_name.txt", "r") text = parseOutText(f) """ f.seek(0) ### go back to beginning of file (annoying) all_text = f.read() ### split off metadata content = all_text.split("X-FileName:") words = "" stemmer = SnowballStemmer("english") if len(content) > 1: text_string = content[1].translate(string.maketrans("", ""), string.punctuation) split = text_string.split() text = [stemmer.stem(word) for word in split] words = ' '.join(text) f.close() return words.strip()
def main(): parser = argparse.ArgumentParser(description='Evaluate translation hypotheses.') parser.add_argument('-i', '--input', default=baseline_path+'data/hyp1-hyp2-ref', help='input file (default data/hyp1-hyp2-ref)') parser.add_argument('-n', '--num_sentences', default=None, type=int, help='Number of hypothesis pairs to evaluate') # note that if x == [1, 2, 3], then x[:None] == x[:] == x (copy); no need for sys.maxint opts = parser.parse_args() # we create a generator and avoid loading all sentences into a list def sentences(): with open(opts.input) as f: for pair in f: yield [sentence.strip().split() for sentence in pair.split(' ||| ')] english_stemmer = SnowballStemmer("english") # note: the -n option does not work in the original code for h1, h2, ref in islice(sentences(), opts.num_sentences): # Perform morphological stemming before calculating METEOR score h1 = [english_stemmer.stem(word) for word in h1] h2 = [english_stemmer.stem(word) for word in h2] ref = [english_stemmer.stem(word) for word in ref] rset = set(ref) h1_match = meteor(h1, rset) # print "meteor is h1_match ", h1_match h2_match = meteor(h2, rset) # print "meteor is h2_match ", h2_match print(1 if h1_match > h2_match else # \begin{cases} (0 if h1_match == h2_match else -1)) # \end{cases}
def get_stemm_tags(self, tags): stemm_tags = [] current_stemmer = SnowballStemmer('english') for tag in self.tags: stemm_tags.append(current_stemmer.stem(tag.lower())) return stemm_tags
def tokenize(string, stem=True, entire=False): """ INPUT: string OUTPUT: a list of words """ string = string.replace("/", " ") string = string.replace("-", " ") tokenizer = PottsTokenizer(preserve_case=False) token_list = tokenizer.tokenize(string) punctuation = re.compile(r'[-.?!,":;$/*()|0-9]') # remove these punctuations and number token_list = [punctuation.sub("", word) for word in token_list] token_list = filter(None, token_list) #filters empty #filter out stopwords STOPWORDS = set(nltk.corpus.stopwords.words('english')) STOPWORDS.update(('would','does','got',"doesn't","it's","isn't","don't","i'm","i'll","i've", "=","can't","didn't","etc","+","%","won't","that's","nikon","g","&", "sure", "may", "yet", "ok","haven't","else","maybe","wouldn't","couldn't","via","rt","'","you're","almost","v","there's","#",'well','somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere')) if entire: # if need a larger set stopwords_entire_list = loadEntireStopWord() STOPWORDS.update(set(stopwords_entire_list)) token_list = [word for word in token_list if word not in STOPWORDS] #stemmer if stem: stemmer = SnowballStemmer("english") token_stem_list = [stemmer.stem(token) for token in token_list] token_list = token_stem_list return token_list
def tokenize(self, document): """ Break text into sentences and each sentence into a list of single words Ignore any token that falls into the stopwords set. """ # use sentence tokenizer sent_tokenize from nltk package sentences = sent_tokenize(utils.to_unicode(document.lower())) # create stemmer of class SnowballStemmer stemmer = SnowballStemmer("english") for sentence in sentences: words = [word for word in utils.tokenize( self.cleanse_text(sentence) )] if self.remove_stopwords: words = [ word for word in words if word not in self.en_stopwords ] if self.stemming: words = [stemmer.stem(t) for t in words] yield words
def parseOutText(f): """ given an opened email file f, parse out all text below the metadata block at the top, stem words and return a string that contains all the words in the email (space-separated) example use case: f = open("email_file_name.txt", "r") text = parseOutText(f) """ f.seek(0) ### go back to beginning of file (annoying) all_text = f.read() ### split off metadata content = all_text.split("X-FileName:") words = "" if len(content) > 1: ### remove punctuation text_string = content[1].translate(string.maketrans("", ""), string.punctuation) ### split the text string into individual words, stemming each word, ### and appending the stemmed word to words words = text_string.strip().split() stemmer = SnowballStemmer("english") stemmed_text_string = "" for word in words: stemmed_text_string += stemmer.stem(word) + " " return stemmed_text_string.strip()
def tokenize(s, stem=True, digit=False, stop=True, use_re=False): """ :type s: str :type stem: bool :type use_re: bool :rtype: set(str) """ stop_words = stopwords.words('english') stemmer = SnowballStemmer('english') wordnet = WordNetLemmatizer() table = string.maketrans("","") if use_re: s = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s) if digit: tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation + string.digits))) else: tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation))) if stop: tokens = set(word for word in tokens if word not in stop_words) if stem: tokens = set(stemmer.stem(word) for word in tokens) return tokens
def test_spanish(self): stemmer = SnowballStemmer('spanish') assert stemmer.stem("Visionado") == 'vision' # The word 'algue' was raising an IndexError assert stemmer.stem("algue") == 'algu'
def text_cleaner_and_tokenizer(texts): """ takes a list of sentences, removes punctuation, numbers, stopwords and stems. Then joins everything back together and returns the filtered texts as a list of unicode strings :param texts: list of unprocessed strings :return: list of unicode strings """ i = 0 stopword_list = set(stopwords.words('danish')) stemmer = SnowballStemmer("danish", ignore_stopwords=False) filtered_texts = [] for sentence in texts: for symbol in punctuation: sentence = sentence.replace(symbol,'') for num in numbers: sentence = sentence.replace(str(num),'') sentence = sentence.decode('utf-8').lower() words_in_sentence = word_tokenize(sentence, language='danish') filtered_sentence = [] for word in words_in_sentence: if word not in stopword_list: stem_word = stemmer.stem(word) filtered_sentence.append(stem_word) sentence = ' '.join(filtered_sentence) filtered_texts.append(sentence) i = i +1 if i % 1000 == 0: print(i) print('Done :D!') return filtered_texts
def stem_stopword_clean( vett_strings ): ''' Prende un vettore di studenti o lavori ongli elemento delle lista unico e stemmato. Divide elementi composti da piu parole, rimuove le STOPwords :param vett_value: vettore di stringhe :return: vettore di parole stem senza stopwords ''' # importo libreria per stem from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords stemmer = SnowballStemmer("italian") stop = set(stopwords.words('italian')) # logger.error(stemmer.stem("italian")) # logger.error(stemmer.stem("a")) # logger.error(stemmer.stem("andate tutti a correre")) documents=[] # logger.error(stop) stem_parola='' for frasi in vett_strings: for parola in frasi.split(" "): stem_parola=stemmer.stem(parola) if(stem_parola not in stop and stem_parola not in documents): documents.append(stem_parola) return documents
def pos_tokenizer(s): #define a tokenizer that uses POS tagging texts=nltk.word_tokenize(s) texts=[word for word in texts if len(word)>2] # PULL OUT NOUN AND VERB PHRASES chunktext=nltk.pos_tag(texts) patterns=""" VP:{<V.*><DT>?<JJ.*>?<NN.*>} NP:{<DT>?<JJ>*<NN.*>} N:{<NN.*>} """ NPchunker=nltk.RegexpParser(patterns) from nltk.stem.snowball import SnowballStemmer st=SnowballStemmer('english') #print text temp=[] result=NPchunker.parse(chunktext) #print result for phrase in result: try: phrase.label() string='' m=0 for word in phrase: if m==0: string+=st.stem(word[0]) m+=1 else: string+=' '+st.stem(word[0]) temp.append(string) except: pass return temp
def parseOutBody(f): from nltk.stem.snowball import SnowballStemmer import string f.seek(0) ### go back to beginning of file (annoying) all_text = f.read() ### split off metadata content = all_text.split("X-FileName:") words = "" if len(content) > 1: ### remove punctuation text_string = content[1].translate(string.maketrans("", ""), string.punctuation).split() ### project part 2: comment out the line below #words = text_string ### split the text string into individual words, stem each word, ### and append the stemmed word to words (make sure there's a single ### space between each stemmed word) stemmer = SnowballStemmer('english') for word in text_string: word = word.strip() word = stemmer.stem(word) words = words + ' ' + word else: pass return words
def __init__(self,df, column,n ): # gets the most frecuent words in a document texto = " ".join(str(x) for x in df[column].values) tokens = texto.split() tokens=[x.lower() for x in tokens] #stopset = set(stopwords.words('english')) # dictionary of stop words #tokens = [w for w in tokens if not w in stopset] stemmer=SnowballStemmer("english") stemm_words=[] tokens_clean=[] for j in tokens: sa=re.sub('[^A-Za-z]+', '', j) tokens_clean.append(sa) #print tokens_clean for s in tokens_clean: try: stem= stemmer.stem(s) if s!='': stemm_words.append(str(stem)) except: pass cuenta = len(tokens_clean) largo = Counter(stemm_words).most_common(n) topdic = dict(largo) asortado = Series(topdic) asortadol = asortado.columns = ['a', 'b'] ordenado = asortado.order(ascending=False) ordenadolist= topdic.keys() #+stemm_words self.top=ordenadolist
def clean_text(text): #remove numbers text = ''.join(i for i in text if not i.isdigit()) #create bag of words tokenized = nltk.word_tokenize(text) #lower case lowercase = [word.lower() for word in tokenized] #load stopwords stopwords = pd.read_csv('.\\source_data\\stopwords.csv',encoding='latin1', header=None,names=['word']) stopwords = list(stopwords['word']) #remove stopwords filtered_words = [word for word in lowercase if word not in stopwords] #remove punctuation punct = set(string.punctuation) filtered_words = [word for word in filtered_words if word not in punct] #stem text stemmer = SnowballStemmer("english", ignore_stopwords=True) stemmed_words = [stemmer.stem(word) for word in filtered_words] stemmed_text = ' '.join(stemmed_words) return stemmed_text
def clean_single_word(word, lemmatizing="wordnet"): """ Performs stemming or lemmatizing on a single word. If we are to search for a word in a clean bag-of-words, we need to search it after the same kind of preprocessing. Inputs: - word: A string containing the source word. - lemmatizing: A string containing one of the following: "porter", "snowball" or "wordnet". Output: - lemma: The resulting clean lemma or stem. """ if lemmatizing == "porter": porter = PorterStemmer() lemma = porter.stem(word) elif lemmatizing == "snowball": snowball = SnowballStemmer('english') lemma = snowball.stem(word) elif lemmatizing == "wordnet": wordnet = WordNetLemmatizer() lemma = wordnet.lemmatize(word) else: print("Invalid lemmatizer argument.") raise RuntimeError return lemma
def parseOutText(f): """ given an opened email file f, parse out all text below the metadata block at the top and return a string that contains all the words in the email (space-separated) example use case: f = open("email_file_name.txt", "r") text = parseOutText(f) """ f.seek(0) all_text = f.read() ### split metadata off content = all_text.split("X-FileName:") words = "" st = "" if len(content) > 1: text_string = content[1].translate(string.maketrans("", ""), string.punctuation) stemmer = SnowballStemmer("english") # print tree for word in text_string.split(): st = st+" "+(stemmer.stem(word)) words = st.lstrip() return words
def get_info_from_df(df, labeled=False, check_ratios=True, nostops=True, snowball=True, bigrams=True): """ INPUT: DataFrame, boolean for presence of change_type labels, several optional parameters for text processing OUTPUT: text processed according to specified parameters, change_type labels (if specified), list of ratios (see above) """ change_texts = [] labels = [] ratios = [] for i, row in df.iterrows(): for j, text in enumerate(row["text_no_abi"]): change_texts.append(text.lower().replace("\n", "")) if labeled: labels.append(row["change_type"][j]) if check_ratios: change_texts, ratios = add_ratios(change_texts) if nostops: stops = set(stopwords.words("english")) change_texts = [" ".join([word for word in text.split(" ") if word not in stops]) for text in change_texts] if snowball: snowball = SnowballStemmer("english") change_texts = [" ".join([snowball.stem(word) for word in text.split(" ")]) for text in change_texts] if bigrams: change_texts = add_bigrams(change_texts) if labeled: return change_texts, ratios, labels else: return change_texts, ratios
def clean_text(list_o_text): docs = [''.join([char if char not in punctuation else ' ' for char in comic]) for comic in list_o_text] # remove punctuation from string docs = [word_tokenize(comic) for comic in docs] # make string into list of words # 3. Strip out stop words from each tokenized document. stop = set(stopwords.words('english')) stop.update(punctuation) other_words = ['cite', 'cite_note', 'cite_ref', 'class', 'href', 'id', 'redirect', 'ref', 'refer', 'span', 'sup', 'title', 'wiki'] stop.update(other_words) docs = [[word for word in words if word.strip(punctuation) not in stop] for words in docs] # remove stop words # Stemming / Lemmatization # 1. Stem using both stemmers and the lemmatizer #porter = PorterStemmer() snowball = SnowballStemmer('english') #wordnet = WordNetLemmatizer() #docs_porter = [[porter.stem(word) for word in words] for words in docs] docs_snowball = [[snowball.stem(word) for word in words] for words in docs] #docs_wordnet = [[wordnet.lemmatize(word) for word in words] for words in docs] docs = [' '.join(doc) for doc in docs_snowball] # for each document, it becomes a long string return docs
def tokenize(text): tokens = nltk.word_tokenize(text) stems = [] stemmer = SnowballStemmer("english", ignore_stopwords=True) for item in tokens: stems.append(stemmer.stem(item)) return stems
def clean_data(data): ''' Stems and removes stop words from training and test data ''' stemmer = SnowballStemmer('english') stop = stopwords.words('english') for column_name in ['query', 'product_title', 'product_description']: for index, row in data.iterrows(): warnings.filterwarnings('error') try: extracted_data = (' ').join( [i for i in BeautifulSoup(row[column_name], 'lxml') .get_text(' ') .split(' ') ]) except UserWarning: pass cleaned_data = re.sub('[^a-zA-Z0-9]',' ', extracted_data) stemmed_data = (' ').join( [stemmer.stem(i) for i in cleaned_data.split(' ') ]) remove_stop_words = ('').join( [i for i in stemmed_data if i not in stop] ) data.set_value(index, column_name, unicode(remove_stop_words)) return data
def get_core_words( text ): #TOKENIZATION b = word_tokenize(text) #KEEP ONLY NOUNS b = [noun for noun, pos in pos_tag(b) if pos.startswith('N')] #CONVERT INTO LOWER CASE looper = 0 for token in b: b[looper] = token.lower() looper+=1 #REMOVE THE STOPWORDS FROM THE FILE minlength = 2 c = [token for token in b if (not token in stopwords.words('english')) and len(token) >= minlength] #STEMMING THE WORDS TO ITS BASE FORM stemmer = SnowballStemmer("english") looper1 = 0 for token in c: c[looper1] = stemmer.stem(token.decode("utf8")) looper1 +=1 return c
def processFile(fh): with gzip.open(fh, 'rb') as f: tree = etree.parse(f) root = tree.getroot() r = re.compile('^[a-zA-Z]+$') s = SnowballStemmer("english") paragraphs = root.xpath('DOC[@type="story"]/TEXT/P') for p in paragraphs: try: sentences = PunktSentenceTokenizer().sentences_from_text(p.text) for sentence in sentences: tokens = TreebankWordTokenizer().tokenize(sentence) #Filter by alphabetic only alphabetic = filter(r.match, tokens) #Filter by stopwords & stem all leftover tokens stop_filtered = [s.stem(w) for w in alphabetic if w.lower() not in stopwords.words('english')] print (" ").join(stop_filtered).upper() except: continue return True
def prune(doc, stoplist = None, stem = True, english_dictionary_words = False): """This takes a single document and tokenizes the words, removes undesirable elements, and prepares it to be loaded into a dictionary. """ # Tokenize the document and make it lowercase temp = utils.simple_preprocess(doc.lower()) # Remove freestanding punctuation and punctuation in words temp = [w for w in temp if w not in string.punctuation] temp = [rmPunct(w) for w in temp] # Remove words in passed stoplist if stoplist: temp = [w for w in temp if w not in stoplist] # Remove specific tokens temp = [w for w in temp if w not in set(['[', ']', "'", '\n', 'com'])] # Remove stopwords temp = [w for w in temp if w not in stopwords.words('english')] # Stem the remaining words if stem: stemmer = SnowballStemmer('english') temp = [stemmer.stem(w) for w in temp] if english_dictionary_words: d = enchant.Dict("en_US") temp = [w for w in temp if d.check(w)] return temp
def read_corpus(corpus_file, use_sentiment): "Reads in the corpus and returns the documents and labels" documents = [] labels = [] with open(corpus_file, encoding='utf-8') as f: for line in f: tokens = line.strip().split() use_stopword = False if use_stopword: stopwordfile = open('stopwords.txt', 'r') stopwords = [] for line in stopwordfile: if len(line) > 0: splitline = line.split(',') for word in splitline: stopwords.append(word) tokenlist = [token for token in tokens[3:] if token not in stopwords] documents.append(find_ngrams(tokenlist, 2)) else: snowballstemmer = SnowballStemmer('english') stemmedtokens = [snowballstemmer.stem(word) for word in tokens[3:]] #documents.append(stemmedtokens) documents.append(find_ngrams(stemmedtokens, 2)) if use_sentiment: # 2-class problem: positive vs negative labels.append( tokens[1] ) else: # 6-class problem: books, camera, dvd, health, music, software labels.append( tokens[0] ) return documents, labels
def parseOutText(f): """ given an opened email file f, parse out all text below the metadata block at the top example use case: f = open("email_file_name.txt", "r") text = parseOutText(f) """ stemmer = SnowballStemmer("english") f.seek(0) ### go back to beginning of file (annoying) all_text = f.read() ### split off metadata content = all_text.split("X-FileName:") words = "" if len(content) > 1: ### remove punctuation text_string = content[1].translate(string.maketrans("", ""), string.punctuation) ### split the text string into individual words, stem each word, ### and append the stemmed word to words (make sure there's a single ### space between each stemmed word) words = ' '.join([stemmer.stem(word) for word in text_string.split()]) return words
def cleaned_bag_of_words_dataset(data_matrix, stemming=False, stop_words=None, TFIDF=False, ngram_range=(1, 1), max_features=None, length=False, number_in_tweet=False, words_present=[]): if stemming: stemmer = SnowballStemmer("english") tweets = [" ".join([stemmer.stem(word) for word in word_tokenize(data_point[2].lower().decode("utf8"))]) for data_point in data_matrix] else: tweets = [data_point[2].lower() for data_point in data_matrix] if TFIDF: vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=ngram_range, max_features=max_features) else: vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=ngram_range, max_features=max_features) dataset = vectorizer.fit_transform(tweets).toarray() if length: lengths = np.array([[len(word_tokenize(data_point[2].decode("utf8")))] for data_point in data_matrix]) dataset = np.concatenate((dataset, lengths), axis=1) if number_in_tweet: numbers = [] for data_point in data_matrix: number_list = list_of_ints_from_string(data_point[2]) filtered_number_list = [number for number in number_list if abs(number) < 10] if len(filtered_number_list) == 0: numbers.append([0]) else: numbers.append([np.mean(filtered_number_list)]) dataset = np.concatenate((dataset, numbers), axis=1) for word in words_present: word_present = np.array([[int(word.lower() in word_tokenize(data_point[2].lower().decode("utf8")))] for data_point in data_matrix]) dataset = np.concatenate((dataset, word_present), axis=1) return dataset
def show_entry_fields(): url = 'http://api.hh.ru/vacancies?text=' + ( e1.get()) + '&page=0&per_page=100' data = requests.get(url).json() print("Поиск вакансий") p = json.dumps(data) res2 = json.loads(p) i = 0 texts = [] total_word = [] window = tk.Toplevel(root) window.minsize(1300, 1000) window.title(u"Вывод данных") #webbrowser.open("index.html") w00 = Label(window, text=u"ВАКАНСИИ", font="Times") w00.place(relx=0.2, rely=0.01) t1 = Text(window, height=60, width=75) t1.place(relx=0.01, rely=0.03) w11 = Label(window, text=u"НАПИСАТЬ СОПРОВОДИТЕЛЬНОЕ ПИСЬМО", font="Times") w11.place(relx=0.64, rely=0.57) t2 = Text(window, height=20, width=70) t2.place(relx=0.52, rely=0.6) while i < len(res2['items']): a = ((res2['items'][i]['id'])) #['requirement'] #print (a) #print ((res2['items'][i]['name'])) aa = ((res2['items'][i]['snippet']['requirement'])) #aa=(res2['items'][i]['snippet']['requirement']).replace('<highlighttext>', '') #patt = re.compile('(\s*)aa(\s*)') print(aa) texts.append(aa) #wordpunct_tokenize(str(aa)) tokenizer = RegexpTokenizer(r'\w+') #print (stopwords.words('english')) (total_word.extend(tokenizer.tokenize(str(aa)))) aaa = str(i + 1) + ') ' + str(res2['items'][i]['name']) + ' | ' + str( res2['items'][i]['area']['name']) + '\n' t1.insert(END, (aaa)) i = i + 1 #----------------------------------------------------------------------формирование окна выдачи результатов stopwords = nltk.corpus.stopwords.words('english') en_stop = get_stop_words('en') stemmer = SnowballStemmer("english") #print stopwords[:10] #--------------------------------------------------------------------------скрытое размещение дирихле #w8=Label(window,text=u"ОСНОВНЫЕ ТЕМЫ И СЛОВА", font = "Times") #w8.place(relx=0.17, rely=0.53) #t8=Text(window, height=24, width=75) #t8.place(relx=0.01, rely=0.57) texts = [] stopped_tokens = [i for i in total_word if not i in en_stop] #print le(stopped_tokens) p_stemmer = PorterStemmer() stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] #print len(stemmed_tokens), stemmed_tokens texts.append(stemmed_tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] ldamodel = gensim.models.LdaModel(corpus, num_topics=100, id2word=dictionary, passes=20) a = ldamodel.print_topics(num_topics=10, num_words=7) #print ldamodel.print_topics(num_topics=4, num_words=7)[0][1] #print a num_topics = 5 topic_words = [] for i in range(num_topics): tt = ldamodel.get_topic_terms(i, 10) topic_words.append([dictionary[pair[0]] for pair in tt]) #print topic_words[0] jj = 0 while jj < len(topic_words): topic11 = ((u"Тема #%d:" % (jj + 1)) + "\n" + "-".join(topic_words[jj]) + "\n") #t8.insert(END, topic11) #print(u"Тема #%d:" % (jj+1)) #print("-".join(topic_words[jj])) jj = jj + 1 #--------------------------------------------------------------------------определение основных компетенций vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=.5) tfv = vec.fit_transform(stopped_tokens) terms = vec.get_feature_names() result = list(set(list_skills) & set(terms)) print(result) text_file = open("Output.txt", "w") text_file.write(result[2]) text_file.close() wc = WordCloud(height=1000, width=1000, max_words=1000).generate(" ".join(terms)) nmf = NMF(n_components=11).fit(tfv) #for idx, topic in enumerate(nmf.components_): #print(u"Тема #%d:" % (idx+1)) #print(" ".join([terms[i] for i in topic.argsort()[:-10 - 1:-1]])) #--------------------------------------------------------------------------рисунок распределения терминов w8 = Label(window, text=u"РАСПРЕДЕЛЕНИЕ НАВЫКОВ", font="Times") w8.place(relx=0.66, rely=0.01) fig = plt.figure(figsize=(5, 5)) im = plt.imshow(wc) canvas = FigureCanvasTkAgg(fig, master=window) canvas.show() canvas.get_tk_widget().place( relx=0.54, rely=0.03) #pack(side=TOP, fill=BOTH, expand=1) canvas._tkcanvas.place(relx=0.52, rely=0.03) #pack(side=TOP, fill=BOTH, expand=1) #--------------------------------------------------------------------------оцека тональности c = Button(window, text=u"Подтвердить квалификацию", font="Times 14 bold", command=scoring, bg="deep sky blue") c.place(relx=0.95, rely=0.97, anchor=SE) c1 = Button(window, text=u"Откликнуться", font="Times 14 bold", command=testing, bg="lime green") c1.place(relx=0.7, rely=0.97, anchor=SE)
"King Sihanouk declined to chair talks in either place.", "A U.S. House resolution criticized Hun Sen's regime while the opposition tried to cut off his access to loans.2", "But in November the King announced a coalition government with Hun Sen heading the executive and Ranariddh leading the parliament.", "Left out, Sam Rainsy sought the King's assurance of Hun Sen's promise of safety and freedom for all politicians." ] sents = [ "Budget negotiations between the White House and House Republicans were delayed on several issues.", "At issue were provisions that included requiring Federal Health Insurance providers to provide contraceptives to women as Well as a provision to build a road across a wildlife preserve in Alaska.", "The contraceptive issue faced an uncertain future while Clinton likely will veto the road.", "There is disagreement also on how to spend the funding on education.", "This year's budget discussions also have been hampered because it is the first time since budget procedures were established in 1974 that there has been a surplus, preventing agreement on a budget resolution." ] sentences = parser.raw_parse_sents(sents) language = 'english' stemmer = SnowballStemmer(language) stoplist = set(stopwords.words(language)) for sent in sentences: phrases = [] parsestr = unicode(list(sent)[0]) #print 'Sent:', parsestr tokens = Tree.fromstring(parsestr).leaves() print tokens hash_pos_tokens, phrases = get_parse_info(parsestr, stemmer, language, stoplist) check = prune_phrases(phrases, stoplist, stemmer, language) for x in check: print(unicode(x)) print('No. of phrases:', len(check))
import emoji import tweepy # Original Working Directory owd = os.getcwd() # Twitter App Credentials consumer_key = "N6EHubErC6jwd5eDqDBoJ3iW1" consumer_secret = "oOmJLOmlaEk7bR6R6KYsFguS2yeRmducUOKIWGZ8wmRuQ70nB0" access_key = "598773124-jLVnqszY1MMYDbeD1vjBYeq6rx5O2QxCxtCm3IFM" access_secret = "URaUh3VzdJJ6jgLejtny5U4I5uo4wlKGCpDgOwANn0ixZ" en_stop_words = set(stopwords.words('english')) stemmer = SnowballStemmer("english") lemmatizer = WordNetLemmatizer() def write_csv(file_name): with open(file_name, 'w') as f: writer = csv.writer(f) def get_tweets(searchQuery, lang, tweets_max=1000): auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) api = tweepy.API(auth) maxTweets = tweets_max # Some arbitrary large number tweetsPerQry = 100 # this is the max the API permits fName = 'emirates_mentioned_tweets' # We'll store the tweets in a CSV file. sinceId = None
import pandas as pd from bs4 import BeautifulSoup from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.naive_bayes import MultinomialNB from sklearn import svm from imblearn.over_sampling import SMOTE from collections import Counter import re import nltk import csv from nltk.stem.snowball import SnowballStemmer stemmer2 = SnowballStemmer("english", ignore_stopwords=True) from nltk.corpus import stopwords # Import the stop word list def extract_numbers(tags): tags_only = re.sub("[^0-9,]", '', tags).split(',') tagsList = map(int, tags_only) return tagsList def getSolution(list1, list2, list3, list4): list1 = list(set(extract_numbers(list1))) list2 = list(set(extract_numbers(list2))) list3 = list(set(extract_numbers(list3))) list4 = list(set(extract_numbers(list4))) list1 = Counter(list1) list2 = Counter(list2)
from gensim.utils import tokenize from nltk.stem.snowball import SnowballStemmer from utils import compose preprocessors = [ lambda text: list(tokenize(text)), lambda g: (SnowballStemmer("russian", True).stem(g_) for g_ in g), ] stop_words = [] def rm_stop_words(text): return [token for token in text if token not in stop_words] def preprocess(texts): return (compose(*preprocessors)(text) for text in texts) if __name__ == "__main__": from argparse import ArgumentParser from read import read from pprint import pprint parser = ArgumentParser() parser.add_argument("-i", dest="path", type=str, help="path to text") args = parser.parse_args() pprint(list(preprocess(read(args.path))))
from nltk.tokenize import word_tokenize import nltk from collections import Counter import string from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer('english') from nltk.corpus import stopwords nltk.download('stopwords') stopword_list = stopwords.words('english') from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() def remove_punc(tokens): clean_tokens = [] for tok in tokens: if tok not in string.punctuation: if tok != "''" and tok != '``' and tok != "'s": clean_tokens.append(tok) return clean_tokens def remove_stopwords(tokens): tokens_clean = [] for tok in tokens: if tok not in stopword_list: tokens_clean.append(tok)
class TextSummarizer: #ps = PorterStemmer() stemmer = SnowballStemmer("english") stopWords = set(stopwords.words("english") + list(punctuation)) text = "" sentences = "" def tokenize_sentence(self): words = word_tokenize(self.text) print(words) return words def input_text(self): while True: #self.text = input("Enter the text to summarize\n") #with open('text_input.docx','r',encoding='utf-8') as f: # inp = StringIO(f.read()) document = Document('crow.docx') #self.text = document.read() self.text = [] for para in document.paragraphs: self.text.append(para.text) self.text = str(self.text) if (len(self.text) > 10): break else: print("Please input the text as length at least 10") def cal_freq(self, words): # Second, we create a dictionary for the word frequency table. freqTable = dict() for word in words: word = word.lower() if word in self.stopWords: continue #word = stemmer.stem(word) if word in freqTable: freqTable[word] += 1 else: freqTable[word] = 1 return freqTable def compute_sentence(self, freqTable): self.sentences = sent_tokenize(self.text) sentenceValue = dict( ) # dict() creates the dictionary with key and it's corresponding value for sentence in self.sentences: for index, wordValue in enumerate(freqTable, start=1): if wordValue in sentence.lower(): # index[0] return word if sentence in sentenceValue: sentenceValue[ sentence] += index # index return value of occurence of that word #sentenceValue.update({sentence: index}) #print(sentenceValue) else: # sentenceValue[sentence] = wordValue sentenceValue[sentence] = index #print(sentenceValue) print(sentenceValue) return sentenceValue def sumAvg(self, sentenceValue): sumValues = 0 for sentence in sentenceValue: sumValues += sentenceValue[sentence] # Average value of a sentence from original text average = int(sumValues / len(sentenceValue)) return average def print_summary(self, sentenceValue, average): summary = '' for sentence in self.sentences: if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.5 * average)): summary += " " + sentence #print(summary) return summary
'type': dataset.pre_clean_len.dtype, 'description': 'Length of the text before cleaning' }, 'dataset_shape': dataset.shape } pprint(data_dict) count = 0 from nltk import PorterStemmer count = 0 stop = stopwords.words('english') stop.remove('not') stop.remove('against') mystemmer = SnowballStemmer("english") length = len(textData) prev = -1 done_count = 0 import time as tm startingTime = tm.time() print("Starting Time is", startingTime) for i in textData: done_count += 1 done_percent = (int)(100 * done_count / length) if done_percent != prev: prev = done_percent print("DONE = ", done_percent, "%.", sep='') textData[count] = cleanText(i)
import nltk from nltk import stem from nltk.stem.snowball import SnowballStemmer from tflearn.layers.core import activation stemmer = SnowballStemmer(language='swedish') import numpy import tflearn import tensorflow import random import json import pickle #Get data with open("data/intents-mdh.json", encoding='utf-8') as file: data = json.load(file) try: with open("data.pickle", "rb") as f: words, labels, training, output = pickle.load(f) except: words = [] labels = [] docs_x = [] docs_y = [] for intent in data["intents"]: for pattern in intent["patterns"]: wrds = nltk.word_tokenize(pattern) words.extend(wrds) docs_x.append(wrds) docs_y.append(intent["tag"])
import re, os, json, operator, functools import numpy as np from collections import Counter, deque from nltk.stem.snowball import FrenchStemmer, SnowballStemmer from nltk.tokenize import WordPunctTokenizer # sentenceTokenizer = nltk.data.load('tokenizers/punkt/PY3/french.pickle') stemmer = SnowballStemmer("french") tokenizer = WordPunctTokenizer() # stemmer = FrenchStemmer() # Takes a string and return a list of words def clean_text(text, stem=False): # Removing curly braces, those are metadata in the corpus text = re.sub(r'\{.*}', '', text) # Remove x2, x3 etc. (repeating verse annotation) text = re.sub(r'(x|X)\d+', '', text) # Replacing purely stylistics chars text = re.sub(r'æ', 'ae', text) text = re.sub(r'œ', 'oe', text) text = re.sub(r'[ìíîï]', 'i', text) text = re.sub(r'[ýÿ]', 'y', text) text = re.sub(r'[òóôõö]', 'o', text) text = re.sub(r'[áâãä]', 'a', text) text = re.sub(r'ë', 'e', text) text = re.sub(r'ñ', 'n', text) text = re.sub(r'[ûü]', 'u', text) text = re.sub(r'[«“”»]', '"', text)
# https://www.nltk.org/api/nltk.stem.html import nltk from nltk.stem.snowball import SnowballStemmer print(" ".join(SnowballStemmer.languages)) stemmer = SnowballStemmer("finnish") word = "lumipallojakaan" print(word) print(stemmer.stem(word))
class TweetProcessor: """ Input: Array of lines made from tweets in json format Attributes: data: Dataframe of tweets Proccedures: 1. Removing all extra whitespaces 2. Change the text to lowercase 3. Remove non-alphabetical characters 4. Remove tweet duplicates Output: Cleaned Dataframe of tweets """ levenshtein_distance = 20 stemmer = SnowballStemmer("english") def __init__(self, data): self.data = data def __remove_whitespaces(self): """ Removes all the tralling whitespaces """ self.data['text'] = map( lambda tweet: re.sub('\s+', ' ', tweet).strip(), self.data['text']) print("Removed whitespaces") def __lowercase(self): """ Changes to lowercase the data """ self.data['text'] = map(lambda tweet: tweet.lower(), self.data['text']) print("Lowercased") def __filter_alphabetic(self): """ Remove all non alphabetical characters """ self.data['text'] = map(lambda tweet: tweet.encode('ascii', 'ignore'), self.data['text']) print("Filtered alphabetic") def __filter_duplicates(self): """ Remove all duplicates from the file by applying Leveshtien Distance to the string""" duplicates = set() for i, a in enumerate(self.data['text']): os.system('clear') print("Filtered: " + str(100 * i / len(self.data['text'])) + " %") for j, b in enumerate(self.data['text']): if (i != j and lv.distance(a, b) < self.levenshtein_distance): duplicates.add(j + 1) self.data = self.data.drop(duplicates, errors='ignore') self.data = self.data.reset_index(drop=True) print("Filtered duplicates") def __stem_data(self): """ Apply steeming to data """ self.data['stemmed'] = self.data["text"].apply(lambda tweet: " ".join( [self.stemmer.stem(word) for word in tweet.split(" ")])) print("Stemmed") def process_data(self): self.__remove_whitespaces() self.__lowercase() self.__filter_alphabetic() self.__filter_duplicates() self.__stem_data() return self.data
def main(): if len(sys.argv) < 2: print('error: too few arguments') print('command: python create_category_corpus.py NUMBER_TOP_CATEGORY') quit() NUMBER_TOP_CATEGORY = int(sys.argv[1]) print('NUMBER_TOP_CATEGORY=%d' % (NUMBER_TOP_CATEGORY)) print('loading category profiles') profile = load_zipped_pickle('category_profiles_dbpedia_201510.gz') print('finish loading category profiles') system_flag = platform.system() cwd = os.getcwd() # initialize mongo client if system_flag == 'Windows': client = pymongo.MongoClient("localhost", 27017) else: client = pymongo.MongoClient("localhost", 58903) db = client.wiki2015 wiki_article_categories = db['article_categories'] category_corpus = {} pkl_filename = 'category_dbpedia_corpus_top%d_fsdm3.pkl.gz' % ( NUMBER_TOP_CATEGORY) if system_flag == 'Windows': lucene_dbpedia_fsdm = Lucene_Object('mmapDirectory\\dbpedia_v2_FSDM3', 'BM25', True) else: lucene_dbpedia_fsdm = Lucene_Object( '%s/mmapDirectory/dbpedia_v2_FSDM3' % (cwd), 'BM25', True) cnt = 0 if os.path.exists(pkl_filename) == True: #if False==True: print('loading category corpus') category_corpus = load_zipped_pickle(pkl_filename) else: for item in wiki_article_categories.find(): list_category = item['categories'].strip().split('|') uri_article = item['uri'] title = findTitle(uri_article) entity_content_dict = {} doc_entity = lucene_dbpedia_fsdm.findEntityDocFromIndex( title, 'title', False) if doc_entity is None: continue for f in [ 'names', 'attributes', 'categories', 'similar_entities', 'related_entities', 'catchall' ]: entity_content_dict[f] = doc_entity[f] entity_content_dict['stemmed_' + f] = doc_entity['stemmed_' + f] if len(entity_content_dict['catchall'].strip()) == 0: continue for cat in list_category[:NUMBER_TOP_CATEGORY]: if ('<http://dbpedia.org/resource/Category:' + cat + '>') not in profile: continue if cat not in category_corpus: category_corpus[cat] = [] if len(category_corpus[cat]) < 300: category_corpus[cat].append(entity_content_dict) #cnt+=1 #if cnt>20: #break print('saving corpus to pkl.gz') save_zipped_pickle(category_corpus, pkl_filename) client.close() # begin write the data into index print('begin write into index') if system_flag == 'Windows': LUCENE_INDEX_DIR = 'mmapDirectory\\category_corpus_dbpedia201510_top' + str( NUMBER_TOP_CATEGORY) + '_fsdm3' else: LUCENE_INDEX_DIR = '%s/mmapDirectory/category_corpus_dbpedia201510_top' % ( cwd) + str(NUMBER_TOP_CATEGORY) + '_fsdm3' # backup code files cmd = 'robocopy %s %s\code_files *.py' % ( r'%cd%', LUCENE_INDEX_DIR ) if system_flag == 'Windows' else 'cp *.py %s/code_files' % ( LUCENE_INDEX_DIR) os.system(cmd) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = SimpleAnalyzer() config = IndexWriterConfig(analyzer) # write data to index w = IndexWriter(index_mm, config) cnt = 0 data = {} max_article_num = 0 stemmer = SnowballStemmer('english') for cat, list_entity_dict in category_corpus.items(): cat_label = cleanSentence(cat, True) data.clear() data['category'] = (cat, 'StringField') data['label'] = (cat_label, 'CUSTOM_FIELD_TEXT') data['stemmed_label'] = (stemSentence(cat_label, stemmer, True), 'CUSTOM_FIELD_TEXT') data['num_articles'] = (len(list_entity_dict), 'INTEGER_STORED') if data['num_articles'][0] > max_article_num: max_article_num = data['num_articles'][0] for f in [ 'names', 'attributes', 'categories', 'similar_entities', 'related_entities', 'catchall' ]: contents = cleanSentence( ' '.join([dic[f] for dic in list_entity_dict]), True, ' ') data[f] = (contents, 'CUSTOM_FIELD_TEXT_NOT_STORED') data['stemmed_' + f] = (stemSentence(contents, stemmer, False), 'CUSTOM_FIELD_TEXT_NOT_STORED') #print ('--------------------') # need to calculate corpus average length addDoc(w, data) #cnt+=1 #if cnt>20: #break w.close() print('max article num=%d' % (max_article_num))
import telebot import requests from bs4 import BeautifulSoup import re from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("russian") def cleanhtml(raw_html): cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') cleantext = re.sub(cleanr, '', raw_html) return cleantext @bot.message_handler(commands=['donate']) def donate(message): bot.send_message(message.chat.id, "На поддержку проекта") @bot.message_handler(commands=['start']) def send_welcome(message): try: global k bot.send_message( message.chat.id, "Отправьте мне ссылку на RSS ленту: " + "\n\n" + "Список популярных RSS лент: " + "\n" + "Лента ру - /LentaRu" + "\n" + "СПОРТ сегодня - /SportToday" + "\n" + "Travel ru - /TravelRu") k = 0 except:
def cli_main(): # parser = argparse.ArgumentParser(description=metrics_description, formatter_class=argparse.RawDescriptionHelpFormatter) parser = argparse.ArgumentParser(description="predictor") # parser.add_argument('--config-file', type=str, help='config file with metric parameters') # parser.add_argument('--metrics', type=str, help='comma-separated string of metrics') # parser.add_argument('--aggregate', type=bool, help='whether to aggregate scores') # parser.add_argument('--jsonl-file', type=str, help='input jsonl file to score') # parser.add_argument('--article-file', type=str, help='input article file') # parser.add_argument('--summ-file', type=str, help='input summary file') # parser.add_argument('--ref-file', type=str, help='input reference file') # parser.add_argument('--output-file', type=str, help='output file') # parser.add_argument('--eos', type=str, help='EOS for ROUGE (if reference not supplied as list)') # args = parser.parse_args() args = default_args(parser=parser) # ===================================== # INITIALIZE METRICS gin.parse_config_file(args.config_file) toks_needed = set() metrics = [x.strip() for x in args.metrics.split(",")] metrics_dict = {} if "rouge" in metrics: from summ_eval.rouge_metric import RougeMetric metrics_dict["rouge"] = RougeMetric() toks_needed.add("line_delimited") if "bert_score" in metrics: from summ_eval.bert_score_metric import BertScoreMetric bert_score_metric = BertScoreMetric() metrics_dict["bert_score"] = bert_score_metric toks_needed.add("space") if "mover_score" in metrics: from summ_eval.mover_score_metric import MoverScoreMetric mover_score_metric = MoverScoreMetric() metrics_dict["mover_score"] = mover_score_metric toks_needed.add("space") if "chrf" in metrics: from summ_eval.chrfpp_metric import ChrfppMetric metrics_dict["chrf"] = ChrfppMetric() toks_needed.add("space") if "meteor" in metrics: from summ_eval.meteor_metric import MeteorMetric metrics_dict["meteor"] = MeteorMetric() toks_needed.add("space") if "bleu" in metrics: from summ_eval.bleu_metric import BleuMetric metrics_dict["bleu"] = BleuMetric() toks_needed.add("space") if "cider" in metrics: from summ_eval.cider_metric import CiderMetric metrics_dict["cider"] = CiderMetric() toks_needed.add("stem") if "s3" in metrics: from summ_eval.s3_metric import S3Metric metrics_dict["s3"] = S3Metric() toks_needed.add("stem") if "rouge_we" in metrics: from summ_eval.rouge_we_metric import RougeWeMetric metrics_dict["rouge_we"] = RougeWeMetric() toks_needed.add("stem") if "stats" in metrics: from summ_eval.data_stats_metric import DataStatsMetric metrics_dict['stats'] = DataStatsMetric() toks_needed.add("spacy") if "sms" in metrics: from summ_eval.sentence_movers_metric import SentenceMoversMetric metrics_dict['sms'] = SentenceMoversMetric() toks_needed.add("spacy") if "summaqa" in metrics: from summ_eval.summa_qa_metric import SummaQAMetric metrics_dict['summaqa'] = SummaQAMetric() toks_needed.add("spacy") toks_needed.add("space") if "syntactic" in metrics: from summ_eval.syntactic_metric import SyntacticMetric metrics_dict["syntactic"] = SyntacticMetric() toks_needed.add("space") if "supert" in metrics: from summ_eval.supert_metric import SupertMetric metrics_dict['supert'] = SupertMetric() toks_needed.add("space") if "blanc" in metrics: from summ_eval.blanc_metric import BlancMetric metrics_dict['blanc'] = BlancMetric() toks_needed.add("space") # ===================================== # ===================================== # READ INPUT print("Reading the input") ids = [] articles = [] references = [] summaries = [] bad_lines = 0 if args.jsonl_file is not None: try: with open(args.jsonl_file) as inputf: for count, line in enumerate(inputf): try: data = json.loads(line) try: ids.append(data['id']) except: pass if len(data['decoded']) == 0: bad_lines += 1 continue summaries.append(data['decoded']) # references.append(data['reference']) if data.get("reference", None): references.append(data['reference']) else: # there are 10 additional references added, the first is the orginal references.append(data["references"][0]) # if "summaqa" in metrics or "stats" in metrics or "supert" in metrics or "blanc" in metrics: # remove stats if "summaqa" in metrics or "supert" in metrics or "blanc" in metrics: try: articles.append(data['text']) except: raise ValueError("You specified summaqa and stats, which" \ "require input articles, but we could not parse the file!") except: bad_lines += 1 except Exception as e: print("Input did not match required format") print(e) sys.exit() print(f"This many bad lines encountered during loading: {bad_lines}") if args.summ_file is not None: with open(args.summ_file) as inputf: summaries = inputf.read().splitlines() if args.ref_file is not None: with open(args.ref_file) as inputf: references = inputf.read().splitlines() # if "summaqa" in metrics or "stats" in metrics or "supert" in metrics or "blanc" in metrics: if "summaqa" in metrics or "supert" in metrics or "blanc" in metrics: if args.article_file is None and len(articles) == 0: raise ValueError("You specified summaqa and stats, which" \ "require input articles, but we could not parse the file!") if len(articles) > 0: pass else: with open(args.article_file) as inputf: articles = inputf.read().splitlines() if len(ids) == 0: ids = list(range(0, len(summaries))) # ===================================== # ===================================== # TOKENIZATION print("Preparing the input") references_delimited = None summaries_delimited = None if len(references) > 0: if isinstance(references[0], list): if "line_delimited" in toks_needed: references_delimited = ["\n".join(ref) for ref in references] if "space" in toks_needed: references_space = [" ".join(ref) for ref in references] elif args.eos is not None: if "line_delimited" not in toks_needed: raise ValueError('You provided a delimiter but are not using a metric which requires one.') if args.eos == "\n": references_delimited = [ref.split(args.eos) for ref in references] else: references_delimited = [f"{args.eos}\n".join(ref.split(args.eos)) for ref in references] elif "line_delimited" in toks_needed: references_delimited = references if "space" in toks_needed: references_space = references if isinstance(summaries[0], list): if "line_delimited" in toks_needed: summaries_delimited = ["\n".join(summ) for summ in summaries] if "space" in toks_needed: summaries_space = [" ".join(summ) for summ in summaries] elif args.eos is not None: if "line_delimited" not in toks_needed: raise ValueError('You provided a delimiter but are not using a metric which requires one.') if args.eos == "\n": summaries_delimited = [ref.split(args.eos) for ref in summaries] else: summaries_delimited = [f"{args.eos}\n".join(ref.split(args.eos)) for ref in summaries] elif "line_delimited" in toks_needed: summaries_delimited = summaries if "space" in toks_needed: summaries_space = summaries if "stem" in toks_needed: tokenizer = RegexpTokenizer(r'\w+') stemmer = SnowballStemmer("english") if isinstance(summaries[0], list): summaries_stemmed = [[stemmer.stem(word) for word in tokenizer.tokenize(" ".join(summ))] for summ in summaries] references_stemmed = [[stemmer.stem(word) for word in tokenizer.tokenize(" ".join(ref))] for ref in references] else: summaries_stemmed = [[stemmer.stem(word) for word in tokenizer.tokenize(summ)] for summ in summaries] references_stemmed = [[stemmer.stem(word) for word in tokenizer.tokenize(ref)] for ref in references] summaries_stemmed = [" ".join(summ) for summ in summaries_stemmed] references_stemmed = [" ".join(ref) for ref in references_stemmed] if "spacy" in toks_needed: nlp = spacy.load('en_core_web_sm') # nlp = spacy.load('en_core_web_md') disable = ["tagger", "textcat","lemmatizer"] if "summaqa" not in metrics: disable.append("ner") if isinstance(summaries[0], list): summaries_spacy = [nlp(" ".join(text), disable=disable) for text in summaries] else: summaries_spacy = [nlp(text, disable=disable) for text in summaries] if "stats" in metrics: summaries_spacy_stats = [[tok.text for tok in summary] for summary in summaries_spacy] if "sms" in metrics: if isinstance(references[0], list): references_spacy = [nlp(" ".join(text), disable=disable) for text in references] else: references_spacy = [nlp(text, disable=disable) for text in references] # this is original for summaqa and stats # if "summaqa" in metrics or "stats" in metrics: # if isinstance(articles[0], list): # input_spacy = [nlp(" ".join(text), disable=disable) for text in articles] # else: # input_spacy = [nlp(text, disable=disable) for text in articles] # if "stats" in metrics: # input_spacy_stats = [[tok.text for tok in article] for article in input_spacy] # use reference as article for stats if "summaqa" in metrics or "stats" in metrics: if isinstance(references[0], list): input_spacy = [nlp(" ".join(text), disable=disable) for text in references] else: input_spacy = [nlp(text, disable=disable) for text in references] if "stats" in metrics: input_spacy_stats = [[tok.text for tok in ref] for ref in input_spacy] if "supert" in metrics or "blanc" in metrics: inputs_space = articles # ===================================== # ===================================== # GET SCORES if args.aggregate: final_output = dict() else: final_output = defaultdict(lambda: defaultdict(int)) # import pdb;pdb.set_trace() for metric, metric_cls in metrics_dict.items(): print(f"Calculating scores for the {metric} metric.") try: if metric == "rouge": output = metric_cls.evaluate_batch(summaries_delimited, references_delimited, aggregate=args.aggregate) # only rouge uses this input so we can delete it del references_delimited del summaries_delimited elif metric in ('bert_score', 'mover_score', 'chrf', 'meteor', 'bleu'): output = metric_cls.evaluate_batch(summaries_space, references_space, aggregate=args.aggregate) elif metric in ('s3', 'rouge_we', 'cider'): output = metric_cls.evaluate_batch(summaries_stemmed, references_stemmed, aggregate=args.aggregate) elif metric == "sms": output = metric_cls.evaluate_batch(summaries_spacy, references_spacy, aggregate=args.aggregate) elif metric in ('summaqa', 'stats', 'supert', 'blanc'): if metric == "summaqa": output = metric_cls.evaluate_batch(summaries_space, input_spacy, aggregate=args.aggregate) elif metric == "stats": output = metric_cls.evaluate_batch(summaries_spacy_stats, input_spacy_stats, aggregate=args.aggregate) elif metric in ('supert', 'blanc'): output = metric_cls.evaluate_batch(summaries_space, inputs_space, aggregate=args.aggregate) if args.aggregate: final_output.update(output) else: ids = list(range(0, len(ids))) for cur_id, cur_output in zip(ids, output): final_output[cur_id].update(cur_output) except Exception as e: print(e) print(f"An error was encountered with the {metric} metric.") # ===================================== # ===================================== # OUTPUT SCORES metrics_str = "_".join(metrics) # json_file_end = args.jsonl_file.split("/")[-1] json_file_end = args.jsonl_file.replace("/", "_") output_path = f"output_{metrics_str}.jsonl" print(f"saving to {output_path}") # with open(f"outputs/{args.output_file}_{json_file_end}_{metrics_str}.jsonl", "w") as outputf: with open(output_path, "w") as outputf: if args.aggregate: json.dump(final_output, outputf) else: for key, value in final_output.items(): value["id"] = key json.dump(value, outputf) outputf.write("\n")
def main(): args = parse_args() maxlen = args.maxlen maxlines = args.maxlines public = args.only_public start_offset = args.start_offset stemmer = SnowballStemmer("english") if os.path.exists(args.input + ".pickle"): print("loading the cached dataset...") with open(args.input + ".pickle", "rb") as fin: x, y = pickle.load(fin) else: vocabulary = {} samples_num = 0 with open(args.input, errors="ignore") as fin: for lineno, line in enumerate(fin): if lineno % 1000 == 0: print("line #%d" % lineno) if lineno > maxlines > 0: break ctx = eval(line) word = False word_num = 0 for c in ctx: if c == ID_S: word = True elif word: word = False if public and c[0].islower() and c not in BUILTINS: continue word_num += 1 for part in extract_names(c): part = stemmer.stem(part) vocabulary.setdefault(part, len(vocabulary)) samples_num += max(0, word_num - start_offset) print("vocabulary:", len(vocabulary), "samples:", samples_num) with open(args.output + ".voc", "wb") as fout: pickle.dump(vocabulary, fout, protocol=-1) x = numpy.zeros((samples_num, maxlen, len(vocabulary)), dtype=numpy.float32) y = numpy.zeros((samples_num, len(vocabulary)), dtype=numpy.float32) print("the worst is behind - we allocated %s bytes" % commaed_int(x.nbytes + y.nbytes)) samples_num = 0 with open(args.input, errors="ignore") as fin: for lineno, line in enumerate(fin): if lineno % 1000 == 0: print("line #%d" % lineno) if lineno > maxlines > 0: break ctx = eval(line) word = False words = [] for c in ctx: if c == ID_S: word = True elif word: word = False if public and c[0].islower() and c not in BUILTINS: continue wadd = tuple(vocabulary[stemmer.stem(p)] for p in extract_names(c)) if wadd: words.append(wadd) for i in range(start_offset, len(words)): for j in range(maxlen): k = i - maxlen + j if k >= 0: for c in words[k]: x[samples_num, j, c] = 1 for c in words[i]: y[samples_num, c] = 1 y[samples_num] /= len(words[i]) samples_num += 1 if args.cache: print("saving the cache...") try: with open(args.input + ".pickle", "wb") as fout: pickle.dump((x, y), fout, protocol=-1) except Exception as e: print(type(e), e) print("x:", x.shape) print("y:", y.shape) print("shuffling...") if args.shuffle: numpy.random.seed(777) rng_state = numpy.random.get_state() numpy.random.shuffle(x) numpy.random.set_state(rng_state) numpy.random.shuffle(y) model = train(x, y, **args.__dict__) model.save(args.output, overwrite=True)
print('The text belongs to Charles Dickens\'s "A Tale of Two Cities".') print( 'The interviewee attributed their correct guess to how famous the first phrase is.' ) print( 'The first four content words were readily recognizable to anyone who has read the book.' ) print('No function words were needed to identify the source.') print('\n') print('\n') print("_" * 70) print('QUESTION 3: Stemming and Lemmatization: \n') porter = PorterStemmer() lancaster = LancasterStemmer() snowball = SnowballStemmer('english') porter_stemming = [porter.stem(w) for w in filtered_words] lancaster_stemming = [lancaster.stem(w) for w in filtered_words] snowball_stemming = [snowball.stem(w) for w in filtered_words] #with wrapping format = '%s' pieces = [format % (word) for word in porter_stemming] output = ', '.join(pieces) wrapped_porter = fill(output) print('The Normalized, Filtered Text Stemmed with PorterStemmer is: \n') print(wrapped_porter) #with wrapping format = '%s'
import re from nltk.stem.snowball import SnowballStemmer from nltk import word_tokenize from stop_list import closed_class_stop_words import itertools s = SnowballStemmer("english") # Stripping cran.qry for the (ID, query) tuple query_file = open('cran.qry').read() queries = query_file.split('I') queries.pop(0) queries = [tuple(q.split('W')) for q in queries] queries = [(re.findall(r'\d{3}', i), word_tokenize(s.stem(q))) for (i, q) in queries] queries = [(i, [w for w in q if w not in closed_class_stop_words]) for (i, q) in queries] # Strip punctuation later # Set of all unique words in queries to make dictionary for IDF score all_queries = [q for (i, q) in queries] all_queries = set(itertools.chain.from_iterable(all_queries)) # Stripping cran.all.1400 for abstracts
products = pd.read_csv("../input/producto_tabla.csv") products['short_name'] = products.NombreProducto.str.extract('^(\D*)', expand=False) products['brand'] = products.NombreProducto.str.extract('^.+\s(\D+) \d+$', expand=False) w = products.NombreProducto.str.extract('(\d+)(Kg|g)', expand=True) products['weight'] = w[0].astype('float') * w[1].map({'Kg': 1000, 'g': 1}) products['pieces'] = products.NombreProducto.str.extract( '(\d+)p ', expand=False).astype('float') products['short_name_processed'] = ( products['short_name'].map(lambda x: " ".join([ i for i in x.lower().split() if i not in nltk.corpus.stopwords.words("spanish") ]))) stemmer = SnowballStemmer("spanish") products['short_name_processed'] = (products['short_name_processed'].map( lambda x: " ".join([stemmer.stem(i) for i in x.lower().split()]))) short_name_processed_list = products['short_name_processed'].unique() vectorizer = CountVectorizer(analyzer="word", \ tokenizer=None, \ preprocessor=None, \ stop_words=None, \ max_features=1000) products = pd.concat([ products.drop('short_name', axis=1), pd.get_dummies(short_name_processed_list) ],
words = "" if len(content) > 1: text_string = content.translate( string.maketrans("", ""), string.punctuation) # text_string = stripNonAlphaNum(text_string) # text_string = ' '.join(text_string) text_string = strip_accents(text_string) word_list = text_string.split() # remove only non alpha words from the string word_list = [i for i in word_list if i.isalpha()] # stemming stemmer = SnowballStemmer("english") stem_word_list = [ stemmer.stem(word) for word in word_list ] # words = (" " . join(stem_word_list)) # print stem_word_list # # remove stopwords from nltk.corpus import stopwords filtered_words = [ word for word in stem_word_list if word not in stopwords.words('english') ] # print filtered_words # sys.exit('-random utf-8 check')
def text2tokens(text, mode): emoticons_str = r""" (?: [:=;] # Eyes [oO\-]? # Nose (optional) [D\)\]\(\]/\\OpP] # Mouth )""" regex_str = [ emoticons_str, r'<[^>]+>', # HTML tags r'(?:@[\w_]+)', # @-mentions r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs #r'(?:\D)', # no numbers r"(?:[a-z][a-z\-_]+[a-z])", # words with - and r'(?:[\w_]+)', # other words r'(?:\S)', # anything else ] tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE) emoticon_re = re.compile(r'^' + emoticons_str + '$', re.VERBOSE | re.IGNORECASE) """ The regular expressions are compiled with the flags re.VERBOSE, to allow spaces in the regexp to be ignored (see the multi-line emoticons regexp), and re.IGNORECASE to catch both upper and lowercases. The tokenize() function simply catches all the tokens in a string and returns them as a list. This function is used within preprocess(), which is used as a pre-processing chain: in this case we simply add a lowercasing feature for all the tokens that are not emoticons (e.g. :D doesn’t become :d). """ punctuation = list(string.punctuation) stop = stopwords.words('french') + punctuation + [ '>>', '<<', '<', '>', 'via', 'le', 'les', 'a', 'rt' ] # Liste des tokens à effacer stemmer = SnowballStemmer('french') try: tokens = tokens_re.findall(unidecode(text)) tokens = [ token if emoticon_re.search(token) else token.lower() for token in tokens ] terms_stop = [] for term in tokens: if term not in stop: try: int(term) except: terms_stop.append(term) #terms_stop = [term for term in tokens if term not in stop] # Crée une liste avec tout les termes sauf les termes stopé if mode == 't': return terms_stop if mode == 's': terms_stem = [stemmer.stem(term) for term in terms_stop] return terms_stem except: print("Problème dans la tokenisation du text") print("texte : ", text, "Type : ", type(text), "Mode : ", mode) pass
# Cleaning up the text messy_sentence = "The point of this example is to _learn how basic text cleaning works_ on *very simple* data." tokenized_messy_sentence = nltk.word_tokenize(messy_sentence) table = {ord(char): '' for char in string.punctuation} # in case you're interested, this is called a dict comprehension cleaned_messy_sentence = [] for token in tokenized_messy_sentence: cleaned_word = token.translate(table) # the translate method allows us to remove all unwanted charachters cleaned_messy_sentence.append(cleaned_word) print(cleaned_messy_sentence) # Stemming and Lemmatization porter = PorterStemmer() snowball = SnowballStemmer('english') wordnet = WordNetLemmatizer() porterlemmas = [] wordnetlemmas = [] snowballlemmas = [] for word in tokenized: porterlemmas.append(porter.stem(word)) snowballlemmas.append(snowball.stem(word)) wordnetlemmas.append(wordnet.lemmatize(word)) print('Porter') print(porterlemmas) print('Snowball') print(snowballlemmas)
tags = [] for text in stopwords: tag = bs(text, "lxml") tags.append(tag.get_text()) #tags """#Convest All to SmallCase""" sm = [] for i in tags: sm.append(i.lower()) """#SnowBall Stem""" import nltk from nltk.stem.snowball import SnowballStemmer snow = SnowballStemmer(language='english') stem = [] for i in sm: z = i.split() strr = "" for j in z: x = snow.stem(j) strr += x strr += " " stem.append(strr) #stem """#task 1 Completed ---
scii_funs() ASCII - Letter Handling @author: Markus.Meister """ import unittest import torch import string import unicodedata import numpy as np from nltk.stem.snowball import SnowballStemmer from nltk.tokenize import RegexpTokenizer # stemmers for words stemmEN = SnowballStemmer('english') stemmDE = SnowballStemmer('german') stemmers = { 'en': stemmEN, 'de': stemmDE, } # tokenizer for words tokenizer = RegexpTokenizer(r'\w+') all_letters = string.ascii_letters + " .,;'" all_numbers = ''.join(list(map(lambda x: str(x), range(10)))) BOUND_LOW_CHARS = 26 #%% -- functions --
print(withoutStop) # In[ ]: bof = pd.Series(withoutStop).value_counts() print(bof) # In[ ]: from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("spanish") stemmed_spanish = [stemmer.stem(item) for item in withoutStop] print(stemmed_spanish) # In[ ]: from sklearn.decomposition import LatentDirichletAllocation as LDA from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np # Helper function
import re from nltk.stem.snowball import SnowballStemmer import pymorphy2 from env import project_id, private_key, credentials, stops import json from google.cloud import bigquery from pandas.io import gbq import pandas as pd morph = pymorphy2.MorphAnalyzer() stemmer = SnowballStemmer('russian') def search_user_library(username, q='', mode='title'): try: q = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", q) q = q.lower() words = q.split() words = [w for w in words if not w in stops] words = [stemmer.stem(w) for w in words] if words == '': return "некорректный ввод" if mode == 'author': Query = 'SELECT * FROM dataset.' + username + ' WHERE AUTHOR LIKE \'' for word in words: Query += '%{}'.format(word) Query += '%\''
#!/usr/bin/env python # coding: utf-8 # In[1]: #!/usr/bin/python from nltk.stem.snowball import SnowballStemmer import string import nltk obj_stem = SnowballStemmer('english') # In[2]: def parseOutText(f): """ given an opened email file f, parse out all text below the metadata block at the top (in Part 2, you will also add stemming capabilities) and return a string that contains all the words in the email (space-separated) example use case: f = open("email_file_name.txt", "r") text = parseOutText(f) """
#!/bin/python from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') #do not understand the parameter from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") #have to set to english #these are middle words as pronouns and prepositions #i do not think they will be much of a sentimental word middle_words = ['and','a','the','am','it','me','with','in','on','by','near','this','that','an','there','here','those'] #store these words into a dictionary and store them in set middle_words = set(dict.fromkeys([stemmer.stem(word) for word in middle_words])) def read_files(tarfname): """Read the training and development data from the sentiment tar file. The returned object contains various fields that store sentiment data, such as: train_data,dev_data: array of documents (array of words) train_fnames,dev_fnames: list of filenames of the doccuments (same length as data) train_labels,dev_labels: the true string label for each document (same length as data) The data is also preprocessed for use with scikit-learn, as: count_vec: CountVectorizer used to process the data (for reapplication on new data) trainX,devX: array of vectors representing Bags of Words, i.e. documents processed through the vectorizer le: LabelEncoder, i.e. a mapper from string labels to ints (stored for reapplication) target_labels: List of labels (same order as used in le) trainy,devy: array of int labels, one for each document """ import tarfile tar = tarfile.open(tarfname, "r:gz") trainname = "train.tsv" devname = "dev.tsv"
def features(tokens, index, history): """ `tokens` = a POS-tagged sentence [(w1, t1), ...] `index` = the index of the token we want to extract features for `history` = the previous predicted IOB tags """ # init the stemmer stemmer = SnowballStemmer('english') # Pad the sequence with placeholders tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')] history = ['[START2]', '[START1]'] + list(history) # shift the index with 2, to accommodate the padding index += 2 word, pos = tokens[index] prevword, prevpos = tokens[index - 1] prevprevword, prevprevpos = tokens[index - 2] nextword, nextpos = tokens[index + 1] nextnextword, nextnextpos = tokens[index + 2] previob = history[index - 1] contains_dash = '-' in word contains_dot = '.' in word allascii = all([True for c in word if c in string.ascii_lowercase]) allcaps = word == word.capitalize() capitalized = word[0] in string.ascii_uppercase prevallcaps = prevword == prevword.capitalize() prevcapitalized = prevword[0] in string.ascii_uppercase nextallcaps = prevword == prevword.capitalize() nextcapitalized = prevword[0] in string.ascii_uppercase return { 'word': word, 'lemma': stemmer.stem(word), 'pos': pos, 'all-ascii': allascii, 'next-word': nextword, 'next-lemma': stemmer.stem(nextword), 'next-pos': nextpos, 'next-next-word': nextnextword, 'nextnextpos': nextnextpos, 'prev-word': prevword, 'prev-lemma': stemmer.stem(prevword), 'prev-pos': prevpos, 'prev-prev-word': prevprevword, 'prev-prev-pos': prevprevpos, 'prev-iob': previob, 'contains-dash': contains_dash, 'contains-dot': contains_dot, 'all-caps': allcaps, 'capitalized': capitalized, 'prev-all-caps': prevallcaps, 'prev-capitalized': prevcapitalized, 'next-all-caps': nextallcaps, 'next-capitalized': nextcapitalized, }
import pickle import collections from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer('english', ignore_stopwords=True) def print_key_value(l, k): for key, value in l[:k]: print("{} {}".format(key, value)) def left(l): r = [] for k, v in l: r.append(k) return r def makedict(l): d = dict() for k, v in l: if k not in d: d[k] = v return d rev_stem_dict = dict() def stem(word):