def algorithms(self): """Returns a list of stemming algorithms provided by the py-stemmer library. """ import Stemmer # @UnresolvedImport return Stemmer.algorithms()
def add_subparser(cls, parser): subparser = parser.add_parser("set-stemmer", help="Configure a stemmer") subparser.set_defaults(run=cls.run) subparser.add_argument("language", choices=Stemmer.algorithms(), help="Stemmer language")
def porterStemmer(string): """ Accepts a string and optionally a stemmer function working on single words, it defaults to the nltk PorterStemmer algorithm. Returns a stemmed string. """ return Stemmer.stem(string)
def _create_stemmers(): """Create stemmers dictionary for all possible languages.""" stemmers_initialized = {} for src_lang in Stemmer.algorithms(): try: dst_lang = _lang_map.get(src_lang) if dst_lang: stemmers_initialized[dst_lang] = Stemmer.Stemmer(src_lang, 40000) except (TypeError, KeyError): pass return stemmers_initialized
def stemming(lang, stemming, words): """Lemmatize text. :param lang: lang text to lemmatize :param stemming: number loops of lemmatizing """ import Stemmer as stemmer try: stemmer = stemmer.Stemmer(lang) for i in range(stemming): words = stemmer.stemWords(words) return words except KeyError: return words
# search_string = "Sachin Ramesh Tendulkar" if len(sys.argv) < 3: print("Invalid arguments") sys.exit(1) #search_string = "" #for i in range(2, len(sys.argv)): # search_string += sys.argv[i] + " " #search_string = search_string.strip() index_file = sys.argv[1] search_string = sys.argv[2] index_file = index_file + "inverted_index.txt" stemmer = Stemmer.Stemmer('english') field_flag = 0 try: field_flag = search_string.index(":") except: pass if field_flag == 0: search_string = search_string.strip() words = search_string.split() for word in words: stemmed_word = word.lower() stemmed_word = stemmer.stemWord(stemmed_word) search(index_file, word, stemmed_word, field_flag)
def NMF_2(): english_stemmer = Stemmer.Stemmer('en') class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: english_stemmer.stemWords(analyzer(doc)) cats = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] print("Loading 20 newsgroups dataset for categories:") pprint(list(cats)) newsgroups = fetch_20newsgroups(subset='all', categories = cats) print("%d documents" % len(newsgroups.data)) print("%d categories" % len(newsgroups.target_names)) print("Creating stemmed TFxIDF representation...") t0 = time() vect = StemmedTfidfVectorizer(stop_words='english') vectors = vect.fit_transform(newsgroups.data) # TFxIDF representation print("Done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % vectors.shape) workbook = xlsxwriter.Workbook('partC_NMF.xlsx') print("Implementing NMF of dimension 2 on data...") nmf_ = NMF(n_components=2) # alpha value? l1 value? nmf_data = nmf_.fit_transform(vectors) print("Done.") print("Implementing non-linear transform on data...") offset = 0.001 nmf_data_off=np.add(nmf_data,offset) log_nmfdata=np.log(nmf_data_off) print("Done.") labels = newsgroups.target labels_2 = [] # Changing the labels from 0-7 to 0-1 for mark in labels: if mark <= 3: labels_2.append(0) else: labels_2.append(1) k = 2 km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1) print("Clustering sparse data with %s" % km) t0 = time() km.fit(nmf_data) km.fit(log_nmfdata) print("done in %0.3fs" % (time() - t0)) # Transforming data back data2D = km.transform(nmf_data) data2D_logarithm = km.transform(log_nmfdata) plt.figure(1) plt.subplot(221) print("Plotting labels of Kmeans algorithm using NMF") plt.title('NMF Dim 2 Kmeans Algorithm with NMF') plt.scatter(nmf_data[:,0], nmf_data[:,1], c=km.labels_) plt.subplot(222) print("Plotting ground truth") plt.title('True labels of data') plt.scatter(nmf_data[:,0], nmf_data[:,1], c=labels_2) plt.subplot(223) print("Plotting labels of Kmeans algorithm with nonlinear transform NMF") plt.title('NMF Dim 2 Kmeans Algorithm Nonlinear transform') plt.scatter(log_nmfdata[:,0], log_nmfdata[:,1], c=km.labels_) plt.subplot(224) print("Plotting ground truth with nonlinear transform") plt.title('Ground truth, nonlinear transform') plt.scatter(log_nmfdata[:,0], log_nmfdata[:,1], c=labels_2) plt.show() print ("Done.")
def algorithms(): if cext_available: return Stemmer.language() else: return list(_languages.keys())
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 4 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ''' ix = (merge['brand_name']==merge['brand_name']) & \ (~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' +merge['name'][ix] ''' #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) ''' Encode Original Strings ''' ''' for col in ['item_description', 'name']: wb = CountVectorizer() if 'X_orig' not in locals(): X_orig = wb.fit_transform(merge[col]) else: X_orig = hstack((X_orig, wb.fit_transform(merge[col]))) print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocsr() X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 3, 0, 1), dtype=bool)] X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 100, 1, 0), dtype=bool)] print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocoo() ''' ''' Stemmer ''' # https://github.com/skbly7/usefulness/blob/ed11cd55080d553cf62873999a5e00b154057fbc/textpreprocess.py from nltk.tokenize import WordPunctTokenizer # This is better for sentences containing unicode, like: u"N\u00faria Espert" word_tokenize = WordPunctTokenizer().tokenize import Stemmer import string ps = Stemmer.Stemmer("english") _wsre = re.compile("\s+") _alphanumre = re.compile("[\w\-\' ]", re.UNICODE) def _removestopwords(txtwords): global stoplist # stoplist = stopwords.words("english") if stoplist is None: stoplist = frozenset([string.strip(l) for l in open(STOPFILE).readlines()]) return [[w for w in t if w not in stoplist] for t in txtwords] def _stem(txtwords): return [stemmer.stemWords(t) for t in txtwords] def _removenonalphanumericchars(txtwords): return [[string.join([c for c in w if _alphanumre.search(c) is not None], "") for w in t] for t in txtwords] def _stripallwhitespace(txts): return [_wsre.sub("", txt) for txt in txts] stemmer = Stemmer.Stemmer("english") def textpreprocess(txt, sentencetokenize=False, replacehyphenbyspace=True, wordtokenize=False, lowercase=True, stem=True, removenonalphanumericchars=True, stripallwhitespace=True): """ Note: For html2text, one could also use NCleaner (common.html2text.batch_nclean) Note: One could improve the sentence tokenization, by using the original HTML formatting in the tokenization. Note: We use the Porter stemmer. (Optimization: Shouldn't rebuild the PorterStemmer object each time this function is called.) """ if sentencetokenize: txts = nltk.word_tokenize(txt) #txts = tokenizer.tokenize(txt.split()) else: txts = txt.split() txt = None if replacehyphenbyspace: txts = [t.replace("-", " ") for t in txts] if wordtokenize: txtwords = [word_tokenize(t) for t in txts] else: txtwords = [string.split(t) for t in txts] txts = None if lowercase: txtwords = [[string.lower(w) for w in t] for t in txtwords] if stem: txtwords = _stem(txtwords) # TODO: Maybe remove Unicode accents? http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string if removenonalphanumericchars: txtwords = _removenonalphanumericchars(txtwords) txtwords = [[w for w in t if w != ""] for t in txtwords] txts = [string.join(words) for words in txtwords] if stripallwhitespace: for _ in range(2): txts = _stripallwhitespace(txts) return string.join(txts, sep=" ") print('[{}] Start stemming'.format(time.time() - start_time)) merge['stem_name'] = [textpreprocess(s) for s in merge["name"].values] print('[{}] Stemming completed'.format(time.time() - start_time)) ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2'] ) crossed_columns_d = cross_columns(x_cols) categorical_columns = list( merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print ('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash)))%D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del(lb) ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2 ** 29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze= True X_name = wb.fit_transform(merge['name']) del(wb) X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze= True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del(wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0, "idf": None}) , procs=8) wb.dictionary_freeze= True X_description = wb.fit_transform(merge['item_description']) del(wb) X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_stem_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_stem_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[trnidx], y.values[validx] model = FM_FTRL(alpha=0.005, beta=0.005, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.005, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X , train_y , verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.datasets import fetch_20newsgroups import heapq, operator # Using Stemmer from the PyStemmer package because it's faster than the nltk stemmer # Package can be downloaded from here: https://pypi.python.org/pypi/PyStemmer import Stemmer english_stemmer = Stemmer.Stemmer('en') # Extension of the normal Tfidf vectorizer so that it stems words before analyzing class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: english_stemmer.stemWords(analyzer(doc)) # Download the 20 newsgroups dataset documents = fetch_20newsgroups() # Sample queries representing the interests of our users queries = { "soccer": "soccer goal league championship striker player score coach football", "music": "music album cd lp song singer play listen genre album band", "cars": "car motor fuel petrol cylinder steering drive hybrid chassis engine mph", "films": "film movie actor director role genre scene camera",
estimator = lambda fdist, bins: SimpleGoodTuringProbDist(fdist, bins=1e5) words = [] directory = parsed.src_texts n = parsed.n output = parsed.o for filename in os.listdir(directory): with open (directory+"/"+filename, "r") as file: inp = file.read() if parsed.text_encoding: inp = inp.decode(parsed.text_encoding) if filename != ".DS_Store": if parsed.word_type == "stem": stemmer = Stemmer.Stemmer('russian') words += stemmer.stemWords([inp]) elif parsed.word_type == "surface_all": words += nltk.word_tokenize(inp) elif parsed.word_type == "surface_no_pm" or parsed.word_type[:7] == "suffix_": inp = inp.translate(None, string.punctuation) words += nltk.word_tokenize(inp) else: words += nltk.word_tokenize(inp) if parsed.word_type[:7] == "suffix_": l = int(parsed.word_type.split("_")[1]) words = [x[-l:] for x in words] if parsed.unknown_word_freq:
import Stemmer as ps # create stemmer class stemmer = ps.Stemmer('english') def stemText(words): return [stemmer.stemWord(word) for word in words] def stemArticles(articles): return {i: stemText(words) for i, words in articles.items()}
def build_analyzer(self): analyzer = super(CountVectorizer, self).build_analyzer() if self.is_lemma: return lambda doc: [self.wnl.lemmatize(t) for t in analyzer(doc)] else: return lambda doc: Stemmer.Stemmer('en').stemWords(analyzer(doc))
def graph_data_from_links(links, filter_largest_subgraph=False, ignore_self_loop=True, directed=False): print('start graph data') import csv, random, io, sys, os import collections import time import string import Stemmer csv.field_size_limit( sys.maxsize ) # http://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072 import nltk.tokenize from nltk.corpus import stopwords stemmer = Stemmer.Stemmer('english') stopwords = set(stopwords.words('english')).union( set(stopwords.words('french'))) punc_table = dict( (ord(char), ' ') for char in string.punctuation if char not in '_-') def tokenize(string): string = string.translate(punc_table) # remove punctuation for begin, end in nltk.tokenize.WhitespaceTokenizer().span_tokenize( string): word = string[begin:end] if not word.isdigit() and word not in stopwords: yield word X = io.StringIO() X_writer = csv.writer(X, delimiter=' ') DTM = io.StringIO() DTM_writer = csv.writer(DTM, delimiter=' ') links = list(csv.reader(io.StringIO(links))) print('links loaded') if filter_largest_subgraph: # amazingly complex algorithm to find the subgraph and filter the links :) groups = {} groups_sizes = [] g = 0 for link in links: if len(link) > 1: source, target = link[0], link[1] if source not in groups and target in groups: groups[source] = groups[target] groups_sizes[groups[target]] += 1 elif source in groups and target not in groups: groups[target] = groups[source] groups_sizes[groups[source]] += 1 elif source not in groups and target not in groups: groups[target], groups[source] = g, g groups_sizes.append(0) groups_sizes[g] += 2 g += 1 elif groups[target] != groups[source]: if groups_sizes[groups[target]] > groups_sizes[ groups[source]]: for node, group in groups.items(): if group == groups[source]: groups[node] = groups[target] else: for node, group in groups.items(): if group == groups[target]: groups[node] = groups[source] best_group = groups_sizes.index(max(groups_sizes)) links = [ link for link in links if len(link) > 1 and groups[link[0]] == best_group ] print('filtered largest subgraph') if not directed: # another amazing algo to symmetrize the links in case of undirected graphs new_links = [] for link in links: new_links.append(link) new_links.append([link[1], link[0]] + link[2:]) links = new_links print('symmetry forced') nodes_i = {} # fast lookup of index terms_i = {} # fast lookup of index nodes = [] # labels terms = [] # dictionnary stemm_to_lemm = {} def node_to_i(node): if node in nodes_i: return nodes_i[node] nodes.append(node) i = len(nodes) - 1 nodes_i[node] = i return i def term_to_i(term): if term in terms_i: return terms_i[term] terms.append(term) i = len(terms) - 1 terms_i[term] = i return i print('start making edges', len(links)) edges = collections.OrderedDict() for link in links: if len(link) > 1: # tokenization tokens = [] text = link[2] if len(link) > 2 else '' tokens = list(tokenize(text)) # stemming if len(tokens) > 0: # filter empty links start = node_to_i(link[0]) end = node_to_i(link[1]) if not ignore_self_loop or start != end: edge_name = '%d,%d' % (start, end) if edge_name not in edges: edges[edge_name] = collections.Counter() doc_terms = edges[edge_name] for token, stemm in zip(tokens, stemmer.stemWords(tokens)): token = token.lower() if stemm in stemm_to_lemm: lemm = stemm_to_lemm[stemm] else: stemm_to_lemm[stemm] = token.lower() lemm = token doc_terms[lemm] += 1 print('edges made') def key_to_order_for_tdm(edge_name): start, end = [int(x) for x in edge_name.split(',')] return start + end * len(nodes) for curr_edge, edge_name in enumerate( sorted(edges.keys(), key=key_to_order_for_tdm)): start, end = edge_name.split(',') X_writer.writerow([start, end, 1]) for token, count in edges[edge_name].items(): DTM_writer.writerow([term_to_i(token), curr_edge, count]) # add empty link to make the matrix square if it's not already a square start = end = len(nodes) - 1 edge_name = '%d,%d' % (start, end) if edge_name not in edges: X_writer.writerow([start, end, 0]) labels = io.StringIO() labels_writer = csv.writer(labels, delimiter=' ') labels_writer.writerow(nodes) dictionnary = io.StringIO() dictionnary_writer = csv.writer(dictionnary, delimiter=' ') dictionnary_writer.writerow(terms) print('data done') return { 'edges': X.getvalue(), 'tdm': DTM.getvalue(), 'labels': labels.getvalue(), 'dictionnary': dictionnary.getvalue() }
adjusted_score = '' else: adjusted_score = float(row[2]) * (1 - similarity_score) out_row = row[:3] + row[5:7] + [adjusted_score] return out_row if __name__ == "__main__": # Config DATA_DIR = '../data/aligner_output/' ALIGNMENT_OUTPUT = os.path.join(DATA_DIR, 'ncsl_alignments.csv') SCORES = os.path.join(DATA_DIR, 'ncsl_alignments_notext.csv') n = 1000 # size of comparison samples stemmer = Stemmer.Stemmer('english').stemWord logging.basicConfig(level=logging.INFO) with open(ALIGNMENT_OUTPUT, 'r', encoding='utf-8') as infile,\ open(SCORES, 'w') as scorefile: reader = csv.reader(infile, delimiter=',', quotechar='"') score_writer = csv.writer(scorefile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) # Count total number of rows in data m = sum([1 for row in reader]) infile.seek(0)
import BasesFrases.Base_treinamento import BasesFrases.Base_teste import BasesFrases.Stop_words import Stemmer import Erros_classificador # baixar atualizacoes #nltk.download() # variaveis baseTreinamento = BasesFrases.Base_treinamento.vet_baseTreinamento baseTeste = BasesFrases.Base_teste.vet_baseTeste stopWords = BasesFrases.Stop_words.stopWordsNLTK # aplicando stemming frasesComStemmingTreinamento = Stemmer.aplicaStemmer(baseTreinamento) frasesComStemmingTeste = Stemmer.aplicaStemmer(baseTeste) # busca cada uma das palavras, apos quebrar todas em radicais palavrasTreinamento = Stemmer.buscaPalavras(frasesComStemmingTreinamento) palavrasTeste = Stemmer.buscaPalavras(frasesComStemmingTeste) # quantidade de vezes que uma palavra se repete frequenciaTreinamento = Stemmer.buscaFrequencia(palavrasTreinamento) frequenciaTeste = Stemmer.buscaFrequencia(palavrasTeste) # palavras que nao se repetem palavrasUnicasTreinamento = Stemmer.buscaPalavrasUnicas(frequenciaTreinamento) palavrasUnicasTeste = Stemmer.buscaPalavrasUnicas(frequenciaTeste)
#!/usr/bin/env python # -*- coding: utf-8 -*- import Stemmer import spacy """ Programa que se encarga de buscar las palabras emocionales dentro de una frase y obtener sus lexemas. """ """ Para obtener los lexemas utilizamos el paquete en español de Stemmer. """ stemmer = Stemmer.Stemmer('spanish') nlp = spacy.load('es') """ Algunas de las palabras en nuestro diccionario comparten lexema y hay que tratarlas de manera específica para que no haya conflicto al buscarlas. """ iguales = [ "amig", "espos", "enfad", "guap", "habit", "her", "jubil", "novi", "odi", "pioj" ] buscar_iguales = [ "amigo", "esposo", "enfado", "guapo", "habitante", "herido", "jubiloso", "novio", "odio", "piojo" ] derivables = ["afect", "asesin", "com", "inspir", "libr", "salud", "verd"] derivadas = [["afecto", "afectivo", "afectiva", "afectuso", "afectividad"], ["asesino", "asesinato"], ["comida", "comedor"], ["inspirado", "inspiración"], ["libre", "librar"], ["saludar", "saludo"], ["verde", "verdoso", "verdear"]]
]: FORMAT = F_ZLEGACY # scanData.py <hgw_file> [--stopcats=<stop category file>] hgwpath = args[0] # hgw/gum.xml TITLE_WEIGHT = 4 STOP_CATEGORY_FILTER = bool(options.stopcats) # reToken = re.compile('[a-zA-Z\-]+') reToken = re.compile("[^ \t\n\r`~!@#$%^&*()_=+|\[;\]\{\},./?<>:’'\\\\\"]+") reAlpha = re.compile("^[a-zA-Z\-_]+$") NONSTOP_THRES = 100 STEMMER = Stemmer.Stemmer('porter') # read stop word list from 'lewis_smart_sorted_uniq.txt' wordList = [] try: f = open('lewis_smart_sorted_uniq.txt', 'r') for word in f.readlines(): wordList.append(word.strip()) f.close() except: print 'Stop words cannot be read! Please put "lewis_smart_sorted_uniq.txt" file containing stop words in this folder.' sys.exit(1) STOP_WORDS = frozenset(wordList) if STOP_CATEGORY_FILTER:
def algorithms(): if cext_available: return Stemmer.language() else: return list(_languages.key())
from Stemmer import * s = Stemmer('russian') while True: print(s.stemWord(input()))
def PystemStemming(text): stemmer_rus = Stemmer.Stemmer('russian') stemmer_en = Stemmer.Stemmer('english') words = text.split(" ") words_out = stemmer_en.stemWords(stemmer_rus.stemWords(words)) return " ".join(words_out)
def __init__(self): self.nepali_stemmer = Stemmer.Stemmer( 'nepali') #initializing nepali stemmer
def main(): stemmer = Stemmer.Stemmer("english") print stemmer.stemWord("cardsing")
class EnglishTfidfVectorizer(TfidfVectorizer): english_stemmer = Stemmer.Stemmer('en') def build_analyzer(self): analyzer = super(EnglishTfidfVectorizer, self).build_analyzer() return lambda doc: self.english_stemmer.stemWords(analyzer(doc))
import re import string import Stemmer # top 25 most common words in English and "wikipedia": # https://en.wikipedia.org/wiki/Most_common_words_in_English STOPWORDS = set([ 'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'developer', 'engineer', 'quận', 'thành', 'huyện', 'phố', 'city', 'district', 'street' ]) PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation)) STEMMER = Stemmer.Stemmer('english') def tokenize(text): return text.split() def lowercase_filter(tokens): return [token.lower() for token in tokens] def punctuation_filter(tokens): return [PUNCTUATION.sub('', token) for token in tokens] def stopword_filter(tokens): return [token for token in tokens if token not in STOPWORDS]
def __init__(self): self.alphabet = u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' self.allwords = AllWordsDB() self.stemmer = Stemmer.Stemmer('russian')
def _get_stemmer_fn(self): import Stemmer # @UnresolvedImport stemmer = Stemmer.Stemmer(self.lang) stemmer.maxCacheSize = self.cachesize return stemmer.stemWord
def __init__(self, topic_model, api=None): self.resource_id = None self.stemmer = None self.seed = None self.case_sensitive = False self.bigrams = False self.ntopics = None self.temp = None self.phi = None self.term_to_index = None self.topics = [] if not (isinstance(topic_model, dict) and 'resource' in topic_model and topic_model['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_topic_model_id(topic_model) if self.resource_id is None: raise Exception( api.error_message(topic_model, resource_type='topicmodel', method='get')) query_string = ONLY_MODEL topic_model = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_topic_model_id(topic_model) if 'object' in topic_model and isinstance(topic_model['object'], dict): topic_model = topic_model['object'] if 'topic_model' in topic_model \ and isinstance(topic_model['topic_model'], dict): status = get_status(topic_model) if 'code' in status and status['code'] == FINISHED: model = topic_model['topic_model'] self.topics = model['topics'] if 'language' in model and model['language'] is not None: lang = model['language'] if lang in CODE_TO_NAME: self.stemmer = Stemmer.Stemmer(CODE_TO_NAME[lang]) self.term_to_index = { self.stem(term): index for index, term in enumerate(model['termset']) } self.seed = abs(model['hashed_seed']) self.case_sensitive = model['case_sensitive'] self.bigrams = model['bigrams'] self.ntopics = len(model['term_topic_assignments'][0]) self.alpha = model['alpha'] self.ktimesalpha = self.ntopics * self.alpha self.temp = [0] * self.ntopics assignments = model['term_topic_assignments'] beta = model['beta'] nterms = len(self.term_to_index) sums = [ sum(n[index] for n in assignments) for index in range(self.ntopics) ] self.phi = [[0 for _ in range(nterms)] for _ in range(self.ntopics)] for k in range(self.ntopics): norm = sums[k] + nterms * beta for w in range(nterms): self.phi[k][w] = (assignments[w][k] + beta) / norm ModelFields.__init__(self, model['fields']) else: raise Exception("The topic model isn't finished yet") else: raise Exception("Cannot create the topic model instance. Could not" " find the 'topic_model' key in the" " resource:\n\n%s" % topic_model)
#ddir = 'E:/workspace/data/cdiscount/' #wdir = 'C:/Users/ngaude/Documents/GitHub/kaggle/cdiscount/' ddir = '/home/ngaude/workspace/data/cdiscount/' wdir = '/home/ngaude/workspace/github/kaggle/cdiscount/' stopwords = [] with open(wdir + 'stop-words_french_1_fr.txt', "r") as f: stopwords += f.read().split('\n') with open(wdir + 'stop-words_french_2_fr.txt', "r") as f: stopwords += f.read().split('\n') stopwords += nltk.corpus.stopwords.words('french') stopwords += ['voir', 'presentation'] stopwords = set(stopwords) stemmer = Stemmer.Stemmer('french') rayon = pd.read_csv(ddir + 'rayon.csv', sep=';') itocat1 = list(np.unique(rayon.Categorie1)) cat1toi = {cat1: i for i, cat1 in enumerate(itocat1)} itocat2 = list(np.unique(rayon.Categorie2)) cat2toi = {cat2: i for i, cat2 in enumerate(itocat2)} itocat3 = list(np.unique(rayon.Categorie3)) cat3toi = {cat3: i for i, cat3 in enumerate(itocat3)} f_itocat = ddir + 'joblib/itocat' itocat = (itocat1, cat1toi, itocat2, cat2toi, itocat3, cat3toi) joblib.dump(itocat, f_itocat) def normalize_txt(txt):
import json import sys import re import Stemmer import bisect import math from collections import defaultdict import time stemmer = Stemmer.Stemmer("english") STOP_WORDS = set(['whence', 'here', 'show', 'were', 'why', 'n’t', 'the', 'whereupon', 'not', 'more', 'how', 'eight', 'indeed', 'i', 'only', 'via', 'nine', 're', 'themselves', 'almost', 'to', 'already', 'front', 'least', 'becomes', 'thereby', 'doing', 'her', 'together', 'be', 'often', 'then', 'quite', 'less', 'many', 'they', 'ourselves', 'take', 'its', 'yours', 'each', 'would', 'may', 'namely', 'do', 'whose', 'whether', 'side', 'both', 'what', 'between', 'toward', 'our', 'whereby', "'m", 'formerly', 'myself', 'had', 'really', 'call', 'keep', "'re", 'hereupon', 'can', 'their', 'eleven', '’m', 'even', 'around', 'twenty', 'mostly', 'did', 'at', 'an', 'seems', 'serious', 'against', "n't", 'except', 'has', 'five', 'he', 'last', '‘ve', 'because', 'we', 'himself', 'yet', 'something', 'somehow', '‘m', 'towards', 'his', 'six', 'anywhere', 'us', '‘d', 'thru', 'thus', 'which', 'everything', 'become', 'herein', 'one', 'in', 'although', 'sometime', 'give', 'cannot', 'besides', 'across', 'noone', 'ever', 'that', 'over', 'among', 'during', 'however', 'when', 'sometimes', 'still', 'seemed', 'get', "'ve", 'him', 'with', 'part', 'beyond', 'everyone', 'same', 'this', 'latterly', 'no', 'regarding', 'elsewhere', 'others', 'moreover', 'else', 'back', 'alone', 'somewhere', 'are', 'will', 'beforehand', 'ten', 'very', 'most', 'three', 'former', '’re', 'otherwise', 'several', 'also', 'whatever', 'am', 'becoming', 'beside', '’s', 'nothing', 'some', 'since', 'thence', 'anyway', 'out', 'up', 'well', 'it', 'various', 'four', 'top', '‘s', 'than', 'under', 'might', 'could', 'by', 'too', 'and', 'whom', '‘ll', 'say', 'therefore', "'s", 'other', 'throughout', 'became', 'your', 'put', 'per', "'ll", 'fifteen', 'must', 'before', 'whenever', 'anyone', 'without', 'does', 'was', 'where', 'thereafter', "'d", 'another', 'yourselves', 'n‘t', 'see', 'go', 'wherever', 'just', 'seeming', 'hence', 'full', 'whereafter', 'bottom', 'whole', 'own', 'empty', 'due', 'behind', 'while', 'onto', 'wherein', 'off', 'again', 'a', 'two', 'above', 'therein', 'sixty', 'those', 'whereas', 'using', 'latter', 'used', 'my', 'herself', 'hers', 'or', 'neither', 'forty', 'thereupon', 'now', 'after', 'yourself', 'whither', 'rather', 'once', 'from', 'until', 'anything', 'few', 'into', 'such', 'being', 'make', 'mine', 'please', 'along', 'hundred', 'should', 'below', 'third', 'unless', 'upon', 'perhaps', 'ours', 'but', 'never', 'whoever', 'fifty', 'any', 'all', 'nobody', 'there', 'have', 'anyhow', 'of', 'seem', 'down', 'is', 'every', '’ll', 'much', 'none', 'further', 'me', 'who', 'nevertheless', 'about', 'everywhere', 'name', 'enough', '’d', 'next', 'meanwhile', 'though', 'through', 'on', 'first', 'been', 'hereby', 'if', 'move', 'so', 'either', 'amongst', 'for', 'twelve', 'nor', 'she', 'always', 'these', 'as', '’ve', 'amount', '‘re', 'someone', 'afterwards', 'you', 'nowhere', 'itself', 'done', 'hereafter', 'within', 'made', 'ca', 'them']) # print(type(STOP_WORDS)) STOP_WORDS.add("cite") query = "" # store = False k = 10 weight = { "t": 100, "i": 20, "b": 1, "c": 20, "l": 0.05, "r": 0.05 } first_words = "" # SORT_SIZE = 10000 TITLE_SIZE = 2000
def __setstate__(self, state): self.__dict__ = state self._stemmer = Stemmer.Stemmer("english")
def run(): parser = argparse.ArgumentParser(description="A chat bot") # database options db_parser = argparse.ArgumentParser(add_help=False) db_parser.add_argument( '--dbname', default='chains', help="Specifies the brain database.") # simulation options note = ("Note that this option is overridden by database settings and " "so is only used at database initialisation time.") modelling_parser = argparse.ArgumentParser(add_help=False) modelling_parser.add_argument( '--chain-order', type=int, default=DEF_CHAIN_ORDER, help="Set the simulation chain size parameter. " + note) modelling_parser.add_argument( '--language', choices=Stemmer.algorithms(), default='english', help="Set the simulation language for the stemmer. " + note) # learning options learning_parser = argparse.ArgumentParser(add_help=False) learning_parser.add_argument( 'infile', metavar='INFILE', nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="An input file from which to learn") # reply options reply_parser = argparse.ArgumentParser(add_help=False) reply_parser.add_argument( 'message', metavar='MSG', nargs='+', action='append', help="Specify a message to respond to.") subparsers = parser.add_subparsers(title='Subcommands', dest='subcommand') subparsers.required = True ### learn command ### learn_subparser = subparsers.add_parser( 'learn', help="add source data to the corpus", parents=[learning_parser, db_parser, modelling_parser]) learn_subparser.set_defaults(func=do_learn) ### response command reply_subparser = subparsers.add_parser( 'reply', help="send a message to get a reply back", parents=[reply_parser, db_parser, modelling_parser]) reply_subparser.set_defaults(func=do_response) ### shell command shell_subparser = subparsers.add_parser( 'shell', help="enter an interactive shell", parents=[db_parser, modelling_parser]) shell_subparser.set_defaults(func=do_shell) dargs = vars(parser.parse_args()) for option in ('file', 'message'): if dargs.get(option): dargs[option] = [x for xs in dargs[option] for x in xs] dargs['func'](dargs)
def NMF_NLT_TFIDF(): english_stemmer = Stemmer.Stemmer('en') class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: english_stemmer.stemWords(analyzer(doc)) cats = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] print("Loading 20 newsgroups dataset for categories:") pprint(list(cats)) newsgroups = fetch_20newsgroups(subset='all', categories = cats) print("%d documents" % len(newsgroups.data)) print("%d categories" % len(newsgroups.target_names)) print("Creating stemmed TFxIDF representation...") t0 = time() vect = StemmedTfidfVectorizer(stop_words='english') vectors = vect.fit_transform(newsgroups.data) # TFxIDF representation print("Done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % vectors.shape) workbook = xlsxwriter.Workbook('part3_NMF_NLT.xlsx') purityMetricsNames = ['Homogeneity', 'Completeness', 'V-measure', 'Adjust Rand-Index', 'Adjusted Mutual Information Score'] metric_list = {} for i in range(1,21): print("Implementing NMF on data...") nmf_ = NMF(n_components=i) # nmf_data = nmf_.fit_transform(vectors) print("Done.") # Applying non-linear transform print("Implementing non-linear transform on data...") offset = 0.001 nmf_data_off=np.add(nmf_data,offset) log_nmf_data=np.log(nmf_data_off) print("Done.") labels = newsgroups.target labels_2 = [] # Changing the labels from 0-7 to 0-1 for mark in labels: if mark <= 3: labels_2.append(0) else: labels_2.append(1) k = 2 km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1) print("Clustering sparse data with %s" % km) t0 = time() km.fit(log_nmf_data) print("done in %0.3fs" % (time() - t0)) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_2, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels_2, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_2, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels_2, km.labels_)) print("Adjusted Mutual Information Score: %.3f" % metrics.adjusted_mutual_info_score(labels_2, km.labels_)) print metrics.confusion_matrix(labels_2,km.labels_) purityMetrics = [metrics.homogeneity_score(labels_2, km.labels_), metrics.completeness_score(labels_2, km.labels_),metrics.v_measure_score(labels_2, km.labels_),metrics.adjusted_rand_score(labels_2, km.labels_),metrics.adjusted_mutual_info_score(labels_2, km.labels_)] # Writing to .xlsx file (For Confusion Matrix) worksheet = workbook.add_worksheet() obs = zip(km.labels_,labels_2) row = 0 col = 0 worksheet.write(row,col,'Predictions') worksheet.write(row,col+1,'Actuals') worksheet.write(row,col+6,'Dimension') worksheet.write(row+1,col+6,i) metric_list = dict(zip(purityMetricsNames,purityMetrics)) pprint(dict(metric_list)) for key in metric_list.keys(): row += 1 worksheet.write(row,col+11,key) worksheet.write(row,col+12,metric_list[key]) row = 0 col = 0 for pred, actual in (obs): row += 1 worksheet.write(row,col, pred) worksheet.write(row,col+1,actual) row = 1 for things in labels: worksheet.write(row,col+2,things) row += 1 workbook.close()
def tokenizer_snowballer(text): stemmer = Stemmer.Stemmer('spanish') return [ stemmer.stemWord(t) for t in token_extract.findall(text.lower()) if t not in vacias ]
def twitter_sentiment_analysis(self, custom_user_dictionary_dict, clean_tweet): ''' - Determinate sentiment analysis (polarity) of tweet Note: this simple model, isn't trained to handle sarcasm or ironic sentences yet, i.e: "Hoy es un maravilloso e impresionante día de mierda" This is a typically ironic tweet but this model will categorize it as "Positive" by majority of positive words. However, some words that aren't in positive_dictionary.json or negative_dictionary.json could be added by the user and selected as "Positive" or "Negative", so in consequence, a sarcasm tweet could be categorized correctly, but only by mayority of positive and negative words. ''' import json from nltk.tokenize import word_tokenize import Stemmer from collections import Counter try: response_data = {} tokens = word_tokenize(clean_tweet) # Define Stemmer for Spanish Language stemmer = Stemmer.Stemmer('spanish') # Compare received tweet with the positive and negative # custom user dictionary positive = map( lambda x: x in custom_user_dictionary_dict['positive'], stemmer.stemWords(tokens)) negative = map( lambda x: x in custom_user_dictionary_dict['negative'], stemmer.stemWords(tokens)) pos = Counter(positive)[True] neg = Counter(negative)[True] total = pos + neg if total > 0: if pos == neg: response_data["polarity"] = "NU" # Neutral else: if pos > neg: response_data["polarity"] = "P" # response_data["sentiment"] = (pos*100)/total else: response_data["polarity"] = "N" # response_data["sentiment"] = (neg*100)/total response_data["positive_sentiment_score"] = pos / len(tokens) response_data["negative_sentiment_score"] = neg / len(tokens) response_data["neutral_sentiment_score"] = ( len(tokens) - (pos + neg)) / len(tokens) else: # None token match with any positive or negative word, # so it's weak to determine a sentiment score response_data["polarity"] = "NU" # Note: sentiment score could be 1, if all the words # are neutral, it means that it's necessary also a # neutral dictionary response_data["positive_sentiment_score"] = 0 response_data["negative_sentiment_score"] = 0 response_data["neutral_sentiment_score"] = 1 # We still don't have a way to determine confidence level # with a sentiment analysis model from a dictionary with # positive and negative words response_data["confidence"] = "Undefined" except Exception as e: response_data = { "polarity": None, "positive_sentiment_score": None, "negative_sentiment_score": None, "neutral_sentiment_score": None, "confidence": "Undefined" } return json.dumps(response_data)
def __init__(self, topic_model, api=None): self.resource_id = None self.stemmer = None self.seed = None self.case_sensitive = False self.bigrams = False self.ntopics = None self.temp = None self.phi = None self.term_to_index = None self.topics = [] self.api = get_api_connection(api) self.resource_id, topic_model = get_resource_dict( \ topic_model, "topicmodel", api=self.api) if 'object' in topic_model and isinstance(topic_model['object'], dict): topic_model = topic_model['object'] if 'topic_model' in topic_model \ and isinstance(topic_model['topic_model'], dict): status = get_status(topic_model) if 'code' in status and status['code'] == FINISHED: self.input_fields = topic_model['input_fields'] model = topic_model['topic_model'] self.topics = model['topics'] if 'language' in model and model['language'] is not None: lang = model['language'] if lang in CODE_TO_NAME: self.stemmer = Stemmer.Stemmer(CODE_TO_NAME[lang]) self.term_to_index = {self.stem(term): index for index, term in enumerate(model['termset'])} self.seed = abs(model['hashed_seed']) self.case_sensitive = model['case_sensitive'] self.bigrams = model['bigrams'] self.ntopics = len(model['term_topic_assignments'][0]) self.alpha = model['alpha'] self.ktimesalpha = self.ntopics * self.alpha self.temp = [0] * self.ntopics assignments = model['term_topic_assignments'] beta = model['beta'] nterms = len(self.term_to_index) sums = [sum(n[index] for n in assignments) for index in range(self.ntopics)] self.phi = [[0 for _ in range(nterms)] for _ in range(self.ntopics)] for k in range(self.ntopics): norm = sums[k] + nterms * beta for w in range(nterms): self.phi[k][w] = (assignments[w][k] + beta) / norm missing_tokens = model.get("missing_tokens") ModelFields.__init__(self, model['fields'], missing_tokens=missing_tokens) else: raise Exception("The topic model isn't finished yet") else: raise Exception("Cannot create the topic model instance. Could not" " find the 'topic_model' key in the" " resource:\n\n%s" % topic_model)
def getTerms(self, withPositions=False): #start = timeit.default_timer() # split by whitespace terms = re.split('[\s]', self.text) stemmer = Stemmer.Stemmer('english') # get the english stopwords stopwords = [] with open('snowball_stopwords_EN.txt', 'r') as document: stopwords += list(filter(None, re.split("[ \n]", document.read()))) document.close() if withPositions: termsPositions = [ ] # [[term0pos0, term0pos1,...], [term1pos0, term1pos1, term1pos2] for pos in range(len(terms)): # in case there is more than one term in this split by whitespace list position tempTermList = [] # maintain websites url_match = re.findall( r'(https?://(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9' r'][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?://(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|ww' r'w\.[a-zA-Z0-9]+\.[^\s]{2,})', terms[pos]) # maintain emails email_match = re.findall(r'[\w.-]+@[\w.-]+', terms[pos]) # maintain words with hyphens hyphen_match = re.findall(r"([A-Za-z]+-[A-Za-z]+)", terms[pos]) # maintain apostrophes apostrophe_match = re.findall(r"([A-Za-z]+'[A-Za-z]*)", terms[pos]) # maintain acronyms acronyms_match = re.findall(r'\b(?:[a-zA-Z]\.){2,}', terms[pos]) # maintain siglas siglas_match = re.findall(r'\b(?:[A-Z]){2,}', terms[pos]) if url_match: if url_match[0].endswith(').') or url_match[0].endswith('),'): url_match = [ url_match[0][:-2] ] # ex: https://www.genomedetective.com/app/typingtool/cov). elif url_match[0].endswith(',') or url_match[0].endswith('.') or url_match[0].endswith(')') or \ url_match[0].endswith('}'): url_match = [url_match[0][:-1]] tempTermList = url_match elif email_match: tempTermList = email_match elif hyphen_match: tempTermList = hyphen_match elif apostrophe_match: if apostrophe_match[0].endswith('\''): apostrophe_match = [apostrophe_match[0][:-1]] tempTermList = apostrophe_match elif acronyms_match: tempTermList = acronyms_match elif siglas_match: tempTermList = siglas_match else: # remove html character entities, ex: term = re.sub(r'(&.+;)', '', terms[pos]) # replaces all non-alphabetic characters by a space, splits on whitespace tempTermList = re.split('[\s]', re.sub(r'[^A-Za-z]', ' ', term)) while ('' in tempTermList): tempTermList.remove('') # lowercases all letters tempTermList = [term.lower() for term in tempTermList] # Removes stopwords from the list of the terms of the document. tempTermList = list( filter(lambda term: term not in stopwords, tempTermList)) # Stemmes tempTermList = [stemmer.stemWord(term) for term in tempTermList] # ignores all tokens with less than 3 characters tempTermList = list(filter(lambda t: len(t) >= 3, tempTermList)) if tempTermList != []: if withPositions: # [[term0pos0, term0pos1,...], [term1pos0, term1pos1, term1pos2] for termInd in range(len(tempTermList)): if tempTermList[termInd] in self.terms: termsPositions[self.terms.index( tempTermList[termInd])] += [pos] else: self.terms += [tempTermList[termInd]] termsPositions += [[pos]] else: self.terms += [ term for term in tempTermList if term not in self.terms ] if withPositions: #stop = timeit.default_timer() #print('getTerms: {} seconds'.format(stop - start)) return self.terms, termsPositions #stop = timeit.default_timer() #print('getTerms: {} seconds'.format(stop - start)) return self.terms
u'Полиция Великобритании нашла основателя WikiLeaks, но, не арестовала', u'В Стокгольме и Осло сегодня состоится вручение Нобелевских премий' ] # words = [u'В ДНР жалуются: Россия не дает денег на пенсию', # u'Крушение поезда в Индии: число погибших превысило 100 человек', # u'ДНР решила создать компьютерные игры о боях за Дебальцево и аэропорт', # u'Турция может не пойти в ЕС, а вступить в ШОС', # u'На Донбассе задержан минометчик ДНР, который обстреливал Майорск', # u'Политолог рассказал, как ускорить деоккупацию Крыма и Донбасса', # u'Столтенбкрг обсудил с Трампом будущее НАТО', # u'Обама призвал дать Трампу время и не ждать худшего', # u'Под Киевом столкнулись грузовик и автобус, есть погибший'] stop_words_list = stop_words.split(" ") _stemmer = Stemmer() words = [w.lower() for w in words] # приводим все строки к нижнему регистру unsymboled = [] for word in words: s = re.sub(r'[.,!?;:{}[]()-_]', '', word) # с помощью регулярных выражений удаляем знаки препинания unsymboled.append(s) listed = [s.split(" ") for s in unsymboled] # разделяем предложения на отдельные слова new = [] for sentence in listed: s = [i for i in sentence