def ProcessCorpus(V, L): try: stem = SpanishStemmer() for l in stdin: l = l.split() if len(l) < 3: stderr.write('Warning: Short line: \"%s\"\n' % ' '.join(l)) continue tid = l[0] uid = l[1] lv = [0 for w in V] for w in l[2:]: w = stem.stem(w.decode('utf-8')) d = V.get(w, None) if d is None: #stderr.write('Warning: \"%s\" not in the lexicon\n' % w); continue lv[d] = lv[d] + 1 if sum(lv) == 0: stderr.write('Warning: %s with null vector. Label: %d\n' % (tid, L[l[0]]) ) stdout.write('%d ' % L[l[0]]) for i in range(len(lv)): stdout.write('%d:%d ' % (i+1, lv[i])) stdout.write('# %s %s\n' % (tid, uid)) return 0 except Exception as ex: stderr.write('Exception: %s\n' % repr(ex)) return 1
class Tokenizer(object): """ Esta clase es la encargada de obtener las palabras de los documentos recuperados por el `Crawler` """ def __init__(self, min_long=5): """ Para inicializar un `Tokenizer` es necesario saber el tamaño mínimo de caracteres `min_long`. que constituyen una palabra válidad :param min_long: un entero. Por defecto igual a cinco (5) """ self.stemmer = SpanishStemmer() self.min_long = min_long def obtener_palabras(self, contenido): """ Este método devuelve una lista de palabras recuperadas del `contenido` :param contenido: una cadena con el contenido de texto del documento :return: una lista de cadenas de caracteres representando las palabras """ # Realizo el stemming en todo el contenido. Eliminando acentos mayusculas y dejando las raices. cont_stemed = self.stemmer.stem(contenido) # Divido el texto por palabras eliminando las repetidas conjunto_palabras = set(re.split(r'\W+', cont_stemed)) # Elimino Stopwords, palabras menores a min_long y retorno lista return [palabra for palabra in conjunto_palabras if palabra not in stopwords.words('spanish') and not len(palabra) < self.min_long]
class ConceptComparerSpanishStem(ConceptComparerBase): """ Implementation of a concept comparer based on a stemmer for spanish. Parameters ---------- None. Notes ----- This is a sub-class of :py:class:`~lingpy.meaning.concepts.ConceptComparerBase`. It uses a simple match of the stem of a given (spanish) string against a given context (that is supposed to be a stemmed spanish word stem). See also -------- ConceptComparerBase ConceptGraph """ def __init__(self): self.stemmer = SpanishStemmer(True) self.re_brackets = re.compile(" ?\([^)]\)") def compare_to_concept(self, element, concept): """Compares a given element to a concept. Parameters ---------- element : str The string (for example a lexical item: head or translation) to compare to the concept. concept : str or object The conpect to compare to. Return ------ match : bool True if element matches the given concept, False otherwise. Notes ----- The `element` is supposed to be a spanish word, the concept a stemmed entry of the spanish Swadesh List. See also -------- spanish_swadesh_list """ element = self.re_brackets.sub("", element) element = element.strip() if not " " in element: stem = self.stemmer.stem(element) if stem == concept: return True return False
class OOVclassifier(object): def __init__(self, stem=False): dictionaries = dicts() path = '/home/alangb/TWPP' # path to TreeTagger installation directory self.english_dict = enchant.Dict("en_EN") self.spanish_dict = enchant.Dict("es_ES") self.ND = dictionaries.norm self.SD = dictionaries.lemario self.PND = dictionaries.names self.stem = stem if stem: self.stemmer = SpanishStemmer() else: self.tagger = TreeTagger(TAGLANG='es', TAGDIR=path) def dictionary_lookup(self, word): result = (word in self.SD or word in self.PND or word in self.ND.values()) return result def affix_check(self, word): result = False if word.islower() or word.istitle(): if self.stem: n = len(word) stem = self.stemmer.stem(word) # compare with first substring of length n of each word in SD for w in [x[:n] for x in self.SD if len(x) >= n]: result = (word == w) if result: break else: lemma = make_tags(self.tagger.tag_text(word))[0].lemma result = self.dictionary_lookup(lemma) return result def check(self, word): result = self.spanish_dict.check(word) if not result: result = self.dictionary_lookup(word) or self.affix_check(word) return result def check_NoES(self, word): result = False if len(word) > 1: result = self.english_dict.check(word) return result def classify(self, word): if self.check(word): result = 1 elif self.check_NoES(word): result = 2 else: result = 0 return result
class SpanishStemmer(Normalizer): def __init__(self, next_normalizer=None): super(SpanishStemmer, self).__init__(next_normalizer) self._stemmer = NLTKSpanishStemmer() def _apply_normalizer(self, data): stem_word = lambda x: self._stemmer.stem(x) stem_word_list = lambda xl: [stem_word(w) for w in xl] return stem_word(data) if not isinstance(data, (list, tuple)) else stem_word_list(data)
class StemmerProcessor(DocumentAtATimeCorpusProcessor): def __init__(self): super(StemmerProcessor, self).__init__() self.stemmer = SpanishStemmer() def process_document(self, document): processed_document = [] for word in document: processed_document.append(self.stemmer.stem(word)) return processed_document
def find_top_N_words(lang_entries, top_N, lang): dictionary = Lang_Dictionary({}, lang) for player in lang_entries: for chat in player.c: language = {'eng': 0, 'spn': 0, 'other': 0, 'tot': 0} sentence = player.c[chat] newlist = player.c[chat].strip().split(' ') newlist = [x.strip("''") for x in newlist] for word in newlist: language['tot'] += 1 if word.lower() not in Lang_dicts.lang_index: language['other'] += 1 else: word = Lang_dicts.lang_index[word.lower()] if word == "english": language['eng'] += 1 elif word == "spanish": language['spn'] += 1 else: language['other'] += 1 if language['other'] < 2 * (language['spn'] + language['eng']): print(sentence) if language['spn'] > language['eng']: print("SPANISH") stemmer = SpanishStemmer() else: print("ENGLISH") stemmer = EnglishStemmer() aslist = [] aslist += sentence sentence ="" j = ''.join(aslist) words = j.split(' ') for line in words: line = str(line).replace('\'', '') line = line.replace('""', '') line = line.replace('"', '') if len(line) > 0: if language["other"] < 2 * (language['spn'] + language["eng"]): sentence += stemmer.stem(line.encode(sys.stdout.encoding, errors = 'replace')) + " " print(sentence) ##INEFFICIENT - looking through dictionary each time? if line.lower() not in dictionary.d: dictionary.d[line.lower()] = 0 dictionary.d[line.lower()] += 1 ###wthCounts is a list of the word and its count wthCounts = [] for(w,c) in dictionary.d.iteritems(): wthCounts += [(c,w)] ##wc is the wthCounts list only sorted wc = sorted(wthCounts, reverse=True) return wc[:top_N]
def build_paragraph_inv_index(paragraphs, stem): p_index = {} stemmer = SpanishStemmer() for i, paragraph in enumerate(paragraphs): words = [word for word in paragraph.split() if word not in STOP_WORDS] for word in words: if stem: word = stemmer.stem(word) if word not in p_index: p_index[word] = [] p_index[word].append(i) return p_index
def build_index_from_words(words, stem): ''' Takes: - words, a list of strings Returns: - index, a dictionary with a count of times a word appears in the document ''' index = {} stemmer = SpanishStemmer() for word in words: if word not in STOP_WORDS: if stem: word = stemmer.stem(word) if word not in index: index[word] = 0 index[word] += 1 return index
def generate_stopwords(stopname='stopSpanish.pkl'): """ Remove stop words, and apply stemming """ stemmer=SpanishStemmer() stopwords_es = set(stopwords.words('spanish')) stopwords_es_sw = set(get_stop_words('spanish')) stopSpanishBeta = list(set(stopwords_es.union(stopwords_es_sw))) stopSpanish = set(stopwords_es.union(stopwords_es_sw)) for stopWord in stopSpanishBeta: stopSpanish.add(stemmer.stem(stopWord)) stopSpanish = list(stopSpanish) stopSpanish.extend(['tra', 'd', 'desc']) # Adding stopwords not present in the standard stopwords stopSpanish.remove('no') # Keep to help identify negative categories with open(f'{resource_path}/{stopname}', 'wb') as f: pickle.dump(stopSpanish, f) return stopSpanish
def getfeats(fields, o): """ This takes the word in question and the offset with respect to the instance word """ word = fields[0] stemmer = SpanishStemmer() with_hyphen = 0 if "-" in word: with_hyphen = 1 with_apostrophe = 0 if "'" in word: with_apostrophe = 1 o = str(o) features = [ (o + "word", word), (o + 'pos', fields[1]), #(o + 'prefix1', word[:1]), (o + 'prefix2', word[:2]), (o + 'prefix3', word[:3]), (o + 'prefix4', word[:4]), #(o + 'suffix1', word[-1:]), (o + 'suffix2', word[-2:]), (o + 'suffix3', word[-3:]), (o + 'suffix4', word[-4:]), (o + 'is_upper', word.isupper()), (o + 'is_title', word.istitle()), (o + 'is_digit', word.isdigit()), (o + 'with_hypen', with_hyphen), (o + 'with_apostrophe', with_apostrophe), (o + 'spanich_stem', stemmer.stem(word)), # (o + 'word_shape', word_shape(word)) ] return features
def spanish_swadesh_list(stemmed=True): """ Helper function that returns a list of strings with the stems of the spanish Swadesh entries. """ try: stemmer = SpanishStemmer(True) except: log.warn("Spanish stemmer could not be loaded!") return swadesh_entries = [] for line in util.read_text_file( util.data_path('swadesh', 'swadesh_spa.txt'), lines=True): line = line.strip() for e in line.split(","): e = e.strip() if stemmed: stem = stemmer.stem(e) swadesh_entries.append(stem) else: swadesh_entries.append(e) return swadesh_entries
class WTAclassifier(object): def __init__(self, lemma=False, stem=False): self.extra_dicts = Dicts() self.english_dict = enchant.Dict("en_EN") self.spanish_dict = enchant.Dict("es_AR") self.lemma = lemma self.stem = stem self.VARIANT_CLASS = 0 self.SPANISH_CLASS = 1 self.FOREIGN_CLASS = 2 if lemma: self.lemmatizer = Lemmatizer() if stem: self.stemmer = SpanishStemmer() def check(self, word): result = self.spanish_dict.check(word) if not result: result = self.extra_dicts.is_valid(word) if self.lemma and not result: lemma = self.lemmatizer.lemmatize(word) result = self.extra_dicts.is_valid(lemma) if self.stem and not result: result = self.affix_check(word) return result def affix_check(self, word): result = False stem = self.stemmer.stem(word.lower()) n = len(stem) # compare with first substring of length n of each word in lemario if stem[0] in self.extra_dicts.lemario.keys(): for x in self.extra_dicts.lemario[stem[0]]: if len(x) >= n and x[:n] == stem: return True return result def check_NoES(self, word): result = False if len(word) > 1: # Tokens like 'x' or 'q' appear in english dict and return True result = self.english_dict.check(word) return result def classify(self, word): if self.check(word): result = self.SPANISH_CLASS # Correct word in spanish elif self.check_NoES(word): result = self.FOREIGN_CLASS # Correct word in another language else: result = self.VARIANT_CLASS # Variant word (to correct) return result def is_variant(self, class_number): return class_number == self.VARIANT_CLASS def is_correct(self, class_number): return class_number == self.SPANISH_CLASS def is_not_spanish(self, class_number): return class_number == self.FOREIGN_CLASS
def steming(text: Text) -> List[Text]: stem = SpanishStemmer() ltext = [stem.stem(w) for w in cleanText(text)] return ltext
def get_vector_matrix(self, freq_floor=50, context_words=3): nlp = es_core_web_md.load() STOPWORDS = spacy.es.STOP_WORDS def _clean_sent(sent): clean_sent = [] # remove stopwords for word in sent: word = word.lower() if not word in STOPWORDS: if not word.isdigit(): clean_sent.append(word) return clean_sent def _update_feature(word, feature_name, features): " dirty update of features " counts = 1 if word in vectors: if feature_name in vectors[word]: counts = vectors[word][feature_name] + 1 features[feature_name] = counts return features def _update_counts(feature_name, f_counts): counts = 1 if feature_name in f_counts: counts = f_counts[feature_name] + 1 f_counts[feature_name] = counts return f_counts sents = self.corpus.get_sents() stemmer = SpanishStemmer() # will use the words as keys and dict of features as values vectors = {} #freq_counts = {} for sent in sents: # TODO: PARALELLIZE!! #for doc in nlp.pipe(texts, batch_size=10000, n_threads=3): # take off stopwords && to get context_words! cleaned_sent = _clean_sent(sent) doc = nlp(' '.join(sent)) for word_idx in range(len(doc)): # get the word and the pos tag spacy_word = doc[word_idx] word = spacy_word.text.lower() pos_tag = spacy_word.pos_ if len(word) <= 2: continue if word in STOPWORDS: continue if word.isdigit(): continue # if not seen word if not word in vectors: features = {} else: features = vectors[word] # counts of frequency to normalze later #freq_counts = _update_counts(pos_tag, freq_counts) # context related (POS and words stemmed) features = _update_feature(word, pos_tag, features) if word_idx > 0: prev_tag = doc[word_idx - 1].pos_ feature_name = prev_tag + '_pos_prev' features = _update_feature(word, feature_name, features) if word_idx < len(sent) - 1: post_tag = doc[word_idx + 1].pos_ feature_name = post_tag + '_pos_post' features = _update_feature(word, feature_name, features) # dependency features. the objective of the dep is stemmed! dep_type = spacy_word.dep_ if dep_type != 'ROOT': dep_obj = stemmer.stem(spacy_word.head.text.lower()) feature_name = 'DEP:' + dep_type + '-' + dep_obj features = _update_feature(word, feature_name, features) # get n words from context as features (stemmed...!) for i in range(context_words): ctxt_word = (random.choice(cleaned_sent)) feature_word = stemmer.stem(ctxt_word) feature_name = ctxt_word + '_ctxt_word' features = _update_feature(word, feature_name, features) # agregar feature de synset (wordnet) :0 features['word'] = word # frequency counting features = _update_feature(word, 'freq', features) vectors[word] = features # sacar palabras con < 'freq' words_to_pop = set() for word, f_dict in vectors.items(): if f_dict['freq'] <= freq_floor: words_to_pop.add(word) for word in words_to_pop: vectors.pop(word) for word, f_dict in vectors.items(): #print(word, f_dict) f_dict['freq'] = 0 vectors[word] = f_dict # delete an irrelevant dimension! # normalizar los contextos de POS #for word, f_dict in vectors.items(): # f_dict[] # agregar palabra de contexto. .. LEMATIZADA ! # NORMALIZAR TODOS LOS CONTEXTOS! -> diccionario de frequencias de ... TODOS los features que ocurrieron self.words = list( vectors.keys()) # thankfully in the same order as vectors.values vectorizer = DictVectorizer(dtype=numpy.int32) vec_matrix = vectorizer.fit_transform(list(vectors.values())) vectors_shape = vec_matrix.get_shape() print(vectors_shape) """ freqs_vector = vectorizer.transform(freq_counts) vec_matrix = vstack([freqs_vector, vec_matrix]) print(s.get_shape) print(s) print(vectorizer.inverse_transform(s)) """ # normalization vec_matrix = normalize(vec_matrix, copy=False) ####### reduccion de dim no sup # reducir dimensionalidad con variance treshold #selector = VarianceThreshold(threshold = 0.0) #vec_matrix = selector.fit_transform(vec_matrix) # SVD (PCA) Trunc_svd = TruncatedSVD(n_components=1500) vec_matrix = Trunc_svd.fit_transform(vec_matrix) # reducir dimensionalidad con percentile de varianza #selected = SelectPercentile(chi2, percentile = 10) #word_vecs_new=selected.fit_transform(new_word_vecs,target_vec) print(vectorizer.inverse_transform(vec_matrix)) # -> to see features! return self.words, vec_matrix
resenhas_categoria = {} inputt = open('resenhas_por_categoria_dict.pk1', 'rb') resenhas_categoria = load(inputt) inputt.close() categoria_polaridad = [] palabras_no_encontradas = 0 total_palabras_encontradas = 0 ss = SpanishStemmer() for categoria in resenhas_categoria: valor_categoria = 0 palabras_encontradas = 0 for resenha in resenhas_categoria[categoria]: for word in resenha[0].split(): if diccionario_polaridad.get(ss.stem(word).lower()): palabras_encontradas += 1 valor_categoria += diccionario_polaridad[ss.stem(word).lower()] else: palabras_no_encontradas += 1 total_palabras_encontradas += palabras_encontradas polaridad_promedio = valor_categoria / len(resenhas_categoria[categoria]) categoria_polaridad.append((categoria, polaridad_promedio)) print() for cp in categoria_polaridad: print(cp) print() print('>>>', total_palabras_encontradas, ' palabras encontradas.') print('>>>', palabras_no_encontradas, ' palabras no encontradas.')
def tokenizer_stemmer_global(document): stemmer = SpanishStemmer() my_tokenizer = RegexpTokenizer("[\w']+") return [ stemmer.stem(token) for token in my_tokenizer.tokenize(document) ]
def stemmer_all(tweet): stm = SpanishStemmer() split_tweet = [word for word in tweet.lower().split(' ') if word.strip()] return ' '.join([stm.stem(word.strip()) for word in split_tweet])
# -*- coding: utf-8 -*- """ Created on Mon May 6 16:07:59 2019 @author: Turing """ from bs4 import BeautifulSoup as Soup from _pickle import dump from nltk.stem.snowball import SpanishStemmer handler = open('senticon.es.xml', encoding="utf-8").read() soup = Soup(handler, 'lxml') diccionario_polaridad = {} ss = SpanishStemmer() for lemma in soup.find_all('lemma'): palabra = lemma.get_text() polaridad = float(lemma.attrs["pol"]) diccionario_polaridad[ss.stem(palabra.replace(' ', '')).lower()] = polaridad output = open("diccionario_polaridades.pk1", "wb") dump(diccionario_polaridad, output, -1) output.close()
def remove_stopwords(text, stopSpanish): stemmer=SpanishStemmer() textList = text.split() textList = [word for word in textList if word not in stopSpanish] return ' '.join([stemmer.stem(word) for word in textList])
from operator import attrgetter from pylons import config from nltk.stem.snowball import SpanishStemmer stemmer = SpanishStemmer(True) # load swadesh list swadesh_file = codecs.open(os.path.join(os.path.dirname( os.path.realpath( __file__)), "spa.txt"), "r", "utf-8") swadesh_list = [] for line in swadesh_file: line = line.strip() for e in line.split(","): stem = stemmer.stem(e) swadesh_list.append(stem) def init_model(engine): """Call me before using any of the tables or classes in the model""" Session.configure(bind=engine) entry_table = schema.Table('entry', meta.metadata, schema.Column('id', types.Integer, schema.Sequence('entry_seq_id', optional=True), primary_key=True), schema.Column('head', types.Unicode(255)), schema.Column('fullentry', types.Text), schema.Column('is_subentry', types.Boolean), schema.Column('is_subentry_of_entry_id', types.Integer), schema.Column('dictdata_id', types.Integer, schema.ForeignKey('dictdata.id')), schema.Column('book_id', types.Integer, schema.ForeignKey('book.id')),
def remove_pattern(input_txt, pattern): r = re.findall(pattern, input_txt) for i in r: input_txt = re.sub(i, '', input_txt) return input_txt # Elimina autor del tweet df = df.drop(columns=['user']) # Elimina usernames del tweet df['tiny_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*") # Eliminar puntuacion, numeros y caracteres especiales df['tiny_tweet'] = df['tiny_tweet'].str.replace("[^a-zA-Z#]", " ") # Elimina palabras cortas (menos de 3 caracteres) df['tiny_tweet'] = df['tiny_tweet'].apply( lambda x: ' '.join([w for w in x.split() if len(w) > 3])) # Tokenizar tweets df['tokens'] = df['tiny_tweet'].apply(lambda x: x.lower().split()) # Stemming stemmer = SpanishStemmer() df['stems'] = df['tokens'].apply(lambda x: [stemmer.stem(i) for i in x]) # Guardo un dataset ya limpio df.to_csv('./processed_tweets.csv', index=False)
class ScoreParagraphs(object): def __init__(self, question, words, stem): self.question = question self.stem = stem self.stemmer = SpanishStemmer() self.words = words self.stemmed_words = self.stem_words(self.words) self.path_pfx = os.getcwd() self.inverted_index = self.load_doc_inverted_index() self.doc_names = self.init_doc_names() self.paragraph_indices = {} self.paragraph_inverted_indices = {} self.results = pd.DataFrame(columns=['text', 'law', 'score']) self.load_paragraph_indices() self.L = 23055.676666666666 #Manually obtained using bash self.scores = {'tf': {}, 'idf':{}, 'tfidf':{},'n_containing':{},\ 'score':{}} def stem_words(self, words): #print('Stemming {}'.format(words)) processed = [] if self.stem: for word in words: word = self.stemmer.stem(word) processed.append(word) return set(processed) def load(self, filename): #print('Trying to load {}'.format(filename)) f = open(filename, 'r', encoding='utf-8') index = json.load(f) #print('Success!') return index def load_doc_inverted_index(self): filename = self.path_pfx + '/indices/inverted{}.json'.format( self.stem * '_stem') return self.load(filename) def init_doc_names(self): temp = [self.inverted_index[word] for word in self.stemmed_words \ if word in self.inverted_index] return [i for sublist in temp for i in sublist] # rv = set() # for i, word in enumerate(self.stemmed_words): # temp = set(self.inverted_index.get(word, [])) # if i == 0: # rv = rv.union(temp) # rv = rv.intersection(temp) # return list(rv) def load_paragraph_indices(self): for doc in self.doc_names: filename = self.path_pfx + '/indices/{}.json'.format( doc + self.stem * '_stem') self.paragraph_indices[doc] = self.load(filename) filename = self.path_pfx + '/indices/{}_p.json'.format( doc + self.stem * '_stem') self.paragraph_inverted_indices[doc] = self.load(filename) def where_should_i_look(self): ''' words is a list of strings, for now: ['robo', 'algun', 'sentencia'] ''' #print('Working with the following documents:\n{}'.format(self.doc_names)) for law in self.doc_names: filename = self.path_pfx + '/leyes/{}.txt'.format(law) text = get_text(filename) # paragraphs = re.findall(r'(Artículo|ARTÍCULO [0-9]+)(.*?)(Artículo|ARTÍCULO)',\ # text, flags=re.DOTALL, overlapped=True) # paragraph_list = [match[1] for match in paragraphs] _, paragraph_list = preprocess_text_to_words(text) for word in self.stemmed_words: #print(word) paragraphs = self.paragraph_inverted_indices[law].get(word, []) results = [[paragraph_list[x], law, 0] for x in paragraphs] df_temp = pd.DataFrame(results, columns=['text', 'law', 'score']) self.results = self.results.append( df_temp, ignore_index=True, ) return self.results ## http://stevenloria.com/finding-important-words-in-a-document-using-tf-idf/ def tf(self, word, document): s = (word, document) #print(s, type(s)) if s not in self.scores['tf']: self.scores['tf'][s] = document.words.count(word) / len( document.words) return self.scores['tf'][s] def n_containing(self, word, doclist): s = (word) #print(s, type(s)) if s not in self.scores['n_containing']: self.scores['n_containing'][s] = sum(1 for doc in doclist if word in doc.words) return self.scores['n_containing'][s] def idf(self, word, doclist): s = (word) #print(s, type(s)) if s not in self.scores['idf']: self.scores['idf'][s] = math.log( len(doclist) / (1 + self.n_containing(word, doclist))) return self.scores['idf'][s] def tfidf(self, word, document, doclist): s = (word, document) #print(s, type(s)) if s not in self.scores['tfidf']: self.scores['tfidf'][s] = self.tf(word, document) * self.idf( word, doclist) return self.scores['tfidf'][s] def bm25(self, word, document, doclist, k=2.0, b=0.75): ''' Takes, word, a string document, a blob object doclist, a list with blob objects l, Returns term frequency (TF) ''' s = (word, document) #print(s, type(s)) if s not in self.scores['score']: self.scores['score'][s] = self.idf(word, doclist) * \ self.tf(word, document) * (k + 1) / (self.tf(word, document) +\ k * (1 - b + b * len(document)/self.L)) return self.scores['score'][s] def score_docs(self, documents, words, method, k, b): ''' documents, a list of strings (paragraphs) words, a list of strings (words) ''' blobs = [tb(paragraph) for paragraph in documents] rv = [0] * len(blobs) for word in words: for i, blob in enumerate(blobs): if method == 'word_count': rv[i] += blob.words.count(word) elif method == 'bm25': rv[i] += self.bm25(word, blob, blobs, k, b) elif method == 'proximity': rv[i] += jaro_winkler(self.question, blob.string) self.results['score'] = rv return rv def drop_duplicates_and_short_paragraphs(self, min_size): self.results.drop_duplicates(inplace=True) k = lambda x: len(x.split()) > min_size self.results = self.results[self.results.text.apply(k)] # clf = retrieve_model() # print(clf.predict(self.results)) def load_law_names(self, filename): rv = pd.read_csv(filename, header=None, names=['law', 'Law']) self.results = self.results.merge(rv, on='law') del self.results['law'] def texts(self, top_k, method, k=2, b=0.75): ''' ''' words = self.stemmed_words if self.stem: print('Working with stemmed words: {}'.format(words)) else: print('Working with words: {}'.format(words)) self.where_should_i_look() self.drop_duplicates_and_short_paragraphs(4) paragraphs = self.results.text texts = [ ' '.join(self.stem_words(paragraph.split())) for paragraph in paragraphs ] self.score_docs(texts, words, method, k, b) self.results.sort_values('score', ascending=False, inplace=True) law_filename = self.path_pfx + '/doc/docnames.csv' df_names = self.load_law_names(law_filename) print(self.results.head(top_k)) return self.results.head(top_k)
lista_articulos_tokens.append(articulo_tokenizado) for articulo in lista_articulos_tokens: print() print(' '.join(articulo)) pol_dict = {} inputt = open('diccionario_polaridades_senticon.pkl', 'rb') pol_dict = load(inputt) inputt.close() def lemmatize_sent(sent): return utils.lemmatize_text( utils.remove_unalphabetic_words(nltk.word_tokenize(sent))) print('article', '\t\t', 'polarity') for aspect_i in range(len(lista_articulos_tokens)): aspect_value = 0 finded_words = 0 for word in lista_articulos_tokens[aspect_i]: if pol_dict.get(ss.stem(word)): finded_words += 1 aspect_value += pol_dict[ss.stem(word)] if finded_words > 0: aspect_value = aspect_value / finded_words print(aspect_i, '\t', aspect_value) else: print(aspect_i, 0, 0)
def export_swadesh_entries(input_path, output_path=None): print("Input: {0}".format(input_path)) print("Ouput: {0}".format(output_path)) cr = CorpusReaderDict(input_path) print("Data loaded") files = [ "book.csv", "component.csv", "corpusversion.csv", "dictdata.csv", "language_iso.csv", "language_bookname.csv", "language_src.csv", "language_tgt.csv", "nondictdata.csv", "wordlistdata.csv", "wordlistconcept.csv" ] for f in files: shutil.copyfile(os.path.join( input_path, f), os.path.join(output_path, f)) from nltk.stem.snowball import SpanishStemmer stemmer = SpanishStemmer() import qlc.utils #get stopwords stopwords = qlc.utils.stopwords_from_file(os.path.join(os.path.dirname( os.path.realpath( __file__)), "data", "stopwords", "spa.txt")) # load swadesh list swadesh_file = codecs.open(os.path.join(os.path.dirname( os.path.realpath( __file__)), "data", "swadesh", "spa.txt"), "r", "utf-8") swadesh_entries = [] for line in swadesh_file: line = line.strip() for e in line.split(","): stem = stemmer.stem(e) swadesh_entries.append(stem) # find all entries that contain one of the swadesh words # save entry ids to list entry_ids = [] dictdata_ids = cr.dictdata_string_ids for dictdata_id in dictdata_ids: src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id) tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id) # is there some spanish? if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']): continue for entry_id, head, translation in \ cr.ids_with_heads_with_translations_for_dictdata_id( dictdata_id): if src_language_iso == [ 'spa' ]: (head, translation) = (translation, head) translation = re.sub(" ?\([^)]\)", "", translation) if translation in stopwords: entry_ids.append(entry_id) else: translation = qlc.utils.remove_stopwords(translation, stopwords) phrase_stems = qlc.utils.stem_phrase(translation, stemmer, True) for stem in phrase_stems: if stem in swadesh_entries: entry_ids.append(entry_id) #print(len(entry_ids)) #return input_entry_csv = os.path.join(input_path, "entry.csv") output_entry_csv = os.path.join(output_path, "entry.csv") input_annotation_csv = os.path.join(input_path, "annotation.csv") output_annotation_csv = os.path.join(output_path, "annotation.csv") output_annotation = codecs.open(output_annotation_csv, "w", "utf-8") annotation_dict = collections.defaultdict(list) # cache annotations for lookup for i, line in enumerate(fileinput.input( input_annotation_csv, openhook=fileinput.hook_encoded("utf-8"))): if i == 0: output_annotation.write(line) continue data = line.strip().split("\t") annotation_dict[ data[_annotation_table_columns['entry_id'] + 1]].append(line) fileinput.nextfile() output = codecs.open(output_entry_csv, "w", "utf-8") count_entries = 0 for i, line in enumerate(fileinput.input( input_entry_csv, openhook=fileinput.hook_encoded("utf-8"))): if i == 0: output.write(line) continue data = line.strip().split("\t") if data[0] in entry_ids: output.write(line) for annotation_line in annotation_dict[data[0]]: output_annotation.write(annotation_line) fileinput.nextfile() output.close() output_annotation.close() # Worldists cr = CorpusReaderWordlist(sys.argv[1]) print("Data loaded") # find all entries that contain one of the swadesh words # save entry ids to list wordlistdata_ids = cr.wordlistdata_string_ids bibtex_keys = collections.defaultdict(list) for wid in wordlistdata_ids: wordlistdata_string = cr.wordlistdata_string_ids[wid] bibtex_key = wordlistdata_string.split("_")[0] bibtex_keys[bibtex_key].append(wid) wordlistentry_ids = [] for bibtex_key in bibtex_key: # first collect all concepts in this book where the spanish counterpart # has one of the swadesh words concepts = [] for wordlistentry_id in wordlistentry_ids: language_iso = cr.get_language_code_for_wordlistdata_id( wordlistdata_id) # is there some spanish? if language_iso != ['spa']: continue for entry_id, concept, counterpart in \ cr.ids_with_concepts_with_counterparts_for_dictdata_id( dictdata_id): counterpart = re.sub(" ?\([^)]\)", "", counterpart) if counterpart in stopwords: entry_ids.append(entry_id) else: counterpart = qlc.utils.remove_stopwords( counterpart, stopwords) phrase_stems = qlc.utils.stem_phrase( counterpart, stemmer, True) for stem in phrase_stems: if stem in swadesh_entries: concepts.append(concept) # now collect the entry ids for those concepts for wordlistentry_id in wordlistentry_ids: for entry_id, concept, counterpart in \ cr.ids_with_concepts_with_counterparts_for_dictdata_id( dictdata_id): if concept in concepts: wordlistentry_ids.append(entry_id) input_entry_csv = os.path.join(input_path, "wordlistentry.csv") output_entry_csv = os.path.join(output_path, "wordlistentry.csv") input_annotation_csv = os.path.join(input_path, "wordlistannotation.csv") output_annotation_csv = os.path.join(output_path, "wordlistannotation.csv") output_annotation = codecs.open(output_annotation_csv, "w", "utf-8") annotation_dict = collections.defaultdict(list) for i, line in enumerate(fileinput.input(input_annotation_csv, openhook=fileinput.hook_encoded("utf-8"))): if i == 0: output_annotation.write(line) continue data = line.strip().split("\t") annotation_dict[data[_wordlistannotation_table_columns['entry_id'] + 1]].append(line) fileinput.nextfile() output = codecs.open(output_entry_csv, "w", "utf-8") count_entries = 0 for i, line in enumerate(fileinput.input(input_entry_csv, openhook=fileinput.hook_encoded("utf-8"))): if i == 0: output.write(line) continue data = line.strip().split("\t") if data[0] in entry_ids: output.write(line) for annotation_line in annotation_dict[data[0]]: output_annotation.write(annotation_line) fileinput.nextfile() output.close() output_annotation.close()
# Then we loop through all the nodes of the merged graph and add the stem nodes to each Spanish node. If the node has only one word (after stopword removal) we will use the NLTK stemmer; otherwise we just leave the phrase as it is: # <codecell> combined_graph_stemmed = copy.deepcopy(combined_graph) for node in combined_graph.nodes(): if "lang" in combined_graph.node[node] and combined_graph.node[node]["lang"] == "spa": e = re.sub(" ?\([^)]\)", "", node) e = e.strip() stem = e words = e.split(" ") if len(words) > 1: words = [w for w in words if not w in stopwords or w == ""] if len(words) == 1: stem = stemmer.stem(words[0]) stem = stem + "|stem" combined_graph_stemmed.add_node(stem, is_stem=True) combined_graph_stemmed.add_edge(stem, node) # <markdowncell> # Again we can count the nodes and the number of connected components. We see that the number of connected components decreases, as more nodes are connected into groups now: # <codecell> networkx.algorithms.components.number_connected_components(combined_graph_stemmed) # <codecell>
class TextProcessor: lemmatizer=None stopEnglish=None stopSpanish=None spanishStemmer=None def __init__(self): self.lemmatizer = treetaggerwrapper.TreeTagger(TAGLANG='es') self.stopEnglish = stopwords.words('english') self.stopSpanish = stopwords.words('spanish') self.stopSpanish.append('y/o') self.spanishStemmer=SpanishStemmer() def _remove_numbers(self, text): "Elimina los números del texto" return ''.join([letter for letter in text if not letter.isdigit()]) def _remove_punctuation(self, text): "Elimina los signos de puntuacion del texto" regex = re.compile('[%s]' % re.escape(string.punctuation)) return regex.sub(' ', text) def preprocessText(self,text): text=text.lower() text=self._remove_punctuation(text) text=self._remove_numbers(text) return text def lematizeText(self,text): newText = "" firstElement = 0 firstWord=True for word in text.split(): if word not in self.stopEnglish and word not in self.stopSpanish: word = word.replace("\ufeff", "") lemmaResult = self.lemmatizer.tag_text(word) # Return [[word,type of word, lemma]] if (len(lemmaResult) != 0): word = lemmaResult[firstElement].split()[2] if firstWord: newText += word firstWord = False else: newText += " " + word return newText def stemText(self,text): newText = "" firstWord = True for word in text.split(): if word not in self.stopEnglish and word not in self.stopSpanish: word = word.replace("\ufeff", "") wordStemmed = self.spanishStemmer.stem(word) if firstWord: newText += wordStemmed firstWord = False else: newText += " " + wordStemmed return newText
class MLAssistant(Assistant): def __init__(self, language='en', database_name='memory', memory_table='memory', listen_log_table='listen_log', speak_log_table='speak_log'): super().__init__(language, database_name, memory_table, listen_log_table, speak_log_table) try: json_file = open('modelo_gustos.json', 'r') loaded_model_json = json_file.read() json_file.close() self.model = model_from_json(loaded_model_json) self.model.load_weights("modelo_gustos.h5") self.model.compile(loss='mean_squared_error', optimizer='adam', metrics=['binary_accuracy']) except Exception: print('****ERROR: Error cargando modelo...****') self.stemmer = SpanishStemmer() self.words = [ '¿qu', '?', 'peli', 'pelis', 'color', 'favorit', 'leer', 'libr', 'novel', 'ver', 'prefier', 'gust', 'pelicul', 'jug', '¿cual', 'prefer', 'jueg', 'com', 'plat', 'animal', 'videojueg' ] self.classes = [ 'comida', 'color', 'animal', 'juego', 'libro', 'película' ] def main(self, initial_sentence='¿Qué deseas?'): self.speak(initial_sentence, remember=False) self.listen() self.process_orders(self.last_recognised) self.adjust_for_ambient_noise() def process_orders(self, sentence): _class = self.classify_sentence(sentence) if not _class: self.speak('no estoy segura de lo que me quieres preguntar') else: if _class == 'comida': self.speak( 'Sin lugar a dudas mi comida preferida son los nachos con queso' ) if _class == 'color': self.speak('Mi color preferido es el escarlata.') if _class == 'animal': self.speak( 'Me gustan mucho los grandes felinos, pero mi animal preferido es una perra que se llama' ' Arale.') if _class == 'juego': self.speak('¡Me encanta Hollywood Monsters!') if _class == 'libro': self.speak( 'No queda muy bien decirlo, pero me han programado para decir siempre la verdad. No tengo' ' tiempo para leer, y por tanto no tengo libro preferido.') if _class == 'película': self.speak( 'No tengo una película preferida, pero me gustan especialmente las películas de Disney y' ' las del Studio Ghibli.') def classify_sentence(self, sentence, min_val=0.5): results = self._get_classification(sentence) if float(results[0][1]) < min_val: return None else: return results[0][0] def _clean_up_sentence(self, sentence): sentence_words = nltk.word_tokenize(sentence) sentence_words = [ self.stemmer.stem(word.lower()) for word in sentence_words ] return sentence_words def _bow(self, sentence, words): sentence_words = self._clean_up_sentence(sentence) bag = [0] * len(words) for s in sentence_words: for i, w in enumerate(words): if w == s: bag[i] = 1 return np.array(bag) def _get_classification(self, sentence): array = [self._bow(sentence, self.words)] np_array = np.array(array, "float32") prediction = self.model.predict(np_array).round(2)[0] result = dict(zip(self.classes, prediction)) return sorted(result.items(), key=operator.itemgetter(1))[::-1]
sents = [] for file_name in utils.find_all_files_in_path('*.txt',files_path): sents += nltk.sent_tokenize(open(file_name).read().replace('\n\n','.').replace('\n','.')) print('aspect','\t\t','polarity') for aspect in aspects: aspect_sent_avg_val = 0 sents_of_aspect_count = 0 for sent in sents: lemmatized_sent = lemmatize_sent(sent) if (aspect in sent) or (aspect in lemmatized_sent): sent_value = 0 sents_of_aspect_count += 1 finded_words = 0 for word in lemmatized_sent: if pol_dict.get(ss.stem(word)): finded_words += 1 sent_value += pol_dict[ss.stem(word)] if finded_words > 0: aspect_sent_avg_val += sent_value #/ finded_words if sents_of_aspect_count > 0: aspect_sent_avg_val = aspect_sent_avg_val / sents_of_aspect_count print(aspect, '\t',aspect_sent_avg_val) else: print(aspect, 0, 0)
def main(argv): log = logging.getLogger() logging.basicConfig(level=logging.INFO) conf = appconfig('config:development.ini', relative_to='.') config = None if not pylons.test.pylonsapp: config = load_environment(conf.global_conf, conf.local_conf) stemmer = SpanishStemmer(True) # load swadesh list swadesh_file = codecs.open(os.path.join(os.path.dirname( os.path.realpath( __file__)), "swadesh_spa.txt"), "r", "utf-8") swadesh_entries = [] for line in swadesh_file: line = line.strip() for e in line.split(","): stem = stemmer.stem(e) swadesh_entries.append(stem) for b in quanthistling.dictdata.books.list: #if b['bibtex_key'] != "thiesen1998": # continue book = model.meta.Session.query(model.Book).filter_by(bibtex_key=b['bibtex_key']).first() if book: print "Filtering entries in %s..." % b['bibtex_key'] for dictdata in book.dictdata: entries = model.meta.Session.query(model.Entry).filter(model.Entry.dictdata_id==dictdata.id).order_by("startpage", "pos_on_page").all() annotations = model.meta.Session.query(model.Annotation).join(model.Entry, model.Annotation.entry_id==model.Entry.id).filter(model.Entry.dictdata_id==dictdata.id).all() dict_annotations = collections.defaultdict(list) for a in annotations: dict_annotations[a.entry_id].append(a) for e in entries: if b['bibtex_key'] == "thiesen1998": e.filtered = False else: e.filtered = True for a in dict_annotations[e.id]: if a.value == "iso-639-3" and a.string == "spa": for a2 in dict_annotations[e.id]: if (a2.value == "head" or a2.value == "translation") and a2.start == a.start: phrase = re.sub(" ?\([^)]\)", "", a2.string) phrase = phrase.strip() if not " " in phrase: stem = stemmer.stem(phrase) if stem in swadesh_entries: e.filtered = False # if e.is_subentry: # e.mainentry().filtered = False Session.commit()
class TokenRepository: terminos = {} tokens = [] reglasDocumento = [] reglasTokens = [] reglasEntities = [] documentos = [] fileNameTerminos = "results/terminos.txt" lista_vacias = [] stemmer = None def __init__(self): self.reglasEntities.append(EmailRegla()) self.reglasEntities.append(UrlRegla()) self.reglasEntities.append(FechasRegla()) self.reglasEntities.append(TelefonosRegla()) self.reglasEntities.append(AbreviaturasRegla()) self.reglasEntities.append(NombresPropiosRegla()) self.reglasEntities.append(NumerosRegla()) self.reglasDocumento.append(MinusculasRegla()) self.reglasDocumento.append(TranslateRegla()) self.reglasDocumento.append(LimpiarHtmlTagsRegla()) self.reglasDocumento.append(LimpiadoBasicoRegla()) self.reglasTokens.append(MinMaxCaracteresRegla()) self.stemmer = SpanishStemmer() def tokenizar(self, documentos, **options): # INIT self.documentos = documentos self.tokens = [] self.terminos = {} self.lista_vacias = [] pathVacias = options.get('pathVacias', None) if pathVacias != None: print u"ANALIZANDO PALABRAS VACIAS" with codecs.open(pathVacias, mode='rt', encoding='utf-8') as vacias: content = vacias.read() for instancia in self.reglasDocumento: content = instancia.run(content) palabras = content.strip().split() for palabra in palabras: if palabra not in self.lista_vacias: self.lista_vacias.append(palabra) # Procesamos cada documento indexDocumento = 0 cantidadDocumentos = len(documentos) for documento in documentos: documento.terminos = {} documento.tokens = [] content = documento.content tokensEntities = [] # Aplicamos cada regla definida en self.reglasEntities para entidades for instancia in self.reglasEntities: response = instancia.run(content) content = response['content'] # Agregamos los terminos a los del documento documento.terminos.update(response['terminos']) tokensEntities += response['tokens'] # Aplicamos cada regla definida en self.reglasDocumento para normalizar for instancia in self.reglasDocumento: content = instancia.run(content) # Sacamos tokens de documentos tokensAux = self.getTokens(content) self.tokens = self.tokens + tokensAux + tokensEntities documento.tokens = tokensAux + tokensEntities # Aplicamos cada regla definida en self.reglasTokens for instancia in self.reglasTokens: tokensAux = instancia.run(tokensAux) # Sacamos palabras vacias if pathVacias != None: for token in tokensAux: if token in self.lista_vacias: tokensAux.remove(token) # Aplicamos Stemming excepto entidades tokensAux = self.stemmizar(tokensAux) terminosAux = self.getTerminos(tokensAux) documento.terminos.update(terminosAux) self.saveTerminosGlobal(documento) indexDocumento += 1 porcentaje = (indexDocumento * 100) / cantidadDocumentos sys.stdout.write(u"\r" + str(int(porcentaje)).ljust(3) + u"% \u258F" + (u"\u2588" * int(porcentaje / 2)).ljust(50) + u"\u2595") sys.stdout.flush() print '\n' self.saveTerminosFile() # Armamos la respuesta response = {} response['terminos'] = self.terminos response['tokens'] = self.tokens response['documentos'] = documentos return response def getTokens(self, string): content = string.strip().split() # Return return content def getTerminos(self, tokens): terminos = {} for token in tokens: if token not in terminos: terminos[token] = {} terminos[token]['CF'] = 1 else: terminos[token]['CF'] += 1 return terminos def stemmizar(self, tokens): tokensAux = [] for token in tokens: tokensAux.append(self.stemmer.stem(token)) return tokensAux def saveTerminosGlobal(self, documento): terminos = {} for termino in documento.terminos: if termino not in self.terminos: self.terminos[termino] = {} self.terminos[termino]['CF'] = documento.terminos[termino][ 'CF'] self.terminos[termino]['DOCS'] = [documento] else: self.terminos[termino]['CF'] += 1 if documento not in self.terminos[termino]["DOCS"]: self.terminos[termino]["DOCS"].append(documento) def saveTerminos(tokens, documento): if token not in self.terminos: self.terminos[token] = {} self.terminos[token]['CF'] = 1 self.terminos[token]['DOCS'] = [documento] else: self.terminos[token]['CF'] += 1 if documento not in self.terminos[token]["DOCS"]: self.terminos[token]["DOCS"].append(documento) def saveTerminosFile(self): with codecs.open(self.fileNameTerminos, mode="w", encoding="utf-8") as archivo: index = 0 archivo.write('ID'.ljust(6)) archivo.write('|') archivo.write('TERMINO'.ljust(30)) archivo.write('|') archivo.write('CF'.ljust(6)) archivo.write('|') archivo.write('DF'.ljust(6)) archivo.write('\n') archivo.write('-' * 50) archivo.write('\n') for termino in sorted(self.terminos.keys()): archivo.write(str(index).ljust(6)) archivo.write('|') archivo.write(termino.ljust(30)) archivo.write('|') archivo.write(str(self.terminos[termino]['CF']).ljust(6)) archivo.write('|') archivo.write( str(len(self.terminos[termino]['DOCS'])).ljust(6)) archivo.write('\n') index += 1
# Then we loop through all the nodes of the merged graph and add the stem nodes to each Spanish node. If the node has only one word (after stopword removal) we will use the NLTK stemmer; otherwise we just leave the phrase as it is: # <codecell> combined_graph_stemmed = copy.deepcopy(combined_graph) for node in combined_graph.nodes(): if "lang" in combined_graph.node[node] and combined_graph.node[node][ "lang"] == "spa": e = re.sub(" ?\([^)]\)", "", node) e = e.strip() stem = e words = e.split(" ") if len(words) > 1: words = [w for w in words if not w in stopwords or w == ""] if len(words) == 1: stem = stemmer.stem(words[0]) stem = stem + "|stem" combined_graph_stemmed.add_node(stem, is_stem=True) combined_graph_stemmed.add_edge(stem, node) # <markdowncell> # Again we can count the nodes and the number of connected components. We see that the number of connected components decreases, as more nodes are connected into groups now: # <codecell> networkx.algorithms.components.number_connected_components( combined_graph_stemmed) # <codecell>
for l in fileinput.input("/Users/ramon/qlc-github/data/eswiki/AA/wiki00"): l = l.strip() l = l.decode("utf-8") l = unicodedata.normalize("NFD", l) if l.startswith("</doc>"): sentences = tokenizer.tokenize(doc) for s in sentences: s = regex.sub(u"[^\p{L}\p{M}]", " ", s) s = s.lower() for w in s.split(): if not w in stopwords: stem = w if len(w) > 3: stem = stemmer.stem(w) sentences_for_stem[stem].add(sentence_id) docs_for_stem[stem].add(doc_id) sentence_id += 1 doc = "" doc_id += 1 elif not l.startswith("<doc"): l = regex.sub("</?a[^>]*>", "", l) doc += l + " " #if doc_id > 500: # break stem1 = stemmer.stem("continua") stem2 = stemmer.stem("figura")
def do_stemmer(df, stop_language='spanish'): """Apply stop words and Stemmers""" ## Como nos llegan tickets en dos idiomas añadimos las palabras de ambos idiomas stop = get_stop_words(stop_language) + get_stop_words('english') ## Añdimos nuestras propias palabras stop += [ "buenas", "buenos", "cid", "dias", "gracias", "hola", "mucho", "mucha", "poder", "proyecto", "please", "saludo", "tardes", "www", "habia" ] stop += [ 'ahora', 'algun', 'alguna', 'amanecia interrumpio', 'amanecia interrumpio relato', 'amanecia interrumpio relato habian', 'amanecia interrumpio relato habian dado', 'aquel', 'asi', 'aun', 'cada', 'vez', 'mas', 'cualquier', 'cosa', 'cuanto', 'dado', 'darse', 'debe', 'debia', 'despues', 'dia noche', 'dia siguiente', 'diez años', 'diez mil', 'dijo', 'dijo', 'dio', 'habia', 'mas', 'podia', 'podian', 'mismo', 'si', 'tal', 'tan', 'puede', 'pueden ser', 'pues', 'puso', 'toda', 'todas', 'vease tambien', 'primer lugar', 'varias', 'dos', 'largo', 'hacia' 'uno', 'una', 'unos', 'una', 'aquella', 'aquello', 'aquel', 'hace', 'muchas', 'mucho', 'muchos', 'mucha', 'pueden', 'puedo', 'unas', 'abrio puerta', 'arriba abajo', 'aqui alla', 'habian', 'doña', 'don', 'señor', 'señora', 'hizo', 'quedo', 'fuerza sino', 'quedo perplejo', 'parece haber', 'parece ser', 'parecia haber', 'mayor parte', 'mañana siguiente', 'media hora', 'hoy dia', 'iba ser', 'iii pag', 'haber hecho', 'habria podido', 'hacer cosas', 'hacia arriba', 'hacia atras', 'hacia puerta', 'hacia tiempo', 'decir verdad', 'dejo caer', 'demasiado tarde', 'derecha izquierda', 'di cuenta', 'dia anterior', 'dia noche', 'dia siguiente', 'casi siempre', 'cierto dia', 'cierto modo', 'cinco años', 'aqui alla', 'arriba abajo', 'aunque solo', 'año nuevo', 'años edad', 'buena parte', 'ninguna parte', 'noche anterior', 'noche dia', 'nunca visto', 'partido comunista', 'podria haber', 'podria ser', 'press cambridge', 'primer lugar', 'quiere decir', 'quiero decir', 'sentido comun', 'seria mejor', 'tras haber', 'tres años', 'tres cuatro', 'tres meses', 'voz alta', 'voz baja', ] stop_words_generated_tokens = [ 'abajo', 'abrio', 'alla', 'alta', 'amanecia', 'anterior', 'aqui', 'aren', 'arriba', 'atras', 'aunque', 'año', 'años', 'baja', 'buena', 'caer', 'cambridge', 'can', 'casi', 'cierto', 'cinco', 'comun', 'cosas', 'couldn', 'cuatro', 'cuenta', 'decir', 'dejo', 'demasiado', 'di', 'dia', 'didn', 'diez', 'doesn', 'edad', 'haber', 'habria', 'hacer', 'hacia', 'hadn', 'hasn', 'haven', 'hecho', 'hora', 'hoy', 'iba', 'iii', 'isn', 'let', 'll', 'lugar', 'mayor', 'mañana', 'media', 'mejor', 'meses', 'modo', 'mustn', 'ninguna', 'noche', 'nuevo', 'nunca', 'pag', 'parece', 'parecia', 'parte', 'partido', 'podido', 'podria', 'puerta', 'quiere', 'quiero', 're', 'relato', 'sentido', 'ser', 'seria', 'shan', 'shouldn', 'siempre', 'siguiente', 'sino', 'solo', 'tambien', 'tarde', 'tiempo', 'tras', 'tres', 've', 'vease', 'visto', 'wasn', 'weren', 'won', 'wouldn' ] stop += stop_words_generated_tokens ps = SpanishStemmer() a = [] df["stem"] = "n" for i, row in df.iterrows(): a.append( ps.stem(row["text"]).replace('fuerza sino', '').replace( 'acceder', 'acceso').replace('user', 'usuario').replace( 'access', 'acceso').replace('usuarios', 'usuario').replace( 'abrio puerta', '').replace('acto seguido', '')) df["stem"] = a return df, stop
#------------------------------ #guardaEnArchivo("OUT_FILES\VocabularioNoStop.txt", textoTokenizadoNoStopWords) #------------------------------ #grabaEnBD('tokens_sin_numeros', textoTokenizado) #------------------------------ print("Tokens set sin números ni stopwords -> set(len()): " + str(len(set(textoTokenizadoNoStopWords)))) tokensStem = [] spanishStemm = SpanishStemmer(ignore_stopwords=False) for token in textoTokenizado: #stemming = stemmer.stem(token) palabraStem = spanishStemm.stem(token) tokensStem.append(palabraStem) #------------------------------ #grabaEnBD('tokens_stem', tokensStem, update=True) #------------------------------ #nuevosTokens=lemmatizer("lemmatization-es.txt", textoTokenizado) nuevosTokensLemmas = lemmatizerBD("lemmatization-es.txt", textoTokenizadoNoStopWords, 'tokens_sin_stopwords_lemmas', saveToTable=True) tokensLemmasSinStopW = [] input("Checar bd") #grabaEnBD('tokens_lemmas', nuevosTokens)