def transform(self, text): try: return self.get_features(text) except Exception as e: Util.standard_error(sys.exc_info()) print('Error transform: {0}'.format(e)) return None
def get_features(self, text, model_type='00'): try: return self.get_lexical_features(text) except Exception as e: Util.standard_error(sys.exc_info()) print('Error get_features: {0}'.format(e)) return None
def tagger(self, text): result = None try: list_tagger = [] doc = self.analysis_pipe(text.lower()) for token in doc: item = { 'text': token.text, 'lemma': token.lemma_, 'stem': token._.stem, 'pos': token.pos_, 'tag': token.tag_, 'dep': token.dep_, 'shape': token.shape_, 'is_alpha': token.is_alpha, 'is_stop': token.is_stop, 'is_digit': token.is_digit, 'is_punct': token.is_punct } list_tagger.append(item) result = list_tagger except Exception as e: Util.standard_error(sys.exc_info()) print('Error tagger: {0}'.format(e)) return result
def __init__(self, lang): try: dict_lang = {'es': 'spanish', 'en': 'english'} self.stemmer = SnowballStemmer(dict_lang[lang]) Token.set_extension('stem', default='', force=True) except Exception as e: Util.standard_error(sys.exc_info())
def dependency_child(self, text): result = [] try: doc = self.analysis_pipe(text.lower()) for token in doc: item = { 'chunk': token.text, 'text': token.text, 'pos_': token.pos_, 'dep_': token.dep_, 'tag_': token.tag_, 'head_text': token.head.text, 'head_pos': token.head.pos_, 'children': None } if len(list(token.children)) > 0: item['children'] = [{ 'child': child, 'pos_': child.pos_, 'dep_': child.dep_, 'tag_': child.tag_, 'head.text': child.head.text, 'head.pos_': child.head.pos_ } for child in token.children] result.append(item) except Exception as e: Util.standard_error(sys.exc_info()) print('Error dependency_child: {0}'.format(e)) return result
def train(self, file_output='predictive_sentiment', iteration=10, fold=10): try: result = {} best_model = None best_classifier = None best_f1 = 0.0 date_file = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") file_report = '{0}_Fold{1}_Iteration{2}_report_{3}.csv'.format(file_output, fold, iteration, date_file) output = DIR_OUTPUT + file_report label = preprocessing.LabelEncoder() x_train, x_test, y_train, y_test = Util.import_dataset() with open(output, 'w') as out_csv: writer = csv.DictWriter(out_csv, fieldnames=fieldnames, delimiter=';', lineterminator='\n') headers = dict((n, n) for n in fieldnames) writer.writerow(headers) for model_name, value in list_model.items(): print('{0}| Start Model: {1}|{0}'.format("#" * 15, model_name)) # data train print('Get train features') x_train = [self.fex.get_features(text=text, model_type=value) for text in tqdm(x_train)] x_train = preprocessing.normalize(x_train) y_train = label.fit_transform(y_train) # data test print('Get test features') x_test = [self.fex.get_features(text=text, model_type=value) for text in tqdm(x_test)] x_test = preprocessing.normalize(x_test) y_test = label.fit_transform(y_test) # crear una función que reciba por parametro el modelo(algoritmo de clasificación) # x_train, y_train, x_test, y_test data_result = {} [writer.writerow(model_i) for model_i in data_result] out_csv.flush() print('Model {0} save successful!'.format(model_name)) for row in data_result: f1_j = float(row['f1']) classifier = row['classifier'] if f1_j > best_f1: best_f1 = f1_j best_model = row['model_name'] best_classifier = row['classifier_name'] # save model file_model = '{0}{1}_model.sav'.format(DIR_MODELS, file_output) outfile = open(file_model, 'wb') pickle.dump(classifier, outfile) outfile.close() print('Model exported in {0}'.format(file_model)) out_csv.close() print('{0}| End Model: {1}|{0}'.format("#" * 15, model_name)) print('The best model is {0}, {1} with F1 score = {2}'.format(best_model, best_classifier, best_f1)) except Exception as e: Util.standard_error(sys.exc_info()) print('Error train: {0}'.format(e)) return None
def get_chunks(self, text): try: doc = self.analysis_pipe(text) return [chunk.text for chunk in doc.noun_chunks] except Exception as e: Util.standard_error(sys.exc_info()) print('Error get_chunks: {0}'.format(e)) return None
def analysis_pipe(self, text): result = None try: result = self.nlp(text) except Exception as e: Util.standard_error(sys.exc_info()) print('Error analysis_pipe: {0}'.format(e)) return result
def __call__(self, doc): try: for token in doc: if not token.is_punct and not token.is_stop and not token.is_digit: token._.set('stem', self.stemmer.stem(token.text)) return doc except Exception as e: Util.standard_error(sys.exc_info())
def sentence_detection(self, text): result = [] try: doc = self.analysis_pipe(text) result = [sent.string.strip() for sent in doc.sents] except Exception as e: Util.standard_error(sys.exc_info()) print('Error sentence_detection: {0}'.format(e)) return result
def proper_encoding(text): result = None try: text = unicodedata.normalize('NFD', text) text = text.encode('ascii', 'ignore') result = text.decode("utf-8") except Exception as e: Util.standard_error(sys.exc_info()) print('Error proper_encoding: {0}'.format(e)) return result
def __init__(self): """ :rtype: object :return: Machine learning object """ try: print('Load Machine Learning....') self.fex = FeatureExtraction(text_analysis=None, lang='es') except Exception as e: Util.standard_error(sys.exc_info()) print('Error constructor: {0}'.format(e))
def delete_special_patterns(self, text): result = None try: text = re.sub(r'\©|\×|\⇔|\_|\»|\«|\~|\#|\$|\€|\Â|\�|\¬', ' ', text)# Elimina caracteres especilaes text = re.sub(r'\,|\;|\:|\!|\¡|\’|\‘|\”|\“|\"|\'|\`', ' ', text)# Elimina puntuaciones text = re.sub(r'\}|\{|\[|\]|\(|\)|\<|\>|\?|\¿|\°|\|', ' ', text) # Elimina parentesis text = re.sub(r'\/|\-|\+|\*|\=|\^|\%|\&|\$', ' ', text) # Elimina operadores result = text.lower() except Exception as e: Util.standard_error(sys.exc_info()) print('Error delete_special_patterns: {0}'.format(e)) return result
def language_detector(self, text): result = None try: doc = self.analysis_pipe(text.lower()) for sent in doc.sents: if sent._.language['score'] > 0.8: result = sent._.language['language'] break except Exception as e: Util.standard_error(sys.exc_info()) print('Error language_detector: {0}'.format(e)) return result
def dependency(self, text): result = [] try: doc = self.analysis_pipe(text.lower()) doc_chunks = list(doc.noun_chunks) for chunk in doc_chunks: item = {'chunk': chunk, 'text': chunk.text, 'root_text': chunk.root.text, 'root_dep': chunk.root.dep_} result.append(item) except Exception as e: Util.standard_error(sys.exc_info()) print('Error dependency: {0}'.format(e)) return result
def load_sapcy(self, lang): result = None try: stemmer_text = Steaming(lang) # initialise component result = spacy.load('es_core_news_md') if lang == 'es' else spacy.load('en_core_web_md') emoji = Emoji(result) result.add_pipe(emoji, first=True) result.add_pipe(stemmer_text, after='parser', name='stemmer') print('Language: {0}\nText Analysis: {1}'.format(lang, result.pipe_names)) except Exception as e: Util.standard_error(sys.exc_info()) print('Error load_sapcy: {0}'.format(e)) return result
def clean_text(self, text, **kwargs): result = None try: labels = ['EMAIL', 'EMOJI', 'MENTION', 'HASHTAG', 'URL'] url = kwargs.get('url') if type( kwargs.get('url')) is bool else False mention = kwargs.get('mention') if type( kwargs.get('mention')) is bool else False emoji = kwargs.get('emoji') if type( kwargs.get('emoji')) is bool else False hashtag = kwargs.get('hashtag') if type( kwargs.get('hashtag')) is bool else False relabel = kwargs.get('relabel') if type( kwargs.get('relabel')) is bool else False stopwords = kwargs.get('stopwords') if type( kwargs.get('stopwords')) is bool else False text_out = str(text).lower() text_out = re.sub( r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)', '[EMAIL]', text_out) text_out = re.sub("[\U0001f000-\U000e007f]", '[EMOJI]', text_out) if emoji else text_out text_out = re.sub( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '[URL]', text_out) if url else text_out text_out = re.sub("@([A-Za-z0-9_]{1,40})", '[MENTION]', text_out) if mention else text_out text_out = re.sub("#([A-Za-z0-9_]{1,40})", '[HASHTAG]', text_out) if hashtag else text_out text_out = re.sub("[0-9]", '', text_out) if not relabel: for label in labels: text_out = re.sub(r'\[' + label + r'\]', ' ', text_out) if mention else text_out text_out = self.delete_special_patterns(text_out) text_out = self.stopwords(text_out) if stopwords else text_out # removing any single letter on a string text_out = re.sub(r'((?<=^)|(?<= )).((?=$)|(?= ))', ' ', text_out).strip() # condense multiple spaces with a single space text_out = re.sub(r'\s+', ' ', text_out).strip() text_out = text_out.rstrip() result = text_out if text_out != ' ' else None except Exception as e: Util.standard_error(sys.exc_info()) print('Error clean_text: {0}'.format(e)) return result
def lexical_diversity(text): result = None try: text_out = re.sub(r"[\U00010000-\U0010ffff]", '', text) text_out = re.sub( r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+' r'|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', text_out) text_out = text_out.lower() result = round((len(set(text_out)) / len(text_out)), 4) except Exception as e: Util.standard_error(sys.exc_info()) print('Error lexical_diversity: {0}'.format(e)) return result
def weighted_position(tokens_text): result = None try: size = len(tokens_text) weighted_words = 0.0 weighted_normalized = 0.0 for w in tokens_text: weighted_words += 1 / (1 + tokens_text.index(w)) weighted_normalized += (1 + tokens_text.index(w)) / size result = (weighted_words, weighted_normalized) except Exception as e: Util.standard_error(sys.exc_info()) print('Error weighted_position: {0}'.format(e)) return result
def stopwords(self, text): try: nlp = Spanish() if self.lang == 'es' else English() doc = nlp(text) token_list = [token.text for token in doc] sentence = [] for word in token_list: lexeme = nlp.vocab[word] if not lexeme.is_stop: sentence.append(word) return ' '.join(sentence) except Exception as e: Util.standard_error(sys.exc_info()) print('Error stopwords: {0}'.format(e)) return None
def dependency_all(self, text): result = [] try: doc = self.analysis_pipe(text.lower()) for chunk in doc.noun_chunks: item = {'chunk': chunk, 'text': chunk.root.text, 'pos_': chunk.root.pos_, 'dep_': chunk.root.dep_, 'tag_': chunk.root.tag_, 'lemma_': chunk.root.lemma_, 'is_stop': chunk.root.is_stop, 'is_punct': chunk.root.is_punct, 'head_text': chunk.root.head.text, 'head_pos': chunk.root.head.pos_, 'children': [{'child': child, 'pos_': child.pos_, 'dep_': child.dep_, 'tag_': child.tag_, 'lemma_': child.lemma_, 'is_stop': child.is_stop, 'is_punct': child.is_punct, 'head.text': child.head.text, 'head.pos_': child.head.pos_} for child in chunk.root.children]} result.append(item) except Exception as e: Util.standard_error(sys.exc_info()) print('Error dependency_all: {0}'.format(e)) return result
def pos_frequency(self, text): dict_token = {'NOUN': 0, 'VERB': 0, 'ADJ': 0, 'ANOTHER': 0} try: doc = self.ta.tagger(text) for token in doc: if token['pos'] == 'NOUN': value = dict_token['NOUN'] dict_token['NOUN'] = value + 1 elif token['pos'] == 'VERB': value = dict_token['VERB'] dict_token['VERB'] = value + 1 elif token['pos'] == 'ADJ': value = dict_token['ADJ'] dict_token['ADJ'] = value + 1 else: value = dict_token['ANOTHER'] dict_token['ANOTHER'] = value + 1 except Exception as e: Util.standard_error(sys.exc_info()) print('Error pos_frequency: {0}'.format(e)) return dict_token
def get_lexical_features(self, text): try: setting = {'url': True, 'mention': True, 'emoji': True, 'hashtag': True, 'stopwords': False, 'relabel': True} text_tokenizer = TweetTokenizer() tags = ('mention', 'url', 'hashtag', 'emoji', 'rt', 'numero', 'nombre', 'apellido') vector = dict() vector['lexical_diversity'] = self.lexical_diversity(text) text = self.ta.clean_text(text, **setting) tokens_text = text_tokenizer.tokenize(text) vector['weighted_position'], vector['weighted_normalized'] = self.weighted_position(tokens_text) vector['label_mention'] = float(sum(1 for word in tokens_text if word == 'mention')) vector['label_url'] = float(sum(1 for word in tokens_text if word == 'url')) vector['label_hashtag'] = float(sum(1 for word in tokens_text if word == 'hashtag')) vector['label_emoji'] = float(sum(1 for word in tokens_text if word == 'emoji')) vector['label_retweets'] = float(sum(1 for word in tokens_text if word == 'rt')) label_word = vector['label_mention'] + vector['label_url'] + vector['label_hashtag'] label_word = label_word + vector['label_emoji'] + vector['label_retweets'] vector['label_word'] = float(len(tokens_text) - label_word) vector['first_person_singular'] = float( sum(1 for word in tokens_text if word in lexical['first_person_singular'])) vector['second_person_singular'] = float( sum(1 for word in tokens_text if word in lexical['second_person_singular'])) vector['third_person_singular'] = float( sum(1 for word in tokens_text if word in lexical['third_person_singular'])) vector['first_person_plurar'] = float( sum(1 for word in tokens_text if word in lexical['first_person_plurar'])) vector['second_person_plurar'] = float( sum(1 for word in tokens_text if word in lexical['second_person_plurar'])) vector['third_person_plurar'] = float( sum(1 for word in tokens_text if word in lexical['third_person_plurar'])) vector['avg_word'] = np.nanmean([len(word) for word in tokens_text if word not in tags]) vector['avg_word'] = vector['avg_word'] if not np.isnan(vector['avg_word']) else 0.0 vector['avg_word'] = round(vector['avg_word'], 4) vector['kur_word'] = kurtosis([len(word) for word in tokens_text if word not in tags]) vector['kur_word'] = vector['kur_word'] if not np.isnan(vector['kur_word']) else 0.0 vector['kur_word'] = round(vector['kur_word'], 4) vector['skew_word'] = skew(np.array([len(word) for word in tokens_text if word not in tags])) vector['skew_word'] = vector['skew_word'] if not np.isnan(vector['skew_word']) else 0.0 vector['skew_word'] = round(vector['skew_word'], 4) # adverbios vector['adverb_neg'] = sum(1 for word in tokens_text if word in lexical['adverb_neg']) vector['adverb_neg'] = float(vector['adverb_neg']) vector['adverb_time'] = sum(1 for word in tokens_text if word in lexical['adverb_time']) vector['adverb_time'] = float(vector['adverb_time']) vector['adverb_place'] = sum(1 for word in tokens_text if word in lexical['adverb_place']) vector['adverb_place'] = float(vector['adverb_place']) vector['adverb_mode'] = sum(1 for word in tokens_text if word in lexical['adverb_mode']) vector['adverb_mode'] = float(vector['adverb_mode']) vector['adverb_cant'] = sum(1 for word in tokens_text if word in lexical['adverb_cant']) vector['adverb_cant'] = float(vector['adverb_cant']) vector['adverb_all'] = float(vector['adverb_neg'] + vector['adverb_time'] + vector['adverb_place']) vector['adverb_all'] = float(vector['adverb_all'] + vector['adverb_mode'] + vector['adverb_cant']) vector['adjetives_neg'] = sum(1 for word in tokens_text if word in lexical['adjetives_neg']) vector['adjetives_neg'] = float(vector['adjetives_neg']) vector['adjetives_pos'] = sum(1 for word in tokens_text if word in lexical['adjetives_pos']) vector['adjetives_pos'] = float(vector['adjetives_pos']) vector['who_general'] = sum(1 for word in tokens_text if word in lexical['who_general']) vector['who_general'] = float(vector['who_general']) vector['who_male'] = sum(1 for word in tokens_text if word in lexical['who_male']) vector['who_male'] = float(vector['who_male']) vector['who_female'] = sum(1 for word in tokens_text if word in lexical['who_female']) vector['who_female'] = float(vector['who_female']) vector['noun'] = self.pos_frequency(text)['NOUN'] * 0.8 vector['verb'] = self.pos_frequency(text)['VERB'] * 0.5 vector['adj'] = self.pos_frequency(text)['ADJ'] * 0.4 vector['pos_others'] = self.pos_frequency(text)['ANOTHER'] * 0.1 return np.array(list(vector.values())) except Exception as e: Util.standard_error(sys.exc_info()) print('Error get_lexical_features: {0}'.format(e)) return None
def syntax_patterns(self, text): result = None try: doc = self.nlp(text) dict_noun = {} dict_verb = {} dict_adv = {} dict_adj = {} for span in doc.sents: result_dependency = self.dependency_all(str(span)) for item in result_dependency: if item['is_stop'] is not True and item['is_punct'] is not True and item['pos_'] not in 'PRON': if item['pos_'] == 'NOUN': # NOUN chunk = str(item['chunk']).lower() chunk_value = [chunk, item['pos_']] dict_noun[chunk] = chunk_value # Chinking for child in item['children']: if child['pos_'] == 'ADJ': # ADJ + NOUN chunk = str(child['child']).lower() + ' ' + str(item['chunk']).lower() chunk_value = [[str(child['child']).lower(), child['pos_']], [str(item['chunk']).lower(), item['pos_']]] dict_noun[chunk] = chunk_value dict_adj[chunk] = chunk_value elif child['pos_'] == 'ADP': # ADP + NOUN chunk = str(child['child']).lower() + ' ' + str(item['chunk']).lower() chunk_value = [[str(child['child']).lower(), child['pos_']], [str(item['chunk']).lower(), item['pos_']]] dict_noun[chunk] = chunk_value elif item['pos_'] in ['PRON', 'PROPN']: for child in item['children']: if child['pos_'] == 'NOUN': # PRON | PROPN + NOUN chunk = str(item['chunk']).lower() + ' ' + str(child['child']).lower() chunk_value = [[str(item['chunk']).lower(), item['pos_']], [str(child['child']).lower(), child['pos_']]] dict_noun[chunk] = chunk_value elif item['pos_'] == 'ADJ': # ADJ chunk = str(item['chunk']).lower() chunk_value = [chunk, item['pos_']] dict_adj[chunk] = chunk_value for child in item['children']: if child['pos_'] == 'NOUN': # ADJ + NOUN chunk = str(item['chunk']).lower() + ' ' + str(child['child']).lower() chunk_value = [[str(item['chunk']).lower(), item['pos_']], [str(child['child']).lower(), child['pos_']]] dict_adj[chunk] = chunk_value if item['dep_'] is not ['ROOT']: if item['head_pos'] == 'NOUN': for child in item['children']: if child['pos_'] == 'ADP': # NOUN + ADP + NOUN chunk = str(item['head_text']).lower() + ' ' + \ str(child['child']).lower() + ' ' + \ str(item['chunk']).lower() chunk_value = [[str(item['head_text']).lower(), item['head_pos']], [str(child['child']).lower(), child['pos_']], [str(item['chunk']).lower(), item['pos_']]] dict_noun[chunk] = chunk_value elif item['head_pos'] == 'ADJ': for child in item['children']: if child['pos_'] == 'ADJ': # ADJ + ADJ + NOUN chunk = str(item['head_text']).lower() + ' ' + \ str(child['child']).lower() + ' ' + \ str(item['chunk']).lower() chunk_value = [[str(item['head_text']).lower(), item['head_pos']], [str(child['child']).lower(), child['pos_']], [str(item['chunk']).lower(), item['pos_']]] dict_noun[chunk] = chunk_value dict_adj[chunk] = chunk_value elif item['head_pos'] == 'VERB': for child in item['children']: if child['pos_'] == 'ADJ': # VERB + NOUN + ADJ chunk = str(item['head_text']).lower() + ' ' + \ str(item['chunk']).lower() + ' ' + \ str(child['child']).lower() chunk_value = [[str(item['head_text']).lower(), item['head_pos']], [str(item['chunk']).lower(), item['pos_']], [str(child['child']).lower(), child['pos_']]] dict_verb[chunk] = chunk_value elif child['pos_'] == 'ADP': # VERB + ADP + NOUN chunk = str(item['head_text']).lower() + ' ' + \ str(child['child']).lower() + ' ' + \ str(item['chunk']).lower() chunk_value = [[str(item['head_text']).lower(), item['head_pos']], [str(child['child']).lower(), child['pos_']], [str(item['chunk']).lower(), item['pos_']]] dict_verb[chunk] = chunk_value elif str(item['head_pos']) == 'ADV': for child in item['children']: if child['pos_'] == 'ADV': # ADV + ADV + NOUN chunk = str(item['head_text']).lower() + ' ' + \ str(child['child']).lower() + ' ' + \ str(item['chunk']).lower() chunk_value = [[str(item['head_text']).lower(), item['head_pos']], [str(child['child']).lower(), child['pos_']], [str(item['chunk']).lower(), item['pos_']]] dict_adv[chunk] = chunk_value elif child['pos_'] == 'ADJ': # ADV + NOUN + ADV chunk = str(item['head_text']).lower() + ' ' + \ str(item['chunk']).lower() + ' ' + \ str(child['child']).lower() chunk_value = [[str(item['head_text']).lower(), item['head_pos']], [str(item['chunk']).lower(), item['pos_']], [str(child['child']).lower(), child['pos_']]] dict_adv[chunk] = chunk_value dict_chunk = {'NOUN': dict_noun, 'VERB': dict_verb, 'ADV': dict_adv, 'ADJ': dict_adj} result = dict_chunk except Exception as e: Util.standard_error(sys.exc_info()) print('Error syntax_patterns: {0}'.format(e)) return result