Ejemplo n.º 1
0
 def token_frequency(model_name, corpus_vec):
     dict_token = {}
     try:
         sep = os.sep
         file_output = DIR_EMBEDDING + 'frequency' + sep + 'frequency_' + model_name + '.csv'
         for list_tokens in corpus_vec:
             for token in list_tokens:
                 if token not in [
                         ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                         '.'
                 ]:
                     if token in dict_token:
                         value = dict_token[token]
                         dict_token[token] = value + 1
                     else:
                         dict_token[token] = 1
         list_token = [{
             'token': k,
             'freq': v
         } for k, v in dict_token.items()]
         df = pd.DataFrame(list_token, columns=['token', 'freq'])
         df.to_csv(file_output, encoding="utf-8", sep=";", index=False)
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error token_frequency: {0}'.format(e))
     return dict_token
Ejemplo n.º 2
0
 def import_words_corpus(self):
     """
     :Version: 1.0
     :Author: Edwin Puertas
     This function  import corpus in spanish and english from SemEval-2018 AIT DISC.
     :param lang: language
     :type lang: Text
     :rtype: Object
     :return: Object embedding
     """
     result = []
     try:
         file_es = 'SemEval-2018_AIT_DISC_ES.csv'
         file_en = 'SemEval-2018_AIT_DISC_EN.csv'
         print('Loading.... {0} corpus'.format(file_es if self.lang ==
                                               'es' else file_en))
         if self.lang == 'es':
             corpus = self.text_analysis.import_corpus(file=file_es)
         else:
             corpus = self.text_analysis.import_corpus(file=file_en)
         result = [i[1] for i in corpus]
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error import_words_corpus: {0}'.format(e))
     return result
Ejemplo n.º 3
0
    def get_features(self, messages, model_type='11111', binary_vad='0000'):
        try:
            # W: Word, S:Syllable, F: Frequency Phoneme, S: One/All Phoneme
            # '1111', '1110', '1101', '1100', '1011', '1010', '1001', '1000',
            # '0111', '0110', '0101', '0100', '0011', '0010'
            word_features = self.get_feature_word(messages)
            syllable_features = self.get_feature_syllable(messages)
            phoneme_frequency = self.get_frequency_phoneme(messages)
            one_syllable = self.get_feature_phoneme(messages)
            all_syllable = self.get_feature_phoneme(messages, syllable=True)
            vad_features = self.get_feature_vad(messages, binary=binary_vad)
            result = np.zeros((len(messages), 0), dtype="float32")
            if int(model_type[0]) == 1:
                result = np.append(result, word_features, axis=1)
            elif int(model_type[1]) == 1:
                result = np.append(result, syllable_features, axis=1)
            elif int(model_type[2]) == 1:
                result = np.append(result, phoneme_frequency, axis=1)
            elif int(model_type[3]) == 1:
                result = np.append(result, one_syllable, axis=1)
            elif int(model_type[4]) == 1:
                result = np.append(result, all_syllable, axis=1)

            result = np.append(result, vad_features, axis=1)
            return result
        except Exception as e:
            Utils.standard_error(sys.exc_info())
            print('Error get_features: {0}'.format(e))
            return None
Ejemplo n.º 4
0
    def __init__(self, lang='es', text_analysis=None):
        try:
            if text_analysis is None:
                self.ta = TextAnalysis(lang=lang)
            else:
                self.ta = text_analysis
            file_lexicon = DIR_INPUT + 'NRC-VAD-Lexicon.txt'
            file_word_embedding_en = DIR_MODELS + 'word_embedding_en.model'
            file_word_embedding_es = DIR_MODELS + 'word_embedding_es.model'
            file_syllable_embedding_en = DIR_MODELS + 'syllable_embedding_en.model'
            file_syllable_embedding_es = DIR_MODELS + 'syllable_embedding_es.model'
            file_phoneme_embedding_en = DIR_MODELS + 'phoneme_embedding_en.model'
            file_phoneme_embedding_es = DIR_MODELS + 'phoneme_embedding_es.model'
            print('Loading Lexicons and Embedding.....')
            if lang == 'es':
                epi = epitran.Epitran('spa-Latn')
                lexicon = self.ta.import_lexicon_vad(file_lexicon, lang=lang)
                word_embedding = Word2Vec.load(file_word_embedding_es)
                syllable_embedding = Word2Vec.load(file_syllable_embedding_es)
                phoneme_embedding = Word2Vec.load(file_phoneme_embedding_es)
            else:
                epi = epitran.Epitran('eng-Latn')
                lexicon = self.ta.import_lexicon_vad(file_lexicon, lang=lang)
                word_embedding = Word2Vec.load(file_word_embedding_en)
                syllable_embedding = Word2Vec.load(file_syllable_embedding_en)
                phoneme_embedding = Word2Vec.load(file_phoneme_embedding_en)

            self.epi = epi
            self.lexicon = lexicon
            self.word_embedding = word_embedding
            self.syllable_embedding = syllable_embedding
            self.phoneme_embedding = phoneme_embedding
        except Exception as e:
            Utils.standard_error(sys.exc_info())
            print('Error FeatureExtraction: {0}'.format(e))
Ejemplo n.º 5
0
 def dependency_child(self, text):
     result = []
     try:
         doc = self.analysis_pipe(text.lower())
         for token in doc:
             item = {
                 'chunk': token.text,
                 'text': token.text,
                 'pos_': token.pos_,
                 'dep_': token.dep_,
                 'tag_': token.tag_,
                 'head_text': token.head.text,
                 'head_pos': token.head.pos_,
                 'children': None
             }
             if len(list(token.children)) > 0:
                 item['children'] = [{
                     'child': child,
                     'pos_': child.pos_,
                     'dep_': child.dep_,
                     'tag_': child.tag_,
                     'head.text': child.head.text,
                     'head.pos_': child.head.pos_
                 } for child in token.children]
             result.append(item)
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error dependency_child: {0}'.format(e))
     return result
Ejemplo n.º 6
0
 def transform(self, list_messages):
     try:
         result = self.get_features(list_messages)
         return result
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error transform: {0}'.format(e))
Ejemplo n.º 7
0
 def get_frequency_phoneme(self, messages):
     try:
         counter = 0
         model = self.phoneme_embedding
         index2phoneme = list(model.wv.index2word)
         num_features = len(index2phoneme)
         msg_feature_vec = np.zeros((len(messages), num_features),
                                    dtype="float32")
         for msg in tqdm(messages):
             # print('Msg: {0}'.format(msg))
             feature_vec = np.zeros(num_features, dtype="float32")
             list_syllable = [
                 token['syllables'] for token in self.ta.tagger(msg)
                 if token['syllables'] is not None
             ]
             for syllable in list_syllable:
                 for s in syllable:
                     syllable_phonetic = self.epi.transliterate(
                         s, normpunc=True)
                     if syllable_phonetic in index2phoneme:
                         index = index2phoneme.index(syllable_phonetic)
                         value = feature_vec[index]
                         feature_vec[index] = value + 1
             msg_feature_vec[counter] = feature_vec
             counter += 1
         return msg_feature_vec
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error get_frequency_phoneme: {0}'.format(e))
         return None
Ejemplo n.º 8
0
 def get_feature_syllable(self, messages, syllable_binary='11'):
     try:
         counter = 0
         model = self.syllable_embedding
         num_features = model.vector_size
         index2phoneme_set = set(model.wv.index2word)
         msg_feature_vec = np.zeros((len(messages), num_features),
                                    dtype="float32")
         for msg in tqdm(messages):
             num_phonemes = 1
             feature_vec = []
             # print('Msg: {0}'.format(msg))
             list_syllable = [
                 token['syllables'] for token in self.ta.tagger(msg)
                 if token['syllables'] is not None
             ]
             for syllable in list_syllable:
                 for s in syllable:
                     syllable_phonetic = self.epi.transliterate(
                         s, normpunc=True)
                     if syllable_phonetic in index2phoneme_set:
                         vec = model.wv[syllable_phonetic]
                         feature_vec.append(vec)
                         num_phonemes += 1
             feature_vec = np.array(feature_vec, dtype="float32")
             feature_vec = np.sum(feature_vec, axis=0)
             feature_vec = np.divide(feature_vec, num_phonemes)
             msg_feature_vec[counter] = feature_vec
             counter += 1
         return msg_feature_vec
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error get_feature_syllable: {0}'.format(e))
         return None
Ejemplo n.º 9
0
 def get_feature_word(self, messages):
     try:
         counter = 0
         model = self.word_embedding
         num_features = model.vector_size
         index2word_set = set(model.wv.index2word)
         msg_feature_vec = np.zeros((len(messages), num_features),
                                    dtype="float32")
         for msg in tqdm(messages):
             num_words = 1
             feature_vec = []
             list_words = [token['text'] for token in self.ta.tagger(msg)]
             for word in list_words:
                 if word in index2word_set:
                     vec = model.wv[word]
                     feature_vec.append(vec)
                 else:
                     feature_vec.append(
                         np.zeros(num_features, dtype="float32"))
                 num_words += 1
             feature_vec = np.array(feature_vec, dtype="float32")
             feature_vec = np.sum(feature_vec, axis=0)
             feature_vec = np.divide(feature_vec, num_words)
             msg_feature_vec[counter] = feature_vec
             counter = counter + 1
         return msg_feature_vec
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error get_feature_word: {0}'.format(e))
         return None
Ejemplo n.º 10
0
 def tagger(self, text):
     result = None
     try:
         list_tagger = []
         doc = self.analysis_pipe(text.lower())
         for token in doc:
             item = {
                 'text': token.text,
                 'lemma': token.lemma_,
                 'stem': token._.stem,
                 'pos': token.pos_,
                 'tag': token.tag_,
                 'dep': token.dep_,
                 'shape': token.shape_,
                 'is_alpha': token.is_alpha,
                 'is_stop': token.is_stop,
                 'is_digit': token.is_digit,
                 'is_punct': token.is_punct,
                 'syllables': token._.syllables
             }
             list_tagger.append(item)
         result = list_tagger
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error tagger: {0}'.format(e))
     return result
Ejemplo n.º 11
0
 def analysis_pipe(self, text):
     doc = None
     try:
         doc = self.nlp(text.lower())
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error analysis_pipe: {0}'.format(e))
     return doc
Ejemplo n.º 12
0
    def part_embedding(self,
                       model_name='part_embedding',
                       size=150,
                       min_count=10,
                       window=5,
                       sample=6e-5,
                       negative=20,
                       alpha=0.03,
                       min_alpha=0.0007,
                       syllable=True):
        """
        :Version: 1.0
        :Author: Edwin Puertas
        This function generated phonemes embedding in spanish and english.
        :param list_doc: list of documents (corpus)
        :type list: Text
        :rtype: dict
        :return: terms by documents
        """
        try:
            start_time = time.time()
            corpus_vec = self.text_analysis.part_vector(self.part_corpus,
                                                        syllable=syllable)

            model = Word2Vec(corpus_vec,
                             cbow_mean=1,
                             workers=self.cores - 1,
                             size=size,
                             min_count=min_count,
                             window=window,
                             sample=sample,
                             negative=negative,
                             alpha=alpha,
                             min_alpha=min_alpha,
                             iter=10)

            model_name = model_name + '_' + self.lang
            file_name = DIR_MODELS + model_name + '.model'
            model.save(file_name)
            print('Model {0} generated successful!'.format(file_name))

            vocabulary = list(model.wv.vocab)
            print('Vocabulary: {0}'.format(vocabulary))

            self.text_analysis.token_frequency(model_name=model_name,
                                               corpus_vec=corpus_vec)
            #Calculated Time processing
            t_sec = round(time.time() - start_time)
            (t_min, t_sec) = divmod(t_sec, 60)
            (t_hour, t_min) = divmod(t_min, 60)
            time_processing = '{} hour:{} min:{} sec'.format(
                t_hour, t_min, t_sec)
            print('Time Processing: {}'.format(time_processing))
        except Exception as e:
            Utils.standard_error(sys.exc_info())
            print('Error part_embedding: {0}'.format(e))
Ejemplo n.º 13
0
 def stemming(self, text):
     try:
         tokens = word_tokenize(text)
         stemmed = [self.stemmer.stem(word) for word in tokens]
         text = ' '.join(stemmed)
         return text
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error stemming: {0}'.format(e))
         return None
Ejemplo n.º 14
0
 def proper_encoding(text):
     result = ''
     try:
         text = unicodedata.normalize('NFD', text)
         text = text.encode('ascii', 'ignore')
         result = text.decode("utf-8")
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error proper_encoding: {0}'.format(e))
     return result
Ejemplo n.º 15
0
 def __init__(self, lang='es', text_analysis=None):
     try:
         print('Load Machine Learning')
         if text_analysis is None:
             self.ta = TextAnalysis(lang=lang)
         else:
             self.ta = text_analysis
         self.features = FeatureExtraction(lang=lang, text_analysis=self.ta)
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error MachineLearning: {0}'.format(e))
Ejemplo n.º 16
0
 def lemmatization(self, text):
     result = ''
     list_tmp = []
     try:
         doc = TextAnalysis.analysis_pipe(text.lower())
         for token in doc:
             list_tmp.append(str(token.lemma_))
         result = ' '.join(list_tmp)
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error lemmatization: {0}'.format(e))
     return result
Ejemplo n.º 17
0
 def get_feature_phoneme(self, messages, syllable=False):
     try:
         counter = 0
         model = self.phoneme_embedding
         num_features = model.vector_size
         index2phoneme_set = set(model.wv.index2word)
         msg_feature_vec = np.zeros((len(messages), num_features),
                                    dtype="float32")
         for msg in tqdm(messages):
             size = 1
             feature_vec = []
             list_syllable = [
                 token['syllables'] for token in self.ta.tagger(msg)
                 if token['syllables'] is not None
             ]
             if syllable:
                 try:
                     first_syllable = str(list_syllable[0][0])
                     first_syllable = first_syllable[0] \
                         if (first_syllable is not None) and (len(first_syllable) > 0) else ''
                     syllable_phonetic = self.epi.transliterate(
                         first_syllable)
                     if syllable_phonetic in index2phoneme_set:
                         vec = model.wv[syllable_phonetic]
                         feature_vec.append(vec)
                     else:
                         feature_vec.append(
                             np.zeros(num_features, dtype="float32"))
                 except Exception as e_epi:
                     print('Error transliterate: {0}'.format(e_epi))
                     pass
             else:
                 list_phoneme = self.epi.trans_list(msg)
                 size = len(list_phoneme)
                 for phoneme in list_phoneme:
                     if phoneme in index2phoneme_set:
                         vec = model.wv[phoneme]
                         feature_vec.append(vec)
                     else:
                         feature_vec.append(
                             np.zeros(num_features, dtype="float32"))
             # print('Vector: {0}'.format(feature_vec))
             feature_vec = np.array(feature_vec, dtype="float32")
             feature_vec = np.sum(feature_vec, axis=0)
             feature_vec = np.divide(feature_vec, size)
             msg_feature_vec[counter] = feature_vec
             counter += 1
         return msg_feature_vec
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error get_feature_phoneme: {0}'.format(e))
         return None
Ejemplo n.º 18
0
 def get_feature_vad(self, messages, binary='0000'):
     try:
         counter = 0
         num_features = 4
         msg_feature_vec = np.zeros((len(messages), num_features),
                                    dtype="float32")
         for msg in tqdm(messages):
             dict_vad = self.get_vad(msg)
             v = dict_vad['valence']
             a = dict_vad['arousal']
             d = dict_vad['dominance']
             vad = dict_vad['vad']
             row = []
             if binary == '0001':
                 row = [0.0, 0.0, 0.0, vad]
             elif binary == '0010':
                 row = [0.0, 0.0, d, 0.0]
             elif binary == '0011':
                 row = [0.0, 0.0, d, vad]
             elif binary == '0100':
                 row = [0.0, a, 0.0, 0.0]
             elif binary == '0101':
                 row = [0.0, a, 0.0, vad]
             elif binary == '0110':
                 row = [0.0, a, d, 0.0]
             elif binary == '0111':
                 row = [0.0, a, d, vad]
             elif binary == '1000':
                 row = [v, 0.0, 0.0, 0.0]
             elif binary == '1001':
                 row = [v, 0.0, 0.0, vad]
             elif binary == '1010':
                 row = [v, 0.0, d, 0.0]
             elif binary == '1011':
                 row = [v, 0.0, d, vad]
             elif binary == '1100':
                 row = [v, a, 0.0, 0.0]
             elif binary == '1101':
                 row = [v, a, 0.0, vad]
             elif binary == '1110':
                 row = [v, a, d, 0.0]
             elif binary == '1111':
                 row = [v, a, d, vad]
             elif binary == '0000':
                 row = [0.0, 0.0, 0.0, 0.0]
             msg_feature_vec[counter] = row
             counter = counter + 1
         return msg_feature_vec
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error get_feature_vad: {0}'.format(e))
         return None
Ejemplo n.º 19
0
 def stopwords(text):
     result = ''
     try:
         nlp = Spanish() if TextAnalysis.lang == 'es' else English()
         doc = nlp(text)
         token_list = [token.text for token in doc]
         sentence = []
         for word in token_list:
             lexeme = nlp.vocab[word]
             if not lexeme.is_stop:
                 sentence.append(word)
         result = ' '.join(sentence)
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error stopwords: {0}'.format(e))
     return result
Ejemplo n.º 20
0
 def dependency(self, text):
     result = []
     try:
         doc = self.analysis_pipe(text.lower())
         doc_chunks = list(doc.noun_chunks)
         for chunk in doc_chunks:
             item = {
                 'chunk': chunk,
                 'text': chunk.text,
                 'root_text': chunk.root.text,
                 'root_dep': chunk.root.dep_
             }
             result.append(item)
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error dependency: {0}'.format(e))
     return result
Ejemplo n.º 21
0
    def words_embedding(self,
                        model_name='word_embedding',
                        size=300,
                        min_count=50,
                        window=5,
                        sample=6e-5,
                        negative=20,
                        alpha=0.03,
                        min_alpha=0.0007):
        try:
            start_time = time.time()
            corpus_vec = self.text_analysis.sentences_vector(self.corpus)

            model = Word2Vec(corpus_vec,
                             cbow_mean=1,
                             workers=self.cores - 1,
                             size=size,
                             min_count=min_count,
                             window=window,
                             sample=sample,
                             negative=negative,
                             alpha=alpha,
                             min_alpha=min_alpha,
                             iter=10)

            model_name = model_name + '_' + self.lang
            file_name = DIR_MODELS + model_name + '.model'
            model.save(file_name)
            print('Model {0} generated successful!'.format(model_name))

            vocabulary = list(model.wv.vocab)
            print('Vocabulary: {0}'.format(vocabulary))

            self.text_analysis.token_frequency(model_name=model_name,
                                               corpus_vec=corpus_vec)
            # Calculated Time processing
            t_sec = round(time.time() - start_time)
            (t_min, t_sec) = divmod(t_sec, 60)
            (t_hour, t_min) = divmod(t_min, 60)
            time_processing = '{} hour:{} min:{} sec'.format(
                t_hour, t_min, t_sec)
            print('Time Processing: {}'.format(time_processing))
        except Exception as e:
            Utils.standard_error(sys.exc_info())
            print('Error words_embedding: {0}'.format(e))
Ejemplo n.º 22
0
 def get_similarity(self, model_name):
     dict_vocabulary = {}
     try:
         file_model = "{0}{1}_{2}.model".format(DIR_MODELS, model_name,
                                                self.lang)
         model = Word2Vec.load(file_model, mmap=None)
         vocabulary = list(model.wv.vocab)
         for i in vocabulary:
             dict_vocabulary[i] = model.most_similar(i)
             if i != '':
                 print('Token: {0}\nMost Similar:'.format(i))
                 for j in model.most_similar(i):
                     print(j)
         print(vocabulary)
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error get_similarity: {0}'.format(e))
     return dict_vocabulary
Ejemplo n.º 23
0
 def delete_special_patterns(text):
     result = ''
     try:
         text = re.sub(r'\©|\×|\⇔|\_|\»|\«|\~|\#|\$|\€|\Â|\�|\¬', ' ',
                       text)  # Elimina caracteres especilaes
         text = re.sub(r'\,|\;|\:|\!|\¡|\’|\‘|\”|\“|\"|\'|\`', ' ',
                       text)  # Elimina puntuaciones
         text = re.sub(r'\}|\{|\[|\]|\(|\)|\<|\>|\?|\¿|\°|\|', ' ',
                       text)  # Elimina parentesis
         text = re.sub(r'\/|\-|\+|\*|\=|\^|\%|\&|\$|\.', ' ',
                       text)  # Elimina operadores
         text = re.sub(r'\b\d+(?:\.\d+)?\s+', ' ',
                       text)  # Elimina número con puntuacion
         result = text.lower()
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error delete_special_patterns: {0}'.format(e))
     return result
Ejemplo n.º 24
0
 def part_vector(self, list_text, syllable=True, size_syllable=0):
     result = []
     try:
         for text in list_text:
             doc = self.analysis_pipe(text.lower())
             for stm in doc.sents:
                 stm = str(stm).rstrip()
                 stm = self.clean_text(stm)
                 if stm != '':
                     print('Sentence: {0}'.format(stm))
                     if syllable:
                         list_syllable = [
                             token['syllables']
                             for token in self.tagger(stm)
                             if token['syllables'] is not None
                         ]
                         list_syllable_phonetic = []
                         for syllable in list_syllable:
                             n = len(
                                 syllable
                             ) if size_syllable == 0 else size_syllable
                             for s in list_syllable[:n]:
                                 syllable_phonetic = self.epi.transliterate(
                                     s, normpunc=True)
                                 if syllable_phonetic is not [
                                         ' ', '', '\ufeff', '1'
                                 ]:
                                     list_syllable_phonetic.append(
                                         syllable_phonetic)
                         result.append(list_syllable_phonetic)
                         print('vector: {0}'.format(list_syllable_phonetic))
                     else:
                         list_phonemes = self.epi.trans_list(stm,
                                                             normpunc=True)
                         list_phonemes = [
                             i for i in list_phonemes
                             if i is not [' ', '', '\ufeff', '1']
                         ]
                         result.append(list_phonemes)
                         print('Vector: {0}'.format(list_phonemes))
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error phonemes_vector: {0}'.format(e))
     return result
Ejemplo n.º 25
0
 def dependency_all(self, text):
     result = []
     try:
         doc = self.analysis_pipe(text.lower())
         for chunk in doc.noun_chunks:
             item = {
                 'chunk':
                 chunk,
                 'text':
                 chunk.root.text,
                 'pos_':
                 chunk.root.pos_,
                 'dep_':
                 chunk.root.dep_,
                 'tag_':
                 chunk.root.tag_,
                 'lemma_':
                 chunk.root.lemma_,
                 'is_stop':
                 chunk.root.is_stop,
                 'is_punct':
                 chunk.root.is_punct,
                 'head_text':
                 chunk.root.head.text,
                 'head_pos':
                 chunk.root.head.pos_,
                 'children': [{
                     'child': child,
                     'pos_': child.pos_,
                     'dep_': child.dep_,
                     'tag_': child.tag_,
                     'lemma_': child.lemma_,
                     'is_stop': child.is_stop,
                     'is_punct': child.is_punct,
                     'head.text': child.head.text,
                     'head.pos_': child.head.pos_
                 } for child in chunk.root.children]
             }
             result.append(item)
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error dependency_all: {0}'.format(e))
     return result
Ejemplo n.º 26
0
 def import_corpus(self, file, sep=';', name_id="id", name_text="text"):
     result = []
     try:
         count = 0
         file = DIR_INPUT + file
         df = pd.read_csv(file, sep=sep)
         df.dropna(inplace=True)
         df = df[[name_id, name_text]].values.tolist()
         for row in tqdm(df):
             id = row[0]
             text = str(row[1])
             if len(text) > 0 or text != '':
                 result.append([id, text])
                 count = count + 1
         print('# Sentence: {0}'.format(count))
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error import_corpus: {0}'.format(e))
     return result
Ejemplo n.º 27
0
 def load_sapcy(self, lang):
     result = None
     try:
         if lang == 'es':
             result = spacy.load('es_core_news_md', disable=['ner'])
         else:
             result = spacy.load('en_core_web_md', disable=['ner'])
         stemmer_text = Steaming(lang)  # initialise component
         syllables = SpacySyllables(result)
         emoji = Emoji(result)
         result.add_pipe(syllables, after="tagger")
         result.add_pipe(emoji, first=True)
         result.add_pipe(stemmer_text, after='parser', name='stemmer')
         print('Language: {0}\nText Analysis: {1}'.format(
             lang, result.pipe_names))
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error load_sapcy: {0}'.format(e))
     return result
Ejemplo n.º 28
0
    def plot(model_name, size=15):
        try:
            print('Plot {0} embedding...'.format(model_name))
            sep = os.sep
            #Creates and TSNE model and plots it
            file_model = DIR_MODELS + model_name + ".model"
            model = Word2Vec.load(file_model, mmap=None)
            labels = []
            tokens = []
            list_vocabulary = list(model.wv.vocab)
            for word in list_vocabulary:
                tokens.append(model[word])
                labels.append(word)

            tsne_model = TSNE(perplexity=40,
                              n_components=2,
                              init='pca',
                              n_iter=2500,
                              random_state=23)
            new_values = tsne_model.fit_transform(tokens)

            x = []
            y = []
            for value in new_values:
                x.append(value[0])
                y.append(value[1])

            plt.figure(figsize=(size, size))
            for i in range(len(x)):
                plt.scatter(x[i], y[i], marker='X', color='blue')
                plt.annotate(labels[i],
                             xy=(x[i], y[i]),
                             xytext=(5, 2),
                             textcoords='offset points',
                             ha='right',
                             va='bottom')
            file_output = DIR_EMBEDDING + sep + 'images' + sep + model_name + '.png'
            plt.savefig(file_output)
            plt.show()
        except Exception as e:
            Utils.standard_error(sys.exc_info())
            print('Error plot: {0}'.format(e))
Ejemplo n.º 29
0
 def import_dataset(file, **kwargs):
     result = None
     try:
         print('Loading dataset {0}...'.format(file))
         setting = {}
         mini_size = kwargs.get('mini_size') if type(
             kwargs.get('mini_size')) is int else 2
         sep = ';' if type(kwargs.get('sep')) is str else kwargs.get('sep')
         setting['url'] = kwargs.get('url') if type(
             kwargs.get('url')) is bool else False
         setting['mention'] = kwargs.get('mention') if type(
             kwargs.get('mention')) is bool else False
         setting['emoji'] = kwargs.get('emoji') if type(
             kwargs.get('emoji')) is bool else False
         setting['hashtag'] = kwargs.get('hashtag') if type(
             kwargs.get('hashtag')) is bool else False
         setting['lemmatize'] = kwargs.get('lemmatizer') if type(
             kwargs.get('lemmatizer')) is bool else False
         setting['stopwords'] = kwargs.get('stopwords') if type(
             kwargs.get('stopwords')) is bool else False
         data = []
         file_path = DIR_INPUT + file
         raw_data = pd.read_csv(file_path, sep=sep, encoding='UTF-8')
         for i, row in raw_data.iterrows():
             text = TextAnalysis.clean_text(row['Tweet'], **setting)
             len_text = len(text.split(' '))
             if len_text > mini_size:
                 tag = int(row['Intensity'])
                 value = 0
                 if tag > 0:
                     value = 1
                 elif tag < 0:
                     value = -1
                 elif tag == 0:
                     value = 0
                 data.append([text, value])
         result = pd.DataFrame(data, columns=['message', 'valence'])
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error import_dataset: {0}'.format(e))
     return result
Ejemplo n.º 30
0
 def sentences_vector(self, list_text):
     result = []
     try:
         setting = {
             'url': True,
             'mention': True,
             'emoji': False,
             'hashtag': True,
             'stopwords': True
         }
         for text in tqdm(list_text):
             text = self.clean_text(text, **setting)
             if text is not None:
                 doc = self.analysis_pipe(text)
                 if doc is not None:
                     vector = [i.text for i in doc]
                     result.append(vector)
     except Exception as e:
         Utils.standard_error(sys.exc_info())
         print('Error sentences_vector: {0}'.format(e))
     return result