コード例 #1
0
def char_grams(data_texts, n):
    vectors = []
    for text in data_texts:
        #Uno la lista de string sin espacios
        text_union = "".join(text)
        #Separo en ngram
        ngram = [text_union[i:i + n] for i in range(len(text_union) - n + 1)]
        print(ngram)
        # dict_alpha_num = dict_alpha_numeric(n)
        dict_alpha_num = Utils.ngrams(n)
        for i in ngram:
            if i in dict_alpha_num:
                dict_alpha_num[i] += 1
                print(dict_alpha_num[i])
            else:
                print(i)
                print('El valor no existe en diccionario')
                continue
        vector_freq = freq_dict(dict_alpha_num)
        vectors.append(vector_freq)
    return vectors
コード例 #2
0
from spacymoji import Emoji
from tweetlib.singleton import Utils
from tweetlib.definitions import TaggingMethod
# import es_core_news_md
# from collections import Counter
# from spacy.tokens import retokenize

# TODO: use Singleton of nlp
# nlp = Singleton.get_nlp()
nlp = Utils.load_nlp(TaggingMethod.SPACY)

# text = ['🤾', '🏻\u200d', '♀', '️', 'La', 'selección', 'española', 'vence', 'a', 'Noruega', 'y', 'se', 'coloca', 'a', 'un', 'paso','de','ganar','el','Mundial','femenino','de','balonmano','👏','🏻','\n','¡','Muc','…','https://t.co/IbVGXUz8Lb']
# text = [ '\n', '🤾','♀', '👏', '🏻\u200d', '️', 'La', 'selección', 'española', 'vence', 'a', 'Noruega', 'y', '', '🏻','\n','¡','Muc','…', 'https://t.co/IbVGXUz8Lb']
# text = ['🤾', '🏻\u200d', '♀', '️', 'La', 'selección', 'española']
# text = ['🤾', '♀', '️👏', '️👏', 'La', 'selección', 'española']


# TODO: use snake case (snake_case vs. CamlCase) for function identifiers
def rm_emoticons(text: list):
    """Elimina los emoji de un corpus

    Args:
        text (list[str]): Lista de tokens.
        nlp ([type]): Biblioteca de spacy, para utilizar sus atributos, en este caso "emoji".

    Returns:
        list[str]: Lista sin emoji.
    """
    # nlp = es_core_news_lg.load()
    # list_within_emoji = []
    # emoji = Emoji(nlp)
コード例 #3
0
    def run(self):
        # get data and classes from self.data
        data = self.dataset.get_data()
        y = self.dataset.get_y()
        preprocessing_list = self.config.get_preprocessing_methods()
        encoding = self.config.get_encoding_method()
        classifier_type = self.config.get_classification_method()
        tagging_method_type = self.config.get_tagging_method()
        # type_user = self.config.get_type_user()
        # vectors = []
        #INITIALIZE THE LIBRARY TO USE
        nlp = Utils.load_nlp(tagging_method_type)
        #copy to data
        data_texts = data.copy()

        # gettting the type of task
        type_task = dict_task[self.task]

        if self.task.name == 'VALIDATE_MODEL' or self.task.name == 'MODEL_STORAGE' or self.task.name == 'PREDICTION':
            if self.task.name == 'PREDICTION':
                data_texts = self.text
            for preprocessing in preprocessing_list:
                prep_method = dict_preprocessing[preprocessing]
                if preprocessing.name != 'LOWERCASE' and preprocessing.name != 'REMOVE_STOP_WORDS' and preprocessing.name != 'MENTIONS':
                    for idx, text_prep in enumerate(data_texts):
                        prep = prep_method(text_prep)
                        data_texts[idx] = prep
                else:
                    for idx, text in enumerate(data_texts):
                        prep = prep_method(text)
                        data_texts[idx] = prep

            # apply encoding
            encoding_method = dict_encoding[encoding]

            if encoding.name == 'BIGRAM' or encoding.name == 'TRIGRAM' or encoding.name == 'CUATRIGRAM':
                vector_encoding = encoding_method(data_texts)
            elif encoding.name == 'ALL_CHARGRAM':
                vector_encoding = encoding_method(data_texts)
                # vector_encoding = np.concatenate(vectors[0], vectors[1])
            elif encoding.name == 'POS_ALL_CHARGRAM':
                vector_encoding = encoding_method(data_texts,
                                                  tagging_method_type.name,
                                                  nlp)
                # vector_encoding = np.concatenate(vectors[0], vectors[1], vectors[2])
            #postagging
            else:
                vector_encoding = encoding_method(data_texts,
                                                  tagging_method_type.name,
                                                  nlp)

            X = np.vstack(vector_encoding)

            # #classifier (validate_model)
            # y_test, pred_y = self.classifier.classification_method(X, y, classifier_type)
            # accuracy = self.classifier.classification_method(X, y, classifier_type)

            # instancia de Classification
            # c = self.classifier

            if self.task.name == 'VALIDATE_MODEL':
                #Devuelve una tupla con la lista de accuracy y un entero que es el promedio de esa lista
                validate_model = type_task(X, y, classifier_type)
            # accuracy = self.classifier.classification_method(X, y, classifier_type). n_value cantidad de clases con mejor presicion
            elif self.task.name == 'PREDICTION':
                predict_method = type_task(self.model, X, self.n_value)
            else:
                # save_model = self.classifier.save_model(X, y, preprocessing_list, encoding, classifier_type, tagging_method_type, type_task, type_user)
                model_storage = type_task(self.id_model, self.config, X, y,
                                          classifier_type)