Esempio n. 1
0
def tokenize_classes(document_classes: Dict[str, List[str]],
                     load_bigrams: bool = True) -> Dict[str, List[List[str]]]:

    word_classes = {}
    for document_class, documents in document_classes.items():
        word_classes[document_class] = list(sentences_to_words(documents))
        word_classes[document_class] = remove_stopwords(
            word_classes[document_class])

    if load_bigrams:
        with open('models/bigrams.pkl', 'rb') as input_file:
            bigram_mod = pickle.load(input_file)
    else:
        words = []
        for word_class, words in word_classes.items():
            words.extend(words)
        bigram_mod = bigrams_model(words)
        with open('models/bigrams.pkl', 'wb') as output_file:
            pickle.dump(bigram_mod, output_file)

    for word_class, words in word_classes.items():
        word_classes[word_class] = apply_bigrams(words, bigram_mod)
        word_classes[word_class] = lemmatization(nlp, word_classes[word_class])

    return word_classes
def tokenize(documents: List[str]) -> List[List[str]]:

    document_words = list(sent_to_words(documents))
    document_words = remove_stopwords(document_words)
    document_words = build_bigrams(document_words)
    document_words = lemmatization(nlp, document_words)

    return document_words
def tokenize_classes(data_words) -> List[list]:
    # Construimos modelos de bigrams y trigrams
    # https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.Phrases
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

    # Aplicamos el conjunto de bigrams/trigrams a nuestros documentos
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # Eliminamos stopwords
    data_words_nostops = remove_stopwords(data_words)

    # Formamos bigrams
    #data_words_bigrams = make_bigrams(data_words_nostops)
    data_words_bigrams = make_bigrams(bigram_mod, data_words_nostops)

    # python3 -m spacy download en_core_web_lg

    # Lematizamos preservando únicamente noun, adj, vb, adv
    data_lemmatized = lemmatization(
        data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    return data_lemmatized
Esempio n. 4
0
def tokenize_classes(document_classes: Dict[str, List[str]], load_bigrams: bool = True) -> Dict[str, List[List[str]]]:

    word_classes = {}
    for document_class, documents in document_classes.items():
        cleaned_documents = clean_data(documents)
        word_classes[document_class] = list(sentences_to_words(cleaned_documents))
        word_classes[document_class] = remove_stopwords(word_classes[document_class])
    basePath = os.path.dirname(os.path.abspath(__file__))
    if load_bigrams:
        with open(basePath+'/../../models/bigrams.pkl', 'rb') as input_file:
            bigram_mod = pickle.load(input_file)
    else:
        words = []
        for word_class, words in word_classes.items():
            words.extend(words)
        bigram_mod = bigrams_model(words)
        with open(basePath+'/../../models/bigrams.pkl', 'wb') as output_file:
            pickle.dump(bigram_mod, output_file)

    for word_class, words in word_classes.items():
        word_classes[word_class] = extend_bigrams(words, bigram_mod)
        word_classes[word_class] = lemmatization(nlp, word_classes[word_class])

    return word_classes