def tokenize_classes(document_classes: Dict[str, List[str]], load_bigrams: bool = True) -> Dict[str, List[List[str]]]: word_classes = {} for document_class, documents in document_classes.items(): word_classes[document_class] = list(sentences_to_words(documents)) word_classes[document_class] = remove_stopwords( word_classes[document_class]) if load_bigrams: with open('models/bigrams.pkl', 'rb') as input_file: bigram_mod = pickle.load(input_file) else: words = [] for word_class, words in word_classes.items(): words.extend(words) bigram_mod = bigrams_model(words) with open('models/bigrams.pkl', 'wb') as output_file: pickle.dump(bigram_mod, output_file) for word_class, words in word_classes.items(): word_classes[word_class] = apply_bigrams(words, bigram_mod) word_classes[word_class] = lemmatization(nlp, word_classes[word_class]) return word_classes
def tokenize(documents: List[str]) -> List[List[str]]: document_words = list(sent_to_words(documents)) document_words = remove_stopwords(document_words) document_words = build_bigrams(document_words) document_words = lemmatization(nlp, document_words) return document_words
def tokenize_classes(data_words) -> List[list]: # Construimos modelos de bigrams y trigrams # https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.Phrases bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Aplicamos el conjunto de bigrams/trigrams a nuestros documentos bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) # Eliminamos stopwords data_words_nostops = remove_stopwords(data_words) # Formamos bigrams #data_words_bigrams = make_bigrams(data_words_nostops) data_words_bigrams = make_bigrams(bigram_mod, data_words_nostops) # python3 -m spacy download en_core_web_lg # Lematizamos preservando únicamente noun, adj, vb, adv data_lemmatized = lemmatization( data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) return data_lemmatized
def tokenize_classes(document_classes: Dict[str, List[str]], load_bigrams: bool = True) -> Dict[str, List[List[str]]]: word_classes = {} for document_class, documents in document_classes.items(): cleaned_documents = clean_data(documents) word_classes[document_class] = list(sentences_to_words(cleaned_documents)) word_classes[document_class] = remove_stopwords(word_classes[document_class]) basePath = os.path.dirname(os.path.abspath(__file__)) if load_bigrams: with open(basePath+'/../../models/bigrams.pkl', 'rb') as input_file: bigram_mod = pickle.load(input_file) else: words = [] for word_class, words in word_classes.items(): words.extend(words) bigram_mod = bigrams_model(words) with open(basePath+'/../../models/bigrams.pkl', 'wb') as output_file: pickle.dump(bigram_mod, output_file) for word_class, words in word_classes.items(): word_classes[word_class] = extend_bigrams(words, bigram_mod) word_classes[word_class] = lemmatization(nlp, word_classes[word_class]) return word_classes