def kmeans_data_filter(file, output_name, output_dir_path, max_features, n_clusters, n_init, max_iter, random_state): filepath = Path(file).resolve() df = pd.read_csv(filepath) preprocessed_data = preprocess(df.text) tfidf_vectorizer = TfidfVectorizer(min_df=3, max_df=0.95, max_features=max_features) tfidf = tfidf_vectorizer.fit_transform(preprocessed_data) print('Running K-means algorithm...') clusters = KMeans(n_clusters=n_clusters, n_init=n_init, max_iter=max_iter, random_state=random_state).fit_predict(tfidf) cluster_to_use = np.bincount(clusters).argmax() output_dir = Path(output_dir_path).resolve() output_name = output_name or filepath.name output_path = output_dir.joinpath(output_name) df[clusters == cluster_to_use].to_csv(output_path, index=None) print('Finished processing file.')
def filter_dataset_using_length(dataset, max_length, min_length): dataset_path = Path(dataset).resolve() dataset = pd.read_csv(dataset_path) print('Filtering dataset:') cleaned_text = preprocess(dataset.text) dataset = dataset[ cleaned_text.apply(lambda x: len(x.split())) <= max_length] dataset = dataset[ cleaned_text.apply(lambda x: len(x.split())) >= min_length] save_path = dataset_path.parent.joinpath('emotion_' + str(len(dataset)) + '.csv') dataset.to_csv(save_path, index=None) print('File saved under "' + save_path.as_posix() + '"')
def process(documents): data_path = Path(os.path.abspath(__file__), '../model_data').resolve() tokenizer_path = data_path.joinpath('tokenizer.pickle') with tokenizer_path.open('rb') as file: tokenizer = pickle.load(file) cleaned_data = preprocess(documents, stemming=True) sequences = [text.split() for text in cleaned_data] list_tokenized = tokenizer.texts_to_sequences(sequences) input_dim = tokenizer.num_words embedding_dim = 100 input_length = 30 x_data = pad_sequences(list_tokenized, maxlen=input_length) sa_weights_path = data_path.joinpath('sentiment_analysis_weights.h5') model = sentiment_analysis_model(input_length, input_dim, embedding_layer=None, embedding_dim=embedding_dim) model.load_weights(sa_weights_path.as_posix()) sa_results = model.predict_classes(x_data) encoder_path = data_path.joinpath('emotion_classes_encoder.pickle') with encoder_path.open('rb') as file: encoder = pickle.load(file) er_weights_path = data_path.joinpath('emotion_recognition_weights.h5') model = emotion_recognition_model(input_length, input_dim, num_classes=4, embedding_layer=None, embedding_dim=embedding_dim) model.load_weights(er_weights_path.as_posix()) er_results = model.predict_classes(x_data) results = pd.get_dummies(er_results) results.columns = encoder.classes_ results['polarity'] = sa_results return results
def get_topics(self, count=5): self.dataframe['cleaned'] = preprocess(self.dataframe.text, lemmatization=True, no_emoji=True, no_special_words=True) self.num_topics = count tokenized_documents = [ tweet.split() for tweet in self.dataframe.cleaned ] bigram = gensim.models.Phrases(tokenized_documents, min_count=5, threshold=100) bigram_mod = gensim.models.phrases.Phraser(bigram) bigrams = [bigram_mod[doc] for doc in tokenized_documents] dictionary = gensim.corpora.Dictionary(bigrams) dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000) bow_corpus = [dictionary.doc2bow(doc) for doc in bigrams] self.model = gensim.models.ldamulticore.LdaMulticore( bow_corpus, num_topics=self.num_topics, id2word=dictionary, passes=10, workers=self.num_workers, chunksize=100, iterations=400) topics = [(topic[0], re.findall(r'(\d\.\d+)\*"(.*?)"', topic[1])) for topic in self.model.print_topics()] topics = [ (topic[0], list(map(lambda x: (round(1000 * float(x[0])), x[1]), topic[1]))) for topic in topics ] document_topics = [self._get_document_topic(doc) for doc in bow_corpus] self.dataframe['topic'] = document_topics return (topics, self.dataframe)
def predict_nb(files_dir, model_file, save_path, text_col='text'): FILES_DIR = Path(files_dir).resolve() RELATIONS_FILE = Path(os.path.abspath(__file__), '../../query_relations.json').resolve() with RELATIONS_FILE.open('rb') as file: relations = json.load(file) model_path = Path(model_file).resolve() with model_path.open('rb') as file: model = pickle.load(file) result_data = [] for filename in os.listdir(FILES_DIR): print('Reading file: "' + filename + '"') file_data = pd.read_csv(os.path.join(FILES_DIR, filename)) cleaned_text = preprocess(file_data[text_col]) result = model.predict(cleaned_text) result = result == 4 if np.bincount( result).argmax() == 4 else result == 0 file_data = file_data[result] query = re.findall(r'(#[^.]+|:.+:)', filename)[0] file_data['label'] = relations[query] result_data = result_data + [file_data] if len(result_data) > 0: result_data = pd.concat(result_data) path = Path(save_path).resolve() result_data.to_csv(path, index=None) print('Files saved under "' + save_path + '"')
def predict_from_directory(files_dir, model_weights_file, model_type, tokenizer_file, save_path, input_length=100, embedding_dim=100, text_col='text'): FILES_DIR = Path(files_dir).resolve() RELATIONS_FILE = Path(os.path.abspath(__file__), '../../query_relations.json').resolve() with RELATIONS_FILE.open('rb') as file: relations = json.load(file) tokenizer_path = Path(tokenizer_file).resolve() with tokenizer_path.open('rb') as file: tokenizer = pickle.load(file) weights_path = Path(model_weights_file).resolve() input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1) model = NLP_MODEL[model_type](input_length, input_dim, None, embedding_dim=embedding_dim) model.load_weights(weights_path.as_posix()) emotion_data_dict = {} for filename in os.listdir(FILES_DIR): print('Reading file: "' + filename + '"') query = re.findall(r'(#[^.]+|:.+:)', filename)[0] emotion = relations[query] file_data = pd.read_csv(os.path.join(FILES_DIR, filename)) dict_data = emotion_data_dict[ emotion] if emotion in emotion_data_dict else None emotion_data_dict[emotion] = pd.concat([dict_data, file_data]) result_data = [] for emotion, dataset in emotion_data_dict.items(): print('Processing "' + emotion + '" data...') cleaned_texts = preprocess(dataset[text_col], quiet=True) predict_sequences = [text.split() for text in cleaned_texts] list_tokenized_predict = tokenizer.texts_to_sequences( predict_sequences) x_predict = pad_sequences(list_tokenized_predict, maxlen=input_length) result = model.predict(x_predict) mean = np.mean(result) std = np.std(result) low, high = get_score_range(mean, std) print("\tScore Range: {:4f} - {:4f}".format(low, high)) dataset = dataset[np.all([(result >= low), (result <= high)], axis=0)] dataset.insert(0, 'label', emotion) result_data = result_data + [dataset] if len(result_data) > 0: result_data = pd.concat(result_data) path = Path(save_path).resolve() result_data.to_csv(path, index=None) print('Files saved under "' + save_path + '"')