Ejemplo n.º 1
0
def kmeans_data_filter(file, output_name, output_dir_path, max_features,
                       n_clusters, n_init, max_iter, random_state):
    filepath = Path(file).resolve()
    df = pd.read_csv(filepath)

    preprocessed_data = preprocess(df.text)

    tfidf_vectorizer = TfidfVectorizer(min_df=3,
                                       max_df=0.95,
                                       max_features=max_features)
    tfidf = tfidf_vectorizer.fit_transform(preprocessed_data)

    print('Running K-means algorithm...')

    clusters = KMeans(n_clusters=n_clusters,
                      n_init=n_init,
                      max_iter=max_iter,
                      random_state=random_state).fit_predict(tfidf)

    cluster_to_use = np.bincount(clusters).argmax()

    output_dir = Path(output_dir_path).resolve()
    output_name = output_name or filepath.name
    output_path = output_dir.joinpath(output_name)

    df[clusters == cluster_to_use].to_csv(output_path, index=None)
    print('Finished processing file.')
def filter_dataset_using_length(dataset, max_length, min_length):
    dataset_path = Path(dataset).resolve()
    dataset = pd.read_csv(dataset_path)

    print('Filtering dataset:')

    cleaned_text = preprocess(dataset.text)
    dataset = dataset[
        cleaned_text.apply(lambda x: len(x.split())) <= max_length]
    dataset = dataset[
        cleaned_text.apply(lambda x: len(x.split())) >= min_length]

    save_path = dataset_path.parent.joinpath('emotion_' + str(len(dataset)) +
                                             '.csv')
    dataset.to_csv(save_path, index=None)

    print('File saved under "' + save_path.as_posix() + '"')
Ejemplo n.º 3
0
  def process(documents):
    data_path = Path(os.path.abspath(__file__), '../model_data').resolve()

    tokenizer_path = data_path.joinpath('tokenizer.pickle')
    with tokenizer_path.open('rb') as file:
      tokenizer = pickle.load(file)

    cleaned_data = preprocess(documents, stemming=True)

    sequences = [text.split() for text in cleaned_data]
    list_tokenized = tokenizer.texts_to_sequences(sequences)

    input_dim = tokenizer.num_words
    embedding_dim = 100
    input_length = 30

    x_data = pad_sequences(list_tokenized, maxlen=input_length)

    sa_weights_path = data_path.joinpath('sentiment_analysis_weights.h5')
    model = sentiment_analysis_model(input_length,
                                     input_dim,
                                     embedding_layer=None,
                                     embedding_dim=embedding_dim)
    model.load_weights(sa_weights_path.as_posix())
    sa_results = model.predict_classes(x_data)

    encoder_path = data_path.joinpath('emotion_classes_encoder.pickle')
    with encoder_path.open('rb') as file:
      encoder = pickle.load(file)

    er_weights_path = data_path.joinpath('emotion_recognition_weights.h5')
    model = emotion_recognition_model(input_length,
                                      input_dim,
                                      num_classes=4,
                                      embedding_layer=None,
                                      embedding_dim=embedding_dim)
    model.load_weights(er_weights_path.as_posix())
    er_results = model.predict_classes(x_data)

    results = pd.get_dummies(er_results)
    results.columns = encoder.classes_
    results['polarity'] = sa_results

    return results
Ejemplo n.º 4
0
    def get_topics(self, count=5):
        self.dataframe['cleaned'] = preprocess(self.dataframe.text,
                                               lemmatization=True,
                                               no_emoji=True,
                                               no_special_words=True)
        self.num_topics = count

        tokenized_documents = [
            tweet.split() for tweet in self.dataframe.cleaned
        ]
        bigram = gensim.models.Phrases(tokenized_documents,
                                       min_count=5,
                                       threshold=100)
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        bigrams = [bigram_mod[doc] for doc in tokenized_documents]

        dictionary = gensim.corpora.Dictionary(bigrams)
        dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)
        bow_corpus = [dictionary.doc2bow(doc) for doc in bigrams]

        self.model = gensim.models.ldamulticore.LdaMulticore(
            bow_corpus,
            num_topics=self.num_topics,
            id2word=dictionary,
            passes=10,
            workers=self.num_workers,
            chunksize=100,
            iterations=400)
        topics = [(topic[0], re.findall(r'(\d\.\d+)\*"(.*?)"', topic[1]))
                  for topic in self.model.print_topics()]
        topics = [
            (topic[0],
             list(map(lambda x: (round(1000 * float(x[0])), x[1]), topic[1])))
            for topic in topics
        ]
        document_topics = [self._get_document_topic(doc) for doc in bow_corpus]
        self.dataframe['topic'] = document_topics

        return (topics, self.dataframe)
Ejemplo n.º 5
0
def predict_nb(files_dir, model_file, save_path, text_col='text'):
    FILES_DIR = Path(files_dir).resolve()
    RELATIONS_FILE = Path(os.path.abspath(__file__),
                          '../../query_relations.json').resolve()

    with RELATIONS_FILE.open('rb') as file:
        relations = json.load(file)

    model_path = Path(model_file).resolve()
    with model_path.open('rb') as file:
        model = pickle.load(file)

    result_data = []

    for filename in os.listdir(FILES_DIR):
        print('Reading file: "' + filename + '"')
        file_data = pd.read_csv(os.path.join(FILES_DIR, filename))
        cleaned_text = preprocess(file_data[text_col])

        result = model.predict(cleaned_text)
        result = result == 4 if np.bincount(
            result).argmax() == 4 else result == 0
        file_data = file_data[result]

        query = re.findall(r'(#[^.]+|:.+:)', filename)[0]
        file_data['label'] = relations[query]

        result_data = result_data + [file_data]

    if len(result_data) > 0:
        result_data = pd.concat(result_data)

        path = Path(save_path).resolve()
        result_data.to_csv(path, index=None)

        print('Files saved under "' + save_path + '"')
def predict_from_directory(files_dir,
                           model_weights_file,
                           model_type,
                           tokenizer_file,
                           save_path,
                           input_length=100,
                           embedding_dim=100,
                           text_col='text'):
    FILES_DIR = Path(files_dir).resolve()
    RELATIONS_FILE = Path(os.path.abspath(__file__),
                          '../../query_relations.json').resolve()

    with RELATIONS_FILE.open('rb') as file:
        relations = json.load(file)

    tokenizer_path = Path(tokenizer_file).resolve()
    with tokenizer_path.open('rb') as file:
        tokenizer = pickle.load(file)

    weights_path = Path(model_weights_file).resolve()
    input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)
    model = NLP_MODEL[model_type](input_length,
                                  input_dim,
                                  None,
                                  embedding_dim=embedding_dim)
    model.load_weights(weights_path.as_posix())

    emotion_data_dict = {}

    for filename in os.listdir(FILES_DIR):
        print('Reading file: "' + filename + '"')

        query = re.findall(r'(#[^.]+|:.+:)', filename)[0]
        emotion = relations[query]

        file_data = pd.read_csv(os.path.join(FILES_DIR, filename))
        dict_data = emotion_data_dict[
            emotion] if emotion in emotion_data_dict else None
        emotion_data_dict[emotion] = pd.concat([dict_data, file_data])

    result_data = []

    for emotion, dataset in emotion_data_dict.items():
        print('Processing "' + emotion + '" data...')

        cleaned_texts = preprocess(dataset[text_col], quiet=True)
        predict_sequences = [text.split() for text in cleaned_texts]
        list_tokenized_predict = tokenizer.texts_to_sequences(
            predict_sequences)
        x_predict = pad_sequences(list_tokenized_predict, maxlen=input_length)

        result = model.predict(x_predict)
        mean = np.mean(result)
        std = np.std(result)
        low, high = get_score_range(mean, std)
        print("\tScore Range: {:4f} - {:4f}".format(low, high))
        dataset = dataset[np.all([(result >= low), (result <= high)], axis=0)]
        dataset.insert(0, 'label', emotion)

        result_data = result_data + [dataset]

    if len(result_data) > 0:
        result_data = pd.concat(result_data)

        path = Path(save_path).resolve()
        result_data.to_csv(path, index=None)

        print('Files saved under "' + save_path + '"')