コード例 #1
0
 def greene_metric(self, min_num_topics=10, step=5, max_num_topics=50, top_n_words=10, tao=10):
     """
     Implements Greene metric to compute the optimal number of topics. Taken from How Many Topics?
     Stability Analysis for Topic Models from Greene et al. 2014.
     :param step:
     :param min_num_topics: Minimum number of topics to test
     :param max_num_topics: Maximum number of topics to test
     :param top_n_words: Top n words for topic to use
     :param tao: Number of sampled models to build
     :return: A list of len (max_num_topics - min_num_topics) with the stability of each tested k
     """
     stability = []
     # Build reference topic model
     # Generate tao topic models with tao samples of the corpus
     for k in np.arange(min_num_topics, max_num_topics + 1, step):
         self.infer_topics(k)
         reference_rank = [list(zip(*self.top_words(i, top_n_words))[0]) for i in range(k)]
         agreement_score_list = []
         for t in range(tao):
             tao_corpus = Corpus(source_file_path=self.corpus._source_file_path,
                                 language=self.corpus._language,
                                 n_gram=self.corpus._n_gram,
                                 vectorization=self.corpus._vectorization,
                                 max_relative_frequency=self.corpus._max_relative_frequency,
                                 min_absolute_frequency=self.corpus._min_absolute_frequency,
                                 preprocessor=self.corpus._preprocessor,
                                 sample=True)
             tao_model = type(self)(tao_corpus)
             tao_model.infer_topics(k)
             tao_rank = [list(zip(*tao_model.top_words(i, top_n_words))[0]) for i in range(k)]
             agreement_score_list.append(tom_lib.stats.agreement_score(reference_rank, tao_rank))
         stability.append(np.mean(agreement_score_list))
     return stability
コード例 #2
0
def getCorpus(className, startTime, endTime):
    # Parameters
    max_tf = 0.8
    min_tf = 4
    num_topics = 7
    vectorization = 'tfidf'

    MYDIR = os.path.dirname(__file__)

    # Should do whole semester by default
    return Corpus(source_file_path=os.path.join(MYDIR, getDataPathForClass(className)),   # Our own dataset!
                    language='english',     # The english language let's go
                    vectorization=vectorization,
                    enqueueTime=startTime,
                    dequeueTime=endTime,
                    max_relative_frequency=max_tf,
                    min_absolute_frequency=min_tf)
コード例 #3
0
__author__ = "Adrien Guille"
__email__ = "*****@*****.**"

# Flask Web server
app = Flask(__name__, static_folder='browser/static', template_folder='browser/templates')

# Parameters
max_tf = 0.8
min_tf = 4
num_topics = 15
vectorization = 'tfidf'

# Load corpus
corpus = Corpus(source_file_path='input/egc_lemmatized.csv',
                language='french',
                vectorization=vectorization,
                max_relative_frequency=max_tf,
                min_absolute_frequency=min_tf)
print('corpus size:', corpus.size)
print('vocabulary size:', len(corpus.vocabulary))

# Infer topics
topic_model = NonNegativeMatrixFactorization(corpus=corpus)
topic_model.infer_topics(num_topics=num_topics)
topic_model.print_topics(num_words=10)

# Clean the data directory
if os.path.exists('browser/static/data'):
    shutil.rmtree('browser/static/data')
os.makedirs('browser/static/data')
コード例 #4
0
ファイル: infer_topics.py プロジェクト: weizhao-2010/TOM
from tom_lib.visualization.visualization import Visualization
import nltk

__author__ = "Adrien Guille, Pavel Soriano"
__email__ = "*****@*****.**"

# Download stopwords from NLTK
nltk.download('stopwords')

# Load and prepare a corpus
print('Load documents from CSV')
corpus = Corpus(
    source_file_path='input/egc_lemmatized.csv',
    language='french',  # language for stop words
    vectorization=
    'tfidf',  # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
    max_relative_frequency=
    0.8,  # ignore words which relative frequency is > than max_relative_frequency
    min_absolute_frequency=4
)  # ignore words which absolute frequency is < than min_absolute_frequency
print('corpus size:', corpus.size)
print('vocabulary size:', len(corpus.vocabulary))

# Instantiate a topic model
topic_model = NonNegativeMatrixFactorization(corpus)

# Estimate the optimal number of topics
# print('Estimating the number of topics...')
# viz = Visualization(topic_model)
# viz.plot_greene_metric(min_num_topics=10,
#                        max_num_topics=11,
コード例 #5
0
    plt.xlabel('Years')
    plt.savefig('topicevolution.png')
    plt.show()


#--------------------------------------------------------------------------------------------------------------------------------
start = timer()
print("Creating Papers.csv....")
MakeCompatibleCsv()
ReadMainCsvFile()
print("Initializing corpus....")
corpus = Corpus(
    source_file_path='Papers.csv',
    language='english',  # language for stop words
    vectorization=
    'tfidf',  # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
    n_gram=3,
    max_relative_frequency=
    0.8,  # ignore words which relative frequency is > than max_relative_frequency
    min_absolute_frequency=4
)  # ignore words which absolute frequency is < than min_absolute_frequency

print('corpus size:', corpus.size)
print('vocabulary size:', len(corpus.vocabulary))
#print('Vector representation of document 0:\n', corpus.vector_for_document(0))
# Instantiate a topic model
print('Instantiate a topic model...')
topic_model = NonNegativeMatrixFactorization(corpus)
topic_model.infer_topics(num_topics)
ut.save_topic_model(topic_model, 'output/NMF_30topics.tom')

print('Finding global Topics...')
コード例 #6
0
to fit the required package format. For this, we refer to the 'tom_df' created in 1(a)
'''
#required tom_lib format for the input: (already created in step 1)
#id    title    text
#1    Document 1's title    This is the full content of document 1.
#2    Document 2's title    This is the full content of document 2.
#etc.
#https://github.com/AdrienGuille/TOM/blob/master/README.md

#use the tom_df from 1(a)
tom_df.to_csv('tom_df.csv', sep='\t', index=False,
              encoding='utf-8')  #, sep='\t'

tom_lib_corpus = Corpus(source_file_path='tom_df.csv',
                        vectorization='tfidf',
                        n_gram=1,
                        max_relative_frequency=0.8,
                        min_absolute_frequency=2)

topic_model = LatentDirichletAllocation(tom_lib_corpus)

output_notebook()

#we have 2 as the minimum because we want the documents to be clustered and not fall under a single group
#we have 4 as the maximum because we assume that each document can belong to a unique topic, and anything more
#than that is too mayn for the sample that we have

p = figure(plot_height=250)
p.line(range(2, 4),
       topic_model.arun_metric(min_num_topics=2,
                               max_num_topics=4,
コード例 #7
0
ファイル: infer_topics.py プロジェクト: AdrienGuille/TOM
# coding: utf-8
import tom_lib.utils as ut
from tom_lib.nlp.topic_model import NonNegativeMatrixFactorization
from tom_lib.structure.corpus import Corpus
from tom_lib.visualization.visualization import Visualization

__author__ = "Adrien Guille, Pavel Soriano"
__email__ = "*****@*****.**"

# Load and prepare a corpus
print('Load documents from CSV')
corpus = Corpus(source_file_path='input/egc_lemmatized.csv',
                language='french',  # language for stop words
                vectorization='tfidf',  # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
                max_relative_frequency=0.8,  # ignore words which relative frequency is > than max_relative_frequency
                min_absolute_frequency=4)  # ignore words which absolute frequency is < than min_absolute_frequency
print('corpus size:', corpus.size)
print('vocabulary size:', len(corpus.vocabulary))
print('Vector representation of document 0:\n', corpus.vector_for_document(0))

# Instantiate a topic model
topic_model = NonNegativeMatrixFactorization(corpus)

# Estimate the optimal number of topics
print('Estimating the number of topics...')
viz = Visualization(topic_model)
viz.plot_greene_metric(min_num_topics=10,
                       max_num_topics=11,
                       tao=10, step=1,
                       top_n_words=10)
viz.plot_arun_metric(min_num_topics=5,
コード例 #8
0
__author__ = "Adrien Guille"
__email__ = "*****@*****.**"

# Flask Web server
app = Flask(__name__, static_folder='browser/static', template_folder='browser/templates')

# Parameters
max_tf = 0.8
min_tf = 4
num_topics = 30
vectorization = 'tfidf'

# Load corpus
corpus = Corpus(source_file_path='input/egc_lemmatized_.csv',
                language='english',
                vectorization=vectorization,
                max_relative_frequency=max_tf,
                min_absolute_frequency=min_tf)
print('corpus size:', corpus.size)
print('vocabulary size:', len(corpus.vocabulary))

# Infer topics
topic_model = NonNegativeMatrixFactorization(corpus=corpus)
topic_model.infer_topics(num_topics=num_topics)
topic_model.print_topics(num_words=30)

# Clean the data directory
if os.path.exists('browser/static/data'):
    shutil.rmtree('browser/static/data')
os.makedirs('browser/static/data')
コード例 #9
0
ファイル: topic_model.py プロジェクト: warmlogic/TOM
    def perplexity_metric(
        self,
        min_num_topics: int = 10,
        max_num_topics: int = 20,
        step: int = 5,
        train_size: float = 0.7,
        verbose: int = 0,
        lda_algorithm: str = None,
        lda_alpha: float = None,
        lda_eta: float = None,
        lda_learning_method: str = None,
        lda_n_jobs: int = None,
        lda_n_iter: int = None,
        random_state=None,
    ):
        """
        Measures perplexity for LDA as computed by scikit-learn.

        http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation.perplexity

        NB: Only supports lda_algorithm 'variational' (sklearn LDA)

        :param min_num_topics:
        :param max_num_topics:
        :param train_size:
        :return:
        """
        print('=' * 50)
        print('Computing perplexity metric (lower is better)...')
        num_topics_infer = range(min_num_topics, max_num_topics + 1, step)
        train_perplexities = []
        test_perplexities = []
        if self.model_type == 'LDA':
            print(f"Computing perplexity with lda_algorithm='{lda_algorithm}'")
            df_train, df_test = train_test_split(self.corpus.data_frame, train_size=train_size, test_size=1 - train_size)
            corpus_train = Corpus(
                source_filepath=df_train,
                name=self.corpus.name,
                sep=self.corpus._sep,
                language=self.corpus._language,
                n_gram=self.corpus._n_gram,
                vectorization=self.corpus._vectorization,
                max_relative_frequency=self.corpus._max_relative_frequency,
                min_absolute_frequency=self.corpus._min_absolute_frequency,
                max_features=self.corpus.max_features,
                sample=None,
                text_col=self.corpus.text_col,
                full_text_col=self.corpus.full_text_col,
                title_col=self.corpus.title_col,
                author_col=self.corpus.author_col,
                affiliation_col=self.corpus.affiliation_col,
                dataset_col=self.corpus.dataset_col,
                date_col=self.corpus.date_col,
                id_col=self.corpus.id_col,
            )
            tf_test = corpus_train.vectorizer.transform(df_test[corpus_train._text_col].tolist())
            lda_model = type(self)(corpus_train)
            for idx, i in enumerate(num_topics_infer):
                print(f'Topics={i} ({idx + 1} of {len(num_topics_infer)})')
                lda_model.infer_topics(
                    num_topics=i,
                    lda_algorithm=lda_algorithm,
                    lda_alpha=lda_alpha,
                    lda_eta=lda_eta,
                    lda_learning_method=lda_learning_method,
                    lda_n_jobs=lda_n_jobs,
                    lda_n_iter=lda_n_iter,
                    verbose=verbose,
                    random_state=random_state,
                )
                train_perplexities.append(lda_model.model.perplexity(
                    corpus_train.sklearn_vector_space))
                test_perplexities.append(lda_model.model.perplexity(tf_test))
                print(f'\tTrain perplexity={train_perplexities[-1]:.4f}, Test perplexity={test_perplexities[-1]:.4f}')
        else:
            raise TypeError("Computing perplexity only supported for LDA with lda_algorithm: str = None. Not running.")
        return train_perplexities, test_perplexities
コード例 #10
0
ファイル: topic_model.py プロジェクト: OPersian/TOM
# coding: utf-8
import tom_lib.utils as ut
from tom_lib.nlp.preprocessor import FrenchLemmatizer
from tom_lib.nlp.topic_model import NonNegativeMatrixFactorization
from tom_lib.structure.corpus import Corpus
from tom_lib.visualization.visualization import Visualization

__author__ = "Adrien Guille, Pavel Soriano"
__email__ = "*****@*****.**"

# Load and prepare a corpus
print('Load documents from CSV')
corpus = Corpus(source_file_path='input/egc.csv',
                language='french',  # language for stop words
                vectorization='tfidf',  # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
                max_relative_frequency=0.8,  # ignore words which relative frequency is > than max_relative_frequency
                min_absolute_frequency=4,  # ignore words which absolute frequency is < than min_absolute_frequency
                preprocessor=FrenchLemmatizer())  # pre-process documents
print('corpus size:', corpus.size)
print('vocabulary size:', len(corpus.vocabulary))
print('Vector representation of document 0:\n', corpus.vector_for_document(0))

# Instantiate a topic model
topic_model = NonNegativeMatrixFactorization(corpus)

# Estimate the optimal number of topics
print('Estimating the number of topics...')
viz = Visualization(topic_model)
viz.plot_greene_metric(min_num_topics=10,
                       max_num_topics=30,
                       tao=10, step=1,
コード例 #11
0
ファイル: topic_model.py プロジェクト: warmlogic/TOM
    def greene_metric(
        self,
        min_num_topics: int = 10,
        max_num_topics: int = 20,
        step: int = 5,
        top_n_words: int = 10,
        tao: int = 10,
        sample: float = 0.8,
        verbose: int = 0,
        nmf_init: str = None,
        nmf_solver: str = None,
        nmf_beta_loss: str = None,
        nmf_max_iter: int = None,
        nmf_alpha: float = None,
        nmf_l1_ratio: float = None,
        nmf_shuffle: bool = None,
        lda_algorithm: str = None,
        lda_alpha: float = None,
        lda_eta: float = None,
        lda_learning_method: str = None,
        lda_n_jobs: int = None,
        lda_n_iter: int = None,
        random_state=None,
    ):
        """
        Higher is better.

        Greene, D., O'Callaghan, D., and Cunningham, P.
        How Many Topics? Stability Analysis for Topic Models
        Machine Learning and Knowledge Discovery in Databases. ECML PKDD 2014.
        https://arxiv.org/abs/1404.4606

        :param step:
        :param min_num_topics: Minimum number of topics to test
        :param max_num_topics: Maximum number of topics to test
        :param top_n_words: Top n words for topic to use
        :param tao: Number of sampled models to build
        :return: A list of len (max_num_topics - min_num_topics) with the stability of each tested k
        """
        print('=' * 50)
        print('Computing Greene metric (higher is better)...')
        num_topics_infer = range(min_num_topics, max_num_topics + 1, step)
        stability = []

        # Build reference topic model
        # Generate tao topic models with tao samples of the corpus
        for idx, k in enumerate(num_topics_infer):
            print(f'Topics={k} ({idx + 1} of {len(num_topics_infer)})')
            if self.model_type == 'NMF':
                self.infer_topics(
                    num_topics=k,
                    nmf_init=nmf_init,
                    nmf_solver=nmf_solver,
                    nmf_beta_loss=nmf_beta_loss,
                    nmf_max_iter=nmf_max_iter,
                    nmf_alpha=nmf_alpha,
                    nmf_l1_ratio=nmf_l1_ratio,
                    nmf_shuffle=nmf_shuffle,
                    verbose=verbose,
                    random_state=random_state,
                )
            elif self.model_type == 'LDA':
                self.infer_topics(
                    num_topics=k,
                    lda_algorithm=lda_algorithm,
                    lda_alpha=lda_alpha,
                    lda_eta=lda_eta,
                    lda_learning_method=lda_learning_method,
                    lda_n_jobs=lda_n_jobs,
                    lda_n_iter=lda_n_iter,
                    verbose=verbose,
                    random_state=random_state,
                )
            else:
                raise TypeError(f'Unsupported model type: {self.model_type}')

            reference_rank = [list(zip(*self.top_words(i, top_n_words)))[0] for i in range(k)]
            agreement_score_list = []
            for t in range(tao):
                tao_corpus = Corpus(
                    source_filepath=self.corpus.data_frame,
                    name=self.corpus.name,
                    sep=self.corpus._sep,
                    language=self.corpus._language,
                    n_gram=self.corpus._n_gram,
                    vectorization=self.corpus._vectorization,
                    max_relative_frequency=self.corpus._max_relative_frequency,
                    min_absolute_frequency=self.corpus._min_absolute_frequency,
                    max_features=self.corpus.max_features,
                    sample=sample,
                    text_col=self.corpus.text_col,
                    full_text_col=self.corpus.full_text_col,
                    title_col=self.corpus.title_col,
                    author_col=self.corpus.author_col,
                    affiliation_col=self.corpus.affiliation_col,
                    dataset_col=self.corpus.dataset_col,
                    date_col=self.corpus.date_col,
                    id_col=self.corpus.id_col,
                )
                tao_model = type(self)(tao_corpus)
                if self.model_type == 'NMF':
                    tao_model.infer_topics(
                        num_topics=k,
                        nmf_init=nmf_init,
                        nmf_solver=nmf_solver,
                        nmf_beta_loss=nmf_beta_loss,
                        nmf_max_iter=nmf_max_iter,
                        nmf_alpha=nmf_alpha,
                        nmf_l1_ratio=nmf_l1_ratio,
                        nmf_shuffle=nmf_shuffle,
                        verbose=verbose,
                        random_state=random_state,
                    )
                elif self.model_type == 'LDA':
                    tao_model.infer_topics(
                        num_topics=k,
                        lda_algorithm=lda_algorithm,
                        lda_alpha=lda_alpha,
                        lda_eta=lda_eta,
                        lda_learning_method=lda_learning_method,
                        lda_n_jobs=lda_n_jobs,
                        lda_n_iter=lda_n_iter,
                        verbose=verbose,
                        random_state=random_state,
                    )
                else:
                    raise TypeError(f'Unsupported model type: {self.model_type}')
                tao_rank = [next(zip(*tao_model.top_words(i, top_n_words))) for i in range(k)]
                agreement_score_list.append(agreement_score(reference_rank, tao_rank))
            stability.append(np.mean(agreement_score_list))
            print(f'    Stability={stability[-1]:.4f}')
        return stability
コード例 #12
0
ファイル: assess_topics.py プロジェクト: warmlogic/TOM
def main(config_infer):
    # get the current datetime string for use in the output directory name
    now_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    # Data parameters
    data_dir = config_infer.get('data_dir', '', vars=os.environ)
    data_dir = data_dir or '.'
    data_dir = Path(data_dir)
    docs_filename = config_infer.get('docs_filename', '')
    if not docs_filename:
        raise ValueError(f'docs_filename not specified in {config_filepath}')
    source_filepath = data_dir / docs_filename
    if not source_filepath.exists():
        raise OSError(f'Documents file does not exist: {source_filepath}')
    # Corpus parameters
    id_col = config_infer.get('id_col', None)
    affiliation_col = config_infer.get('affiliation_col', None)
    dataset_col = config_infer.get('dataset_col', None)
    title_col = config_infer.get('title_col', None)
    author_col = config_infer.get('author_col', None)
    date_col = config_infer.get('date_col', None)
    text_col = config_infer.get('text_col', None)
    full_text_col = config_infer.get('full_text_col', None)
    corpus_name = config_infer.get('corpus_name', None)
    corpus_name = '_'.join(corpus_name.split()) if corpus_name else 'corpus'  # remove spaces
    language = config_infer.get('language', None)
    assert (isinstance(language, str) and language in ['english']) or (isinstance(language, list)) or (language is None)
    # ignore words which relative frequency is > than max_relative_frequency
    max_relative_frequency = config_infer.getfloat('max_relative_frequency', 0.8)
    # ignore words which absolute frequency is < than min_absolute_frequency
    min_absolute_frequency = config_infer.getint('min_absolute_frequency', 5)
    # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
    vectorization = config_infer.get('vectorization', 'tfidf')
    n_gram = config_infer.getint('n_gram', 1)
    max_features = config_infer.get('max_features', None)
    if isinstance(max_features, str):
        if max_features.isnumeric():
            max_features = int(max_features)
        elif max_features == 'None':
            max_features = None
    assert isinstance(max_features, int) or (max_features is None)
    sample = config_infer.getfloat('sample', 1.0)
    # General model parameters
    model_type = config_infer.get('model_type', 'NMF')
    verbose = config_infer.getint('verbose', 0)
    random_state = config_infer.getint('random_state', None)
    # NMF parameters
    nmf_init = config_infer.get('nmf_init', None)
    nmf_solver = config_infer.get('nmf_solver', None)
    nmf_beta_loss = config_infer.get('nmf_beta_loss', 'frobenius')
    nmf_max_iter = config_infer.getint('nmf_max_iter', None)
    nmf_alpha = config_infer.getfloat('nmf_alpha', None)
    nmf_l1_ratio = config_infer.getfloat('nmf_l1_ratio', None)
    nmf_shuffle = config_infer.getboolean('nmf_shuffle', None)
    # LDA parameters
    lda_algorithm = config_infer.get('lda_algorithm', 'variational')
    lda_alpha = config_infer.getfloat('lda_alpha', None)
    lda_eta = config_infer.getfloat('lda_eta', None)
    lda_learning_method = config_infer.get('lda_algorithm', 'batch')
    lda_n_jobs = config_infer.getint('lda_n_jobs', -1)
    lda_n_iter = config_infer.getint('lda_n_iter', None)

    # Assessment config parameters
    min_num_topics = config_infer.getint('min_num_topics', 11)
    max_num_topics = config_infer.getint('max_num_topics', 49)
    step = config_infer.getint('step', 2)
    greene_tao = config_infer.getint('greene_tao', 10)
    greene_top_n_words = config_infer.getint('greene_top_n_words', 10)
    greene_sample = config_infer.getfloat('greene_sample', 0.8)
    arun_iterations = config_infer.getint('arun_iterations', 10)
    brunet_iterations = config_infer.getint('brunet_iterations', 10)
    coherence_w2v_top_n_words = config_infer.getint('coherence_w2v_top_n_words', 10)
    coherence_w2v_size = config_infer.getint('coherence_w2v_size', 100)
    # perplexity_train_size = config_infer.getfloat('perplexity_train_size', 0.7)

    if model_type not in ['NMF', 'LDA']:
        raise ValueError(f"model_type must be 'NMF' or 'LDA', got {model_type}")

    if model_type == 'NMF':
        if (nmf_solver == 'mu') and (nmf_beta_loss not in ['frobenius', 'kullback-leibler', 'itakura-saito']):
            raise ValueError(f"For NMF, 'beta_loss' must be 'frobenius', 'kullback-leibler', or 'itakura-saito', got '{nmf_beta_loss}'")
        if vectorization == 'tf':
            raise ValueError(f"for NMF, 'vectorization' should be 'tfidf', got '{vectorization}'")
    elif model_type == 'LDA':
        if lda_algorithm not in ['variational', 'gibbs']:
            raise ValueError(f"For LDA, 'lda_algorithm' must be 'variational' or 'gibbs', got '{lda_algorithm}'")
        if vectorization == 'tfidf':
            raise ValueError(f"for LDA, 'vectorization' should be 'tf', got '{vectorization}'")

    # Load and prepare a corpus
    logger.info(f'Loading documents: {source_filepath}')
    corpus = Corpus(
        source_filepath=source_filepath,
        name=corpus_name,
        language=language,
        vectorization=vectorization,
        n_gram=n_gram,
        max_relative_frequency=max_relative_frequency,
        min_absolute_frequency=min_absolute_frequency,
        max_features=max_features,
        sample=sample,
        id_col=id_col,
        affiliation_col=affiliation_col,
        dataset_col=dataset_col,
        title_col=title_col,
        author_col=author_col,
        date_col=date_col,
        text_col=text_col,
        full_text_col=full_text_col,
    )
    logger.info(f'Corpus size: {corpus.size:,}')
    logger.info(f'Vocabulary size: {corpus.vocabulary_size:,}')

    # Initialize topic model
    if model_type == 'NMF':
        topic_model = NonNegativeMatrixFactorization(corpus=corpus)
    elif model_type == 'LDA':
        topic_model = LatentDirichletAllocation(corpus=corpus)

    # Estimate the optimal number of topics
    num_topics_infer = range(min_num_topics, max_num_topics + 1, step)
    logger.info(f'Total number of topics to infer: {len(num_topics_infer)}')
    logger.info(f'Topic numbers: {list(num_topics_infer)}')

    output_dir = f'assess_{topic_model.model_type}_{source_filepath.stem}_{now_str}'

    viz = Visualization(topic_model, output_dir=output_dir)

    logger.info('Estimating the number of topics to choose. This could take a while...')
    logger.info(f'Will save results to: {viz.output_dir}')

    logger.info('Assessing Greene metric')
    viz.plot_greene_metric(
        min_num_topics=min_num_topics,
        max_num_topics=max_num_topics,
        step=step,
        tao=greene_tao,
        top_n_words=greene_top_n_words,
        sample=greene_sample,
        random_state=random_state,
        verbose=verbose,
        nmf_init=nmf_init,
        nmf_solver=nmf_solver,
        nmf_beta_loss=nmf_beta_loss,
        nmf_max_iter=nmf_max_iter,
        nmf_alpha=nmf_alpha,
        nmf_l1_ratio=nmf_l1_ratio,
        nmf_shuffle=nmf_shuffle,
        lda_algorithm=lda_algorithm,
        lda_alpha=lda_alpha,
        lda_eta=lda_eta,
        lda_learning_method=lda_learning_method,
        lda_n_jobs=lda_n_jobs,
        lda_n_iter=lda_n_iter,
    )

    logger.info('Assessing Arun metric')
    viz.plot_arun_metric(
        min_num_topics=min_num_topics,
        max_num_topics=max_num_topics,
        step=step,
        iterations=arun_iterations,
        random_state=random_state,
        verbose=verbose,
        nmf_init=nmf_init,
        nmf_solver=nmf_solver,
        nmf_beta_loss=nmf_beta_loss,
        nmf_max_iter=nmf_max_iter,
        nmf_alpha=nmf_alpha,
        nmf_l1_ratio=nmf_l1_ratio,
        nmf_shuffle=nmf_shuffle,
        lda_algorithm=lda_algorithm,
        lda_alpha=lda_alpha,
        lda_eta=lda_eta,
        lda_learning_method=lda_learning_method,
        lda_n_jobs=lda_n_jobs,
        lda_n_iter=lda_n_iter,
    )

    logger.info('Assessing Coherence Word2Vec metric')
    viz.plot_coherence_w2v_metric(
        min_num_topics=min_num_topics,
        max_num_topics=max_num_topics,
        step=step,
        top_n_words=coherence_w2v_top_n_words,
        w2v_size=coherence_w2v_size,
        random_state=random_state,
        verbose=verbose,
        nmf_init=nmf_init,
        nmf_solver=nmf_solver,
        nmf_beta_loss=nmf_beta_loss,
        nmf_max_iter=nmf_max_iter,
        nmf_alpha=nmf_alpha,
        nmf_l1_ratio=nmf_l1_ratio,
        nmf_shuffle=nmf_shuffle,
        lda_algorithm=lda_algorithm,
        lda_alpha=lda_alpha,
        lda_eta=lda_eta,
        lda_learning_method=lda_learning_method,
        lda_n_jobs=lda_n_jobs,
        lda_n_iter=lda_n_iter,
    )

    logger.info('Assessing Brunet metric')
    viz.plot_brunet_metric(
        min_num_topics=min_num_topics,
        max_num_topics=max_num_topics,
        step=step,
        iterations=brunet_iterations,
        random_state=random_state,
        verbose=verbose,
        nmf_init=nmf_init,
        nmf_solver=nmf_solver,
        nmf_beta_loss=nmf_beta_loss,
        nmf_max_iter=nmf_max_iter,
        nmf_alpha=nmf_alpha,
        nmf_l1_ratio=nmf_l1_ratio,
        nmf_shuffle=nmf_shuffle,
        lda_algorithm=lda_algorithm,
        lda_alpha=lda_alpha,
        lda_eta=lda_eta,
        lda_learning_method=lda_learning_method,
        lda_n_jobs=lda_n_jobs,
        lda_n_iter=lda_n_iter,
    )
コード例 #13
0
        similar_word_ids = np.argsort(np.array(similarity)).tolist()
        return similar_word_ids[:nb_words]

if __name__ == '__main__':
    input_file_path = ''
    output_file_path = ''
    if platform.system() == 'Darwin':
        input_file_path = '/Users/adrien/data/HP/HP1.csv'
        output_file_path = '/Users/adrien/data/HP/HP1_lemmatized.csv'
    elif platform.system() == 'Linux':
        input_file_path = '/home/adrien/datasets/HP/HP1.csv'
        output_file_path = '/home/adrien/datasets/HP/HP1_lemmatized.csv'
    print('Loading corpus...')
    corpus = Corpus(source_file_path=input_file_path,
                    vectorization='tf',
                    max_relative_frequency=0.75,
                    min_absolute_frequency=4,
                    preprocessor=EnglishLemmatizer())
    print(' - corpus size:', corpus.size)
    print(' - vocabulary size:', len(corpus.vocabulary))
    corpus.export(output_file_path)

    print('Computing semantic model...')
    print(' - calculating raw frequencies...')
    model = SemanticModel(corpus, window=7)

    print(' - transforming raw frequencies (Positive Pointwise Mutual Information)...')
    model.ppmi_transform(laplace_smoothing=2)

    print(' - smoothing model (SVD)...')
    model.svd_smoothing(dimension=300)
コード例 #14
0
def main(config_browser):
    # Data parameters
    data_dir = config_browser.get('data_dir', '', vars=os.environ)
    data_dir = data_dir or '.'
    data_dir = Path(data_dir)
    docs_filename = config_browser.get('docs_filename', '')
    if not docs_filename:
        raise ValueError(f'docs_filename not specified in {config_filepath}')
    source_filepath = data_dir / docs_filename
    if not source_filepath.exists():
        raise OSError(f'Documents file does not exist: {source_filepath}')
    # Corpus parameters
    id_col = config_browser.get('id_col', None)
    affiliation_col = config_browser.get('affiliation_col', None)
    dataset_col = config_browser.get('dataset_col', None)
    title_col = config_browser.get('title_col', None)
    author_col = config_browser.get('author_col', None)
    date_col = config_browser.get('date_col', None)
    text_col = config_browser.get('text_col', None)
    full_text_col = config_browser.get('full_text_col', None)
    corpus_name = config_browser.get('corpus_name', None)
    corpus_name = '_'.join(
        corpus_name.split()) if corpus_name else 'corpus'  # remove spaces
    language = config_browser.get('language', None)
    assert (isinstance(language, str)
            and language in ['english']) or (isinstance(
                language, list)) or (language is None)
    # ignore words which relative frequency is > than max_relative_frequency
    max_relative_frequency = config_browser.getfloat('max_relative_frequency',
                                                     0.8)
    # ignore words which absolute frequency is < than min_absolute_frequency
    min_absolute_frequency = config_browser.getint('min_absolute_frequency', 5)
    # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
    vectorization = config_browser.get('vectorization', 'tfidf')
    n_gram = config_browser.getint('n_gram', 1)
    max_features = config_browser.get('max_features', None)
    if isinstance(max_features, str):
        if max_features.isnumeric():
            max_features = int(max_features)
        elif max_features == 'None':
            max_features = None
    assert isinstance(max_features, int) or (max_features is None)
    sample = config_browser.getfloat('sample', 1.0)
    # General model parameters
    model_type = config_browser.get('model_type', 'NMF')
    num_topics = config_browser.getint('num_topics', 15)
    verbose = config_browser.getint('verbose', 0)
    random_state = config_browser.getint('random_state', None)
    rename_topics = config_browser.get('rename_topics', None)
    rename_topics = rename_topics.split(',') if rename_topics else None
    merge_topics = config_browser.get('merge_topics', None)
    if merge_topics:
        merge_topics = {
            t.split(':')[0]: t.split(':')[1:][0].split(',')
            for t in merge_topics.split('.') if t
        }
    # must define the state if renaming or merging topics
    if rename_topics or merge_topics:
        assert random_state is not None
    load_if_existing_model = config_browser.getboolean(
        'load_if_existing_model', True)
    # NMF parameters
    nmf_init = config_browser.get('nmf_init', None)
    nmf_solver = config_browser.get('nmf_solver', None)
    nmf_beta_loss = config_browser.get('nmf_beta_loss', 'frobenius')
    nmf_max_iter = config_browser.getint('nmf_max_iter', None)
    nmf_alpha = config_browser.getfloat('nmf_alpha', None)
    nmf_l1_ratio = config_browser.getfloat('nmf_l1_ratio', None)
    nmf_shuffle = config_browser.getboolean('nmf_shuffle', None)
    # LDA parameters
    lda_algorithm = config_browser.get('lda_algorithm', 'variational')
    lda_alpha = config_browser.getfloat('lda_alpha', None)
    lda_eta = config_browser.getfloat('lda_eta', None)
    lda_learning_method = config_browser.get('lda_algorithm', 'batch')
    lda_n_jobs = config_browser.getint('lda_n_jobs', -1)
    lda_n_iter = config_browser.getint('lda_n_iter', None)
    # Web app parameters
    top_words_description = config_browser.getint('top_words_description', 10)
    top_words_cloud = config_browser.getint('top_words_cloud', 5)

    if model_type not in ['NMF', 'LDA']:
        raise ValueError(
            f"model_type must be 'NMF' or 'LDA', got {model_type}")

    if model_type == 'NMF':
        if (nmf_solver == 'mu') and (nmf_beta_loss not in [
                'frobenius', 'kullback-leibler', 'itakura-saito'
        ]):
            raise ValueError(
                f"For NMF, 'beta_loss' must be 'frobenius', 'kullback-leibler', or 'itakura-saito', got '{nmf_beta_loss}'"
            )
        if vectorization == 'tf':
            raise ValueError(
                f"for NMF, 'vectorization' should be 'tfidf', got '{vectorization}'"
            )
    elif model_type == 'LDA':
        if lda_algorithm not in ['variational', 'gibbs']:
            raise ValueError(
                f"For LDA, 'lda_algorithm' must be 'variational' or 'gibbs', got '{lda_algorithm}'"
            )
        if vectorization == 'tfidf':
            raise ValueError(
                f"for LDA, 'vectorization' should be 'tf', got '{vectorization}'"
            )

    if rename_topics:
        assert len(rename_topics) == num_topics

    # Flask Web server
    static_folder = Path('browser/static')
    template_folder = Path('browser/templates')

    # Set up directories for serving files
    tm_folder = Path(
        'data') / f'{model_type}_{source_filepath.stem}_{num_topics}_topics'
    data_folder = tm_folder / 'data'
    model_folder = tm_folder / 'model'
    topic_model_filepath = model_folder / 'model.pickle'

    # Set up sub-directories for serving files
    topic_cloud_folder = data_folder / 'topic_cloud'
    # # author_network_folder = data_folder / 'author_network'
    figs_folder = data_folder / 'figs'

    # ##################################
    # Load or train model
    # ##################################

    if load_if_existing_model and (static_folder /
                                   topic_model_filepath).exists():
        # Load model from disk:
        logger.info(
            f'Loading topic model: {static_folder / topic_model_filepath}')
        topic_model = ut.load_topic_model(static_folder / topic_model_filepath)

        # if loading a model and random_state is set, ensure they match
        if random_state:
            assert topic_model.random_state == random_state

        logger.info(f'Corpus size: {topic_model.corpus.size:,}')
        logger.info(f'Vocabulary size: {topic_model.corpus.vocabulary_size:,}')
    else:
        # Clean the topic model directory
        if (static_folder / tm_folder).exists():
            ut.delete_folder(static_folder / tm_folder)
        (static_folder / tm_folder).mkdir(parents=True, exist_ok=False)

        # Load and prepare a corpus
        logger.info(f'Loading documents: {source_filepath}')
        corpus = Corpus(
            source_filepath=source_filepath,
            name=corpus_name,
            language=language,
            vectorization=vectorization,
            n_gram=n_gram,
            max_relative_frequency=max_relative_frequency,
            min_absolute_frequency=min_absolute_frequency,
            max_features=max_features,
            sample=sample,
            id_col=id_col,
            affiliation_col=affiliation_col,
            dataset_col=dataset_col,
            title_col=title_col,
            author_col=author_col,
            date_col=date_col,
            text_col=text_col,
            full_text_col=full_text_col,
        )
        # Initialize topic model
        if model_type == 'NMF':
            topic_model = NonNegativeMatrixFactorization(corpus=corpus)
        elif model_type == 'LDA':
            topic_model = LatentDirichletAllocation(corpus=corpus)

        logger.info(f'Corpus size: {topic_model.corpus.size:,}')
        logger.info(f'Vocabulary size: {topic_model.corpus.vocabulary_size:,}')

        # Infer topics
        logger.info(f'Inferring {num_topics} topics')
        if model_type == 'NMF':
            topic_model.infer_topics(
                num_topics=num_topics,
                nmf_init=nmf_init,
                nmf_solver=nmf_solver,
                nmf_beta_loss=nmf_beta_loss,
                nmf_max_iter=nmf_max_iter,
                nmf_alpha=nmf_alpha,
                nmf_l1_ratio=nmf_l1_ratio,
                nmf_shuffle=nmf_shuffle,
                verbose=verbose,
                random_state=random_state,
            )
        elif model_type == 'LDA':
            topic_model.infer_topics(
                num_topics=num_topics,
                lda_algorithm=lda_algorithm,
                lda_alpha=lda_alpha,
                lda_eta=lda_eta,
                lda_learning_method=lda_learning_method,
                lda_n_jobs=lda_n_jobs,
                lda_n_iter=lda_n_iter,
                verbose=verbose,
                random_state=random_state,
            )

        # Save model on disk
        logger.info(f'Saving topic model: {topic_model_filepath}')
        ut.save_topic_model(topic_model, static_folder / topic_model_filepath)

    topic_cols_all = [
        ' '.join(tw)
        for tw in topic_model.top_words_topics(num_words=top_words_description)
    ]
    if rename_topics:
        rename = {tc: d for tc, d in zip(topic_cols_all, rename_topics)}
    else:
        rename = None

    # Get the top words for each topic for use around the site
    topic_description = [
        f"Topic {i:2d}: {rename_topics[i] + ' --- ' if rename_topics else None}{', '.join(tw)}"
        for i, tw in enumerate(
            topic_model.top_words_topics(num_words=top_words_description))
    ]

    # Save the top words to CSV
    num_top_words_save = 20
    logger.info(f'Saving top {num_top_words_save} words CSV and XLSX')
    top_words_filename = f'{topic_model.corpus.name}_{topic_model.nb_topics}_topics_top_{num_top_words_save}_words'
    ut.save_top_words(num_top_words_save, topic_model,
                      static_folder / data_folder / top_words_filename)

    # Get the vocabularly and split into sublists
    n_cols = 5
    words_per_col = int(ceil(topic_model.corpus.vocabulary_size / n_cols))
    split_vocabulary = [
        sublist for sublist in ut.chunks(
            [(k, v)
             for k, v in topic_model.corpus.vocabulary.items()], words_per_col)
    ]

    # Export topic cloud
    logger.info('Saving topic cloud')
    ut.save_topic_cloud(topic_model,
                        static_folder / topic_cloud_folder /
                        'topic_cloud.json',
                        top_words=top_words_cloud)

    # # Export per-topic author network using the most likely documents for each topic
    # logger.info('Saving author network details')
    # for topic_id in range(topic_model.nb_topics):
    #     ut.save_json_object(topic_model.corpus.collaboration_network(topic_model.documents_for_topic(topic_id)),
    #                         static_folder / author_network_folder / f'author_network{topic_id}.json')

    logger.info('Done.')

    # ##################################
    # Make plots for the main index page
    # ##################################

    logger.info('Creating plots...')

    # always create these images so they are up to date, and we have the paths based on the variables

    normalized = True
    thresh = 0.1
    freq = '1YS'
    ma_window = None
    savefig = True
    ncols = 7
    nchar_title = 30
    dpi = 72
    figformat = 'png'
    by_affil_list = [False, True]
    if merge_topics:
        merge_topics_list = [False, True]
    else:
        merge_topics_list = [False, False]

    viz = Visualization(topic_model, output_dir=static_folder / figs_folder)

    logger.info(f'Will save figures and figure data to: {viz.output_dir}')

    # count
    docs_over_time_count_line, docs_over_time_count_filepath = viz.plotly_docs_over_time(
        freq=freq,
        count=True,
        by_affil=True,
        ma_window=ma_window,
        output_type='div',
        savedata=True,
    )

    # percent
    docs_over_time_percent_line, docs_over_time_percent_filepath = viz.plotly_docs_over_time(
        freq=freq,
        count=False,
        by_affil=True,
        ma_window=ma_window,
        output_type='div',
        savedata=True,
    )

    # average topic loading
    topic_loading_barplot, topic_loading_filepath = viz.plotly_doc_topic_loading(
        rename=rename,
        normalized=normalized,
        n_words=top_words_description,
        output_type='div',
        savedata=True,
    )

    # topic_heatmap, topic_heatmap_filepath = viz.plotly_heatmap(
    #     rename=rename,
    #     normalized=normalized,
    #     n_words=top_words_description,
    #     annotate=True,
    #     annot_decimals=2,
    #     annot_fontsize=7,
    #     annot_fontcolor='black',
    #     output_type='div',
    #     savedata=False,
    # )

    topic_clustermap, topic_clustermap_filepath, topic_heatmap_filepath = viz.plotly_clustermap(
        rename=rename,
        normalized=normalized,
        n_words=top_words_description,
        annotate=True,
        annot_decimals=2,
        annot_fontsize=7,
        annot_fontcolor='black',
        output_type='div',
        savedata=True,
    )

    totc = []
    totp = []
    # totl = []
    for i, mt in enumerate(merge_topics_list):
        for ba in by_affil_list:
            if (not any(merge_topics_list)) and (i == 1):
                fig_topic_over_time_count = None
            else:
                _, _, fig_topic_over_time_count = viz.plot_topic_over_time_count(
                    rename=rename,
                    merge_topics=merge_topics if mt else None,
                    normalized=normalized,
                    thresh=thresh,
                    freq=freq,
                    n_words=top_words_description,
                    by_affil=ba,
                    ma_window=ma_window,
                    nchar_title=nchar_title,
                    ncols=ncols,
                    savefig=savefig,
                    dpi=dpi,
                    figformat=figformat,
                )
            totc.append(fig_topic_over_time_count)

            if (not any(merge_topics_list)) and (i == 1):
                fig_topic_over_time_percent = None
            else:
                _, _, fig_topic_over_time_percent = viz.plot_topic_over_time_percent(
                    rename=rename,
                    merge_topics=merge_topics if mt else None,
                    normalized=normalized,
                    thresh=thresh,
                    freq=freq,
                    n_words=top_words_description,
                    by_affil=ba,
                    ma_window=ma_window,
                    nchar_title=nchar_title,
                    ncols=ncols,
                    savefig=savefig,
                    dpi=dpi,
                    figformat=figformat,
                )
            totp.append(fig_topic_over_time_percent)

            # if (not any(merge_topics_list)) and (i == 1):
            #     fig_topic_over_time_loading = None
            # else:
            #     _, _, fig_topic_over_time_loading = viz.plot_topic_over_time_loading(
            #         rename=rename,
            #         merge_topics=merge_topics if mt else None,
            #         normalized=normalized,
            #         thresh=thresh,
            #         freq=freq,
            #         n_words=top_words_description,
            #         by_affil=ba,
            #         ma_window=ma_window,
            #         nchar_title=nchar_title,
            #         ncols=ncols,
            #         savefig=savefig,
            #         dpi=dpi,
            #         figformat=figformat,
            #     )
            # totl.append(fig_topic_over_time_loading)

    # _, _, fig_topic_topic_corr_heatmap = viz.plot_heatmap(
    #     rename=rename,
    #     normalized=normalized,
    #     fmt='.2f',
    #     annot_fontsize=12,
    #     n_words=top_words_description,
    #     savefig=savefig,
    #     dpi=dpi,
    #     figformat=figformat,
    # )

    _, fig_topic_topic_corr_clustermap = viz.plot_clustermap(
        rename=rename,
        normalized=normalized,
        fmt='.2f',
        annot_fontsize=12,
        n_words=top_words_description,
        savefig=savefig,
        dpi=dpi,
        figformat=figformat,
    )

    # # debug
    # fig_topic_over_time_count = ''
    # fig_topic_over_time_percent = ''
    # fig_topic_over_time_loading = ''
    # fig_topic_over_time_count_affil = ''
    # fig_topic_over_time_percent_affil = ''
    # fig_topic_over_time_loading_affil = ''
    # fig_topic_topic_corr_heatmap = ''
    # fig_topic_topic_corr_clustermap = ''

    logger.info('Done.')

    # ##################################
    # Print info
    # ##################################

    topic_model.print_topics(num_words=10)

    server = Flask(__name__,
                   static_folder=static_folder,
                   template_folder=template_folder)

    # ##################################
    # Set up topic loading similarity app
    # ##################################

    external_stylesheets = [
        'https://codepen.io/chriddyp/pen/bWLwgP.css',
    ]

    app = dash.Dash(
        __name__,
        server=server,
        routes_pathname_prefix='/topic_loading_similarity/',
        external_stylesheets=external_stylesheets,
    )

    app.title = 'Topic Loading Similarity'
    similarity_col = 'similarity'

    cols_sim = [
        similarity_col,
        topic_model.corpus._title_col,
        topic_model.corpus._dataset_col,
        topic_model.corpus._affiliation_col,
        topic_model.corpus._author_col,
        topic_model.corpus._date_col,
        id_col,
    ]
    cols_nosim = [
        c for c in cols_sim if c in topic_model.corpus.data_frame.columns
    ]

    app.layout = html.Div([
        html.Div([
            html.Div(
                html.
                P('Drag or click the sliders to describe a topic loading vector. The most similar documents are displayed below.'
                  ),
                style={'float': 'left'},
            ),
            html.Div(
                html.A('Back to topic browser', id='back-to-main', href='../'),
                style={'float': 'right'},
            ),
        ]),
        html.Div(html.P('')),
        html.Div(
            [
                html.Div([
                    html.Div(
                        dcc.Slider(
                            id=f'slider-topic-{n}',
                            min=0.0,
                            max=1.0,
                            step=0.1,
                            value=0.0,  # starting value
                            updatemode='drag',
                        ),
                        style={
                            'width': '20%',
                            'display': 'inline-block',
                        },
                    ),
                    html.Div(
                        id=f'slider-output-container-{n}',
                        style={
                            'marginLeft': 10,
                            'marginRight': 5,
                            'font-size': 'small',
                            'display': 'inline-block',
                        },
                    ),
                    html.Div(
                        html.Label(topic_description[n]),
                        style={
                            'font-weight': 'bold',
                            'font-size': 'small',
                            'width': '75%',
                            'display': 'inline-block',
                        },
                    ),
                ]) for n in range(topic_model.nb_topics)
            ],
            style={
                'width': '100%',
                'display': 'inline-block'
            },
        ),
        html.Label('Number of documents to display'),
        html.Div(
            dcc.Dropdown(
                id='num-docs-dropdown',
                options=[
                    {
                        'label': '10',
                        'value': 10
                    },
                    {
                        'label': '50',
                        'value': 50
                    },
                    {
                        'label': '100',
                        'value': 100
                    },
                    {
                        'label': '200',
                        'value': 200
                    },
                    {
                        'label': 'All',
                        'value': topic_model.corpus.size
                    },
                ],
                value=10,
                placeholder='Select...',
            ),
            style={
                'width': '10%',
                'display': 'inline-block',
            },
        ),
        html.Div(
            html.A(
                html.Button('Export to CSV'),
                id='download-link',
                download=f'{corpus_name}_topic_loading_similarity.csv',
                href='',
                target='_blank',
            ),
            style={
                'display': 'inline-block',
                'float': 'right',
            },
        ),
        html.Div([
            dt.DataTable(
                id='doc-table',
                data=[],
                columns=[{
                    "name": i,
                    "id": i
                } for i in cols_sim],
                style_table={'overflowX': 'scroll'},
                style_cell={
                    'minWidth': '0px',
                    'maxWidth': '250px',
                    'whiteSpace': 'normal'
                },
                style_cell_conditional=[
                    {
                        'if': {
                            'column_id': similarity_col
                        },
                        'width': '7%'
                    },
                    {
                        'if': {
                            'column_id': topic_model.corpus._title_col
                        },
                        'width': '39%'
                    },
                    {
                        'if': {
                            'column_id': topic_model.corpus._dataset_col
                        },
                        'width': '6%'
                    },
                    {
                        'if': {
                            'column_id': topic_model.corpus._affiliation_col
                        },
                        'width': '14%'
                    },
                    {
                        'if': {
                            'column_id': topic_model.corpus._author_col
                        },
                        'width': '12%'
                    },
                    {
                        'if': {
                            'column_id': topic_model.corpus._date_col
                        },
                        'width': '7%'
                    },
                    {
                        'if': {
                            'column_id': id_col
                        },
                        'width': '15%'
                    },
                ],
                style_data_conditional=[{
                    'if': {
                        'row_index': 'odd'
                    },
                    'backgroundColor': 'rgb(248, 248, 248)'
                }],
                style_header={
                    'backgroundColor': 'rgb(230, 230, 230)',
                    'fontWeight': 'bold'
                },
                css=[{
                    'selector':
                    '.dash-cell div.dash-cell-value',
                    'rule':
                    'display: inline; white-space: inherit; overflow: inherit; text-overflow: inherit;'
                }],
                editable=False,
                row_deletable=False,
                filter_action='native',
                sort_action='native',
                page_action='native',
                page_current=0,
                page_size=100,
                style_as_list_view=False,
            ),
        ]),
    ])

    for n in range(topic_model.nb_topics):

        @app.callback(
            Output(f'slider-output-container-{n}', 'children'),
            [Input(f'slider-topic-{n}', 'value')],
        )
        def update_output(slider_n_value):
            return f'{slider_n_value:.1f}'

    def filter_data(vector, num_docs=None, round_decimal=None):
        if not num_docs:
            num_docs = 10
        if not round_decimal:
            round_decimal = 4
        doc_ids_sims = topic_model.similar_documents(vector, num_docs=num_docs)
        doc_ids = [x[0] for x in doc_ids_sims]
        result = topic_model.corpus.data_frame.reindex(columns=cols_nosim,
                                                       index=doc_ids)
        result[similarity_col] = [
            round(x[1], round_decimal) for x in doc_ids_sims
        ]
        result[topic_model.corpus._date_col] = result[
            topic_model.corpus._date_col].dt.strftime('%Y-%m-%d')
        return result

    @app.callback(
        Output('doc-table', 'data'),
        [
            Input(f'slider-topic-{n}', 'value')
            for n in range(topic_model.nb_topics)
        ] + [Input('num-docs-dropdown', 'value')],
    )
    def update_table(*args):
        vector = list(args[:-1])
        num_docs = args[-1]
        return filter_data(vector, num_docs).to_dict('records')

    @app.callback(
        Output('download-link', 'href'),
        [
            Input(f'slider-topic-{n}', 'value')
            for n in range(topic_model.nb_topics)
        ] + [Input('num-docs-dropdown', 'value')],
    )
    def update_download_link(*args):
        vector = list(args[:-1])
        num_docs = args[-1]
        return 'data:text/csv;charset=utf-8,%EF%BB%BF' + urllib.parse.quote(
            filter_data(vector, num_docs).to_csv(index=False,
                                                 encoding='utf-8'))

    # ##################################
    # Serve pages
    # ##################################

    @server.route('/')
    def index():
        return render_template(
            'index.html',
            topic_ids=topic_description,
            doc_ids=range(topic_model.corpus.size),
            method=type(topic_model).__name__,
            corpus_name=corpus_name,
            corpus_size=topic_model.corpus.size,
            vocabulary_size=topic_model.corpus.vocabulary_size,
            max_relative_frequency=max_relative_frequency,
            min_absolute_frequency=min_absolute_frequency,
            vectorization=vectorization,
            num_topics=num_topics,
            random_state=topic_model.random_state,
            top_words_csv=data_folder / f'{top_words_filename}.csv',
            top_words_xlsx=data_folder / f'{top_words_filename}.xlsx',
            docs_over_time_count_line=docs_over_time_count_line,
            docs_over_time_count_filepath=figs_folder /
            docs_over_time_count_filepath,
            docs_over_time_percent_line=docs_over_time_percent_line,
            docs_over_time_percent_filepath=figs_folder /
            docs_over_time_percent_filepath,
            topic_loading_barplot=topic_loading_barplot,
            topic_loading_filepath=figs_folder / topic_loading_filepath,
            # topic_heatmap=topic_heatmap,
            topic_clustermap=topic_clustermap,
            topic_clustermap_filepath=figs_folder / topic_clustermap_filepath,
            topic_heatmap_filepath=figs_folder / topic_heatmap_filepath,
            fig_topic_over_time_count=figs_folder / totc[0] if totc[0] else
            None,  # count, original topics, combined affiliations
            fig_topic_over_time_percent=figs_folder / totp[0] if totp[0] else
            None,  # percent, original topics, combined affiliations
            # fig_topic_over_time_loading=figs_folder / totl[0] if totl[0] else None,  # loading, original topics, combined affiliations
            fig_topic_over_time_count_affil=figs_folder / totc[1]
            if totc[1] else None,  # count, original topics, split affiliations
            fig_topic_over_time_percent_affil=figs_folder / totp[1] if totp[1]
            else None,  # percent, original topics, split affiliations
            # fig_topic_over_time_loading_affil=figs_folder / totl[1] if totl[1] else None,  # loading, original topics, split affiliations
            fig_topic_over_time_count_merged=figs_folder / totc[2] if totc[2]
            else None,  # count, merged topics, combined affiliations
            fig_topic_over_time_percent_merged=figs_folder / totp[2] if totp[2]
            else None,  # percent, merged topics, combined affiliations
            # fig_topic_over_time_loading_merged=figs_folder / totl[2] if totl[2] else None,  # loading, merged topics, combined affiliations
            fig_topic_over_time_count_affil_merged=figs_folder / totc[3]
            if totc[3] else None,  # count, merged topics, split affiliations
            fig_topic_over_time_percent_affil_merged=figs_folder / totp[3]
            if totp[3] else None,  # percent, merged topics, split affiliations
            # fig_topic_over_time_loading_affil_merged=figs_folder / totl[3] if totl[3] else None,  # loading, merged topics, split affiliations
            # fig_topic_topic_corr_heatmap=figs_folder / fig_topic_topic_corr_heatmap,
            fig_topic_topic_corr_clustermap=figs_folder /
            fig_topic_topic_corr_clustermap,
        )

    @server.route('/topic_cloud.html')
    def topic_cloud():
        return render_template(
            'topic_cloud.html',
            topic_ids=topic_description,
            doc_ids=range(topic_model.corpus.size),
            topic_cloud_filename=topic_cloud_folder / 'topic_cloud.json',
        )

    @server.route('/vocabulary.html')
    def vocabulary():
        return render_template(
            'vocabulary.html',
            topic_ids=topic_description,
            split_vocabulary=split_vocabulary,
            vocabulary_size=topic_model.corpus.vocabulary_size,
        )

    @server.route('/topic/<tid>.html')
    def topic_details(tid: str):
        tid = int(tid)
        # get the most likely documents per topic
        ids = topic_model.documents_for_topic(tid)
        # # get the top 100 documents per topic
        # ids = list(topic_model.top_topic_docs(topics=tid, top_n=100))[0][1]
        documents = []
        for i, document_id in enumerate(ids):
            documents.append((
                i + 1,
                topic_model.corpus.title(document_id).title(),
                ', '.join(topic_model.corpus.dataset(document_id)).title(),
                ', '.join(topic_model.corpus.affiliation(document_id)).title(),
                ', '.join(topic_model.corpus.author(document_id)).title(),
                topic_model.corpus.date(document_id).strftime('%Y-%m-%d'),
                topic_model.corpus.id(document_id),
                document_id,
            ), )

        topic_word_weight_barplot, _ = viz.plotly_topic_word_weight(
            tid,
            normalized=True,
            n_words=20,
            output_type='div',
            savedata=False)
        topic_over_time_percent_line, _ = viz.plotly_topic_over_time(
            tid, count=False, output_type='div', savedata=False)
        topic_affiliation_count_barplot, _ = viz.plotly_topic_affiliation_count(
            tid, output_type='div', savedata=False)

        return render_template(
            'topic.html',
            topic_id=tid,
            description=
            f"{tid}{': ' + rename_topics[tid] if rename_topics else None}",
            frequency=round(topic_model.topic_frequency(tid) * 100, 2),
            documents=documents,
            topic_ids=topic_description,
            doc_ids=range(topic_model.corpus.size),
            topic_word_weight_barplot=topic_word_weight_barplot,
            topic_over_time_percent_line=topic_over_time_percent_line,
            topic_affiliation_count_barplot=topic_affiliation_count_barplot,
            # author_network_filename=author_network_folder / f'author_network{tid}.json',
        )

    @server.route('/document/<did>.html')
    def document_details(did: str):
        did = int(did)
        vector = topic_model.corpus.word_vector_for_document(did)
        word_list = []
        for a_word_id in range(len(vector)):
            word_list.append((topic_model.corpus.word_for_id(a_word_id),
                              round(vector[a_word_id], 3), a_word_id))
        word_list = sorted(word_list, key=lambda x: x[1], reverse=True)
        documents = []
        for another_doc in topic_model.corpus.similar_documents(did, 5):
            documents.append((
                topic_model.corpus.title(another_doc[0]).title(),
                ', '.join(topic_model.corpus.author(another_doc[0])).title(),
                topic_model.corpus.date(another_doc[0]).strftime('%Y-%m-%d'),
                ', '.join(topic_model.corpus.affiliation(
                    another_doc[0])).title(),
                ', '.join(topic_model.corpus.dataset(another_doc[0])).title(),
                another_doc[0],
                round(another_doc[1], 3),
            ), )

        doc_topic_loading_barplot, _ = viz.plotly_doc_topic_loading(
            did,
            rename=rename,
            normalized=True,
            n_words=top_words_description,
            output_type='div',
            savedata=False,
        )

        return render_template(
            'document.html',
            doc_id=did,
            words=word_list[:21],
            topic_ids=topic_description,
            doc_ids=range(topic_model.corpus.size),
            documents=documents,
            title=topic_model.corpus.title(did).title(),
            authors=', '.join(topic_model.corpus.author(did)).title(),
            year=topic_model.corpus.date(did).strftime('%Y-%m-%d'),
            short_content=topic_model.corpus.title(did).title(),
            affiliation=', '.join(topic_model.corpus.affiliation(did)).title(),
            dataset=', '.join(topic_model.corpus.dataset(did)).title(),
            id=topic_model.corpus.id(did),
            full_text=topic_model.corpus.full_text(did),
            doc_topic_loading_barplot=doc_topic_loading_barplot,
        )

    @server.route('/word/<wid>.html')
    def word_details(wid: str):
        wid = int(wid)
        documents = []
        for document_id in topic_model.corpus.docs_for_word(wid, sort=True):
            documents.append((
                topic_model.corpus.title(document_id).title(),
                ', '.join(topic_model.corpus.author(document_id)).title(),
                topic_model.corpus.date(document_id).strftime('%Y-%m-%d'),
                ', '.join(topic_model.corpus.affiliation(document_id)).title(),
                ', '.join(topic_model.corpus.dataset(document_id)).title(),
                document_id,
            ), )

        word_topic_loading_barplot, _ = viz.plotly_word_topic_loading(
            wid,
            rename=rename,
            normalized=True,
            n_words=top_words_description,
            output_type='div',
            savedata=False,
        )

        return render_template(
            'word.html',
            word_id=wid,
            word=topic_model.corpus.word_for_id(wid),
            topic_ids=topic_description,
            doc_ids=range(topic_model.corpus.size),
            documents=documents,
            word_topic_loading_barplot=word_topic_loading_barplot,
        )

    @app.server.route('/favicon.ico')
    def favicon():
        return send_from_directory(static_folder / 'images',
                                   request.path[1:],
                                   mimetype='image/vnd.microsoft.icon')

    @server.route('/robots.txt')
    def robots_txt():
        return send_from_directory(static_folder, request.path[1:])

    # @server.url_defaults
    # def hashed_static_file(endpoint, values):
    #     """Flask: add static file's cache invalidator param (last modified time)
    #     to URLs generated by url_for(). Blueprints aware.
    #     """
    #     if 'static' == endpoint or endpoint.endswith('.static'):
    #         filename = values.get('filename')
    #         if filename:
    #             blueprint = request.blueprint
    #             if '.' in endpoint:  # blueprint
    #                 blueprint = endpoint.rsplit('.', 1)[0]

    #             static_folder = server.static_folder
    #             # use blueprint, but dont set `static_folder` option
    #             if blueprint and server.blueprints[blueprint].static_folder:
    #                 static_folder = server.blueprints[blueprint].static_folder

    #             fp = Path(static_folder, filename)
    #             if fp.exists():
    #                 values['_'] = int(fp.stat().st_mtime)

    return app
コード例 #15
0

# Parameters
max_tf = 0.8
min_tf = 4
num_topics = 8
vectorization = 'tfidf'

# Download stopwords from NLTK
nltk.download('stopwords')

# Load and prepare a corpus
print('Load documents from CSV')
corpus = Corpus(source_file_path='input/help_queue_formatted_TOM.csv',   # This is our own file path
                language='english',  # language for stop words
                vectorization=vectorization,  # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
                max_relative_frequency=max_tf,  # ignore words which relative frequency is > than max_relative_frequency
                min_absolute_frequency=min_tf)  # ignore words which absolute frequency is < than min_absolute_frequency
print('corpus size:', corpus.size)
print('vocabulary size:', len(corpus.vocabulary))

# Instantiate a topic model
topic_model = NonNegativeMatrixFactorization(corpus)

# Estimate the optimal number of topics
# print('Estimating the number of topics...')
# viz = Visualization(topic_model)
# viz.plot_greene_metric(min_num_topics=10,
#                        max_num_topics=11,
#                        tao=10, step=1,
#                        top_n_words=10)