コード例 #1
0
def run_topic_model(output_dir,
                    n_topics,
                    content_fields,
                    field_filters=None,
                    field_filter_vals=None,
                    seed=42):

    np.random.seed(seed)
    # documents = iter_elastic_query(ES_INSTANCE + ES_INDEX, "abstract", "", query=None)
    documents = read_bulk_index(elastic + "original/", content_fields,
                                field_filters, field_filter_vals)

    corpus = EntitiesTokenizer(
        documents)  #receives a generator of strings (content for each doc)

    # if os.path.isdir(output_dir):
    #     shutil.rmtree(output_dir)

    # os.makedirs(output_dir)

    corpus_bow = CorpusBOW(corpus)

    corpus_dict = corpus_bow.save_dict(os.path.join(output_dir, 'corpus.dict'))
    # Serialize and store the corpus
    corpus_file = corpus_bow.serialize(os.path.join(output_dir, 'corpus.mm'))
    # Create LDA model from corpus and dictionary

    topik_lda = LDA(os.path.join(output_dir, 'corpus.mm'),
                    os.path.join(output_dir, 'corpus.dict'),
                    n_topics,
                    update_every=1,
                    passes=5)

    topik_lda.save(os.path.join(output_dir, 'model.gensim'))

    # Generate the input for the termite plot
    topik_lda.termite_data(os.path.join(output_dir, 'termite.csv'))
    # Get termite plot for this model
    termite = Termite(os.path.join(output_dir, 'termite.csv'), "Termite Plot")
    termite.plot(os.path.join(output_dir, 'termite.html'))

    df_results = generate_csv_output_file(documents, corpus, corpus_bow,
                                          topik_lda.model)

    to_r_ldavis(corpus_bow,
                dir_name=os.path.join(output_dir, 'ldavis'),
                lda=topik_lda)
    os.environ["LDAVIS_DIR"] = os.path.join(output_dir, 'ldavis')
    try:
        subprocess.call(
            ['Rscript',
             os.path.join(BASEDIR, 'topic-space/R/runLDAvis.R')])
    except ValueError:
        logging.warning("Unable to run runLDAvis.R")
コード例 #2
0
def run_topic_model(field, subfield, output_dir, n_topics, seed=42):

    np.random.seed(seed)

    documents = iter_elastic_query(ES_INSTANCE, ES_INDEX, field, subfield)

    corpus = EntitiesTokenizer(documents)

    if os.path.isdir(output_dir):
        shutil.rmtree(output_dir)

    os.makedirs(output_dir)

    # Create dictionary
    corpus_bow = CorpusBOW(corpus)
    corpus_dict = corpus_bow.save_dict(os.path.join(output_dir, 'corpus.dict'))
    # Serialize and store the corpus
    corpus_file = corpus_bow.serialize(os.path.join(output_dir, 'corpus.mm'))
    # Create LDA model from corpus and dictionary
    lda = LDA(os.path.join(output_dir, 'corpus.mm'),
              os.path.join(output_dir, 'corpus.dict'),
              n_topics,
              update_every=1,
              chuncksize=10000,
              passes=1)
    # Generate the input for the termite plot
    lda.termite_data(os.path.join(output_dir, 'termite.csv'))
    # Get termite plot for this model

    termite = Termite(os.path.join(output_dir, 'termite.csv'), "Termite Plot")
    termite.plot(os.path.join(output_dir, 'termite.html'))

    df_results = generate_csv_output_file(documents, corpus, corpus_bow,
                                          lda.model)

    to_r_ldavis(corpus_bow,
                dir_name=os.path.join(output_dir, 'ldavis'),
                lda=lda)
    os.environ["LDAVIS_DIR"] = os.path.join(output_dir, 'ldavis')
    try:
        subprocess.call(['Rscript', os.path.join(BASEDIR, 'R/runLDAvis.R')])
    except ValueError:
        logging.warning("Unable to run runLDAvis.R")
コード例 #3
0
ファイル: models.py プロジェクト: adeze/topic_space
def run_topic_model(field, subfield, output_dir, n_topics, seed=42):

    np.random.seed(seed)

    documents = iter_elastic_query(ES_INSTANCE, ES_INDEX, field, subfield)

    corpus = EntitiesTokenizer(documents)

    if os.path.isdir(output_dir):
        shutil.rmtree(output_dir)

    os.makedirs(output_dir)

    # Create dictionary
    corpus_bow = CorpusBOW(corpus)
    corpus_dict = corpus_bow.save_dict(os.path.join(output_dir, 'corpus.dict'))
    # Serialize and store the corpus
    corpus_file = corpus_bow.serialize(os.path.join(output_dir, 'corpus.mm'))
    # Create LDA model from corpus and dictionary
    lda = LDA(os.path.join(output_dir, 'corpus.mm'), os.path.join(output_dir,'corpus.dict'), n_topics,
              update_every=1, chuncksize=10000, passes=1)
    # Generate the input for the termite plot
    lda.termite_data(os.path.join(output_dir,'termite.csv'))
    # Get termite plot for this model

    termite = Termite(os.path.join(output_dir,'termite.csv'), "Termite Plot")
    termite.plot(os.path.join(output_dir,'termite.html'))

    df_results = generate_csv_output_file(documents, corpus, corpus_bow, lda.model)

    to_r_ldavis(corpus_bow, dir_name=os.path.join(output_dir, 'ldavis'), lda=lda)
    os.environ["LDAVIS_DIR"] = os.path.join(output_dir, 'ldavis')
    try:
        subprocess.call(['Rscript', os.path.join(BASEDIR,'R/runLDAvis.R')])
    except ValueError:
        logging.warning("Unable to run runLDAvis.R")
コード例 #4
0
ファイル: run.py プロジェクト: asmeurer/topik
def run_model(data, format='json_stream', tokenizer='simple', n_topics=10, dir_path='./topic_model',
                    model='lda_batch', termite_plot=True, output_file=False, r_ldavis=False,  prefix_value=None,
                    event_value=None, field=None, query='*:*', index=None, subfield=None, seed=42):
    """Run your data through all topik functionality and save all results to a specified directory.

    Parameters
    ----------
    data: string
        Input data (e.g. file or folder or solr/elasticsearch instance).

    format: {'json_stream', 'folder_files', 'json_large', 'solr', 'elastic'}.
        The format of your data input. Currently available a json stream or a folder containing text files. 
        Default is 'json_stream'

    tokenizer: {'simple', 'collocations', 'entities', 'mixed'}
        The type of tokenizer to use. Default is 'simple'.
    
    n_topics: int
        Number of topics to find in your data
        
    dir_path: string
        Directory path to store all topic modeling results files. Default is `./topic_model`.

    model: {'lda_batch', 'lda_online'}.
        Statistical modeling algorithm to use. Default 'lda_batch'.

    termite_plot: bool
        Generate termite plot of your model if True. Default is True.

    output_file: bool
        Generate a final summary csv file of your results. For each document: text, tokens, lda_probabilities and topic.

    r_ldavis: bool
        Generate an interactive data visualization of your topics. Default is False.

    prefix_value: string
        For 'large json' format reader, the prefix value to parse.

    event_value: string
        For 'large json' format reader, the event value to parse.

    field: string
        For 'json_stream', 'solr' or 'elastic' format readers, the field to parse.

    solr_instance: string
        For 'solr' format reader, the url to the solr instance.

    query: string
        For 'solr' format reader, an optional query. Default is '*:*' to retrieve all documents.

    seed: int
        Set random number generator to seed, to be able to reproduce results. Default 42.

    """
    np.random.seed(seed)

    if format == 'folder_files':
        documents = iter_documents_folder(data)
    elif format == 'large_json' and prefix_value is not None and event_value is not None:
        documents = iter_large_json(data, prefix_value, event_value)
    elif format == 'json_stream' and field is not None:
        documents = iter_document_json_stream(data, field)
    elif format == 'solr' and field is not None:
        documents = iter_solr_query(data, field, query=query)
    elif format == 'elastic' and field is not None:
        documents = iter_elastic_query(data, index, field, subfield)
    else:
        raise Exception("Invalid input, make sure your passing the appropriate arguments for the different formats")

    if tokenizer == 'simple':
        corpus = SimpleTokenizer(documents)
    elif tokenizer == 'collocations' :
        corpus = CollocationsTokenizer(documents)
    elif tokenizer == 'entities':
        corpus = EntitiesTokenizer(documents)
    elif tokenizer == 'mixed':
        corpus = MixedTokenizer(documents)
    else:
        print("Processing value invalid, using simple")
        corpus = SimpleTokenizer(documents)

    if os.path.isdir(dir_path):
        shutil.rmtree(dir_path)

    os.makedirs(dir_path)

    # Create dictionary
    corpus_bow = CorpusBOW(corpus)
    corpus_dict = corpus_bow.save_dict(os.path.join(dir_path, 'corpus.dict'))
    # Serialize and store the corpus
    corpus_file = corpus_bow.serialize(os.path.join(dir_path, 'corpus.mm'))
    # Create LDA model from corpus and dictionary
    if model == 'lda_batch':
        # To perform lda in batch mode set update_every=0 and passes=20)
        # https://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation
        lda = LDA(os.path.join(dir_path, 'corpus.mm'), os.path.join(dir_path,'corpus.dict'), n_topics, update_every=0,
                  passes=20)
    elif model == 'lda_online':
        # To perform lda in online mode set variables update_every, chuncksize and passes.
        lda = LDA(os.path.join(dir_path, 'corpus.mm'), os.path.join(dir_path,'corpus.dict'), n_topics, update_every=1,
                  chuncksize=10000, passes=1)
    else:
        logging.warning('model provided not valid. Using lda_batch.')
        lda = LDA(os.path.join(dir_path, 'corpus.mm'), os.path.join(dir_path,'corpus.dict'), n_topics, update_every=0,
                  passes=20)
    # Generate the input for the termite plot
    lda.termite_data(os.path.join(dir_path,'termite.csv'))
    # Get termite plot for this model
    if termite_plot:
        termite = Termite(os.path.join(dir_path,'termite.csv'), "Termite Plot")
        termite.plot(os.path.join(dir_path,'termite.html'))

    if output_file:

        if format == 'folder_files':
            documents = iter_documents_folder(data)
        elif format == 'large_json':
            documents = iter_large_json(data, prefix_value, event_value)
        else:
            documents = iter_document_json_stream(data, field)

        df_results = generate_csv_output_file(documents, corpus, corpus_bow, lda.model)

    if r_ldavis:
        to_r_ldavis(corpus_bow, dir_name=os.path.join(dir_path, 'ldavis'), lda=lda)
        os.environ["LDAVIS_DIR"] = os.path.join(dir_path, 'ldavis')
        try:
            subprocess.call(['Rscript', os.path.join(BASEDIR,'R/runLDAvis.R')])
        except ValueError:
            logging.warning("Unable to run runLDAvis.R")
        os.chdir(os.path.join(dir_path, 'ldavis', 'output'))
        sp = subprocess.Popen(['python', '-m', 'SimpleHTTPServer', '8000'])
        webbrowser.open_new_tab('127.0.0.1:8000')
        time.sleep(30)
        sp.kill()
コード例 #5
0
def run_model(data,
              format='json_stream',
              tokenizer='simple',
              n_topics=10,
              dir_path='./topic_model',
              model='lda_batch',
              termite_plot=True,
              output_file=False,
              r_ldavis=False,
              prefix_value=None,
              event_value=None,
              field=None,
              query='*:*',
              index=None,
              subfield=None,
              seed=42):
    """Run your data through all topik functionality and save all results to a specified directory.

    Parameters
    ----------
    data: string
        Input data (e.g. file or folder or solr/elasticsearch instance).

    format: {'json_stream', 'folder_files', 'json_large', 'solr', 'elastic'}.
        The format of your data input. Currently available a json stream or a folder containing text files. 
        Default is 'json_stream'

    tokenizer: {'simple', 'collocations', 'entities', 'mixed'}
        The type of tokenizer to use. Default is 'simple'.
    
    n_topics: int
        Number of topics to find in your data
        
    dir_path: string
        Directory path to store all topic modeling results files. Default is `./topic_model`.

    model: {'lda_batch', 'lda_online'}.
        Statistical modeling algorithm to use. Default 'lda_batch'.

    termite_plot: bool
        Generate termite plot of your model if True. Default is True.

    output_file: bool
        Generate a final summary csv file of your results. For each document: text, tokens, lda_probabilities and topic.

    r_ldavis: bool
        Generate an interactive data visualization of your topics. Default is False.

    prefix_value: string
        For 'large json' format reader, the prefix value to parse.

    event_value: string
        For 'large json' format reader, the event value to parse.

    field: string
        For 'json_stream', 'solr' or 'elastic' format readers, the field to parse.

    solr_instance: string
        For 'solr' format reader, the url to the solr instance.

    query: string
        For 'solr' format reader, an optional query. Default is '*:*' to retrieve all documents.

    seed: int
        Set random number generator to seed, to be able to reproduce results. Default 42.

    """
    np.random.seed(seed)

    if format == 'folder_files':
        id_documents = iter_documents_folder(data)
    elif format == 'large_json' and prefix_value is not None and event_value is not None:
        id_documents = iter_large_json(data, prefix_value, event_value)
    elif format == 'json_stream' and field is not None:
        id_documents = iter_document_json_stream(data, field)
    elif format == 'solr' and field is not None:
        id_documents = iter_solr_query(data, field, query=query)
    elif format == 'elastic' and field is not None:
        id_documents = iter_elastic_query(data, index, field, subfield)
    else:
        raise Exception(
            "Invalid input, make sure your passing the appropriate arguments for the different formats"
        )
    ids, documents = unzip(id_documents)

    if tokenizer == 'simple':
        corpus = SimpleTokenizer(documents)
    elif tokenizer == 'collocations':
        corpus = CollocationsTokenizer(documents)
    elif tokenizer == 'entities':
        corpus = EntitiesTokenizer(documents)
    elif tokenizer == 'mixed':
        corpus = MixedTokenizer(documents)
    else:
        print("Processing value invalid, using simple")
        corpus = SimpleTokenizer(documents)

    if os.path.isdir(dir_path):
        shutil.rmtree(dir_path)

    os.makedirs(dir_path)

    # Create dictionary
    corpus_bow = CorpusBOW(corpus)
    corpus_dict = corpus_bow.save_dict(os.path.join(dir_path, 'corpus.dict'))
    # Serialize and store the corpus
    corpus_file = corpus_bow.serialize(os.path.join(dir_path, 'corpus.mm'))
    # Create LDA model from corpus and dictionary
    if model == 'lda_batch':
        # To perform lda in batch mode set update_every=0 and passes=20)
        # https://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation
        lda = LDA(os.path.join(dir_path, 'corpus.mm'),
                  os.path.join(dir_path, 'corpus.dict'),
                  n_topics,
                  update_every=0,
                  passes=20)
    elif model == 'lda_online':
        # To perform lda in online mode set variables update_every, chuncksize and passes.
        lda = LDA(os.path.join(dir_path, 'corpus.mm'),
                  os.path.join(dir_path, 'corpus.dict'),
                  n_topics,
                  update_every=1,
                  chunksize=10000,
                  passes=1)
    else:
        logging.warning('model provided not valid. Using lda_batch.')
        lda = LDA(os.path.join(dir_path, 'corpus.mm'),
                  os.path.join(dir_path, 'corpus.dict'),
                  n_topics,
                  update_every=0,
                  passes=20)
    # Generate the input for the termite plot
    lda.termite_data(os.path.join(dir_path, 'termite.csv'))
    # Get termite plot for this model
    if termite_plot:
        termite = Termite(os.path.join(dir_path, 'termite.csv'),
                          "Termite Plot")
        termite.plot(os.path.join(dir_path, 'termite.html'))

    if output_file:

        if format == 'folder_files':
            id_documents = iter_documents_folder(data)

        elif format == 'large_json':
            id_documents = iter_large_json(data, prefix_value, event_value)
        else:
            id_documents = iter_document_json_stream(data, field)

        ids, documents = unzip(id_documents)
        df_results = generate_csv_output_file(documents, corpus, corpus_bow,
                                              lda.model)

    if r_ldavis:
        to_r_ldavis(corpus_bow,
                    dir_name=os.path.join(dir_path, 'ldavis'),
                    lda=lda)
        os.environ["LDAVIS_DIR"] = os.path.join(dir_path, 'ldavis')
        try:
            subprocess.call(
                ['Rscript', os.path.join(BASEDIR, 'R/runLDAvis.R')])
        except ValueError:
            logging.warning("Unable to run runLDAvis.R")
        os.chdir(os.path.join(dir_path, 'ldavis', 'output'))
        sp = subprocess.Popen(['python', '-m', 'SimpleHTTPServer', '8000'])
        webbrowser.open_new_tab('127.0.0.1:8000')
        time.sleep(30)
        sp.kill()
コード例 #6
0
ファイル: run.py プロジェクト: lewismc/topik
def run_model(data_source, source_type="auto", year_field=None, start_year=None, stop_year=None,
                content_field=None, clear_es_index=False,
                tokenizer='simple', n_topics=10, dir_path='./topic_model', model='lda_batch', 
                termite_plot=True, output_file=False, r_ldavis=False, json_prefix=None,  
                seed=42, **kwargs):

    """Run your data through all topik functionality and save all results to a specified directory.

    Parameters
    ----------
    data_source: string
        Input data (e.g. file or folder or solr/elasticsearch instance).

    source_type: {'json_stream', 'folder_files', 'json_large', 'solr', 'elastic'}.
        The format of your data input. Currently available a json stream or a folder containing text files.
        Default is 'json_stream'

    year_field: string
        The field name (if any) that contains the year associated with each document (for filtering).

    start_year: int
        For beginning of range filter on year_field values

    stop_year: int
        For beginning of range filter on year_field values

    content_field: string
        The primary text field to parse.

    clear_es_index: bool
        On true, delete and re-create destination elasticsearch index prior to loading in new documents.  Otherwise leave any previously
        existing documents and just add/update with the new documents.

    tokenizer: {'simple', 'collocations', 'entities', 'mixed'}
        The type of tokenizer to use. Default is 'simple'.

    n_topics: int
        Number of topics to find in your data

    dir_path: string
        Directory path to store all topic modeling results files. Default is `./topic_model`.

    model: {'lda_batch', 'lda_online'}.
        Statistical modeling algorithm to use. Default 'lda_batch'.

    termite_plot: bool
        Generate termite plot of your model if True. Default is True.

    output_file: bool
        Generate a final summary csv file of your results. For each document: text, tokens, lda_probabilities and topic.

    r_ldavis: bool
        Generate an interactive data visualization of your topics. Default is False.

    json_prefix: string
        For 'large json' format reader, the prefix value to parse.

    seed: int
        Set random number generator to seed, to be able to reproduce results. Default 42.
    """

    np.random.seed(seed)


    raw_data = read_input(data_source, content_field=content_field,
                          source_type=source_type, **kwargs)
    processed_data = preprocess(raw_data, tokenizer_method=tokenizer, **kwargs)

    # Serialize and store the corpus
    # Create LDA model from corpus and dictionary
    if model == 'lda_batch':
        # To perform lda in batch mode set update_every=0 and passes=20)
        # https://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation
        lda = LDA(processed_data, n_topics, update_every=0, passes=20)
    elif model == 'lda_online':
        # To perform lda in online mode set variables update_every, chunksize and passes.
        lda = LDA(processed_data, n_topics, update_every=1,
                  chunksize=10000, passes=1)
    else:
        logging.warning('model provided not valid. Using lda_batch.')
        lda = LDA(processed_data, n_topics, update_every=0, passes=20)
    # Get termite plot for this model
    if termite_plot:
        # Generate the input for the termite plot
        csv_path = os.path.join(dir_path, 'termite.csv')
        lda.termite_data(csv_path)
        termite = Termite(csv_path, "Termite Plot")
        termite.plot(os.path.join(dir_path, 'termite.html'))

    if output_file:
        filtered_documents = raw_data.get_data_by_year(start_year, stop_year, year_field)
        df_results = generate_csv_output_file(filtered_documents, raw_data,
                                              processed_data, lda.model)

    if r_ldavis:
        to_r_ldavis(processed_data, dir_name=os.path.join(dir_path, 'ldavis'), lda=lda)
        os.environ["LDAVIS_DIR"] = os.path.join(dir_path, 'ldavis')
        try:
            subprocess.call(['Rscript', os.path.join(BASEDIR, 'R/runLDAvis.R')])
        except ValueError:
            logging.warning("Unable to run runLDAvis.R")
        os.chdir(os.path.join(dir_path, 'ldavis', 'output'))
        sp = subprocess.Popen(['python', '-m', 'SimpleHTTPServer', '8000'])
        webbrowser.open_new_tab('127.0.0.1:8000')
        time.sleep(3)
        sp.kill()
    os.chdir(os.path.dirname(BASEDIR))