Exemple #1
0
 def test_elastic_import(self):
     output_args = {'host': 'localhost',
                    'index': INDEX}
     # import data from file into known elastic server
     read_input('{}/test_data_json_stream.json'.format(
                test_data_path), content_field="abstract",
                output_type=ElasticSearchCorpus.class_key(),
                output_args=output_args, synchronous_wait=30)
     iterable_data = read_input("localhost:9200/"+INDEX, content_field="abstract")
     self.assertEquals(len(iterable_data), 100)
 def setUp(self):
     self.test_raw_data = read_input('{}/test_data_json_stream.json'.format(
         test_data_path), content_field="abstract",
         output_type="elasticsearch",
         output_args= {'host': 'localhost',
                       'index': INDEX},
            synchronous_wait=30)
Exemple #3
0
    def setUp(self):
        self.dictionary_values_simple_test_data_1 = [
                'bending', 'sci', 'forget', 'messi', 'skip',
                'hands', 'focus', 'comply', 'colors', 'planning']

        self.dictionary_values_simple_test_data_json_stream = [
            u'limited', u'consolidated', u'magnetic', u'comparatively',
            u'powders', u'waspaloy', u'tensile', u'assembled', u'relationships',
            u'sfft']

        self.corpus_bow_head_2_simple_test_data_1 = [
            (0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1),
            (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]

        self.corpus_bow_head_2_simple_test_data_json_stream = [
            (0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1),
            (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2),
            (15, 2), (16, 1), (17, 1), (18, 1), (19, 2), (20, 2), (21, 2),
            (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1),
            (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1),
            (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)]

        raw_data = read_input(os.path.join(test_data_path,
                                           'test_data_json_stream.json'),
                                   content_field="abstract",
                                   output_type="dictionary")
        self.processed_data = preprocess(raw_data)
Exemple #4
0
 def test_simple_tokenizer(self):
     raw_data = read_input(
             source=self.data_json_stream_path,
             content_field="abstract",
             output_type="dictionary")
     _, text = next(iter(raw_data))
     doc_tokens = tokenizer_methods["simple"](text, min_length=1)
     self.assertEqual(doc_tokens, self.solution_simple_tokenizer_test_data_json_stream)
Exemple #5
0
 def test_mixed_tokenizer(self):
     raw_data = read_input(
             source=self.data_json_stream_path,
             content_field="abstract",
             output_type="dictionary")
     entities = collect_entities(raw_data)
     id, text = next(iter(raw_data))
     doc_tokens = tokenizer_methods["mixed"](text, entities)
     self.assertEqual(doc_tokens, self.solution_mixed_tokenizer_test_data_json_stream)
Exemple #6
0
 def test_collocations_tokenizer(self):
     raw_data = read_input(
             source=self.data_json_stream_path,
             content_field="abstract",
             output_type="dictionary")
     patterns = collect_bigrams_and_trigrams(raw_data, min_bigram_freq=2, min_trigram_freq=2)
     _, text = next(iter(raw_data))
     doc_tokens = tokenizer_methods["collocation"](text, patterns=patterns)
     self.assertEqual(doc_tokens, self.solution_collocations_tokenizer_test_data_json_stream)
Exemple #7
0
 def test_entities_tokenizer_json_stream(self):
     raw_data = read_input(
             source=self.data_json_stream_path,
             content_field="abstract",
             output_type="dictionary")
     entities = find_entities(raw_data, freq_min=1)
     _, text = next(iter(raw_data))
     doc_tokens = tokenizer_methods["entities"](text, entities)
     self.assertEqual(doc_tokens, self.solution_entities_tokenizer_test_data_json_stream)
Exemple #8
0
 def test_iter_large_json(self):
     iterable_data = read_input('{}/test_data_large_json.json'.format(test_data_path),
                                content_field="text", json_prefix='item._source.isAuthorOf')
     id, first_text = next(iter(iterable_data))
     self.assertEqual(first_text, self.solution_3)
Exemple #9
0
 def test_iter_documents_folder_gz(self):
     loaded_dictionaries = read_input(
         '{}/test_data_folder_files_gz'.format(test_data_path),
         content_field="abstract")
     id, first_text = next(iter(loaded_dictionaries))
     self.assertEqual(first_text, self.solution_1)
Exemple #10
0
 def test_read_document_json_stream(self):
     iterable_data = read_input('{}/test_data_json_stream.json'.format(
                                test_data_path), content_field="abstract")
     id, first_text = next(iter(iterable_data))
     self.assertEqual(first_text, self.solution_4)
Exemple #11
0
def test_invalid_source_load():
    raw_data = read_input(source=os.path.join(module_path, 'data/test_data_json_stream.json'),
                          content_field="abstract")
    raw_data.save("test_file")
    assert_raises(NameError, load_model, "test_file", "Steve")  # attempt to load a model that we know does not exist
Exemple #12
0
 def setUp(self):
     raw_data = read_input(
             source=os.path.join(module_path, 'data/test_data_json_stream.json'),
             content_field="abstract")
     self.digested_data = preprocess(raw_data)
     self.model = registered_models[self.model_name](self.digested_data, ntopics=NTOPICS)
Exemple #13
0
 def setUp(self):
     raw_data = read_input(
             source=os.path.join(module_path, 'data/test_data_json_stream.json'),
             content_field="abstract")
     self.digested_data = preprocess(raw_data)
     self.model = self._train_model()
 def setUp(self):
     self.test_raw_data = read_input('{}/test_data_json_stream.json'.format(
         test_data_path), content_field="abstract")
Exemple #15
0
 def setUp(self):
     test_raw_data = read_input('{}/test_data_json_stream.json'.format(
         test_data_path), content_field="abstract")
     self.processed_data = test_raw_data.tokenize()
Exemple #16
0
def run_model(data_source, source_type="auto", year_field=None, start_year=None, stop_year=None,
                content_field=None, clear_es_index=False,
                tokenizer='simple', n_topics=10, dir_path='./topic_model', model='lda_batch', 
                termite_plot=True, output_file=False, r_ldavis=False, json_prefix=None,  
                seed=42, **kwargs):

    """Run your data through all topik functionality and save all results to a specified directory.

    Parameters
    ----------
    data_source: string
        Input data (e.g. file or folder or solr/elasticsearch instance).

    source_type: {'json_stream', 'folder_files', 'json_large', 'solr', 'elastic'}.
        The format of your data input. Currently available a json stream or a folder containing text files.
        Default is 'json_stream'

    year_field: string
        The field name (if any) that contains the year associated with each document (for filtering).

    start_year: int
        For beginning of range filter on year_field values

    stop_year: int
        For beginning of range filter on year_field values

    content_field: string
        The primary text field to parse.

    clear_es_index: bool
        On true, delete and re-create destination elasticsearch index prior to loading in new documents.  Otherwise leave any previously
        existing documents and just add/update with the new documents.

    tokenizer: {'simple', 'collocations', 'entities', 'mixed'}
        The type of tokenizer to use. Default is 'simple'.

    n_topics: int
        Number of topics to find in your data

    dir_path: string
        Directory path to store all topic modeling results files. Default is `./topic_model`.

    model: {'lda_batch', 'lda_online'}.
        Statistical modeling algorithm to use. Default 'lda_batch'.

    termite_plot: bool
        Generate termite plot of your model if True. Default is True.

    output_file: bool
        Generate a final summary csv file of your results. For each document: text, tokens, lda_probabilities and topic.

    r_ldavis: bool
        Generate an interactive data visualization of your topics. Default is False.

    json_prefix: string
        For 'large json' format reader, the prefix value to parse.

    seed: int
        Set random number generator to seed, to be able to reproduce results. Default 42.
    """

    np.random.seed(seed)


    raw_data = read_input(data_source, content_field=content_field,
                          source_type=source_type, **kwargs)
    processed_data = preprocess(raw_data, tokenizer_method=tokenizer, **kwargs)

    # Serialize and store the corpus
    # Create LDA model from corpus and dictionary
    if model == 'lda_batch':
        # To perform lda in batch mode set update_every=0 and passes=20)
        # https://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation
        lda = LDA(processed_data, n_topics, update_every=0, passes=20)
    elif model == 'lda_online':
        # To perform lda in online mode set variables update_every, chunksize and passes.
        lda = LDA(processed_data, n_topics, update_every=1,
                  chunksize=10000, passes=1)
    else:
        logging.warning('model provided not valid. Using lda_batch.')
        lda = LDA(processed_data, n_topics, update_every=0, passes=20)
    # Get termite plot for this model
    if termite_plot:
        # Generate the input for the termite plot
        csv_path = os.path.join(dir_path, 'termite.csv')
        lda.termite_data(csv_path)
        termite = Termite(csv_path, "Termite Plot")
        termite.plot(os.path.join(dir_path, 'termite.html'))

    if output_file:
        filtered_documents = raw_data.get_data_by_year(start_year, stop_year, year_field)
        df_results = generate_csv_output_file(filtered_documents, raw_data,
                                              processed_data, lda.model)

    if r_ldavis:
        to_r_ldavis(processed_data, dir_name=os.path.join(dir_path, 'ldavis'), lda=lda)
        os.environ["LDAVIS_DIR"] = os.path.join(dir_path, 'ldavis')
        try:
            subprocess.call(['Rscript', os.path.join(BASEDIR, 'R/runLDAvis.R')])
        except ValueError:
            logging.warning("Unable to run runLDAvis.R")
        os.chdir(os.path.join(dir_path, 'ldavis', 'output'))
        sp = subprocess.Popen(['python', '-m', 'SimpleHTTPServer', '8000'])
        webbrowser.open_new_tab('127.0.0.1:8000')
        time.sleep(3)
        sp.kill()
    os.chdir(os.path.dirname(BASEDIR))