Esempio n. 1
0
    def test_4_convert_encoded_edges_count_for_undirected_graph(self):
        def equal_test(word1, word2):
            if directed[(str(word2id[word1]), str(word2id[word2]))] \
                    and directed[(str(word2id[word2]), str(word2id[word1]))]:
                sum_count = directed[(str(word2id[word1]), str(word2id[word2]))] \
                            + directed[(str(word2id[word2]), str(word2id[word1]))]
            elif directed[(str(word2id[word1]), str(word2id[word2]))]:
                sum_count = directed[(str(word2id[word1]), str(word2id[word2]))]
            elif directed[(str(word2id[word2]), str(word2id[word1]))]:
                sum_count = directed[(str(word2id[word2]), str(word2id[word1]))]
            else:
                sum_count = 0

            if (str(word2id[word1]), str(word2id[word2])) in undirected:
                self.assertEqual(sum_count, undirected[(str(word2id[word1]), str(word2id[word2]))])
            elif (str(word2id[word2]), str(word2id[word1])) in undirected:
                self.assertEqual(sum_count, undirected[(str(word2id[word2]), str(word2id[word1]))])
            else:
                print('No direct edge between ' + word1 + ' and ' + word2)
                self.assertEqual(sum_count, 0)

        wpp = WordPairsProcessing(max_vocab_size=None, min_count=self.min_count,
                                  dicts_folder=self.dicts_folder, window_size=50,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        directed = wpp.apply(process_num=self.process_num)

        word2id = util.read_two_columns_file_to_build_dictionary_type_specified(
            file=self.dicts_folder + 'dict_merged.txt', key_type=str,
            value_type=int)

        undirected = wpp.convert_encoded_edges_count_for_undirected_graph(
            old_encoded_edges_count_path=
            self.graph_folder + "encoded_edges_count_window_size_" + str(self.max_window_size) + ".txt")

        equal_test('and', ',')
        equal_test('the', '.')
        equal_test('and', '.')
        equal_test('and', 'of')
        equal_test('in', 'of')
        equal_test('.', 'in')
        equal_test('.', ',')  # . and , are not directly connected.
Esempio n. 2
0
                    wtokenizer=Tokenizer.mytok,
                    remove_numbers=False,
                    remove_punctuations=False,
                    stem_word=False,
                    lowercase=False)
merged_dict = wp.apply(data_folder=data_folder, process_num=process_num)

sp = SentenceProcessing(
    dicts_folder=dicts_folder,
    output_folder=edges_folder,
    max_window_size=max_window_size,
    local_dict_extension=config['graph']['local_dict_extension'])
word_count_all = sp.apply(data_folder=dicts_folder, process_num=process_num)

wpp = WordPairsProcessing(max_vocab_size=max_vocab_size,
                          min_count=min_count,
                          dicts_folder=dicts_folder,
                          window_size=max_window_size,
                          edges_folder=edges_folder,
                          graph_folder=graph_folder,
                          safe_files_number_per_processor=config['graph']
                          ['safe_files_number_per_processor'])
result = wpp.apply(process_num=process_num)

# igt = networkx_wrapper.IGraphWrapper('Test')
# igt.add_edges_from_file(path=graph_folder+'encoded_edges_count_window_size_5.txt')
print('[corpus2graph] time in seconds:', util.count_time(start_time))
gtw = graph_tool_wrapper.GraphToolWrapper('Test')
gtw.addEdgesFromFile(path=graph_folder +
                     'encoded_edges_count_window_size_5.txt')
print('[corpus2graph] time in seconds:', util.count_time(start_time))
Esempio n. 3
0
    def test_3_word_pairs_processing(self):
        # test valid vocabulary
        wpp = WordPairsProcessing(max_vocab_size=None, min_count=1,
                                  dicts_folder=self.dicts_folder, window_size=self.max_window_size,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        valid_vocab = wpp.write_valid_vocabulary()
        self.assertEqual(len(valid_vocab), 94)

        wpp = WordPairsProcessing(max_vocab_size=None, min_count=3,
                                  dicts_folder=self.dicts_folder, window_size=self.max_window_size,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        valid_vocab = wpp.write_valid_vocabulary()
        self.assertEqual(len(valid_vocab), 9)

        wpp = WordPairsProcessing(max_vocab_size=self.max_vocab_size, min_count=self.min_count,
                                  dicts_folder=self.dicts_folder, window_size=self.max_window_size,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        valid_vocab = wpp.write_valid_vocabulary()
        self.assertEqual(len(valid_vocab), self.max_vocab_size)

        wpp = WordPairsProcessing(max_vocab_size=None, min_count=self.min_count,
                                  dicts_folder=self.dicts_folder, window_size=self.max_window_size,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        valid_vocab = wpp.write_valid_vocabulary()
        self.assertEqual(len(valid_vocab), 6)

        # test word pairs of a specific window size
        wpp = WordPairsProcessing(max_vocab_size=None, min_count=self.min_count,
                                  dicts_folder=self.dicts_folder, window_size=self.max_window_size,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        result = wpp.apply(process_num=self.process_num)
        word2id = util.read_two_columns_file_to_build_dictionary_type_specified(
            file=self.dicts_folder + 'dict_merged.txt', key_type=str, value_type=int)

        self.assertEqual(result[(str(word2id['and']), str(word2id[',']))], 2)
        self.assertEqual(result[(str(word2id['and']), str(word2id['.']))], 2)
        self.assertEqual(result[(str(word2id['and']), str(word2id['the']))], 1)

        self.assertEqual(result[(str(word2id['the']), str(word2id['of']))], 6)
        self.assertEqual(result[(str(word2id['the']), str(word2id['.']))], 2)
        self.assertEqual(result[(str(word2id['the']), str(word2id['and']))], 3)
        self.assertEqual(result[(str(word2id['the']), str(word2id['in']))], 1)
        self.assertEqual(result[(str(word2id['the']), str(word2id[',']))], 2)

        self.assertEqual(result[(str(word2id['of']), str(word2id['.']))], 3)
        self.assertEqual(result[(str(word2id['of']), str(word2id['the']))], 2)
        self.assertEqual(result[(str(word2id['of']), str(word2id['and']))], 3)
        self.assertEqual(result[(str(word2id['of']), str(word2id['in']))], 2)
        self.assertEqual(result[(str(word2id['of']), str(word2id[',']))], 1)

        self.assertEqual(result[(str(word2id['in']), str(word2id['.']))], 1)
        self.assertEqual(result[(str(word2id['in']), str(word2id['the']))], 5)
        self.assertEqual(result[(str(word2id['in']), str(word2id['and']))], 1)
        self.assertEqual(result[(str(word2id['in']), str(word2id[',']))], 1)

        self.assertEqual(result[(str(word2id[',']), str(word2id['and']))], 2)
        self.assertEqual(result[(str(word2id[',']), str(word2id['in']))], 1)
        self.assertEqual(result[(str(word2id[',']), str(word2id['the']))], 1)

        self.assertEqual(len(result), 20 + 3)  # 3 self loops

        wpp = WordPairsProcessing(max_vocab_size=self.max_vocab_size, min_count=self.min_count,
                                  dicts_folder=self.dicts_folder, window_size=self.max_window_size,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        result = wpp.apply(process_num=self.process_num)
        self.assertEqual(result[(str(word2id['and']), str(word2id['the']))], 1)

        self.assertEqual(result[(str(word2id['the']), str(word2id['of']))], 6)
        self.assertEqual(result[(str(word2id['the']), str(word2id['and']))], 3)

        self.assertEqual(result[(str(word2id['of']), str(word2id['the']))], 2)
        self.assertEqual(result[(str(word2id['of']), str(word2id['and']))], 3)

        self.assertEqual(len(result), 5 + 2)  # 2 self loops
Esempio n. 4
0
def main():
    arguments = docopt(__doc__, version='1.O.O')

    data_folder = arguments['<data_dir>']
    if not data_folder.endswith('/'):
        data_folder += '/'
    output_folder = arguments['<output_dir>']
    if not output_folder.endswith('/'):
        output_folder += '/'
    dicts_folder = output_folder + 'dicts_and_encoded_texts/'
    edges_folder = output_folder + 'edges/'
    graph_folder = output_folder + 'graph/'

    util.mkdir_p(output_folder)
    util.mkdir_p(dicts_folder)
    util.mkdir_p(edges_folder)
    util.mkdir_p(graph_folder)

    max_window_size = int(arguments['--max_window_size'])
    process_num = int(arguments['--process_num'])
    min_count = int(arguments['--min_count'])
    max_vocab_size = int(arguments['--max_vocab_size'])
    safe_files_number_per_processor = int(
        arguments['--safe_files_number_per_processor'])

    if arguments['all']:
        start_time = time.time()
        wp = WordProcessing(output_folder=dicts_folder,
                            word_tokenizer='',
                            wtokenizer=Tokenizer.mytok,
                            remove_numbers=False,
                            remove_punctuations=False,
                            stem_word=False,
                            lowercase=False)
        merged_dict = wp.apply(data_folder=data_folder,
                               process_num=process_num)
        sp = SentenceProcessing(dicts_folder=dicts_folder,
                                output_folder=edges_folder,
                                max_window_size=max_window_size,
                                local_dict_extension='.dicloc')
        word_count_all = sp.apply(data_folder=dicts_folder,
                                  process_num=process_num)
        wpp = WordPairsProcessing(
            max_vocab_size=max_vocab_size,
            min_count=min_count,
            dicts_folder=dicts_folder,
            window_size=max_window_size,
            edges_folder=edges_folder,
            graph_folder=graph_folder,
            safe_files_number_per_processor=safe_files_number_per_processor)
        result = wpp.apply(process_num=process_num)
        # wpp.multiprocessing_merge_edges_count_of_a_specific_window_size(process_num=process_num, already_existed_window_size=4)
        print('time in seconds:', util.count_time(start_time))

    if arguments['wordprocessing']:
        start_time = time.time()
        wp = WordProcessing(output_folder=dicts_folder,
                            word_tokenizer='',
                            wtokenizer=Tokenizer.mytok,
                            remove_numbers=False,
                            remove_punctuations=False,
                            stem_word=False,
                            lowercase=False)
        merged_dict = wp.apply(data_folder=data_folder,
                               process_num=process_num)
        print('time for word processing in seconds:',
              util.count_time(start_time))

    if arguments['sentenceprocessing']:
        start_time = time.time()
        sp = SentenceProcessing(dicts_folder=dicts_folder,
                                output_folder=edges_folder,
                                max_window_size=max_window_size,
                                local_dict_extension='.dicloc')
        word_count_all = sp.apply(data_folder=dicts_folder,
                                  process_num=process_num)
        print('time for sentence processing in seconds:',
              util.count_time(start_time))

    if arguments['wordpairsprocessing']:
        start_time = time.time()
        wpp = WordPairsProcessing(
            max_vocab_size=max_vocab_size,
            min_count=min_count,
            dicts_folder=dicts_folder,
            window_size=max_window_size,
            edges_folder=edges_folder,
            graph_folder=graph_folder,
            safe_files_number_per_processor=safe_files_number_per_processor)
        result = wpp.apply(process_num=process_num)
        # wpp.multiprocessing_merge_edges_count_of_a_specific_window_size(process_num=process_num, already_existed_window_size=4)
        print('time for word pairs processing in seconds:',
              util.count_time(start_time))