Exemple #1
0
def define_dictionary_n_grams(docs, start=1):
    print("define_dictionary_n_grams")

    import topmine_src.phrase_mining as phrase_mining
    import sys
    import topmine_src.utils as utils

    file_name = "vw_remont-i-stroitel_stvo_only_text"
    output_path = "remont_n-grams"

    # represents the minimum number of occurences you want each phrase to have.
    min_support = 10

    # represents the threshold for merging two words into a phrase. A lower value
    # alpha leads to higher recall and lower precision,
    alpha = 4

    # length of the maximum phrase size
    max_phrase_size = 10

    phrase_miner = phrase_mining.PhraseMining(file_name, min_support,
                                              max_phrase_size, alpha)
    partitioned_docs, index_vocab = phrase_miner.mine()
    frequent_phrases = phrase_miner.get_frequent_phrases(min_support)
    utils.store_partitioned_docs(partitioned_docs)
    utils.store_vocab(index_vocab)
    utils.store_frequent_phrases(frequent_phrases, output_path)
    return {}
Exemple #2
0
from topmine_src import phrase_mining
import sys
from topmine_src import utils

arguments = sys.argv
print('Running Phrase Mining...')

file_name = arguments[1]

# represents the minimum number of occurences you want each phrase to have.
min_support = 10

# represents the threshold for merging two words into a phrase. A lower value
# alpha leads to higher recall and lower precision,
alpha = 4

# length of the maximum phrase size
max_phrase_size = 10

phrase_miner = phrase_mining.PhraseMining(file_name, min_support,
                                          max_phrase_size, alpha)
partitioned_docs, index_vocab = phrase_miner.mine()
frequent_phrases = phrase_miner.get_frequent_phrases(min_support)
utils.store_partitioned_docs(partitioned_docs)
utils.store_vocab(index_vocab)
utils.store_frequent_phrases(frequent_phrases)
Exemple #3
0
def main(_):
    stop_word_file = FLAGS.stop_word_file

    file_name = FLAGS.train_file

    min_support = FLAGS.min_support
    max_phrase_size = FLAGS.max_phrase_size
    alpha = FLAGS.alpha
    beta = FLAGS.beta
    iteration = FLAGS.iteration
    num_topics = FLAGS.num_topics
    optimization_iterations = FLAGS.optimization_iterations
    optimization_burnin = FLAGS.optimization_burnin

    import jieba

    # with open(FLAGS.train_file, "r") as frobj:
    # 	examples = [line.strip() for line in frobj]
    # 	print(len(examples), "===before removing duplicate===")
    # 	examples = set(examples)
    # 	tmp = []
    # 	for example in examples:
    # 		re_pattern = "({}{})".format("__label__", "\d.")
    # 		element_list = re.split(re_pattern, example)
    # 		tmp.append(" ".join(list(jieba.cut("".join(element_list[-1].split())))))
    # 	examples = set(tmp)
    # 	print(len(examples), "===after removing duplicate===")

    train_file_list = FLAGS.train_file.split("&")
    examples = []
    for train_file in train_file_list:
        with open(train_file, "r") as frobj:
            for line in tqdm(frobj):
                try:
                    content = json.loads(line)
                    text = " ".join(content["text"])
                    if len(text) >= 512:
                        continue
                    examples.append(text)
                except:
                    continue

    def _get_stopwords(stop_word_path):
        """
		Returns a list of stopwords.
		"""
        stopwords = set()
        with open(stop_word_path, "r") as frobj:
            for line in frobj:
                stopwords.add(line.rstrip())
        return stopwords

    # stopwords = _get_stopwords(FLAGS.stop_word_file)
    stopwords = []

    phrase_miner = phrase_mining.PhraseMining(min_support, max_phrase_size,
                                              alpha)
    partitioned_docs, index_vocab, partitioned_indexer = phrase_miner.mine(
        examples, stopwords)
    frequent_phrases = phrase_miner.get_frequent_phrases(min_support,
                                                         if_only_phrase=False)
    partioned_docs_path = FLAGS.ouput_file + "/partioned_docs.txt"
    utils.store_partitioned_docs(partitioned_docs, path=partioned_docs_path)
    vocab_path = FLAGS.ouput_file + "/vocabs.txt"
    utils.store_vocab(index_vocab, path=vocab_path)

    frequent_phrase_path = FLAGS.ouput_file + "/frequent_phrases.txt"
    utils.store_frequent_phrases(frequent_phrases, path=frequent_phrase_path)
    print("{}: total frequent phrases {}".format(file_name,
                                                 len(frequent_phrases)))

    # print('Running PhraseLDA...')

    # partitioned_docs = utils.load_partitioned_docs(path=partioned_docs_path)
    # vocab_file = utils.load_vocab(path=vocab_path)

    # plda = phrase_lda.PhraseLDA( partitioned_docs, vocab_file, num_topics ,
    # 			alpha, beta, iteration, optimization_iterations, optimization_burnin);

    # document_phrase_topics, most_frequent_topics, topics = plda.run()

    # stored_topics_path = FLAGS.ouput_file + "/doc_phrase_topics.txt"
    # utils.store_phrase_topics(document_phrase_topics,
    # 						 path=stored_topics_path)
    # most_frequent_topic_prefix_path = FLAGS.ouput_file + "/frequent_phrase_topics.txt"
    # utils.store_most_frequent_topics(most_frequent_topics,
    # 								prefix_path=most_frequent_topic_prefix_path)

    import _pickle as pkl
    pkl.dump(
        {
            "frequent_phrases": frequent_phrases,
            "index_vocab": index_vocab,
            "partitioned_docs": partitioned_docs,
            "indexer": partitioned_indexer
        }, open(FLAGS.ouput_file + "/mining_info.pkl", "wb"))
Exemple #4
0
def main(_):
    stop_word_file = FLAGS.stop_word_file

    file_name = FLAGS.train_file

    min_support = FLAGS.min_support
    max_phrase_size = FLAGS.max_phrase_size
    alpha = FLAGS.alpha
    beta = FLAGS.beta
    iteration = FLAGS.iteration
    num_topics = FLAGS.num_topics
    optimization_iterations = FLAGS.optimization_iterations
    optimization_burnin = FLAGS.optimization_burnin

    train_file_list = FLAGS.train_file.split("&")
    examples = []
    for train_file in train_file_list:
        with open(train_file, "r") as frobj:
            for line in tqdm(frobj):
                try:
                    content = line.strip()
                    # examples.append(" ".join(list(content)))
                    examples.append(" ".join(list(jieba.cut(clean(content)))))
                except:
                    continue

    def _get_stopwords(stop_word_path):
        """
		Returns a list of stopwords.
		"""
        stopwords = set()
        with open(stop_word_path, "r") as frobj:
            for line in frobj:
                stopwords.add(line.rstrip())
        return stopwords

    # stopwords = []
    stopwords = _get_stopwords(FLAGS.stop_word_file)

    print("==total example==", len(examples))

    phrase_miner = phrase_mining.PhraseMining(min_support, max_phrase_size,
                                              alpha)
    partitioned_docs, index_vocab, partitioned_indexer = phrase_miner.mine(
        examples, stopwords)
    frequent_phrases = phrase_miner.get_frequent_phrases(min_support,
                                                         if_only_phrase=False)
    partioned_docs_path = FLAGS.ouput_file + "/partioned_docs.txt"
    utils.store_partitioned_docs(partitioned_docs, path=partioned_docs_path)
    vocab_path = FLAGS.ouput_file + "/vocabs.txt"
    utils.store_vocab(index_vocab, path=vocab_path)

    frequent_phrase_path = FLAGS.ouput_file + "/frequent_phrases.txt"
    utils.store_frequent_phrases(frequent_phrases, path=frequent_phrase_path)
    print("{}: total frequent phrases {}".format(file_name,
                                                 len(frequent_phrases)))

    # print('Running PhraseLDA...')

    # partitioned_docs = utils.load_partitioned_docs(path=partioned_docs_path)
    # vocab_file = utils.load_vocab(path=vocab_path)

    # plda = phrase_lda.PhraseLDA( partitioned_docs, vocab_file, num_topics ,
    # 			alpha, beta, iteration, optimization_iterations, optimization_burnin);

    # document_phrase_topics, most_frequent_topics, topics = plda.run()

    # stored_topics_path = FLAGS.ouput_file + "/doc_phrase_topics.txt"
    # utils.store_phrase_topics(document_phrase_topics,
    # 						 path=stored_topics_path)
    # most_frequent_topic_prefix_path = FLAGS.ouput_file + "/frequent_phrase_topics.txt"
    # utils.store_most_frequent_topics(most_frequent_topics,
    # 								prefix_path=most_frequent_topic_prefix_path)

    import _pickle as pkl
    pkl.dump(
        {
            "frequent_phrases": frequent_phrases,
            "index_vocab": index_vocab,
            "partitioned_docs": partitioned_docs,
            "indexer": partitioned_indexer
        }, open(FLAGS.ouput_file + "/mining_info.pkl", "wb"))