class Test_ClassSummaryCosineSimilarty(unittest.TestCase):
    """Test Class Summary class.

    """
    def setUp(self):
        """
        setting initial paramater
        Args:
            data: test file name
            split_module: setting the split_module instance
        """
        wn_summary_list = APP_ROOT + '/../../Data/wn_total_summary_51519_limit05_out_put_list.txt'
        self.input_module = InputFileCython(wn_summary_list)
        self.input_module.input_special_format_file()

    def test_summary_class(self):
        """
        test make summary dict
        """
        file_list = self.input_module.get_file_data()
        class_word_vector = {}
        for file in file_list:
            self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_total_summary_51519_limit05_out_put/" + file.strip())
            self.input_module.input_special_format_file()
            if file.strip() not in class_word_vector:
                word_list = (list(map(lambda x:x.strip(), self.input_module.get_file_data())))
                class_word_vector.update({file.strip().replace("_summary.txt", ""): word_list})
        sqlite_twitter_cython = SqliteTwitterSummaryCython(class_word_vector)
        sqlite_twitter_cython.call_sql()
class Test_ClassSummaryExclude(unittest.TestCase):
    """Test Class Summary class.

    """
    def setUp(self):
        """
        setting initial paramater
        Args:
            data: test file name
            split_module: setting the split_module instance
        """
        wn_summary_list = APP_ROOT + '/../../Data/wn_total_summary_list.txt'
        self.input_module = InputFileCython(wn_summary_list)
        self.input_module.input_special_format_file()
        self.class_summary_exclude = ClassSummaryExcludeCython()

    def test_summary_class(self):
        """
        test make summary dict
        """
        file_list = self.input_module.get_file_data()
        OUT_PUT_PATH = APP_ROOT + "/../../Data/wn_total_summary_51519_limit05_out_put/"
        for file in file_list:
            self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_total_summary_51519_limit05/" + file.strip())
            self.input_module.input_special_format_file()
            self.class_summary_exclude.exclude_data(OUT_PUT_PATH, file.strip(), self.input_module.get_file_data())
class Test_ClassSummaryExclude(unittest.TestCase):
    """Test Class Summary class.

    """
    def setUp(self):
        """
        setting initial paramater
        Args:
            data: test file name
            split_module: setting the split_module instance
        """
        wn_summary_list = APP_ROOT + '/../../Data/wn_total_summary_list.txt'
        self.input_module = InputFileCython(wn_summary_list)
        self.input_module.input_special_format_file()
        self.class_summary_exclude = ClassSummaryExcludeCython()

    def test_summary_class(self):
        """
        test make summary dict
        """
        file_list = self.input_module.get_file_data()
        OUT_PUT_PATH = APP_ROOT + "/../../Data/wn_total_summary_51519_limit05_out_put/"
        for file in file_list:
            self.input_module = InputFileCython(
                APP_ROOT + "/../../Data/wn_total_summary_51519_limit05/" +
                file.strip())
            self.input_module.input_special_format_file()
            self.class_summary_exclude.exclude_data(
                OUT_PUT_PATH, file.strip(), self.input_module.get_file_data())
class Test_ClassSummaryCosineSimilarty(unittest.TestCase):
    """Test Class Summary class.

    """
    def setUp(self):
        """
        setting initial paramater
        Args:
            data: test file name
            split_module: setting the split_module instance
        """
        wn_summary_list = APP_ROOT + '/../../Data/wn_summary_list.txt'
        self.input_module = InputFileCython(wn_summary_list)
        self.input_module.input_special_format_file()

    def test_summary_class(self):
        """
        test make summary dict
        """
        file_list = self.input_module.get_file_data()
        wn_average_vector_list = APP_ROOT + '/../../Data/wn_average_vector_list.txt'
        self.input_module = InputFileCython(wn_average_vector_list)
        self.input_module.input_special_format_file()
        vector_list = self.input_module.get_file_data()
        class_word_vector = {}
        class_average_vector = {}
        for file in file_list:
            self.input_module = InputFileCython(APP_ROOT +
                                                "/../../Data/wn_summary/" +
                                                file.strip())
            self.input_module.input_special_format_file()
            if file.strip() not in class_word_vector:
                word_list = re.sub(
                    "\]|\[|\'", "",
                    self.input_module.get_file_data()[0].strip())
                class_word_vector.update(
                    {file.strip().replace(".txt", ""): word_list.split(",")})
        for vector in vector_list:
            self.input_module = InputFileCython(
                APP_ROOT + "/../../Data/wn_summary_multi/" + vector.strip())
            self.input_module.input_special_format_file()
            vector_list = []
            if vector.strip() not in class_average_vector:
                for value in self.input_module.get_file_data():
                    value = re.sub("\]|\[|\'", "", value.strip())
                    [
                        vector_list.append(each_value)
                        for each_value in value.split(" ") if each_value != ""
                    ]
                    vector_list = list(map(float, vector_list))
                class_average_vector.update({
                    vector.strip().replace(".txt_vector.txt", ""):
                    vector_list
                })
        class_summary_cosine_similarity_cython = ClassSummaryCosineSimilarityCython(
            class_word_vector, class_average_vector)
        class_summary_cosine_similarity_cython.summary_class_use_cosine_similarity(
        )
Exemple #5
0
class Test_ClassSummary(unittest.TestCase):
    """Test Class Summary class.

    """
    def setUp(self):
        """
        setting initial paramater
        Args:
            data: test file name
            split_module: setting the split_module instance
        """
        wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt'
        self.input_module = InputFileCython(wiki_vector_file_name)

    def test_summary_class(self):
        """
        test make summary dict
        """
        self.input_module.input_fast_large_file()
        wiki_vector = self.input_module.get_vector()
        wn_summary_list = APP_ROOT + '/../../Data/wn_summary_list.txt'
        self.input_module = InputFileCython(wn_summary_list)
        self.input_module.input_special_format_file()
        file_list = self.input_module.get_file_data()
        count = 0
        class_word_vector = {}
        class_average_vector = {}
        for file in file_list:
            self.input_module = InputFileCython(APP_ROOT +
                                                "/../../Data/wn_summary/" +
                                                file.strip())
            self.input_module.input_special_format_file()
            if count == 0:
                class_summary = ClassSummaryCython(
                    file.strip(), self.input_module.get_file_data(),
                    wiki_vector)
            else:
                class_summary = ClassSummaryCython(
                    file.strip(), self.input_module.get_file_data(),
                    wiki_vector, class_word_vector, class_average_vector)
            class_word_vector, class_average_vector = class_summary.summary_class(
            )
            fo = open(
                APP_ROOT + "/../../Data/test/" + file.strip() + "_vector.txt",
                'w')
            sys.stdout = fo
            print(class_average_vector[file.strip()])
            fo.close()
            sys.stdout = sys.__stdout__
        class_summary_cosine_similarity_cython = ClassSummaryCosineSimilarityCython(
            class_average_vector)
        class_summary_cosine_similarity_cython.summary_class_use_cosine_similarity(
        )
class Test_ClassSummaryCosineSimilarty(unittest.TestCase):
    """Test Class Summary class.

    """
    def setUp(self):
        """
        setting initial paramater
        Args:
            data: test file name
            split_module: setting the split_module instance
        """
        wn_summary_list = APP_ROOT + '/../../Data/wn_summary_list.txt'
        self.input_module = InputFileCython(wn_summary_list)
        self.input_module.input_special_format_file()

    def test_summary_class(self):
        """
        test make summary dict
        """
        file_list = self.input_module.get_file_data()
        wn_average_vector_list = APP_ROOT + '/../../Data/wn_average_vector_list.txt'
        self.input_module = InputFileCython(wn_average_vector_list)
        self.input_module.input_special_format_file()
        vector_list = self.input_module.get_file_data()
        class_word_vector = {}
        class_average_vector = {}
        for file in file_list:
            self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_summary/" + file.strip())
            self.input_module.input_special_format_file()
            if file.strip() not in class_word_vector:
                word_list = re.sub("\]|\[|\'", "", self.input_module.get_file_data()[0].strip())
                class_word_vector.update({file.strip().replace(".txt", ""): word_list.split(",")})
        for vector in vector_list:
            self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_summary_multi/" + vector.strip())
            self.input_module.input_special_format_file()
            vector_list = []
            if vector.strip() not in class_average_vector:
                for value in self.input_module.get_file_data():
                    value = re.sub("\]|\[|\'", "", value.strip())
                    [vector_list.append(each_value) for each_value in value.split(" ") if each_value != ""]
                    vector_list = list(map(float, vector_list))
                class_average_vector.update({vector.strip().replace(".txt_vector.txt", ""): vector_list})
        class_summary_cosine_similarity_cython = ClassSummaryCosineSimilarityCython(class_word_vector, class_average_vector)
        class_summary_cosine_similarity_cython.summary_class_use_cosine_similarity()
class Test_ClassSummary(unittest.TestCase):
    """Test Class Summary class.

    """
    def setUp(self):
        """
        setting initial paramater
        Args:
            data: test file name
            split_module: setting the split_module instance
        """
        wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt'
        self.input_module = InputFileCython(wiki_vector_file_name)

    def test_summary_class(self):
        """
        test make summary dict
        """
        self.input_module.input_fast_large_file()
        wiki_vector = self.input_module.get_vector()
        wn_summary_list = APP_ROOT + '/../../Data/wn_summary_list.txt'
        self.input_module = InputFileCython(wn_summary_list)
        self.input_module.input_special_format_file()
        file_list = self.input_module.get_file_data()
        count = 0
        class_word_vector = {}
        class_average_vector = {}
        for file in file_list:
            self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_summary/" + file.strip())
            self.input_module.input_special_format_file()
            if count == 0:
                class_summary = ClassSummaryCython(file.strip(), self.input_module.get_file_data(), wiki_vector)
            else:
                class_summary = ClassSummaryCython(file.strip(), self.input_module.get_file_data(), wiki_vector, class_word_vector, class_average_vector)
            class_word_vector, class_average_vector = class_summary.summary_class()
            fo = open(APP_ROOT + "/../../Data/test/" + file.strip() + "_vector.txt", 'w')
            sys.stdout = fo
            print(class_average_vector[file.strip()])
            fo.close()
            sys.stdout = sys.__stdout__
        class_summary_cosine_similarity_cython = ClassSummaryCosineSimilarityCython(class_average_vector)
        class_summary_cosine_similarity_cython.summary_class_use_cosine_similarity()
class SqliteTwitterSummary(object):
    """
    Twitter Save to the SQLite
    """
    def __init__(self, class_word_vector):
        """
        Initial Setting
        Get the mecab dict by the yaml
        """
        Twitter = namedtuple("Twitter", ["mecab"])
        config_file = "enviroment_twitter.yml"

        with open(config_file, encoding="utf-8") as cf:
            e = yaml.load(cf)
            twitter = Twitter(e["twitter"]["mecab"])

        self.tagger = MeCab.Tagger("-Owakati -d %s" % twitter.mecab)
        conn = sqlite3.connect('./twitter_data.db')
        self.cur = conn.cursor()
        self.class_word_vector = class_word_vector
        self.class_average_vector = {}
        self.class_word_dict = self.make_class_word_dict()
        #self.__initial_setting_vector()

    def __initial_setting_vector(self):
        # Wiki vector dict
        wiki_vector_file_name = APP_ROOT + '/../Data/jawiki_vector/jawiki_vector_delete_first.txt'
        self.input_module = InputFileCython(wiki_vector_file_name)
        self.input_module.input_fast_large_file()
        self.wiki_vector = self.input_module.get_vector()
        # Make average vector dict
        wiki_average_vector_file_name_list = APP_ROOT + '/../Data/wn_summary_multi_vector_list.txt'
        self.input_module = InputFileCython(wiki_average_vector_file_name_list)
        self.input_module.input_special_format_file()
        summary_vector_file_list = self.input_module.get_file_data()
        for file in summary_vector_file_list:
            read_file = APP_ROOT + "/../Data/wn_summary_multi_vector/" + file
            self.input_module = InputFileCython(read_file)
            self.input_module.input_file_str_list()
            summary_vector_file_list = self.input_module.get_file_data()
            class_name = file.replace("_summary.txt_vector.txt", "")
            if class_name not in self.class_average_vector:
                self.class_average_vector.update({class_name: summary_vector_file_list})
        self.class_summary = ClassSummary("", self.wiki_vector, "")
        self.cosine_similarity = ClassCosineSimilarity("", "")

    def make_class_word_dict(self):
        """
        make remake the data format
        """
        word_class_dict = {}
        for class_name, word_list in self.class_word_vector.items():
            word_dict = {}
            for word in word_list:
                if word not in word_dict:
                    word_dict.update({word: 1})
            if class_name not in word_class_dict:
                word_class_dict.update({class_name: word_dict})
        return word_class_dict

    def call_sql(self):
        """
        call SQlite and save the twitter in the SQLite
        """
        self.cur.execute("""SELECT source_txt, replay_txt FROM ms_rinna;""")
        file_list = os.listdir("./data_latest/")
        #for file in file_list:
        #    os.remove("./data/" + file)
        for source_txt, replay_txt in self.cur.fetchall():
            class_name = self.judge_class(source_txt, replay_txt)
            # class_name = self.judge_class_wiki_vector(source_txt, replay_txt)
            print(class_name)
            print(source_txt)
            print(replay_txt)
            source_file = open("./data_latest/" + class_name + '_source_twitter_data.txt', 'a')
            replay_file = open("./data_latest/" + class_name + '_replay_twitter_data.txt', 'a')
            replay_file.write(self.tagger.parse(source_txt).replace("\n", "") + '\n')
            source_file.write(self.tagger.parse(replay_txt).replace('\n', '') + '\n')
            source_file.close()
            replay_file.close()

    def judge_class(self, source_txt, replay_txt=""):
        """
        Judge word class
        :param source_txt: twitter source text
        :param replay_txt: twitter replay text
        :return: most match class
        """
        class_match_rate = {}
        total_text = []
        source_wakati_text = self.__mecab_method(source_txt.strip())
        total_text.extend(source_wakati_text)
        if replay_txt != "":
            replay_wakati_text = self.__mecab_method(replay_txt.strip())
            total_text.extend(replay_wakati_text)
        for class_name in self.class_word_vector.keys():
            word_match_count = self.__match_word_count(total_text, class_name)
            if class_name not in class_match_rate:
                class_match_rate.update({class_name: 1.0 * word_match_count / len(self.class_word_dict[class_name])})
        if max(class_match_rate.values()) == 0.0:
            return "other"
        else:
            return max(class_match_rate.items(), key=operator.itemgetter(1))[0]

    def judge_class_wiki_vector(self, source_txt, replay_txt=""):
        """
        Judge word class by wiki vector
        :param source_txt: twitter source text
        :param replay_txt: twitter replay text
        :return: most match class
        """
        class_match_rate = {}
        total_text = []
        source_wakati_text = self.__mecab_method(source_txt.strip())
        total_text.extend(source_wakati_text)
        if replay_txt != "":
            replay_wakati_text = self.__mecab_method(replay_txt.strip())
            total_text.extend(replay_wakati_text)
        self.class_summary.summary_vector_word_list(total_text)
        summary_vector = self.class_summary.get_average_vector()
        for class_name, average_vector in self.class_average_vector.items():
            class_match_rate.update({class_name: self.cosine_similarity.cosine_similarity(summary_vector, average_vector)})
        print(class_match_rate)
        if max(class_match_rate.values()) <= 0.1:
            return "other"
        else:
            return max(class_match_rate.items(), key=operator.itemgetter(1))[0]

    def __mecab_method(self, text):
        """
        Call Mecab method split process and choose noum
        :param text:
        :return: only noum
        """
        res = self.tagger.parseToNode("".join(text))
        split_nonum = []
        while res:
            feature = res.feature.split(",")
            if feature[0] == u"名詞":
                split_nonum.append(feature[6])
            res = res.next
        return split_nonum

    def __match_word_count(self, total_text, class_name):
        """
        count matthing word word class
        :param total_text: source text and reply text
        :param class_name: choose class name
        :return: matthing count
        """
        word_match_count = 0
        for word in total_text:
            if word in self.class_word_dict[class_name]:
                word_match_count = word_match_count + 1
        return word_match_count
Exemple #9
0
class SlackApp():
    """
    Slack Call app
    You preapre the chainer model, You execute the bellow command, you can play the dialogue app
    Example
        python app.py
    """
    def __init__(self, data_model):
        """
        Iniital Setting
        :param data_model: Setting Slack Model. Slack Model has the a lot of paramater
        """
        self.slack_channel = data_model.slack_channel
        self.data = ""
        self.parameter = data_model.parameter_dict
        self.model_name = "../model_word_match/ChainerDialogue"
        self.generation_limit = 200
        """
        We confirm channel number
        https://api.slack.com/methods/channels.list
        """
        self.chan = data_model.chan
        self.usr = data_model.user
        self.mecab_dict = data_model.mecab_dict
        self.Mecab = MeCab.Tagger("-Owakati -d %s" % self.mecab_dict)
        XP.set_library(False, 0)
        self.XP = XP
        wn_summary_list = APP_ROOT + '/../Data/wn_total_summary_51519_limit05_out_put_list.txt'
        self.input_module = InputFileCython(wn_summary_list)
        self.input_module.input_special_format_file()
        file_list = self.input_module.get_file_data()
        self.class_word_vector = self.__make_class_word_vector(file_list)
        self.sqlite_twitter_summary = SqliteTwitterSummary(
            self.class_word_vector)
        self.word_class_dict = self.sqlite_twitter_summary.make_class_word_dict(
        )
        self.word_class = ""
        self.multi_train_execute = ExecuteAttentionDialogue()
        self.elastic_search = GetAnswer()

    def __make_class_word_vector(self, file_list):
        """
        Make class word vector dict
        :param file_list:
        :return:
        """
        class_word_vector = {}
        for file in file_list:
            self.input_module = InputFileCython(
                APP_ROOT +
                "/../Data/wn_total_summary_51519_limit05_out_put//" +
                file.strip())
            self.input_module.input_special_format_file()
            if file.strip() not in class_word_vector:
                word_list = (list(
                    map(lambda x: x.strip(),
                        self.input_module.get_file_data())))
                class_word_vector.update(
                    {file.strip().replace("_summary.txt", ""): word_list})
        return class_word_vector

    def call_method(self):
        """
        Slack api call
        1: read sentence
        2: model return the sentence
        """
        if self.slack_channel.rtm_connect():
            while True:
                self.data = self.slack_channel.rtm_read()
                self.__judge_print()
                time.sleep(1)
        else:
            print("connection Fail")

    def __judge_print(self):
        """
        judge slack call for chainer
        Example:
            chainer:{your sentence}
                chainer return the sentence
            chainer_train:{your sentence}
                start train
        """
        if len(self.data) >= 1 and "text" in self.data[0]:
            input_text = self.data[0]["text"]
            print(input_text)
            if "chainer:" in input_text:
                # predict
                if "?" in input_text or "?" in input_text:
                    replace_input = re.sub("chainer:|\?", "",
                                           input_text.strip())
                    self.elastic_search.search_data(replace_input)
                    if len(self.elastic_search.search_result) > 0:
                        hyp_batch = self.elastic_search.search_result[0]
                        print(hyp_batch)
                        if hyp_batch["image"]:
                            word = hyp_batch["image"] + "\n" + hyp_batch[
                                "title"] + "\n" + hyp_batch[
                                    "abstract"] + "\n" + hyp_batch["url"]
                        else:
                            word = hyp_batch["title"] + "\n" + hyp_batch[
                                "abstract"] + "\n" + hyp_batch["url"]
                    else:
                        word = "No match"
                else:
                    # input sentence
                    src_batch = self.__input_sentence()
                    hyp_batch = self.__predict_sentence(src_batch)
                    word = ''.join(hyp_batch[0]).replace("</s>", "")
                # show predict word
                print(
                    self.slack_channel.api_call("chat.postMessage",
                                                user=self.usr,
                                                channel=self.chan,
                                                text=word))
            if "chainer_train" in self.data[0]["text"]:
                self.__setting_parameter()
                self.__multi_train()

    def __multi_train(self):
        """
        Call multi train
        """
        self.multi_train_execute.train_mulit_model()

    def __input_sentence(self):
        """
        return sentence for chainer predict
        """
        text = self.__mecab_method(self.data[0]["text"].replace(
            "chainer:", ""))
        self.word_class = self.sqlite_twitter_summary.judge_class(
            self.data[0]["text"].replace("chainer:", ""))
        ##  self.word_class = self.sqlite_twitter_summary.judge_class_wiki_vector(self.data[0]["text"].replace("chainer:", ""))
        data = [text]
        src_batch = [
            x + ["</s>"] * (self.generation_limit - len(x) + 1) for x in data
        ]
        return src_batch

    def __predict_sentence(self, src_batch):
        """
        predict sentence
        :param src_batch: get the source sentence
        :return:
        """
        self.model_name = "../model_word_match/ChainerDialogue_" + self.word_class
        print(self.word_class)
        dialogue = EncoderDecoderModelAttention(self.parameter)
        src_vocab = Vocabulary.load(self.model_name + '.srcvocab')
        trg_vocab = Vocabulary.load(self.model_name + '.trgvocab')
        model = AttentionDialogue.load_spec(self.model_name + '.spec', self.XP)
        serializers.load_hdf5(self.model_name + '.weights', model)
        hyp_batch = dialogue.forward_implement(src_batch, None, src_vocab,
                                               trg_vocab, model, False,
                                               self.generation_limit)
        print(hyp_batch)
        return hyp_batch

    def __setting_parameter(self):
        """
        setteing each patamater
        """
        self.parameter["word2vec"] = self.model_name
        train_path = "../twitter/"
        self.parameter["source"] = train_path + "source_twitter_data.txt"
        self.parameter["target"] = train_path + "replay_twitter_data.txt"

    def __mecab_method(self, text):
        """
        Call the mecab method
        :param text: user input text
        :return:
        """
        mecab_text = self.Mecab.parse(text)
        return mecab_text.split(" ")
class ProducerConsumerThreadSqlTwitter(object):
    """
    Producer
    Consumer
    Multi Thread Crawling.
    Using the Consumer Producer pattern
    Reference
        Python Consumer Producer pattern
            http://agiliq.com/blog/2013/10/producer-consumer-problem-in-python/
        Multi Thread Design pattern
    """
    def get_file_data(self, file):
        """
        get file data
        :param file: summary word net data
        :return:
        """
        self.input_module = InputFileCython(file)
        self.input_module.input_special_format_file()
        return self.input_module.get_file_data()

    def producer_run(self):
        """
        Running Producer
        """
        file_list = self.get_file_data(APP_ROOT + '/../Data/wn_total_summary_51519_limit05_out_put_list.txt')
        class_word_vector = {}
        for file in file_list:
            if file.strip() not in class_word_vector:
                word_list = (list(map(lambda x:x.strip(), self.get_file_data(APP_ROOT + "/../Data/wn_total_summary_51519_limit05_out_put/" + file.strip()))))
                class_word_vector.update({file.strip().replace("_summary.txt", ""): word_list})
        global class_queue
        global check_queue
        while True:
            if class_queue not in check_queue.queue:
                try:
                    class_queue.put(class_word_vector)
                    check_queue.put(class_word_vector)
                except queue.Empty:
                    print("Queue Full")
                    pass
                else:
                    log_text = "Produced "
                    print(log_text)
                    time.sleep(random.uniform(0.0, 0.5))

    def consumer_run(self):
        """
        Running Consumer
        """
        global class_queue
        while True:
            try:
                class_word_vector = class_queue.get()
            except class_queue.Empty:
                print("Queue Empty")
                pass
            else:
                log_text = "Consume "
                print(log_text)
                sqlite_twitter = SqliteTwitterSummaryCython(class_word_vector)
                sqlite_twitter.call_sql()
                class_queue.task_done()
                # Setting the wait time, I refered to the bellow link
                #  https://www.w3.org/Protocols/HTTP-NG/http-prob.html
                time.sleep(random.uniform(0.601, 0.602))
This script for parallel command multi thread program
"""


if __name__ == '__main__':
    """
    args
       -r: setting word net link list
           Example:
               '/../../Data/wn_summary_list.txt'
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--read_word_net_list_file', '-r', default='',
                        help='set word net list file')
    args = parser.parse_args()
    #  Word Net File
    wn_summary_split_list = APP_ROOT + "/../../Data/" + args.read_word_net_list_file
    input_module = InputFileCython(wn_summary_split_list)
    input_module.input_special_format_file()
    file_list = input_module.get_file_data()
    # Wiki Vector
    wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt'
    input_module = InputFileCython(wiki_vector_file_name)
    input_module.input_fast_large_file()
    wiki_vector = input_module.get_vector()

    producerConsumer = ProducerConsumerClassSummary()
    multi_thread_producer_crawl_instance = threading.Thread(target=producerConsumer.producer_run, args=([file_list]))
    multi_thread_consumer_crawl_instance = threading.Thread(target=producerConsumer.consumer_run, args=([wiki_vector]))
    multi_thread_producer_crawl_instance.start()
    multi_thread_consumer_crawl_instance.start()
if __name__ == '__main__':
    """
    args
       -r: setting word net link list
           Example:
               '/../../Data/wn_summary_list.txt'
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--read_word_net_list_file',
                        '-r',
                        default='',
                        help='set word net list file')
    args = parser.parse_args()
    #  Word Net File
    wn_summary_split_list = APP_ROOT + "/../../Data/" + args.read_word_net_list_file
    input_module = InputFileCython(wn_summary_split_list)
    input_module.input_special_format_file()
    file_list = input_module.get_file_data()
    # Wiki Vector
    wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt'
    input_module = InputFileCython(wiki_vector_file_name)
    input_module.input_fast_large_file()
    wiki_vector = input_module.get_vector()

    producerConsumer = ProducerConsumerClassSummary()
    multi_thread_producer_crawl_instance = threading.Thread(
        target=producerConsumer.producer_run, args=([file_list]))
    multi_thread_consumer_crawl_instance = threading.Thread(
        target=producerConsumer.consumer_run, args=([wiki_vector]))
    multi_thread_producer_crawl_instance.start()
    multi_thread_consumer_crawl_instance.start()
Exemple #13
0
class SqliteTwitterSummary(object):
    """
    Twitter Save to the SQLite
    """
    def __init__(self, class_word_vector):
        """
        Initial Setting
        Get the mecab dict by the yaml
        """
        Twitter = namedtuple("Twitter", ["mecab"])
        config_file = "enviroment_twitter.yml"

        with open(config_file, encoding="utf-8") as cf:
            e = yaml.load(cf)
            twitter = Twitter(e["twitter"]["mecab"])

        self.tagger = MeCab.Tagger("-Owakati -d %s" % twitter.mecab)
        conn = sqlite3.connect('./twitter_data.db')
        self.cur = conn.cursor()
        self.class_word_vector = class_word_vector
        self.class_average_vector = {}
        self.class_word_dict = self.make_class_word_dict()
        #self.__initial_setting_vector()

    def __initial_setting_vector(self):
        # Wiki vector dict
        wiki_vector_file_name = APP_ROOT + '/../Data/jawiki_vector/jawiki_vector_delete_first.txt'
        self.input_module = InputFileCython(wiki_vector_file_name)
        self.input_module.input_fast_large_file()
        self.wiki_vector = self.input_module.get_vector()
        # Make average vector dict
        wiki_average_vector_file_name_list = APP_ROOT + '/../Data/wn_summary_multi_vector_list.txt'
        self.input_module = InputFileCython(wiki_average_vector_file_name_list)
        self.input_module.input_special_format_file()
        summary_vector_file_list = self.input_module.get_file_data()
        for file in summary_vector_file_list:
            read_file = APP_ROOT + "/../Data/wn_summary_multi_vector/" + file
            self.input_module = InputFileCython(read_file)
            self.input_module.input_file_str_list()
            summary_vector_file_list = self.input_module.get_file_data()
            class_name = file.replace("_summary.txt_vector.txt", "")
            if class_name not in self.class_average_vector:
                self.class_average_vector.update(
                    {class_name: summary_vector_file_list})
        self.class_summary = ClassSummary("", self.wiki_vector, "")
        self.cosine_similarity = ClassCosineSimilarity("", "")

    def make_class_word_dict(self):
        """
        make remake the data format
        """
        word_class_dict = {}
        for class_name, word_list in self.class_word_vector.items():
            word_dict = {}
            for word in word_list:
                if word not in word_dict:
                    word_dict.update({word: 1})
            if class_name not in word_class_dict:
                word_class_dict.update({class_name: word_dict})
        return word_class_dict

    def call_sql(self):
        """
        call SQlite and save the twitter in the SQLite
        """
        self.cur.execute("""SELECT source_txt, replay_txt FROM ms_rinna;""")
        file_list = os.listdir("./data_latest/")
        #for file in file_list:
        #    os.remove("./data/" + file)
        for source_txt, replay_txt in self.cur.fetchall():
            class_name = self.judge_class(source_txt, replay_txt)
            # class_name = self.judge_class_wiki_vector(source_txt, replay_txt)
            print(class_name)
            print(source_txt)
            print(replay_txt)
            source_file = open(
                "./data_latest/" + class_name + '_source_twitter_data.txt',
                'a')
            replay_file = open(
                "./data_latest/" + class_name + '_replay_twitter_data.txt',
                'a')
            replay_file.write(
                self.tagger.parse(source_txt).replace("\n", "") + '\n')
            source_file.write(
                self.tagger.parse(replay_txt).replace('\n', '') + '\n')
            source_file.close()
            replay_file.close()

    def judge_class(self, source_txt, replay_txt=""):
        """
        Judge word class
        :param source_txt: twitter source text
        :param replay_txt: twitter replay text
        :return: most match class
        """
        class_match_rate = {}
        total_text = []
        source_wakati_text = self.__mecab_method(source_txt.strip())
        total_text.extend(source_wakati_text)
        if replay_txt != "":
            replay_wakati_text = self.__mecab_method(replay_txt.strip())
            total_text.extend(replay_wakati_text)
        for class_name in self.class_word_vector.keys():
            word_match_count = self.__match_word_count(total_text, class_name)
            if class_name not in class_match_rate:
                class_match_rate.update({
                    class_name:
                    1.0 * word_match_count /
                    len(self.class_word_dict[class_name])
                })
        if max(class_match_rate.values()) == 0.0:
            return "other"
        else:
            return max(class_match_rate.items(), key=operator.itemgetter(1))[0]

    def judge_class_wiki_vector(self, source_txt, replay_txt=""):
        """
        Judge word class by wiki vector
        :param source_txt: twitter source text
        :param replay_txt: twitter replay text
        :return: most match class
        """
        class_match_rate = {}
        total_text = []
        source_wakati_text = self.__mecab_method(source_txt.strip())
        total_text.extend(source_wakati_text)
        if replay_txt != "":
            replay_wakati_text = self.__mecab_method(replay_txt.strip())
            total_text.extend(replay_wakati_text)
        self.class_summary.summary_vector_word_list(total_text)
        summary_vector = self.class_summary.get_average_vector()
        for class_name, average_vector in self.class_average_vector.items():
            class_match_rate.update({
                class_name:
                self.cosine_similarity.cosine_similarity(
                    summary_vector, average_vector)
            })
        print(class_match_rate)
        if max(class_match_rate.values()) <= 0.1:
            return "other"
        else:
            return max(class_match_rate.items(), key=operator.itemgetter(1))[0]

    def __mecab_method(self, text):
        """
        Call Mecab method split process and choose noum
        :param text:
        :return: only noum
        """
        res = self.tagger.parseToNode("".join(text))
        split_nonum = []
        while res:
            feature = res.feature.split(",")
            if feature[0] == u"名詞":
                split_nonum.append(feature[6])
            res = res.next
        return split_nonum

    def __match_word_count(self, total_text, class_name):
        """
        count matthing word word class
        :param total_text: source text and reply text
        :param class_name: choose class name
        :return: matthing count
        """
        word_match_count = 0
        for word in total_text:
            if word in self.class_word_dict[class_name]:
                word_match_count = word_match_count + 1
        return word_match_count
class ProducerConsumerThreadSqlTwitter(object):
    """
    Producer
    Consumer
    Multi Thread Crawling.
    Using the Consumer Producer pattern
    Reference
        Python Consumer Producer pattern
            http://agiliq.com/blog/2013/10/producer-consumer-problem-in-python/
        Multi Thread Design pattern
    """
    def get_file_data(self, file):
        """
        get file data
        :param file: summary word net data
        :return:
        """
        self.input_module = InputFileCython(file)
        self.input_module.input_special_format_file()
        return self.input_module.get_file_data()

    def producer_run(self):
        """
        Running Producer
        """
        file_list = self.get_file_data(
            APP_ROOT +
            '/../Data/wn_total_summary_51519_limit05_out_put_list.txt')
        class_word_vector = {}
        for file in file_list:
            if file.strip() not in class_word_vector:
                word_list = (list(
                    map(
                        lambda x: x.strip(),
                        self.get_file_data(
                            APP_ROOT +
                            "/../Data/wn_total_summary_51519_limit05_out_put/"
                            + file.strip()))))
                class_word_vector.update(
                    {file.strip().replace("_summary.txt", ""): word_list})
        global class_queue
        global check_queue
        while True:
            if class_queue not in check_queue.queue:
                try:
                    class_queue.put(class_word_vector)
                    check_queue.put(class_word_vector)
                except queue.Empty:
                    print("Queue Full")
                    pass
                else:
                    log_text = "Produced "
                    print(log_text)
                    time.sleep(random.uniform(0.0, 0.5))

    def consumer_run(self):
        """
        Running Consumer
        """
        global class_queue
        while True:
            try:
                class_word_vector = class_queue.get()
            except class_queue.Empty:
                print("Queue Empty")
                pass
            else:
                log_text = "Consume "
                print(log_text)
                sqlite_twitter = SqliteTwitterSummaryCython(class_word_vector)
                sqlite_twitter.call_sql()
                class_queue.task_done()
                # Setting the wait time, I refered to the bellow link
                #  https://www.w3.org/Protocols/HTTP-NG/http-prob.html
                time.sleep(random.uniform(0.601, 0.602))