Ejemplo n.º 1
0
    def __init__(self, connected_to_internet):
        """ Initializer. Prepares Config object.
        
        :param connected_to_internet: True - connected to the Interner, False - disconnected
        """
        self.connected_to_internet = connected_to_internet
        self.font_cache = {}
        self.voice_commands_cache = {}
        self.cd_titles = {}
        self.cd_track_names_cache = {}
        self.screensaver_cache = {}
        self.config_class = Config()
        self.config = self.config_class.config
        self.screen_rect = self.config_class.screen_rect
        self.config[LABELS] = self.get_labels()
        self.weather_config = self.get_weather_config()
        self.pygame_screen = self.config_class.pygame_screen
        self.CURRENT_WORKING_DIRECTORY = os.getcwd()
        self.read_storage()
        self.discogs_util = DiscogsUtil(self.k1)
        self.image_util = ImageUtil(self)
        self.file_util = FileUtil(self)

        if (not os.environ.get('PYTHONHTTPSVERIFY', '')
                and getattr(ssl, '_create_unverified_context', None)):
            ssl._create_default_https_context = ssl._create_unverified_context
        self.podcasts_util = None
        self.db_util = None
        self.bluetooth_util = None
 def get_refresh_video_barrage(self, cid, row_barrages):
     barrage_file_path = FileUtil.get_barrage_file_path(cid)
     # 检查该cid的弹幕文件是否存在,如果不存在,那么此时的row_barrages数据将全部写入文件中,
     # 如果存在,那么就只要找到更新的弹幕记录。
     barrage_count = 0
     if FileUtil.is_file_exists(barrage_file_path):
         last_barrage_index = -1  # 记录文件中最后一条弹幕在row_barrages中的下标。
         barrage_count = FileUtil.get_file_line_count(barrage_file_path)
         last_n_barrages = FileUtil.get_file_last_n_line_content(barrage_file_path, 5)
         Logger.print_console_info(u"当前文件的最后n条弹幕:\n" + u"\n".join(last_n_barrages))
         for index in xrange(len(row_barrages) - 1, -1, -1):
             if self.__is_same_barrage(last_n_barrages, row_barrages[index]):
                 # 获得存储在弹幕文件中的最后一条弹幕,在更新弹幕序列中的位置。
                 last_barrage_index = index
                 break
         # 当前弹幕数据没有更新
         if last_barrage_index == (len(row_barrages) - 1):
             row_barrages = []
             Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) +
                                            u"\t" + u"弹幕数据没有更新。")
         # 此时部分的弹幕数据需要更新
         elif last_barrage_index >= 0:
             Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) +
                                            u"\t" + u"有弹幕数据更新:" +
                                            u"\t" + str(len(row_barrages) - last_barrage_index - 1))
             row_barrages = row_barrages[last_barrage_index + 1: len(row_barrages)]
         # 弹幕全文都要更新
         elif last_barrage_index == -1:
             Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) + u"\t" +
                                            u"有弹幕数据更新:" + u"\t" + str(len(row_barrages)))
     barrage_count += len(row_barrages)
     Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) +
                                    u" 当前弹幕总条数:" + unicode(barrage_count) + u"\n\n")
     return row_barrages
Ejemplo n.º 3
0
 def get_refresh_video_barrage(self, cid, row_barrages):
     barrage_file_path = FileUtil.get_barrage_file_path(cid)
     # 检查该cid的弹幕文件是否存在,如果不存在,那么此时的row_barrages数据将全部写入文件中,
     # 如果存在,那么就只要找到更新的弹幕记录。
     barrage_count = 0
     if FileUtil.is_file_exists(barrage_file_path):
         last_barrage_index = -1  # 记录文件中最后一条弹幕在row_barrages中的下标。
         barrage_count = FileUtil.get_file_line_count(barrage_file_path)
         last_n_barrages = FileUtil.get_file_last_n_line_content(barrage_file_path, 5)
         Logger.print_console_info(u"当前文件的最后n条弹幕:\n" + u"\n".join(last_n_barrages))
         for index in xrange(len(row_barrages) - 1, -1, -1):
             if self.__is_same_barrage(last_n_barrages, row_barrages[index]):
                 # 获得存储在弹幕文件中的最后一条弹幕,在更新弹幕序列中的位置。
                 last_barrage_index = index
                 break
         # 当前弹幕数据没有更新
         if last_barrage_index == (len(row_barrages) - 1):
             row_barrages = []
             Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) +
                                            u"\t" + u"弹幕数据没有更新。")
         # 此时部分的弹幕数据需要更新
         elif last_barrage_index >= 0:
             Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) +
                                            u"\t" + u"有弹幕数据更新:" +
                                            u"\t" + str(len(row_barrages) - last_barrage_index - 1))
             row_barrages = row_barrages[last_barrage_index + 1: len(row_barrages)]
         # 弹幕全文都要更新
         elif last_barrage_index == -1:
             Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) + u"\t" +
                                            u"有弹幕数据更新:" + u"\t" + str(len(row_barrages)))
     barrage_count += len(row_barrages)
     Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) +
                                    u" 当前弹幕总条数:" + unicode(barrage_count) + u"\n\n")
     return row_barrages
Ejemplo n.º 4
0
Archivo: util.py Proyecto: Rucia1/Peppy
    def __init__(self, connected_to_internet):
        """ Initializer. Prepares Config object. """

        self.connected_to_internet = connected_to_internet
        self.font_cache = {}
        self.image_cache = {}
        self.voice_commands_cache = {}
        self.cd_titles = {}
        self.cd_track_names_cache = {}
        self.screensaver_cache = {}
        self.image_cache_base64 = {}
        self.svg_cache = {}
        self.album_art_url_cache = {}
        self.config_class = Config()
        self.config = self.config_class.config
        self.config[LABELS] = self.get_labels()
        self.weather_config = self.get_weather_config()
        self.PYGAME_SCREEN = self.config[PYGAME_SCREEN]
        self.file_util = FileUtil(self.config)
        self.CURRENT_WORKING_DIRECTORY = os.getcwd()
        self.discogs_util = DiscogsUtil()
        self.COLOR_MAIN = self.color_to_hex(self.config[COLORS][COLOR_BRIGHT])
        self.COLOR_ON = self.color_to_hex(self.config[COLORS][COLOR_CONTRAST])
        self.COLOR_OFF = self.color_to_hex(
            self.config[COLORS][COLOR_DARK_LIGHT])
        self.COLOR_MUTE = self.color_to_hex(self.config[COLORS][COLOR_MUTE])
        if (not os.environ.get('PYTHONHTTPSVERIFY', '')
                and getattr(ssl, '_create_unverified_context', None)):
            ssl._create_default_https_context = ssl._create_unverified_context
Ejemplo n.º 5
0
 def gen_user_topic_lda_by_time_window(cls, barrage_seg_list, cid):
     dictionary = corpora.Dictionary.load(os.path.join(FileUtil.get_train_model_dir(),
                                                       str(cid) + "-barrage-words.dict"))
     lda_model = models.TfidfModel.load(os.path.join(FileUtil.get_train_model_dir(),
                                                     str(cid) + "-barrage-lda.model"))
     time_window_list = TimeWindow.gen_time_window_barrage_info(barrage_seg_list, cid)
     for time_window in time_window_list:
         time_window.gen_user_word_frequency()  # 产生该时间窗口内的用户词频信息。
         time_window.gen_user_topic_lda(dictionary, lda_model)  # 产生该时间窗口内的用户所发词语的tfidf权重信息。
     return time_window_list
Ejemplo n.º 6
0
def segment_barrages(barrages, cid=None, is_corpus=False):
    if os.path.exists(FileUtil.get_word_segment_result_file_path(cid)):
        # 如果存在切词保存结果文件,那么直接从切词文件中读取
        return load_segment_barrages(cid)

    index = 0
    barrage_seg_list = []
    for barrage in barrages:
        barrage_seg = BarrageSeg(barrage.play_timestamp, barrage.sender_id,
                                 barrage.row_id, index)
        sentence_seg = __segment_sentence(barrage.content)  # 一条弹幕分词之后的结果
        if len(sentence_seg) <= 0:  # 对于其中的词语全部都被过滤的弹幕,不保存它的信息(防止影片末尾日期刷屏)
            continue
        barrage_seg.sentence_seg_list = sentence_seg
        barrage_seg_list.append(barrage_seg)
        index += 1
    if is_corpus is False:
        # 将分词结果写入测试文件中,检查分词情况
        __save_segment_word_to_file(barrage_seg_list, cid)
        # 将分词的结果以json的形式写入文件中,以供今后分析zscore的时候调用。
        save_segment_barrages(barrage_seg_list, cid)
        # 将视频v全体的弹幕数据作为语料库,便于生成tf-idf模型
        corpus = DictConfig.gen_corpus_info(barrage_seg_list, cid)
        # 以分好词的弹幕作为训练集,训练tf-idf模型
        DictConfig.gen_tfidf_model(corpus, cid)
        # 以分好词的弹幕作为训练集,训练lda模型
        DictConfig.gen_lda_model(corpus, cid)
    return barrage_seg_list
Ejemplo n.º 7
0
def build_window(danmaku_list, window_size, step_length, parse_dict):
    window_list = []
    current_start = 0
    current_end = current_start + window_size
    current_danmaku = []
    current_index = 0
    if FileUtil.is_file_exists(constants.WINDOW_LOG):
        os.remove(constants.WINDOW_LOG)
    while current_start < danmaku_list[-1].videoSecond:
        logging.info("Building time window " + str(current_index) + "...")
        for danmaku in danmaku_list:
            if current_start <= danmaku.videoSecond <= current_end:
                current_danmaku.append(danmaku)
            elif danmaku.videoSecond > current_end:
                break
        # write_window_log(current_index, current_start, current_end, current_danmaku)
        time_window = TimeWindow(current_index, current_start, current_end)
        time_window.buildUsers(danmakuutil.extract_users(current_danmaku))
        time_window.buildTSCs(len(current_danmaku))
        time_window.buildTSCLength(current_danmaku)
        # time_window.buildEntropy(current_danmaku, parse_dict)
        # time_window.buildUserFeature(danmakuutil.extract_user_feature(current_danmaku, parse_dict, "Word-Frequency"))
        window_list.append(time_window)

        current_index += 1
        current_start += step_length
        current_danmaku = []
        current_end = current_start + window_size

    return window_list
Ejemplo n.º 8
0
def segment_barrages(barrages, cid=None, is_corpus=False):
    if os.path.exists(FileUtil.get_word_segment_result_file_path(cid)):
        # 如果存在切词保存结果文件,那么直接从切词文件中读取
        return load_segment_barrages(cid)

    index = 0
    barrage_seg_list = []
    for barrage in barrages:
        barrage_seg = BarrageSeg(barrage.play_timestamp, barrage.sender_id, barrage.row_id, index)
        sentence_seg = __segment_sentence(barrage.content)  # 一条弹幕分词之后的结果
        if len(sentence_seg) <= 0:  # 对于其中的词语全部都被过滤的弹幕,不保存它的信息(防止影片末尾日期刷屏)
            continue
        barrage_seg.sentence_seg_list = sentence_seg
        barrage_seg_list.append(barrage_seg)
        index += 1
    if is_corpus is False:
        # 将分词结果写入测试文件中,检查分词情况
        __save_segment_word_to_file(barrage_seg_list, cid)
        # 将分词的结果以json的形式写入文件中,以供今后分析zscore的时候调用。
        save_segment_barrages(barrage_seg_list, cid)
        # 将视频v全体的弹幕数据作为语料库,便于生成tf-idf模型
        corpus = DictConfig.gen_corpus_info(barrage_seg_list, cid)
        # 以分好词的弹幕作为训练集,训练tf-idf模型
        DictConfig.gen_tfidf_model(corpus, cid)
        # 以分好词的弹幕作为训练集,训练lda模型
        DictConfig.gen_lda_model(corpus, cid)
    return barrage_seg_list
Ejemplo n.º 9
0
 def gen_tfidf_model(cls, corpus, cid):
     # let’s initialize a tfidf transformation:
     logging.debug(u"生成 tfidf 模型!!!")
     tfidf = models.TfidfModel(corpus)
     tfidf.save(
         os.path.join(FileUtil.get_train_model_dir(),
                      str(cid) + "-barrage-tfidf.model"))
Ejemplo n.º 10
0
def __save_segment_word_to_file(barrage_seg_list, cid):
    # barrage_seg_list -> barrage_seg -> sentence_seg_list -> sentence_seg
    word_segment_file = os.path.join(FileUtil.get_word_segment_dir(),
                                     "test-" + cid + "-seg-result.txt")
    with codecs.open(word_segment_file, "wb", "utf-8") as output_file:
        for barrage_seg in barrage_seg_list:
            for word_seg in barrage_seg.sentence_seg_list:
                output_file.write(word_seg.word + u"\t" + word_seg.flag + u"\n")
Ejemplo n.º 11
0
def __save_segment_word_to_file(barrage_seg_list, cid):
    # barrage_seg_list -> barrage_seg -> sentence_seg_list -> sentence_seg
    word_segment_file = os.path.join(FileUtil.get_word_segment_dir(),
                                     "test-" + cid + "-seg-result.txt")
    with codecs.open(word_segment_file, "wb", "utf-8") as output_file:
        for barrage_seg in barrage_seg_list:
            for word_seg in barrage_seg.sentence_seg_list:
                output_file.write(word_seg.word + u"\t" + word_seg.flag +
                                  u"\n")
Ejemplo n.º 12
0
def load_segment_barrages(cid):
    json_data = []
    file_path = FileUtil.get_word_segment_result_file_path(cid)
    with codecs.open(file_path, "rb", "utf-8") as input_file:
        for line in input_file:
            json_data.append(line)
    json_str = u"".join(json_data)
    barrage_seg_list_json = json.loads(json_str)
    barrage_seg_list = BarrageSeg.dict2barrageseglist(barrage_seg_list_json)
    return barrage_seg_list
Ejemplo n.º 13
0
 def __save_time_window_info_to_file(cls, cid, time_window_list):
     file_path = os.path.join(FileUtil.get_zscore_dir(), str(cid) + "-time-window-info.txt")
     with codecs.open(file_path, "wb", "utf-8") as output_file:
         for time_window in time_window_list:
             time_window_info = unicode(str(time_window.time_window_index)) + u"\t" \
                                + DateTimeUtil.format_barrage_play_timestamp(time_window.start_timestamp) + u"\t" \
                                + DateTimeUtil.format_barrage_play_timestamp(time_window.end_timestamp) + u"\t" \
                                + unicode(str(time_window.barrage_count)) + u"\t" \
                                + unicode(str(time_window.valid_barrage_word_count)) + u"\n"
             output_file.write(time_window_info)
Ejemplo n.º 14
0
def load_segment_barrages(cid):
    json_data = []
    file_path = FileUtil.get_word_segment_result_file_path(cid)
    with codecs.open(file_path, "rb", "utf-8") as input_file:
        for line in input_file:
            json_data.append(line)
    json_str = u"".join(json_data)
    barrage_seg_list_json = json.loads(json_str)
    barrage_seg_list = BarrageSeg.dict2barrageseglist(barrage_seg_list_json)
    return barrage_seg_list
Ejemplo n.º 15
0
 def __save_similarity_matrix_to_local(cls, sim_matrix, time_window_index):
     matrix_file_name = os.path.join(
         FileUtil.get_similarity_matrix_dir(),
         "matrix-" + str(time_window_index) + ".txt")
     with codecs.open(matrix_file_name, "wb", "utf-8") as output_file:
         np.savetxt(fname=output_file,
                    X=sim_matrix,
                    fmt="%.2f",
                    delimiter="\t",
                    newline="\n")
Ejemplo n.º 16
0
    def gen_corpus_info(cls, barrage_seg_list, cid):
        # 获得每条弹幕分好之后的词语
        texts = []
        for barrage_seg in barrage_seg_list:
            text = []
            for word_seg in barrage_seg.sentence_seg_list:
                text.append(word_seg.word)
            texts.append(text)
        # 为文本中的每一个词语赋予一个数字下标
        dictionary = corpora.Dictionary(texts)
        # store the dictionary, for future reference
        dictionary.save(os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-words.dict"))

        logging.debug(dictionary.token2id)
        # 根据生成的字典,生成语料库信息(语料的词用id表示,后面对应的是count。)
        corpus = [dictionary.doc2bow(text) for text in texts]
        # store to disk, for later use
        corpora.MmCorpus.serialize(os.path.join(FileUtil.get_train_model_dir(), str(cid) + '-barrage-corpus.mm'),
                                   corpus)
        return corpus
Ejemplo n.º 17
0
 def gen_sorted_zscore_file(self, threshold_value=1.0):
     sorted_zscore_file_name = FileUtil.get_cid_from_barrage_file_path(self.zscore_file_path) + "-sorted-zscore.txt"
     with codecs.open(sorted_zscore_file_name, "wb", "utf-8") as output_file:
         for time_window_index, zscore in self.zscore_list:
             if zscore < threshold_value:
                 continue
             total_seconds = time_window_index * self.slide_time_interval  # 时间窗口起始地秒数
             zscore_info = unicode(str(time_window_index)) + u"\t" + unicode(
                 str(zscore)) + u"\t" + DateTimeUtil.format_barrage_play_timestamp(total_seconds) + u"\n"
             # logger.debug(zscore_info)
             output_file.write(zscore_info)
Ejemplo n.º 18
0
def build_word2vec_model(barrage_corpus_dirname, barrage_corpus_file_type="txt"):
    train_sentences = TrainSentences(barrage_corpus_dirname, barrage_corpus_file_type)
    """
    min_count: One of them is for pruning the internal dictionary. Words that appear only once or twice in a billion-word corpus
    are probably uninteresting typos and garbage. In addition, there’s not enough data to make any meaningful training
    on those words, so it’s best to ignore them, default 5
    size: Another parameter is the size of the NN layers, which correspond to the “degrees” of freedom the training
     algorithm has, default 100
    workers: training parallelization, to speed up training, default = 1 worker = no parallelization
    """
    model = gensim.models.Word2Vec(train_sentences, min_count=5, size=150, workers=multiprocessing.cpu_count())
    model.save(os.path.join(FileUtil.get_train_model_dir(), "barrage-corpusword2vec-model.txt"))
Ejemplo n.º 19
0
def gen_corpus_words():
    barrage_corpus_files = glob.glob(os.path.join(FileUtil.get_corpus_dir(), "*.txt"))
    file_lists = [barrage_corpus_files[0: 501], barrage_corpus_files[501: 1001], barrage_corpus_files[1001: 1501],
                  barrage_corpus_files[1501: 2001], barrage_corpus_files[2001: 2501], barrage_corpus_files[2501: 3001],
                  barrage_corpus_files[3001: len(barrage_corpus_files)]]
    pools = Pool(7)
    file_index = 0
    for file_list in file_lists:
        file_index += 1
        pools.apply_async(__gen_corpus_words, args=(file_list, "all-corpus-" + str(file_index) + ".txt"))
    pools.close()
    pools.join()
Ejemplo n.º 20
0
 def __save_high_emotion_clips_to_file(self, high_emotion_clips, global_zscore_threshold,
                                       left_zscore_threshold, right_zscore_threshould):
     file_path = os.path.join(FileUtil.get_zscore_dir(), self.cid + "-high-emotion-clips.txt")
     with codecs.open(file_path, "wb", "utf-8") as output_file:
         output_file.write(unicode(str(global_zscore_threshold)) + u"\t" +
                           unicode(str(left_zscore_threshold)) + u"\t" +
                           unicode(str(right_zscore_threshould)) + u"\n")
         for emotion_clip in high_emotion_clips:
             str_info = u""
             for item in emotion_clip:
                 str_info += (unicode(str(item)) + u"\t")
             str_info = str_info[0: len(str_info) - 1] + u"\n"
             output_file.write(str_info)
Ejemplo n.º 21
0
    def gen_corpus_info(cls, barrage_seg_list, cid):
        # 获得每条弹幕分好之后的词语
        texts = []
        for barrage_seg in barrage_seg_list:
            text = []
            for word_seg in barrage_seg.sentence_seg_list:
                text.append(word_seg.word)
            texts.append(text)
        # 为文本中的每一个词语赋予一个数字下标
        dictionary = corpora.Dictionary(texts)
        # store the dictionary, for future reference
        dictionary.save(
            os.path.join(FileUtil.get_train_model_dir(),
                         str(cid) + "-barrage-words.dict"))

        logging.debug(dictionary.token2id)
        # 根据生成的字典,生成语料库信息(语料的词用id表示,后面对应的是count。)
        corpus = [dictionary.doc2bow(text) for text in texts]
        # store to disk, for later use
        corpora.MmCorpus.serialize(
            os.path.join(FileUtil.get_train_model_dir(),
                         str(cid) + '-barrage-corpus.mm'), corpus)
        return corpus
Ejemplo n.º 22
0
def gen_sorted_barrage_file(barrage_file_path):
    barrages = get_barrage_from_txt_file(barrage_file_path)  # 弹幕信息已经按照降序进行排好序。
    sorted_file_name = FileUtil.get_cid_from_barrage_file_path(barrage_file_path) + "-sorted.txt"
    with codecs.open(sorted_file_name, "wb", "utf-8") as output_file:
        for barrage in barrages:
            play_time_stamp = unicode(str(float(barrage.play_timestamp)))
            # barrage_str = DateTimeUtil.format_barrage_play_timestamp(play_time_stamp) + u"\t" + play_time_stamp \
            #               + u"\t" + barrage.type + u"\t" + barrage.font_size + u"\t" + barrage.font_color + u"\t" \
            #               + barrage.unix_timestamp + u"\t" + barrage.pool + u"\t" + barrage.sender_id + u"\t" \
            #               + barrage.row_id + u"\t" + barrage.content + u"\n"
            barrage_str = play_time_stamp + u"\t" + barrage.type + u"\t" + barrage.font_size + u"\t" \
                          + barrage.font_color + u"\t" + barrage.unix_timestamp + u"\t" + barrage.pool + u"\t" \
                          + barrage.sender_id + u"\t" + barrage.row_id + u"\t" + barrage.content + u"\n"
            output_file.write(barrage_str)
    return barrages
Ejemplo n.º 23
0
 def build_dicts(cls):
     if not cls.__HAS_LOAD_USER_DICT:  # 还未加载用户词典
         cls.__HAS_LOAD_USER_DICT = True
         # 载入自定义的弹幕词典,优化弹幕特有词语的切词,以及颜表情的切词
         jieba.load_userdict(os.path.join(FileUtil.get_dict_dir(), "barrage-word-dict.txt"))
         logging.debug(u"自定义弹幕词典加载成功!!!")
     # 初始化停用词列表
     cls.__init_stopwords()
     # 初始化替换词词典
     cls.__init_replace_words()
     # 初始化接受词性的词典
     cls.__init_accept_nominal()
     # 初始化emoji替换词典
     cls.__init_emoji_replace_dict()
     # 初始化弃用标点符号词典
     cls.__init_reject_punctuation_set()
Ejemplo n.º 24
0
 def gen_sorted_zscore_file(self, threshold_value=1.0):
     sorted_zscore_file_name = FileUtil.get_cid_from_barrage_file_path(
         self.zscore_file_path) + "-sorted-zscore.txt"
     with codecs.open(sorted_zscore_file_name, "wb",
                      "utf-8") as output_file:
         for time_window_index, zscore in self.zscore_list:
             if zscore < threshold_value:
                 continue
             total_seconds = time_window_index * self.slide_time_interval  # 时间窗口起始地秒数
             zscore_info = unicode(
                 str(time_window_index)) + u"\t" + unicode(
                     str(zscore)
                 ) + u"\t" + DateTimeUtil.format_barrage_play_timestamp(
                     total_seconds) + u"\n"
             # logger.debug(zscore_info)
             output_file.write(zscore_info)
Ejemplo n.º 25
0
def gen_sorted_barrage_file(barrage_file_path):
    barrages = get_barrage_from_txt_file(barrage_file_path)  # 弹幕信息已经按照降序进行排好序。
    sorted_file_name = FileUtil.get_cid_from_barrage_file_path(
        barrage_file_path) + "-sorted.txt"
    with codecs.open(sorted_file_name, "wb", "utf-8") as output_file:
        for barrage in barrages:
            play_time_stamp = unicode(str(float(barrage.play_timestamp)))
            # barrage_str = DateTimeUtil.format_barrage_play_timestamp(play_time_stamp) + u"\t" + play_time_stamp \
            #               + u"\t" + barrage.type + u"\t" + barrage.font_size + u"\t" + barrage.font_color + u"\t" \
            #               + barrage.unix_timestamp + u"\t" + barrage.pool + u"\t" + barrage.sender_id + u"\t" \
            #               + barrage.row_id + u"\t" + barrage.content + u"\n"
            barrage_str = play_time_stamp + u"\t" + barrage.type + u"\t" + barrage.font_size + u"\t" \
                          + barrage.font_color + u"\t" + barrage.unix_timestamp + u"\t" + barrage.pool + u"\t" \
                          + barrage.sender_id + u"\t" + barrage.row_id + u"\t" + barrage.content + u"\n"
            output_file.write(barrage_str)
    return barrages
Ejemplo n.º 26
0
def get_parse_dict(danmaku_list):
    logging.info("Starting parsing sentences in Danmaku...")
    parse_dict = dict()
    jieba.load_userdict(constants.USER_DICT_PATH)
    emotion_dict_path = os.path.join(FileUtil.get_project_root_path(),
                                     "WordSegment", "emotion_dict.txt")
    emotion_dict = load_emotion_dict(emotion_dict_path)
    for danmaku in danmaku_list:
        rowId = danmaku.rowId
        if danmaku.content is not None:
            words = wordSegment(emotion_dict, danmaku.content)
            parse_dict[rowId] = words
        else:
            parse_dict[rowId] = None
    logging.info("parse dictionary has generated!")
    return parse_dict
Ejemplo n.º 27
0
 def build_dicts(cls):
     if not cls.__HAS_LOAD_USER_DICT:  # 还未加载用户词典
         cls.__HAS_LOAD_USER_DICT = True
         # 载入自定义的弹幕词典,优化弹幕特有词语的切词,以及颜表情的切词
         jieba.load_userdict(
             os.path.join(FileUtil.get_dict_dir(), "barrage-word-dict.txt"))
         logging.debug(u"自定义弹幕词典加载成功!!!")
     # 初始化停用词列表
     cls.__init_stopwords()
     # 初始化替换词词典
     cls.__init_replace_words()
     # 初始化接受词性的词典
     cls.__init_accept_nominal()
     # 初始化emoji替换词典
     cls.__init_emoji_replace_dict()
     # 初始化弃用标点符号词典
     cls.__init_reject_punctuation_set()
Ejemplo n.º 28
0
 def __save_high_emotion_clips_to_file(self, high_emotion_clips,
                                       global_zscore_threshold,
                                       left_zscore_threshold,
                                       right_zscore_threshould):
     file_path = os.path.join(FileUtil.get_zscore_dir(),
                              self.cid + "-high-emotion-clips.txt")
     with codecs.open(file_path, "wb", "utf-8") as output_file:
         output_file.write(
             unicode(str(global_zscore_threshold)) + u"\t" +
             unicode(str(left_zscore_threshold)) + u"\t" +
             unicode(str(right_zscore_threshould)) + u"\n")
         for emotion_clip in high_emotion_clips:
             str_info = u""
             for item in emotion_clip:
                 str_info += (unicode(str(item)) + u"\t")
             str_info = str_info[0:len(str_info) - 1] + u"\n"
             output_file.write(str_info)
Ejemplo n.º 29
0
def parse_barrage_xml_to_txt(xml_file_path):
    # 获取xml文件中的全部内容。
    with codecs.open(xml_file_path, "rb", "utf-8") as input_file:
        content = []
        for line in input_file:
            content.append(line)
    content = u"\n".join(content)
    # 弹幕出现的播放时间,弹幕类型,字体大小,字体颜色,弹幕出现的unix时间戳,弹幕池,弹幕创建者id,弹幕id
    pattern = re.compile(r'<d p="(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)">(.*?)</d>', re.S)
    barrages = re.findall(pattern, content)
    if len(barrages) <= 0:
        return None
    txt_file_name = FileUtil.get_cid_from_barrage_file_path(xml_file_path) + ".txt"
    with codecs.open(txt_file_name, "wb", "utf-8") as output_file:
        for barrage in barrages:
            output_file.write(u"\t".join(barrage) + u"\n")
    return barrages
Ejemplo n.º 30
0
def gen_corpus_words():
    barrage_corpus_files = glob.glob(
        os.path.join(FileUtil.get_corpus_dir(), "*.txt"))
    file_lists = [
        barrage_corpus_files[0:501], barrage_corpus_files[501:1001],
        barrage_corpus_files[1001:1501], barrage_corpus_files[1501:2001],
        barrage_corpus_files[2001:2501], barrage_corpus_files[2501:3001],
        barrage_corpus_files[3001:len(barrage_corpus_files)]
    ]
    pools = Pool(7)
    file_index = 0
    for file_list in file_lists:
        file_index += 1
        pools.apply_async(__gen_corpus_words,
                          args=(file_list,
                                "all-corpus-" + str(file_index) + ".txt"))
    pools.close()
    pools.join()
Ejemplo n.º 31
0
 def load_high_emotion_clips_from_file(cls, cid):
     file_path = os.path.join(FileUtil.get_zscore_dir(), cid + "-high-emotion-clips-lda.txt")
     first_line_flag = True
     high_emotion_clips = []
     global_zscore_threshold = 0
     left_zscore_threshold = 0
     right_zscore_threshould = 0
     with codecs.open(file_path, "rb", "utf-8") as input_file:
         for line in input_file:
             split_info = line.strip().split("\t")
             if first_line_flag:
                 first_line_flag = False
                 global_zscore_threshold = split_info[0]
                 left_zscore_threshold = split_info[1]
                 right_zscore_threshould = split_info[2]
                 continue
             high_emotion_clips.append(split_info)
     return high_emotion_clips, global_zscore_threshold, left_zscore_threshold, right_zscore_threshould
Ejemplo n.º 32
0
def build_word2vec_model(barrage_corpus_dirname,
                         barrage_corpus_file_type="txt"):
    train_sentences = TrainSentences(barrage_corpus_dirname,
                                     barrage_corpus_file_type)
    """
    min_count: One of them is for pruning the internal dictionary. Words that appear only once or twice in a billion-word corpus
    are probably uninteresting typos and garbage. In addition, there’s not enough data to make any meaningful training
    on those words, so it’s best to ignore them, default 5
    size: Another parameter is the size of the NN layers, which correspond to the “degrees” of freedom the training
     algorithm has, default 100
    workers: training parallelization, to speed up training, default = 1 worker = no parallelization
    """
    model = gensim.models.Word2Vec(train_sentences,
                                   min_count=5,
                                   size=150,
                                   workers=multiprocessing.cpu_count())
    model.save(
        os.path.join(FileUtil.get_train_model_dir(),
                     "barrage-corpusword2vec-model.txt"))
Ejemplo n.º 33
0
 def load_high_emotion_clips_from_file(cls, cid):
     file_path = os.path.join(FileUtil.get_zscore_dir(),
                              cid + "-high-emotion-clips-lda.txt")
     first_line_flag = True
     high_emotion_clips = []
     global_zscore_threshold = 0
     left_zscore_threshold = 0
     right_zscore_threshould = 0
     with codecs.open(file_path, "rb", "utf-8") as input_file:
         for line in input_file:
             split_info = line.strip().split("\t")
             if first_line_flag:
                 first_line_flag = False
                 global_zscore_threshold = split_info[0]
                 left_zscore_threshold = split_info[1]
                 right_zscore_threshould = split_info[2]
                 continue
             high_emotion_clips.append(split_info)
     return high_emotion_clips, global_zscore_threshold, left_zscore_threshold, right_zscore_threshould
Ejemplo n.º 34
0
def parse_barrage_xml_to_txt(xml_file_path):
    # 获取xml文件中的全部内容。
    with codecs.open(xml_file_path, "rb", "utf-8") as input_file:
        content = []
        for line in input_file:
            content.append(line)
    content = u"\n".join(content)
    # 弹幕出现的播放时间,弹幕类型,字体大小,字体颜色,弹幕出现的unix时间戳,弹幕池,弹幕创建者id,弹幕id
    pattern = re.compile(
        r'<d p="(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)">(.*?)</d>',
        re.S)
    barrages = re.findall(pattern, content)
    if len(barrages) <= 0:
        return None
    txt_file_name = FileUtil.get_cid_from_barrage_file_path(
        xml_file_path) + ".txt"
    with codecs.open(txt_file_name, "wb", "utf-8") as output_file:
        for barrage in barrages:
            output_file.write(u"\t".join(barrage) + u"\n")
    return barrages
Ejemplo n.º 35
0
 def save_barrages_to_local(self, cid, row_barrages, is_corpus=False):
     barrage_count = len(row_barrages)
     if barrage_count <= 0:  # 是对于要存储入数据库的弹幕来说的。
         return
     barrage_file_path = FileUtil.get_barrage_file_path(cid, is_corpus)
     if is_corpus:
         if barrage_count < 100:  # 弹幕数量小于100的弹幕不作为语料库弹幕数据。
             return
         row_barrages = self.sort_barrages(row_barrages)
         # 如果需要作为语料库的信息,那么 弹幕数量 频率至少为每10秒钟 一条,这样才能保持内容的连贯性。
         try:
             total_seconds = float(row_barrages[-1][0].strip())
             if (total_seconds / 10) > barrage_count:
                 return
         except Exception as exception:
             print exception
             return
     with codecs.open(barrage_file_path, "ab", "utf-8") as output_file:
         for barrage in row_barrages:
             if barrage is not None:
                 output_file.write(u"\t".join(barrage) + u"\n")
Ejemplo n.º 36
0
 def save_barrages_to_local(self, cid, row_barrages, is_corpus=False):
     barrage_count = len(row_barrages)
     if barrage_count <= 0:  # 是对于要存储入数据库的弹幕来说的。
         return
     barrage_file_path = FileUtil.get_barrage_file_path(cid, is_corpus)
     if is_corpus:
         if barrage_count < 100:  # 弹幕数量小于100的弹幕不作为语料库弹幕数据。
             return
         row_barrages = self.sort_barrages(row_barrages)
         # 如果需要作为语料库的信息,那么 弹幕数量 频率至少为每10秒钟 一条,这样才能保持内容的连贯性。
         try:
             total_seconds = float(row_barrages[-1][0].strip())
             if (total_seconds / 10) > barrage_count:
                 return
         except Exception as exception:
             print exception
             return
     with codecs.open(barrage_file_path, "ab", "utf-8") as output_file:
         for barrage in row_barrages:
             if barrage is not None:
                 output_file.write(u"\t".join(barrage) + u"\n")
Ejemplo n.º 37
0
    def extend_emotion_dict(self):
        barrage_model = gensim.models.Word2Vec.load(os.path.join(FileUtil.get_train_model_dir(),
                                                                 "barrage-corpusword2vec-model.txt"))
        standard_word_dict = {}  # {word, (category, degree, level)}
        for category, word_set in self.emotion_dict.items():
            for word, emotion_degree, emotion_level in word_set:
                standard_word_dict[word] = (category, emotion_degree, emotion_level)
        extend_word_dict = {}  # {word, (category, degree, level)}
        for word, word_info in standard_word_dict.items():
            category = word_info[0]
            emotion_degree = word_info[1]
            emotion_level = word_info[2]
            try:
                similar_word_list = barrage_model.most_similar(positive=[word])
            except Exception as exception:
                logger.info(exception)
                continue
            for index in xrange(0, len(similar_word_list)):
                similar_word, similar = similar_word_list[index]

                similar_degree = float(emotion_degree) * similar
                similar_level = int(emotion_level)
                if (similar_word not in standard_word_dict.keys()) and (similar_word not in extend_word_dict.keys()):
                    extend_word_dict[similar_word] = (category, similar_degree, similar_level)
                elif similar_word in extend_word_dict.keys():
                    last_similar_degree = extend_word_dict[similar_word][1]
                    if last_similar_degree < similar_degree:
                        extend_word_dict[similar_word] = (category, similar_degree, similar_level)

        with codecs.open("extend-emotion-words.txt", "wb", "utf-8") as output_file:
            for word, word_info in extend_word_dict.items():
                category = word_info[0]
                degree = word_info[1]
                level = word_info[2]
                output_file.write(category + u"\t" + word + u"\t" + unicode(str(degree)) + u"\t" +
                                  unicode(str(level)) + u"\n")
Ejemplo n.º 38
0
        first_line_flag = True
        high_emotion_clips = []
        global_zscore_threshold = 0
        left_zscore_threshold = 0
        right_zscore_threshould = 0
        with codecs.open(file_path, "rb", "utf-8") as input_file:
            for line in input_file:
                split_info = line.strip().split("\t")
                if first_line_flag:
                    first_line_flag = False
                    global_zscore_threshold = split_info[0]
                    left_zscore_threshold = split_info[1]
                    right_zscore_threshould = split_info[2]
                    continue
                high_emotion_clips.append(split_info)
        return high_emotion_clips, global_zscore_threshold, left_zscore_threshold, right_zscore_threshould


if __name__ == "__main__":
    zscore = Zscore("2171229", os.path.join(FileUtil.get_zscore_dir(), "hd-zscore-result-lda.txt"), 30, 10, 4)
    # zscore.gen_sorted_zscore_file(threshold_value=5)
    # # zscore.gen_possible_high_emotion_clips()
    high_emotion_clips = zscore.gen_possible_high_emotion_clips()
    for emotion_clip in high_emotion_clips:
        str_info = u""
        for item in emotion_clip:
            str_info += (unicode(str(item)) + u"\t")
        str_info = str_info[0: len(str_info) - 1]
        str_info += u"\n"
        print str_info
Ejemplo n.º 39
0
 def gen_lda_model(cls, corpus, cid):
     logging.debug(u"生成 lda 模型!!!")
     lda = models.LdaModel(corpus, num_topics=10)
     lda.save(os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-lda.model"))
Ejemplo n.º 40
0
 def gen_tfidf_model(cls, corpus, cid):
     # let’s initialize a tfidf transformation:
     logging.debug(u"生成 tfidf 模型!!!")
     tfidf = models.TfidfModel(corpus)
     tfidf.save(os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-tfidf.model"))
Ejemplo n.º 41
0
            for barrage_seg in barrage_seg_list:
                corpus_words = u""
                if len(barrage_seg.sentence_seg_list) <= 0:
                    continue  # 弹幕中的词语有可能全部被替换掉了,没有剩下任何词语。
                for word_seg in barrage_seg.sentence_seg_list:
                    corpus_words += (word_seg.word + u"\t")
                corpus_words = corpus_words[0: len(corpus_words) - 1] + u"\n"
                output_file.write(corpus_words)


# 根据语料库建立 word2vec 模型
# 参数: barrage_corpus_dirname 弹幕语料的路径
#       barrage_corpus_file_type 弹幕语料存储的文件类型
def build_word2vec_model(barrage_corpus_dirname, barrage_corpus_file_type="txt"):
    train_sentences = TrainSentences(barrage_corpus_dirname, barrage_corpus_file_type)
    """
    min_count: One of them is for pruning the internal dictionary. Words that appear only once or twice in a billion-word corpus
    are probably uninteresting typos and garbage. In addition, there’s not enough data to make any meaningful training
    on those words, so it’s best to ignore them, default 5
    size: Another parameter is the size of the NN layers, which correspond to the “degrees” of freedom the training
     algorithm has, default 100
    workers: training parallelization, to speed up training, default = 1 worker = no parallelization
    """
    model = gensim.models.Word2Vec(train_sentences, min_count=5, size=150, workers=multiprocessing.cpu_count())
    model.save(os.path.join(FileUtil.get_train_model_dir(), "barrage-corpusword2vec-model.txt"))


if __name__ == "__main__":
    train_sentences = TrainSentences(FileUtil.get_corpus_dir())
    gen_corpus_words()
Ejemplo n.º 42
0
    # 根据语料库信息生成lda模型
    @classmethod
    def gen_lda_model(cls, corpus, cid):
        logging.debug(u"生成 lda 模型!!!")
        lda = models.LdaModel(corpus, num_topics=10)
        lda.save(os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-lda.model"))

    # 初始化所有的字典信息。
    @classmethod
    def build_dicts(cls):
        if not cls.__HAS_LOAD_USER_DICT:  # 还未加载用户词典
            cls.__HAS_LOAD_USER_DICT = True
            # 载入自定义的弹幕词典,优化弹幕特有词语的切词,以及颜表情的切词
            jieba.load_userdict(os.path.join(FileUtil.get_dict_dir(), "barrage-word-dict.txt"))
            logging.debug(u"自定义弹幕词典加载成功!!!")
        # 初始化停用词列表
        cls.__init_stopwords()
        # 初始化替换词词典
        cls.__init_replace_words()
        # 初始化接受词性的词典
        cls.__init_accept_nominal()
        # 初始化emoji替换词典
        cls.__init_emoji_replace_dict()
        # 初始化弃用标点符号词典
        cls.__init_reject_punctuation_set()


if __name__ == "__main__":
    lda = models.LdaModel.load(os.path.join(FileUtil.get_train_model_dir(), "9-barrage-lda.model"))
Ejemplo n.º 43
0
def save_segment_barrages(barrage_seg_list, cid):
    save_file_path = FileUtil.get_word_segment_result_file_path(cid)
    json_str = json.dumps(barrage_seg_list, default=lambda obj: obj.__dict__)
    with codecs.open(save_file_path, "wb", "utf-8") as output_file:
        output_file.write(json_str)
Ejemplo n.º 44
0
 def __save_similarity_matrix_to_local(cls, sim_matrix, time_window_index):
         matrix_file_name = os.path.join(FileUtil.get_similarity_matrix_dir(),
                                         "matrix-" + str(time_window_index) + ".txt")
         with codecs.open(matrix_file_name, "wb", "utf-8") as output_file:
             np.savetxt(fname=output_file, X=sim_matrix, fmt="%.2f", delimiter="\t", newline="\n")
Ejemplo n.º 45
0
        logging.debug(u"生成 lda 模型!!!")
        lda = models.LdaModel(corpus, num_topics=10)
        lda.save(
            os.path.join(FileUtil.get_train_model_dir(),
                         str(cid) + "-barrage-lda.model"))

    # 初始化所有的字典信息。
    @classmethod
    def build_dicts(cls):
        if not cls.__HAS_LOAD_USER_DICT:  # 还未加载用户词典
            cls.__HAS_LOAD_USER_DICT = True
            # 载入自定义的弹幕词典,优化弹幕特有词语的切词,以及颜表情的切词
            jieba.load_userdict(
                os.path.join(FileUtil.get_dict_dir(), "barrage-word-dict.txt"))
            logging.debug(u"自定义弹幕词典加载成功!!!")
        # 初始化停用词列表
        cls.__init_stopwords()
        # 初始化替换词词典
        cls.__init_replace_words()
        # 初始化接受词性的词典
        cls.__init_accept_nominal()
        # 初始化emoji替换词典
        cls.__init_emoji_replace_dict()
        # 初始化弃用标点符号词典
        cls.__init_reject_punctuation_set()


if __name__ == "__main__":
    lda = models.LdaModel.load(
        os.path.join(FileUtil.get_train_model_dir(), "9-barrage-lda.model"))
Ejemplo n.º 46
0
            # 创建barrage对象
            barrage = Barrage(play_timestamp=barrage_timestamp, sender_id=sender_name, content=content)
            barrages.append(barrage)
        # 为每一个用户的名称对应一个唯一的数字表示
        dictionary = corpora.Dictionary(sender_name_list)
        dictionary.save("live_sender_name.dict")
        # 在将barrages中的barrage用户名称替换为刚刚生成的对应数字表示
        for barrage in barrages:
            barrage.sender_id = str(dictionary.token2id[barrage.sender_id])
        return barrages


if __name__ == "__main__":
    # barrages = get_barrage_from_txt_file("../../data/local/9.txt")
    # file_path = FileUtil.get_word_segment_result_file_path("../../data/local/9.txt")
    # barrage_seg_list = wordseg.segment_barrages(barrages)
    # wordseg.save_segment_barrages(file_path, barrage_seg_list)
    # barrage_seg_list = wordseg.load_segment_barrages(file_path)
    # for barrage_seg in barrage_seg_list:
    #     print str(barrage_seg.play_timestamp), u"\t", u"\t".join([seg.word + u"\t" + seg.flag for seg
    #                                                               in barrage_seg.sentence_seg_list])

    gen_sorted_barrage_file(os.path.join(FileUtil.get_local_data_dir(), "2171229.txt"))

    # parse_barrage_xml_to_txt("4547002.xml")

    # barrages = get_barrage_from_live_text_file(os.path.join(FileUtil.get_project_root_path(), "data", "AlphaGo",
    #                                                         "bilibili", "2016-03-09.txt"))
    # for barrage in barrages:
    #     print barrage.play_timestamp, u"\t", barrage.sender_id, u"\t", barrage.content, u"\n"
Ejemplo n.º 47
0
class DictConfig(object):
    __HAS_LOAD_USER_DICT = False  # 检测是否加载了用户自定义的词典

    # 停用词词典信息
    __STOP_WORDS = set([])  # 停用词集合信息
    # 停用词词典的加载路径,用户可以自定义添加。
    __STOP_WORDS_PATH_SET = set([
        os.path.join(FileUtil.get_dict_dir(), "stopwords-zh-dict.txt"),
        os.path.join(FileUtil.get_dict_dir(), "stopwords-en-dict.txt")
    ])
    # 替换词词典信息
    # 替换词词典的替换词次序十分重要,所以用了list。例如 !{1,3}这个替换规则就应该在!!!!+这个替换规则后面。
    __REPLACE_WORDS = []
    __REPLACE_WORDS_PATH_SET = set(
        [os.path.join(FileUtil.get_dict_dir(), "replace-dict.txt")])
    # 替换颜表情词典信息
    __REPLACE_EMOJI = {}
    __REPLACE_EMOJI_PATH_SET = set(
        [os.path.join(FileUtil.get_dict_dir(), "emoji-dict.txt")])
    # 接受词性词典 ---- 现在代码中没有用词性来过滤处理
    __ACCEPT_NOMINAL = set([])
    __ACCEPT_NOMINAL_PATH_SET = set(
        [os.path.join(FileUtil.get_dict_dir(), "accept-nominal-dict.txt")])
    # 拒绝接受的单个标点符号词典
    __REJECT_PUNCTUATION = set([])
    __REJECT_PUNCTUATION_PATH_SET = set(
        [os.path.join(FileUtil.get_dict_dir(), "reject-punctuation-dict.txt")])
    # 程度副词词典加载(来自知网数据)
    __DEGREE_ADVERB = {}
    __DEGREE_ADVERB_PATH_SET = set(
        [os.path.join(FileUtil.get_dict_dir(), "degree-adverb-dict.txt")])
    # 否定词词典加载
    __NEGATIVES = set([])
    __NEGATIVES_PATH_SET = set(
        [os.path.join(FileUtil.get_dict_dir(), "negatives-dict.txt")])
    # 情感词典加载
    __EMOTION = {}  # 情感词典的格式 {情感词类别:(情感词,情感强度,情感极性)}
    __EMOTION_PATH_SET = set([
        os.path.join(FileUtil.get_dict_dir(), "emotion-extend-dict.txt"),
        os.path.join(FileUtil.get_dict_dir(), "emotion-dict.txt")
    ])

    @classmethod
    def get_stopwords_set(cls):
        return cls.__STOP_WORDS

    @classmethod
    def get_stopwords_dict_path_set(cls):
        return cls.__STOP_WORDS_PATH_SET

    @classmethod
    def get_replace_words_list(cls):
        return cls.__REPLACE_WORDS

    @classmethod
    def get_accept_nominal_set(cls):
        return cls.__ACCEPT_NOMINAL

    @classmethod
    def get_emoji_replace_dict(cls):
        return cls.__REPLACE_EMOJI

    @classmethod
    def get_reject_punctuation_dict(cls):
        return cls.__REJECT_PUNCTUATION

    @classmethod
    def get_degree_adverb_dict(cls):
        return cls.__DEGREE_ADVERB

    @classmethod
    def get_negatives_set(cls):
        return cls.__NEGATIVES

    # 直接加载情感词典 情感词典的格式 {情感词类别:(情感词,情感强度,情感极性)} 供情感分析使用
    @classmethod
    def load_emotion_dict(cls):
        cls.__EMOTION = {}
        for emotion_dict_path in cls.__EMOTION_PATH_SET:
            with codecs.open(emotion_dict_path, "rb", "utf-8") as input_file:
                for line in input_file:
                    split_info = line.strip().split(u"\t")
                    if len(split_info) < 4:
                        continue
                    category = split_info[0]  # 情感词类别
                    word = split_info[1]  # 情感词
                    degree = split_info[2]  # 情感强度
                    level = split_info[3]  # 情感极性
                    if category not in cls.__EMOTION.keys():
                        cls.__EMOTION[category] = set([(word, degree, level)])
                    else:
                        cls.__EMOTION[category].add((word, degree, level))
        return cls.__EMOTION

    # 初始化填充停用词列表信息。
    @classmethod
    def __init_stopwords(cls):
        if cls.__STOP_WORDS:
            return
        cls.__STOP_WORDS = set([" ", "\r", "\n", "\t"])
        for stopwords_dict_path in cls.__STOP_WORDS_PATH_SET:
            with codecs.open(stopwords_dict_path, "rb", "utf-8") as input_file:
                for line in input_file:
                    stopwords = line.strip()
                    cls.__STOP_WORDS.add(stopwords)
        logging.debug(u"停用词词典构建完成!!!")

    @classmethod
    def __init_replace_words(cls):
        if cls.__REPLACE_WORDS:
            return
        for replace_words_path in cls.__REPLACE_WORDS_PATH_SET:
            with codecs.open(replace_words_path, "rb", "utf-8") as input_file:
                for line in input_file:
                    split_info = line.strip().split("\t")
                    word_pattern = split_info[0]
                    replace_word = split_info[1]
                    replace_flag = split_info[
                        2]  # 替换词的词性,因为今后将会用到 过滤数字 和 无用标点的选项
                    cls.__REPLACE_WORDS.append(
                        (word_pattern, replace_word, replace_flag))
        logging.debug(u"替换词词典构建完成!!!")

    @classmethod
    def __init_accept_nominal(cls):
        if cls.__ACCEPT_NOMINAL:
            return
        for accept_nominal_path in cls.__ACCEPT_NOMINAL_PATH_SET:
            with codecs.open(accept_nominal_path, "rb", "utf-8") as input_file:
                for line in input_file:
                    split_info = line.strip().split("\t")
                    accept_nominal = split_info[0]
                    cls.__ACCEPT_NOMINAL.add(accept_nominal)
        logging.debug(u"接受词性词典加载成功!!!")

    @classmethod
    def __init_emoji_replace_dict(cls):
        if cls.__REPLACE_EMOJI:
            return
        for emoji_dict_path in cls.__REPLACE_EMOJI_PATH_SET:
            with codecs.open(emoji_dict_path, "rb", "utf-8") as input_file:
                for line in input_file:
                    split_info = line.strip().split("\t")
                    if len(split_info) < 2:
                        # 一般情况下emoji表情都包含两列,一列为表情,另一列为替换词,这两列必须有;第三列为表情说明,可有可无。
                        continue
                    # emoji替换词典里的表情是可能重复的,因为表情太复杂来不及检查,这里将会出现以最后一个定义为准。
                    emoji = split_info[0]
                    replace_word = split_info[1]
                    cls.__REPLACE_EMOJI[emoji] = replace_word
        logging.debug(u"emoji 替换词典加载完成!!!")

    # 加载拒绝的单个标点词的词典
    @classmethod
    def __init_reject_punctuation_set(cls):
        if cls.__REJECT_PUNCTUATION:
            return
        for reject_punctuation_path in cls.__REJECT_PUNCTUATION_PATH_SET:
            with codecs.open(reject_punctuation_path, "rb",
                             "utf-8") as input_file:
                for line in input_file:
                    punctuation = line.strip()
                    cls.__REJECT_PUNCTUATION.add(punctuation)
        logging.debug(u"弃用标点符号词典加载完成!!!")

    # 初始化程度副词词典
    @classmethod
    def load_degree_adverb_dict(cls):
        cls.__DEGREE_ADVERB = {}
        for degree_adverb_path in cls.__DEGREE_ADVERB_PATH_SET:
            with codecs.open(degree_adverb_path, "rb", "utf-8") as input_file:
                for line in input_file:
                    split_info = line.strip().split("\t")
                    degree_adverb = split_info[0]
                    score = split_info[1]
                    if degree_adverb not in cls.__DEGREE_ADVERB.keys():
                        cls.__DEGREE_ADVERB[degree_adverb] = float(score)
        logging.debug(u"程度副词词典加载完成!!!")
        return cls.__DEGREE_ADVERB

    # 初始化否定词词典
    @classmethod
    def load_negatives_set(cls):
        cls.__NEGATIVES = set([])
        for negatives_path in cls.__NEGATIVES_PATH_SET:
            with codecs.open(negatives_path, "rb", "utf-8") as input_file:
                for line in input_file:
                    negative = line.strip()
                    cls.__NEGATIVES.add(negative)
        logging.debug(u"否定词词典加载完成!!!")
        return cls.__NEGATIVES

    # 将待实验视频v的全体弹幕信息作为语料库,为训练tf-idf模型以及lda模型做准备
    # 根据分好词的barrage_seg_list(分好词、过滤好停词),为弹幕中的每一个词语对应一个唯一的编号。
    @classmethod
    def gen_corpus_info(cls, barrage_seg_list, cid):
        # 获得每条弹幕分好之后的词语
        texts = []
        for barrage_seg in barrage_seg_list:
            text = []
            for word_seg in barrage_seg.sentence_seg_list:
                text.append(word_seg.word)
            texts.append(text)
        # 为文本中的每一个词语赋予一个数字下标
        dictionary = corpora.Dictionary(texts)
        # store the dictionary, for future reference
        dictionary.save(
            os.path.join(FileUtil.get_train_model_dir(),
                         str(cid) + "-barrage-words.dict"))

        logging.debug(dictionary.token2id)
        # 根据生成的字典,生成语料库信息(语料的词用id表示,后面对应的是count。)
        corpus = [dictionary.doc2bow(text) for text in texts]
        # store to disk, for later use
        corpora.MmCorpus.serialize(
            os.path.join(FileUtil.get_train_model_dir(),
                         str(cid) + '-barrage-corpus.mm'), corpus)
        return corpus

    # 根据语料库corpus生成tf-idf模型
    @classmethod
    def gen_tfidf_model(cls, corpus, cid):
        # let’s initialize a tfidf transformation:
        logging.debug(u"生成 tfidf 模型!!!")
        tfidf = models.TfidfModel(corpus)
        tfidf.save(
            os.path.join(FileUtil.get_train_model_dir(),
                         str(cid) + "-barrage-tfidf.model"))

    # 根据语料库信息生成lda模型
    @classmethod
    def gen_lda_model(cls, corpus, cid):
        logging.debug(u"生成 lda 模型!!!")
        lda = models.LdaModel(corpus, num_topics=10)
        lda.save(
            os.path.join(FileUtil.get_train_model_dir(),
                         str(cid) + "-barrage-lda.model"))

    # 初始化所有的字典信息。
    @classmethod
    def build_dicts(cls):
        if not cls.__HAS_LOAD_USER_DICT:  # 还未加载用户词典
            cls.__HAS_LOAD_USER_DICT = True
            # 载入自定义的弹幕词典,优化弹幕特有词语的切词,以及颜表情的切词
            jieba.load_userdict(
                os.path.join(FileUtil.get_dict_dir(), "barrage-word-dict.txt"))
            logging.debug(u"自定义弹幕词典加载成功!!!")
        # 初始化停用词列表
        cls.__init_stopwords()
        # 初始化替换词词典
        cls.__init_replace_words()
        # 初始化接受词性的词典
        cls.__init_accept_nominal()
        # 初始化emoji替换词典
        cls.__init_emoji_replace_dict()
        # 初始化弃用标点符号词典
        cls.__init_reject_punctuation_set()
Ejemplo n.º 48
0
 def gen_lda_model(cls, corpus, cid):
     logging.debug(u"生成 lda 模型!!!")
     lda = models.LdaModel(corpus, num_topics=10)
     lda.save(
         os.path.join(FileUtil.get_train_model_dir(),
                      str(cid) + "-barrage-lda.model"))
Ejemplo n.º 49
0
        left_zscore_threshold = 0
        right_zscore_threshould = 0
        with codecs.open(file_path, "rb", "utf-8") as input_file:
            for line in input_file:
                split_info = line.strip().split("\t")
                if first_line_flag:
                    first_line_flag = False
                    global_zscore_threshold = split_info[0]
                    left_zscore_threshold = split_info[1]
                    right_zscore_threshould = split_info[2]
                    continue
                high_emotion_clips.append(split_info)
        return high_emotion_clips, global_zscore_threshold, left_zscore_threshold, right_zscore_threshould


if __name__ == "__main__":
    zscore = Zscore(
        "2171229",
        os.path.join(FileUtil.get_zscore_dir(), "hd-zscore-result-lda.txt"),
        30, 10, 4)
    # zscore.gen_sorted_zscore_file(threshold_value=5)
    # # zscore.gen_possible_high_emotion_clips()
    high_emotion_clips = zscore.gen_possible_high_emotion_clips()
    for emotion_clip in high_emotion_clips:
        str_info = u""
        for item in emotion_clip:
            str_info += (unicode(str(item)) + u"\t")
        str_info = str_info[0:len(str_info) - 1]
        str_info += u"\n"
        print str_info
Ejemplo n.º 50
0
    parse_dict = dict()
    jieba.load_userdict(constants.USER_DICT_PATH)
    emotion_dict_path = os.path.join(FileUtil.get_project_root_path(),
                                     "WordSegment", "emotion_dict.txt")
    emotion_dict = load_emotion_dict(emotion_dict_path)
    for danmaku in danmaku_list:
        rowId = danmaku.rowId
        if danmaku.content is not None:
            words = wordSegment(emotion_dict, danmaku.content)
            parse_dict[rowId] = words
        else:
            parse_dict[rowId] = None
    logging.info("parse dictionary has generated!")
    return parse_dict


if __name__ == "__main__":
    # 测试代码
    danmaku_list = getDataSource(constants.DATASOURCE)
    emotion_dict_path = os.path.join(FileUtil.get_project_root_path(),
                                     "WordSegment", "emotion_dict.txt")
    emotion_dict = load_emotion_dict(emotion_dict_path)
    for (key, value_set) in emotion_dict.items():
        print key, u"\t", u"\t".join(value for value in value_set), u"\n"
    for danmaku in danmaku_list:
        if danmaku.content is None:
            continue
        words = wordSegment(emotion_dict, danmaku.content)
        for word in words:
            print word.content
Ejemplo n.º 51
0
class EmployeeService:

    dept = Department()
    employee = Employee()
    dept_service = DepartmentService()
    fileUtil = FileUtil("employee.txt")
    #
    # fileUtil.objects = employees
    # fileUtil.construct_file_headers("ID", "First Name","Last Name")
    # fileUtil.construct_file()

    db = DbUtil("employee.db")

    def __int__(self):
        print("Default: Employee Service")

    def check_date_of_birth(self, dob):
        date_format = "%Y-%m-%d"
        try:
            yy, mm, dd = str(dob).split("-")

            dob_entered = date(int(yy), int(mm), int(dd))
            age = self.calculate_age(dob_entered)

            if age < 24:
                return False
            else:
                return True
        except ValueError:
            print(sys.exc_info()[1])

        # dob_entered = datetime.datetime.strptime(dob, date_format)

    def calculate_age(self, dob):
        today = date.today()
        return today.year - dob.year - ((today.month, today.day) <
                                        (dob.month, dob.day))

    def create_employee_table(self):
        create_table_query = """
            CREATE table employee(
            emp_id varchar(10) primary key,
            fname varchar(50) not null,
            lname varchar(50) not null,
            dob date,
            dept_id varchar(10),
            FOREIGN KEY (dept_id) REFERENCES department(dept_id)            
            );
            """
        try:
            self.db.execute_query(create_table_query)
        except:
            print("Unable to create employee table \n")
            print(sys.exc_info()[1])
            print("\n")

    def save_employee(self, emp: Employee):

        dob = emp.dob
        if self.check_date_of_birth(dob):
            try:
                self.db.execute_dynamic_query(
                    "insert into employee (emp_id,fname,lname,dob,dept_id) values (?,?,?,?,?)",
                    emp.emp_id, emp.fname, emp.lname, emp.dob,
                    emp.dept.dept_id)
                self.db.connection.commit()

                print("Congrats : Employee - " + emp.fname +
                      " details saved \n")
            except sqlite3.IntegrityError:
                print(
                    "Sorry- Unable to save employee details. ID already exists \n"
                )
            except:
                print(
                    "Sorry- Unable to save employee details. Invalid Values entered \n"
                )
                print(sys.exc_info())

        else:
            print(
                "Oops !! - Employee - " + emp.fname +
                " too young to get registered. To register, you must be older than 24 years. \n"
            )

    def fetch_all_employees(self):

        employees = []
        query = "select * from employee"
        try:
            self.db.execute_query(query)

            query_result = self.db.fetch_all()

            for emp in query_result:
                self.employee = Employee()
                self.employee.emp_id = emp[0]
                self.employee.fname = emp[1]
                self.employee.lname = emp[2]
                self.employee.dob = emp[3]
                self.employee.dept = self.dept_service.fetch_department_by_id(
                    emp[4])
                employees.append(self.employee)
            print("Employee Details \n")
            print("******************************************\n")

            print("#Id" + "\t \t \t" + "DOB" + "\t\t\t" + "Department" +
                  "\t\t\t" + "fname" + "\t\t\t\t" + "lname" + "\n")
            print(
                "**************************************************************************************************** \n"
            )
            for emp in employees:
                print(emp.emp_id + "\t" + emp.dob + "\t" + emp.dept.dept_name +
                      "\t\t\t\t\t" + emp.fname + " " + emp.lname + "\n")

        except:
            print("Unable to fetch all employees \n")
            print(sys.exc_info()[1])
            print("\n")

        return employees

    def fetch_employee_by_id(self, eid: str):

        self.employee = Employee()
        try:
            self.db.execute_dynamic_query(
                "select * from employee where emp_id = ?", eid)
            query_result = self.db.fetch_one()
            self.employee.emp_id = query_result[0]
            self.employee.fname = query_result[1]
            self.employee.lname = query_result[2]
            self.employee.dob = query_result[3]
            self.employee.dept = self.dept_service.fetch_department_by_id(
                query_result[4])

            print("#Id" + "\t \t \t" + "DOB" + "\t\t\t" + "Department" +
                  "\t\t\t" + "fname" + "\t\t\t\t" + "lname" + "\n")
            print(
                "**************************************************************************************************** \n"
            )
            print(self.employee.emp_id + "\t" + self.employee.dob + "\t" +
                  self.employee.dept.dept_name + "\t\t\t\t\t" +
                  self.employee.fname + " " + self.employee.lname + "\n")
            return self.employee

        except:
            print("Unable to fetch Employee Id - " + eid +
                  " . Please enter correct employee ID.\n")
            print(sys.exc_info()[1])
            print("\n")

    def delete_employee(self, eid):
        try:
            emp_to_be_deleted = self.fetch_employee_by_id(eid)
            self.db.execute_dynamic_query(
                "delete from employee where emp_id=?", eid)
            print("Successfully Deleted Employee - " +
                  emp_to_be_deleted.fname + "\n")
            self.fetch_all_employees()
        except:
            print("Unable to delete Employee Id - " + eid +
                  ". Check employee Id.\n")
            print(sys.exc_info()[1])
            print("\n")

    def update_employee(self, emp: Employee):

        try:
            emp_result = self.fetch_employee_by_id(emp.emp_id)
            try:
                if emp.__eq__(emp_result):
                    #
                    print("Successfully Updated Employee- " + emp.emp_id +
                          "\n")
                else:
                    date_util = DateUtil()
                    if date_util.check_date_of_birth(emp.dob):
                        self.db.execute_dynamic_query(
                            "update employee set fname = ?, lname =?, dob = ? , dept_id = ? where emp_id=?",
                            emp.fname, emp.lname, emp.dob, emp.dept.dept_id,
                            emp.emp_id)
                        self.db.connection.commit()
                        print("Successfully Updated Employee- " + emp.emp_id +
                              "\n")
                    else:
                        print("Sorry!! Unable to update Employee- " +
                              emp.emp_id + "\n")

            except AttributeError:

                if "fname" in str(sys.exc_info()[1]):
                    print("First Name cannot be null")
                elif "lname" in str(sys.exc_info()[1]):
                    print("Last Name cannot be null")
                elif "dob" in str(sys.exc_info()[1]):
                    print("Date of birth is not in correct format")
            except ValueError:
                print(sys.exc_info()[1])
        except:
            print("Unable to update employee Id - " + str(emp.emp_id) +
                  ". Check employee Id.\n")
            print(sys.exc_info())
            print("\n")
Ejemplo n.º 52
0
                                                          str(cid) + "-barrage-words.dict"))
        lda_model = models.TfidfModel.load(os.path.join(FileUtil.get_train_model_dir(),
                                                        str(cid) + "-barrage-lda.model"))
        time_window_list = TimeWindow.gen_time_window_barrage_info(barrage_seg_list, cid)
        for time_window in time_window_list:
            time_window.gen_user_word_frequency()  # 产生该时间窗口内的用户词频信息。
            time_window.gen_user_topic_lda(dictionary, lda_model)  # 产生该时间窗口内的用户所发词语的tfidf权重信息。
        return time_window_list


if __name__ == "__main__":
    barrage_file_path = "../../data/local/9.txt"
    # "../../data/local/9.txt" "../../data/AlphaGo/bilibili/2016-03-09.txt" "../../data/local/2065063.txt"
    barrages = dataloader.get_barrage_from_txt_file(barrage_file_path)
    # barrages = dataloader.get_barrage_from_live_text_file(barrage_file_path)
    cid = FileUtil.get_cid_from_barrage_file_path(barrage_file_path)
    barrage_seg_list = wordseg.segment_barrages(barrages, cid)
    # time_window_list = TimeWindow.gen_time_window_barrage_info(barrage_seg_list, cid)
    # for time_window in time_window_list:
    #     str_info = ''
    #     for barrage_seg in time_window.barrage_or_seg_list:
    #         for sentence_seg in barrage_seg.sentence_seg_list:
    #             str_info += (sentence_seg.word + sentence_seg.flag + u"\t")
    #     print str(time_window.time_window_index), u"\t", str(time_window.start_timestamp), u"\t",\
    #         str(time_window.end_timestamp), u"\t", str_info

    # time_window_list = TimeWindow.gen_user_word_frequency_by_time_window(barrage_seg_list)
    # with codecs.open(FileUtil.get_word_segment_result_file_path(cid), "wb", "utf-8") as output_file:
    #     for time_window in time_window_list:
    #         str_info = str(time_window.time_window_index) + u"\t"
    #         for user_id, word_frequency in time_window.user_word_frequency_dict.items():
Ejemplo n.º 53
0
                              content=content)
            barrages.append(barrage)
        # 为每一个用户的名称对应一个唯一的数字表示
        dictionary = corpora.Dictionary(sender_name_list)
        dictionary.save("live_sender_name.dict")
        # 在将barrages中的barrage用户名称替换为刚刚生成的对应数字表示
        for barrage in barrages:
            barrage.sender_id = str(dictionary.token2id[barrage.sender_id])
        return barrages


if __name__ == "__main__":
    # barrages = get_barrage_from_txt_file("../../data/local/9.txt")
    # file_path = FileUtil.get_word_segment_result_file_path("../../data/local/9.txt")
    # barrage_seg_list = wordseg.segment_barrages(barrages)
    # wordseg.save_segment_barrages(file_path, barrage_seg_list)
    # barrage_seg_list = wordseg.load_segment_barrages(file_path)
    # for barrage_seg in barrage_seg_list:
    #     print str(barrage_seg.play_timestamp), u"\t", u"\t".join([seg.word + u"\t" + seg.flag for seg
    #                                                               in barrage_seg.sentence_seg_list])

    gen_sorted_barrage_file(
        os.path.join(FileUtil.get_local_data_dir(), "2171229.txt"))

    # parse_barrage_xml_to_txt("4547002.xml")

    # barrages = get_barrage_from_live_text_file(os.path.join(FileUtil.get_project_root_path(), "data", "AlphaGo",
    #                                                         "bilibili", "2016-03-09.txt"))
    # for barrage in barrages:
    #     print barrage.play_timestamp, u"\t", barrage.sender_id, u"\t", barrage.content, u"\n"