Beispiel #1
0
def keyword_location(file_paht_list):
    dismiss_count = 0

    output_dic = {}

    for current_file in file_paht_list:
        with open(current_file, 'r', encoding='utf-8') as f:
            s_t = time()
            for line in f:
                line_section = line.split('\t')
                location = getLocation(line_section)
                keyword_list = getKeywordList(line_section)
                current_text = getText(line_section)
                current_text = filer.filer(current_text)
                word_list = [word for word in jieba.cut(current_text)]
                if (len(word_list) < 5):
                    dismiss_count += 1
                    continue

                if (location not in output_dic.keys()):
                    print(location)
                    output_dic[location] = [
                        0 for i in range(len(keywords_list))
                    ]
                for keyword in keyword_list:
                    keyword = keyword.replace(',', '')
                output_dic[location][keywords_list.index(keyword)] += 1
            e_t = time()
        print(output_dic)
        print('dismiss count: ' + str(dismiss_count))
        print('current collection process time: ' + str(e_t - s_t))
Beispiel #2
0
def topic_friends(file_paht_list):
    dismiss_count = 0

    output_dic = {}

    for current_file in file_paht_list:
        with open(current_file, 'r', encoding='utf-8') as f:
            s_t = time()
            for line in f:
                line_section = line.split('\t')
                friends = getFriends(line_section)
                current_text = getText(line_section)
                current_text = filer.filer(current_text)
                word_list = [word for word in jieba.cut(current_text)]
                if (len(word_list) < 5):
                    dismiss_count += 1
                    continue
                current_topic = getTopic(current_text)
                if (str(friends) not in output_dic.keys()):
                    print(friends)
                    output_dic[str(friends)] = [0 for i in range(len(topic))]
                output_dic[str(friends)][topic.index(current_topic)] += 1
        e_t = time()
        print(output_dic)
        print('dismiss count: ' + str(dismiss_count))
        print('current collection process time: ' + str(e_t - s_t))
    # 存储
    output_file = open('data/topic_friends.pickle', 'wb')
    pickle.dump(output_dic, output_file)
    output_file.close()
Beispiel #3
0
def topic_keyword(file_paht_list):
    dismiss_count = 0
    output_dic = {}

    for current_file in file_paht_list:
        with open(current_file, 'r', encoding='utf-8') as f:
            s_t = time()
            for line in f:
                line_section = line.split('\t')
                keyword_list = getKeywordList(line_section)
                # print(keyword_list)
                current_text = getText(line_section)

                # dismiss weibo which is too short(<5)
                current_text = filer.filer(current_text)
                word_list = [word for word in jieba.cut(current_text)]
                if (len(word_list) < 5):
                    dismiss_count += 1

                    continue

                current_topic = getTopic(current_text)
                for keyword in keyword_list:
                    keyword = keyword.replace(',', '')
                    if (keyword not in output_dic.keys()):
                        print(keyword)
                        output_dic[keyword] = [0 for i in range(len(topic))]
                    output_dic[keyword][topic.index(current_topic)] += 1
            e_t = time()
            print('current file process time: ' + str(e_t - s_t))
        print(output_dic)
        print('dismiss count: ' + str(dismiss_count))
Beispiel #4
0
def keyword_emotion(file_paht_list):
    output_dic = {}
    for current_keyword in keywords_list:
        output_dic[current_keyword] = {}
    for current_file in file_paht_list:
        print(current_file)
        with open(current_file, 'r', encoding='utf-8') as f:
            s_t = time()
            for line in f:
                line_section = line.split('\t')
                current_text = getText(line_section)
                # dismiss weibo which is too short(<5)
                current_text = filer.filer(current_text)
                word_list = [word for word in jieba.cut(current_text)]
                if (len(word_list) < 5):
                    continue
                mood = getMood(line_section)
                keyword_list = getKeywordList(line_section)

                # 统计所有
                for keyword in keyword_list:
                    keyword = keyword.replace(',', '')
                    if (str(mood) not in output_dic[keyword].keys()):
                        output_dic[keyword][str(mood)] = 0
                    else:
                        output_dic[keyword][str(mood)] += 1

            e_t = time()
            print('current file process time: ' + str(e_t - s_t))
            print(output_dic)
    return output_dic
Beispiel #5
0
def extract_word_freq(file_folder_path):
    out_folder = 'data/location_wordfreq/'
    jieba.load_userdict("data/user_dict.txt")
    stop_word = []

    with open('data/stop_word.txt', 'r', encoding='utf-8') as sw_f:
        for item in sw_f:
            stop_word.append(item.strip())
    current_keyword = file_folder_path.split('/')[-1]
    if (current_keyword != '健康'):
        return
    print(current_keyword)
    current_keyword_cut_list = current_keyword.split(',')
    keyword_ban_list = [ban_word for ban_word in current_keyword_cut_list]
    for ban_word in current_keyword_cut_list:
        ban_word_list = jieba.cut(ban_word)
        for cut_ban_word in ban_word_list:
            if (cut_ban_word not in keyword_ban_list):
                keyword_ban_list.append(cut_ban_word)
    print(keyword_ban_list)
    file_name_list = os.listdir(file_folder_path)
    temp_list = [
        file_folder_path + '/' + filename for filename in file_name_list
    ]
    for current_file in temp_list:
        #xxx.txt
        current_city_filename = current_file.split('/')[-1]
        with open(current_file, 'r', encoding='utf-8') as file:
            keyword_out_dic = {}
            weibo_cut_list = []
            line_count = 0
            for temp_line in file:
                weibo_origin = filer.filer(temp_line).replace('/', '')
                weibo_cut = list(jieba.cut(weibo_origin))
                for items in weibo_cut:
                    if ((items not in stop_word)
                            and (items not in keyword_ban_list)
                            and len(items.strip()) > 0):
                        print(items)
                        weibo_cut_list.append(items)
                line_count += 1
                if (line_count >= 10000):
                    fd = FreqDist(weibo_cut_list)  # nltk库提供的词频统计
                    keyword_out_dic = merege_iter(keyword_out_dic, fd)
                    line_count = 0
                    weibo_cut_list = []

            if (line_count > 0):
                fd = FreqDist(weibo_cut_list)  # nltk库提供的词频统计
                keyword_out_dic = merege_iter(keyword_out_dic, fd)
                line_count = 0
                weibo_cut_list = []

            sort_dict = sorted(keyword_out_dic.items(),
                               key=lambda d: d[1],
                               reverse=True)

            temp_count = 0
Beispiel #6
0
    def __init__(self, cfg, logger, job):

        self.starttime = time.time()
        self.cfg = cfg
        self.logger = logger
        self.job = job

        self.filename = '%s/%s__%s.job_history' % (cfg.run_dir, job['name'], job['env'])
        
        self.file = filer(cfg, logger, self.filename)
Beispiel #7
0
    def __init__(self, cfg, logger, job):

        self.starttime = time.time()
        self.cfg = cfg
        self.logger = logger
        self.job = job

        self.filename = '%s/%s__%s.job_history' % (cfg.run_dir, job['name'],
                                                   job['env'])

        self.file = filer(cfg, logger, self.filename)
Beispiel #8
0
def keyword_mentionrepost(file_paht_list):
    dismiss_count = 0
    all_keywords_list = [
        '健康', '事业有成', '发展机会', '生活幸福', '有房', '出名', '家庭幸福', '好工作', '平等机会',
        '白手起家', '成为富人', '个体自由', '安享晚年', '收入足够', '个人努力', '祖国强大', '中国经济持续发展',
        '父辈更好'
    ]

    output_dic = {}
    output_dic['at'] = [0 for i in all_keywords_list]
    output_dic['at_total'] = [0 for i in all_keywords_list]
    output_dic['repost'] = [0 for i in all_keywords_list]
    output_dic['repost_total'] = [0 for i in all_keywords_list]

    print(output_dic)
    for current_file in file_paht_list:
        with open(current_file, 'r', encoding='utf-8') as f:
            s_t = time()
            for line in f:
                line_section = line.split('\t')
                location = getLocation(line_section)
                keyword_list = getKeywordList(line_section)
                current_text = getText(line_section)

                current_text = filer.filer(current_text)

                #先统计@率和转发率再移除@的符号
                current_at = current_text.count('@')
                current_repost = current_text.count('//')

                current_text = remove_at(current_text)

                word_list = [word for word in jieba.cut(current_text)]
                if (len(word_list) < 5):
                    dismiss_count += 1
                    continue

                for keyword in keyword_list:
                    keyword = keyword.replace(',', '')
                    output_dic['at'][all_keywords_list.index(
                        keyword)] += current_at
                    output_dic['at_total'][all_keywords_list.index(
                        keyword)] += 1
                    output_dic['repost'][all_keywords_list.index(
                        keyword)] += current_repost
                    output_dic['repost_total'][all_keywords_list.index(
                        keyword)] += 1
            e_t = time()
        print(output_dic)
        print('dismiss count: ' + str(dismiss_count))
        print('current collection process time: ' + str(e_t - s_t))
Beispiel #9
0
def keyword_time(file_paht_list):
    city_list = [
        '北京', '天津', '内蒙古', '新疆', '河北', '甘肃', '宁夏', '山西', '陕西', '青海', '山东',
        '河南', '安徽', '辽宁', '吉林', '黑龙江', '江苏', '浙江', '上海', '湖北', '湖南', '四川',
        '重庆', '贵州', '云南', '广西', '江西', '福建', '广东', '海南', '西藏', '台湾', '香港', '澳门',
        '海外', '其他'
    ]
    output_dic = {}
    for current_file in file_paht_list:
        print(current_file)
        with open(current_file, 'r', encoding='utf-8') as f:
            current_time = current_file.split('/')[-1].split('.')[0]
            print(current_time)
            if (current_time in output_dic.keys()):
                continue
            output_dic[current_time] = {}
            s_t = time()
            for line in f:
                line_section = line.split('\t')
                current_text = getText(line_section)
                # dismiss weibo which is too short(<5)
                current_text = filer.filer(current_text)
                word_list = [word for word in jieba.cut(current_text)]
                if (len(word_list) < 5):
                    continue

                location = getLocation(line_section)
                if (len(location) == 0):
                    continue
                current_city_list = location.split()
                if (len(current_city_list) == 0):
                    continue
                current_city = current_city_list[0]
                keyword_list = getKeywordList(line_section)

                # 统计所有
                for keyword in keyword_list:
                    keyword = keyword.replace(',', '')
                    if (keyword not in output_dic[current_time].keys()):
                        print(keyword)
                        output_dic[current_time][keyword] = [
                            0 for i in range(len(city_list))
                        ]
                    output_dic[current_time][keyword][city_list.index(
                        current_city)] += 1

            e_t = time()
            print('current file process time: ' + str(e_t - s_t))
            print(output_dic)
Beispiel #10
0
def predict(weibo):
    # print(weibo)
    weibo = filer.filer(weibo)
    word_list = [word for word in jieba.cut(weibo)]
    if len(word_list) < 5:
        return -1

    result_list = list(pre_probability_dict.values())
    for word in word_list:
        if word in pre_category_probability_dict:
            for i in range(0, len(category_list)):
                result_list[i] *= pre_category_probability_dict[word][i]
    max_category = 0
    for i in range(0, len(category_list)):
        if result_list[i] > result_list[max_category]:
            max_category = i
    return category_list[max_category]
Beispiel #11
0
def keyword_percent(file_paht_list):
    dismiss_count = 0
    output_dic_gt5 = {}
    output_dic_all = {}

    for current_file in file_paht_list:
        with open(current_file, 'r', encoding='utf-8') as f:
            s_t = time()
            for line in f:
                line_section = line.split('\t')

                current_text = getText(line_section)
                keyword_list = getKeywordList(line_section)
                # dismiss weibo which is too short(<5)
                current_text = filer.filer(current_text)
                word_list = [word for word in jieba.cut(current_text)]

                # 统计所有
                for keyword in keyword_list:
                    keyword = keyword.replace(',', '')
                    if (keyword not in output_dic_all.keys()):
                        print(keyword)
                        output_dic_all[keyword] = 0
                    output_dic_all[keyword] += 1

                if (len(word_list) < 5):
                    dismiss_count += 1
                    continue

                for keyword in keyword_list:
                    keyword = keyword.replace(',', '')
                    if (keyword not in output_dic_gt5.keys()):
                        print(keyword)
                        output_dic_gt5[keyword] = 0
                    output_dic_gt5[keyword] += 1
            e_t = time()
            print('all')
            print(output_dic_all)
            print('greater than 5')
            print(output_dic_gt5)
            print('dismiss count: ' + str(dismiss_count))
            print('current collection process time: ' + str(e_t - s_t))
Beispiel #12
0
def topic_percent(file_paht_list):
    dismiss_count = 0
    output_list = [0 for i in range(len(topic))]
    for current_file in file_paht_list:
        with open(current_file, 'r', encoding='utf-8') as f:
            s_t = time()
            for line in f:
                line_section = line.split('\t')

                current_text = getText(line_section)
                # dismiss weibo which is too short(<5)
                current_text = filer.filer(current_text)
                word_list = [word for word in jieba.cut(current_text)]
                if (len(word_list) < 5):
                    dismiss_count += 1
                    continue
                current_topic = getTopic(current_text)
                output_list[topic.index(current_topic)] += 1
            e_t = time()
            print(output_list)
            print('dismiss count: ' + str(dismiss_count))
            print('current collection process time: ' + str(e_t - s_t))
Beispiel #13
0
		formatter_class=argparse.RawDescriptionHelpFormatter,
		description=DESC,
		epilog=textwrap.dedent(EPILOG)
	)
	parser.add_argument("-c", "--config", type=str, metavar="config", help="job_tracker config file", default="job_tracker.cfg")
	parser.add_argument("-j", "--job", type=str, metavar="job", help="Job to analyze", required=True)
	parser.add_argument("-e", "--env", type=str, metavar="env", choices=['P', 'I'], default='I', help="Environment")

	# parse the args and call the appropriate command function
	args = parser.parse_args()

	cfg = setup_config()
	cfg.load_config(args.config)
	logger = logging.getLogger("/tmp/jt-analyzer.log")
	file_name = '%s/%s__%s.job_history' % (cfg.run_dir, args.job, args.env)
	file_handler = filer(cfg, logger, file_name)

	print "\nGetting log entries from %s\n" % file_name
	print "FROM                - UNTIL    : RESULT\n--------------------------------------------------------------"

	job_history = file_handler.read_content()

	for item in job_history:

		# item = dict(eval(item.rstrip()))
		# print item
		# item = dict(eval(item))
		item['epoche_until'] = convert_epoche_to_timestamp(item['epoche_until'])
		item['epoche_from'] = convert_epoche_to_timestamp(item['epoche_from'])
		
		if item['epoche_from'] == item['epoche_until']:
Beispiel #14
0
    parser.add_argument("-e",
                        "--env",
                        type=str,
                        metavar="env",
                        choices=['P', 'I'],
                        default='I',
                        help="Environment")

    # parse the args and call the appropriate command function
    args = parser.parse_args()

    cfg = setup_config()
    cfg.load_config(args.config)
    logger = logging.getLogger("/tmp/jt-analyzer.log")
    file_name = '%s/%s__%s.job_history' % (cfg.run_dir, args.job, args.env)
    file_handler = filer(cfg, logger, file_name)

    print "\nGetting log entries from %s\n" % file_name
    print "FROM                - UNTIL    : RESULT\n--------------------------------------------------------------"

    job_history = file_handler.read_content()

    for item in job_history:

        # item = dict(eval(item.rstrip()))
        # print item
        # item = dict(eval(item))
        item['epoche_until'] = convert_epoche_to_timestamp(
            item['epoche_until'])
        item['epoche_from'] = convert_epoche_to_timestamp(item['epoche_from'])
Beispiel #15
0
def year_keyword_location_lda(current_year, current_keyword, mood_list='all'):
    jieba.load_userdict("data/user_dict.txt")
    stop_word = []

    with open('data/stop_word.txt', 'r', encoding='utf-8') as sw_f:
        for item in sw_f:
            stop_word.append(item.strip())
    current_keyword_cut_list = current_keyword.split(',')
    current_keyword_banned_list = []
    # 全切
    for temp1 in current_keyword_cut_list:
        cut_list = jieba.cut(temp1)
        for temp2 in cut_list:
            current_keyword_banned_list.append(temp2)

    time_keyword_folder = '/Volumes/data/chinadream/time_keyword_emotion_location'
    timemonth_list = os.listdir(time_keyword_folder)
    current_year_timemonth_list = []
    for current_timemonth in timemonth_list:
        if (current_timemonth[0:4] == str(current_year)):
            current_year_timemonth_list.append(current_timemonth)

    corpus_text = []
    corpus_city = {}
    count = 0
    print('开始抽取语料')
    for current_timemonth in current_year_timemonth_list:
        if (mood_list == 'all'):
            current_mood_path = time_keyword_folder + '/' + current_timemonth
            current_mood_list = os.listdir(current_mood_path)
            for current_mood in current_mood_list:
                if (current_mood == '.DS_Store'):
                    continue
                current_folder_keyword_list = os.listdir(current_mood_path +
                                                         '/' +
                                                         str(current_mood))
                if (current_keyword not in current_folder_keyword_list):
                    continue

                current_city_path = current_mood_path + '/' + str(
                    current_mood) + '/' + current_keyword
                current_city_list = os.listdir(current_city_path)

                for current_city in current_city_list:
                    if (current_city == '.DS_Store'):
                        continue
                    if (current_city in corpus_city):
                        city_index = corpus_city[current_city]
                        origin_text = corpus_text[city_index]
                    else:
                        origin_text = []

                    current_city_file_path = current_city_path + '/' + current_city
                    for temp_line in open(current_city_file_path):
                        weibo_origin = filer.filer(temp_line).replace('/', '')
                        if (len(weibo_origin) == 0):
                            continue
                        weibo_cut = list(jieba.cut(weibo_origin))
                        weibo_cut_list = []
                        for items in weibo_cut:
                            if (items not in stop_word
                                    and len(items.strip()) > 0):
                                if (items in current_keyword_banned_list):
                                    continue
                                weibo_cut_list.append(items)
                        if (len(weibo_cut_list) < 5):
                            continue
                        for current_cut in weibo_cut_list:
                            origin_text.append(current_cut)
                    # 该城市没有符合条件的预料被抽取
                    if (len(origin_text) == 0):
                        continue
                    if (current_city in corpus_city):
                        # 更新语料
                        corpus_text[city_index] = origin_text
                    else:
                        #添加语料,更新index
                        corpus_text.append(origin_text)
                        corpus_city[current_city] = count
                        count += 1
    #释放内存
    del origin_text
    gc.collect()

    frequency = defaultdict(int)
    for city_file in corpus_text:
        for token in city_file:
            frequency[token] += 1
    texts = [[token for token in text if frequency[token] > 3]
             for text in corpus_text]

    word_count_dict = corpora.Dictionary(texts)

    corpus = [word_count_dict.doc2bow(text) for text in texts]
    print('计算tfidf')
    tfidf = TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    del tfidf
    gc.collect()

    print('开始LDA模型构建')
    lda = LdaModel(corpus=corpus_tfidf,
                   id2word=word_count_dict,
                   num_topics=100)
    output_model_folder = 'data/year_keyword_location_lda/model/' + str(
        current_year) + '_' + current_keyword + '_' + mood_list
    if (not os.path.exists(output_model_folder)):
        os.makedirs(output_model_folder)
    model_file = output_model_folder + '/' + str(
        current_year) + '_' + current_keyword + '_' + mood_list + '_lda.model'
    dictionary_file = output_model_folder + '/' + str(
        current_year) + '_' + current_keyword + '_' + mood_list + '_lda.dict'
    corpus_file = output_model_folder + '/' + str(
        current_year) + '_' + current_keyword + '_' + mood_list + '_lda.mm'
    word_count_dict.save(dictionary_file)
    corpora.MmCorpus.serialize(corpus_file, corpus)
    lda.save(model_file)

    # db = conntoMongoKeywordLocation_topic()
    # current_collection = db[collection_name_1]
    # data_toinsert = {
    #     'topic_distri':'1',
    #     'keyword': current_keyword,
    #     'all_topic': str(lda.print_topics(-1))
    # }
    # result = current_collection.insert_one(data_toinsert)

    # write to file
    output_folder = 'result/year_keyword_location_lda/' + str(
        current_year) + '_' + current_keyword + '_' + mood_list
    if (not os.path.exists(output_folder)):
        os.makedirs(output_folder)

    #默认20个话题
    lda_topic_list = lda.print_topics()
    with open(output_folder + '/lda_topics.txt', 'a+',
              encoding='utf-8') as output_file:
        output_file.write(
            str(current_year) + '_' + current_keyword + '_' + mood_list)
        output_file.write('\n')
        for current_topic in lda_topic_list:
            output_file.write(str(current_topic))
            output_file.write('\n')

    for city_name, city_index in corpus_city.items():
        city_corpus = corpus[city_index]
        doc_lda = lda.get_document_topics(city_corpus)  # 得到新文档的主题分布
        # print(doc_lda)

        # db = conntoMongoKeywordLocation()
        # collection_name_1 = str(current_year) + '_' + current_keyword + '_' + mood_list
        # current_collection = db[collection_name_1]
        # data_toinsert = {
        #     'city': city_name,
        #     'topic_distribution': str(doc_lda)
        # }
        # result = current_collection.insert_one(data_toinsert)

        with open(output_folder + '/city_topics.txt', 'a+',
                  encoding='utf-8') as output_file:
            output_city_name = city_name.split('.')[0]
            output_file.write(output_city_name)
            output_file.write('\n')
            output_file.write(str(doc_lda))
            output_file.write('\n')

    return
Beispiel #16
0
def keyword_location_lda(mongo_server='127.0.0.1'):
    jieba.load_userdict("data/user_dict.txt")
    stop_word = []
    weibocityfilefolder = '/Volumes/data/chinadream/city/'

    # keyword_finished = []
    # db = conntoMongoKeywordLocation_topic()
    # for keyword_result in db['topic'].find():
    #     keyword_finished.append(keyword_result['keyword'])

    with open('data/stop_word.txt', 'r', encoding='utf-8') as sw_f:
        for item in sw_f:
            stop_word.append(item.strip())

    keyword_folder = '/Volumes/data/chinadream/keyword_location/'
    folderlist = os.listdir(keyword_folder)
    for current_keyword in folderlist:
        # if(current_keyword in keyword_finished):
        #     continue
        print(current_keyword)
        current_keyword_folder = keyword_folder + current_keyword + '/'
        current_keyword_cut_list = current_keyword.split(',')
        current_keyword_banned_list = []
        if (current_keyword == '个人努力' or current_keyword == '健康'
                or (not os.listdir(current_keyword_folder))):
            continue
        # 全切
        for temp1 in current_keyword_cut_list:
            cut_list = jieba.cut(temp1)
            for temp2 in cut_list:
                current_keyword_banned_list.append(temp2)

        current_city_file_list = os.listdir(current_keyword_folder)

        #corpus_text 里的每个list代表一个城市的所有文本
        corpus_text = []
        corpus_city = {}
        count = 0
        for current_city_file in current_city_file_list:
            print(current_city_file)
            corpus_numbers = 0
            origin_text = []
            open_keyword_file_path = current_keyword_folder + current_city_file
            open_keyword_file = linecache.getlines(open_keyword_file_path)
            # open_keyword_file = open(open_keyword_file_path,'r',encoding='utf-8')
            temp_line_num = len(open_keyword_file)
            max_weiboDoc = 5000
            if (temp_line_num < max_weiboDoc):
                for temp_line_lineNum in range(temp_line_num):
                    temp_line = open_keyword_file[temp_line_lineNum]
                    current_topic = getTopic(temp_line)
                    if (current_topic == '娱乐'):
                        continue

                    weibo_origin = filer.filer(temp_line).replace('/', '')
                    if (len(weibo_origin) == 0):
                        continue
                    weibo_cut = list(jieba.cut(weibo_origin))
                    weibo_cut_list = []
                    for items in weibo_cut:
                        if (items not in stop_word and len(items.strip()) > 0):
                            if (items in current_keyword_banned_list):
                                continue
                            weibo_cut_list.append(items)
                    if (len(weibo_cut_list) < 5):
                        continue
                    for current_cut in weibo_cut_list:
                        origin_text.append(current_cut)
            else:
                used_set = set()
                linenumber_list = [i for i in range(temp_line_num)]

                while (len(used_set) < max_weiboDoc
                       or len(linenumber_list) > 0):
                    if (len(linenumber_list) == 0):
                        break
                    elif (len(linenumber_list) == 1):
                        a = 0
                    else:
                        a = randint(0, len(linenumber_list) - 1)
                    del linenumber_list[a]
                    temp_line = open_keyword_file[a]
                    current_topic = getTopic(temp_line)
                    if (current_topic == '娱乐'):
                        continue

                    weibo_origin = filer.filer(temp_line).replace('/', '')
                    if (len(weibo_origin) == 0):
                        continue
                    weibo_cut = list(jieba.cut(weibo_origin))
                    weibo_cut_list = []
                    for items in weibo_cut:
                        if (items not in stop_word and len(items.strip()) > 0):
                            if (items in current_keyword_banned_list):
                                continue
                            weibo_cut_list.append(items)
                    if (len(weibo_cut_list) < 5):
                        continue
                    for current_cut in weibo_cut_list:
                        origin_text.append(current_cut)
                    used_set.add(a)
            if (len(origin_text) == 0):
                continue
            linecache.clearcache()
            print(len(origin_text))

            corpus_city[current_city_file] = count
            corpus_text.append(origin_text)
            count += 1

        del origin_text
        gc.collect()

        frequency = defaultdict(int)
        for city_file in corpus_text:
            for token in city_file:
                frequency[token] += 1
        texts = [[token for token in text if frequency[token] > 3]
                 for text in corpus_text]

        word_count_dict = corpora.Dictionary(texts)
        corpus = [word_count_dict.doc2bow(text) for text in texts]
        print('计算tfidf')
        tfidf = TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        del tfidf
        gc.collect()
        print('开始LDA模型构建')
        lda = LdaModel(corpus=corpus_tfidf,
                       id2word=word_count_dict,
                       num_topics=8)
        model_file = 'data/keyword_location/model/' + current_keyword + '_lda.model'

        lda.save(model_file)

        for city_name, city_index in corpus_city.items():
            city_corpus = corpus[city_index]
            doc_lda = lda.get_document_topics(city_corpus)  # 得到新文档的主题分布
            db = conntoMongoKeywordLocation()
            current_collection = db[current_keyword]
            data_toinsert = {
                'city': city_name,
                'topic_distribution': str(doc_lda)
            }
            result = current_collection.insert_one(data_toinsert)

        db = conntoMongoKeywordLocation_topic()
        current_collection = db['topic']
        data_toinsert = {
            'keyword': current_keyword,
            'all_topic': str(lda.print_topics(-1))
        }
        result = current_collection.insert_one(data_toinsert)
        #write to file
        # output_file = codecs.open('result/keyword_location/city_topic/' + current_keyword + '_city_topics.txt', 'a+', encoding='utf-8')
        # output_file.write(city_name)
        # output_file.write('\t')
        # output_file.write(str(doc_lda))
        # output_file.write('\n')
    return