def save_ts(topic, excel_name): # '0' no means start_ts = [1396918800, 1396918800, 1396920000, 1396922400, 1396929600, 1396928400,\ 1397032200, 1397045700, 1397096100, 1397089200, 1397138400] end_ts = [1396918900,1396920300, 1396927000, 1396923400, 1396931000, 1396930000,\ 1397033200, 1397130000, 1397098000, 1397089900, 1397140000] s = load_scws() for i in range(11): #item = OpinionTestTime(topic, str(i), start_ts[i], end_ts[i]) # 获取child_topic对应的权重最大的微博,并进行分词 data = xlrd.open_workbook(excel_name) table_weibo = data.sheet_by_name(str(i)) line = table_weibo.row_values(0) # 读取工作表中的第一行,即权重最大的微博文本 weibo = line[1] # 获得微博文本 term_list = cut(s, weibo.encode('utf8')) #print 'term_list:', term_list child_topic = json.dumps({str(i): term_list}) item = OpinionTestTime(topic, child_topic, start_ts[i], end_ts[i]) # 分割线 item_exist = db.session.query(OpinionTestTime).filter(OpinionTestTime.topic==topic, \ OpinionTestTime.child_topic==child_topic, \ OpinionTestTime.start_ts==start_ts[i], \ OpinionTestTime.end_ts==end_ts[i]).first() if item_exist: db.session.delete(item_exist) db.session.add(item) db.session.commit()
import datetime from case.model import * from case.extensions import db from case.moodlens import pie as pieModule from case.identify import utils as identifyModule import search as searchModule from case.time_utils import ts2datetime, ts2date from xapian_case.xapian_backend import XapianSearch from xapian_case.utils import cut, load_scws from case.dynamic_xapian_weibo import getXapianWeiboByTopic from case.global_config import XAPIAN_USER_DATA_PATH from case.Database import Event, EventManager from case.topic_manage import topics_name_start_end from flask import Blueprint, url_for, render_template, request, abort, flash, session, redirect, make_response scws = load_scws() mod = Blueprint('case', __name__, url_prefix='/index') xapian_search_weibo = getXapianWeiboByTopic() em = EventManager() def acquire_user_by_id(uid): user_search = XapianSearch(path=XAPIAN_USER_DATA_PATH, name='master_timeline_user', schema_version=1) result = user_search.search_by_id(int(uid), fields=[ 'name', 'location',
# db.authenticate('root', 'root') db = getattr(connection, usedb) return db def ts2date(timestamp): return time.strftime('%Y-%m-%d', time.localtime(timestamp)) def ts2datetime(timestamp): return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp)) def datetime2ts(date): return int(time.mktime(time.strptime(date, '%Y-%m-%d %H:%M:%S'))) s = load_scws() cx_dict = set(['Ag','a','an','Ng','n','nr','ns','nt','nz','Vg','v','vd','vn','@','j']) # 关键词词性词典, 保留名词、动词、形容词 cx_dict_noun = set(['Ng','n','nr','ns','nt','nz']) # 关键词词性词典, 保留名词 def load_black_words(): one_words = set([line.strip('\r\n') for line in file(EXTRA_BLACK_LIST_PATH)]) return one_words black_words = load_black_words() def cut_words(text): '''分词, 加入黑名单过滤单个词,保留名词、动词、形容词 input texts: 输入text的list,utf-8
#-*-coding=utf-8-*- import collections from xapian_case.utils import load_scws, cut sw = load_scws() total_keywords_list = [] f = open('../source/domain_training_text.txt') for line in f: text = line.strip() terms = cut(sw, text) total_keywords_list.extend(terms) f.close() ct = collections.Counter(total_keywords_list) keywords_results = ct.most_common(100) fw = open('../source/domain_keywords_20150618.txt', 'w') for keyword, count in keywords_results: fw.write("%s\n" % keyword) fw.close()
# -*- coding: utf-8 -*- # gathering snmp data from __future__ import division import re import opencc import os from gensim import corpora import cPickle as pickle from xapian_case.utils import load_scws, cut, load_emotion_words AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') cut_str = load_scws() cc = opencc.OpenCC('s2t', opencc_path='/usr/bin/opencc') emotions_words = load_emotion_words() emotions_words = [unicode(e, 'utf-8') for e in emotions_words] t_emotions_words = [cc.convert(e) for e in emotions_words] emotions_words.extend(t_emotions_words) emotions_words = [w.encode('utf-8') for w in emotions_words] emotions_words_set = set(emotions_words) emotion_pattern = re.compile(r'\[(\S+?)\]') def if_emoticoned_weibo(r): # 微博是否包含指定的表情符号集 emotions = re.findall(emotion_pattern, r['text']) is_emoticoned = 1 if set(emotions) & emotions_words_set else 0 return is_emoticoned
# -*- coding: utf-8 -*- import os import scws import time import csv import re from gensim import corpora from xapian_case.utils import load_scws, cut, cut_filter from liblinearutil import svm_read_problem, load_model, predict, save_model, train sw = load_scws() AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), './') FEATURE_WORD_PATH = os.path.join(AB_PATH, './svm/dictionary_20150124.txt') SVM_MODEL_FILE = os.path.join(AB_PATH, './svm/train.model') TRAIN_DATA_FILE = os.path.join(AB_PATH, './train20150124.csv') TRAIN_INPUT_FILE = os.path.join(AB_PATH, './svm/train20150124.txt') dictionary = corpora.Dictionary.load_from_text(FEATURE_WORD_PATH) def prepare_svm_input_file(texts, dictionary=dictionary): """将svm输入处理成文件 """ pid = os.getpid() svm_input_path = os.path.join(AB_PATH, './svm_test/%s.txt' % pid) fw = open(svm_input_path, 'w') for text in texts: words = cut(sw, text)