def cut_words_local(input_file, output_file): tmp_file = open(input_file,"r") f = open(output_file,"w") count = 0 for line in tmp_file.readlines(): try: tid, text = line.split('\t') except: pass if not isinstance(text, str): raise ValueError("cut words input text must be string") cx_terms = cut(s, text, cx=True) terms = [term for term,cx in cx_terms if cx in cx_dict_noun] leng = len(terms) # f.write(tid+"\t"+str(leng)) for term in terms: f.write(tid+"\t"+str(leng)+"\t"+term+"\n") # f.write("\t"+term) # f.write("\n") count += 1 if count == 100000: break f.close() tmp_file.close() return
def domain_zixun_stat(): texts = [] # 统计关键词 total_keywords_list = [] query_dict["$or"] = [{"category": "domain_keywords_20150617.txt"}] query_dict["source_website"] = "baidu_site_search" count = mongo.boatcol.find(query_dict).count() results = mongo.boatcol.find(query_dict) for r in results: title = _encode_utf8(r['title']) summary = _encode_utf8(r['summary']) text = title + summary texts.append(text) for text in texts: cut_kw = cut(s, text) total_keywords_list.extend(cut_kw) ct = collections.Counter(total_keywords_list) keywords_results = ct.most_common(50) fw = csv.writer(open(result_path + 'domain_keywords_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^') for keyword, count in keywords_results: fw.writerow((_encode_utf8(keyword), count))
def triple_classifier(tweet): """content168 以utf-8编码 """ sentiment = 0 text = tweet['content168'] if '//@' in text: text = text[:text.index('//@')] if not len(text): text = remove_at(tweet['content168']) emoticon_sentiment = emoticon(pe_set, ne_set, text) if emoticon_sentiment in [1, 2]: sentiment = 1 text = '' if text != '': entries = cut(sw, text) entry = [e.decode('utf-8') for e in entries] bow = dictionary_1.doc2bow(entry) s = [1, 1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0]**pair[1]) s[1] = s[1] * (step1_score[pair[0]][1]**pair[1]) if s[0] <= s[1]: sentiment = 1 else: sentiment = 0 return sentiment
def save_ts(topic, excel_name): # '0' no means start_ts = [1396918800, 1396918800, 1396920000, 1396922400, 1396929600, 1396928400,\ 1397032200, 1397045700, 1397096100, 1397089200, 1397138400] end_ts = [1396918900,1396920300, 1396927000, 1396923400, 1396931000, 1396930000,\ 1397033200, 1397130000, 1397098000, 1397089900, 1397140000] s = load_scws() for i in range(11): #item = OpinionTestTime(topic, str(i), start_ts[i], end_ts[i]) # 获取child_topic对应的权重最大的微博,并进行分词 data = xlrd.open_workbook(excel_name) table_weibo = data.sheet_by_name(str(i)) line = table_weibo.row_values(0) # 读取工作表中的第一行,即权重最大的微博文本 weibo = line[1] # 获得微博文本 term_list = cut(s, weibo.encode('utf8')) #print 'term_list:', term_list child_topic = json.dumps({str(i): term_list}) item = OpinionTestTime(topic, child_topic, start_ts[i], end_ts[i]) # 分割线 item_exist = db.session.query(OpinionTestTime).filter(OpinionTestTime.topic==topic, \ OpinionTestTime.child_topic==child_topic, \ OpinionTestTime.start_ts==start_ts[i], \ OpinionTestTime.end_ts==end_ts[i]).first() if item_exist: db.session.delete(item_exist) db.session.add(item) db.session.commit()
def triple_classifier(tweet): """content168 以utf-8编码 """ sentiment = 0 text = tweet['content168'] if '//@' in text: text = text[:text.index('//@')] if not len(text): text = remove_at(tweet['content168']) emoticon_sentiment = emoticon(pe_set,ne_set, text) if emoticon_sentiment in [1,2]: sentiment = 1 text = '' if text != '': entries = cut(sw, text) entry = [e.decode('utf-8') for e in entries] bow = dictionary_1.doc2bow(entry) s = [1,1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1]) s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1]) if s[0] <= s[1]: sentiment = 1 else: sentiment = 0 return sentiment
def gaishu(): # 要素 yaosu = 'gaishu' # 话题关键词 topic = request.args.get('query', default_topic) # 事件标签 event_label = cut(scws, topic) # 时间范围: 20130901-20130901 time_range = request.args.get('time_range', default_timerange) # 时间粒度: 3600 point_interval = request.args.get('point_interval', None) if not point_interval: point_interval = default_pointInterval else: for pi in pointIntervals: if pi['en'] == int(point_interval): point_interval = pi break return render_template('index/gaishu.html', yaosu=yaosu, time_range=time_range, \ topic=topic, pointInterval=point_interval, pointIntervals=pointIntervals, \ gaishu_yaosus=gaishu_yaosus, deep_yaosus=deep_yaosus, event_label=event_label)
def cut_words(text): if not isinstance(text, str): print 'type_content:', type(text) raise ValueError('cut words input text must be string') cx_terms = cut(s, text, cx=True) return [term for term, cx in cx_terms if cx in cx_dict and term not in black_words]
def domain_stat(): texts = [] keywords = get_keywords('keywords_domain_baidu.txt') # 统计关键词 total_keywords_list = [] query_dict["$or"] = [{"source_category": "keywords_domain_weiboapi.txt"}] query_dict["source_website"] = "weibo_api_search_spider" count = mongo.master_timeline_weibo.find(query_dict).count() results = mongo.master_timeline_weibo.find(query_dict) for r in results: texts.append(r['text'].encode('utf-8')) query_dict["$or"] = [{"category": "keywords_domain_forum.txt"}] del query_dict["source_website"] count = mongo.boatcol.find(query_dict).count() results = mongo.boatcol.find(query_dict) for r in results: title = _encode_utf8(r['title']) content168 = _encode_utf8(r['content168']) summary = _encode_utf8(r['summary']) text = title + content168 + summary texts.append(text) query_dict["$or"] = [{"category": "keywords_domain_weixin.txt"}] count = mongo.boatcol.find(query_dict).count() results = mongo.boatcol.find(query_dict) for r in results: title = _encode_utf8(r['title']) content168 = _encode_utf8(r['content168']) summary = _encode_utf8(r['summary']) text = title + content168 + summary texts.append(text) query_dict["$or"] = [{"category": "keywords_domain_baidu.txt"}] query_dict["source_website"] = "baidu_ns_search" results = mongo.boatcol.find(query_dict) for r in results: title = _encode_utf8(r['title']) content168 = _encode_utf8(r['content168']) summary = _encode_utf8(r['summary']) text = title + content168 + summary texts.append(text) for text in texts: cut_kw = cut(s, text) total_keywords_list.extend(cut_kw) ct = collections.Counter(total_keywords_list) keywords_results = ct.most_common(50) fw = csv.writer(open(result_path + 'domain_keywords_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^') for keyword, count in keywords_results: fw.writerow((_encode_utf8(keyword), count))
def cut_words(text): if not isinstance(text, str): print 'type_content:', type(text) raise ValueError('cut words input text must be string') cx_terms = cut(s, text, cx=True) return [ term for term, cx in cx_terms if cx in cx_dict and term not in black_words ]
def parse(self): docid = 0 with open(self.filename) as f: for line in f: if docid == 0: docid += 1 continue text = line.strip() terms = cut(s, text) self.corpus[str(docid)] = terms docid += 1
def cut_words_noun(text): '''分词, 加入黑名单过滤单个词,保留名词 input texts: 输入text的list,utf-8 output: terms: 关键词list ''' if not isinstance(text, str): raise ValueError("cut words input text must be string") cx_terms = cut(s, text, cx=True) return [term for term, cx in cx_terms if cx in cx_dict_noun and term not in black_words]
def prepare_svm_input(texts, y=None, dictionary=dictionary): """处理svm输入 """ x = [] if not y: y = [1.0 for i in range(0, len(texts))] for text in texts: words = cut(sw, text) feature = dictionary.doc2bow(words) x.append(dict(feature)) return y, x
def prepare_svm_input_file(texts, dictionary=dictionary): """将svm输入处理成文件 """ pid = os.getpid() svm_input_path = os.path.join(AB_PATH, "./svm_test/%s.txt" % pid) fw = open(svm_input_path, "w") for text in texts: words = cut(sw, text) feature = dictionary.doc2bow(words) line = "1 " + " ".join([str(wordid + 1) + ":" + str(wordcount) for wordid, wordcount in feature]) fw.write("%s\n" % line) fw.close() return svm_input_path
def cut_words_noun(text): '''分词, 加入黑名单过滤单个词,保留名词 input texts: 输入text的list,utf-8 output: terms: 关键词list ''' if not isinstance(text, str): raise ValueError("cut words input text must be string") cx_terms = cut(s, text, cx=True) return [ term for term, cx in cx_terms if cx in cx_dict_noun and term not in black_words ]
def prepare_svm_input_file(texts, dictionary=dictionary): """将svm输入处理成文件 """ pid = os.getpid() svm_input_path = os.path.join(AB_PATH, './svm_test/%s.txt' % pid) fw = open(svm_input_path, 'w') for text in texts: words = cut(sw, text) feature = dictionary.doc2bow(words) line = '1 ' + ' '.join([ str(wordid + 1) + ':' + str(wordcount) for wordid, wordcount in feature ]) fw.write('%s\n' % line) fw.close() return svm_input_path
def triple_classifier(tweet): """text: utf-8 encoding """ sentiment = 0 text = tweet['text'] # encode #if_empty_retweet = if_empty_retweet_weibo(tweet) #if if_empty_retweet: # text = tweet['retweeted_status']['text'] # if_emoticoned = if_emoticoned_weibo(tweet) # if if_emoticoned == 1: emoticon_sentiment = emoticon(text) if emoticon_sentiment != 0: sentiment = emoticon_sentiment text = '' if text != '': entries = cut(cut_str, text) entry = [e.decode('utf-8', 'ignore') for e in entries] bow = dictionary_1.doc2bow(entry) s = [1, 1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1]) s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1]) if s[0] <= s[1]: bow = dictionary_2.doc2bow(entry) s = [1, 1, 1] for pair in bow: s[0] = s[0] * (step2_score[pair[0]][0] ** pair[1]) s[1] = s[1] * (step2_score[pair[0]][1] ** pair[1]) s[2] = s[2] * (step2_score[pair[0]][2] ** pair[1]) if s[0] > s[1] and s[0] > s[2]: sentiment = HAPPY elif s[1] > s[0] and s[1] > s[2]: sentiment = SAD elif s[2] > s[1] and s[2] > s[0]: sentiment = ANGRY return sentiment
def triple_classifier(tweet): """text: utf-8 encoding """ sentiment = 0 text = tweet['text'] # encode #if_empty_retweet = if_empty_retweet_weibo(tweet) #if if_empty_retweet: # text = tweet['retweeted_status']['text'] # if_emoticoned = if_emoticoned_weibo(tweet) # if if_emoticoned == 1: emoticon_sentiment = emoticon(text) if emoticon_sentiment != 0: sentiment = emoticon_sentiment text = '' if text != '': entries = cut(cut_str, text) entry = [e.decode('utf-8', 'ignore') for e in entries] bow = dictionary_1.doc2bow(entry) s = [1, 1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0]**pair[1]) s[1] = s[1] * (step1_score[pair[0]][1]**pair[1]) if s[0] <= s[1]: bow = dictionary_2.doc2bow(entry) s = [1, 1, 1] for pair in bow: s[0] = s[0] * (step2_score[pair[0]][0]**pair[1]) s[1] = s[1] * (step2_score[pair[0]][1]**pair[1]) s[2] = s[2] * (step2_score[pair[0]][2]**pair[1]) if s[0] > s[1] and s[0] > s[2]: sentiment = HAPPY elif s[1] > s[0] and s[1] > s[2]: sentiment = SAD elif s[2] > s[1] and s[2] > s[0]: sentiment = ANGRY return sentiment
def neutral_classifier(tweet): sentiment = 0 text = tweet['text'] # encode emoticon_sentiment = emoticon(text) if emoticon_sentiment != 0: sentiment = 0 text = u'' if text != u'': entries = cut(cut_str, text.encode('utf-8')) entry = [e.decode('utf-8', 'ignore') for e in entries] bow = dictionary_1.doc2bow(entry) s = [1, 1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1]) s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1]) if s[0] > s[1]: sentiment = 1 return sentiment
def cut_text(item): text = item['text'].encode('utf-8') item['terms'] = cut(s, text, cx=False) item['topics'] = list(set(item['terms'])) return item
#-*-coding=utf-8-*- import collections from xapian_case.utils import load_scws, cut sw = load_scws() total_keywords_list = [] f = open('../source/domain_training_text.txt') for line in f: text = line.strip() terms = cut(sw, text) total_keywords_list.extend(terms) f.close() ct = collections.Counter(total_keywords_list) keywords_results = ct.most_common(100) fw = open('../source/domain_keywords_20150618.txt', 'w') for keyword, count in keywords_results: fw.write("%s\n" % keyword) fw.close()
def cut_text(item): text = item['text'].encode('utf-8') item['terms'] = cut(s, text, cx=False) return item
def cut_text(item): text = item['content168'].encode('utf-8') item['terms'] = cut(s, text, cx=False) return item
mongo = _default_mongo() module_keywords = get_module_keywords() for bankuai, lanmu, source, source_en, keywords_file in module_keywords: query_dict = {"timestamp": {"$gte": START_TS, "$lt": END_TS}, "keywords_hit": True, "rubbish": False} if source_en == "weibo_api_search_spider": query_dict["source_category"] = keywords_file query_dict["source_website"] = source_en count = mongo.master_timeline_weibo.find(query_dict).count() results = mongo.master_timeline_weibo.find(query_dict) for r in results: terms = cut(s, r["text"].encode("utf-8")) mongo.master_timeline_weibo.update({"_id": r["_id"]}, {"$set": {"terms": terms}}) else: query_dict["category"] = keywords_file query_dict["source_website"] = source_en count = mongo.boatcol.find(query_dict).count() results = mongo.boatcol.find(query_dict) for r in results: title = _encode_utf8(r["title"]) content168 = _encode_utf8(r["content168"]) summary = _encode_utf8(r["summary"]) text = title + content168 + summary r["text"] = text.decode("utf-8") terms = cut(s, r["text"].encode("utf-8"))
def cut_text(item): text = item["text"].encode("utf-8") item["terms"] = cut(s, text, cx=False) item["topics"] = list(set(item["terms"])) return item
def sheqi_stat(): texts = [] keywords = get_keywords('keywords_corp_baidu.txt') # 发布来源统计 author_dict = dict() # 涉及企业的信息数统计 corp_dict = dict() # 各渠道信息条数的周走势 source_daily_dict = { "微博": {}, "微信": {}, "论坛": {}, "新闻": {} } # 微博论坛渠道的情绪统计 sentiment_stock_dict = dict() sentiment_ustock_dict = dict() # 统计关键词 total_keywords_list = [] query_dict["$or"] = [{"source_category": "keywords_corp_weiboapi.txt"}, {"source_category": "keywords_hot_weiboapi.txt"}, {"source_category": "keywords_leader_weiboapi.txt"}] query_dict["source_website"] = "weibo_api_search_spider" count = mongo.master_timeline_weibo.find(query_dict).count() results = mongo.master_timeline_weibo.find(query_dict) author_dict["微博"] = count for r in results: text = r['text'].encode('utf-8') isstock = stock_classifier(text) texts.append(text) try: source_daily_dict["微博"][ts2date(r["timestamp"])] += 1 except KeyError: source_daily_dict["微博"][ts2date(r["timestamp"])] = 1 if isstock: try: sentiment_stock_dict[r["sentiment"]] += 1 except KeyError: sentiment_stock_dict[r["sentiment"]] = 1 else: try: sentiment_ustock_dict[r["sentiment"]] += 1 except KeyError: sentiment_ustock_dict[r["sentiment"]] = 1 query_dict["$or"] = [{"category": "keywords_corp_forum.txt"}, {"category": "keywords_hot_forum.txt"}, {"category": "keywords_leader_forum.txt"}] del query_dict["source_website"] count = mongo.boatcol.find(query_dict).count() results = mongo.boatcol.find(query_dict) author_dict["论坛"] = count for r in results: title = _encode_utf8(r['title']) content168 = _encode_utf8(r['content168']) summary = _encode_utf8(r['summary']) text = title + content168 + summary isstock = stock_classifier(text) texts.append(text) try: source_daily_dict["论坛"][r["date"]] += 1 except KeyError: source_daily_dict["论坛"][r["date"]] = 1 if isstock: try: sentiment_stock_dict[r["sentiment"]] += 1 except KeyError: sentiment_stock_dict[r["sentiment"]] = 1 else: try: sentiment_ustock_dict[r["sentiment"]] += 1 except KeyError: sentiment_ustock_dict[r["sentiment"]] = 1 query_dict["$or"] = [{"category": "keywords_corp_weixin.txt"}, {"category": "keywords_hot_weixin.txt"}, {"category": "keywords_leader_weixin.txt"}] count = mongo.boatcol.find(query_dict).count() results = mongo.boatcol.find(query_dict) author_dict["微信"] = count for r in results: title = _encode_utf8(r['title']) content168 = _encode_utf8(r['content168']) summary = _encode_utf8(r['summary']) text = title + content168 + summary texts.append(text) try: source_daily_dict["微信"][r["date"]] += 1 except KeyError: source_daily_dict["微信"][r["date"]] = 1 query_dict["$or"] = [{"category": "keywords_corp_baidu.txt"}, {"category": "keywords_hot_baidu.txt"}, {"category": "keywords_leader_baidu.txt"}] query_dict["source_website"] = "baidu_ns_search" results = mongo.boatcol.find(query_dict) for r in results: try: author_dict[r['user_name']] += 1 except KeyError: author_dict[r['user_name']] = 1 title = _encode_utf8(r['title']) content168 = _encode_utf8(r['content168']) summary = _encode_utf8(r['summary']) text = title + content168 + summary texts.append(text) try: source_daily_dict["新闻"][r["date"]] += 1 except KeyError: source_daily_dict["新闻"][r["date"]] = 1 fw = csv.writer(open(result_path + 'sheqi_author_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^') results = sorted(author_dict.iteritems(), key=lambda(k, v): v, reverse=True) for k, v in results: if k == "": continue fw.writerow((_encode_utf8(k), v)) for text in texts: cut_kw = cut(s, text) total_keywords_list.extend(cut_kw) for keyword in keywords: if keyword in text: try: corp_dict[keyword] += 1 except KeyError: corp_dict[keyword] = 1 fw = csv.writer(open(result_path + 'sheqi_gongsi_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^') results = sorted(corp_dict.iteritems(), key=lambda(k, v): v, reverse=True) for k, v in results: if k == "": continue fw.writerow((_encode_utf8(k), v)) fw = csv.writer(open(result_path + 'sheqi_source_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^') for source, daily_dict in source_daily_dict.iteritems(): for date, count in daily_dict.iteritems(): fw.writerow((_encode_utf8(source), date, count)) fw = csv.writer(open(result_path + 'sheqi_stock_sentiment_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^') for sentiment, count in sentiment_stock_dict.iteritems(): fw.writerow((sentiment, count)) fw = csv.writer(open(result_path + 'sheqi_nonstock_sentiment_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^') for sentiment, count in sentiment_ustock_dict.iteritems(): fw.writerow((sentiment, count)) ct = collections.Counter(total_keywords_list) keywords_results = ct.most_common(50) fw = csv.writer(open(result_path + 'sheqi_keywords_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^') for keyword, count in keywords_results: fw.writerow((_encode_utf8(keyword), count))
def friends_zixun_stat(): texts = [] keywords = get_keywords('keywords_friends_baidu.txt') # 涉及企业的與情热度统计 corp_dict = dict() # 统计关键词 total_keywords_list = [] """ query_dict["$or"] = [{"source_category": "keywords_friends_weiboapi.txt"}] query_dict["source_website"] = "weibo_api_search_spider" count = mongo.master_timeline_weibo.find(query_dict).count() results = mongo.master_timeline_weibo.find(query_dict) for r in results: if 'hot' in r: hot = r['hot'] else: hot = 1 texts.append([r['text'].encode('utf-8'), hot]) query_dict["$or"] = [{"category": "keywords_friends_forum.txt"}] del query_dict["source_website"] count = mongo.boatcol.find(query_dict).count() results = mongo.boatcol.find(query_dict) for r in results: title = _encode_utf8(r['title']) content168 = _encode_utf8(r['content168']) summary = _encode_utf8(r['summary']) text = title + content168 + summary if 'hot' in r: hot = r['hot'] else: hot = 1 texts.append([text, hot]) """ query_dict["$or"] = [{"category": "keywords_friends_weixin.txt"}] count = mongo.boatcol.find(query_dict).count() results = mongo.boatcol.find(query_dict) for r in results: title = _encode_utf8(r['title']) content168 = _encode_utf8(r['content168']) summary = _encode_utf8(r['summary']) text = title + content168 + summary if 'hot' in r: hot = r['hot'] else: hot = 1 texts.append([text, hot]) query_dict["$or"] = [{"category": "keywords_friends_baidu.txt"}] query_dict["source_website"] = "baidu_ns_search" results = mongo.boatcol.find(query_dict) for r in results: title = _encode_utf8(r['title']) content168 = _encode_utf8(r['content168']) summary = _encode_utf8(r['summary']) text = title + content168 + summary if 'hot' in r: hot = r['hot'] else: hot = 1 texts.append([text, hot]) for text in texts: cut_kw = cut(s, text[0]) total_keywords_list.extend(cut_kw) for keyword in keywords: if keyword in text[0]: if text[1] == 0: hot = 1 else: hot = text[1] try: corp_dict[keyword] += hot except KeyError: corp_dict[keyword] = hot fw = csv.writer(open(result_path + 'friends_gongsi_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^') results = sorted(corp_dict.iteritems(), key=lambda(k, v): v, reverse=True) for k, v in results: if k == "": continue fw.writerow((_encode_utf8(k), v)) ct = collections.Counter(total_keywords_list) keywords_results = ct.most_common(50) fw = csv.writer(open(result_path + 'friends_keywords_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^') for keyword, count in keywords_results: fw.writerow((_encode_utf8(keyword), count))