Python cut Examples, xapian_case.utils.cut Python Examples

Example #1

0

Show file

def cut_words_local(input_file, output_file):
    tmp_file = open(input_file,"r")
    f = open(output_file,"w")
    count = 0
    for line in tmp_file.readlines():
        try:
            tid, text = line.split('\t')
        except:
            pass

        if not isinstance(text, str):
            raise ValueError("cut words input text must be string")
        cx_terms = cut(s, text, cx=True)
        terms = [term for term,cx in cx_terms if cx in cx_dict_noun]

        leng = len(terms)
        # f.write(tid+"\t"+str(leng))
        for term in terms:
            f.write(tid+"\t"+str(leng)+"\t"+term+"\n")
            # f.write("\t"+term)
        # f.write("\n")
        count += 1
        if count == 100000:
            break
    f.close()
    tmp_file.close()
    return

Example #2

0

Show file

File: stat.py Project: ferrero-zhang/scrapy_boat

def domain_zixun_stat():
    texts = []

    # 统计关键词
    total_keywords_list = []

    query_dict["$or"] = [{"category": "domain_keywords_20150617.txt"}]
    query_dict["source_website"] = "baidu_site_search"
    count = mongo.boatcol.find(query_dict).count()

    results = mongo.boatcol.find(query_dict)
    for r in results:
        title = _encode_utf8(r['title'])
        summary = _encode_utf8(r['summary'])

        text = title  + summary
        texts.append(text)

    for text in texts:
        cut_kw = cut(s, text)
        total_keywords_list.extend(cut_kw)

    ct = collections.Counter(total_keywords_list)
    keywords_results = ct.most_common(50)
    fw = csv.writer(open(result_path + 'domain_keywords_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^')
    for keyword, count in keywords_results:
        fw.writerow((_encode_utf8(keyword), count))

Example #3

0

Show file

def triple_classifier(tweet):
    """content168 以utf-8编码
    """
    sentiment = 0
    text = tweet['content168']

    if '//@' in text:
        text = text[:text.index('//@')]

    if not len(text):
        text = remove_at(tweet['content168'])

    emoticon_sentiment = emoticon(pe_set, ne_set, text)
    if emoticon_sentiment in [1, 2]:
        sentiment = 1
        text = ''

    if text != '':
        entries = cut(sw, text)
        entry = [e.decode('utf-8') for e in entries]
        bow = dictionary_1.doc2bow(entry)
        s = [1, 1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0]**pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1]**pair[1])
        if s[0] <= s[1]:
            sentiment = 1
        else:
            sentiment = 0

    return sentiment

Example #4

0

Show file

File: read_excel4.py Project: NeilWang6/case

def save_ts(topic, excel_name):  # '0' no means
    start_ts = [1396918800, 1396918800, 1396920000, 1396922400, 1396929600, 1396928400,\
                1397032200, 1397045700, 1397096100, 1397089200, 1397138400]
    end_ts = [1396918900,1396920300, 1396927000, 1396923400, 1396931000, 1396930000,\
              1397033200, 1397130000, 1397098000, 1397089900, 1397140000]
    s = load_scws()
    for i in range(11):
        #item = OpinionTestTime(topic, str(i), start_ts[i], end_ts[i])
        # 获取child_topic对应的权重最大的微博，并进行分词
        data = xlrd.open_workbook(excel_name)
        table_weibo = data.sheet_by_name(str(i))
        line = table_weibo.row_values(0)  # 读取工作表中的第一行，即权重最大的微博文本
        weibo = line[1]  # 获得微博文本
        term_list = cut(s, weibo.encode('utf8'))
        #print 'term_list:', term_list
        child_topic = json.dumps({str(i): term_list})
        item = OpinionTestTime(topic, child_topic, start_ts[i], end_ts[i])
        # 分割线
        item_exist = db.session.query(OpinionTestTime).filter(OpinionTestTime.topic==topic, \
                                                              OpinionTestTime.child_topic==child_topic, \
                                                              OpinionTestTime.start_ts==start_ts[i], \
                                                              OpinionTestTime.end_ts==end_ts[i]).first()
        if item_exist:
            db.session.delete(item_exist)
        db.session.add(item)
    db.session.commit()

Example #5

0

Show file

File: neutral_classifier.py Project: jimmy0000/opinion_news

def triple_classifier(tweet):
    """content168 以utf-8编码
    """
    sentiment = 0
    text = tweet['content168']

    if '//@' in text:
        text = text[:text.index('//@')]

    if not len(text):
        text = remove_at(tweet['content168'])

    emoticon_sentiment = emoticon(pe_set,ne_set, text)
    if emoticon_sentiment in [1,2]:
        sentiment = 1
        text = ''

    if text != '':
        entries = cut(sw, text)
        entry = [e.decode('utf-8') for e in entries]
        bow = dictionary_1.doc2bow(entry)
        s = [1,1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1])
        if s[0] <= s[1]:
            sentiment = 1
        else:
            sentiment = 0

    return sentiment

Example #6

0

Show file

File: read_excel4.py Project: huxiaoqian/case

def save_ts(topic, excel_name): # '0' no means
    start_ts = [1396918800, 1396918800, 1396920000, 1396922400, 1396929600, 1396928400,\
                1397032200, 1397045700, 1397096100, 1397089200, 1397138400]
    end_ts = [1396918900,1396920300, 1396927000, 1396923400, 1396931000, 1396930000,\
              1397033200, 1397130000, 1397098000, 1397089900, 1397140000]
    s = load_scws()
    for i in range(11):
        #item = OpinionTestTime(topic, str(i), start_ts[i], end_ts[i])
        # 获取child_topic对应的权重最大的微博，并进行分词
        data = xlrd.open_workbook(excel_name)
        table_weibo = data.sheet_by_name(str(i))
        line = table_weibo.row_values(0) # 读取工作表中的第一行，即权重最大的微博文本
        weibo = line[1] # 获得微博文本
        term_list = cut(s, weibo.encode('utf8'))
        #print 'term_list:', term_list
        child_topic = json.dumps({str(i): term_list})
        item = OpinionTestTime(topic, child_topic, start_ts[i], end_ts[i])
        # 分割线
        item_exist = db.session.query(OpinionTestTime).filter(OpinionTestTime.topic==topic, \
                                                              OpinionTestTime.child_topic==child_topic, \
                                                              OpinionTestTime.start_ts==start_ts[i], \
                                                              OpinionTestTime.end_ts==end_ts[i]).first()
        if item_exist:
            db.session.delete(item_exist)
        db.session.add(item)
    db.session.commit()

Example #7

0

Show file

File: views.py Project: huxiaoqian/case

def gaishu():
    # 要素
    yaosu = 'gaishu'

    # 话题关键词
    topic = request.args.get('query', default_topic)

    # 事件标签
    event_label = cut(scws, topic)

    # 时间范围: 20130901-20130901
    time_range = request.args.get('time_range', default_timerange)

    # 时间粒度: 3600
    point_interval = request.args.get('point_interval', None)
    if not point_interval:
        point_interval = default_pointInterval
    else:
        for pi in pointIntervals:
            if pi['en'] == int(point_interval):
                point_interval = pi
                break

    return render_template('index/gaishu.html', yaosu=yaosu, time_range=time_range, \
            topic=topic, pointInterval=point_interval, pointIntervals=pointIntervals, \
            gaishu_yaosus=gaishu_yaosus, deep_yaosus=deep_yaosus, event_label=event_label)

Example #8

0

Show file

def gaishu():
    # 要素
    yaosu = 'gaishu'

    # 话题关键词
    topic = request.args.get('query', default_topic)

    # 事件标签
    event_label = cut(scws, topic)

    # 时间范围: 20130901-20130901
    time_range = request.args.get('time_range', default_timerange)

    # 时间粒度: 3600
    point_interval = request.args.get('point_interval', None)
    if not point_interval:
        point_interval = default_pointInterval
    else:
        for pi in pointIntervals:
            if pi['en'] == int(point_interval):
                point_interval = pi
                break

    return render_template('index/gaishu.html', yaosu=yaosu, time_range=time_range, \
            topic=topic, pointInterval=point_interval, pointIntervals=pointIntervals, \
            gaishu_yaosus=gaishu_yaosus, deep_yaosus=deep_yaosus, event_label=event_label)

Example #9

0

Show file

File: news_keywords.py Project: huxiaoqian/case

def cut_words(text):
    if not isinstance(text, str):
        print 'type_content:', type(text)
        raise ValueError('cut words input text must be string')

    cx_terms = cut(s, text, cx=True)
    
    return [term for term, cx in cx_terms if cx in cx_dict and term not in black_words]

Example #10

0

Show file

File: stat.py Project: jselabzxl/scrapy_boat

def domain_stat():
    texts = []
    keywords = get_keywords('keywords_domain_baidu.txt')

    # 统计关键词
    total_keywords_list = []

    query_dict["$or"] = [{"source_category": "keywords_domain_weiboapi.txt"}]
    query_dict["source_website"] = "weibo_api_search_spider"
    count = mongo.master_timeline_weibo.find(query_dict).count()
    results = mongo.master_timeline_weibo.find(query_dict)
    for r in results:
        texts.append(r['text'].encode('utf-8'))

    query_dict["$or"] = [{"category": "keywords_domain_forum.txt"}]
    del query_dict["source_website"]
    count = mongo.boatcol.find(query_dict).count()
    results = mongo.boatcol.find(query_dict)
    for r in results:
        title = _encode_utf8(r['title'])
        content168 = _encode_utf8(r['content168'])
        summary = _encode_utf8(r['summary'])

        text = title  + content168 + summary
        texts.append(text)

    query_dict["$or"] = [{"category": "keywords_domain_weixin.txt"}]
    count = mongo.boatcol.find(query_dict).count()
    results = mongo.boatcol.find(query_dict)
    for r in results:
        title = _encode_utf8(r['title'])
        content168 = _encode_utf8(r['content168'])
        summary = _encode_utf8(r['summary'])

        text = title  + content168 + summary
        texts.append(text)

    query_dict["$or"] = [{"category": "keywords_domain_baidu.txt"}]
    query_dict["source_website"] = "baidu_ns_search"
    results = mongo.boatcol.find(query_dict)
    for r in results:
        title = _encode_utf8(r['title'])
        content168 = _encode_utf8(r['content168'])
        summary = _encode_utf8(r['summary'])

        text = title  + content168 + summary
        texts.append(text)

    for text in texts:
        cut_kw = cut(s, text)
        total_keywords_list.extend(cut_kw)

    ct = collections.Counter(total_keywords_list)
    keywords_results = ct.most_common(50)
    fw = csv.writer(open(result_path + 'domain_keywords_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^')
    for keyword, count in keywords_results:
        fw.writerow((_encode_utf8(keyword), count))

Example #11

0

Show file

File: news_keywords.py Project: yuanhr/info_consume

def cut_words(text):
    if not isinstance(text, str):
        print 'type_content:', type(text)
        raise ValueError('cut words input text must be string')

    cx_terms = cut(s, text, cx=True)

    return [
        term for term, cx in cx_terms
        if cx in cx_dict and term not in black_words
    ]

Example #12

0

Show file

File: parse_boat.py Project: ferrero-zhang/scrapy_boat

 def parse(self):
     docid = 0
     with open(self.filename) as f:
         for line in f:
             if docid == 0:
                 docid += 1
                 continue
             text = line.strip()
             terms = cut(s, text)
             self.corpus[str(docid)] = terms
             docid += 1

Example #13

0

Show file

File: utils.py Project: jimmy0000/opinion_news

def cut_words_noun(text):
    '''分词, 加入黑名单过滤单个词，保留名词
       input
           texts: 输入text的list，utf-8
       output:
           terms: 关键词list
    '''
    if not isinstance(text, str):
        raise ValueError("cut words input text must be string")

    cx_terms = cut(s, text, cx=True)

    return [term for term, cx in cx_terms if cx in cx_dict_noun and term not in black_words]

Example #14

0

Show file

File: rubbish_filter.py Project: jimmy0000/opinion_news

def prepare_svm_input(texts, y=None, dictionary=dictionary):
    """处理svm输入
    """
    x = []

    if not y:
        y = [1.0 for i in range(0, len(texts))]

    for text in texts:
        words = cut(sw, text)
        feature = dictionary.doc2bow(words)
        x.append(dict(feature))

    return y, x

Example #15

0

Show file

File: rubbish_filter.py Project: yuanhr/info_consume

def prepare_svm_input(texts, y=None, dictionary=dictionary):
    """处理svm输入
    """
    x = []

    if not y:
        y = [1.0 for i in range(0, len(texts))]

    for text in texts:
        words = cut(sw, text)
        feature = dictionary.doc2bow(words)
        x.append(dict(feature))

    return y, x

Example #16

0

Show file

File: rubbish_filter.py Project: jimmy0000/opinion_news

def prepare_svm_input_file(texts, dictionary=dictionary):
    """将svm输入处理成文件
    """
    pid = os.getpid()
    svm_input_path = os.path.join(AB_PATH, "./svm_test/%s.txt" % pid)

    fw = open(svm_input_path, "w")
    for text in texts:
        words = cut(sw, text)
        feature = dictionary.doc2bow(words)
        line = "1 " + " ".join([str(wordid + 1) + ":" + str(wordcount) for wordid, wordcount in feature])
        fw.write("%s\n" % line)
    fw.close()

    return svm_input_path

Example #17

0

Show file

File: utils.py Project: lijiahong/opinion_news

def cut_words_noun(text):
    '''分词, 加入黑名单过滤单个词，保留名词
       input
           texts: 输入text的list，utf-8
       output:
           terms: 关键词list
    '''
    if not isinstance(text, str):
        raise ValueError("cut words input text must be string")

    cx_terms = cut(s, text, cx=True)

    return [
        term for term, cx in cx_terms
        if cx in cx_dict_noun and term not in black_words
    ]

Example #18

0

Show file

File: rubbish_filter.py Project: yuanhr/info_consume

def prepare_svm_input_file(texts, dictionary=dictionary):
    """将svm输入处理成文件
    """
    pid = os.getpid()
    svm_input_path = os.path.join(AB_PATH, './svm_test/%s.txt' % pid)

    fw = open(svm_input_path, 'w')
    for text in texts:
        words = cut(sw, text)
        feature = dictionary.doc2bow(words)
        line = '1 ' + ' '.join([
            str(wordid + 1) + ':' + str(wordcount)
            for wordid, wordcount in feature
        ])
        fw.write('%s\n' % line)
    fw.close()

    return svm_input_path

Example #19

0

Show file

File: triple_sentiment_classifier.py Project: lijiahong/opinion_news

def triple_classifier(tweet):
    """text: utf-8 encoding
    """
    sentiment = 0
    text = tweet['text']  # encode

    #if_empty_retweet = if_empty_retweet_weibo(tweet)
    #if if_empty_retweet:
    #    text = tweet['retweeted_status']['text']

    # if_emoticoned = if_emoticoned_weibo(tweet)
    # if if_emoticoned == 1:
    emoticon_sentiment = emoticon(text)
    if emoticon_sentiment != 0:
        sentiment = emoticon_sentiment
        text = ''

    if text != '':
        entries = cut(cut_str, text)
        entry = [e.decode('utf-8', 'ignore') for e in entries]
        bow = dictionary_1.doc2bow(entry)
        s = [1, 1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1])
        if s[0] <= s[1]:
            bow = dictionary_2.doc2bow(entry)
            s = [1, 1, 1]
            for pair in bow:
                s[0] = s[0] * (step2_score[pair[0]][0] ** pair[1])
                s[1] = s[1] * (step2_score[pair[0]][1] ** pair[1])
                s[2] = s[2] * (step2_score[pair[0]][2] ** pair[1])
            if s[0] > s[1] and s[0] > s[2]:
                sentiment = HAPPY
            elif s[1] > s[0] and s[1] > s[2]:
                sentiment = SAD
            elif s[2] > s[1] and s[2] > s[0]:
                sentiment = ANGRY

    return sentiment

Example #20

0

Show file

def triple_classifier(tweet):
    """text: utf-8 encoding
    """
    sentiment = 0
    text = tweet['text']  # encode

    #if_empty_retweet = if_empty_retweet_weibo(tweet)
    #if if_empty_retweet:
    #    text = tweet['retweeted_status']['text']

    # if_emoticoned = if_emoticoned_weibo(tweet)
    # if if_emoticoned == 1:
    emoticon_sentiment = emoticon(text)
    if emoticon_sentiment != 0:
        sentiment = emoticon_sentiment
        text = ''

    if text != '':
        entries = cut(cut_str, text)
        entry = [e.decode('utf-8', 'ignore') for e in entries]
        bow = dictionary_1.doc2bow(entry)
        s = [1, 1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0]**pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1]**pair[1])
        if s[0] <= s[1]:
            bow = dictionary_2.doc2bow(entry)
            s = [1, 1, 1]
            for pair in bow:
                s[0] = s[0] * (step2_score[pair[0]][0]**pair[1])
                s[1] = s[1] * (step2_score[pair[0]][1]**pair[1])
                s[2] = s[2] * (step2_score[pair[0]][2]**pair[1])
            if s[0] > s[1] and s[0] > s[2]:
                sentiment = HAPPY
            elif s[1] > s[0] and s[1] > s[2]:
                sentiment = SAD
            elif s[2] > s[1] and s[2] > s[0]:
                sentiment = ANGRY

    return sentiment

Example #21

0

Show file

File: neutral_classifier.py Project: rcsc/xapian_case

def neutral_classifier(tweet):
    sentiment = 0
    text = tweet['text']  # encode

    emoticon_sentiment = emoticon(text)
    if emoticon_sentiment != 0:
        sentiment = 0
        text = u''

    if text != u'':
        entries = cut(cut_str, text.encode('utf-8'))
        entry = [e.decode('utf-8', 'ignore') for e in entries]
        bow = dictionary_1.doc2bow(entry)
        s = [1, 1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1])
        if s[0] > s[1]:
            sentiment = 1
 

    return sentiment

Example #22

0

Show file

File: xapian_zmq_work.py Project: lijiahong/xapian_case

    def cut_text(item):
        text = item['text'].encode('utf-8')
        item['terms'] = cut(s, text, cx=False)
        item['topics'] = list(set(item['terms']))

        return item

Example #23

0

Show file

File: train_domain_keywords.py Project: ferrero-zhang/scrapy_boat

#-*-coding=utf-8-*-

import collections
from xapian_case.utils import load_scws, cut

sw = load_scws()
total_keywords_list = []

f = open('../source/domain_training_text.txt')

for line in f:
    text = line.strip()
    terms = cut(sw, text)
    total_keywords_list.extend(terms)

f.close()

ct = collections.Counter(total_keywords_list)
keywords_results = ct.most_common(100)
fw = open('../source/domain_keywords_20150618.txt', 'w')
for keyword, count in keywords_results:
    fw.write("%s\n" % keyword)
fw.close()

Example #24

0

Show file

File: xapian_zmq_work.py Project: rcsc/xapian_case

 def cut_text(item):
     text = item['text'].encode('utf-8')
     item['terms'] = cut(s, text, cx=False)
     return item

Example #25

0

Show file

File: cron_news_topic_propagate.py Project: NeilWang6/case

def cut_text(item):
    text = item['content168'].encode('utf-8')
    item['terms'] = cut(s, text, cx=False)
    return item

Example #26

0

Show file

File: cut_word.py Project: ferrero-zhang/scrapy_boat

mongo = _default_mongo()

module_keywords = get_module_keywords()

for bankuai, lanmu, source, source_en, keywords_file in module_keywords:
    query_dict = {"timestamp": {"$gte": START_TS, "$lt": END_TS}, "keywords_hit": True, "rubbish": False}

    if source_en == "weibo_api_search_spider":
        query_dict["source_category"] = keywords_file
        query_dict["source_website"] = source_en
        count = mongo.master_timeline_weibo.find(query_dict).count()
        results = mongo.master_timeline_weibo.find(query_dict)

        for r in results:
            terms = cut(s, r["text"].encode("utf-8"))
            mongo.master_timeline_weibo.update({"_id": r["_id"]}, {"$set": {"terms": terms}})
    else:
        query_dict["category"] = keywords_file
        query_dict["source_website"] = source_en
        count = mongo.boatcol.find(query_dict).count()
        results = mongo.boatcol.find(query_dict)

        for r in results:
            title = _encode_utf8(r["title"])
            content168 = _encode_utf8(r["content168"])
            summary = _encode_utf8(r["summary"])

            text = title + content168 + summary
            r["text"] = text.decode("utf-8")
            terms = cut(s, r["text"].encode("utf-8"))

Example #27

0

Show file

File: xapian_zmq_work.py Project: huxiaoqian/xapian_case

    def cut_text(item):
        text = item["text"].encode("utf-8")
        item["terms"] = cut(s, text, cx=False)
        item["topics"] = list(set(item["terms"]))

        return item

Example #28

0

Show file

File: stat.py Project: ferrero-zhang/scrapy_boat

def sheqi_stat():
    texts = []
    keywords = get_keywords('keywords_corp_baidu.txt')

    # 发布来源统计
    author_dict = dict()

    # 涉及企业的信息数统计
    corp_dict = dict()

    # 各渠道信息条数的周走势
    source_daily_dict = {
        "微博": {},
        "微信": {},
        "论坛": {},
        "新闻": {}
    }

    # 微博论坛渠道的情绪统计
    sentiment_stock_dict = dict()
    sentiment_ustock_dict = dict()

    # 统计关键词
    total_keywords_list = []

    query_dict["$or"] = [{"source_category": "keywords_corp_weiboapi.txt"}, {"source_category": "keywords_hot_weiboapi.txt"}, {"source_category": "keywords_leader_weiboapi.txt"}]
    query_dict["source_website"] = "weibo_api_search_spider"
    count = mongo.master_timeline_weibo.find(query_dict).count()
    results = mongo.master_timeline_weibo.find(query_dict)
    author_dict["微博"] = count
    for r in results:
        text = r['text'].encode('utf-8')
        isstock = stock_classifier(text)
        texts.append(text)
        try:
            source_daily_dict["微博"][ts2date(r["timestamp"])] += 1
        except KeyError:
            source_daily_dict["微博"][ts2date(r["timestamp"])] = 1

        if isstock:
            try:
                sentiment_stock_dict[r["sentiment"]] += 1
            except KeyError:
                sentiment_stock_dict[r["sentiment"]] = 1
        else:
            try:
                sentiment_ustock_dict[r["sentiment"]] += 1
            except KeyError:
                sentiment_ustock_dict[r["sentiment"]] = 1

    query_dict["$or"] = [{"category": "keywords_corp_forum.txt"}, {"category": "keywords_hot_forum.txt"}, {"category": "keywords_leader_forum.txt"}]
    del query_dict["source_website"]
    count = mongo.boatcol.find(query_dict).count()
    results = mongo.boatcol.find(query_dict)
    author_dict["论坛"] = count
    for r in results:
        title = _encode_utf8(r['title'])
        content168 = _encode_utf8(r['content168'])
        summary = _encode_utf8(r['summary'])

        text = title  + content168 + summary
        isstock = stock_classifier(text)
        texts.append(text)
        try:
            source_daily_dict["论坛"][r["date"]] += 1
        except KeyError:
            source_daily_dict["论坛"][r["date"]] = 1

        if isstock:
            try:
                sentiment_stock_dict[r["sentiment"]] += 1
            except KeyError:
                sentiment_stock_dict[r["sentiment"]] = 1
        else:
            try:
                sentiment_ustock_dict[r["sentiment"]] += 1
            except KeyError:
                sentiment_ustock_dict[r["sentiment"]] = 1

    query_dict["$or"] = [{"category": "keywords_corp_weixin.txt"}, {"category": "keywords_hot_weixin.txt"}, {"category": "keywords_leader_weixin.txt"}]
    count = mongo.boatcol.find(query_dict).count()
    results = mongo.boatcol.find(query_dict)
    author_dict["微信"] = count
    for r in results:
        title = _encode_utf8(r['title'])
        content168 = _encode_utf8(r['content168'])
        summary = _encode_utf8(r['summary'])

        text = title  + content168 + summary
        texts.append(text)
        try:
            source_daily_dict["微信"][r["date"]] += 1
        except KeyError:
            source_daily_dict["微信"][r["date"]] = 1

    query_dict["$or"] = [{"category": "keywords_corp_baidu.txt"}, {"category": "keywords_hot_baidu.txt"}, {"category": "keywords_leader_baidu.txt"}]
    query_dict["source_website"] = "baidu_ns_search"
    results = mongo.boatcol.find(query_dict)
    for r in results:
        try:
            author_dict[r['user_name']] += 1
        except KeyError:
            author_dict[r['user_name']] = 1
        title = _encode_utf8(r['title'])
        content168 = _encode_utf8(r['content168'])
        summary = _encode_utf8(r['summary'])

        text = title  + content168 + summary
        texts.append(text)
        try:
            source_daily_dict["新闻"][r["date"]] += 1
        except KeyError:
            source_daily_dict["新闻"][r["date"]] = 1

    fw = csv.writer(open(result_path + 'sheqi_author_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^')
    results = sorted(author_dict.iteritems(), key=lambda(k, v): v, reverse=True)
    for k, v in results:
        if k == "":
            continue
        fw.writerow((_encode_utf8(k), v))

    for text in texts:
        cut_kw = cut(s, text)
        total_keywords_list.extend(cut_kw)
        for keyword in keywords:
            if keyword in text:
                try:
                    corp_dict[keyword] += 1
                except KeyError:
                    corp_dict[keyword] = 1

    fw = csv.writer(open(result_path + 'sheqi_gongsi_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^')
    results = sorted(corp_dict.iteritems(), key=lambda(k, v): v, reverse=True)
    for k, v in results:
        if k == "":
            continue
        fw.writerow((_encode_utf8(k), v))

    fw = csv.writer(open(result_path + 'sheqi_source_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^')
    for source, daily_dict in source_daily_dict.iteritems():
        for date, count in daily_dict.iteritems():
            fw.writerow((_encode_utf8(source), date, count))

    fw = csv.writer(open(result_path + 'sheqi_stock_sentiment_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^')
    for sentiment, count in sentiment_stock_dict.iteritems():
        fw.writerow((sentiment, count))

    fw = csv.writer(open(result_path + 'sheqi_nonstock_sentiment_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^')
    for sentiment, count in sentiment_ustock_dict.iteritems():
        fw.writerow((sentiment, count))

    ct = collections.Counter(total_keywords_list)
    keywords_results = ct.most_common(50)
    fw = csv.writer(open(result_path + 'sheqi_keywords_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^')
    for keyword, count in keywords_results:
        fw.writerow((_encode_utf8(keyword), count))

Example #29

0

Show file

File: stat.py Project: ferrero-zhang/scrapy_boat

def friends_zixun_stat():
    texts = []
    keywords = get_keywords('keywords_friends_baidu.txt')

    # 涉及企业的與情热度统计
    corp_dict = dict()

    # 统计关键词
    total_keywords_list = []

    """
    query_dict["$or"] = [{"source_category": "keywords_friends_weiboapi.txt"}]
    query_dict["source_website"] = "weibo_api_search_spider"
    count = mongo.master_timeline_weibo.find(query_dict).count()
    results = mongo.master_timeline_weibo.find(query_dict)
    for r in results:
        if 'hot' in r:
            hot = r['hot']
        else:
            hot = 1
        texts.append([r['text'].encode('utf-8'), hot])

    query_dict["$or"] = [{"category": "keywords_friends_forum.txt"}]
    del query_dict["source_website"]
    count = mongo.boatcol.find(query_dict).count()
    results = mongo.boatcol.find(query_dict)
    for r in results:
        title = _encode_utf8(r['title'])
        content168 = _encode_utf8(r['content168'])
        summary = _encode_utf8(r['summary'])

        text = title  + content168 + summary
        if 'hot' in r:
            hot = r['hot']
        else:
            hot = 1
        texts.append([text, hot])
    """

    query_dict["$or"] = [{"category": "keywords_friends_weixin.txt"}]
    count = mongo.boatcol.find(query_dict).count()
    results = mongo.boatcol.find(query_dict)
    for r in results:
        title = _encode_utf8(r['title'])
        content168 = _encode_utf8(r['content168'])
        summary = _encode_utf8(r['summary'])

        text = title  + content168 + summary
        if 'hot' in r:
            hot = r['hot']
        else:
            hot = 1
        texts.append([text, hot])

    query_dict["$or"] = [{"category": "keywords_friends_baidu.txt"}]
    query_dict["source_website"] = "baidu_ns_search"
    results = mongo.boatcol.find(query_dict)
    for r in results:
        title = _encode_utf8(r['title'])
        content168 = _encode_utf8(r['content168'])
        summary = _encode_utf8(r['summary'])

        text = title  + content168 + summary
        if 'hot' in r:
            hot = r['hot']
        else:
            hot = 1
        texts.append([text, hot])

    for text in texts:
        cut_kw = cut(s, text[0])
        total_keywords_list.extend(cut_kw)
        for keyword in keywords:
            if keyword in text[0]:
                if text[1] == 0:
                    hot = 1
                else:
                    hot = text[1]
                try:
                    corp_dict[keyword] += hot
                except KeyError:
                    corp_dict[keyword] = hot

    fw = csv.writer(open(result_path + 'friends_gongsi_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^')
    results = sorted(corp_dict.iteritems(), key=lambda(k, v): v, reverse=True)
    for k, v in results:
        if k == "":
            continue
        fw.writerow((_encode_utf8(k), v))

    ct = collections.Counter(total_keywords_list)
    keywords_results = ct.most_common(50)
    fw = csv.writer(open(result_path + 'friends_keywords_stat_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^')
    for keyword, count in keywords_results:
        fw.writerow((_encode_utf8(keyword), count))