Beispiel #1
0
    def NlpirTokener(self,raw):
        nlpir_result=''
        tokens = nlpir.Seg(raw)
        for w in tokens:
#           result+= w[0]+"/"+w[1] #加词性标注
            nlpir_result+= w[0] +' '#加词性标注
        return nlpir_result
Beispiel #2
0
    def part_from_string(self,string):
        if not string:
            return ""
        part = []
        for t in nlpir.Seg(string):
            part.append(t[0])

        part_string = " ".join(part)
        return part_string
Beispiel #3
0
def extract_statuses_behave(statuses, date_filter=None):
    n = 0
    n_filtered = 0
    n_original = 0
    days = set()
    n_comments = 0.0
    n_repost = 0.0
    n_attitudes = 0.0
    n_url = 0
    n_pic = 0
    n_at = 0
    n_emotion = 0
    n_emotion_pos = 0
    n_emotion_neg = 0
    n_night = 0

    n_contains_I = 0
    n_contains_we = 0
    nTextLength = []

    if date_filter is None: date_filter = 'Dec 12 23:59:59 +0800 2099'
    filter = parser.parse(date_filter, fuzzy=True)

    minCreatedAt = parser.parse('Dec 31 23:59:59 +0800 2099', fuzzy=True)
    maxCreatedAt = parser.parse('Jan 01 00:00:01 +0800 1970', fuzzy=True)

    for s in statuses:
        n += 1
        if 'created_at' not in s:
            continue

        created_at = s.get('created_at')
        created = parser.parse(created_at, fuzzy=True) if isinstance(
            created_at, basestring) else utc.localize(created_at)
        if created < minCreatedAt: minCreatedAt = created
        if created > maxCreatedAt: maxCreatedAt = created

        if created > filter: continue

        n_filtered += 1

        text = s.get('text')
        hour = created.hour

        n_original += 1 if "retweeted_status" in s or s.get('is_original',
                                                            0) == 1 else 0
        day = ''.join(
            str(t) for t in [created.year, created.month, created.day])
        days.add(int(day))
        n_comments += s.get('comments_count', 0)
        n_repost += s.get('reposts_count', 0)
        n_attitudes += s.get('attitudes_count', 0)

        n_url += 1 if 'http://t.cn/' in text else 0
        n_pic += 1 if len(s.get('pic_ids', s.get('original_pic',
                                                 0))) > 0 else 0
        n_at += sum([1 if i == '@' else 0 for i in text])

        l = len(text)
        nTextLength.append(l)
        words = []
        if l > 0:
            t = text.encode('UTF-8')
            words = nlpir.Seg(t)
            for i in words:
                word = i[0]
                if word in set_I: n_contains_I += 1
                if word in set_We: n_contains_we += 1

        exps = re.findall(re_exp, text)
        for exp in exps:
            n_emotion += 1
            if exp in neg_exp:
                n_emotion_pos += 1
            elif exp in pos_exp:
                n_emotion_neg += 1

        n_night += 1 if hour < 6 or hour > 22 else 0

    if n < 1:
        return ['N/A'] * 23

    if n_filtered > 0:
        n_comments /= n_filtered
        n_repost /= n_filtered
        n_attitudes /= n_filtered

    result = [
        n,
        n_filtered,  # 公开微博总数
        n_original,  # 原创微博数'
        n_pic,  # 含图片原创微博数'
        n_url,  # 含URL微博数
        n_at,  # 含@的微博数
        n_contains_we,  # 微博中第一人称复数使用次数
        n_contains_I,  # 微博中第一人称单数使用次数
        n_night,  # 夜间时段发微博数
        n_emotion,  # 含表情总数
        n_emotion_pos,  # 含积极表情总数
        n_emotion_neg,  # 含消极表情总数
        numpy.mean(nTextLength)
        if len(nTextLength) > 0 else 'N/A',  # 公开微博字数平均值
        numpy.std(nTextLength) if len(nTextLength) > 0 else 'N/A',  # 公开微博字数STD
        numpy.max(nTextLength) if len(nTextLength) > 0 else 'N/A',  # 公开微博字数MAX
        numpy.min(nTextLength) if len(nTextLength) > 0 else 'N/A',  # 公开微博字数MIN
        numpy.median(nTextLength)
        if len(nTextLength) > 0 else 'N/A',  # 公开微博字数MEDIAN
        len(days),  # 发微博天数
        n_comments,  # 评论数
        n_repost,  # 转发数
        n_attitudes,  # 表态数
        minCreatedAt,  # 最早一条微博发布时间
        maxCreatedAt  # 最后一条微博发布时间
    ]
    return [str(i) for i in result]
Beispiel #4
0
    def process_paragraph(self, paragraph, encoding='utf-8'):
        r = Result()

        n_sentence_total_number = 1
        n_term_total_number = 0
        n_term_numerals = len(find_numeral(paragraph))
        n_term_in_dic = 0
        n_term_len_gte6 = 0

        n_term_len_gte4 = 0
        n_term_latin = 0

        n_term_at_mention = len(find_at_mention(paragraph))
        n_term_emotion = len(find_emotions(paragraph))
        n_term_hashtag = len(find_hashtag(paragraph))
        n_term_url = len(find_url(paragraph))

        lst = nlpir.Seg(paragraph)
        for t in lst:
            term = t[0].decode('utf-8', 'ignore')
            POS = t[1]

            n_term_total_number += 1

            if is_sentence_separator(term):  #如果是句子分隔符,句子数自增
                n_sentence_total_number += 1
            else:
                if is_latin(term): n_term_latin += 1
                if len(term) >= 6: n_term_len_gte6 += 1
                if len(term) >= 4: n_term_len_gte4 += 1

            tags = get_term_tags(term)
            if len(tags) > 0:
                n_term_in_dic += 1
                for tag in tags:
                    r.accumulate(tag)

            if self.enablePOS: r.accumulate('POS/%s' % POS)

        r.accumulate('stat/WordCount', value=n_term_total_number)
        if n_term_total_number == 0: n_term_total_number = float('NaN')

        r.accumulate('stat/WordPerSentence',
                     value=float(n_term_total_number) /
                     n_sentence_total_number)
        r.accumulate('stat/RateDicCover',
                     value=float(n_term_in_dic) / n_term_total_number)
        r.accumulate('stat/RateNumeral',
                     value=float(n_term_numerals) / n_term_total_number)
        r.accumulate('stat/RateSixLtrWord',
                     value=float(n_term_len_gte6) / n_term_total_number)

        r.accumulate('stat/RateFourCharWord',
                     value=float(n_term_len_gte4) / n_term_total_number)
        r.accumulate('stat/RateLatinWord',
                     value=float(n_term_latin) / n_term_total_number)

        r.accumulate('stat/NumAtMention', value=n_term_at_mention)
        r.accumulate('stat/NumEmotion', value=n_term_emotion)
        r.accumulate('stat/NumHashTag', value=n_term_hashtag)
        r.accumulate('stat/NumURLs', value=n_term_url)

        return r
Beispiel #5
0
    def process_file(self, file_path, file_encoding='utf-8-sig'):
        r = Result()

        n_sentence_total_number = 1
        n_term_total_number = 0
        n_term_numerals = 0
        n_term_in_dic = 0
        n_term_len_gte6 = 0

        n_term_len_gte4 = 0
        n_term_latin = 0

        n_term_at_mention = 0
        n_term_emotion = 0
        n_term_hashtag = 0
        n_term_url = 0

        with codecs.open(file_path, 'r', encoding=file_encoding) as fp:
            for line in fp:
                line = line.strip(' \t\r\n').encode('utf-8')
                if len(line) < 1: continue

                n_term_numerals += len(find_numeral(line))
                n_term_at_mention += len(find_at_mention(line))
                n_term_emotion += len(find_emotions(line))
                n_term_hashtag += len(find_hashtag(line))
                n_term_emotion += len(find_url(line))

                lst = nlpir.Seg(line)
                for t in lst:
                    term = t[0].decode('utf-8', 'ignore')
                    POS = t[1]

                    n_term_total_number += 1

                    if is_sentence_separator(term):  #如果是句子分隔符,句子数自增
                        n_sentence_total_number += 1
                    else:
                        if is_latin(term): n_term_latin += 1
                        if len(term) >= 6: n_term_len_gte6 += 1
                        if len(term) >= 4: n_term_len_gte4 += 1

                    tags = get_term_tags(term)
                    if len(tags) > 0:
                        n_term_in_dic += 1
                        for tag in tags:
                            r.accumulate(tag)

                    if self.enablePOS: r.accumulate('POS/%s' % POS)

        r.accumulate('stat/WordCount', value=n_term_total_number)
        if n_term_total_number == 0: n_term_total_number = float('NaN')

        r.accumulate('stat/WordPerSentence',
                     value=n_term_total_number / n_sentence_total_number)
        r.accumulate('stat/DicCoverRate',
                     value=n_term_in_dic / n_term_total_number)
        r.accumulate('stat/Numerals',
                     value=n_term_numerals / n_term_total_number)
        r.accumulate('stat/SixLtr',
                     value=n_term_len_gte6 / n_term_total_number)

        r.accumulate('stat/FourCharWord',
                     value=n_term_len_gte4 / n_term_total_number)
        r.accumulate('stat/Latin', value=n_term_latin / n_term_total_number)

        r.accumulate('stat/AtMention', value=n_term_at_mention)
        r.accumulate('stat/Emotion', value=n_term_emotion)
        r.accumulate('stat/HashTag', value=n_term_hashtag)
        r.accumulate('stat/URLs', value=n_term_url)

        return r
Beispiel #6
0
def extractWeibo(fname):
    nStatuses = 0
    nOriginal = 0
    nOrigStatusWithPics = 0
    nContainsUrl = 0
    nContainsMention = 0
    nContainsWe = 0
    nContainsI = 0
    nComposeLate = 0
    nNegExps = 0

    nTextLength = []

    minCreatedAt = parser.parse('Dec 31 23:59:59 +0800 2099', fuzzy=True)
    maxCreatedAt = parser.parse('Jan 01 00:00:01 +0800 1970', fuzzy=True)

    try:
        f = codecs.open(fname, 'r', encoding='utf-8')

        statuses = json.load(f, encoding='utf-8')
        nStatuses += len(statuses)

        for s in statuses:
            text = s['text']
            l = len(text)
            nTextLength.append(l)

            isOriginal = 1 if s.get('retweeted_status', None) is None else 0
            nOriginal += isOriginal

            containsPic = s.get('pic_ids',None) is not None \
                or s.get('pic_urls',None) is not None
            nOrigStatusWithPics += 1 if isOriginal > 0 and containsPic else 0

            containsUrl = 'http://t.cn/' in text
            nContainsUrl += 1 if containsUrl else 0

            containsMention = '@' in text
            nContainsMention += containsMention

            if l == 0:
                words = []
            else:
                t = text.encode('UTF-8')
                words = nlpir.Seg(t)

            for i in words:
                word = i[0]
                if word in set_I: nContainsI += 1
                if word in set_We: nContainsWe += 1

            createdAt = parser.parse(s['created_at'], fuzzy=True)

            if createdAt < minCreatedAt: minCreatedAt = createdAt
            if createdAt > maxCreatedAt: maxCreatedAt = createdAt

            createdHour = createdAt.hour
            if not (createdHour > 6 and createdHour < 22):
                nComposeLate += 1

            exps = re.findall(re_exp, text)
            for exp in exps:
                if exp in neg_exp:
                    nNegExps += 1

        f.close()
        f = dict()
        f['公开微博总数'] = nStatuses
        f['原创微博数'] = nOriginal
        f['含图片原创微博数'] = nOrigStatusWithPics
        f['含URL微博数'] = nContainsUrl
        f['含@的微博数'] = nContainsMention
        f['微博中第一人称复数使用次数'] = nContainsWe
        f['微博中第一人称单数使用次数'] = nContainsI
        f['夜间时段发微博数'] = nComposeLate
        f['含消极表情总数'] = nNegExps
        f['公开微博字数平均值'] = numpy.mean(
            nTextLength) if len(nTextLength) > 0 else 'N/A'
        f['公开微博字数STD'] = numpy.std(
            nTextLength) if len(nTextLength) > 0 else 'N/A'
        f['公开微博字数MAX'] = numpy.max(
            nTextLength) if len(nTextLength) > 0 else 'N/A'
        f['公开微博字数MIN'] = numpy.min(
            nTextLength) if len(nTextLength) > 0 else 'N/A'
        f['公开微博字数MEDIAN'] = numpy.median(
            nTextLength) if len(nTextLength) > 0 else 'N/A'
        f['最早一条微博发布时间'] = minCreatedAt.date()
        f['最后一条微博发布时间'] = maxCreatedAt.date()

        for k, v in f.iteritems():
            f[k] = str(v)

        global textLen
        textLen.extend(nTextLength)

        return f

    except IOError as e:
        return {}