Exemple #1
0
    def process_file(self, file_path, file_encoding='utf-8-sig'):
        r = Result()

        n_sentence_total_number = 1
        n_term_total_number = 0
        n_term_numerals = 0
        n_term_in_dic = 0
        n_term_len_gte6 = 0

        n_term_len_gte4 = 0
        n_term_latin = 0

        n_term_at_mention = 0
        n_term_emotion = 0
        n_term_hashtag = 0
        n_term_url = 0

        with codecs.open(file_path, 'r', encoding=file_encoding) as fp:
            for line in fp:
                line = line.strip(' \t\r\n').encode('utf-8')
                if len(line) < 1: continue

                n_term_numerals += len(find_numeral(line))
                n_term_at_mention += len(find_at_mention(line))
                n_term_emotion += len(find_emotions(line))
                n_term_hashtag += len(find_hashtag(line))
                n_term_emotion += len(find_url(line))

                lst = nlpir.Seg(line)
                for t in lst:
                    term = t[0].decode('utf-8', 'ignore')
                    POS = t[1]

                    n_term_total_number += 1

                    if is_sentence_separator(term):  #如果是句子分隔符,句子数自增
                        n_sentence_total_number += 1
                    else:
                        if is_latin(term): n_term_latin += 1
                        if len(term) >= 6: n_term_len_gte6 += 1
                        if len(term) >= 4: n_term_len_gte4 += 1

                    tags = get_term_tags(term)
                    if len(tags) > 0:
                        n_term_in_dic += 1
                        for tag in tags:
                            r.accumulate(tag)

                    if self.enablePOS: r.accumulate('POS/%s' % POS)

        r.accumulate('stat/WordCount', value=n_term_total_number)
        if n_term_total_number == 0: n_term_total_number = float('NaN')

        r.accumulate('stat/WordPerSentence',
                     value=n_term_total_number / n_sentence_total_number)
        r.accumulate('stat/DicCoverRate',
                     value=n_term_in_dic / n_term_total_number)
        r.accumulate('stat/Numerals',
                     value=n_term_numerals / n_term_total_number)
        r.accumulate('stat/SixLtr',
                     value=n_term_len_gte6 / n_term_total_number)

        r.accumulate('stat/FourCharWord',
                     value=n_term_len_gte4 / n_term_total_number)
        r.accumulate('stat/Latin', value=n_term_latin / n_term_total_number)

        r.accumulate('stat/AtMention', value=n_term_at_mention)
        r.accumulate('stat/Emotion', value=n_term_emotion)
        r.accumulate('stat/HashTag', value=n_term_hashtag)
        r.accumulate('stat/URLs', value=n_term_url)

        return r
Exemple #2
0
    def process_paragraph(self, paragraph, encoding='utf-8'):
        r = Result()

        n_sentence_total_number = 1
        n_term_total_number = 0
        n_term_numerals = len(find_numeral(paragraph))
        n_term_in_dic = 0
        n_term_len_gte6 = 0

        n_term_len_gte4 = 0
        n_term_latin = 0

        n_term_at_mention = len(find_at_mention(paragraph))
        n_term_emotion = len(find_emotions(paragraph))
        n_term_hashtag = len(find_hashtag(paragraph))
        n_term_url = len(find_url(paragraph))

        lst = nlpir.Seg(paragraph)
        for t in lst:
            term = t[0].decode('utf-8', 'ignore')
            POS = t[1]

            n_term_total_number += 1

            if is_sentence_separator(term):  #如果是句子分隔符,句子数自增
                n_sentence_total_number += 1
            else:
                if is_latin(term): n_term_latin += 1
                if len(term) >= 6: n_term_len_gte6 += 1
                if len(term) >= 4: n_term_len_gte4 += 1

            tags = get_term_tags(term)
            if len(tags) > 0:
                n_term_in_dic += 1
                for tag in tags:
                    r.accumulate(tag)

            if self.enablePOS: r.accumulate('POS/%s' % POS)

        r.accumulate('stat/WordCount', value=n_term_total_number)
        if n_term_total_number == 0: n_term_total_number = float('NaN')

        r.accumulate('stat/WordPerSentence',
                     value=float(n_term_total_number) /
                     n_sentence_total_number)
        r.accumulate('stat/RateDicCover',
                     value=float(n_term_in_dic) / n_term_total_number)
        r.accumulate('stat/RateNumeral',
                     value=float(n_term_numerals) / n_term_total_number)
        r.accumulate('stat/RateSixLtrWord',
                     value=float(n_term_len_gte6) / n_term_total_number)

        r.accumulate('stat/RateFourCharWord',
                     value=float(n_term_len_gte4) / n_term_total_number)
        r.accumulate('stat/RateLatinWord',
                     value=float(n_term_latin) / n_term_total_number)

        r.accumulate('stat/NumAtMention', value=n_term_at_mention)
        r.accumulate('stat/NumEmotion', value=n_term_emotion)
        r.accumulate('stat/NumHashTag', value=n_term_hashtag)
        r.accumulate('stat/NumURLs', value=n_term_url)

        return r
Exemple #3
0
    def process_file(self, file_path, file_encoding='utf-8-sig'):
        r = Result()

        n_sentence_total_number = 1
        n_term_total_number = 0
        n_term_numerals = 0
        n_term_in_dic = 0
        n_term_len_gte6 = 0

        n_term_len_gte4 = 0
        n_term_latin = 0

        n_term_at_mention = 0
        n_term_emotion = 0
        n_term_hashtag = 0
        n_term_url = 0

        with codecs.open(file_path, 'r', encoding=file_encoding) as fp:
            for line in fp:
                line = line.strip(' \t\r\n').encode('utf-8')
                if len(line)<1: continue

                n_term_numerals += len( find_numeral(line) )
                n_term_at_mention += len( find_at_mention(line) )
                n_term_emotion += len( find_emotions(line) )
                n_term_hashtag += len( find_hashtag(line) )
                n_term_emotion += len( find_url(line) )

                lst = nlpir.Seg(line)
                for t in lst:
                    term = t[0].decode('utf-8','ignore')
                    POS = t[1]

                    n_term_total_number += 1

                    if is_sentence_separator(term):    #如果是句子分隔符,句子数自增
                        n_sentence_total_number += 1
                    else:
                        if is_latin(term): n_term_latin += 1
                        if len(term)>=6:  n_term_len_gte6 += 1
                        if len(term)>=4:  n_term_len_gte4 += 1

                    tags = get_term_tags(term)
                    if len(tags)>0:
                        n_term_in_dic += 1
                        for tag in tags:
                            r.accumulate(tag)

                    if self.enablePOS: r.accumulate('POS/%s'%POS)

        r.accumulate('stat/WordCount', value=n_term_total_number)
        if n_term_total_number == 0: n_term_total_number=float('NaN')

        r.accumulate('stat/WordPerSentence', value=n_term_total_number/n_sentence_total_number)
        r.accumulate('stat/DicCoverRate', value=n_term_in_dic/n_term_total_number)
        r.accumulate('stat/Numerals', value=n_term_numerals/n_term_total_number)
        r.accumulate('stat/SixLtr', value=n_term_len_gte6/n_term_total_number)

        r.accumulate('stat/FourCharWord', value=n_term_len_gte4/n_term_total_number)
        r.accumulate('stat/Latin', value=n_term_latin/n_term_total_number)

        r.accumulate('stat/AtMention', value=n_term_at_mention)
        r.accumulate('stat/Emotion', value=n_term_emotion)
        r.accumulate('stat/HashTag', value=n_term_hashtag)
        r.accumulate('stat/URLs', value=n_term_url)

        return r
Exemple #4
0
    def process_paragraph(self, paragraph, encoding='utf-8'):
        r = Result()

        n_sentence_total_number = 1
        n_term_total_number = 0
        n_term_numerals = len( find_numeral(paragraph) )
        n_term_in_dic = 0
        n_term_len_gte6 = 0

        n_term_len_gte4 = 0
        n_term_latin = 0

        n_term_at_mention = len( find_at_mention(paragraph) )
        n_term_emotion = len( find_emotions(paragraph) )
        n_term_hashtag = len( find_hashtag(paragraph) )
        n_term_url = len( find_url(paragraph) )

        lst = nlpir.Seg(paragraph)
        for t in lst:
            term = t[0].decode('utf-8','ignore')
            POS = t[1]

            n_term_total_number += 1

            if is_sentence_separator(term):    #如果是句子分隔符,句子数自增
                n_sentence_total_number += 1
            else:
                if is_latin(term): n_term_latin += 1
                if len(term)>=6:  n_term_len_gte6 += 1
                if len(term)>=4:  n_term_len_gte4 += 1

            tags = get_term_tags(term)
            if len(tags)>0:
                n_term_in_dic += 1
                for tag in tags:
                    r.accumulate(tag)

            if self.enablePOS: r.accumulate('POS/%s'%POS)

        r.accumulate('stat/WordCount', value=n_term_total_number)
        if n_term_total_number == 0: n_term_total_number=float('NaN')

        r.accumulate('stat/WordPerSentence', value= float(n_term_total_number)/n_sentence_total_number)
        r.accumulate('stat/RateDicCover', value= float(n_term_in_dic)/n_term_total_number)
        r.accumulate('stat/RateNumeral', value= float(n_term_numerals)/n_term_total_number)
        r.accumulate('stat/RateSixLtrWord', value= float(n_term_len_gte6)/n_term_total_number)

        r.accumulate('stat/RateFourCharWord', value= float(n_term_len_gte4)/n_term_total_number)
        r.accumulate('stat/RateLatinWord', value= float(n_term_latin)/n_term_total_number)

        r.accumulate('stat/NumAtMention', value=n_term_at_mention)
        r.accumulate('stat/NumEmotion', value=n_term_emotion)
        r.accumulate('stat/NumHashTag', value=n_term_hashtag)
        r.accumulate('stat/NumURLs', value=n_term_url)

        return r
Exemple #5
0
def process_iterator(lst_original, lst_segged, to_ration=True, enable_pos=False, encoding='utf-8'):
    """used to process list of paragraphs"""
    if len(lst_original) != len(lst_segged):
        raise ValueError('The length of lst_original and lst_segged should be equal!')

    r = Result()

    n_sentence_total_number = 1
    n_term_total_number = 0
    n_term_numerals = 0
    n_term_in_dic = 0
    n_term_len_gte6 = 0

    n_term_len_gte4 = 0
    n_term_latin = 0

    n_term_at_mention = 0
    n_term_emotion = 0
    n_term_hashtag = 0
    n_term_url = 0

    for line, seg_str in zip(lst_original, lst_segged):
        line = line.strip(' \t\r\n')  # .encode('utf-8')
        if len(line) < 1:
            continue

        n_term_numerals += len(find_numeral(line))
        n_term_at_mention += len(find_at_mention(line))
        n_term_emotion += len(find_emotions(line))
        n_term_hashtag += len(find_hashtag(line))
        n_term_url += len(find_url(line))

        segged_terms = default_seg(seg_str)
        for t in segged_terms:
            term = t[0].decode(encoding, 'ignore')
            pos = t[1]

            n_term_total_number += 1

            if is_sentence_separator(term):  # 如果是句子分隔符,句子数自增
                n_sentence_total_number += 1
            else:
                if is_latin(term): n_term_latin += 1
                if len(term) >= 6: n_term_len_gte6 += 1
                if len(term) >= 4: n_term_len_gte4 += 1

            tags = get_term_tags(term)
            if len(tags) > 0:
                n_term_in_dic += 1
                for tag in tags:
                    r.accumulate(tag)

            if enable_pos:
                r.accumulate('POS/%s' % pos)

    r.accumulate('stat/WordCount', value=n_term_total_number)
    if n_term_total_number == 0: n_term_total_number = float('NaN')

    r.accumulate('stat/WordPerSentence', value=float(n_term_total_number) / n_sentence_total_number)
    r.accumulate('stat/RateDicCover', value=float(n_term_in_dic) / n_term_total_number)
    r.accumulate('stat/RateNumeral', value=float(n_term_numerals) / n_term_total_number)
    r.accumulate('stat/RateSixLtrWord', value=float(n_term_len_gte6) / n_term_total_number)

    r.accumulate('stat/RateFourCharWord', value=float(n_term_len_gte4) / n_term_total_number)
    r.accumulate('stat/RateLatinWord', value=float(n_term_latin) / n_term_total_number)

    r.accumulate('stat/NumAtMention', value=n_term_at_mention)
    r.accumulate('stat/NumEmotion', value=n_term_emotion)
    r.accumulate('stat/NumHashTag', value=n_term_hashtag)
    r.accumulate('stat/NumURLs', value=n_term_url)

    return r.to_list(to_ratio)
Exemple #6
0
def process_iterator(lst_original,
                     lst_segged,
                     to_ration=True,
                     enable_pos=False,
                     encoding='utf-8'):
    """used to process list of paragraphs"""
    if len(lst_original) != len(lst_segged):
        raise ValueError(
            'The length of lst_original and lst_segged should be equal!')

    r = Result()

    n_sentence_total_number = 1
    n_term_total_number = 0
    n_term_numerals = 0
    n_term_in_dic = 0
    n_term_len_gte6 = 0

    n_term_len_gte4 = 0
    n_term_latin = 0

    n_term_at_mention = 0
    n_term_emotion = 0
    n_term_hashtag = 0
    n_term_url = 0

    for line, seg_str in zip(lst_original, lst_segged):
        line = line.strip(' \t\r\n')  # .encode('utf-8')
        if len(line) < 1:
            continue

        n_term_numerals += len(find_numeral(line))
        n_term_at_mention += len(find_at_mention(line))
        n_term_emotion += len(find_emotions(line))
        n_term_hashtag += len(find_hashtag(line))
        n_term_url += len(find_url(line))

        segged_terms = default_seg(seg_str)
        for t in segged_terms:
            term = t[0].decode(encoding, 'ignore')
            pos = t[1]

            n_term_total_number += 1

            if is_sentence_separator(term):  # 如果是句子分隔符,句子数自增
                n_sentence_total_number += 1
            else:
                if is_latin(term): n_term_latin += 1
                if len(term) >= 6: n_term_len_gte6 += 1
                if len(term) >= 4: n_term_len_gte4 += 1

            tags = get_term_tags(term)
            if len(tags) > 0:
                n_term_in_dic += 1
                for tag in tags:
                    r.accumulate(tag)

            if enable_pos:
                r.accumulate('POS/%s' % pos)

    r.accumulate('stat/WordCount', value=n_term_total_number)
    if n_term_total_number == 0: n_term_total_number = float('NaN')

    r.accumulate('stat/WordPerSentence',
                 value=float(n_term_total_number) / n_sentence_total_number)
    r.accumulate('stat/RateDicCover',
                 value=float(n_term_in_dic) / n_term_total_number)
    r.accumulate('stat/RateNumeral',
                 value=float(n_term_numerals) / n_term_total_number)
    r.accumulate('stat/RateSixLtrWord',
                 value=float(n_term_len_gte6) / n_term_total_number)

    r.accumulate('stat/RateFourCharWord',
                 value=float(n_term_len_gte4) / n_term_total_number)
    r.accumulate('stat/RateLatinWord',
                 value=float(n_term_latin) / n_term_total_number)

    r.accumulate('stat/NumAtMention', value=n_term_at_mention)
    r.accumulate('stat/NumEmotion', value=n_term_emotion)
    r.accumulate('stat/NumHashTag', value=n_term_hashtag)
    r.accumulate('stat/NumURLs', value=n_term_url)

    return r.to_list(to_ratio)
Exemple #7
0
def process_iterator(iterator, segmentor=default_seg, enable_pos=True, encoding='utf-8'):
    r = Result()

    n_sentence_total_number = 1
    n_term_total_number = 0
    n_term_numerals = 0
    n_term_in_dic = 0
    n_term_len_gte6 = 0

    n_term_len_gte4 = 0
    n_term_latin = 0

    n_term_at_mention = 0
    n_term_emotion = 0
    n_term_hashtag = 0
    n_term_url = 0

    for line in iterator:
        line = line.strip(' \t\r\n')  # .encode('utf-8')
        if len(line) < 1:
            continue

        n_term_numerals += len( find_numeral(line) )
        n_term_at_mention += len( find_at_mention(line) )
        n_term_emotion += len( find_emotions(line) )
        n_term_hashtag += len( find_hashtag(line) )
        n_term_url += len( find_url(line) )

        segged_terms = segmentor(line)
        for term, pos in segged_terms:
            term = term.decode(encoding, 'ignore')

            n_term_total_number += 1

            if is_sentence_separator(term):    # 如果是句子分隔符,句子数自增
                n_sentence_total_number += 1
            else:
                if is_latin(term): n_term_latin += 1
                if len(term) >= 6: n_term_len_gte6 += 1
                if len(term) >= 4: n_term_len_gte4 += 1

            tags = get_term_tags(term)
            if len(tags) > 0:
                n_term_in_dic += 1
                for tag in tags:
                    r.accumulate(tag)

            if enable_pos:
                r.accumulate('POS/%s' % pos)

    r.accumulate('stat/WordCount', value=n_term_total_number)
    if n_term_total_number == 0: n_term_total_number = float('NaN')

    r.accumulate('stat/WordPerSentence', value=float(n_term_total_number)/n_sentence_total_number)
    r.accumulate('stat/RateDicCover', value=float(n_term_in_dic)/n_term_total_number)
    r.accumulate('stat/RateNumeral', value=float(n_term_numerals)/n_term_total_number)
    r.accumulate('stat/RateSixLtrWord', value=float(n_term_len_gte6)/n_term_total_number)

    r.accumulate('stat/RateFourCharWord', value=float(n_term_len_gte4)/n_term_total_number)
    r.accumulate('stat/RateLatinWord', value=float(n_term_latin)/n_term_total_number)

    r.accumulate('stat/NumAtMention', value=n_term_at_mention)
    r.accumulate('stat/NumEmotion', value=n_term_emotion)
    r.accumulate('stat/NumHashTag', value=n_term_hashtag)
    r.accumulate('stat/NumURLs', value=n_term_url)

    return r