def process_file(self, file_path, file_encoding='utf-8-sig'): r = Result() n_sentence_total_number = 1 n_term_total_number = 0 n_term_numerals = 0 n_term_in_dic = 0 n_term_len_gte6 = 0 n_term_len_gte4 = 0 n_term_latin = 0 n_term_at_mention = 0 n_term_emotion = 0 n_term_hashtag = 0 n_term_url = 0 with codecs.open(file_path, 'r', encoding=file_encoding) as fp: for line in fp: line = line.strip(' \t\r\n').encode('utf-8') if len(line) < 1: continue n_term_numerals += len(find_numeral(line)) n_term_at_mention += len(find_at_mention(line)) n_term_emotion += len(find_emotions(line)) n_term_hashtag += len(find_hashtag(line)) n_term_emotion += len(find_url(line)) lst = nlpir.Seg(line) for t in lst: term = t[0].decode('utf-8', 'ignore') POS = t[1] n_term_total_number += 1 if is_sentence_separator(term): #如果是句子分隔符,句子数自增 n_sentence_total_number += 1 else: if is_latin(term): n_term_latin += 1 if len(term) >= 6: n_term_len_gte6 += 1 if len(term) >= 4: n_term_len_gte4 += 1 tags = get_term_tags(term) if len(tags) > 0: n_term_in_dic += 1 for tag in tags: r.accumulate(tag) if self.enablePOS: r.accumulate('POS/%s' % POS) r.accumulate('stat/WordCount', value=n_term_total_number) if n_term_total_number == 0: n_term_total_number = float('NaN') r.accumulate('stat/WordPerSentence', value=n_term_total_number / n_sentence_total_number) r.accumulate('stat/DicCoverRate', value=n_term_in_dic / n_term_total_number) r.accumulate('stat/Numerals', value=n_term_numerals / n_term_total_number) r.accumulate('stat/SixLtr', value=n_term_len_gte6 / n_term_total_number) r.accumulate('stat/FourCharWord', value=n_term_len_gte4 / n_term_total_number) r.accumulate('stat/Latin', value=n_term_latin / n_term_total_number) r.accumulate('stat/AtMention', value=n_term_at_mention) r.accumulate('stat/Emotion', value=n_term_emotion) r.accumulate('stat/HashTag', value=n_term_hashtag) r.accumulate('stat/URLs', value=n_term_url) return r
def process_paragraph(self, paragraph, encoding='utf-8'): r = Result() n_sentence_total_number = 1 n_term_total_number = 0 n_term_numerals = len(find_numeral(paragraph)) n_term_in_dic = 0 n_term_len_gte6 = 0 n_term_len_gte4 = 0 n_term_latin = 0 n_term_at_mention = len(find_at_mention(paragraph)) n_term_emotion = len(find_emotions(paragraph)) n_term_hashtag = len(find_hashtag(paragraph)) n_term_url = len(find_url(paragraph)) lst = nlpir.Seg(paragraph) for t in lst: term = t[0].decode('utf-8', 'ignore') POS = t[1] n_term_total_number += 1 if is_sentence_separator(term): #如果是句子分隔符,句子数自增 n_sentence_total_number += 1 else: if is_latin(term): n_term_latin += 1 if len(term) >= 6: n_term_len_gte6 += 1 if len(term) >= 4: n_term_len_gte4 += 1 tags = get_term_tags(term) if len(tags) > 0: n_term_in_dic += 1 for tag in tags: r.accumulate(tag) if self.enablePOS: r.accumulate('POS/%s' % POS) r.accumulate('stat/WordCount', value=n_term_total_number) if n_term_total_number == 0: n_term_total_number = float('NaN') r.accumulate('stat/WordPerSentence', value=float(n_term_total_number) / n_sentence_total_number) r.accumulate('stat/RateDicCover', value=float(n_term_in_dic) / n_term_total_number) r.accumulate('stat/RateNumeral', value=float(n_term_numerals) / n_term_total_number) r.accumulate('stat/RateSixLtrWord', value=float(n_term_len_gte6) / n_term_total_number) r.accumulate('stat/RateFourCharWord', value=float(n_term_len_gte4) / n_term_total_number) r.accumulate('stat/RateLatinWord', value=float(n_term_latin) / n_term_total_number) r.accumulate('stat/NumAtMention', value=n_term_at_mention) r.accumulate('stat/NumEmotion', value=n_term_emotion) r.accumulate('stat/NumHashTag', value=n_term_hashtag) r.accumulate('stat/NumURLs', value=n_term_url) return r
def process_file(self, file_path, file_encoding='utf-8-sig'): r = Result() n_sentence_total_number = 1 n_term_total_number = 0 n_term_numerals = 0 n_term_in_dic = 0 n_term_len_gte6 = 0 n_term_len_gte4 = 0 n_term_latin = 0 n_term_at_mention = 0 n_term_emotion = 0 n_term_hashtag = 0 n_term_url = 0 with codecs.open(file_path, 'r', encoding=file_encoding) as fp: for line in fp: line = line.strip(' \t\r\n').encode('utf-8') if len(line)<1: continue n_term_numerals += len( find_numeral(line) ) n_term_at_mention += len( find_at_mention(line) ) n_term_emotion += len( find_emotions(line) ) n_term_hashtag += len( find_hashtag(line) ) n_term_emotion += len( find_url(line) ) lst = nlpir.Seg(line) for t in lst: term = t[0].decode('utf-8','ignore') POS = t[1] n_term_total_number += 1 if is_sentence_separator(term): #如果是句子分隔符,句子数自增 n_sentence_total_number += 1 else: if is_latin(term): n_term_latin += 1 if len(term)>=6: n_term_len_gte6 += 1 if len(term)>=4: n_term_len_gte4 += 1 tags = get_term_tags(term) if len(tags)>0: n_term_in_dic += 1 for tag in tags: r.accumulate(tag) if self.enablePOS: r.accumulate('POS/%s'%POS) r.accumulate('stat/WordCount', value=n_term_total_number) if n_term_total_number == 0: n_term_total_number=float('NaN') r.accumulate('stat/WordPerSentence', value=n_term_total_number/n_sentence_total_number) r.accumulate('stat/DicCoverRate', value=n_term_in_dic/n_term_total_number) r.accumulate('stat/Numerals', value=n_term_numerals/n_term_total_number) r.accumulate('stat/SixLtr', value=n_term_len_gte6/n_term_total_number) r.accumulate('stat/FourCharWord', value=n_term_len_gte4/n_term_total_number) r.accumulate('stat/Latin', value=n_term_latin/n_term_total_number) r.accumulate('stat/AtMention', value=n_term_at_mention) r.accumulate('stat/Emotion', value=n_term_emotion) r.accumulate('stat/HashTag', value=n_term_hashtag) r.accumulate('stat/URLs', value=n_term_url) return r
def process_paragraph(self, paragraph, encoding='utf-8'): r = Result() n_sentence_total_number = 1 n_term_total_number = 0 n_term_numerals = len( find_numeral(paragraph) ) n_term_in_dic = 0 n_term_len_gte6 = 0 n_term_len_gte4 = 0 n_term_latin = 0 n_term_at_mention = len( find_at_mention(paragraph) ) n_term_emotion = len( find_emotions(paragraph) ) n_term_hashtag = len( find_hashtag(paragraph) ) n_term_url = len( find_url(paragraph) ) lst = nlpir.Seg(paragraph) for t in lst: term = t[0].decode('utf-8','ignore') POS = t[1] n_term_total_number += 1 if is_sentence_separator(term): #如果是句子分隔符,句子数自增 n_sentence_total_number += 1 else: if is_latin(term): n_term_latin += 1 if len(term)>=6: n_term_len_gte6 += 1 if len(term)>=4: n_term_len_gte4 += 1 tags = get_term_tags(term) if len(tags)>0: n_term_in_dic += 1 for tag in tags: r.accumulate(tag) if self.enablePOS: r.accumulate('POS/%s'%POS) r.accumulate('stat/WordCount', value=n_term_total_number) if n_term_total_number == 0: n_term_total_number=float('NaN') r.accumulate('stat/WordPerSentence', value= float(n_term_total_number)/n_sentence_total_number) r.accumulate('stat/RateDicCover', value= float(n_term_in_dic)/n_term_total_number) r.accumulate('stat/RateNumeral', value= float(n_term_numerals)/n_term_total_number) r.accumulate('stat/RateSixLtrWord', value= float(n_term_len_gte6)/n_term_total_number) r.accumulate('stat/RateFourCharWord', value= float(n_term_len_gte4)/n_term_total_number) r.accumulate('stat/RateLatinWord', value= float(n_term_latin)/n_term_total_number) r.accumulate('stat/NumAtMention', value=n_term_at_mention) r.accumulate('stat/NumEmotion', value=n_term_emotion) r.accumulate('stat/NumHashTag', value=n_term_hashtag) r.accumulate('stat/NumURLs', value=n_term_url) return r
def process_iterator(lst_original, lst_segged, to_ration=True, enable_pos=False, encoding='utf-8'): """used to process list of paragraphs""" if len(lst_original) != len(lst_segged): raise ValueError('The length of lst_original and lst_segged should be equal!') r = Result() n_sentence_total_number = 1 n_term_total_number = 0 n_term_numerals = 0 n_term_in_dic = 0 n_term_len_gte6 = 0 n_term_len_gte4 = 0 n_term_latin = 0 n_term_at_mention = 0 n_term_emotion = 0 n_term_hashtag = 0 n_term_url = 0 for line, seg_str in zip(lst_original, lst_segged): line = line.strip(' \t\r\n') # .encode('utf-8') if len(line) < 1: continue n_term_numerals += len(find_numeral(line)) n_term_at_mention += len(find_at_mention(line)) n_term_emotion += len(find_emotions(line)) n_term_hashtag += len(find_hashtag(line)) n_term_url += len(find_url(line)) segged_terms = default_seg(seg_str) for t in segged_terms: term = t[0].decode(encoding, 'ignore') pos = t[1] n_term_total_number += 1 if is_sentence_separator(term): # 如果是句子分隔符,句子数自增 n_sentence_total_number += 1 else: if is_latin(term): n_term_latin += 1 if len(term) >= 6: n_term_len_gte6 += 1 if len(term) >= 4: n_term_len_gte4 += 1 tags = get_term_tags(term) if len(tags) > 0: n_term_in_dic += 1 for tag in tags: r.accumulate(tag) if enable_pos: r.accumulate('POS/%s' % pos) r.accumulate('stat/WordCount', value=n_term_total_number) if n_term_total_number == 0: n_term_total_number = float('NaN') r.accumulate('stat/WordPerSentence', value=float(n_term_total_number) / n_sentence_total_number) r.accumulate('stat/RateDicCover', value=float(n_term_in_dic) / n_term_total_number) r.accumulate('stat/RateNumeral', value=float(n_term_numerals) / n_term_total_number) r.accumulate('stat/RateSixLtrWord', value=float(n_term_len_gte6) / n_term_total_number) r.accumulate('stat/RateFourCharWord', value=float(n_term_len_gte4) / n_term_total_number) r.accumulate('stat/RateLatinWord', value=float(n_term_latin) / n_term_total_number) r.accumulate('stat/NumAtMention', value=n_term_at_mention) r.accumulate('stat/NumEmotion', value=n_term_emotion) r.accumulate('stat/NumHashTag', value=n_term_hashtag) r.accumulate('stat/NumURLs', value=n_term_url) return r.to_list(to_ratio)
def process_iterator(lst_original, lst_segged, to_ration=True, enable_pos=False, encoding='utf-8'): """used to process list of paragraphs""" if len(lst_original) != len(lst_segged): raise ValueError( 'The length of lst_original and lst_segged should be equal!') r = Result() n_sentence_total_number = 1 n_term_total_number = 0 n_term_numerals = 0 n_term_in_dic = 0 n_term_len_gte6 = 0 n_term_len_gte4 = 0 n_term_latin = 0 n_term_at_mention = 0 n_term_emotion = 0 n_term_hashtag = 0 n_term_url = 0 for line, seg_str in zip(lst_original, lst_segged): line = line.strip(' \t\r\n') # .encode('utf-8') if len(line) < 1: continue n_term_numerals += len(find_numeral(line)) n_term_at_mention += len(find_at_mention(line)) n_term_emotion += len(find_emotions(line)) n_term_hashtag += len(find_hashtag(line)) n_term_url += len(find_url(line)) segged_terms = default_seg(seg_str) for t in segged_terms: term = t[0].decode(encoding, 'ignore') pos = t[1] n_term_total_number += 1 if is_sentence_separator(term): # 如果是句子分隔符,句子数自增 n_sentence_total_number += 1 else: if is_latin(term): n_term_latin += 1 if len(term) >= 6: n_term_len_gte6 += 1 if len(term) >= 4: n_term_len_gte4 += 1 tags = get_term_tags(term) if len(tags) > 0: n_term_in_dic += 1 for tag in tags: r.accumulate(tag) if enable_pos: r.accumulate('POS/%s' % pos) r.accumulate('stat/WordCount', value=n_term_total_number) if n_term_total_number == 0: n_term_total_number = float('NaN') r.accumulate('stat/WordPerSentence', value=float(n_term_total_number) / n_sentence_total_number) r.accumulate('stat/RateDicCover', value=float(n_term_in_dic) / n_term_total_number) r.accumulate('stat/RateNumeral', value=float(n_term_numerals) / n_term_total_number) r.accumulate('stat/RateSixLtrWord', value=float(n_term_len_gte6) / n_term_total_number) r.accumulate('stat/RateFourCharWord', value=float(n_term_len_gte4) / n_term_total_number) r.accumulate('stat/RateLatinWord', value=float(n_term_latin) / n_term_total_number) r.accumulate('stat/NumAtMention', value=n_term_at_mention) r.accumulate('stat/NumEmotion', value=n_term_emotion) r.accumulate('stat/NumHashTag', value=n_term_hashtag) r.accumulate('stat/NumURLs', value=n_term_url) return r.to_list(to_ratio)
def process_iterator(iterator, segmentor=default_seg, enable_pos=True, encoding='utf-8'): r = Result() n_sentence_total_number = 1 n_term_total_number = 0 n_term_numerals = 0 n_term_in_dic = 0 n_term_len_gte6 = 0 n_term_len_gte4 = 0 n_term_latin = 0 n_term_at_mention = 0 n_term_emotion = 0 n_term_hashtag = 0 n_term_url = 0 for line in iterator: line = line.strip(' \t\r\n') # .encode('utf-8') if len(line) < 1: continue n_term_numerals += len( find_numeral(line) ) n_term_at_mention += len( find_at_mention(line) ) n_term_emotion += len( find_emotions(line) ) n_term_hashtag += len( find_hashtag(line) ) n_term_url += len( find_url(line) ) segged_terms = segmentor(line) for term, pos in segged_terms: term = term.decode(encoding, 'ignore') n_term_total_number += 1 if is_sentence_separator(term): # 如果是句子分隔符,句子数自增 n_sentence_total_number += 1 else: if is_latin(term): n_term_latin += 1 if len(term) >= 6: n_term_len_gte6 += 1 if len(term) >= 4: n_term_len_gte4 += 1 tags = get_term_tags(term) if len(tags) > 0: n_term_in_dic += 1 for tag in tags: r.accumulate(tag) if enable_pos: r.accumulate('POS/%s' % pos) r.accumulate('stat/WordCount', value=n_term_total_number) if n_term_total_number == 0: n_term_total_number = float('NaN') r.accumulate('stat/WordPerSentence', value=float(n_term_total_number)/n_sentence_total_number) r.accumulate('stat/RateDicCover', value=float(n_term_in_dic)/n_term_total_number) r.accumulate('stat/RateNumeral', value=float(n_term_numerals)/n_term_total_number) r.accumulate('stat/RateSixLtrWord', value=float(n_term_len_gte6)/n_term_total_number) r.accumulate('stat/RateFourCharWord', value=float(n_term_len_gte4)/n_term_total_number) r.accumulate('stat/RateLatinWord', value=float(n_term_latin)/n_term_total_number) r.accumulate('stat/NumAtMention', value=n_term_at_mention) r.accumulate('stat/NumEmotion', value=n_term_emotion) r.accumulate('stat/NumHashTag', value=n_term_hashtag) r.accumulate('stat/NumURLs', value=n_term_url) return r