def NlpirTokener(self,raw): nlpir_result='' tokens = nlpir.Seg(raw) for w in tokens: # result+= w[0]+"/"+w[1] #加词性标注 nlpir_result+= w[0] +' '#加词性标注 return nlpir_result
def part_from_string(self,string): if not string: return "" part = [] for t in nlpir.Seg(string): part.append(t[0]) part_string = " ".join(part) return part_string
def extract_statuses_behave(statuses, date_filter=None): n = 0 n_filtered = 0 n_original = 0 days = set() n_comments = 0.0 n_repost = 0.0 n_attitudes = 0.0 n_url = 0 n_pic = 0 n_at = 0 n_emotion = 0 n_emotion_pos = 0 n_emotion_neg = 0 n_night = 0 n_contains_I = 0 n_contains_we = 0 nTextLength = [] if date_filter is None: date_filter = 'Dec 12 23:59:59 +0800 2099' filter = parser.parse(date_filter, fuzzy=True) minCreatedAt = parser.parse('Dec 31 23:59:59 +0800 2099', fuzzy=True) maxCreatedAt = parser.parse('Jan 01 00:00:01 +0800 1970', fuzzy=True) for s in statuses: n += 1 if 'created_at' not in s: continue created_at = s.get('created_at') created = parser.parse(created_at, fuzzy=True) if isinstance( created_at, basestring) else utc.localize(created_at) if created < minCreatedAt: minCreatedAt = created if created > maxCreatedAt: maxCreatedAt = created if created > filter: continue n_filtered += 1 text = s.get('text') hour = created.hour n_original += 1 if "retweeted_status" in s or s.get('is_original', 0) == 1 else 0 day = ''.join( str(t) for t in [created.year, created.month, created.day]) days.add(int(day)) n_comments += s.get('comments_count', 0) n_repost += s.get('reposts_count', 0) n_attitudes += s.get('attitudes_count', 0) n_url += 1 if 'http://t.cn/' in text else 0 n_pic += 1 if len(s.get('pic_ids', s.get('original_pic', 0))) > 0 else 0 n_at += sum([1 if i == '@' else 0 for i in text]) l = len(text) nTextLength.append(l) words = [] if l > 0: t = text.encode('UTF-8') words = nlpir.Seg(t) for i in words: word = i[0] if word in set_I: n_contains_I += 1 if word in set_We: n_contains_we += 1 exps = re.findall(re_exp, text) for exp in exps: n_emotion += 1 if exp in neg_exp: n_emotion_pos += 1 elif exp in pos_exp: n_emotion_neg += 1 n_night += 1 if hour < 6 or hour > 22 else 0 if n < 1: return ['N/A'] * 23 if n_filtered > 0: n_comments /= n_filtered n_repost /= n_filtered n_attitudes /= n_filtered result = [ n, n_filtered, # 公开微博总数 n_original, # 原创微博数' n_pic, # 含图片原创微博数' n_url, # 含URL微博数 n_at, # 含@的微博数 n_contains_we, # 微博中第一人称复数使用次数 n_contains_I, # 微博中第一人称单数使用次数 n_night, # 夜间时段发微博数 n_emotion, # 含表情总数 n_emotion_pos, # 含积极表情总数 n_emotion_neg, # 含消极表情总数 numpy.mean(nTextLength) if len(nTextLength) > 0 else 'N/A', # 公开微博字数平均值 numpy.std(nTextLength) if len(nTextLength) > 0 else 'N/A', # 公开微博字数STD numpy.max(nTextLength) if len(nTextLength) > 0 else 'N/A', # 公开微博字数MAX numpy.min(nTextLength) if len(nTextLength) > 0 else 'N/A', # 公开微博字数MIN numpy.median(nTextLength) if len(nTextLength) > 0 else 'N/A', # 公开微博字数MEDIAN len(days), # 发微博天数 n_comments, # 评论数 n_repost, # 转发数 n_attitudes, # 表态数 minCreatedAt, # 最早一条微博发布时间 maxCreatedAt # 最后一条微博发布时间 ] return [str(i) for i in result]
def process_paragraph(self, paragraph, encoding='utf-8'): r = Result() n_sentence_total_number = 1 n_term_total_number = 0 n_term_numerals = len(find_numeral(paragraph)) n_term_in_dic = 0 n_term_len_gte6 = 0 n_term_len_gte4 = 0 n_term_latin = 0 n_term_at_mention = len(find_at_mention(paragraph)) n_term_emotion = len(find_emotions(paragraph)) n_term_hashtag = len(find_hashtag(paragraph)) n_term_url = len(find_url(paragraph)) lst = nlpir.Seg(paragraph) for t in lst: term = t[0].decode('utf-8', 'ignore') POS = t[1] n_term_total_number += 1 if is_sentence_separator(term): #如果是句子分隔符,句子数自增 n_sentence_total_number += 1 else: if is_latin(term): n_term_latin += 1 if len(term) >= 6: n_term_len_gte6 += 1 if len(term) >= 4: n_term_len_gte4 += 1 tags = get_term_tags(term) if len(tags) > 0: n_term_in_dic += 1 for tag in tags: r.accumulate(tag) if self.enablePOS: r.accumulate('POS/%s' % POS) r.accumulate('stat/WordCount', value=n_term_total_number) if n_term_total_number == 0: n_term_total_number = float('NaN') r.accumulate('stat/WordPerSentence', value=float(n_term_total_number) / n_sentence_total_number) r.accumulate('stat/RateDicCover', value=float(n_term_in_dic) / n_term_total_number) r.accumulate('stat/RateNumeral', value=float(n_term_numerals) / n_term_total_number) r.accumulate('stat/RateSixLtrWord', value=float(n_term_len_gte6) / n_term_total_number) r.accumulate('stat/RateFourCharWord', value=float(n_term_len_gte4) / n_term_total_number) r.accumulate('stat/RateLatinWord', value=float(n_term_latin) / n_term_total_number) r.accumulate('stat/NumAtMention', value=n_term_at_mention) r.accumulate('stat/NumEmotion', value=n_term_emotion) r.accumulate('stat/NumHashTag', value=n_term_hashtag) r.accumulate('stat/NumURLs', value=n_term_url) return r
def process_file(self, file_path, file_encoding='utf-8-sig'): r = Result() n_sentence_total_number = 1 n_term_total_number = 0 n_term_numerals = 0 n_term_in_dic = 0 n_term_len_gte6 = 0 n_term_len_gte4 = 0 n_term_latin = 0 n_term_at_mention = 0 n_term_emotion = 0 n_term_hashtag = 0 n_term_url = 0 with codecs.open(file_path, 'r', encoding=file_encoding) as fp: for line in fp: line = line.strip(' \t\r\n').encode('utf-8') if len(line) < 1: continue n_term_numerals += len(find_numeral(line)) n_term_at_mention += len(find_at_mention(line)) n_term_emotion += len(find_emotions(line)) n_term_hashtag += len(find_hashtag(line)) n_term_emotion += len(find_url(line)) lst = nlpir.Seg(line) for t in lst: term = t[0].decode('utf-8', 'ignore') POS = t[1] n_term_total_number += 1 if is_sentence_separator(term): #如果是句子分隔符,句子数自增 n_sentence_total_number += 1 else: if is_latin(term): n_term_latin += 1 if len(term) >= 6: n_term_len_gte6 += 1 if len(term) >= 4: n_term_len_gte4 += 1 tags = get_term_tags(term) if len(tags) > 0: n_term_in_dic += 1 for tag in tags: r.accumulate(tag) if self.enablePOS: r.accumulate('POS/%s' % POS) r.accumulate('stat/WordCount', value=n_term_total_number) if n_term_total_number == 0: n_term_total_number = float('NaN') r.accumulate('stat/WordPerSentence', value=n_term_total_number / n_sentence_total_number) r.accumulate('stat/DicCoverRate', value=n_term_in_dic / n_term_total_number) r.accumulate('stat/Numerals', value=n_term_numerals / n_term_total_number) r.accumulate('stat/SixLtr', value=n_term_len_gte6 / n_term_total_number) r.accumulate('stat/FourCharWord', value=n_term_len_gte4 / n_term_total_number) r.accumulate('stat/Latin', value=n_term_latin / n_term_total_number) r.accumulate('stat/AtMention', value=n_term_at_mention) r.accumulate('stat/Emotion', value=n_term_emotion) r.accumulate('stat/HashTag', value=n_term_hashtag) r.accumulate('stat/URLs', value=n_term_url) return r
def extractWeibo(fname): nStatuses = 0 nOriginal = 0 nOrigStatusWithPics = 0 nContainsUrl = 0 nContainsMention = 0 nContainsWe = 0 nContainsI = 0 nComposeLate = 0 nNegExps = 0 nTextLength = [] minCreatedAt = parser.parse('Dec 31 23:59:59 +0800 2099', fuzzy=True) maxCreatedAt = parser.parse('Jan 01 00:00:01 +0800 1970', fuzzy=True) try: f = codecs.open(fname, 'r', encoding='utf-8') statuses = json.load(f, encoding='utf-8') nStatuses += len(statuses) for s in statuses: text = s['text'] l = len(text) nTextLength.append(l) isOriginal = 1 if s.get('retweeted_status', None) is None else 0 nOriginal += isOriginal containsPic = s.get('pic_ids',None) is not None \ or s.get('pic_urls',None) is not None nOrigStatusWithPics += 1 if isOriginal > 0 and containsPic else 0 containsUrl = 'http://t.cn/' in text nContainsUrl += 1 if containsUrl else 0 containsMention = '@' in text nContainsMention += containsMention if l == 0: words = [] else: t = text.encode('UTF-8') words = nlpir.Seg(t) for i in words: word = i[0] if word in set_I: nContainsI += 1 if word in set_We: nContainsWe += 1 createdAt = parser.parse(s['created_at'], fuzzy=True) if createdAt < minCreatedAt: minCreatedAt = createdAt if createdAt > maxCreatedAt: maxCreatedAt = createdAt createdHour = createdAt.hour if not (createdHour > 6 and createdHour < 22): nComposeLate += 1 exps = re.findall(re_exp, text) for exp in exps: if exp in neg_exp: nNegExps += 1 f.close() f = dict() f['公开微博总数'] = nStatuses f['原创微博数'] = nOriginal f['含图片原创微博数'] = nOrigStatusWithPics f['含URL微博数'] = nContainsUrl f['含@的微博数'] = nContainsMention f['微博中第一人称复数使用次数'] = nContainsWe f['微博中第一人称单数使用次数'] = nContainsI f['夜间时段发微博数'] = nComposeLate f['含消极表情总数'] = nNegExps f['公开微博字数平均值'] = numpy.mean( nTextLength) if len(nTextLength) > 0 else 'N/A' f['公开微博字数STD'] = numpy.std( nTextLength) if len(nTextLength) > 0 else 'N/A' f['公开微博字数MAX'] = numpy.max( nTextLength) if len(nTextLength) > 0 else 'N/A' f['公开微博字数MIN'] = numpy.min( nTextLength) if len(nTextLength) > 0 else 'N/A' f['公开微博字数MEDIAN'] = numpy.median( nTextLength) if len(nTextLength) > 0 else 'N/A' f['最早一条微博发布时间'] = minCreatedAt.date() f['最后一条微博发布时间'] = maxCreatedAt.date() for k, v in f.iteritems(): f[k] = str(v) global textLen textLen.extend(nTextLength) return f except IOError as e: return {}