-
Notifications
You must be signed in to change notification settings - Fork 0
/
pre_emotion.py
150 lines (121 loc) · 4.15 KB
/
pre_emotion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import json
import linecache
import os
import re
import jieba
import numpy as np
from acora import AcoraBuilder
from emotion_cla.emo_cls import classify
from emotion_cla.separate import separate
in_dir = 'data/tweet'
out_dir = 'data/tweet_emo'
builder = AcoraBuilder([line.strip() for line in open('data/emoji.txt')])
ac = builder.build()
def load_labelled():
lines = set()
for i in range(5):
for line in open('data/content_3000/{}.txt'.format(i)):
lines.add(line.strip())
return lines
# have_lines = load_labelled()
def random_ids(in_name, out_name, lens):
'''
随机选择文本的行
'''
global have_lines
out_file = open(out_name, 'a')
ids = set()
_max = len(open(in_name).readlines())
while len(ids) < lens:
num = int(_max * np.random.random())
if num in ids:
continue
line = linecache.getline(in_name, num)
y, X = line.strip().split('\t')
if line not in have_lines:
ids.add(num)
out_file.write(X + '\n')
out_file.close()
return ids
def pre_label():
'''
打预标签
'''
for i, in_name in enumerate(os.listdir(in_dir)):
print(i)
stock_name = in_name
in_name = os.path.join(in_dir, in_name)
for j, line in enumerate(open(in_name)):
d = json.loads(line)
d['content_pre_emo'] = classify(separate(d['content']))
d['title_pre_emo'] = classify(separate(d['title']))
with open('{}/{}'.format(out_dir, stock_name), 'a') as f:
f.write(json.dumps(d, ensure_ascii=False) + '\n')
def get_train_data(in_name):
for line in open(in_name):
d = json.loads(line.strip())
content = d['content']
# title = d['title']
# t_emo = d['title_pre_emo']
c_emo = d['content_pre_emo']
# 标题和内容中要有一个有表情符
# if not (re.search('\\[\\S+\\]', title) or re.search('\\[\\S+\\]', content)):
# bingo = False
# for kw, pos in ac.finditer(content):
# bingo = True
# break
# if not re.search('\\[\\S+\\]', content):
# print('不满足要求 ...')
# continue
bingo = True
if bingo:
# 内容长度5到200
if 10 < len(content) < 200:
with open('data/content/{}.txt'.format(c_emo), 'a') as f:
f.write(str(c_emo) + '\t' + content + '\n')
# with open('data/title/{}.txt'.format(t_emo), 'a') as f:
# f.write(str(t_emo) + '\t' + title + '\n')
def label_split(in_name):
"""
分割数据,用于数据标注划分
"""
index = 0
for line in open(in_name):
with open(in_name[:-4] + '-({}).txt'.format(int(index / 500 + 1)), 'a') as f:
f.write(line)
print(index, int(index / 500 + 1))
index += 1
def what_the_fuck():
"""
将已经标注的数据按情绪分类
"""
labels = []
in_dir = 'data/labelled'
for in_name in os.listdir(in_dir):
_in = os.path.join(in_dir, in_name)
# print(_in)
for i, line in enumerate(open(_in)):
if line.strip() == '':
continue
label = line.split('\t')[0]
s= line.split('\t')[1]
# 1234:四种情绪,-:没有情绪,x:不确定
if label in ['1', '2', '3', '4', '-']:
if label == '-':
label = '0'
with open('data/labelled_split/{}.txt'.format(label), 'a') as f:
f.write(line)
if __name__ == '__main__':
# for line in open('data/random_ids.txt'):
# # for line in open('data/_id.txt'):
# line = line.strip().split(',')[0]
# print(line)
# in_name = 'data/tweet_emo/' + line.strip() + '.txt'
# get_train_data(in_name)
# random_ids('data/_id.txt', 100)
# get_train_data('data/002446.txt')
# for i in range(5):
# random_ids('data/content/{}.txt'.format(i), 'data/content_sample_3000/{}.txt'.format(i), 3000)
# for i in range(1, 5):
# label_split('data/content_3000/{}.txt'.format(i))
what_the_fuck()