/
IThotline0817.py
250 lines (207 loc) · 8.37 KB
/
IThotline0817.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# -*- coding: utf-8 -*-
"""
Author: Siyuan Liu
Python Version: 3.6
"""
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import jieba
import jieba.analyse
import jieba.posseg as psg
from nltk import *
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.cluster import KMeans
from stop_words import get_stop_words
#import re
#from wordcloud import WordCloud
from gensim import corpora, models, similarities
import lda
import xlwt
import re
# 同义词
synonym_file = 'D:\\data\\synonym.xlsx'
synonym = pd.read_excel(synonym_file,header = None)
synonym = dict(zip(synonym[0],synonym[1]))
#领域词库 由于jieba的词典和HanLp的词典格式不同,所以我只取了词语一列(第一列),没有要词频和词性
domainwords_file = pd.read_csv("D:\\data\\domainwords.txt",sep="\t",header = None)
domainwords = list(domainwords_file[0])
f = open(domain_open, "w", encoding = 'utf-8')
for item in domainwords:
f.write(str(item)+'\n')
f.close()
# 导入领域词库
jieba.load_userdict(domain_open)
#导入工单数据
file='D:\data\incidents_201612.xlsx'
IT= pd.read_excel(file) #Read the data into a dataframe
IT.info() #Check basic information of the dataframe
# 169391 rows 43 columns
TITLE = list(IT.TITLE)
#删除TITLE相邻重复项
def dup_del(l):
i =1
while i <len(l):
if l[i]==l[i-1]:
l.pop(i)
else:
i+=1
return l
TITLE_no_dup = pd.Series(dup_del(TITLE))
#Description of interesting variables
#Analysis by Date and Time
IT['OPEN DATE'].column=['OPEN_DATE']
IT.rename(columns = {"OPEN DATE":"OPEN_DATE", "OPEN TIME":"OPEN_TIME"},inplace = True)
IT['DAY'] = pd.to_datetime(IT.OPEN_DATE).dt.date
group_by_day = IT.groupby('DAY')
day_count = group_by_day.count().ID
IT.TIME =pd.to_datetime(IT.OPEN_TIME).dt.time
IT['HOUR']= pd.to_datetime(IT.OPEN_TIME).dt.hour
group_by_hour=IT.groupby('HOUR')
hour_count= group_by_hour.count().ID
#选取1201位样本看这一天每个小时工单情况
IT_1201 = IT[IT.OPEN_DATE=="2016-12-01"]
group_by_hour_1201 = IT_1201.groupby('HOUR')
hour_count_1201 = group_by_hour.count().ID
#Analysis by Area
area = pd.Series.unique(IT['AREA'])
pd.Series.unique(IT['SUBAREA'])
group_by_area = IT.groupby(['DAY','AREA'])
area_count = group_by_area.count().ID
area_count =area_count.sort_values(ascending = False)
#Analysis by urgency
IT.URGENCY = IT.URGENCY.str[:1]
group_by_urgency = IT.groupby('URGENCY')
urgency_count= group_by_urgency.count().ID
group_urgency_day = IT.groupby(['URGENCY','DAY'])
urgency_day_count = group_urgency_day.count().ID
#Import stopwords
stopwords_file = open('D:\\data\\analysis\\stopwords.txt','r',encoding = 'utf-8')
stopwords=[]
for line in stopwords_file.readlines():
stopwords.append(line.strip())
stopwords_file.close()
stopwords = stopwords +get_stop_words('en')+[' ','\n']+['almost','please','jqr']
## 写入停用词文件,为jieba.analyse做准备 (jieba.analyse用tfidf的方法提取关键词)
stopwords_file = open('D:\\data\\newstop.txt','w',encoding='utf-8')
for word in stopwords:
stopwords_file.write(str(word)+'\n')
stopwords_file.close()
## 分词 -- 结果为一个list
#psg.lcut() 分词结果是词语+词性
def cut_word(raw_data):
cut_result = []
prbl_cl = list(raw_data)
for entry in prbl_cl:
word_cut= psg.lcut(entry)
cut_result+=word_cut
return cut_result
#分词结果为一个list
cut_all = cut_word(TITLE_no_dup)
#分词结果为一个list中有len(title)个list
cut_title = TITLE_no_dup.map(psg.lcut)
#提取所有词中带有office 和outlook的词语
office =list(set([w.word for w in cut_all if re.search('office',w.word)]))
outlook = list(set([w.word for w in cut_all if re.search('outlook',w.word)]))
#同义词替换
def syn_rep(raw_data):
for w in raw_data:
if w.word in synonym.keys():
w.word = synonym[w.word]
return raw_data
#清洗分词结果
#对于只有jieba.lcut()的结果进行清洗
def data_clean(raw_data):
raw_data = [word.lower() for word in raw_data]
raw_data = [word for word in raw_data if word not in stopwords ]
raw_data = [word for word in raw_data if word.isdigit() ]
raw_data = [word for word in raw_data if not re.search('^[a-z]$',word)==0]
raw_data = [word for word in raw_data if not re.search('\.',word)]
raw_data = [word for word in raw_data if not re.search('^[a-z]+[0-9]+',word) or word in office or word in outlook]
raw_data = [word for word in raw_data if len(word)<=13 or word in office or word in outlook]
return raw_data
#对于psg.lcut的结果进行清洗
def data_clean_psg(raw_data):
raw_data = [w.word for w in raw_data if re.search('^n',w.flag) or re.search('^v',w.flag) or w.flag in ["eng",'x']]
raw_data = [w.lower() for w in raw_data]
raw_data = [w for w in raw_data if w not in stopwords ]
raw_data = [w for w in raw_data if w.isdigit() ==0 ]
raw_data = [w for w in raw_data if not re.search('^[a-z]$',w)]
raw_data = [w for w in raw_data if not re.search('\.',w)]
raw_data = [w for w in raw_data if not re.search('^[a-z]+[0-9]+',w) or w in office or w in outlook]
raw_data = [w for w in raw_data if not re.search('^[0-9]+[a-z]+',w) or w in office or w in outlook]
raw_data = [w for w in raw_data if len(w)<=13 or w in office or w in outlook]
return raw_data
#把工单按天分类
daygroup = [group_by_day.get_group(x) for x in group_by_day.groups]
Question_day = []
for i in range(31):
title_day = dup_del(list(daygroup[i].TITLE))
Question_day.append(cut_word(title_day))
#统计每天内area的情况
area_day=[]
for i in range(31):
area =list(daygroup[i].AREA)
freq = pd.Series(FreqDist(daygroup[i].AREA))
groupbyarea = freq.sort_values(ascending = False)
area_day.append(groupbyarea[:10])
hot_day_area = pd.DataFrame(area_day)
hot_day_area.to_excel("D:\\data\\HotArea10.xlsx")
#对所有的分词结果、按天分词、按工单分词的结果进行去同义词处理
cut_day = pd.Series(Question_day)
no_syn_case = cut_title.map(syn_rep)
no_syn_day = cut_day.map(syn_rep)
no_syn_all = syn_rep(cut_all)
#清洗分词
clean_all = data_clean_psg(no_syn_all)
clean_case = no_syn_case.map(data_clean_psg)
clean_day = no_syn_day.map(data_clean_psg)
##输出清洗好的数据
write_day_file = 'D:\\data\\ByDay.txt'
c =open(write_day_file,encoding ='utf-8',mode = 'w')
for item in clean_day:
c.write(' '.join(item)+'\n')
c.close()
write_case_file = 'D:\\data\\ByCase.txt'
b = open(write_case_file, "w", encoding = 'utf-8')
for item in clean_case:
b.write(' '.join(item)+'\n')
b.close()
write_all_file = 'D:\\data\\All.txt'
a = open(write_all_file, encoding = 'utf-8',mode = 'w')
for item in clean_all:
a.write(str(item)+'\n')
a.close()
#对词语长度的统计,为了确定应删除词语的长度
freq=FreqDist(clean_all)
clean_all = pd.Series(clean_all,name = 'cut_result')
word_len = pd.Series(clean_all).map(len)
word_len = pd.Series(word_len,name = 'word_len')
cut_len= pd.concat([cut_result,word_len],axis =1)
group_by_len=cut_len.groupby('word_len')
len_count = group_by_len.count()
group_by_cut =pd.Series(freq)
sort_freq = group_by_cut.sort_values(ascending = False)
appear_once=freq.hapaxes() #统计出现一次的词语
#统计每天词频最高的词语
hot_day = []
for i in range(len(clean_day)):
freq = FreqDist(clean_day[i])
group_by_word = pd.Series(freq)
sort_freq = group_by_word.sort_values(ascending = False)
hot_day.append(sort_freq[:10])
hot_day_stat = pd.DataFrame(hot_day)
hot_day_stat.to_excel("D:\\data\\HotWords5.xlsx")
#构建LDA模型
word_dict = corpora.Dictionary(clean_day)
corpus_list = [word_dict.doc2bow(doc) for doc in clean_day]
lda = models.ldamodel.LdaModel(corpus=corpus_list, id2word=word_dict, num_topics = 10)
output_file = 'D:\\data\\lda_output.txt'
with open(output_file,'w') as f:
for pattern in lda.show_topics():
f.write("%s" % str(pattern))