forked from zx576/rhyme
/
utils.py
113 lines (94 loc) · 2.94 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# coding = utf-8
# author = zhouxin
# date = 2017.8.15
# dexcription
# 一些调试阶段的工具函数
from models import Lrc, Word, Rhyme
import jieba.posseg as pseg
import re
import os
import json
from settings import BASEDIR
class Utils:
def __init__(self):
self.exclude = ['作词', '作曲', '混音', '编曲','歌词']
# 删除某个 table 信息
def delete_(self, ins):
query = ins.delete().where(ins.re2 == '')
query.execute()
# 取词频较高的词汇
# 同时可以筛选某类词性
def get_most_common(self, num, f='n'):
query = Word.select().where(Word.re3 > num)
res = []
for i in query:
words = pseg.cut(i.word)
for word, flag in words:
# 筛选某类词性词汇
if flag == f and word not in self.exclude:
res.append([u'{}'.format(i.word), i.re3])
#
res.sort(key=lambda x:x[1], reverse=True)
# for i in res:
# print(i)
return res
def save_words_freq(self, txtname, num, ins, f='n'):
dir = os.path.join(BASEDIR, txtname)
res = self.get_most_common(num, f)
dct = {}
dct[ins] = res
print(dct)
# dct = dict(tuple(res))
with open(dir, 'w',)as f:
for k,v in res:
f.write('{0} : {1}'.format(k,v))
f.write('\n')
# 查看兄弟出现次数
# 验证数据有效性
def get_total_num_lrc(self):
query = Lrc.select().where(Lrc.re1 == '')
pt = re.compile('兄弟')
count = 0
for i in query:
# print(type(i.lrc))
r = re.findall(pt, i.lrc)
if r:
count += 1
print(count, i.music_name, i.music_id, i.singer, r)
print(count)
def deduplicate(self):
query = Lrc.select().where(Lrc.re1 == '')
for i in query:
name = i.music_name
songs = Lrc.select().where(Lrc.music_name == name)
if len(songs) > 1:
for j in songs[1:]:
j.re1 = 'd'
j.save()
def optword(self):
query = Word.select()
for i in query:
name = i.word
words = Word.select().where(Word.word == name)
if len(words) > 1:
for j in words[1:]:
j.re2 = 'd'
j.save()
def statistic(self):
query_l = Lrc.select()
print(len(query_l))
query_w = Word.select().where(Word.word != 'd')
print(len(query_w))
query_r = Rhyme.select()
print(len(query_r))
if __name__ == '__main__':
ul = Utils()
# ul.delete_(Word)
# ul.delete_(Rhyme)
# ul.get_most_common(10)
# ul.get_total_num_lrc()
# ul.deduplicate()
# ul.save_words_freq('words-frequency-v.txt', 10, 'words', 'v')
# ul.open_txt()
# ul.optword()
ul.statistic()