forked from cashgithubs/semtool
/
encode_url.py
123 lines (103 loc) · 3.75 KB
/
encode_url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#coding: utf8
import os
import sys
import re
import urllib
import logging
import pdb
import subprocess
import autopath
from utils.common_handler import CommonHandler
from utils.http_client import HttpClient
from utils.btlog import btlog_init
'''
file format:
$heaer (one line)
$content (multi line)
suggest file encoding: gbk
'''
class EncodeChinese(CommonHandler, HttpClient):
def is_mutichaset(self, s):
if isinstance(s, unicode):
raise Exception, 'param should is str'
unicode_s = self.ToUnicode(s)
if len(unicode_s) == len(s):
return False
return True
def run(self, source_file):
if sys.argv[2] == 'gbk':
dest_file = source_file + ".utf8"
subprocess.Popen("iconv -c -f gbk -t utf8 %s > %s" % (source_file, dest_file),
shell=True, stdout=subprocess.PIPE).communicate()
self.do_file(source_file)
self.do_file(dest_file)
elif sys.argv[2] == 'utf8':
dest_file = source_file + ".utf8"
subprocess.Popen("iconv -c -f utf8 -t gbk %s > .tmp" % (source_file),
shell=True, stdout=subprocess.PIPE).communicate()
subprocess.Popen("iconv -c -f gbk -t utf8 .tmp > %s" % (dest_file),
shell=True, stdout=subprocess.PIPE).communicate()
self.do_file(source_file)
self.do_file(dest_file)
def do_file(self, source_file):
line_list = self.LoadList(source_file)
logging.info("source len: %d" % len(line_list))
detect_result = {}
formated_list = []
for line in line_list:
line = line.strip("\n").strip("\r")
if len(line) <= 1:
continue
formated_list.append(line)
dest_list = []
first_line_flag = True
for line in formated_list:
if first_line_flag: # skip header
first_line_flag = False
continue
if len(line) <= 1:
break # no empty line allowed
new_line = self.do_line(line)
dest_list.append(new_line)
logging.info("dest_list len: %d" % len(dest_list))
self.SaveList("%s.encode" % source_file, dest_list)
def do_line(self, line):
items = self.line_items(line)
todo_str = re.match(r'.*-(.*)-jiudian', items[1]).group(1)
if self.is_mutichaset(todo_str):
uri = "/semtool/php/keyword_encoding.php?word=%s&flag=semtool" % todo_str
encoded_str = self.DoGet('192.168.0.233', 80, uri)
if len(encoded_str) < len(todo_str):
raise Exception, 'encode api error'
logging.info("E: %s => %s" % (todo_str, encoded_str))
new_url = items[1].replace(todo_str, encoded_str)
s = "%s\t%s" % (items[0], new_url)
else:
s = "%s\t%s" % (items[0], items[1])
return s
def line_items(self, line):
items = line.split("\t")
if len(items) != 2:
print line, items
raise Exception
return items
class CsvProcessor(EncodeChinese):
def line_items(self, line):
items = line.split(",")
if len(items) != 2:
print line, items
raise Exception
return items
def test():
c = CsvProcessor()
for line in ('1,http://a.b.c/-北京-jiudian', '2,http://b.c.d/-a-jiudian', '3,http://a.b.c/-%E5%8C%97%E4%BA%AC-jiudian'):
print c.do_line(line)
def usage():
print 'useage: %s filename gbk|utf8' % sys.argv[0]
sys.exit()
if __name__ == '__main__':
if len(sys.argv) != 3 or sys.argv[2] not in ['gbk', 'utf8']:
usage()
btlog_init('log/encode.log')
e = CsvProcessor()
e.run(sys.argv[1])