def patch(self, json_file): with open(json_file) as json_data: d = json.load(json_data) if u'file_name' not in d: return file_name = d[u'file_name'] path = d[u'path'] encoding = d[u'encoding'] data = d[u'data'] # type: data md5 = d[u'md5'] print file_name print path print encoding print data if not os.path.exists(path): print 'Error: 找不到源文件!!' return file_md5 = get_file_md5(path) if md5 != file_md5: print 'Error: 源文件已经被篡改,不能覆盖!!' return print '=>验证通过,开始回填数据' backup = path + '.backup' # shutil.copy(path, backup) # 备份文件 with open(path) as f: lines = f.readlines() unicode_lines = map(lambda s: s.decode(encoding), lines) print '======' print unicode_lines for (k, v) in data.items(): v.sort(reverse=True, key=lambda x: x['start']) for item in v: line = unicode_lines[int(k)] start = item['start'] end = item['end'] trans = item['trans'] # print start, end, '|', origin, '|', trans new_line = line[:start] + trans + line[end:] unicode_lines[int(k)] = new_line print unicode_lines lines = map(lambda s: s.encode(encoding), unicode_lines) f = open(path, 'w') f.writelines(lines) f.close() print '<=回填结束' print ''
def resolve(self): file_name = unicode(os.path.basename(self.path)) f = open(self.path) code = chardet.detect(f.read()) f.seek(0, 0) lines = f.readlines() f.close() try: unicode_lines = map(lambda s: s.decode(code['encoding']), lines) except UnicodeDecodeError as e: print('fail to decode file: ', file_name) print('except: ', e) return {} except TypeError as e: print('fail to decode file: ', file_name) print('except: ', e) return {} data = {} # dict_map 保存找到的中文串 dict_map = {} sort_order_data = ['text', 'start', 'end', 'origin', 'trans', 'auto'] for seq in xrange(len(unicode_lines)): if self.is_comment(unicode_lines[seq]): continue # 匹配第一个正则表达式 m = re.finditer(self.pattern, unicode_lines[seq]) m_list = [i.span() for i in m] if len(m_list) > 0: data[seq] = [] for item in m_list: item_data = { 'text': unicode_lines[seq], 'start': item[0], 'end': item[1], 'origin': unicode_lines[seq][item[0]:item[1]], 'trans': self.transform((unicode_lines[seq][item[0]:item[1]]), unicode_lines[seq], item[0], item[1], unicode_lines, seq), 'auto': '' } # 保存中文串 dict_map[unicode_lines[seq][item[0]:item[1]][1:-1]] = unicode_lines[seq][item[0]:item[1]][1:-1] # 排序字典 data_ordered = OrderedDict( sorted(item_data.iteritems(), key=lambda (k, v): sort_order_data.index(k))) data[seq].append(data_ordered) # 匹配第二个正则表达式 # print 'second' m = re.finditer(self.pattern_plus, unicode_lines[seq]) m_list = [i.span() for i in m] if len(m_list) > 0: if seq not in data: data[seq] = [] for item in m_list: start = item[0] end = item[1] if not self.is_item_include(start, end, data[seq]): # print item # print unicode_lines[seq][item[0]:item[1]] item_data = { 'text': unicode_lines[seq], 'start': item[0], 'end': item[1], 'origin': unicode_lines[seq][item[0]:item[1]], 'trans': self.transform_simple((unicode_lines[seq][item[0]:item[1]])), 'auto': '' } # 保存中文串 dict_map[unicode_lines[seq][item[0]:item[1]]] = unicode_lines[seq][item[0]:item[1]] # 排序字典 data_ordered = OrderedDict( sorted(item_data.iteritems(), key=lambda (k, v): sort_order_data.index(k))) data[seq].append(data_ordered) ret = { 'path': unicode(self.path), 'file_name': file_name, 'md5': get_file_md5(self.path), 'encoding': code['encoding'], 'data': data, } sort_order = ['file_name', 'md5', 'path', 'encoding', 'data'] ret_ordered = OrderedDict(sorted(ret.iteritems(), key=lambda (k, v): sort_order.index(k))) if not os.path.exists(self.output_path): os.makedirs(self.output_path) if len(data) <= 0: return f = codecs.open( os.path.join(self.output_path, file_name + '-' + hashlib.md5(self.path).hexdigest() + '-output.json'), 'w', encoding="utf-8") f.write(json.dumps(ret_ordered, encoding='utf-8', ensure_ascii=False, indent=4)) f.close() return dict_map