if catName.endswith(st): containsStoptag = True break for st in [u'中华人民共和国']: if catName.startswith(st): containsStoptag = True break if not containsStoptag: setEleInDict(tagsDict, catName, times/math.pow(b, depth)) cur.execute("SELECT inLinks AS catID FROM category_inlinks WHERE id = %s", [catID]) crawlCategories(cur.fetchall(), levels-1, times, depth+1) res = urllib2.urlopen('https://api.weibo.com/2/statuses/user_timeline.json?source=' + weiboAppKey + '&uid=1990309453&count=100&trim_user=1') data = json.loads(cvt.convert(res.read().decode('utf-8')).encode('utf-8')) statuses = data['statuses'] con = mdb.connect('localhost', 'yumao', 'yumao8899', 'wikidb', charset='utf8'); cur = con.cursor(mdb.cursors.DictCursor) start = time.clock() for i in range(1, len(statuses)): status = statuses[i] text = ' '.join([status['text'], status['retweeted_status']['text']]) if status.has_key('retweeted_status') else status['text'] print '(' + str(i) + ')', text collectTags(text) # print print 'Timeout:', time.clock() - start cur.close()
inside_code = 0x0020 else: inside_code -= 0xfee0 if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符 return uchar return chr(inside_code) # 读取标注数据 with open('/home/room/data1/libo/paddle_project/dataset/rec_data/train.list', 'r', encoding='UTF-8') as f: for line in f: # print(line) # 45 \t 48 \t img_0.jpg \t 福 name, label = line[:-1].split('\t')[-2:] # print('name: %s \nlabel: %s' %(name, label)) label = label.replace(' ', '') label = converter.convert(label) label.lower() new_label = [] for word in label: word = Q2B(word) if is_chinese(word) or is_number(word) or is_english(word): new_label.append(word) if word not in word_list: word_list.append(word) if new_label: datas.append('%s\t%s\n' % (os.path.join('train_images', name), ''.join(new_label))) word_list.sort() # 生成词表 with open('/home/room/data1/libo/paddle_project/dataset/rec_data/vocab.txt', 'w', encoding='UTF-8') as f:
import os import sys from langconv import Converter path = os.path.join('..', '..', 'data') argv = sys.argv if len(argv) > 1: path = argv[1] conv = Converter('zh-hans') if not os.path.exists(path): print('no directory') quit() for path, dir_list, file_list in os.walk(path): for file_name in file_list: if not file_name.endswith('.pt'): continue file = open(os.path.join(path, file_name), 'r') lines = file.readlines() simplified = [] for line in lines: simplified.append(conv.convert(line)) file = open(os.path.join(path, file_name), 'w') file.writelines(simplified)
if catName.startswith(st): containsStoptag = True break if not containsStoptag: setEleInDict(tagsDict, catName, times / math.pow(b, depth)) cur.execute( "SELECT inLinks AS catID FROM category_inlinks WHERE id = %s", [catID]) crawlCategories(cur.fetchall(), levels - 1, times, depth + 1) res = urllib2.urlopen( 'https://api.weibo.com/2/statuses/user_timeline.json?source=' + weiboAppKey + '&uid=1990309453&count=100&trim_user=1') data = json.loads(cvt.convert(res.read().decode('utf-8')).encode('utf-8')) statuses = data['statuses'] con = mdb.connect('localhost', 'yumao', 'yumao8899', 'wikidb', charset='utf8') cur = con.cursor(mdb.cursors.DictCursor) start = time.clock() for i in range(1, len(statuses)): status = statuses[i] text = ' '.join([ status['text'], status['retweeted_status']['text'] ]) if status.has_key('retweeted_status') else status['text'] print '(' + str(i) + ')', text collectTags(text) # print print 'Timeout:', time.clock() - start