Beispiel #1
0
                if catName.endswith(st):
                    containsStoptag = True
                    break
            for st in [u'中华人民共和国']:
                if catName.startswith(st):
                    containsStoptag = True
                    break
                
            if not containsStoptag:
                setEleInDict(tagsDict, catName, times/math.pow(b, depth))
                cur.execute("SELECT inLinks AS catID FROM category_inlinks WHERE id = %s", [catID])
                crawlCategories(cur.fetchall(), levels-1, times, depth+1)
    
 
res = urllib2.urlopen('https://api.weibo.com/2/statuses/user_timeline.json?source=' + weiboAppKey + '&uid=1990309453&count=100&trim_user=1')
data = json.loads(cvt.convert(res.read().decode('utf-8')).encode('utf-8'))
statuses = data['statuses']

con = mdb.connect('localhost', 'yumao', 'yumao8899', 'wikidb', charset='utf8');
cur = con.cursor(mdb.cursors.DictCursor)

start = time.clock()
for i in range(1, len(statuses)):
    status = statuses[i]
    text = ' '.join([status['text'], status['retweeted_status']['text']]) if status.has_key('retweeted_status') else status['text']
    print '(' + str(i) + ')', text
    collectTags(text)
#     print
print 'Timeout:', time.clock() - start

cur.close()
Beispiel #2
0
        inside_code = 0x0020
    else:
        inside_code -= 0xfee0
    if inside_code < 0x0020 or inside_code > 0x7e:      # 转完之后不是半角字符返回原来的字符
        return uchar
    return chr(inside_code)


# 读取标注数据
with open('/home/room/data1/libo/paddle_project/dataset/rec_data/train.list', 'r', encoding='UTF-8') as f:
    for line in f:
        # print(line)         # 45 \t 48 \t img_0.jpg \t 福
        name, label = line[:-1].split('\t')[-2:]
        # print('name: %s \nlabel: %s' %(name, label))
        label = label.replace(' ', '')
        label = converter.convert(label)
        label.lower()
        new_label = []
        for word in label:
            word = Q2B(word)
            if is_chinese(word) or is_number(word) or is_english(word):
                new_label.append(word)
                if word not in word_list:
                    word_list.append(word)
        if new_label:
            datas.append('%s\t%s\n' % (os.path.join('train_images', name), ''.join(new_label)))

word_list.sort()

# 生成词表
with open('/home/room/data1/libo/paddle_project/dataset/rec_data/vocab.txt', 'w', encoding='UTF-8') as f:
Beispiel #3
0
import os
import sys
from langconv import Converter

path = os.path.join('..', '..', 'data')

argv = sys.argv
if len(argv) > 1:
    path = argv[1]

conv = Converter('zh-hans')

if not os.path.exists(path):
    print('no directory')
    quit()
for path, dir_list, file_list in os.walk(path):
    for file_name in file_list:
        if not file_name.endswith('.pt'):
            continue
        file = open(os.path.join(path, file_name), 'r')
        lines = file.readlines()

        simplified = []
        for line in lines:
            simplified.append(conv.convert(line))

        file = open(os.path.join(path, file_name), 'w')
        file.writelines(simplified)
Beispiel #4
0
                if catName.startswith(st):
                    containsStoptag = True
                    break

            if not containsStoptag:
                setEleInDict(tagsDict, catName, times / math.pow(b, depth))
                cur.execute(
                    "SELECT inLinks AS catID FROM category_inlinks WHERE id = %s",
                    [catID])
                crawlCategories(cur.fetchall(), levels - 1, times, depth + 1)


res = urllib2.urlopen(
    'https://api.weibo.com/2/statuses/user_timeline.json?source=' +
    weiboAppKey + '&uid=1990309453&count=100&trim_user=1')
data = json.loads(cvt.convert(res.read().decode('utf-8')).encode('utf-8'))
statuses = data['statuses']

con = mdb.connect('localhost', 'yumao', 'yumao8899', 'wikidb', charset='utf8')
cur = con.cursor(mdb.cursors.DictCursor)

start = time.clock()
for i in range(1, len(statuses)):
    status = statuses[i]
    text = ' '.join([
        status['text'], status['retweeted_status']['text']
    ]) if status.has_key('retweeted_status') else status['text']
    print '(' + str(i) + ')', text
    collectTags(text)
#     print
print 'Timeout:', time.clock() - start