Beispiel #1
0
def load(source_slug, name_slug, chunk_len=200, debug=False):
    print source_slug
    print name_slug
    i = 0
    bulk = Bulk(source_slug, name_slug, chunk_len)
    slug = "%s_%s" % (source_slug, name_slug)
    parts = get_parts(slug, '\n\n')
    for part in parts:
        # print part
        # print
        if not part:
            continue
        try:
            num, word, desc = part.split('\n', 2)
        except ValueError:
            num, word = part.split('\n', 1)
            desc = ''
        if num[:3] != 'id=':
            print '### WRONG_NUM ###', num, part
        num = num[3:]
        # if num[-2:] == '.d':
        #     num = num[:-2]
        #     # todo: write bool, downloaded or not
        word = word.decode('utf8').strip()
        desc = desc.decode('utf8').strip()
        if (source_slug, name_slug) == ('academic', 'efremova'):
            # print word
            word, desc = desc.split('\n', 1)
            # print word
            word = word.replace(u'́', '')
            # print word
        if (source_slug, name_slug) == ('academic', 'kuznetsov'):
            if desc.find(u'[с прописной буквы]') != -1:
                # todo: бывает и с большой буквы, и с маленькой "Галактика"
                print word
                word = word.capitalize()
                print word
        # word = prettify(word, encoding=False).decode('utf8')
        # desc = prettify(desc, encoding=False).decode('utf8')
        word = remove_end(word, ['1', '2', ' .', ';']).strip()
        if not check_word(word, debug):  # "-" and " "
            continue
        bulk.add(word, desc, num)
        # todo: also save num to db
        # print num
        # print desc
        # print
        i += 1
        # if i > 10:
        #     break
        if not i % 10000:
            print i