def load(source_slug, name_slug, chunk_len=200, debug=False): print source_slug print name_slug i = 0 bulk = Bulk(source_slug, name_slug, chunk_len) slug = "%s_%s" % (source_slug, name_slug) parts = get_parts(slug, '\n\n') for part in parts: # print part # print if not part: continue try: num, word, desc = part.split('\n', 2) except ValueError: num, word = part.split('\n', 1) desc = '' if num[:3] != 'id=': print '### WRONG_NUM ###', num, part num = num[3:] # if num[-2:] == '.d': # num = num[:-2] # # todo: write bool, downloaded or not word = word.decode('utf8').strip() desc = desc.decode('utf8').strip() if (source_slug, name_slug) == ('academic', 'efremova'): # print word word, desc = desc.split('\n', 1) # print word word = word.replace(u'́', '') # print word if (source_slug, name_slug) == ('academic', 'kuznetsov'): if desc.find(u'[с прописной буквы]') != -1: # todo: бывает и с большой буквы, и с маленькой "Галактика" print word word = word.capitalize() print word # word = prettify(word, encoding=False).decode('utf8') # desc = prettify(desc, encoding=False).decode('utf8') word = remove_end(word, ['1', '2', ' .', ';']).strip() if not check_word(word, debug): # "-" and " " continue bulk.add(word, desc, num) # todo: also save num to db # print num # print desc # print i += 1 # if i > 10: # break if not i % 10000: print i