def load(source_slug, name_slug, chunk_len=200, debug=False): print source_slug print name_slug i = 0 bulk = Bulk(source_slug, name_slug, chunk_len) slug = "%s_%s" % (source_slug, name_slug) parts = get_parts(slug, '\n\n') for part in parts: # print part # print if not part: continue try: num, word, desc = part.split('\n', 2) except ValueError: num, word = part.split('\n', 1) desc = '' if num[:3] != 'id=': print '### WRONG_NUM ###', num, part num = num[3:] # if num[-2:] == '.d': # num = num[:-2] # # todo: write bool, downloaded or not word = word.decode('utf8').strip() desc = desc.decode('utf8').strip() if (source_slug, name_slug) == ('academic', 'efremova'): # print word word, desc = desc.split('\n', 1) # print word word = word.replace(u'́', '') # print word if (source_slug, name_slug) == ('academic', 'kuznetsov'): if desc.find(u'[с прописной буквы]') != -1: # todo: бывает и с большой буквы, и с маленькой "Галактика" print word word = word.capitalize() print word # word = prettify(word, encoding=False).decode('utf8') # desc = prettify(desc, encoding=False).decode('utf8') word = remove_end(word, ['1', '2', ' .', ';']).strip() if not check_word(word, debug): # "-" and " " continue bulk.add(word, desc, num) # todo: also save num to db # print num # print desc # print i += 1 # if i > 10: # break if not i % 10000: print i
def load(chunk_len=200, debug=False): print source_slug print name_slug bulk = Bulk(source_slug, name_slug, chunk_len) lines = get_lines(slug) for line in lines: word, desc = line.split('#') word = prettify(word) desc = prettify(desc) if not check_word(word, debug): # "-", "." and " " continue bulk.add(word, desc) bulk.process()
def load(chunk_len=200, debug=False): for source_slug, name_slug in slugs: print source_slug print name_slug bulk = Bulk(source_slug, name_slug, chunk_len) slug = "%s_%s" % (source_slug, name_slug) lines = get_lines(slug) for value in lines: value = prettify(value, remove_dot=False) if not check_word(value, debug): # "-" and "." continue bulk.add(value) bulk.process()
def load(chunk_len=200, debug=False): print source_slug print name_slug bulk = Bulk(source_slug, name_slug, chunk_len) parts = get_parts(slug, '\n\n') for desc in parts: desc = prettify(join_lines(desc)) first_words = re.findall(u'^([-А-ЯЁ\d][-А-ЯЁ\d]*\.?\.?\.?)', desc, re.UNICODE) if len(first_words) != 1: print u'Ошибка в первом слове:' print '#', desc word = first_words[0] words = [word] cutted = desc[len(word):] other_words = re.findall(u'\W([А-ЯЁ][-А-ЯЁ\d]+\.?\.?\.?)\W', cutted, re.UNICODE) # todo: слова с пробелами: "ВСЕ Ж ТАКИ" (2 случая) for word in other_words: length = cutted.index(word) + len(word) wrong_words = [ u'США', u'США.', u'СССР', u'ССР', u'СЯ', u'СЯ1', u'ЧК', u'СЯ2', u'ТЕ', u'СЯ1-2', u'ЭВМ', u'ВЛКСМ', u'Т-34'] # 70 получено опытным путем и годится только для этого файла if length <= 70 and word not in wrong_words: words.append(word) for i in range(len(words)): word = remove_last_dot(words[i]) if re.search('\d-\d-\d$', word): word = word[:-5] elif re.search('\d-\d$', word): word = word[:-3] elif re.search('\d$', word): word = word[:-1] elif re.search('\d-\d\.\.\.$', word): word = word[:-6] + word[-3:] elif re.search('\d\.\.\.$', word): word = word[:-4] + word[-3:] elif re.search('\d', word): print u'Ошибка! Цифра в слове: ', word words[i] = word words = set(words) # remove duplicates for word in words: if not check_word(word, debug): # "-" and "..." continue bulk.add(word, desc) bulk.process()
def load(chunk_len=200, debug=False): print source_slug print name_slug bulk = Bulk(source_slug, name_slug, chunk_len) lines = get_parts(slug, '\n ') for desc in lines:#[:1000]: desc = prettify(join_lines(desc)) words = re.findall(u'^([-А-ЯЁ\d][-А-ЯЁ\d\s?]*)\W', desc, re.UNICODE) if not words: bulk.append_desc(desc) continue word = prettify(words[0], encoding=False) if not check_word(word, debug): continue bulk.add(word, desc) bulk.process()
def load(chunk_len=200, debug=False): print source_slug print name_slug bulk = Bulk(source_slug, name_slug, chunk_len) lines = get_lines(slug) for line in lines: word, desc = line.split('#', 1) # todo: не покрывает два случая: # - Господин# (ж. р. госпожа)#, владелец, ... # - Приходить# (прийти# ), прибыть, ... # todo: c запятыми можно отдельно повозиться: # - аутентичный (автентичный, отентичный) # - барон, баронет # - бросать деньги (за окно, на ветер) word = prettify(word).upper() desc = prettify(remove_begin(desc, [',', '||']).strip()) if not check_word(word, debug): # "-", "," and " " continue bulk.add(word, desc) bulk.process()