Beispiel #1
0
def load(source_slug, name_slug, chunk_len=200, debug=False):
    print source_slug
    print name_slug
    i = 0
    bulk = Bulk(source_slug, name_slug, chunk_len)
    slug = "%s_%s" % (source_slug, name_slug)
    parts = get_parts(slug, '\n\n')
    for part in parts:
        # print part
        # print
        if not part:
            continue
        try:
            num, word, desc = part.split('\n', 2)
        except ValueError:
            num, word = part.split('\n', 1)
            desc = ''
        if num[:3] != 'id=':
            print '### WRONG_NUM ###', num, part
        num = num[3:]
        # if num[-2:] == '.d':
        #     num = num[:-2]
        #     # todo: write bool, downloaded or not
        word = word.decode('utf8').strip()
        desc = desc.decode('utf8').strip()
        if (source_slug, name_slug) == ('academic', 'efremova'):
            # print word
            word, desc = desc.split('\n', 1)
            # print word
            word = word.replace(u'́', '')
            # print word
        if (source_slug, name_slug) == ('academic', 'kuznetsov'):
            if desc.find(u'[с прописной буквы]') != -1:
                # todo: бывает и с большой буквы, и с маленькой "Галактика"
                print word
                word = word.capitalize()
                print word
        # word = prettify(word, encoding=False).decode('utf8')
        # desc = prettify(desc, encoding=False).decode('utf8')
        word = remove_end(word, ['1', '2', ' .', ';']).strip()
        if not check_word(word, debug):  # "-" and " "
            continue
        bulk.add(word, desc, num)
        # todo: also save num to db
        # print num
        # print desc
        # print
        i += 1
        # if i > 10:
        #     break
        if not i % 10000:
            print i
Beispiel #2
0
def load(chunk_len=200, debug=False):
    print source_slug
    print name_slug
    bulk = Bulk(source_slug, name_slug, chunk_len)
    lines = get_parts(slug, "\n\n")
    for line in lines:
        word, desc = line.split("\n", 1)
        word = prettify(word)
        desc = prettify(desc)
        if not check_word(word, debug):  # "-" and " "
            continue
        bulk.add(word, desc)
    bulk.process()
Beispiel #3
0
def load(chunk_len=200, debug=False):
    print source_slug
    print name_slug
    bulk = Bulk(source_slug, name_slug, chunk_len)
    parts = get_parts(slug, '\n\n')
    for desc in parts:
        desc = prettify(join_lines(desc))
        first_words = re.findall(u'^([-А-ЯЁ\d][-А-ЯЁ\d]*\.?\.?\.?)', desc,
                                 re.UNICODE)
        if len(first_words) != 1:
            print u'Ошибка в первом слове:'
            print '#', desc
        word = first_words[0]
        words = [word]
        cutted = desc[len(word):]

        other_words = re.findall(u'\W([А-ЯЁ][-А-ЯЁ\d]+\.?\.?\.?)\W', cutted,
                                 re.UNICODE)
        # todo: слова с пробелами: "ВСЕ Ж ТАКИ" (2 случая)
        for word in other_words:
            length = cutted.index(word) + len(word)
            wrong_words = [
                u'США', u'США.', u'СССР', u'ССР', u'СЯ', u'СЯ1', u'ЧК', u'СЯ2',
                u'ТЕ', u'СЯ1-2', u'ЭВМ', u'ВЛКСМ', u'Т-34']
            # 70 получено опытным путем и годится только для этого файла
            if length <= 70 and word not in wrong_words:
                words.append(word)

        for i in range(len(words)):
            word = remove_last_dot(words[i])
            if re.search('\d-\d-\d$', word):
                word = word[:-5]
            elif re.search('\d-\d$', word):
                word = word[:-3]
            elif re.search('\d$', word):
                word = word[:-1]
            elif re.search('\d-\d\.\.\.$', word):
                word = word[:-6] + word[-3:]
            elif re.search('\d\.\.\.$', word):
                word = word[:-4] + word[-3:]
            elif re.search('\d', word):
                print u'Ошибка! Цифра в слове: ', word
            words[i] = word

        words = set(words)  # remove duplicates

        for word in words:
            if not check_word(word, debug):  # "-" and "..."
                continue
            bulk.add(word, desc)
    bulk.process()
Beispiel #4
0
def load(chunk_len=200, debug=False):
    print source_slug
    print name_slug
    bulk = Bulk(source_slug, name_slug, chunk_len)
    lines = get_parts(slug, '\n   ')
    for desc in lines:#[:1000]:
        desc = prettify(join_lines(desc))
        words = re.findall(u'^([-А-ЯЁ\d][-А-ЯЁ\d\s?]*)\W', desc, re.UNICODE)
        if not words:
            bulk.append_desc(desc)
            continue
        word = prettify(words[0], encoding=False)
        if not check_word(word, debug):
            continue
        bulk.add(word, desc)
    bulk.process()