Esempio n. 1
0
def get_tokens(book_id):
    prev_token = None
    tokens = list()
    for sentence in get_sentences(book_id):
        for token in sentence.get_tokens():
            token.force_prev_token = prev_token
            tokens.append(token)
            prev_token = token
    return tokens
Esempio n. 2
0
def compare_lexeme_morpheme_capitals(book_id):
    sorting = {
        'Name': list(),
        'Surn': list(),
        'Abbr': list(),
        'Geox': list(),
        'Orgn': list(),
    }
    keys = list()
    for sentence in get_sentences(book_id):
        for token in sentence.get_tokens():
            if token.morpheme.is_small and token.lexeme.is_capital:
                # if token.lexeme.capital in ['Name', 'Surn']:
                # if token.lexeme.capital in ['Abbr']:
                # if token.lexeme.capital in ['Geox']:
                # if token.lexeme.capital not in ['Name', 'Surn', 'Abbr', 'Geox']:
                #     keys.append("%s %s %s" % (token.morpheme, token.lexeme.value, token.lexeme.capital))
                sorting[token.lexeme.capital].append(
                    u"%s — %s" % (token.morpheme.value, token.lexeme.value)
                )
                # if token.lexeme.capital in ['Name']:
                #     print token.morpheme, token.lexeme.value, token.lexeme.capital
                #     for lexeme in token.lexemes:
                #         print '-', lexeme['word'], lexeme['type'], Token.check_lexeme_capital(lexeme['params'])

                # if token.lexeme.capital in ['Name', 'Surn']:
                # if token.lexeme.capital in ['Geox']:
                # if token.lexeme.capital in ['Orgn', 'Abbr']:
                # if token.lexeme.capital in ['Orgn']:
                #     # if token.lexeme.type in ['ADJF', 'NOUN']:
                #     # if token.lexeme.type in ['ADJF']:
                #     #     continue
                #     ok = False
                #     for lexeme in token.lexemes:
                #         capital = Token.check_lexeme_capital(lexeme['params'])
                #         if not capital:
                #             ok =  True
                #     if not ok:
                #         print '-',
                #     else:
                #         # print '+',
                #         continue
                #     print token.morpheme.value, token.lexeme.value, token.lexeme.capital
                #     for lexeme in token.lexemes:
                #         print '  -', lexeme['word'], lexeme['type'], Token.check_lexeme_capital(lexeme['params'])

    # for index, key in enumerate(set(keys)):
    #     print index, key

    for capital, tokens in sorting.items():
        tokens = set(tokens)
        print
        print capital, len(tokens)
        for token in tokens:
            print "-", token
Esempio n. 3
0
def fill_words(book_id):
    chains = list()
    for sentence in get_sentences(book_id):
        for word in sentence.get_tokens():
            chains += [(1, word.morpheme.value),
                       (2, word.lexeme.base),
                       (3, word.lexeme.value)]
    chains = set(chains)
    chains = [Chain(value=chain[1], length=1, type=chain[0])
              for chain in chains]
    Chain.objects.bulk_create(chains, 10000)
Esempio n. 4
0
def check_sentences():
    for sentence in get_sentences(1):
        s = sentence.source
        # if re.search(u'[^А-Яа-я]И[^А-Яа-я]', s, re.UNICODE) and not s.startswith(u'И') and not u'СМИ' in s:
        # if re.search(u'[^А-Яа-я]И[^А-Яа-я]', s, re.UNICODE):
        # if re.search(u'[^А-Яа-я0-9«]В[^А-Яа-я0-9]', s, re.UNICODE):
        if re.search(u'[^А-Яа-я0-9«]С[^А-Яа-я0-9]', s, re.UNICODE) and not s.startswith(u'• С'):
        # if u'С ' in s and not s.startswith(u'С') and not s.startswith(u'• С') and not s.startswith(u'«С') and \
        #         not u'ЕС' in s and not u'АЭС' in s and not u'ХДС' in s and not u'МКС' in s and not u'МЧС' in s and not u'ЧС' in s:
        # if u'Он ' in s and not s.startswith(u'Он'): # and not u'СМИ' in s:
            print s
Esempio n. 5
0
def fill_mystem():
    from pymystem3 import Mystem
    m = Mystem()
    for sentence in get_sentences(1):
        lemmas = m.analyze(sentence.source)
        items = list()
        for lemma in lemmas:
            text = lemma['text']
            analysis = lemma.get('analysis')
            if not analysis:
                text = text.strip()
                if not len(text):
                    print 'spaces = "%s"' % text
                    continue
                if ' ' in text:
                    for item in re.split('\s+', text):
                        items.append("%s   %s ?" % (item, item))
                    print 'several =', "|".join(re.split('\s+', text))
                    continue
                print 'delimiter = "%s"' % text
                items.append("%s   %s ?" % (text, text))
                continue

            if not len(text.strip()):
                raise Exception('Impossible')
            if ' ' in text:
                raise Exception('Impossible')

            lexemes = list()
            for lexeme in analysis:
                print 'lex=', lexeme.get('lex', '-')
                print 'gr=', lexeme.get('gr', '-')
                lexemes.append("%s %s" % (lexeme['lex'], lexeme['gr']))
            items.append("%s   %s" % (text, '  '.join(lexemes)))
        sentence.mystem = '\n'.join(items)
        sentence.save()