def get_tokens(book_id): prev_token = None tokens = list() for sentence in get_sentences(book_id): for token in sentence.get_tokens(): token.force_prev_token = prev_token tokens.append(token) prev_token = token return tokens
def compare_lexeme_morpheme_capitals(book_id): sorting = { 'Name': list(), 'Surn': list(), 'Abbr': list(), 'Geox': list(), 'Orgn': list(), } keys = list() for sentence in get_sentences(book_id): for token in sentence.get_tokens(): if token.morpheme.is_small and token.lexeme.is_capital: # if token.lexeme.capital in ['Name', 'Surn']: # if token.lexeme.capital in ['Abbr']: # if token.lexeme.capital in ['Geox']: # if token.lexeme.capital not in ['Name', 'Surn', 'Abbr', 'Geox']: # keys.append("%s %s %s" % (token.morpheme, token.lexeme.value, token.lexeme.capital)) sorting[token.lexeme.capital].append( u"%s — %s" % (token.morpheme.value, token.lexeme.value) ) # if token.lexeme.capital in ['Name']: # print token.morpheme, token.lexeme.value, token.lexeme.capital # for lexeme in token.lexemes: # print '-', lexeme['word'], lexeme['type'], Token.check_lexeme_capital(lexeme['params']) # if token.lexeme.capital in ['Name', 'Surn']: # if token.lexeme.capital in ['Geox']: # if token.lexeme.capital in ['Orgn', 'Abbr']: # if token.lexeme.capital in ['Orgn']: # # if token.lexeme.type in ['ADJF', 'NOUN']: # # if token.lexeme.type in ['ADJF']: # # continue # ok = False # for lexeme in token.lexemes: # capital = Token.check_lexeme_capital(lexeme['params']) # if not capital: # ok = True # if not ok: # print '-', # else: # # print '+', # continue # print token.morpheme.value, token.lexeme.value, token.lexeme.capital # for lexeme in token.lexemes: # print ' -', lexeme['word'], lexeme['type'], Token.check_lexeme_capital(lexeme['params']) # for index, key in enumerate(set(keys)): # print index, key for capital, tokens in sorting.items(): tokens = set(tokens) print print capital, len(tokens) for token in tokens: print "-", token
def fill_words(book_id): chains = list() for sentence in get_sentences(book_id): for word in sentence.get_tokens(): chains += [(1, word.morpheme.value), (2, word.lexeme.base), (3, word.lexeme.value)] chains = set(chains) chains = [Chain(value=chain[1], length=1, type=chain[0]) for chain in chains] Chain.objects.bulk_create(chains, 10000)
def check_sentences(): for sentence in get_sentences(1): s = sentence.source # if re.search(u'[^А-Яа-я]И[^А-Яа-я]', s, re.UNICODE) and not s.startswith(u'И') and not u'СМИ' in s: # if re.search(u'[^А-Яа-я]И[^А-Яа-я]', s, re.UNICODE): # if re.search(u'[^А-Яа-я0-9«]В[^А-Яа-я0-9]', s, re.UNICODE): if re.search(u'[^А-Яа-я0-9«]С[^А-Яа-я0-9]', s, re.UNICODE) and not s.startswith(u'• С'): # if u'С ' in s and not s.startswith(u'С') and not s.startswith(u'• С') and not s.startswith(u'«С') and \ # not u'ЕС' in s and not u'АЭС' in s and not u'ХДС' in s and not u'МКС' in s and not u'МЧС' in s and not u'ЧС' in s: # if u'Он ' in s and not s.startswith(u'Он'): # and not u'СМИ' in s: print s
def fill_mystem(): from pymystem3 import Mystem m = Mystem() for sentence in get_sentences(1): lemmas = m.analyze(sentence.source) items = list() for lemma in lemmas: text = lemma['text'] analysis = lemma.get('analysis') if not analysis: text = text.strip() if not len(text): print 'spaces = "%s"' % text continue if ' ' in text: for item in re.split('\s+', text): items.append("%s %s ?" % (item, item)) print 'several =', "|".join(re.split('\s+', text)) continue print 'delimiter = "%s"' % text items.append("%s %s ?" % (text, text)) continue if not len(text.strip()): raise Exception('Impossible') if ' ' in text: raise Exception('Impossible') lexemes = list() for lexeme in analysis: print 'lex=', lexeme.get('lex', '-') print 'gr=', lexeme.get('gr', '-') lexemes.append("%s %s" % (lexeme['lex'], lexeme['gr'])) items.append("%s %s" % (text, ' '.join(lexemes))) sentence.mystem = '\n'.join(items) sentence.save()