def estimate_conditional_tag_probability(morph, corpus_filename, logger=None): """ Estimate P(t|w) based on OpenCorpora xml dump. Probability is estimated based on counts of disambiguated ambiguous words, using simple Laplace smoothing. """ import nltk import opencorpora if logger is None: logger = logging.getLogger(__name__) class _ConditionalProbDist(nltk.ConditionalProbDist): """ This ConditionalProbDist subclass passes 'condition' variable to probdist_factory. See https://github.com/nltk/nltk/issues/500 """ def __init__(self, cfdist, probdist_factory): self._probdist_factory = probdist_factory for condition in cfdist: self[condition] = probdist_factory(cfdist[condition], condition) reader = opencorpora.CorpusReader(corpus_filename) disambig_words = list( with_progress( _disambiguated_words(reader), "Reading disambiguated words from corpus" ) ) disambig_words = with_progress(disambig_words, "Filtering out non-ambiguous words") ambiguous_words = [ (w, gr) for (w, gr) in ( (w.lower(), tag2grammemes(t)) for (w, t) in disambig_words if len(morph.tag(w)) > 1 ) if gr != set(['UNKN']) ] logger.info("Computing P(t|w)") def probdist_factory(fd, condition): bins = max(len(morph.tag(condition)), fd.B()) return nltk.LaplaceProbDist(fd, bins=bins) cfd = nltk.ConditionalFreqDist(ambiguous_words) cpd = _ConditionalProbDist(cfd, probdist_factory) return cpd, cfd
def _suffixes_prediction_data(words, paradigm_popularity, gramtab, paradigms, suffixes, min_ending_freq, min_paradigm_popularity, max_suffix_length, paradigm_prefixes): logger.debug('calculating prediction data: removing non-productive paradigms..') productive_paradigms = _popular_keys(paradigm_popularity, min_paradigm_popularity) # ["suffix"] => number of occurrences # this is for removing non-productive suffixes ending_counts = collections.defaultdict(int) # [form_prefix_id]["suffix"]["POS"][(para_id, idx)] => number or occurrences # this is for selecting most popular parses prefix_endings = {} for form_prefix_id in range(len(paradigm_prefixes)): prefix_endings[form_prefix_id] = collections.defaultdict( lambda: collections.defaultdict( lambda: collections.defaultdict(int))) logger.debug('calculating prediction data: checking word endings..') for word, (para_id, idx) in with_progress(words, "Checking word endings"): if para_id not in productive_paradigms: continue paradigm = paradigms[para_id] form_count = len(paradigm) // 3 tag = gramtab[paradigm[form_count + idx]] form_prefix_id = paradigm[2*form_count + idx] form_prefix = paradigm_prefixes[form_prefix_id] form_suffix = suffixes[paradigm[idx]] assert len(word) >= len(form_prefix+form_suffix), word assert word.startswith(form_prefix), word assert word.endswith(form_suffix), word if len(word) == len(form_prefix) + len(form_suffix): # pseudo-paradigms are useless for prediction continue POS = tuple(tag.replace(' ', ',', 1).split(','))[0] for i in range(max(len(form_suffix), 1), max_suffix_length+1): #was: 1,2,3,4,5 word_end = word[-i:] ending_counts[word_end] += 1 prefix_endings[form_prefix_id][word_end][POS][(para_id, idx)] += 1 dawgs_data = [] for form_prefix_id in sorted(prefix_endings.keys()): logger.debug('calculating prediction data: preparing DAFSA #%d..' % form_prefix_id) endings = prefix_endings[form_prefix_id] dawgs_data.append( _get_suffixes_dawg_data(endings, ending_counts, min_ending_freq) ) return dawgs_data
def _suffixes_prediction_data(words, paradigm_popularity, gramtab, paradigms, suffixes, min_ending_freq, min_paradigm_popularity, max_suffix_length, paradigm_prefixes): logger.debug('calculating prediction data: removing non-productive paradigms..') productive_paradigms = _popular_keys(paradigm_popularity, min_paradigm_popularity) # ["suffix"] => number of occurrences # this is for removing non-productive suffixes ending_counts = collections.defaultdict(int) # [form_prefix_id]["suffix"]["POS"][(para_id, idx)] => number or occurrences # this is for selecting most popular parses prefix_endings = {} for form_prefix_id in range(len(paradigm_prefixes)): prefix_endings[form_prefix_id] = collections.defaultdict( lambda: collections.defaultdict( lambda: collections.defaultdict(int))) logger.debug('calculating prediction data: checking word endings..') for word, (para_id, idx) in with_progress(words, "Checking word endings"): if para_id not in productive_paradigms: continue paradigm = paradigms[para_id] form_count = len(paradigm) // 3 tag = gramtab[paradigm[form_count + idx]] form_prefix_id = paradigm[2*form_count + idx] form_prefix = paradigm_prefixes[form_prefix_id] form_suffix = suffixes[paradigm[idx]] assert len(word) >= len(form_prefix+form_suffix), word assert word.startswith(form_prefix), word assert word.endswith(form_suffix), word if len(word) == len(form_prefix) + len(form_suffix): # pseudo-paradigms are useless for prediction continue POS = tuple(tag.replace(' ', ',', 1).split(','))[0] for i in range(max(len(form_suffix), 1), max_suffix_length+1): #was: 1,2,3,4,5 word_end = word[-i:] ending_counts[word_end] += 1 prefix_endings[form_prefix_id][word_end][POS][(para_id, idx)] += 1 dawgs_data = [] for form_prefix_id in sorted(prefix_endings.keys()): logger.debug('calculating prediction data: preparing DAFSA #%d..' % form_prefix_id) endings = prefix_endings[form_prefix_id] dawgs_data.append( _get_suffixes_dawg_data(endings, ending_counts, min_ending_freq) ) return dawgs_data
def estimate_conditional_tag_probability(morph, corpus_filename, logger=None): """ Estimate P(t|w) based on OpenCorpora xml dump. Probability is estimated based on counts of disambiguated ambiguous words, using simple Laplace smoothing. """ import nltk import opencorpora if logger is None: logger = logging.getLogger(__name__) class _ConditionalProbDist(nltk.ConditionalProbDist): """ This ConditionalProbDist subclass passes 'condition' variable to probdist_factory. See https://github.com/nltk/nltk/issues/500 """ def __init__(self, cfdist, probdist_factory): self._probdist_factory = probdist_factory for condition in cfdist: self[condition] = probdist_factory(cfdist[condition], condition) reader = opencorpora.CorpusReader(corpus_filename) disambig_words = list( with_progress(_disambiguated_words(reader), "Reading disambiguated words from corpus")) disambig_words = with_progress(disambig_words, "Filtering out non-ambiguous words") ambiguous_words = [(w, gr) for (w, gr) in ((w.lower(), tag2grammemes(t)) for (w, t) in disambig_words if len(morph.tag(w)) > 1) if gr != set(['UNKN'])] logger.info("Computing P(t|w)") def probdist_factory(fd, condition): bins = max(len(morph.tag(condition)), fd.B()) return nltk.LaplaceProbDist(fd, bins=bins) cfd = nltk.ConditionalFreqDist(ambiguous_words) cpd = _ConditionalProbDist(cfd, probdist_factory) return cpd, cfd
def parse_opencorpora_xml(filename): """ Parse OpenCorpora dict XML and return a ``ParsedDictionary`` namedtuple. """ links = [] lexemes = {} grammemes = [] version, revision = get_dictionary_info(filename) logger.info("dictionary v%s, rev%s", version, revision) interesting_tags = set(['grammeme', 'lemma', 'link']) def _parse(filename): for ev, elem in iterparse(filename): if elem.tag not in interesting_tags: continue yield ev, elem logger.info("parsing XML dictionary") for ev, elem in with_progress(_parse(filename), "XML parsing"): if elem.tag == 'grammeme': name = elem.find('name').text parent = elem.get('parent') alias = elem.find('alias').text description = elem.find('description').text grammeme = (name, parent, alias, description) grammemes.append(grammeme) xml_clear_elem(elem) if elem.tag == 'lemma': lex_id, word_forms = _word_forms_from_xml_elem(elem) lexemes[lex_id] = word_forms xml_clear_elem(elem) elif elem.tag == 'link': link_tuple = ( elem.get('from'), elem.get('to'), elem.get('type'), ) links.append(link_tuple) xml_clear_elem(elem) return ParsedDictionary( lexemes=lexemes, links=links, grammemes=grammemes, version=version, revision=revision )
def simplify_tags(parsed_dict, skip_space_ambiguity=True): """ This function simplifies tags in :param:`parsed_dict`. :param:`parsed_dict` is modified inplace. """ logger.info("simplifying tags: looking for tag spellings") spellings = _get_tag_spellings(parsed_dict) logger.info( "simplifying tags: looking for spelling duplicates " "(skip_space_ambiguity: %s)", skip_space_ambiguity) tag_replaces = _get_duplicate_tag_replaces(spellings, skip_space_ambiguity) logger.debug("%d duplicate tags will be removed", len(tag_replaces)) logger.info("simplifying tags: fixing") for lex_id in with_progress(parsed_dict.lexemes, "Simplifying tags"): new_lexeme = [(word, _simplify_tag(tag, tag_replaces)) for word, tag in parsed_dict.lexemes[lex_id]] parsed_dict.lexemes[lex_id] = new_lexeme
def simplify_tags(parsed_dict, skip_space_ambiguity=True): """ This function simplifies tags in :param:`parsed_dict`. :param:`parsed_dict` is modified inplace. """ logger.info("simplifying tags: looking for tag spellings") spellings = _get_tag_spellings(parsed_dict) logger.info("simplifying tags: looking for spelling duplicates " "(skip_space_ambiguity: %s)", skip_space_ambiguity) tag_replaces = _get_duplicate_tag_replaces(spellings, skip_space_ambiguity) logger.debug("%d duplicate tags will be removed", len(tag_replaces)) logger.info("simplifying tags: fixing") for lex_id in with_progress(parsed_dict.lexemes, "Simplifying tags"): new_lexeme = [ (word, _simplify_tag(tag, tag_replaces)) for word, tag in parsed_dict.lexemes[lex_id] ] parsed_dict.lexemes[lex_id] = new_lexeme
def _itertags(parsed_dict): for lex_id in with_progress(parsed_dict.lexemes, "Looking for tag spellings"): for word, tag in parsed_dict.lexemes[lex_id]: yield tag
def _itertags(parsed_dict): for lex_id in with_progress(parsed_dict.lexemes, "Looking for tag spellings"): for word, tag in parsed_dict.lexemes[lex_id]: yield tag