def insert_defns( session, lemma_name: str, defns: DictTree2L[List[Dict]] ) -> Tuple[int, List[Tuple[int, Optional[Dict]]]]: morphs = [] # type: List[Tuple[int, Optional[Dict]]] headword_id = insert_get_id(session, tables.headword, name=lemma_name) for full_id, ety, pos, sense in flatten_senses( defns): # type: Tuple[str, int, str, Dict] stripped_defn = sense["stripped_defn"] sense.pop("bi_examples", {}) sense.pop("fi_examples", {}) word_sense_id = insert_get_id( session, tables.word_sense, inflection_of_id=None, headword_id=headword_id, etymology_index=ety, pos=pos, sense=stripped_defn, sense_id=full_id, extra=sense, ) morph = sense.get("morph") if morph and morph.get("type") == "form": morphs.append((word_sense_id, morph)) return headword_id, morphs
def insert_ety_head(session, lemma: str, ety_head, headword_id_map): lemma_id = ensure_lemma(session, lemma, headword_id_map) ety_head_id = insert_get_id( session, tables.etymology, etymology_index=ety_head.pop("ety_idx"), headword_id=lemma_id, poses=ety_head.pop("poses"), ) etys = ety_head.pop("etys") for ety in etys: derivation_id = insert_get_id( session, tables.derivation, etymology_id=ety_head_id, type=DerivationType(ety.pop("type")), extra={"raw_frag": ety.pop("raw_frag")}, ) for bit in ety.pop("bits"): child_lemma_id = ensure_lemma(session, bit["headword"], headword_id_map) insert( session, tables.derivation_seg, derivation_id=derivation_id, derived_seg_id=child_lemma_id, alt=bit["alt"], )
def ensure_lemma(session, lemma, headword_id_map, *, redlink=False): if lemma in headword_id_map: lemma_id = headword_id_map[lemma] else: lemma_id = insert_get_id(session, tables.headword, name=lemma, redlink=redlink) headword_id_map[lemma] = lemma_id return lemma_id
def insert_morph(session, word_sense_id, morph, headword_id_map): morph.pop("type") lemma = morph.pop("lemma") lemma_id = ensure_lemma(session, lemma, headword_id_map) inflection_of_id = insert_get_id(session, tables.inflection_of, lemma_id=lemma_id, inflection=morph) session.execute(tables.word_sense.update().where( tables.word_sense.c.id == word_sense_id).values( inflection_of_id=inflection_of_id))
def insert_mwe(session, mwe: UdMwe, hw_cnts_cache, freqs=False, materialize=False): gap_mwe = gapped_mwe(mwe) logger.info("Inserting %s", gap_mwe) mwe_id = insert_get_id( session, tables["ud_mwe"], typ=mwe.typ, poses=listify_poses(mwe.poses), headword_idx=mwe.headword_idx, ) for subword_idx, token in enumerate(mwe.tokens): insert( session, tables["ud_mwe_token"], mwe_id=mwe_id, subword_idx=subword_idx, payload=token.payload, payload_is_lemma=token.payload_is_lemma, poses=listify_poses(token.poses), feats=token.feats, ) for link in mwe.links: insert( session, tables["link"], mwe_id=mwe_id, name=link.link_name, payload=link.get_cols(), ) if isinstance(link, WiktionaryHeadwordLink): insert( session, tables["wiktionary_hw_link"], mwe_id=mwe_id, page_exists=link.page_exists, has_senses=link.has_senses, ) if freqs: insert_freqs(session, mwe_id, mwe, hw_cnts_cache) if materialize: insert( session, tables["mwe_fmt"], mwe_id=mwe_id, gapped_mwe=gapped_mwe(mwe), pos_info=pos_template(mwe), turkudepsearch=tds(mwe), )
def insert_headword_freqs(session, mwe, lemma_query, lemma): freqs_res = headword_freq(mwe) assert freqs_res is not None headword_freq_id = insert_get_id( session, tables["headword_freq"], lemma_query=lemma_query, lemma=lemma, wordfreq=freqs_res[0], wordfreq_zipf=freqs_res[1], # internet_parsebank_cnt=turkudepsearch_headword_freq(lemma_query), ) # hw_cnts = turkudepsearch_propbank_headword_freqs(lemma_query) # for prop, cnt in hw_cnts.items(): # insert( # session, # tables["headword_propbank_freqs"], # headword_freq_id=headword_freq_id, # prop=prop, # cnt=cnt, # ) return headword_freq_id # , hw_cnts
def insert_indexed( session, subwords_list: List[Tuple[Optional[str], bool, List[str], Dict[str, str]]], ud_mwe_headword_idx, ud_mwe_id, *, ignore_bare_lemma=True, lemmatise=fi_lemmatise, dry_run=False, add_surf=True, ) -> IndexingResult: subword_keys = [] for (payload, payload_is_lemma, poses, feats) in subwords_list: bare_lemma = payload_is_lemma and len( feats) == 0 and not ignore_bare_lemma if bare_lemma or len(feats) > 0: assert payload is None or payload_is_lemma lemma = payload if payload is not None else WILDCARD keyed_feats = {lemma: {tuple(feats.items())}} elif payload is not None: keyed_feats = lemmatise(payload) if add_surf: keyed_feats.setdefault(payload.lower(), set()).add( ((SURF, SURF), )) else: # TODO: Might like to blacklist anything too simple composed with this open wildcard # e.g. we should probably just forget about headword + ___ keyed_feats = {WILDCARD: set()} subword_keys.append(keyed_feats) if ud_mwe_headword_idx is not None and subword_keys[ud_mwe_headword_idx]: key_idx = ud_mwe_headword_idx key_is_head = True else: key_idx = get_key_idx(subwords_list) key_is_head = False assert key_idx is not None key_lemmas = list(subword_keys[key_idx].keys()) if not len(key_lemmas): return IndexingResult.FAIL if not dry_run: word_id = insert_get_id( session, tables["word"], key_idx=key_idx, key_is_head=key_is_head, ud_mwe_id=ud_mwe_id, ) for lemma in key_lemmas: if not dry_run: insert( session, tables["key_lemma"], key_lemma=lemma, word_id=word_id, ) for subword_idx, constrained_lemmas in enumerate(subword_keys): lemma_feats = {k: list(v) for k, v in constrained_lemmas.items()} if not dry_run: insert( session, tables["subword"], word_id=word_id, subword_idx=subword_idx, lemma_feats=lemma_feats, ) if key_is_head: return IndexingResult.HEAD_INDEXED else: return IndexingResult.RAREST_INDEXED