コード例 #1
0
def insert_defns(
    session, lemma_name: str, defns: DictTree2L[List[Dict]]
) -> Tuple[int, List[Tuple[int, Optional[Dict]]]]:
    morphs = []  # type: List[Tuple[int, Optional[Dict]]]
    headword_id = insert_get_id(session, tables.headword, name=lemma_name)
    for full_id, ety, pos, sense in flatten_senses(
            defns):  # type: Tuple[str, int, str, Dict]
        stripped_defn = sense["stripped_defn"]
        sense.pop("bi_examples", {})
        sense.pop("fi_examples", {})

        word_sense_id = insert_get_id(
            session,
            tables.word_sense,
            inflection_of_id=None,
            headword_id=headword_id,
            etymology_index=ety,
            pos=pos,
            sense=stripped_defn,
            sense_id=full_id,
            extra=sense,
        )

        morph = sense.get("morph")
        if morph and morph.get("type") == "form":
            morphs.append((word_sense_id, morph))

    return headword_id, morphs
コード例 #2
0
def insert_ety_head(session, lemma: str, ety_head, headword_id_map):
    lemma_id = ensure_lemma(session, lemma, headword_id_map)
    ety_head_id = insert_get_id(
        session,
        tables.etymology,
        etymology_index=ety_head.pop("ety_idx"),
        headword_id=lemma_id,
        poses=ety_head.pop("poses"),
    )
    etys = ety_head.pop("etys")
    for ety in etys:
        derivation_id = insert_get_id(
            session,
            tables.derivation,
            etymology_id=ety_head_id,
            type=DerivationType(ety.pop("type")),
            extra={"raw_frag": ety.pop("raw_frag")},
        )
        for bit in ety.pop("bits"):
            child_lemma_id = ensure_lemma(session, bit["headword"],
                                          headword_id_map)
            insert(
                session,
                tables.derivation_seg,
                derivation_id=derivation_id,
                derived_seg_id=child_lemma_id,
                alt=bit["alt"],
            )
コード例 #3
0
def ensure_lemma(session, lemma, headword_id_map, *, redlink=False):
    if lemma in headword_id_map:
        lemma_id = headword_id_map[lemma]
    else:
        lemma_id = insert_get_id(session,
                                 tables.headword,
                                 name=lemma,
                                 redlink=redlink)
        headword_id_map[lemma] = lemma_id
    return lemma_id
コード例 #4
0
def insert_morph(session, word_sense_id, morph, headword_id_map):
    morph.pop("type")
    lemma = morph.pop("lemma")
    lemma_id = ensure_lemma(session, lemma, headword_id_map)
    inflection_of_id = insert_get_id(session,
                                     tables.inflection_of,
                                     lemma_id=lemma_id,
                                     inflection=morph)
    session.execute(tables.word_sense.update().where(
        tables.word_sense.c.id == word_sense_id).values(
            inflection_of_id=inflection_of_id))
コード例 #5
0
def insert_mwe(session, mwe: UdMwe, hw_cnts_cache, freqs=False, materialize=False):
    gap_mwe = gapped_mwe(mwe)
    logger.info("Inserting %s", gap_mwe)
    mwe_id = insert_get_id(
        session,
        tables["ud_mwe"],
        typ=mwe.typ,
        poses=listify_poses(mwe.poses),
        headword_idx=mwe.headword_idx,
    )
    for subword_idx, token in enumerate(mwe.tokens):
        insert(
            session,
            tables["ud_mwe_token"],
            mwe_id=mwe_id,
            subword_idx=subword_idx,
            payload=token.payload,
            payload_is_lemma=token.payload_is_lemma,
            poses=listify_poses(token.poses),
            feats=token.feats,
        )
    for link in mwe.links:
        insert(
            session,
            tables["link"],
            mwe_id=mwe_id,
            name=link.link_name,
            payload=link.get_cols(),
        )
        if isinstance(link, WiktionaryHeadwordLink):
            insert(
                session,
                tables["wiktionary_hw_link"],
                mwe_id=mwe_id,
                page_exists=link.page_exists,
                has_senses=link.has_senses,
            )
    if freqs:
        insert_freqs(session, mwe_id, mwe, hw_cnts_cache)
    if materialize:
        insert(
            session,
            tables["mwe_fmt"],
            mwe_id=mwe_id,
            gapped_mwe=gapped_mwe(mwe),
            pos_info=pos_template(mwe),
            turkudepsearch=tds(mwe),
        )
コード例 #6
0
def insert_headword_freqs(session, mwe, lemma_query, lemma):
    freqs_res = headword_freq(mwe)
    assert freqs_res is not None
    headword_freq_id = insert_get_id(
        session,
        tables["headword_freq"],
        lemma_query=lemma_query,
        lemma=lemma,
        wordfreq=freqs_res[0],
        wordfreq_zipf=freqs_res[1],
        # internet_parsebank_cnt=turkudepsearch_headword_freq(lemma_query),
    )
    # hw_cnts = turkudepsearch_propbank_headword_freqs(lemma_query)
    # for prop, cnt in hw_cnts.items():
    # insert(
    # session,
    # tables["headword_propbank_freqs"],
    # headword_freq_id=headword_freq_id,
    # prop=prop,
    # cnt=cnt,
    # )
    return headword_freq_id  # , hw_cnts
コード例 #7
0
def insert_indexed(
    session,
    subwords_list: List[Tuple[Optional[str], bool, List[str], Dict[str, str]]],
    ud_mwe_headword_idx,
    ud_mwe_id,
    *,
    ignore_bare_lemma=True,
    lemmatise=fi_lemmatise,
    dry_run=False,
    add_surf=True,
) -> IndexingResult:
    subword_keys = []
    for (payload, payload_is_lemma, poses, feats) in subwords_list:
        bare_lemma = payload_is_lemma and len(
            feats) == 0 and not ignore_bare_lemma
        if bare_lemma or len(feats) > 0:
            assert payload is None or payload_is_lemma
            lemma = payload if payload is not None else WILDCARD
            keyed_feats = {lemma: {tuple(feats.items())}}
        elif payload is not None:
            keyed_feats = lemmatise(payload)
            if add_surf:
                keyed_feats.setdefault(payload.lower(), set()).add(
                    ((SURF, SURF), ))
        else:
            # TODO: Might like to blacklist anything too simple composed with this open wildcard
            # e.g. we should probably just forget about headword + ___
            keyed_feats = {WILDCARD: set()}
        subword_keys.append(keyed_feats)
    if ud_mwe_headword_idx is not None and subword_keys[ud_mwe_headword_idx]:
        key_idx = ud_mwe_headword_idx
        key_is_head = True
    else:
        key_idx = get_key_idx(subwords_list)
        key_is_head = False
    assert key_idx is not None
    key_lemmas = list(subword_keys[key_idx].keys())
    if not len(key_lemmas):
        return IndexingResult.FAIL
    if not dry_run:
        word_id = insert_get_id(
            session,
            tables["word"],
            key_idx=key_idx,
            key_is_head=key_is_head,
            ud_mwe_id=ud_mwe_id,
        )
    for lemma in key_lemmas:
        if not dry_run:
            insert(
                session,
                tables["key_lemma"],
                key_lemma=lemma,
                word_id=word_id,
            )
    for subword_idx, constrained_lemmas in enumerate(subword_keys):
        lemma_feats = {k: list(v) for k, v in constrained_lemmas.items()}
        if not dry_run:
            insert(
                session,
                tables["subword"],
                word_id=word_id,
                subword_idx=subword_idx,
                lemma_feats=lemma_feats,
            )
    if key_is_head:
        return IndexingResult.HEAD_INDEXED
    else:
        return IndexingResult.RAREST_INDEXED