Beispiel #1
0
def update():
    cwn = CwnBase()
    url_templ = "https://docs.google.com/spreadsheets/d/1vzDlokmrsXMdGBaoSFR9lC1F9BlN8qHR6b5YDMMvv7Y/export?format=csv&gid={gid}"
    cwn_data, check_results = update_cwn(url_templ, cwn)

    if check_results:
        export_to_pickle(cwn_data)
        export_to_json(cwn_data)
    else:
        logger.error("Update failed, please check the log file")

    return check_results
Beispiel #2
0
def search_word(word):
    cwn = CwnBase()  # Initialize cwn
    cwn_senses = cwn.find_lemma(word)[0].senses

    word1 = re.sub('\^|\$', '', word)  # Remove the anchors

    translator = Translator()  # Initialize the translator
    tran_result = translator.translate(word1).text  # Translate CH to EN
    synset = wn.synsets(tran_result)

    num = []  # Id in wordnet
    df = []  # Definition in wordnet

    for i in range(0, len(synset)):
        df.append(synset[i].definition())  # Find the definition of the word
        num.append('{}-{}'.format(
            str(wn.synset(synset[i].name()).offset()).zfill(8),
            wn.synset(synset[i].name()).pos()))
        # Find the id num of the synset. It has to be 8-digit with a pos tag.

    return cwn_senses, {id_: def_ for id_, def_ in zip(num, df)}
Beispiel #3
0
def test_graph_merge():
    cwn = CwnBase()
    annot1 = CwnAnnotator(cwn, "test_a")
    annot2 = CwnAnnotator(cwn, "test_b")

    am = annot_merger.AnnotationMerger(annot1, annot2)

    merged = am.merge()

    assert len(merged.V) > 0, "merged graph has non-empty vertices"
    assert len(merged.E) > 0, "merged graph has non-empty edges"

    merged.save()
Beispiel #4
0
def test_consistency(caplog):
    caplog.set_level(logging.INFO)
    cwn = CwnBase()
    basepath = Path(__file__).parent / "../data"
    annot_dfs = {
        "lemma":
        pd.read_csv(basepath / "sense_no_relation - lemma.csv"),
        "sense":
        pd.read_csv(basepath / "sense_no_relation - sense.csv"),
        "lex_rel":
        pd.read_csv(basepath / "sense_no_relation - lexical relation.csv"),
    }
    check_consistency(cwn, annot_dfs)
    assert True
Beispiel #5
0
# %%
#from pprint import pprint
#from CwnGraph import CwnBase
#CwnBase.install_cwn("cwn_graph.pyobj")
# %%
from CwnGraph import CwnBase
cwn = CwnBase()
# %%
lemmas = cwn.find_lemma("^朋友$")
lemmas
# %%
senses = lemmas[0].senses
senses
# %%
friend = senses[0]
friend.relations
# %%
friend.synonym
#%%
cwn.find_senses(definition='縮寫')
Beispiel #6
0
#%%
from pprint import pprint
from CwnGraph import CwnBase
cwn = CwnBase()
import re
#%%
cwn.find_lemma("^朋友$")[0].senses[0]
# %%
all_lemma = cwn.find_lemma(".+")
#%%

# %%
all_senses = []
short = []
for lemma in all_lemma[1:500]:
    all_senses.append = lemma.senses
    for sense in all_senses:
        if re.match("簡省", sense):
            short.append(sense)
# %%
Beispiel #7
0
with open('ASBC_unigrams.json', encoding='utf-8') as f:
    word_freq = json.load(f)

cjk = re.compile(r'[\u2E80-\u2FD5\u3190-\u319f\u3400-\u4DBF\u4E00-\u9FCC\uF900-\uFAAD]+')
sorted_wordfreq = sorted(((k, v) for k, v in word_freq.items() if cjk.match(k)), key=lambda x: x[1], reverse=True)

# %% 取詞頻最高的 8000 個詞彙
sorted_wordfreq[:8000]
# %%
# Extract words only
def Extract(lst): 
    return [item[0] for item in lst] 
#%%
# Initialize cwn
from CwnGraph import CwnBase
cwn = CwnBase()
# %%
# Find asbc8k not in cwn
words = Extract(sorted_wordfreq[:8000])
asbc_in_cwn = []
for word in words:
    if len(cwn.find_lemma(f"^{word}$")) == 0:
        asbc_in_cwn.append(word)

 # %%
# Write .csv
import pandas as pd
df = pd.DataFrame(asbc_in_cwn, columns=["colummn"])
df.to_csv('asbc8k_not_in_cwn.csv', index=False)
# %%
# Import huayu8000
Beispiel #8
0
    translator = Translator()
    # Initialize the translator
    tran_result = translator.translate(word).text
    # translate CH to EN
    synset = wn.synsets(tran_result)
    num = []
    df = []
    result = []

    for i in range(0, len(synset)):
        df.append(synset[i].definition())  # find the definition of the word
        num.append('{}-{}'.format(
            str(wn.synset(synset[i].name()).offset()).zfill(8),
            wn.synset(synset[i].name()).pos()))
        # Find the id num of the synset. It has to be 8-digit with a pos tag.

    return cwn_senses, {id_: def_ for id_, def_ in zip(num, df)}


# %%
from CwnGraph import CwnBase
cwn = CwnBase()


def search_word(word):
    senses = []
    for i in range(0, len(word)):
        senses.append(cwn.find_lemma(word)[0].senses)
        return senses
Beispiel #9
0
 def __init__(self):
     self.cwn = CwnBase()
Beispiel #10
0
class SenseKeyedVectors(KeyedVectors):
    def __init__(self):
        self.cwn = CwnBase()

    @classmethod
    def load_from_kv(cls, fpath):
        kv = KeyedVectors.load(fpath)
        skv = SenseKeyedVectors()
        skv.__dict__.update(kv.__dict__)
        return skv

    def query_sense(self, term):
        if "-" not in term:
            return term
        cwn_id = term[term.index("-") + 1:]
        try:
            sense = CwnSense(cwn_id, self.cwn)
            return sense.head_word + ": " + sense.definition
        except Exception:
            return term

    def get_token_idx(self, sense):
        tok = f"{sense.head_word}-{sense.id}"
        tok_idx = self.key_to_index.get(tok, -1)
        return tok_idx

    def query_sense_freq(self, sense):
        tok_idx = self.get_token_idx(sense)
        if tok_idx < 0:
            return 0
        else:
            return self.expandos["count"][tok_idx]

    def query_vector(self, sense):
        tok_idx = self.get_token_idx(sense)
        if tok_idx < 0:
            return None
        else:
            return self.get_vector(tok_idx, norm=True)

    def make_sense_vectors(self, lemma):
        senses = self.cwn.find_all_senses(lemma)
        sense_ids = []
        sense_labels = []
        sense_freqs = []
        vecs = []

        for sense_x in senses:
            vec_x = self.query_vector(sense_x)
            if vec_x is None: continue
            vecs.append(vec_x)
            sense_ids.append(sense_x.id)
            try:
                ex0 = sense_x.all_examples()[0]
                ex = ex0[ex0.index("<") - 3:ex0.index(">") + 4]
            except:
                ex = ""
            sense_labels.append(f"[{sense_x.id}]{sense_x.definition}: {ex}")
            sense_freqs.append(self.query_sense_freq(sense_x))

        if len(vecs):
            stack_vecs = np.vstack(vecs)
        else:
            stack_vecs = np.array([])

        return SenseData(sense_ids, sense_labels, sense_freqs, stack_vecs)
Beispiel #11
0
# %%
from CwnGraph import CwnBase
cwn = CwnBase()

# %%
lemmas = cwn.find_lemma("^朋友$")
senses = lemmas[0].senses

# %%
import re
from googletrans import Translator
from nltk.corpus import wordnet as wn


def search_word(word):
    cwn_senses = cwn.find_lemma(word)[0].senses

    word1 = re.sub('\^|\$', '', word)
    translator = Translator()
    # Initialize the translator
    tran_result = translator.translate(word1).text
    # translate CH to EN
    synset = wn.synsets(tran_result)
    num = []
    df = []
    result = []

    for i in range(0, len(synset)):
        df.append(synset[i].definition())  # find the definition of the word
        num.append('{}-{}'.format(
            str(wn.synset(synset[i].name()).offset()).zfill(8),
Beispiel #12
0
def get_cwn_inst():
    global __cwn
    if not __cwn:
        __cwn = CwnBase()
    return __cwn