def update(): cwn = CwnBase() url_templ = "https://docs.google.com/spreadsheets/d/1vzDlokmrsXMdGBaoSFR9lC1F9BlN8qHR6b5YDMMvv7Y/export?format=csv&gid={gid}" cwn_data, check_results = update_cwn(url_templ, cwn) if check_results: export_to_pickle(cwn_data) export_to_json(cwn_data) else: logger.error("Update failed, please check the log file") return check_results
def search_word(word): cwn = CwnBase() # Initialize cwn cwn_senses = cwn.find_lemma(word)[0].senses word1 = re.sub('\^|\$', '', word) # Remove the anchors translator = Translator() # Initialize the translator tran_result = translator.translate(word1).text # Translate CH to EN synset = wn.synsets(tran_result) num = [] # Id in wordnet df = [] # Definition in wordnet for i in range(0, len(synset)): df.append(synset[i].definition()) # Find the definition of the word num.append('{}-{}'.format( str(wn.synset(synset[i].name()).offset()).zfill(8), wn.synset(synset[i].name()).pos())) # Find the id num of the synset. It has to be 8-digit with a pos tag. return cwn_senses, {id_: def_ for id_, def_ in zip(num, df)}
def test_graph_merge(): cwn = CwnBase() annot1 = CwnAnnotator(cwn, "test_a") annot2 = CwnAnnotator(cwn, "test_b") am = annot_merger.AnnotationMerger(annot1, annot2) merged = am.merge() assert len(merged.V) > 0, "merged graph has non-empty vertices" assert len(merged.E) > 0, "merged graph has non-empty edges" merged.save()
def test_consistency(caplog): caplog.set_level(logging.INFO) cwn = CwnBase() basepath = Path(__file__).parent / "../data" annot_dfs = { "lemma": pd.read_csv(basepath / "sense_no_relation - lemma.csv"), "sense": pd.read_csv(basepath / "sense_no_relation - sense.csv"), "lex_rel": pd.read_csv(basepath / "sense_no_relation - lexical relation.csv"), } check_consistency(cwn, annot_dfs) assert True
# %% #from pprint import pprint #from CwnGraph import CwnBase #CwnBase.install_cwn("cwn_graph.pyobj") # %% from CwnGraph import CwnBase cwn = CwnBase() # %% lemmas = cwn.find_lemma("^朋友$") lemmas # %% senses = lemmas[0].senses senses # %% friend = senses[0] friend.relations # %% friend.synonym #%% cwn.find_senses(definition='縮寫')
#%% from pprint import pprint from CwnGraph import CwnBase cwn = CwnBase() import re #%% cwn.find_lemma("^朋友$")[0].senses[0] # %% all_lemma = cwn.find_lemma(".+") #%% # %% all_senses = [] short = [] for lemma in all_lemma[1:500]: all_senses.append = lemma.senses for sense in all_senses: if re.match("簡省", sense): short.append(sense) # %%
with open('ASBC_unigrams.json', encoding='utf-8') as f: word_freq = json.load(f) cjk = re.compile(r'[\u2E80-\u2FD5\u3190-\u319f\u3400-\u4DBF\u4E00-\u9FCC\uF900-\uFAAD]+') sorted_wordfreq = sorted(((k, v) for k, v in word_freq.items() if cjk.match(k)), key=lambda x: x[1], reverse=True) # %% 取詞頻最高的 8000 個詞彙 sorted_wordfreq[:8000] # %% # Extract words only def Extract(lst): return [item[0] for item in lst] #%% # Initialize cwn from CwnGraph import CwnBase cwn = CwnBase() # %% # Find asbc8k not in cwn words = Extract(sorted_wordfreq[:8000]) asbc_in_cwn = [] for word in words: if len(cwn.find_lemma(f"^{word}$")) == 0: asbc_in_cwn.append(word) # %% # Write .csv import pandas as pd df = pd.DataFrame(asbc_in_cwn, columns=["colummn"]) df.to_csv('asbc8k_not_in_cwn.csv', index=False) # %% # Import huayu8000
translator = Translator() # Initialize the translator tran_result = translator.translate(word).text # translate CH to EN synset = wn.synsets(tran_result) num = [] df = [] result = [] for i in range(0, len(synset)): df.append(synset[i].definition()) # find the definition of the word num.append('{}-{}'.format( str(wn.synset(synset[i].name()).offset()).zfill(8), wn.synset(synset[i].name()).pos())) # Find the id num of the synset. It has to be 8-digit with a pos tag. return cwn_senses, {id_: def_ for id_, def_ in zip(num, df)} # %% from CwnGraph import CwnBase cwn = CwnBase() def search_word(word): senses = [] for i in range(0, len(word)): senses.append(cwn.find_lemma(word)[0].senses) return senses
def __init__(self): self.cwn = CwnBase()
class SenseKeyedVectors(KeyedVectors): def __init__(self): self.cwn = CwnBase() @classmethod def load_from_kv(cls, fpath): kv = KeyedVectors.load(fpath) skv = SenseKeyedVectors() skv.__dict__.update(kv.__dict__) return skv def query_sense(self, term): if "-" not in term: return term cwn_id = term[term.index("-") + 1:] try: sense = CwnSense(cwn_id, self.cwn) return sense.head_word + ": " + sense.definition except Exception: return term def get_token_idx(self, sense): tok = f"{sense.head_word}-{sense.id}" tok_idx = self.key_to_index.get(tok, -1) return tok_idx def query_sense_freq(self, sense): tok_idx = self.get_token_idx(sense) if tok_idx < 0: return 0 else: return self.expandos["count"][tok_idx] def query_vector(self, sense): tok_idx = self.get_token_idx(sense) if tok_idx < 0: return None else: return self.get_vector(tok_idx, norm=True) def make_sense_vectors(self, lemma): senses = self.cwn.find_all_senses(lemma) sense_ids = [] sense_labels = [] sense_freqs = [] vecs = [] for sense_x in senses: vec_x = self.query_vector(sense_x) if vec_x is None: continue vecs.append(vec_x) sense_ids.append(sense_x.id) try: ex0 = sense_x.all_examples()[0] ex = ex0[ex0.index("<") - 3:ex0.index(">") + 4] except: ex = "" sense_labels.append(f"[{sense_x.id}]{sense_x.definition}: {ex}") sense_freqs.append(self.query_sense_freq(sense_x)) if len(vecs): stack_vecs = np.vstack(vecs) else: stack_vecs = np.array([]) return SenseData(sense_ids, sense_labels, sense_freqs, stack_vecs)
# %% from CwnGraph import CwnBase cwn = CwnBase() # %% lemmas = cwn.find_lemma("^朋友$") senses = lemmas[0].senses # %% import re from googletrans import Translator from nltk.corpus import wordnet as wn def search_word(word): cwn_senses = cwn.find_lemma(word)[0].senses word1 = re.sub('\^|\$', '', word) translator = Translator() # Initialize the translator tran_result = translator.translate(word1).text # translate CH to EN synset = wn.synsets(tran_result) num = [] df = [] result = [] for i in range(0, len(synset)): df.append(synset[i].definition()) # find the definition of the word num.append('{}-{}'.format( str(wn.synset(synset[i].name()).offset()).zfill(8),
def get_cwn_inst(): global __cwn if not __cwn: __cwn = CwnBase() return __cwn