def operation_pref_conjugaison(lang): log.info("Doing operation Set Property: LabelNamePreference: Conjugaison") preference_base = calculate_they_read() for c in DBRead(DBConjugations, sql=" SELECT * FROM conjugations ", cls=ConjugationsItem): log.info(" set Property: LabelNamePreference: %s", c) preference = \ math.sqrt( len( c.AlternativeFormsOther ) ) + \ math.sqrt( len( c.ExplainationTxt ) ) # preference = preference / preference_base / 2 if preference <= 0: LabelNamePreference = 0 else: LabelNamePreference = 1 DBExecute( DBConjugations, """ UPDATE conjugations SET LabelNamePreference = ?, Operation_Pref = 1 WHERE PK = ? """, LabelNamePreference, c.PK)
def operation_pref_wiktionary(lang): log.info("Doing operation Set Property: LabelNamePreference: Wiktionary") preference_base = calculate_cat_felidae() for wt in DBRead(DBWiktionary, sql=" SELECT * FROM wiktionary ", cls=WiktionaryItem): log.info(" set Property: LabelNamePreference: %s", wt) preference = calculate_preference_wiktionary(wt) preference = preference / preference_base / 2 if preference <= 0: LabelNamePreference = 0 else: LabelNamePreference = 1 DBExecute( DBWiktionary, """ UPDATE wiktionary SET LabelNamePreference = ?, Operation_Pref = 1 WHERE PrimaryKey = ? """, LabelNamePreference, wt.PrimaryKey)
def operation_pref_wikipedia(lang): log.info("Doing operation Set Property: LabelNamePreference: Wikipedia") preference_base = calculate_cat_felidae() for wp in DBRead(DBWikipedia, sql=" SELECT * FROM wikipedia ", cls=WikipediaItem): log.info(" set Property: LabelNamePreference: %s", wp) preference = \ math.sqrt( len( wp.SeeAlsoWikipediaLinks ) ) + \ math.sqrt( len( wp.ExplainationWPTxt ) ) + \ math.sqrt( len( wp.ExplainationExamplesTxt ) ) # # then divide by value of ( CAT-FELIDAE ) and divide by 2 # If <0 then : =0 elif >1 then : =1 # preference = preference / preference_base / 2 if preference <= 0: LabelNamePreference = 0 else: LabelNamePreference = 1 DBExecute( DBWikipedia, """ UPDATE wikipedia SET LabelNamePreference = ?, Operation_Pref = 1 WHERE PK = ? """, LabelNamePreference, wp.PK)
def calculate_they_read(): # 7.244997998398398 DB_THEY_READ = sqlite3.connect("conjugations-they-read.sqlite3") rows = DBRead( DB_THEY_READ, sql= "SELECT * FROM conjugations WHERE PK = 'en§read§Verb_To_read_They_Indicative_Present§5'", cls=ConjugationsItem) try: c = next(rows) except StopIteration: raise Exception("No THEY READ") DB_THEY_READ.close() # preference = \ math.sqrt( len( c.AlternativeFormsOther ) ) + \ math.sqrt( len( c.ExplainationTxt ) ) return preference # 7.244997998398398
def vectorize_properties_wikipedia(): log.info("Vectorizing wikipedia") for wp in DBRead(DBWikipedia, sql=" SELECT * FROM wikipedia ", cls=dict): log.info(" vectorize: %s", wp["LabelName"]) vetorized = Vectorize_PKS(wp, default_language=wp["LanguageCode"]) DBExecute( DBWiktionary, """ UPDATE wiktionary SET Description_Vect = ?, AlsoKnownAs_Vect = ?, Instance_of_Vect= ?, Subclass_of_Vect = ?, Part_of_Vect = ?, Operation_Vectorizer = 1 WHERE PrimaryKey = ? """, to_json(vetorized["Description"]), to_json(vetorized["AlsoKnownAs"]), to_json(vetorized["Instance_of"]), to_json(vetorized["Subclass_of"]), to_json(vetorized["Part_of"]), wp["PrimaryKey"])
def merge(wp: WikipediaItem) -> Iterable[WordItem]: # load all wikipedia # and merge with exisiting word (wikidata) # if same Ext_Wikipedia_URL, # and also check if there is exisiting word with same labelname (not case sensitive), # then use PKS_ListMatch to see if we merge or not log.info(wp) # search by URL and name sql = """ SELECT * FROM words WHERE LanguageCode = ? COLLATE NOCASE AND LabelName = ? COLLATE NOCASE AND Ext_Wikipedia_URL = ? COLLATE NOCASE """ # ci_index # do search items = DBRead(DBWord, sql=sql, params=[wp.LanguageCode, wp.LabelName, wp.SelfUrlWikipedia], cls=WordItem) items = list(items) # if items: # Match_List_PKS_With_Lists_Of_PKS sentences1 = [item.Description for item in items] sentences2 = [wp.ExplainationWPTxt] matches = Match_List_PKS_With_Lists_Of_PKS(tuple(sentences1), tuple(sentences2)) matched_words = [ item for (item, (s1, s2)) in zip(items, matches) if s2 == wp.ExplainationWPTxt ] if matched_words: for w in matched_words: # merge log.debug("[ OK ] matched: %s == %s", w, wp) merge_words(w, wp) w.MergedWith.append(wp.PK) yield w else: # append log.debug("new: %s", wp) w = WordItem() merge_words(w, wp) yield w else: # append log.debug("new: %s", wp) w = WordItem() merge_words(w, wp) yield w
def load_wiktionary_one( DBWord, lang, label ): for wd in DBRead( DBWiktionary, table="wiktionary", cls=WiktionaryItem, where="LanguageCode=? COLLATE NOCASE AND LabelName=? COLLATE NOCASE", params=[ lang, label ] ): log.info( "%s", wd ) for w in merge( wd ): DBWrite( DBWord, w, table="words", if_exists="replace" ) DBExecute( DBWiktionary, "UPDATE wiktionary SET Operation_Merging = 1 WHERE PrimaryKey = ?", wd.PrimaryKey )
def find_word_by_label(label): sql = """ SELECT * FROM words """ rows = DBRead(DBWord, sql=sql) return ''
def load_wiktionary( DBWord ): log.info( "loading wiktionary" ) for wd in DBRead( DBWiktionary, table="wiktionary", cls=WiktionaryItem ): log.info( "%s", wd ) for w in merge( wd ): DBWrite( DBWord, w, table="words", if_exists="replace" ) DBExecute( DBWiktionary, "UPDATE wiktionary SET Operation_Merging = 1 WHERE PrimaryKey = ?", wd.PrimaryKey )
def load_wikipedia(DBWord): log.info("loading wikipedia") for wd in DBRead(DBWikipedia, table="wikipedia", cls=WikipediaItem): log.info("%s", wd) for w in merge(wd): DBWrite(DBWord, w, table="words", if_exists="replace") DBExecute(DBWikipedia, "UPDATE wikipedia SET Operation_Merging = 1 WHERE PK = ?", wd.PK)
def merge_verbs( wt: WiktionaryItem ) -> Iterator[WordItem ]: # 1. find verbs with same LaabelName, Type='verb' # 2. do PKS_Match_List # 3. merge matched # find same verbs sql = """ SELECT * FROM words WHERE LanguageCode = ? COLLATE NOCASE AND LabelName = ? COLLATE NOCASE AND Type = 'verb' COLLATE NOCASE AND FromWT is NULL """ # ci_index items = DBRead( DBWord, sql=sql, params=[wt.LanguageCode, wt.LabelName], cls=WordItem ) items = list( items ) # if items: log.debug( "found items: %s", items ) # Match_List_PKS_With_Lists_Of_PKS( explanations, translation_sentences ) sentences1 = [ item.Description for item in items ] sentences2 = [ wt.ExplainationTxt ] log.debug( "matching:" ) log.debug( " sentences1: %s", sentences1 ) log.debug( " sentences2: %s", sentences2 ) matches = Match_List_PKS_With_Lists_Of_PKS( tuple(sentences1), tuple(sentences2) ) matched_words = [ item for (item, (s1, s2)) in zip(items, matches) if s2 == wt.ExplainationTxt ] if matched_words: # merge for w in matched_words: log.debug( "[ OK ] matched: %s == %s", w, wt ) merge_words( w, wt ) w.MergedWith.append( wt.PrimaryKey ) yield w else: # append log.debug( "new: %s", wt ) w = WordItem() merge_words( w, wt ) yield w else: # append log.debug( "new: %s", wt ) w = WordItem() merge_words( w, wt ) yield w
def operation_pref_wikidata(lang): log.info("Doing operation Set Property: LabelNamePreference: Wikidata") preference_base = calculate_cat_felidae() for wd in DBRead(DBWikidata, sql=" SELECT * FROM wikidata ", cls=WikidataItem): log.info(" set Property: LabelNamePreference: %s", wd) ExplainationExamplesTxt = get_sentences_with_label( lang, wd.Description, wd.LabelName) ExplainationTxt = wd.Description preference = \ len( wd.AlsoKnownAs ) + \ len( wd.Instance_of ) + \ len( wd.Subclass_of ) + \ len( wd.Part_of ) + \ len( wd.Translation_EN ) + \ len( wd.Translation_PT ) + \ len( wd.Translation_DE ) + \ len( wd.Translation_ES ) + \ len( wd.Translation_FR ) + \ len( wd.Translation_IT ) + \ len( wd.Translation_RU ) + \ math.sqrt( wd.WikipediaLinkCountTotal ) + \ math.sqrt( len( ExplainationExamplesTxt ) ) + \ math.sqrt( len( ExplainationTxt ) ) # # then divide by value of ( CAT-FELIDAE ) and divide by 2 # If <0 then : =0 elif >1 then : =1 # preference = preference / preference_base / 2 if preference <= 0: LabelNamePreference = 0 else: LabelNamePreference = 1 DBExecute( DBWikidata, """ UPDATE wikidata SET LabelNamePreference = ?, Operation_Pref = 1 WHERE PrimaryKey = ? """, LabelNamePreference, wd.PrimaryKey)
def load_wikipedia_one(DBWord, lang, label): for wd in DBRead( DBWikipedia, table="wikipedia", cls=WikipediaItem, where= "LanguageCode=? COLLATE NOCASE AND LabelName=? COLLATE NOCASE", params=[lang, label]): log.info("%s", wd) for w in merge(wd): DBWrite(DBWord, w, table="words", if_exists="fail") DBExecute(DBWikipedia, "UPDATE wikipedia SET Operation_Merging = 1 WHERE PK = ?", wd.PK)
def invert_vector_properties(lang): # 1. PK -> Description[ PK1, PK2, ... ] # 2. PK1 -> Description_Inv[ ..., PK ] # PK2 -> Description_Inv[ ..., PK ] log.info("Invert vectors words") word = WordItem() properties_to_invert = [ x for x in vars(word) if not callable(x) and x.endswith("_Vect") ] for w in DBRead(DBWord, sql=" SELECT * FROM words ", cls=dict): log.info(" invert vector: %s", w["LabelName"]) for prop in properties_to_invert: inverted = invert_property(lang, w, prop)
def invert_properties_words(lang): log.info("Vectorizing words") for w in DBRead(DBWord, sql=" SELECT * FROM words ", cls=dict): log.info(" vectorize: %s", w["LabelName"]) vetorized = Vectorize_database_record(lang, w) DBExecute( DBWord, """ UPDATE words SET ExplainationTxt_Vect = ?, AlternativeFormsOther_Vect = ?, Synonymy_Vect = ?, Antonymy_Vect = ?, Hypernymy_Vect = ?, Hyponymy_Vect = ?, Meronymy_Vect = ?, RelatedTerms_Vect = ?, Coordinate_Vect = ?, Otherwise_Vect = ?, Description_Vect = ?, AlsoKnownAs_Vect = ?, Instance_of_Vect= ?, Subclass_of_Vect = ?, Part_of_Vect = ?, Operation_Vectorizer = 1 WHERE PrimaryKey = ? """, to_json(vetorized["ExplainationTxt"]), to_json(vetorized["AlternativeFormsOther"]), to_json(vetorized["Synonymy"]), to_json(vetorized["Antonymy"]), to_json(vetorized["Hypernymy"]), to_json(vetorized["Hyponymy"]), to_json(vetorized["Meronymy"]), to_json(vetorized["RelatedTerms"]), to_json(vetorized["Coordinate"]), to_json( vetorized["Description"]), to_json(vetorized["AlsoKnownAs"]), to_json(vetorized["Instance_of"]), to_json(vetorized["Subclass_of"]), to_json(vetorized["Part_of"]), w["PrimaryKey"])
def calculate_cat_felidae(): # 27.517909943958315 DB_CAT = sqlite3.connect("wiktionary-cat.sqlite3") rows = DBRead( DB_CAT, sql= "SELECT * FROM wiktionary WHERE PrimaryKey = 'en-dictionary§Noun_Reference_Word_Alphabetical_with-1'", cls=WiktionaryItem) try: wt = next(rows) except StopIteration: raise Exception("No CAT-FELIDAE") DB_CAT.close() return calculate_preference_wiktionary(wt)