def compute_resource_continuouswikifier(*, resource_ids, **kwargs): recovered = get_experimental_contents(resource_ids, return_content_raw=True) return [{ "resource_id": res["id"], "value": continuous_wikification(res["content_raw"], **kwargs) } for res in recovered]
def preprocess_res(resource_ids, **kwargs): recovered = get_experimental_contents(resource_ids, return_content_raw=True) return [{ "resource_id": res["id"], "value": __preprocess(res["content_raw"], **kwargs) } for res in recovered]
def get_resource_difficulty(resource_ids): res_lens = get_experimental_contents(resource_ids) res_wks = get_experimental_features(resource_ids, [__DEFAULT_EXPID_SETTING["SIMPLE"]]) res_valid = get_valid_resources(res_lens, res_wks) return [{ "resource_id": res[0], "value": wikification2con_per_sec(res[1], len(res[2]['concepts'])) } for res in res_valid]
def continuous_doc2vec_model_update_DB(resume: bool = __DEFAULT_RESUME_SETTING, exp_id: int = __DEFAULT_EXPID_SETTING): model = load_model('ccmllt') lids = list(get_all_resource_ids()) if resume: lids_computed = list(get_all_computed_resource_ids(exp_id)) print(f"We are talking about global nbr of resources: {len(lids)}") print( f"We are talking about nbr of computed resources: {len(lids_computed)}" ) lids = list(set(lids) - set(lids_computed)) print( f"We are talking about nbr of tobe_computed resources: {len(lids)}" ) print("Some ids samples from DB that will be computed:") print(lids[0:100]) # lids = lids[0:1002] chunk = 0 records = {} batch_size = 1000 for text, rid in ((t["content_raw"], t['id']) for t in tqdm.tqdm(get_experimental_contents( lids, order_needed=False, return_content_raw=True), total=len(lids), desc="continuousdoc2vec done")): try: if rid in model[0]: records[rid] = { 'value': recover_vectors(rid, model), 'interpolate': False } else: records[rid] = { 'value': recover_vectors(text, model), 'interpolate': True } except Exception as error: print("ErrorFATAL:", rid) print(error) records[rid] = {"value": {"error": str(error)}} raise error chunk += 1 if chunk == batch_size: print("One part submitted to DB:") print(records.keys()) insert_experiment_result(exp_id, records.items(), update=not resume) chunk = 0 records = {} if chunk > 0 and chunk < batch_size: print("Last part submitted to DB:") print(records.keys()) insert_experiment_result(exp_id, records.items(), update=not resume)
def doc2vec_model_update_DB(resume: bool = __DEFAULT_RESUME_SETTING, exp_id: int = __DEFAULT_EXPID_SETTING): model = load_model('dcmllt') lids = list(get_all_resource_ids()) if resume: lids_computed = list(get_all_computed_resource_ids(exp_id)) print("We are talking about global nbr of resources: ", len(lids)) print("We are talking about nbr of computed resources: ", len(lids_computed)) lids = list(set(lids) - set(lids_computed)) print("We are talking about nbr of tobe_computed resources: ", len(lids)) print("Some ids samples from DB that will be computed:") print(lids[100:]) # lids = lids[-100:] chunk = 0 records = {} batch_size = 1000 for r, t in tqdm.tqdm( ((res["id"], res["content_raw"]) for res in get_experimental_contents( lids, order_needed=False, return_content_raw=True)), total=len(lids), desc="doc2vec done"): try: try: records[r] = { 'value': recover_vector(r, model).tolist(), 'interpolate': False } except KeyError: records[r] = { 'value': recover_vector(t, model).tolist(), 'interpolate': True } except Exception as e: print("ErrorFATAL:", r) print(e) records[r] = {'value': {"error": str(e)}} # raise e chunk += 1 if chunk == batch_size: # todo record in db print("One part submitted to DB:") print(records.keys()) insert_experiment_result(exp_id, records.items(), update=not resume) chunk = 0 records = {} if chunk > 0 and chunk < batch_size: print("Last part submitted to DB:") print(records.keys()) insert_experiment_result(exp_id, records.items(), update=not resume)
def continuous_doc2vec_createmodel(): lids = list(get_all_resource_ids()) print("Some ids samples from DB:") print(lids[0:100]) batch_size = 1000 ltexts = tqdm.tqdm(((res["id"], res["content_raw"]) for res in get_experimental_contents( lids, order_needed=True, return_content_raw=True)), total=len(lids), desc="continuousdoc2vec_createmodel done") model = train_a_part_model_fromdb( ltexts, f"x5gonwp3models/models/continuousdoc2vec/model/{datetime.date.today()}/{datetime.date.today()}", vector_size=300, window=5, min_count=1)
def doc2vec_createmodel(): lids = list(get_all_resource_ids()) print("Some ids samples from DB:") print(lids[0:100]) batch_size = 1000 ltexts = tqdm.tqdm( ((res["id"], res["content_raw"]) for res in get_experimental_contents(lids, return_content_raw=True)), total=len(lids), desc="doc2vec_createmodel done") train_a_model_fromdb(ltexts, "x5gonwp3models/models/doc2vec/model/" + str(datetime.date.today()) + "/" + str(datetime.date.today()), vector_size=300, window=5, min_count=1)
def tfidf_model_update_DB(min_n: int = 1, max_n: int = 2, exp_id: Dict[str, int] = __DEFAULT_EXPID_SETTING, batch_size: int = 1000): lids = list(get_all_resource_ids()) print("Some ids samples from DB that will be computed:") print(lids[0:100]) tfidf = { **{f"[{min_n}-{n}]-grams": {} for n in range(min_n, max_n + 1)}, "SIMPLE": {} } records = { **{f"[{min_n}-{n}]-grams": {} for n in range(min_n, max_n + 1)}, "SIMPLE": {} } chunk = 0 # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # lids = lids[0:3] # print(lids) # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! for n in range(min_n, max_n + 1): path = os.path.join("x5gonwp3models", "models", "tfidf", "model", str(datetime.date.today()), f"[{min_n}-{n}]-grams") ltexts, rlids = zip( *((preprocess(t["content_raw"]), t['id']) for t in tqdm.tqdm(get_experimental_contents( lids, order_needed=False, return_content_raw=True), total=len(lids), desc="tfidf done"))) tfidf[f"[{min_n}-{n}]-grams"] = tfidf_ngrams( ltexts, min_n=min_n, max_n=n, return_format="dict", sort_keywords=(min_n == 1 and max_n == 2)) save_model(path=path, model=tfidf[f"[{min_n}-{n}]-grams"]["model"]) for i, r in enumerate(rlids): try: for vname, res in tfidf.items(): if vname == "SIMPLE": continue records[vname][r] = {"value": tfidf[vname]['X'][i]} if vname == "[1-2]-grams": sum_all_scores = sum(list(tfidf[vname]['X'][i].values())) records["SIMPLE"][r] = { "value": dict( sorted(tfidf[vname]['X'][i].items(), key=operator.itemgetter(1))[-50:]) } records["SIMPLE"][r]["value_norm"] = { k: (v / sum_all_scores) for (k, v) in records["SIMPLE"][r]["value"].items() } except Exception as e: print(i, r) raise e chunk += 1 if chunk == batch_size: # todo record in db insert_experiment_result(exp_id["[1-2]-grams"], records["[1-2]-grams"].items()) insert_experiment_result(exp_id["[1-1]-grams"], records["[1-1]-grams"].items()) insert_experiment_result(exp_id["SIMPLE"], records["SIMPLE"].items()) records = { **{ f"[{min_n}-{n}]-grams": {} for n in range(min_n, max_n + 1) }, "SIMPLE": {} } chunk = 0 if chunk > 0 and chunk < batch_size: insert_experiment_result(exp_id["[1-2]-grams"], records["[1-2]-grams"].items()) insert_experiment_result(exp_id["[1-1]-grams"], records["[1-1]-grams"].items()) insert_experiment_result(exp_id["SIMPLE"], records["SIMPLE"].items()) records = { **{f"[{min_n}-{n}]-grams": {} for n in range(min_n, max_n + 1)}, "SIMPLE": {} } chunk = 0
def get_resource_difficulty(resource_ids): recovered = get_experimental_contents(resource_ids) return [{"resource_id": res["id"], "value": char_per_sec(res["value"])} for res in recovered]
def wikifier_model_update_DB(resume: bool = __DEFAULT_RESUME_SETTING, exp_id: dict = __DEFAULT_EXPID_SETTING, batch_size: int = 1000 ): lids = list(get_all_resource_ids()) if resume: lids_computed = list(get_all_computed_resource_ids(exp_id["CLASSIC"])) print("We are talking about global nbr of resources: ", len(lids)) print("We are talking about nbr of computed resources: ", len(lids_computed)) lids = list(set(lids) - set(lids_computed)) print("We are talking about nbr of tobe_computed resources: ", len(lids)) print("Some ids samples from DB that will be computed:") print(lids[0:100]) wikifier = {"FULL": {}, "CLASSIC": {}, "SIMPLE": {}} records = {"FULL": {}, "CLASSIC": {}, "SIMPLE": {}} chunk = 0 # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # lids = lids[:3] # print(lids) # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! for r, t in tqdm.tqdm(((res["id"], res["content_raw"]) for res in get_experimental_contents(lids, order_needed=False, return_content_raw=True)), total=len(lids), desc="wikifier done"): try: wikifier_full_tmp = wikification(t, subprocess=4, wikification_type="FULL") wikifier["FULL"][r] = wikifier_full_tmp wikifier["CLASSIC"][r] = wikification_filter(wikifier_full_tmp, wikification_type_needed="CLASSIC") wikifier["SIMPLE"][r] = wikification_filter(wikifier["CLASSIC"][r], wikification_type_needed="SIMPLE") except Exception as e: print("ErrorFATAL:", r) wikifier["FULL"][r] = {"error": str(e)} wikifier["CLASSIC"][r] = {"error": str(e)} wikifier["SIMPLE"][r] = {"error": str(e)} # print(e) records["FULL"][r] = {'value': wikifier["FULL"][r]} records["CLASSIC"][r] = {'value': wikifier["CLASSIC"][r]} records["SIMPLE"][r] = {'value': wikifier["SIMPLE"][r]} chunk += 1 if chunk == batch_size: # todo record in db print("One part submitted to DB:") insert_experiment_result(exp_id["FULL"], records["FULL"].items(), update=not resume) insert_experiment_result(exp_id["CLASSIC"], records["CLASSIC"].items(), update=not resume) insert_experiment_result(exp_id["SIMPLE"], records["SIMPLE"].items(), update=not resume) wikifier = {"FULL": {}, "CLASSIC": {}, "SIMPLE": {}} records = {"FULL": {}, "CLASSIC": {}, "SIMPLE": {}} chunk = 0 if chunk > 0 and chunk <= batch_size: print("Last part submitted to DB:") insert_experiment_result(exp_id["FULL"], records["FULL"].items(), update=not resume) insert_experiment_result(exp_id["CLASSIC"], records["CLASSIC"].items(), update=not resume) insert_experiment_result(exp_id["SIMPLE"], records["SIMPLE"].items(), update=not resume) wikifier = {"FULL": {}, "CLASSIC": {}, "SIMPLE": {}} records = {"FULL": {}, "CLASSIC": {}, "SIMPLE": {}} chunk = 0
def get_resource_oermetainfos(resource_ids): recovered = get_experimental_contents(resource_ids) return [{ "resource_id": res["id"], "value": res["value"] } for res in recovered]