Ejemplo n.º 1
0
Archivo: core.py Proyecto: X5GON/lamapi
def compute_resource_continuouswikifier(*, resource_ids, **kwargs):
    recovered = get_experimental_contents(resource_ids,
                                          return_content_raw=True)
    return [{
        "resource_id": res["id"],
        "value": continuous_wikification(res["content_raw"], **kwargs)
    } for res in recovered]
Ejemplo n.º 2
0
def preprocess_res(resource_ids, **kwargs):
    recovered = get_experimental_contents(resource_ids,
                                          return_content_raw=True)
    return [{
        "resource_id": res["id"],
        "value": __preprocess(res["content_raw"], **kwargs)
    } for res in recovered]
Ejemplo n.º 3
0
Archivo: core.py Proyecto: X5GON/lamapi
def get_resource_difficulty(resource_ids):
    res_lens = get_experimental_contents(resource_ids)
    res_wks = get_experimental_features(resource_ids,
                                        [__DEFAULT_EXPID_SETTING["SIMPLE"]])
    res_valid = get_valid_resources(res_lens, res_wks)
    return [{
        "resource_id": res[0],
        "value": wikification2con_per_sec(res[1], len(res[2]['concepts']))
    } for res in res_valid]
Ejemplo n.º 4
0
Archivo: core.py Proyecto: X5GON/lamapi
def continuous_doc2vec_model_update_DB(resume: bool = __DEFAULT_RESUME_SETTING,
                                       exp_id: int = __DEFAULT_EXPID_SETTING):
    model = load_model('ccmllt')
    lids = list(get_all_resource_ids())
    if resume:
        lids_computed = list(get_all_computed_resource_ids(exp_id))
        print(f"We are talking about global nbr of resources: {len(lids)}")
        print(
            f"We are talking about nbr of computed resources: {len(lids_computed)}"
        )
        lids = list(set(lids) - set(lids_computed))
        print(
            f"We are talking about nbr of tobe_computed resources: {len(lids)}"
        )
    print("Some ids samples from DB that will be computed:")
    print(lids[0:100])
    # lids = lids[0:1002]
    chunk = 0
    records = {}
    batch_size = 1000
    for text, rid in ((t["content_raw"], t['id'])
                      for t in tqdm.tqdm(get_experimental_contents(
                          lids, order_needed=False, return_content_raw=True),
                                         total=len(lids),
                                         desc="continuousdoc2vec done")):
        try:
            if rid in model[0]:
                records[rid] = {
                    'value': recover_vectors(rid, model),
                    'interpolate': False
                }
            else:
                records[rid] = {
                    'value': recover_vectors(text, model),
                    'interpolate': True
                }
        except Exception as error:
            print("ErrorFATAL:", rid)
            print(error)
            records[rid] = {"value": {"error": str(error)}}
            raise error
        chunk += 1
        if chunk == batch_size:
            print("One part submitted to DB:")
            print(records.keys())
            insert_experiment_result(exp_id,
                                     records.items(),
                                     update=not resume)
            chunk = 0
            records = {}
    if chunk > 0 and chunk < batch_size:
        print("Last part submitted to DB:")
        print(records.keys())
        insert_experiment_result(exp_id, records.items(), update=not resume)
Ejemplo n.º 5
0
Archivo: core.py Proyecto: X5GON/lamapi
def doc2vec_model_update_DB(resume: bool = __DEFAULT_RESUME_SETTING,
                            exp_id: int = __DEFAULT_EXPID_SETTING):
    model = load_model('dcmllt')
    lids = list(get_all_resource_ids())
    if resume:
        lids_computed = list(get_all_computed_resource_ids(exp_id))
        print("We are talking about global nbr of resources: ", len(lids))
        print("We are talking about nbr of computed resources: ",
              len(lids_computed))
        lids = list(set(lids) - set(lids_computed))
        print("We are talking about nbr of tobe_computed resources: ",
              len(lids))
    print("Some ids samples from DB that will be computed:")
    print(lids[100:])
    # lids = lids[-100:]
    chunk = 0
    records = {}
    batch_size = 1000
    for r, t in tqdm.tqdm(
        ((res["id"], res["content_raw"]) for res in get_experimental_contents(
            lids, order_needed=False, return_content_raw=True)),
            total=len(lids),
            desc="doc2vec done"):
        try:
            try:
                records[r] = {
                    'value': recover_vector(r, model).tolist(),
                    'interpolate': False
                }
            except KeyError:
                records[r] = {
                    'value': recover_vector(t, model).tolist(),
                    'interpolate': True
                }
        except Exception as e:
            print("ErrorFATAL:", r)
            print(e)
            records[r] = {'value': {"error": str(e)}}
            # raise e
        chunk += 1
        if chunk == batch_size:
            # todo record in db
            print("One part submitted to DB:")
            print(records.keys())
            insert_experiment_result(exp_id,
                                     records.items(),
                                     update=not resume)
            chunk = 0
            records = {}
    if chunk > 0 and chunk < batch_size:
        print("Last part submitted to DB:")
        print(records.keys())
        insert_experiment_result(exp_id, records.items(), update=not resume)
Ejemplo n.º 6
0
Archivo: core.py Proyecto: X5GON/lamapi
def continuous_doc2vec_createmodel():
    lids = list(get_all_resource_ids())
    print("Some ids samples from DB:")
    print(lids[0:100])
    batch_size = 1000
    ltexts = tqdm.tqdm(((res["id"], res["content_raw"])
                        for res in get_experimental_contents(
                            lids, order_needed=True, return_content_raw=True)),
                       total=len(lids),
                       desc="continuousdoc2vec_createmodel done")
    model = train_a_part_model_fromdb(
        ltexts,
        f"x5gonwp3models/models/continuousdoc2vec/model/{datetime.date.today()}/{datetime.date.today()}",
        vector_size=300,
        window=5,
        min_count=1)
Ejemplo n.º 7
0
Archivo: core.py Proyecto: X5GON/lamapi
def doc2vec_createmodel():
    lids = list(get_all_resource_ids())
    print("Some ids samples from DB:")
    print(lids[0:100])
    batch_size = 1000
    ltexts = tqdm.tqdm(
        ((res["id"], res["content_raw"])
         for res in get_experimental_contents(lids, return_content_raw=True)),
        total=len(lids),
        desc="doc2vec_createmodel done")
    train_a_model_fromdb(ltexts,
                         "x5gonwp3models/models/doc2vec/model/" +
                         str(datetime.date.today()) + "/" +
                         str(datetime.date.today()),
                         vector_size=300,
                         window=5,
                         min_count=1)
Ejemplo n.º 8
0
Archivo: core.py Proyecto: X5GON/lamapi
def tfidf_model_update_DB(min_n: int = 1,
                          max_n: int = 2,
                          exp_id: Dict[str, int] = __DEFAULT_EXPID_SETTING,
                          batch_size: int = 1000):
    lids = list(get_all_resource_ids())
    print("Some ids samples from DB that will be computed:")
    print(lids[0:100])
    tfidf = {
        **{f"[{min_n}-{n}]-grams": {}
           for n in range(min_n, max_n + 1)}, "SIMPLE": {}
    }
    records = {
        **{f"[{min_n}-{n}]-grams": {}
           for n in range(min_n, max_n + 1)}, "SIMPLE": {}
    }
    chunk = 0
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # lids = lids[0:3]
    # print(lids)
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    for n in range(min_n, max_n + 1):

        path = os.path.join("x5gonwp3models", "models", "tfidf", "model",
                            str(datetime.date.today()), f"[{min_n}-{n}]-grams")
        ltexts, rlids = zip(
            *((preprocess(t["content_raw"]), t['id'])
              for t in tqdm.tqdm(get_experimental_contents(
                  lids, order_needed=False, return_content_raw=True),
                                 total=len(lids),
                                 desc="tfidf done")))
        tfidf[f"[{min_n}-{n}]-grams"] = tfidf_ngrams(
            ltexts,
            min_n=min_n,
            max_n=n,
            return_format="dict",
            sort_keywords=(min_n == 1 and max_n == 2))
        save_model(path=path, model=tfidf[f"[{min_n}-{n}]-grams"]["model"])
    for i, r in enumerate(rlids):
        try:
            for vname, res in tfidf.items():
                if vname == "SIMPLE":
                    continue
                records[vname][r] = {"value": tfidf[vname]['X'][i]}
                if vname == "[1-2]-grams":
                    sum_all_scores = sum(list(tfidf[vname]['X'][i].values()))
                    records["SIMPLE"][r] = {
                        "value":
                        dict(
                            sorted(tfidf[vname]['X'][i].items(),
                                   key=operator.itemgetter(1))[-50:])
                    }
                    records["SIMPLE"][r]["value_norm"] = {
                        k: (v / sum_all_scores)
                        for (k, v) in records["SIMPLE"][r]["value"].items()
                    }
        except Exception as e:
            print(i, r)
            raise e
        chunk += 1
        if chunk == batch_size:
            # todo record in db
            insert_experiment_result(exp_id["[1-2]-grams"],
                                     records["[1-2]-grams"].items())
            insert_experiment_result(exp_id["[1-1]-grams"],
                                     records["[1-1]-grams"].items())
            insert_experiment_result(exp_id["SIMPLE"],
                                     records["SIMPLE"].items())
            records = {
                **{
                    f"[{min_n}-{n}]-grams": {}
                    for n in range(min_n, max_n + 1)
                }, "SIMPLE": {}
            }
            chunk = 0
    if chunk > 0 and chunk < batch_size:
        insert_experiment_result(exp_id["[1-2]-grams"],
                                 records["[1-2]-grams"].items())
        insert_experiment_result(exp_id["[1-1]-grams"],
                                 records["[1-1]-grams"].items())
        insert_experiment_result(exp_id["SIMPLE"], records["SIMPLE"].items())
        records = {
            **{f"[{min_n}-{n}]-grams": {}
               for n in range(min_n, max_n + 1)}, "SIMPLE": {}
        }
        chunk = 0
Ejemplo n.º 9
0
Archivo: core.py Proyecto: X5GON/lamapi
def get_resource_difficulty(resource_ids):
    recovered = get_experimental_contents(resource_ids)
    return [{"resource_id": res["id"],
             "value": char_per_sec(res["value"])} for res in recovered]
Ejemplo n.º 10
0
def wikifier_model_update_DB(resume: bool = __DEFAULT_RESUME_SETTING,
                             exp_id: dict = __DEFAULT_EXPID_SETTING,
                             batch_size: int = 1000
                             ):
    lids = list(get_all_resource_ids())
    if resume:
        lids_computed = list(get_all_computed_resource_ids(exp_id["CLASSIC"]))
        print("We are talking about global nbr of resources: ", len(lids))
        print("We are talking about nbr of computed resources: ",
              len(lids_computed))
        lids = list(set(lids) - set(lids_computed))
        print("We are talking about nbr of tobe_computed resources: ",
              len(lids))
    print("Some ids samples from DB that will be computed:")
    print(lids[0:100])
    wikifier = {"FULL": {}, "CLASSIC": {}, "SIMPLE": {}}
    records = {"FULL": {}, "CLASSIC": {}, "SIMPLE": {}}
    chunk = 0
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # lids = lids[:3]
    # print(lids)
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    for r, t in tqdm.tqdm(((res["id"],
                            res["content_raw"]) for res in get_experimental_contents(lids,
                                                                                     order_needed=False,
                                                                                     return_content_raw=True)),
                          total=len(lids),
                          desc="wikifier done"):
        try:
            wikifier_full_tmp = wikification(t,
                                             subprocess=4,
                                             wikification_type="FULL")
            wikifier["FULL"][r] = wikifier_full_tmp
            wikifier["CLASSIC"][r] = wikification_filter(wikifier_full_tmp,
                                                         wikification_type_needed="CLASSIC")
            wikifier["SIMPLE"][r] = wikification_filter(wikifier["CLASSIC"][r],
                                                        wikification_type_needed="SIMPLE")
        except Exception as e:
            print("ErrorFATAL:", r)
            wikifier["FULL"][r] = {"error": str(e)}
            wikifier["CLASSIC"][r] = {"error": str(e)}
            wikifier["SIMPLE"][r] = {"error": str(e)}
            # print(e)
        records["FULL"][r] = {'value': wikifier["FULL"][r]}
        records["CLASSIC"][r] = {'value': wikifier["CLASSIC"][r]}
        records["SIMPLE"][r] = {'value': wikifier["SIMPLE"][r]}
        chunk += 1
        if chunk == batch_size:
            # todo record in db
            print("One part submitted to DB:")
            insert_experiment_result(exp_id["FULL"],
                                     records["FULL"].items(), update=not resume)
            insert_experiment_result(exp_id["CLASSIC"],
                                     records["CLASSIC"].items(), update=not resume)
            insert_experiment_result(exp_id["SIMPLE"],
                                     records["SIMPLE"].items(), update=not resume)
            wikifier = {"FULL": {}, "CLASSIC": {}, "SIMPLE": {}}
            records = {"FULL": {}, "CLASSIC": {}, "SIMPLE": {}}
            chunk = 0
    if chunk > 0 and chunk <= batch_size:
        print("Last part submitted to DB:")
        insert_experiment_result(exp_id["FULL"],
                                 records["FULL"].items(), update=not resume)
        insert_experiment_result(exp_id["CLASSIC"],
                                 records["CLASSIC"].items(), update=not resume)
        insert_experiment_result(exp_id["SIMPLE"],
                                 records["SIMPLE"].items(), update=not resume)
        wikifier = {"FULL": {}, "CLASSIC": {}, "SIMPLE": {}}
        records = {"FULL": {}, "CLASSIC": {}, "SIMPLE": {}}
        chunk = 0
Ejemplo n.º 11
0
def get_resource_oermetainfos(resource_ids):
    recovered = get_experimental_contents(resource_ids)
    return [{
        "resource_id": res["id"],
        "value": res["value"]
    } for res in recovered]