def init(): for f in ['journal_title', 'title', 'abstract', 'keywords', 'mesh_headings']: model_name = f"/src/models/pubmed_model_{f}.model" if os.path.exists(model_name) is False: download_file( f"https://storage.gra.cloud.ovh.net/v1/AUTH_{project_id}/models/pubmed_model_{f}_stratTrue.model", model_name) logger.debug(f"loading model {model_name}") models[f] = fasttext.load_model(model_name) logger.debug("nb labels : {}".format(models[f].get_labels()))
def init(): if os.path.exists("/src/models/model_pf.bin") is False: download_file( f"https://storage.gra.cloud.ovh.net/v1/AUTH_{project_id}/models/model_pf.bin", "/src/models/model_pf.bin") if os.path.exists("/src/models/model_pf.vec") is False: download_file( f"https://storage.gra.cloud.ovh.net/v1/AUTH_{project_id}/models/model_pf.vec", "/src/models/model_pf.vec") model_pf = fasttext.load_model('/src/models/model_pf.bin') model["pf"] = model_pf
import fasttext from collections import Counter from project.server.main.bso_category import get_bso_category from project.server.main.pf_classifier import get_pf_label from project.server.main.utils import download_file import pickle import os os.system("mkdir -p /src/models/") if os.path.exists("/src/models/all_categ_revue.pkl") is False: download_file( "https://storage.gra.cloud.ovh.net/v1/AUTH_32c5d10cb0fe4519b957064a111717e3/models/all_categ_revue.pkl", "/src/models/all_categ_revue.pkl") all_categ_revue = pickle.load(open('/src/models/all_categ_revue.pkl', 'rb')) def get_categ_from_source(source, top=1): try: mst_common = Counter(all_categ_revue[source]).most_common() mst_common_list = [ e[0] for e in mst_common if (e[0] and (e[0] not in ['unknown', ''])) ] ans = ";".join([e for e in mst_common_list[0:top]]) # the most common if ans == "": ans = 'unknown' except: ans = 'unknown' return ans
def set_FoR(): #curl https://www.arc.gov.au/file/10549/download?token=Sbfb2a9n #-O FoR.xlsx FoR_file = download_file( "https://storage.gra.cloud.ovh.net/v1/AUTH_32c5d10cb0fe4519b957064a111717e3/models/FoR.xlsx", f"{PV_MOUNT}FoR.xlsx") #FoR_file="FoR.xlsx" xl = pd.ExcelFile(FoR_file) df_issn = xl.parse("ERA 2018 Journal List") df_for = xl.parse("FoR Codes") for_dict = {} for i, row in df_for.iterrows(): for_code = str(row['FoR Code']).strip() if len(for_code) == 1: for_code = "0" + for_code if len(for_code) == 3: for_code = "0" + for_code if len(for_code) not in [2, 4]: continue for_dict[for_code] = row['FoR Description'].strip() for_dict['MD'] = "Multidisciplinary" issn_dict = {} issn_dict_health = {} for i, row in df_issn.iterrows(): issns = [] for x in range(1, 8): issn = row['ISSN {}'.format(x)] if not pd.isnull(issn): issns.append(issn) fors = [] fors_health = [] has_other = False has_health = False for x in range(1, 4): for_code = row['FoR {}'.format(x)] if pd.isnull(for_code): continue for_code = str(for_code).replace('.0', '') if len(for_code) == 1: for_code = "0" + for_code if len(for_code) == 3: for_code = "0" + for_code if for_code not in for_dict: continue if for_dict[for_code] not in fors: fors.append(for_dict[for_code]) if for_code in for_code_health: candidate = for_dict[for_code] if candidate not in fors_health: fors_health.append(candidate) has_health = True elif for_code[0:2] in for_code_health: candidate = for_dict[for_code[0:2]] if candidate not in fors_health: fors_health.append(candidate) has_health = True elif for_code[0:2] in ["06", "11"]: candidate = "Other " + for_dict[for_code[0:2]] if candidate not in fors_health: fors_health.append(candidate) has_other = True if has_health: for k in fors_health.copy(): if "Other " in k: fors_health.remove(k) for issn in issns: issn_dict[issn] = fors if len(fors_health) > 0: issn_dict_health[issn] = fors_health pickle.dump(issn_dict_health, open(f"{PV_MOUNT}issn_dict_health.pkl", "wb")) pickle.dump(issn_dict, open(f"{PV_MOUNT}issn_dict.pkl", "wb"))
from project.server.main.utils import download_file import json import os import re os.system("mkdir -p /src/models/") if os.path.exists("/src/models/asjc.json") is False: download_file( "https://storage.gra.cloud.ovh.net/v1/AUTH_32c5d10cb0fe4519b957064a111717e3/models/asjc.json", "/src/models/asjc.json") asjc_data = json.load(open('/src/models/asjc.json', 'r')) asjc_dict = {} for e in asjc_data: if e.get('issn'): asjc_dict[e.get('issn')] = e def asjc_classify(elems, details=False): for e in elems: if 'issn_list' not in e: if 'journal_issns' in e: issns = e['journal_issns'] if not isinstance(issns, str): continue issn_list = [k.strip() for k in re.split(",|;", issns)] e['issn_list'] = [k for k in issn_list if len(k) > 0] else: continue res = [] for issn in e['issn_list']: