Example #1
0
def init():
    for f in ['journal_title', 'title', 'abstract', 'keywords', 'mesh_headings']:
        model_name = f"/src/models/pubmed_model_{f}.model"
        if os.path.exists(model_name) is False:
            download_file(
                f"https://storage.gra.cloud.ovh.net/v1/AUTH_{project_id}/models/pubmed_model_{f}_stratTrue.model",
                model_name)
        logger.debug(f"loading model {model_name}")
        models[f] = fasttext.load_model(model_name)
        logger.debug("nb labels : {}".format(models[f].get_labels()))
def init():

    if os.path.exists("/src/models/model_pf.bin") is False:
        download_file(
            f"https://storage.gra.cloud.ovh.net/v1/AUTH_{project_id}/models/model_pf.bin",
            "/src/models/model_pf.bin")
    if os.path.exists("/src/models/model_pf.vec") is False:
        download_file(
            f"https://storage.gra.cloud.ovh.net/v1/AUTH_{project_id}/models/model_pf.vec",
            "/src/models/model_pf.vec")

    model_pf = fasttext.load_model('/src/models/model_pf.bin')
    model["pf"] = model_pf
Example #3
0
import fasttext
from collections import Counter
from project.server.main.bso_category import get_bso_category
from project.server.main.pf_classifier import get_pf_label
from project.server.main.utils import download_file
import pickle
import os

os.system("mkdir -p /src/models/")
if os.path.exists("/src/models/all_categ_revue.pkl") is False:
    download_file(
        "https://storage.gra.cloud.ovh.net/v1/AUTH_32c5d10cb0fe4519b957064a111717e3/models/all_categ_revue.pkl",
        "/src/models/all_categ_revue.pkl")
all_categ_revue = pickle.load(open('/src/models/all_categ_revue.pkl', 'rb'))


def get_categ_from_source(source, top=1):
    try:
        mst_common = Counter(all_categ_revue[source]).most_common()
        mst_common_list = [
            e[0] for e in mst_common
            if (e[0] and (e[0] not in ['unknown', '']))
        ]
        ans = ";".join([e for e in mst_common_list[0:top]])  # the most common
        if ans == "":
            ans = 'unknown'
    except:
        ans = 'unknown'
    return ans

Example #4
0
def set_FoR():

    #curl https://www.arc.gov.au/file/10549/download?token=Sbfb2a9n #-O FoR.xlsx
    FoR_file = download_file(
        "https://storage.gra.cloud.ovh.net/v1/AUTH_32c5d10cb0fe4519b957064a111717e3/models/FoR.xlsx",
        f"{PV_MOUNT}FoR.xlsx")
    #FoR_file="FoR.xlsx"

    xl = pd.ExcelFile(FoR_file)

    df_issn = xl.parse("ERA 2018 Journal List")
    df_for = xl.parse("FoR Codes")
    for_dict = {}
    for i, row in df_for.iterrows():
        for_code = str(row['FoR Code']).strip()
        if len(for_code) == 1:
            for_code = "0" + for_code
        if len(for_code) == 3:
            for_code = "0" + for_code
        if len(for_code) not in [2, 4]:
            continue
        for_dict[for_code] = row['FoR Description'].strip()

    for_dict['MD'] = "Multidisciplinary"

    issn_dict = {}
    issn_dict_health = {}

    for i, row in df_issn.iterrows():
        issns = []
        for x in range(1, 8):
            issn = row['ISSN {}'.format(x)]
            if not pd.isnull(issn):
                issns.append(issn)

        fors = []
        fors_health = []
        has_other = False
        has_health = False
        for x in range(1, 4):
            for_code = row['FoR {}'.format(x)]
            if pd.isnull(for_code):
                continue
            for_code = str(for_code).replace('.0', '')

            if len(for_code) == 1:
                for_code = "0" + for_code
            if len(for_code) == 3:
                for_code = "0" + for_code

            if for_code not in for_dict:
                continue

            if for_dict[for_code] not in fors:
                fors.append(for_dict[for_code])

            if for_code in for_code_health:
                candidate = for_dict[for_code]
                if candidate not in fors_health:
                    fors_health.append(candidate)
                    has_health = True
            elif for_code[0:2] in for_code_health:
                candidate = for_dict[for_code[0:2]]
                if candidate not in fors_health:
                    fors_health.append(candidate)
                    has_health = True
            elif for_code[0:2] in ["06", "11"]:
                candidate = "Other " + for_dict[for_code[0:2]]
                if candidate not in fors_health:
                    fors_health.append(candidate)
                    has_other = True

        if has_health:
            for k in fors_health.copy():
                if "Other " in k:
                    fors_health.remove(k)

        for issn in issns:
            issn_dict[issn] = fors

            if len(fors_health) > 0:
                issn_dict_health[issn] = fors_health

    pickle.dump(issn_dict_health, open(f"{PV_MOUNT}issn_dict_health.pkl",
                                       "wb"))
    pickle.dump(issn_dict, open(f"{PV_MOUNT}issn_dict.pkl", "wb"))
Example #5
0
from project.server.main.utils import download_file
import json
import os
import re

os.system("mkdir -p /src/models/")
if os.path.exists("/src/models/asjc.json") is False:
    download_file(
        "https://storage.gra.cloud.ovh.net/v1/AUTH_32c5d10cb0fe4519b957064a111717e3/models/asjc.json",
        "/src/models/asjc.json")
asjc_data = json.load(open('/src/models/asjc.json', 'r'))

asjc_dict = {}
for e in asjc_data:
    if e.get('issn'):
        asjc_dict[e.get('issn')] = e


def asjc_classify(elems, details=False):
    for e in elems:
        if 'issn_list' not in e:
            if 'journal_issns' in e:
                issns = e['journal_issns']
                if not isinstance(issns, str):
                    continue
                issn_list = [k.strip() for k in re.split(",|;", issns)]
                e['issn_list'] = [k for k in issn_list if len(k) > 0]
            else:
                continue
        res = []
        for issn in e['issn_list']: