Esempio n. 1
0
def get_xml_files(PMClist_file, xml_folder):
    """
    Download fulltext XML files for all PMCs in the file PMClist_file
    :param PMClist_file: Path to text file with full list of PMCs for dataset
    :param xml_folder: Path to folder where the fulltext XML files will be saved
    """

    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/PMCXXXXX/fullTextXML'

    if not xml_folder.exists():
        xml_folder.mkdir()

    with PMClist_file.open() as fin:
        pmcs = [el.strip() for el in fin.readlines()]
    pmcs = [el for el in pmcs if len(el)]

    for el in pmcs:
        path2file = xml_folder.joinpath(el + '.xml')
        response = requests.get(url=url.replace('PMCXXXXX', el))

        if response.ok:
            with path2file.open('w') as f:
                f.write(response.text)
            printgr('Correctly processed ' + el)
        else:
            printred('Could not retrieve ' + el)

    return
Esempio n. 2
0
def get_pdf_files(PMClist_file, pdf_folder):
    """
    Download PDF files for all PMCs in the file PMClist_file
    :param PMClist_file: Path to text file with full list of PMCs for dataset
    :param pdf_folder: Path to folder where the pdf files will be saved
    """

    url = 'https://europepmc.org/backend/ptpmcrender.fcgi?accid=PMCXXXXX&blobtype=pdf'

    if not pdf_folder.exists():
        pdf_folder.mkdir()

    with PMClist_file.open() as fin:
        pmcs = [el.strip() for el in fin.readlines()]
    pmcs = [el for el in pmcs if len(el)]

    for el in pmcs:
        path2file = pdf_folder.joinpath(el + '.pdf')
        response = requests.get(url=url.replace('PMCXXXXX', el))

        if response.ok:
            with path2file.open('wb') as f:
                f.write(response.content)
            printgr('Correctly processed ' + el)
        else:
            printred('Could not retrieve ' + el)

    return
Esempio n. 3
0
def get_annotations(PMClist_file, annotations_folder):
    """
    Download available EuropePMC annotations for all PMCs in the file PMClist_file
    :param PMClist_file: Path to text file with full list of PMCs for dataset
    :param annotations_folder: Path to folder where the annotation files will be saved
    """

    url = 'https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds?articleIds=PMC%3AXXXXX&format=JSON'

    if not annotations_folder.exists():
        annotations_folder.mkdir()

    with PMClist_file.open() as fin:
        pmcs = [el.strip() for el in fin.readlines()]
    pmcs = [el for el in pmcs if len(el)]

    for el in pmcs:
        path2file = annotations_folder.joinpath(el + '.json')
        response = requests.get(url=url.replace('XXXXX', el.split('PMC')[1]))

        if response.ok:
            with path2file.open('w') as f:
                f.write(response.text)
            printgr('Coirrectly processed ' + el)
        else:
            printred('Could not retrieve ' + el)

    return
def get_S2_data(data_folder, csv_file, S2_folder):
    """
    Download Semantic Scholar description of all papers and authors in base dataset
    :param data_folder: Path to folder with the Bio-protocol .json files
    :param csv_file: Path to csv file with PMID to PMC translations
    :param S2_folder: Path to folder where the downloaded papers will be saved
    """

    paperUrl = 'https://api.semanticscholar.org/v1/paper/XXXXX'
    authorUrl = 'https://api.semanticscholar.org/v1/author/XXXXX'
    
    if not S2_folder.exists():
        S2_folder.mkdir()

    papers = [f for f in data_folder.iterdir() if f.name.endswith('.json')]

    ProtocolID = []
    S2ID = []

    for idx,el in enumerate(papers):
        time.sleep(5)
        doi = '10.21769/BIOPROTOC.' + el.name.split('Bio-protocol')[1].split('.json')[0]
        printgr('Processing paper ' + el.name + ' (' + str(idx) + '). DOI: ' + doi)
        response = requests.get(url = paperUrl.replace('XXXXX', doi))
        if not response.ok:
            print(response.status_code)
        if response.status_code==404:
            time.sleep(5)
            response = requests.get(url = paperUrl.replace('XXXXX', doi.lower()))
        if response.ok:
            paperdata = json.load(StringIO(response.text))
            with S2_folder.joinpath(paperdata['paperId'] +'.json').open('w') as fout:
                json.dump(paperdata, fout)

            for author in paperdata['authors']:
                authorId = author['authorId']
                time.sleep(5)
                response = requests.get(url = authorUrl.replace('XXXXX', str(authorId)))
                if not response.ok:
                    print(response)
                    print(authorId)
                else:
                    authordata = json.load(StringIO(response.text))
                    #Save author data
                    with S2_folder.joinpath('Author' + str(authorId) +'.json').open('w') as fout:
                        json.dump(authordata,fout)

            ProtocolID.append(el.name.split('.json')[0])
            S2ID.append(paperdata['paperId'])

    with csv_file.open('w') as fout:
        fout.write('ProtocolID,S2ID\n')
        [fout.write(PID+','+S2ID+'\n') for PID, S2ID in zip(ProtocolID,S2ID)]
    return
Esempio n. 5
0
def get_S2_data(csv_file, S2_folder):
    """
    Download Semantic Scholar description of all papers and authors in base dataset
    :param csv_file: Path to csv file with PMID to PMC translations
    :param S2_folder: Path to folder where the downloaded papers will be saved
    """

    paperUrl = 'https://api.semanticscholar.org/v1/paper/XXXXX'
    authorUrl = 'https://api.semanticscholar.org/v1/author/XXXXX'

    if not S2_folder.exists():
        S2_folder.mkdir()

    df = pd.read_csv(csv_file)

    for idx, el in enumerate(df.S2ID.values.tolist()):
        printgr('Processing paper ' + el + ' (' + str(idx) + ')')
        time.sleep(5)
        response = requests.get(url=paperUrl.replace('XXXXX', el))
        while not response.ok:
            print('Sleep')
            time.sleep(10)
            response = requests.get(url=paperUrl.replace('XXXXX', el))
        paperdata = json.load(StringIO(response.text))
        with S2_folder.joinpath(el + '.json').open('w') as fout:
            json.dump(paperdata, fout)

        for author in paperdata['authors']:
            authorId = author['authorId']
            time.sleep(5)
            response = requests.get(
                url=authorUrl.replace('XXXXX', str(authorId)))
            if not response.ok:
                print(response)
                print(authorId)
            else:
                authordata = json.load(StringIO(response.text))
                #Save author data
                with S2_folder.joinpath('Author' + str(authorId) +
                                        '.json').open('w') as fout:
                    json.dump(authordata, fout)
    return
Esempio n. 6
0
def get_annotationsMED(PMIDs, annotations_folder):
    """
    Download available EuropePMC annotations for all files in the list PMIDs
    :param PMIDs: list of Pubmed identifiers
    :param annotations_folder: Path to folder where the annotation files will be saved
    """

    url = 'https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds?articleIds=MED%3AXXXXX&format=JSON'

    if not annotations_folder.exists():
        annotations_folder.mkdir()

    for el in PMIDs:
        path2file = annotations_folder.joinpath('PMID' + el + '.json')
        response = requests.get(url=url.replace('XXXXX', el))

        if response.ok:
            with path2file.open('w') as f:
                f.write(response.text)
            printgr('Correctly processed ' + el)
        else:
            printred('Could not retrieve ' + el)

    return
============================================================
"""
if lemmatization:

    #Conectamos a la Base de Datos de Semantic Scholar
    dbCONNECTOR = cf.get('DB', 'dbCONNECTOR')
    dbNAME = cf.get('DB', 'dbNAME')
    dbUSER = cf.get('DB', 'dbUSER')
    dbPASS = cf.get('DB', 'dbPASS')
    dbSERVER = cf.get('DB', 'dbSERVER')
    dbSOCKET = cf.get('DB', 'dbSOCKET')
    # DM = BaseDMsql(db_name=dbNAME, db_connector=dbCONNECTOR, path2db=None,
    #            db_server=dbSERVER, db_user=dbUSER, db_password=dbPASS,
    #            unix_socket=dbSOCKET)
    
    printgr('Reading Bio-Protocols from database')
    base_df = pd.read_csv(csv_file, low_memory=False, dtype=str)
    Protocol_S2 = {el[0]:el[1] for el in base_df.values.tolist()}
    extended_df = pd.read_csv(csv_file_extended, low_memory=False, dtype=str)
    base_S2 = [el[1] for el in base_df.values.tolist()]
    extended_S2 = [el[0] for el in extended_df.values.tolist() if el[0] not in base_S2]
    
    BIO_df = pd.DataFrame(columns=['ProtocolID', 'S2paperID', 'title', 'paperAbstract', 'procedure', 'keywords'])
    #Empezamos leyendo todos los artículos que no están en el dataset base
    # for S2id in extended_S2:
    #     dfaux = DM.readDBtable('S2papers', limit=None, selectOptions='S2paperID, title, paperAbstract',
    #                             filterOptions='S2paperID="'+S2id+'"')
    #     BIO_df = BIO_df.append(dfaux, ignore_index = True, sort=False)
    # #Now, we move to the protocols in the base dataset
    # protocols = [f for f in protocols_folder.iterdir() if f.name.endswith('.json')]
    # all_prot_data = []
============================================================
"""
if lemmatization:

    #Conectamos a la Base de Datos de Semantic Scholar
    dbCONNECTOR = cf.get('DB', 'dbCONNECTOR')
    dbNAME = cf.get('DB', 'dbNAME')
    dbUSER = cf.get('DB', 'dbUSER')
    dbPASS = cf.get('DB', 'dbPASS')
    dbSERVER = cf.get('DB', 'dbSERVER')
    dbSOCKET = cf.get('DB', 'dbSOCKET')
    # DM = BaseDMsql(db_name=dbNAME, db_connector=dbCONNECTOR, path2db=None,
    #            db_server=dbSERVER, db_user=dbUSER, db_password=dbPASS,
    #            unix_socket=dbSOCKET)

    printgr('Reading Agriculture data from database')
    AGR_df = pd.read_csv(csv_file_extended, low_memory=False, dtype=str)
    AGR_S2 = AGR_df['S2paperID'].values.tolist()
    AGR_df = pd.DataFrame()
    # for S2id in AGR_S2:
        # dfaux = DM.readDBtable('S2papers', limit=None, selectOptions='S2paperID, title, paperAbstract',
        #                         filterOptions='S2paperID="'+S2id+'"')
        # AGR_df = AGR_df.append(dfaux, ignore_index = True)
    # print('Agriculture data loaded, #papers:', len(AGR_df))

    # from lemmatizer.ENlemmatizer import ENLemmatizer
    # lemmas_server = cf.get('Lemmatizer', 'server')
    # stw_file = Path(cf.get('Lemmatizer', 'default_stw_file'))
    # dict_eq_file = Path(cf.get('Lemmatizer', 'default_dict_eq_file'))
    # POS = cf.get('Lemmatizer', 'POS')
    # concurrent_posts = int(cf.get('Lemmatizer', 'concurrent_posts'))