Exemple #1
0
def main():
    login(load_account_data())

    mode = int(input("Select a mode (1 - Data test | 2 - Post): "))

    if mode == 1:
        data = pre_frame_all_tweets()
        tweet_frame = pd.DataFrame(data)

        print(tweet_frame.head(15))
    elif mode == 2:
        topic = input("What would you like to post about?: ")
        pubmed = PubMed(tool=str(os.getenv("APP_NAME")), email=str(os.getenv("APP_EMAIL")))
        results = pubmed.query(topic, max_results=100)

        for article in results:
            articles.append(article)

        for count in range(0, 5):
            chosen_article = articles[random.randrange(0, len(articles))]

            if chosen_article not in selected_articles:
                selected_articles.append(chosen_article)

        for art in selected_articles:
            tweet = build_tweet(art)
            send_tweets(tweet)
    else:
        print("Invalid input! Use 1 or 2!")
    if input("Run again? (Y/N): ").capitalize() == "Y":
        main()
    else:
        print("Exiting!")
def get_abstract_from_pubmed(query):

    # Create a PubMed object that GraphQL can use to query
    # Note that the parameters are not required but kindly requested by PubMed Central
    # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
    pubmed = PubMed(tool="MyTool", email="*****@*****.**")

    # Execute the query against the API
    time.sleep(0.1)
    results = pubmed.query(query, max_results=500)
    time.sleep(0.1)

    # Loop over the retrieved articles
    for article in results:
        #this is to avoid some errors (too much request)
        time.sleep(0.2)

        # Extract and format information from the article
        article_id = article.pubmed_id

        title = article.title
        if article.keywords:
            if None in article.keywords:
                article.keywords.remove(None)
            keywords = '", "'.join(article.keywords)
        publication_date = article.publication_date

        abstract = article.abstract

        # # make a file for the next step
        result_of_search = (
            f'{article_id} - {publication_date} - {title}\n \n{abstract}\n')

        return (result_of_search)
Exemple #3
0
def pubmed(title_keywords, n=500, docs=False):
    '''Get articles with meta-data from PubMed

    pubs = pubmed_query('nutrition')
    
    title_keywords : str
        The string to be searched for in the title of the
        articles.
    n : int
        Number of articles to return.
    docs : bool
        Instead of dataframe with multiple columns,
        just return abstracts as a list of lists.

    '''

    from pymed import PubMed
    import json
    import pandas as pd

    out = []

    pubmed = PubMed(tool="literview", email="*****@*****.**")
    query = title_keywords + "[Title]"
    results = pubmed.query(query, max_results=n)

    for article in results:
        out.append(article.toJSON())

    out2 = []

    for i in range(len(out)):

        j = json.loads(out[i])

        try:
            journal = j['journal']
        except:
            journal = ''

        try:
            keywords = j['keywords']
        except:
            keywords = []

        out2.append([
            j['title'], journal, j['publication_date'], keywords, j['abstract']
        ])

    out = pd.DataFrame(out2)
    out.columns = [
        'title', 'journal', 'publication_date', 'keywords', 'abstract'
    ]

    if docs:
        out = [[doc] for doc in out.abstract.values]

    return out
Exemple #4
0
def querysave(search_term, max_records, save_json, inputfile):
    if inputfile == 1:
        with open("input.txt", "r") as f:
            para = ast.literal_eval(f.read())
        search_term = para['search_term']
        max_records = para['max_records']
        save_json = para['save_json']
    with mlflow.start_run() as mlrun:
        pubmed = PubMed(tool="AlphabetH", email="*****@*****.**")
        query = search_term
        results = pubmed.query(query, max_results=max_records)
        pp = defaultdict(lambda: defaultdict(dict))
        for art in results:
            pmed = art.pubmed_id
            try:
                pp[pmed]['title'] = art.title
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = art.abstract
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.conclusions
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.methods
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.results
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['keywords'] = art.keywords
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['authors'] = art.authors
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['journal'] = art.journal
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['pubdate'] = str(art.publication_date.year)
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['conclusions'] = art.conclusions
            except (AttributeError, TypeError):
                pass
        with open(save_json, 'w') as fp:
            json.dump(pp, fp)
def PubMedQuery(Inputfile, Outputfile, AdditionalKeyWords, verbose = False):
    '''
    parameters
    ---------------
    Inputfile: str, input file, like './input.txt';
    Outputfile: str, output file, like './test.xlsx'
    AdditionalKeyWords: str, keywords, like ' "pharmacy chemistry biology" '
    '''
    
    with open(Inputfile, 'r') as f:
        drugs = f.readlines()
    drugs = [i.strip() for i in drugs]

    pubmed = PubMed(tool="Query-Pubmed-Toolbox", email="*****@*****.**")
    al = []
    with tqdm(total = len(drugs), ascii=True) as pbar:
        while drugs:
            time.sleep(0.5)
            drug  = drugs[0]
            try:
                results = pubmed.query(drug + ' '+ smart_strip(AdditionalKeyWords),  max_results=5)
                results = list(results)
                if results:
                    for res in results:
                            D =  res.toDict()
                            if type(D.get('pubmed_id')) == str:
                                pubmedid  = ';'.join([smart_strip(i) for i in D.get('pubmed_id').split('\n')])
                            else:
                                pubmedid = None
                                
                            mydict = {'drug':drug,
                                    'pubmid':pubmedid,
                                    'title':smart_strip(D.get('title')),
                                    'journal': smart_strip(D.get('journal')),
                                    'abstract':smart_strip(D.get('abstract')),
                                    'doi':D.get('doi'),
                                    'year': D.get('publication_date')}

                            al.append(mydict)
                else: 
                    logging.warning('not found for %s' % drug + ' '+ AdditionalKeyWords)
                    al.append({'drug':drug})
                drugs.pop(0)
                pbar.update(1)
                if verbose:
                    pbar.write('Query: %s' % (drug + ' '+ smart_strip(AdditionalKeyWords)))

            except: pass
    df = pd.DataFrame(al)
    if '.xlsx' not in Outputfile:
        Outputfile = Outputfile + '.xlsx'
    sdf = df.style.apply(hightlight_null, axis=1)
    sdf.to_excel(Outputfile)
    df.to_pickle('.temp.pkl')
def query_pubmed(search_term, max_results=5000):
    ''' Uses pymed API to query PubMed database. '''

    pubmed = PubMed(tool='MyTool', email='')
    results = pubmed.query(search_term, max_results=max_results)

    article_list = []
    for article in results:
        article_dict = article.toDict()
        article_list.append(article_dict)

    return article_list
Exemple #7
0
def main():
    # Setup output folder
    output_folder = Path.cwd().parent.parent / 'corpus' / 'pubmed' / 'json'
    if Path.exists(output_folder):
        shutil.rmtree(output_folder)
    Path.mkdir(output_folder)

    # Create a PubMed object that GraphQL can use to query
    pubmed = PubMed(tool="DavidCampos", email="*****@*****.**")

    # Create a GraphQL query in plain text
    query = "(\"2000\"[Date - Publication] : \"3000\"[Date - Publication]) AND " \
            "((COVID-19) OR (Coronavirus) OR (Corona virus) OR (2019-nCoV) OR " \
            "(SARS-CoV) OR (MERS-CoV) OR (Severe Acute Respiratory Syndrome) OR " \
            "(Middle East Respiratory Syndrome) OR " \
            "(2019 novel coronavirus disease[MeSH Terms]) OR (2019 novel coronavirus infection[MeSH Terms]) OR " \
            "(2019-nCoV disease[MeSH Terms]) OR (2019-nCoV infection[MeSH Terms]) OR " \
            "(coronavirus disease 2019[MeSH Terms]) OR (coronavirus disease-19[MeSH Terms]))"

    # Execute the query against the API
    results = pubmed.query(query, max_results=1000000)

    # Loop over the retrieved articles
    counter = 0
    for article in results:
        # Discard if abstract empty
        if article.abstract is None or article.abstract == "":
            continue

        # Get PubmedID
        pubmed_id = article.pubmed_id
        if '\n' in pubmed_id:
            rest = pubmed_id.split('\n', 1)
            pubmed_id = rest[0]
        article.pubmed_id = pubmed_id

        # Get article as dict
        article_dict = article.toDict()

        # Write article to JSON
        with open(output_folder / (pubmed_id + ".json"), 'w') as outfile:
            json.dump(article_dict, outfile, default=date_converter)
        counter += 1
        print(counter)
Exemple #8
0
def get_corpus(output_dir='.'):
    assert os.path.exists(output_dir)
    pmed = PubMed()
    results = pmed.query('glycan', max_results=100000)
    results = filter(_is_relevant, results)
    ids = map(attrgetter('pubmed_id'), results)
    abstracts = map(attrgetter('abstract'), results)
    del results
    results = dict(zip(ids, abstracts))
    print('Fetched {} results'.format(len(results)))
    print('Writing .json file')
    with open(os.path.join(output_dir, 'glyco_corpus.json'), 'w+') as outfile:
        json.dump(results, outfile)
    print('Tokenizing sentences')
    results_txt = map(sent_tokenize, results.values())
    results_txt = reduce(operator.concat, results_txt)
    results_txt = reduce(_join_lines, results_txt)
    print('Writing .txt file')
    with open(os.path.join(output_dir, 'glyco_corpus.txt'), 'w+') as outfile:
        outfile.write(results_txt)
Exemple #9
0
from pymed import PubMed
from os import path

pubmed = PubMed(tool="paperList", email="*****@*****.**")

query = 'Correia BE[author]'

publications = pubmed.query(query, max_results=500)


## Defining functions
def get_filename(article):
    words = '-'.join(article.title.split(' ')[:3])
    date = '-'.join([
        str(article.publication_date.year),
        str(article.publication_date.month),
        str(article.publication_date.day)
    ])
    title = '-'.join([date, words]) + '.md'
    return (title)


def get_authors(article):
    author_list = []
    for author in article.authors:
        name = author['lastname'] + ' ' + author['initials']
        author_list.append(name)
        authors = ', '.join(author_list)
    return (authors, author_list[0])

Exemple #10
0
#Pubmed search for articles on HIV in African American women
import numpy as np
import pandas as pd
import pymed
from pymed import PubMed

pubmed = PubMed(tool="PubMedSearcher", email="*****@*****.**")

search_term = "HIV Viral Load African American"  #place search terms in quotes
results = pubmed.query(search_term, max_results=500)
articleList = []
articleInfo = []

for article in results:
    # Print the object type
    # Convert to dictionary
    articleDict = article.toDict()
    articleList.append(articleDict)

# Create a dict list of articles from PUBMED API
for article in articleList:
    pubmedId = article['pubmed_id'].partition('\n')[0]
    # Append article info to dictionary with fields you wish to collect
    articleInfo.append({
        u'pubmed_id': pubmedId,
        u'title': article['title'],
        u'keywords': article['keywords'],
        u'journal': article['journal'],
        u'abstract': article['abstract'],
        # u'conclusions':article['conclusions'],
        # u'methods':article['methods'],
Exemple #11
0
#https://icd.who.int/browse10/2016/en

disease_dict = []
with open("./disease_dict.tsv", "r") as f:
    lines = f.read().split("\n")
    for line in lines:
        if line == "":
            continue
        buf = line.split("\t")
        disease_dict.append(buf[0])

#word = "kynurenine"
word = "AMINOACYL-TRNA BIOSYNTHESIS"

pubmed = PubMed(tool="MyTool", email="*****@*****.**")
results = pubmed.query(word, max_results=10000)

debug = open("./debug.txt", "w")

result_count = {}
result_file = open("./result_count.txt", "w")


for item in results:
    #print (item.doi)
    #print (item.keywords)
    #print (item.abstract)
    abst = item.abstract
    
    if abst is None:
        continue
# https://stackoverflow.com/questions/57053378/
# https://www.kaggle.com/summerkrankin/pubmed-download-als

import pandas as pd
from pymed import PubMed
import time


# User inputs:
query = input("Provide a query for PubMed (can include field tags): ")
my_email = input("Provide your e-mail address (optional): ")
max_results = input("Maximum number of results: ")

# Consult PubMed:
pubmed = PubMed(tool="PubMedSearcher", email = my_email)
results = pubmed.query(query, max_results = int(max_results))

# Create an empty Dataframe with just the column names:
articles_df = pd.DataFrame(columns = ['PMID',
                                      'Publication_date',
                                      'Title',
                                      'Authors',
                                      'Journal',
                                      'DOI',
                                      'Keywords',
                                      'Abstract'])

# Now, for each article, fill the dataframe with the info collected:
for article in results:

    # Handle some exceptions in case of missing information:
def perform_query(keywords, amount):
    database = PubMed(tool="Vigor", email="*****@*****.**")
    query = keywords
    database_results = database.query(query, max_results=amount)
    return database_results
 def _load_query_result(self):
     if not self._query_result:
         pubmed = PubMed(tool='Collabovid', email='*****@*****.**')
         self._query_result = list(
             pubmed.query(query=self._PUBMED_SEARCH_QUERY,
                          max_results=30000))
@author: Wei Zhao @ Metis, 02/12/2021
"""
#%%
from pymed import PubMed
from util import save_as_pickle
from collections import defaultdict

#%%
pubmed = PubMed(tool="MyTool", email="")
query = [
    '((traumatic brain injury) ' + 'OR (concussion) ' +
    'OR (brain biomechanics)) ' +
    'AND ("1991/01/01"[Date - Create] : "3000"[Date - Create])' +
    'AND (english[Language])'
]
results = pubmed.query(query, max_results=12000000)


#%%
def download_data(results):
    """
    Download the data
    """
    # Loop over the retrieved articles
    data_dict = defaultdict(list)
    c = 0
    for article in results:
        c += 1
        if c % 1200 == 0:
            print(c)
        # Extract and format information from the article
Exemple #16
0
import csv
import itertools

from pymed import PubMed

# Create a PubMed object that GraphQL can use to query
# Note that the parameters are not required but kindly requested by PubMed Central
# https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
pubmed = PubMed(tool="Author co-occurence analysis", email="*****@*****.**")

# Create a GraphQL query in plain text
query = "occupational health[Title]"

# Execute the query against the API
results = list(pubmed.query(query, max_results=1344))

# Create a node for each unique author
nodes = {
    author: index
    for index, author in enumerate(
        set(
            itertools.chain.from_iterable([[
                f'{author["lastname"]} {author["firstname"]}'
                for author in article.authors
            ] for article in results])))
}

# Create an edge for each combination of authors (co-authorship)
edges = list(
    itertools.chain.from_iterable([[
        combination
Exemple #17
0
import time
import json

pubmed = PubMed(tool="toolname", email="your email")

# Timer
start_time = time.time()

industryList = []  # Array of companys

for comp in industryList:
    comp_time = time.time()
    articleList = []
    articleInfo = []
    query = f"your test query here with {comp}, same as how pubmed takes the queries"
    results = pubmed.query(query, max_results=999999)
    for article in results:
        articleDict = article.toDict()
        articleList.append(articleDict)
    for article in articleList:
        articleInfo.append({'pubmed_id': article['pubmed_id'],
                            'title': article['title'],
                            'keywords': article['keywords'],
                            'mesh': article['mesh'],
                            'journal': article['journal'],
                            'abstract': article['abstract'],
                            'conclusions': article['conclusions'],
                            'methods': article['methods'],
                            'results': article['results'],
                            'copyrights': article['copyrights'],
                            'doi': article['doi'],
Exemple #18
0
from pymed import PubMed
"""In MEDLINE/PubMed, every journal article is indexed with about 10–15 subject headings, 
subheadings and supplementary concept records, with some of them designated as major and marked 
with an asterisk, indicating the article's major topics. When performing a MEDLINE search via PubMed, 
entry terms are automatically translated into (i.e. mapped to) the corresponding descriptors with a 
good degree of reliability; it is recommended to check the 'Details tab' in PubMed to see how a search 
formulation was translated. By default, a search for a descriptor will include all the descriptors in 
the hierarchy below the given one. PubMed does not apply automatic mapping of the term in the following 
circumstances: by writing the quoted phrase (e.g., "kidney allograft"), when truncated on the asterisk 
(e.g., kidney allograft *), and when looking with field labels (e.g., Cancer [ti]).

Campos-Asensio, C. (2018). "Cómo elaborar una estrategia de búsqueda bibliográfica". 
Enfermería Intensiva (in Spanish). 29 (4): 182–186. """

pubmed = PubMed(tool="MyTool", email="*****@*****.**")
results = pubmed.query("spanish[Language]", max_results=500)
for res in results:
    print(res)
Exemple #19
0
def fetch_journal_articles_data(journal_abbr,
                                start_year=0,
                                end_year=None,
                                max_results=10000,
                                verbosity='full',
                                logger=None):
    """
    Uses PubMed to get the latest articles of a journal based on its name

    Parameters
    ----------
    journal_abbr: (str) journal abbreviation according to NLM catalog
    max_results: (int) number of recent articles to retrieve, 0 will get all the articles
    start_year: (int)
    end_year: (int)
    verbosity: (str or None) 'full' will print all dois, 'summary' prints the counter every 5 articles, None prints nothing
    logger: (Logger or None)

    Returns
    ----------
    articles: (list) a list of entities.Article items
    """
    #> Check if journal/PMC is supported by scraper
    journal = Journal.objects.get(abbr_name=journal_abbr)
    publisher_Q = Publisher.objects.filter(journals__contains=journal)
    if publisher_Q.count() == 0:
        logger.info("Journal has no publisher")
        return
    elif not publisher_Q[0].supported:
        logger.info("Journal not supported")
        return
    else:
        publisher = publisher_Q[0]
    #> Search in pubmed
    pubmed = PubMed()
    if not end_year:
        end_year = datetime.date.today().year + 2
    query = f"{journal_abbr}[jour] {start_year}:{end_year}[DP]"
    search_succeeded = False
    retries = 0
    while (retries < 10) and (not search_succeeded):
        try:
            entries = list(pubmed.query(query, max_results=max_results))
        except:
            retries += 1
            time.sleep(.2)
        else:
            search_succeeded = True
    if not search_succeeded:
        if verbosity == 'full':
            logger.info("Pubmed search failed after 10 retries")
        return
    articles = []
    counter = 0
    total_count = len(entries)
    any_success = False
    for entry in entries:
        if entry.doi:
            # > a quick fix for a bug in pymed (0.8.9), which sometimes returns a multiline list of dois
            # for a entry. And the first one is the real one
            doi = entry.doi.split('\n')[0]
        else:
            logger.info("No DOI")
            continue
        if Journal.objects.filter(
                articles__doi=doi).count() == 0:  # article does not exist
            dates = scraper.get_dates(doi, publisher.domain, logger=logger)
            if any([v is not None
                    for v in dates.values()]):  #> the operation has succeeded
                article = Article(doi=doi,
                                  title=entry.title,
                                  authors=[
                                      f"{a['lastname']} {a['initials']}"
                                      for a in entry.authors
                                  ],
                                  received=dates['Received'],
                                  accepted=dates['Accepted'],
                                  published=dates['Published'])
                journal.update(push__articles=article)
                any_success = True
            else:
                if verbosity == 'full':
                    logger.info('Scraper failed')
                if (counter + 1 > GIVE_UP_LIMIT) and (not any_success):
                    if verbosity == 'full':
                        logger.info(
                            f"No success for any of the {GIVE_UP_LIMIT} articles searched"
                        )
                    journal.update(set__last_failed=True)
                    return
        else:
            if verbosity == 'full':
                logger.info("Already in database")
            any_success = True
        counter += 1
        if verbosity == 'full':
            logger.info(
                f'[{journal.abbr_name}] ({counter} of {total_count}): {doi}')
        if (counter % 5 == 0) and (verbosity == 'summary'):
            logger.info(counter)
    journal.update(set__last_failed=False)
    journal.update(set__last_checked=datetime.datetime.now())
Exemple #20
0
except:
    pass
for brevet in DataBrevet["brevets"]:

    AffilAuteur = dict()

    if brevet['label'] not in DejaVus:

        for Auteur in brevet['inventor']:
            SavBrevet = False  # Commutateur pour éviter de requêter 15 fois pour un brevet
            LigneCsv = """"""  # the csv file for mathching articles and patent at CIB level
            NbAut += 1
            Auteur = Auteur.title()
            NumAut += 1
            query = "%s[Author - Full]" % (Auteur)
            DocsAuteur = pubmed.query(query, max_results=500)
            IramFull = """"""  # le contenu du fichier IRAMUTEQ complet
            Num = 0  #le numéro de doc pour sauvegarde
            auteurDejaVu = False
            NbArt = 0  # Articles retrouvés

            for article in DocsAuteur:
                NbArt += 1
                SAV = False  # switch pour savegarder dans le csv
                #    print(type(article))
                #    print(article.toJSON())
                Num += 1
                Affi = PubMedCheckNameAndGetAffiliation(
                    article.pubmed_id.split('\n')[0], Auteur
                )  # the first pubmed_id is the article. Others are citations
                if Affi is not None:
Exemple #21
0
from pymed import PubMed
import json

email=input("Please enter your email:")
user_input=input("I want to search for...")

pubmed = PubMed(tool="MyTool", email=email)
results = pubmed.query(user_input, max_results=5)

results_list = []
output = []

for article in results:
    results_as_dict = article.toDict()
    results_list.append(results_as_dict)

for article in results_list:
    pubmed_id = article['pubmed_id'].partition('\n')[0]
    output.append({u'pubmed_id':pubmed_id,
                       u'title':article['title'],
                       u'abstract':article['abstract']})

with open('output_results.json', 'w') as outfile:
    json.dump(output, outfile, indent=4)
    
Exemple #22
0
def mainpipe(inputfile, search_term, max_records, json_out, embvec, embvecache,
             val_ratio, rnnsize, batchsize, lr, weight_decay, n_epochs,
             model_save, es):
    if inputfile == 1:
        with open("input.txt", "r") as f:
            para = ast.literal_eval(f.read())
        search_term = para['search_term']
        max_records = para['max_records']
        embvec = para['embvec']
        embvecache = para['embvecache']
        val_ratio = para['val_ratio']
        rnnsize = para['rnnsize']
        batchsize = para['batchsize']
        lr = para['lr']
        weight_decay = para['weight_decay']
        n_epochs = para['n_epochs']
        model_save = para['model_save']
    if embvec == 1:
        embvec = torchtext.vocab.GloVe(name='840B', dim=300, cache=embvecache)
        use_pretrained = True
    with mlflow.start_run() as mlrun:
        pubmed = PubMed(tool="AlphabetH", email="*****@*****.**")
        query = search_term
        results = pubmed.query(query, max_results=max_records)
        pp = defaultdict(lambda: defaultdict(dict))
        for art in results:
            pmed = art.pubmed_id
            try:
                pp[pmed]['title'] = art.title
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = art.abstract
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.conclusions
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.methods
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.results
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['keywords'] = art.keywords
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['authors'] = art.authors
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['journal'] = art.journal
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['pubdate'] = str(art.publication_date.year)
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['conclusions'] = art.conclusions
            except (AttributeError, TypeError):
                pass
        print(subprocess.getoutput("python -m spacy download en_core_web_sm"))
        artpd = pd.DataFrame.from_dict(pp, orient='index')
        artpda = artpd[artpd.abstract.notnull()].copy()
        artpda = artpda[artpd.title.notnull()]
        #        artpda.index = pd.Series(artpda.index).apply(lambda x: x[0:8])
        artpdak = artpda[artpda.keywords.str.len() > 0].copy()
        dataf = pd.DataFrame(
            index=artpdak.index,
            columns=['SRC', 'TRG', 'keywords', 'Extracted', 'abskey'])
        dataf.loc[:, 'SRC'] = artpdak.title + ' ' + artpdak.abstract
        dataf.loc[:, 'keywords'] = artpdak.keywords
        svoc = spacy.load("en_core_web_sm")
        matcher = PhraseMatcher(svoc.vocab, attr="LOWER")
        for pmid in dataf.index:
            t0 = dataf.loc[pmid]
            patterns = [svoc.make_doc(str(name)) for name in t0.keywords]
            matcher.add("Names", None, *patterns)
            doc = svoc(t0.SRC)
            t1 = ['O'] * (len(doc))
            matched = []
            matn = 0
            for _, start, end in matcher(doc):
                t1[start] = 'B'
                t1[start + 1:end] = 'I' * (end - start - 1)
                if str(doc[start:end]).lower() not in matched:
                    matn = matn + 1
                    matched.append(str(doc[start:end]).lower())
            abskw = []
            for x in t0.keywords:
                if x.lower() not in matched:
                    abskw.append(x)
            dataf.loc[pmid, 'TRG'] = ' '.join([t for t in t1])
            dataf.loc[pmid, 'Extracted'] = matn
            dataf.loc[pmid, 'abskey'] = abskw
            matcher.remove("Names")
        datatrain = dataf[dataf['Extracted'] >= 3].copy()
        datatest = dataf[dataf['Extracted'] < 3].copy()
        # separate train and validate
        dtrain = datatrain.loc[:, ['SRC', 'TRG']]
        dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']]
        seed = 250
        idx = np.arange(datatrain.shape[0])
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.shuffle(idx)
        val_size = int(len(idx) * val_ratio)
        df_train = dtrain.iloc[idx[val_size:], :]
        df_val = dtrain.iloc[idx[:val_size], :]
        df_val_k = dtraink.iloc[idx[:val_size], :]
        df_test = datatest.loc[:, ['SRC', 'TRG']]
        dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']]
        df_val_k = dtraink.iloc[idx[:val_size], :]
        # Load original dataset
        datai = artpda.copy()
        datai = datai[datai.abstract.notnull()]
        datai = datai[datai.title.notnull()]
        datai = datai.replace('\n', ' ', regex=True)
        datai = datai.replace('\t', ' ', regex=True)
        dataiu = datai.loc[datai.keywords.str.len() == 0]
        dataik = datai.loc[datai.keywords.str.len() > 0]
        dataiu['SRC'] = dataiu.title + ' ' + dataiu.abstract
        tokenizertrg = lambda x: x.split()

        def tokenizersrc(text):  # create a tokenizer function
            return [tok.text for tok in svoc.tokenizer(text)]

        def safe_value(field_val):
            return field_val if not pd.isna(field_val) else "Other"

        def safe_year(field_val):
            return field_val if not pd.isna(field_val) else 1900

        TEXT = torchtext.data.Field(init_token='<bos>',
                                    eos_token='<eos>',
                                    sequential=True,
                                    lower=False)
        LABEL = torchtext.data.Field(init_token='<bos>',
                                     eos_token='<eos>',
                                     sequential=True,
                                     unk_token=None)
        fields = [('text', TEXT), ('label', LABEL)]
        device = 'cuda'
        train_examples = read_data(df_train, fields, tokenizersrc,
                                   tokenizertrg)
        valid_examples = read_data(df_val, fields, tokenizersrc, tokenizertrg)
        # Load the pre-trained embeddings that come with the torchtext library.
        if use_pretrained:
            print('We are using pre-trained word embeddings.')
            TEXT.build_vocab(train_examples, vectors=embvec)
        else:
            print('We are training word embeddings from scratch.')
            TEXT.build_vocab(train_examples, max_size=5000)
        LABEL.build_vocab(train_examples)
        # Create one of the models defined above.
        #self.model = RNNTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False)
        model0 = RNNCRFTagger(TEXT,
                              LABEL,
                              rnnsize,
                              emb_dim=300,
                              update_pretrained=False)

        model0.to(device)
        optimizer = torch.optim.Adam(model0.parameters(),
                                     lr=lr,
                                     weight_decay=weight_decay)
        train(train_examples, valid_examples, embvec, TEXT, LABEL, device,
              model0, batchsize, optimizer, n_epochs)
        out2 = evaltest2(df_val, df_val_k, model0, tokenizersrc, fields,
                         device)
        ttp3 = kphperct(df_val_k, out2, svoc)
        mlflow.log_param("epochs", n_epochs)
        mlflow.pytorch.save_model(model0, model_save)
        mlflow.log_metric("extraction_rate", ttp3.mean())
        augout = evaltest2(dataiu, model0, tokenizersrc, fields, device)
        klist = kphext2(dataiu.SRC, augout, svoc)
        for i in range(len(dataiu.index)):
            dataiu.iloc[i, 2].extend(list(set(klist[i])))
        output = pd.concat([dataik, dataiu], join="inner")
        output.to_json('/home/pding/OneDrive/kph/MSaug.json', orient='index')
        if es == 1:
            output['journal'] = output['journal'].apply(safe_value)
            output['conclusions'] = output['conclusions'].apply(safe_value)
            output['pubdate'] = output['pubdate'].apply(safe_year)
            output['PMID'] = output.index
            test_server = [{'host': '127.0.0.1', 'port': 9200}]
            es = Elasticsearch(test_server, http_compress=True)
            use_these_keys = [
                'PMID', 'title', 'abstract', 'keywords', 'authors', 'pubdate'
            ]

            def filterKeys(document):
                return {key: document[key] for key in use_these_keys}

            def doc_generator(df):
                df_iter = df.iterrows()
                for index, document in df_iter:
                    try:
                        yield {
                            "_index": 'ms',
                            "_source": filterKeys(document),
                        }
                    except StopIteration:
                        return

            helpers.bulk(es, doc_generator(output))
        print(ttp3.mean())
Exemple #23
0
import pprint
import pandas as pd
import numpy as np
import xmltodict
from xml.etree import ElementTree
from pymed import PubMed
from Bio import Entrez
import plotly
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime

# https://stackoverflow.com/questions/57053378/query-pubmed-with-python-how-to-get-all-article-details-from-query-to-pandas-d
pubmed = PubMed(tool="PubMedSearcher", email="*****@*****.**")
results = pubmed.query(
    "nhsx[affiliation]",
    max_results=500)  # number might need to be updated in future, for now low
articleList = []
articleInfo = []

for article in results:
    # Print the type of object we've found (can be either PubMedBookArticle or PubMedArticle).
    articleDict = article.toDict()  # convert to dictionary
    articleList.append(articleDict)

# Generate list of dict records which will hold all article details that could be fetch from PUBMED API
for article in articleList:
    # Sometimes article['pubmed_id'] contains list separated with comma - take first pubmedId in that list - thats article pubmedId
    pubmedId = article["pubmed_id"].partition("\n")[0]  # keep only pubmed id
    # Append article info to dictionary #
    articleInfo.append({
Exemple #24
0
from pymed import PubMed

# Create a PubMed object that GraphQL can use to query
# Note that the parameters are not required but kindly requested by PubMed Central
# https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
pubmed = PubMed(tool="MyTool", email="*****@*****.**")

# Create a GraphQL query in plain text
query = '(("2018/05/01"[Date - Create] : "3000"[Date - Create])) AND (Xiaoying Xian[Author] OR diabetes)'

# Execute the query against the API
results = pubmed.query(query, max_results=500)

# Loop over the retrieved articles
for article in results:

    # Extract and format information from the article
    article_id = article.article_id
    title = article.title
    if article.keywords:
        if None in article.keywords:
            article.keywords.remove(None)
        keywords = '", "'.join(article.keywords)
    publication_date = article.publication_date
    abstract = article.abstract

    # Show information about the article
    print(
        f'{article_id} - {publication_date} - {title}\nKeywords: "{keywords}"\n{abstract}\n'
    )
    def get_pubmed_data(self,
                        query,
                        searched_zipcode,
                        date,
                        maximum_number_of_value=3):
        csv_data = {
            "affiliation": [],
            "number_of_authors": [],
            "authors_name": [],
            "authors_institute": [],
            "authors_address": [],
            "authors_zipcode": [],
            "paper_title": [],
            "publication_date": [],
            "journal": []
        }
        pubmed = PubMed(tool="MyTool", email="*****@*****.**")
        parser = Parser()

        results = pubmed.query(query, max_results=maximum_number_of_value)
        is_queried_by_zipcode = searched_zipcode.isdecimal()

        if is_queried_by_zipcode:
            searched_zipcode = int(searched_zipcode)

        for article in results:
            jsonData = json.loads(article.toJSON())
            authors_list = jsonData['authors']
            authors_name = ""
            authors_institute = ""
            authors_affiliation = ""
            authors_address = ""
            authors_zipcode = ""
            num_authors = len(authors_list) or 0
            counted_matched = 0
            if is_queried_by_zipcode:
                counted_matched = self.has_match_zipcode_of_authprs(
                    authors_list, searched_zipcode)
            if (not is_queried_by_zipcode) or (is_queried_by_zipcode
                                               and counted_matched > 0):
                for index in range(0, num_authors):
                    affiliation = authors_list[index][
                        "affiliation"] or "<NOT_AVAILABLE>"
                    zipcode = str(self.get_address_with_zipcode(affiliation))
                    # print(type(zipcode))
                    # print(zipcode)
                    author_name = authors_list[index][
                        'firstname'] + " " + authors_list[index][
                            "lastname"] or "<NOT_AVAILABLE>"
                    author_institute = ""
                    author_institute += self.get_organization(
                        affiliation=affiliation) + " "
                    authors_affiliation += affiliation
                    authors_name += author_name
                    authors_institute += author_institute
                    authors_address += str(parser.parse(affiliation))
                    authors_zipcode += zipcode
                    if num_authors != index + 1:
                        authors_name += "||"
                        authors_institute += "||"
                        authors_affiliation += "||"
                        authors_address += "||"
                        authors_zipcode += "||"
            else:
                break
            paper_title = jsonData['title'] or "<NOT_AVAILABLE>"
            publication_date = jsonData['publication_date'] or "<NOT_AVAILABLE>"
            journal = jsonData['journal'] or "<NOT_AVAILABLE>"

            if self.is_us:
                if not is_queried_by_zipcode or (is_queried_by_zipcode
                                                 and counted_matched > 0):

                    csv_data["authors_name"].append(authors_name)
                    csv_data["affiliation"].append(authors_affiliation)
                    csv_data["authors_institute"].append(authors_institute)
                    csv_data["paper_title"].append(paper_title)
                    csv_data["publication_date"].append(publication_date)
                    csv_data["journal"].append(journal)
                    csv_data["authors_address"].append(authors_address)
                    csv_data["number_of_authors"].append(num_authors)
                    csv_data["authors_zipcode"].append(authors_zipcode)
                    self.is_us = False

            # if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0):
            #
            #     df = pd.DataFrame(csv_data)
            #     # print(df.head())
            #     df.to_csv("PubMedData_from.csv", index=False)

        print("Size of csv ", len(csv_data["paper_title"]))
        if len(csv_data["paper_title"]) > 0:
            df = pd.DataFrame(csv_data)
            print(df.head())
            datetimeobject = datetime.datetime.strptime(date, '%Y/%m/%d')
            csv_file_name = "PubMedData_From_" + datetimeobject.strftime(
                '%Y_%m_%d') + ".csv"
            print(csv_file_name)
            df.to_csv(csv_file_name, index=False)
Exemple #26
0
class query(object):

    # * Store Flags
    def __init__(self):
        # Positional Arguments
        self.oFile = args.oFile
        # Flags for information requested by Pubmed API
        self.email = args.email
        self.tool = args.tool
        # All other flags used to build the query
        self.author1 = args.author1
        self.authors = args.authors
        self.title = args.title
        self.terms = args.terms
        self.userquery = args.userquery
        self.psYear = args.pubSinceYear
        self.psLast = args.pubSinceLast
        self.maxResults = args.maxResults

    # * Build Object
    # Create a PubMed object that GraphQL can use to query
    def buildQuery(self):
        # Build Object and send some info to PubMed by their request
        # Note that the parameters below are not required but kindly requested by PubMed Central
        # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
        self.pubmed = PubMed(tool=self.tool, email=self.email)

        # * Create query to feed into Pubmed
        self.query = ""
        # First author
        if self.author1 is not None:
            if '#' in str(self.author1):
                self.author1 = str(self.author1).replace('#', ' ')
            self.query = self.query + str(self.author1)[2:-2] + ' [1au] AND '
        # Authors
        if self.authors is not None:
            for author in self.authors.split(' '):
                if '#' in author:
                    author = author.replace('#', ' ')
                self.query = self.query + author + ' [auth] AND '
        # Title
        if self.title is not None:
            for tword in self.title:
                self.query = self.query + tword + ' [ti] AND '
        # Terms
        if self.terms is not None:
            for item in self.terms.split(' '):
                self.query = self.query + item + ' AND '
        # User query
        if self.userquery is not None:
            userquery = str(self.userquery)[2:-2]
            self.query = self.query + userquery + ' AND '

        # Calculate what the start date is for articles to be included based on user settings
        if self.psLast is not None:
            # Only include articles published in the last <x> years
            self.dYa = datetime.now() - relativedelta(years=int(self.psLast))
            self.dayYearsAgo = str(self.dYa).split(' ')[0].replace('-', '/')
            self.dYaQuery = '(' + self.dayYearsAgo + '[Date - Create] : "3000"[Date - Create])'

        else:
            self.dYaQuery = '("' + self.psYear + '/01/01"[Date - Create] : "3000"[Date - Create])'
        self.query = self.query + self.dYaQuery

        # Announce created query for verification:
        print(f'''
        This is your query:
        {self.query}
        ''')

    def runQuery(self):
        # Execute the query against the API
        self.results = self.pubmed.query(self.query,
                                         max_results=int(self.maxResults) + 1)

        # Make dictionary to store data
        self.output = {}

        # Loop over the retrieved articles
        self.nResults = 0
        for result in self.results:
            self.nResults = self.nResults + 1

        # Check if there are more than <n> results
        if self.nResults > int(self.maxResults):
            # Show warning
            print('More than ' + str(self.maxResults) + ' results found')
        elif self.nResults == 0:
            # Show warning
            print('No results found')
        else:
            # Print number of results
            print(str(self.nResults) + ' result(s) obtained.')

            # Loop over the retrieved articles
            self.results = self.pubmed.query(self.query,
                                             max_results=int(self.maxResults))
            for article in self.results:

                # Extract and format information from the article
                article_id = article.pubmed_id.split()[0]
                title = article.title
                authors = article.authors
                # if article.keywords:
                #     if None in article.keywords:
                #         article.keywords.remove(None)
                #     keywords = '", "'.join(article.keywords)
                publication_date = article.publication_date
                abstract = article.abstract
                if hasattr(article, 'journal'):
                    journal = article.journal
                else:
                    journal = 'NA'

                # Reshape author list
                authorString = ''
                for author in authors:
                    last = author['lastname']
                    first = author['firstname']
                    if last is None:
                        last = 'NA'
                    if first is None:
                        first = 'NA'
                    authorString = authorString + ' ' + last + ', ' + first + ';'

                # Add results to the dictionary
                self.output[article_id] = [
                    article_id, title, authorString, journal, publication_date,
                    abstract
                ]

            # Put data in a dataframe after extraction
            self.DF = pd.DataFrame.from_dict(self.output)
            self.DF = self.DF.T
            self.DF = self.DF.reset_index(drop=True)  # Remove row names
            self.DF.columns = [
                "PMID", "Title", "Authors", "Journal", "PubDate", "Abstract"
            ]

            # Save to Excel
            self.writer = pd.ExcelWriter(self.oFile, engine='xlsxwriter')
            self.DF.to_excel(self.writer, sheet_name='PMquery', index=False)
            self.workbook = self.writer.book
            self.worksheet = self.writer.sheets['PMquery']

            # Formatting
            self.format = self.workbook.add_format({
                'text_wrap': True,
                'align': 'top'
            })
            self.worksheet.set_column('A:A', 9, self.format)
            self.worksheet.set_column('B:C', 22, self.format)
            self.worksheet.set_column('D:E', 11, self.format)
            self.worksheet.set_column('F:F', 58, self.format)
            self.writer.save()

            # If there is only one result, also return information to the shell
            if self.nResults == 1:
                #print(json.dumps(self.output.items, indent = 4))
                #print(self.output.items())
                key = list(self.output.keys())[0]
                print('\n ----------------------------------\n', 'PMID:      ',
                      self.output[key][0], '\n', 'Title:     ',
                      self.output[key][1], '\n', 'Authors:   ',
                      self.output[key][2], '\n', 'Journal:   ',
                      self.output[key][3], '\n', 'Published: ',
                      self.output[key][4], '\n',
                      '----------------------------------\n')
from tqdm import tqdm
from pymed import PubMed
import pickle

# change the query with your own ! [CHANGE QUERY]
pubmed = PubMed(tool="OPLR", email="*****@*****.**")
results = pubmed.query(
    '"trans women" OR "trans woman" OR "trans man" OR "trans men" OR "transwoman" OR "transwomen" OR "transmen" OR "transman" OR "transgender" OR "transsexual" OR "transgenderism" OR "transsexuality" OR "transsexualism"',
    max_results=20000)

mydict = {"pubs": {}, "labels": {}}

# Here we construct an OPLR dictionary file based on the pubmed API :
# here are the fields, only fields present in both articles and books ar used. (copyrights is not used)
# BOTH : "pubmed_id" "title" "abstract" "publication_date" "authors" "copyrights" "doi"
# ARTI : "keywords" "journal" "methods" "conclusions" "results" "xml"
# BOOK : "doi" "isbn" "language" "publication_type" "sections" "publisher" "publisher_location"
i = 0
for pub in tqdm(results):
    i += 1
    authors = []
    for a in pub.authors:
        try:
            name = a['initials'] + " " + a['lastname']
        except TypeError:
            name = ""
        authors.append(name)

    try:
        date = pub.publication_date.strftime("%Y-%m-%d")
    except AttributeError: