Exemple #1
0
def main():
    login(load_account_data())

    mode = int(input("Select a mode (1 - Data test | 2 - Post): "))

    if mode == 1:
        data = pre_frame_all_tweets()
        tweet_frame = pd.DataFrame(data)

        print(tweet_frame.head(15))
    elif mode == 2:
        topic = input("What would you like to post about?: ")
        pubmed = PubMed(tool=str(os.getenv("APP_NAME")), email=str(os.getenv("APP_EMAIL")))
        results = pubmed.query(topic, max_results=100)

        for article in results:
            articles.append(article)

        for count in range(0, 5):
            chosen_article = articles[random.randrange(0, len(articles))]

            if chosen_article not in selected_articles:
                selected_articles.append(chosen_article)

        for art in selected_articles:
            tweet = build_tweet(art)
            send_tweets(tweet)
    else:
        print("Invalid input! Use 1 or 2!")
    if input("Run again? (Y/N): ").capitalize() == "Y":
        main()
    else:
        print("Exiting!")
def get_abstract_from_pubmed(query):

    # Create a PubMed object that GraphQL can use to query
    # Note that the parameters are not required but kindly requested by PubMed Central
    # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
    pubmed = PubMed(tool="MyTool", email="*****@*****.**")

    # Execute the query against the API
    time.sleep(0.1)
    results = pubmed.query(query, max_results=500)
    time.sleep(0.1)

    # Loop over the retrieved articles
    for article in results:
        #this is to avoid some errors (too much request)
        time.sleep(0.2)

        # Extract and format information from the article
        article_id = article.pubmed_id

        title = article.title
        if article.keywords:
            if None in article.keywords:
                article.keywords.remove(None)
            keywords = '", "'.join(article.keywords)
        publication_date = article.publication_date

        abstract = article.abstract

        # # make a file for the next step
        result_of_search = (
            f'{article_id} - {publication_date} - {title}\n \n{abstract}\n')

        return (result_of_search)
Exemple #3
0
def pubmed(title_keywords, n=500, docs=False):
    '''Get articles with meta-data from PubMed

    pubs = pubmed_query('nutrition')
    
    title_keywords : str
        The string to be searched for in the title of the
        articles.
    n : int
        Number of articles to return.
    docs : bool
        Instead of dataframe with multiple columns,
        just return abstracts as a list of lists.

    '''

    from pymed import PubMed
    import json
    import pandas as pd

    out = []

    pubmed = PubMed(tool="literview", email="*****@*****.**")
    query = title_keywords + "[Title]"
    results = pubmed.query(query, max_results=n)

    for article in results:
        out.append(article.toJSON())

    out2 = []

    for i in range(len(out)):

        j = json.loads(out[i])

        try:
            journal = j['journal']
        except:
            journal = ''

        try:
            keywords = j['keywords']
        except:
            keywords = []

        out2.append([
            j['title'], journal, j['publication_date'], keywords, j['abstract']
        ])

    out = pd.DataFrame(out2)
    out.columns = [
        'title', 'journal', 'publication_date', 'keywords', 'abstract'
    ]

    if docs:
        out = [[doc] for doc in out.abstract.values]

    return out
Exemple #4
0
def querysave(search_term, max_records, save_json, inputfile):
    if inputfile == 1:
        with open("input.txt", "r") as f:
            para = ast.literal_eval(f.read())
        search_term = para['search_term']
        max_records = para['max_records']
        save_json = para['save_json']
    with mlflow.start_run() as mlrun:
        pubmed = PubMed(tool="AlphabetH", email="*****@*****.**")
        query = search_term
        results = pubmed.query(query, max_results=max_records)
        pp = defaultdict(lambda: defaultdict(dict))
        for art in results:
            pmed = art.pubmed_id
            try:
                pp[pmed]['title'] = art.title
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = art.abstract
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.conclusions
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.methods
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.results
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['keywords'] = art.keywords
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['authors'] = art.authors
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['journal'] = art.journal
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['pubdate'] = str(art.publication_date.year)
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['conclusions'] = art.conclusions
            except (AttributeError, TypeError):
                pass
        with open(save_json, 'w') as fp:
            json.dump(pp, fp)
def PubMedQuery(Inputfile, Outputfile, AdditionalKeyWords, verbose = False):
    '''
    parameters
    ---------------
    Inputfile: str, input file, like './input.txt';
    Outputfile: str, output file, like './test.xlsx'
    AdditionalKeyWords: str, keywords, like ' "pharmacy chemistry biology" '
    '''
    
    with open(Inputfile, 'r') as f:
        drugs = f.readlines()
    drugs = [i.strip() for i in drugs]

    pubmed = PubMed(tool="Query-Pubmed-Toolbox", email="*****@*****.**")
    al = []
    with tqdm(total = len(drugs), ascii=True) as pbar:
        while drugs:
            time.sleep(0.5)
            drug  = drugs[0]
            try:
                results = pubmed.query(drug + ' '+ smart_strip(AdditionalKeyWords),  max_results=5)
                results = list(results)
                if results:
                    for res in results:
                            D =  res.toDict()
                            if type(D.get('pubmed_id')) == str:
                                pubmedid  = ';'.join([smart_strip(i) for i in D.get('pubmed_id').split('\n')])
                            else:
                                pubmedid = None
                                
                            mydict = {'drug':drug,
                                    'pubmid':pubmedid,
                                    'title':smart_strip(D.get('title')),
                                    'journal': smart_strip(D.get('journal')),
                                    'abstract':smart_strip(D.get('abstract')),
                                    'doi':D.get('doi'),
                                    'year': D.get('publication_date')}

                            al.append(mydict)
                else: 
                    logging.warning('not found for %s' % drug + ' '+ AdditionalKeyWords)
                    al.append({'drug':drug})
                drugs.pop(0)
                pbar.update(1)
                if verbose:
                    pbar.write('Query: %s' % (drug + ' '+ smart_strip(AdditionalKeyWords)))

            except: pass
    df = pd.DataFrame(al)
    if '.xlsx' not in Outputfile:
        Outputfile = Outputfile + '.xlsx'
    sdf = df.style.apply(hightlight_null, axis=1)
    sdf.to_excel(Outputfile)
    df.to_pickle('.temp.pkl')
def query_pubmed(search_term, max_results=5000):
    ''' Uses pymed API to query PubMed database. '''

    pubmed = PubMed(tool='MyTool', email='')
    results = pubmed.query(search_term, max_results=max_results)

    article_list = []
    for article in results:
        article_dict = article.toDict()
        article_list.append(article_dict)

    return article_list
Exemple #7
0
    def buildQuery(self):
        # Build Object and send some info to PubMed by their request
        # Note that the parameters below are not required but kindly requested by PubMed Central
        # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
        self.pubmed = PubMed(tool=self.tool, email=self.email)

        # * Create query to feed into Pubmed
        self.query = ""
        # First author
        if self.author1 is not None:
            if '#' in str(self.author1):
                self.author1 = str(self.author1).replace('#', ' ')
            self.query = self.query + str(self.author1)[2:-2] + ' [1au] AND '
        # Authors
        if self.authors is not None:
            for author in self.authors.split(' '):
                if '#' in author:
                    author = author.replace('#', ' ')
                self.query = self.query + author + ' [auth] AND '
        # Title
        if self.title is not None:
            for tword in self.title:
                self.query = self.query + tword + ' [ti] AND '
        # Terms
        if self.terms is not None:
            for item in self.terms.split(' '):
                self.query = self.query + item + ' AND '
        # User query
        if self.userquery is not None:
            userquery = str(self.userquery)[2:-2]
            self.query = self.query + userquery + ' AND '

        # Calculate what the start date is for articles to be included based on user settings
        if self.psLast is not None:
            # Only include articles published in the last <x> years
            self.dYa = datetime.now() - relativedelta(years=int(self.psLast))
            self.dayYearsAgo = str(self.dYa).split(' ')[0].replace('-', '/')
            self.dYaQuery = '(' + self.dayYearsAgo + '[Date - Create] : "3000"[Date - Create])'

        else:
            self.dYaQuery = '("' + self.psYear + '/01/01"[Date - Create] : "3000"[Date - Create])'
        self.query = self.query + self.dYaQuery

        # Announce created query for verification:
        print(f'''
        This is your query:
        {self.query}
        ''')
Exemple #8
0
def main():
    # Setup output folder
    output_folder = Path.cwd().parent.parent / 'corpus' / 'pubmed' / 'json'
    if Path.exists(output_folder):
        shutil.rmtree(output_folder)
    Path.mkdir(output_folder)

    # Create a PubMed object that GraphQL can use to query
    pubmed = PubMed(tool="DavidCampos", email="*****@*****.**")

    # Create a GraphQL query in plain text
    query = "(\"2000\"[Date - Publication] : \"3000\"[Date - Publication]) AND " \
            "((COVID-19) OR (Coronavirus) OR (Corona virus) OR (2019-nCoV) OR " \
            "(SARS-CoV) OR (MERS-CoV) OR (Severe Acute Respiratory Syndrome) OR " \
            "(Middle East Respiratory Syndrome) OR " \
            "(2019 novel coronavirus disease[MeSH Terms]) OR (2019 novel coronavirus infection[MeSH Terms]) OR " \
            "(2019-nCoV disease[MeSH Terms]) OR (2019-nCoV infection[MeSH Terms]) OR " \
            "(coronavirus disease 2019[MeSH Terms]) OR (coronavirus disease-19[MeSH Terms]))"

    # Execute the query against the API
    results = pubmed.query(query, max_results=1000000)

    # Loop over the retrieved articles
    counter = 0
    for article in results:
        # Discard if abstract empty
        if article.abstract is None or article.abstract == "":
            continue

        # Get PubmedID
        pubmed_id = article.pubmed_id
        if '\n' in pubmed_id:
            rest = pubmed_id.split('\n', 1)
            pubmed_id = rest[0]
        article.pubmed_id = pubmed_id

        # Get article as dict
        article_dict = article.toDict()

        # Write article to JSON
        with open(output_folder / (pubmed_id + ".json"), 'w') as outfile:
            json.dump(article_dict, outfile, default=date_converter)
        counter += 1
        print(counter)
Exemple #9
0
def get_corpus(output_dir='.'):
    assert os.path.exists(output_dir)
    pmed = PubMed()
    results = pmed.query('glycan', max_results=100000)
    results = filter(_is_relevant, results)
    ids = map(attrgetter('pubmed_id'), results)
    abstracts = map(attrgetter('abstract'), results)
    del results
    results = dict(zip(ids, abstracts))
    print('Fetched {} results'.format(len(results)))
    print('Writing .json file')
    with open(os.path.join(output_dir, 'glyco_corpus.json'), 'w+') as outfile:
        json.dump(results, outfile)
    print('Tokenizing sentences')
    results_txt = map(sent_tokenize, results.values())
    results_txt = reduce(operator.concat, results_txt)
    results_txt = reduce(_join_lines, results_txt)
    print('Writing .txt file')
    with open(os.path.join(output_dir, 'glyco_corpus.txt'), 'w+') as outfile:
        outfile.write(results_txt)
Exemple #10
0
#Pubmed search for articles on HIV in African American women
import numpy as np
import pandas as pd
import pymed
from pymed import PubMed

pubmed = PubMed(tool="PubMedSearcher", email="*****@*****.**")

search_term = "HIV Viral Load African American"  #place search terms in quotes
results = pubmed.query(search_term, max_results=500)
articleList = []
articleInfo = []

for article in results:
    # Print the object type
    # Convert to dictionary
    articleDict = article.toDict()
    articleList.append(articleDict)

# Create a dict list of articles from PUBMED API
for article in articleList:
    pubmedId = article['pubmed_id'].partition('\n')[0]
    # Append article info to dictionary with fields you wish to collect
    articleInfo.append({
        u'pubmed_id': pubmedId,
        u'title': article['title'],
        u'keywords': article['keywords'],
        u'journal': article['journal'],
        u'abstract': article['abstract'],
        # u'conclusions':article['conclusions'],
        # u'methods':article['methods'],
Exemple #11
0
import pandas as pd
#the pymed library is used to query the PubMed database and acquire article info
from pymed import PubMed
pubmed = PubMed(tool="PubMedRetriever", email="*****@*****.**")

#enter search term here, this acquires all the articles that appear when you search for the term in pubmed
search_term = "covid-19"
#enter the max number of results
search_results = pubmed.query(search_term, max_results=100000)
#create the lists that are used to save acquired data
article_list = []
article_details = []
abstracts = []
qualitydata = []
#create the identifiers that are used to identify and seperate the fields of interest
startdoi = "|start_doi|"
enddoi = "|end_doi|"
startpubmedid = "|start_pid|"
endpubmedid = "|end_pid|"
startpdate = "|start_pd|"
endpdate = "|end_pd|"

#go through all the articles that are retrieved by using the search term from pymed and save the information
for article in search_results:
    # Convert each retrieved article to a dictionary
    article_dictionary = article.toDict()
    article_list.append(article_dictionary)

# Generate list of dictionary records which will hold all article details that could be fetched from PUBMED API
for article in article_list:
    #get article pubmed ID
Exemple #12
0
def mainpipe(inputfile, search_term, max_records, json_out, embvec, embvecache,
             val_ratio, rnnsize, batchsize, lr, weight_decay, n_epochs,
             model_save, es):
    if inputfile == 1:
        with open("input.txt", "r") as f:
            para = ast.literal_eval(f.read())
        search_term = para['search_term']
        max_records = para['max_records']
        embvec = para['embvec']
        embvecache = para['embvecache']
        val_ratio = para['val_ratio']
        rnnsize = para['rnnsize']
        batchsize = para['batchsize']
        lr = para['lr']
        weight_decay = para['weight_decay']
        n_epochs = para['n_epochs']
        model_save = para['model_save']
    if embvec == 1:
        embvec = torchtext.vocab.GloVe(name='840B', dim=300, cache=embvecache)
        use_pretrained = True
    with mlflow.start_run() as mlrun:
        pubmed = PubMed(tool="AlphabetH", email="*****@*****.**")
        query = search_term
        results = pubmed.query(query, max_results=max_records)
        pp = defaultdict(lambda: defaultdict(dict))
        for art in results:
            pmed = art.pubmed_id
            try:
                pp[pmed]['title'] = art.title
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = art.abstract
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.conclusions
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.methods
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.results
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['keywords'] = art.keywords
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['authors'] = art.authors
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['journal'] = art.journal
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['pubdate'] = str(art.publication_date.year)
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['conclusions'] = art.conclusions
            except (AttributeError, TypeError):
                pass
        print(subprocess.getoutput("python -m spacy download en_core_web_sm"))
        artpd = pd.DataFrame.from_dict(pp, orient='index')
        artpda = artpd[artpd.abstract.notnull()].copy()
        artpda = artpda[artpd.title.notnull()]
        #        artpda.index = pd.Series(artpda.index).apply(lambda x: x[0:8])
        artpdak = artpda[artpda.keywords.str.len() > 0].copy()
        dataf = pd.DataFrame(
            index=artpdak.index,
            columns=['SRC', 'TRG', 'keywords', 'Extracted', 'abskey'])
        dataf.loc[:, 'SRC'] = artpdak.title + ' ' + artpdak.abstract
        dataf.loc[:, 'keywords'] = artpdak.keywords
        svoc = spacy.load("en_core_web_sm")
        matcher = PhraseMatcher(svoc.vocab, attr="LOWER")
        for pmid in dataf.index:
            t0 = dataf.loc[pmid]
            patterns = [svoc.make_doc(str(name)) for name in t0.keywords]
            matcher.add("Names", None, *patterns)
            doc = svoc(t0.SRC)
            t1 = ['O'] * (len(doc))
            matched = []
            matn = 0
            for _, start, end in matcher(doc):
                t1[start] = 'B'
                t1[start + 1:end] = 'I' * (end - start - 1)
                if str(doc[start:end]).lower() not in matched:
                    matn = matn + 1
                    matched.append(str(doc[start:end]).lower())
            abskw = []
            for x in t0.keywords:
                if x.lower() not in matched:
                    abskw.append(x)
            dataf.loc[pmid, 'TRG'] = ' '.join([t for t in t1])
            dataf.loc[pmid, 'Extracted'] = matn
            dataf.loc[pmid, 'abskey'] = abskw
            matcher.remove("Names")
        datatrain = dataf[dataf['Extracted'] >= 3].copy()
        datatest = dataf[dataf['Extracted'] < 3].copy()
        # separate train and validate
        dtrain = datatrain.loc[:, ['SRC', 'TRG']]
        dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']]
        seed = 250
        idx = np.arange(datatrain.shape[0])
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.shuffle(idx)
        val_size = int(len(idx) * val_ratio)
        df_train = dtrain.iloc[idx[val_size:], :]
        df_val = dtrain.iloc[idx[:val_size], :]
        df_val_k = dtraink.iloc[idx[:val_size], :]
        df_test = datatest.loc[:, ['SRC', 'TRG']]
        dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']]
        df_val_k = dtraink.iloc[idx[:val_size], :]
        # Load original dataset
        datai = artpda.copy()
        datai = datai[datai.abstract.notnull()]
        datai = datai[datai.title.notnull()]
        datai = datai.replace('\n', ' ', regex=True)
        datai = datai.replace('\t', ' ', regex=True)
        dataiu = datai.loc[datai.keywords.str.len() == 0]
        dataik = datai.loc[datai.keywords.str.len() > 0]
        dataiu['SRC'] = dataiu.title + ' ' + dataiu.abstract
        tokenizertrg = lambda x: x.split()

        def tokenizersrc(text):  # create a tokenizer function
            return [tok.text for tok in svoc.tokenizer(text)]

        def safe_value(field_val):
            return field_val if not pd.isna(field_val) else "Other"

        def safe_year(field_val):
            return field_val if not pd.isna(field_val) else 1900

        TEXT = torchtext.data.Field(init_token='<bos>',
                                    eos_token='<eos>',
                                    sequential=True,
                                    lower=False)
        LABEL = torchtext.data.Field(init_token='<bos>',
                                     eos_token='<eos>',
                                     sequential=True,
                                     unk_token=None)
        fields = [('text', TEXT), ('label', LABEL)]
        device = 'cuda'
        train_examples = read_data(df_train, fields, tokenizersrc,
                                   tokenizertrg)
        valid_examples = read_data(df_val, fields, tokenizersrc, tokenizertrg)
        # Load the pre-trained embeddings that come with the torchtext library.
        if use_pretrained:
            print('We are using pre-trained word embeddings.')
            TEXT.build_vocab(train_examples, vectors=embvec)
        else:
            print('We are training word embeddings from scratch.')
            TEXT.build_vocab(train_examples, max_size=5000)
        LABEL.build_vocab(train_examples)
        # Create one of the models defined above.
        #self.model = RNNTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False)
        model0 = RNNCRFTagger(TEXT,
                              LABEL,
                              rnnsize,
                              emb_dim=300,
                              update_pretrained=False)

        model0.to(device)
        optimizer = torch.optim.Adam(model0.parameters(),
                                     lr=lr,
                                     weight_decay=weight_decay)
        train(train_examples, valid_examples, embvec, TEXT, LABEL, device,
              model0, batchsize, optimizer, n_epochs)
        out2 = evaltest2(df_val, df_val_k, model0, tokenizersrc, fields,
                         device)
        ttp3 = kphperct(df_val_k, out2, svoc)
        mlflow.log_param("epochs", n_epochs)
        mlflow.pytorch.save_model(model0, model_save)
        mlflow.log_metric("extraction_rate", ttp3.mean())
        augout = evaltest2(dataiu, model0, tokenizersrc, fields, device)
        klist = kphext2(dataiu.SRC, augout, svoc)
        for i in range(len(dataiu.index)):
            dataiu.iloc[i, 2].extend(list(set(klist[i])))
        output = pd.concat([dataik, dataiu], join="inner")
        output.to_json('/home/pding/OneDrive/kph/MSaug.json', orient='index')
        if es == 1:
            output['journal'] = output['journal'].apply(safe_value)
            output['conclusions'] = output['conclusions'].apply(safe_value)
            output['pubdate'] = output['pubdate'].apply(safe_year)
            output['PMID'] = output.index
            test_server = [{'host': '127.0.0.1', 'port': 9200}]
            es = Elasticsearch(test_server, http_compress=True)
            use_these_keys = [
                'PMID', 'title', 'abstract', 'keywords', 'authors', 'pubdate'
            ]

            def filterKeys(document):
                return {key: document[key] for key in use_these_keys}

            def doc_generator(df):
                df_iter = df.iterrows()
                for index, document in df_iter:
                    try:
                        yield {
                            "_index": 'ms',
                            "_source": filterKeys(document),
                        }
                    except StopIteration:
                        return

            helpers.bulk(es, doc_generator(output))
        print(ttp3.mean())
Exemple #13
0
    'root': {
        'handlers': ['console'],
        'level': 'INFO'
    }
}

logging.config.dictConfig(LOGGING)

from pymed import PubMed

my_email = "*****@*****.**"

# Create a PubMed object that GraphQL can use to query
# Note that the parameters are not required but kindly requested by PubMed Central
# https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
pubmed = PubMed(tool="Protein Interaction Text Miner", email=my_email)


class Publication:
    def __init__(self,
                 pubmed_id,
                 title,
                 publication_date,
                 abstract,
                 keywords=""):

        self.pubmed_id = pubmed_id
        self.url = "https://www.ncbi.nlm.nih.gov/pubmed/" + pubmed_id.split(
            "\n")[0]
        self.title = title
        self.publication_date = publication_date
import datetime
from typing import List, Union

from pymed import PubMed

from .utils import get_query_from_keywords_and_date, get_emails
from ..utils import dump_papers

PUBMED = PubMed(tool="MyTool", email="*****@*****.**")

pubmed_field_mapper = {"publication_date": "date"}

# Authors fields needs specific processing
process_fields = {
    "authors":
    lambda authors: list(
        map(
            lambda a: str(a.get("firstname", "")) + "" + str(
                a.get("lastname", "")),
            authors,
        )),
    "date":
    lambda date: (date.strftime("%Y-%m-%d")
                  if isinstance(date, datetime.date) else date),
}


def get_pubmed_papers(query: str,
                      fields: List = [
                          "title", "authors", "date", "abstract", "journal",
                          "doi"
min_year = 2010
max_year = 2020
total = []
slope = []
label = [
    'Renal Pathology', 'Kidney Transplantation', 'Chronic kidney disease',
    'Acute Kidney Injury', 'Renal Insufficiency', 'renal hypotension',
    'Drug Discovery', 'Immunology', 'Genetic', 'Geriatric',
    'Cardiovascular disease'
]

# Create a PubMed object that GraphQL can use to query
# Note that the parameters are not required but kindly requested by PubMed Central
# https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
pubmed = PubMed(tool="MyTool", email="*****@*****.**")

# query terms
time_ml = '((2010/1/1[Date - Publication]: 2020/12/31[Date - Publication]) AND ("Artificial General Intelligence" OR "Artificial Intelligence" OR "Autoencoder" OR "auto encoder" OR "Reinforcement learning" OR "AI Governance" OR "Augmented Intelligence" OR "Decision Intelligence" OR "neural network" OR "Data Labeling" OR "Annotation Services" OR "Edge AI" OR "Smart Robotics" OR "Quantum Computing" OR "Digital Ethics" OR "AutoML" OR "Deep Neural" OR "Deep Learning" OR "Deep Network" OR "Convolutional Neural" OR "Graph Neural" OR "Generative Adversarial" OR "Adversarial Learning" OR "Natural Language Processing" OR "Recurrent Neural" OR "Computer Vision" OR "Cognitive Computing" OR "machine learning" OR "random forest" OR "support vector" OR "regression tree" OR "regression splines" OR "artificial neural" OR "Lasso" OR "decision tree" OR "linear regression" OR "bayesian" OR "regression model" OR "regression" OR "Supervised-learning" OR "clustering" OR "Dimensionality reduction" OR "Unsupervised-learning" OR "big-data" OR "data-mining" OR "semi-supervised" OR "self-learning" OR "sparse learning" OR "dictionary learning" OR "Feature learning" OR "Anomaly detection" OR "Robot learning" OR "algorithms" OR "Federated learning" OR "linear model" OR "pattern recognition" OR "information retrieval" OR "game theory" OR "information theory" OR "swarm intelligence" OR "Markov Decision" OR "Markov Random" OR "dynamic programming" OR "multilayer perceptrons" OR "component analysis" OR "Sparse coding" OR "subspace learning" OR "matrix factorization" OR "matrix decomposition" OR "NLP algorithm" OR "K means" OR "computer vision" OR "speech recognition" OR "predictive model" OR "machine learning"))'
kidney = '(("Glomeruli"[All Fields] OR "glomerular"[All Fields] OR "glomerulus"[All Fields] OR "glomerulosclerosis"[All Fields] OR "nephropathology"[All Fields] OR "renal pathology"[All Fields] OR "kidney pathology"[All Fields] OR "renal whole slide"[All Fields] OR "kidney whole slide"[All Fields] OR "renal wholeslide"[All Fields] OR "kidney wholeslide"[All Fields] OR "renal biopsy"[All Fields] OR "kidney biopsy"[All Fields] OR "Kidney/diagnostic imaging"[MAJR] OR "Kidney Glomerulus/pathology”[MAJR] OR "Kidney Diseases/pathology"[MAJR] OR "Kidney/pathology"[MAJR] OR ("Kidney"[MeSH] AND "Biopsy"[MeSH]) OR "Renal Dialysis"[MeSH] OR "Kidney Diseases"[MeSH] OR "Nephrology"[MeSH] OR "Nephrology" OR "Nephrologists"[MeSH] OR "Kidney"[MeSH] OR "Kidney Function Tests"[MeSH] OR "Kidney Function Tests"[MeSH] OR "Kidney Transplantation"[MeSH] OR "Hypertension, Renal"[MeSH] OR "Renal Insufficiency"[MeSH] OR "renal survival" OR "Acute kidney injury" OR "kidney transplantation" OR "kidney disease" OR "CKD" OR "AKI" OR "chronic kidney disease"))'

query_Renal_Pathology = '("Glomeruli"[All Fields] OR "glomerular"[All Fields] OR "glomerulus"[All Fields] OR "glomerulosclerosis"[All Fields] OR "nephropathology"[All Fields] OR "renal pathology"[All Fields] OR "kidney pathology"[All Fields] OR "renal whole slide"[All Fields] OR "kidney whole slide"[All Fields] OR "renal wholeslide"[All Fields] OR "kidney wholeslide"[All Fields] OR "renal biopsy"[All Fields] OR "kidney biopsy"[All Fields] OR "Kidney/diagnostic imaging"[MAJR] OR "Kidney Glomerulus/pathology"[MAJR] OR "Kidney Diseases/pathology"[MAJR] OR "Kidney/pathology"[MAJR] OR ("Kidney"[MeSH] AND "Biopsy"[MeSH]))' + ' AND ' + time_ml
query_Kidney_Transplantation = '(Kidney Transplantation)' + ' AND ' + time_ml + ' AND ' + kidney
query_CKD = '("chronic kidney disease" OR "CKD")' + ' AND ' + time_ml + 'AND' + kidney
query_Acute_Kidney_Injury = '("acute kidney injury" OR "AKI")' + ' AND ' + time_ml + 'AND' + kidney
query_Renal_Insufficiency = '("Renal Insufficiency")' + ' AND ' + time_ml + 'AND' + kidney
query_Renal_Hypotension = '("Hypertension, Renal" OR "renal hypertension")' + ' AND ' + time_ml + 'AND' + kidney
query_Drug_Discovery = '("Drug Discovery")' + ' AND ' + kidney + ' AND ' + time_ml
query_Immunology = '("Immunology")' + ' AND ' + kidney + ' AND ' + time_ml
query_Genetic = '("Genetic")' + ' AND ' + kidney + ' AND ' + time_ml
query_Geriatric = '("Geriatric")' + ' AND ' + kidney + ' AND ' + time_ml
 def _load_query_result(self):
     if not self._query_result:
         pubmed = PubMed(tool='Collabovid', email='*****@*****.**')
         self._query_result = list(
             pubmed.query(query=self._PUBMED_SEARCH_QUERY,
                          max_results=30000))
Exemple #17
0
from pymed import PubMed
import json

email=input("Please enter your email:")
user_input=input("I want to search for...")

pubmed = PubMed(tool="MyTool", email=email)
results = pubmed.query(user_input, max_results=5)

results_list = []
output = []

for article in results:
    results_as_dict = article.toDict()
    results_list.append(results_as_dict)

for article in results_list:
    pubmed_id = article['pubmed_id'].partition('\n')[0]
    output.append({u'pubmed_id':pubmed_id,
                       u'title':article['title'],
                       u'abstract':article['abstract']})

with open('output_results.json', 'w') as outfile:
    json.dump(output, outfile, indent=4)
    
Exemple #18
0
class query(object):

    # * Store Flags
    def __init__(self):
        # Positional Arguments
        self.oFile = args.oFile
        # Flags for information requested by Pubmed API
        self.email = args.email
        self.tool = args.tool
        # All other flags used to build the query
        self.author1 = args.author1
        self.authors = args.authors
        self.title = args.title
        self.terms = args.terms
        self.userquery = args.userquery
        self.psYear = args.pubSinceYear
        self.psLast = args.pubSinceLast
        self.maxResults = args.maxResults

    # * Build Object
    # Create a PubMed object that GraphQL can use to query
    def buildQuery(self):
        # Build Object and send some info to PubMed by their request
        # Note that the parameters below are not required but kindly requested by PubMed Central
        # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
        self.pubmed = PubMed(tool=self.tool, email=self.email)

        # * Create query to feed into Pubmed
        self.query = ""
        # First author
        if self.author1 is not None:
            if '#' in str(self.author1):
                self.author1 = str(self.author1).replace('#', ' ')
            self.query = self.query + str(self.author1)[2:-2] + ' [1au] AND '
        # Authors
        if self.authors is not None:
            for author in self.authors.split(' '):
                if '#' in author:
                    author = author.replace('#', ' ')
                self.query = self.query + author + ' [auth] AND '
        # Title
        if self.title is not None:
            for tword in self.title:
                self.query = self.query + tword + ' [ti] AND '
        # Terms
        if self.terms is not None:
            for item in self.terms.split(' '):
                self.query = self.query + item + ' AND '
        # User query
        if self.userquery is not None:
            userquery = str(self.userquery)[2:-2]
            self.query = self.query + userquery + ' AND '

        # Calculate what the start date is for articles to be included based on user settings
        if self.psLast is not None:
            # Only include articles published in the last <x> years
            self.dYa = datetime.now() - relativedelta(years=int(self.psLast))
            self.dayYearsAgo = str(self.dYa).split(' ')[0].replace('-', '/')
            self.dYaQuery = '(' + self.dayYearsAgo + '[Date - Create] : "3000"[Date - Create])'

        else:
            self.dYaQuery = '("' + self.psYear + '/01/01"[Date - Create] : "3000"[Date - Create])'
        self.query = self.query + self.dYaQuery

        # Announce created query for verification:
        print(f'''
        This is your query:
        {self.query}
        ''')

    def runQuery(self):
        # Execute the query against the API
        self.results = self.pubmed.query(self.query,
                                         max_results=int(self.maxResults) + 1)

        # Make dictionary to store data
        self.output = {}

        # Loop over the retrieved articles
        self.nResults = 0
        for result in self.results:
            self.nResults = self.nResults + 1

        # Check if there are more than <n> results
        if self.nResults > int(self.maxResults):
            # Show warning
            print('More than ' + str(self.maxResults) + ' results found')
        elif self.nResults == 0:
            # Show warning
            print('No results found')
        else:
            # Print number of results
            print(str(self.nResults) + ' result(s) obtained.')

            # Loop over the retrieved articles
            self.results = self.pubmed.query(self.query,
                                             max_results=int(self.maxResults))
            for article in self.results:

                # Extract and format information from the article
                article_id = article.pubmed_id.split()[0]
                title = article.title
                authors = article.authors
                # if article.keywords:
                #     if None in article.keywords:
                #         article.keywords.remove(None)
                #     keywords = '", "'.join(article.keywords)
                publication_date = article.publication_date
                abstract = article.abstract
                if hasattr(article, 'journal'):
                    journal = article.journal
                else:
                    journal = 'NA'

                # Reshape author list
                authorString = ''
                for author in authors:
                    last = author['lastname']
                    first = author['firstname']
                    if last is None:
                        last = 'NA'
                    if first is None:
                        first = 'NA'
                    authorString = authorString + ' ' + last + ', ' + first + ';'

                # Add results to the dictionary
                self.output[article_id] = [
                    article_id, title, authorString, journal, publication_date,
                    abstract
                ]

            # Put data in a dataframe after extraction
            self.DF = pd.DataFrame.from_dict(self.output)
            self.DF = self.DF.T
            self.DF = self.DF.reset_index(drop=True)  # Remove row names
            self.DF.columns = [
                "PMID", "Title", "Authors", "Journal", "PubDate", "Abstract"
            ]

            # Save to Excel
            self.writer = pd.ExcelWriter(self.oFile, engine='xlsxwriter')
            self.DF.to_excel(self.writer, sheet_name='PMquery', index=False)
            self.workbook = self.writer.book
            self.worksheet = self.writer.sheets['PMquery']

            # Formatting
            self.format = self.workbook.add_format({
                'text_wrap': True,
                'align': 'top'
            })
            self.worksheet.set_column('A:A', 9, self.format)
            self.worksheet.set_column('B:C', 22, self.format)
            self.worksheet.set_column('D:E', 11, self.format)
            self.worksheet.set_column('F:F', 58, self.format)
            self.writer.save()

            # If there is only one result, also return information to the shell
            if self.nResults == 1:
                #print(json.dumps(self.output.items, indent = 4))
                #print(self.output.items())
                key = list(self.output.keys())[0]
                print('\n ----------------------------------\n', 'PMID:      ',
                      self.output[key][0], '\n', 'Title:     ',
                      self.output[key][1], '\n', 'Authors:   ',
                      self.output[key][2], '\n', 'Journal:   ',
                      self.output[key][3], '\n', 'Published: ',
                      self.output[key][4], '\n',
                      '----------------------------------\n')
    def get_pubmed_data(self,
                        query,
                        searched_zipcode,
                        date,
                        maximum_number_of_value=3):
        csv_data = {
            "affiliation": [],
            "number_of_authors": [],
            "authors_name": [],
            "authors_institute": [],
            "authors_address": [],
            "authors_zipcode": [],
            "paper_title": [],
            "publication_date": [],
            "journal": []
        }
        pubmed = PubMed(tool="MyTool", email="*****@*****.**")
        parser = Parser()

        results = pubmed.query(query, max_results=maximum_number_of_value)
        is_queried_by_zipcode = searched_zipcode.isdecimal()

        if is_queried_by_zipcode:
            searched_zipcode = int(searched_zipcode)

        for article in results:
            jsonData = json.loads(article.toJSON())
            authors_list = jsonData['authors']
            authors_name = ""
            authors_institute = ""
            authors_affiliation = ""
            authors_address = ""
            authors_zipcode = ""
            num_authors = len(authors_list) or 0
            counted_matched = 0
            if is_queried_by_zipcode:
                counted_matched = self.has_match_zipcode_of_authprs(
                    authors_list, searched_zipcode)
            if (not is_queried_by_zipcode) or (is_queried_by_zipcode
                                               and counted_matched > 0):
                for index in range(0, num_authors):
                    affiliation = authors_list[index][
                        "affiliation"] or "<NOT_AVAILABLE>"
                    zipcode = str(self.get_address_with_zipcode(affiliation))
                    # print(type(zipcode))
                    # print(zipcode)
                    author_name = authors_list[index][
                        'firstname'] + " " + authors_list[index][
                            "lastname"] or "<NOT_AVAILABLE>"
                    author_institute = ""
                    author_institute += self.get_organization(
                        affiliation=affiliation) + " "
                    authors_affiliation += affiliation
                    authors_name += author_name
                    authors_institute += author_institute
                    authors_address += str(parser.parse(affiliation))
                    authors_zipcode += zipcode
                    if num_authors != index + 1:
                        authors_name += "||"
                        authors_institute += "||"
                        authors_affiliation += "||"
                        authors_address += "||"
                        authors_zipcode += "||"
            else:
                break
            paper_title = jsonData['title'] or "<NOT_AVAILABLE>"
            publication_date = jsonData['publication_date'] or "<NOT_AVAILABLE>"
            journal = jsonData['journal'] or "<NOT_AVAILABLE>"

            if self.is_us:
                if not is_queried_by_zipcode or (is_queried_by_zipcode
                                                 and counted_matched > 0):

                    csv_data["authors_name"].append(authors_name)
                    csv_data["affiliation"].append(authors_affiliation)
                    csv_data["authors_institute"].append(authors_institute)
                    csv_data["paper_title"].append(paper_title)
                    csv_data["publication_date"].append(publication_date)
                    csv_data["journal"].append(journal)
                    csv_data["authors_address"].append(authors_address)
                    csv_data["number_of_authors"].append(num_authors)
                    csv_data["authors_zipcode"].append(authors_zipcode)
                    self.is_us = False

            # if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0):
            #
            #     df = pd.DataFrame(csv_data)
            #     # print(df.head())
            #     df.to_csv("PubMedData_from.csv", index=False)

        print("Size of csv ", len(csv_data["paper_title"]))
        if len(csv_data["paper_title"]) > 0:
            df = pd.DataFrame(csv_data)
            print(df.head())
            datetimeobject = datetime.datetime.strptime(date, '%Y/%m/%d')
            csv_file_name = "PubMedData_From_" + datetimeobject.strftime(
                '%Y_%m_%d') + ".csv"
            print(csv_file_name)
            df.to_csv(csv_file_name, index=False)
        if current_link is not None and "/chembldb/" in current_link:
            common_names.append(common_name)
            chembl_links.append(current_link)
            chembl_names.append(d.string)

for i in chembl_names:
    print(i)

for j in chembl_links:
    print(j)

for k in common_names:
    print(k)

pubmed = PubMed(tool="PubMedSearcher", email="*****@*****.**")

# Final drug lists: only those found to have relevant publications on PubMed make the final dataset
# so final drugs must have: chemblID + pubmed results
final_drug_common_names = set()
final_drug_chembl_names = set()
final_drug_chembl_links = set()

#test1 = []
#test2 = []

# Loop through drug names
for i, name in enumerate(common_names, 0):
    # Create a GraphQL query in plain text
    query = '\"alzheimers\"' + "+" + '\"' + name + '\"'
Exemple #21
0
from pymed import PubMed
from os import path

pubmed = PubMed(tool="paperList", email="*****@*****.**")

query = 'Correia BE[author]'

publications = pubmed.query(query, max_results=500)


## Defining functions
def get_filename(article):
    words = '-'.join(article.title.split(' ')[:3])
    date = '-'.join([
        str(article.publication_date.year),
        str(article.publication_date.month),
        str(article.publication_date.day)
    ])
    title = '-'.join([date, words]) + '.md'
    return (title)


def get_authors(article):
    author_list = []
    for author in article.authors:
        name = author['lastname'] + ' ' + author['initials']
        author_list.append(name)
        authors = ', '.join(author_list)
    return (authors, author_list[0])

def perform_query(keywords, amount):
    database = PubMed(tool="Vigor", email="*****@*****.**")
    query = keywords
    database_results = database.query(query, max_results=amount)
    return database_results
# I used the following links as references:
# https://stackoverflow.com/questions/57053378/
# https://www.kaggle.com/summerkrankin/pubmed-download-als

import pandas as pd
from pymed import PubMed
import time


# User inputs:
query = input("Provide a query for PubMed (can include field tags): ")
my_email = input("Provide your e-mail address (optional): ")
max_results = input("Maximum number of results: ")

# Consult PubMed:
pubmed = PubMed(tool="PubMedSearcher", email = my_email)
results = pubmed.query(query, max_results = int(max_results))

# Create an empty Dataframe with just the column names:
articles_df = pd.DataFrame(columns = ['PMID',
                                      'Publication_date',
                                      'Title',
                                      'Authors',
                                      'Journal',
                                      'DOI',
                                      'Keywords',
                                      'Abstract'])

# Now, for each article, fill the dataframe with the info collected:
for article in results:
Exemple #24
0
import requests
import json
import pprint
import pandas as pd
import numpy as np
import xmltodict
from xml.etree import ElementTree
from pymed import PubMed
from Bio import Entrez
import plotly
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime

# https://stackoverflow.com/questions/57053378/query-pubmed-with-python-how-to-get-all-article-details-from-query-to-pandas-d
pubmed = PubMed(tool="PubMedSearcher", email="*****@*****.**")
results = pubmed.query(
    "nhsx[affiliation]",
    max_results=500)  # number might need to be updated in future, for now low
articleList = []
articleInfo = []

for article in results:
    # Print the type of object we've found (can be either PubMedBookArticle or PubMedArticle).
    articleDict = article.toDict()  # convert to dictionary
    articleList.append(articleDict)

# Generate list of dict records which will hold all article details that could be fetch from PUBMED API
for article in articleList:
    # Sometimes article['pubmed_id'] contains list separated with comma - take first pubmedId in that list - thats article pubmedId
    pubmedId = article["pubmed_id"].partition("\n")[0]  # keep only pubmed id
Exemple #25
0
from pymed import PubMed

# Create a PubMed object that GraphQL can use to query
# Note that the parameters are not required but kindly requested by PubMed Central
# https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
pubmed = PubMed(tool="MyTool", email="*****@*****.**")

# Create a GraphQL query in plain text
query = '(("2018/05/01"[Date - Create] : "3000"[Date - Create])) AND (Xiaoying Xian[Author] OR diabetes)'

# Execute the query against the API
results = pubmed.query(query, max_results=500)

# Loop over the retrieved articles
for article in results:

    # Extract and format information from the article
    article_id = article.article_id
    title = article.title
    if article.keywords:
        if None in article.keywords:
            article.keywords.remove(None)
        keywords = '", "'.join(article.keywords)
    publication_date = article.publication_date
    abstract = article.abstract

    # Show information about the article
    print(
        f'{article_id} - {publication_date} - {title}\nKeywords: "{keywords}"\n{abstract}\n'
    )
Exemple #26
0
#/usr/bin/python

from pymed import PubMed
from pprint import pprint as pp
import lxml.etree as etree
from bs4 import BeautifulSoup

# Query Help
#
# https://pubmed.ncbi.nlm.nih.gov/advanced/
#
pubmed = PubMed(tool="Acadex", email="*****@*****.**")
query = '(David Adlam[Author])'

results = pubmed.query(query, max_results=1)

print('Hello')
for r in results:
    # bs = BeautifulSoup(r.xml, 'xml')
    # print(bs.prettify())
    # pp(etree.tostring(r.xml.getroot(), pretty_print=True))
    # pp(r.toDict())
    pass
Exemple #27
0
from pymed import PubMed
import pandas as pd
import datetime
import time
import json

pubmed = PubMed(tool="toolname", email="your email")

# Timer
start_time = time.time()

industryList = []  # Array of companys

for comp in industryList:
    comp_time = time.time()
    articleList = []
    articleInfo = []
    query = f"your test query here with {comp}, same as how pubmed takes the queries"
    results = pubmed.query(query, max_results=999999)
    for article in results:
        articleDict = article.toDict()
        articleList.append(articleDict)
    for article in articleList:
        articleInfo.append({'pubmed_id': article['pubmed_id'],
                            'title': article['title'],
                            'keywords': article['keywords'],
                            'mesh': article['mesh'],
                            'journal': article['journal'],
                            'abstract': article['abstract'],
                            'conclusions': article['conclusions'],
                            'methods': article['methods'],
Exemple #28
0
from pymed import PubMed
import json

pubmed = PubMed(tool="PubmedToolkit", email="*****@*****.**")

start = '2013/01/01'
end = '2017/01/01'
query = '(("english"[Language]) AND "case reports"[Publication Type]) ' \
    + f'AND ("{start}"[Date - Publication] : "{end}"[Date - Publication]) ' \
    + 'AND ("humans"[MeSH Terms]) AND ("Case Reports"[ptyp]) AND ("English"[lang]) ' \
    + 'AND ("pubmed pmc local"[sb]))'

results = pubmed.query(query, max_results=5000)

count = 0

def save(force=False, every=100):
    global count
    if not force:
        count += 1
        if count >= every:
            count = 0
        else:
            return
    print('Save data, fetched', len(data))
    with open('data.json', 'w') as f:
        json.dump(data, f)

data = []

for article in results:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This file is to query and download data from PubMed

@author: Wei Zhao @ Metis, 02/12/2021
"""
#%%
from pymed import PubMed
from util import save_as_pickle
from collections import defaultdict

#%%
pubmed = PubMed(tool="MyTool", email="")
query = [
    '((traumatic brain injury) ' + 'OR (concussion) ' +
    'OR (brain biomechanics)) ' +
    'AND ("1991/01/01"[Date - Create] : "3000"[Date - Create])' +
    'AND (english[Language])'
]
results = pubmed.query(query, max_results=12000000)


#%%
def download_data(results):
    """
    Download the data
    """
    # Loop over the retrieved articles
    data_dict = defaultdict(list)
    c = 0
Exemple #30
0
#                        UrlInventorBuild,UrlIPCRBuild#, cmap_discretize
#import pickle
#from urllib.parse import urlparse

#
# =============================================================================
# Paramétrage
# =============================================================================
# Pour IPCCat
SeuilScorePrediction = 600  # les IPC de la catégorisation par l'API dont
# le score est > SeuilScorePrediction sont retenus. ¶00 c'est bien

# put your credential from epo client in this file...
# chargement clés de client, utilisé pour récupérer l'abstract du brevet du gugusse retrouvé

pubmed = PubMed(tool="P2N-Acad", email="*****@*****.**")
PotentielAuteurs = list()

configFile = LoadConfig()

requete = configFile.requete
projectName = configFile.ndf

# La liste des structures adéquates s'appuie sur un fichier dans AcadRessources
# encodé  en UTF8 avec une affiliation par ligne
#BonneAffiliation= LoadAffiliation('BonnesAffiliations.csv') #['laboratoire', 'institut', "centre de recherche", "université"] #à compléter
# Les champs nécessaires par brevet.
NeededInfo = ['label', 'date', 'inventor', 'title', 'abstract']
#Paramétrages pour sauvegarde des résultats : les répertoire sont fonction du fichier
#requete.cql
ndf = projectName