def main(): login(load_account_data()) mode = int(input("Select a mode (1 - Data test | 2 - Post): ")) if mode == 1: data = pre_frame_all_tweets() tweet_frame = pd.DataFrame(data) print(tweet_frame.head(15)) elif mode == 2: topic = input("What would you like to post about?: ") pubmed = PubMed(tool=str(os.getenv("APP_NAME")), email=str(os.getenv("APP_EMAIL"))) results = pubmed.query(topic, max_results=100) for article in results: articles.append(article) for count in range(0, 5): chosen_article = articles[random.randrange(0, len(articles))] if chosen_article not in selected_articles: selected_articles.append(chosen_article) for art in selected_articles: tweet = build_tweet(art) send_tweets(tweet) else: print("Invalid input! Use 1 or 2!") if input("Run again? (Y/N): ").capitalize() == "Y": main() else: print("Exiting!")
def get_abstract_from_pubmed(query): # Create a PubMed object that GraphQL can use to query # Note that the parameters are not required but kindly requested by PubMed Central # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/ pubmed = PubMed(tool="MyTool", email="*****@*****.**") # Execute the query against the API time.sleep(0.1) results = pubmed.query(query, max_results=500) time.sleep(0.1) # Loop over the retrieved articles for article in results: #this is to avoid some errors (too much request) time.sleep(0.2) # Extract and format information from the article article_id = article.pubmed_id title = article.title if article.keywords: if None in article.keywords: article.keywords.remove(None) keywords = '", "'.join(article.keywords) publication_date = article.publication_date abstract = article.abstract # # make a file for the next step result_of_search = ( f'{article_id} - {publication_date} - {title}\n \n{abstract}\n') return (result_of_search)
def pubmed(title_keywords, n=500, docs=False): '''Get articles with meta-data from PubMed pubs = pubmed_query('nutrition') title_keywords : str The string to be searched for in the title of the articles. n : int Number of articles to return. docs : bool Instead of dataframe with multiple columns, just return abstracts as a list of lists. ''' from pymed import PubMed import json import pandas as pd out = [] pubmed = PubMed(tool="literview", email="*****@*****.**") query = title_keywords + "[Title]" results = pubmed.query(query, max_results=n) for article in results: out.append(article.toJSON()) out2 = [] for i in range(len(out)): j = json.loads(out[i]) try: journal = j['journal'] except: journal = '' try: keywords = j['keywords'] except: keywords = [] out2.append([ j['title'], journal, j['publication_date'], keywords, j['abstract'] ]) out = pd.DataFrame(out2) out.columns = [ 'title', 'journal', 'publication_date', 'keywords', 'abstract' ] if docs: out = [[doc] for doc in out.abstract.values] return out
def querysave(search_term, max_records, save_json, inputfile): if inputfile == 1: with open("input.txt", "r") as f: para = ast.literal_eval(f.read()) search_term = para['search_term'] max_records = para['max_records'] save_json = para['save_json'] with mlflow.start_run() as mlrun: pubmed = PubMed(tool="AlphabetH", email="*****@*****.**") query = search_term results = pubmed.query(query, max_results=max_records) pp = defaultdict(lambda: defaultdict(dict)) for art in results: pmed = art.pubmed_id try: pp[pmed]['title'] = art.title except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = art.abstract except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.conclusions except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.methods except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.results except (AttributeError, TypeError): pass try: pp[pmed]['keywords'] = art.keywords except (AttributeError, TypeError): pass try: pp[pmed]['authors'] = art.authors except (AttributeError, TypeError): pass try: pp[pmed]['journal'] = art.journal except (AttributeError, TypeError): pass try: pp[pmed]['pubdate'] = str(art.publication_date.year) except (AttributeError, TypeError): pass try: pp[pmed]['conclusions'] = art.conclusions except (AttributeError, TypeError): pass with open(save_json, 'w') as fp: json.dump(pp, fp)
def PubMedQuery(Inputfile, Outputfile, AdditionalKeyWords, verbose = False): ''' parameters --------------- Inputfile: str, input file, like './input.txt'; Outputfile: str, output file, like './test.xlsx' AdditionalKeyWords: str, keywords, like ' "pharmacy chemistry biology" ' ''' with open(Inputfile, 'r') as f: drugs = f.readlines() drugs = [i.strip() for i in drugs] pubmed = PubMed(tool="Query-Pubmed-Toolbox", email="*****@*****.**") al = [] with tqdm(total = len(drugs), ascii=True) as pbar: while drugs: time.sleep(0.5) drug = drugs[0] try: results = pubmed.query(drug + ' '+ smart_strip(AdditionalKeyWords), max_results=5) results = list(results) if results: for res in results: D = res.toDict() if type(D.get('pubmed_id')) == str: pubmedid = ';'.join([smart_strip(i) for i in D.get('pubmed_id').split('\n')]) else: pubmedid = None mydict = {'drug':drug, 'pubmid':pubmedid, 'title':smart_strip(D.get('title')), 'journal': smart_strip(D.get('journal')), 'abstract':smart_strip(D.get('abstract')), 'doi':D.get('doi'), 'year': D.get('publication_date')} al.append(mydict) else: logging.warning('not found for %s' % drug + ' '+ AdditionalKeyWords) al.append({'drug':drug}) drugs.pop(0) pbar.update(1) if verbose: pbar.write('Query: %s' % (drug + ' '+ smart_strip(AdditionalKeyWords))) except: pass df = pd.DataFrame(al) if '.xlsx' not in Outputfile: Outputfile = Outputfile + '.xlsx' sdf = df.style.apply(hightlight_null, axis=1) sdf.to_excel(Outputfile) df.to_pickle('.temp.pkl')
def query_pubmed(search_term, max_results=5000): ''' Uses pymed API to query PubMed database. ''' pubmed = PubMed(tool='MyTool', email='') results = pubmed.query(search_term, max_results=max_results) article_list = [] for article in results: article_dict = article.toDict() article_list.append(article_dict) return article_list
def buildQuery(self): # Build Object and send some info to PubMed by their request # Note that the parameters below are not required but kindly requested by PubMed Central # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/ self.pubmed = PubMed(tool=self.tool, email=self.email) # * Create query to feed into Pubmed self.query = "" # First author if self.author1 is not None: if '#' in str(self.author1): self.author1 = str(self.author1).replace('#', ' ') self.query = self.query + str(self.author1)[2:-2] + ' [1au] AND ' # Authors if self.authors is not None: for author in self.authors.split(' '): if '#' in author: author = author.replace('#', ' ') self.query = self.query + author + ' [auth] AND ' # Title if self.title is not None: for tword in self.title: self.query = self.query + tword + ' [ti] AND ' # Terms if self.terms is not None: for item in self.terms.split(' '): self.query = self.query + item + ' AND ' # User query if self.userquery is not None: userquery = str(self.userquery)[2:-2] self.query = self.query + userquery + ' AND ' # Calculate what the start date is for articles to be included based on user settings if self.psLast is not None: # Only include articles published in the last <x> years self.dYa = datetime.now() - relativedelta(years=int(self.psLast)) self.dayYearsAgo = str(self.dYa).split(' ')[0].replace('-', '/') self.dYaQuery = '(' + self.dayYearsAgo + '[Date - Create] : "3000"[Date - Create])' else: self.dYaQuery = '("' + self.psYear + '/01/01"[Date - Create] : "3000"[Date - Create])' self.query = self.query + self.dYaQuery # Announce created query for verification: print(f''' This is your query: {self.query} ''')
def main(): # Setup output folder output_folder = Path.cwd().parent.parent / 'corpus' / 'pubmed' / 'json' if Path.exists(output_folder): shutil.rmtree(output_folder) Path.mkdir(output_folder) # Create a PubMed object that GraphQL can use to query pubmed = PubMed(tool="DavidCampos", email="*****@*****.**") # Create a GraphQL query in plain text query = "(\"2000\"[Date - Publication] : \"3000\"[Date - Publication]) AND " \ "((COVID-19) OR (Coronavirus) OR (Corona virus) OR (2019-nCoV) OR " \ "(SARS-CoV) OR (MERS-CoV) OR (Severe Acute Respiratory Syndrome) OR " \ "(Middle East Respiratory Syndrome) OR " \ "(2019 novel coronavirus disease[MeSH Terms]) OR (2019 novel coronavirus infection[MeSH Terms]) OR " \ "(2019-nCoV disease[MeSH Terms]) OR (2019-nCoV infection[MeSH Terms]) OR " \ "(coronavirus disease 2019[MeSH Terms]) OR (coronavirus disease-19[MeSH Terms]))" # Execute the query against the API results = pubmed.query(query, max_results=1000000) # Loop over the retrieved articles counter = 0 for article in results: # Discard if abstract empty if article.abstract is None or article.abstract == "": continue # Get PubmedID pubmed_id = article.pubmed_id if '\n' in pubmed_id: rest = pubmed_id.split('\n', 1) pubmed_id = rest[0] article.pubmed_id = pubmed_id # Get article as dict article_dict = article.toDict() # Write article to JSON with open(output_folder / (pubmed_id + ".json"), 'w') as outfile: json.dump(article_dict, outfile, default=date_converter) counter += 1 print(counter)
def get_corpus(output_dir='.'): assert os.path.exists(output_dir) pmed = PubMed() results = pmed.query('glycan', max_results=100000) results = filter(_is_relevant, results) ids = map(attrgetter('pubmed_id'), results) abstracts = map(attrgetter('abstract'), results) del results results = dict(zip(ids, abstracts)) print('Fetched {} results'.format(len(results))) print('Writing .json file') with open(os.path.join(output_dir, 'glyco_corpus.json'), 'w+') as outfile: json.dump(results, outfile) print('Tokenizing sentences') results_txt = map(sent_tokenize, results.values()) results_txt = reduce(operator.concat, results_txt) results_txt = reduce(_join_lines, results_txt) print('Writing .txt file') with open(os.path.join(output_dir, 'glyco_corpus.txt'), 'w+') as outfile: outfile.write(results_txt)
#Pubmed search for articles on HIV in African American women import numpy as np import pandas as pd import pymed from pymed import PubMed pubmed = PubMed(tool="PubMedSearcher", email="*****@*****.**") search_term = "HIV Viral Load African American" #place search terms in quotes results = pubmed.query(search_term, max_results=500) articleList = [] articleInfo = [] for article in results: # Print the object type # Convert to dictionary articleDict = article.toDict() articleList.append(articleDict) # Create a dict list of articles from PUBMED API for article in articleList: pubmedId = article['pubmed_id'].partition('\n')[0] # Append article info to dictionary with fields you wish to collect articleInfo.append({ u'pubmed_id': pubmedId, u'title': article['title'], u'keywords': article['keywords'], u'journal': article['journal'], u'abstract': article['abstract'], # u'conclusions':article['conclusions'], # u'methods':article['methods'],
import pandas as pd #the pymed library is used to query the PubMed database and acquire article info from pymed import PubMed pubmed = PubMed(tool="PubMedRetriever", email="*****@*****.**") #enter search term here, this acquires all the articles that appear when you search for the term in pubmed search_term = "covid-19" #enter the max number of results search_results = pubmed.query(search_term, max_results=100000) #create the lists that are used to save acquired data article_list = [] article_details = [] abstracts = [] qualitydata = [] #create the identifiers that are used to identify and seperate the fields of interest startdoi = "|start_doi|" enddoi = "|end_doi|" startpubmedid = "|start_pid|" endpubmedid = "|end_pid|" startpdate = "|start_pd|" endpdate = "|end_pd|" #go through all the articles that are retrieved by using the search term from pymed and save the information for article in search_results: # Convert each retrieved article to a dictionary article_dictionary = article.toDict() article_list.append(article_dictionary) # Generate list of dictionary records which will hold all article details that could be fetched from PUBMED API for article in article_list: #get article pubmed ID
def mainpipe(inputfile, search_term, max_records, json_out, embvec, embvecache, val_ratio, rnnsize, batchsize, lr, weight_decay, n_epochs, model_save, es): if inputfile == 1: with open("input.txt", "r") as f: para = ast.literal_eval(f.read()) search_term = para['search_term'] max_records = para['max_records'] embvec = para['embvec'] embvecache = para['embvecache'] val_ratio = para['val_ratio'] rnnsize = para['rnnsize'] batchsize = para['batchsize'] lr = para['lr'] weight_decay = para['weight_decay'] n_epochs = para['n_epochs'] model_save = para['model_save'] if embvec == 1: embvec = torchtext.vocab.GloVe(name='840B', dim=300, cache=embvecache) use_pretrained = True with mlflow.start_run() as mlrun: pubmed = PubMed(tool="AlphabetH", email="*****@*****.**") query = search_term results = pubmed.query(query, max_results=max_records) pp = defaultdict(lambda: defaultdict(dict)) for art in results: pmed = art.pubmed_id try: pp[pmed]['title'] = art.title except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = art.abstract except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.conclusions except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.methods except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.results except (AttributeError, TypeError): pass try: pp[pmed]['keywords'] = art.keywords except (AttributeError, TypeError): pass try: pp[pmed]['authors'] = art.authors except (AttributeError, TypeError): pass try: pp[pmed]['journal'] = art.journal except (AttributeError, TypeError): pass try: pp[pmed]['pubdate'] = str(art.publication_date.year) except (AttributeError, TypeError): pass try: pp[pmed]['conclusions'] = art.conclusions except (AttributeError, TypeError): pass print(subprocess.getoutput("python -m spacy download en_core_web_sm")) artpd = pd.DataFrame.from_dict(pp, orient='index') artpda = artpd[artpd.abstract.notnull()].copy() artpda = artpda[artpd.title.notnull()] # artpda.index = pd.Series(artpda.index).apply(lambda x: x[0:8]) artpdak = artpda[artpda.keywords.str.len() > 0].copy() dataf = pd.DataFrame( index=artpdak.index, columns=['SRC', 'TRG', 'keywords', 'Extracted', 'abskey']) dataf.loc[:, 'SRC'] = artpdak.title + ' ' + artpdak.abstract dataf.loc[:, 'keywords'] = artpdak.keywords svoc = spacy.load("en_core_web_sm") matcher = PhraseMatcher(svoc.vocab, attr="LOWER") for pmid in dataf.index: t0 = dataf.loc[pmid] patterns = [svoc.make_doc(str(name)) for name in t0.keywords] matcher.add("Names", None, *patterns) doc = svoc(t0.SRC) t1 = ['O'] * (len(doc)) matched = [] matn = 0 for _, start, end in matcher(doc): t1[start] = 'B' t1[start + 1:end] = 'I' * (end - start - 1) if str(doc[start:end]).lower() not in matched: matn = matn + 1 matched.append(str(doc[start:end]).lower()) abskw = [] for x in t0.keywords: if x.lower() not in matched: abskw.append(x) dataf.loc[pmid, 'TRG'] = ' '.join([t for t in t1]) dataf.loc[pmid, 'Extracted'] = matn dataf.loc[pmid, 'abskey'] = abskw matcher.remove("Names") datatrain = dataf[dataf['Extracted'] >= 3].copy() datatest = dataf[dataf['Extracted'] < 3].copy() # separate train and validate dtrain = datatrain.loc[:, ['SRC', 'TRG']] dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']] seed = 250 idx = np.arange(datatrain.shape[0]) np.random.seed(seed) torch.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.shuffle(idx) val_size = int(len(idx) * val_ratio) df_train = dtrain.iloc[idx[val_size:], :] df_val = dtrain.iloc[idx[:val_size], :] df_val_k = dtraink.iloc[idx[:val_size], :] df_test = datatest.loc[:, ['SRC', 'TRG']] dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']] df_val_k = dtraink.iloc[idx[:val_size], :] # Load original dataset datai = artpda.copy() datai = datai[datai.abstract.notnull()] datai = datai[datai.title.notnull()] datai = datai.replace('\n', ' ', regex=True) datai = datai.replace('\t', ' ', regex=True) dataiu = datai.loc[datai.keywords.str.len() == 0] dataik = datai.loc[datai.keywords.str.len() > 0] dataiu['SRC'] = dataiu.title + ' ' + dataiu.abstract tokenizertrg = lambda x: x.split() def tokenizersrc(text): # create a tokenizer function return [tok.text for tok in svoc.tokenizer(text)] def safe_value(field_val): return field_val if not pd.isna(field_val) else "Other" def safe_year(field_val): return field_val if not pd.isna(field_val) else 1900 TEXT = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, lower=False) LABEL = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, unk_token=None) fields = [('text', TEXT), ('label', LABEL)] device = 'cuda' train_examples = read_data(df_train, fields, tokenizersrc, tokenizertrg) valid_examples = read_data(df_val, fields, tokenizersrc, tokenizertrg) # Load the pre-trained embeddings that come with the torchtext library. if use_pretrained: print('We are using pre-trained word embeddings.') TEXT.build_vocab(train_examples, vectors=embvec) else: print('We are training word embeddings from scratch.') TEXT.build_vocab(train_examples, max_size=5000) LABEL.build_vocab(train_examples) # Create one of the models defined above. #self.model = RNNTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False) model0 = RNNCRFTagger(TEXT, LABEL, rnnsize, emb_dim=300, update_pretrained=False) model0.to(device) optimizer = torch.optim.Adam(model0.parameters(), lr=lr, weight_decay=weight_decay) train(train_examples, valid_examples, embvec, TEXT, LABEL, device, model0, batchsize, optimizer, n_epochs) out2 = evaltest2(df_val, df_val_k, model0, tokenizersrc, fields, device) ttp3 = kphperct(df_val_k, out2, svoc) mlflow.log_param("epochs", n_epochs) mlflow.pytorch.save_model(model0, model_save) mlflow.log_metric("extraction_rate", ttp3.mean()) augout = evaltest2(dataiu, model0, tokenizersrc, fields, device) klist = kphext2(dataiu.SRC, augout, svoc) for i in range(len(dataiu.index)): dataiu.iloc[i, 2].extend(list(set(klist[i]))) output = pd.concat([dataik, dataiu], join="inner") output.to_json('/home/pding/OneDrive/kph/MSaug.json', orient='index') if es == 1: output['journal'] = output['journal'].apply(safe_value) output['conclusions'] = output['conclusions'].apply(safe_value) output['pubdate'] = output['pubdate'].apply(safe_year) output['PMID'] = output.index test_server = [{'host': '127.0.0.1', 'port': 9200}] es = Elasticsearch(test_server, http_compress=True) use_these_keys = [ 'PMID', 'title', 'abstract', 'keywords', 'authors', 'pubdate' ] def filterKeys(document): return {key: document[key] for key in use_these_keys} def doc_generator(df): df_iter = df.iterrows() for index, document in df_iter: try: yield { "_index": 'ms', "_source": filterKeys(document), } except StopIteration: return helpers.bulk(es, doc_generator(output)) print(ttp3.mean())
'root': { 'handlers': ['console'], 'level': 'INFO' } } logging.config.dictConfig(LOGGING) from pymed import PubMed my_email = "*****@*****.**" # Create a PubMed object that GraphQL can use to query # Note that the parameters are not required but kindly requested by PubMed Central # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/ pubmed = PubMed(tool="Protein Interaction Text Miner", email=my_email) class Publication: def __init__(self, pubmed_id, title, publication_date, abstract, keywords=""): self.pubmed_id = pubmed_id self.url = "https://www.ncbi.nlm.nih.gov/pubmed/" + pubmed_id.split( "\n")[0] self.title = title self.publication_date = publication_date
import datetime from typing import List, Union from pymed import PubMed from .utils import get_query_from_keywords_and_date, get_emails from ..utils import dump_papers PUBMED = PubMed(tool="MyTool", email="*****@*****.**") pubmed_field_mapper = {"publication_date": "date"} # Authors fields needs specific processing process_fields = { "authors": lambda authors: list( map( lambda a: str(a.get("firstname", "")) + "" + str( a.get("lastname", "")), authors, )), "date": lambda date: (date.strftime("%Y-%m-%d") if isinstance(date, datetime.date) else date), } def get_pubmed_papers(query: str, fields: List = [ "title", "authors", "date", "abstract", "journal", "doi"
min_year = 2010 max_year = 2020 total = [] slope = [] label = [ 'Renal Pathology', 'Kidney Transplantation', 'Chronic kidney disease', 'Acute Kidney Injury', 'Renal Insufficiency', 'renal hypotension', 'Drug Discovery', 'Immunology', 'Genetic', 'Geriatric', 'Cardiovascular disease' ] # Create a PubMed object that GraphQL can use to query # Note that the parameters are not required but kindly requested by PubMed Central # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/ pubmed = PubMed(tool="MyTool", email="*****@*****.**") # query terms time_ml = '((2010/1/1[Date - Publication]: 2020/12/31[Date - Publication]) AND ("Artificial General Intelligence" OR "Artificial Intelligence" OR "Autoencoder" OR "auto encoder" OR "Reinforcement learning" OR "AI Governance" OR "Augmented Intelligence" OR "Decision Intelligence" OR "neural network" OR "Data Labeling" OR "Annotation Services" OR "Edge AI" OR "Smart Robotics" OR "Quantum Computing" OR "Digital Ethics" OR "AutoML" OR "Deep Neural" OR "Deep Learning" OR "Deep Network" OR "Convolutional Neural" OR "Graph Neural" OR "Generative Adversarial" OR "Adversarial Learning" OR "Natural Language Processing" OR "Recurrent Neural" OR "Computer Vision" OR "Cognitive Computing" OR "machine learning" OR "random forest" OR "support vector" OR "regression tree" OR "regression splines" OR "artificial neural" OR "Lasso" OR "decision tree" OR "linear regression" OR "bayesian" OR "regression model" OR "regression" OR "Supervised-learning" OR "clustering" OR "Dimensionality reduction" OR "Unsupervised-learning" OR "big-data" OR "data-mining" OR "semi-supervised" OR "self-learning" OR "sparse learning" OR "dictionary learning" OR "Feature learning" OR "Anomaly detection" OR "Robot learning" OR "algorithms" OR "Federated learning" OR "linear model" OR "pattern recognition" OR "information retrieval" OR "game theory" OR "information theory" OR "swarm intelligence" OR "Markov Decision" OR "Markov Random" OR "dynamic programming" OR "multilayer perceptrons" OR "component analysis" OR "Sparse coding" OR "subspace learning" OR "matrix factorization" OR "matrix decomposition" OR "NLP algorithm" OR "K means" OR "computer vision" OR "speech recognition" OR "predictive model" OR "machine learning"))' kidney = '(("Glomeruli"[All Fields] OR "glomerular"[All Fields] OR "glomerulus"[All Fields] OR "glomerulosclerosis"[All Fields] OR "nephropathology"[All Fields] OR "renal pathology"[All Fields] OR "kidney pathology"[All Fields] OR "renal whole slide"[All Fields] OR "kidney whole slide"[All Fields] OR "renal wholeslide"[All Fields] OR "kidney wholeslide"[All Fields] OR "renal biopsy"[All Fields] OR "kidney biopsy"[All Fields] OR "Kidney/diagnostic imaging"[MAJR] OR "Kidney Glomerulus/pathology”[MAJR] OR "Kidney Diseases/pathology"[MAJR] OR "Kidney/pathology"[MAJR] OR ("Kidney"[MeSH] AND "Biopsy"[MeSH]) OR "Renal Dialysis"[MeSH] OR "Kidney Diseases"[MeSH] OR "Nephrology"[MeSH] OR "Nephrology" OR "Nephrologists"[MeSH] OR "Kidney"[MeSH] OR "Kidney Function Tests"[MeSH] OR "Kidney Function Tests"[MeSH] OR "Kidney Transplantation"[MeSH] OR "Hypertension, Renal"[MeSH] OR "Renal Insufficiency"[MeSH] OR "renal survival" OR "Acute kidney injury" OR "kidney transplantation" OR "kidney disease" OR "CKD" OR "AKI" OR "chronic kidney disease"))' query_Renal_Pathology = '("Glomeruli"[All Fields] OR "glomerular"[All Fields] OR "glomerulus"[All Fields] OR "glomerulosclerosis"[All Fields] OR "nephropathology"[All Fields] OR "renal pathology"[All Fields] OR "kidney pathology"[All Fields] OR "renal whole slide"[All Fields] OR "kidney whole slide"[All Fields] OR "renal wholeslide"[All Fields] OR "kidney wholeslide"[All Fields] OR "renal biopsy"[All Fields] OR "kidney biopsy"[All Fields] OR "Kidney/diagnostic imaging"[MAJR] OR "Kidney Glomerulus/pathology"[MAJR] OR "Kidney Diseases/pathology"[MAJR] OR "Kidney/pathology"[MAJR] OR ("Kidney"[MeSH] AND "Biopsy"[MeSH]))' + ' AND ' + time_ml query_Kidney_Transplantation = '(Kidney Transplantation)' + ' AND ' + time_ml + ' AND ' + kidney query_CKD = '("chronic kidney disease" OR "CKD")' + ' AND ' + time_ml + 'AND' + kidney query_Acute_Kidney_Injury = '("acute kidney injury" OR "AKI")' + ' AND ' + time_ml + 'AND' + kidney query_Renal_Insufficiency = '("Renal Insufficiency")' + ' AND ' + time_ml + 'AND' + kidney query_Renal_Hypotension = '("Hypertension, Renal" OR "renal hypertension")' + ' AND ' + time_ml + 'AND' + kidney query_Drug_Discovery = '("Drug Discovery")' + ' AND ' + kidney + ' AND ' + time_ml query_Immunology = '("Immunology")' + ' AND ' + kidney + ' AND ' + time_ml query_Genetic = '("Genetic")' + ' AND ' + kidney + ' AND ' + time_ml query_Geriatric = '("Geriatric")' + ' AND ' + kidney + ' AND ' + time_ml
def _load_query_result(self): if not self._query_result: pubmed = PubMed(tool='Collabovid', email='*****@*****.**') self._query_result = list( pubmed.query(query=self._PUBMED_SEARCH_QUERY, max_results=30000))
from pymed import PubMed import json email=input("Please enter your email:") user_input=input("I want to search for...") pubmed = PubMed(tool="MyTool", email=email) results = pubmed.query(user_input, max_results=5) results_list = [] output = [] for article in results: results_as_dict = article.toDict() results_list.append(results_as_dict) for article in results_list: pubmed_id = article['pubmed_id'].partition('\n')[0] output.append({u'pubmed_id':pubmed_id, u'title':article['title'], u'abstract':article['abstract']}) with open('output_results.json', 'w') as outfile: json.dump(output, outfile, indent=4)
class query(object): # * Store Flags def __init__(self): # Positional Arguments self.oFile = args.oFile # Flags for information requested by Pubmed API self.email = args.email self.tool = args.tool # All other flags used to build the query self.author1 = args.author1 self.authors = args.authors self.title = args.title self.terms = args.terms self.userquery = args.userquery self.psYear = args.pubSinceYear self.psLast = args.pubSinceLast self.maxResults = args.maxResults # * Build Object # Create a PubMed object that GraphQL can use to query def buildQuery(self): # Build Object and send some info to PubMed by their request # Note that the parameters below are not required but kindly requested by PubMed Central # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/ self.pubmed = PubMed(tool=self.tool, email=self.email) # * Create query to feed into Pubmed self.query = "" # First author if self.author1 is not None: if '#' in str(self.author1): self.author1 = str(self.author1).replace('#', ' ') self.query = self.query + str(self.author1)[2:-2] + ' [1au] AND ' # Authors if self.authors is not None: for author in self.authors.split(' '): if '#' in author: author = author.replace('#', ' ') self.query = self.query + author + ' [auth] AND ' # Title if self.title is not None: for tword in self.title: self.query = self.query + tword + ' [ti] AND ' # Terms if self.terms is not None: for item in self.terms.split(' '): self.query = self.query + item + ' AND ' # User query if self.userquery is not None: userquery = str(self.userquery)[2:-2] self.query = self.query + userquery + ' AND ' # Calculate what the start date is for articles to be included based on user settings if self.psLast is not None: # Only include articles published in the last <x> years self.dYa = datetime.now() - relativedelta(years=int(self.psLast)) self.dayYearsAgo = str(self.dYa).split(' ')[0].replace('-', '/') self.dYaQuery = '(' + self.dayYearsAgo + '[Date - Create] : "3000"[Date - Create])' else: self.dYaQuery = '("' + self.psYear + '/01/01"[Date - Create] : "3000"[Date - Create])' self.query = self.query + self.dYaQuery # Announce created query for verification: print(f''' This is your query: {self.query} ''') def runQuery(self): # Execute the query against the API self.results = self.pubmed.query(self.query, max_results=int(self.maxResults) + 1) # Make dictionary to store data self.output = {} # Loop over the retrieved articles self.nResults = 0 for result in self.results: self.nResults = self.nResults + 1 # Check if there are more than <n> results if self.nResults > int(self.maxResults): # Show warning print('More than ' + str(self.maxResults) + ' results found') elif self.nResults == 0: # Show warning print('No results found') else: # Print number of results print(str(self.nResults) + ' result(s) obtained.') # Loop over the retrieved articles self.results = self.pubmed.query(self.query, max_results=int(self.maxResults)) for article in self.results: # Extract and format information from the article article_id = article.pubmed_id.split()[0] title = article.title authors = article.authors # if article.keywords: # if None in article.keywords: # article.keywords.remove(None) # keywords = '", "'.join(article.keywords) publication_date = article.publication_date abstract = article.abstract if hasattr(article, 'journal'): journal = article.journal else: journal = 'NA' # Reshape author list authorString = '' for author in authors: last = author['lastname'] first = author['firstname'] if last is None: last = 'NA' if first is None: first = 'NA' authorString = authorString + ' ' + last + ', ' + first + ';' # Add results to the dictionary self.output[article_id] = [ article_id, title, authorString, journal, publication_date, abstract ] # Put data in a dataframe after extraction self.DF = pd.DataFrame.from_dict(self.output) self.DF = self.DF.T self.DF = self.DF.reset_index(drop=True) # Remove row names self.DF.columns = [ "PMID", "Title", "Authors", "Journal", "PubDate", "Abstract" ] # Save to Excel self.writer = pd.ExcelWriter(self.oFile, engine='xlsxwriter') self.DF.to_excel(self.writer, sheet_name='PMquery', index=False) self.workbook = self.writer.book self.worksheet = self.writer.sheets['PMquery'] # Formatting self.format = self.workbook.add_format({ 'text_wrap': True, 'align': 'top' }) self.worksheet.set_column('A:A', 9, self.format) self.worksheet.set_column('B:C', 22, self.format) self.worksheet.set_column('D:E', 11, self.format) self.worksheet.set_column('F:F', 58, self.format) self.writer.save() # If there is only one result, also return information to the shell if self.nResults == 1: #print(json.dumps(self.output.items, indent = 4)) #print(self.output.items()) key = list(self.output.keys())[0] print('\n ----------------------------------\n', 'PMID: ', self.output[key][0], '\n', 'Title: ', self.output[key][1], '\n', 'Authors: ', self.output[key][2], '\n', 'Journal: ', self.output[key][3], '\n', 'Published: ', self.output[key][4], '\n', '----------------------------------\n')
def get_pubmed_data(self, query, searched_zipcode, date, maximum_number_of_value=3): csv_data = { "affiliation": [], "number_of_authors": [], "authors_name": [], "authors_institute": [], "authors_address": [], "authors_zipcode": [], "paper_title": [], "publication_date": [], "journal": [] } pubmed = PubMed(tool="MyTool", email="*****@*****.**") parser = Parser() results = pubmed.query(query, max_results=maximum_number_of_value) is_queried_by_zipcode = searched_zipcode.isdecimal() if is_queried_by_zipcode: searched_zipcode = int(searched_zipcode) for article in results: jsonData = json.loads(article.toJSON()) authors_list = jsonData['authors'] authors_name = "" authors_institute = "" authors_affiliation = "" authors_address = "" authors_zipcode = "" num_authors = len(authors_list) or 0 counted_matched = 0 if is_queried_by_zipcode: counted_matched = self.has_match_zipcode_of_authprs( authors_list, searched_zipcode) if (not is_queried_by_zipcode) or (is_queried_by_zipcode and counted_matched > 0): for index in range(0, num_authors): affiliation = authors_list[index][ "affiliation"] or "<NOT_AVAILABLE>" zipcode = str(self.get_address_with_zipcode(affiliation)) # print(type(zipcode)) # print(zipcode) author_name = authors_list[index][ 'firstname'] + " " + authors_list[index][ "lastname"] or "<NOT_AVAILABLE>" author_institute = "" author_institute += self.get_organization( affiliation=affiliation) + " " authors_affiliation += affiliation authors_name += author_name authors_institute += author_institute authors_address += str(parser.parse(affiliation)) authors_zipcode += zipcode if num_authors != index + 1: authors_name += "||" authors_institute += "||" authors_affiliation += "||" authors_address += "||" authors_zipcode += "||" else: break paper_title = jsonData['title'] or "<NOT_AVAILABLE>" publication_date = jsonData['publication_date'] or "<NOT_AVAILABLE>" journal = jsonData['journal'] or "<NOT_AVAILABLE>" if self.is_us: if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0): csv_data["authors_name"].append(authors_name) csv_data["affiliation"].append(authors_affiliation) csv_data["authors_institute"].append(authors_institute) csv_data["paper_title"].append(paper_title) csv_data["publication_date"].append(publication_date) csv_data["journal"].append(journal) csv_data["authors_address"].append(authors_address) csv_data["number_of_authors"].append(num_authors) csv_data["authors_zipcode"].append(authors_zipcode) self.is_us = False # if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0): # # df = pd.DataFrame(csv_data) # # print(df.head()) # df.to_csv("PubMedData_from.csv", index=False) print("Size of csv ", len(csv_data["paper_title"])) if len(csv_data["paper_title"]) > 0: df = pd.DataFrame(csv_data) print(df.head()) datetimeobject = datetime.datetime.strptime(date, '%Y/%m/%d') csv_file_name = "PubMedData_From_" + datetimeobject.strftime( '%Y_%m_%d') + ".csv" print(csv_file_name) df.to_csv(csv_file_name, index=False)
if current_link is not None and "/chembldb/" in current_link: common_names.append(common_name) chembl_links.append(current_link) chembl_names.append(d.string) for i in chembl_names: print(i) for j in chembl_links: print(j) for k in common_names: print(k) pubmed = PubMed(tool="PubMedSearcher", email="*****@*****.**") # Final drug lists: only those found to have relevant publications on PubMed make the final dataset # so final drugs must have: chemblID + pubmed results final_drug_common_names = set() final_drug_chembl_names = set() final_drug_chembl_links = set() #test1 = [] #test2 = [] # Loop through drug names for i, name in enumerate(common_names, 0): # Create a GraphQL query in plain text query = '\"alzheimers\"' + "+" + '\"' + name + '\"'
from pymed import PubMed from os import path pubmed = PubMed(tool="paperList", email="*****@*****.**") query = 'Correia BE[author]' publications = pubmed.query(query, max_results=500) ## Defining functions def get_filename(article): words = '-'.join(article.title.split(' ')[:3]) date = '-'.join([ str(article.publication_date.year), str(article.publication_date.month), str(article.publication_date.day) ]) title = '-'.join([date, words]) + '.md' return (title) def get_authors(article): author_list = [] for author in article.authors: name = author['lastname'] + ' ' + author['initials'] author_list.append(name) authors = ', '.join(author_list) return (authors, author_list[0])
def perform_query(keywords, amount): database = PubMed(tool="Vigor", email="*****@*****.**") query = keywords database_results = database.query(query, max_results=amount) return database_results
# I used the following links as references: # https://stackoverflow.com/questions/57053378/ # https://www.kaggle.com/summerkrankin/pubmed-download-als import pandas as pd from pymed import PubMed import time # User inputs: query = input("Provide a query for PubMed (can include field tags): ") my_email = input("Provide your e-mail address (optional): ") max_results = input("Maximum number of results: ") # Consult PubMed: pubmed = PubMed(tool="PubMedSearcher", email = my_email) results = pubmed.query(query, max_results = int(max_results)) # Create an empty Dataframe with just the column names: articles_df = pd.DataFrame(columns = ['PMID', 'Publication_date', 'Title', 'Authors', 'Journal', 'DOI', 'Keywords', 'Abstract']) # Now, for each article, fill the dataframe with the info collected: for article in results:
import requests import json import pprint import pandas as pd import numpy as np import xmltodict from xml.etree import ElementTree from pymed import PubMed from Bio import Entrez import plotly import plotly.graph_objects as go import plotly.express as px from datetime import datetime # https://stackoverflow.com/questions/57053378/query-pubmed-with-python-how-to-get-all-article-details-from-query-to-pandas-d pubmed = PubMed(tool="PubMedSearcher", email="*****@*****.**") results = pubmed.query( "nhsx[affiliation]", max_results=500) # number might need to be updated in future, for now low articleList = [] articleInfo = [] for article in results: # Print the type of object we've found (can be either PubMedBookArticle or PubMedArticle). articleDict = article.toDict() # convert to dictionary articleList.append(articleDict) # Generate list of dict records which will hold all article details that could be fetch from PUBMED API for article in articleList: # Sometimes article['pubmed_id'] contains list separated with comma - take first pubmedId in that list - thats article pubmedId pubmedId = article["pubmed_id"].partition("\n")[0] # keep only pubmed id
from pymed import PubMed # Create a PubMed object that GraphQL can use to query # Note that the parameters are not required but kindly requested by PubMed Central # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/ pubmed = PubMed(tool="MyTool", email="*****@*****.**") # Create a GraphQL query in plain text query = '(("2018/05/01"[Date - Create] : "3000"[Date - Create])) AND (Xiaoying Xian[Author] OR diabetes)' # Execute the query against the API results = pubmed.query(query, max_results=500) # Loop over the retrieved articles for article in results: # Extract and format information from the article article_id = article.article_id title = article.title if article.keywords: if None in article.keywords: article.keywords.remove(None) keywords = '", "'.join(article.keywords) publication_date = article.publication_date abstract = article.abstract # Show information about the article print( f'{article_id} - {publication_date} - {title}\nKeywords: "{keywords}"\n{abstract}\n' )
#/usr/bin/python from pymed import PubMed from pprint import pprint as pp import lxml.etree as etree from bs4 import BeautifulSoup # Query Help # # https://pubmed.ncbi.nlm.nih.gov/advanced/ # pubmed = PubMed(tool="Acadex", email="*****@*****.**") query = '(David Adlam[Author])' results = pubmed.query(query, max_results=1) print('Hello') for r in results: # bs = BeautifulSoup(r.xml, 'xml') # print(bs.prettify()) # pp(etree.tostring(r.xml.getroot(), pretty_print=True)) # pp(r.toDict()) pass
from pymed import PubMed import pandas as pd import datetime import time import json pubmed = PubMed(tool="toolname", email="your email") # Timer start_time = time.time() industryList = [] # Array of companys for comp in industryList: comp_time = time.time() articleList = [] articleInfo = [] query = f"your test query here with {comp}, same as how pubmed takes the queries" results = pubmed.query(query, max_results=999999) for article in results: articleDict = article.toDict() articleList.append(articleDict) for article in articleList: articleInfo.append({'pubmed_id': article['pubmed_id'], 'title': article['title'], 'keywords': article['keywords'], 'mesh': article['mesh'], 'journal': article['journal'], 'abstract': article['abstract'], 'conclusions': article['conclusions'], 'methods': article['methods'],
from pymed import PubMed import json pubmed = PubMed(tool="PubmedToolkit", email="*****@*****.**") start = '2013/01/01' end = '2017/01/01' query = '(("english"[Language]) AND "case reports"[Publication Type]) ' \ + f'AND ("{start}"[Date - Publication] : "{end}"[Date - Publication]) ' \ + 'AND ("humans"[MeSH Terms]) AND ("Case Reports"[ptyp]) AND ("English"[lang]) ' \ + 'AND ("pubmed pmc local"[sb]))' results = pubmed.query(query, max_results=5000) count = 0 def save(force=False, every=100): global count if not force: count += 1 if count >= every: count = 0 else: return print('Save data, fetched', len(data)) with open('data.json', 'w') as f: json.dump(data, f) data = [] for article in results:
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This file is to query and download data from PubMed @author: Wei Zhao @ Metis, 02/12/2021 """ #%% from pymed import PubMed from util import save_as_pickle from collections import defaultdict #%% pubmed = PubMed(tool="MyTool", email="") query = [ '((traumatic brain injury) ' + 'OR (concussion) ' + 'OR (brain biomechanics)) ' + 'AND ("1991/01/01"[Date - Create] : "3000"[Date - Create])' + 'AND (english[Language])' ] results = pubmed.query(query, max_results=12000000) #%% def download_data(results): """ Download the data """ # Loop over the retrieved articles data_dict = defaultdict(list) c = 0
# UrlInventorBuild,UrlIPCRBuild#, cmap_discretize #import pickle #from urllib.parse import urlparse # # ============================================================================= # Paramétrage # ============================================================================= # Pour IPCCat SeuilScorePrediction = 600 # les IPC de la catégorisation par l'API dont # le score est > SeuilScorePrediction sont retenus. ¶00 c'est bien # put your credential from epo client in this file... # chargement clés de client, utilisé pour récupérer l'abstract du brevet du gugusse retrouvé pubmed = PubMed(tool="P2N-Acad", email="*****@*****.**") PotentielAuteurs = list() configFile = LoadConfig() requete = configFile.requete projectName = configFile.ndf # La liste des structures adéquates s'appuie sur un fichier dans AcadRessources # encodé en UTF8 avec une affiliation par ligne #BonneAffiliation= LoadAffiliation('BonnesAffiliations.csv') #['laboratoire', 'institut', "centre de recherche", "université"] #à compléter # Les champs nécessaires par brevet. NeededInfo = ['label', 'date', 'inventor', 'title', 'abstract'] #Paramétrages pour sauvegarde des résultats : les répertoire sont fonction du fichier #requete.cql ndf = projectName