def main(): login(load_account_data()) mode = int(input("Select a mode (1 - Data test | 2 - Post): ")) if mode == 1: data = pre_frame_all_tweets() tweet_frame = pd.DataFrame(data) print(tweet_frame.head(15)) elif mode == 2: topic = input("What would you like to post about?: ") pubmed = PubMed(tool=str(os.getenv("APP_NAME")), email=str(os.getenv("APP_EMAIL"))) results = pubmed.query(topic, max_results=100) for article in results: articles.append(article) for count in range(0, 5): chosen_article = articles[random.randrange(0, len(articles))] if chosen_article not in selected_articles: selected_articles.append(chosen_article) for art in selected_articles: tweet = build_tweet(art) send_tweets(tweet) else: print("Invalid input! Use 1 or 2!") if input("Run again? (Y/N): ").capitalize() == "Y": main() else: print("Exiting!")
def get_abstract_from_pubmed(query): # Create a PubMed object that GraphQL can use to query # Note that the parameters are not required but kindly requested by PubMed Central # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/ pubmed = PubMed(tool="MyTool", email="*****@*****.**") # Execute the query against the API time.sleep(0.1) results = pubmed.query(query, max_results=500) time.sleep(0.1) # Loop over the retrieved articles for article in results: #this is to avoid some errors (too much request) time.sleep(0.2) # Extract and format information from the article article_id = article.pubmed_id title = article.title if article.keywords: if None in article.keywords: article.keywords.remove(None) keywords = '", "'.join(article.keywords) publication_date = article.publication_date abstract = article.abstract # # make a file for the next step result_of_search = ( f'{article_id} - {publication_date} - {title}\n \n{abstract}\n') return (result_of_search)
def pubmed(title_keywords, n=500, docs=False): '''Get articles with meta-data from PubMed pubs = pubmed_query('nutrition') title_keywords : str The string to be searched for in the title of the articles. n : int Number of articles to return. docs : bool Instead of dataframe with multiple columns, just return abstracts as a list of lists. ''' from pymed import PubMed import json import pandas as pd out = [] pubmed = PubMed(tool="literview", email="*****@*****.**") query = title_keywords + "[Title]" results = pubmed.query(query, max_results=n) for article in results: out.append(article.toJSON()) out2 = [] for i in range(len(out)): j = json.loads(out[i]) try: journal = j['journal'] except: journal = '' try: keywords = j['keywords'] except: keywords = [] out2.append([ j['title'], journal, j['publication_date'], keywords, j['abstract'] ]) out = pd.DataFrame(out2) out.columns = [ 'title', 'journal', 'publication_date', 'keywords', 'abstract' ] if docs: out = [[doc] for doc in out.abstract.values] return out
def querysave(search_term, max_records, save_json, inputfile): if inputfile == 1: with open("input.txt", "r") as f: para = ast.literal_eval(f.read()) search_term = para['search_term'] max_records = para['max_records'] save_json = para['save_json'] with mlflow.start_run() as mlrun: pubmed = PubMed(tool="AlphabetH", email="*****@*****.**") query = search_term results = pubmed.query(query, max_results=max_records) pp = defaultdict(lambda: defaultdict(dict)) for art in results: pmed = art.pubmed_id try: pp[pmed]['title'] = art.title except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = art.abstract except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.conclusions except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.methods except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.results except (AttributeError, TypeError): pass try: pp[pmed]['keywords'] = art.keywords except (AttributeError, TypeError): pass try: pp[pmed]['authors'] = art.authors except (AttributeError, TypeError): pass try: pp[pmed]['journal'] = art.journal except (AttributeError, TypeError): pass try: pp[pmed]['pubdate'] = str(art.publication_date.year) except (AttributeError, TypeError): pass try: pp[pmed]['conclusions'] = art.conclusions except (AttributeError, TypeError): pass with open(save_json, 'w') as fp: json.dump(pp, fp)
def PubMedQuery(Inputfile, Outputfile, AdditionalKeyWords, verbose = False): ''' parameters --------------- Inputfile: str, input file, like './input.txt'; Outputfile: str, output file, like './test.xlsx' AdditionalKeyWords: str, keywords, like ' "pharmacy chemistry biology" ' ''' with open(Inputfile, 'r') as f: drugs = f.readlines() drugs = [i.strip() for i in drugs] pubmed = PubMed(tool="Query-Pubmed-Toolbox", email="*****@*****.**") al = [] with tqdm(total = len(drugs), ascii=True) as pbar: while drugs: time.sleep(0.5) drug = drugs[0] try: results = pubmed.query(drug + ' '+ smart_strip(AdditionalKeyWords), max_results=5) results = list(results) if results: for res in results: D = res.toDict() if type(D.get('pubmed_id')) == str: pubmedid = ';'.join([smart_strip(i) for i in D.get('pubmed_id').split('\n')]) else: pubmedid = None mydict = {'drug':drug, 'pubmid':pubmedid, 'title':smart_strip(D.get('title')), 'journal': smart_strip(D.get('journal')), 'abstract':smart_strip(D.get('abstract')), 'doi':D.get('doi'), 'year': D.get('publication_date')} al.append(mydict) else: logging.warning('not found for %s' % drug + ' '+ AdditionalKeyWords) al.append({'drug':drug}) drugs.pop(0) pbar.update(1) if verbose: pbar.write('Query: %s' % (drug + ' '+ smart_strip(AdditionalKeyWords))) except: pass df = pd.DataFrame(al) if '.xlsx' not in Outputfile: Outputfile = Outputfile + '.xlsx' sdf = df.style.apply(hightlight_null, axis=1) sdf.to_excel(Outputfile) df.to_pickle('.temp.pkl')
def query_pubmed(search_term, max_results=5000): ''' Uses pymed API to query PubMed database. ''' pubmed = PubMed(tool='MyTool', email='') results = pubmed.query(search_term, max_results=max_results) article_list = [] for article in results: article_dict = article.toDict() article_list.append(article_dict) return article_list
def main(): # Setup output folder output_folder = Path.cwd().parent.parent / 'corpus' / 'pubmed' / 'json' if Path.exists(output_folder): shutil.rmtree(output_folder) Path.mkdir(output_folder) # Create a PubMed object that GraphQL can use to query pubmed = PubMed(tool="DavidCampos", email="*****@*****.**") # Create a GraphQL query in plain text query = "(\"2000\"[Date - Publication] : \"3000\"[Date - Publication]) AND " \ "((COVID-19) OR (Coronavirus) OR (Corona virus) OR (2019-nCoV) OR " \ "(SARS-CoV) OR (MERS-CoV) OR (Severe Acute Respiratory Syndrome) OR " \ "(Middle East Respiratory Syndrome) OR " \ "(2019 novel coronavirus disease[MeSH Terms]) OR (2019 novel coronavirus infection[MeSH Terms]) OR " \ "(2019-nCoV disease[MeSH Terms]) OR (2019-nCoV infection[MeSH Terms]) OR " \ "(coronavirus disease 2019[MeSH Terms]) OR (coronavirus disease-19[MeSH Terms]))" # Execute the query against the API results = pubmed.query(query, max_results=1000000) # Loop over the retrieved articles counter = 0 for article in results: # Discard if abstract empty if article.abstract is None or article.abstract == "": continue # Get PubmedID pubmed_id = article.pubmed_id if '\n' in pubmed_id: rest = pubmed_id.split('\n', 1) pubmed_id = rest[0] article.pubmed_id = pubmed_id # Get article as dict article_dict = article.toDict() # Write article to JSON with open(output_folder / (pubmed_id + ".json"), 'w') as outfile: json.dump(article_dict, outfile, default=date_converter) counter += 1 print(counter)
def get_corpus(output_dir='.'): assert os.path.exists(output_dir) pmed = PubMed() results = pmed.query('glycan', max_results=100000) results = filter(_is_relevant, results) ids = map(attrgetter('pubmed_id'), results) abstracts = map(attrgetter('abstract'), results) del results results = dict(zip(ids, abstracts)) print('Fetched {} results'.format(len(results))) print('Writing .json file') with open(os.path.join(output_dir, 'glyco_corpus.json'), 'w+') as outfile: json.dump(results, outfile) print('Tokenizing sentences') results_txt = map(sent_tokenize, results.values()) results_txt = reduce(operator.concat, results_txt) results_txt = reduce(_join_lines, results_txt) print('Writing .txt file') with open(os.path.join(output_dir, 'glyco_corpus.txt'), 'w+') as outfile: outfile.write(results_txt)
from pymed import PubMed from os import path pubmed = PubMed(tool="paperList", email="*****@*****.**") query = 'Correia BE[author]' publications = pubmed.query(query, max_results=500) ## Defining functions def get_filename(article): words = '-'.join(article.title.split(' ')[:3]) date = '-'.join([ str(article.publication_date.year), str(article.publication_date.month), str(article.publication_date.day) ]) title = '-'.join([date, words]) + '.md' return (title) def get_authors(article): author_list = [] for author in article.authors: name = author['lastname'] + ' ' + author['initials'] author_list.append(name) authors = ', '.join(author_list) return (authors, author_list[0])
#Pubmed search for articles on HIV in African American women import numpy as np import pandas as pd import pymed from pymed import PubMed pubmed = PubMed(tool="PubMedSearcher", email="*****@*****.**") search_term = "HIV Viral Load African American" #place search terms in quotes results = pubmed.query(search_term, max_results=500) articleList = [] articleInfo = [] for article in results: # Print the object type # Convert to dictionary articleDict = article.toDict() articleList.append(articleDict) # Create a dict list of articles from PUBMED API for article in articleList: pubmedId = article['pubmed_id'].partition('\n')[0] # Append article info to dictionary with fields you wish to collect articleInfo.append({ u'pubmed_id': pubmedId, u'title': article['title'], u'keywords': article['keywords'], u'journal': article['journal'], u'abstract': article['abstract'], # u'conclusions':article['conclusions'], # u'methods':article['methods'],
#https://icd.who.int/browse10/2016/en disease_dict = [] with open("./disease_dict.tsv", "r") as f: lines = f.read().split("\n") for line in lines: if line == "": continue buf = line.split("\t") disease_dict.append(buf[0]) #word = "kynurenine" word = "AMINOACYL-TRNA BIOSYNTHESIS" pubmed = PubMed(tool="MyTool", email="*****@*****.**") results = pubmed.query(word, max_results=10000) debug = open("./debug.txt", "w") result_count = {} result_file = open("./result_count.txt", "w") for item in results: #print (item.doi) #print (item.keywords) #print (item.abstract) abst = item.abstract if abst is None: continue
# https://stackoverflow.com/questions/57053378/ # https://www.kaggle.com/summerkrankin/pubmed-download-als import pandas as pd from pymed import PubMed import time # User inputs: query = input("Provide a query for PubMed (can include field tags): ") my_email = input("Provide your e-mail address (optional): ") max_results = input("Maximum number of results: ") # Consult PubMed: pubmed = PubMed(tool="PubMedSearcher", email = my_email) results = pubmed.query(query, max_results = int(max_results)) # Create an empty Dataframe with just the column names: articles_df = pd.DataFrame(columns = ['PMID', 'Publication_date', 'Title', 'Authors', 'Journal', 'DOI', 'Keywords', 'Abstract']) # Now, for each article, fill the dataframe with the info collected: for article in results: # Handle some exceptions in case of missing information:
def perform_query(keywords, amount): database = PubMed(tool="Vigor", email="*****@*****.**") query = keywords database_results = database.query(query, max_results=amount) return database_results
def _load_query_result(self): if not self._query_result: pubmed = PubMed(tool='Collabovid', email='*****@*****.**') self._query_result = list( pubmed.query(query=self._PUBMED_SEARCH_QUERY, max_results=30000))
@author: Wei Zhao @ Metis, 02/12/2021 """ #%% from pymed import PubMed from util import save_as_pickle from collections import defaultdict #%% pubmed = PubMed(tool="MyTool", email="") query = [ '((traumatic brain injury) ' + 'OR (concussion) ' + 'OR (brain biomechanics)) ' + 'AND ("1991/01/01"[Date - Create] : "3000"[Date - Create])' + 'AND (english[Language])' ] results = pubmed.query(query, max_results=12000000) #%% def download_data(results): """ Download the data """ # Loop over the retrieved articles data_dict = defaultdict(list) c = 0 for article in results: c += 1 if c % 1200 == 0: print(c) # Extract and format information from the article
import csv import itertools from pymed import PubMed # Create a PubMed object that GraphQL can use to query # Note that the parameters are not required but kindly requested by PubMed Central # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/ pubmed = PubMed(tool="Author co-occurence analysis", email="*****@*****.**") # Create a GraphQL query in plain text query = "occupational health[Title]" # Execute the query against the API results = list(pubmed.query(query, max_results=1344)) # Create a node for each unique author nodes = { author: index for index, author in enumerate( set( itertools.chain.from_iterable([[ f'{author["lastname"]} {author["firstname"]}' for author in article.authors ] for article in results]))) } # Create an edge for each combination of authors (co-authorship) edges = list( itertools.chain.from_iterable([[ combination
import time import json pubmed = PubMed(tool="toolname", email="your email") # Timer start_time = time.time() industryList = [] # Array of companys for comp in industryList: comp_time = time.time() articleList = [] articleInfo = [] query = f"your test query here with {comp}, same as how pubmed takes the queries" results = pubmed.query(query, max_results=999999) for article in results: articleDict = article.toDict() articleList.append(articleDict) for article in articleList: articleInfo.append({'pubmed_id': article['pubmed_id'], 'title': article['title'], 'keywords': article['keywords'], 'mesh': article['mesh'], 'journal': article['journal'], 'abstract': article['abstract'], 'conclusions': article['conclusions'], 'methods': article['methods'], 'results': article['results'], 'copyrights': article['copyrights'], 'doi': article['doi'],
from pymed import PubMed """In MEDLINE/PubMed, every journal article is indexed with about 10–15 subject headings, subheadings and supplementary concept records, with some of them designated as major and marked with an asterisk, indicating the article's major topics. When performing a MEDLINE search via PubMed, entry terms are automatically translated into (i.e. mapped to) the corresponding descriptors with a good degree of reliability; it is recommended to check the 'Details tab' in PubMed to see how a search formulation was translated. By default, a search for a descriptor will include all the descriptors in the hierarchy below the given one. PubMed does not apply automatic mapping of the term in the following circumstances: by writing the quoted phrase (e.g., "kidney allograft"), when truncated on the asterisk (e.g., kidney allograft *), and when looking with field labels (e.g., Cancer [ti]). Campos-Asensio, C. (2018). "Cómo elaborar una estrategia de búsqueda bibliográfica". Enfermería Intensiva (in Spanish). 29 (4): 182–186. """ pubmed = PubMed(tool="MyTool", email="*****@*****.**") results = pubmed.query("spanish[Language]", max_results=500) for res in results: print(res)
def fetch_journal_articles_data(journal_abbr, start_year=0, end_year=None, max_results=10000, verbosity='full', logger=None): """ Uses PubMed to get the latest articles of a journal based on its name Parameters ---------- journal_abbr: (str) journal abbreviation according to NLM catalog max_results: (int) number of recent articles to retrieve, 0 will get all the articles start_year: (int) end_year: (int) verbosity: (str or None) 'full' will print all dois, 'summary' prints the counter every 5 articles, None prints nothing logger: (Logger or None) Returns ---------- articles: (list) a list of entities.Article items """ #> Check if journal/PMC is supported by scraper journal = Journal.objects.get(abbr_name=journal_abbr) publisher_Q = Publisher.objects.filter(journals__contains=journal) if publisher_Q.count() == 0: logger.info("Journal has no publisher") return elif not publisher_Q[0].supported: logger.info("Journal not supported") return else: publisher = publisher_Q[0] #> Search in pubmed pubmed = PubMed() if not end_year: end_year = datetime.date.today().year + 2 query = f"{journal_abbr}[jour] {start_year}:{end_year}[DP]" search_succeeded = False retries = 0 while (retries < 10) and (not search_succeeded): try: entries = list(pubmed.query(query, max_results=max_results)) except: retries += 1 time.sleep(.2) else: search_succeeded = True if not search_succeeded: if verbosity == 'full': logger.info("Pubmed search failed after 10 retries") return articles = [] counter = 0 total_count = len(entries) any_success = False for entry in entries: if entry.doi: # > a quick fix for a bug in pymed (0.8.9), which sometimes returns a multiline list of dois # for a entry. And the first one is the real one doi = entry.doi.split('\n')[0] else: logger.info("No DOI") continue if Journal.objects.filter( articles__doi=doi).count() == 0: # article does not exist dates = scraper.get_dates(doi, publisher.domain, logger=logger) if any([v is not None for v in dates.values()]): #> the operation has succeeded article = Article(doi=doi, title=entry.title, authors=[ f"{a['lastname']} {a['initials']}" for a in entry.authors ], received=dates['Received'], accepted=dates['Accepted'], published=dates['Published']) journal.update(push__articles=article) any_success = True else: if verbosity == 'full': logger.info('Scraper failed') if (counter + 1 > GIVE_UP_LIMIT) and (not any_success): if verbosity == 'full': logger.info( f"No success for any of the {GIVE_UP_LIMIT} articles searched" ) journal.update(set__last_failed=True) return else: if verbosity == 'full': logger.info("Already in database") any_success = True counter += 1 if verbosity == 'full': logger.info( f'[{journal.abbr_name}] ({counter} of {total_count}): {doi}') if (counter % 5 == 0) and (verbosity == 'summary'): logger.info(counter) journal.update(set__last_failed=False) journal.update(set__last_checked=datetime.datetime.now())
except: pass for brevet in DataBrevet["brevets"]: AffilAuteur = dict() if brevet['label'] not in DejaVus: for Auteur in brevet['inventor']: SavBrevet = False # Commutateur pour éviter de requêter 15 fois pour un brevet LigneCsv = """""" # the csv file for mathching articles and patent at CIB level NbAut += 1 Auteur = Auteur.title() NumAut += 1 query = "%s[Author - Full]" % (Auteur) DocsAuteur = pubmed.query(query, max_results=500) IramFull = """""" # le contenu du fichier IRAMUTEQ complet Num = 0 #le numéro de doc pour sauvegarde auteurDejaVu = False NbArt = 0 # Articles retrouvés for article in DocsAuteur: NbArt += 1 SAV = False # switch pour savegarder dans le csv # print(type(article)) # print(article.toJSON()) Num += 1 Affi = PubMedCheckNameAndGetAffiliation( article.pubmed_id.split('\n')[0], Auteur ) # the first pubmed_id is the article. Others are citations if Affi is not None:
from pymed import PubMed import json email=input("Please enter your email:") user_input=input("I want to search for...") pubmed = PubMed(tool="MyTool", email=email) results = pubmed.query(user_input, max_results=5) results_list = [] output = [] for article in results: results_as_dict = article.toDict() results_list.append(results_as_dict) for article in results_list: pubmed_id = article['pubmed_id'].partition('\n')[0] output.append({u'pubmed_id':pubmed_id, u'title':article['title'], u'abstract':article['abstract']}) with open('output_results.json', 'w') as outfile: json.dump(output, outfile, indent=4)
def mainpipe(inputfile, search_term, max_records, json_out, embvec, embvecache, val_ratio, rnnsize, batchsize, lr, weight_decay, n_epochs, model_save, es): if inputfile == 1: with open("input.txt", "r") as f: para = ast.literal_eval(f.read()) search_term = para['search_term'] max_records = para['max_records'] embvec = para['embvec'] embvecache = para['embvecache'] val_ratio = para['val_ratio'] rnnsize = para['rnnsize'] batchsize = para['batchsize'] lr = para['lr'] weight_decay = para['weight_decay'] n_epochs = para['n_epochs'] model_save = para['model_save'] if embvec == 1: embvec = torchtext.vocab.GloVe(name='840B', dim=300, cache=embvecache) use_pretrained = True with mlflow.start_run() as mlrun: pubmed = PubMed(tool="AlphabetH", email="*****@*****.**") query = search_term results = pubmed.query(query, max_results=max_records) pp = defaultdict(lambda: defaultdict(dict)) for art in results: pmed = art.pubmed_id try: pp[pmed]['title'] = art.title except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = art.abstract except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.conclusions except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.methods except (AttributeError, TypeError): pass try: pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.results except (AttributeError, TypeError): pass try: pp[pmed]['keywords'] = art.keywords except (AttributeError, TypeError): pass try: pp[pmed]['authors'] = art.authors except (AttributeError, TypeError): pass try: pp[pmed]['journal'] = art.journal except (AttributeError, TypeError): pass try: pp[pmed]['pubdate'] = str(art.publication_date.year) except (AttributeError, TypeError): pass try: pp[pmed]['conclusions'] = art.conclusions except (AttributeError, TypeError): pass print(subprocess.getoutput("python -m spacy download en_core_web_sm")) artpd = pd.DataFrame.from_dict(pp, orient='index') artpda = artpd[artpd.abstract.notnull()].copy() artpda = artpda[artpd.title.notnull()] # artpda.index = pd.Series(artpda.index).apply(lambda x: x[0:8]) artpdak = artpda[artpda.keywords.str.len() > 0].copy() dataf = pd.DataFrame( index=artpdak.index, columns=['SRC', 'TRG', 'keywords', 'Extracted', 'abskey']) dataf.loc[:, 'SRC'] = artpdak.title + ' ' + artpdak.abstract dataf.loc[:, 'keywords'] = artpdak.keywords svoc = spacy.load("en_core_web_sm") matcher = PhraseMatcher(svoc.vocab, attr="LOWER") for pmid in dataf.index: t0 = dataf.loc[pmid] patterns = [svoc.make_doc(str(name)) for name in t0.keywords] matcher.add("Names", None, *patterns) doc = svoc(t0.SRC) t1 = ['O'] * (len(doc)) matched = [] matn = 0 for _, start, end in matcher(doc): t1[start] = 'B' t1[start + 1:end] = 'I' * (end - start - 1) if str(doc[start:end]).lower() not in matched: matn = matn + 1 matched.append(str(doc[start:end]).lower()) abskw = [] for x in t0.keywords: if x.lower() not in matched: abskw.append(x) dataf.loc[pmid, 'TRG'] = ' '.join([t for t in t1]) dataf.loc[pmid, 'Extracted'] = matn dataf.loc[pmid, 'abskey'] = abskw matcher.remove("Names") datatrain = dataf[dataf['Extracted'] >= 3].copy() datatest = dataf[dataf['Extracted'] < 3].copy() # separate train and validate dtrain = datatrain.loc[:, ['SRC', 'TRG']] dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']] seed = 250 idx = np.arange(datatrain.shape[0]) np.random.seed(seed) torch.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.shuffle(idx) val_size = int(len(idx) * val_ratio) df_train = dtrain.iloc[idx[val_size:], :] df_val = dtrain.iloc[idx[:val_size], :] df_val_k = dtraink.iloc[idx[:val_size], :] df_test = datatest.loc[:, ['SRC', 'TRG']] dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']] df_val_k = dtraink.iloc[idx[:val_size], :] # Load original dataset datai = artpda.copy() datai = datai[datai.abstract.notnull()] datai = datai[datai.title.notnull()] datai = datai.replace('\n', ' ', regex=True) datai = datai.replace('\t', ' ', regex=True) dataiu = datai.loc[datai.keywords.str.len() == 0] dataik = datai.loc[datai.keywords.str.len() > 0] dataiu['SRC'] = dataiu.title + ' ' + dataiu.abstract tokenizertrg = lambda x: x.split() def tokenizersrc(text): # create a tokenizer function return [tok.text for tok in svoc.tokenizer(text)] def safe_value(field_val): return field_val if not pd.isna(field_val) else "Other" def safe_year(field_val): return field_val if not pd.isna(field_val) else 1900 TEXT = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, lower=False) LABEL = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, unk_token=None) fields = [('text', TEXT), ('label', LABEL)] device = 'cuda' train_examples = read_data(df_train, fields, tokenizersrc, tokenizertrg) valid_examples = read_data(df_val, fields, tokenizersrc, tokenizertrg) # Load the pre-trained embeddings that come with the torchtext library. if use_pretrained: print('We are using pre-trained word embeddings.') TEXT.build_vocab(train_examples, vectors=embvec) else: print('We are training word embeddings from scratch.') TEXT.build_vocab(train_examples, max_size=5000) LABEL.build_vocab(train_examples) # Create one of the models defined above. #self.model = RNNTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False) model0 = RNNCRFTagger(TEXT, LABEL, rnnsize, emb_dim=300, update_pretrained=False) model0.to(device) optimizer = torch.optim.Adam(model0.parameters(), lr=lr, weight_decay=weight_decay) train(train_examples, valid_examples, embvec, TEXT, LABEL, device, model0, batchsize, optimizer, n_epochs) out2 = evaltest2(df_val, df_val_k, model0, tokenizersrc, fields, device) ttp3 = kphperct(df_val_k, out2, svoc) mlflow.log_param("epochs", n_epochs) mlflow.pytorch.save_model(model0, model_save) mlflow.log_metric("extraction_rate", ttp3.mean()) augout = evaltest2(dataiu, model0, tokenizersrc, fields, device) klist = kphext2(dataiu.SRC, augout, svoc) for i in range(len(dataiu.index)): dataiu.iloc[i, 2].extend(list(set(klist[i]))) output = pd.concat([dataik, dataiu], join="inner") output.to_json('/home/pding/OneDrive/kph/MSaug.json', orient='index') if es == 1: output['journal'] = output['journal'].apply(safe_value) output['conclusions'] = output['conclusions'].apply(safe_value) output['pubdate'] = output['pubdate'].apply(safe_year) output['PMID'] = output.index test_server = [{'host': '127.0.0.1', 'port': 9200}] es = Elasticsearch(test_server, http_compress=True) use_these_keys = [ 'PMID', 'title', 'abstract', 'keywords', 'authors', 'pubdate' ] def filterKeys(document): return {key: document[key] for key in use_these_keys} def doc_generator(df): df_iter = df.iterrows() for index, document in df_iter: try: yield { "_index": 'ms', "_source": filterKeys(document), } except StopIteration: return helpers.bulk(es, doc_generator(output)) print(ttp3.mean())
import pprint import pandas as pd import numpy as np import xmltodict from xml.etree import ElementTree from pymed import PubMed from Bio import Entrez import plotly import plotly.graph_objects as go import plotly.express as px from datetime import datetime # https://stackoverflow.com/questions/57053378/query-pubmed-with-python-how-to-get-all-article-details-from-query-to-pandas-d pubmed = PubMed(tool="PubMedSearcher", email="*****@*****.**") results = pubmed.query( "nhsx[affiliation]", max_results=500) # number might need to be updated in future, for now low articleList = [] articleInfo = [] for article in results: # Print the type of object we've found (can be either PubMedBookArticle or PubMedArticle). articleDict = article.toDict() # convert to dictionary articleList.append(articleDict) # Generate list of dict records which will hold all article details that could be fetch from PUBMED API for article in articleList: # Sometimes article['pubmed_id'] contains list separated with comma - take first pubmedId in that list - thats article pubmedId pubmedId = article["pubmed_id"].partition("\n")[0] # keep only pubmed id # Append article info to dictionary # articleInfo.append({
from pymed import PubMed # Create a PubMed object that GraphQL can use to query # Note that the parameters are not required but kindly requested by PubMed Central # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/ pubmed = PubMed(tool="MyTool", email="*****@*****.**") # Create a GraphQL query in plain text query = '(("2018/05/01"[Date - Create] : "3000"[Date - Create])) AND (Xiaoying Xian[Author] OR diabetes)' # Execute the query against the API results = pubmed.query(query, max_results=500) # Loop over the retrieved articles for article in results: # Extract and format information from the article article_id = article.article_id title = article.title if article.keywords: if None in article.keywords: article.keywords.remove(None) keywords = '", "'.join(article.keywords) publication_date = article.publication_date abstract = article.abstract # Show information about the article print( f'{article_id} - {publication_date} - {title}\nKeywords: "{keywords}"\n{abstract}\n' )
def get_pubmed_data(self, query, searched_zipcode, date, maximum_number_of_value=3): csv_data = { "affiliation": [], "number_of_authors": [], "authors_name": [], "authors_institute": [], "authors_address": [], "authors_zipcode": [], "paper_title": [], "publication_date": [], "journal": [] } pubmed = PubMed(tool="MyTool", email="*****@*****.**") parser = Parser() results = pubmed.query(query, max_results=maximum_number_of_value) is_queried_by_zipcode = searched_zipcode.isdecimal() if is_queried_by_zipcode: searched_zipcode = int(searched_zipcode) for article in results: jsonData = json.loads(article.toJSON()) authors_list = jsonData['authors'] authors_name = "" authors_institute = "" authors_affiliation = "" authors_address = "" authors_zipcode = "" num_authors = len(authors_list) or 0 counted_matched = 0 if is_queried_by_zipcode: counted_matched = self.has_match_zipcode_of_authprs( authors_list, searched_zipcode) if (not is_queried_by_zipcode) or (is_queried_by_zipcode and counted_matched > 0): for index in range(0, num_authors): affiliation = authors_list[index][ "affiliation"] or "<NOT_AVAILABLE>" zipcode = str(self.get_address_with_zipcode(affiliation)) # print(type(zipcode)) # print(zipcode) author_name = authors_list[index][ 'firstname'] + " " + authors_list[index][ "lastname"] or "<NOT_AVAILABLE>" author_institute = "" author_institute += self.get_organization( affiliation=affiliation) + " " authors_affiliation += affiliation authors_name += author_name authors_institute += author_institute authors_address += str(parser.parse(affiliation)) authors_zipcode += zipcode if num_authors != index + 1: authors_name += "||" authors_institute += "||" authors_affiliation += "||" authors_address += "||" authors_zipcode += "||" else: break paper_title = jsonData['title'] or "<NOT_AVAILABLE>" publication_date = jsonData['publication_date'] or "<NOT_AVAILABLE>" journal = jsonData['journal'] or "<NOT_AVAILABLE>" if self.is_us: if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0): csv_data["authors_name"].append(authors_name) csv_data["affiliation"].append(authors_affiliation) csv_data["authors_institute"].append(authors_institute) csv_data["paper_title"].append(paper_title) csv_data["publication_date"].append(publication_date) csv_data["journal"].append(journal) csv_data["authors_address"].append(authors_address) csv_data["number_of_authors"].append(num_authors) csv_data["authors_zipcode"].append(authors_zipcode) self.is_us = False # if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0): # # df = pd.DataFrame(csv_data) # # print(df.head()) # df.to_csv("PubMedData_from.csv", index=False) print("Size of csv ", len(csv_data["paper_title"])) if len(csv_data["paper_title"]) > 0: df = pd.DataFrame(csv_data) print(df.head()) datetimeobject = datetime.datetime.strptime(date, '%Y/%m/%d') csv_file_name = "PubMedData_From_" + datetimeobject.strftime( '%Y_%m_%d') + ".csv" print(csv_file_name) df.to_csv(csv_file_name, index=False)
class query(object): # * Store Flags def __init__(self): # Positional Arguments self.oFile = args.oFile # Flags for information requested by Pubmed API self.email = args.email self.tool = args.tool # All other flags used to build the query self.author1 = args.author1 self.authors = args.authors self.title = args.title self.terms = args.terms self.userquery = args.userquery self.psYear = args.pubSinceYear self.psLast = args.pubSinceLast self.maxResults = args.maxResults # * Build Object # Create a PubMed object that GraphQL can use to query def buildQuery(self): # Build Object and send some info to PubMed by their request # Note that the parameters below are not required but kindly requested by PubMed Central # https://www.ncbi.nlm.nih.gov/pmc/tools/developers/ self.pubmed = PubMed(tool=self.tool, email=self.email) # * Create query to feed into Pubmed self.query = "" # First author if self.author1 is not None: if '#' in str(self.author1): self.author1 = str(self.author1).replace('#', ' ') self.query = self.query + str(self.author1)[2:-2] + ' [1au] AND ' # Authors if self.authors is not None: for author in self.authors.split(' '): if '#' in author: author = author.replace('#', ' ') self.query = self.query + author + ' [auth] AND ' # Title if self.title is not None: for tword in self.title: self.query = self.query + tword + ' [ti] AND ' # Terms if self.terms is not None: for item in self.terms.split(' '): self.query = self.query + item + ' AND ' # User query if self.userquery is not None: userquery = str(self.userquery)[2:-2] self.query = self.query + userquery + ' AND ' # Calculate what the start date is for articles to be included based on user settings if self.psLast is not None: # Only include articles published in the last <x> years self.dYa = datetime.now() - relativedelta(years=int(self.psLast)) self.dayYearsAgo = str(self.dYa).split(' ')[0].replace('-', '/') self.dYaQuery = '(' + self.dayYearsAgo + '[Date - Create] : "3000"[Date - Create])' else: self.dYaQuery = '("' + self.psYear + '/01/01"[Date - Create] : "3000"[Date - Create])' self.query = self.query + self.dYaQuery # Announce created query for verification: print(f''' This is your query: {self.query} ''') def runQuery(self): # Execute the query against the API self.results = self.pubmed.query(self.query, max_results=int(self.maxResults) + 1) # Make dictionary to store data self.output = {} # Loop over the retrieved articles self.nResults = 0 for result in self.results: self.nResults = self.nResults + 1 # Check if there are more than <n> results if self.nResults > int(self.maxResults): # Show warning print('More than ' + str(self.maxResults) + ' results found') elif self.nResults == 0: # Show warning print('No results found') else: # Print number of results print(str(self.nResults) + ' result(s) obtained.') # Loop over the retrieved articles self.results = self.pubmed.query(self.query, max_results=int(self.maxResults)) for article in self.results: # Extract and format information from the article article_id = article.pubmed_id.split()[0] title = article.title authors = article.authors # if article.keywords: # if None in article.keywords: # article.keywords.remove(None) # keywords = '", "'.join(article.keywords) publication_date = article.publication_date abstract = article.abstract if hasattr(article, 'journal'): journal = article.journal else: journal = 'NA' # Reshape author list authorString = '' for author in authors: last = author['lastname'] first = author['firstname'] if last is None: last = 'NA' if first is None: first = 'NA' authorString = authorString + ' ' + last + ', ' + first + ';' # Add results to the dictionary self.output[article_id] = [ article_id, title, authorString, journal, publication_date, abstract ] # Put data in a dataframe after extraction self.DF = pd.DataFrame.from_dict(self.output) self.DF = self.DF.T self.DF = self.DF.reset_index(drop=True) # Remove row names self.DF.columns = [ "PMID", "Title", "Authors", "Journal", "PubDate", "Abstract" ] # Save to Excel self.writer = pd.ExcelWriter(self.oFile, engine='xlsxwriter') self.DF.to_excel(self.writer, sheet_name='PMquery', index=False) self.workbook = self.writer.book self.worksheet = self.writer.sheets['PMquery'] # Formatting self.format = self.workbook.add_format({ 'text_wrap': True, 'align': 'top' }) self.worksheet.set_column('A:A', 9, self.format) self.worksheet.set_column('B:C', 22, self.format) self.worksheet.set_column('D:E', 11, self.format) self.worksheet.set_column('F:F', 58, self.format) self.writer.save() # If there is only one result, also return information to the shell if self.nResults == 1: #print(json.dumps(self.output.items, indent = 4)) #print(self.output.items()) key = list(self.output.keys())[0] print('\n ----------------------------------\n', 'PMID: ', self.output[key][0], '\n', 'Title: ', self.output[key][1], '\n', 'Authors: ', self.output[key][2], '\n', 'Journal: ', self.output[key][3], '\n', 'Published: ', self.output[key][4], '\n', '----------------------------------\n')
from tqdm import tqdm from pymed import PubMed import pickle # change the query with your own ! [CHANGE QUERY] pubmed = PubMed(tool="OPLR", email="*****@*****.**") results = pubmed.query( '"trans women" OR "trans woman" OR "trans man" OR "trans men" OR "transwoman" OR "transwomen" OR "transmen" OR "transman" OR "transgender" OR "transsexual" OR "transgenderism" OR "transsexuality" OR "transsexualism"', max_results=20000) mydict = {"pubs": {}, "labels": {}} # Here we construct an OPLR dictionary file based on the pubmed API : # here are the fields, only fields present in both articles and books ar used. (copyrights is not used) # BOTH : "pubmed_id" "title" "abstract" "publication_date" "authors" "copyrights" "doi" # ARTI : "keywords" "journal" "methods" "conclusions" "results" "xml" # BOOK : "doi" "isbn" "language" "publication_type" "sections" "publisher" "publisher_location" i = 0 for pub in tqdm(results): i += 1 authors = [] for a in pub.authors: try: name = a['initials'] + " " + a['lastname'] except TypeError: name = "" authors.append(name) try: date = pub.publication_date.strftime("%Y-%m-%d") except AttributeError: