#!/usr/bin/env python2 # We want to know, of the files we have, what is the breakdown? import os import pandas from brainbehavior.pubmed import Pubmed # First we need to download full article text # Create a pubmed object email = "*****@*****.**" pm = Pubmed(email) # Get pubmed ids for all articles in database pc_ids = pm.get_pubmed_central_ids() # We are going to download them here download_folder = "/scratch/PI/russpold/data/PUBMED/articles" # Submit scripts to download in batches of 100 iters = len(pc_ids) / 100 # Subset matrix to files we have downloaded subset = pandas.DataFrame(columns=pm.ftp.columns) for i in range(0, iters): print "%s of %s" % (i, iters) download_subfolder = "%s/%s" % (download_folder, i) start = i * 100 if i != iters: end = start + 100 else:
#!/usr/bin/env python2 # This script will download pubmed papers for a given start and end index in the current # ftp manifest file # Usage : download_pubmed_muhaha.py start end download_folder import sys import pandas from brainbehavior.pubmed import Pubmed # Get the start and end index of ids from the command line start = int(sys.argv[1]) end = int(sys.argv[2]) download_folder = sys.argv[3] email = sys.argv[4] # First we need to download full article text # Create a pubmed object pm = Pubmed(email) # Get pubmed ids for articles in database pc_ids = pm.get_pubmed_central_ids() # Filter down to indices that we want pc_ids = pc_ids[start:end] # Download the articles! pm.download_pubmed(pc_ids,download_folder)
#!/usr/bin/env python2 # This script will download pubmed papers for a given start and end index in the current # ftp manifest file # Usage : download_pubmed_muhaha.py start end download_folder import sys import pandas from brainbehavior.pubmed import Pubmed # Get the start and end index of ids from the command line pmid = sys.argv[1] download_folder = sys.argv[2] email = sys.argv[3] # First we need to download full article text # Create a pubmed object pm = Pubmed(email) # Download the articles! pm.download_pubmed([pmid],download_folder)
#!/usr/bin/env python2 # We want to know, of the files we have, what is the breakdown? import os import pandas from brainbehavior.pubmed import Pubmed # First we need to download full article text # Create a pubmed object email = "*****@*****.**" pm = Pubmed(email) # Get pubmed ids for all articles in database pc_ids = pm.get_pubmed_central_ids() # We are going to download them here download_folder = "/scratch/PI/russpold/data/PUBMED/articles" # Submit scripts to download in batches of 100 iters = len(pc_ids)/100 # Subset matrix to files we have downloaded subset = pandas.DataFrame(columns=pm.ftp.columns) for i in range(0,iters): print "%s of %s" %(i,iters) download_subfolder = "%s/%s" %(download_folder,i) start = i*100 if i != iters: end = start + 100
#!/usr/bin/env python2 # This script will launch instances of download_pubmed_muhaha.py import os import time from brainbehavior.pubmed import Pubmed from glob import glob # First we need to download full article text # Create a pubmed object email = "*****@*****.**" pm = Pubmed(email) # Get pubmed ids for all articles in database pc_ids = pm.get_pubmed_central_ids() # Download folder download_folder = "/scratch/PI/russpold/data/PUBMED/articles" # Submit scripts to download in batches of 100 start = 0 iters = len(pc_ids)/100 # Function to submit a single iteration of a missing job def submit_single_missing(pmid,download_folder,email): jobname = "pm_%s" %(pmid) filey = open(".job/%s.job" % (jobname),"w") filey.writelines("#!/bin/bash\n") filey.writelines("#SBATCH --job-name=%s\n" %(jobname)) filey.writelines("#SBATCH --output=.out/%s.out\n" %(jobname))