def firsttime_run(filedir='recount-methylation-files', run_timestamp=gettime_ntp()): """ firsttime_run On first setup, run new equeries and query filter. Arguments: * filedir (str): Dir name for db files. * run_timestamp (str) : NTP timestamp or function to retrieve it. Returns: * gseidlist (list): List of valid GSE IDs. """ print("Beginning first time server run...") equery_dest = settings.equerypath temppath = settings.temppath gse_query() gsm_query() gseqfile = getlatest_filepath(equery_dest, 'gse_edirectquery') gsmqfile = getlatest_filepath(equery_dest, 'gsm_edirectquery') gsequery_filter() gsefiltpath = getlatest_filepath(equery_dest, 'gsequery_filt') if gsefiltpath: gsefiltd = querydict(querypath=gsefiltpath, splitdelim=' ') gseidlist = list(gsefiltd.keys()) print("GSE id list of len " + str(len(gseidlist)) + " found. Returning...") return gseidlist else: print("Error retrieving gse query filtered file. Returning...") return None return None
def scheduled_run(eqfilt_path=False, run_timestamp=gettime_ntp()): """ scheduled_run Tasks performed on regular schedule, after first setup. For the job queue, a list of GSE IDs is returned. The id list is filtered on existing GSE soft files to prioritize unrepresented experiments for download. Arguments: * eqfilt_path (str) : Filepath to edirect query filter file. * filedir (str) : Root name of files directory. * run_timestamp (str) : NTP timestamp or function to retrieve it. Returns: * gse_list (list) : list of valid GSE IDs, or None if error occurs """ try: gsefiltd = get_queryfilt_dict() except: print("No gse query filt file found, checking for GSE and GSM " + "queries...") gsequery_latest = getlatest_filepath(filepath=eqpath, filestr='gse_edirectquery') if not gsequery_latest: gse_query() gsmquery_latest = getlatest_filepath(eqpath, 'gsm_edirectquery') if not gsmquery_latest: gsm_query() print("Running filter on GSE query...") gsequery_filter() gsefiltd = get_queryfilt_dict() # get list of GSE IDs from existing SOFT files gsesoftfiles = os.listdir(settings.gsesoftpath) print("GSE SOFT files: " + str(gsesoftfiles)) rxgse = re.compile('GSE[0-9]*') gseid_softexists = [ str(rxgse.findall(softfn)[0]) for softfn in gsesoftfiles if rxgse.findall(softfn) ] if gsefiltd: gseid_listall = list(gsefiltd.keys()) print("GSE ID list of len " + str(len(gseid_listall)) + " found. Filtering..") if gseid_softexists and len(gseid_softexists) > 0: gseid_filt = [ gseid for gseid in gseid_listall if not gseid in gseid_softexists ] else: gseid_filt = gseid_listall print("After filtering existing SOFT files, N = " + str(len(gseid_filt)) + " GSE IDs remain. Returning ID list...") # if all GSE IDs represented, return all GSE IDs for brand new run if len(gseid_filt) == len(gseid_listall): gseid_filt = gseid_listall return gseid_filt else: print("Error forming equery filt dictionary. Returning...") return None
def gsequery_filter(splitdelim='\t', timestamp=gettime_ntp()): """ gsequery_filter Prepare an edirect query file. Filter a GSE query file on its GSM membership. Arguments: * splitdelim (str) : Delimiter to split ids in querydict() call. * timestamp (str) : NTP timestamp or function to retrieve it. Returns: * gsequeryfiltered (list): Filtered GSE query object (list), writes filtered query file as side effect. """ eqpath = settings.equerypath gsequerystr = settings.gsequerystr gsmquerystr = settings.gsmquerystr # get GSM list from gsm query file gsmqueryf_latestpath = getlatest_filepath(filepath=eqpath, filestr=gsmquerystr, embeddedpattern=True, tslocindex=1, returntype='returnlist') if gsmqueryf_latestpath: print("Latest gsmquery file detected: " + str(gsmqueryf_latestpath)) else: print("Error detecting latest gsmquery file! Returning...") return gsmlines = [line.rstrip('\n') for line in open(gsmqueryf_latestpath[0])] gsmlist = [line.split('\t')[1::][0] for line in gsmlines] # get GSE dictionary object gsequeryf_latestpath = getlatest_filepath(filepath=eqpath, filestr=gsequerystr, embeddedpattern=True, tslocindex=1, returntype='returnlist') if gsequeryf_latestpath: print("Latest gsequery file detected: " + str(gsequeryf_latestpath)) else: print("Error detecting latest gsequery file! Returning...") return gsed_obj = querydict(querypath=gsequeryf_latestpath[0], splitdelim='\t') gsefiltl = [] for gsekey in list(gsed_obj.keys()): samplelist_original = gsed_obj[gsekey] samplelist_filt = [ sample for sample in samplelist_original if sample in gsmlist ] if samplelist_filt and len(samplelist_filt) > 0: gsefiltl.append(' '.join([gsekey, ' '.join(samplelist_filt)])) print('writing filt file...') if eqpath: filtfn = ".".join(["gsequery_filt", timestamp]) with open(os.path.join(eqpath, filtfn), 'w') as filtfile: for item in gsefiltl: filtfile.write("%s\n" % item) return gsefiltl
def eqd_gsm_exclude(equery_dest=settings.equerypath, filesdir=settings.filesdir, gsmv_fname="gsmv.txt", exclude_dpath=os.path.join("inst", "freeze_gsmv")): """ eqd_gsm_exclude Exclude GSM IDs from edirecty query objects Arguments: * gsmv_fname: Name of the file to load. Should include only space-separated sample/GSM IDs in a single line. * exclude_dpath: Path to directory containing the file gsmv_fname. Returns: * Returns the path to the new filtered file at settings.equerypath. """ gsmv_fpath = os.path.join(exclude_dpath, gsmv_fname) if not os.path.exists(gsmv_fpath): print("Couldn't find sample ID file") gsmv_exclude = [line.rstrip('\n').split(" ") for line in open(gsmv_fpath)][0] # gsmv_exclude = [i for sublist in gsmv_exclude for i in sublist] eqpath = settings.equerypath gsefilt_latest = getlatest_filepath(eqpath,'gsequery_filt', embeddedpattern=True, tslocindex=1, returntype='returnlist' )[0] print("Starting with latest detected filter file: "+gsefilt_latest) querylines = [line.rstrip('\n') for line in open(gsefilt_latest)] qlnew = []; print("Applying filter..."); numgsm_old = len(querylines) for line in querylines: line = line.split(" ") ldat = [gid for gid in line if not gid in gsmv_exclude] numgsm_new = len(ldat) if len(ldat) > 1: qlnew.append(ldat) print("After filter, retained " + str(len(qlnew)) + " studies.") nts = gettime_ntp() newfpath = os.path.join(eqpath, ".".join(["gsequery_filt",nts])) print("Writing new filter file: ", newfpath) with open(newfpath, "w") as wf: for line in qlnew: wf.write(" ".join(line) + "\n") return newfpath
def compile_rsheet(eqfiltd=get_queryfilt_dict(), sheetfn_ext='rsheet', msrapfn_ext='msrapout', msrapfn='msrapout', idatsfn_ext='idat', timestamp=gettime_ntp()): """ compile_rsheet Knits poised file data together into a sheet to be read into R using minfi. Steps taken include: 1. Grab msrap file list 2. Grab idats file list 3. Intersect files lists 4. Subset eqfilt dict on gse 5. Form and write new sheet files, one per gse Arguments * eqfiltd (function or dictionary) : Equery filter dictionary object. * sheetsdir (str) : Directory to write new sheet files. * sheetfn_ext (str) : Filename extension for new sheet files. * msrapdir (str) : Directory containing MetaSRA-pipeline datafiles. * msrapfn_ext (str) : Filename extension of valid MetaSRA-pipeline datafiles. * idatsfn_ext (str) : Filename extension of valid idat files. * idatsdir (str) : Name of directory containing GSM idat files. * filesdir (str) : Root name of directory containing database files. * timestamp (str) : NTP timestamp for file versioning. * msrapfn (str) : File name stem for MetaSRA-pipeline files Returns: * null, produces sheet files as a side effect. """ # form the sheet path and make dir as needed sheetspath = settings.sheetspath os.makedirs(sheetspath, exist_ok=True) sheets_fpath = os.path.join(sheetspath, ".".join([timestamp, sheetfn_ext])) # form msrap and idat paths and get filenames msrap_path = settings.gsmmsrapoutpath rxmsrap = re.compile(".*" + msrapfn_ext + "$") msrap_fnlist = list(filter(rxmsrap.match, os.listdir(msrap_path))) print("msrap_fnlist : " + str(msrap_fnlist)) # idats fn idats_path = settings.idatspath rxidat = re.compile(".*" + idatsfn_ext + "$") idats_fnlist = list(filter(rxidat.match, os.listdir(idats_path))) # extract gsm ids rxgsm = re.compile(".*GSM[0-9]") idats_splitlist = [ idatfn.split(".")[0] for idatfn in idats_fnlist if len(idatfn.split(".")) > 1 ] idats_gsmlist_filt = list(set(filter(rxgsm.match, idats_splitlist))) # unique gsm ids msrap_splitlist = [ msrapfn.split(".")[1] for msrapfn in msrap_fnlist if len(msrapfn.split(".")) > 1 ] msrap_gsmlist_filt = list(set(filter(rxgsm.match, msrap_splitlist))) # unique gsm ids print("idats_gsmlist_filt : " + str(idats_gsmlist_filt)) print("msrap_gsmlist_filt : " + str(msrap_gsmlist_filt)) gsmvalid = [ gsmid for gsmid in msrap_gsmlist_filt if gsmid in idats_gsmlist_filt ] if len(gsmvalid) > 0: rxgrn = re.compile(".*Grn.idat$") rxred = re.compile(".*Red.idat$") lsheet = [] # list object to write rsheet, one row per gsmid # append colnames lsheet.append(" ".join([ "gsmid", "gseid", "idats_fn", "msrapmd_fn", "msrapmd_flatjson", "SENTRIX_ID", "ARRAY_ID", "Basename" ])) lsheet[0] = lsheet[0] + "\n" for gsmid in gsmvalid: # compile the file info for this gsm rxgsmi = re.compile(".*" + gsmid + ".*") gsmi_idats = list(filter(rxgsmi.match, idats_fnlist)) gsmi_red_idats = list(filter(rxred.match, gsmi_idats)) gsmi_grn_idats = list(filter(rxgrn.match, gsmi_idats)) # get the latest file versions gsmi_red_pattern = gsmi_red_idats[0].split(".")[2] gsmi_grn_pattern = gsmi_grn_idats[0].split(".")[2] gsmi_red_latest = getlatest_filepath(filepath=idats_path, filestr=gsmi_red_pattern, embeddedpattern=True) gsmi_grn_latest = getlatest_filepath(filepath=idats_path, filestr=gsmi_grn_pattern, embeddedpattern=True) # get the latest msrap file gsmi_msrap_latest = getlatest_filepath(filepath=msrap_path, filestr=gsmid, embeddedpattern=True) print(gsmi_msrap_latest) if (gsmi_red_latest and not gsmi_red_latest == 0 and gsmi_grn_latest and not gsmi_grn_latest == 0 and gsmi_msrap_latest and not gsmi_msrap_latest == 0): # form the rsheets with valid gsm ids with open(gsmi_msrap_latest, 'r') as msrapmd: gsmi_metadata_dict = json.load(msrapmd) gsmi_md = gsmi_metadata_dict[0] # weird dictionary grows = [] for key in list(gsmi_md.keys()): kval = gsmi_md[key] if type(kval) is list: grows.append(";".join(kval)) else: grows.append(":".join([str(key), str(gsmi_md[key])])) gsmi_mdvar = "'" + ";".join(grows) + "'" # grab the gse id for this gsm gseid = str([ gsek for gsek in list(eqfiltd.keys()) if gsmid in eqfiltd[gsek] ][0]) # make the gsm arrays path Basename for minfi gsmi_bn = "_".join(gsmi_red_latest.split("_")[0:3]) # one entry per gsm lgsmi = " ".join([ gsmid, # gsm id gseid, # gse id ";".join([ os.path.basename(gsmi_red_latest), os.path.basename(gsmi_grn_latest) ]), # idat filenames os.path.basename(gsmi_msrap_latest), # metadata filename gsmi_mdvar, # flattened json file os.path.basename(gsmi_red_latest).split( "_")[-2], # sentrix id os.path.basename(gsmi_red_latest).split("_") [-3], # array id gsmi_bn # minfi path Basename, for arrays ]) lgsmi = lgsmi + "\n" lsheet.append(lgsmi) else: print( "No valid GSM IDs detected. Check idats and MetaSRA-pipeline GSM " + "files directories.") return 0 # write the final sheet files with open(sheets_fpath, 'w') as fsheet: for item in lsheet: fsheet.write(item) return lsheet
def rmdb_fpaths_old(rmhlinks=False): """ rmdb_fpaths Get filepaths for existant sample idats and msrap outfiles. Arguments: * rmhlinks : Whether to remove old hardlinks and form new ones, regardless of whether current hlinks exist (boolean). Returns: * gsmdocdict (dict.) : Dictionary of validated filepaths. """ timestamp = gettime_ntp() # connect to RMDB mongodb client = pymongo.MongoClient(settings.rmdbhost, settings.rmdbport) dbcon = client.recount_methylation; idatscon = dbcon.gsm.idats softcon = dbcon.gse.soft; idatslist = list(idatscon.find()) # grab unique gsm ids idatslist = [record for record in idatslist if 'gsmid' in record.keys()] gsmindex = list(set([record['gsmid'] for record in idatslist])) print("from idats db, found n = "+str(len(gsmindex))+" gsm ids") # fname catch patterns for re grnidatcatch = settings.grnidat_expcatch redidatcatch = settings.redidat_expcatch msrapoutcatch = settings.msrapoutfnpattern # filter all records for gsm on most recent update datetime gsm_fpaths_dd = {} # list all previously expanded idat files directy from idats dir allidatslist = os.listdir(settings.idatspath) allidatslist = list(filter(re.compile('.*\.idat$').match, allidatslist)) print("found n = "+str((len(allidatslist)))+" expanded idat filenames...") # grab and filter idats and msrap outfiles lists if rmhlinks: print("Beginning sample iterations with hlink removal.") else: print("Beginning sample iterations without hlink removal.") for gi, gsmid in enumerate(gsmindex, 1): print("Getting fpaths for gsm: "+str(gsmid)+", num: "+str(gi), end="\r") gsm_fpaths_dd[gsmid] = [] # all idat records for the GSM id recordsgsm = [record for record in idatslist if record['gsmid']==gsmid] # filter records by channel type, # note most records are for compressed files idatsrec_gsmgrn = [record for record in recordsgsm if isinstance(record['date'],datetime.datetime) and re.search('.*Grn\.idat.*',os.path.basename(record['filepath'])) ] idatsrec_gsmred = [record for record in recordsgsm if isinstance(record['date'],datetime.datetime) and re.search('.*Red\.idat.*',os.path.basename(record['filepath'])) ] if idatsrec_gsmgrn and idatsrec_gsmred: # get latest records for each channel irec_filtgrn = sorted(idatsrec_gsmgrn, key=lambda k: k['date'])[-1] irec_filtred = sorted(idatsrec_gsmred, key=lambda k: k['date'])[-1] # valid record file basenames igrnrec_bn = os.path.basename(irec_filtgrn['filepath']) iredrec_bn = os.path.basename(irec_filtred['filepath']) # check for expanded versions of compressed files igrn_fn = [fn for fn in allidatslist if igrnrec_bn[:-3] in fn ] ired_fn = [fn for fn in allidatslist if iredrec_bn[:-3] in fn ] if igrn_fn and ired_fn: igrn_fn = igrn_fn[0] ired_fn = ired_fn[0] hllist = [] if rmhlinks: # remove old hard links to sample idats grnhl_torm = [fn for fn in allidatslist if "hlink" in fn and '.'.join(igrn_fn.split('.')[2:]) in fn ] redhl_torm = [fn for fn in allidatslist if "hlink" in fn and '.'.join(ired_fn.split('.')[2:]) in fn ] if grnhl_torm: for hlfn in grnhl_torm: os.remove(os.path.join(settings.idatspath, hlfn) ) if redhl_torm: for hlfn in redhl_torm: os.remove(os.path.join(settings.idatspath, hlfn) ) # new hlinks hllist = new_idat_hlinks(gsmid, ts=timestamp, igrn_fn=igrn_fn, ired_fn=ired_fn ) else: # check if hlinks exist, create new ones otherwise grnhllist = [fn for fn in allidatslist if "hlink" in fn and '.'.join(igrn_fn.split('.')[2:]) in fn ] redhllist = [fn for fn in allidatslist if "hlink" in fn and '.'.join(ired_fn.split('.')[2:]) in fn ] # get matching grn and red hlink fn's if they exist status_hlink = None grnfnpass = None redfnpass = None if grnhllist and redhllist: grnhllistfilt = list(set(grnhllist)) redhllistfilt = [] for ghl in grnhllistfilt: for rhl in redhllist: # check that base array ids identical if ghl[:-9]==rhl[:-9]: redhllistfilt.append(rhl) else: redhllistfilt.append("") rhlfiltsub = [rhl[:-9] for rhl in redhllistfilt] grnhllistfilt = [ghl for ghl in grnhllistfilt if ghl[:-9] in rhlfiltsub] redhllistfilt = [rhl for rhl in redhllistfilt if not rhl==""] if grnhllistfilt and redhllistfilt: grnfnpass = grnhllistfilt[0] redfnpass = redhllistfilt[0] # pass hlinks to return dictionary hllist.append(os.path.join(settings.idatspath, grnfnpass)) hllist.append(os.path.join(settings.idatspath, redfnpass)) else: # make new hlinks hllist = new_idat_hlinks(gsmid, ts=timestamp, igrn_fn=igrn_fn, ired_fn=ired_fn) else: # make new hlinks hllist = new_idat_hlinks(gsmid, ts=timestamp, igrn_fn=igrn_fn, ired_fn=ired_fn) # finally, pass listed hlinks to return dictionary gsm_fpaths_dd[gsmid].append(hllist[0]) gsm_fpaths_dd[gsmid].append(hllist[1]) else: gsm_fpaths_dd[gsmid].append(None) gsm_fpaths_dd[gsmid].append(None) else: gsm_fpaths_dd[gsmid].append(False) # check for valid MetaSRA-pipeline filepaths try: msraplatest = getlatest_filepath(filepath=settings.gsmmsrapoutpath, filestr=gsmid, embeddedpattern=True, tslocindex=0, returntype='returnlist' ) if msraplatest and len(msraplatest)==1: gsm_fpaths_dd[gsmid].append(msraplatest[0]) except: gsm_fpaths_dd[gsmid].append(False) print("Finished with sample num "+str(gi), end="\r") print("Finished sample iterations. Returning...") # return gsmid dictionary with lists of filtered results or valid fpaths return gsm_fpaths_dd
def scan_gsmstatdict(usersheet=True, maxbn=40000, gsmstatdictpath=settings.gsmstatpicklepath): """ scan_gsmstatdict Make a new GSM status dictionary, or update an existing dictionary with latest sample data from compilations files. Arguments: * usersheet (Bool.): Whether to load sample basenames from latest detected rsheet. If 'False', detect basenames de novo with "getbn()". * maxbn (int): Max basenames allowed when forming new status dictionary. * gsmstatdictpath (path/str): Path from which to read status dictionary. Returns: * None, or status dictionary object if loadobj """ if not os.path.exists(gsmstatdictpath): basenames = [] if usersheet: rslatest = getlatest_filepath(filepath=settings.sheetspath, filestr="rsheet", embeddedpattern=True, tslocindex=0, returntype='returnlist') if rslatest: rslpath = rslatest[0] print("Detected latest rsheet. Reading sample ids...") with open(rslpath, "r") as rso: for linect, line in enumerate(rso, 1): if line[0:3] == 'GSM': basenames.append( line.split(' ')[7].replace('\n', '')) print("Finished reading line num " + str(linect), end="\r") print("Finished reading rsheet. Continuing...") else: # form the new status dictionary print("Getting basenames with 'getbn()'...") basenames = getbn(maxbn=maxbn) if not basenames: print("Error obtaining basenames. Returning...") return None else: print("Finished retrieving n = " + str(len(basenames)) + " basenames. Forming dictionary...") gsmstatdict = {bn: [] for bn in basenames} pickle_out = open(gsmstatdictpath, "wb") pickle.dump(gsmstatdict, pickle_out) pickle_out.close() # check path for existing file if os.path.exists(gsmstatdictpath): print("Detected sample status dictionary. Updating...") tasktype = "update dictionary" gsmstatdict = pickle.load(open(gsmstatdictpath, "rb")) cflist = os.listdir(settings.compilationspath) cflist = [cfn for cfn in cflist if 'compilation' in cfn] for cfn in cflist: print("Starting on cfn " + str(cfn)) cftype = cfn.split('.')[1] print("Detected compilation type " + str(cftype)) cfnpath = os.path.join(settings.compilationspath, cfn) with open(cfnpath, "r") as opencfn: for li, line in enumerate(opencfn, 1): if line.split(' ')[0][0:4] == '"GSM': gsmfname = line.split(' ')[0].replace('"', '') if gsmfname in list(gsmstatdict.keys()): if not cftype in gsmstatdict[gsmfname]: gsmstatdict[gsmfname].append(cftype) else: gsmstatdict[gsmfname] = [cftype] print("Finished reading line num " + str(li), end="\r") print("Finished reading lines from cfn. " + "Saving updated sample status dictionary.") pickle_out = open(gsmstatdictpath, "wb") pickle.dump(gsmstatdict, pickle_out) pickle_out.close() print("Finished saving updated dictionary. Continuing...") else: print( "Error, could not detect gsm status dictionary at settings path. " + "Returning...") return None
def dl_idat(input_list, retries_connection=3, retries_files=3, interval_con=.1, interval_file=.01, validate=True, timestamp=gettime_ntp()): """ dl_idat Download idats, reading in either list of GSM IDs or ftp addresses. Arguments * input list (list, required) : A list of valid GSM IDs. * retries_connection (int) : Number of ftp connection retries allowed. # retries_files : Number of retry attempts allowed for sample file downloads. * interval_con (float) : Time (in seconds) to sleep before retrying a database connection. * interval_file (float) : Time (in seconds) to sleep before retrying a file connection. * validate (Bool.): Validate new files against existing idats? * timestamp (str) : An NTP timestamp for versioning. Returns * dldict (dictionary) : Records, dates, and exit statuses of ftp calls, OR error string over connection issues. Downloads and moves new and validated files as side effect. """ idatspath = settings.idatspath temppath = settings.temppath os.makedirs(idatspath, exist_ok=True) os.makedirs(temppath, exist_ok=True) temp_dir_make = tempfile.mkdtemp(dir=temppath) item = input_list[0] if not item.startswith('GSM'): raise RuntimeError("GSM IDs must begin with \"GSM\".") ftptoken_login = '******' retries_left_connection = retries_connection while retries_left_connection: print('trying ftp connection') try: ftp = ftplib.FTP(ftptoken_login) loginstat = ftp.login() print('connection successful, continuing...') break except ftplib.all_errors as e: if retries_left_connection: retries_left_connection -= 1 print('continuing with connection retries left = ' + str(retries_left_connection)) time.sleep(interval_con) continue else: print('connection retries exhausted, returning...') return str(e) # mongodb connection client = pymongo.MongoClient(settings.rmdbhost, settings.rmdbport) dldict = {} files_written = [] for gsm_id in input_list: print('Starting GSM: ' + gsm_id) dldict[gsm_id] = [] id_ftptokens = [ 'ftp.ncbi.nlm.nih.gov', 'geo', 'samples', gsm_id[:-3] + 'nnn', gsm_id, 'suppl' ] id_ftpadd = '/'.join(id_ftptokens[1::]) + '/' filenames = [] retries_left_files = retries_files try: filenames = ftp.nlst(id_ftpadd) if len(filenames) > 0: filestr = '; '.join(str(e) for e in filenames) print("files found: " + filestr) dldict[gsm_id].append([ gsm_id, id_ftpadd, "connection success, valid num idats found" ]) print("Idat filenames detected for " + gsm_id + ", continuing...") for file in filenames: print("Beginning iteration for file: " + file) filedate = "" filedate_estat = "" filedl_estat = "" file_tokens = file.split('/') try: filedate = ftp.sendcmd("MDTM /" + '/'.join(file_tokens)) filedate = datetime.datetime.strptime( filedate[4:], "%Y%m%d%H%M%S") mongo_date = idat_mongo_date(gsm_id, file, client) if filedate in mongo_date: filedate_estat = "same_as_local_date" dldict[gsm_id].append( [gsm_id, file, filedate, filedate_estat]) print('Online date same as local date. Breaking..') break else: filedate_estat = "new_date" to_write = os.path.join( temp_dir_make, '.'.join( [gsm_id, str(timestamp), file_tokens[-1]])) file_ftpadd = '/'.join(file_tokens[:-1]) file_ftpadd = file_ftpadd + '/' + file_tokens[-1:][ 0] print('Attempting file download, for file: ' + file) try: with open(to_write, 'wb') as output_stream: filedl_estat = ftp.retrbinary( "RETR /" + file_ftpadd, output_stream.write) dldict[gsm_id].append([ gsm_id, file_ftpadd, to_write, filedl_estat, filedate, filedate_estat ]) if '226 Transfer complete' in filedl_estat: files_written.append( (gsm_id, to_write, len(dldict[gsm_id]) - 1)) print("File successfully downloaded. " + "Continuing...") continue except ftplib.all_errors as efiledl: if retries_left_files: retries_left_files -= 1 print( 'ftp file dl error, retries left = ' + str(retries_left_files)) time.sleep(interval_file) continue else: print( 'File retries exhausted. Breaking...') filedl_estat = str(efiledl) dldict[gsm_id].append([ gsm_id, file_ftpadd, to_write, filedl_estat, filedate, filedate_estat ]) break break break except ftplib.all_errors as efiledate: if retries_left_files: retries_left_files -= 1 print('ftplib file date error, retries left = ' + str(retries_left_files)) time.sleep(interval_file) continue else: print('File retries exhausted. Breaking...') filedate_estat = str(efiledate) filedate = "not_available" dldict[gsm_id].append( [gsm_id, file, filedate, filedate_estat]) break continue else: dldict[gsm_id].append([gsm_id, "no files at ftp address"]) break except ftplib.error_temp as eid: if retries_left_files: retries_left_files -= 1 print('ftplib filenames error, retries left = ' + str(retries_left_files)) time.sleep(interval_file) continue else: print('File retries exhausted. Breaking...') dldict[gsm_id].append([gsm_id, id_ftpadd, str(eid)]) break if validate: print("Validating downloaded files...") for gsm_id, file_written, index in files_written: print("file written is " + file_written) filestr = os.path.basename(file_written).split('.')[2::] filestr = str('.'.join(filestr)) print('filestr written : ' + filestr) print('dir to search latest: ' + idatspath) gsmidat_latest = getlatest_filepath(idatspath, filestr, embeddedpattern=True, returntype='returnlist', tslocindex=1) print('gsm latest: ' + str(gsmidat_latest)) if gsmidat_latest: gsmidat_latest = gsmidat_latest[0] print('cmp result: ' + str(filecmp.cmp(gsmidat_latest, file_written))) if filecmp.cmp(gsmidat_latest, file_written): print( "Downloaded file is same as recent file. Removing...") os.remove(file_written) # If filename is false, we found it was the same dldict[gsm_id][index].append(False) else: print("Downloaded file is new, moving to idatspath...") shutil.move( file_written, os.path.join(idatspath, os.path.basename(file_written))) dldict[gsm_id][index].append(True) dldict[gsm_id][index][2] = os.path.join( idatspath, os.path.basename(file_written)) else: print("Downloaded file is new, moving...") shutil.move( file_written, os.path.join(idatspath, os.path.basename(file_written))) dldict[gsm_id][index].append(True) dldict[gsm_id][index][2] = os.path.join( idatspath, os.path.basename(file_written)) shutil.rmtree(temp_dir_make) return dldict
def dl_soft(gse_list=[], retries_connection=3, retries_files=3, interval_con=.1, interval_file=.01, validate=True, timestamp=gettime_ntp()): """ dl_soft Download GSE soft file(s). Accepts either a list of GSM IDs or ftp addresses. Arguments: * gse_list (list, required) : A list of valid GSE id(s). * retries_connection (int) : Number of ftp connection retries allowed. * retries_files : Number of retry attempts allowed for sample file downloads. * interval_con (float) : Time (in seconds) to sleep before retrying a database connection. * interval_file (float) : Time (in seconds) to sleep before retrying a file connection. * validate (Bool.): Validate new files against existing idats? * timestamp (str) : An NTP timestamp for versioning. Returns: * Dictionary showing records, dates, and exit statuses of ftp calls OR error string over connection issues """ gsesoftpath = settings.gsesoftpath temppath = settings.temppath os.makedirs(gsesoftpath, exist_ok=True) os.makedirs(temppath, exist_ok=True) temp_dir_make = tempfile.mkdtemp(dir=temppath) item = gse_list[0] if not item.startswith('GSE'): raise RuntimeError("GSE IDs must begin with \"GSE\".") ftptoken_login = '******' retries_left_connection = retries_connection while retries_left_connection: print('trying ftp connection') try: ftp = ftplib.FTP(ftptoken_login) loginstat = ftp.login() print('connection successful, continuing...') break except ftplib.all_errors as e: if retries_left_connection: retries_left_connection -= 1 print('continuing with connection retries left = ' + str(retries_left_connection)) time.sleep(interval_con) continue else: print('connection retries exhausted, returning...') return str(e) # mongodb connection client = pymongo.MongoClient(settings.rmdbhost, settings.rmdbport) dldict = {} print('beginning iterations over gse list...') for gse in gse_list: print('beginning download for gse: ' + gse) retries_left_files = retries_files dldict[gse] = [] files_written = [] filenames = [] # tokens for soft file ftp address id_ftptokens = [ 'ftp.ncbi.nlm.nih.gov', 'geo', 'series', gse[:-3] + 'nnn', gse, 'soft' ] id_ftpadd = '/'.join(id_ftptokens[1::]) + '/' while retries_left_files: try: filenames = ftp.nlst(id_ftpadd) # filter for only soft file names file = list(filter(lambda x: 'family.soft' in x, filenames))[0] dldict[gse].append([gse, id_ftpadd, "success"]) filedate = "" filedate_estat = "" filedl_estat = "" file_tokens = file.split('/') try: print('getting date from ' + '/'.join(file_tokens)) filedate = ftp.sendcmd("MDTM /" + '/'.join(file_tokens)) filedate = datetime.datetime.strptime( filedate[4:], "%Y%m%d%H%M%S") mongo_date = soft_mongo_date(gse, file, client) if filedate in mongo_date: print('online date same as local date,' + 'breaking...') filedate_estat = "same_as_local_date" dldict[gse].append( [gse, file, filedate, filedate_estat]) break else: print('new online date found, continuing...') filedate_estat = "new_date" to_write = os.path.join( temp_dir_make, '.'.join([gse, timestamp, file_tokens[-1]])) file_ftpadd = '/'.join(file_tokens[:-1]) file_ftpadd = file_ftpadd + '/' + file_tokens[-1:][0] try: print('downloading soft from ' + file_ftpadd) with open(to_write, 'wb') as output_stream: filedl_estat = ftp.retrbinary( "RETR /" + file_ftpadd, output_stream.write) dldict[gse].append([ gse, file_ftpadd, to_write, filedl_estat, filedate, filedate_estat ]) if '226 Transfer complete' in filedl_estat: files_written.append( (gse, to_write, len(dldict[gse]) - 1)) print('total files written = ' + str(len(files_written))) print('soft transfer successful for ' + to_write + ', breaking...') break except ftplib.all_errors as efiledl: print('file download error from ' + file_ftpadd) if retries_left_files: retries_left_files -= 1 print('continuing with file retries left =' + str(retries_left_files)) time.sleep(interval_file) continue else: print('file retries exhausted, breaking..') filedl_estat = str(efiledl) dldict[gse].append([ gse, file_ftpadd, to_write, filedl_estat, filedate, filedate_estat ]) break except ftplib.all_errors as efiledate: print('error getting date from ' + '/'.join(file_tokens)) if retries_left_files: retries_left_files -= 1 print('continuing with file retries left = ' + str(retries_left_files)) time.sleep(interval_file) continue else: print('file retries exhausted, breaking..') filedate_estat = str(efiledate) filedate = "not_available" dldict[gse].append( [gse, file, filedate, filedate_estat]) break except ftplib.error_temp as eid: print('error making ftp connection to ' + id_ftpadd) if retries_left_files: retries_left_connection -= 1 print('ftplib error encountered, file retries left = ' + str(retries_left_files)) time.sleep(interval_file) continue else: print('file retries exhausted, breaking..') dldict[gse].append([gse, id_ftpadd, str(eid)]) break if validate: print('commencing file validation...') for gse, new_filepath, index in files_written: filestr = os.path.basename(new_filepath).split('.')[0] gsesoft_latest = getlatest_filepath(gsesoftpath, filestr) if gsesoft_latest and not gsesoft_latest == 0: if filecmp.cmp(gsesoft_latest, new_filepath): print('identical file found in dest_dir, removing...') dldict[gse].append(False) os.remove(new_filepath) else: print('new file detected in temp_dir, moving to ' + 'dest_dir...') dldict[gse].append(True) dldict[gse][index][2] = os.path.join( gsesoftpath, os.path.basename(new_filepath)) shutil.move( new_filepath, os.path.join(dest_dir, os.path.basename(new_filepath))) else: print('new file detected in temp_dir, moving to dest_dir..') dldict[gse].append(True) dldict[gse][index][2] = os.path.join( gsesoftpath, os.path.basename(new_filepath)) shutil.move( new_filepath, os.path.join(gsesoftpath, os.path.basename(new_filepath))) continue shutil.rmtree(temp_dir_make) return dldict
def msrap_getsamples(json_flist=[], fnpatt=".*json.filt$", gsmjsonpath=os.path.join("recount-methylation-files", "gsm_json_filt"), nprocsamp=50, nmaxproc=20): """ msrap_getsamples Get the validated samples file list Arguments: * json_flist (list) : List of GSM JSON filenames to process. If not provided, function automatically detects any new GSM JSON files without available MetaSRA-pipeline outfiles. * fnpatt (str): Filename pattern of valid json files to identify. * gsmjsonpath (path): Path to JSON formatted sample SOFT data. * nprocsamp (int) : Number of samples to process per screen deployed. * nmaxproc (int) : Maximum processes to launch * timelim (int) : time limit (minutes) for monitoring processes. * statint (int) : time (seconds) to sleep before next status update. Returns: (Null) Generates >=1 processes for file sublists """ print("Checking dirs for msrapout and msrap logs...") os.makedirs(settings.gsmmsrapoutpath, exist_ok=True) os.makedirs(settings.msraplogspath, exist_ok=True) # detect gsm soft files psoftpath = settings.psoftscriptpath if os.path.exists(psoftpath): print("Process soft script found at: "+str(psoftpath)) gsmsoftpath = settings.gsmsoftpath gsmmsrapoutpath = settings.gsmmsrapoutpath jsonfnpattern = fnpatt rjson = re.compile(jsonfnpattern) msrapoutfnpattern = settings.msrapoutfnpattern rmsrapout = re.compile(msrapoutfnpattern) # generate fl list of valid json files that haven't been processed yet fl = [] if json_flist and len(json_flist)>0: jsonfnlist = list(filter(rjson.match, json_flist)) jsongsmlist = [x.split('.')[1] for x in jsonfnlist] else: json_flist = os.listdir(gsmjsonpath) jsonfnlist = list(filter(rjson.match, json_flist)) jsongsmlist = [x.split('.')[1] for x in jsonfnlist] msrapoutfnlist = os.listdir(gsmmsrapoutpath) msrapoutfnlist = list(filter(rmsrapout.match, msrapoutfnlist)) print("Found "+str(len(msrapoutfnlist))+" files with pattern " +msrapoutfnpattern+". Continuing...") msrapgsmlist = [x.split('.')[2] for x in msrapoutfnlist] gsmprocess = [g for g in jsongsmlist if not g in msrapgsmlist and g[0:3]=='GSM' ] for index, gsmid in enumerate(gsmprocess): gjsonfpath = getlatest_filepath(filepath=gsmjsonpath, filestr=gsmid, embeddedpattern=True, tslocindex=0, returntype='returnlist' ) if gjsonfpath and len(gjsonfpath)==1: gjsonfn = [os.path.basename(gjsonfpath[0])] else: gjsonfn = [os.path.basename(fn) for fn in gjsonfpath] gjsonfn = gjsonfn[0] fl.append(gjsonfn) numi = 100*(index/len(gsmprocess)) perci = str(round(numi,2)) print("Appended file "+gjsonfn+" to files list to process. " +"Progress: "+str(index)+"/"+str(len(gsmprocess))+"=" +perci+"%. Continuing...") # form list of fn lists based on nscreensi and indices/slices if fl: print("Forming list of fn lists for screen deployment...") ll = [] rangelist = [i for i in range(0, len(fl), nprocsamp)] for enum, i in enumerate(rangelist[:-1]): ll.append(fl[i:rangelist[enum+1]]) if len(fl[rangelist[-1]::]) > 0: ll.append(fl[rangelist[-1]::]) else: print("Error, no files list object to process. Returning...") return None print('screens ll list, len = ' + str(len(ll))) print('nmax screens = '+str(nmaxproc)) return ll
print(list(newfilesd.keys())) for gse_softfn in list(newfilesd.keys()): gsmfilelist = list(filter(rxgsmfile.match, newfilesd[gse_softfn])) if gsmfilelist and len(gsmfilelist)>0: print(str(gsmfilelist)) for gsmfile in gsmfilelist: gsm_oldfile_path = "" gsm_newfile_path = "" gsm_softfn = gsmfile gsmstr = gsm_softfn.split(".")[1] print("gsmfile: "+str(gsmfile)) print("gsmstr : "+gsmstr) gsm_newfile_path = os.path.join(temp_dir_make, gsm_softfn) gsm_oldfile_path = getlatest_filepath( filepath=gsmsoft_destpath, filestr=gsmstr, embeddedpattern=True, tslocindex=0 ) print("gsm_oldfile_path : "+str(gsm_oldfile_path)) print("gsm_newfile_path : "+str(gsm_newfile_path)) if os.path.exists(gsm_newfile_path): if gsm_oldfile_path: if filecmp.cmp(gsm_oldfile_path, gsm_newfile_path): print("Identical GSM soft file detected, removing...") os.remove(gsm_newfile_path) newfilesd[gsmfile] = False else: print("New GSM soft file detected, moving from temp...") shutil.move(gsm_newfile_path, os.path.join( gsmsoft_destpath, os.path.basename(gsm_newfile_path)) )