Esempio n. 1
0
def firsttime_run(filedir='recount-methylation-files',
                  run_timestamp=gettime_ntp()):
    """ firsttime_run

        On first setup, run new equeries and query filter.
    
        Arguments:
        * filedir (str): Dir name for db files. 
        * run_timestamp (str) : NTP timestamp or function to retrieve it.
    
        Returns:
        * gseidlist (list): List of valid GSE IDs.
    """
    print("Beginning first time server run...")
    equery_dest = settings.equerypath
    temppath = settings.temppath
    gse_query()
    gsm_query()
    gseqfile = getlatest_filepath(equery_dest, 'gse_edirectquery')
    gsmqfile = getlatest_filepath(equery_dest, 'gsm_edirectquery')
    gsequery_filter()
    gsefiltpath = getlatest_filepath(equery_dest, 'gsequery_filt')
    if gsefiltpath:
        gsefiltd = querydict(querypath=gsefiltpath, splitdelim=' ')
        gseidlist = list(gsefiltd.keys())
        print("GSE id list of len " + str(len(gseidlist)) +
              " found. Returning...")
        return gseidlist
    else:
        print("Error retrieving gse query filtered file. Returning...")
        return None
    return None
Esempio n. 2
0
def scheduled_run(eqfilt_path=False, run_timestamp=gettime_ntp()):
    """ scheduled_run

        Tasks performed on regular schedule, after first setup. For the job 
        queue, a list of GSE IDs is returned. The id list is filtered on 
        existing GSE soft files to prioritize unrepresented experiments for 
        download. 

        Arguments:
        * eqfilt_path (str) : Filepath to edirect query filter file.
        * filedir (str) : Root name of files directory.
        * run_timestamp (str) : NTP timestamp or function to retrieve it.
        
        Returns:
        * gse_list (list) : list of valid GSE IDs, or None if error occurs 
    """
    try:
        gsefiltd = get_queryfilt_dict()
    except:
        print("No gse query filt file found, checking for GSE and GSM " +
              "queries...")
        gsequery_latest = getlatest_filepath(filepath=eqpath,
                                             filestr='gse_edirectquery')
        if not gsequery_latest:
            gse_query()
        gsmquery_latest = getlatest_filepath(eqpath, 'gsm_edirectquery')
        if not gsmquery_latest:
            gsm_query()
        print("Running filter on GSE query...")
        gsequery_filter()
        gsefiltd = get_queryfilt_dict()
    # get list of GSE IDs from existing SOFT files
    gsesoftfiles = os.listdir(settings.gsesoftpath)
    print("GSE SOFT files: " + str(gsesoftfiles))
    rxgse = re.compile('GSE[0-9]*')
    gseid_softexists = [
        str(rxgse.findall(softfn)[0]) for softfn in gsesoftfiles
        if rxgse.findall(softfn)
    ]
    if gsefiltd:
        gseid_listall = list(gsefiltd.keys())
        print("GSE ID list of len " + str(len(gseid_listall)) +
              " found. Filtering..")
        if gseid_softexists and len(gseid_softexists) > 0:
            gseid_filt = [
                gseid for gseid in gseid_listall
                if not gseid in gseid_softexists
            ]
        else:
            gseid_filt = gseid_listall
        print("After filtering existing SOFT files, N = " +
              str(len(gseid_filt)) + " GSE IDs remain. Returning ID list...")
        # if all GSE IDs represented, return all GSE IDs for brand new run
        if len(gseid_filt) == len(gseid_listall):
            gseid_filt = gseid_listall
        return gseid_filt
    else:
        print("Error forming equery filt dictionary. Returning...")
        return None
def gsequery_filter(splitdelim='\t', timestamp=gettime_ntp()):
    """ gsequery_filter
        
        Prepare an edirect query file. Filter a GSE query file on its GSM 
            membership. 
        
        Arguments:
            * splitdelim (str) : Delimiter to split ids in querydict() call.
            * timestamp (str) : NTP timestamp or function to retrieve it.
        
        Returns:
            * gsequeryfiltered (list): Filtered GSE query object (list), writes
                filtered query file as side effect.
    """
    eqpath = settings.equerypath
    gsequerystr = settings.gsequerystr
    gsmquerystr = settings.gsmquerystr
    # get GSM list from gsm query file
    gsmqueryf_latestpath = getlatest_filepath(filepath=eqpath,
                                              filestr=gsmquerystr,
                                              embeddedpattern=True,
                                              tslocindex=1,
                                              returntype='returnlist')
    if gsmqueryf_latestpath:
        print("Latest gsmquery file detected: " + str(gsmqueryf_latestpath))
    else:
        print("Error detecting latest gsmquery file! Returning...")
        return
    gsmlines = [line.rstrip('\n') for line in open(gsmqueryf_latestpath[0])]
    gsmlist = [line.split('\t')[1::][0] for line in gsmlines]
    # get GSE dictionary object
    gsequeryf_latestpath = getlatest_filepath(filepath=eqpath,
                                              filestr=gsequerystr,
                                              embeddedpattern=True,
                                              tslocindex=1,
                                              returntype='returnlist')
    if gsequeryf_latestpath:
        print("Latest gsequery file detected: " + str(gsequeryf_latestpath))
    else:
        print("Error detecting latest gsequery file! Returning...")
        return
    gsed_obj = querydict(querypath=gsequeryf_latestpath[0], splitdelim='\t')
    gsefiltl = []
    for gsekey in list(gsed_obj.keys()):
        samplelist_original = gsed_obj[gsekey]
        samplelist_filt = [
            sample for sample in samplelist_original if sample in gsmlist
        ]
        if samplelist_filt and len(samplelist_filt) > 0:
            gsefiltl.append(' '.join([gsekey, ' '.join(samplelist_filt)]))
    print('writing filt file...')
    if eqpath:
        filtfn = ".".join(["gsequery_filt", timestamp])
        with open(os.path.join(eqpath, filtfn), 'w') as filtfile:
            for item in gsefiltl:
                filtfile.write("%s\n" % item)
    return gsefiltl
def eqd_gsm_exclude(equery_dest=settings.equerypath, filesdir=settings.filesdir,
    gsmv_fname="gsmv.txt", exclude_dpath=os.path.join("inst", "freeze_gsmv")):
    """ eqd_gsm_exclude

        Exclude GSM IDs from edirecty query objects

        Arguments:
        * gsmv_fname: Name of the file to load. Should include only 
            space-separated sample/GSM IDs in a single line.
        * exclude_dpath: Path to directory containing the file gsmv_fname.

        Returns:
        * Returns the path to the new filtered file at settings.equerypath.

    """
    gsmv_fpath = os.path.join(exclude_dpath, gsmv_fname)
    if not os.path.exists(gsmv_fpath):
        print("Couldn't find sample ID file")
    gsmv_exclude = [line.rstrip('\n').split(" ") 
                        for line in open(gsmv_fpath)][0]
    # gsmv_exclude = [i for sublist in gsmv_exclude for i in sublist]
    eqpath = settings.equerypath
    gsefilt_latest = getlatest_filepath(eqpath,'gsequery_filt', 
            embeddedpattern=True, tslocindex=1, returntype='returnlist'
        )[0]
    print("Starting with latest detected filter file: "+gsefilt_latest)
    querylines = [line.rstrip('\n') for line in open(gsefilt_latest)]
    qlnew = []; print("Applying filter..."); numgsm_old = len(querylines)
    for line in querylines:
        line = line.split(" ")
        ldat = [gid for gid in line if not gid in gsmv_exclude]
        numgsm_new = len(ldat)
        if len(ldat) > 1:
            qlnew.append(ldat)
    print("After filter, retained " + str(len(qlnew)) + " studies.")
    nts = gettime_ntp()
    newfpath = os.path.join(eqpath, ".".join(["gsequery_filt",nts]))
    print("Writing new filter file: ", newfpath)
    with open(newfpath, "w") as wf:
        for line in qlnew:
            wf.write(" ".join(line) + "\n")
        return newfpath
Esempio n. 5
0
def compile_rsheet(eqfiltd=get_queryfilt_dict(),
                   sheetfn_ext='rsheet',
                   msrapfn_ext='msrapout',
                   msrapfn='msrapout',
                   idatsfn_ext='idat',
                   timestamp=gettime_ntp()):
    """ compile_rsheet

        Knits poised file data together into a sheet to be read into R using 
        minfi. Steps taken include: 
            1. Grab msrap file list
            2. Grab idats file list
            3. Intersect files lists
            4. Subset eqfilt dict on gse
            5. Form and write new sheet files, one per gse
        
        Arguments
        * eqfiltd (function or dictionary) : Equery filter dictionary object.
        * sheetsdir (str) : Directory to write new sheet files.
        * sheetfn_ext (str) : Filename extension for new sheet files.
        * msrapdir (str) : Directory containing MetaSRA-pipeline datafiles.
        * msrapfn_ext (str) : Filename extension of valid MetaSRA-pipeline
            datafiles.
        * idatsfn_ext (str) : Filename extension of valid idat files.
        * idatsdir (str) : Name of directory containing GSM idat files.
        * filesdir (str) : Root name of directory containing database files.
        * timestamp (str) : NTP timestamp for file versioning.
        * msrapfn (str) : File name stem for MetaSRA-pipeline files
        
        Returns:
        * null, produces sheet files as a side effect.
    """
    # form the sheet path and make dir as needed
    sheetspath = settings.sheetspath
    os.makedirs(sheetspath, exist_ok=True)
    sheets_fpath = os.path.join(sheetspath, ".".join([timestamp, sheetfn_ext]))
    # form msrap and idat paths and get filenames
    msrap_path = settings.gsmmsrapoutpath
    rxmsrap = re.compile(".*" + msrapfn_ext + "$")
    msrap_fnlist = list(filter(rxmsrap.match, os.listdir(msrap_path)))
    print("msrap_fnlist : " + str(msrap_fnlist))
    # idats fn
    idats_path = settings.idatspath
    rxidat = re.compile(".*" + idatsfn_ext + "$")
    idats_fnlist = list(filter(rxidat.match, os.listdir(idats_path)))
    # extract gsm ids
    rxgsm = re.compile(".*GSM[0-9]")
    idats_splitlist = [
        idatfn.split(".")[0] for idatfn in idats_fnlist
        if len(idatfn.split(".")) > 1
    ]
    idats_gsmlist_filt = list(set(filter(rxgsm.match,
                                         idats_splitlist)))  # unique gsm ids
    msrap_splitlist = [
        msrapfn.split(".")[1] for msrapfn in msrap_fnlist
        if len(msrapfn.split(".")) > 1
    ]
    msrap_gsmlist_filt = list(set(filter(rxgsm.match,
                                         msrap_splitlist)))  # unique gsm ids
    print("idats_gsmlist_filt : " + str(idats_gsmlist_filt))
    print("msrap_gsmlist_filt : " + str(msrap_gsmlist_filt))
    gsmvalid = [
        gsmid for gsmid in msrap_gsmlist_filt if gsmid in idats_gsmlist_filt
    ]
    if len(gsmvalid) > 0:
        rxgrn = re.compile(".*Grn.idat$")
        rxred = re.compile(".*Red.idat$")
        lsheet = []  # list object to write rsheet, one row per gsmid
        # append colnames
        lsheet.append(" ".join([
            "gsmid", "gseid", "idats_fn", "msrapmd_fn", "msrapmd_flatjson",
            "SENTRIX_ID", "ARRAY_ID", "Basename"
        ]))
        lsheet[0] = lsheet[0] + "\n"
        for gsmid in gsmvalid:
            # compile the file info for this gsm
            rxgsmi = re.compile(".*" + gsmid + ".*")
            gsmi_idats = list(filter(rxgsmi.match, idats_fnlist))
            gsmi_red_idats = list(filter(rxred.match, gsmi_idats))
            gsmi_grn_idats = list(filter(rxgrn.match, gsmi_idats))
            # get the latest file versions
            gsmi_red_pattern = gsmi_red_idats[0].split(".")[2]
            gsmi_grn_pattern = gsmi_grn_idats[0].split(".")[2]
            gsmi_red_latest = getlatest_filepath(filepath=idats_path,
                                                 filestr=gsmi_red_pattern,
                                                 embeddedpattern=True)
            gsmi_grn_latest = getlatest_filepath(filepath=idats_path,
                                                 filestr=gsmi_grn_pattern,
                                                 embeddedpattern=True)
            # get the latest msrap file
            gsmi_msrap_latest = getlatest_filepath(filepath=msrap_path,
                                                   filestr=gsmid,
                                                   embeddedpattern=True)
            print(gsmi_msrap_latest)
            if (gsmi_red_latest and not gsmi_red_latest == 0
                    and gsmi_grn_latest and not gsmi_grn_latest == 0
                    and gsmi_msrap_latest and not gsmi_msrap_latest == 0):
                # form the rsheets with valid gsm ids
                with open(gsmi_msrap_latest, 'r') as msrapmd:
                    gsmi_metadata_dict = json.load(msrapmd)
                gsmi_md = gsmi_metadata_dict[0]  # weird dictionary
                grows = []
                for key in list(gsmi_md.keys()):
                    kval = gsmi_md[key]
                    if type(kval) is list:
                        grows.append(";".join(kval))
                    else:
                        grows.append(":".join([str(key), str(gsmi_md[key])]))
                gsmi_mdvar = "'" + ";".join(grows) + "'"
                # grab the gse id for this gsm
                gseid = str([
                    gsek for gsek in list(eqfiltd.keys())
                    if gsmid in eqfiltd[gsek]
                ][0])
                # make the gsm arrays path Basename for minfi
                gsmi_bn = "_".join(gsmi_red_latest.split("_")[0:3])
                # one entry per gsm
                lgsmi = " ".join([
                    gsmid,  # gsm id
                    gseid,  # gse id
                    ";".join([
                        os.path.basename(gsmi_red_latest),
                        os.path.basename(gsmi_grn_latest)
                    ]),  # idat filenames
                    os.path.basename(gsmi_msrap_latest),  # metadata filename
                    gsmi_mdvar,  # flattened json file
                    os.path.basename(gsmi_red_latest).split(
                        "_")[-2],  # sentrix id
                    os.path.basename(gsmi_red_latest).split("_")
                    [-3],  # array id
                    gsmi_bn  # minfi path Basename, for arrays
                ])
                lgsmi = lgsmi + "\n"
                lsheet.append(lgsmi)
    else:
        print(
            "No valid GSM IDs detected. Check idats and MetaSRA-pipeline GSM "
            + "files directories.")
        return 0
    # write the final sheet files
    with open(sheets_fpath, 'w') as fsheet:
        for item in lsheet:
            fsheet.write(item)

    return lsheet
Esempio n. 6
0
def rmdb_fpaths_old(rmhlinks=False):
    """ rmdb_fpaths
        Get filepaths for existant sample idats and msrap outfiles.
        Arguments:
        * rmhlinks : Whether to remove old hardlinks and form new ones, 
                regardless of whether current hlinks exist (boolean).
        Returns:
        * gsmdocdict (dict.) : Dictionary of validated filepaths.
    """
    timestamp = gettime_ntp()
    # connect to RMDB mongodb
    client = pymongo.MongoClient(settings.rmdbhost, settings.rmdbport)
    dbcon = client.recount_methylation; idatscon = dbcon.gsm.idats
    softcon = dbcon.gse.soft; idatslist = list(idatscon.find())
        # grab unique gsm ids
    idatslist = [record for record in idatslist if 'gsmid' in record.keys()]
    gsmindex = list(set([record['gsmid'] for record in idatslist]))
    print("from idats db, found n = "+str(len(gsmindex))+" gsm ids")
        # fname catch patterns for re
    grnidatcatch = settings.grnidat_expcatch
    redidatcatch = settings.redidat_expcatch
    msrapoutcatch = settings.msrapoutfnpattern
        # filter all records for gsm on most recent update datetime
    gsm_fpaths_dd = {}
    # list all previously expanded idat files directy from idats dir
    allidatslist = os.listdir(settings.idatspath)
    allidatslist = list(filter(re.compile('.*\.idat$').match, allidatslist))
    print("found n = "+str((len(allidatslist)))+" expanded idat filenames...")
    # grab and filter idats and msrap outfiles lists
    if rmhlinks:
        print("Beginning sample iterations with hlink removal.")
    else:
        print("Beginning sample iterations without hlink removal.")
    for gi, gsmid in enumerate(gsmindex, 1):
        print("Getting fpaths for gsm: "+str(gsmid)+", num: "+str(gi), end="\r")
        gsm_fpaths_dd[gsmid] = []
        # all idat records for the GSM id
        recordsgsm = [record for record in idatslist if record['gsmid']==gsmid]
        # filter records by channel type,
        # note most records are for compressed files
        idatsrec_gsmgrn = [record for record in recordsgsm 
            if isinstance(record['date'],datetime.datetime)
            and re.search('.*Grn\.idat.*',os.path.basename(record['filepath']))
        ]
        idatsrec_gsmred = [record for record in recordsgsm 
            if isinstance(record['date'],datetime.datetime)
            and re.search('.*Red\.idat.*',os.path.basename(record['filepath']))
        ]
        if idatsrec_gsmgrn and idatsrec_gsmred:
            # get latest records for each channel
            irec_filtgrn = sorted(idatsrec_gsmgrn, key=lambda k: k['date'])[-1]
            irec_filtred = sorted(idatsrec_gsmred, key=lambda k: k['date'])[-1]
            # valid record file basenames
            igrnrec_bn = os.path.basename(irec_filtgrn['filepath'])
            iredrec_bn = os.path.basename(irec_filtred['filepath'])
            # check for expanded versions of compressed files
            igrn_fn = [fn for fn in allidatslist 
                if igrnrec_bn[:-3] in fn
            ]
            ired_fn = [fn for fn in allidatslist 
                if iredrec_bn[:-3] in fn
            ]
            if igrn_fn and ired_fn:
                igrn_fn = igrn_fn[0]
                ired_fn = ired_fn[0]
                hllist = []
                if rmhlinks:
                    # remove old hard links to sample idats
                    grnhl_torm = [fn for fn in allidatslist
                        if "hlink" in fn 
                        and '.'.join(igrn_fn.split('.')[2:]) in fn
                    ]
                    redhl_torm = [fn for fn in allidatslist
                        if "hlink" in fn 
                        and '.'.join(ired_fn.split('.')[2:]) in fn
                    ]
                    if grnhl_torm:
                        for hlfn in grnhl_torm:
                            os.remove(os.path.join(settings.idatspath, 
                                    hlfn)
                                )
                    if redhl_torm:
                        for hlfn in redhl_torm:
                            os.remove(os.path.join(settings.idatspath, 
                                    hlfn)
                                )
                    # new hlinks
                    hllist = new_idat_hlinks(gsmid, ts=timestamp, 
                            igrn_fn=igrn_fn, ired_fn=ired_fn
                        )
                else:
                    # check if hlinks exist, create new ones otherwise
                    grnhllist = [fn for fn in allidatslist
                        if "hlink" in fn 
                        and '.'.join(igrn_fn.split('.')[2:]) in fn
                    ]
                    redhllist = [fn for fn in allidatslist
                        if "hlink" in fn 
                        and '.'.join(ired_fn.split('.')[2:]) in fn
                    ]
                    # get matching grn and red hlink fn's if they exist
                    status_hlink = None
                    grnfnpass = None
                    redfnpass = None
                    if grnhllist and redhllist:
                        grnhllistfilt = list(set(grnhllist))
                        redhllistfilt = []
                        for ghl in grnhllistfilt:
                            for rhl in redhllist:
                                # check that base array ids identical
                                if ghl[:-9]==rhl[:-9]:
                                    redhllistfilt.append(rhl)
                                else:
                                    redhllistfilt.append("")
                        rhlfiltsub = [rhl[:-9] for rhl in redhllistfilt]
                        grnhllistfilt = [ghl for ghl in grnhllistfilt 
                            if ghl[:-9] in rhlfiltsub]
                        redhllistfilt = [rhl for rhl in redhllistfilt
                            if not rhl==""]
                        if grnhllistfilt and redhllistfilt:
                            grnfnpass = grnhllistfilt[0]
                            redfnpass = redhllistfilt[0]
                            # pass hlinks to return dictionary
                            hllist.append(os.path.join(settings.idatspath, grnfnpass))
                            hllist.append(os.path.join(settings.idatspath, redfnpass))
                        else:
                            # make new hlinks
                            hllist = new_idat_hlinks(gsmid, ts=timestamp, 
                                igrn_fn=igrn_fn, ired_fn=ired_fn)
                    else:
                        # make new hlinks
                        hllist = new_idat_hlinks(gsmid, ts=timestamp, 
                            igrn_fn=igrn_fn, ired_fn=ired_fn)
                # finally, pass listed hlinks to return dictionary
                gsm_fpaths_dd[gsmid].append(hllist[0])  
                gsm_fpaths_dd[gsmid].append(hllist[1])    
            else:
                gsm_fpaths_dd[gsmid].append(None)
                gsm_fpaths_dd[gsmid].append(None)
        else:
            gsm_fpaths_dd[gsmid].append(False)
        # check for valid MetaSRA-pipeline filepaths
        try:
            msraplatest = getlatest_filepath(filepath=settings.gsmmsrapoutpath,
                filestr=gsmid, embeddedpattern=True, tslocindex=0, 
                returntype='returnlist'
            )
            if msraplatest and len(msraplatest)==1:
                gsm_fpaths_dd[gsmid].append(msraplatest[0])
        except:
            gsm_fpaths_dd[gsmid].append(False)
        print("Finished with sample num "+str(gi), end="\r")
    print("Finished sample iterations. Returning...")
    # return gsmid dictionary with lists of filtered results or valid fpaths
    return gsm_fpaths_dd
def scan_gsmstatdict(usersheet=True,
                     maxbn=40000,
                     gsmstatdictpath=settings.gsmstatpicklepath):
    """ scan_gsmstatdict
        
        Make a new GSM status dictionary, or update an existing dictionary with
        latest sample data from compilations files.
        
        Arguments:
        * usersheet (Bool.): Whether to load sample basenames from latest 
            detected rsheet. If 'False', detect basenames de novo with 
            "getbn()".
        * maxbn (int): Max basenames allowed when forming new status dictionary.
        * gsmstatdictpath (path/str): Path from which to read status dictionary.
        
        Returns: 
        * None, or status dictionary object if loadobj
    """
    if not os.path.exists(gsmstatdictpath):
        basenames = []
        if usersheet:
            rslatest = getlatest_filepath(filepath=settings.sheetspath,
                                          filestr="rsheet",
                                          embeddedpattern=True,
                                          tslocindex=0,
                                          returntype='returnlist')
            if rslatest:
                rslpath = rslatest[0]
                print("Detected latest rsheet. Reading sample ids...")
                with open(rslpath, "r") as rso:
                    for linect, line in enumerate(rso, 1):
                        if line[0:3] == 'GSM':
                            basenames.append(
                                line.split(' ')[7].replace('\n', ''))
                        print("Finished reading line num " + str(linect),
                              end="\r")
                print("Finished reading rsheet. Continuing...")
        else:
            # form the new status dictionary
            print("Getting basenames with 'getbn()'...")
            basenames = getbn(maxbn=maxbn)
        if not basenames:
            print("Error obtaining basenames. Returning...")
            return None
        else:
            print("Finished retrieving n = " + str(len(basenames)) +
                  " basenames. Forming dictionary...")
            gsmstatdict = {bn: [] for bn in basenames}
            pickle_out = open(gsmstatdictpath, "wb")
            pickle.dump(gsmstatdict, pickle_out)
            pickle_out.close()
    # check path for existing file
    if os.path.exists(gsmstatdictpath):
        print("Detected sample status dictionary. Updating...")
        tasktype = "update dictionary"
        gsmstatdict = pickle.load(open(gsmstatdictpath, "rb"))
        cflist = os.listdir(settings.compilationspath)
        cflist = [cfn for cfn in cflist if 'compilation' in cfn]
        for cfn in cflist:
            print("Starting on cfn " + str(cfn))
            cftype = cfn.split('.')[1]
            print("Detected compilation type " + str(cftype))
            cfnpath = os.path.join(settings.compilationspath, cfn)
            with open(cfnpath, "r") as opencfn:
                for li, line in enumerate(opencfn, 1):
                    if line.split(' ')[0][0:4] == '"GSM':
                        gsmfname = line.split(' ')[0].replace('"', '')
                        if gsmfname in list(gsmstatdict.keys()):
                            if not cftype in gsmstatdict[gsmfname]:
                                gsmstatdict[gsmfname].append(cftype)
                        else:
                            gsmstatdict[gsmfname] = [cftype]
                        print("Finished reading line num " + str(li), end="\r")
            print("Finished reading lines from cfn. " +
                  "Saving updated sample status dictionary.")
            pickle_out = open(gsmstatdictpath, "wb")
            pickle.dump(gsmstatdict, pickle_out)
            pickle_out.close()
            print("Finished saving updated dictionary. Continuing...")
    else:
        print(
            "Error, could not detect gsm status dictionary at settings path. "
            + "Returning...")
    return None
Esempio n. 8
0
def dl_idat(input_list,
            retries_connection=3,
            retries_files=3,
            interval_con=.1,
            interval_file=.01,
            validate=True,
            timestamp=gettime_ntp()):
    """ dl_idat
        
        Download idats, reading in either list of GSM IDs or ftp addresses.
        
        Arguments
            * input list (list, required) : A list of valid GSM IDs.
            * retries_connection (int) : Number of ftp connection retries 
                allowed.
            # retries_files : Number of retry attempts allowed for sample file
                downloads.
            * interval_con (float) : Time (in seconds) to sleep before retrying 
                a database connection.
            * interval_file (float) : Time (in seconds) to sleep before retrying 
                a file connection. 
            * validate (Bool.): Validate new files against existing idats?
            * timestamp (str) : An NTP timestamp for versioning.
        
        Returns 
            * dldict (dictionary) : Records, dates, and exit statuses of ftp 
                calls, OR error string over connection issues. Downloads and 
                moves new and validated files as side effect. 
    """
    idatspath = settings.idatspath
    temppath = settings.temppath
    os.makedirs(idatspath, exist_ok=True)
    os.makedirs(temppath, exist_ok=True)
    temp_dir_make = tempfile.mkdtemp(dir=temppath)
    item = input_list[0]
    if not item.startswith('GSM'):
        raise RuntimeError("GSM IDs must begin with \"GSM\".")
    ftptoken_login = '******'
    retries_left_connection = retries_connection
    while retries_left_connection:
        print('trying ftp connection')
        try:
            ftp = ftplib.FTP(ftptoken_login)
            loginstat = ftp.login()
            print('connection successful, continuing...')
            break
        except ftplib.all_errors as e:
            if retries_left_connection:
                retries_left_connection -= 1
                print('continuing with connection retries left = ' +
                      str(retries_left_connection))
                time.sleep(interval_con)
                continue
            else:
                print('connection retries exhausted, returning...')
                return str(e)
    # mongodb connection
    client = pymongo.MongoClient(settings.rmdbhost, settings.rmdbport)
    dldict = {}
    files_written = []
    for gsm_id in input_list:
        print('Starting GSM: ' + gsm_id)
        dldict[gsm_id] = []
        id_ftptokens = [
            'ftp.ncbi.nlm.nih.gov', 'geo', 'samples', gsm_id[:-3] + 'nnn',
            gsm_id, 'suppl'
        ]
        id_ftpadd = '/'.join(id_ftptokens[1::]) + '/'
        filenames = []
        retries_left_files = retries_files
        try:
            filenames = ftp.nlst(id_ftpadd)
            if len(filenames) > 0:
                filestr = '; '.join(str(e) for e in filenames)
                print("files found: " + filestr)
                dldict[gsm_id].append([
                    gsm_id, id_ftpadd,
                    "connection success, valid num idats found"
                ])
                print("Idat filenames detected for " + gsm_id +
                      ", continuing...")
                for file in filenames:
                    print("Beginning iteration for file: " + file)
                    filedate = ""
                    filedate_estat = ""
                    filedl_estat = ""
                    file_tokens = file.split('/')
                    try:
                        filedate = ftp.sendcmd("MDTM /" +
                                               '/'.join(file_tokens))
                        filedate = datetime.datetime.strptime(
                            filedate[4:], "%Y%m%d%H%M%S")
                        mongo_date = idat_mongo_date(gsm_id, file, client)
                        if filedate in mongo_date:
                            filedate_estat = "same_as_local_date"
                            dldict[gsm_id].append(
                                [gsm_id, file, filedate, filedate_estat])
                            print('Online date same as local date. Breaking..')
                            break
                        else:
                            filedate_estat = "new_date"
                            to_write = os.path.join(
                                temp_dir_make, '.'.join(
                                    [gsm_id,
                                     str(timestamp), file_tokens[-1]]))
                            file_ftpadd = '/'.join(file_tokens[:-1])
                            file_ftpadd = file_ftpadd + '/' + file_tokens[-1:][
                                0]
                            print('Attempting file download, for file: ' +
                                  file)
                            try:
                                with open(to_write, 'wb') as output_stream:
                                    filedl_estat = ftp.retrbinary(
                                        "RETR /" + file_ftpadd,
                                        output_stream.write)
                                dldict[gsm_id].append([
                                    gsm_id, file_ftpadd, to_write,
                                    filedl_estat, filedate, filedate_estat
                                ])
                                if '226 Transfer complete' in filedl_estat:
                                    files_written.append(
                                        (gsm_id, to_write,
                                         len(dldict[gsm_id]) - 1))
                                print("File successfully downloaded. " +
                                      "Continuing...")
                                continue
                            except ftplib.all_errors as efiledl:
                                if retries_left_files:
                                    retries_left_files -= 1
                                    print(
                                        'ftp file dl error, retries left = ' +
                                        str(retries_left_files))
                                    time.sleep(interval_file)
                                    continue
                                else:
                                    print(
                                        'File retries exhausted. Breaking...')
                                    filedl_estat = str(efiledl)
                                    dldict[gsm_id].append([
                                        gsm_id, file_ftpadd, to_write,
                                        filedl_estat, filedate, filedate_estat
                                    ])
                                    break
                            break
                        break
                    except ftplib.all_errors as efiledate:
                        if retries_left_files:
                            retries_left_files -= 1
                            print('ftplib file date error, retries left = ' +
                                  str(retries_left_files))
                            time.sleep(interval_file)
                            continue
                        else:
                            print('File retries exhausted. Breaking...')
                            filedate_estat = str(efiledate)
                            filedate = "not_available"
                            dldict[gsm_id].append(
                                [gsm_id, file, filedate, filedate_estat])
                            break
                    continue
            else:
                dldict[gsm_id].append([gsm_id, "no files at ftp address"])
                break
        except ftplib.error_temp as eid:
            if retries_left_files:
                retries_left_files -= 1
                print('ftplib filenames error, retries left = ' +
                      str(retries_left_files))
                time.sleep(interval_file)
                continue
            else:
                print('File retries exhausted. Breaking...')
                dldict[gsm_id].append([gsm_id, id_ftpadd, str(eid)])
                break
    if validate:
        print("Validating downloaded files...")
        for gsm_id, file_written, index in files_written:
            print("file written is " + file_written)
            filestr = os.path.basename(file_written).split('.')[2::]
            filestr = str('.'.join(filestr))
            print('filestr written : ' + filestr)
            print('dir to search latest: ' + idatspath)
            gsmidat_latest = getlatest_filepath(idatspath,
                                                filestr,
                                                embeddedpattern=True,
                                                returntype='returnlist',
                                                tslocindex=1)
            print('gsm latest: ' + str(gsmidat_latest))
            if gsmidat_latest:
                gsmidat_latest = gsmidat_latest[0]
                print('cmp result: ' +
                      str(filecmp.cmp(gsmidat_latest, file_written)))
                if filecmp.cmp(gsmidat_latest, file_written):
                    print(
                        "Downloaded file is same as recent file. Removing...")
                    os.remove(file_written)
                    # If filename is false, we found it was the same
                    dldict[gsm_id][index].append(False)
                else:
                    print("Downloaded file is new, moving to idatspath...")
                    shutil.move(
                        file_written,
                        os.path.join(idatspath,
                                     os.path.basename(file_written)))
                    dldict[gsm_id][index].append(True)
                    dldict[gsm_id][index][2] = os.path.join(
                        idatspath, os.path.basename(file_written))
            else:
                print("Downloaded file is new, moving...")
                shutil.move(
                    file_written,
                    os.path.join(idatspath, os.path.basename(file_written)))
                dldict[gsm_id][index].append(True)
                dldict[gsm_id][index][2] = os.path.join(
                    idatspath, os.path.basename(file_written))
        shutil.rmtree(temp_dir_make)
    return dldict
Esempio n. 9
0
def dl_soft(gse_list=[],
            retries_connection=3,
            retries_files=3,
            interval_con=.1,
            interval_file=.01,
            validate=True,
            timestamp=gettime_ntp()):
    """ dl_soft
        
        Download GSE soft file(s). Accepts either a list of GSM IDs or ftp 
        addresses.
        
        Arguments:
            * gse_list (list, required) : A list of valid GSE id(s).
            * retries_connection (int) : Number of ftp connection retries 
                allowed. 
            * retries_files : Number of retry attempts allowed for sample file
                downloads. 
            * interval_con (float) : Time (in seconds) to sleep before retrying 
                a database connection. 
            * interval_file (float) : Time (in seconds) to sleep before retrying 
                a file connection. 
            * validate (Bool.): Validate new files against existing idats?
            * timestamp (str) : An NTP timestamp for versioning.     
        
        Returns: 
            * Dictionary showing records, dates, and exit statuses of ftp calls
                OR error string over connection issues
    """
    gsesoftpath = settings.gsesoftpath
    temppath = settings.temppath
    os.makedirs(gsesoftpath, exist_ok=True)
    os.makedirs(temppath, exist_ok=True)
    temp_dir_make = tempfile.mkdtemp(dir=temppath)
    item = gse_list[0]
    if not item.startswith('GSE'):
        raise RuntimeError("GSE IDs must begin with \"GSE\".")
    ftptoken_login = '******'
    retries_left_connection = retries_connection
    while retries_left_connection:
        print('trying ftp connection')
        try:
            ftp = ftplib.FTP(ftptoken_login)
            loginstat = ftp.login()
            print('connection successful, continuing...')
            break
        except ftplib.all_errors as e:
            if retries_left_connection:
                retries_left_connection -= 1
                print('continuing with connection retries left = ' +
                      str(retries_left_connection))
                time.sleep(interval_con)
                continue
            else:
                print('connection retries exhausted, returning...')
                return str(e)
    # mongodb connection
    client = pymongo.MongoClient(settings.rmdbhost, settings.rmdbport)
    dldict = {}
    print('beginning iterations over gse list...')
    for gse in gse_list:
        print('beginning download for gse: ' + gse)
        retries_left_files = retries_files
        dldict[gse] = []
        files_written = []
        filenames = []
        # tokens for soft file ftp address
        id_ftptokens = [
            'ftp.ncbi.nlm.nih.gov', 'geo', 'series', gse[:-3] + 'nnn', gse,
            'soft'
        ]
        id_ftpadd = '/'.join(id_ftptokens[1::]) + '/'
        while retries_left_files:
            try:
                filenames = ftp.nlst(id_ftpadd)
                # filter for only soft file names
                file = list(filter(lambda x: 'family.soft' in x, filenames))[0]
                dldict[gse].append([gse, id_ftpadd, "success"])
                filedate = ""
                filedate_estat = ""
                filedl_estat = ""
                file_tokens = file.split('/')
                try:
                    print('getting date from ' + '/'.join(file_tokens))
                    filedate = ftp.sendcmd("MDTM /" + '/'.join(file_tokens))
                    filedate = datetime.datetime.strptime(
                        filedate[4:], "%Y%m%d%H%M%S")
                    mongo_date = soft_mongo_date(gse, file, client)
                    if filedate in mongo_date:
                        print('online  date same as local date,' +
                              'breaking...')
                        filedate_estat = "same_as_local_date"
                        dldict[gse].append(
                            [gse, file, filedate, filedate_estat])
                        break
                    else:
                        print('new online date found, continuing...')
                        filedate_estat = "new_date"
                        to_write = os.path.join(
                            temp_dir_make,
                            '.'.join([gse, timestamp, file_tokens[-1]]))
                        file_ftpadd = '/'.join(file_tokens[:-1])
                        file_ftpadd = file_ftpadd + '/' + file_tokens[-1:][0]
                        try:
                            print('downloading soft from ' + file_ftpadd)
                            with open(to_write, 'wb') as output_stream:
                                filedl_estat = ftp.retrbinary(
                                    "RETR /" + file_ftpadd,
                                    output_stream.write)
                            dldict[gse].append([
                                gse, file_ftpadd, to_write, filedl_estat,
                                filedate, filedate_estat
                            ])
                            if '226 Transfer complete' in filedl_estat:
                                files_written.append(
                                    (gse, to_write, len(dldict[gse]) - 1))
                            print('total files written = ' +
                                  str(len(files_written)))
                            print('soft transfer successful for ' + to_write +
                                  ', breaking...')
                            break
                        except ftplib.all_errors as efiledl:
                            print('file download error from ' + file_ftpadd)
                            if retries_left_files:
                                retries_left_files -= 1
                                print('continuing with file retries left =' +
                                      str(retries_left_files))
                                time.sleep(interval_file)
                                continue
                            else:
                                print('file retries exhausted, breaking..')
                                filedl_estat = str(efiledl)
                                dldict[gse].append([
                                    gse, file_ftpadd, to_write, filedl_estat,
                                    filedate, filedate_estat
                                ])
                                break
                except ftplib.all_errors as efiledate:
                    print('error getting date from ' + '/'.join(file_tokens))
                    if retries_left_files:
                        retries_left_files -= 1
                        print('continuing with file retries left = ' +
                              str(retries_left_files))
                        time.sleep(interval_file)
                        continue
                    else:
                        print('file retries exhausted, breaking..')
                        filedate_estat = str(efiledate)
                        filedate = "not_available"
                        dldict[gse].append(
                            [gse, file, filedate, filedate_estat])
                        break
            except ftplib.error_temp as eid:
                print('error making ftp connection to ' + id_ftpadd)
                if retries_left_files:
                    retries_left_connection -= 1
                    print('ftplib error encountered, file retries left = ' +
                          str(retries_left_files))
                    time.sleep(interval_file)
                    continue
                else:
                    print('file retries exhausted, breaking..')
                    dldict[gse].append([gse, id_ftpadd, str(eid)])
                    break
    if validate:
        print('commencing file validation...')
        for gse, new_filepath, index in files_written:
            filestr = os.path.basename(new_filepath).split('.')[0]
            gsesoft_latest = getlatest_filepath(gsesoftpath, filestr)
            if gsesoft_latest and not gsesoft_latest == 0:
                if filecmp.cmp(gsesoft_latest, new_filepath):
                    print('identical file found in dest_dir, removing...')
                    dldict[gse].append(False)
                    os.remove(new_filepath)
                else:
                    print('new file detected in temp_dir, moving to ' +
                          'dest_dir...')
                    dldict[gse].append(True)
                    dldict[gse][index][2] = os.path.join(
                        gsesoftpath, os.path.basename(new_filepath))
                    shutil.move(
                        new_filepath,
                        os.path.join(dest_dir, os.path.basename(new_filepath)))
            else:
                print('new file detected in temp_dir, moving to dest_dir..')
                dldict[gse].append(True)
                dldict[gse][index][2] = os.path.join(
                    gsesoftpath, os.path.basename(new_filepath))
                shutil.move(
                    new_filepath,
                    os.path.join(gsesoftpath, os.path.basename(new_filepath)))
            continue
        shutil.rmtree(temp_dir_make)
    return dldict
Esempio n. 10
0
def msrap_getsamples(json_flist=[], fnpatt=".*json.filt$", 
    gsmjsonpath=os.path.join("recount-methylation-files", "gsm_json_filt"), 
    nprocsamp=50, nmaxproc=20):
    """ msrap_getsamples
        
        Get the validated samples file list

        Arguments:
            * json_flist (list) : List of GSM JSON filenames to process. If not 
                provided, function automatically detects any new GSM JSON files
                without available MetaSRA-pipeline outfiles.
            * fnpatt (str): Filename pattern of valid json files to identify.
            * gsmjsonpath (path): Path to JSON formatted sample SOFT data.
            * nprocsamp (int) : Number of samples to process per screen deployed.
            * nmaxproc (int) : Maximum processes to launch
            * timelim (int) : time limit (minutes) for monitoring processes.
            * statint (int) : time (seconds) to sleep before next status update.
        Returns:
            (Null) Generates >=1 processes for file sublists

    """
    print("Checking dirs for msrapout and msrap logs...")
    os.makedirs(settings.gsmmsrapoutpath, exist_ok=True)
    os.makedirs(settings.msraplogspath, exist_ok=True)
    # detect gsm soft files
    psoftpath = settings.psoftscriptpath
    if os.path.exists(psoftpath):
        print("Process soft script found at: "+str(psoftpath))
    gsmsoftpath = settings.gsmsoftpath
    gsmmsrapoutpath = settings.gsmmsrapoutpath
    jsonfnpattern = fnpatt
    rjson = re.compile(jsonfnpattern)
    msrapoutfnpattern = settings.msrapoutfnpattern
    rmsrapout = re.compile(msrapoutfnpattern)
    # generate fl list of valid json files that haven't been processed yet
    fl = []
    if json_flist and len(json_flist)>0:
        jsonfnlist = list(filter(rjson.match, json_flist)) 
        jsongsmlist = [x.split('.')[1] for x in jsonfnlist]
    else:
        json_flist = os.listdir(gsmjsonpath)
        jsonfnlist = list(filter(rjson.match, json_flist)) 
        jsongsmlist = [x.split('.')[1] for x in jsonfnlist]
    msrapoutfnlist = os.listdir(gsmmsrapoutpath) 
    msrapoutfnlist = list(filter(rmsrapout.match, msrapoutfnlist))
    print("Found "+str(len(msrapoutfnlist))+" files with pattern "
        +msrapoutfnpattern+". Continuing...")
    msrapgsmlist = [x.split('.')[2] for x in msrapoutfnlist]
    gsmprocess = [g for g in jsongsmlist 
            if not g in msrapgsmlist and
            g[0:3]=='GSM'
        ]
    for index, gsmid in enumerate(gsmprocess):
        gjsonfpath = getlatest_filepath(filepath=gsmjsonpath,
                filestr=gsmid, embeddedpattern=True, tslocindex=0,
                returntype='returnlist'
            )
        if gjsonfpath and len(gjsonfpath)==1:
            gjsonfn = [os.path.basename(gjsonfpath[0])]
        else:
            gjsonfn = [os.path.basename(fn) for fn in gjsonfpath]
        gjsonfn = gjsonfn[0]
        fl.append(gjsonfn)
        numi = 100*(index/len(gsmprocess))
        perci = str(round(numi,2))
        print("Appended file "+gjsonfn+" to files list to process. "
            +"Progress: "+str(index)+"/"+str(len(gsmprocess))+"="
            +perci+"%. Continuing...")
    # form list of fn lists based on nscreensi and indices/slices
    if fl:
        print("Forming list of fn lists for screen deployment...")
        ll = []
        rangelist = [i for i in range(0, len(fl), nprocsamp)]
        for enum, i in enumerate(rangelist[:-1]):
            ll.append(fl[i:rangelist[enum+1]])
        if len(fl[rangelist[-1]::]) > 0:
            ll.append(fl[rangelist[-1]::])
    else:
        print("Error, no files list object to process. Returning...")
        return None
    print('screens ll list, len = ' + str(len(ll)))
    print('nmax screens = '+str(nmaxproc))
    return ll
Esempio n. 11
0
print(list(newfilesd.keys()))
for gse_softfn in list(newfilesd.keys()):
    gsmfilelist = list(filter(rxgsmfile.match, newfilesd[gse_softfn]))
    if gsmfilelist and len(gsmfilelist)>0:
        print(str(gsmfilelist))
        for gsmfile in gsmfilelist:
            gsm_oldfile_path = ""
            gsm_newfile_path = ""
            gsm_softfn = gsmfile
            gsmstr = gsm_softfn.split(".")[1]
            print("gsmfile: "+str(gsmfile))
            print("gsmstr : "+gsmstr)
            gsm_newfile_path = os.path.join(temp_dir_make, gsm_softfn)
            gsm_oldfile_path = getlatest_filepath(
                    filepath=gsmsoft_destpath, filestr=gsmstr, 
                    embeddedpattern=True, tslocindex=0
                )
            print("gsm_oldfile_path : "+str(gsm_oldfile_path))
            print("gsm_newfile_path : "+str(gsm_newfile_path))
            if os.path.exists(gsm_newfile_path):
                if gsm_oldfile_path:
                    if filecmp.cmp(gsm_oldfile_path, gsm_newfile_path):
                        print("Identical GSM soft file detected, removing...")
                        os.remove(gsm_newfile_path)
                        newfilesd[gsmfile] = False
                    else:
                        print("New GSM soft file detected, moving from temp...")
                        shutil.move(gsm_newfile_path, os.path.join(
                                gsmsoft_destpath, 
                                os.path.basename(gsm_newfile_path))
                            )