def rename_file(job_id, fn_id, newname): """Only renames file inside same path/server. Does not move cross directories. This job checks if there is a RawFile entry with the same name in the same folder to avoid possible renaming collisions. Updates RawFile in job instead of view since jobs are processed in a single queue. Since it only expects raw files it will also rename all mzML attached converted files. newname should NOT contain the file extension, only name. FIXME: make impossible to overwrite using move jobs at all (also moving etc) """ fn = models.StoredFile.objects.select_related('rawfile', 'servershare').get( pk=fn_id) # FIXME task checks if src == dst, but we could do that already here? fn_ext = os.path.splitext(fn.filename)[1] if models.StoredFile.objects.exclude(pk=fn_id).filter( rawfile__name=newname + fn_ext, path=fn.path, servershare_id=fn.servershare_id).count(): raise RuntimeError('A file in path {} with name {} already exists or will soon be created. Please choose another name'.format(fn.path, newname)) fn.rawfile.name = newname + fn_ext fn.rawfile.save() for changefn in fn.rawfile.storedfile_set.all(): oldname, ext = os.path.splitext(changefn.filename) special_type = '_refined' if changefn.filetype_id == settings.REFINEDMZML_SFGROUP_ID else '' print(newname, special_type) tid = dstasks.move_file_storage.delay(changefn.filename, changefn.servershare.name, changefn.path, changefn.path, changefn.id, newname='{}{}{}'.format(newname, special_type, ext)).id create_db_task(tid, job_id, changefn.filename, changefn.servershare.name, changefn.path, changefn.path, changefn.id, newname=newname + ext)
def move_files_dataset_storage(job_id, dset_id, dst_path, rawfn_ids, *sf_ids): print('Moving dataset files to storage') new_sf_ids = StoredFile.objects.filter( rawfile__datasetrawfile__dataset_id=dset_id, rawfile__source_md5=F('md5'), rawfile_id__in=rawfn_ids) if new_sf_ids.count() != len(sf_ids): print('Original job submission had {} stored files, but now there are {}' ' stored files'.format(len(sf_ids), new_sf_ids.count())) dset_files = StoredFile.objects.filter(pk__in=new_sf_ids, checked=True) # if only half of the files have been SCP arrived yet? Try more later: dset_registered_files = DatasetRawFile.objects.filter( dataset_id=dset_id, rawfile_id__in=rawfn_ids) if dset_files.count() != dset_registered_files.count(): raise RuntimeError( 'Not all files to move have been transferred or ' 'registered as transferred yet, or have non-matching MD5 sums ' 'between their registration and after transfer from input source. ' 'Holding this job, you may retry it when files have arrived') for fn in dset_files: # TODO check for diff os.path.join(sevrershare, dst_path), not just # path? if fn.path != dst_path: tid = tasks.move_file_storage.delay( fn.rawfile.name, fn.servershare.name, fn.path, dst_path, fn.id).id create_db_task(tid, job_id, fn.rawfile.name, fn.servershare.name, fn.path, dst_path, fn.id)
def move_files_dataset_storage(job_id, dset_id, dst_path, rawfn_ids, *sf_ids): print('Moving dataset files to storage') new_sf_ids = StoredFile.objects.filter( rawfile__datasetrawfile__dataset_id=dset_id, rawfile__source_md5=F('md5'), rawfile_id__in=rawfn_ids) if new_sf_ids.count() != len(sf_ids): print( 'Original job submission had {} stored files, but now there are {}' ' stored files'.format(len(sf_ids), new_sf_ids.count())) dset_files = StoredFile.objects.filter(pk__in=new_sf_ids, checked=True) # if only half of the files have been SCP arrived yet? Try more later: dset_registered_files = DatasetRawFile.objects.filter( dataset_id=dset_id, rawfile_id__in=rawfn_ids) if dset_files.count() != dset_registered_files.count(): raise RuntimeError( 'Not all files to move have been transferred or ' 'registered as transferred yet, or have non-matching MD5 sums ' 'between their registration and after transfer from input source. ' 'Holding this job and temporarily retrying it') for fn in dset_files: # TODO check for diff os.path.join(sevrershare, dst_path), not just # path? if fn.path != dst_path: tid = tasks.move_file_storage.delay(fn.rawfile.name, fn.servershare.name, fn.path, dst_path, fn.id).id create_db_task(tid, job_id, fn.rawfile.name, fn.servershare.name, fn.path, dst_path, fn.id)
def auto_run_qc_workflow(job_id, sf_id, analysis_id, wfv_id, dbfn_id): """Assumes one file, one analysis""" analysis = models.Analysis.objects.get(pk=analysis_id) nfwf = models.NextflowWfVersion.objects.get(pk=wfv_id) dbfn = models.LibraryFile.objects.get(pk=dbfn_id).sfile mzml = filemodels.StoredFile.objects.select_related( 'rawfile__producer', 'servershare').get(rawfile__storedfile__id=sf_id, filetype='mzml') wf = models.Workflow.objects.filter(shortname__name='QC').last() params = ['--mods', 'data/labelfreemods.txt', '--instrument'] params.append('velos' if 'elos' in mzml.rawfile.producer.name else 'qe') stagefiles = { '--mzml': (mzml.servershare.name, mzml.path, mzml.filename), '--db': (dbfn.servershare.name, dbfn.path, dbfn.filename) } run = { 'timestamp': datetime.strftime(analysis.date, '%Y%m%d_%H.%M'), 'analysis_id': analysis.id, 'rf_id': mzml.rawfile_id, 'wf_commit': nfwf.commit, 'nxf_wf_fn': nfwf.filename, 'repo': nfwf.nfworkflow.repo, } views.create_nf_search_entries(analysis, wf.id, nfwf.id, job_id) res = tasks.run_nextflow_longitude_qc.delay(run, params, stagefiles) create_db_task(res.id, job_id, run, params, stagefiles)
def refine_mzmls(job_id, dset_id, analysis_id, wfv_id, dbfn_id, qtype, *dset_mzmls): analysis = models.Analysis.objects.get(pk=analysis_id) nfwf = models.NextflowWfVersion.objects.get(pk=wfv_id) dbfn = models.LibraryFile.objects.get(pk=dbfn_id).sfile stagefiles = {'--tdb': (dbfn.servershare.name, dbfn.path, dbfn.filename)} mzmlfiles = rm.StoredFile.objects.select_related('rawfile').filter( pk__in=dset_mzmls, rawfile__datasetrawfile__dataset_id=dset_id) analysisshare = rm.ServerShare.objects.get(name=settings.ANALYSISSHARENAME).id mzmls = [(x.servershare.name, x.path, x.filename, get_or_create_mzmlentry(x, settings.REFINEDMZML_SFGROUP_ID, analysisshare).id, analysisshare) for x in mzmlfiles] allinstr = [x['rawfile__producer__name'] for x in mzmlfiles.distinct('rawfile__producer').values('rawfile__producer__name')] if len(allinstr) > 1: raise RuntimeError('Trying to run a refiner job on dataset containing more than one instrument is not possible') params = ['--instrument'] params.append('velos' if 'elos' in allinstr else 'qe') if qtype != 'labelfree': params.extend(['--isobaric', qtype]) run = {'timestamp': datetime.strftime(analysis.date, '%Y%m%d_%H.%M'), 'analysis_id': analysis.id, 'wf_commit': nfwf.commit, 'nxf_wf_fn': nfwf.filename, 'repo': nfwf.nfworkflow.repo, 'name': analysis.name, 'outdir': analysis.user.username, } res = tasks.refine_mzmls.delay(run, params, mzmls, stagefiles) analysis.log = json.dumps(['[{}] Job queued'.format(datetime.strftime(timezone.now(), '%Y-%m-%d %H:%M:%S'))]) analysis.save() create_db_task(res.id, job_id, run, params, mzmls, stagefiles)
def move_single_file(job_id, fn_id, dst_path, newname=False): fn = models.StoredFile.objects.select_related('rawfile', 'servershare').get( pk=fn_id) tid = dstasks.move_file_storage.delay(fn.rawfile.name, fn.servershare.name, fn.path, dst_path, fn.id, newname).id create_db_task(tid, job_id, md5, fn.rawfile.name, fn.servershare.name, fn.path, dst_path, fn.id, newname)
def auto_run_qc_workflow(job_id, sf_id, analysis_id, wfv_id, dbfn_id): """Assumes one file, one analysis""" analysis = models.Analysis.objects.get(pk=analysis_id) nfwf = models.NextflowWfVersion.objects.get(pk=wfv_id) dbfn = models.LibraryFile.objects.get(pk=dbfn_id).sfile mzml = rm.StoredFile.objects.select_related( 'rawfile__producer', 'servershare', 'filetype').get(rawfile__storedfile__id=sf_id, filetype__filetype='mzml') wf = models.Workflow.objects.filter(shortname__name='QC').last() params = ['--mods', 'data/labelfreemods.txt', '--instrument'] params.append('velos' if 'elos' in mzml.rawfile.producer.name else 'qe') stagefiles = {'--mzml': (mzml.servershare.name, mzml.path, mzml.filename), '--db': (dbfn.servershare.name, dbfn.path, dbfn.filename)} run = {'timestamp': datetime.strftime(analysis.date, '%Y%m%d_%H.%M'), 'analysis_id': analysis.id, 'rf_id': mzml.rawfile_id, 'wf_commit': nfwf.commit, 'nxf_wf_fn': nfwf.filename, 'repo': nfwf.nfworkflow.repo, 'name': 'longqc', 'outdir': 'internal_results', } create_nf_search_entries(analysis, wf.id, nfwf.id, job_id) res = tasks.run_nextflow_longitude_qc.delay(run, params, stagefiles) analysis.log = json.dumps(['[{}] Job queued'.format(datetime.strftime(timezone.now(), '%Y-%m-%d %H:%M:%S'))]) analysis.save() create_db_task(res.id, job_id, run, params, stagefiles)
def move_single_file(job_id, fn_id, dst_path, oldname=False, dstshare=False, newname=False): fn = models.StoredFile.objects.select_related('rawfile', 'servershare').get( pk=fn_id) oldname = fn.filename if not oldname else oldname tid = dstasks.move_file_storage.delay(oldname, fn.servershare.name, fn.path, dst_path, fn.id, dstshare=dstshare, newname=newname).id create_db_task(tid, job_id, oldname, fn.servershare.name, fn.path, dst_path, fn.id, dstshare=dstshare, newname=newname)
def purge_analysis(job_id, analysis_id, *sf_ids): """Queues tasks for deleting files from analysis from disk, then queues job for directory removal""" for fn in rm.StoredFile.objects.filter(pk__in=sf_ids): fullpath = os.path.join(fn.path, fn.filename) print('Purging {} from analysis {}'.format(fullpath, analysis_id)) tid = filetasks.delete_file.delay(fn.servershare.name, fullpath, fn.id).id create_db_task(tid, job_id, fn.servershare.name, fullpath, fn.id)
def get_md5(job_id, sf_id): print('Running MD5 job') sfile = models.StoredFile.objects.filter(pk=sf_id).select_related( 'servershare', 'rawfile').get() fnpath = os.path.join(sfile.path, sfile.filename) res = tasks.get_md5.delay(sfile.rawfile.source_md5, sfile.id, fnpath, sfile.servershare.name) create_db_task(res.id, job_id, sfile.rawfile.source_md5, sfile.id, fnpath, sfile.servershare.name) print('MD5 task queued')
def delete_empty_directory(job_id, analysis_id, *dependent_sfids): """Check first if all the sfids are set to purged, indicating the dir is actually empty. Then queue a task. The sfids also make this job dependent on other jobs on those, as in the file-purging tasks before this directory deletion""" sfiles = models.StoredFile.objects.filter(pk__in=dependent_sfids) if sfiles.count() == sfiles.filter(purged=True).count(): fn = sfiles.select_related('servershare').last() tid = tasks.delete_empty_dir.delay(fn.servershare.name, fn.path).id create_db_task(tid, job_id, fn.servershare.name, fn.path) else: raise RuntimeError('Cannot delete dir: according to the DB, there are still storedfiles which ' 'have not been purged yet in the directory')
def remove_files_from_dataset_storagepath(job_id, dset_id, fn_ids, *sf_ids): print('Moving files with ids {} from dataset storage to tmp, ' 'if not already there. Deleting if mzml'.format(fn_ids)) for fn in StoredFile.objects.select_related('filetype').filter( pk__in=sf_ids).exclude(servershare__name=settings.TMPSHARENAME): if fn.filetype.filetype == 'mzml': fullpath = os.path.join(fn.path, fn.filename) tid = filetasks.delete_file.delay(fn.servershare.name, fullpath, fn.id).id create_db_task(tid, job_id, fn.servershare.name, fullpath, fn.id) else: tid = tasks.move_stored_file_tmp.delay(fn.filename, fn.path, fn.id).id create_db_task(tid, job_id, fn.filename, fn.path, fn.id)
def remove_files_from_dataset_storagepath(job_id, dset_id, fn_ids, *sf_ids): print('Moving files with ids {} from dataset storage to tmp, ' 'if not already there. Deleting if mzml'.format(fn_ids)) for fn in StoredFile.objects.filter(pk__in=sf_ids).exclude( servershare__name=settings.TMPSHARENAME): if fn.filetype == 'mzml': fullpath = os.path.join(fn.path, fn.filename) tid = filetasks.delete_file.delay(fn.servershare.name, fullpath, fn.id).id create_db_task(tid, job_id, fn.servershare.name, fullpath, fn.id) else: tid = tasks.move_stored_file_tmp.delay(fn.rawfile.name, fn.path, fn.id).id create_db_task(tid, job_id, fn.rawfile.name, fn.path, fn.id)
def create_pdc_archive(job_id, sf_id, md5): print('Running PDC archive job') sfile = models.StoredFile.objects.filter(pk=sf_id).select_related( 'servershare').get() yearmonth = datetime.strftime(sfile.regdate, '%Y%m') # only create entry when not already exists, no sideeffects try: models.PDCBackedupFile.objects.get(storedfile=sfile) except models.PDCBackedupFile.DoesNotExist: models.PDCBackedupFile.objects.create(storedfile=sfile, pdcpath='', success=False) fnpath = os.path.join(sfile.path, sfile.filename) res = tasks.pdc_archive.delay(md5, yearmonth, sfile.servershare.name, fnpath, sfile.id) create_db_task(res.id, job_id, md5, yearmonth, sfile.servershare.name, fnpath, sfile.id) print('PDC archival task queued')
def create_swestore_backup(job_id, sf_id, md5): print('Running swestore backup job') sfile = models.StoredFile.objects.filter(pk=sf_id).select_related( 'servershare').get() # only create entry when not already exists, no sideeffects try: models.SwestoreBackedupFile.objects.get(storedfile=sfile) except models.SwestoreBackedupFile.DoesNotExist: models.SwestoreBackedupFile.objects.create(storedfile=sfile, swestore_path='', success=False) fnpath = os.path.join(sfile.path, sfile.filename) res = tasks.swestore_upload.delay(md5, sfile.servershare.name, fnpath, sfile.id) create_db_task(res.id, job_id, md5, sfile.servershare.name, fnpath, sfile.id) print('Swestore task queued')
def download_px_project(job_id, dset_id, pxacc, rawfnids, sharename, *sf_ids): """gets sf_ids, of non-checked non-downloaded PX files. checks pride, fires tasks for files not yet downloaded. """ px_stored = {x.filename: x for x in models.StoredFile.objects.filter( pk__in=sf_ids, checked=False).select_related('rawfile')} t_ids = [] for fn in call_proteomexchange(pxacc): ftpurl = urlsplit(fn['downloadLink']) filename = os.path.split(ftpurl.path)[1] if filename in px_stored and fn['fileSize'] == px_stored[filename].rawfile.size: pxsf = px_stored[filename] t_ids.append(tasks.download_px_file_raw.delay( ftpurl.path, ftpurl.netloc, pxsf.id, pxsf.rawfile_id, fn['fileSize'], sharename, dset_id).id) create_db_task(t_ids[-1], job_id, ftpurl.path, ftpurl.netloc, pxsf.id, pxsf.rawfile_id, fn['fileSize'], sharename, dset_id)
def run_nextflow(job_id, dset_ids, platenames, fractions, setnames, analysis_id, wf_id, wfv_id, inputs, *dset_mzmls): """ inputs is {'params': ['--isobaric', 'tmt10plex'], 'singlefiles': {'--tdb': tdb_sf_id, ... },} or shoudl inputs be DB things fields flag,sf_id (how for mzmls though?) {'params': ['--isobaric', 'tmt10plex', '--instrument', 'qe', '--nfcore', '--hirief', 'SAMPLETABLE', "126::set1::treat1::treat::::127::set1::treat2::treat..." ], 'mzml': ('--mzmls', '{sdir}/*.mzML'), 'singlefiles': {'--tdb': 42659, '--dbsnp': 42665, '--genome': 42666, '--snpfa': 42662, '--cosmic': 42663, '--ddb': 42664, '--blastdb': 42661, '--knownproteins': 42408, '--gtf': 42658, '--mods': 42667}} """ analysis = models.Analysis.objects.select_related('user').get(pk=analysis_id) nfwf = models.NextflowWfVersion.objects.select_related('nfworkflow').get( pk=wfv_id) stagefiles = {} for flag, sf_id in inputs['singlefiles'].items(): sf = rm.StoredFile.objects.get(pk=sf_id) stagefiles[flag] = (sf.servershare.name, sf.path, sf.filename) mzmls = [(x.servershare.name, x.path, x.filename, setnames[str(x.id)], platenames[str(x.rawfile.datasetrawfile.dataset_id)], fractions.get(str(x.id), False)) for x in rm.StoredFile.objects.filter(pk__in=dset_mzmls)] run = {'timestamp': datetime.strftime(analysis.date, '%Y%m%d_%H.%M'), 'analysis_id': analysis.id, 'wf_commit': nfwf.commit, 'nxf_wf_fn': nfwf.filename, 'repo': nfwf.nfworkflow.repo, 'name': analysis.name, 'outdir': analysis.user.username, } profiles = ['standard'] if '--nfcore' in inputs['params']: inputs['params'] = [x for x in inputs['params'] if x != '--nfcore'] profiles.extend(['docker', 'lehtio']) inputs['params'].extend(['--name', 'RUNNAME__PLACEHOLDER']) else: inputs['params'].extend(['--searchname', 'RUNNAME__PLACEHOLDER']) if 'sampletable' in inputs: inputs['params'].extend(['SAMPLETABLE', inputs['sampletable']]) res = tasks.run_nextflow_workflow.delay(run, inputs['params'], mzmls, stagefiles, ','.join(profiles)) analysis.log = json.dumps(['[{}] Job queued'.format(datetime.strftime(timezone.now(), '%Y-%m-%d %H:%M:%S'))]) analysis.save() create_db_task(res.id, job_id, run, inputs['params'], mzmls, stagefiles)
def run_nextflow(job_id, dset_ids, platenames, fractions, setnames, analysis_id, wf_id, wfv_id, inputs, *dset_mzmls): """ inputs is {'params': ['--isobaric', 'tmt10plex'], 'singlefiles': {'--tdb': tdb_sf_id, ... },} or shoudl inputs be DB things fields flag,sf_id (how for mzmls though?) {'params': ['--isobaric', 'tmt10plex', '--instrument', 'qe', '-profile', 'slurm'], 'mzml': ('--mzmls', '{sdir}/*.mzML'), 'singlefiles': {'--tdb': 42659, '--dbsnp': 42665, '--genome': 42666, '--snpfa': 42662, '--cosmic': 42663, '--ddb': 42664, '--blastdb': 42661, '--knownproteins': 42408, '--gtf': 42658, '--mods': 42667}} """ analysis = models.Analysis.objects.select_related('user').get( pk=analysis_id) nfwf = models.NextflowWfVersion.objects.select_related('nfworkflow').get( pk=wfv_id) stagefiles = {} for flag, sf_id in inputs['singlefiles'].items(): sf = filemodels.StoredFile.objects.get(pk=sf_id) stagefiles[flag] = (sf.servershare.name, sf.path, sf.filename) mzmls = [(x.servershare.name, x.path, x.filename, setnames[str(x.id)], platenames[str(x.rawfile.datasetrawfile.dataset_id)], fractions.get(str(x.id), False)) for x in filemodels.StoredFile.objects.filter(pk__in=dset_mzmls)] run = { 'timestamp': datetime.strftime(analysis.date, '%Y%m%d_%H.%M'), 'analysis_id': analysis.id, 'wf_commit': nfwf.commit, 'nxf_wf_fn': nfwf.filename, 'repo': nfwf.nfworkflow.repo, 'name': analysis.name, 'outdir': analysis.user.username, } res = tasks.run_nextflow_workflow.delay(run, inputs['params'], mzmls, stagefiles) analysis.log = json.dumps([ '[{}] Job queued'.format( datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')) ]) analysis.save() create_db_task(res.id, job_id, run, inputs['params'], mzmls, stagefiles)
def move_dataset_storage_loc(job_id, dset_id, src_path, dst_path, *sf_ids): # within a server share print('Renaming dataset storage location job') t = tasks.rename_storage_location.delay(src_path, dst_path, sf_ids) create_db_task(t.id, job_id, src_path, dst_path, sf_ids)