def _check_and_copy_remote_file(remote_user, remote_host, source_file, remote_file): ''' An internal static method for copying files to remote path :param remote_user: Username for the remote server :param remote_host: Hostname for the remote server :param source_file: Source filepath :param remote_file: Remote filepath ''' try: if not os.path.exists(source_file): raise IOError('Source file {0} not found for copy'.\ format(source_file)) os.chmod(source_file, mode=0o754) # change source file permission before copy remote_address = \ '{0}@{1}'.format(\ remote_user, remote_host) copy_remote_file(\ source_path=source_file, destinationa_path=remote_file, destination_address=remote_address) # create dir and copy file to remote except: raise
def run_sync(self): ''' A method for running the sequencing run sync ''' try: check_file_path(self.output_dir) all_seqrun_dir = \ list_remote_file_or_dirs(\ remote_server=self.seqrun_server, remote_path=self.seqrun_path, only_dirs=True) all_seqrun_dir = \ list(map(os.path.basename,all_seqrun_dir)) # convert paths to dirname new_seqrun_dirs = \ check_seqrun_dir_in_db(\ all_seqrun_dir=all_seqrun_dir, dbconfig=self.database_config_file) # filter existing seqruns for seqrun in new_seqrun_dirs: try: new_seqruns = \ check_seqrun_dir_in_db(\ all_seqrun_dir=[seqrun], dbconfig=self.database_config_file) # filter existing seqrun again if len(new_seqruns)>0: copy_remote_file(\ source_path=os.path.join(self.seqrun_path,seqrun), destinationa_path=self.output_dir, source_address=self.seqrun_server) # sync dirs if its still new except Exception as e: raise ValueError('Failed to sync seqrun {0}, got error {1}'.\ format(seqrun,e)) except Exception as e: raise ValueError('Stopped syncing seqrun data, got error: {0}'.\ format(e))
disk_path = args.disk_path copy_to_remoter = args.copy_to_remoter remote_server = args.remote_server output_path = args.output_path try: if copy_to_remoter and not remote_server: parser.print_help() raise ValueError( 'Remote server address is required for copying files.') storage_stats = get_storage_stats_in_gb( disk_path) # calculate disk usage stats temp_dir = get_temp_dir() temp_file = os.path.join(temp_dir, 'disk_usage.json') # get temp file path with open(temp_file, 'w') as j_data: json.dump(storage_stats, j_data, indent=4) # writing disk usage to temp jeon file if copy_to_remoter: copy_remote_file(source_path=temp_file, destinationa_path=output_path, destination_address=remote_server ) # copy json file to remote server else: shutil.copy2(temp_file, output_path) # copy json file to local server remove_dir(temp_dir) # remove temp dir except Exception as e: print('Error: {0}'.format(e))
def run(self): try: seqrun_igf_id = self.param_required('seqrun_igf_id') seqrun_source = self.param_required('seqrun_source') seqrun_server = self.param_required('seqrun_server') seqrun_user = self.param_required('seqrun_user') seqrun_local_dir = self.param_required('seqrun_local_dir') chacksum_type = self.param_required('checksum_type') seqrun_file_name = self.param_required('seqrun_file_name') file_md5_value = self.param_required('file_md5') transfer_remote_file = True # transfer file from remote server source_file_path = \ os.path.join(\ seqrun_source, seqrun_igf_id, seqrun_file_name) # get new seqrun path dir_name = os.path.dirname( seqrun_file_name) # returns dir name or empty strings destination_dir = \ os.path.join(\ seqrun_local_dir, seqrun_igf_id, dir_name) # get file copy path destination_path = \ os.path.join(\ destination_dir, os.path.basename(seqrun_file_name)) # get destination path if os.path.exists(destination_path) and \ os.path.isfile(destination_path): existing_checksum = \ calculate_file_checksum(\ destination_path,\ hasher=chacksum_type) # calculate checksum of existing file if existing_checksum == file_md5_value: transfer_remote_file = False # skip file transfer if its up to date else: os.remove(destination_path) # remove existing file if transfer_remote_file: if seqrun_user is None and seqrun_server is None: raise ValueError('seqrun: {0}, missing required value for seqrun_user or seqrun_server'.\ format(seqrun_igf_id)) source_address = '{0}@{1}'.format( seqrun_user, seqrun_server) # get host username and address copy_remote_file(\ source_path=source_file_path, destinationa_path=destination_path, source_address=source_address, check_file=False) # copy remote file if not os.path.exists(destination_path): raise IOError('failed to copy file {0} for seqrun {1}'.\ format(seqrun_file_name,seqrun_igf_id)) # check destination file after copy new_checksum = \ calculate_file_checksum(\ destination_path, hasher=chacksum_type) # calculate checksum of the transferred file if new_checksum != file_md5_value: raise ValueError('seqrun:{3}, checksum not matching for file {0}, expected: {1}, got {2}'.\ format(seqrun_file_name, file_md5_value, new_checksum, seqrun_igf_id)) # raise error if checksum doesn't match self.param('dataflow_params', {'seqrun_file_name': seqrun_file_name}) except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: file = self.param_required('file') seqrun_igf_id = self.param_required('seqrun_igf_id') remote_user = self.param_required('remote_user') remote_host = self.param_required('remote_host') remote_project_path = self.param_required('remote_project_path') project_name = self.param_required('project_name') seqrun_date = self.param_required('seqrun_date') flowcell_id = self.param_required('flowcell_id') dir_label = self.param_required('dir_label') sample_label = self.param('sample_label') tag = self.param_required('tag') use_ephemeral_space = self.param('use_ephemeral_space') analysis_label = self.param_required('analysis_label') force_overwrite = self.param('force_overwrite') if not os.path.exists(file): raise IOError('file {0} not found'.format(file)) if dir_label is None: dir_label = \ os.path.basename(os.path.dirname(file)) # get the lane and index length info, FIXIT file_suffix = None file_name = os.path.basename(file) file_name_list = file_name.split('.') if len(file_name_list) > 1: (file_label,file_suffix) = \ (file_name_list[0],file_name_list[-1]) # get file_label and suffix else: file_label = file_name_list[0] remote_file_name = \ '{0}.{1}'.format(analysis_label,file_suffix) # simplify remote filename for report page destination_outout_path = \ os.path.join( remote_project_path, project_name, seqrun_date, flowcell_id, dir_label, tag) # result dir path is generic if sample_label is not None: destination_outout_path = \ os.path.join( destination_outout_path, sample_label) # adding sample label only if its present destination_outout_path = \ os.path.join( destination_outout_path, analysis_label, file_label) # adding file label to the destination path if os.path.isfile(file): destination_outout_path = \ os.path.join(\ destination_outout_path, remote_file_name) # add destination file name temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp work dir copy2(file, os.path.join( temp_work_dir, remote_file_name)) # copy file to a temp dir and rename it os.chmod(os.path.join(temp_work_dir, remote_file_name), mode=0o754) # set file permission copy_remote_file( source_path=os.path.join(temp_work_dir, remote_file_name), destinationa_path=destination_outout_path, destination_address='{0}@{1}'.format(remote_user, remote_host), force_update=force_overwrite) # copy file to remote if os.path.isdir(file): destination_outout_path = \ os.path.join( destination_outout_path, remote_file_name) # add destination dir name self.param( 'dataflow_params', { 'file': file, 'status': 'done', 'remote_file': destination_outout_path }) # add dataflow params except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: seqrun_igf_id=self.param_required('seqrun_igf_id') seqrun_source=self.param_required('seqrun_source') seqrun_server=self.param_required('seqrun_server') seqrun_user=self.param_required('seqrun_user') igf_session_class=self.param_required('igf_session_class') seqrun_md5_type=self.param_required('seqrun_md5_type') hpc_location=self.param_required('hpc_location') db_file_location_label=self.param_required('db_file_location_label') db_file_path_label=self.param_required('db_file_path_label') seqrun_path=os.path.join(seqrun_source,seqrun_igf_id) # get new seqrun path seqrun_server_login='******'.format(seqrun_user, seqrun_server) # get destination path subprocess.check_call(['ssh', seqrun_server_login, 'ls', seqrun_path]) # check remote file ca=CollectionAdaptor(**{'session_class':igf_session_class}) # get the md5 list from db ca.start_session() files=ca.get_collection_files(collection_name=seqrun_igf_id, collection_type=seqrun_md5_type) # fetch file collection files=files.to_dict(orient='records') ca.close_session() if len(files)>1: raise ValueError('sequencing run {0} has more than one md5 json file'.\ format(seqrun_igf_id)) if len(files)==0: raise ValueError('sequencing run {0} does not have any md5 json file'.\ format(seqrun_igf_id)) md5_json_location=files[0][db_file_location_label] md5_json_path=files[0][db_file_path_label] if md5_json_location !=hpc_location: temp_dir=get_temp_dir(work_dir=os.getcwd()) # create a temp directory destination_path=os.path.join(temp_dir,os.path.basename(md5_json_path)) # get destination path for md5 file copy_remote_file(source_path=md5_json_path, destinationa_path=destination_path, source_address=seqrun_server_login) # copy remote file to local disk md5_json_path=destination_path # set md5 json filepath with open(md5_json_path) as json_data: md5_json=json.load(json_data) # read json data, get all file and md5 from json file self.param('sub_tasks',md5_json) # seed dataflow remove_dir(temp_dir) # remove temp dir when its not required message='seqrun: {0}, seeded {1} files for copy'.format(seqrun_igf_id, \ len(md5_json)) self.warning(message) self.post_message_to_slack(message,reaction='pass') self.comment_asana_task(task_name=seqrun_igf_id, \ comment=message) except Exception as e: message='Error in {0}: {1}, seqrun: {2}'.format(self.__class__.__name__,\ e,\ seqrun_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') self.comment_asana_task(task_name=seqrun_igf_id, \ comment=message) raise
def run(self): try: seqrun_igf_id = self.param_required('seqrun_igf_id') project_name = self.param_required('project_name') seqrun_date = self.param_required('seqrun_date') flowcell_id = self.param_required('flowcell_id') remote_project_path = self.param_required('remote_project_path') remote_user = self.param_required('remote_user') remote_host = self.param_required('remote_host') template_dir = self.param_required('template_dir') page_type = self.param_required('page_type') fastq_dir = self.param('fastq_dir') multiqc_remote_file = self.param('multiqc_remote_file') lane_index_info = self.param('lane_index_info') qc_template_path = self.param('qc_template_path') project_template = self.param('project_template') undetermined_template = self.param('undetermined_template') sample_template = self.param('sample_template') project_filename = self.param('project_filename') sample_filename = self.param('sample_filename') undetermined_filename = self.param('undetermined_filename') report_html = self.param('report_html') remote_ftp_base = self.param('remote_ftp_base') use_ephemeral_space = self.param('use_ephemeral_space') if page_type not in ['project', 'sample', 'undetermined']: raise ValueError( 'Project type {0} is not defined yet'.format(page_type)) qc_template_path = \ os.path.join(template_dir,qc_template_path) remote_file_path = \ os.path.join(\ remote_project_path, project_name, seqrun_date, flowcell_id) if lane_index_info is not None: remote_file_path = \ os.path.join(\ remote_file_path, lane_index_info) # generic remote path, lane info is none for project template_env = \ Environment( loader=FileSystemLoader(searchpath=qc_template_path), autoescape=select_autoescape(['xml'])) # set template env #remote_chk_cmd=['ssh',\ # '{0}@{1}'.\ # format(remote_user,\ # remote_host),\ # 'ls'] #remote_rm_cmd=['ssh',\ # '{0}@{1}'.\ # format(remote_user,\ # remote_host),\ # 'rm', \ # '-f'] temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp dir report_output_file = None qc_file_info = dict() qc_file_info.\ update({ 'project_name':project_name, 'flowcell': flowcell_id, }) if page_type == 'project': # prepare project page (headerdata, qcmain) = self._process_projects_data( ) # get required data for project qc page template_file = \ template_env.get_template(project_template) report_output_file = \ os.path.join(\ temp_work_dir, project_filename) template_file.\ stream(\ ProjectName=project_name, SeqrunDate=seqrun_date, FlowcellId=flowcell_id, headerdata=headerdata, qcmain=qcmain).\ dump(report_output_file) os.chmod(report_output_file, mode=0o754) #remote_chk_cmd.append(os.path.join(remote_file_path,project_filename)) #remote_rm_cmd.append(os.path.join(remote_file_path,project_filename)) elif page_type == 'undetermined': # prepare undetermined fastq page (headerdata, qcmain) = \ self._process_undetermined_data(remote_file_path) # get required data for undetermined qc page template_file = \ template_env.get_template(undetermined_template) report_output_file = \ os.path.join(\ temp_work_dir, undetermined_filename) template_file.\ stream( ProjectName=project_name, SeqrunDate=seqrun_date, FlowcellId=flowcell_id, headerdata=headerdata, qcmain=qcmain).\ dump(report_output_file) os.chmod(report_output_file, mode=0o754) #remote_chk_cmd.append(os.path.join(remote_file_path,undetermined_filename)) #remote_rm_cmd.append(os.path.join(remote_file_path,undetermined_filename)) elif page_type == 'sample': # prepare sample page if lane_index_info is None: raise ValueError('Missing lane and index information') if fastq_dir is None: raise ValueError('Missing required fastq_dir') (headerdata, qcmain) = \ self._process_samples_data() # get required data for sample qc page (lane_id,index_length) = \ lane_index_info.split('_',1) # get lane and index info template_file = \ template_env.get_template(sample_template) # get template file report_output_file = \ os.path.join(\ temp_work_dir, sample_filename) template_file.\ stream( ProjectName=project_name, SeqrunDate=seqrun_date, FlowcellId=flowcell_id, Lane=lane_id, IndexBarcodeLength=index_length, headerdata=headerdata, qcmain=qcmain).\ dump(report_output_file) # dump data to template file os.chmod(report_output_file, mode=0o754) #remote_chk_cmd.append(os.path.join(remote_file_path,sample_filename)) #remote_rm_cmd.append(os.path.join(remote_file_path,sample_filename)) remote_sample_qc_path = \ os.path.join(\ remote_file_path, os.path.basename(report_output_file)) if multiqc_remote_file is None: raise ValueError( 'required a valid path for remote multiqc') remote_path = \ os.path.join(\ remote_project_path, project_name, seqrun_date, flowcell_id) # get remote base path remote_sample_qc_path = \ os.path.relpath(\ remote_sample_qc_path, start=remote_path) # elative path for sample qc multiqc_remote_file = \ os.path.relpath(\ multiqc_remote_file, start=remote_path) # relative path for multiqc report_htmlname = os.path.basename(report_html) reports = list() for root, _, files in os.walk(top=fastq_dir): if report_htmlname in files: reports.\ extend([os.path.join(os.path.abspath(root),file) \ for file in files \ if fnmatch.fnmatch(os.path.join(root,file),report_html)]) # get all html reports if len(reports) == 0: raise ValueError('No demultiplexing report found for fastq dir {0}'.\ format(fastq_dir)) os.chmod(reports[0], mode=0o774) # added read permission for report html copy_remote_file(source_path=reports[0], destinationa_path=remote_file_path, destination_address='{0}@{1}'.format( remote_user, remote_host)) # copy file to remote remote_report_file = \ os.path.join(\ remote_file_path, os.path.basename(reports[0])) # get remote path for report file remote_report_file = \ os.path.relpath(\ remote_report_file, start=remote_path) # get relative path for demultiplexing report qc_file_info = \ {'lane_id':lane_id, 'index_length':index_length, 'sample_qc_page':remote_sample_qc_path, 'multiqc_page':multiqc_remote_file, 'demultiplexing_report':remote_report_file, 'fastq_dir':fastq_dir, 'project_name':project_name, } #response=subprocess.call(remote_chk_cmd) #if response!=0: # subprocess.check_call(remote_rm_cmd) # remove existing remote file if not os.path.exists(report_output_file): raise IOError('file {0} not found'.format(report_output_file)) copy_remote_file(\ source_path=report_output_file, destinationa_path=remote_file_path, destination_address='{0}@{1}'.format(remote_user,remote_host)) # copy file to remote remote_qc_page = \ os.path.join(\ remote_file_path, os.path.basename(report_output_file)) qc_file_info.\ update({'remote_qc_page':remote_qc_page}) self.param('dataflow_params', {'qc_file_info': qc_file_info}) remote_url_path = \ 'http://{0}/{1}'.\ format(remote_host, os.path.relpath(\ remote_qc_page, start=remote_ftp_base)) message = \ 'QC page {0}, {1},{2}: {3}'.\ format( seqrun_igf_id, project_name, page_type, remote_url_path) self.post_message_to_slack(message, reaction='pass') # send msg to slack self.comment_asana_task(\ task_name=seqrun_igf_id, comment=message) # send msg to asana except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
'--source_address', default=None, help='Source address with user name') parser.add_argument('-d', '--dest_address', default=None, help='Destination address with user name') parser.add_argument('-f', '--force_update', default=False, action='store_true', help='Force update existing file') args = parser.parse_args() source_path = args.source_path dest_path = args.dest_path source_address = args.source_address dest_address = args.dest_address force_update = args.force_update if __name__ == '__main__': try: copy_remote_file(source_path=source_path, destinationa_path=dest_path, source_address=source_address, destination_address=dest_address, copy_method='rsync', check_file=True, force_update=force_update) except Exception as e: raise ValueError("Failed to copy remote file, error:{0}".format(e))
def run(self): try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') file_list = self.param_required('file_list') remote_user = self.param_required('remote_user') remote_host = self.param_required('remote_host') remote_project_path = self.param_required('remote_project_path') dir_labels = self.param_required('dir_labels') igf_session_class = self.param_required('igf_session_class') force_overwrite = self.param('force_overwrite') collect_remote_file = self.param('collect_remote_file') collection_name = self.param('collection_name') collection_type = self.param('collection_type') collection_table = self.param('collection_table') file_location = self.param('file_location') use_ephemeral_space = self.param('use_ephemeral_space') destination_output_path = \ os.path.join( remote_project_path, project_igf_id) # get base destination path if isinstance(dir_labels, list) and \ len(dir_labels) > 0: destination_output_path=\ os.path.join(destination_output_path, *dir_labels) if collect_remote_file: if collection_name is None or \ collection_type is None: raise ValueError('Name and type are required for db collection') output_file_list = list() temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get temp dir for file in file_list: if not os.path.exists(file): raise IOError('file {0} not found'.\ format(file)) if os.path.isfile(file): copy2( file, os.path.join( temp_work_dir, os.path.basename(file))) # copy file to a temp dir dest_file_path = \ os.path.join( destination_output_path, os.path.basename(file)) # get destination file path os.chmod( os.path.join( temp_work_dir, os.path.basename(file)), mode=0o764) # set file permission elif os.path.isdir(file): copytree(\ file, os.path.join( temp_work_dir, os.path.basename(file))) # copy dir to a temp dir dest_file_path=destination_output_path for root,dirs,files in os.walk(temp_work_dir): for dir_name in dirs: os.chmod( os.path.join(root,dir_name), mode=0o775) for file_name in files: os.chmod( os.path.join(root,file_name), mode=0o764) # changing file and dir permissions for remote files else: raise ValueError('Unknown source file type: {0}'.\ format(file)) #os.chmod( # os.path.join( # temp_work_dir, # os.path.basename(file)), # mode=0o754) # set file permission copy_remote_file(\ source_path=os.path.join(temp_work_dir, os.path.basename(file)), destinationa_path=dest_file_path, destination_address='{0}@{1}'.format(remote_user,remote_host), force_update=force_overwrite ) # copy file to remote if os.path.isdir(file): dest_file_path=\ os.path.join(\ dest_file_path, os.path.basename(file)) # fix for dir input output_file_list.append(dest_file_path) remove_dir(dir_path=temp_work_dir) # remove temp dir self.param('dataflow_params', {'status': 'done', 'output_list':output_file_list}) # add dataflow params if collect_remote_file: data=list() remove_data_list=[{'name':collection_name, 'type':collection_type}] for file in output_file_list: data.append( {'name':collection_name, 'type':collection_type, 'table':collection_table, 'file_path':file, 'location':file_location } ) ca = CollectionAdaptor(**{'session_class':igf_session_class}) ca.start_session() try: ca.remove_collection_group_info( data=remove_data_list, autosave=False) # remove existing data before loading new collection ca.load_file_and_create_collection( data=data, autosave=False, calculate_file_size_and_md5=False) # load remote files to db ca.commit_session() # commit changes ca.close_session() except: ca.rollback_session() # rollback changes ca.close_session() raise except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise