def _check_and_copy_remote_file(remote_user, remote_host, source_file,
                                    remote_file):
        '''
    An internal static method for copying files to remote path
    
    :param remote_user: Username for the remote server
    :param remote_host: Hostname for the remote server
    :param source_file: Source filepath
    :param remote_file: Remote filepath
    '''
        try:
            if not os.path.exists(source_file):
                raise IOError('Source file {0} not found for copy'.\
                              format(source_file))

            os.chmod(source_file,
                     mode=0o754)  # change source file permission before copy
            remote_address = \
              '{0}@{1}'.format(\
                remote_user,
                remote_host)
            copy_remote_file(\
              source_path=source_file,
              destinationa_path=remote_file,
              destination_address=remote_address)                                     # create dir and copy file to remote
        except:
            raise
  def run_sync(self):
    '''
    A method for running the sequencing run sync
    '''
    try:
      check_file_path(self.output_dir)
      all_seqrun_dir = \
        list_remote_file_or_dirs(\
          remote_server=self.seqrun_server,
          remote_path=self.seqrun_path,
          only_dirs=True)
      all_seqrun_dir = \
        list(map(os.path.basename,all_seqrun_dir))                              # convert paths to dirname
      new_seqrun_dirs = \
        check_seqrun_dir_in_db(\
          all_seqrun_dir=all_seqrun_dir,
          dbconfig=self.database_config_file)                                   # filter existing seqruns
      for seqrun in new_seqrun_dirs:
        try:
          new_seqruns = \
            check_seqrun_dir_in_db(\
              all_seqrun_dir=[seqrun],
              dbconfig=self.database_config_file)                               # filter existing seqrun again
          if len(new_seqruns)>0:
            copy_remote_file(\
              source_path=os.path.join(self.seqrun_path,seqrun),
              destinationa_path=self.output_dir,
              source_address=self.seqrun_server)                                # sync dirs if its still new

        except Exception as e:
          raise ValueError('Failed to sync seqrun {0}, got error {1}'.\
                           format(seqrun,e))

    except Exception as e:
      raise ValueError('Stopped syncing seqrun data, got error: {0}'.\
                       format(e))
disk_path = args.disk_path
copy_to_remoter = args.copy_to_remoter
remote_server = args.remote_server
output_path = args.output_path

try:
    if copy_to_remoter and not remote_server:
        parser.print_help()
        raise ValueError(
            'Remote server address is required for copying files.')

    storage_stats = get_storage_stats_in_gb(
        disk_path)  # calculate disk usage stats
    temp_dir = get_temp_dir()
    temp_file = os.path.join(temp_dir, 'disk_usage.json')  # get temp file path
    with open(temp_file, 'w') as j_data:
        json.dump(storage_stats, j_data,
                  indent=4)  # writing disk usage to temp jeon file

    if copy_to_remoter:
        copy_remote_file(source_path=temp_file,
                         destinationa_path=output_path,
                         destination_address=remote_server
                         )  # copy json file to remote server
    else:
        shutil.copy2(temp_file, output_path)  # copy json file to local server

    remove_dir(temp_dir)  # remove temp dir
except Exception as e:
    print('Error: {0}'.format(e))
    def run(self):
        try:
            seqrun_igf_id = self.param_required('seqrun_igf_id')
            seqrun_source = self.param_required('seqrun_source')
            seqrun_server = self.param_required('seqrun_server')
            seqrun_user = self.param_required('seqrun_user')
            seqrun_local_dir = self.param_required('seqrun_local_dir')
            chacksum_type = self.param_required('checksum_type')
            seqrun_file_name = self.param_required('seqrun_file_name')
            file_md5_value = self.param_required('file_md5')
            transfer_remote_file = True  # transfer file from remote server
            source_file_path = \
              os.path.join(\
                seqrun_source,
                seqrun_igf_id,
                seqrun_file_name)                                                     # get new seqrun path
            dir_name = os.path.dirname(
                seqrun_file_name)  # returns dir name or empty strings
            destination_dir = \
              os.path.join(\
                seqrun_local_dir,
                seqrun_igf_id,
                dir_name)                                                             # get file copy path

            destination_path = \
              os.path.join(\
                destination_dir,
                os.path.basename(seqrun_file_name))                                   # get destination path
            if os.path.exists(destination_path) and \
               os.path.isfile(destination_path):
                existing_checksum = \
                  calculate_file_checksum(\
                    destination_path,\
                    hasher=chacksum_type)                                               # calculate checksum of existing file
                if existing_checksum == file_md5_value:
                    transfer_remote_file = False  # skip file transfer if its up to date
                else:
                    os.remove(destination_path)  # remove existing file

            if transfer_remote_file:
                if seqrun_user is None and seqrun_server is None:
                    raise ValueError('seqrun: {0}, missing required value for seqrun_user or seqrun_server'.\
                                     format(seqrun_igf_id))

                source_address = '{0}@{1}'.format(
                    seqrun_user,
                    seqrun_server)  # get host username and address
                copy_remote_file(\
                  source_path=source_file_path,
                  destinationa_path=destination_path,
                  source_address=source_address,
                  check_file=False)                                                     # copy remote file
                if not os.path.exists(destination_path):
                    raise IOError('failed to copy file {0} for seqrun {1}'.\
                                  format(seqrun_file_name,seqrun_igf_id))                 # check destination file after copy

                new_checksum = \
                  calculate_file_checksum(\
                    destination_path,
                    hasher=chacksum_type)                                               # calculate checksum of the transferred file
                if new_checksum != file_md5_value:
                    raise ValueError('seqrun:{3}, checksum not matching for file {0}, expected: {1}, got {2}'.\
                                     format(seqrun_file_name,
                                            file_md5_value,
                                            new_checksum,
                                            seqrun_igf_id))                               # raise error if checksum doesn't match

            self.param('dataflow_params',
                       {'seqrun_file_name': seqrun_file_name})
        except Exception as e:
            message = \
              'seqrun: {2}, Error in {0}: {1}'.\
                format(\
                  self.__class__.__name__,
                  e,
                  seqrun_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
Beispiel #5
0
    def run(self):
        try:
            file = self.param_required('file')
            seqrun_igf_id = self.param_required('seqrun_igf_id')
            remote_user = self.param_required('remote_user')
            remote_host = self.param_required('remote_host')
            remote_project_path = self.param_required('remote_project_path')
            project_name = self.param_required('project_name')
            seqrun_date = self.param_required('seqrun_date')
            flowcell_id = self.param_required('flowcell_id')
            dir_label = self.param_required('dir_label')
            sample_label = self.param('sample_label')
            tag = self.param_required('tag')
            use_ephemeral_space = self.param('use_ephemeral_space')
            analysis_label = self.param_required('analysis_label')
            force_overwrite = self.param('force_overwrite')

            if not os.path.exists(file):
                raise IOError('file {0} not found'.format(file))

            if dir_label is None:
                dir_label = \
                  os.path.basename(os.path.dirname(file))                               # get the lane and index length info, FIXIT

            file_suffix = None
            file_name = os.path.basename(file)
            file_name_list = file_name.split('.')
            if len(file_name_list) > 1:
                (file_label,file_suffix) = \
                  (file_name_list[0],file_name_list[-1])                                # get file_label and suffix
            else:
                file_label = file_name_list[0]

            remote_file_name = \
              '{0}.{1}'.format(analysis_label,file_suffix)                            # simplify remote filename for report page

            destination_outout_path = \
              os.path.join(
                remote_project_path,
                project_name,
                seqrun_date,
                flowcell_id,
                dir_label,
                tag)                                                                  # result dir path is generic
            if sample_label is not None:
                destination_outout_path = \
                  os.path.join(
                    destination_outout_path,
                    sample_label)                                                       # adding sample label only if its present

            destination_outout_path = \
              os.path.join(
                destination_outout_path,
                analysis_label,
                file_label)                                                           # adding file label to the destination path
            if os.path.isfile(file):
                destination_outout_path = \
                  os.path.join(\
                    destination_outout_path,
                    remote_file_name)                                                   # add destination file name

            temp_work_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get a temp work dir
            copy2(file, os.path.join(
                temp_work_dir,
                remote_file_name))  # copy file to a temp dir and rename it
            os.chmod(os.path.join(temp_work_dir, remote_file_name),
                     mode=0o754)  # set file permission
            copy_remote_file(
                source_path=os.path.join(temp_work_dir, remote_file_name),
                destinationa_path=destination_outout_path,
                destination_address='{0}@{1}'.format(remote_user, remote_host),
                force_update=force_overwrite)  # copy file to remote
            if os.path.isdir(file):
                destination_outout_path = \
                  os.path.join(
                    destination_outout_path,
                    remote_file_name)                                                   # add destination dir name

            self.param(
                'dataflow_params', {
                    'file': file,
                    'status': 'done',
                    'remote_file': destination_outout_path
                })  # add dataflow params
        except Exception as e:
            message = \
              'seqrun: {2}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  seqrun_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
  def run(self):
    try:
      seqrun_igf_id=self.param_required('seqrun_igf_id')
      seqrun_source=self.param_required('seqrun_source')
      seqrun_server=self.param_required('seqrun_server')
      seqrun_user=self.param_required('seqrun_user')
      igf_session_class=self.param_required('igf_session_class')
      seqrun_md5_type=self.param_required('seqrun_md5_type')
      hpc_location=self.param_required('hpc_location')
      db_file_location_label=self.param_required('db_file_location_label')
      db_file_path_label=self.param_required('db_file_path_label')

      seqrun_path=os.path.join(seqrun_source,seqrun_igf_id)                     # get new seqrun path
      seqrun_server_login='******'.format(seqrun_user, seqrun_server)          # get destination path
      subprocess.check_call(['ssh', 
                             seqrun_server_login,
                             'ls', 
                             seqrun_path])                                      # check remote file
      ca=CollectionAdaptor(**{'session_class':igf_session_class})               # get the md5 list from db
      ca.start_session()
      files=ca.get_collection_files(collection_name=seqrun_igf_id,
                                    collection_type=seqrun_md5_type)            # fetch file collection
      files=files.to_dict(orient='records')
      ca.close_session()

      if len(files)>1:
        raise ValueError('sequencing run {0} has more than one md5 json file'.\
                         format(seqrun_igf_id))

      if len(files)==0:
        raise ValueError('sequencing run {0} does not have any md5 json file'.\
                         format(seqrun_igf_id))
      
      md5_json_location=files[0][db_file_location_label]
      md5_json_path=files[0][db_file_path_label]
      if md5_json_location !=hpc_location:
        temp_dir=get_temp_dir(work_dir=os.getcwd())                             # create a temp directory
        destination_path=os.path.join(temp_dir,os.path.basename(md5_json_path)) # get destination path for md5 file
        copy_remote_file(source_path=md5_json_path,
                         destinationa_path=destination_path,
                         source_address=seqrun_server_login)                    # copy remote file to local disk
        md5_json_path=destination_path                                          # set md5 json filepath

      with open(md5_json_path) as json_data:
            md5_json=json.load(json_data)                                       # read json data, get all file and md5 from json file
      self.param('sub_tasks',md5_json)                                          # seed dataflow
      remove_dir(temp_dir)                                                      # remove temp dir when its not required
      
      message='seqrun: {0}, seeded {1} files for copy'.format(seqrun_igf_id, \
                                                              len(md5_json))
      self.warning(message)
      self.post_message_to_slack(message,reaction='pass')
      self.comment_asana_task(task_name=seqrun_igf_id, \
                              comment=message)

    except Exception as e:
      message='Error in {0}: {1}, seqrun: {2}'.format(self.__class__.__name__,\
                                                      e,\
                                                      seqrun_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')
      self.comment_asana_task(task_name=seqrun_igf_id, \
                              comment=message)
      raise
    def run(self):
        try:
            seqrun_igf_id = self.param_required('seqrun_igf_id')
            project_name = self.param_required('project_name')
            seqrun_date = self.param_required('seqrun_date')
            flowcell_id = self.param_required('flowcell_id')
            remote_project_path = self.param_required('remote_project_path')
            remote_user = self.param_required('remote_user')
            remote_host = self.param_required('remote_host')
            template_dir = self.param_required('template_dir')
            page_type = self.param_required('page_type')
            fastq_dir = self.param('fastq_dir')
            multiqc_remote_file = self.param('multiqc_remote_file')
            lane_index_info = self.param('lane_index_info')
            qc_template_path = self.param('qc_template_path')
            project_template = self.param('project_template')
            undetermined_template = self.param('undetermined_template')
            sample_template = self.param('sample_template')
            project_filename = self.param('project_filename')
            sample_filename = self.param('sample_filename')
            undetermined_filename = self.param('undetermined_filename')
            report_html = self.param('report_html')
            remote_ftp_base = self.param('remote_ftp_base')
            use_ephemeral_space = self.param('use_ephemeral_space')

            if page_type not in ['project', 'sample', 'undetermined']:
                raise ValueError(
                    'Project type {0} is not defined yet'.format(page_type))

            qc_template_path = \
              os.path.join(template_dir,qc_template_path)
            remote_file_path = \
              os.path.join(\
                remote_project_path,
                project_name,
                seqrun_date,
                flowcell_id)
            if lane_index_info is not None:
                remote_file_path = \
                  os.path.join(\
                    remote_file_path,
                    lane_index_info)                                                    # generic remote path, lane info is none for project

            template_env = \
              Environment(
                loader=FileSystemLoader(searchpath=qc_template_path),
                autoescape=select_autoescape(['xml']))                                # set template env

            #remote_chk_cmd=['ssh',\
            #                '{0}@{1}'.\
            #                format(remote_user,\
            #                       remote_host),\
            #                'ls']

            #remote_rm_cmd=['ssh',\
            #                '{0}@{1}'.\
            #                format(remote_user,\
            #                       remote_host),\
            #                'rm', \
            #                '-f']

            temp_work_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get a temp dir
            report_output_file = None
            qc_file_info = dict()
            qc_file_info.\
              update({
                'project_name':project_name,
                'flowcell': flowcell_id,
              })
            if page_type == 'project':  # prepare project page
                (headerdata, qcmain) = self._process_projects_data(
                )  # get required data for project qc page

                template_file = \
                  template_env.get_template(project_template)
                report_output_file = \
                  os.path.join(\
                    temp_work_dir,
                    project_filename)
                template_file.\
                stream(\
                  ProjectName=project_name,
                  SeqrunDate=seqrun_date,
                  FlowcellId=flowcell_id,
                  headerdata=headerdata,
                  qcmain=qcmain).\
                dump(report_output_file)
                os.chmod(report_output_file, mode=0o754)

                #remote_chk_cmd.append(os.path.join(remote_file_path,project_filename))
                #remote_rm_cmd.append(os.path.join(remote_file_path,project_filename))

            elif page_type == 'undetermined':  # prepare undetermined fastq page
                (headerdata, qcmain) = \
                  self._process_undetermined_data(remote_file_path)                     # get required data for undetermined qc page
                template_file = \
                  template_env.get_template(undetermined_template)
                report_output_file = \
                  os.path.join(\
                    temp_work_dir,
                    undetermined_filename)
                template_file.\
                stream(
                  ProjectName=project_name,
                  SeqrunDate=seqrun_date,
                  FlowcellId=flowcell_id,
                  headerdata=headerdata,
                  qcmain=qcmain).\
                dump(report_output_file)
                os.chmod(report_output_file, mode=0o754)
                #remote_chk_cmd.append(os.path.join(remote_file_path,undetermined_filename))
                #remote_rm_cmd.append(os.path.join(remote_file_path,undetermined_filename))

            elif page_type == 'sample':  # prepare sample page
                if lane_index_info is None:
                    raise ValueError('Missing lane and index information')

                if fastq_dir is None:
                    raise ValueError('Missing required fastq_dir')

                (headerdata, qcmain) = \
                  self._process_samples_data()                                          # get required data for sample qc page
                (lane_id,index_length) = \
                  lane_index_info.split('_',1)                                          # get lane and index info
                template_file = \
                  template_env.get_template(sample_template)                            # get template file
                report_output_file = \
                  os.path.join(\
                    temp_work_dir,
                    sample_filename)
                template_file.\
                  stream(
                    ProjectName=project_name,
                    SeqrunDate=seqrun_date,
                    FlowcellId=flowcell_id,
                    Lane=lane_id,
                    IndexBarcodeLength=index_length,
                    headerdata=headerdata,
                    qcmain=qcmain).\
                  dump(report_output_file)                                                # dump data to template file
                os.chmod(report_output_file, mode=0o754)

                #remote_chk_cmd.append(os.path.join(remote_file_path,sample_filename))
                #remote_rm_cmd.append(os.path.join(remote_file_path,sample_filename))

                remote_sample_qc_path = \
                  os.path.join(\
                    remote_file_path,
                    os.path.basename(report_output_file))
                if multiqc_remote_file is None:
                    raise ValueError(
                        'required a valid path for remote multiqc')

                remote_path = \
                  os.path.join(\
                    remote_project_path,
                    project_name,
                    seqrun_date,
                    flowcell_id)                                                        # get remote base path
                remote_sample_qc_path = \
                  os.path.relpath(\
                    remote_sample_qc_path,
                    start=remote_path)                                                  # elative path for sample qc
                multiqc_remote_file = \
                  os.path.relpath(\
                    multiqc_remote_file,
                    start=remote_path)                                                  # relative path for multiqc

                report_htmlname = os.path.basename(report_html)
                reports = list()
                for root, _, files in os.walk(top=fastq_dir):
                    if report_htmlname in files:
                        reports.\
                          extend([os.path.join(os.path.abspath(root),file) \
                                   for file in files \
                                     if fnmatch.fnmatch(os.path.join(root,file),report_html)]) # get all html reports

                if len(reports) == 0:
                    raise ValueError('No demultiplexing report found for fastq dir {0}'.\
                                     format(fastq_dir))

                os.chmod(reports[0],
                         mode=0o774)  # added read permission for report html
                copy_remote_file(source_path=reports[0],
                                 destinationa_path=remote_file_path,
                                 destination_address='{0}@{1}'.format(
                                     remote_user,
                                     remote_host))  # copy file to remote
                remote_report_file = \
                  os.path.join(\
                    remote_file_path,
                    os.path.basename(reports[0]))                                       # get remote path for report file
                remote_report_file = \
                  os.path.relpath(\
                    remote_report_file,
                    start=remote_path)                                                  # get relative path for demultiplexing report

                qc_file_info = \
                  {'lane_id':lane_id,
                   'index_length':index_length,
                   'sample_qc_page':remote_sample_qc_path,
                   'multiqc_page':multiqc_remote_file,
                   'demultiplexing_report':remote_report_file,
                   'fastq_dir':fastq_dir,
                   'project_name':project_name,
                  }

            #response=subprocess.call(remote_chk_cmd)
            #if response!=0:
            #  subprocess.check_call(remote_rm_cmd)                                    # remove existing remote file

            if not os.path.exists(report_output_file):
                raise IOError('file {0} not found'.format(report_output_file))

            copy_remote_file(\
              source_path=report_output_file,
              destinationa_path=remote_file_path,
              destination_address='{0}@{1}'.format(remote_user,remote_host))          # copy file to remote
            remote_qc_page = \
              os.path.join(\
                remote_file_path,
                os.path.basename(report_output_file))
            qc_file_info.\
              update({'remote_qc_page':remote_qc_page})
            self.param('dataflow_params', {'qc_file_info': qc_file_info})

            remote_url_path = \
              'http://{0}/{1}'.\
              format(remote_host,
                     os.path.relpath(\
                       remote_qc_page,
                       start=remote_ftp_base))
            message = \
              'QC page {0}, {1},{2}: {3}'.\
                format(
                  seqrun_igf_id,
                  project_name,
                  page_type,
                  remote_url_path)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send msg to slack
            self.comment_asana_task(\
              task_name=seqrun_igf_id,
              comment=message)                                                        # send msg to asana
        except Exception as e:
            message = \
              'seqrun: {2}, Error in {0}: {1}'.\
                format(\
                  self.__class__.__name__,
                  e,
                  seqrun_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
Beispiel #8
0
                    '--source_address',
                    default=None,
                    help='Source address with user name')
parser.add_argument('-d',
                    '--dest_address',
                    default=None,
                    help='Destination address with user name')
parser.add_argument('-f',
                    '--force_update',
                    default=False,
                    action='store_true',
                    help='Force update existing file')

args = parser.parse_args()
source_path = args.source_path
dest_path = args.dest_path
source_address = args.source_address
dest_address = args.dest_address
force_update = args.force_update

if __name__ == '__main__':
    try:
        copy_remote_file(source_path=source_path,
                         destinationa_path=dest_path,
                         source_address=source_address,
                         destination_address=dest_address,
                         copy_method='rsync',
                         check_file=True,
                         force_update=force_update)
    except Exception as e:
        raise ValueError("Failed to copy remote file, error:{0}".format(e))
Beispiel #9
0
  def run(self):
    try:
      project_igf_id = self.param_required('project_igf_id')
      sample_igf_id = self.param_required('sample_igf_id')
      file_list = self.param_required('file_list')
      remote_user = self.param_required('remote_user')
      remote_host = self.param_required('remote_host')
      remote_project_path = self.param_required('remote_project_path')
      dir_labels = self.param_required('dir_labels')
      igf_session_class = self.param_required('igf_session_class')
      force_overwrite = self.param('force_overwrite')
      collect_remote_file = self.param('collect_remote_file')
      collection_name = self.param('collection_name')
      collection_type = self.param('collection_type')
      collection_table = self.param('collection_table')
      file_location = self.param('file_location')
      use_ephemeral_space = self.param('use_ephemeral_space')
      destination_output_path = \
        os.path.join(
          remote_project_path,
          project_igf_id)                                                       # get base destination path
      if isinstance(dir_labels, list) and \
         len(dir_labels) > 0:
        destination_output_path=\
          os.path.join(destination_output_path,
                       *dir_labels)

      if collect_remote_file:
        if collection_name is None or \
           collection_type is None:
           raise ValueError('Name and type are required for db collection')

      output_file_list = list()
      temp_work_dir = \
        get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get temp dir
      for file in file_list:
        if not os.path.exists(file):
          raise IOError('file {0} not found'.\
                        format(file))

        if os.path.isfile(file):
          copy2(
            file,
            os.path.join(
              temp_work_dir,
              os.path.basename(file)))                                          # copy file to a temp dir
          dest_file_path = \
            os.path.join(
              destination_output_path,
              os.path.basename(file))                                           # get destination file path
          os.chmod(
            os.path.join(
              temp_work_dir,
              os.path.basename(file)),
            mode=0o764)                                                         # set file permission
        elif os.path.isdir(file):
          copytree(\
            file,
            os.path.join(
              temp_work_dir,
              os.path.basename(file)))                                          # copy dir to a temp dir
          dest_file_path=destination_output_path
          for root,dirs,files in os.walk(temp_work_dir):
            for dir_name in dirs:
              os.chmod(
                os.path.join(root,dir_name),
                mode=0o775)
            for file_name in files:
              os.chmod(
                os.path.join(root,file_name),
                mode=0o764)                                                     # changing file and dir permissions for remote files
        else:
          raise ValueError('Unknown source file type: {0}'.\
                           format(file))

        #os.chmod(
        #  os.path.join(
        #    temp_work_dir,
        #    os.path.basename(file)),
        #  mode=0o754)                                                                       # set file permission
        copy_remote_file(\
          source_path=os.path.join(temp_work_dir,
                                   os.path.basename(file)),
          destinationa_path=dest_file_path,
          destination_address='{0}@{1}'.format(remote_user,remote_host),
          force_update=force_overwrite
        )                                                                       # copy file to remote
        if os.path.isdir(file):
          dest_file_path=\
            os.path.join(\
              dest_file_path,
              os.path.basename(file))                                           # fix for dir input

        output_file_list.append(dest_file_path)

      remove_dir(dir_path=temp_work_dir)                                        # remove temp dir
      self.param('dataflow_params',
                 {'status': 'done',
                  'output_list':output_file_list})                              # add dataflow params
      if collect_remote_file:
        data=list()
        remove_data_list=[{'name':collection_name,
                           'type':collection_type}]
        for file in output_file_list:
          data.append(
            {'name':collection_name,
             'type':collection_type,
             'table':collection_table,
             'file_path':file,
             'location':file_location
            }
          )

        ca = CollectionAdaptor(**{'session_class':igf_session_class})
        ca.start_session()
        try:
          ca.remove_collection_group_info(
            data=remove_data_list,
            autosave=False)                                                     # remove existing data before loading new collection
          ca.load_file_and_create_collection(
            data=data,
            autosave=False,
            calculate_file_size_and_md5=False)                                  # load remote files to db
          ca.commit_session()                                                   # commit changes
          ca.close_session()
        except:
          ca.rollback_session()                                                 # rollback changes
          ca.close_session()
          raise

    except Exception as e:
      message = \
        'project: {2}, sample:{3}, Error in {0}: {1}'.\
        format(
          self.__class__.__name__,
          e,
          project_igf_id,
          sample_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise