Example #1
0
def config_file_setup(logger, cf_label, cf_from_cli=None):
    """
    Create a config file if does not exists, copying it from the package
    default into the user_config_dir.
    Return a configuration file path from cli args if present, otherwise return
    a path from the user_config_dir
    :param logger: logger
    :param cf_label: label of the configuration file (required)
    :param cf_from_cli: path to configuration file from cli arg
    :return: Path
    """
    presta_config_dir = os.path.join(user_config_dir(__appname__))
    config_file_from_home = os.path.join(presta_config_dir, cf_label)

    if not path_exists(config_file_from_home, logger, force=False):
        logger.info('Creating config path {}'.format(presta_config_dir))
        ensure_dir(presta_config_dir)
        config_file_path = '/'.join(['config', cf_label])
        config_file_from_package = resource_filename(__appname__,
                                                     config_file_path)
        copyfile(config_file_from_package, config_file_from_home)

    config_file_paths = []
    if cf_from_cli and path_exists(cf_from_cli, logger, force=False):
        config_file_paths.append(WeightedPath(cf_from_cli, 0))
    if path_exists(config_file_from_home, logger, force=False):
        config_file_paths.append(WeightedPath(config_file_from_home, 1))

    logger.debug("config file paths: {}".format(config_file_paths))

    config_file_path = sorted(config_file_paths)[0].path
    logger.info('Reading configuration from {}'.format(config_file_path))
    return config_file_path
Example #2
0
    def get_object(self, src_path, dest_path=None, prefix='irods://'):
        """
        Retrieves an object from an existing path.
        If dest_path is set, data will be copied from iRODS to the filesystem.

        :type src_path: str
        :param src_path: irods path

        :param dest_path: str
        :param dest_path: destination path

        :type prefix: str
        :param prefix: path's prefix (if any)

        :return: an irods.data_object.iRODSDataObject,
        irods.collection.iRODSCollection or None
        """

        if src_path.startswith(prefix):
            src_path = os.path.join(src_path.replace(prefix, '/'))
        exists, obj = self.exists(src_path, delivery=True)

        if exists and dest_path:
            ensure_dir(os.path.dirname(dest_path))
            with open(dest_path, 'w') as df:
                with obj.open('r') as sf:
                    for line in sf:
                        df.write(line)

        return obj
Example #3
0
def main():
    app = App()
    parser = app.make_parser()
    args = parser.parse_args()
    ensure_dir(os.path.dirname(args.logfile))
    logger = a_logger('Main', level=args.loglevel, filename=args.logfile)
    logger.info('{} started'.format(__appname__.capitalize()))

    args.func(logger, args) if hasattr(args, 'func') else parser.print_help()
Example #4
0
 def update(self, label):
     pipeline = Pipeline(self.conf[label],
                         loglevel=self.loglevel,
                         logfile=self.logfile)
     skip = ''
     if self.ask_before_to_refresh:
         msg = ("Updating {} - {}".format(pipeline.label.capitalize(),
                                          pipeline.description))
         skip = self.user_input(msg)
     if skip != ord('s'):
         repo_dir = os.path.join(self.cache_dir, label)
         ensure_dir(repo_dir, force=True)
         self.clone(label)
Example #5
0
    def __init__(self, args=None):
        self.loglevel = args.loglevel
        self.logfile = args.logfile
        self.logger = a_logger(self.__class__.__name__,
                               level=self.loglevel,
                               filename=self.logfile)
        path_from_cli = args.config_file if 'config_file' in vars(
            args) else None
        cm = ConfigurationManager(args=args, path_from_cli=path_from_cli)
        self.conf = cm.get_pipelines_config
        self.core_environment_file = cm.get_default_config[
            'core_environment_file']
        self.environment_file = cm.get_default_config[
            'project_environment_file']
        self.cache_dir = cache_dir
        ensure_dir(self.cache_dir)

        self.ask_before_to_refresh = args.ask if 'ask' in vars(args) else False
Example #6
0
def copy_qc_dirs(trigger=None,  **kwargs):

    if trigger is False:
        return trigger

    src = kwargs.get('src')
    dest = kwargs.get('dest')

    dirs = ['Stats', 'Reports', 'fastqc']
    ensure_dir(dest)
    task0 = copy.si(os.path.join(src, dirs[0]), os.path.join(dest, dirs[0]))
    task1 = copy.si(os.path.join(src, dirs[1]), os.path.join(dest, dirs[1]))
    task2 = copy.si(os.path.join(src, dirs[2]), os.path.join(dest, dirs[2]))

    job = group([task0, task1, task2])

    result = job.apply_async()

    return result
Example #7
0
    def __init__(self,
                 args=None,
                 path_from_cli=None,
                 path_from_package='config/config.yaml',
                 config_filename='config.yaml'):
        def copy_config_file_from_package(appname, src, dst):
            _from_package = resource_filename(appname, src)
            copyfile(_from_package, dst)

        self.loglevel = args.loglevel
        self.logfile = args.logfile
        logger = a_logger(self.__class__.__name__,
                          level=self.loglevel,
                          filename=self.logfile)

        cfg_dir = os.path.join(config_dir)
        config_file_path = os.path.join(cfg_dir, config_filename)

        # Create configuration file from default if needed
        if not path_exists(cfg_dir, logger, force=False):
            logger.info('Creating config dir {}'.format(cfg_dir))
            ensure_dir(cfg_dir)
        if not path_exists(config_file_path, logger, force=False):
            logger.info('Copying default config file from {} package '
                        'resource'.format(__appname__))
            copy_config_file_from_package(__appname__, path_from_package,
                                          config_file_path)

        config_file_paths = []
        if path_from_cli and path_exists(path_from_cli, logger, force=False):
            config_file_paths.append(WeightedPath(path_from_cli, 0))
        if path_exists(config_file_path, logger, force=False):
            config_file_paths.append(WeightedPath(config_file_path, 1))

        logger.debug("config file paths: {}".format(config_file_paths))

        config_file_path = sorted(config_file_paths)[0].path
        logger.info('Reading configuration from {}'.format(config_file_path))

        c = load_config(config_file_path)
        self.pipes_conf = c['pipelines'] if 'pipelines' in c else None
        self.default_conf = c['default_vars'] if 'default_vars' in c else None
Example #8
0
    def clone(self, label):
        pipeline = Pipeline(self.conf[label],
                            loglevel=self.loglevel,
                            logfile=self.logfile)
        repo_dir = os.path.join(self.cache_dir, label)
        ensure_dir(repo_dir)
        if path_is_empty(repo_dir):
            print("Cloning {}".format(pipeline.url))
            Repo.clone_from(pipeline.url, repo_dir)
            repo = Repo(repo_dir)
            heads = repo.heads
            master = heads.master
            with open(os.path.join(repo_dir, ".git_repo_last_commit"),
                      'w') as filename:
                filename.write(pipeline.url)
                filename.write("\ncommit id: {}".format(master.commit))

            requirements_path = os.path.join(repo_dir,
                                             self.core_environment_file)
            if not path_exists(requirements_path):
                data = {
                    'channels': ['bioconda', 'conda-forge', 'defaults'],
                    'dependencies': ['python==3.6.1', 'pip']
                }
                dump(data, requirements_path)

            requirements_path = os.path.join(repo_dir, self.environment_file)
            if not path_exists(requirements_path):
                data = {
                    'channels': ['bioconda', 'conda-forge', 'defaults'],
                    'dependencies': ['snakemake', 'drmaa==0.7.8']
                }
                dump(data, requirements_path)

            print("commit id: {}".format(master.commit))
            print("Done.\n")
            self.logger.info('Cloned git repo at {} into {} '
                             'directory'.format(pipeline.url, repo_dir))
        else:
            self.logger.warning("Can't clone git repo {} "
                                "into {}".format(pipeline.url, repo_dir))
Example #9
0
    def run(self):
        msgs = [
            "Generating Fastqc reports",
            "Coping qc dirs from {} to {}".format(self.input_path,
                                                  self.output_path)
        ]

        if path_exists(self.qc_path, self.logger, force=False) and len(os.listdir(self.qc_path)) > 0 \
                and not self.rerun:

            self.logger.info(msgs[1])
            copy_task = dispatch_event.si(event='copy_qc_folders',
                                          params=dict(src=self.input_path,
                                                      dest=self.output_path))
            copy_task.delay()

        else:
            self.logger.info("{} and {}".format(msgs[0], msgs[1]))
            ensure_dir(self.qc_path, force=True)

            qc_task = chain(
                dispatch_event.si(
                    event='qc_started',
                    params=dict(progress_status_file=self.started)),
                rd_collect_fastq.si(ds_path=self.input_path),
                qc_runner.s(outdir=self.qc_path,
                            batch_queuing=self.batch_queuing,
                            queue_spec=self.queues_conf.get('q_fastqc')),
            ).apply_async()

            copy_task = trigger_event.si(event='copy_qc_folders',
                                         params=dict(src=self.input_path,
                                                     dest=self.output_path),
                                         tasks=qc_task.get())
            copy_task.apply_async()

            trigger_event.si(event='qc_completed',
                             params=dict(progress_status_file=self.completed),
                             tasks=qc_task.get()).apply_async()
Example #10
0
File: qc.py Project: gmauro/presta
    def run(self):
        msgs = ["Generating Fastqc reports",
                "Coping qc dirs from {} to {}".format(self.input_path,
                                                      self.output_path)]

        if path_exists(self.qc_path, self.logger, force=False) and len(os.listdir(self.qc_path)) > 0 \
                and not self.rerun:

            self.logger.info(msgs[1])
            copy_task = dispatch_event.si(event='copy_qc_folders',
                                          params=dict(src=self.input_path,
                                                      dest=self.output_path)
                                          )
            copy_task.delay()

        else:
            self.logger.info("{} and {}".format(msgs[0], msgs[1]))
            ensure_dir(self.qc_path, force=True)

            qc_task = chain(dispatch_event.si(event='qc_started',
                                              params=dict(progress_status_file=self.started)),
                            rd_collect_fastq.si(ds_path=self.input_path),
                            qc_runner.s(outdir=self.qc_path,
                                        batch_queuing=self.batch_queuing,
                                        queue_spec=self.queues_conf.get('q_fastqc')),
                            ).apply_async()

            copy_task = trigger_event.si(event='copy_qc_folders',
                                         params=dict(src=self.input_path,
                                                     dest=self.output_path),
                                         tasks=qc_task.get())
            copy_task.apply_async()

            trigger_event.si(event='qc_completed',
                             params=dict(progress_status_file=self.completed),
                             tasks=qc_task.get()).apply_async()
Example #11
0
File: qc.py Project: gmauro/presta
    def __init__(self, args=None, logger=None):
        self.logger = logger
        self.conf = get_conf(logger, args.config_file)
        self.io_conf = self.conf.get_io_section()
        self.batch_queuing = args.batch_queuing
        self.queues_conf = self.conf.get_section('queues')

        rd_label = args.rd_label
        ds_path = args.ds_path if args.ds_path \
            else os.path.join(self.io_conf.get('archive_root_path'),
                              rd_label,
                              self.io_conf.get('ds_folder_name'))

        qc_path = args.qc_path if args.qc_path \
            else os.path.join(ds_path,
                              self.io_conf.get('qc_folder_name'))

        qc_export_path = args.qc_export_path if args.qc_export_path \
            else os.path.join(self.io_conf.get('qc_export_basepath'),
                              rd_label)

        # FIXME: this is a local path, must be checked that run on right node
        if not path_exists(qc_export_path, logger, force=False):
            ensure_dir(qc_export_path)

        path_exists(ds_path, logger)
        path_exists(qc_export_path, logger)

        self.input_path = ds_path
        self.output_path = qc_export_path
        self.qc_path = qc_path
        self.rerun = args.rerun
        self.started = os.path.join(self.qc_path,
                                    self.io_conf.get('quality_check_started_file'))
        self.completed = os.path.join(self.qc_path,
                                      self.io_conf.get('quality_check_completed_file'))
Example #12
0
    def __init__(self, args=None, logger=None):
        self.logger = logger
        self.conf = get_conf(logger, args.config_file)
        self.io_conf = self.conf.get_io_section()
        self.batch_queuing = args.batch_queuing
        self.queues_conf = self.conf.get_section('queues')

        rd_label = args.rd_label
        ds_path = args.ds_path if args.ds_path \
            else os.path.join(self.io_conf.get('archive_root_path'),
                              rd_label,
                              self.io_conf.get('ds_folder_name'))

        qc_path = args.qc_path if args.qc_path \
            else os.path.join(ds_path,
                              self.io_conf.get('qc_folder_name'))

        qc_export_path = args.qc_export_path if args.qc_export_path \
            else os.path.join(self.io_conf.get('qc_export_basepath'),
                              rd_label)

        # FIXME: this is a local path, must be checked that run on right node
        if not path_exists(qc_export_path, logger, force=False):
            ensure_dir(qc_export_path)

        path_exists(ds_path, logger)
        path_exists(qc_export_path, logger)

        self.input_path = ds_path
        self.output_path = qc_export_path
        self.qc_path = qc_path
        self.rerun = args.rerun
        self.started = os.path.join(
            self.qc_path, self.io_conf.get('quality_check_started_file'))
        self.completed = os.path.join(
            self.qc_path, self.io_conf.get('quality_check_completed_file'))
Example #13
0
    def __fs2fs_carrier(self, input_paths, opath):

        self.delivery_started = os.path.join(
            opath, self.io_conf.get('delivery_started_file'))
        self.delivery_completed = os.path.join(
            opath, self.io_conf.get('delivery_completed_file'))

        self.merge_started = os.path.join(
            opath, self.io_conf.get('merge_started_file'))
        self.merge_completed = os.path.join(
            opath, self.io_conf.get('merge_completed_file'))

        bids = [
            _ for _ in self.delivery['samples_info'].keys()
            if self.delivery['samples_info'][_].get('type') not in
            SAMPLE_TYPES_TOSKIP
        ]

        if len(bids) > 0:
            for id, info in self.delivery['samples_info'].iteritems():
                batch_id = info.get('batch_id')
                path = os.path.join(opath, batch_id)
                if not self.dry_run and not os.path.exists(path):
                    ensure_dir(path)

        self.logger.info('Looking for files related to {} Bika ids'.format(
            len(bids)))

        dm = DatasetsManager(self.logger, bids)
        for path in input_paths:
            if self.runs and isinstance(self.runs,
                                        list) and len(self.runs) > 0:
                for run in self.runs:
                    ipath = os.path.join(path, run)
                    if os.path.exists(ipath):
                        self.logger.info('Searching in {}'.format(ipath))
                        datasets_info, count = dm.collect_fastq_from_fs(ipath)
                        self.logger.info("found {} files in {}".format(
                            count, ipath))
            else:
                ipath = path
                if os.path.exists(ipath):
                    self.logger.info('Searching in  {}'.format(ipath))
                    datasets_info, count = dm.collect_fastq_from_fs(ipath)
                    self.logger.info("found {} files in {}".format(
                        count, ipath))

        datasets_info = dm.fastq_collection
        count = dm.fastq_counter

        self.logger.info("found {} files".format(count))

        to_be_merged = dict()

        if not self.dry_run:
            dispatch_event.si(event='delivery_started',
                              params=dict(
                                  progress_status_file=self.delivery_started,
                                  delivery_id=self.delivery_id)).delay()

        for bid in bids:
            sample_label = self.samples_info[bid].get('client_sample_id')

            if bid not in to_be_merged:
                to_be_merged[bid] = dict()

            if bid in datasets_info:
                for f in datasets_info[bid]:
                    src = f.get('filepath')
                    read = f.get('read_label')
                    lane = f.get('lane')
                    ext = f.get('file_ext')

                    batch_id = self.samples_info[bid].get('batch_id')

                    filename = format_dataset_filename(
                        sample_label=sample_label,
                        lane=lane,
                        read=read,
                        ext=ext,
                        uid=True)

                    dst = os.path.join(opath, batch_id, filename)

                    self.logger.info("Coping {} into {}".format(src, dst))

                    if os.path.isfile(dst):
                        self.logger.info('{} skipped'.format(
                            os.path.basename(dst)))
                    else:
                        if not self.dry_run:
                            tsk = copy.si(src, dst).delay()
                            self.logger.info('{} copied'.format(
                                os.path.basename(dst)))

                        if self.merge:
                            to_be_merged[bid][ext] = dict(
                            ) if ext not in to_be_merged[
                                bid] else to_be_merged[bid][ext]

                            if read not in to_be_merged[bid][ext]:
                                to_be_merged[bid][ext][read] = dict(src=list(),
                                                                    dst=list(),
                                                                    tsk=list())

                            to_be_merged[bid][ext][read]['src'].append(src)
                            to_be_merged[bid][ext][read]['dst'].append(dst)

                            if not self.dry_run and tsk:
                                to_be_merged[bid][ext][read]['tsk'].append(
                                    tsk.task_id)
                        else:
                            if self.md5_check:
                                # MD5 CHECKSUM
                                self.logger.info(
                                    "Getting MD5 hash of {}".format(dst))
                                if not self.dry_run:
                                    md5_task = trigger_event.si(
                                        event='get_md5_checksum',
                                        params=dict(src=dst,
                                                    dst=".".join([dst,
                                                                  'md5'])),
                                        tasks=[tsk.task_id]).delay()
                                    task_id = md5_task.get()

            else:
                msg = 'I have not found any file related to this ' \
                      'Bika id: {}'.format(bid)

                self.logger.warning(msg)
                self.logger.info('{} skipped'.format(bid))
                del to_be_merged[bid]

        if self.merge:
            if not self.dry_run:
                dispatch_event.si(
                    event='merge_started',
                    params=dict(
                        progress_status_file=self.merge_started)).delay()

            for bid, file_ext in to_be_merged.iteritems():
                sample_label = self.samples_info[bid].get('client_sample_id')
                for ext, reads in file_ext.iteritems():
                    for read, datasets in reads.iteritems():

                        filename = format_dataset_filename(
                            sample_label=sample_label, read=read, ext=ext)
                        src = datasets['dst']
                        dst = os.path.join(opath, batch_id, filename)
                        tsk = datasets['tsk']

                        self.logger.info("Merging {} into {}".format(
                            " ".join(src), dst))
                        if not self.dry_run:
                            merge_task = trigger_event.si(
                                event='merge_datasets',
                                params=dict(src=src, dst=dst, remove_src=True),
                                tasks=tsk).delay()
                            task_id = merge_task.get()
                            if self.md5_check:
                                # MD5 CHECKSUM
                                self.logger.info(
                                    "Getting MD5 hash of {}".format(dst))
                                md5_task = trigger_event.si(
                                    event='get_md5_checksum',
                                    params=dict(src=dst,
                                                dst=".".join([dst, 'md5'])),
                                    tasks=[task_id]).delay()
                                task_id = md5_task.get()

                            to_be_merged[bid][ext][read]['tsk'] = [task_id]

        if not self.dry_run:
            task_ids = list()
            for bid, file_ext in to_be_merged.iteritems():
                for ext, reads in file_ext.iteritems():
                    for read, datasets in reads.iteritems():
                        task_ids.extend(datasets['tsk'])

            trigger_event.si(event='delivery_completed',
                             params=dict(
                                 progress_status_file=self.delivery_completed,
                                 delivery_id=self.delivery_id),
                             tasks=task_ids).delay()

            if self.merge:
                trigger_event.si(event='merge_completed',
                                 params=dict(
                                     progress_status_file=self.merge_completed,
                                     delivery_id=self.delivery_id),
                                 tasks=task_ids).delay()
Example #14
0
    def run(self):
        path_exists(self.rd['path'], self.logger)

        rd_status_checks = rd_ready_to_be_preprocessed(
            user=self.user,
            group=self.group,
            path=self.rd['path'],
            rd_label=self.rd['label'],
            ssht_filename=self.samplesheet['filename'],
            ir_conf=self.ir_conf,
            io_conf=self.io_conf)

        check = rd_status_checks[0] and rd_status_checks[1] and \
                rd_status_checks[2][0] and rd_status_checks[2][1]

        check_sanitize_metadata = not rd_status_checks[3]

        if not check:
            self.logger.error("{} is not ready to be preprocessed".format(
                self.rd['label']))
            sys.exit()

        self.logger.info('Processing {}'.format(self.rd['label']))
        self.logger.info('running path {}'.format(self.rd['path']))
        self.logger.info('datasets path {}'.format(self.ds['path']))
        self.logger.info('samplesheet path {}'.format(self.samplesheet['path']))

        if self.emit_events:
            self.logger.info('quality check output path {}'.format(self.qc['path']))
            self.logger.info('quality check export path {}'.format(self.qc['export_path']))

        ensure_dir(self.ds['path'])

        irods_task = chain(
            sanitize_metadata.si(conf=self.ir_conf,
                                 ssht_filename=self.samplesheet['filename'],
                                 rd_label=self.rd['label'],
                                 sanitize=check_sanitize_metadata,
                                 logbook_path=self.logbook['path']
                                 ),

            copy_run_info_to_irods.si(conf=self.ir_conf,
                                      run_info_path=self.run_info['path'],
                                      rd_label=self.rd['label'],
                                      logbook_path=self.logbook['path']
                                      ),

            copy_run_parameters_to_irods.si(conf=self.ir_conf,
                                            run_parameters_path=self.run_parameters['path'],
                                            rd_label=self.rd['label'],
                                            logbook_path=self.logbook['path']
                                            ),
        )

        samplesheet_task = chain(

            copy_samplesheet_from_irods.si(conf=self.ir_conf,
                                           ssht_path=self.samplesheet['path'],
                                           rd_label=self.rd['label'],
                                           overwrite_samplesheet=self.overwrite_samplesheet,
                                           logbook_path=self.logbook['path']
                                           ),

            replace_values_into_samplesheet.si(conf=self.ir_conf,
                                               ssht_path=self.samplesheet['path'],
                                               rd_label=self.rd['label'],
                                               overwrite_samplesheet=self.overwrite_samplesheet,
                                               logbook_path=self.logbook['path']
                                               ),

        )

        # full pre-processing sequencing rundir pipeline
        pipeline = chain(
            dispatch_event.si(event='preprocessing_started',
                              params=dict(ds_path=self.ds['path'],
                                          rd_label=self.rd['label'],
                                          progress_status_file=self.started_file,
                                          emit_events=self.emit_events)),

            irods_task,
            samplesheet_task,

            replace_index_cycles_into_run_info.si(conf=self.ir_conf,
                                                  ssht_path=self.samplesheet['path'],
                                                  run_info_path=self.run_info['path'],
                                                  rd_label=self.rd['label'],
                                                  logbook_path=self.logbook['path']),

            bcl2fastq.si(rd_path=self.rd['path'],
                         ds_path=self.ds['path'],
                         ssht_path=self.samplesheet['path'],
                         run_info_path=self.run_info['path'],
                         no_lane_splitting=self.no_lane_splitting,
                         barcode_mismatches=self.barcode_mismatches,
                         with_failed_reads=self.with_failed_reads,
                         batch_queuing=self.batch_queuing,
                         queue_spec=self.queues_conf.get('q_bcl2fastq'),
                         logbook_path=self.logbook['path']),

            replace_index_cycles_into_run_info.si(conf=self.ir_conf,
                                                  ssht_path=self.samplesheet['path'],
                                                  run_info_path=self.run_info['path'],
                                                  rd_label=self.rd['label'],
                                                  logbook_path=self.logbook['path']),

            dispatch_event.si(event='fastq_ready',
                              params=dict(ds_path=self.ds['path'],
                                          qc_path=self.qc['path'],
                                          qc_export_path=self.qc['export_path'],
                                          force=True,
                                          rd_label=self.rd['label'],
                                          progress_status_file=self.completed_file,
                                          emit_events=self.emit_events)),
        )
        pipeline.delay()
Example #15
0
    def __fs2fs_carrier(self, input_paths, opath):

        self.delivery_started = os.path.join(opath,
                                             self.io_conf.get('delivery_started_file'))
        self.delivery_completed = os.path.join(opath,
                                               self.io_conf.get('delivery_completed_file'))

        self.merge_started = os.path.join(opath,
                                          self.io_conf.get('merge_started_file'))
        self.merge_completed = os.path.join(opath,
                                            self.io_conf.get('merge_completed_file'))

        bids = [_ for _ in self.delivery['samples_info'].keys() if self.delivery['samples_info'][_].get(
            'type') not in SAMPLE_TYPES_TOSKIP]

        if len(bids) > 0:
            for id, info in self.delivery['samples_info'].iteritems():
                batch_id = info.get('batch_id')
                path = os.path.join(opath, batch_id)
                if not self.dry_run and not os.path.exists(path):
                    ensure_dir(path)

        self.logger.info('Looking for files related to {} Bika ids'.format(len(bids)))

        dm = DatasetsManager(self.logger, bids)
        for path in input_paths:
            if self.runs and isinstance(self.runs, list) and len(self.runs) > 0:
                for run in self.runs:
                    ipath = os.path.join(path, run)
                    if os.path.exists(ipath):
                        self.logger.info('Searching in {}'.format(ipath))
                        datasets_info, count = dm.collect_fastq_from_fs(ipath)
                        self.logger.info("found {} files in {}".format(count, ipath))
            else:
                ipath = path
                if os.path.exists(ipath):
                    self.logger.info('Searching in  {}'.format(ipath))
                    datasets_info, count = dm.collect_fastq_from_fs(ipath)
                    self.logger.info("found {} files in {}".format(count, ipath))

        datasets_info = dm.fastq_collection
        count = dm.fastq_counter

        self.logger.info("found {} files".format(count))

        to_be_merged = dict()

        if not self.dry_run:
            dispatch_event.si(event='delivery_started',
                              params=dict(progress_status_file=self.delivery_started, delivery_id=self.delivery_id)
                              ).delay()

        for bid in bids:
            sample_label = self.samples_info[bid].get('client_sample_id')

            if bid not in to_be_merged:
                to_be_merged[bid] = dict()

            if bid in datasets_info:
                for f in datasets_info[bid]:
                    src = f.get('filepath')
                    read = f.get('read_label')
                    lane = f.get('lane')
                    ext = f.get('file_ext')
                    
                    batch_id = self.samples_info[bid].get('batch_id')

                    filename = format_dataset_filename(sample_label=sample_label,
                                                       lane=lane,
                                                       read=read,
                                                       ext=ext,
                                                       uid=True)

                    dst = os.path.join(opath, batch_id, filename)

                    self.logger.info("Coping {} into {}".format(src, dst))

                    if os.path.isfile(dst):
                        self.logger.info('{} skipped'.format(os.path.basename(
                            dst)))
                    else:
                        if not self.dry_run:
                            tsk = copy.si(src, dst).delay()
                            self.logger.info(
                                '{} copied'.format(os.path.basename(dst)))

                        if self.merge:
                            to_be_merged[bid][ext] = dict() if ext not in to_be_merged[bid] else to_be_merged[bid][ext]

                            if read not in to_be_merged[bid][ext]:
                                to_be_merged[bid][ext][read] = dict(src=list(), dst=list(), tsk=list())

                            to_be_merged[bid][ext][read]['src'].append(src)
                            to_be_merged[bid][ext][read]['dst'].append(dst)

                            if not self.dry_run and tsk:
                                to_be_merged[bid][ext][read]['tsk'].append(tsk.task_id)
                        else:
                            if self.md5_check:
                                # MD5 CHECKSUM
                                self.logger.info("Getting MD5 hash of {}".format(dst))
                                if not self.dry_run:
                                    md5_task = trigger_event.si(event='get_md5_checksum',
                                                                params=dict(src=dst,
                                                                            dst=".".join([dst, 'md5'])),
                                                                tasks=[tsk.task_id]).delay()
                                    task_id = md5_task.get()

            else:
                msg = 'I have not found any file related to this ' \
                      'Bika id: {}'.format(bid)

                self.logger.warning(msg)
                self.logger.info('{} skipped'.format(bid))
                del to_be_merged[bid]

        if self.merge:
            if not self.dry_run:
                dispatch_event.si(event='merge_started',
                                  params=dict(progress_status_file=self.merge_started)
                                  ).delay()

            for bid, file_ext in to_be_merged.iteritems():
                sample_label = self.samples_info[bid].get('client_sample_id')
                for ext, reads in file_ext.iteritems():
                    for read, datasets in reads.iteritems():

                        filename = format_dataset_filename(sample_label=sample_label,
                                                           read=read,
                                                           ext=ext)
                        src = datasets['dst']
                        dst = os.path.join(opath, batch_id, filename)
                        tsk = datasets['tsk']

                        self.logger.info("Merging {} into {}".format(" ".join(src), dst))
                        if not self.dry_run:
                            merge_task = trigger_event.si(event='merge_datasets',
                                                          params=dict(src=src,
                                                                      dst=dst,
                                                                      remove_src=True),
                                                          tasks=tsk).delay()
                            task_id = merge_task.get()
                            if self.md5_check:
                                # MD5 CHECKSUM
                                self.logger.info("Getting MD5 hash of {}".format(dst))
                                md5_task = trigger_event.si(event='get_md5_checksum',
                                                            params=dict(src=dst,
                                                                        dst=".".join([dst, 'md5'])),
                                                            tasks=[task_id]).delay()
                                task_id = md5_task.get()

                            to_be_merged[bid][ext][read]['tsk'] = [task_id]

        if not self.dry_run:
            task_ids = list()
            for bid, file_ext in to_be_merged.iteritems():
                for ext, reads in file_ext.iteritems():
                    for read, datasets in reads.iteritems():
                        task_ids.extend(datasets['tsk'])

            trigger_event.si(event='delivery_completed',
                             params=dict(progress_status_file=self.delivery_completed, delivery_id=self.delivery_id),
                             tasks=task_ids).delay()

            if self.merge:
                trigger_event.si(event='merge_completed',
                                 params=dict(progress_status_file=self.merge_completed, delivery_id=self.delivery_id),
                                 tasks=task_ids).delay()
Example #16
0
def implementation(logger, args):
    def get_profile(profile_label, profile_path, logger_):
        file_path = os.path.join(profile_path, '{}.yaml'.format(profile_label))

        if path_exists(file_path, logger_, force=False):
            msg = "Profile found at {}".format(file_path)
            print(msg)
            logger.info(msg)
            profile = load(file_path)
            return profile
        logger.info("Profile not found at {}".format(file_path))
        return None

    def write_profile(default_config, pl_, profile_label, profile_path,
                      logger_):
        def merge_two_dicts(x, y):
            z = x.copy()  # start with x's keys and values
            z.update(y)  # modifies z with y's keys and values & returns None
            return z

        file_path = os.path.join(profile_path, '{}.yaml'.format(profile_label))
        if path_exists(file_path, logger_, force=False) and not args.force:
            msg = "{} profile already exists".format(file_path)
            print(msg)
            logger.error(msg)
            # sys.exit()
        else:
            to_dump = merge_two_dicts(
                default_config,
                pl_.playbook_vars_template(project_name=profile_label))
            dump(to_dump, file_path)
            logger.info("Created {} profile".format(file_path))
            print("Edit variables value into the {} file".format(file_path))
        return

    profile_label, ext = os.path.splitext(args.profile)
    profile_path = os.path.join(profile_dir, args.label)
    ensure_dir(profile_path)

    plm = PipelinesManager(args)
    pl = plm.get_pipeline(args.label)
    profile = get_profile(profile_label, profile_path, logger)

    path_from_cli = args.config_file if 'config_file' in vars(args) else None
    cm = ConfigurationManager(args=args, path_from_cli=path_from_cli)
    default_config = cm.get_default_config

    if args.create_profile and not args.deployment:
        write_profile(default_config, pl, profile_label, profile_path, logger)
        return

    if args.deployment and not args.create_profile:
        if profile:
            host = args.host
            remote_user = args.remote_user
            connection = args.connection
            pl.instantiate(host, remote_user, connection, profile)
            return
    if not profile:
        msg = 'Profile "{}" not found. Have you created it? \n' \
              'Digit "solida setup --help" for more details'.format(profile_label)
        print(msg)
        logger.error(msg)