Python remove_dir Examples, igf_data.utils.fileutils.remove_dir Python Examples

Example #1

0

Show file

 def test_fetch_db_data_and_prepare_gviz_json(self):
     pp = Project_pooling_info(dbconfig_file=self.dbconfig)
     temp_dir = get_temp_dir()
     temp_file = os.path.join(temp_dir, 'test.json')
     pp.fetch_db_data_and_prepare_gviz_json(output_file_path=temp_file)
     self.assertTrue(os.path.exists(temp_file))
     remove_dir(temp_dir)

Example #2

0

Show file

File: fileutils_test.py Project: bballamudi/data-management-python

    def tearDown(self):
        remove_dir(dir_path=self.results_dir)
        if os.path.exists(self.output_tar_file):
            os.remove(self.output_tar_file)

        if os.path.exists(self.output_targz_file):
            os.remove(self.output_targz_file)

Example #3

0

Show file

def extract_cellranger_count_metrics_summary(
        cellranger_tar,
        collection_name=None,
        collection_type=None,
        attribute_name='attribute_name',
        attribute_value='attribute_value',
        attribute_prefix='None',
        target_filename='metrics_summary.csv'):
    '''
  A function for extracting metrics summary file for cellranger ourput tar and parse the file.
  Optionally it can add the collection name and type info to the output dictionary.
  
  :param cellranger_tar: A cellranger output tar file
  :param target_filename: A filename for metrics summary file lookup, default metrics_summary.csv
  :param collection_name: Optional collection name, default None
  :param collection_type: Optional collection type, default None
  :param attribute_tag: An optional string to add as prefix of the attribute names, default None
  :returns: A dictionary containing the metrics values
  '''
    try:
        check_file_path(cellranger_tar)
        temp_work_dir = get_temp_dir(use_ephemeral_space=False)
        metrics_file = None
        with tarfile.open(cellranger_tar, mode='r') as tar:
            for file_name in tar.getnames():
                if os.path.basename(file_name) == target_filename:
                    tar.extract(file_name, path=temp_work_dir)
                    metrics_file = os.path.join(temp_work_dir, file_name)

        if metrics_file is None:
            raise IOError('Required file {0} not found in tar {1}'.\
                          format(target_filename,cellranger_tar))

        attribute_data = pd.read_csv(metrics_file).T.\
                         reset_index()
        attribute_data.columns = [attribute_name, attribute_value]
        if attribute_prefix is None:
            attribute_data[attribute_name] = \
              attribute_data[attribute_name].\
                map(lambda x: x.replace(' ','_'))
        else:
            attribute_data[attribute_name] = \
              attribute_data[attribute_name].\
                map(lambda x: \
                    '{0}_{1}'.format(\
                      attribute_prefix,
                      x.replace(' ','_')))

        if collection_name is not None:
            attribute_data['name'] = collection_name
        if collection_type is not None:
            attribute_data['type'] = collection_type

        attribute_data = attribute_data.\
                         to_dict(orient='records')
        remove_dir(temp_work_dir)
        return attribute_data
    except:
        raise

Example #4

0

Show file

def generate_ipynb_from_template(template_ipynb_path,
                                 output_dir,
                                 param_dictionary,
                                 date_tag='date_tag',
                                 use_ephemeral_space=False):
    '''
  A class for generating notebook IPYNB file from a template files with param substitution

  :param template_ipynb_path: A template IPYNB file path
  :param output_dir: Output path
  :param param_dictionary: A dictionary containing the params for final notebook
  :param date_tag: A text for date tag name, default date_tag
  :param use_ephemeral_space: Toggle for using ephemeral space for temp dir, default False
  :returns: None
  '''
    try:
        check_file_path(template_ipynb_path)
        check_file_path(output_dir)
        if not isinstance(param_dictionary, dict):
            raise TypeError(
                    "Expecting a dictionary, got {0}".\
                      format(type(param_dictionary)))
        date_tag_value = \
          datetime.\
            strftime(
              datetime.now(),
              '%Y-%b-%d %H:%M')                                                     # date tag values
        param_dictionary.\
          update(dict(date_tag=date_tag_value))                                     # adding date tag values to params
        temp_dir = \
          get_temp_dir(
            use_ephemeral_space=use_ephemeral_space)
        temp_output = \
          os.path.join(
            temp_dir,
            os.path.basename(template_ipynb_path))
        final_output = \
          os.path.join(
            output_dir,
            os.path.basename(template_ipynb_path))
        template_env = \
          Environment(
            loader=\
              FileSystemLoader(
                searchpath=os.path.dirname(template_ipynb_path)),
            autoescape=select_autoescape(['html', 'xml']))
        notebook = \
          template_env.\
            get_template(
              os.path.basename(template_ipynb_path))
        notebook.\
          stream(**param_dictionary).\
          dump(temp_output)                                                         # write temp ipynb file with param substitution
        copy_local_file(temp_output, final_output)
        remove_dir(temp_dir)
    except Exception as e:
        raise ValueError(
                "Failed to generate ipynb file from template {1}, error: {0}".\
                  format(e,template_ipynb_path))

Example #5

0

Show file

    def run_HaplotypeCaller(self,
                            input_bam,
                            output_vcf_path,
                            dbsnp_vcf,
                            emit_gvcf=True,
                            force=False,
                            dry_run=False,
                            gatk_param_list=None):
        '''
    A method for running GATK HaplotypeCaller
    
    :param input_bam: A input bam file
    :param output_vcf_path: A output vcf filepath
    :param dbsnp_vcf: A dbsnp vcf file
    :param emit_gvcf: A toggle for GVCF generation, default True
    :param force: Overwrite output file, if force is True
    :param dry_run: Return GATK command, if its true, default False
    :param gatk_param_list: List of additional params for BQSR, default None
    :returns: GATK commandline
    '''
        try:
            self._run_gatk_checks()  # run initial checks
            check_file_path(input_bam)
            check_file_path(dbsnp_vcf)
            temp_dir = \
              get_temp_dir(use_ephemeral_space=self.use_ephemeral_space)              # get temp dir
            temp_output = \
              os.path.join(
                temp_dir,
                os.path.basename(output_vcf_path))
            gatk_cmd = [
                quote(self.gatk_exe), "HaplotypeCaller", "-I",
                quote(input_bam), "-O",
                quote(temp_output), "--reference",
                quote(self.ref_fasta), "--dbsnp",
                quote(dbsnp_vcf), "--java-options",
                quote(self.java_param)
            ]
            if emit_gvcf:
                gatk_cmd.extend(["--emit-ref-confidence", "GVCF"])
            if gatk_param_list is not None and \
               isinstance(gatk_param_list,list) and \
               len(gatk_param_list) > 0:
                gatk_cmd.extend(gatk_param_list)  # additional params
            gatk_cmd = ' '.join(gatk_cmd)
            if dry_run:
                return gatk_cmd

            subprocess.check_call(gatk_cmd, shell=True)
            copy_local_file(source_path=temp_output,
                            destinationa_path=output_vcf_path,
                            force=force)
            remove_dir(temp_dir)
            return gatk_cmd
        except Exception as e:
            raise ValueError(
                    "Failed to run GATK HaplotypeCaller, error: {0}".\
                      format(e))

Example #6

0

Show file

File: singularity_run_wrapper.py Project: bballamudi/data-management-python

def singularity_run(image_path,
                    path_bind,
                    args_list,
                    container_dir='/tmp',
                    return_results=True,
                    use_ephemeral_space=False,
                    dry_run=False):
    '''
  A wrapper module for running singularity based containers

  :param image_path: Singularrity image path
  :param path_bind: Path to bind to singularity /tmp dir
  :param args_list: List of args for singulatiy run
  :param return_results: Return singulatiy run results, default True
  :param use_ephemeral_space: Toggle for using ephemeral space for temp dir, default False
  :param dry_run: Return the singularity command without run, default False
  :returns: A response from container run and a string containing singularity command line
  '''
    try:
        check_file_path(image_path)
        check_file_path(path_bind)
        temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space)
        res = None
        temp_image_path = \
          os.path.join(
            temp_dir,
            os.path.basename(image_path))
        copy_local_file(image_path, temp_image_path)  # copy image to tmp dir
        if not isinstance(args_list,list) and \
           len(args_list) > 0:
            raise ValueError(
                'No args provided for singularity run')  # safemode
        args = ' '.join(args_list)  # flatten args
        singularity_run_cmd = \
          'singularity run {0} --bind {1}:{2} {3}'.\
            format(
              temp_image_path,
              path_bind,
              container_dir,
              args)
        if dry_run:
            return res, singularity_run_cmd
        else:

            res = \
              Client.run(
                image=temp_image_path,
                bind='{0}:{1}'.format(path_bind,container_dir),
                args=args,
                return_result=return_results)
            remove_dir(temp_dir)  # remove copied image after run
            return res, singularity_run_cmd
    except Exception as e:
        raise ValueError(
                'Failed to run image {0}, error: {1}'.\
                  format(image_path,e))

Example #7

0

Show file

    def run_AnalyzeCovariates(self,
                              before_report_file,
                              after_report_file,
                              output_pdf_path,
                              force=False,
                              dry_run=False,
                              gatk_param_list=None):
        '''
    A method for running GATK AnalyzeCovariates tool
    
    :param before_report_file: A file containing bqsr output before recalibration
    :param after_report_file: A file containing bqsr output after recalibration
    :param output_pdf_path: An output pdf filepath
    :param force: Overwrite output file, if force is True
    :param dry_run: Return GATK command, if its true, default False
    :param gatk_param_list: List of additional params for BQSR, default None
    :returns: GATK commandline
    '''
        try:
            self._run_gatk_checks()  # run initial checks
            check_file_path(before_report_file)
            check_file_path(after_report_file)
            temp_dir = \
              get_temp_dir(use_ephemeral_space=self.use_ephemeral_space)              # get temp dir
            temp_output = \
              os.path.join(
                temp_dir,
                os.path.basename(output_pdf_path))
            gatk_cmd = [
                quote(self.gatk_exe), "AnalyzeCovariates",
                "--before-report-file",
                quote(before_report_file), "--after-report-file",
                quote(after_report_file), "--plots-report-file",
                quote(temp_output), "--java-options",
                quote(self.java_param)
            ]
            if gatk_param_list is not None and \
               isinstance(gatk_param_list,list) and \
               len(gatk_param_list) > 0:
                gatk_cmd.extend(gatk_param_list)  # additional params
            gatk_cmd = ' '.join(gatk_cmd)
            if dry_run:
                return gatk_cmd

            subprocess.check_call(gatk_cmd, shell=True)
            copy_local_file(source_path=temp_output,
                            destinationa_path=output_pdf_path,
                            force=force)
            remove_dir(temp_dir)
            return gatk_cmd
        except Exception as e:
            raise ValueError(
                    "Failed to run GATK AnalyzeCovariates, error: {0}".\
                      format(e))

Example #8

0

Show file

File: UpdateProjectStatus.py Project: bballamudi/data-management-python

  def run(self):
    try:
      project_igf_id = self.param_required('project_igf_id')
      sample_igf_id = self.param_required('sample_igf_id')
      remote_project_path = self.param_required('remote_project_path')
      igf_session_class = self.param_required('igf_session_class')
      remote_user = self.param_required('remote_user')
      remote_host = self.param_required('remote_host')
      status_data_json = self.param('status_data_json')
      demultiplexing_pipeline_name = self.param_required('demultiplexing_pipeline_name')
      analysis_pipeline_name = self.param_required('analysis_pipeline_name')
      use_ephemeral_space = self.param('use_ephemeral_space')

      temp_work_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space)     # get a temp dir
      ps = \
        Project_status(\
          igf_session_class=igf_session_class,
          project_igf_id=project_igf_id)
      temp_status_output = \
        os.path.join(\
          temp_work_dir,
          status_data_json)                                                     # get path for temp status file
      remote_project_dir = \
        os.path.join(\
          remote_project_path,
          project_igf_id)                                                       # get remote project directory path
      ps.generate_gviz_json_file(\
        output_file=temp_status_output,
        demultiplexing_pipeline=demultiplexing_pipeline_name,
        analysis_pipeline=analysis_pipeline_name)                               # write data to output json file
      remote_file_path = \
        os.path.join(\
          remote_project_dir,
          status_data_json)
      self._check_and_copy_remote_file(\
        remote_user=remote_user,
        remote_host=remote_host,
        source_file=temp_status_output,
        remote_file=remote_file_path)                                           # copy file to remote
      self.param('dataflow_params',
                 {'remote_project_info':'done'})
      remove_dir(temp_work_dir)                                                 # remove temp dir
    except Exception as e:
      message = \
        'project: {2}, sample:{3}, Error in {0}: {1}'.\
          format(\
            self.__class__.__name__,
            e,
            project_igf_id,
            sample_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise

Example #9

0

Show file

    def _notify_about_new_user_account(self,data,user_col='username',\
                             password_col='password',hpc_user_col='hpc_username',\
                             name_col='name',email_id_col='email_id'):
        '''
    An internal method for sending mail to new user with their password
    
    :param data: A pandas series containing user data
    :param user_col: Column name for username, default username
    :param password_col: Column name for password, default password
    :param hpc_user_col: Column name for hpc_username, default hpc_username
    :param name_col: Column name for name, default name
    :param email_id_col: Column name for email id, default email_id
    '''
        try:
            if not isinstance(data, pd.Series):
                raise ValueError('Expecting a pandas series and got {0}'.\
                                 format(type(data)))
            username = data[user_col]
            fullname = data[name_col]
            password = data[password_col]
            email_id = data[email_id_col]

            if hpc_user_col not in data or pd.isnull(
                    data[hpc_user_col]):  # send email only to non-hpc users
                template_dir = os.path.dirname(self.user_account_template)
                template_env=Environment(loader=FileSystemLoader(searchpath=template_dir), \
                                         autoescape=select_autoescape(['html','xml']))  # set template env
                template_file=template_env.\
                              get_template(os.path.basename(self.user_account_template))
                temp_work_dir = get_temp_dir()  # get a temp dir
                report_output_file = os.path.join(temp_work_dir,
                                                  'email_template.txt')
                template_file.\
                  stream(userEmail=email_id, \
                         fullName=fullname,\
                         userName=username,\
                         userPass=password,\
                        ).\
                  dump(report_output_file)
                read_cmd = ['cat', quote(report_output_file)]
                proc = subprocess.Popen(read_cmd, stdout=subprocess.PIPE)
                sendmail_cmd = [self.sendmail_exe, '-t']
                subprocess.check_call(sendmail_cmd, stdin=proc.stdout)
                proc.stdout.close()
                if proc.returncode != None:
                    raise ValueError('Failed running command {0}:{1}'.format(read_cmd,\
                                                                             proc.returncode))
                remove_dir(temp_work_dir)
        except:
            raise

Example #10

0

Show file

File: CleanupDirOrFile.py Project: bballamudi/data-management-python

    def run(self):
        try:
            seqrun_igf_id = self.param_required('seqrun_igf_id')
            path = self.param_required('path')
            cleanup_status = self.param_required('cleanup_status')

            message = None
            if cleanup_status:
                if not os.path.exists(path):
                    raise IOError('path {0} is not accessible'.format(path))

                if os.path.isdir(path):
                    remove_dir(path)
                    message = 'removed dir {0}'.format(path)
                elif os.path.isfile(path):
                    os.remove(path)
                    message = 'removed file {0}'.format(path)

                else:
                    message = 'path {0} is not file or directory, skipped removing'.\
                              format(path)

            else:
                message = 'Not removing path {0} as cleanup_status is not True'.\
                          format(path)

            self.param('dataflow_params', {
                'path': path,
                'cleanup_status': cleanup_status
            })  # set dataflow params
            if message:
                self.post_message_to_slack(
                    message, reaction='pass')  # send msg to slack
                self.comment_asana_task(task_name=seqrun_igf_id,
                                        comment=message)  # send msg to asana
        except Exception as e:
            message = \
              'seqrun: {2}, Error in {0}: {1}'.\
                format(\
                  self.__class__.__name__,
                  e,
                  seqrun_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise

Example #11

0

Show file

def validation_home():
    form = ValidationForm()
    if form.validate_on_submit():
        temp_dir = get_temp_dir()
        new_metadata_list = list()
        counter = 0
        for file in form.metadata_file.data:
            counter += 1
            filename = secure_filename(file.filename)
            file.save(\
              os.path.join(\
                temp_dir,
                '{0}_{1}'.format(counter,filename)))
            new_metadata_list.\
              append(\
                os.path.join(\
                  temp_dir,
                  '{0}_{1}'.format(counter,filename)))
        samplesheet_filename = \
          secure_filename(form.samplesheet_file.data.filename)
        form.samplesheet_file.\
          data.save(\
            os.path.join(\
              temp_dir,
              samplesheet_filename))
        new_samplesheet = \
          os.path.join(\
            temp_dir,
            samplesheet_filename)
        logging.warning(form.recaptcha.errors)
        vp = \
          Validate_project_and_samplesheet_metadata(\
            samplesheet_file=new_samplesheet,
            metadata_files=new_metadata_list,
            samplesheet_schema=app.config.get('SAMPLESHEET_SCHEMA'),
            metadata_schema=app.config.get('METADATA_SCHEMA'))
        json_data = vp.convert_errors_to_gviz()
        remove_dir(temp_dir)
        return render_template('validation/results.html', jsonData=json_data)
    else:
        if request.method == 'POST':
            flash('Failed input validation check')
    return render_template('validation/validate_metadata.html', form=form)

Example #12

0

Show file

File: views.py Project: imperial-genomics-facility/Metadata_validation

def metadata_home():
    try:
        csv_data = ''
        form = MetadataForm()
        if form.validate_on_submit():
            temp_dir = get_temp_dir()
            metadata_filename = \
              secure_filename(form.metadata_file.data.filename)
            form.metadata_file.\
              data.save(\
                os.path.join(\
                  temp_dir,
                  metadata_filename))
            new_metadata_file = \
              os.path.join(\
                temp_dir,
                metadata_filename)
            try:
                csv_data = \
                  run_metadata_reformatting(\
                    metadata_file=new_metadata_file,\
                    output_dir=temp_dir)
            except Exception as e:
                flash('Failed metadata file reformatting')
                logging.warning(e)
            remove_dir(temp_dir)
            if csv_data != '':
                return \
                  Response(\
                    csv_data,
                    mimetype="text/csv",
                    headers={"Content-disposition":
                             "attachment; filename=reformatted_metadata.csv"})
        else:
            if request.method == 'POST':
                flash('Failed file type validation check')
    except Exception as e:
        logging.warning('Failed metadata reformatting, error: {0}'.format(e))

    return render_template('metadata/metadata_reformat.html', form=form)

Example #13

0

Show file

File: ppqt_utils.py Project: bballamudi/data-management-python

    def run_ppqt(self, input_bam, output_dir, output_spp_name,
                 output_pdf_name):
        '''
    A method for running PPQT on input bam

    :param input_bam: Input bam file
    :param output_spp_name: Output spp out file
    :param output_pdf_name: Output pdf plot
    :param output_dir: Destination output dir
    :returns: PPQT run command as list,spp and pdf output path and a list or dictionary for spp.out matrics
    '''
        try:
            temp_dir = \
              get_temp_dir(use_ephemeral_space=self.use_ephemeral_space)
            run_cmd = \
              self._pre_process(\
                input_bam=input_bam,
                output_spp_name=output_spp_name,
                output_pdf_name=output_pdf_name,
                output_dir=temp_dir,
                temp_dir=temp_dir)                                                    # preprocess and fetch run cmd

            subprocess.check_call(\
              ' '.join(run_cmd),
              shell=True)                                                             # run ppqt and capture stdout

            spp_output, pdf_output = \
              self._post_process(\
                output_spp_name=output_spp_name,
                output_pdf_name=output_pdf_name,
                output_dir=output_dir,
                temp_dir=temp_dir)                                                    # copy files from temp dir
            remove_dir(temp_dir)  # clean up temp dir
            spp_data = self._parse_spp_output(spp_file=spp_output)
            return run_cmd, spp_output, pdf_output, spp_data
        except:
            raise

Example #14

0

Show file

def merge_multiple_bam(samtools_exe,
                       input_bam_list,
                       output_bam_path,
                       sorted_by_name=False,
                       use_ephemeral_space=0,
                       threads=1,
                       force=False,
                       dry_run=False,
                       index_output=True):
    '''
  A function for merging multiple input bams to a single output bam
  
  :param samtools_exe: samtools executable path
  :param input_bam_list: A file containing list of bam filepath
  :param output_bam_path: A bam output filepath
  :param sorted_by_name: Sort bam file by read_name, default False (for coordinate sorted bams)
  :param threads: Number of threads to use for merging, default 1
  :param force: Output bam file will be overwritten if force is True, default False
  :param index_output: Index output bam, default True
  :param use_ephemeral_space: A toggle for temp dir settings, default 0
  :param dry_run: A toggle for returning the samtools command without actually running it, default False
  :return: samtools command
  '''
    try:
        check_file_path(samtools_exe)
        check_file_path(input_bam_list)
        with open(input_bam_list, 'r') as fp:
            for bam in fp:
                check_file_path(bam.strip())

        temp_dir = \
          get_temp_dir(use_ephemeral_space=use_ephemeral_space)
        temp_bam = \
          os.path.join(\
            temp_dir,
            os.path.basename(output_bam_path))
        merge_cmd = \
          [quote(samtools_exe),
           'merge',
           '--output-fmt','BAM',
           '--threads',quote(str(threads)),
           '-b',quote(input_bam_list)
          ]
        if sorted_by_name:
            merge_cmd.append('-n')  # Input files are sorted by read name

        merge_cmd.append(temp_bam)
        if dry_run:
            return merge_cmd

        subprocess.check_call(merge_cmd)  # run samtools merge
        copy_local_file(\
          source_path=temp_bam,
          destinationa_path=output_bam_path,
          force=force)                                                              # copy bamfile
        remove_dir(temp_dir)  # remove temp dir
        _check_bam_file(output_bam_path)
        if index_output and \
           not sorted_by_name:
            index_bam_or_cram(\
              samtools_exe=samtools_exe,
              input_path=output_bam_path,
              threads=threads)
        return merge_cmd
    except:
        raise

Example #15

0

Show file

def run_sort_bam(samtools_exe,
                 input_bam_path,
                 output_bam_path,
                 sort_by_name=False,
                 use_ephemeral_space=0,
                 threads=1,
                 force=False,
                 dry_run=False,
                 cram_out=False,
                 index_output=True):
    '''
  A function for sorting input bam file and generate a output bam
  
  :param samtools_exe: samtools executable path
  :param input_bam_path: A bam filepath
  :param output_bam_path: A bam output filepath
  :param sort_by_name: Sort bam file by read_name, default False (for coordinate sorting)
  :param threads: Number of threads to use for sorting, default 1
  :param force: Output bam file will be overwritten if force is True, default False
  :param cram_out: Output cram file, default False
  :param index_output: Index output bam, default True
  :param use_ephemeral_space: A toggle for temp dir settings, default 0
  :param dry_run: A toggle for returning the samtools command without actually running it, default False
  :return: None
  '''
    try:
        check_file_path(samtools_exe)
        _check_bam_file(bam_file=input_bam_path)
        sort_cmd = \
          [quote(samtools_exe),
           'sort',
           '-@{0}'.format(quote(str(threads)))
          ]
        if sort_by_name:
            sort_cmd.append('-n')  # sorting by read name

        if cram_out:
            sort_cmd.append('--output-fmt CRAM')
        else:
            sort_cmd.append('--output-fmt BAM')

        temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space)
        temp_bam = \
          os.path.join(\
            temp_dir,
            os.path.basename(output_bam_path))

        sort_cmd.extend(['-o', quote(temp_bam)])
        sort_cmd.append(quote(input_bam_path))
        if dry_run:
            return sort_cmd

        copy_local_file(\
          source_path=temp_bam,
          destinationa_path=output_bam_path,
          force=force)                                                              # copy output bam
        remove_dir(temp_dir)  # remove temp dir
        if cram_out:
            _check_cram_file(output_bam_path)
        else:
            _check_bam_file(output_bam_path)

        if index_output:
            index_bam_or_cram(\
              samtools_exe=samtools_exe,
              input_path=output_bam_path,
              threads=threads)
    except:
        raise

Example #16

0

Show file

File: find_new_seqrun_and_prepare_md5.py Project: bballamudi/data-management-python

                  time_tuple.tm_mday,
                  time_tuple.tm_hour,
                  time_tuple.tm_min,
                  time_tuple.tm_sec)
            file_name = \
              'samplesheet_metadata_check_failed_{0}.txt'.\
                format(time_stamp)
            file_name = os.path.join(msg_tmp_dir, file_name)
            with open(file_name, 'w') as fp:
                fp.write(message)  # write message file for slack
            message = 'samplesheet metadata check message : {0}'.format(
                time_stamp)
            slack_obj.post_file_to_channel(
                filepath=file_name, message=message
            )  # post samplesheet metadata check results to slack
            remove_dir(msg_tmp_dir)  # remove temp dir

        if len(new_seqruns.keys()) > 0:
            temp_dir = get_temp_dir()  # create temp dir
            new_seqruns,error_files = \
              validate_samplesheet_for_seqrun(
                seqrun_info=new_seqruns,
                schema_json=samplesheet_json_schema,
                output_dir=temp_dir)# validate samplesheet for seqruns
            if len(error_files.keys()) > 0:
                for seqrun_name, error_file_path in error_files.items():
                    message = \
                      'Samplesheet validation failed for run {0}'.\
                        format(seqrun_name)
                    slack_obj.post_file_to_channel(
                        filepath=error_file_path,

Example #17

0

Show file

File: RunMutiQC.py Project: bballamudi/data-management-python

  def run(self):
    try:
      seqrun_igf_id = self.param_required('seqrun_igf_id')
      demultiplexing_stats_file = self.param_required('demultiplexing_stats_file')
      qc_files = self.param_required('qc_files')
      fastq_dir = self.param_required('fastq_dir')
      multiqc_exe = self.param('multiqc_exe')
      multiqc_options = self.param('multiqc_options')
      multiqc_dir_label = self.param('multiqc_dir_label')
      force_overwrite = self.param('force_overwrite')
      base_results_dir = self.param_required('base_results_dir')
      project_name = self.param_required('project_name')
      seqrun_date = self.param_required('seqrun_date')
      flowcell_id = self.param_required('flowcell_id')
      tag = self.param_required('tag')
      multiqc_template_file = self.param_required('multiqc_template_file')
      tool_order_list = self.param('tool_order_list')
      model_name = self.param('model_name')
      use_ephemeral_space = self.param('use_ephemeral_space')
      if tag not in ['known','undetermined']:
        raise ValueError('unknown status tag {0}'.format(tag))                  # check valid status tags

      lane_index_info = os.path.basename(fastq_dir)                             # get lane and index info
      fastqc_files = list()
      fastqscreen_files = list()
      fastqc_files.\
        extend([fqc_dir
                  for fqc_dir in qc_files['fastqc']])
      fastqscreen_files.\
        extend([fsr_dir
                  for fsr_dir in qc_files['fastqscreen']])
      multiqc_result_dir = \
        os.path.join(\
          base_results_dir,
          project_name,
          seqrun_date,
          flowcell_id,
          lane_index_info,
          tag,
          multiqc_dir_label)                                                    # get multiqc final output path
      if os.path.exists(multiqc_result_dir) and \
         force_overwrite:
        remove_dir(multiqc_result_dir)                                          # remove existing output dir if force_overwrite is true

      if not os.path.exists(multiqc_result_dir):
        os.makedirs(multiqc_result_dir,mode=0o775)                              # create output dir if its not present

      temp_work_dir = \
        get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get a temp work dir
      multiqc_input_list = \
        os.path.join(\
          temp_work_dir,
          'multiqc_input_file.txt')                                             # get name of multiqc input file
      demultiplexing_stats_file = \
        os.path.join(\
          fastq_dir,
          demultiplexing_stats_file)
      with open(multiqc_input_list,'w') as multiqc_input_file:                  # writing multiqc input
        if not os.path.exists(demultiplexing_stats_file):
          raise IOError('demultiplexing stats file {0} not found'.\
                        format(demultiplexing_stats_file))                      # check demultiplexing stats file

        multiqc_input_file.write('{}\n'.format(demultiplexing_stats_file))      # add demultiplexing stat to list
        for fastqc_file in fastqc_files:
          fastqc_zip = fastqc_file['fastqc_zip']
          if not os.path.exists(fastqc_zip):
            raise IOError('fasqc file {0} not found'.\
                        format(fastqc_zip))                                     # check fastqc file
          multiqc_input_file.write('{}\n'.format(fastqc_zip))                   # add fastqc file to list

        for fastqscreen_file in fastqscreen_files:
          fastqscreen_stat = fastqscreen_file['fastqscreen_stat']
          if not os.path.exists(fastqscreen_stat):
            raise IOError('fastqscreen file {0} not found'.\
                        format(fastqscreen_stat))                               # check fastqscreen file
          multiqc_input_file.write('{}\n'.format(fastqscreen_stat))             # add fastqscreen file to list

      multiqc_report_title = \
        'Project:{0},Sequencing_date:{1},Flowcell_lane:{2},status:{3}'.\
        format(
          project_name,
          seqrun_date,
          lane_index_info,
          tag)                                                                  # get multiqc report title and filename
      multiqc_param = self.format_tool_options(multiqc_options)                 # format multiqc params
      date_stamp = datetime.now().strftime('%d-%b-%Y %H:%M:%S')
      check_file_path(multiqc_template_file)
      multiqc_conf_file = \
        os.path.join(
          temp_work_dir,
          os.path.basename(multiqc_template_file))
      template_env = \
        Environment(\
          loader=\
            FileSystemLoader(
              searchpath=os.path.dirname(multiqc_template_file)),
          autoescape=select_autoescape(['html', 'xml']))
      multiqc_conf = \
        template_env.\
          get_template(os.path.basename(multiqc_template_file))
      multiqc_conf.\
        stream(\
          project_igf_id=project_name,
          flowcell_id=flowcell_id,
          platform_name=model_name,
          tag_name='{0} {1}'.format(lane_index_info,tag),
          date_stamp=date_stamp,
          tool_order_list=tool_order_list).\
        dump(multiqc_conf_file)
      multiqc_cmd = \
        [multiqc_exe,
         '--file-list',quote(multiqc_input_list),
         '--outdir',temp_work_dir,
         '--title',quote(multiqc_report_title),
         '--config',quote(multiqc_conf_file)
        ]                                                                       # multiqc base parameters
      multiqc_cmd.extend(multiqc_param)                                         # add additional parameters
      subprocess.check_call(' '.join(multiqc_cmd),shell=True)                   # run multiqc
      multiqc_html = None
      multiqc_data = None
      for root, _,files in os.walk(top=temp_work_dir):
        for file in files:
          if fnmatch.fnmatch(file, '*.html'):
            copy2(os.path.join(root,file),multiqc_result_dir)
            multiqc_html = os.path.join(multiqc_result_dir,file)                # get multiqc html path
          elif fnmatch.fnmatch(file, '*.zip'):
            copy2(os.path.join(root,file),multiqc_result_dir)
            multiqc_data = os.path.join(multiqc_result_dir,file)                # get multiqc data path

      self.param('dataflow_params',
                 {'multiqc_html':multiqc_html,
                  'multiqc_data':multiqc_data,
                  'lane_index_info':lane_index_info})
    except Exception as e:
      message = \
        'seqrun: {2}, Error in {0}: {1}'.\
          format(
            self.__class__.__name__,
            e,
            seqrun_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise

Example #18

0

Show file

    def run(self):
        '''
    A method for running picard commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param reference_type: Reference genome collection type, default GENOME_FASTA
    :param reference_refFlat: Reference genome collection type, default GENE_REFFLAT
    :param ribosomal_interval_type: Collection type for ribosomal interval list, default RIBOSOMAL_INTERVAL
    :param species_name: species_name
    :param java_exe: Java path
    :param java_java_paramexe: Java run parameters
    :param picard_jar: Picard jar path
    :param picard_command: Picard command
    :param base_work_dir: Base workd directory
    :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False
    :param use_ephemeral_space: A toggle for temp dir setting, default 0
    :param patterned_flowcell_list: A list of paterned flowcells, default ['HISEQ4000','NEXTSEQ']
    '''
        try:
            temp_output_dir = False
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            java_exe = self.param_required('java_exe')
            java_param = self.param_required('java_param')
            picard_jar = self.param_required('picard_jar')
            input_files = self.param_required('input_files')
            picard_command = self.param_required('picard_command')
            igf_session_class = self.param_required('igf_session_class')
            species_name = self.param('species_name')
            reference_type = self.param('reference_type')
            reference_refFlat = self.param('reference_refFlat')
            ribosomal_interval_type = self.param('ribosomal_interval_type')
            base_work_dir = self.param_required('base_work_dir')
            analysis_files = self.param_required('analysis_files')
            picard_option = self.param('picard_option')
            patterned_flowcell_list = self.param('patterned_flowcell_list')
            platform_name = self.param_required('platform_name')
            output_prefix = self.param('output_prefix')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            cram_collection_type = self.param('cram_collection_type')
            seed_date_stamp = self.param_required('date_stamp')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = \
                  '{0}_{1}'.\
                    format(
                      output_prefix,
                      seed_date_stamp)                                                  # adding seed datestamp to output prefix

            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            temp_output_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get temp work dir
            ref_genome = \
              Reference_genome_utils(
                genome_tag=species_name,
                dbsession_class=igf_session_class,
                genome_fasta_type=reference_type,
                gene_reflat_type=reference_refFlat,
                ribosomal_interval_type=ribosomal_interval_type)                      # setup ref genome utils
            genome_fasta = ref_genome.get_genome_fasta()  # get genome fasta
            ref_flat_file = ref_genome.get_gene_reflat()  # get refFlat file
            ribosomal_interval_file = ref_genome.get_ribosomal_interval(
            )  # get ribosomal interval file
            patterned_flowcell = False
            if platform_name in patterned_flowcell_list:  # check for patterned flowcell
                patterned_flowcell = True

            if load_metrics_to_cram and \
               not cram_collection_type:
                raise ValueError(
                    'Cram file collection type is required for loading picard metrics to db'
                )

            picard=\
              Picard_tools(\
                java_exe=java_exe,
                java_param=java_param,
                picard_jar=picard_jar,
                input_files=input_files,
                output_dir=temp_output_dir,
                ref_fasta=genome_fasta,
                patterned_flowcell=patterned_flowcell,
                ref_flat_file=ref_flat_file,
                picard_option=picard_option,
                output_prefix=output_prefix,
                use_ephemeral_space=use_ephemeral_space,
                ribisomal_interval=ribosomal_interval_file)                           # setup picard tool
            temp_output_files,picard_command_line,picard_metrics = \
              picard.run_picard_command(command_name=picard_command)                  # run picard command
            output_file_list = list()
            for source_path in temp_output_files:
                dest_path=\
                  os.path.join(
                    work_dir,
                    os.path.basename(source_path))                                      # get destination filepath
                move_file(source_path=source_path,
                          destinationa_path=dest_path,
                          force=True)  # move files to work dir
                output_file_list.append(dest_path)
            remove_dir(temp_output_dir)
            analysis_files.extend(output_file_list)
            bam_files = list()
            for file in output_file_list:
                if file.endswith('.bam'):
                    bam_files.append(file)

            if load_metrics_to_cram and \
               len(picard_metrics)>0:
                ca = CollectionAdaptor(**{'session_class': igf_session_class})
                attribute_data = \
                  ca.prepare_data_for_collection_attribute(
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    data_list=picard_metrics)                                           # fromat data for collection attribute table
                ca.start_session()
                try:
                    ca.create_or_update_collection_attributes(\
                      data=attribute_data,
                      autosave=False
                    )                                                                     # load data to collection attribute table
                    ca.commit_session()
                    ca.close_session()
                except:
                    ca.rollback_session()
                    ca.close_session()
                    raise

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'bam_files': bam_files,
                    'seed_date_stamp': seed_date_stamp
                })  # pass on picard output list
            message = \
              'finished picard {0} for {1} {2}'.\
                format(
                  picard_command,
                  project_igf_id,
                  sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'Picard {0} command: {1}'.\
                format(
                  picard_command,
                  picard_command_line)
            #self.comment_asana_task(task_name=project_igf_id, comment=message)        # send commandline to Asana
        except Exception as e:
            if temp_output_dir and \
               os.path.exists(temp_output_dir):
                remove_dir(temp_output_dir)

            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise

Example #19

0

Show file

File: RunBcl2Fastq.py Project: bballamudi/data-management-python

    def run(self):
        try:
            seqrun_igf_id = self.param_required('seqrun_igf_id')
            seqrun_date = self.param_required('seqrun_date')
            flowcell_id = self.param_required('flowcell_id')
            flowcell_lane = self.param_required('flowcell_lane')
            project_name = self.param_required('project_name')
            index_length = self.param_required('index_length')
            seqrun_local_dir = self.param_required('seqrun_local_dir')
            bases_mask = self.param_required('basesmask')
            base_work_dir = self.param_required('base_work_dir')
            base_fastq_dir = self.param_required('base_fastq_dir')
            samplesheet_file = self.param_required('samplesheet')
            bcl2fastq_exe = self.param_required('bcl2fastq_exe')
            runinfo_filename = self.param('runinfo_filename')
            bcl2fastq_options = self.param('bcl2fastq_options')
            singlecell_options = self.param_required('singlecell_options')
            singlecell_tag = self.param('singlecell_tag')
            force_overwrite = self.param('force_overwrite')
            fastq_dir_label = self.param('fastq_dir_label')
            samplesheet_filename = self.param('samplesheet_filename')
            use_ephemeral_space = self.param('use_ephemeral_space')
            model_name = self.param('model_name')
            reset_mask_short_adapter_reads = self.param(
                'reset_mask_short_adapter_reads')

            project_type = ''  # default single cell status is empty
            seqrun_dir = os.path.join(seqrun_local_dir,
                                      seqrun_igf_id)  # local seqrun dir
            runinfo_file = os.path.join(
                seqrun_dir, runinfo_filename)  # seqrun runinfo file
            if not os.path.exists(samplesheet_file):
                raise IOError('samplesheet file {0} not found'.\
                              format(samplesheet_file))

            samplesheet_sc = SampleSheet(
                infile=samplesheet_file
            )  # read samplesheet for single cell check
            samplesheet_sc.\
              filter_sample_data(\
                condition_key='Description',
                condition_value=singlecell_tag,
                method='include')
            if len(samplesheet_sc._data) > 0:
                project_type = singlecell_tag  # set single cell status as true if its present in samplesheet

            if not os.path.exists(runinfo_file):
                raise IOError('Runinfo file {0} not found'.\
                              format(runinfo_file))

            lane_index = '{0}_{1}'.format(
                flowcell_lane,
                index_length)  # get label for lane and index length
            output_dir_label = \
              os.path.join(
                project_name,
                fastq_dir_label,
                seqrun_date,
                flowcell_id,
                lane_index)                                                           # output dir label
            output_fastq_dir = \
              os.path.join(base_fastq_dir,output_dir_label)                           # output fastq dir

            if os.path.exists(output_fastq_dir) and force_overwrite:
                remove_dir(output_fastq_dir
                           )  # remove fastq directory if its already present

            message = \
              'started fastq conversion for {0}, {1} : {2}_{3}'.\
                format(
                  seqrun_igf_id,
                  project_name,
                  flowcell_lane,
                  index_length)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            seqrun_temp_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # create a new input directory in TMPDIR
            move_file = \
              moveBclFilesForDemultiplexing(\
                input_dir=seqrun_dir,
                output_dir=seqrun_temp_dir,
                samplesheet=samplesheet_file,
                run_info_xml=runinfo_file,
                platform_model=model_name)                                            # get lists of files to move to TMPDIR
            move_file.copy_bcl_files()  # move files to TMPDIR
            job_name = self.job_name()
            output_temp_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # create tmp directory in TMPDIR for cluster
            report_dir = \
              os.path.join(\
                base_work_dir,
                seqrun_igf_id,
                job_name,
                'Reports')                                                            # creating report directory in main storage
            if not os.path.exists(report_dir):
                os.makedirs(report_dir, mode=0o770)

            stats_dir = \
              os.path.join(\
                base_work_dir,
                seqrun_igf_id,
                job_name,
                'Stats')                                                              # create stats directory in main storage
            if not os.path.exists(stats_dir):
                os.makedirs(stats_dir, mode=0o770)

            bcl2fastq_cmd = \
              [quote(bcl2fastq_exe),
               '--runfolder-dir',quote(seqrun_temp_dir),
               '--sample-sheet',quote(samplesheet_file),
               '--output-dir',quote(output_temp_dir),
               '--reports-dir',quote(report_dir),
               '--use-bases-mask',quote(bases_mask),
               '--stats-dir',quote(stats_dir)]                                        # bcl2fastq base parameters

            bcl2fastq_param = \
              self.format_tool_options(bcl2fastq_options)                             # format bcl2fastq params
            bcl2fastq_cmd.extend(bcl2fastq_param)  # add additional parameters
            if reset_mask_short_adapter_reads and \
               '--mask-short-adapter-reads' not in bcl2fastq_options:
                read_pattern = re.compile(r'^y(\d+)n?\d?')
                read_values = [
                    int(re.match(read_pattern, i).group(1))
                    for i in bases_mask.split(',')
                    if i.startswith('y') and re.match(read_pattern, i)
                    if int(re.match(read_pattern, i).group(1)) < 22
                ]  # hack for checking if reads are lower than the Illumina threasholds
                if len(read_values) > 0 and \
                    min(read_values) > 5:
                    bcl2fastq_cmd.\
                      append("--mask-short-adapter-reads={0}".\
                             format(quote(str(min(read_values)))))
                    message = \
                      'Setting masked bases length for {0},{1}:{2}_{3}, value: {4}'.\
                        format(
                          seqrun_igf_id,
                          project_name,
                          flowcell_lane,
                          index_length,
                          min(read_values))
                    self.post_message_to_slack(
                        message, reaction='pass')  # send log to slack
                    self.comment_asana_task(\
                      task_name=seqrun_igf_id,
                      comment=message)                                                    # send log to asana

            if project_type == singlecell_tag:
                sc_bcl2fastq_param = self.format_tool_options(
                    singlecell_options)  # format singlecell bcl2fastq params
                bcl2fastq_cmd.extend(
                    sc_bcl2fastq_param)  # add additional parameters

            message = ' '.join(bcl2fastq_cmd)
            self.post_message_to_slack(
                message, reaction='pass')  # send bcl2fastq command to Slack
            self.comment_asana_task(
                task_name=seqrun_igf_id,
                comment=message)  # send bcl2fastq command to Asana

            subprocess.check_call(' '.join(bcl2fastq_cmd),
                                  shell=True)  # run bcl2fastq

            copytree(output_temp_dir,
                     output_fastq_dir)  # copy output from TMPDIR
            copy2(\
              samplesheet_file,
              os.path.join(\
                output_fastq_dir,
                samplesheet_filename))                                                # add samplesheet to output dir
            move(report_dir,
                 output_fastq_dir)  # move report directory to project dir
            move(stats_dir,
                 output_fastq_dir)  # move stats directory to project dir
            self.param('dataflow_params', {
                'fastq_dir': output_fastq_dir,
                'bcl2fq_project_type': project_type
            })  # set dataflow params
            message = \
              'Fastq conversion done for {0},{1}:{2}_{3}, fastq: {4}'.\
                format(
                  seqrun_igf_id,
                  project_name,
                  flowcell_lane,
                  index_length,
                  output_fastq_dir)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            self.comment_asana_task(\
              task_name=seqrun_igf_id,
              comment=message)                                                        # send log to asana
            remove_dir(seqrun_temp_dir)
            remove_dir(output_temp_dir)  # remove temp dirs
        except Exception as e:
            message = \
              'seqrun: {2}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  seqrun_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise

Example #20

0

Show file

def run_samtools_view(samtools_exe,
                      input_file,
                      output_file,
                      reference_file=None,
                      force=True,
                      cram_out=False,
                      threads=1,
                      samtools_params=None,
                      index_output=True,
                      dry_run=False,
                      use_ephemeral_space=0):
    '''
  A function for running samtools view command

  :param samtools_exe: samtools executable path
  :param input_file: An input bam filepath with / without index. Index file will be created if its missing
  :param output_file: An output file path
  :param reference_file: Reference genome fasta filepath, default None
  :param force: Output file will be overwritten if force is True, default True
  :param threads: Number of threads to use for conversion, default 1
  :param samtools_params: List of samtools param, default None
  :param index_output: Index output file, default True
  :param dry_run: A toggle for returning the samtools command without actually running it, default False
  :param use_ephemeral_space: A toggle for temp dir settings, default 0
  :returns: Samtools command as list
  '''
    try:
        check_file_path(samtools_exe)
        _check_bam_file(bam_file=input_file)  # check bam file
        if not dry_run:
            _check_bam_index(\
              samtools_exe=samtools_exe,
              bam_file=input_file)                                                    # check bam index

        temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space)
        temp_file = \
          os.path.join(\
            temp_dir,
            os.path.basename(output_file))                                          # get temp output file path
        view_cmd = \
          [quote(samtools_exe),
           'view',
           '-o',quote(temp_file)
          ]                                                                         # convert bam to cram using samtools
        if reference_file is not None:
            check_file_path(reference_file)
            view_cmd.extend(['-T', quote(reference_file)])

        if threads is not None:
            view_cmd.append('-@{0}'.format(quote(str(threads))))

        if cram_out:
            view_cmd.append('-C')
            if reference_file is None:
                raise ValueError('Reference file is required for cram output')
        else:
            view_cmd.append('-b')

        if samtools_params is not None and \
           isinstance(samtools_params, list) and \
           len(samtools_params) > 0:
            view_cmd.extend(\
              [quote(i) for i in samtools_params])                                    # add additional params

        view_cmd.append(quote(input_file))
        if dry_run:
            return view_cmd

        subprocess.check_call(\
          ' '.join(view_cmd),
          shell=True)
        if cram_out:
            _check_cram_file(cram_path=temp_file)  # check cram output

        copy_local_file(\
          source_path=temp_file,
          destinationa_path=output_file,
          force=force)                                                              # move cram file to original path
        remove_dir(temp_dir)  # remove temp directory
        if index_output:
            index_bam_or_cram(\
              samtools_exe=samtools_exe,
              input_path=output_file,
              threads=threads)

        return view_cmd
    except:
        raise

Example #21

0

Show file

File: pipelineutils_test.py Project: bballamudi/data-management-python

 def tearDown(self):
     Base.metadata.drop_all(self.engine)
     os.remove(self.dbname)
     if os.path.exists(self.temp_dir):
         remove_dir(dir_path=self.temp_dir)

Example #22

0

Show file

def nbconvert_execute_in_singularity(image_path,
                                     ipynb_path,
                                     input_list,
                                     output_dir,
                                     output_format='html',
                                     output_file_map=None,
                                     timeout=600,
                                     kernel='python3',
                                     use_ephemeral_space=False,
                                     allow_errors=False,
                                     dry_run=False):
    '''
  A function for running jupyter nbconvert within singularity containers

  :param image_path: A singularity image path
  :param ipynb_path: A notebook file path to run in the singularity container
  :param input_list: A list of input file for notebook run
  :param output_dir: Path to copy output files
  :param output_format: Notebook output format, default html
  :param output_file_map: A a dictionary of output file tag abd name as key and value, to copy to output_path from tmp dir, default None
  :param timeout: Timeout setting for notebook execution, default 600s
  :param kernel: Kernel name for notebook execution, default python3
  :param allow_errors: A toggle for running notebook with errors, default False
  :param use_ephemeral_space: Toggle for using ephemeral space for temp dir, default False
  :param dry_run: Return the notebook command without run, default False
  :returns: notebook cmd
  '''
    try:
        check_file_path(image_path)
        check_file_path(ipynb_path)
        if output_file_map is None:
            output_file_map = dict(
            )  # default output map is an empty dictionary
        if not isinstance(input_list,list) and \
           len(input_list)==0:
            raise ValueError("Missing input files for notebook run")
        tmp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space
                               )  # this will be mounted on container on /tmp
        tmp_input_list = list()
        for f in input_list:
            check_file_path(f)
            temp_path = \
              os.path.join(
                tmp_dir,
                os.path.basename(f))
            copy_local_file(f, temp_path)  # copy input files to temp dir
            tmp_input_list.append(temp_path)
        temp_ipynb_path = \
          os.path.join(
            tmp_dir,
            os.path.basename(ipynb_path))
        copy_local_file(ipynb_path,
                        temp_ipynb_path)  # copy ipynb file to tmp dir
        args_list = [
            'jupyter', 'nbconvert', '{0}'.format(quote(temp_ipynb_path)),
            '--to={0}'.format(quote(output_format)), '--execute',
            '--ExecutePreprocessor.enabled=True',
            '--ExecutePreprocessor.timeout={0}'.format(quote(str(timeout))),
            '--ExecutePreprocessor.kernel_name={0}'.format(quote(kernel))
        ]  # prepare notebook cmd for run
        if allow_errors:
            args_list.append('--allow-errors')  # run notebooks with errors
        try:
            res = None
            res, run_cmd = \
              singularity_run(
                image_path=image_path,
                path_bind=tmp_dir,
                use_ephemeral_space=use_ephemeral_space,
                args_list=args_list,
                dry_run=dry_run)                                                      # run notebook in singularity container
        except Exception as e:
            raise ValueError("Failed to run jupyter command in singularity, error {0}, response: {1}".\
                               format(e,res))
        if output_file_map is not None and \
           isinstance(output_file_map,dict):
            for tag, output in output_file_map.items():
                output_path = output_dir
                temp_output = \
                  os.path.join(
                    tmp_dir,
                    os.path.basename(output))                                           # just get base name
                if not dry_run:
                    check_file_path(
                        temp_output)  # skip output file check for dry run
                if os.path.isfile(temp_output):
                    output_path = \
                      os.path.join(
                        output_path,
                        os.path.basename(output))                                         # need file name when copying files
                if not dry_run:
                    copy_local_file(
                        temp_output,
                        output_path)  # copy file or dir to output path
                if os.path.isdir(temp_output):
                    output_path = \
                      os.path.join(
                        output_path,
                        os.path.basename(output))                                         # adding dir name to output path, once copy is over
                output_file_map.\
                  update({tag:output_path})
        if output_format == 'html':
            temp_ipynb_path = \
              temp_ipynb_path.replace('.ipynb','.html')
        elif output_format == 'markdown':
            temp_ipynb_path = \
              temp_ipynb_path.replace('.ipynb','.md')
        elif output_format == 'notebook':
            temp_ipynb_path = temp_ipynb_path
        elif output_format == 'pdf':
            temp_ipynb_path = \
              temp_ipynb_path.replace('.ipynb','.pdf')
        elif output_format == 'python':
            temp_ipynb_path = \
              temp_ipynb_path.replace('.ipynb','.py')
        elif output_format == 'slide':
            temp_ipynb_path = \
              temp_ipynb_path.replace('.ipynb','.html')
        if not dry_run:
            check_file_path(temp_ipynb_path)  # check output file path
        output_ipynb_path = \
          os.path.join(
            output_dir,
            os.path.basename(temp_ipynb_path))
        if not dry_run:
            copy_local_file(temp_ipynb_path,
                            output_ipynb_path)  # copy output notebook
        output_file_map.\
          update({'notebook':output_ipynb_path})                                    # add notebook output to dataflow
        remove_dir(tmp_dir)
        return output_file_map, run_cmd
    except Exception as e:
        raise ValueError(
                "Failed to run nbconvert in singularity, error: {0}".\
                  format(e))

Example #23

0

Show file

File: UploadFastqToIrods.py Project: bballamudi/data-management-python

  def run(self):
    try:
      fastq_dir = self.param_required('fastq_dir')
      seqrun_igf_id = self.param_required('seqrun_igf_id')
      project_name = self.param_required('project_name')
      igf_session_class = self.param_required('igf_session_class')
      irods_exe_dir = self.param_required('irods_exe_dir')
      flowcell_id = self.param_required('flowcell_id')
      samplesheet_filename = self.param('samplesheet_filename')
      manifest_name = self.param_required('manifest_name')
      report_html = self.param('report_html')
      use_ephemeral_space = self.param('use_ephemeral_space')

      pa = ProjectAdaptor(**{'session_class':igf_session_class})
      pa.start_session()
      user = \
        pa.fetch_data_authority_for_project(\
          project_igf_id=project_name)                                          # fetch user info from db
      pa.close_session()

      if user is None:
        raise ValueError('No user found for project {0}'.\
                         format(project_name))

      username = user.username                                                  # get username for irods

      report_htmlname = os.path.basename(report_html)
      seqrun_date = seqrun_igf_id.split('_')[0]                                 # collect seqrun date from igf id
      seqrun_date = datetime.datetime.strptime(seqrun_date,'%y%m%d').date()     # identify actual date
      seqrun_date = str(seqrun_date)                                            # convert object to string
      irods_upload = IGF_irods_uploader(irods_exe_dir)                          # create instance for irods upload
      base_seq_dir = os.path.basename(fastq_dir)                                # get base name for the source dir
      tarfile_name = \
        '{0}_{1}_{2}.tar'.\
          format(\
            project_name,
            base_seq_dir,
            seqrun_date)                                                        # construct name of the tarfile
      temp_work_dir = \
        get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get a temp dir
      tarfile_name = \
        os.path.join(
          temp_work_dir,
          tarfile_name)                                                         # create tarfile in the temp dir

      with tarfile.open(tarfile_name, "w") as tar:
        for root,_, files in os.walk(top=fastq_dir):
          if samplesheet_filename in files:
            samplesheet_file = \
              os.path.join(os.path.abspath(root),
                           samplesheet_filename)                                # get samplesheet filepath
            tmp_samplesheet_file = \
              os.path.join(
                temp_work_dir,
                '{0}_{1}_{2}_{3}'.\
                  format(
                    project_name,
                    base_seq_dir,
                    seqrun_date,
                    samplesheet_filename))
            copy2(
              samplesheet_file,
              tmp_samplesheet_file)                                             # change samplesheet filename
            tar.add(
              tmp_samplesheet_file,
              arcname=\
                os.path.relpath(
                  tmp_samplesheet_file,
                  start=temp_work_dir))                                         # add samplesheet file to tar

          if report_htmlname in files:
            for file in files:
              if fnmatch.fnmatch(os.path.join(root,file),report_html):
                report_file = os.path.join(os.path.abspath(root),file)          # get filepath for the report
                tmp_report_file = \
                  os.path.join(\
                    temp_work_dir,
                    '{0}_{1}_{2}_{3}'.\
                    format(\
                      project_name,
                      base_seq_dir,
                      seqrun_date,
                      os.path.basename(report_file)))                           # change report name
                copy2(report_file, tmp_report_file)                             # copy report file to temp
                tar.add(tmp_report_file,
                        arcname=os.path.relpath(tmp_report_file,
                                                start=temp_work_dir))           # add demultiplexing report to tar

          if manifest_name in files:
            manifest_file = \
              os.path.join(os.path.abspath(root),
                           manifest_name)                                       # get samplesheet filepath
            tmp_manifest_file = \
              os.path.join(\
                temp_work_dir,
                '{0}_{1}_{2}_{3}'.\
                format(\
                  project_name,
                  base_seq_dir,
                  seqrun_date,
                  manifest_name))                                               # change manifest name
            copy2(manifest_file,tmp_manifest_file)                              # copy manifest to temp
            tar.add(tmp_manifest_file,
                    arcname=os.path.relpath(tmp_manifest_file,
                                            start=temp_work_dir))               # add samplesheet file to tar

          for file in files:
            if fnmatch.fnmatch(file, '*.fastq.gz') and \
              not fnmatch.fnmatch(file, 'Undetermined_*'):
              fastq_file_path = os.path.join(os.path.abspath(root),file)        # get filepath for the fastq files
              tar.add(fastq_file_path,
                      arcname=os.path.relpath(fastq_file_path,
                                              start=fastq_dir))                 # add fastq file to tar

      irods_upload.\
      upload_fastqfile_and_create_collection(\
        filepath=tarfile_name,
        irods_user=username,
        project_name=project_name,
        run_igf_id=seqrun_igf_id,
        flowcell_id=flowcell_id,
        run_date=seqrun_date)                                                   # upload fastq data to irods
      remove_dir(temp_work_dir)                                                 # remove temp dir once data uoload is done
    except Exception as e:
      message = \
        'seqrun: {2}, Error in {0}: {1}'.\
        format(\
          self.__class__.__name__,
          e,
          seqrun_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise

Example #24

0

Show file

    def run(self):
        '''
    A method for running the cellranger count for a given sample using ehive pipeline
    
    :param project_igf_id: A project igf id
    :param experiment_igf_id: An experiment igf id
    :param sample_igf_id: A sample igf id
    :param biomaterial_type: Biomaterial type for samples, required for nuclei samples
    :param nuclei_biomaterial_type: Required keywords for nuclei samples, default 'SINGLE_NUCLEI'
    :param igf_session_class: A database session class
    :param cellranger_exe: Cellranger executable path
    :param cellranger_options: Cellranger parameters
                               
                               List of default parameters
                                 --jobmode=pbspro
                                 --localcores=1
                                 --localmem=4
                                 --mempercore=4
                                 --maxjobs=20
    
    :param base_work_dir: Base work directory path
    :param fastq_collection_type: Collection type name for input fastq files, default demultiplexed_fastq
    :param species_name: Reference genome collection name
    :param reference_type: Reference genome collection type, default TRANSCRIPTOME_TENX
    :param nuclei_reference_type: Reference genome collection type for pre-mRNA samples, default TRANSCRIPTOME_TENX_NUCLEI
    :param job_timeout: Timeout for cellranger job, default 24hrs
    :returns: Adding cellranger_output to the dataflow_params
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            cellranger_exe = self.param_required('cellranger_exe')
            cellranger_options = self.param_required('cellranger_options')
            base_work_dir = self.param_required('base_work_dir')
            fastq_collection_type = self.param_required(
                'fastq_collection_type')
            biomaterial_type = self.param_required('biomaterial_type')
            job_timeout = self.param_required('job_timeout')
            nuclei_biomaterial_type = self.param('nuclei_biomaterial_type')
            species_name = self.param('species_name')
            reference_type = self.param('reference_type')
            nuclei_reference_type = self.param('nuclei_reference_type')

            # setup work dir for run
            work_dir = False
            work_dir_prefix = \
              os.path.join(\
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = self.get_job_work_dir(
                work_dir=work_dir_prefix
            )  # replace this with temp dir while running in queue
            # setup env for run
            os.chdir(work_dir)  # move to work dir
            os.environ['PATH'] += '{0}{1}'.format(
                os.pathsep, os.path.dirname(
                    cellranger_exe))  # add cellranger location to env PATH
            # collect reference genome for run
            if biomaterial_type == nuclei_biomaterial_type:
                ref_genome = \
                  Reference_genome_utils(\
                    genome_tag=species_name,
                    dbsession_class=igf_session_class,
                    tenx_ref_type=nuclei_reference_type)                                # fetch ref genome for pre-mRNA samples
            else:
                ref_genome = \
                  Reference_genome_utils(\
                    genome_tag=species_name,
                    dbsession_class=igf_session_class,
                    tenx_ref_type=reference_type)

            # collect fastq input for run
            cellranger_ref_transcriptome = ref_genome.get_transcriptome_tenx(
            )  # fetch tenx ref transcriptome from db
            input_fastq_dirs = \
              get_cellranger_count_input_list(\
                db_session_class=igf_session_class,
                experiment_igf_id=experiment_igf_id,
                fastq_collection_type=fastq_collection_type)                          # fetch fastq dir paths as list for run
            # configure cellranger count command for run
            cellranger_options = \
              self.format_tool_options(\
                cellranger_options,
                separator='=')
            cellranger_cmd = \
              [cellranger_exe,
               'count',
               '{0}={1}'.format('--fastqs',
                                quote(','.join(input_fastq_dirs))),
               '{0}={1}'.format('--id',
                                quote(experiment_igf_id)),
               '{0}={1}'.format('--transcriptome',
                                quote(cellranger_ref_transcriptome)),
              ]                                                                       # set initial parameters
            cellranger_cmd.extend(
                cellranger_options)  # add optional parameters
            # log before job submission
            message = \
              'started cellranger count for {0}, {1} {2}'.\
              format(\
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            self.comment_asana_task(task_name=project_igf_id,
                                    comment=message)  # send comment to Asana
            message = ' '.join(cellranger_cmd)
            self.comment_asana_task(
                task_name=project_igf_id,
                comment=message)  # send cellranger command to Asana
            # start job execution
            cellranger_cmd = ' '.join(
                cellranger_cmd)  # create shell command string
            subprocess.\
              check_call(\
                cellranger_cmd,
                shell=True,
                timeout=job_timeout)                                                  # run cellranger count using shell
            # prepare output after cellranger run
            cellranger_output = \
              os.path.join(\
                work_dir,
                experiment_igf_id,
                'outs')                                                               # get cellranger output path
            message = \
              'finished cellranger count for {0}, {1} {2} : {3}'.\
              format(\
                project_igf_id,
                sample_igf_id,
                experiment_igf_id,
                cellranger_output)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            self.comment_asana_task(task_name=project_igf_id,
                                    comment=message)  # send comment to Asana
            # validate output files after cellranger run
            check_cellranger_count_output(
                output_path=cellranger_output)  # check output file
            cellranger_report = \
              os.path.join(\
                cellranger_output,
                'web_summary.html')
            check_file_path(cellranger_report)

            self.param('dataflow_params',\
                       {'cellranger_output':cellranger_output,
                        'cellranger_report':cellranger_report})                       # pass on cellranger output path
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
              format(\
                self.__class__.__name__,
                e,
                project_igf_id,
                sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            if work_dir:
                remove_dir(work_dir)
            raise

Example #25

0

Show file

File: mergesinglecellfastq.py Project: bballamudi/data-management-python

    def merge_fastq_per_lane_per_sample(self):
        '''
    A method for merging single cell fastq files present in input fastq_dir
    per lane per sample basis
    '''
        try:
            sample_data = \
              self._fetch_lane_and_sample_info_from_samplesheet()                     # get sample and lane information from samplesheet
            sample_files, samples_info = \
              self._group_singlecell_fastq(
                sample_data,
                self.fastq_dir)                                                       # get file groups
            all_intermediate_files = list(
            )  # empty list for intermediate files
            s_count = 0  # initial count for fastq S value
            for lane_id in sorted(sample_files.keys()):
                if self.platform_name == 'NEXTSEQ':
                    s_count = 0  # nextseq is weird, reset counter for each lane
                for sample_id in sorted(sample_files[lane_id].keys()):
                    s_count += 1  # assign new S value for fastq files
                    sample_name = samples_info.get(sample_id)['sample_name']
                    project_id = samples_info.get(sample_id)[
                        'project_id']  # get sample and project info
                    output_path = \
                      os.path.join(
                        self.fastq_dir,
                        project_id,
                        sample_id)                                                        # output location is under input fastq_dir
                    if not os.path.exists(output_path):
                        os.makedirs(output_path,
                                    mode=0o770)  # create outout directory

                    for read_type in sample_files[lane_id][sample_id].keys(
                    ):  # merge per read type
                        output_filename = \
                          '{0}_S{1}_L00{2}_{3}_001.fastq.gz'.\
                            format(
                              sample_name,
                              s_count,
                              lane_id,
                              read_type)                                                    # assign new output filename
                        final_path = os.path.join(
                            output_path,
                            output_filename)  # assign final output path
                        if not self.force_overwrite and os.path.exists(
                                final_path):
                            raise ValueError('Failed to overwrite existing file {0}'.\
                                             format(final_path))

                        input_list = list()
                        for sc_fragment, file_path in \
                          sorted(sample_files[lane_id][sample_id][read_type].items()):
                            input_list.extend(
                                file_path
                            )  # create list of input fastqs for merge
                        if len(input_list) != 4:
                            raise ValueError(\
                              'expecting 4 files, got {0} for sample {1}, lane {2}, read type {3}'.\
                                format(
                                  len(input_list),
                                  sample_id,
                                  lane_id,
                                  read_type))                                                 # checking input files list
                        temp_dir = \
                          get_temp_dir(use_ephemeral_space=self.use_ephemeral_space)        # get a temp dir
                        temp_file = os.path.join(
                            temp_dir, output_filename)  # assign temp filename
                        cmd = ["cat"] + input_list + [
                            ">", temp_file
                        ]  # shell command for merging fastq.gz files
                        subprocess.check_call(
                            " ".join(cmd), shell=True
                        )  # exact same command for fastq merge as 10x pipeline
                        shutil.copy(temp_file,
                                    final_path)  # copy file to final location
                        remove_dir(temp_dir)  # remove temp dir
                        for file_path in input_list:
                            all_intermediate_files.append(
                                file_path)  # add fastq to intermediate list
            for file_path in all_intermediate_files:
                os.remove(
                    file_path
                )  # remove intermediate files once merging is complete
        except:
            raise

Example #26

0

Show file

    def generate_report(self):
        '''
    A method for generating html report from scanpy analysis

    :param generate_cb_data: A toggle for generating cellbrowser data, default False
    :param cb_data_path: A output path for cellbrowser data, default None
    '''
        try:
            os.chdir(self.work_dir)
            if os.path.exists(os.path.join(self.work_dir, 'cache')):
                remove_dir(os.path.join(self.work_dir, 'cache'))

            date_stamp = datetime.now().strftime('%d-%b-%Y %H:%M:%S')

            # step 1: read input files
            temp_input_dir = \
              get_temp_dir(use_ephemeral_space=self.use_ephemeral_space)              # fix for hpc
            local_matrix_file = \
              os.path.join(\
                temp_input_dir,
                os.path.basename(self.matrix_file))
            local_barcode_tsv = \
              os.path.join(\
                temp_input_dir,
                os.path.basename(self.barcode_tsv))
            local_features_tsv = \
              os.path.join(\
                temp_input_dir,
                os.path.basename(self.features_tsv))
            copy_local_file(\
              source_path=self.matrix_file,
              destinationa_path=local_matrix_file)
            copy_local_file(\
              source_path=self.barcode_tsv,
              destinationa_path=local_barcode_tsv)
            copy_local_file(\
              source_path=self.features_tsv,
              destinationa_path=local_features_tsv)
            adata = sc.read_10x_mtx(\
                      temp_input_dir,
                      var_names='gene_symbols',
                      cache=True)                                                     # read input files
            adata.var_names_make_unique()
            sc.pl.highest_expr_genes(\
              adata,
              n_top=30,
              save='.png')                                                            # list of genes that yield the highest fraction of counts in each single cells, across all cells
            highest_gene_expr = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/highest_expr_genes.png'))                                # encode highest gene expr data
            # step 2: filter data based on cell and genes
            sc.pp.filter_cells(\
              adata,
              min_genes=self.min_gene_count)
            sc.pp.filter_genes(\
              adata,
              min_cells=self.min_cell_count)
            # step 3: fetch mitochondrial genes
            mt_genes = self._fetch_mitochondrial_genes(species_name='hsapiens')
            mt_genes = [name for name in adata.var_names if name in mt_genes
                        ]  # filter mito genes which are not present in data
            # step 4: calculate mitochondrial read percentage
            adata.obs['percent_mito'] = \
              np.sum(adata[:, mt_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
            adata.obs['n_counts'] = adata.X.sum(
                axis=1
            ).A1  # add the total counts per cell as observations-annotation to adata
            sc.pl.violin(\
              adata,
              ['n_genes', 'n_counts', 'percent_mito'],
              jitter=0.4,
              multi_panel=True,
              show=True,
              save='.png')                                                            # violin plot of the computed quality measures /figures/violin.png
            mito_plot_data = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,\
                      'figures/violin.png'))
            sc.pl.scatter(\
              adata,
              x='n_counts',
              y='percent_mito',
              show=True,
              save='.png')                                                            # scatter plots for data quality 1
            mito_plot_scatter1 = \
              self._encode_png_image(\
                png_file=os.path.join(\
                  self.work_dir,
                  'figures/scatter.png'))
            sc.pl.scatter(\
              adata,
              x='n_counts',
              y='n_genes',
              save='.png')                                                            # scatter plots for data quality 2
            mito_plot_scatter2 = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/scatter.png'))
            # step 5: Filtering data bases on percent mito
            adata = adata[adata.obs['n_genes'] < 2500, :]
            adata = adata[adata.obs['percent_mito'] < 0.05, :]
            # step 6: Normalise and filter data
            sc.pp.normalize_per_cell(
                adata
            )  # Total-count normalize (library-size correct) the data matrix to 10,000 reads per cell, so that counts become comparable among cells.
            sc.pp.log1p(adata)
            adata.raw = adata
            sc.pp.highly_variable_genes(\
              adata,
              min_mean=0.0125,
              max_mean=3,
              min_disp=0.5)                                                           # Identify highly-variable genes
            sc.pl.highly_variable_genes(adata, save='.png')
            genes_dispersion_data = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/filter_genes_dispersion.png'))                           # plot highly-variable genes
            adata = adata[:, adata.var[
                'highly_variable']]  # filter highly-variable genes
            # step 7: Analyze data
            sc.pp.regress_out(\
              adata,
              ['n_counts', 'percent_mito'])                                           # regress out effects of total counts per cell and the percentage of mitochondrial genes expressed
            sc.pp.scale(\
              adata,
              max_value=10)                                                           # scale the data to unit variance
            sc.tl.pca(\
              adata,
              svd_solver='arpack')                                                    # run pca
            sc.pl.pca_loadings(\
              adata,
              show=True,
              save='.png')                                                            # plot pca loading graph
            pca_data = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/pca_loadings.png'))                                      # load pca loading graph
            sc.pl.pca_variance_ratio(\
              adata,
              log=True,save='.png')                                                   # save pca variation ratio
            pca_var_data = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/pca_variance_ratio.png'))                                # load pca variation graph
            sc.tl.tsne(\
              adata,
              random_state=2,
              n_pcs=10)                                                               # legacy tsne
            sc.pp.neighbors(\
              adata,
              n_neighbors=10,
              n_pcs=40)                                                               # neighborhood graph
            # step 7.5 Plot 3D UMAP
            sc.tl.umap(\
              adata,
              n_components=3)                                                         # generate UMAP with 3PCs
            sc.tl.louvain(adata)  # louvain graph clustering
            dict_map = { \
              '0':'#4682B4',
              '1':'#A233A2',
              '2':'#FF7F50',
              '3':'#6787E7',
              '4':'#B75555',
              '5':'#2E8B57',
              '6':'#191970',
              '7':'#DB7093',
              '8':'#90EE90',
              '9':'#00FFFF',
              '10':'#FFD700',
              '11':'#DC143C',
              '12':'#B0C4DE',
              '13':'#00FA9A',
              '14':'#FA8072',
              '15':'#FFF0F5',
              '16':'#DB7093'
            }
            louvain_series = deepcopy(adata.obs['louvain'])
            color_map = louvain_series.map(dict_map).values
            labels = list(adata.obs.index)
            hovertext = \
              ['cluster: {0}, barcode: {1}'.\
               format(grp,labels[index])
                 for index,grp in enumerate(louvain_series.values)]
            threeDUmapDiv = \
              plot([go.Scatter3d( \
                      x=adata.obsm['X_umap'][:, 0],
                      y=adata.obsm['X_umap'][:, 1],
                      z=adata.obsm['X_umap'][:, 2],
                      mode = 'markers',
                      marker = dict(color = color_map,
                                    size = 5),
                      opacity=0.6,
                      text=labels,
                      hovertext=hovertext,
                   )],
                   output_type='div',
                   include_plotlyjs='cdn')                                            # capture 3d div for umap plot
            sc.tl.umap(adata,
                       n_components=2)  # umap with 2PCs or original adata
            sc.pl.tsne(\
              adata,
              color='louvain',
              show=True,
              save='.png')                                                            # plot tSNE data
            tsne_data = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/tsne.png'))                                              # load t-SNE
            # step 8: Finding marker genes
            sc.pl.umap(\
              adata,
              color=['louvain'],
              save='.png')                                                            # plot umap
            umap_data = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/umap.png'))                                              # load umap
            sc.tl.rank_genes_groups(\
              adata,
              'louvain',
              method='t-test')                                                        # compute a ranking for the highly differential genes in each cluster
            sc.pl.rank_genes_groups(\
              adata,
              n_genes=20,
              show=True,
              sharey=False,
              save='.png')                                                            # plot diff genes in each clusters
            rank_genes_groups_data = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/rank_genes_groups_louvain.png'))                         # load ranking plot
            sc.pl.rank_genes_groups_stacked_violin(\
              adata,
              n_genes=10,
              save='.png')                                                            # ranked genes group stacked violin plot
            rank_genes_groups_stacked_violin = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/stacked_violin.png'))                                    # load stacked violin plot data
            sc.pl.rank_genes_groups_dotplot(\
              adata,
              n_genes=10,
              color_map='bwr',
              dendrogram='dendrogram_louvain',
              save='.png')                                                            # ranked genes group dot plot
            rank_genes_groups_dotplot = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/dotplot.png'))                                           # load dotplot
            sc.pl.rank_genes_groups_matrixplot(\
              adata,
              n_genes=10,
              save='.png')                                                            # ranked genes group matrix plot
            rank_genes_groups_matrixplot = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/matrixplot.png'))                                        # load matrix plot
            sc.pl.rank_genes_groups_heatmap(\
              adata,
              n_genes=10,
              show_gene_labels=True,
              save='.png')                                                            # ranked gene heatmap plot
            rank_genes_groups_heatmap = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/heatmap.png'))                                           # load heatmap plot
            sc.pl.rank_genes_groups_tracksplot(\
              adata,
              n_genes=10,
              cmap='bwr',
              save='.png')                                                            # ranked gene tracks plot
            rank_genes_groups_tracksplot = \
              self._encode_png_image(\
                png_file=\
                  os.path.join(\
                    self.work_dir,
                    'figures/tracksplot.png'))                                        # load tracks plot

            project_name = self.project_name
            project_name = \
              project_name[0] \
                if isinstance(project_name, tuple) \
                  else project_name                                                   # check for project_name object
            template_env = \
              Environment(\
                loader=FileSystemLoader(\
                  searchpath=os.path.dirname(self.html_template_file)),
                  autoescape=select_autoescape(['xml']))
            template_file = \
              template_env.\
                get_template(\
                  os.path.basename(self.html_template_file))
            template_file.\
              stream(\
                ProjectName=project_name,
                SampleName=self.sample_name,
                Date_stamp=date_stamp,
                Highest_gene_expr=highest_gene_expr,
                MitoPlot=mito_plot_data,
                MitoScatter1=mito_plot_scatter1,
                MitoScatter2=mito_plot_scatter2,
                GenesDispersion=genes_dispersion_data,
                Pca=pca_data,
                Pca_var_data=pca_var_data,
                Tsne=tsne_data,
                Umap3DDiv=threeDUmapDiv,
                Umap_data=umap_data,
                RankGenesGroups=rank_genes_groups_data,
                Rank_genes_groups_stacked_violin=rank_genes_groups_stacked_violin,
                Rank_genes_groups_dotplot=rank_genes_groups_dotplot,
                Rank_genes_groups_matrixplot=rank_genes_groups_matrixplot,
                Rank_genes_groups_heatmap=rank_genes_groups_heatmap,
                Rank_genes_groups_tracksplot=rank_genes_groups_tracksplot).\
              dump(os.path.join(self.work_dir,'test.html'))
            copy_local_file(\
              os.path.join(\
                self.work_dir,'test.html'),
                self.output_file,
                force=self.force_overwrite)
            if self.cellbrowser_h5ad is not None:
                try:
                    if not os.path.exists(
                            os.path.dirname(self.cellbrowser_h5ad)):
                        os.makedirs(os.path.dirname(self.cellbrowser_h5ad))

                    temp_h5ad = \
                      os.path.join(\
                        self.work_dir,
                        os.path.basename(self.cellbrowser_h5ad))
                    adata.write_h5ad(filename=temp_h5ad)
                    copy_local_file(\
                      source_path=temp_h5ad,
                      destinationa_path=self.cellbrowser_h5ad,
                      force=True)
                except Exception as e:
                    raise ValueError('Failed to export Scanpy h5ad, error: {0}'.\
                                     format(e))

            remove_dir(temp_input_dir)
            remove_dir(self.work_dir)
        except:
            raise

Example #27

0

Show file

 def tearDown(self):
     remove_dir(self.tmp_dir)

Example #28

0

Show file

 def tearDown(self):
     if os.path.exists(self.fastq_dir):
         remove_dir(self.fastq_dir)

Example #29

0

Show file

File: calculate_disk_usage_summary.py Project: bballamudi/data-management-python

disk_path = args.disk_path
copy_to_remoter = args.copy_to_remoter
remote_server = args.remote_server
output_path = args.output_path

try:
    if copy_to_remoter and not remote_server:
        parser.print_help()
        raise ValueError(
            'Remote server address is required for copying files.')

    storage_stats = get_storage_stats_in_gb(
        disk_path)  # calculate disk usage stats
    temp_dir = get_temp_dir()
    temp_file = os.path.join(temp_dir, 'disk_usage.json')  # get temp file path
    with open(temp_file, 'w') as j_data:
        json.dump(storage_stats, j_data,
                  indent=4)  # writing disk usage to temp jeon file

    if copy_to_remoter:
        copy_remote_file(source_path=temp_file,
                         destinationa_path=output_path,
                         destination_address=remote_server
                         )  # copy json file to remote server
    else:
        shutil.copy2(temp_file, output_path)  # copy json file to local server

    remove_dir(temp_dir)  # remove temp dir
except Exception as e:
    print('Error: {0}'.format(e))

Example #30

0

Show file

    def nbconvert_singularity(self, singularity_image_path, dry_run=False):
        '''
    A method for generating notebook from template and executing in singularity container

    :param singularity_image_path: A singularity image path
    :param dry_run: A toggle for dry run, default False
    :returns: A response str from singularity, run command and a dictionary of output params for dataflow
    '''
        try:
            output_params = dict()
            new_input_map = \
              self._substitute_input_path_and_copy_files_to_tempdir()                 # get modified input map and copy files to ount dir
            if not isinstance(new_input_map, dict):
                raise TypeError("Expecting a dictionary and got {0}".\
                                  format(type(new_input_map)))
            date_stamp = self._get_date_stamp()  # get date stamp
            new_input_map.\
              update({self.date_tag:date_stamp})                                      # update input map with datestamp
            temp_notebook = \
              self._generate_ipynb_from_template(param_map=new_input_map)             # generate new notebook after param substitution
            container_notebook_path = \
              os.path.join(
                self.container_dir_prefix,
                os.path.basename(temp_notebook))
            args_list = [
                'jupyter', 'nbconvert',
                '{0}'.format(quote(container_notebook_path)),
                '--to={0}'.format(quote(self.output_format)), '--execute',
                '--ExecutePreprocessor.enabled=True',
                '--ExecutePreprocessor.timeout={0}'.format(
                    quote(str(self.timeout))),
                '--ExecutePreprocessor.kernel_name={0}'.format(
                    quote(self.kernel))
            ]  # prepare notebook cmd for run
            if self.allow_errors:
                args_list.append('--allow-errors')  # run notebooks with errors
            try:
                res = None
                res, run_cmd = \
                  singularity_run(
                    image_path=singularity_image_path,
                    path_bind=self.temp_dir,
                    use_ephemeral_space=self.use_ephemeral_space,
                    args_list=args_list,
                    dry_run=dry_run)                                                    # run notebook in singularity container
            except Exception as e:
                raise ValueError(
                        "Failed to run jupyter command in singularity, error {0}, response: {1}".\
                          format(e,res))
            if dry_run:
                return res, run_cmd, output_params  # test singularity cmd
            else:
                output_params = \
                  self._copy_container_output_and_update_map(
                    temp_notebook_path=temp_notebook)                                   # move files to output dir
                remove_dir(self.temp_dir)  # clean up temp dir
                return res, run_cmd, output_params

        except Exception as e:
            raise ValueError("Failed to execute notebook in singularity container, error: {0}".\
                               format(e))