Ejemplo n.º 1
0
def _generate_fastq_with_casava(fc_dir, config, r1=False):
    """Perform demultiplexing and generate fastq.gz files for the current
    flowecell using CASAVA (>1.8).
    """
    basecall_dir = os.path.join(fc_dir, "Data", "Intensities", "BaseCalls")
    casava_dir = config["program"].get("casava")
    unaligned_dir = os.path.join(fc_dir, "Unaligned")
    samplesheet_file = samplesheet.run_has_samplesheet(fc_dir, config)
    num_mismatches = config["algorithm"].get("mismatches", 1)
    num_cores = config["algorithm"].get("num_cores", 1)
    im_stats = config["algorithm"].get("ignore-missing-stats",False)
    im_bcl = config["algorithm"].get("ignore-missing-bcl",False)
    im_control = config["algorithm"].get("ignore-missing-control",False)
    
    # Write to log files
    configure_out = os.path.join(fc_dir,"configureBclToFastq.out")
    configure_err = os.path.join(fc_dir,"configureBclToFastq.err")
    casava_out = os.path.join(fc_dir,"bclToFastq_R{:d}.out".format(2-int(r1)))
    casava_err = os.path.join(fc_dir,"bclToFastq_R{:d}.err".format(2-int(r1)))

    cl = [os.path.join(casava_dir, "configureBclToFastq.pl")]
    cl.extend(["--input-dir", basecall_dir])
    cl.extend(["--output-dir", unaligned_dir])
    cl.extend(["--mismatches", str(num_mismatches)])
    cl.extend(["--fastq-cluster-count", "0"])
    if samplesheet_file is not None: cl.extend(["--sample-sheet", samplesheet_file])
    if im_stats: cl.append("--ignore-missing-stats")
    if im_bcl: cl.append("--ignore-missing-bcl")
    if im_control: cl.append("--ignore-missing-control")
    
    bm = _get_bases_mask(fc_dir)
    if bm is not None:
        cl.extend(["--use-bases-mask", bm])

    if r1:
        # Run configuration script
        logger2.info("Configuring BCL to Fastq conversion")
        logger2.debug(cl)
        
        co = open(configure_out,'w')
        ce = open(configure_err,'w')
        try:
            subprocess.check_call(cl,stdout=co,stderr=ce)
            co.close()
            ce.close()
        except subprocess.CalledProcessError, e:
            logger2.error("Configuring BCL to Fastq conversion for {:s} FAILED " \
                          "(exit code {}), please check log files {:s}, {:s}".format(fc_dir,
                                                                                     str(e.returncode),
                                                                                     configure_out,
                                                                                     configure_err))
            raise e
Ejemplo n.º 2
0
 def _log_messages(self, log_handler, subject="Test email"):
     try:
         with log_handler.applicationbound():
             with logbook.Processor(lambda record: record.extra.__setitem__('run', subject)):
                 logger2.debug("DEBUG record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.info("INFO record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.notice("NOTICE record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.warning("WARNING record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.error("ERROR record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.critical("CRITICAL record test generated @ %s" % time.strftime("%x - %X"))
     except Exception as e:
         return e
     return None
Ejemplo n.º 3
0
def simple_upload(remote_info, data):
    """Upload generated files to specified host using rsync
    """
    include = ['--include=*/']
    for fcopy in data['to_copy']:
        include.extend(["--include", "{}**/*".format(fcopy)])
        include.append("--include={}".format(fcopy))
    # By including both these patterns we get the entire directory
    # if a directory is given, or a single file if a single file is
    # given.

    cl = ["rsync", \
          "--checksum", \
          "--recursive", \
          "--links", \
          "-D", \
          "--partial", \
          "--progress", \
          "--prune-empty-dirs"
          ]

    # file / dir inclusion specification
    cl.extend(include)
    cl.append("--exclude=*")

    # source and target
    cl.extend([
          # source
          data["directory"], \
          # target
          "{store_user}@{store_host}:{store_dir}".format(**remote_info)
         ])

    logdir = remote_info.get("log_dir",os.getcwd())
    rsync_out = os.path.join(logdir,"rsync_transfer.out")
    rsync_err = os.path.join(logdir,"rsync_transfer.err")
    ro = open(rsync_out, 'a')
    re = open(rsync_err, 'a')
    try:
        ro.write("-----------\n{}\n".format(" ".join(cl)))
        re.write("-----------\n{}\n".format(" ".join(cl)))
        ro.flush()
        re.flush()
        subprocess.check_call(cl, stdout=ro, stderr=re)
    except subprocess.CalledProcessError, e:
        logger2.error("rsync transfer of {} FAILED with (exit code {}). " \
                      "Please check log files {:s} and {:s}".format(data["directory"],
                                                                    str(e.returncode),
                                                                    rsync_out,
                                                                    rsync_err))
        raise e
Ejemplo n.º 4
0
def _write_to_worksheet(client, ssheet, wsheet_title, rows, header, append, keys=[]):
    """Generic method to write a set of rows to a worksheet on google docs.
    """
    # Convert the worksheet title to unicode
    wsheet_title = _to_unicode(wsheet_title)

    # Add a new worksheet, possibly appending or replacing a pre-existing
    # worksheet according to the append-flag.
    wsheet = g_spreadsheet.add_worksheet(client, \
                                         ssheet, \
                                         wsheet_title, \
                                         len(rows) + 1, \
                                         len(header), \
                                         append)
    if wsheet is None:
        logger2.error("ERROR: Could not add a worksheet {!r} to " \
            "spreadsheet {!r}".format(wsheet_title, ssheet.title.text))
        return False
    
    # If keys are specified (will correspond to indexes in the header), delete pre-existing rows with matching keys
    if append and len(keys) > 0:
        wsheet_data = g_spreadsheet.get_cell_content(client, ssheet, wsheet, '2')
        wsheet_header = g_spreadsheet.get_header(client, ssheet, wsheet)
        try:
            wsheet_indexes = [wsheet_header.index(key) for key in keys]
            header_indexes = [header.index(key) for key in keys]
        except ValueError:
            logger2.warn("WARNING: Could not identify correct header for duplicate detection")
        else:
            for row in rows:
                try:
                    key = "#".join([row[i] for i in header_indexes])        
                    for i, wrow in enumerate(wsheet_data):
                        wkey = "#".join([wrow[j] for j in wsheet_indexes])
                        if wkey == key:
                            g_spreadsheet.delete_row(client, ssheet, wsheet, i+1)
                            wsheet_data.pop(i)
                            break
                except:
                    logger2.warn("WARNING: Could not identify/replace duplicate rows")

    # Write the data to the worksheet
    success = g_spreadsheet.write_rows(client, ssheet, wsheet, header, rows)
    if success:
        logger2.info("Wrote data to the {!r}:{!r} " \
                     "worksheet".format(ssheet.title.text, wsheet_title))
    else:
        logger2.error("ERROR: Could not write data to the {!r}:{!r} " \
                      "worksheet".format(ssheet.title.text, wsheet_title))
    return success
Ejemplo n.º 5
0
def snpeff_effects(vcf_in, genome, config):
    """Prepare tab-delimited file for variant effects using snpEff.
    """
    interval_file = config["algorithm"].get("hybrid_target", None)
    if _vcf_has_items(vcf_in) and config["algorithm"].get(
            "variation_effects", True):
        se_interval = (_convert_to_snpeff_interval(interval_file, vcf_in)
                       if interval_file else None)
        try:
            snpeff_data_dir = os.path.join(config["program"]["snpEff"], "data")
            snpeff_genome_remap = config.get("resources", {}).get(
                "snpEff", {}).get("genome_remap", SNPEFF_GENOME_REMAP)
            assert genome in snpeff_genome_remap, log.error(
                "The genome {} is not present in the SnpEff genome dictionary."
            )
            for snpeff_genome in snpeff_genome_remap[genome]:
                if os.path.exists(os.path.join(snpeff_data_dir,
                                               snpeff_genome)):
                    break
            vcf_file = _run_snpeff(vcf_in, snpeff_genome, se_interval, "vcf",
                                   config)
            effects_file = _run_snpeff(vcf_in, snpeff_genome, se_interval,
                                       "txt", config)
        finally:
            for fname in [se_interval]:
                if fname and os.path.exists(fname):
                    os.remove(fname)
        return vcf_file, effects_file
    else:
        return None, None
Ejemplo n.º 6
0
def process_second_read(*args, **kwargs):
    """Processing to be performed after all reads have been sequenced
    """
    dname, config = args[0:2]
    logger2.info("The instrument has finished dumping on directory %s" % dname)

    utils.touch_indicator_file(os.path.join(dname, "second_read_processing_started.txt"))
    _update_reported(config["msg_db"], dname)
    fastq_dir = None

    # Do bcl -> fastq conversion and demultiplexing using Casava1.8+
    if kwargs.get("casava", False):
        if not kwargs.get("no_casava_processing", False):
            logger2.info("Generating fastq.gz files for {:s}".format(dname))
            _generate_fastq_with_casava(dname, config)
            # Merge demultiplexing results into a single Unaligned folder
            utils.merge_demux_results(dname)
            #Move the demultiplexing results
            if config.has_key('mfs_dir'):
                fc_id = os.path.basename(dname)
                cl = ["rsync", \
                      "--checksum", \
                      "--recursive", \
                      "--links", \
                      "-D", \
                      "--partial", \
                      "--progress", \
                      "--prune-empty-dirs", \
                      os.path.join(dname, 'Unaligned'), \
                      os.path.join(config.get('mfs_dir'), fc_id)
                      ]
                logger2.info("Synching Unaligned folder to MooseFS for run {}".format(fc_id))
                logdir = os.path.join(config.get('log_dir'), os.getcwd())
                rsync_out = os.path.join(logdir,"rsync_transfer.out")
                rsync_err = os.path.join(logdir,"rsync_transfer.err")

                with open(rsync_out, 'a') as ro:
                    with open(rsync_err, 'a') as re:
                        try:
                            ro.write("-----------\n{}\n".format(" ".join(cl)))
                            re.write("-----------\n{}\n".format(" ".join(cl)))
                            subprocess.check_call(cl, stdout=ro, stderr=re)
                        except subprocess.CalledProcessError, e:
                            logger2.error("rsync transfer of Unaligned results FAILED")
Ejemplo n.º 7
0
def initial_processing(*args, **kwargs):
    """Initial processing to be performed after the first base report
    """
    dname, config = args[0:2]
    # Touch the indicator flag that processing of read1 has been started
    utils.touch_indicator_file(os.path.join(dname, "initial_processing_started.txt"))

    # Copy the samplesheet to the run folder
    ss_file = samplesheet.run_has_samplesheet(dname, config)
    if ss_file:
        dst = os.path.join(dname,os.path.basename(ss_file))
        try:
            copyfile(ss_file,dst)
        except IOError, e:
            logger2.error("Error copying samplesheet {} from {} to {}: {}" \
                          "".format(os.path.basename(ss_file),
                                    os.path.dirname(ss_file),
                                    os.path.dirname(dst),
                                    e))
Ejemplo n.º 8
0
 def _log_messages(self, log_handler, subject="Test email"):
     try:
         with log_handler.applicationbound():
             with logbook.Processor(lambda record: record.extra.__setitem__(
                     'run', subject)):
                 logger2.debug("DEBUG record test generated @ %s" %
                               time.strftime("%x - %X"))
                 logger2.info("INFO record test generated @ %s" %
                              time.strftime("%x - %X"))
                 logger2.notice("NOTICE record test generated @ %s" %
                                time.strftime("%x - %X"))
                 logger2.warning("WARNING record test generated @ %s" %
                                 time.strftime("%x - %X"))
                 logger2.error("ERROR record test generated @ %s" %
                               time.strftime("%x - %X"))
                 logger2.critical("CRITICAL record test generated @ %s" %
                                  time.strftime("%x - %X"))
     except Exception as e:
         return e
     return None
Ejemplo n.º 9
0
def snpeff_effects(vcf_in, genome, config):
    """Prepare tab-delimited file for variant effects using snpEff.
    """
    interval_file = config["algorithm"].get("hybrid_target", None)
    if _vcf_has_items(vcf_in) and config["algorithm"].get("variation_effects",True):
        se_interval = (_convert_to_snpeff_interval(interval_file, vcf_in)
                       if interval_file else None)
        try:
            snpeff_data_dir = os.path.join(config["program"]["snpEff"], "data")
            snpeff_genome_remap = config.get("resources",{}).get("snpEff",{}).get("genome_remap",SNPEFF_GENOME_REMAP)
            assert genome in snpeff_genome_remap, log.error("The genome {} is not present in the SnpEff genome dictionary.")
            for snpeff_genome in snpeff_genome_remap[genome]:
                if os.path.exists(os.path.join(snpeff_data_dir, snpeff_genome)):
                    break
            vcf_file = _run_snpeff(vcf_in, snpeff_genome, se_interval, "vcf", config)
            effects_file = _run_snpeff(vcf_in, snpeff_genome, se_interval, "txt", config)
        finally:
            for fname in [se_interval]:
                if fname and os.path.exists(fname):
                    os.remove(fname)
        return vcf_file, effects_file
    else:
        return None, None
Ejemplo n.º 10
0
        if r1:
            cl.append("r1")

        logger2.info("Demultiplexing and converting bcl to fastq.gz")
        logger2.debug(cl)
        
        co = open(casava_out,'w')
        ce = open(casava_err,'w')
        try:
            subprocess.check_call(cl,stdout=co,stderr=ce)
            co.close()
            ce.close()
        except subprocess.CalledProcessError, e:
            logger2.error("BCL to Fastq conversion for {:s} FAILED " \
                          "(exit code {}), please check log files {:s}, "\
                          "{:s}".format(fc_dir,
                                        str(e.returncode),
                                        casava_out,
                                        casava_err))
            raise e
            
    logger2.debug("Done")
    return unaligned_dir

def _generate_fastq(fc_dir, config, compress_fastq):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    postprocess_dir = config.get("postprocess_dir", "")
Ejemplo n.º 11
0
def _generate_fastq_with_casava_task(args):
    """Perform demultiplexing and generate fastq.gz files for the current
    flowecell using CASAVA (>1.8).
    """
    bp = args.get('bp')
    samples_group = args.get('samples')
    base_mask = samples_group['base_mask']
    samples = samples_group['samples']
    fc_dir = args.get('fc_dir')
    config = args.get('config')
    r1 = args.get('r1', False)
    idx_only = args.get('idx_only', False)
    ss = 'SampleSheet_{bp}bp.csv'.format(bp=str(bp))
    unaligned_folder = 'Unaligned_{bp}bp'.format(bp=str(bp))
    out_file = 'configureBclToFastq_{bp}bp.out'.format(bp=str(bp))
    err_file = 'configureBclToFastq_{bp}bp.err'.format(bp=str(bp))

    #Prepare CL arguments and call configureBclToFastq
    basecall_dir = os.path.join(fc_dir, "Data", "Intensities", "BaseCalls")
    casava_dir = config["program"].get("casava")
    out_dir = config.get("out_directory", fc_dir)
    #Append the flowcell dir to the output directory if different from the run dir
    if out_dir != fc_dir:
        out_dir = os.path.join(out_dir, os.path.basename(fc_dir))
    unaligned_dir = os.path.join(out_dir, unaligned_folder)
    samplesheet_file = os.path.join(fc_dir, ss)
    num_mismatches = config["algorithm"].get("mismatches", 1)
    num_cores = config["algorithm"].get("num_cores", 1)
    im_stats = config["algorithm"].get("ignore-missing-stats", False)
    im_bcl = config["algorithm"].get("ignore-missing-bcl", False)
    im_control = config["algorithm"].get("ignore-missing-control", False)

    # Write to log files
    configure_out = os.path.join(fc_dir, out_file)
    configure_err = os.path.join(fc_dir, err_file)
    casava_out = os.path.join(fc_dir, "bclToFastq_R{:d}.out".format(2 - int(r1)))
    casava_err = os.path.join(fc_dir, "bclToFastq_R{:d}.err".format(2 - int(r1)))

    cl = [os.path.join(casava_dir, "configureBclToFastq.pl")]
    cl.extend(["--input-dir", basecall_dir])
    cl.extend(["--output-dir", unaligned_dir])
    cl.extend(["--mismatches", str(num_mismatches)])
    cl.extend(["--fastq-cluster-count", "0"])
    if samplesheet_file is not None:
        cl.extend(["--sample-sheet", samplesheet_file])

    if im_stats:
        cl.append("--ignore-missing-stats")

    if im_bcl:
        cl.append("--ignore-missing-bcl")

    if im_control:
        cl.append("--ignore-missing-control")

    if base_mask is not None:
        cl.extend(["--use-bases-mask", ','.join(base_mask)])
    if r1:
        cl.append("--force")

    if r1 or idx_only:
        #Create separate samplesheet and folder
        with open(os.path.join(fc_dir, ss), 'w') as fh:
            samplesheet = csv.DictWriter(fh, fieldnames=samples['fieldnames'], dialect='excel')
            samplesheet.writeheader()
            samplesheet.writerows(samples['samples'])

        # Run configuration script
        logger2.info("Configuring BCL to Fastq conversion")
        logger2.debug(cl)

        co = open(configure_out, 'w')
        ce = open(configure_err, 'w')
        try:
            co.write("{}\n".format(" ".join(cl)))
            ce.write("{}\n".format(" ".join(cl)))
            subprocess.check_call(cl, stdout=co, stderr=ce)
        except subprocess.CalledProcessError, e:
            logger2.error("Configuring BCL to Fastq conversion for {:s} FAILED " \
                          "(exit code {}), please check log files {:s}, {:s}".format(fc_dir,
                                                                                     str(e.returncode),
                                                                                     configure_out,
                                                                                     configure_err))
            raise e
        finally:
Ejemplo n.º 12
0
        except IOError, e:
            logger2.error("Error copying samplesheet {} from {} to {}: {}" \
                          "".format(os.path.basename(ss_file),
                                    os.path.dirname(ss_file),
                                    os.path.dirname(dst),
                                    e))
    # If this is a MiSeq run and we have the scilifelab modules loaded,
    # convert the MiSeq samplesheet into a format compatible with casava
    elif _is_miseq_run(dname):
        if 'scilifelab.illumina.miseq' in sys.modules:
            mrun = MiSeqRun(dname)
            hiseq_ssheet = os.path.join(dname,'{}.csv'.format(_get_flowcell_id(dname)))
            mrun.write_hiseq_samplesheet(hiseq_ssheet)
        # If the module wasn't loaded, there's nothing we can do, so warn
        else:
            logger2.error("The necessary dependencies for processing MiSeq runs with CASAVA could not be loaded")

    # Upload the necessary files
    loc_args = args + (None, )
    _post_process_run(*loc_args, **{"fetch_msg": kwargs.get("fetch_msg", False),
                                    "process_msg": False,
                                    "store_msg": kwargs.get("store_msg", False),
                                    "backup_msg": kwargs.get("backup_msg", False),
                                    "push_data": kwargs.get("push_data", False)})

    # Touch the indicator flag that processing of read1 has been completed
    utils.touch_indicator_file(os.path.join(dname, "initial_processing_completed.txt"))


def process_first_read(*args, **kwargs):
    """Processing to be performed after the first read and the index reads