Beispiel #1
0
def process_first_read(*args, **kwargs):
    """Processing to be performed after the first read and the index reads
    have been sequenced
    """
    
    dname, config = args[0:2]
    # Do bcl -> fastq conversion and demultiplexing using Casava1.8+
    if kwargs.get("casava",False):
        logger2.info("Generating fastq.gz files for read 1 of {:s}".format(dname))
        
        # Touch the indicator flag that processing of read1 has been started
        utils.touch_indicator_file(os.path.join(dname,"first_read_processing_started.txt"))
        unaligned_dir = _generate_fastq_with_casava(dname, config, r1=True)
        logger2.info("Done generating fastq.gz files for read 1 of {:s}".format(dname))
        
        # Extract the top barcodes from the undemultiplexed fraction
        if config["program"].get("extract_barcodes",None):
            extract_top_undetermined_indexes(dname,
                                             unaligned_dir,
                                             config)
            
        loc_args = args + (unaligned_dir,)
        _post_process_run(*loc_args, **{"fetch_msg": True,
                                        "process_msg": False,
                                        "store_msg": kwargs.get("store_msg",False),
                                        "backup_msg": False})
        
        # Touch the indicator flag that processing of read1 has been completed
        utils.touch_indicator_file(os.path.join(dname,"first_read_processing_completed.txt"))
Beispiel #2
0
def align(fastq_file,
          pair_file,
          ref_file,
          out_base,
          align_dir,
          config,
          extra_args=None,
          rg_name=None):
    """Align with novoalign.
    """
    out_file = os.path.join(align_dir, "{0}.sam".format(out_base))
    if not file_exists(out_file):
        cl = [config["program"].get("novoalign", "novoalign")]
        cl += _novoalign_args_from_config(config)
        cl += extra_args if extra_args is not None else []
        cl += ["-o", "SAM"]
        if rg_name:
            cl.append(r"@RG\tID:{0}".format(rg_name))
        cl += ["-d", ref_file, "-f", fastq_file]
        if pair_file:
            cl.append(pair_file)
        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                logger.info(" ".join([str(x) for x in cl]))
                subprocess.check_call([str(x) for x in cl], stdout=out_handle)
    return out_file
Beispiel #3
0
def remove_contaminants(fastq_file, pair_file, ref_file, out_base, fastq_dir, config,
                        extra_args=None, rg_name=None):
    """Remove reads aligning to the contaminating reference genome
    """

    out_root = os.path.join(fastq_dir, out_base)
    out_files = ["%s_1.ext" % out_root,
                 "%s_2.ext" % out_root,
                 "%s.filter_metrics" % out_root]
    suffix = "_fastq.txt"

    if not len(glob.glob("%s_[12]%s" % (out_root, suffix))) > 0:
        with file_transaction(out_files) as (tx_out_file1, tx_out_file2, tx_metrics_file):
            out = tx_out_file1
            if pair_file:
                out = out.replace("_1.ext", ".ext")

            cl = [config["program"]["bowtie"]]
            cl += _bowtie_args_from_config(config)
            cl += extra_args if extra_args is not None else []
            # Allow for read pairs mapping at opposite ends of e.g. the phiX genome
            # Trim 7bp from 3'end corresponding to the barcode
            cl += ["--best", "-X", "6000", "-3", "7"]
            cl += ["--un", out, ref_file]
            if pair_file:
                cl += ["-1", fastq_file, "-2", pair_file]

            else:
                cl += [fastq_file]

            cl += ["/dev/null"]
            cl = [str(i) for i in cl]

            # Get the output, echo it as well as write it to the metrics file
            output = subprocess.check_output(cl, stderr=subprocess.STDOUT)
            log.info(output)

            with open(tx_metrics_file, "w") as fh:
                fh.write("%s\n" % str(output))

        dest_files = []
        for i, out_file in enumerate(out_files):
            if not out_file.endswith(".ext"):
                continue

            if not os.path.exists(out_file):
                if i == 1 and not pair_file:
                    dest_files.append(pair_file)
                    continue

                open(out_file, "w").close()

            dest_file = out_file.replace(".ext", suffix)
            os.rename(out_file, dest_file)
            dest_files.append(dest_file)
    else:
        dest_files = sorted(glob.glob("%s_[12]%s" % (out_root,suffix)))
    dest_files.append(out_base)

    return dest_files
Beispiel #4
0
def remove_contaminants(fastq1, fastq2, info, lane_name, lane_desc, dirs,
                        config):
    """Remove reads mapping to the specified contaminating reference.
    """

    base_name = None
    genome_build = info.get("genomes_filter_out", None)
    # Skip filtering of phix in case we have already done that for the lane
    # FIXME: This logic is way too complicated..
    #   - If filter_phix is true, phix has been filtered lane-wise and need not be run again
    #   - If demultiplexed is true, lane-wise filtering has been skipped and we need to do it here
    if genome_build is not None and os.path.exists(fastq1) and \
    (genome_build != "phix" or not config["algorithm"].get("filter_phix", False) \
     or config["algorithm"].get("demultiplexed", False)):
        if genome_build == "spiked_phix":
            genome_build = "phix"

        program = config["algorithm"].get("remove_contaminants", "bowtie")
        logger.info("Removing %s contaminants on %s, using %s" \
            % (genome_build, info["description"], program))
        fastq1, fastq2, base_name = rc(fastq1, fastq2, genome_build, program,
                                       lane_name, dirs, config)

    return [[
        fastq1, fastq2, info, (base_name or lane_name), lane_desc, dirs, config
    ]]
Beispiel #5
0
def test():
    seq = """>lcl||YPD4_1219|ftsK|128205128 putative cell division protein
MSQEYTEDKEVTLKKLSNGRRLLEAVLIVVTILAAYLMVALVSFNPSDPSWSQTAWHEPI
HNLGGSIGAWMADTLFSTFGVLAYAIPPIMVIFCWTAFRQRDASEYLDYFALSLRLIGTL
ALILTSCGLAALNIDDLYYFASGGVIGSLFSNAMLPWFNGVGATLTLLCIWVVGLTLFTG
WSWLVIAEKIGAAVLGSLTFITNRSRREERYDDEDSYHDDDHADGRDITGQEKGVVSNKG
VVSNNAVVGAGVAASSALAHGDDDVLFSAPSVTDSIVEHGSVVATGTETTDTKATDTNDE
YDPLLSPLRATDYSVQDATSSPIADVAVEPVLNHDAAAIYGTTPVMTNTATPPLYSFELP
EESLPIQTHAAPTERPEPKLGAWDMSPTPVSHSPFDFSAIQRPVGQLESRQPGSNQSGSH
QIHSAQSSHISVGNTPYMNPGLDAQIDGLSTTSLTNKPVLASGTVAAATAAAAFMPAFTA
TSDSSSQIKQGIGPELPRPNPVRIPTRRELASFGIKLPSQRMAEQELRERDGDETQNPQM
AASSYGTEITSDEDAALQQAILRKAFADQQSERYALSTLAEQSSITERSPAAEMPTTPSQ
VSDLEDEQALQEAELRQAFAAQQQHRYGATGDTDNAVDNIRSVDTSTAFTFSPIADLVDD
SPREPLFTLSPYVDETDVDEPVQLEGKEESLLQDYPEQVPTYQPPVQQAHLGQSAPTQPS
HTQSTYGQSTYGQSTYGQSTPAPVSQPVVTSASAISTSVTPTSIASLNTAPVSAAPVAPS
PQPPAFSQPTAAMDSLIHPFLMRNDQPLQKPTTPLPTLDLLSSPPAEEEPVDMFALEQTA
RLVEARLGDYRVKAEVVGISPGPVITRFELDLAPGVKASRISNLSRDLARSLSAIAVRVV
EVIPGKPYVGLELPNKHRQTVYLREVLDCAKFRENPSPLAIVLGKDIAGQPVVADLAKMP
HLLVAGTTGSGKSVGVNAMILSILYKATPDDVRFIMIDPKMLELSVYEGIPHLLTGVVTD
MKDAANALRWCVGEMERRYKLMSALGVRNLAGYNERVAQAEAMGRPIPDPFWKPSDSMDI
SPPMLVKLPYIVVMVDEFADLMMTVGKKVEELIARLAQKARAAGIHLVLATQRPSVDVIT
GLIKANIPTRIAFTVSSKIDSRTILDQGGAESLLGMGDMLYMAPNSSIPVRVHGAFVRDQ
EVHAVVNDWKARGRPQYIDSILSGGEEGEGGGLGLDSDEELDPLFDQAVNFVLEKRRASI
SGVQRQFRIGYNRAARIIEQMEAQQIVSTPGHNGNREVLAPPPHE"""
    handle = hmmscan(hmmdb='pfam', seq=seq)
    import json
    j = json.loads(handle.read())
    logging.info(json.dumps(j, sort_keys=True, indent=4))
Beispiel #6
0
def backup_data(remote_info, config_file):
    """Main entry point for fetching data from sequencer or pre-processing machine.
    """
    config = load_config(config_file)
    logger.info("Backing up run data over to remote storage: %s" %
                config["store_host"])
    _copy_from_sequencer(remote_info, config)
Beispiel #7
0
def _remote_copy(remote_info, config):
    """Securely copy files from remote directory to the processing server.

    This requires ssh public keys to be setup so that no password entry
    is necessary.
    """
    fc_dir = os.path.join(config["analysis"]["store_dir"],
                          os.path.basename(remote_info['directory']))
    logger.info("Copying analysis files to %s" % fc_dir)
    if not fabric_files.exists(fc_dir):
        fabric.run("mkdir %s" % fc_dir)

    for fcopy in remote_info['to_copy']:
        target_loc = os.path.join(fc_dir, fcopy)
        if not fabric_files.exists(target_loc):
            target_dir = os.path.dirname(target_loc)
            if not fabric_files.exists(target_dir):
                fabric.run("mkdir -p %s" % target_dir)

            cl = ["scp", "-r", "%s@%s:%s/%s" %
                  (remote_info["user"], remote_info["hostname"],
                   remote_info["directory"], fcopy),
                  target_loc]
            fabric.run(" ".join(cl))

    logger.info("Analysis files copied")

    return fc_dir
Beispiel #8
0
def run_freebayes(align_bam,
                  ref_file,
                  config,
                  dbsnp=None,
                  region=None,
                  out_file=None):
    """Detect small polymorphisms with FreeBayes.
    """
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0]

    if not file_exists(out_file):
        logger.info("Genotyping with FreeBayes: {region} {fname}".format(
            region=region, fname=os.path.basename(align_bam)))
        with file_transaction(out_file) as tx_out_file:
            cl = [
                config["program"].get("freebayes",
                                      "freebayes"), "-b", align_bam, "-v",
                tx_out_file, "-f", ref_file, "--left-align-indels"
            ]
            cl += _freebayes_options_from_config(config["algorithm"])
            if region:
                cl.extend(["-r", region])

            subprocess.check_call(cl)

    return out_file
Beispiel #9
0
def align(fastq_file, pair_file, ref_file, out_base, align_dir, config,
          rg_name=None):
    """Perform a BWA alignment, generating a SAM file.
    """
    sai1_file = os.path.join(align_dir, "%s_1.sai" % out_base)
    sai2_file = (os.path.join(align_dir, "%s_2.sai" % out_base)
                 if pair_file else None)
    sam_file = os.path.join(align_dir, "%s.sam" % out_base)
    if not file_exists(sam_file):
        if not file_exists(sai1_file):
            with file_transaction(sai1_file) as tx_sai1_file:
                _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config)

        if sai2_file and not file_exists(sai2_file):
            with file_transaction(sai2_file) as tx_sai2_file:
                _run_bwa_align(pair_file, ref_file, tx_sai2_file, config)

        align_type = "sampe" if sai2_file else "samse"
        sam_cl = [config["program"]["bwa"], align_type, ref_file, sai1_file]
        if sai2_file:
            sam_cl.append(sai2_file)

        sam_cl.append(fastq_file)
        if sai2_file:
            sam_cl.append(pair_file)

        with file_transaction(sam_file) as tx_sam_file:
            with open(tx_sam_file, "w") as out_handle:
                logger.info(" ".join(sam_cl))
                subprocess.check_call(sam_cl, stdout=out_handle)

    return sam_file
Beispiel #10
0
def gatk_realigner_targets(runner, align_bam, ref_file, dbsnp=None, region=None, out_file=None, deep_coverage=False):
    """Generate a list of interval regions for realignment around indels.
    """
    if out_file:
        out_file = "%s.intervals" % os.path.splitext(out_file)[0]
    else:
        out_file = "%s-realign.intervals" % os.path.splitext(align_bam)[0]
    # check only for file existence; interval files can be empty after running
    # on small chromosomes, so don't rerun in those cases
    if not os.path.exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            logger.info("GATK RealignerTargetCreator: %s %s" % (os.path.basename(align_bam), region))
            params = ["-T", "RealignerTargetCreator", "-I", align_bam, "-R", ref_file, "-o", tx_out_file, "-l", "INFO"]
            if region:
                params += ["-L", region]

            if dbsnp:
                params += ["--known", dbsnp]

            if deep_coverage:
                params += ["--mismatchFraction", "0.30", "--maxIntervalSize", "650"]

            runner.run_gatk(params)

    return out_file
Beispiel #11
0
def process_first_read(*args, **kwargs):
    """Processing to be performed after the first read and the index reads
    have been sequenced
    """
    dname, config = args[0:2]
    # Do bcl -> fastq conversion and demultiplexing using Casava1.8+
    if kwargs.get("casava", False):
        if not kwargs.get("no_casava_processing", False):
            logger2.info("Generating fastq.gz files for read 1 of {:s}".format(dname))

            # Touch the indicator flag that processing of read1 has been started
            utils.touch_indicator_file(os.path.join(dname, "first_read_processing_started.txt"))
            unaligned_dirs = _generate_fastq_with_casava(dname, config, r1=True)
            logger2.info("Done generating fastq.gz files for read 1 of {:s}".format(dname))

            # Extract the top barcodes from the undemultiplexed fraction
            for unaligned_dir in unaligned_dirs:
                if config["program"].get("extract_barcodes", None):
                    extract_top_undetermined_indexes(dname, unaligned_dir, config)

        for unaligned_dir in unaligned_dirs:
            unaligned_dir = os.path.join(dname, "Unaligned")
            loc_args = args + (unaligned_dir,)
            _post_process_run(*loc_args, **{"fetch_msg": kwargs.get("fetch_msg", False),
                                            "process_msg": False,
                                            "store_msg": kwargs.get("store_msg", False),
                                            "backup_msg": kwargs.get("backup_msg", False),
                                            "push_data": kwargs.get("push_data", False)})

        # Touch the indicator flag that processing of read1 has been completed
        utils.touch_indicator_file(os.path.join(dname, "first_read_processing_completed.txt"))
Beispiel #12
0
def _remote_copy(remote_info, config):
    """Securely copy files from remote directory to the processing server.

    This requires ssh public keys to be setup so that no password entry
    is necessary.
    """
    fc_dir = os.path.join(config["analysis"]["store_dir"],
                          os.path.basename(remote_info['directory']))
    logger.info("Copying analysis files to %s" % fc_dir)
    if not fabric_files.exists(fc_dir):
        fabric.run("mkdir %s" % fc_dir)

    for fcopy in remote_info['to_copy']:
        target_loc = os.path.join(fc_dir, fcopy)
        if not fabric_files.exists(target_loc):
            target_dir = os.path.dirname(target_loc)
            if not fabric_files.exists(target_dir):
                fabric.run("mkdir -p %s" % target_dir)

            cl = [
                "scp", "-r",
                "%s@%s:%s/%s" % (remote_info["user"], remote_info["hostname"],
                                 remote_info["directory"], fcopy), target_loc
            ]
            fabric.run(" ".join(cl))

    logger.info("Analysis files copied")

    return fc_dir
Beispiel #13
0
def screen_sample_contaminants(data):
    """Screen the sample fastq files for contaminants
    """
    if data["config"]["algorithm"]["screen_contaminants"]:
        logger.info("Screening for contaminants on sample %s with genome %s" \
        % (str(data["name"]), str(data["genome_build"])))
        screen_for_contamination(data["fastq1"], data["fastq2"],
                                 data["config"])
def _process_samplesheets(dname, config):
    """Process Illumina samplesheets into YAML files for post-processing.
    """
    ss_file = samplesheet.run_has_samplesheet(dname, config)
    if ss_file:
        out_file = os.path.join(dname, "run_info.yaml")
        logger2.info("CSV Samplesheet %s found, converting to %s" % (ss_file, out_file))
        samplesheet.csv2yaml(ss_file, out_file)
Beispiel #15
0
def _process_samplesheets(dname, config):
    """Process Illumina samplesheets into YAML files for post-processing.
    """
    ss_file = samplesheet.run_has_samplesheet(dname, config)
    if ss_file:
        out_file = os.path.join(dname, "run_info.yaml")
        logger2.info("CSV Samplesheet %s found, converting to %s" %
                     (ss_file, out_file))
        samplesheet.csv2yaml(ss_file, out_file)
Beispiel #16
0
def _run_bwa_align(fastq_file, ref_file, out_file, config):
    aln_cl = [config["program"]["bwa"], "aln",
              "-n %s" % config["algorithm"]["max_errors"],
              "-k %s" % config["algorithm"]["max_errors"]]
    aln_cl += _bwa_args_from_config(config)
    aln_cl += [ref_file, fastq_file]
    with open(out_file, "w") as out_handle:
        logger.info(" ".join(aln_cl))
        subprocess.check_call(aln_cl, stdout=out_handle)
Beispiel #17
0
 def __init__(self, name, fname, picard, quick=False):
     self.name = name
     self._bam = pysam.Samfile(fname, "rb")
     picard.run_fn("picard_index", fname)
     if quick:
         self._total = 1e6
     else:
         self._total = sum(1 for r in self._bam.fetch() if not r.is_unmapped)
         log.info("{}{}".format(name, self._total))
Beispiel #18
0
def screen_sample_contaminants(data):
    """Screen the sample fastq files for contaminants
    """
    if data["config"]["algorithm"]["screen_contaminants"]:
        logger.info("Screening for contaminants on sample %s with genome %s" \
        % (str(data["name"]), str(data["genome_build"])))
        screen_for_contamination(data["fastq1"],
                                 data["fastq2"],
                                 data["config"])
Beispiel #19
0
 def read_picard_metrics(self):
     log.info("read_picard_metrics for sample {}, project {}, lane {} in run {}".format(self["barcode_name"], self["sample_prj"], self["lane"], self["flowcell"]))
     picard_parser = ExtendedPicardMetricsParser()
     pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}-.*.(align|hs|insert|dup)_metrics".format(self["lane"], self["barcode_id"])
     try:
         files = self.filter_files(pattern)
         metrics = picard_parser.extract_metrics(files)
         self["picard_metrics"] = metrics
     except:
         log.warn("no picard metrics for sample {}".format(self["barcode_name"]))
Beispiel #20
0
 def __init__(self, name, fname, picard, quick=False):
     self.name = name
     self._bam = pysam.Samfile(fname, "rb")
     picard.run_fn("picard_index", fname)
     if quick:
         self._total = 1e6
     else:
         self._total = sum(1 for r in self._bam.fetch()
                           if not r.is_unmapped)
         log.info("{}{}".format(name, self._total))
Beispiel #21
0
 def parse_illumina_metrics(self, fullRTA):
     log.info("parse_illumina_metrics")
     fn = []
     for root, dirs, files in os.walk(os.path.abspath(self.path)):
         for file in files:
             if file.endswith(".xml") and not file.find(".AppleDouble"):
                 fn.append(os.path.join(root, file))
     parser = IlluminaXMLParser()
     metrics = parser.parse(fn, fullRTA)
     self["illumina"] = metrics
Beispiel #22
0
 def parse_run_info_yaml(self, run_info_yaml="run_info.yaml"):
     log.info("parse_run_info_yaml: going to read {} in directory {}".format(run_info_yaml, self.path))
     infile = os.path.join(os.path.abspath(self.path), run_info_yaml)
     try:
         fp = open(infile)
         runinfo = yaml.load(fp)
         fp.close()
         self["run_info_yaml"] = runinfo
     except:
         log.warn("No such file {}".format(infile))
Beispiel #23
0
 def parse_samplesheet_csv(self):
     log.info("parse_samplesheet_csv: going to read {}.csv in directory {}".format(self["RunInfo"]["Flowcell"][1:], self.path))
     infile = os.path.join(os.path.abspath(self.path), "{}.csv".format(self["RunInfo"]["Flowcell"][1:]))
     try:
         fp = open(infile)
         runinfo = json.dumps([x for x in csv.reader(fp)])
         fp.close()
         self["run_info_csv"] = runinfo
     except:
         log.warn("No such file {}".format(infile))
Beispiel #24
0
 def _parseRunInfo(self, fn="RunInfo.xml"):
     log.info("_parseRunInfo: going to read RunInfo.xml in directory {}".format(self.path))
     try:
         fp = open(os.path.join(os.path.abspath(self.path), fn))
         parser = RunInfoParser()
         data = parser.parse(fp)
         fp.close()
         self["RunInfo"] = data
     except:
         log.warn("No such file %s" % os.path.join(os.path.abspath(self.path), fn))
Beispiel #25
0
 def _parseRunInfo(self, fn="RunInfo.xml"):
     log.info("_parseRunInfo: going to read RunInfo.xml in directory {}".format(self.path))
     try:
         fp = open(os.path.join(os.path.abspath(self.path), fn))
         parser = RunInfoParser()
         data = parser.parse(fp)
         fp.close()
         self["RunInfo"] = data
     except:
         log.warn("No such file %s" % os.path.join(os.path.abspath(self.path), fn))
Beispiel #26
0
 def read_picard_metrics(self):
     log.info("read_picard_metrics for sample {}, project {}, lane {} in run {}".format(self["barcode_name"], self["sample_prj"], self["lane"], self["flowcell"]))
     picard_parser = ExtendedPicardMetricsParser()
     pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}-.*.(align|hs|insert|dup)_metrics".format(self["lane"], self["barcode_id"])
     try:
         files = self.filter_files(pattern)
         metrics = picard_parser.extract_metrics(files)
         self["picard_metrics"] = metrics
     except:
         log.warn("no picard metrics for sample {}".format(self["barcode_name"]))
Beispiel #27
0
 def parse_samplesheet_csv(self):
     log.info("parse_samplesheet_csv: going to read {}.csv in directory {}".format(self["RunInfo"]["Flowcell"][1:], self.path))
     infile = os.path.join(os.path.abspath(self.path), "{}.csv".format(self["RunInfo"]["Flowcell"][1:]))
     try:
         fp = open(infile)
         runinfo = json.dumps([x for x in csv.reader(fp)])
         fp.close()
         self["run_info_csv"] = runinfo
     except:
         log.warn("No such file {}".format(infile))
Beispiel #28
0
 def parse_run_info_yaml(self, run_info_yaml="run_info.yaml"):
     log.info("parse_run_info_yaml: going to read {} in directory {}".format(run_info_yaml, self.path))
     infile = os.path.join(os.path.abspath(self.path), run_info_yaml)
     try:
         fp = open(infile)
         runinfo = yaml.load(fp)
         fp.close()
         self["run_info_yaml"] = runinfo
     except:
         log.warn("No such file {}".format(infile))
Beispiel #29
0
 def parse_illumina_metrics(self, fullRTA):
     log.info("parse_illumina_metrics")
     fn = []
     for root, dirs, files in os.walk(os.path.abspath(self.path)):
         for file in files:
             if file.endswith(".xml") and not file.find(".AppleDouble"):
                 fn.append(os.path.join(root, file))
     parser = IlluminaXMLParser()
     metrics = parser.parse(fn, fullRTA)
     self["illumina"] = metrics
Beispiel #30
0
def _run_bwa_align(fastq_file, ref_file, out_file, config):
    aln_cl = [
        config["program"]["bwa"], "aln",
        "-n %s" % config["algorithm"]["max_errors"],
        "-k %s" % config["algorithm"]["max_errors"]
    ]
    aln_cl += _bwa_args_from_config(config)
    aln_cl += [ref_file, fastq_file]
    with open(out_file, "w") as out_handle:
        logger.info(" ".join(aln_cl))
        subprocess.check_call(aln_cl, stdout=out_handle)
Beispiel #31
0
def long_term_storage(remote_info, config_file):
    """Securely copy files from remote directory to the storage server.

    This requires ssh public keys to be setup so that no password entry
    is necessary, Fabric is used to manage setting up copies on the remote
    storage server.
    """
    config = load_config(config_file)
    logger.info("Copying run data over to remote storage: %s" % config["store_host"])
    logger.debug("The contents from AMQP for this dataset are:\n %s" % remote_info)
    _copy_for_storage(remote_info, config)
Beispiel #32
0
 def parse_bc_metrics(self):
     log.info("parse_bc_metrics for lane {} in flowcell {}".format(self["lane"], self["flowcell"]))
     pattern = "{}*barcode/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.bc_metrics".format(self["lane"], self["lane"])
     files = self.filter_files(pattern)
     try:
         parser = MetricsParser()
         fp = open(files[0])
         data = parser.parse_bc_metrics(fp)
         fp.close()
         self["bc_metrics"] = data
     except:
         log.warn("No bc_metrics info for lane {}".format(self["lane"]))
Beispiel #33
0
 def parse_fastq_screen(self):
     log.info("parse_fastq_screen for sample {}, lane {} in run {}".format(self["barcode_name"], self["lane"], self["flowcell"]))
     parser = MetricsParser()
     pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}_[12]_fastq_screen.txt".format(self["lane"], self["barcode_id"])
     files = self.filter_files(pattern)
     try:
         fp = open(files[0])
         data = parser.parse_fastq_screen_metrics(fp)
         fp.close()
         self["metrics"]["fastq_scr"] = data
     except:
         log.warn("no fastq screen metrics for sample {}".format(self["barcode_name"]))
Beispiel #34
0
 def parse_fastq_screen(self):
     log.info("parse_fastq_screen for sample {}, lane {} in run {}".format(self["barcode_name"], self["lane"], self["flowcell"]))
     parser = MetricsParser()
     pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}_[12]_fastq_screen.txt".format(self["lane"], self["barcode_id"])
     files = self.filter_files(pattern)
     try:
         fp = open(files[0])
         data = parser.parse_fastq_screen_metrics(fp)
         fp.close()
         self["metrics"]["fastq_scr"] = data
     except:
         log.warn("no fastq screen metrics for sample {}".format(self["barcode_name"]))
Beispiel #35
0
def process_lane(lane_items, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """

    lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name)
    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], dirs["work"],
                                               lane_items[0], fc_name, config=config)

    # Filter phiX
    custom_config = _update_config_w_custom(config, lane_items[0])
    if custom_config["algorithm"].get("filter_phix", False):
        # If we are starting from demultiplexed material, we will skip a lane-wise screening
        # Screening will be performed on a sample basis
        if custom_config["algorithm"].get("demultiplexed", False):
            logger.warn("Will not filter phix lane-wise on already demultiplexed files. " \
                "You will have to specify genomes_filter_out option for each sample")

        else:
            logger.info("Filtering phiX from %s" % lane_name)
            info = {"genomes_filter_out": "spiked_phix", "description": lane_name}
            processed = remove_contaminants(full_fastq1, full_fastq2, info, lane_name, info["description"], dirs, custom_config)
            (full_fastq1, full_fastq2, _, lane_name) = processed[0][0:4]

    logger.info("Demultiplexing %s" % lane_name)
    bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items,
                                lane_name, dirs, config)

    out = []
    for item in lane_items:
        config = _update_config_w_custom(config, item)
        # Can specify all barcodes but might not have actual sequences
        # Would be nice to have a good way to check this is okay here.
        if item["barcode_id"] in bc_files:
            fastq1, fastq2 = bc_files[item["barcode_id"]]
            cur_lane_name = lane_name
            cur_lane_desc = item["description"]
            if item.get("name", "") and config["algorithm"].get("include_short_name", True):
                cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc)

            if item["barcode_id"] is not None:
                cur_lane_name += "_%s" % (item["barcode_id"])

            if config["algorithm"].get("trim_reads", False):
                trim_info = brun_trim_fastq([x for x in [fastq1, fastq2] if x is not None],
                                            dirs, config)
                fastq1 = trim_info[0]
                if fastq2 is not None:
                    fastq2 = trim_info[1]

            out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc,
                        dirs, config))

    return out
Beispiel #36
0
 def parse_bc_metrics(self):
     log.info("parse_bc_metrics for lane {} in flowcell {}".format(self["lane"], self["flowcell"]))
     pattern = "{}*barcode/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.bc_metrics".format(self["lane"], self["lane"])
     files = self.filter_files(pattern)
     try:
         parser = MetricsParser()
         fp = open(files[0])
         data = parser.parse_bc_metrics(fp)
         fp.close()
         self["bc_metrics"] = data
     except:
         log.warn("No bc_metrics info for lane {}".format(self["lane"]))
Beispiel #37
0
def _generate_fastq_with_casava(fc_dir, config, r1=False):
    """Perform demultiplexing and generate fastq.gz files for the current
    flowecell using CASAVA (>1.8).
    """
    basecall_dir = os.path.join(fc_dir, "Data", "Intensities", "BaseCalls")
    casava_dir = config["program"].get("casava")
    unaligned_dir = os.path.join(fc_dir, "Unaligned")
    samplesheet_file = samplesheet.run_has_samplesheet(fc_dir, config)
    num_mismatches = config["algorithm"].get("mismatches", 1)
    num_cores = config["algorithm"].get("num_cores", 1)
    im_stats = config["algorithm"].get("ignore-missing-stats",False)
    im_bcl = config["algorithm"].get("ignore-missing-bcl",False)
    im_control = config["algorithm"].get("ignore-missing-control",False)
    
    # Write to log files
    configure_out = os.path.join(fc_dir,"configureBclToFastq.out")
    configure_err = os.path.join(fc_dir,"configureBclToFastq.err")
    casava_out = os.path.join(fc_dir,"bclToFastq_R{:d}.out".format(2-int(r1)))
    casava_err = os.path.join(fc_dir,"bclToFastq_R{:d}.err".format(2-int(r1)))

    cl = [os.path.join(casava_dir, "configureBclToFastq.pl")]
    cl.extend(["--input-dir", basecall_dir])
    cl.extend(["--output-dir", unaligned_dir])
    cl.extend(["--mismatches", str(num_mismatches)])
    cl.extend(["--fastq-cluster-count", "0"])
    if samplesheet_file is not None: cl.extend(["--sample-sheet", samplesheet_file])
    if im_stats: cl.append("--ignore-missing-stats")
    if im_bcl: cl.append("--ignore-missing-bcl")
    if im_control: cl.append("--ignore-missing-control")
    
    bm = _get_bases_mask(fc_dir)
    if bm is not None:
        cl.extend(["--use-bases-mask", bm])

    if r1:
        # Run configuration script
        logger2.info("Configuring BCL to Fastq conversion")
        logger2.debug(cl)
        
        co = open(configure_out,'w')
        ce = open(configure_err,'w')
        try:
            subprocess.check_call(cl,stdout=co,stderr=ce)
            co.close()
            ce.close()
        except subprocess.CalledProcessError, e:
            logger2.error("Configuring BCL to Fastq conversion for {:s} FAILED " \
                          "(exit code {}), please check log files {:s}, {:s}".format(fc_dir,
                                                                                     str(e.returncode),
                                                                                     configure_out,
                                                                                     configure_err))
            raise e
Beispiel #38
0
def long_term_storage(remote_info, config_file):
    """Securely copy files from remote directory to the storage server.

    This requires ssh public keys to be setup so that no password entry
    is necessary, Fabric is used to manage setting up copies on the remote
    storage server.
    """
    config = load_config(config_file)
    logger.info("Copying run data over to remote storage: %s" %
                config["store_host"])
    logger.debug("The contents from AMQP for this dataset are:\n %s" %
                 remote_info)
    _copy_for_storage(remote_info, config)
Beispiel #39
0
 def parse_bc_metrics(self):
     """Parse bc metrics at sample level"""
     log.info("parse_bc_metrics for sample {}, project {} in flowcell {}".format(self["barcode_name"], self["sample_prj"], self["flowcell"]))
     pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?[\._]bc[\._]metrics".format(self["lane"])
     files = self.filter_files(pattern)
     try:
         parser = MetricsParser()
         fp = open(files[0])
         data = parser.parse_bc_metrics(fp)
         fp.close()
         self["bc_count"] = data[str(self["barcode_id"])]
     except:
         log.warn("No bc_metrics info for lane {}".format(self["lane"]))
Beispiel #40
0
def process_alignment(fastq1, fastq2, info, lane_name, lane_desc,
                      dirs, config):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    if os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (lane_name, aligner))
        out_bam = align_to_sort_bam(fastq1, fastq2, info["genome_build"], \
                                aligner, lane_name, lane_desc, dirs, config)

    return [{"fastq": [fastq1, fastq2], "out_bam": out_bam, "info": info,
             "config": config}]
Beispiel #41
0
def _record_sw_versions(config, sw_version_file):
    """Get the versions of software used in the pipeline and output to
       log and text file in working directory
    """
    sw_versions = version.get_versions(config)
    sw_versions['bcbb'] = version._get_git_commit()

    logger.info("bcbb pipeline is running with software versions: %s" % sw_versions)

    with open(sw_version_file, 'w') as fh:
        fh.write("%s\n" % datetime.datetime.now().isoformat())
        for sw, ver in sw_versions.items():
            fh.write("%s\t%s\n" % (sw, ver))
Beispiel #42
0
def get_run_info(fc_dir, config, run_info_yaml):
    """Retrieve run information from a passed YAML file or the Galaxy API.
    """
    if run_info_yaml and os.path.exists(run_info_yaml):
        logger.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml)
        fc_name, fc_date, run_info = _run_info_from_yaml(fc_dir, run_info_yaml)
    else:
        logger.info("Fetching run details from Galaxy instance")
        fc_name, fc_date = get_flowcell_info(fc_dir)
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
        run_info = galaxy_api.run_details(fc_name, fc_date)

    return fc_name, fc_date, _organize_runs_by_lane(run_info)
Beispiel #43
0
def recalibrate_sample(data):
    """Recalibrate quality values from aligned sample BAM file.
    """
    logger.info("Recalibrating {} with GATK".format(str(data["name"])))
    if data["config"]["algorithm"]["recalibrate"]:
        recal_bam = recalibrate_quality(data["work_bam"], data["fastq1"],
                                        data["fastq2"], data["sam_ref"],
                                        data["dirs"], data["config"])
        save_diskspace(data["work_bam"], \
                       "Recalibrated to {}".format(recal_bam), data["config"])
        data["work_bam"] = recal_bam

    return [[data]]
Beispiel #44
0
 def parse_filter_metrics(self, re_str="*filter[_.]metrics"):
     log.info("parse_filter_metrics for lane {} in flowcell {}".format(self["lane"], self["flowcell"]))
     pattern = "nophix/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.filter_metrics".format(self["lane"])
     files = self.filter_files(pattern)
     self["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
     try:
         fp = open(files[0])
         parser = MetricsParser()
         data = parser.parse_filter_metrics(fp)
         fp.close()
         self["filter_metrics"] = data
     except:
         log.warn("No filter nophix metrics for lane {}".format(self["lane"]))
Beispiel #45
0
 def parse_filter_metrics(self, re_str="*filter[_.]metrics"):
     log.info("parse_filter_metrics for lane {} in flowcell {}".format(self["lane"], self["flowcell"]))
     pattern = "nophix/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.filter_metrics".format(self["lane"])
     files = self.filter_files(pattern)
     self["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
     try:
         fp = open(files[0])
         parser = MetricsParser()
         data = parser.parse_filter_metrics(fp)
         fp.close()
         self["filter_metrics"] = data
     except:
         log.warn("No filter nophix metrics for lane {}".format(self["lane"]))
Beispiel #46
0
 def parse_bc_metrics(self):
     """Parse bc metrics at sample level"""
     log.info("parse_bc_metrics for sample {}, project {} in flowcell {}".format(self["barcode_name"], self["sample_prj"], self["flowcell"]))
     pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?[\._]bc[\._]metrics".format(self["lane"])
     files = self.filter_files(pattern)
     try:
         parser = MetricsParser()
         fp = open(files[0])
         data = parser.parse_bc_metrics(fp)
         fp.close()
         self["bc_count"] = data[str(self["barcode_id"])]
     except:
         log.warn("No bc_metrics info for lane {}".format(self["lane"]))
Beispiel #47
0
def recalibrate_sample(data):
    """Recalibrate quality values from aligned sample BAM file.
    """
    logger.info("Recalibrating {} with GATK".format(str(data["name"])))
    if data["config"]["algorithm"]["recalibrate"]:
        recal_bam = recalibrate_quality(data["work_bam"], data["fastq1"],
                                        data["fastq2"], data["sam_ref"],
                                        data["dirs"], data["config"])
        save_diskspace(data["work_bam"], \
                       "Recalibrated to {}".format(recal_bam), data["config"])
        data["work_bam"] = recal_bam

    return [[data]]
Beispiel #48
0
 def _log_messages(self, log_handler, subject="Test email"):
     try:
         with log_handler.applicationbound():
             with logbook.Processor(lambda record: record.extra.__setitem__('run', subject)):
                 logger2.debug("DEBUG record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.info("INFO record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.notice("NOTICE record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.warning("WARNING record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.error("ERROR record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.critical("CRITICAL record test generated @ %s" % time.strftime("%x - %X"))
     except Exception as e:
         return e
     return None
Beispiel #49
0
def run_and_monitor(config, config_file, args, workers_needed=None,
                    task_module=None, queues=None):
    """Run a distributed analysis in s cluster environment, monitoring outputs.
    """
    cp = config["distributed"]["cluster_platform"]
    cluster = __import__("bcbio.distributed.{0}".format(cp), fromlist=[cp])
    jobids = []
    try:
        # If the manager is going to be run on a cluster, submit the job
        manager_id = None
        local_manager = config["distributed"].get("run_process_program_locally", False)
        if not local_manager:
            log.info("Starting manager")
            manager_id = start_analysis_manager(cluster, args, config)
            jobids.append(manager_id)

        log.info("Starting cluster workers")
        jobids.extend(start_workers(cluster, config, config_file, workers_needed,
                                    task_module, queues))
        while not(cluster.are_running(jobids)):
            time.sleep(5)

        # If manager should run locally, run it as a regular subprocess rather than monitor its status on the cluster
        log.info("Running analysis")
        if not local_manager:
            monitor_analysis(cluster, manager_id)

        else:
            start_analysis_manager(cluster, args, config)

    finally:
        log.info("Cleaning up cluster workers")
        stop_workers(cluster, jobids)
Beispiel #50
0
 def parse_filter_metrics(self):
     """CASAVA: Parse filter metrics at sample level"""
     log.info("parse_filter_metrics for lane {}, project {} in flowcell {}".format(self["lane"], self["sample_prj"], self["flowcell"]))
     pattern = "{}_[0-9]+_[0-9A-Za-z]+_{}(_nophix)?.filter_metrics".format(self["lane"], self["barcode_id"])
     files = self.filter_files(pattern)
     self["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
     try:
         fp = open(files[0])
         parser = MetricsParser()
         data = parser.parse_filter_metrics(fp)
         fp.close()
         self["filter_metrics"] = data
     except:
         log.warn("No filter nophix metrics for lane {}".format(self["lane"]))
Beispiel #51
0
 def read_fastqc_metrics(self):
     log.info("read_fastq_metrics for sample {}, project {}, lane {} in run {}".format(self["barcode_name"], self["sample_prj"], self["lane"], self["flowcell"]))
     if self["barcode_name"] == "unmatched":
         return
     self["fastqc"] = {'stats':None}
     pattern = "fastqc/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}-*".format(self["lane"], self["barcode_id"])
     files = self.filter_files(pattern)
     try:
         fastqc_dir = os.path.dirname(files[0])
         fqparser = ExtendedFastQCParser(fastqc_dir)
         stats = fqparser.get_fastqc_summary()
         self["fastqc"] = {'stats':stats}
     except:
         log.warn("no fastq screen metrics for sample {}".format(self["barcode_name"]))
Beispiel #52
0
 def parse_filter_metrics(self):
     """CASAVA: Parse filter metrics at sample level"""
     log.info("parse_filter_metrics for lane {}, project {} in flowcell {}".format(self["lane"], self["sample_prj"], self["flowcell"]))
     pattern = "{}_[0-9]+_[0-9A-Za-z]+_{}(_nophix)?.filter_metrics".format(self["lane"], self["barcode_id"])
     files = self.filter_files(pattern)
     self["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
     try:
         fp = open(files[0])
         parser = MetricsParser()
         data = parser.parse_filter_metrics(fp)
         fp.close()
         self["filter_metrics"] = data
     except:
         log.warn("No filter nophix metrics for lane {}".format(self["lane"]))
Beispiel #53
0
def _write_to_worksheet(client, ssheet, wsheet_title, rows, header, append, keys=[]):
    """Generic method to write a set of rows to a worksheet on google docs.
    """
    # Convert the worksheet title to unicode
    wsheet_title = _to_unicode(wsheet_title)

    # Add a new worksheet, possibly appending or replacing a pre-existing
    # worksheet according to the append-flag.
    wsheet = g_spreadsheet.add_worksheet(client, \
                                         ssheet, \
                                         wsheet_title, \
                                         len(rows) + 1, \
                                         len(header), \
                                         append)
    if wsheet is None:
        logger2.error("ERROR: Could not add a worksheet {!r} to " \
            "spreadsheet {!r}".format(wsheet_title, ssheet.title.text))
        return False
    
    # If keys are specified (will correspond to indexes in the header), delete pre-existing rows with matching keys
    if append and len(keys) > 0:
        wsheet_data = g_spreadsheet.get_cell_content(client, ssheet, wsheet, '2')
        wsheet_header = g_spreadsheet.get_header(client, ssheet, wsheet)
        try:
            wsheet_indexes = [wsheet_header.index(key) for key in keys]
            header_indexes = [header.index(key) for key in keys]
        except ValueError:
            logger2.warn("WARNING: Could not identify correct header for duplicate detection")
        else:
            for row in rows:
                try:
                    key = "#".join([row[i] for i in header_indexes])        
                    for i, wrow in enumerate(wsheet_data):
                        wkey = "#".join([wrow[j] for j in wsheet_indexes])
                        if wkey == key:
                            g_spreadsheet.delete_row(client, ssheet, wsheet, i+1)
                            wsheet_data.pop(i)
                            break
                except:
                    logger2.warn("WARNING: Could not identify/replace duplicate rows")

    # Write the data to the worksheet
    success = g_spreadsheet.write_rows(client, ssheet, wsheet, header, rows)
    if success:
        logger2.info("Wrote data to the {!r}:{!r} " \
                     "worksheet".format(ssheet.title.text, wsheet_title))
    else:
        logger2.error("ERROR: Could not write data to the {!r}:{!r} " \
                      "worksheet".format(ssheet.title.text, wsheet_title))
    return success
Beispiel #54
0
def _get_run_info(fc_name, fc_date, config, run_info_yaml):
    """Retrieve run information from a passed YAML file or the Galaxy API.
    """
    if run_info_yaml and os.path.exists(run_info_yaml):
        logger.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml)
        with open(run_info_yaml) as in_handle:
            run_details = yaml.load(in_handle)

        return dict(details=run_details, run_id="")

    else:
        logger.info("Fetching run details from Galaxy instance")
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])

        return galaxy_api.run_details(fc_name, fc_date)
Beispiel #55
0
 def parse_bc_metrics(self):
     """Parse bc metrics at sample level"""
     log.info("parse_bc_metrics for flowcell {}".format(self["RunInfo"]["Flowcell"]))
     for lane in self._lanes:
         pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?[\._]bc[\._]metrics".format(lane)
         self["lanes"][str(lane)]["bc_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
         files = self.filter_files(pattern)
         try:
             parser = MetricsParser()
             fp = open(files[0])
             data = parser.parse_bc_metrics(fp)
             fp.close()
             self["lanes"][str(lane)]["bc_metrics"] = data
         except:
             log.warn("No bc_metrics info for lane {}".format(lane))