Ejemplo n.º 1
0
def _filter_out_genomes(data):
    """ Filters out genomes found in run_info.yaml
    """
    print "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
    print data
    # genome_build, sam_ref = ref_genome_info(data["info"], config, data["dirs"])
    sam_ref = data["sam_ref"]

    log.info("Removing genome from sample %s" % str(data["name"]))
    try:
        # not data ! should reach run_info.yaml somehow from here
        if data["filter_out_genomes"]:
            for genome in data["filter_out_genomes"].split(","):
                (out_file, ext) = os.path.splitext(os.path.basename(fastq1))
                out_file = out_file + "-stripped-" + genome + ext
                cl = [
                    "bowtie",
                    "--solexa1.3-quals",
                    "--un",
                    out_file,
                    sam_ref,
                    "-1",
                    data["fastq1"],
                    "-2",
                    data["fastq2"],
                    "/dev/null",
                ]
                log.info("Running %s" % cl)
                subprocess.check_call(cl)
    except KeyError:
        log.error("Not removing genomes, directive filter_out_genomes undefined in run_info.yaml")
        pass
Ejemplo n.º 2
0
Archivo: lane.py Proyecto: kdaily/bcbb
def process_lane(info, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """
    config = _update_config_w_custom(config, info)

    sample_name = info.get("description", "")
    if (config["algorithm"].get("include_short_name", True) and
            info.get("name", "")):
        sample_name = "%s---%s" % (info.get("name", ""), sample_name)
    genome_build = info.get("genome_build", None)
    multiplex = info.get("multiplex", None)

    log.info("Processing sample: %s; lane %s; reference genome %s; " \
             "researcher %s; analysis method %s" %
             (sample_name, info["lane"], genome_build,
              info.get("researcher", ""), info.get("analysis", "")))
    if multiplex:
        log.debug("Sample %s multiplexed as: %s" % (sample_name, multiplex))

    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], info, fc_name)
    lane_name = "%s_%s_%s" % (info['lane'], fc_date, fc_name)
    lane_items = []
    for mname, msample, fastq1, fastq2 in split_by_barcode(full_fastq1,
            full_fastq2, multiplex, lane_name, dirs, config):
        mlane_name = "%s_%s" % (lane_name, mname) if mname else lane_name
        if msample is None:
            msample = "%s---%s" % (sample_name, mname)
        lane_items.append((fastq1, fastq2, genome_build, mlane_name, msample,
                           dirs, config))
    return lane_items
Ejemplo n.º 3
0
def process_sample(sample_name, fastq_files, info, bam_files, dirs,
                   config, config_file):
    """Finalize processing for a sample, potentially multiplexed.
    """
    config = _update_config_w_custom(config, info)

    genome_build = info.get("genome_build", None)
    (_, sam_ref) = get_genome_ref(genome_build, config["algorithm"]["aligner"],
                                  dirs["galaxy"])
    fastq1, fastq2 = combine_fastq_files(fastq_files, dirs["work"])
    log.info("Combining and preparing wig file %s" % str(sample_name))
    sort_bam = merge_bam_files(bam_files, dirs["work"], config)
    (gatk_bam, vrn_file, effects_file) = ("", "", "")
    if config["algorithm"]["recalibrate"]:
        log.info("Recalibrating %s with GATK" % str(sample_name))
        gatk_bam = recalibrate_quality(sort_bam, fastq1, fastq2, sam_ref,
                                       dirs, config)
        if config["algorithm"]["snpcall"]:
            log.info("SNP genotyping %s with GATK" % str(sample_name))
            vrn_file = run_genotyper(gatk_bam, sam_ref, config)
            log.info("Calculating variation effects for %s" % str(sample_name))
            effects_file = variation_effects(vrn_file, genome_build, config)
    if config["algorithm"].get("transcript_assemble", False):
        tx_file = assemble_transcripts(sort_bam, sam_ref, config)
    if sam_ref is not None:
        log.info("Generating summary files: %s" % str(sample_name))
        generate_align_summary(sort_bam, fastq2 is not None, sam_ref,
                               sample_name, config, dirs)
    bam_to_wig(sort_bam, config, config_file)
    return [sample_name, fastq_files, info, sort_bam, gatk_bam, vrn_file,
            effects_file]
Ejemplo n.º 4
0
def main(run_name, gdocs_spreadsheet, encoded_credentials_file, run_info_yaml, analysis_dir, archive_dir, gdocs_worksheet, gdocs_projects_folder, append, split_on_project):

    log.info("Processing run: %s" % run_name)
    
    # If not supplied, assume that the configuration file is named run_info.yaml and resides in the archive dir
    if not run_info_yaml:
        run_info_yaml = os.path.join(archive_dir,"run_info.yaml")
        log.info("No configuration file supplied, assuming it is '%s'" % run_info_yaml)
        
    if not os.path.exists(run_info_yaml):
        log.warn("Could not find required run_info.yaml configuration file at '%s'" % run_info_yaml)
        return
    with open(run_info_yaml) as in_handle:
        run_info = {'details': yaml.load(in_handle)}

    # Get the google docs crdentials
    gdocs_credentials = ""
    if not os.path.exists(encoded_credentials_file):
        log.warn("The Google Docs credentials file could not be found. No demultiplex data was written")
        return
    with open(encoded_credentials_file) as fh:
        gdocs_credentials = fh.read().strip()
    

    fc_name, fc_date = get_flowcell_info(run_name)
    
    # Get the barcode statistics
    bc_metrics = get_bc_stats(fc_date,fc_name,analysis_dir,run_info)
    
    # Write the report
    write_run_report_to_gdocs(fc_date,fc_name,bc_metrics,gdocs_spreadsheet,gdocs_credentials,gdocs_worksheet,append,split_on_project)
    
    # Write the bc project summary report
    if gdocs_projects_folder:
        write_project_report_to_gdocs(fc_date,fc_name,bc_metrics,gdocs_credentials,gdocs_projects_folder)
Ejemplo n.º 5
0
def process_alignment(fastq1, fastq2, genome_build, lane_name, sample, dirs, config):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    aligner = config["algorithm"].get("aligner", None)
    if os.path.exists(fastq1) and aligner:
        log.info("Aligning lane %s with %s aligner" % (lane_name, aligner))
        align_to_sort_bam(fastq1, fastq2, genome_build, aligner,
                          lane_name, sample, dirs, config)
Ejemplo n.º 6
0
def long_term_storage(remote_info, config_file):
    config = load_config(config_file)
    log_handler = create_log_handler(config, log.name)
    with log_handler.applicationbound():
        log.info("Copying run data over to remote storage: %s" %
        config["store_host"])
        log.debug("The contents from AMQP for this dataset are:\n %s" %
        remote_info)
        _copy_for_storage(remote_info, config)
Ejemplo n.º 7
0
def process_alignment(fastq1, fastq2, info, lane_name, lane_desc, dirs, config):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    if os.path.exists(fastq1) and aligner:
        log.info("Aligning lane %s with %s aligner" % (lane_name, aligner))
        out_bam = align_to_sort_bam(fastq1, fastq2, info["genome_build"], aligner, lane_name, lane_desc, dirs, config)
    return [{"fastq": [fastq1, fastq2], "out_bam": out_bam, "info": info, "config": config}]
Ejemplo n.º 8
0
def main(flowcell_id, archive_dir, analysis_dir):
    print " ".join([flowcell_id, archive_dir, analysis_dir])
    fp = os.path.join(archive_dir, flowcell_id, "run_info.yaml")
    with open(fp) as in_handle:
        run_info = yaml.load(in_handle)
    project_ids = dict()
    for lane in run_info:
        (l, id) = [x.strip() for x in lane['description'].split(",")]
        if project_ids.has_key(id):
            project_ids[id].append(lane)
        else:
            project_ids[id] = [lane]

    sphinx_defs = []
    for k in project_ids.keys():
        lanes = [x['lane'] for x in project_ids[k]]
        log.info("saw project %s in lanes %s" %( k, ", ".join(lanes)))
        sphinx_defs.append("('%s', '%s_delivery.tex', 'Delivery note', u'Scilife', 'manual'),\n"  % (k, k))
        projectfile = "%s.mako" % (k)
        fp = open(projectfile, "w")
        fp.write(TEMPLATE)
        fp.close()
        mylookup = TemplateLookup(directories=['./'])
        tmpl = Template(filename=projectfile, lookup=mylookup)
        proj_conf = {
            'id' : k,
            'lanes' : project_ids[k],
            'archive_dir' : archive_dir, 
            'analysis_dir' : analysis_dir,
            'flowcell' : flowcell_id,
            }
        d = generate_report(proj_conf)
        rstfile = "%s.rst" % (k)
        fp = open(rstfile, "w")
        fp.write(tmpl.render(**d))
        fp.close()

    sphinxconf = os.path.join(os.getcwd(), "conf.py")
    if not os.path.exists(sphinxconf):
        log.warn("no sphinx configuration file conf.py found: you have to edit conf.py yourself!")
    else:
        fp = open(sphinxconf)
        lines = fp.readlines()
        fp.close()
        sdout = []
        modify_conf = False
        for sd in sphinx_defs:
            if not sd in lines:
                sdout.append(sd)
                modify_conf = True
        if modify_conf:
            i = lines.index("latex_documents = [\n")
            newconf = lines[:i+3] + sdout + lines[i+3:]
            fp = open("conf.py", "w")
            fp.write("".join(newconf))
            fp.close()
Ejemplo n.º 9
0
def write_run_report_to_gdocs(
    fc_date,
    fc_name,
    bc_metrics,
    ssheet_title,
    encoded_credentials,
    wsheet_title=None,
    append=False,
    split_project=False,
):
    """Upload the barcode read distribution for a run to google docs"""

    # Connect to google and get the spreadsheet
    client, ssheet = get_spreadsheet(ssheet_title, encoded_credentials)
    if not client or not ssheet:
        return False

    # Convert the bc_metrics data structure into a flat list
    rows = _structure_to_list(bc_metrics)

    # Get the projects in the run
    projects = _get_unique_project_names(rows)
    log.info("The run contains data from: '%s'" % "', '".join(projects))

    # Calculate the number of million reads for convenience
    brci = -1
    brcmi = -1
    for i, head in enumerate(BARCODE_STATS_HEADER):
        if head[1] == "barcode_read_count":
            brci = i
        elif head[1] == "barcode_read_count_millions":
            brcmi = i
    if brci >= 0 and brcmi >= 0:
        for row in rows:
            try:
                row[brcmi] = unicode(round(int(row[brci]) / 1000000.0, 2))
            except ValueError:
                pass

    # If we will split the worksheet by project, use the project names as worksheet titles
    success = True
    if split_project:
        # Filter away the irrelevent project entries and write the remaining to the appropriate worksheet
        for wsheet_title in projects:
            success &= _write_to_worksheet(
                client, ssheet, wsheet_title, _apply_filter(rows, [wsheet_title]), BARCODE_STATS_HEADER, append
            )

    # Else, set the default title of the worksheet to be a string of concatenated date and flowcell id
    else:
        if wsheet_title is None:
            wsheet_title = "%s_%s" % (fc_date, fc_name)
        success &= _write_to_worksheet(client, ssheet, wsheet_title, rows, BARCODE_STATS_HEADER, append)

    return success
Ejemplo n.º 10
0
def _handle_data(src, tgt, f=shutil.copyfile):
    if src is None:
        return
    if os.path.exists(tgt):
        log.warn("%s already exists: not doing anything!" %(tgt))
        return
    if options.dry_run:
        print "DRY_RUN: %s file %s to %s" % (f.__name__, src, tgt)
    else:
        log.info("%s file %s to %s" % (f.__name__, src, tgt))
        f(src, tgt)
Ejemplo n.º 11
0
def realign_sample(data, region=None, out_file=None):
    """Realign sample BAM file at indels.
    """
    log.info("Realigning %s with GATK" % str(data["name"]))
    if data["config"]["algorithm"]["snpcall"]:
        sam_ref = data["sam_ref"]
        config = data["config"]
        data["work_bam"] = gatk_realigner(data["work_bam"], sam_ref, config,
                                          configured_ref_file("dbsnp", config, sam_ref),
                                          region, out_file)
    return [data]
Ejemplo n.º 12
0
def _get_run_info(fc_name, fc_date, config, run_info_yaml):
    """Retrieve run information from a passed YAML file or the Galaxy API.
    """
    if run_info_yaml and os.path.exists(run_info_yaml):
        log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml)
        with open(run_info_yaml) as in_handle:
            run_details = yaml.load(in_handle)
        return dict(details=run_details, run_id="")
    else:
        log.info("Fetching run details from Galaxy instance")
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
        return galaxy_api.run_details(fc_name, fc_date)
Ejemplo n.º 13
0
def get_run_info(fc_dir, config, run_info_yaml):
    """Retrieve run information from a passed YAML file or the Galaxy API.
    """
    if run_info_yaml and os.path.exists(run_info_yaml):
        log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml)
        fc_name, fc_date, run_info = _run_info_from_yaml(fc_dir, run_info_yaml)
    else:
        log.info("Fetching run details from Galaxy instance")
        fc_name, fc_date = get_flowcell_info(fc_dir)
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
        run_info = galaxy_api.run_details(fc_name, fc_date)
    return fc_name, fc_date, _organize_runs_by_lane(run_info)
Ejemplo n.º 14
0
def prune_run_info_by_description(run_info, desc, lanes):
    """Prune a run_info file by lane description"""
    run_info_ret = list()
    for info in run_info:
        if not desc is None:
            if info['description'].find(desc) > -1 or desc=="ALL":
                log.info("Found %s in run_info for lane %s, description: %s" %(desc, info['lane'], info['description'] ))
                run_info_ret.append(info)
        elif not lanes is None:
            if info['lane'] in lanes.split(","):
                log.info("Appending lane %s to output" %(info['lane']))
                run_info_ret.append(info)

    return run_info_ret     
Ejemplo n.º 15
0
def _write_to_worksheet(client, ssheet, wsheet_title, rows, header, append):
    """Generic method to write a set of rows to a worksheet on google docs"""

    # Convert the worksheet title to unicode
    wsheet_title = _to_unicode(wsheet_title)

    # Add a new worksheet, possibly appending or replacing a pre-existing worksheet according to the append-flag
    wsheet = bcbio.google.spreadsheet.add_worksheet(client, ssheet, wsheet_title, len(rows) + 1, len(header), append)
    if wsheet is None:
        log.info("Could not add a worksheet '%s' to spreadsheet '%s'" % (wsheet_title, ssheet.title.text))
        return False

    # Write the data to the worksheet
    log.info("Adding data to the '%s' worksheet" % (wsheet_title))
    return bcbio.google.spreadsheet.write_rows(client, ssheet, wsheet, [col_header[0] for col_header in header], rows)
Ejemplo n.º 16
0
def create_bc_report_on_gdocs(fc_date, fc_name, work_dir, run_info, config):
    """Get the barcode read distribution for a run and upload to google docs"""

    # Get the required parameters from the post_process.yaml configuration file
    gdocs = config.get("gdocs_upload", None)
    if not gdocs:
        log.info("No GDocs upload section specified in config file, will not upload demultiplex data")
        return

    # Get the GDocs demultiplex result file title
    gdocs_spreadsheet = gdocs.get("gdocs_dmplx_file", None)
    if not gdocs_spreadsheet:
        log.warn(
            "Could not find Google Docs demultiplex results file title in config. No demultiplex counts were written to Google Docs"
        )
        return

    # Get the account credentials
    encoded_credentials = ""
    encoded_credentials_file = gdocs.get("gdocs_credentials", None)
    if not encoded_credentials_file:
        log.warn("Could not find Google Docs account credentials. No demultiplex report was written")
        return
    # Check if the credentials file exists
    if not os.path.exists(encoded_credentials_file):
        log.warn("The Google Docs credentials file could not be found. No demultiplex data was written")
        return
    with open(encoded_credentials_file) as fh:
        encoded_credentials = fh.read().strip()

    # Get the barcode statistics. Get a deep copy of the run_info since we will modify it
    bc_metrics = get_bc_stats(fc_date, fc_name, work_dir, copy.deepcopy(run_info))

    # Upload the data
    write_run_report_to_gdocs(fc_date, fc_name, bc_metrics, gdocs_spreadsheet, encoded_credentials)

    # Get the projects parent folder
    projects_folder = gdocs.get("gdocs_projects_folder", None)

    # Write the bc project summary report
    if projects_folder:
        write_project_report_to_gdocs(fc_date, fc_name, bc_metrics, encoded_credentials, projects_folder)
Ejemplo n.º 17
0
def process_lane(info, config, dirs):
    """Models bcbio process lane"""
    sample_name = info.get("description", "")
    genome_build = info.get("genome_build", None)
    multiplex = info.get('multiplex', None)
    log.info("Processing sample: %s; lane %s; reference genome %s" %
             (sample_name, info["lane"], genome_build))
    if multiplex:
        log.debug("Sample %s is multiplexed as: %s" % (sample_name, multiplex))
    fq = get_barcoded_fastq_files(multiplex, info, dirs['fc_dir'], config['fc_name'], config['fc_date'])
    
    ## Move data along with fastq files
    fc_bc_dir = os.path.join(config['data_delivery_dir'], "%s_%s_%s_barcode" % (info['lane'], config['fc_date'], config['fc_name']))
    _make_dir(fc_bc_dir, "fastq.txt barcode directory")
    if not options.only_fastq:
        data, fastqc = _get_analysis_results(config, dirs, info['lane'])
        _deliver_data(data, fastqc, config['data_delivery_dir'])

    for fqpair in fq:
        [_deliver_fastq_file(fq_src, os.path.basename(fq_src), fc_bc_dir) for fq_src in fqpair]
Ejemplo n.º 18
0
def write_project_report_to_gdocs(fc_date, fc_name, project_bc_metrics, encoded_credentials, gdocs_folder=""):
    """Upload the sample read distribution for a project to google docs"""

    # Create a client class which will make HTTP requests with Google Docs server.
    client = bcbio.google.spreadsheet.get_client(encoded_credentials)
    doc_client = bcbio.google.document.get_client(encoded_credentials)

    # Get a reference to the parent folder
    parent_folder = bcbio.google.document.get_folder(doc_client, gdocs_folder)

    # Group the barcode data by project
    grouped = group_bc_stats(project_bc_metrics)

    # Loop over the projects and write the project summary for each
    for pdata in grouped:

        project_name = pdata.get("project_name", "")
        ssheet_title = project_name + "_sequencing_results"
        ssheet = bcbio.google.spreadsheet.get_spreadsheet(client, ssheet_title)
        if not ssheet:
            bcbio.google.document.add_spreadsheet(doc_client, ssheet_title)
            ssheet = bcbio.google.spreadsheet.get_spreadsheet(client, ssheet_title)

        _write_project_report_to_gdocs(client, ssheet, fc_date, fc_name, pdata)
        _write_project_report_summary_to_gdocs(client, ssheet)

        # Just to make it look a bit nicer, remove the default 'Sheet1' worksheet
        wsheet = bcbio.google.spreadsheet.get_worksheet(client, ssheet, "Sheet 1")
        if wsheet:
            client.DeleteWorksheet(wsheet)

        folder_name = project_name
        folder = bcbio.google.document.get_folder(doc_client, folder_name)
        if not folder:
            log.info("creating folder '%s'" % _from_unicode(folder_name))
            folder = bcbio.google.document.add_folder(doc_client, folder_name, parent_folder)

        ssheet = bcbio.google.document.move_to_folder(doc_client, ssheet, folder)
        log.info(
            "'%s' spreadsheet written to folder '%s'" % (_from_unicode(ssheet.title.text), _from_unicode(folder_name))
        )
Ejemplo n.º 19
0
Archivo: lane.py Proyecto: hussius/bcbb
def make_lane_items(info, fc_date, fc_name, dirs, config):
    sample_name = info.get("description", "")
    if (config["algorithm"].get("include_short_name", True) and
            info.get("name", "")):
        sample_name = "%s---%s" % (info.get("name", ""), sample_name)
    genome_build = info.get("genome_build", None)
    multiplex = info.get("multiplex", "")
    log.info("Processing sample: %s; lane %s; reference genome %s; " \
             "researcher %s; analysis method %s" %
             (sample_name, info["lane"], genome_build,
              info.get("researcher", ""), info.get("analysis", "")))
    lane_items = []
    if multiplex:
        log.debug("Sample %s is multiplexed as: %s" % (sample_name, multiplex))
        mitems = get_multiplex_items(multiplex, info['lane'], dirs['fc_dir'], fc_name, fc_date)
        for fastq1, fastq2, mlane_name, msample in mitems:
            lane_items.append((fastq1, fastq2, genome_build, mlane_name, msample, dirs, config))
    else:
        # TODO: Not multiplex: what to do?
        pass
    return lane_items
Ejemplo n.º 20
0
def get_spreadsheet(ssheet_title,encoded_credentials):
    """Connect to Google docs and get a spreadsheet"""
    
    # Convert the spreadsheet title to unicode
    ssheet_title = _to_unicode(ssheet_title)
    
    # Create a client class which will make HTTP requests with Google Docs server.
    client = bcbio.google.spreadsheet.get_client()
    bcbio.google.connection.authenticate(client,encoded_credentials)
    
    # Locate the spreadsheet
    ssheet = bcbio.google.spreadsheet.get_spreadsheet(client,ssheet_title)
    
    # Check that we got a result back
    if not ssheet:
        log.warn("No document with specified title '%s' found in GoogleDocs repository" % ssheet_title)
        return (None,None)
    
    log.info("Found spreadsheet matching the supplied title: '%s'" % (ssheet.title.text))
    
    return (client,ssheet)
Ejemplo n.º 21
0
def _remote_copy(remote_info, config):
    """Securely copy files from remote directory to the processing server.

    This requires ssh public keys to be setup so that no password entry
    is necessary.
    """
    fc_dir = os.path.join(config["analysis"]["store_dir"],
                          os.path.basename(remote_info['directory']))
    log.info("Copying analysis files to %s" % fc_dir)
    if not fabric_files.exists(fc_dir):
        fabric.run("mkdir %s" % fc_dir)
    for fcopy in remote_info['to_copy']:
        target_loc = os.path.join(fc_dir, fcopy)
        if not fabric_files.exists(target_loc):
            target_dir = os.path.dirname(target_loc)
            if not fabric_files.exists(target_dir):
                fabric.run("mkdir -p %s" % target_dir)
            cl = ["scp", "-r", "%s@%s:%s/%s" %
                  (remote_info["user"], remote_info["hostname"],
                   remote_info["directory"], fcopy),
                  target_loc]
            fabric.run(" ".join(cl))
    log.info("Analysis files copied")
    return fc_dir
Ejemplo n.º 22
0
def _copy_for_storage(remote_info, config):
    """Securely copy files from remote directory to the storage server.

    This requires ssh public keys to be setup so that no password entry
    is necessary, Fabric is used to manage setting up copies on the remote
    storage server.
    """
    log.info("Copying run data over to remote storage: %s" % config["store_host"])
    log.debug("The contents from AMQP for this dataset are:\n %s" % remote_info)
    base_dir = config["store_dir"]
    fabric.env.host_string = "%s@%s" % (config["store_user"], config["store_host"])
    fc_dir = os.path.join(base_dir, os.path.basename(remote_info['directory']))
    if not fabric_files.exists(fc_dir):
        fabric.run("mkdir %s" % fc_dir)
    for fcopy in remote_info['to_copy']:
        target_loc = os.path.join(fc_dir, fcopy)
        if not fabric_files.exists(target_loc):
            target_dir = os.path.dirname(target_loc)
            if not fabric_files.exists(target_dir):
                fabric.run("mkdir -p %s" % target_dir)
            cl = ["scp", "-r", "%s@%s:%s/%s" % (
                  remote_info["user"], remote_info["hostname"], remote_info["directory"],
                  fcopy), target_loc]
            fabric.run(" ".join(cl))
Ejemplo n.º 23
0
def _make_dir(dir, label):
    if not os.path.exists(dir):
        os.makedirs(dir)
        log.info("Creating %s directory %s" % (label, dir))
    else:
        log.warn("%s already exists: not creating new directory" % (dir))
def main(flowcell_id, archive_dir, analysis_dir, config_file):
    print " ".join([flowcell_id, archive_dir, analysis_dir])
    fp = os.path.join(archive_dir, flowcell_id, "run_info.yaml")
    with open(fp) as in_handle:
        run_info = yaml.load(in_handle)
    if config_file:
        with open(config_file) as in_handle:
            config = yaml.load(in_handle)
    else:
        config = {}
    project_ids = dict()
    for lane in run_info:
        (l, id) = [x.strip() for x in lane['description'].split(",")]
        if project_ids.has_key(id):
            if not lane in project_ids[id]: project_ids[id].append(lane)
        else:
            project_ids[id] = [lane]
        # Check here if project is a "sub project" of the lane
        if not lane.has_key('multiplex'): continue
        for s in lane['multiplex']:
            if s.has_key('description'):
                if project_ids.has_key(s['description']):
                    if lane not in project_ids[s['description']]: project_ids[s['description']].append(lane)
                else:
                    project_ids[s['description']] = [lane]
                                                                                             
    sphinx_defs = []
    for k in project_ids.keys():
        lanes = [x['lane'] for x in project_ids[k]]
        proj_file_tag = k + "_" + get_flowcell_info(flowcell_id)[1] + get_flowcell_info(flowcell_id)[0][0]
        log.info("saw project %s in lanes %s" %( k, ", ".join(lanes)))
        sphinx_defs.append("('%s', '%s_delivery.tex', 'Raw data delivery note', u'SciLifeLab Stockholm', 'howto'),\n"  % (proj_file_tag, proj_file_tag))
        projectfile = "%s.mako" % (proj_file_tag) 
        fp = open(projectfile, "w")
        fp.write(TEMPLATE)
        fp.close()
        mylookup = TemplateLookup(directories=['./'])
        tmpl = Template(filename=projectfile, lookup=mylookup)
        proj_conf = {
            'id' : k,
            'lanes' : project_ids[k],
            'archive_dir' : archive_dir, 
            'analysis_dir' : analysis_dir,
            'flowcell' : flowcell_id,
            'config' : config,
            }
        d = generate_report(proj_conf)
        rstfile = "%s.rst" % (proj_file_tag)
        fp = open(rstfile, "w")
        fp.write(tmpl.render(**d))
        fp.close()

    sphinxconf = os.path.join(os.getcwd(), "conf.py")
    if not os.path.exists(sphinxconf):
        log.warn("no sphinx configuration file conf.py found: you have to edit conf.py yourself!")
    else:
        fp = open(sphinxconf)
        lines = fp.readlines()
        fp.close()
        sdout = []
        modify_conf = False
        for sd in sphinx_defs:
            if not sd in lines:
                sdout.append(sd)
                modify_conf = True
        if modify_conf:
            i = lines.index("latex_documents = [\n")
            newconf = lines[:i+3] + sdout + lines[i+3:]
            fp = open("conf.py", "w")
            fp.write("".join(newconf))
            fp.close()
def generate_report(proj_conf):
    
    #######
    ### Metadata fetched from the 'Genomics project list' on Google Docs
    ###
    proj_data = ProjectMetaData(proj_conf['id'], proj_conf['config'])
    uppnex_proj = proj_data.uppnex_id
    project_id = proj_data.project_id
    queue_date = proj_data.queue_date
    no_samples = proj_data.no_samples
    lanes_plates = proj_data.lanes_plates
    min_reads_per_sample = proj_data.min_reads_per_sample
    customer_reference = proj_data.customer_reference
    application = proj_data.application
    no_finished_samples = proj_data.no_finished_samples
    
    d = { 
        'project_id' : proj_conf['id'],
        'latex_opt' : "",
        'summary' : "",
        'infotable' : "",
        'lanetable' : "",
        'read1table': "",
        'read2table': "",
        'qcplots': "",
        'qc30plots': "",
        'errorrate': "",
        'yieldtable': "",
        }

    ## Latex option (no of floats per page)
    floats_per_page = '.. raw:: latex\n\n   \setcounter{totalnumber}{8}'
    d.update(latex_opt = floats_per_page)

    ## General info table
    tab = Texttable()
    if not uppnex_proj or len(uppnex_proj) < 4 or uppnex_proj[0:4] != 'b201':
        uppnex_proj = "b201YXXX"
    
    run_name_comp = proj_conf['flowcell'].split('_')
    simple_run_name = run_name_comp[0] + run_name_comp[3][0]
    instr_id = run_name_comp[1]
    fc_name, fc_date = get_flowcell_info(proj_conf['flowcell'])
    tab.add_row(["Run name:", proj_conf['flowcell']])
    tab.add_rows([["Project id:", proj_conf['id']], 
                  ["Date:", fc_date],
                  ["Instrument ID:", instr_id],
                  ["Flow cell ID:", fc_name],
                  ["Uppnex project:", uppnex_proj],
                  ["Delivery directory:", "/bubo/proj/" + uppnex_proj + "/INBOX/20" + simple_run_name + "_hiseq2000"]])
    d.update(infotable=tab.draw())
    
    ## Lane table
    tab = Texttable()
    tab.add_row(["Lane", "Sample(s)"])
    for l in proj_conf['lanes']:
        main_proj = l['description'].split(',')[1].strip()
        if main_proj == proj_conf['id']: is_main_proj = True
        else: is_main_proj = False
        samples = []
        if l.has_key('multiplex'):
            for mp in l['multiplex']:
                if mp.has_key('description'):
                    if mp['description'] == proj_conf['id']:
                        samples.append(mp['name'])
                elif is_main_proj:
                    samples.append(mp['name'])
            tab.add_row([l['lane'], ", ".join(samples)])
        else:
            tab.add_row([l['lane'], "Non-multiplexed lane"])
    d.update(lanetable=tab.draw())
    
    tab_r1 = Texttable()
    tab_r2 = Texttable()
    tab_r1.set_cols_width([2,12,12,12,12,12,12,30])
    tab_r2.set_cols_width([2,12,12,12,12,12,12,30])
    tab_r1.add_row(["Lane", "Clu. dens. #/mm2","% PF clusters","Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"])
    tab_r2.add_row(["Lane", "Clu. dens. #/mm2","% PF clusters","Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"])

    # These should be moved to a cfg file. ( + perhaps provide an alternative for v1.5 FC )
    if (options.v1_5_fc): min_clupf = 300 
    else: min_clupf = 475
    max_phas = 0.4
    max_prephas = 1.0 # 0.5
    max_mean_err = 2

    statspath = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "Summary")
    stats = summ.getQCstats(statspath)

    # Check quality criteria and add comments
    comm_r1 = ''
    comm_r2 = ''
    ok_r1 = True
    ok_r2 = True
    ok_cludens_r1 = True
    ok_cludens_r2 = True
    ok_phasing_r1 = True
    ok_phasing_r2 = True
    ok_prephasing_r1 = True
    ok_prephasing_r2 = True
    ok_err_rate = True 
    ok_err_r1 = True
    ok_err_r2 = True

    for l in proj_conf['lanes']:

        # Cluster densities
        clu_dens_r1 =  stats['raw_cluster_dens']['read1'][l['lane']]
        clu_dens_r2 =  stats['raw_cluster_dens']['read2'][l['lane']]
        clu_dens_sd_r1 =  stats['raw_cluster_dens_sd']['read1'][l['lane']]
        clu_dens_sd_r2 =  stats['raw_cluster_dens_sd']['read2'][l['lane']]
        clu_dens_string_r1 = str(clu_dens_r1) + '+/-' + str(clu_dens_sd_r1) 
        clu_dens_string_r2 = str(clu_dens_r2) + '+/-' + str(clu_dens_sd_r2) 

        # Cluster PF densities
        clu_dens_pf_r1 =  stats['pf_cluster_dens']['read1'][l['lane']]
        clu_dens_pf_r2 =  stats['pf_cluster_dens']['read2'][l['lane']]
        clu_dens_pf_sd_r1 =  stats['pf_cluster_dens_sd']['read1'][l['lane']]
        clu_dens_pf_sd_r2 =  stats['pf_cluster_dens_sd']['read2'][l['lane']]
        clu_dens_pf_string_r1 = str(clu_dens_pf_r1) + '+/-' + str(clu_dens_pf_sd_r1)
        clu_dens_pf_string_r2 = str(clu_dens_pf_r2) + '+/-' + str(clu_dens_pf_sd_r2)

        # % PF clusters
        prc_pf_r1 =  stats['prc_pf']['read1'][l['lane']]
        prc_pf_r2 =  stats['prc_pf']['read2'][l['lane']]
        prc_pf_sd_r1 =  stats['prc_pf_sd']['read1'][l['lane']]
        prc_pf_sd_r2 =  stats['prc_pf_sd']['read2'][l['lane']]
        prc_pf_string_r1 = str(prc_pf_r1) + '+/-' + str(prc_pf_sd_r1)
        prc_pf_string_r2 = str(prc_pf_r2) + '+/-' + str(prc_pf_sd_r2)

        # % phasing and prephasing
        phas_r1 = stats['phasing']['read1'][l['lane']]
        phas_r2 = stats['phasing']['read2'][l['lane']]
        prephas_r1 = stats['prephasing']['read1'][l['lane']]
        prephas_r2 = stats['prephasing']['read2'][l['lane']]
        phas_string_r1 = str(phas_r1) + '/' + str(prephas_r1)
        phas_string_r2 = str(phas_r2) + '/' + str(prephas_r2)

        # % aligned
        aln_r1 = stats['prc_aligned']['read1'][l['lane']]
        aln_r2 = stats['prc_aligned']['read2'][l['lane']]
        aln_sd_r1 = stats['prc_aligned_sd']['read1'][l['lane']]
        aln_sd_r2 = stats['prc_aligned_sd']['read2'][l['lane']]
        aln_string_r1 = str(aln_r1) + '+/-' + str(aln_sd_r1)
        aln_string_r2 = str(aln_r2) + '+/-' + str(aln_sd_r2)

        # error rate
        err_r1 = stats['error_rate']['read1'][l['lane']]
        err_r2 = stats['error_rate']['read2'][l['lane']]
        err_sd_r1 = stats['error_rate_sd']['read1'][l['lane']]
        err_sd_r2 = stats['error_rate_sd']['read2'][l['lane']]
        err_str_r1 = str(err_r1) + '+/-' + str(err_sd_r1)
        err_str_r2 = str(err_r2) + '+/-' + str(err_sd_r2)
        
        comm_r1 = ""
        comm_r2 = ""

        # check criteria
        if float(clu_dens_pf_r1[:-1]) < min_clupf: 
            ok_r1 = False
            ok_cludens_r1 = False
            comm_r1 += "Low cluster density. "
        if float(clu_dens_pf_r2[:-1]) < min_clupf: 
            ok_r2 = False
            ok_cludens_r2 = False
            comm_r2 += "Low cluster density. "
        if float(phas_r1) > max_phas: 
            ok_r1 = False
            ok_phasing_r1 = False
            comm_r1 += "High phasing. "
        if float(phas_r2) > max_phas: 
            ok_r2 = False
            ok_phasing_r2 = False
            comm_r2 += "High phasing. "
        if float(prephas_r1) > max_prephas: 
            ok_r1 = False
            ok_prephasing_r1 = False
            comm_r1 += "High prephasing. "
        if float(prephas_r2) > max_prephas: 
            ok_r2 = False
            ok_prephasing_r2 = False
            comm_r2 += "High prephasing. "
        avg_error_rate = (float(err_r1) + float(err_r2))/2
        if avg_error_rate > max_mean_err:
            ok_err_rate = False
        if float(err_r1) > max_mean_err:
            #ok_r1 = False
            comm_r1 += "High error rate. "
            ok_err_r1 = False
        if float(err_r2) > max_mean_err:
            #ok_r2 = False
            comm_r2 += "High error rate. "
            ok_err_r2 = False

        if comm_r1 == "": comm_r1 = "OK"        
        if comm_r2 == "": comm_r2 = "OK"

        tab_r1.add_row([l['lane'], clu_dens_string_r1, prc_pf_string_r1, clu_dens_pf_string_r1, phas_string_r1, aln_string_r1, err_str_r1, comm_r1])
        tab_r2.add_row([l['lane'], clu_dens_string_r2, prc_pf_string_r2, clu_dens_pf_string_r2, phas_string_r2, aln_string_r2, err_str_r2, comm_r2])

    # Reinitialize comments for the summary. (Which will be for several lanes, potentially)
    comm_r1 = ""
    comm_r2 = ""
 
    if not ok_cludens_r1: comm_r1 += "Low cluster density. " 
    if not ok_cludens_r2: comm_r2 += "Low cluster density. " 
    if not ok_phasing_r1: comm_r1 += "High phasing. " 
    if not ok_phasing_r2: comm_r2 += "High phasing. " 
    if not ok_prephasing_r1: comm_r1 += "High prephasing. " 
    if not ok_prephasing_r2: comm_r2 += "High prephasing. " 
    if not ok_err_rate:
        if not ok_err_r1: 
            ok_r1 = False
            comm_r1 += "High error rate. "
        if not ok_err_r2: 
            ok_r2 = False
            comm_r2 += "High error rate. "

    if (ok_r1 and ok_r2): 
        comm_r1 = comm_r2 = "OK"
        d.update(summary = "Successful run according to QC criteria. ")
    else:  
        if (ok_r1): 
            comm_r1 = "OK"
            d.update (summary = "Read 2 did not pass quality criteria: " + comm_r2)
        elif (ok_r2):
            comm_r2 = "OK"
            d.update (summary = "Read 1 did not pass quality criteria: " + comm_r1)
        else:
            d.update (summary = "Did not pass quality criteria. Read 1: " + comm_r1 + " Read 2: " + comm_r2)


    d.update(read1table=tab_r1.draw())
    d.update(read2table=tab_r2.draw())
        
    ## qcplots
    byCycleDir = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "ByCycle")
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "QScore_L%s.png" % (l['lane']))), width="100%"))
    d.update(qcplots= "\n".join(res))

    ## qc30plots
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "NumGT30_L%s.png" % (l['lane']))), width="100%"))
    d.update(qc30plots= "\n".join(res))

    ## qcplots
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "ErrRate_L%s.png" % (l['lane']))), width="100%"))
    d.update(errorrate= "\n".join(res))

    ## Sequence yield table
    target_yield_per_lane = 143000000.0
    if (options.v1_5_fc):  target_yield_per_lane = 60000000.0
    tab = Texttable()
    tab.add_row(['Lane','Sample','Number of sequences','Comment'])
    
    run_info_yaml = os.path.join(proj_conf['archive_dir'],proj_conf['flowcell'],"run_info.yaml")

    if not os.path.exists(run_info_yaml):
        log.warn("Could not find required run_info.yaml configuration file at '%s'" % run_info_yaml)
        return

    #with open(run_info_yaml) as in_handle:
    #    run_info = {'details': yaml.load(in_handle)}

    with open(run_info_yaml) as in_handle:
        run_info = yaml.load(in_handle)

    # fc_name, fc_date = get_flowcell_info(proj_conf['flowcell'])
    # bc_yield = bc_metrics.get_bc_stats(fc_date,fc_name,proj_conf['analysis_dir'], run_info)
   
    fc_name, fc_date = get_flowcell_info(proj_conf['flowcell'])
    low_yield = False
    
    bc_multiplier = 0.75 # Should move to cfg file

    ok_samples = []
    low_samples = []

    for l in proj_conf['lanes']:
        bc_file_name = os.path.join(proj_conf['analysis_dir'], proj_conf['flowcell'], '_'.join([l['lane'], fc_date, fc_name, "barcode"]), '_'.join([l['lane'], fc_date, fc_name, "bc.metrics"]))
        try:
            bc_file = open(bc_file_name)
        except:
            sys.exit("Could not find bc metrics file " + bc_file_name)
        bc_count = {}
        for line in bc_file:
            c = line.strip().split()
            bc_count[c[0]]=c[1] + ' (~' + str (int ( round (float(c[1])/1000000) ) ) + " million)"
        no_samples = len(bc_count)
        if no_samples == 0:
            log.warn("Did not find a BC metrics file... Skipping lane %s for %s" %(l['lane'], proj_conf['id']))
            continue
        target_yield_per_sample = bc_multiplier * target_yield_per_lane / no_samples
        sample_name = {}
        is_multiplexed = True
        is_rerun = False
        # Check here for each sample if it belongs to the project
        for entry in run_info:
            if entry['lane'] == l['lane']:
                is_main_proj = False       
                if entry['description'].split(',')[1].strip() == proj_conf['id']:
                    is_main_proj = True
                if entry.has_key('multiplex'):
                    for sample in entry['multiplex']:
                        if sample.has_key('description'):
                            if is_main_proj: 
                                log.info('Rerun lane: skipping sample ' + sample['name'] + ' in lane ' + l['lane'] + ' which does not belong to the current project')
                                is_rerun=True
                            else:
                                if sample['description'].strip() == proj_conf['id']:
                                    sample_name[sample['barcode_id']]=sample['name']
                                is_rerun = True
                        elif is_main_proj: 
                            sample_name[sample['barcode_id']]=sample['name']
                else: is_multiplexed = False
        samp_count = {}

        for k in bc_count.keys():
            if not k.isdigit(): pass
            else: 
                if sample_name.has_key(int(k)): samp_count[sample_name[int(k)]] =  bc_count[k]

        for k in sorted(samp_count.keys()):
            comment = ''
            if int(samp_count[k].split('(')[0]) < target_yield_per_sample: 
                comment = 'Low. '
                low_yield = True
                low_samples.append(k)
            else: ok_samples.append(k)
            if is_rerun: comment += '(rerun lane)'
            tab.add_row([l['lane'], k, samp_count[k], comment])
        
        if is_multiplexed:
            comment = ''
            try:
                if int (bc_count['unmatched'].split('(')[0]) > target_yield_per_sample: comment = 'High.'
                if is_rerun: comment += '(rerun lane)'
                tab.add_row([l['lane'], 'unmatched', bc_count['unmatched'], comment])
            except:
                log.warning('Unsufficient or no barcode metrics for lane')
        else:
            comment = ''
            for k in bc_count.keys():
                if int (bc_count[k].split('(')[0]) < bc_multiplier * target_yield_per_lane: comment = 'Low.' 
                tab.add_row([l['lane'], "Non-multiplexed lane", bc_count[k], comment])

    # if low_yield:
    #    comm = d['summary'] +  " Some samples had low yields."
    #    d.update(summary = comm)
    delivery_type = "Final delivery. "
    if low_yield:
        delivery_type = "Partial delivery. "
        fail_comm = "Samples " + ", ".join(low_samples) + " yielded fewer sequences than expected. These will be re-run unless this was already a re-run and the total yield is now sufficient. "
    else: fail_comm = ""

    if low_yield: 
        if len(ok_samples)>0: ok_comm = "Samples " + ", ".join(ok_samples) + " yielded the expected number of sequences or more. "
        else: ok_comm = ""
    else: ok_comm = "All samples yielded the expected number of sequences or more. "

    #comm = delivery_type + d['summary'] + fail_comm + ok_comm
    comm = d['summary'] + fail_comm + ok_comm
    d.update(summary = comm)

    d.update(yieldtable=tab.draw())
    return d