Ejemplo n.º 1
0
def setUpModule():
    """Create test databases in local server"""
    if not has_couchdb:
        return
    server = couchdb.Server()
    ## Create databases
    for x in DATABASES:
        if not server.__contains__(x):
            LOG.info("Creating database {}".format(x))
            server.create(x)
    ## Create views for flowcells and samples
    for dbname in DATABASES:
        dblab = dbname.replace("-test", "")
        db = server[dbname]
        for k, v in VIEWS[dblab].items():
            for title, view in v.items():
                viewdef = ViewDefinition(k, title, view)
                viewdef.sync(db)

    ## Create and upload project summary
    with open(os.path.join(filedir, "data", "config", "project_summary.yaml")) as fh:
        prj_sum = yaml.load(fh)
    db = server["samples-test"]
    p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******")
    for p in prj_sum:
        prj = ProjectSummaryDocument(**p)
        p_con.save(prj, key="project_name")
Ejemplo n.º 2
0
 def test_dbcon(self):
     """Test database connection and that we get expected values."""
     s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******")
     samples = [s_con.get_entry(x) for x in s_con.name_view]
     samples_d = {x["name"]: x for x in samples}
     self.assertEqual(samples_d["1_120924_AC003CCCXX_TGACCA"]["date"], "120924")
     self.assertEqual(samples_d["1_121015_BB002BBBXX_TGACCA"]["flowcell"], "BB002BBBXX")
     self.assertEqual(samples_d["2_120924_AC003CCCXX_ACAGTG"]["entity_type"], "sample_run_metrics")
     self.assertEqual(samples_d["3_120924_AC003CCCXX_ACAGTG"]["lane"], "3")
     self.assertEqual(samples_d["4_120924_AC003CCCXX_CGTTAA"]["sequence"], "CGTTAA")
     self.assertEqual(samples_d["2_121015_BB002BBBXX_TGACCA"]["project_id"], "P002")
     fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
     flowcells = [fc_con.get_entry(x) for x in fc_con.name_view]
     flowcells_d = {x["name"]: x for x in flowcells}
     self.assertEqual(flowcells_d["120924_AC003CCCXX"]["name"], "120924_AC003CCCXX")
     self.assertEqual(flowcells_d["121015_BB002BBBXX"]["name"], "121015_BB002BBBXX")
     self.assertEqual(flowcells_d["120924_AC003CCCXX"]["entity_type"], "flowcell_run_metrics")
     p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******")
     projects = [p_con.get_entry(x) for x in p_con.name_view]
     projects_d = {x["project_name"]: x for x in projects}
     self.assertEqual(projects_d["J.Doe_00_01"]["min_m_reads_per_sample_ordered"], 0.1)
     self.assertEqual(projects_d["J.Doe_00_01"]["no_of_samples"], 2)
     self.assertEqual(
         set(projects_d["J.Doe_00_01"]["samples"].keys()), set(["P001_101_index3", "P001_102", "P001_103"])
     )
     self.assertEqual(projects_d["J.Doe_00_01"]["customer_reference"], "GnuGenome")
     self.assertEqual(projects_d["J.Doe_00_02"]["min_m_reads_per_sample_ordered"], 0.2)
     self.assertEqual(projects_d["J.Doe_00_03"]["samples"].keys(), ["3_index6"])
     self.assertIn("A", projects_d["J.Doe_00_03"]["samples"]["3_index6"]["library_prep"])
Ejemplo n.º 3
0
    def test_2_make_note(self):
        """Make a note subset by example flowcell and project"""
        s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
        fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
        p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url)
        paragraphs = sample_note_paragraphs()
        headers = sample_note_headers()
        samples = s_con.get_samples(self.examples["flowcell"], self.examples["project"])
        project = p_con.get_entry(self.examples["project"])
        samples = p_con.map_srm_to_name(self.examples["project"], fc_id=self.examples["flowcell"], use_bc_map=True)
        for k,v  in samples.items():
            s_param = parameters
            s = s_con.get_entry(k)
            s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()})
            fc = "{}_{}".format(s["date"], s["flowcell"])
            s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"])
            s_param['avg_quality_score'] = s_con.calc_avg_qv(s["name"])
            s_param['rounded_read_count'] = round(float(s_param['rounded_read_count'])/1e6,1) if s_param['rounded_read_count'] else None
            s_param['customer_name'] = project['samples'][v["sample"]].get('customer_name', None)

            if project:
                s_param['ordered_amount'] = p_con.get_ordered_amount(self.examples["project"])
                s_param['customer_reference'] = s_param.get('customer_reference', project['customer_reference'])
                s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project['uppnex_id'])
            s_param['success'] = sequencing_success(s_param, cutoffs)
            s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None})
            make_note("{}.pdf".format(s["barcode_name"]), headers, paragraphs, **s_param)
Ejemplo n.º 4
0
def setUpModule():
    """Create test databases in local server"""
    if not has_couchdb:
        return
    server = couchdb.Server()

    ## Create databases
    for x in DATABASES:
        if not server.__contains__(x):
            LOG.info("Creating database {}".format(x))
            server.create(x)
    ## Create views for flowcells and samples
    for dbname in DATABASES:
        dblab = dbname.replace("-test", "")
        db = server[dbname]
        for k, v in VIEWS[dblab].items():
            for title, view in v.items():
                viewdef = ViewDefinition(k, title, view)
                viewdef.sync(db)

    ## Create and upload project summary
    with open(os.path.join(filedir, "data", "config",
                           "project_summary.yaml")) as fh:
        prj_sum = yaml.load(fh)
    db = server["samples-test"]
    p_con = ProjectSummaryConnection(dbname="projects-test",
                                     username="******",
                                     password="******")
    for p in prj_sum:
        prj = ProjectSummaryDocument(**p)
        p_con.save(prj, key="project_name")
Ejemplo n.º 5
0
    def list_projects(self):
        if not self._check_pargs(["flowcell"]):
            return

        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return
        if not validate_fc_directory_format(self.pargs.flowcell):
            self.app.log.warn(
                "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell)
            )
            return

        out_data = [[self.pargs.flowcell]]
        s = self.pargs.flowcell.split("_")
        fcid = "_".join([s[0], s[-1]])

        self.log.debug("Establishing FlowcellRunMetricsConnection")
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs))
        self.log.debug("Establishing ProjectSummaryConnection")
        p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))

        self.log.debug("Fetching flowcell metric document for flowcell {}".format(fcid))
        fc = fc_con.get_entry(fcid)
        if fc is None:
            self.log.warn("No flowcell metric document for flowcell {}".format(fcid))
            return

        self.log.debug("Fetching csv samplesheet data for flowcell {}".format(fcid))
        ssheet_data = self._get_samplesheet_sample_data(fc)
        if len(ssheet_data) == 0:
            self.log.warn("No csv samplesheet data for flowcell {}".format(fcid))
            return

        self.log.debug("Fetch runParameter data for flowcell {}".format(fcid))
        run_data = self._get_run_parameter_data(fc)
        if len(run_data) == 0:
            self.log.warn("No runParameter data for flowcell {}".format(fcid))

        out_data = [
            [self.pargs.flowcell, run_data.get("InstrumentType", "HiSeq2000"), run_data.get("RunMode", "High Output")]
        ]

        # Extract the project names
        projects = set([proj[0].replace("__", ".") for data in ssheet_data.values() for proj in data.values()])

        # Extract application for each project
        for project in projects:
            self.log.debug("Fetching project data document for project {}".format(project))
            pdoc = p_con.get_entry(project)
            if pdoc is None:
                self.log.warn("No project data document for project {}".format(project))
                pdoc = {}

            application = pdoc.get("application", "N/A")
            out_data.append([project, application])

        self.app._output_data["stdout"].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))
Ejemplo n.º 6
0
    def upload_qc(self):
        if not self._check_pargs(['flowcell']):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get(
            "db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return
        if not validate_fc_directory_format(self.pargs.flowcell):
            self.app.log.warn(
                "Path '{}' does not conform to bcbio flowcell directory format; aborting"
                .format(self.pargs.flowcell))
            return

        runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell),
                                   "{}.csv".format(fc_id(self.pargs.flowcell)))
        runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell),
                                    "run_info.yaml")
        (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
        if int(fc_date) < 120815:
            self.log.info(
                "Assuming pre-casava based file structure for {}".format(
                    fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_pre_casava_qc()
        else:
            self.log.info("Assuming casava based file structure for {}".format(
                fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_casava_qc()

        if len(qc_objects) == 0:
            self.log.info("No out-of-date qc objects for {}".format(
                fc_id(self.pargs.flowcell)))
            return
        else:
            self.log.info("Retrieved {} updated qc objects".format(
                len(qc_objects)))

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get(
            "db", "samples"),
                                           **vars(self.app.pargs))
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get(
            "db", "flowcells"),
                                              **vars(self.app.pargs))
        p_con = ProjectSummaryConnection(dbname=self.app.config.get(
            "db", "projects"),
                                         **vars(self.app.pargs))
        for obj in qc_objects:
            if self.app.pargs.debug:
                self.log.debug("{}: {}".format(str(obj), obj["_id"]))
            if isinstance(obj, FlowcellRunMetricsDocument):
                dry("Saving object {}".format(repr(obj)), fc_con.save(obj))
            if isinstance(obj, SampleRunMetricsDocument):
                project_sample = p_con.get_project_sample(
                    obj.get("sample_prj", None), obj.get("barcode_name", None),
                    self.pargs.extensive_matching)
                if project_sample:
                    obj["project_sample_name"] = project_sample['sample_name']
                dry("Saving object {}".format(repr(obj)), s_con.save(obj))
Ejemplo n.º 7
0
def bcbb_configuration_from_samplesheet(csv_samplesheet, couch_credentials):
    """Parse an illumina csv-samplesheet and return a dictionary suitable for the bcbb-pipeline
    """
    tfh, yaml_file = tempfile.mkstemp('.yaml','samplesheet')
    os.close(tfh)
    yaml_file = bcbio.solexa.samplesheet.csv2yaml(csv_samplesheet,yaml_file)
    with open(yaml_file) as fh:
        config = yaml.load(fh)

    application_setup = {
                         'Amplicon': {'analysis': 'Align_standard'},
                         'ChIP-seq': {'analysis': 'RNA-seq'},
                         'Custom capture': {'analysis': 'Align_standard_seqcap'},
                         'de novo': {'analysis': 'Align_standard',
                                     'genome_build': 'unknown'},
                         'Exome capture': {'analysis': 'Align_standard_seqcap'},
                         'Finished library': {'analysis': 'Align_standard',
                                              'genome_build': 'unknown'},
                         'Mate-pair': {'analysis': 'Align_standard',
                                       'genome_build': 'unknown'},
                         'Metagenome': {'analysis': 'Align_standard',
                                        'genome_build': 'unknown'},
                         'miRNA-seq': {'analysis': 'Align_standard',
                                       'genome_build': 'unknown'},
                         'RNA-seq (mRNA)': {'analysis': 'RNA-seq'},
                         'RNA-seq (total RNA)': {'analysis': 'RNA-seq'},
                         'WG re-seq': {'analysis': 'Align_standard'},
                         'default': {'analysis': 'Align_standard'},
                         }

    #Connect to maggie to get project application 
    try:
        p_con = ProjectSummaryConnection(**couch_credentials)
    except:
        print "Can't connect to maggie to get application"
        p_con = None
  
  # Replace the default analysis
    ## TODO: This is an ugly hack, should be replaced by a custom config 
    for lane in config:
        for plex in lane.get('multiplex',[]):
            application='default'
            if p_con is not None:
                try:
                    Proj=plex.get('sample_prj','')
                    project = p_con.get_entry(Proj)
                    if project is not None:
                        application = project.get("application", 'default').strip()
                except:
                    application='default'
            setup = application_setup.get(application,application_setup['default'])
            for key, val in setup.items():
                plex[key] = val
            
    # Remove the yaml file, we will write a new one later
    os.remove(yaml_file)
    
    return config
Ejemplo n.º 8
0
 def list_projects(self):
     if not self._check_pargs(["flowcell"]):
         return
     
     url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
     if not url:
         self.app.log.warn("Please provide a valid url: got {}".format(url))
         return
     if not validate_fc_directory_format(self.pargs.flowcell):
         self.app.log.warn("Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell))
         return
     
     out_data = [[self.pargs.flowcell]]
     s = self.pargs.flowcell.split("_")
     fcid = "_".join([s[0],s[-1]])
     
     self.log.debug("Establishing FlowcellRunMetricsConnection")
     fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs))
     self.log.debug("Establishing ProjectSummaryConnection")
     p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
 
     self.log.debug("Fetching flowcell metric document for flowcell {}".format(fcid))
     fc = fc_con.get_entry(fcid)
     if fc is None:
         self.log.warn("No flowcell metric document for flowcell {}".format(fcid))
         return
 
     self.log.debug("Fetching csv samplesheet data for flowcell {}".format(fcid))
     ssheet_data = self._get_samplesheet_sample_data(fc)
     if len(ssheet_data) == 0:
         self.log.warn("No csv samplesheet data for flowcell {}".format(fcid))
         return
     
     self.log.debug("Fetch runParameter data for flowcell {}".format(fcid))
     run_data = self._get_run_parameter_data(fc)
     if len(run_data) == 0:
         self.log.warn("No runParameter data for flowcell {}".format(fcid))
     
     out_data = [[self.pargs.flowcell,
                  run_data.get("InstrumentType","HiSeq2000"),
                  run_data.get("RunMode","High Output")]]
     
     # Extract the project names
     projects = set([proj[0].replace("__",".") for data in ssheet_data.values() for proj in data.values()])
 
     # Extract application for each project
     for project in projects:    
         self.log.debug("Fetching project data document for project {}".format(project))
         pdoc = p_con.get_entry(project)
         if pdoc is None:
             self.log.warn("No project data document for project {}".format(project))
             pdoc = {}
     
         application = pdoc.get("application","N/A")
         type = pdoc.get("type","Check GPL")
         out_data.append([project,application,type])
     
     self.app._output_data['stdout'].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))
Ejemplo n.º 9
0
 def test_3_sample_map(self):
     """Test getting a sample mapping"""
     p_con = ProjectSummaryConnection(username=self.user,
                                      password=self.pw,
                                      url=self.url)
     sample_map = p_con.map_name_to_srm(self.examples["project"],
                                        use_ps_map=False,
                                        check_consistency=True)
     print sample_map
Ejemplo n.º 10
0
 def test_4_srm_map(self):
     """Test getting a sample mapping from srm to project samples"""
     p_con = ProjectSummaryConnection(username=self.user,
                                      password=self.pw,
                                      url=self.url)
     samples = p_con.map_srm_to_name(self.examples["project"],
                                     fc_id=self.examples["flowcell"],
                                     use_ps_map=False,
                                     check_consistency=True)
     print samples
Ejemplo n.º 11
0
    def update(self):
        if not self._check_pargs(["sample_prj"]):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs))
        samples = s_con.get_samples(sample_prj=self.pargs.sample_prj)

        if self.pargs.project_id:
            self.app.log.debug("Going to update 'project_id' to {} for sample runs with 'sample_prj' == {}".format(self.pargs.project_id, self.pargs.sample_prj))
            for s in samples:
                if not s.get("project_id", None) is None:
                    if not query_yes_no("'project_id':{} for sample {}; are you sure you want to overwrite?".format(s["project_id"], s["name"]), force=self.pargs.force):
                        continue
                s["project_id"] = self.pargs.project_id
                s_con.save(s)
        if self.pargs.names:
            self.app.log.debug("Going to update 'project_sample_name' for sample runs with 'sample_prj' == {}".format(self.pargs.sample_prj))
            if os.path.exists(self.pargs.names):
                with open(self.pargs.names) as fh:
                    names_d = json.load(fh)
            else:
                names_d= ast.literal_eval(self.pargs.names)
            samples_sort = sorted(samples, key=lambda s:s["barcode_name"])
            groups = {}
            for k, g in itertools.groupby(samples_sort, key=lambda x:x["barcode_name"]):
                groups[k] = list(g)
            for barcode_name in names_d:
                sample_list = groups.get(barcode_name, None)
                if not sample_list:
                    continue
                for s in sample_list:
                    if not s.get("project_sample_name", None) is None:
                        if not query_yes_no("'project_sample_name':{} for sample {}; are you sure you want to overwrite?".format(s["project_sample_name"], s["name"]), force=self.pargs.force):
                            continue
                    s["project_sample_name"] = names_d[barcode_name]
                    s_con.save(s)
        else:
            self.app.log.info("Trying to use extensive matching...")
            p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
            project_name = self.pargs.sample_prj
            if self.pargs.project_alias:
                project_name = self.pargs.project_alias
            for s in samples:
                project_sample = p_con.get_project_sample(project_name, s["barcode_name"], extensive_matching=True)
                if project_sample:
                    self.app.log.info("using mapping '{} : {}'...".format(s["barcode_name"], project_sample["sample_name"]))
                    s["project_sample_name"] = project_sample["sample_name"]
                    s_con.save(s)
Ejemplo n.º 12
0
    def update(self):
        if not self._check_pargs(["sample_prj"]):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs))
        samples = s_con.get_samples(sample_prj=self.pargs.sample_prj)

        if self.pargs.project_id:
            self.app.log.debug("Going to update 'project_id' to {} for sample runs with 'sample_prj' == {}".format(self.pargs.project_id, self.pargs.sample_prj))
            for s in samples:
                if not s.get("project_id", None) is None:
                    if not query_yes_no("'project_id':{} for sample {}; are you sure you want to overwrite?".format(s["project_id"], s["name"]), force=self.pargs.force):
                        continue
                s["project_id"] = self.pargs.project_id
                s_con.save(s)
        if self.pargs.names:
            self.app.log.debug("Going to update 'project_sample_name' for sample runs with 'sample_prj' == {}".format(self.pargs.sample_prj))
            if os.path.exists(self.pargs.names):
                with open(self.pargs.names) as fh:
                    names_d = json.load(fh)
            else:
                names_d= ast.literal_eval(self.pargs.names)
            samples_sort = sorted(samples, key=lambda s:s["barcode_name"])
            groups = {}
            for k, g in itertools.groupby(samples_sort, key=lambda x:x["barcode_name"]):
                groups[k] = list(g)
            for barcode_name in names_d:
                sample_list = groups.get(barcode_name, None)
                if not sample_list:
                    continue
                for s in sample_list:
                    if not s.get("project_sample_name", None) is None:
                        if not query_yes_no("'project_sample_name':{} for sample {}; are you sure you want to overwrite?".format(s["project_sample_name"], s["name"]), force=self.pargs.force):
                            continue
                    s["project_sample_name"] = names_d[barcode_name]
                    s_con.save(s)
        else:
            self.app.log.info("Trying to use extensive matching...")
            p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
            project_name = self.pargs.sample_prj
            if self.pargs.project_alias:
                project_name = self.pargs.project_alias
            for s in samples:
                project_sample = p_con.get_project_sample(project_name, s["barcode_name"], extensive_matching=True)
                if project_sample:
                    self.app.log.info("using mapping '{} : {}'...".format(s["barcode_name"], project_sample["sample_name"]))
                    s["project_sample_name"] = project_sample["sample_name"]
                    s_con.save(s)
Ejemplo n.º 13
0
 def setUp(self):
     self.user = "******"
     self.pw = "pw"
     self.url = "localhost"
     self.examples = {
         "sample": "1_120924_AC003CCCXX_TGACCA",
         "flowcell": "AC003CCCXX",
         "project": "J.Doe_00_01"
     }
     self.p_con = ProjectSummaryConnection(dbname="projects-test",
                                           username=self.user,
                                           password=self.pw,
                                           url=self.url)
Ejemplo n.º 14
0
 def test_dbcon(self):
     """Test database connection and that we get expected values."""
     s_con = SampleRunMetricsConnection(dbname="samples-test",
                                        username="******",
                                        password="******")
     samples = [s_con.get_entry(x) for x in s_con.name_view]
     samples_d = {x["name"]: x for x in samples}
     self.assertEqual(samples_d["1_120924_AC003CCCXX_TGACCA"]["date"],
                      "120924")
     self.assertEqual(samples_d["1_121015_BB002BBBXX_TGACCA"]["flowcell"],
                      "BB002BBBXX")
     self.assertEqual(
         samples_d["2_120924_AC003CCCXX_ACAGTG"]["entity_type"],
         "sample_run_metrics")
     self.assertEqual(samples_d["3_120924_AC003CCCXX_ACAGTG"]["lane"], "3")
     self.assertEqual(samples_d["4_120924_AC003CCCXX_CGTTAA"]["sequence"],
                      "CGTTAA")
     self.assertEqual(samples_d["2_121015_BB002BBBXX_TGACCA"]["project_id"],
                      "P002")
     fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test",
                                           username="******",
                                           password="******")
     flowcells = [fc_con.get_entry(x) for x in fc_con.name_view]
     flowcells_d = {x["name"]: x for x in flowcells}
     self.assertEqual(flowcells_d["120924_AC003CCCXX"]["name"],
                      "120924_AC003CCCXX")
     self.assertEqual(flowcells_d["121015_BB002BBBXX"]["name"],
                      "121015_BB002BBBXX")
     self.assertEqual(flowcells_d["120924_AC003CCCXX"]["entity_type"],
                      "flowcell_run_metrics")
     p_con = ProjectSummaryConnection(dbname="projects-test",
                                      username="******",
                                      password="******")
     projects = [p_con.get_entry(x) for x in p_con.name_view]
     projects_d = {x["project_name"]: x for x in projects}
     self.assertEqual(
         projects_d["J.Doe_00_01"]["min_m_reads_per_sample_ordered"], 0.1)
     self.assertEqual(projects_d["J.Doe_00_01"]["no_of_samples"], 2)
     self.assertEqual(set(projects_d["J.Doe_00_01"]["samples"].keys()),
                      set(["P001_101_index3", "P001_102", "P001_103"]))
     self.assertEqual(projects_d["J.Doe_00_01"]["customer_reference"],
                      "GnuGenome")
     self.assertEqual(
         projects_d["J.Doe_00_02"]["min_m_reads_per_sample_ordered"], 0.2)
     self.assertEqual(projects_d["J.Doe_00_03"]["samples"].keys(),
                      ["3_index6"])
     self.assertIn(
         "A",
         projects_d["J.Doe_00_03"]["samples"]["3_index6"]["library_prep"])
Ejemplo n.º 15
0
 def bpreport(self):
     if not self._check_pargs(["project"]):
         return
     kw = vars(self.pargs)
     basedir = os.path.abspath(
         os.path.join(self.app.controller._meta.root_path,
                      self.app.controller._meta.path_id))
     flist = find_samples(basedir, **vars(self.pargs))
     if not len(flist) > 0:
         self.log.info("No samples/sample configuration files found")
         return
     if self.pargs.no_statusdb:
         sample_name_map = None
     else:
         if not self._check_pargs(["statusdb_project_name"]):
             return
         p_con = ProjectSummaryConnection(dbname=self.app.config.get(
             "db", "projects"),
                                          **vars(self.app.pargs))
         s_con = SampleRunMetricsConnection(dbname=self.app.config.get(
             "db", "samples"),
                                            **vars(self.app.pargs))
         sample_name_map = get_scilife_to_customer_name(
             self.pargs.statusdb_project_name, p_con, s_con)
     kw.update(project_name=self.pargs.project,
               flist=flist,
               basedir=basedir,
               sample_name_map=sample_name_map)
     out_data = best_practice_note(**kw)
     self.log.info(
         "Wrote report to directory {}; use Makefile to generate pdf report"
         .format(basedir))
     self.app._output_data['stdout'].write(out_data['stdout'].getvalue())
     self.app._output_data['stderr'].write(out_data['stderr'].getvalue())
     self.app._output_data['debug'].write(out_data['debug'].getvalue())
Ejemplo n.º 16
0
 def __init__(self, config):
     super(GDocsUpdater, self).__init__(config)
         
     # Connect to the Google Docs api
     gdconf = self.config.get("gdocs",{})
     creds = os.path.expanduser(gdconf.get("credentials_file",""))
     assert os.path.exists(creds), "Supplied GDocs credentials file does not exist"
     self.gdcon = SpreadSheet(get_credentials(creds))
     assert self.gdcon, "Could not get a SpreadSheet object, please verify gdocs credentials"
     doc = gdconf.get("qc_checklist",None)
     assert doc, "No QC checklist specified in configuration, please specify"
     ssheet = self.gdcon.get_spreadsheet(doc)
     assert ssheet, "Could not locate QC checklist '{}' on Google Docs. Please make sure it exists".format(doc)
     self.gdcon.ssheet = ssheet
     
     # Get the Ongoing, Finished and Coming worksheets
     self.ongoing = self.gdcon.get_worksheet("Ongoing")
     self.coming = self.gdcon.get_worksheet("Coming")
     self.finished = self.gdcon.get_worksheet("Finished")
     assert self.ongoing and self.coming and self.finished, "Could not get 'Ongoing', 'Finished' and 'Coming' worksheets from '{}'. Please make sure that they exist".format(doc)
     
     # Get a connection to the StatusDB project database
     dbconf = self.config.get("statusdb",{})
     try:
         self.pcon = ProjectSummaryConnection(url=dbconf.get("url","localhost"), 
                                              username=dbconf.get("user","user"), 
                                              password=dbconf.get("password","pass"))
     except ConnectionError:
         self.pcon = None
Ejemplo n.º 17
0
 def setUp(self):
     """FIXME: All other tests depend on data being uploaded, so
     these are not real unit tests. The setup to TestQCUpload has to
     be run prior to other tests, else unexpected failures will
     occur."""
     self.app = self.make_app(
         argv=['qc', 'upload-qc', flowcells[0], '--mtime', '10000'],
         extensions=[
             'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'
         ])
     self._run_app()
     self.app = self.make_app(
         argv=['qc', 'upload-qc', flowcells[1], '--mtime', '10000'],
         extensions=[
             'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'
         ])
     self._run_app()
     self.s_con = SampleRunMetricsConnection(dbname="samples-test",
                                             username="******",
                                             password="******")
     self.p_con = ProjectSummaryConnection(dbname="projects-test",
                                           username="******",
                                           password="******")
     self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test",
                                                username="******",
                                                password="******")
Ejemplo n.º 18
0
def closed_projects(report, from_date=None, to_date=None, **kw):

    finished = []

    # Get a connection to the database
    pcon = ProjectSummaryConnection(**kw)
    if not pcon:
        report.log.error(
            "Could not get connection to database".format(project))
        return False

    # Loop over the entries in the project_dates view
    for item in pcon.db.view("project/project_dates", reduce=False):
        try:
            project = item.key
            closed = datetime.datetime.strptime(
                item.value.get("close_date", "0000-00-00"), "%Y-%m-%d")
            # Skip the entry if the date is outside the range we're insterested in
            if from_date is not None and closed < from_date:
                continue
            if to_date is not None and closed > to_date:
                continue
            finished.append({
                'name':
                project,
                'closed':
                datetime.datetime.strftime(closed, "%Y-%m-%d")
            })
        except ValueError:
            continue
    return finished
Ejemplo n.º 19
0
 def setUp(self):
     self.user = "******"
     self.pw = "pw"
     self.url = "localhost"
     self.examples = {"sample": "1_120924_AC003CCCXX_TGACCA", "flowcell": "AC003CCCXX", "project": "J.Doe_00_01"}
     self.p_con = ProjectSummaryConnection(
         dbname="projects-test", username=self.user, password=self.pw, url=self.url
     )
Ejemplo n.º 20
0
    def upload_qc(self):
        if not self._check_pargs(["flowcell"]):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return
        if not validate_fc_directory_format(self.pargs.flowcell):
            self.app.log.warn(
                "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell)
            )
            return

        runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell)))
        runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml")
        (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
        if int(fc_date) < 120815:
            self.log.info("Assuming pre-casava based file structure for {}".format(fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_pre_casava_qc()
        else:
            self.log.info("Assuming casava based file structure for {}".format(fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_casava_qc()

        if len(qc_objects) == 0:
            self.log.info("No out-of-date qc objects for {}".format(fc_id(self.pargs.flowcell)))
            return
        else:
            self.log.info("Retrieved {} updated qc objects".format(len(qc_objects)))

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs))
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs))
        p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
        for obj in qc_objects:
            if self.app.pargs.debug:
                self.log.debug("{}: {}".format(str(obj), obj["_id"]))
            if isinstance(obj, FlowcellRunMetricsDocument):
                dry("Saving object {}".format(repr(obj)), fc_con.save(obj))
            if isinstance(obj, SampleRunMetricsDocument):
                project_sample = p_con.get_project_sample(
                    obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching
                )
                if project_sample:
                    obj["project_sample_name"] = project_sample["sample_name"]
                dry("Saving object {}".format(repr(obj)), s_con.save(obj))
Ejemplo n.º 21
0
def application_qc(project_name=None, flowcell=None, application=None,
                   username=None, password=None, url=None,
                   sampledb="samples", projectdb="projects", **kw):
    """Perform application specific qc on a project.

    :param project_name: project name
    :param flowcell: flowcell identifier
    :param application: application for which to perform qc
    :param username: database username
    :param password: database password
    :param url: database url
    :param sampledb: samples database name
    :param projectdb: project database name
    """
    LOG.debug("Doing application qc for project {}, flowcell {}".format(project_name, flowcell))

    output_data = {'stdout':StringIO(), 'stderr':StringIO()}
    p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url)
    s_con = SampleRunMetricsConnection(dbname=sampledb, username=username, password=password, url=url)
    prj_summary = p_con.get_entry(project_name)
    qc_data = get_qc_data(project_name, p_con, s_con, flowcell)

    if not prj_summary is None:
        qc_data = get_qc_data(project_name, p_con, s_con, flowcell)
        if prj_summary.get("application") not in APPLICATION_MAP.keys():
            if not application:
                LOG.warn("No such application {}. Please use the application option (available choices {})".format(application, ",".join(QC_CUTOFF.keys())))
                return output_data
            application = application
        else:
            application = APPLICATION_MAP[prj_summary.get("application")]
    else:
        LOG.info("No such project {} in project summary. Trying to get qc data anyway.".format(project_name))
        if not application:
            LOG.warn("No application provided. Please use the application option (available choices {})".format(",".join(QC_CUTOFF.keys())))
            return output_data
        qc_data = _get_sample_qc_data(project_name, application, s_con, flowcell)

    output_data = _qc_info_header(project_name, application, output_data)
    for k,v in sorted(qc_data.iteritems()):
        y = [str(x) for x in assess_qc(v, application)]
        output_data["stdout"].write("".join(y) + "\n")
    return output_data
Ejemplo n.º 22
0
 def test_2_make_project_note(self):
     """Make a project note subset by flowcell and project"""
     s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
     fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
     p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url)
     paragraphs = project_note_paragraphs()
     headers = project_note_headers()
     param = parameters
     project = p_con.get_entry(self.examples["project"])
     if not project:
         print "No project named {}".format(self.examples["project"])
         return
     if project:
         ordered_amount = p_con.get_ordered_amount(self.examples["project"])
     else:
         return
         ordered_amount = self.pargs.ordered_million_reads
         
     ## Start collecting the data
     sample_table = []
     sample_list = project['samples']
     param.update({key:project.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys()})
     samples = p_con.map_name_to_srm(self.examples["project"], check_consistency=True, use_bc_map=True)
     all_passed = True
     for k,v in samples.items():
         if k=="Unexpected":
             continue
         project_sample = sample_list[k]
         vals = {x:project_sample.get(prjs_to_table[x], None) for x in prjs_to_table.keys()}
         vals['MOrdered'] = ordered_amount
         vals['BarcodeSeq'] = s_con.get_entry(v.keys()[0], "sequence")
         
         ## Set status
         vals['Status'] = set_status(vals) if vals['Status'] is None else vals['Status']
         vals.update({k:"N/A" for k in vals.keys() if vals[k] is None})
         if vals['Status']=="N/A" or vals['Status']=="NP": all_passed = False
         sample_table.append([vals[k] for k in table_keys])
     if all_passed: param["finished"] = 'Project finished.'
     sample_table.sort()
     sample_table = list(sample_table for sample_table,_ in itertools.groupby(sample_table))
     sample_table.insert(0, ['ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status'])
     paragraphs["Samples"]["tpl"] = make_sample_table(sample_table)
     make_note("{}.pdf".format(self.examples["project"]), headers, paragraphs, **param)
Ejemplo n.º 23
0
def bcbb_configuration_from_samplesheet(csv_samplesheet, couch_credentials):
    """Parse an illumina csv-samplesheet and return a dictionary suitable for the bcbb-pipeline
    """
    tfh, yaml_file = tempfile.mkstemp('.yaml', 'samplesheet')
    os.close(tfh)
    yaml_file = bcbio.solexa.samplesheet.csv2yaml(csv_samplesheet, yaml_file)
    with open(yaml_file) as fh:
        config = yaml.load(fh)

    application_setup = {
        'Amplicon': {
            'analysis': 'Align_standard'
        },
        'ChIP-seq': {
            'analysis': 'Align_standard',
            'genome_build': 'phix'
        },
        'Custom capture': {
            'analysis': 'Align_standard_seqcap'
        },
        'de novo': {
            'analysis': 'Align_standard',
            'genome_build': 'unknown'
        },
        'Exome capture': {
            'analysis': 'Align_standard_seqcap'
        },
        'Finished library': {
            'analysis': 'Align_standard',
            'genome_build': 'phix'
        },
        'Mate-pair': {
            'analysis': 'Align_standard',
            'genome_build': 'unknown'
        },
        'Metagenome': {
            'analysis': 'Align_standard',
            'genome_build': 'unknown'
        },
        'miRNA-seq': {
            'analysis': 'Align_standard',
            'genome_build': 'unknown'
        },
        'RNA-seq (mRNA)': {
            'analysis': 'Align_standard',
            'genome_build': 'phix'
        },
        'RNA-seq (total RNA)': {
            'analysis': 'Align_standard',
            'genome_build': 'phix'
        },
        'WG re-seq': {
            'analysis': 'Align_standard'
        },
        'default': {
            'analysis': 'Align_standard'
        },
    }

    #Connect to maggie to get project application
    try:
        p_con = ProjectSummaryConnection(**couch_credentials)
    except:
        print "Can't connect to maggie to get application"
        p_con = None

# Replace the default analysis
## TODO: This is an ugly hack, should be replaced by a custom config
    for lane in config:
        for plex in lane.get('multiplex', []):
            application = 'default'
            if p_con is not None:
                try:
                    Proj = plex.get('sample_prj', '')
                    project = p_con.get_entry(Proj)
                    if project is not None:
                        application = project.get("application",
                                                  'default').strip()
                except:
                    application = 'default'
            setup = application_setup.get(application,
                                          application_setup['default'])
            for key, val in setup.items():
                plex[key] = val

    # Remove the yaml file, we will write a new one later
    os.remove(yaml_file)

    return config
Ejemplo n.º 24
0
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project database"
        self.log.debug("Connecting to flowcell database")
        f_con = FlowcellRunMetricsConnection(**vars(self.pargs))
        assert f_con, "Could not get connection to flowcell database"
        self.log.debug("Connecting to x_flowcell database")
        x_con = X_FlowcellRunMetricsConnection(**vars(self.pargs))
        assert x_con, "Could not get connection to x_flowcell database"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(
                self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error(
                    "Uppmax project was not specified and could not be fetched from project database"
                )
                return

        # Setup paths and verify parameters
        self._meta.production_root = self.pargs.root if self.pargs.root else self.app.config.get(
            "production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(
            self._meta.production_root
        ), "No such directory {}; check your production config".format(
            self._meta.production_root)
        assert os.path.exists(
            proj_base_dir), "No project {} in production path {}".format(
                self.pargs.project, self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get(
                "deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn(
                "{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get(
                "deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn(
                "{}, will use 'INBOX' as uppnext_project_delivery_path".format(
                    e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,
                                        self.pargs.uppmax_project,
                                        self._meta.uppnex_delivery_dir)
        assert os.path.exists(
            destination_root
        ), "Delivery destination folder {} does not exist".format(
            destination_root)
        destination_root = os.path.join(destination_root, self.pargs.project)

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(
            proj_base_dir=proj_base_dir,
            sample=self.pargs.sample,
            flowcell=self.pargs.flowcell)
        if len(uncompressed) > 0:
            self.log.error(
                "There are uncompressed fastq file for project, kindly check all files are compressed properly before delivery"
            )
            return

        # Extract the list of samples and runs associated with the project and sort them
        samples = self.samples_to_copy(
            pid=p_con.get_entry(self.pargs.project, "project_id"),
            pod=p_con.get_entry(self.pargs.project, "open_date"),
            fc_dict={
                'HiSeq2500': f_con.proj_list,
                'HiSeqX': x_con.proj_list
            },
            proj_base_dir=proj_base_dir,
            destination_root=destination_root,
            sample=self.pargs.sample,
            flowcell=self.pargs.flowcell)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = {}
            for sample in samples:
                if query_yes_no("Deliver sample {} ?".format(sample),
                                default="no"):
                    to_process[sample] = samples[sample]
            samples = to_process

        if self.pargs.sample:
            sample = samples.get(self.pargs.sample)
            if not sample:
                self.log.error(
                    "There is no such sample {} for project {}".format(
                        self.pargs.sample, self.pargs.project))
                return
            samples = {self.pargs.sample: sample}

        self.log.info(
            "Will deliver data for {} samples from project {} to {}".format(
                len(samples), self.pargs.project, destination_root))
        if not query_yes_no("Continue?"):
            return

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no(
                    "Do you wish to continue delivering using rsync?",
                    default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample
        for sample, flowcells in samples.iteritems():
            for fc, files in flowcells.iteritems():
                self.log.info("Processing sample {} and flowcell {}".format(
                    sample, fc))

                # transfer files
                self.log.debug("Transferring {} fastq files".format(
                    len(files['src'])))
                self._transfer_files(sources=files['src'],
                                     targets=files['dst'])

                passed = True
                if self.pargs.link or self.pargs.dry_run:
                    passed = False
                else:
                    # calculate md5sums on the source side and write it on the destination
                    md5 = []
                    for s, d in zip(files['src'], files['dst']):
                        m = md5sum(s)
                        mfile = "{}.md5".format(d)
                        md5.append([m, mfile, s])
                        self.log.debug("md5sum for source file {}: {}".format(
                            s, m))

                    # write the md5sum to a file at the destination and verify the transfer
                    for m, mfile, srcpath in md5:
                        dstfile = os.path.splitext(mfile)[0]
                        self.log.debug(
                            "Writing md5sum to file {}".format(mfile))
                        self.app.cmd.write(
                            mfile, "{}  {}".format(m,
                                                   os.path.basename(dstfile)),
                            True)
                        self.log.debug(
                            "Verifying md5sum for file {}".format(dstfile))
                        dm = md5sum(dstfile)
                        self.log.debug(
                            "md5sum for destination file {}: {}".format(
                                dstfile, dm))
                        if m != dm:
                            self.log.warn(
                                "md5sum verification FAILED for {}. Source: {}, Target: {}"
                                .format(dstfile, m, dm))
                            self.log.warn(
                                "Improperly transferred file {} is removed from destination, please retry transfer of this file"
                                .format(dstfile))
                            self.app.cmd.safe_unlink(dstfile)
                            self.app.cmd.safe_unlink(mfile)
                            passed = False
                            continue

                        # Modify the permissions to ug+rw
                        for f in [dstfile, mfile]:
                            self.app.cmd.chmod(
                                f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                                | stat.S_IWGRP)

                # touch the flag to trigger uppmax inbox permission fix
                self.app.cmd.safe_touchfile(
                    os.path.join("/sw", "uppmax", "var", "inboxfix",
                                 "schedule", self.pargs.uppmax_project))

                # log the transfer to statusdb if verification passed
                if passed:
                    data = {
                        'raw_data_delivery': {
                            'timestamp': utc_time(),
                            'files': {
                                os.path.splitext(
                                    (os.path.basename(srcpath)))[0]:
                                {
                                    'md5':
                                    m,
                                    'path':
                                    os.path.splitext(mfile)[0],
                                    'size_in_bytes':
                                    self._getsize(os.path.splitext(mfile)[0]),
                                    'source_location':
                                    srcpath
                                }
                                for m, mfile, srcpath in md5
                            }
                        }
                    }
                    jsonstr = json.dumps(data)
                    jsonfile = os.path.join(
                        proj_base_dir, sample, fc,
                        "{}_{}_raw_data_delivery.json".format(sample, fc))
                    self.log.debug(
                        "Writing delivery to json file {}".format(jsonfile))
                    self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True)
                    self.log.debug(
                        "Saving delivery in StatusDB document {}".format(id))
                    if self.proj_flowcells[fc]['type'] == 'HiSeqX':
                        fc_con = x_con
                    else:
                        fc_con = f_con
                    fc_obj = fc_con.get_entry(fc)
                    self.log.info(
                        "Logging delivery to StatusDB document {}".format(
                            fc_obj.get('_id')))
                    fc_raw_data = fc_obj.get('raw_data_delivery', {})
                    fc_raw_data.update(data['raw_data_delivery'])
                    fc_obj['raw_data_delivery'] = fc_raw_data
                    self._save(fc_con, fc_obj)
                    self.log.debug(jsonstr)
Ejemplo n.º 25
0
    def test_2_make_project_note(self):
        """Make a project note subset by flowcell and project"""
        s_con = SampleRunMetricsConnection(username=self.user,
                                           password=self.pw,
                                           url=self.url)
        fc_con = FlowcellRunMetricsConnection(username=self.user,
                                              password=self.pw,
                                              url=self.url)
        p_con = ProjectSummaryConnection(username=self.user,
                                         password=self.pw,
                                         url=self.url)
        paragraphs = project_note_paragraphs()
        headers = project_note_headers()
        param = parameters
        project = p_con.get_entry(self.examples["project"])
        if not project:
            print "No project named {}".format(self.examples["project"])
            return
        if project:
            ordered_amount = p_con.get_ordered_amount(self.examples["project"])
        else:
            return
            ordered_amount = self.pargs.ordered_million_reads

        ## Start collecting the data
        sample_table = []
        sample_list = project['samples']
        param.update({
            key: project.get(ps_to_parameter[key], None)
            for key in ps_to_parameter.keys()
        })
        samples = p_con.map_name_to_srm(self.examples["project"],
                                        check_consistency=True,
                                        use_bc_map=True)
        all_passed = True
        for k, v in samples.items():
            if k == "Unexpected":
                continue
            project_sample = sample_list[k]
            vals = {
                x: project_sample.get(prjs_to_table[x], None)
                for x in prjs_to_table.keys()
            }
            vals['MOrdered'] = ordered_amount
            vals['BarcodeSeq'] = s_con.get_entry(v.keys()[0], "sequence")

            ## Set status
            vals['Status'] = set_status(
                vals) if vals['Status'] is None else vals['Status']
            vals.update({k: "N/A" for k in vals.keys() if vals[k] is None})
            if vals['Status'] == "N/A" or vals['Status'] == "NP":
                all_passed = False
            sample_table.append([vals[k] for k in table_keys])
        if all_passed: param["finished"] = 'Project finished.'
        sample_table.sort()
        sample_table = list(
            sample_table
            for sample_table, _ in itertools.groupby(sample_table))
        sample_table.insert(0, [
            'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered',
            'Status'
        ])
        paragraphs["Samples"]["tpl"] = make_sample_table(sample_table)
        make_note("{}.pdf".format(self.examples["project"]), headers,
                  paragraphs, **param)
Ejemplo n.º 26
0
class TestDbConnection(unittest.TestCase):
    def setUp(self):
        self.user = "******"
        self.pw = "pw"
        self.url = "localhost"
        self.examples = {"sample": "1_120924_AC003CCCXX_TGACCA", "flowcell": "AC003CCCXX", "project": "J.Doe_00_01"}
        self.p_con = ProjectSummaryConnection(
            dbname="projects-test", username=self.user, password=self.pw, url=self.url
        )

    def test_connection(self):
        """Test database connection"""
        sample_con = SampleRunMetricsConnection(
            dbname="samples-test", username=self.user, password=self.pw, url=self.url
        )
        self.assertEqual(sample_con.url_string, "http://{}:5984".format(self.url))

    def test_get_flowcell(self):
        """Test getting a flowcell for a given sample"""
        sample_con = SampleRunMetricsConnection(
            dbname="samples-test", username=self.user, password=self.pw, url=self.url
        )
        fc = sample_con.get_entry(self.examples["sample"], "flowcell")
        self.assertEqual(str(fc), self.examples["flowcell"])

    def test_get_sample_ids(self):
        """Test getting sample ids given flowcell and sample_prj"""
        sample_con = SampleRunMetricsConnection(
            dbname="samples-test", username=self.user, password=self.pw, url=self.url
        )
        sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"])
        LOG.info("Number of samples before subsetting: " + str(len(sample_ids)))
        self.assertEqual(len(sample_ids), 4)
        sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"])
        LOG.info("Number of samples after subsetting: " + str(len(sample_ids)))
        self.assertEqual(len(sample_ids), 2)

    def test_get_samples(self):
        """Test getting samples given flowcell and sample_prj."""
        sample_con = SampleRunMetricsConnection(
            dbname="samples-test", username=self.user, password=self.pw, url=self.url
        )

        samples = sample_con.get_samples(fc_id=self.examples["flowcell"])
        LOG.info("Selecting on flowcell: " + str(len(samples)))
        self.assertEqual(len(samples), 4)
        samples = sample_con.get_samples(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"])
        LOG.info("Selecting on flowcell, subsetting on project: " + str(len(samples)))
        self.assertEqual(len(samples), 2)

        samples = sample_con.get_samples(sample_prj=self.examples["project"])
        LOG.info("Selecting on project: " + str(len(samples)))
        self.assertEqual(len(samples), 3)
        samples = sample_con.get_samples(sample_prj=self.examples["project"], fc_id=self.examples["flowcell"])
        LOG.info("Selecting on project, subsetting on flowcell: " + str(len(samples)))
        self.assertEqual(len(samples), 2)

    def test_get_samples_wrong_info(self):
        """Test getting samples when either flowcell or project id information is wrong"""
        sample_con = SampleRunMetricsConnection(
            dbname="samples-test", username=self.user, password=self.pw, url=self.url
        )

        samples = sample_con.get_samples(sample_prj="bogusproject", fc_id=self.examples["flowcell"])
        LOG.info("Selecting on bogus project, subsetting on flowcell: " + str(len(samples)))
        self.assertEqual(len(samples), 0)

    def test_get_project_sample_ids(self):
        """Test getting project sample ids"""
        sample_con = SampleRunMetricsConnection(
            dbname="samples-test", username=self.user, password=self.pw, url=self.url
        )
        sample_ids = sample_con.get_sample_ids(sample_prj=self.examples["project"])
        sample_names = [sample_con.db.get(x)["name"] for x in sample_ids]
        self.assertEqual(
            set(sample_names),
            set(["1_120924_AC003CCCXX_TGACCA", "2_120924_AC003CCCXX_ACAGTG", "1_121015_BB002BBBXX_TGACCA"]),
        )

    def test_get_latest_library_prep(self):
        """Test getting latest library prep"""
        prj = self.p_con.get_entry("J.Doe_00_01")
        prj["samples"]["P001_102"]["library_prep"]["B"] = {"sample_run_metrics": {"2_120924_AC003CCCXX_TTGGAA": None}}
        self.p_con.save(prj)
        preps = self.p_con.get_latest_library_prep(project_name=self.examples["project"])
        srm = [x for l in preps.values() for x in l]
        # Make sure A prep not in list
        self.assertNotIn("2_120924_AC003CCCXX_ACAGTG", srm)
        # Make sure B prep in list
        self.assertIn("2_120924_AC003CCCXX_TTGGAA", srm)
        # Reset data
        prj = self.p_con.get_entry("J.Doe_00_01")
        del prj["samples"]["P001_102"]["library_prep"]["B"]
        self.p_con.save(prj)
Ejemplo n.º 27
0
def _project_status_note_table(project_name=None, username=None, password=None, url=None,
                               use_ps_map=True, use_bc_map=False, check_consistency=False,
                               ordered_million_reads=None, uppnex_id=None, customer_reference=None,
                               exclude_sample_ids={}, project_alias=None, sample_aliases={},
                               projectdb="projects", samplesdb="samples", flowcelldb="flowcells",
                               include_all_samples=False, param={}, **kw):

    # mapping project_summary to parameter keys
    ps_to_parameter = {"scilife_name":"scilife_name", "customer_name":"customer_name", "project_name":"project_name"}
    # mapping project sample to table
    table_keys = ['ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered']

    output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()}
    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url)

    #Get the information source for this project
    source = p_con.get_info_source(project_name)

    # Get project summary from project database
    sample_aliases = _literal_eval_option(sample_aliases, default={})
    prj_summary = p_con.get_entry(project_name)
    if not prj_summary:
        LOG.warn("No such project '{}'".format(project_name))
        return
    LOG.debug("Working on project '{}'.".format(project_name))

    # Determine if project is finished by getting all samples sequenced date
    try:
        all_samples_sequenced = prj_summary['project_summary']['all_samples_sequenced']
    except (TypeError,KeyError):
        all_samples_sequenced = False

    # Get sample run list and loop samples to make mapping sample -> {sampleruns}
    sample_run_list = _set_sample_run_list(project_name, flowcell=None, project_alias=project_alias, s_con=s_con)
    samples = {}
    for s in sample_run_list:
        prj_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None))
        if prj_sample:
            sample_name = prj_sample['project_sample'].get("scilife_name", None)
            s_d = {s["name"] : {'sample':sample_name, 'id':s["_id"]}}
            samples.update(s_d)
        else:
            if s["barcode_name"] in sample_aliases:
                s_d = {sample_aliases[s["barcode_name"]] : {'sample':sample_aliases[s["barcode_name"]], 'id':s["_id"]}}
                samples.update(s_d)
            else:
                s_d = {s["name"]:{'sample':s["name"], 'id':s["_id"], 'barcode_name':s["barcode_name"]}}
                LOG.warn("No mapping found for sample run:\n  '{}'".format(s_d))

    # Convert to mapping from desired sample name to list of aliases
    # Less important for the moment; one solution is to update the
    # Google docs summary table to use the P names
    sample_dict = prj_summary['samples']
    param.update({key:prj_summary.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys()})
    param["ordered_amount"] = param.get("ordered_amount", p_con.get_ordered_amount(project_name, samples=sample_dict))

    if not param.get('customer_reference') :
        try:
            param['customer_reference'] = prj_summary['details']['customer_project_reference']
        except (TypeError,KeyError):
            param['customer_reference'] = prj_summary.get('customer_reference')
    param['uppnex_project_id'] = param.get('uppnex_project_id', prj_summary.get('uppnex_id'))

    # Override database values if options passed at command line
    if uppnex_id:
        param["uppnex_project_id"] = uppnex_id
    if customer_reference:
        param["customer_reference"] = customer_reference

    # Process options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={})

    ## Start collecting the data
    sample_table = []
    samples_excluded = []
    last_library_preps = p_con.get_latest_library_prep(project_name)
    last_library_preps_srm = [x for l in last_library_preps.values() for x in l]
    LOG.debug("Looping through sample map that maps project sample names to sample run metrics ids")
    for k,v in samples.items():
        LOG.debug("project sample '{}' maps to '{}'".format(k, v))
        if not include_all_samples:
            if v['sample'] not in last_library_preps.keys():
                LOG.info("No library prep information for sample {}; keeping in report".format(v['sample']))
            else:
                if k not in last_library_preps_srm:
                    LOG.info("Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report".format(k, v["id"], ",".join(list(set(last_library_preps[v['sample']].values()))), v['sample']))
                    continue
        else:
            pass

        if re.search("Unexpected", k):
            continue
        barcode_seq = s_con.get_entry(k, "sequence")
        # Exclude sample id?
        if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq):
            samples_excluded.append(v['sample'])
            continue
        # Get the project sample name from the sample run and set table values
        project_sample = sample_dict[v['sample']]
        vals = _set_sample_table_values(v['sample'], project_sample, barcode_seq, ordered_million_reads, param)
        sample_table.append([vals[k] for k in table_keys])

    # Loop through samples in sample_dict for which there is no sample run information
    samples_in_table_or_excluded = list(set([x[0] for x in sample_table])) + samples_excluded
    samples_not_in_table = list(set(sample_dict.keys()) - set(samples_in_table_or_excluded))
    for sample in samples_not_in_table:
        if re.search("Unexpected", sample):
            continue
        project_sample = sample_dict[sample]
        # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
        project_sample_d = _set_project_sample_dict(project_sample, source)
        if project_sample_d:
            for k,v in project_sample_d.iteritems():
                barcode_seq = s_con.get_entry(k, "sequence")
                vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param)
                sample_table.append([vals[k] for k in table_keys])
        else:
            barcode_seq = None
            vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param)
            sample_table.append([vals[k] for k in table_keys])
    if all_samples_sequenced: param["finished"] = 'All samples for this project have been sequenced.'
    sample_table.sort()
    sample_table = list(sample_table for sample_table,_ in itertools.groupby(sample_table))
    sample_table.insert(0, ['ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered'])

    return output_data, sample_table, param
Ejemplo n.º 28
0
 def test_4_srm_map(self):
     """Test getting a sample mapping from srm to project samples"""
     p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url)
     samples = p_con.map_srm_to_name(self.examples["project"], fc_id=self.examples["flowcell"], use_ps_map=False, check_consistency=True)
     print samples
Ejemplo n.º 29
0
def initiate_survey(report, project, **kw):

    # Get a connection to the database
    pcon = ProjectSummaryConnection(**kw)
    if not pcon:
        report.log.error(
            "Could not get connection to database".format(project))
        return False

    # Get the document for the project
    pdoc = pcon.get_entry(project)
    if not pdoc:
        report.log.error("No such project: {} in database".format(project))
        return False

    # get a project instance from lims
    lproj = lims_project(report, pdoc.get("project_id"))
    if not lproj:
        report.log.error(
            "Could not initiate LIMS object for project {}".format(project))
        return False

    # Check if project is aborted
    if project_aborted(lproj):
        report.log.warn("Project {} has been aborted".format(project))
        return False

    # check if project is closed
    closed = project_closed(lproj)
    if closed is None:
        report.log.warn("Project {} is not closed".format(project))
        return False
    report.log.debug("Project {} closed on {}".format(
        project, datetime.datetime.strptime(closed, report._meta.date_format)))

    # check if a user survey has already been sent
    if survey_sent(lproj):
        report.log.info("Survey already sent for project {}".format(project))
        return False
    report.log.debug("No previous survey sent for {}".format(project))

    # get email addresses for persons connected to the project
    emails = project_email(report, lproj)
    if len(emails) == 0:
        report.log.warn(
            "No email addresses found associated with project {}".format(
                project))
        return False

    # verify the format of the email address
    recipients = []
    for email in emails:
        if email is None or not re.match(r'[^@]+@[^@]+\.[^@]+', email):
            report.log.warn("Illegal email format: {}".format(email))
            continue
        recipients.append(email)

    # send the survey email to each recipient
    sent = send_survey(report,
                       project,
                       recipients,
                       sender=kw.get("sender"),
                       smtphost=kw.get("smtphost"),
                       smtpport=kw.get("smtpport"),
                       dryrun=report.pargs.dry_run)

    # update the project udf to indicate that we have sent out the survey
    if sent:
        report.log.info("Survey sent to recipients {} successfully".format(
            ",".join(recipients)))
        lproj.udf['Survey sent'] = datetime.datetime.now().date()
        if not report.pargs.dry_run:
            lproj.put()
    elif not sent:
        report.log.warn("Sending survey to recipients {} failed".format(
            ",".join(recipients)))

    return sent
Ejemplo n.º 30
0
 def test_3_sample_map(self):
     """Test getting a sample mapping"""
     p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url)
     sample_map = p_con.map_name_to_srm(self.examples["project"], use_ps_map=False, check_consistency=True)
     print sample_map
Ejemplo n.º 31
0
def initiate_survey(report, project, **kw):
     
    # Get a connection to the database
    pcon = ProjectSummaryConnection(**kw)
    if not pcon:
        report.log.error("Could not get connection to database".format(project))
        return False
    
    # Get the document for the project
    pdoc = pcon.get_entry(project)
    if not pdoc:
        report.log.error("No such project: {} in database".format(project))
        return False

    # get a project instance from lims
    lproj = lims_project(report, pdoc.get("project_id"))
    if not lproj:
        report.log.error("Could not initiate LIMS object for project {}".format(project))
        return False
        
    # check if project is closed
    closed = project_closed(lproj)
    if closed is None:
        report.log.warn("Project {} is not closed".format(project))
        return False
    report.log.debug("Project {} closed on {}".format(project,datetime.datetime.strptime(closed,report._meta.date_format)))

    # check if a user survey has already been sent
    if survey_sent(lproj):
        report.log.info("Survey already sent for project {}".format(project))
        return False
    report.log.debug("No previous survey sent for {}".format(project))
    
    # get email addresses for persons connected to the project
    emails = project_email(report,lproj)
    if len(emails) == 0:
        report.log.warn("No email addresses found associated with project {}".format(project))
        return False
    
    # verify the format of the email address
    recipients = []
    for email in emails:
        if email is None or not re.match(r'^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,3})$',email):
            report.log.warn("Illegal email format: {}".format(email))
            continue
        recipients.append(email)
    
    # send the survey email to each recipient 
    sent = send_survey(report, 
                       project, 
                       recipients, 
                       sender = kw.get("sender"),
                       smtphost=kw.get("smtphost"),
                       smtpport=kw.get("smtpport"),
                       dryrun=report.pargs.dry_run)
    
    # update the project udf to indicate that we have sent out the survey
    if sent:
        report.log.info("Survey sent to recipients {} successfully".format(",".join(recipients)))
        lproj.udf['Survey sent'] = datetime.datetime.now().date()
        if not report.pargs.dry_run:
            lproj.put()
    elif not sent:
        report.log.warn("Sending survey to recipients {} failed".format(",".join(recipients)))
        
    return sent
Ejemplo n.º 32
0
def project_status_note(project_name=None,
                        username=None,
                        password=None,
                        url=None,
                        use_ps_map=True,
                        use_bc_map=False,
                        check_consistency=False,
                        ordered_million_reads=None,
                        uppnex_id=None,
                        customer_reference=None,
                        exclude_sample_ids={},
                        project_alias=None,
                        sample_aliases={},
                        projectdb="projects",
                        samplesdb="samples",
                        flowcelldb="flowcells",
                        include_all_samples=False,
                        **kw):
    """Make a project status note. Used keywords:

    :param project_name: project name
    :param user: db user name
    :param password: db password
    :param url: db url
    :param use_ps_map: use project summary mapping
    :param use_bc_map: use project to barcode name mapping
    :param check_consistency: check consistency between mappings
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param exclude_sample_ids: exclude some sample ids from project note
    :param project_alias: project alias name
    :param sample_aliases: sample alias names
    :param projectdb: project db name
    :param samplesdb: samples db name
    :param flowcelldb: flowcells db name
    :param include_all_samples: include all samples in report
    """
    # parameters
    parameters = {
        "project_name": project_name,
        "finished": "Not finished, or cannot yet assess if finished.",
    }
    # mapping project_summary to parameter keys
    ps_to_parameter = {
        "scilife_name": "scilife_name",
        "customer_name": "customer_name",
        "project_name": "project_name"
    }
    # mapping project sample to table
    table_keys = [
        'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered',
        'Status'
    ]

    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    # Set report paragraphs
    paragraphs = project_note_paragraphs()
    headers = project_note_headers()
    # Set local param variable
    param = parameters

    # Get project summary from project database
    sample_aliases = _literal_eval_option(sample_aliases, default={})
    prj_summary = p_con.get_entry(project_name)
    if not prj_summary:
        LOG.warn("No such project '{}'".format(project_name))
        return
    LOG.debug("Working on project '{}'.".format(project_name))

    # Get sample run list and loop samples to make mapping sample -> {sampleruns}
    sample_run_list = _set_sample_run_list(project_name,
                                           flowcell=None,
                                           project_alias=project_alias,
                                           s_con=s_con)
    samples = {}
    for s in sample_run_list:
        prj_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if prj_sample:
            sample_name = prj_sample['project_sample'].get(
                "scilife_name", None)
            s_d = {s["name"]: {'sample': sample_name, 'id': s["_id"]}}
            samples.update(s_d)
        else:
            if s["barcode_name"] in sample_aliases:
                s_d = {
                    sample_aliases[s["barcode_name"]]: {
                        'sample': sample_aliases[s["barcode_name"]],
                        'id': s["_id"]
                    }
                }
                samples.update(s_d)
            else:
                s_d = {
                    s["name"]: {
                        'sample': s["name"],
                        'id': s["_id"],
                        'barcode_name': s["barcode_name"]
                    }
                }
                LOG.warn(
                    "No mapping found for sample run:\n  '{}'".format(s_d))

    # Convert to mapping from desired sample name to list of aliases
    # Less important for the moment; one solution is to update the
    # Google docs summary table to use the P names
    sample_dict = prj_summary['samples']
    param.update({
        key: prj_summary.get(ps_to_parameter[key], None)
        for key in ps_to_parameter.keys()
    })
    param["ordered_amount"] = param.get("ordered_amount",
                                        p_con.get_ordered_amount(project_name))
    param['customer_reference'] = param.get(
        'customer_reference', prj_summary.get('customer_reference'))
    param['uppnex_project_id'] = param.get('uppnex_project_id',
                                           prj_summary.get('uppnex_id'))

    # Override database values if options passed at command line
    if uppnex_id:
        param["uppnex_project_id"] = uppnex_id
    if customer_reference:
        param["customer_reference"] = customer_reference

    # Process options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={})

    ## Start collecting the data
    sample_table = []
    samples_excluded = []
    all_passed = True
    last_library_preps = p_con.get_latest_library_prep(project_name)
    last_library_preps_srm = [
        x for l in last_library_preps.values() for x in l
    ]
    LOG.debug(
        "Looping through sample map that maps project sample names to sample run metrics ids"
    )
    for k, v in samples.items():
        LOG.debug("project sample '{}' maps to '{}'".format(k, v))
        if not include_all_samples:
            if v['sample'] not in last_library_preps.keys():
                LOG.info(
                    "No library prep information for sample {}; keeping in report"
                    .format(v['sample']))
            else:
                if k not in last_library_preps_srm:
                    LOG.info(
                        "Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report"
                        .format(k, v["id"],
                                last_library_preps[v['sample']].values()[0],
                                v['sample']))
                    continue
        else:
            pass

        if re.search("Unexpected", k):
            continue
        barcode_seq = s_con.get_entry(k, "sequence")
        # Exclude sample id?
        if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq):
            samples_excluded.append(v['sample'])
            continue
        # Get the project sample name from the sample run and set table values
        project_sample = sample_dict[v['sample']]
        vals = _set_sample_table_values(v['sample'], project_sample,
                                        barcode_seq, ordered_million_reads,
                                        param)
        if vals['Status'] == "N/A" or vals['Status'] == "NP":
            all_passed = False
        sample_table.append([vals[k] for k in table_keys])

    # Loop through samples in sample_dict for which there is no sample run information
    samples_in_table_or_excluded = list(set([x[0] for x in sample_table
                                             ])) + samples_excluded
    samples_not_in_table = list(
        set(sample_dict.keys()) - set(samples_in_table_or_excluded))
    for sample in samples_not_in_table:
        if re.search("Unexpected", sample):
            continue
        project_sample = sample_dict[sample]
        # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
        project_sample_d = _set_project_sample_dict(project_sample)
        if project_sample_d:
            for k, v in project_sample_d.iteritems():
                barcode_seq = s_con.get_entry(k, "sequence")
                vals = _set_sample_table_values(sample, project_sample,
                                                barcode_seq,
                                                ordered_million_reads, param)
                if vals['Status'] == "N/A" or vals['Status'] == "NP":
                    all_passed = False
                sample_table.append([vals[k] for k in table_keys])
        else:
            barcode_seq = None
            vals = _set_sample_table_values(sample, project_sample,
                                            barcode_seq, ordered_million_reads,
                                            param)
            if vals['Status'] == "N/A" or vals['Status'] == "NP":
                all_passed = False
            sample_table.append([vals[k] for k in table_keys])
    if all_passed: param["finished"] = 'Project finished.'
    sample_table.sort()
    sample_table = list(sample_table
                        for sample_table, _ in itertools.groupby(sample_table))
    sample_table.insert(0, [
        'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered',
        'Status'
    ])
    paragraphs["Samples"]["tpl"] = make_sample_table(sample_table)
    make_note("{}_project_summary.pdf".format(project_name), headers,
              paragraphs, **param)
    make_rest_note("{}_project_summary.rst".format(project_name),
                   sample_table=sample_table,
                   report="project_report",
                   **param)
    param.update(
        {k: "N/A"
         for k in param.keys() if param[k] is None or param[k] == ""})
    output_data["debug"].write(
        json.dumps({
            'param': param,
            'table': sample_table
        }))
    return output_data
from scilifelab.db.statusdb import ProjectSummaryConnection
pcon = ProjectSummaryConnection(url="tools.scilifelab.se", username="******", password="******")
project = pcon.get_entry('C.Dixelius_13_01')
for sample in project.get("samples",{}).values():
       print("\t".join([sample.get('scilife_name'),sample.get('customer_name')]))
Ejemplo n.º 34
0
def data_delivery_note(**kw):
    """Create an easily parseable information file with information about the data delivery
    """
    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }

    project_name = kw.get('project_name', None)
    flowcell = kw.get('flowcell', None)
    LOG.debug("Generating data delivery note for project {}{}.".format(
        project_name, ' and flowcell {}'.format(flowcell if flowcell else '')))

    # Get a connection to the project and sample databases
    p_con = ProjectSummaryConnection(**kw)
    assert p_con, "Could not connect to project database"
    s_con = SampleRunMetricsConnection(**kw)
    assert s_con, "Could not connect to sample database"

    # Get the entry for the project and samples from the database
    LOG.debug("Fetching samples from sample database")
    samples = s_con.get_samples(sample_prj=project_name, fc_id=flowcell)
    LOG.debug("Got {} samples from database".format(len(samples)))

    # Get the customer sample names from the project database
    LOG.debug("Fetching samples from project database")
    project_samples = p_con.get_entry(project_name, "samples")
    customer_names = {
        sample_name: sample.get('customer_name', 'N/A')
        for sample_name, sample in project_samples.items()
    }

    data = [[
        'SciLifeLab ID', 'Submitted ID', 'Flowcell', 'Lane', 'Barcode', 'Read',
        'Path', 'MD5', 'Size (bytes)', 'Timestamp'
    ]]
    for sample in samples:
        sname = sample.get('project_sample_name', 'N/A')
        cname = customer_names.get(sname, 'N/A')
        fc = sample.get('flowcell', 'N/A')
        lane = sample.get('lane', 'N/A')
        barcode = sample.get('sequence', 'N/A')
        if 'raw_data_delivery' not in sample:
            data.append([sname, cname, '', '', '', '', '', '', '', ''])
            continue
        delivery = sample['raw_data_delivery']
        tstamp = delivery.get('timestamp', 'N/A')
        for read, file in delivery.get('files', {}).items():
            data.append([
                sname,
                cname,
                fc,
                lane,
                barcode,
                read,
                file.get('path', 'N/A'),
                file.get('md5', 'N/A'),
                file.get('size_in_bytes', 'N/A'),
                tstamp,
            ])

    # Write the data to a csv file
    outfile = "{}{}_data_delivery.csv".format(
        project_name, '_{}'.format(flowcell) if flowcell else '')
    LOG.debug("Writing delivery data to {}".format(outfile))
    with open(outfile, "w") as outh:
        csvw = csv.writer(outh)
        for row in data:
            csvw.writerow(row)

    # Write Texttable formatted output to stdout
    tt = texttable.Texttable(180)
    tt.add_rows(data)
    output_data['stdout'].write(tt.draw())

    return output_data
Ejemplo n.º 35
0
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project databse"
        self.log.debug("Connecting to samples database")
        s_con = SampleRunMetricsConnection(**vars(self.pargs))
        assert s_con, "Could not get connection to samples databse"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error("Uppmax project was not specified and could not be fetched from project database")
                return

        # Extract the list of samples and runs associated with the project and sort them
        samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name','NA'), k.get('flowcell','NA'), k.get('lane','NA')))

        # Setup paths and verify parameters
        self._meta.production_root = self.app.config.get("production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(self._meta.production_root), "No such directory {}; check your production config".format(self._meta.production_root)
        assert os.path.exists(proj_base_dir), "No project {} in production path {}".format(self.pargs.project,self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get("deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn("{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get("deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn("{}, will use 'INBOX' as uppnext_project_delivery_path".format(e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,self.pargs.uppmax_project,self._meta.uppnex_delivery_dir)
        assert os.path.exists(destination_root), "Delivery destination folder {} does not exist".format(destination_root)
        destination_root = os.path.join(destination_root,self.pargs.project)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = []
            for sample in samples:
                sname = sample.get("project_sample_name")
                index = sample.get("sequence")
                fcid = sample.get("flowcell")
                lane = sample.get("lane")
                date = sample.get("date")
                self.log.info("Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}".format(sname,
                                                                                                           index,
                                                                                                           fcid,
                                                                                                           lane,
                                                                                                           date))
                if query_yes_no("Deliver sample?", default="no"):
                    to_process.append(sample)
            samples = to_process

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(proj_base_dir,samples)
        if len(uncompressed) > 0:
            self.log.warn("The following samples have uncompressed *.fastq files that cannot be delivered: {}".format(",".join(uncompressed)))
            if not query_yes_no("Continue anyway?", default="no"):
                return

        self.log.info("Will deliver data for {} samples from project {} to {}".format(len(samples),self.pargs.project,destination_root))
        if not query_yes_no("Continue?"):
            return

        # Get the list of files to transfer and the destination
        self.log.debug("Gathering list of files to copy")
        to_copy = self.get_file_copy_list(proj_base_dir,
                                          destination_root,
                                          samples)

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no("Do you wish to continue delivering using rsync?", default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample run
        for id, files in to_copy.items():
            # get the sample database object
            [sample] = [s for s in samples if s.get('_id') == id]
            self.log.info("Processing sample {} and flowcell {}".format(sample.get("project_sample_name","NA"),sample.get("flowcell","NA")))

            # transfer files
            self.log.debug("Transferring {} fastq files".format(len(files)))
            self._transfer_files([f[0] for f in files], [f[1] for f in files])

            passed = True
            if self.pargs.link or self.pargs.dry_run:
                passed = False
            else:
                # calculate md5sums on the source side and write it on the destination
                md5 = []
                for f in files:
                    m = md5sum(f[0])
                    mfile = "{}.md5".format(f[1])
                    md5.append([m,mfile,f[2],f[0]])
                    self.log.debug("md5sum for source file {}: {}".format(f[0],m))

                # write the md5sum to a file at the destination and verify the transfer
                for m, mfile, read, srcpath in md5:
                    dstfile = os.path.splitext(mfile)[0]
                    self.log.debug("Writing md5sum to file {}".format(mfile))
                    self.app.cmd.write(mfile,"{}  {}".format(m,os.path.basename(dstfile)),True)
                    self.log.debug("Verifying md5sum for file {}".format(dstfile))
                    dm = md5sum(dstfile)
                    self.log.debug("md5sum for destination file {}: {}".format(dstfile,dm))
                    if m != dm:
                        self.log.warn("md5sum verification FAILED for {}. Source: {}, Target: {}".format(dstfile,m,dm))
                        self.log.warn("Improperly transferred file {} is removed from destination, please retry transfer of this file".format(dstfile))
                        self.app.cmd.safe_unlink(dstfile)
                        self.app.cmd.safe_unlink(mfile)
                        passed = False
                        continue

                    # Modify the permissions to ug+rw
                    for f in [dstfile, mfile]:
                        self.app.cmd.chmod(f,stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP)

            # touch the flag to trigger uppmax inbox permission fix
            self.app.cmd.safe_touchfile(os.path.join("/sw","uppmax","var","inboxfix","schedule",self.pargs.uppmax_project))

            # log the transfer to statusdb if verification passed
            if passed:
                self.log.info("Logging delivery to StatusDB document {}".format(id))
                data = {'raw_data_delivery': {'timestamp': utc_time(),
                                              'files': {'R{}'.format(read):{'md5': m,
                                                                            'path': os.path.splitext(mfile)[0],
                                                                            'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]),
                                                                            'source_location': srcpath} for m, mfile, read, srcpath in md5},
                                              }
                        }
                jsonstr = json.dumps(data)
                jsonfile = os.path.join(os.path.dirname(md5[0][3]),
                                        "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(sample.get("date"),
                                                                                       sample.get("flowcell"),
                                                                                       sample.get("project_sample_name"),
                                                                                       sample.get("sequence"),
                                                                                       sample.get("lane")))
                self.log.debug("Writing delivery to json file {}".format(jsonfile))
                self.app.cmd.write(jsonfile,data=jsonstr,overwrite=True)
                self.log.debug("Saving delivery in StatusDB document {}".format(id))
                sample.update(data)
                self._save(s_con,sample)
                self.log.debug(jsonstr)
Ejemplo n.º 36
0
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project databse"
        self.log.debug("Connecting to samples database")
        s_con = SampleRunMetricsConnection(**vars(self.pargs))
        assert s_con, "Could not get connection to samples databse"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(
                self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error(
                    "Uppmax project was not specified and could not be fetched from project database"
                )
                return

        # Extract the list of samples and runs associated with the project and sort them
        samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell,
                                           sample_prj=self.pargs.project),
                         key=lambda k:
                         (k.get('project_sample_name', 'NA'),
                          k.get('flowcell', 'NA'), k.get('lane', 'NA')))

        # Setup paths and verify parameters
        self._meta.production_root = self.app.config.get("production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(
            self._meta.production_root
        ), "No such directory {}; check your production config".format(
            self._meta.production_root)
        assert os.path.exists(
            proj_base_dir), "No project {} in production path {}".format(
                self.pargs.project, self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get(
                "deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn(
                "{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get(
                "deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn(
                "{}, will use 'INBOX' as uppnext_project_delivery_path".format(
                    e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,
                                        self.pargs.uppmax_project,
                                        self._meta.uppnex_delivery_dir)
        assert os.path.exists(
            destination_root
        ), "Delivery destination folder {} does not exist".format(
            destination_root)
        destination_root = os.path.join(destination_root, self.pargs.project)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = []
            for sample in samples:
                sname = sample.get("project_sample_name")
                index = sample.get("sequence")
                fcid = sample.get("flowcell")
                lane = sample.get("lane")
                date = sample.get("date")
                self.log.info(
                    "Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}"
                    .format(sname, index, fcid, lane, date))
                if query_yes_no("Deliver sample?", default="no"):
                    to_process.append(sample)
            samples = to_process

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(
            proj_base_dir, samples)
        if len(uncompressed) > 0:
            self.log.warn(
                "The following samples have uncompressed *.fastq files that cannot be delivered: {}"
                .format(",".join(uncompressed)))
            if not query_yes_no("Continue anyway?", default="no"):
                return

        self.log.info(
            "Will deliver data for {} samples from project {} to {}".format(
                len(samples), self.pargs.project, destination_root))
        if not query_yes_no("Continue?"):
            return

        # Get the list of files to transfer and the destination
        self.log.debug("Gathering list of files to copy")
        to_copy = self.get_file_copy_list(proj_base_dir, destination_root,
                                          samples)

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no(
                    "Do you wish to continue delivering using rsync?",
                    default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample run
        for id, files in to_copy.items():
            # get the sample database object
            [sample] = [s for s in samples if s.get('_id') == id]
            self.log.info("Processing sample {} and flowcell {}".format(
                sample.get("project_sample_name", "NA"),
                sample.get("flowcell", "NA")))

            # calculate md5sums on the source side and write it on the destination
            md5 = []
            for f in files:
                m = md5sum(f[0])
                mfile = "{}.md5".format(f[1])
                md5.append([m, mfile, f[2], f[0]])
                self.log.debug("md5sum for source file {}: {}".format(f[0], m))

            # transfer files
            self.log.debug("Transferring {} fastq files".format(len(files)))
            self._transfer_files([f[0] for f in files], [f[1] for f in files])

            # write the md5sum to a file at the destination and verify the transfer
            passed = True
            for m, mfile, read, srcpath in md5:
                dstfile = os.path.splitext(mfile)[0]
                self.log.debug("Writing md5sum to file {}".format(mfile))
                self.app.cmd.write(
                    mfile, "{}  {}".format(m, os.path.basename(dstfile)), True)
                self.log.debug("Verifying md5sum for file {}".format(dstfile))

                # if dry-run, make sure verification pass
                if self.pargs.dry_run:
                    dm = m
                else:
                    dm = md5sum(dstfile)
                self.log.debug("md5sum for destination file {}: {}".format(
                    dstfile, dm))
                if m != dm:
                    self.log.warn(
                        "md5sum verification FAILED for {}. Source: {}, Target: {}"
                        .format(dstfile, m, dm))
                    self.log.warn(
                        "Improperly transferred file {} is removed from destination, please retry transfer of this file"
                        .format(dstfile))
                    self.app.cmd.safe_unlink(dstfile)
                    self.app.cmd.safe_unlink(mfile)
                    passed = False
                    continue

                # Modify the permissions to ug+rw
                for f in [dstfile, mfile]:
                    self.app.cmd.chmod(
                        f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                        | stat.S_IWGRP)

            # touch the flag to trigger uppmax inbox permission fix
            self.app.cmd.safe_touchfile(
                os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule",
                             self.pargs.uppmax_project))

            # log the transfer to statusdb if verification passed
            if passed:
                self.log.info(
                    "Logging delivery to StatusDB document {}".format(id))
                data = {
                    'raw_data_delivery': {
                        'timestamp': utc_time(),
                        'files': {
                            'R{}'.format(read): {
                                'md5':
                                m,
                                'path':
                                os.path.splitext(mfile)[0],
                                'size_in_bytes':
                                self._getsize(os.path.splitext(mfile)[0]),
                                'source_location':
                                srcpath
                            }
                            for m, mfile, read, srcpath in md5
                        },
                    }
                }
                jsonstr = json.dumps(data)
                jsonfile = os.path.join(
                    os.path.dirname(md5[0][3]),
                    "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(
                        sample.get("date"), sample.get("flowcell"),
                        sample.get("project_sample_name"),
                        sample.get("sequence"), sample.get("lane")))
                self.log.debug(
                    "Writing delivery to json file {}".format(jsonfile))
                self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True)
                self.log.debug(
                    "Saving delivery in StatusDB document {}".format(id))
                sample.update(data)
                self._save(s_con, sample)
                self.log.debug(jsonstr)
Ejemplo n.º 37
0
class GDocsUpdater(rm.RunMonitor):
    
    def __init__(self, config):
        super(GDocsUpdater, self).__init__(config)
            
        # Connect to the Google Docs api
        gdconf = self.config.get("gdocs",{})
        creds = os.path.expanduser(gdconf.get("credentials_file",""))
        assert os.path.exists(creds), "Supplied GDocs credentials file does not exist"
        self.gdcon = SpreadSheet(get_credentials(creds))
        assert self.gdcon, "Could not get a SpreadSheet object, please verify gdocs credentials"
        doc = gdconf.get("qc_checklist",None)
        assert doc, "No QC checklist specified in configuration, please specify"
        ssheet = self.gdcon.get_spreadsheet(doc)
        assert ssheet, "Could not locate QC checklist '{}' on Google Docs. Please make sure it exists".format(doc)
        self.gdcon.ssheet = ssheet
        
        # Get the Ongoing, Finished and Coming worksheets
        self.ongoing = self.gdcon.get_worksheet("Ongoing")
        self.coming = self.gdcon.get_worksheet("Coming")
        self.finished = self.gdcon.get_worksheet("Finished")
        assert self.ongoing and self.coming and self.finished, "Could not get 'Ongoing', 'Finished' and 'Coming' worksheets from '{}'. Please make sure that they exist".format(doc)
        
        # Get a connection to the StatusDB project database
        dbconf = self.config.get("statusdb",{})
        try:
            self.pcon = ProjectSummaryConnection(url=dbconf.get("url","localhost"), 
                                                 username=dbconf.get("user","user"), 
                                                 password=dbconf.get("password","pass"))
        except ConnectionError:
            self.pcon = None
        
        
        
    def _list_runs(self, lists):
        # Loop over the lists and fetch the cards
        runs = {}
        for tlist in lists:
            list_obj = self.trello.get_list(self.trello_board,tlist,True)
            if not list_obj:
                continue
            
            # Loop over the cards in the list
            for card in list_obj.list_cards():
                # Get the description and convert it to a dictionary
                runs[card.name] = self.description_to_dict(card.description)
                
        return runs
    
    def coming_runs(self):
        """Return a dictionary with runs that are currently in process, i.e. not handed over to 
        the processing pipeline on Uppmax. The key in the dictionary is the run id and the values
        is a metadata dictionary
        """
        
        # Runs in these lists are to be considered "coming"
        lists = [rm.FIRSTREAD,
                 rm.INDEXREAD,
                 rm.SECONDREAD,
                 rm.PROCESSING,
                 rm.UPPMAX,
                 rm.STALLED]
        return self._list_runs(lists)
        
        
    def ongoing_runs(self):
        """Return a dictionary with runs that have finished and have been handed over to 
        the processing pipeline on Uppmax. The key in the dictionary is the run id and the values
        is a metadata dictionary
        """
        
        # Runs in these lists are to be considered "coming"
        lists = [rm.COMPLETED]
        return self._list_runs(lists)
        
    def reshape_run_info(self, runs, skiplist=[]):
        """Take the dictionary of runs and convert to a sorted list of lists with elements 
        corresponding to the columns in the checklist"""
        
        run_projects = []
        for id,data in runs.items():
            p = data.get('Projects',[''])
            if type(p) is not list:
                p = [p]
            for project in p:
                if len(project) == 0:
                    project = 'Unknown, please check!'
                if "{}_{}".format(id,project) not in skiplist:
                    application, tp = '',''#self.lookup_project(project)
                    run_projects.append([id,project,application,tp,'',data.get('Run mode',[''])[0]])

        return run_projects
        
    def lookup_project(self, project):
        """Lookup project application and type in StatusDB"""
        
        application = ""
        type = ""
        if self.pcon:
            pdoc = self.pcon.get_entry(project)
            if pdoc:
                application = str(pdoc.get("application",""))
                type = str(pdoc.get("type",pdoc.get("details",{}).get("type","")))
                
        return application, type
    
    def get_skiplist(self):
        """Get the runs and projects already listed in the GDocs spreadsheet
        """
        
        skiplist = []
        # Get the contents from the finished worksheet
        for run_project in self.gdocs_finished_runs():
            skiplist.append("{}_{}".format(run_project[0],run_project[1]))
    
        return skiplist
    
    def gdocs_coming_runs(self):
        return self._get_gdocs_run_projects(self.coming,COMING_HEADER_OFFSET)
    def gdocs_ongoing_runs(self):
        return self._get_gdocs_run_projects(self.ongoing,ONGOING_HEADER_OFFSET)
    def gdocs_finished_runs(self):
        return self._get_gdocs_run_projects(self.finished,FINISHED_HEADER_OFFSET)
    
    def _get_gdocs_run_projects(self, wsheet, header_offset):
        
        # Get the cell data
        run_projects = {}
        rows = self.gdcon.get_cell_content(wsheet,header_offset,1,0,6)
        for row in rows:
            if len(str(row[0])) == 0:
                continue
            data = [str(r) for r in row]
            key = "{}{}".format(data[0],data[1])
            if key in run_projects:
                continue
            run_projects[key] = data
        
        # Only return unique rows
        return run_projects.values()
        
    def update_gdocs(self):
        
        # Get the coming runs from Trello but Exclude runs that are already in gdocs
        gdocs_finished = self.gdocs_finished_runs()
        gdocs_ongoing = self.gdocs_ongoing_runs()
        gdocs_coming = self.gdocs_coming_runs()
        trello_coming = self.reshape_run_info(self.coming_runs(), ["{}_{}".format(r[0],r[1]) for r in gdocs_finished + gdocs_ongoing + gdocs_coming])
        # Get the ongoing runs from Trello but exclude runs that are already in the finished or ongoing tab
        trello_ongoing = self.reshape_run_info(self.ongoing_runs(), ["{}_{}".format(r[0],r[1]) for r in gdocs_finished + gdocs_ongoing])
        
        # Add each coming run to the next empty row
        for run in trello_coming:
            self.update_empty_row(self.coming,run,COMING_HEADER_OFFSET)
        
        # Move each run from coming if it exists there to the ongoing tab or just add it
        for run in trello_ongoing:
            status = self.run_project_match(run,gdocs_coming)
            if status == 0:
                self.update_empty_row(self.ongoing,run,ONGOING_HEADER_OFFSET)
                continue
            # Find the row index of the run in the coming tab
            row_index = self.gdcon.get_row_index(self.coming,run[0:2],COMING_HEADER_OFFSET)
            # Get the data from the coming tab, add it to an empty row in the ongoing tab and replace it with empty values
            row_data = self.gdcon.get_cell_content(self.coming,row_index,0,row_index,0)
            self.update_empty_row(self.ongoing,row_data[0],ONGOING_HEADER_OFFSET)
            self.gdcon.update_row(self.coming,row_index,["" for i in xrange(len(row_data[0]))])
    
        def last_name(data):
            pcs = data[1].split('.')
            if len(pcs) == 1:
                return pcs[0]
            return "".join(pcs[1:])
    
        # Lastly, update the application and type fields in gdocs if they are empty
        for wsheet, offset in [(self.coming, COMING_HEADER_OFFSET), (self.ongoing, ONGOING_HEADER_OFFSET)]:
            # Print a reader-friendly text to stdout
            print("{}\n{}\n".format(wsheet.title.text,"".join(['-' for i in xrange(len(wsheet.title.text))])))
            for run in sorted(self._get_gdocs_run_projects(wsheet,offset), key=last_name):
                if len(run) < 4:
                    continue
                if run[2] == "" or run[3] == "":
                    app, tp = self.lookup_project(run[1])
                    if run[2] == "":
                        run[2] = app
                    if run[3] == "":
                        run[3] = tp
                row_index = self.gdcon.get_row_index(wsheet,run[0:2],offset)
                self.gdcon.update_row(wsheet,row_index,run[0:4])
                
                print("{} - {}{}".format(run[1],"{} - ".format(run[3]) if len(run[3]) > 0 else "",run[4]))
                print("{}{}\n".format("{}\n".format(run[2]) if len(run[2]) > 0 else "",run[0]))
            
    def update_empty_row(self, wsheet, data, offset, merged=False):
        """Update the next empty row after the specified offset with the supplied data
        """
        updated = False
        # Require two empty rows in succession
        row_index = offset
        r2 = row_index
        while r2-row_index != 1:
            row_index = self.gdcon.get_row_index(wsheet,["" for i in xrange(len(data))],r2)
            # If we're writing a merged row, we need two consecutive empty rows
            if merged:
                r2 = self.gdcon.get_row_index(wsheet,["" for i in xrange(len(data))],row_index+1)
            else:
                r2 = row_index+1
            
        assert row_index > 0, "***ERROR*** No more rows left in spreadsheet"
        updated = self.gdcon.update_row(wsheet,row_index,data)
        # FIXME: do this better.. if the row is merged, write the same data to the second "hidden" row
        if merged:
            self.gdcon.update_row(wsheet,row_index+1,data)
            
        return updated
        
    def run_project_match(self, needle, haystack):
        """Checks if a run and project exist in a list of lists. Determines identity by the two first 
        columns in each list, the third and fourth are checked to determine if they need updating.
        Return 0 for no match, 1 for match that needs updating and 2 for a match that does not need updating
        """
        if len(needle) < 4:
            return 0
        
        for straw in haystack:
            if len(straw) < 4:
                continue
            if needle[0] != straw[0] or needle[1] != straw[1]:
                continue
            if needle[2] != straw[2] or needle[3] != straw[3]:
                return 1
            return 2
        
        return 0
Ejemplo n.º 38
0
class TestDbConnection(unittest.TestCase):
    def setUp(self):
        self.user = "******"
        self.pw = "pw"
        self.url = "localhost"
        self.examples = {
            "sample": "1_120924_AC003CCCXX_TGACCA",
            "flowcell": "AC003CCCXX",
            "project": "J.Doe_00_01"
        }
        self.p_con = ProjectSummaryConnection(dbname="projects-test",
                                              username=self.user,
                                              password=self.pw,
                                              url=self.url)

    def test_connection(self):
        """Test database connection"""
        sample_con = SampleRunMetricsConnection(dbname="samples-test",
                                                username=self.user,
                                                password=self.pw,
                                                url=self.url)
        self.assertEqual(sample_con.url_string,
                         "http://{}:5984".format(self.url))

    def test_get_flowcell(self):
        """Test getting a flowcell for a given sample"""
        sample_con = SampleRunMetricsConnection(dbname="samples-test",
                                                username=self.user,
                                                password=self.pw,
                                                url=self.url)
        fc = sample_con.get_entry(self.examples["sample"], "flowcell")
        self.assertEqual(str(fc), self.examples["flowcell"])

    def test_get_sample_ids(self):
        """Test getting sample ids given flowcell and sample_prj"""
        sample_con = SampleRunMetricsConnection(dbname="samples-test",
                                                username=self.user,
                                                password=self.pw,
                                                url=self.url)
        sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"])
        LOG.info("Number of samples before subsetting: " +
                 str(len(sample_ids)))
        self.assertEqual(len(sample_ids), 5)
        sample_ids = sample_con.get_sample_ids(
            fc_id=self.examples["flowcell"],
            sample_prj=self.examples["project"])
        LOG.info("Number of samples after subsetting: " + str(len(sample_ids)))
        self.assertEqual(len(sample_ids), 2)

    def test_get_samples(self):
        """Test getting samples given flowcell and sample_prj."""
        sample_con = SampleRunMetricsConnection(dbname="samples-test",
                                                username=self.user,
                                                password=self.pw,
                                                url=self.url)

        samples = sample_con.get_samples(fc_id=self.examples["flowcell"])
        LOG.info("Selecting on flowcell: " + str(len(samples)))
        self.assertEqual(len(samples), 5)
        samples = sample_con.get_samples(fc_id=self.examples["flowcell"],
                                         sample_prj=self.examples["project"])
        LOG.info("Selecting on flowcell, subsetting on project: " +
                 str(len(samples)))
        self.assertEqual(len(samples), 2)

        samples = sample_con.get_samples(sample_prj=self.examples["project"])
        LOG.info("Selecting on project: " + str(len(samples)))
        self.assertEqual(len(samples), 3)
        samples = sample_con.get_samples(sample_prj=self.examples["project"],
                                         fc_id=self.examples["flowcell"])
        LOG.info("Selecting on project, subsetting on flowcell: " +
                 str(len(samples)))
        self.assertEqual(len(samples), 2)

    def test_get_samples_wrong_info(self):
        """Test getting samples when either flowcell or project id information is wrong"""
        sample_con = SampleRunMetricsConnection(dbname="samples-test",
                                                username=self.user,
                                                password=self.pw,
                                                url=self.url)

        samples = sample_con.get_samples(sample_prj="bogusproject",
                                         fc_id=self.examples["flowcell"])
        LOG.info("Selecting on bogus project, subsetting on flowcell: " +
                 str(len(samples)))
        self.assertEqual(len(samples), 0)

    def test_get_project_sample_ids(self):
        """Test getting project sample ids"""
        sample_con = SampleRunMetricsConnection(dbname="samples-test",
                                                username=self.user,
                                                password=self.pw,
                                                url=self.url)
        sample_ids = sample_con.get_sample_ids(
            sample_prj=self.examples["project"])
        sample_names = [sample_con.db.get(x)["name"] for x in sample_ids]
        self.assertEqual(
            set(sample_names),
            set([
                '1_120924_AC003CCCXX_TGACCA', '2_120924_AC003CCCXX_ACAGTG',
                '1_121015_BB002BBBXX_TGACCA'
            ]))

    def test_get_latest_library_prep(self):
        """Test getting latest library prep"""
        prj = self.p_con.get_entry("J.Doe_00_01")
        prj['samples']['P001_102']['library_prep']['B'] = {
            'sample_run_metrics': {
                '2_120924_AC003CCCXX_TTGGAA': None
            }
        }
        self.p_con.save(prj)
        preps = self.p_con.get_latest_library_prep(
            project_name=self.examples["project"])
        srm = [x for l in preps.values() for x in l]
        # Make sure A prep not in list
        self.assertNotIn('2_120924_AC003CCCXX_ACAGTG', srm)
        # Make sure B prep in list
        self.assertIn('2_120924_AC003CCCXX_TTGGAA', srm)
        # Reset data
        prj = self.p_con.get_entry("J.Doe_00_01")
        del prj['samples']['P001_102']['library_prep']['B']
        self.p_con.save(prj)

    def test_get_barcode_lane_statistics(self):
        """Test getting barcode lane statistics from flowcell database"""
        fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test",
                                              username="******",
                                              password="******")
        # Try getting wrong sample name, should return None
        data = fc_con.get_barcode_lane_statistics("J.Doe_00_01",
                                                  "P001_101_index6",
                                                  "120924_AC003CCCXX", "1")
        self.assertEqual(data, (None, None))
        data = fc_con.get_barcode_lane_statistics("J.Doe_00_01",
                                                  "P001_101_index3",
                                                  "120924_AC003CCCXX", "1")
        self.assertEqual(data, (u'35.22', u'90.05'))
Ejemplo n.º 39
0
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None,
                       ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None,
                       project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells",
                       phix=None, is_paired=True, **kw):
    """Make a sample status note. Used keywords:

    :param project_name: project name
    :param flowcell: flowcell id
    :param username: db username
    :param password: db password
    :param url: db url
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param project_alias: project alias name
    :param phix: phix error rate
    :param is_paired: True if run is paired-end, False for single-end
    """
    # Cutoffs
    cutoffs = {
        "phix_err_cutoff" : 2.0,
        "qv_cutoff" : 30,
        }

    instrument = _parse_instrument_config(os.path.expanduser(kw.get("instrument_config","")))
    instrument_dict = {i['instrument_id']: i for i in instrument}

    # parameters
    parameters = {
        "project_name" : None,
        "start_date" : None,
        "FC_id" : None,
        "scilifelab_name" : None,
        "rounded_read_count" : None,
        "phix_error_rate" : None,
        "avg_quality_score" : None,
        "pct_q30_bases" : None,
        "success" : None,
        "run_mode":None,
        "is_paired":True
        }
    # key mapping from sample_run_metrics to parameter keys
    srm_to_parameter = {"project_name":"sample_prj", "FC_id":"flowcell",
                        "scilifelab_name":"barcode_name", "start_date":"date",
                        "rounded_read_count":"bc_count", "lane": "lane"}

    LOG.debug("got parameters {}".format(parameters))
    output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()}
    if not _assert_flowcell_format(flowcell):
        LOG.warn("Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9\-]+\")".format(flowcell) )
        return output_data
    output_data = _update_sample_output_data(output_data, cutoffs)

    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url)

    # Set up paragraphs
    paragraphs = sample_note_paragraphs()
    headers = sample_note_headers()

    # Get project
    project = p_con.get_entry(project_name)
    source = p_con.get_info_source(project_name)
    if not project:
        LOG.warn("No such project '{}'".format(project_name))
        return output_data

    # Set samples list
    sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con)
    if len(sample_run_list) == 0:
        LOG.warn("No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?".format(project_name, flowcell))
        return output_data

    # Set options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    bc_count = _literal_eval_option(bc_count)
    phix = _literal_eval_option(phix)

    # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports
    sample_count = Counter([x.get("barcode_name") for x in sample_run_list])

    # Loop samples and collect information
    s_param_out = []
    fcdoc = None
    for s in sample_run_list:
        s_param = {}
        LOG.debug("working on sample '{}', sample run metrics name '{}', id '{}'".format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None)))
        s_param.update(parameters)
        s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()})
        fc = "{}_{}".format(s.get("date"), s.get("flowcell"))
        # Get instrument
        try:
            s_param.update(instrument_dict[fc_con.get_instrument(str(fc))])
        except:
            LOG.warn("Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report.".format(fc))
            s_param.update(instrument_dict['default'])
        # Get run mode
        if not fcdoc or fcdoc.get("name") != fc:
            fcdoc = fc_con.get_entry(fc)
        runp = fcdoc.get("RunParameters",{})
        s_param["sequencing_platform"] = "MiSeq" if "MCSVersion" in runp else "HiSeq2500"
        s_param["clustering_method"] = "onboard clustering" if runp.get("ClusteringChoice","") == "OnBoardClustering" or s_param["sequencing_platform"] == "MiSeq" else "cBot"
        s_param["sequencing_setup"] = fcdoc.get("run_setup")
        s_param["sequencing_mode"] = runp.get("RunMode","High Output")
        s_param["sequencing_software"] = "RTA {}".format(runp.get("RTAVersion"))
        if s_param["sequencing_platform"] == "MiSeq":
            s_param["sequencing_software"] = "MCS {}/{}".format(runp.get("MCSVersion"),s_param["sequencing_software"])
        else:
            s_param["sequencing_software"] = "{} {}/{}".format(runp.get("ApplicationName"),runp.get("ApplicationVersion"),s_param["sequencing_software"])
        s_param["is_paired"] = fc_con.is_paired_end(str(fc))
        if s_param["is_paired"] is None:
            LOG.warn("Could not determine run setup for flowcell {}. Will assume paired-end.".format(fc))
            s_param["is_paired"] = True
        s_param.update(software_versions)
        s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"])
        if phix:
            s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix)
        # Get quality score from demultiplex stats, if that fails
        # (which it shouldn't), fall back on fastqc data.
        (avg_quality_score, pct_q30_bases) = fc_con.get_barcode_lane_statistics(project_name, s.get("barcode_name"), fc, s["lane"])
        s_param['avg_quality_score'] = avg_quality_score if avg_quality_score else calc_avg_qv(s)
        if not s_param['avg_quality_score']:
            LOG.warn("Setting average quality failed for sample {}, id {}".format(s.get("name"), s.get("_id")))
        s_param['pct_q30_bases'] = pct_q30_bases
        if not s_param['pct_q30_bases']:
            LOG.warn("Setting % of >= Q30 Bases (PF) failed for sample {}, id {}".format(s.get("name"), s.get("_id")))
        # Compare phix error and qv to cutoffs
        err_stat = "OK"
        qv_stat = "OK"
        if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]:
            err_stat = "HIGH"
        elif s_param["phix_error_rate"] == -1:
            err_stat = "N/A"
        if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]:
            qv_stat = "LOW"
        output_data["stdout"].write("{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat))

        # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing
        s_param['ordered_amount'] = s_param.get('ordered_amount',
                                                p_con.get_ordered_amount(project_name,
                                                                         samples=p_con.get_entry(project_name,'samples')))
        s_param['customer_reference'] = s_param.get('customer_reference', project.get('customer_reference'))
        s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id'))

        # Override database settings if options passed at command line
        if ordered_million_reads:
            s_param["ordered_amount"] = _get_ordered_million_reads(s["barcode_name"], ordered_million_reads)
        if bc_count:
            s_param["rounded_read_count"] = _round_read_count_in_millions(_get_bc_count(s["barcode_name"], bc_count, s))
        else:
            s_param["rounded_read_count"] = _round_read_count_in_millions(s_param["rounded_read_count"])
        if uppnex_id:
            s_param["uppnex_project_id"] = uppnex_id
        if customer_reference:
            s_param["customer_reference"] = customer_reference

        # Get the project sample name corresponding to the sample run
        project_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None))
        if project_sample:
            LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample["sample_name"]))
            project_sample_item = project_sample['project_sample']
            # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
            project_sample_d = _set_project_sample_dict(project_sample_item, source)
            if not project_sample_d:
                LOG.warn("No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}".format(s["name"], s["barcode_name"], s["_id"], project_sample))
            # Check if sample run metrics name present in project database: if so, verify that database ids are consistent
            if s["name"] not in project_sample_d.keys():
                LOG.warn("no such sample run metrics '{}' in project sample run metrics dictionary".format(s["name"]) )
            else:
                if s["_id"] == project_sample_d[s["name"]]:
                    LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample_d[s["name"]]))
                else:
                    LOG.warn("inconsistent mapping for '{}': '{}' != '{}' (project summary id)".format(s["name"], s["_id"], project_sample_d[s["name"]]))
            s_param['customer_name'] = project_sample_item.get("customer_name", None)

            # Always normalize submitted id, since module textttable does not support unicode
            if type(s_param['customer_name']) is unicode:
                s_param['customer_name'] = unicodedata.normalize('NFKD', s_param['customer_name']).encode('ascii', 'ignore')
        # No project sample found. Manual upload to database necessary.
        else:
            s_param['customer_name'] = None
            LOG.warn("No project sample name found for sample run name '{}'".format(s["barcode_name"]))
            LOG.info("Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names ")
            LOG.info("or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names.")
            LOG.info("Please refer to the pm documentation for examples.")
            query_ok(force=kw.get("force", False))

        # Finally assess sequencing success, update parameters and set outputs
        s_param['success'] = sequencing_success(s_param, cutoffs)
        s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] ==  "" or s_param[k] == -1.0})
        if sample_count[s.get("barcode_name")] > 1:
            outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"])
        else:
            outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"])
        s_param["outfile"] = outfile
        s_param_out.append(s_param)

    # Write final output to reportlab and rst files
    output_data["debug"].write(json.dumps({'s_param': s_param_out, 'sample_runs':{s["name"]:s["barcode_name"] for s in sample_run_list}}))
    notes = [make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out]
    rest_notes = make_sample_rest_notes("{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out)
    concatenate_notes(notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None)))
    return output_data
Ejemplo n.º 40
0
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None,
                       ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None,
                       project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells",
                       phix=None, **kw):
    """Make a sample status note. Used keywords:

    :param project_name: project name
    :param flowcell: flowcell id
    :param username: db username
    :param password: db password
    :param url: db url
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param project_alias: project alias name
    :param phix: phix error rate
    """
    # Cutoffs
    cutoffs = {
        "phix_err_cutoff" : 2.0,
        "qv_cutoff" : 30,
        }
    
    # parameters
    parameters = {
        "project_name" : None,
        "start_date" : None,
        "FC_id" : None,
        "scilifelab_name" : None,
        "rounded_read_count" : None,
        "phix_error_rate" : None,
        "avg_quality_score" : None,
        "success" : None,
        "run_mode":None,
        }
    # key mapping from sample_run_metrics to parameter keys
    srm_to_parameter = {"project_name":"sample_prj", "FC_id":"flowcell", 
                        "scilifelab_name":"barcode_name", "start_date":"date", "rounded_read_count":"bc_count"}
    
    LOG.debug("got parameters {}".format(parameters))
    output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()}
    if not _assert_flowcell_format(flowcell):
        LOG.warn("Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9]+XX\")".format(flowcell) )
        return output_data
    output_data = _update_sample_output_data(output_data, cutoffs)

    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url)

    # Set up paragraphs
    paragraphs = sample_note_paragraphs()
    headers = sample_note_headers()

    # Get project
    project = p_con.get_entry(project_name)
    if not project:
        LOG.warn("No such project '{}'".format(project_name))
        return output_data

    # Set samples list
    sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con)
    if len(sample_run_list) == 0:
        LOG.warn("No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?".format(project_name, flowcell))
        return output_data
    
    # Set options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    bc_count = _literal_eval_option(bc_count)
    phix = _literal_eval_option(phix)

    # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports
    sample_count = Counter([x.get("barcode_name") for x in sample_run_list])

    # Loop samples and collect information
    s_param_out = []
    for s in sample_run_list:
        s_param = {}
        LOG.debug("working on sample '{}', sample run metrics name '{}', id '{}'".format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None)))
        s_param.update(parameters)
        s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()})
        fc = "{}_{}".format(s.get("date"), s.get("flowcell"))
        # Get instrument
        try:
            s_param.update(instrument[fc_con.get_instrument(str(fc))])
        except:
            LOG.warn("Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report.".format(fc))
            s_param.update(instrument['default'])
        # Get run mode
        s_param["run_mode"] = fc_con.get_run_mode(str(fc))
        s_param.update(software_versions)
        s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"])
        if phix:
            s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix)
        s_param['avg_quality_score'] = calc_avg_qv(s)
        if not s_param['avg_quality_score']:
            LOG.warn("Calculation of average quality failed for sample {}, id {}".format(s.get("name"), s.get("_id")))

        # Compare phix error and qv to cutoffs
        err_stat = "OK"
        qv_stat = "OK"
        if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]:
            err_stat = "HIGH"
        elif s_param["phix_error_rate"] == -1:
            err_stat = "N/A"
        if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]:
            qv_stat = "LOW"
        output_data["stdout"].write("{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat))

        # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing
        s_param['ordered_amount'] = s_param.get('ordered_amount', p_con.get_ordered_amount(project_name))
        s_param['customer_reference'] = s_param.get('customer_reference', project.get('customer_reference'))
        s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id'))

        # Override database settings if options passed at command line
        if ordered_million_reads:
            s_param["ordered_amount"] = _get_ordered_million_reads(s["barcode_name"], ordered_million_reads)
        if bc_count:
            s_param["rounded_read_count"] = _round_read_count_in_millions(_get_bc_count(s["barcode_name"], bc_count, s))
        else:
            s_param["rounded_read_count"] = _round_read_count_in_millions(s_param["rounded_read_count"])
        if uppnex_id:
            s_param["uppnex_project_id"] = uppnex_id
        if customer_reference:
            s_param["customer_reference"] = customer_reference

        # Get the project sample name corresponding to the sample run
        project_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None))
        if project_sample:
            LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample["sample_name"]))
            project_sample_item = project_sample['project_sample']
            # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
            project_sample_d = _set_project_sample_dict(project_sample_item)
            if not project_sample_d:
                LOG.warn("No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}".format(s["name"], s["barcode_name"], s["_id"], project_sample))
            # Check if sample run metrics name present in project database: if so, verify that database ids are consistent
            if s["name"] not in project_sample_d.keys():
                LOG.warn("no such sample run metrics '{}' in project sample run metrics dictionary".format(s["name"]) )
            else:
                if s["_id"] == project_sample_d[s["name"]]:
                    LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample_d[s["name"]]))
                else:
                    LOG.warn("inconsistent mapping for '{}': '{}' != '{}' (project summary id)".format(s["name"], s["_id"], project_sample_d[s["name"]]))
            s_param['customer_name'] = project_sample_item.get("customer_name", None)

        # No project sample found. Manual upload to database necessary.
        else:
            s_param['customer_name'] = None
            LOG.warn("No project sample name found for sample run name '{}'".format(s["barcode_name"]))
            LOG.info("Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names ")
            LOG.info("or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names.")
            LOG.info("Please refer to the pm documentation for examples.")
            query_ok(force=kw.get("force", False))

        # Finally assess sequencing success, update parameters and set outputs
        s_param['success'] = sequencing_success(s_param, cutoffs)
        s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] ==  "" or s_param[k] == -1.0})
        if sample_count[s.get("barcode_name")] > 1:
            outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"])
        else:
            outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"])
        s_param["outfile"] = outfile
        s_param_out.append(s_param)

    # Write final output to reportlab and rst files
    output_data["debug"].write(json.dumps({'s_param': s_param_out, 'sample_runs':{s["name"]:s["barcode_name"] for s in sample_run_list}}))
    notes = [make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out]
    rest_notes = make_sample_rest_notes("{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out)
    concatenate_notes(notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None)))
    return output_data
Ejemplo n.º 41
0
def project_status_note(project_name=None, username=None, password=None, url=None,
                        use_ps_map=True, use_bc_map=False, check_consistency=False,
                        ordered_million_reads=None, uppnex_id=None, customer_reference=None,
                        exclude_sample_ids={}, project_alias=None, sample_aliases={},
                        projectdb="projects", samplesdb="samples", flowcelldb="flowcells",
                        include_all_samples=False, **kw):
    """Make a project status note. Used keywords:

    :param project_name: project name
    :param user: db user name
    :param password: db password
    :param url: db url
    :param use_ps_map: use project summary mapping
    :param use_bc_map: use project to barcode name mapping
    :param check_consistency: check consistency between mappings
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param exclude_sample_ids: exclude some sample ids from project note
    :param project_alias: project alias name
    :param sample_aliases: sample alias names
    :param projectdb: project db name
    :param samplesdb: samples db name
    :param flowcelldb: flowcells db name
    :param include_all_samples: include all samples in report
    """
    # parameters
    parameters = {
        "project_name" : project_name,
        "finished" : "Not finished, or cannot yet assess if finished.",
        }
    # mapping project_summary to parameter keys
    ps_to_parameter = {"scilife_name":"scilife_name", "customer_name":"customer_name", "project_name":"project_name"}
    # mapping project sample to table
    table_keys = ['ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status']

    output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()}
    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url)

    # Set report paragraphs
    paragraphs = project_note_paragraphs()
    headers = project_note_headers()
    # Set local param variable
    param = parameters
    
    # Get project summary from project database
    sample_aliases = _literal_eval_option(sample_aliases, default={})
    prj_summary = p_con.get_entry(project_name)
    if not prj_summary:
        LOG.warn("No such project '{}'".format(project_name))
        return
    LOG.debug("Working on project '{}'.".format(project_name))

    # Get sample run list and loop samples to make mapping sample -> {sampleruns}
    sample_run_list = _set_sample_run_list(project_name, flowcell=None, project_alias=project_alias, s_con=s_con)
    samples = {}
    for s in sample_run_list:
        prj_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None))
        if prj_sample:
            sample_name = prj_sample['project_sample'].get("scilife_name", None)
            s_d = {s["name"] : {'sample':sample_name, 'id':s["_id"]}}
            samples.update(s_d)
        else:
            if s["barcode_name"] in sample_aliases:
                s_d = {sample_aliases[s["barcode_name"]] : {'sample':sample_aliases[s["barcode_name"]], 'id':s["_id"]}}
                samples.update(s_d)
            else:
                s_d = {s["name"]:{'sample':s["name"], 'id':s["_id"], 'barcode_name':s["barcode_name"]}}
                LOG.warn("No mapping found for sample run:\n  '{}'".format(s_d))

    # Convert to mapping from desired sample name to list of aliases
    # Less important for the moment; one solution is to update the
    # Google docs summary table to use the P names
    sample_dict = prj_summary['samples']
    param.update({key:prj_summary.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys()})
    param["ordered_amount"] = param.get("ordered_amount", p_con.get_ordered_amount(project_name))
    param['customer_reference'] = param.get('customer_reference', prj_summary.get('customer_reference'))
    param['uppnex_project_id'] = param.get('uppnex_project_id', prj_summary.get('uppnex_id'))

    # Override database values if options passed at command line
    if uppnex_id:
        param["uppnex_project_id"] = uppnex_id
    if customer_reference:
        param["customer_reference"] = customer_reference

    # Process options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={})

    ## Start collecting the data
    sample_table = []
    samples_excluded = []
    all_passed = True
    last_library_preps = p_con.get_latest_library_prep(project_name)
    last_library_preps_srm = [x for l in last_library_preps.values() for x in l] 
    LOG.debug("Looping through sample map that maps project sample names to sample run metrics ids")
    for k,v in samples.items():
        LOG.debug("project sample '{}' maps to '{}'".format(k, v))
        if not include_all_samples:
            if v['sample'] not in last_library_preps.keys():
                LOG.info("No library prep information for sample {}; keeping in report".format(v['sample']))
            else:
                if k not in last_library_preps_srm:
                    LOG.info("Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report".format(k, v["id"], last_library_preps[v['sample']].values()[0], v['sample']))
                    continue
        else:
            pass
                    
        if re.search("Unexpected", k):
            continue
        barcode_seq = s_con.get_entry(k, "sequence")
        # Exclude sample id?
        if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq):
            samples_excluded.append(v['sample'])
            continue
        # Get the project sample name from the sample run and set table values
        project_sample = sample_dict[v['sample']]
        vals = _set_sample_table_values(v['sample'], project_sample, barcode_seq, ordered_million_reads, param)
        if vals['Status']=="N/A" or vals['Status']=="NP": all_passed = False
        sample_table.append([vals[k] for k in table_keys])

    # Loop through samples in sample_dict for which there is no sample run information
    samples_in_table_or_excluded = list(set([x[0] for x in sample_table])) + samples_excluded
    samples_not_in_table = list(set(sample_dict.keys()) - set(samples_in_table_or_excluded))
    for sample in samples_not_in_table:
        if re.search("Unexpected", sample):
            continue
        project_sample = sample_dict[sample]
        # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
        project_sample_d = _set_project_sample_dict(project_sample)
        if project_sample_d:
            for k,v in project_sample_d.iteritems():
                barcode_seq = s_con.get_entry(k, "sequence")
                vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param)
                if vals['Status']=="N/A" or vals['Status']=="NP": all_passed = False
                sample_table.append([vals[k] for k in table_keys])
        else:
            barcode_seq = None
            vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param)
            if vals['Status']=="N/A" or vals['Status']=="NP": all_passed = False
            sample_table.append([vals[k] for k in table_keys])
    if all_passed: param["finished"] = 'Project finished.'
    sample_table.sort()
    sample_table = list(sample_table for sample_table,_ in itertools.groupby(sample_table))
    sample_table.insert(0, ['ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status'])
    paragraphs["Samples"]["tpl"] = make_sample_table(sample_table)
    make_note("{}_project_summary.pdf".format(project_name), headers, paragraphs, **param)
    make_rest_note("{}_project_summary.rst".format(project_name), sample_table=sample_table, report="project_report", **param)
    param.update({k:"N/A" for k in param.keys() if param[k] is None or param[k] ==  ""})
    output_data["debug"].write(json.dumps({'param':param, 'table':sample_table}))
    return output_data
Ejemplo n.º 42
0
def data_delivery_note(**kw):
    """Create an easily parseable information file with information about the data delivery
    """
    output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()}

    project_name = kw.get('project_name',None)
    flowcell = kw.get('flowcell',None)
    LOG.debug("Generating data delivery note for project {}{}.".format(project_name,' and flowcell {}'.format(flowcell if flowcell else '')))

    # Get a connection to the project and sample databases
    p_con = ProjectSummaryConnection(**kw)
    assert p_con, "Could not connect to project database"
    s_con = SampleRunMetricsConnection(**kw)
    assert s_con, "Could not connect to sample database"

    # Get the entry for the project and samples from the database
    LOG.debug("Fetching samples from sample database")
    samples = s_con.get_samples(sample_prj=project_name, fc_id=flowcell)
    LOG.debug("Got {} samples from database".format(len(samples)))

    # Get the customer sample names from the project database
    LOG.debug("Fetching samples from project database")
    project_samples = p_con.get_entry(project_name, "samples")
    customer_names = {sample_name:sample.get('customer_name','N/A') for sample_name, sample in project_samples.items()}

    data = [['SciLifeLab ID','Submitted ID','Flowcell','Lane','Barcode','Read','Path','MD5','Size (bytes)','Timestamp']]
    for sample in samples:
        sname = sample.get('project_sample_name','N/A')
        cname = customer_names.get(sname,'N/A')
        fc = sample.get('flowcell','N/A')
        lane = sample.get('lane','N/A')
        barcode = sample.get('sequence','N/A')
        if 'raw_data_delivery' not in sample:
            data.append([sname,cname,'','','','','','','',''])
            continue
        delivery = sample['raw_data_delivery']
        tstamp = delivery.get('timestamp','N/A')
        for read, file in delivery.get('files',{}).items():
            data.append([sname,
                         cname,
                         fc,
                         lane,
                         barcode,
                         read,
                         file.get('path','N/A'),
                         file.get('md5','N/A'),
                         file.get('size_in_bytes','N/A'),
                         tstamp,])

    # Write the data to a csv file
    outfile = "{}{}_data_delivery.csv".format(project_name,'_{}'.format(flowcell) if flowcell else '')
    LOG.debug("Writing delivery data to {}".format(outfile))
    with open(outfile,"w") as outh:
        csvw = csv.writer(outh)
        for row in data:
            csvw.writerow(row)

    # Write Texttable formatted output to stdout
    tt = texttable.Texttable(180)
    tt.add_rows(data)
    output_data['stdout'].write(tt.draw())

    return output_data
Ejemplo n.º 43
0
def application_qc(project_name=None,
                   flowcell=None,
                   application=None,
                   username=None,
                   password=None,
                   url=None,
                   sampledb="samples",
                   projectdb="projects",
                   **kw):
    """Perform application specific qc on a project.

    :param project_name: project name
    :param flowcell: flowcell identifier
    :param application: application for which to perform qc
    :param username: database username
    :param password: database password
    :param url: database url
    :param sampledb: samples database name
    :param projectdb: project database name
    """
    LOG.debug("Doing application qc for project {}, flowcell {}".format(
        project_name, flowcell))

    output_data = {'stdout': StringIO(), 'stderr': StringIO()}
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)
    s_con = SampleRunMetricsConnection(dbname=sampledb,
                                       username=username,
                                       password=password,
                                       url=url)
    prj_summary = p_con.get_entry(project_name)
    qc_data = get_qc_data(project_name, p_con, s_con, flowcell)

    if not prj_summary is None:
        qc_data = get_qc_data(project_name, p_con, s_con, flowcell)
        if prj_summary.get("application") not in APPLICATION_MAP.keys():
            if not application:
                LOG.warn(
                    "No such application {}. Please use the application option (available choices {})"
                    .format(application, ",".join(QC_CUTOFF.keys())))
                return output_data
            application = application
        else:
            application = APPLICATION_MAP[prj_summary.get("application")]
    else:
        LOG.info(
            "No such project {} in project summary. Trying to get qc data anyway."
            .format(project_name))
        if not application:
            LOG.warn(
                "No application provided. Please use the application option (available choices {})"
                .format(",".join(QC_CUTOFF.keys())))
            return output_data
        qc_data = _get_sample_qc_data(project_name, application, s_con,
                                      flowcell)

    output_data = _qc_info_header(project_name, application, output_data)
    for k, v in sorted(qc_data.iteritems()):
        y = [str(x) for x in assess_qc(v, application)]
        output_data["stdout"].write("".join(y) + "\n")
    return output_data
Ejemplo n.º 44
0
def sample_status_note(project_name=None,
                       flowcell=None,
                       username=None,
                       password=None,
                       url=None,
                       ordered_million_reads=None,
                       uppnex_id=None,
                       customer_reference=None,
                       bc_count=None,
                       project_alias=[],
                       projectdb="projects",
                       samplesdb="samples",
                       flowcelldb="flowcells",
                       phix=None,
                       is_paired=True,
                       **kw):
    """Make a sample status note. Used keywords:

    :param project_name: project name
    :param flowcell: flowcell id
    :param username: db username
    :param password: db password
    :param url: db url
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param project_alias: project alias name
    :param phix: phix error rate
    :param is_paired: True if run is paired-end, False for single-end
    """
    # Cutoffs
    cutoffs = {
        "phix_err_cutoff": 2.0,
        "qv_cutoff": 30,
    }

    instrument = _parse_instrument_config(
        os.path.expanduser(kw.get("instrument_config", "")))
    instrument_dict = {i['instrument_id']: i for i in instrument}

    # parameters
    parameters = {
        "project_name": None,
        "start_date": None,
        "FC_id": None,
        "scilifelab_name": None,
        "rounded_read_count": None,
        "phix_error_rate": None,
        "avg_quality_score": None,
        "pct_q30_bases": None,
        "success": None,
        "run_mode": None,
        "is_paired": True
    }
    # key mapping from sample_run_metrics to parameter keys
    srm_to_parameter = {
        "project_name": "sample_prj",
        "FC_id": "flowcell",
        "scilifelab_name": "barcode_name",
        "start_date": "date",
        "rounded_read_count": "bc_count",
        "lane": "lane"
    }

    LOG.debug("got parameters {}".format(parameters))
    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    if not _assert_flowcell_format(flowcell):
        LOG.warn(
            "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9\-]+\")"
            .format(flowcell))
        return output_data
    output_data = _update_sample_output_data(output_data, cutoffs)

    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    # Set up paragraphs
    paragraphs = sample_note_paragraphs()
    headers = sample_note_headers()

    # Get project
    project = p_con.get_entry(project_name)
    source = p_con.get_info_source(project_name)
    if not project:
        LOG.warn("No such project '{}'".format(project_name))
        return output_data

    # Set samples list
    sample_run_list = _set_sample_run_list(project_name, flowcell,
                                           project_alias, s_con)
    if len(sample_run_list) == 0:
        LOG.warn(
            "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?"
            .format(project_name, flowcell))
        return output_data

    # Set options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    bc_count = _literal_eval_option(bc_count)
    phix = _literal_eval_option(phix)

    # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports
    sample_count = Counter([x.get("barcode_name") for x in sample_run_list])

    # Loop samples and collect information
    s_param_out = []
    fcdoc = None
    for s in sample_run_list:
        s_param = {}
        LOG.debug(
            "working on sample '{}', sample run metrics name '{}', id '{}'".
            format(s.get("barcode_name", None), s.get("name", None),
                   s.get("_id", None)))
        s_param.update(parameters)
        s_param.update(
            {key: s[srm_to_parameter[key]]
             for key in srm_to_parameter.keys()})
        fc = "{}_{}".format(s.get("date"), s.get("flowcell"))
        # Get instrument
        try:
            s_param.update(instrument_dict[fc_con.get_instrument(str(fc))])
        except:
            LOG.warn(
                "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report."
                .format(fc))
            s_param.update(instrument_dict['default'])
        # Get run mode
        if not fcdoc or fcdoc.get("name") != fc:
            fcdoc = fc_con.get_entry(fc)
        runp = fcdoc.get("RunParameters", {})
        s_param[
            "sequencing_platform"] = "MiSeq" if "MCSVersion" in runp else "HiSeq2500"
        s_param["clustering_method"] = "onboard clustering" if runp.get(
            "ClusteringChoice", "") == "OnBoardClustering" or s_param[
                "sequencing_platform"] == "MiSeq" else "cBot"
        s_param["sequencing_setup"] = fcdoc.get("run_setup")
        s_param["sequencing_mode"] = runp.get("RunMode", "High Output")
        s_param["sequencing_software"] = "RTA {}".format(
            runp.get("RTAVersion"))
        if s_param["sequencing_platform"] == "MiSeq":
            s_param["sequencing_software"] = "MCS {}/{}".format(
                runp.get("MCSVersion"), s_param["sequencing_software"])
        else:
            s_param["sequencing_software"] = "{} {}/{}".format(
                runp.get("ApplicationName"), runp.get("ApplicationVersion"),
                s_param["sequencing_software"])
        s_param["is_paired"] = fc_con.is_paired_end(str(fc))
        if s_param["is_paired"] is None:
            LOG.warn(
                "Could not determine run setup for flowcell {}. Will assume paired-end."
                .format(fc))
            s_param["is_paired"] = True
        s_param.update(software_versions)
        s_param["phix_error_rate"] = fc_con.get_phix_error_rate(
            str(fc), s["lane"])
        if phix:
            s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix)
        # Get quality score from demultiplex stats, if that fails
        # (which it shouldn't), fall back on fastqc data.
        (avg_quality_score,
         pct_q30_bases) = fc_con.get_barcode_lane_statistics(
             project_name, s.get("barcode_name"), fc, s["lane"])
        s_param[
            'avg_quality_score'] = avg_quality_score if avg_quality_score else calc_avg_qv(
                s)
        if not s_param['avg_quality_score']:
            LOG.warn(
                "Setting average quality failed for sample {}, id {}".format(
                    s.get("name"), s.get("_id")))
        s_param['pct_q30_bases'] = pct_q30_bases
        if not s_param['pct_q30_bases']:
            LOG.warn(
                "Setting % of >= Q30 Bases (PF) failed for sample {}, id {}".
                format(s.get("name"), s.get("_id")))
        # Compare phix error and qv to cutoffs
        err_stat = "OK"
        qv_stat = "OK"
        if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]:
            err_stat = "HIGH"
        elif s_param["phix_error_rate"] == -1:
            err_stat = "N/A"
        if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]:
            qv_stat = "LOW"
        output_data["stdout"].write(
            "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(
                s["barcode_name"], s["lane"], s_param["phix_error_rate"],
                err_stat, s_param["avg_quality_score"], qv_stat))

        # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing
        s_param['ordered_amount'] = s_param.get(
            'ordered_amount',
            p_con.get_ordered_amount(project_name,
                                     samples=p_con.get_entry(
                                         project_name, 'samples')))
        s_param['customer_reference'] = s_param.get(
            'customer_reference', project.get('customer_reference'))
        s_param['uppnex_project_id'] = s_param.get('uppnex_project_id',
                                                   project.get('uppnex_id'))

        # Override database settings if options passed at command line
        if ordered_million_reads:
            s_param["ordered_amount"] = _get_ordered_million_reads(
                s["barcode_name"], ordered_million_reads)
        if bc_count:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                _get_bc_count(s["barcode_name"], bc_count, s))
        else:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                s_param["rounded_read_count"])
        if uppnex_id:
            s_param["uppnex_project_id"] = uppnex_id
        if customer_reference:
            s_param["customer_reference"] = customer_reference

        # Get the project sample name corresponding to the sample run
        project_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if project_sample:
            LOG.debug(
                "project sample run metrics mapping found: '{}' : '{}'".format(
                    s["name"], project_sample["sample_name"]))
            project_sample_item = project_sample['project_sample']
            # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
            project_sample_d = _set_project_sample_dict(
                project_sample_item, source)
            if not project_sample_d:
                LOG.warn(
                    "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}"
                    .format(s["name"], s["barcode_name"], s["_id"],
                            project_sample))
            # Check if sample run metrics name present in project database: if so, verify that database ids are consistent
            if s["name"] not in project_sample_d.keys():
                LOG.warn(
                    "no such sample run metrics '{}' in project sample run metrics dictionary"
                    .format(s["name"]))
            else:
                if s["_id"] == project_sample_d[s["name"]]:
                    LOG.debug(
                        "project sample run metrics mapping found: '{}' : '{}'"
                        .format(s["name"], project_sample_d[s["name"]]))
                else:
                    LOG.warn(
                        "inconsistent mapping for '{}': '{}' != '{}' (project summary id)"
                        .format(s["name"], s["_id"],
                                project_sample_d[s["name"]]))
            s_param['customer_name'] = project_sample_item.get(
                "customer_name", None)

            # Always normalize submitted id, since module textttable does not support unicode
            if type(s_param['customer_name']) is unicode:
                s_param['customer_name'] = unicodedata.normalize(
                    'NFKD',
                    s_param['customer_name']).encode('ascii', 'ignore')
        # No project sample found. Manual upload to database necessary.
        else:
            s_param['customer_name'] = None
            LOG.warn(
                "No project sample name found for sample run name '{}'".format(
                    s["barcode_name"]))
            LOG.info(
                "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names "
            )
            LOG.info(
                "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names."
            )
            LOG.info("Please refer to the pm documentation for examples.")
            query_ok(force=kw.get("force", False))

        # Finally assess sequencing success, update parameters and set outputs
        s_param['success'] = sequencing_success(s_param, cutoffs)
        s_param.update({
            k: "N/A"
            for k in s_param.keys()
            if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0
        })
        if sample_count[s.get("barcode_name")] > 1:
            outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                               s["flowcell"], s["lane"])
        else:
            outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                            s["flowcell"])
        s_param["outfile"] = outfile
        s_param_out.append(s_param)

    # Write final output to reportlab and rst files
    output_data["debug"].write(
        json.dumps({
            's_param': s_param_out,
            'sample_runs':
            {s["name"]: s["barcode_name"]
             for s in sample_run_list}
        }))
    notes = [
        make_note(headers=headers, paragraphs=paragraphs, **sp)
        for sp in s_param_out
    ]
    rest_notes = make_sample_rest_notes(
        "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None),
                                             s.get("flowcell", None)),
        s_param_out)
    concatenate_notes(
        notes, "{}_{}_{}_sample_summary.pdf".format(project_name,
                                                    s.get("date", None),
                                                    s.get("flowcell", None)))
    return output_data
Ejemplo n.º 45
0
def sample_status_note(project_name=None,
                       flowcell=None,
                       username=None,
                       password=None,
                       url=None,
                       ordered_million_reads=None,
                       uppnex_id=None,
                       customer_reference=None,
                       bc_count=None,
                       project_alias=[],
                       projectdb="projects",
                       samplesdb="samples",
                       flowcelldb="flowcells",
                       phix=None,
                       **kw):
    """Make a sample status note. Used keywords:

    :param project_name: project name
    :param flowcell: flowcell id
    :param username: db username
    :param password: db password
    :param url: db url
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param project_alias: project alias name
    :param phix: phix error rate
    """
    # Cutoffs
    cutoffs = {
        "phix_err_cutoff": 2.0,
        "qv_cutoff": 30,
    }

    # parameters
    parameters = {
        "project_name": None,
        "start_date": None,
        "FC_id": None,
        "scilifelab_name": None,
        "rounded_read_count": None,
        "phix_error_rate": None,
        "avg_quality_score": None,
        "success": None,
        "run_mode": None,
    }
    # key mapping from sample_run_metrics to parameter keys
    srm_to_parameter = {
        "project_name": "sample_prj",
        "FC_id": "flowcell",
        "scilifelab_name": "barcode_name",
        "start_date": "date",
        "rounded_read_count": "bc_count"
    }

    LOG.debug("got parameters {}".format(parameters))
    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    if not _assert_flowcell_format(flowcell):
        LOG.warn(
            "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9]+XX\")"
            .format(flowcell))
        return output_data
    output_data = _update_sample_output_data(output_data, cutoffs)

    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    # Set up paragraphs
    paragraphs = sample_note_paragraphs()
    headers = sample_note_headers()

    # Get project
    project = p_con.get_entry(project_name)
    if not project:
        LOG.warn("No such project '{}'".format(project_name))
        return output_data

    # Set samples list
    sample_run_list = _set_sample_run_list(project_name, flowcell,
                                           project_alias, s_con)
    if len(sample_run_list) == 0:
        LOG.warn(
            "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?"
            .format(project_name, flowcell))
        return output_data

    # Set options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    bc_count = _literal_eval_option(bc_count)
    phix = _literal_eval_option(phix)

    # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports
    sample_count = Counter([x.get("barcode_name") for x in sample_run_list])

    # Loop samples and collect information
    s_param_out = []
    for s in sample_run_list:
        s_param = {}
        LOG.debug(
            "working on sample '{}', sample run metrics name '{}', id '{}'".
            format(s.get("barcode_name", None), s.get("name", None),
                   s.get("_id", None)))
        s_param.update(parameters)
        s_param.update(
            {key: s[srm_to_parameter[key]]
             for key in srm_to_parameter.keys()})
        fc = "{}_{}".format(s.get("date"), s.get("flowcell"))
        # Get instrument
        try:
            s_param.update(instrument[fc_con.get_instrument(str(fc))])
        except:
            LOG.warn(
                "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report."
                .format(fc))
            s_param.update(instrument['default'])
        # Get run mode
        s_param["run_mode"] = fc_con.get_run_mode(str(fc))
        s_param.update(software_versions)
        s_param["phix_error_rate"] = fc_con.get_phix_error_rate(
            str(fc), s["lane"])
        if phix:
            s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix)
        s_param['avg_quality_score'] = calc_avg_qv(s)
        if not s_param['avg_quality_score']:
            LOG.warn(
                "Calculation of average quality failed for sample {}, id {}".
                format(s.get("name"), s.get("_id")))

        # Compare phix error and qv to cutoffs
        err_stat = "OK"
        qv_stat = "OK"
        if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]:
            err_stat = "HIGH"
        elif s_param["phix_error_rate"] == -1:
            err_stat = "N/A"
        if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]:
            qv_stat = "LOW"
        output_data["stdout"].write(
            "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(
                s["barcode_name"], s["lane"], s_param["phix_error_rate"],
                err_stat, s_param["avg_quality_score"], qv_stat))

        # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing
        s_param['ordered_amount'] = s_param.get(
            'ordered_amount', p_con.get_ordered_amount(project_name))
        s_param['customer_reference'] = s_param.get(
            'customer_reference', project.get('customer_reference'))
        s_param['uppnex_project_id'] = s_param.get('uppnex_project_id',
                                                   project.get('uppnex_id'))

        # Override database settings if options passed at command line
        if ordered_million_reads:
            s_param["ordered_amount"] = _get_ordered_million_reads(
                s["barcode_name"], ordered_million_reads)
        if bc_count:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                _get_bc_count(s["barcode_name"], bc_count, s))
        else:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                s_param["rounded_read_count"])
        if uppnex_id:
            s_param["uppnex_project_id"] = uppnex_id
        if customer_reference:
            s_param["customer_reference"] = customer_reference

        # Get the project sample name corresponding to the sample run
        project_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if project_sample:
            LOG.debug(
                "project sample run metrics mapping found: '{}' : '{}'".format(
                    s["name"], project_sample["sample_name"]))
            project_sample_item = project_sample['project_sample']
            # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
            project_sample_d = _set_project_sample_dict(project_sample_item)
            if not project_sample_d:
                LOG.warn(
                    "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}"
                    .format(s["name"], s["barcode_name"], s["_id"],
                            project_sample))
            # Check if sample run metrics name present in project database: if so, verify that database ids are consistent
            if s["name"] not in project_sample_d.keys():
                LOG.warn(
                    "no such sample run metrics '{}' in project sample run metrics dictionary"
                    .format(s["name"]))
            else:
                if s["_id"] == project_sample_d[s["name"]]:
                    LOG.debug(
                        "project sample run metrics mapping found: '{}' : '{}'"
                        .format(s["name"], project_sample_d[s["name"]]))
                else:
                    LOG.warn(
                        "inconsistent mapping for '{}': '{}' != '{}' (project summary id)"
                        .format(s["name"], s["_id"],
                                project_sample_d[s["name"]]))
            s_param['customer_name'] = project_sample_item.get(
                "customer_name", None)

        # No project sample found. Manual upload to database necessary.
        else:
            s_param['customer_name'] = None
            LOG.warn(
                "No project sample name found for sample run name '{}'".format(
                    s["barcode_name"]))
            LOG.info(
                "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names "
            )
            LOG.info(
                "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names."
            )
            LOG.info("Please refer to the pm documentation for examples.")
            query_ok(force=kw.get("force", False))

        # Finally assess sequencing success, update parameters and set outputs
        s_param['success'] = sequencing_success(s_param, cutoffs)
        s_param.update({
            k: "N/A"
            for k in s_param.keys()
            if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0
        })
        if sample_count[s.get("barcode_name")] > 1:
            outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                               s["flowcell"], s["lane"])
        else:
            outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                            s["flowcell"])
        s_param["outfile"] = outfile
        s_param_out.append(s_param)

    # Write final output to reportlab and rst files
    output_data["debug"].write(
        json.dumps({
            's_param': s_param_out,
            'sample_runs':
            {s["name"]: s["barcode_name"]
             for s in sample_run_list}
        }))
    notes = [
        make_note(headers=headers, paragraphs=paragraphs, **sp)
        for sp in s_param_out
    ]
    rest_notes = make_sample_rest_notes(
        "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None),
                                             s.get("flowcell", None)),
        s_param_out)
    concatenate_notes(
        notes, "{}_{}_{}_sample_summary.pdf".format(project_name,
                                                    s.get("date", None),
                                                    s.get("flowcell", None)))
    return output_data
Ejemplo n.º 46
0
def _project_status_note_table(project_name=None,
                               username=None,
                               password=None,
                               url=None,
                               use_ps_map=True,
                               use_bc_map=False,
                               check_consistency=False,
                               ordered_million_reads=None,
                               uppnex_id=None,
                               customer_reference=None,
                               exclude_sample_ids={},
                               project_alias=None,
                               sample_aliases={},
                               projectdb="projects",
                               samplesdb="samples",
                               flowcelldb="flowcells",
                               include_all_samples=False,
                               param={},
                               **kw):

    # mapping project_summary to parameter keys
    ps_to_parameter = {
        "scilife_name": "scilife_name",
        "customer_name": "customer_name",
        "project_name": "project_name"
    }
    # mapping project sample to table
    table_keys = [
        'ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered'
    ]

    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    #Get the information source for this project
    source = p_con.get_info_source(project_name)

    # Get project summary from project database
    sample_aliases = _literal_eval_option(sample_aliases, default={})
    prj_summary = p_con.get_entry(project_name)
    if not prj_summary:
        LOG.warn("No such project '{}'".format(project_name))
        return
    LOG.debug("Working on project '{}'.".format(project_name))

    # Determine if project is finished by getting all samples sequenced date
    try:
        all_samples_sequenced = prj_summary['project_summary'][
            'all_samples_sequenced']
    except (TypeError, KeyError):
        all_samples_sequenced = False

    # Get sample run list and loop samples to make mapping sample -> {sampleruns}
    sample_run_list = _set_sample_run_list(project_name,
                                           flowcell=None,
                                           project_alias=project_alias,
                                           s_con=s_con)
    samples = {}
    for s in sample_run_list:
        prj_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if prj_sample:
            sample_name = prj_sample['project_sample'].get(
                "scilife_name", None)
            s_d = {s["name"]: {'sample': sample_name, 'id': s["_id"]}}
            samples.update(s_d)
        else:
            if s["barcode_name"] in sample_aliases:
                s_d = {
                    sample_aliases[s["barcode_name"]]: {
                        'sample': sample_aliases[s["barcode_name"]],
                        'id': s["_id"]
                    }
                }
                samples.update(s_d)
            else:
                s_d = {
                    s["name"]: {
                        'sample': s["name"],
                        'id': s["_id"],
                        'barcode_name': s["barcode_name"]
                    }
                }
                LOG.warn(
                    "No mapping found for sample run:\n  '{}'".format(s_d))

    # Convert to mapping from desired sample name to list of aliases
    # Less important for the moment; one solution is to update the
    # Google docs summary table to use the P names
    sample_dict = prj_summary['samples']
    param.update({
        key: prj_summary.get(ps_to_parameter[key], None)
        for key in ps_to_parameter.keys()
    })
    param["ordered_amount"] = param.get(
        "ordered_amount",
        p_con.get_ordered_amount(project_name, samples=sample_dict))

    if not param.get('customer_reference'):
        try:
            param['customer_reference'] = prj_summary['details'][
                'customer_project_reference']
        except (TypeError, KeyError):
            param['customer_reference'] = prj_summary.get('customer_reference')
    param['uppnex_project_id'] = param.get('uppnex_project_id',
                                           prj_summary.get('uppnex_id'))

    # Override database values if options passed at command line
    if uppnex_id:
        param["uppnex_project_id"] = uppnex_id
    if customer_reference:
        param["customer_reference"] = customer_reference

    # Process options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={})

    ## Start collecting the data
    sample_table = []
    samples_excluded = []
    last_library_preps = p_con.get_latest_library_prep(project_name)
    last_library_preps_srm = [
        x for l in last_library_preps.values() for x in l
    ]
    LOG.debug(
        "Looping through sample map that maps project sample names to sample run metrics ids"
    )
    for k, v in samples.items():
        LOG.debug("project sample '{}' maps to '{}'".format(k, v))
        if not include_all_samples:
            if v['sample'] not in last_library_preps.keys():
                LOG.info(
                    "No library prep information for sample {}; keeping in report"
                    .format(v['sample']))
            else:
                if k not in last_library_preps_srm:
                    LOG.info(
                        "Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report"
                        .format(
                            k, v["id"], ",".join(
                                list(
                                    set(last_library_preps[
                                        v['sample']].values()))), v['sample']))
                    continue
        else:
            pass

        if re.search("Unexpected", k):
            continue
        barcode_seq = s_con.get_entry(k, "sequence")
        # Exclude sample id?
        if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq):
            samples_excluded.append(v['sample'])
            continue
        # Get the project sample name from the sample run and set table values
        project_sample = sample_dict[v['sample']]
        vals = _set_sample_table_values(v['sample'], project_sample,
                                        barcode_seq, ordered_million_reads,
                                        param)
        sample_table.append([vals[k] for k in table_keys])

    # Loop through samples in sample_dict for which there is no sample run information
    samples_in_table_or_excluded = list(set([x[0] for x in sample_table
                                             ])) + samples_excluded
    samples_not_in_table = list(
        set(sample_dict.keys()) - set(samples_in_table_or_excluded))
    for sample in samples_not_in_table:
        if re.search("Unexpected", sample):
            continue
        project_sample = sample_dict[sample]
        # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
        project_sample_d = _set_project_sample_dict(project_sample, source)
        if project_sample_d:
            for k, v in project_sample_d.iteritems():
                barcode_seq = s_con.get_entry(k, "sequence")
                vals = _set_sample_table_values(sample, project_sample,
                                                barcode_seq,
                                                ordered_million_reads, param)
                sample_table.append([vals[k] for k in table_keys])
        else:
            barcode_seq = None
            vals = _set_sample_table_values(sample, project_sample,
                                            barcode_seq, ordered_million_reads,
                                            param)
            sample_table.append([vals[k] for k in table_keys])
    if all_samples_sequenced:
        param["finished"] = 'All samples for this project have been sequenced.'
    sample_table.sort()
    sample_table = list(sample_table
                        for sample_table, _ in itertools.groupby(sample_table))
    sample_table.insert(
        0,
        ['ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered'])

    return output_data, sample_table, param
Ejemplo n.º 47
0
class TestDbConnection(unittest.TestCase):
    def setUp(self):
        self.user = "******"
        self.pw = "pw"
        self.url = "localhost"
        self.examples = {"sample":"1_120924_AC003CCCXX_TGACCA",
                         "flowcell":"AC003CCCXX",
                         "project":"J.Doe_00_01"}
        self.p_con = ProjectSummaryConnection(dbname="projects-test", username=self.user, password=self.pw, url=self.url)

    def test_connection(self):
        """Test database connection"""
        sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url)
        self.assertEqual(sample_con.url_string, "http://{}:5984".format(self.url))

    def test_get_flowcell(self):
        """Test getting a flowcell for a given sample"""
        sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url)
        fc = sample_con.get_entry(self.examples["sample"], "flowcell")
        self.assertEqual(str(fc), self.examples["flowcell"])

    def test_get_sample_ids(self):
        """Test getting sample ids given flowcell and sample_prj"""
        sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url)
        sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"])
        LOG.info("Number of samples before subsetting: " + str(len(sample_ids)))
        self.assertEqual(len(sample_ids), 5)
        sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"])
        LOG.info( "Number of samples after subsetting: " + str(len(sample_ids)))
        self.assertEqual(len(sample_ids), 2)

    def test_get_samples(self):
        """Test getting samples given flowcell and sample_prj."""
        sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url)

        samples = sample_con.get_samples(fc_id=self.examples["flowcell"])
        LOG.info("Selecting on flowcell: " + str(len(samples)))
        self.assertEqual(len(samples), 5)
        samples = sample_con.get_samples(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"])
        LOG.info("Selecting on flowcell, subsetting on project: " + str(len(samples)))
        self.assertEqual(len(samples), 2)

        samples = sample_con.get_samples(sample_prj=self.examples["project"])
        LOG.info("Selecting on project: " + str(len(samples)))
        self.assertEqual(len(samples), 3)
        samples = sample_con.get_samples(sample_prj=self.examples["project"], fc_id=self.examples["flowcell"])
        LOG.info("Selecting on project, subsetting on flowcell: " + str(len(samples)))
        self.assertEqual(len(samples), 2)

    def test_get_samples_wrong_info(self):
        """Test getting samples when either flowcell or project id information is wrong"""
        sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url)

        samples = sample_con.get_samples(sample_prj="bogusproject", fc_id=self.examples["flowcell"])
        LOG.info("Selecting on bogus project, subsetting on flowcell: " + str(len(samples)))
        self.assertEqual(len(samples), 0)
        
                
    def test_get_project_sample_ids(self):
        """Test getting project sample ids"""
        sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url)
        sample_ids = sample_con.get_sample_ids(sample_prj=self.examples["project"])
        sample_names = [sample_con.db.get(x)["name"] for x in sample_ids]
        self.assertEqual(set(sample_names) , set(['1_120924_AC003CCCXX_TGACCA', '2_120924_AC003CCCXX_ACAGTG', '1_121015_BB002BBBXX_TGACCA']))
        
    def test_get_latest_library_prep(self):
        """Test getting latest library prep"""
        prj = self.p_con.get_entry("J.Doe_00_01")
        prj['samples']['P001_102']['library_prep']['B'] = {'sample_run_metrics': {'2_120924_AC003CCCXX_TTGGAA': None}}
        self.p_con.save(prj)
        preps = self.p_con.get_latest_library_prep(project_name=self.examples["project"])
        srm = [x for l in preps.values() for x in l]
        # Make sure A prep not in list
        self.assertNotIn('2_120924_AC003CCCXX_ACAGTG', srm)
        # Make sure B prep in list
        self.assertIn('2_120924_AC003CCCXX_TTGGAA', srm)
        # Reset data
        prj = self.p_con.get_entry("J.Doe_00_01")
        del prj['samples']['P001_102']['library_prep']['B']
        self.p_con.save(prj)

    def test_get_barcode_lane_statistics(self):
        """Test getting barcode lane statistics from flowcell database"""
        fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
        # Try getting wrong sample name, should return None
        data = fc_con.get_barcode_lane_statistics("J.Doe_00_01", "P001_101_index6", "120924_AC003CCCXX", "1")
        self.assertEqual(data, (None, None))
        data = fc_con.get_barcode_lane_statistics("J.Doe_00_01", "P001_101_index3", "120924_AC003CCCXX", "1")
        self.assertEqual(data, (u'35.22', u'90.05'))