Exemple #1
0
 def test_get_flowcell(self):
     """Test getting a flowcell for a given sample"""
     sample_con = SampleRunMetricsConnection(
         dbname="samples-test", username=self.user, password=self.pw, url=self.url
     )
     fc = sample_con.get_entry(self.examples["sample"], "flowcell")
     self.assertEqual(str(fc), self.examples["flowcell"])
Exemple #2
0
 def test_3_get_sample_ids(self):
     """Test getting sample ids given flowcell and sample_prj"""
     sample_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
     sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"])
     print "Number of samples before subsetting: " + str(len(sample_ids))
     sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"])
     print "Number of samples after subsetting: " + str(len(sample_ids))
Exemple #3
0
    def setUpClass(cls):
        # Temporarily add new sample for use in exclusion tests
        s_con = SampleRunMetricsConnection(dbname="samples-test",
                                           username="******",
                                           password="******")
        s = s_con.get_entry("1_121015_BB002BBBXX_TGACCA")
        kw = copy.deepcopy(s)
        del kw["_id"]
        new_s = SampleRunMetricsDocument(**kw)
        new_s["sequence"] = "AGTTGA"
        new_s["name"] = "1_121015_BB002BBBXX_AGTTGA"
        s_con.save(new_s)

        kw = copy.deepcopy(s)
        del kw["_id"]
        new_s = SampleRunMetricsDocument(**kw)
        new_s["sample_prj"] = "j-doe_00_01"
        new_s["sequence"] = "CGAACG"
        new_s["name"] = "1_121015_BB002BBBXX_CGAACG"
        s_con.save(new_s)

        s = s_con.get_entry("3_120924_AC003CCCXX_ACAGTG")
        kw = copy.deepcopy(s)
        del kw["_id"]
        new_s = SampleRunMetricsDocument(**kw)
        new_s["sample_prj"] = "j-doe_00_02"
        new_s["sequence"] = "GGAAGG"
        new_s["name"] = "3_120924_AC003CCCXX_GGAAGG"
        s_con.save(new_s)
Exemple #4
0
 def test_4_get_samples(self):
     """Test getting samples given flowcell and sample_prj."""
     sample_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
     samples = sample_con.get_samples(fc_id=self.examples["flowcell"])
     print "Number of samples before subsetting: " + str(len(samples))
     samples = sample_con.get_samples(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"])
     print "Number of samples after subsetting: " + str(len(samples))
Exemple #5
0
    def test_2_make_note(self):
        """Make a note subset by example flowcell and project"""
        s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
        fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
        p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url)
        paragraphs = sample_note_paragraphs()
        headers = sample_note_headers()
        samples = s_con.get_samples(self.examples["flowcell"], self.examples["project"])
        project = p_con.get_entry(self.examples["project"])
        samples = p_con.map_srm_to_name(self.examples["project"], fc_id=self.examples["flowcell"], use_bc_map=True)
        for k,v  in samples.items():
            s_param = parameters
            s = s_con.get_entry(k)
            s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()})
            fc = "{}_{}".format(s["date"], s["flowcell"])
            s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"])
            s_param['avg_quality_score'] = s_con.calc_avg_qv(s["name"])
            s_param['rounded_read_count'] = round(float(s_param['rounded_read_count'])/1e6,1) if s_param['rounded_read_count'] else None
            s_param['customer_name'] = project['samples'][v["sample"]].get('customer_name', None)

            if project:
                s_param['ordered_amount'] = p_con.get_ordered_amount(self.examples["project"])
                s_param['customer_reference'] = s_param.get('customer_reference', project['customer_reference'])
                s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project['uppnex_id'])
            s_param['success'] = sequencing_success(s_param, cutoffs)
            s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None})
            make_note("{}.pdf".format(s["barcode_name"]), headers, paragraphs, **s_param)
Exemple #6
0
def fastq_screen(project_name=None, flowcell=None,
                 username=None, password=None, url=None, dbname="samples", **kw):
    """Perform application specific qc on a project.

    :param project_name: project name
    :param flowcell: flowcell identifier
    :param username: database username
    :param password: database password
    :param url: database url
    :param dbname: samples database name
    """
    LOG.debug("Running fastq screen summary on project {}, flowcell ".format(project_name, flowcell))
    output_data = {'stdout':StringIO(), 'stderr':StringIO()}
    s_con = SampleRunMetricsConnection(dbname=dbname, username=username, password=password, url=url)
    samples = s_con.get_samples(fc_id=flowcell, sample_prj=project_name)
    for s in samples:
        LOG.debug("Checking fastq_screen data for sample {}, id {}, project {}".format(s.get("name", None), s.get("_id", None), s.get("sample_prj", None)))
        fqscreen_data = s.get("fastq_scr", {})
        output_data["stdout"].write(s["barcode_name"] + "\n")
        if fqscreen_data:
            header = [[x for x in v.keys()] for k, v in fqscreen_data.iteritems()]
            output_data["stdout"].write("\t\t" + "".join("{:>27}".format(x) for x in header[0]) + "\n")
            vals = ["{:>12}\t{}\n".format(k, "".join(["{:>27}".format(x) for x in v.values()])) for k, v in fqscreen_data.iteritems()]
            for v in vals:
                output_data["stdout"].write(v)
    return output_data
Exemple #7
0
 def setUp(self):
     """FIXME: All other tests depend on data being uploaded, so
     these are not real unit tests. The setup to TestQCUpload has to
     be run prior to other tests, else unexpected failures will
     occur."""
     self.app = self.make_app(
         argv=['qc', 'upload-qc', flowcells[0], '--mtime', '10000'],
         extensions=[
             'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'
         ])
     self._run_app()
     self.app = self.make_app(
         argv=['qc', 'upload-qc', flowcells[1], '--mtime', '10000'],
         extensions=[
             'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'
         ])
     self._run_app()
     self.s_con = SampleRunMetricsConnection(dbname="samples-test",
                                             username="******",
                                             password="******")
     self.p_con = ProjectSummaryConnection(dbname="projects-test",
                                           username="******",
                                           password="******")
     self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test",
                                                username="******",
                                                password="******")
Exemple #8
0
    def test_get_samples_wrong_info(self):
        """Test getting samples when either flowcell or project id information is wrong"""
        sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url)

        samples = sample_con.get_samples(sample_prj="bogusproject", fc_id=self.examples["flowcell"])
        LOG.info("Selecting on bogus project, subsetting on flowcell: " + str(len(samples)))
        self.assertEqual(len(samples), 0)
Exemple #9
0
 def test_dbcon(self):
     """Test database connection and that we get expected values."""
     s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******")
     samples = [s_con.get_entry(x) for x in s_con.name_view]
     samples_d = {x["name"]: x for x in samples}
     self.assertEqual(samples_d["1_120924_AC003CCCXX_TGACCA"]["date"], "120924")
     self.assertEqual(samples_d["1_121015_BB002BBBXX_TGACCA"]["flowcell"], "BB002BBBXX")
     self.assertEqual(samples_d["2_120924_AC003CCCXX_ACAGTG"]["entity_type"], "sample_run_metrics")
     self.assertEqual(samples_d["3_120924_AC003CCCXX_ACAGTG"]["lane"], "3")
     self.assertEqual(samples_d["4_120924_AC003CCCXX_CGTTAA"]["sequence"], "CGTTAA")
     self.assertEqual(samples_d["2_121015_BB002BBBXX_TGACCA"]["project_id"], "P002")
     fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
     flowcells = [fc_con.get_entry(x) for x in fc_con.name_view]
     flowcells_d = {x["name"]: x for x in flowcells}
     self.assertEqual(flowcells_d["120924_AC003CCCXX"]["name"], "120924_AC003CCCXX")
     self.assertEqual(flowcells_d["121015_BB002BBBXX"]["name"], "121015_BB002BBBXX")
     self.assertEqual(flowcells_d["120924_AC003CCCXX"]["entity_type"], "flowcell_run_metrics")
     p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******")
     projects = [p_con.get_entry(x) for x in p_con.name_view]
     projects_d = {x["project_name"]: x for x in projects}
     self.assertEqual(projects_d["J.Doe_00_01"]["min_m_reads_per_sample_ordered"], 0.1)
     self.assertEqual(projects_d["J.Doe_00_01"]["no_of_samples"], 2)
     self.assertEqual(
         set(projects_d["J.Doe_00_01"]["samples"].keys()), set(["P001_101_index3", "P001_102", "P001_103"])
     )
     self.assertEqual(projects_d["J.Doe_00_01"]["customer_reference"], "GnuGenome")
     self.assertEqual(projects_d["J.Doe_00_02"]["min_m_reads_per_sample_ordered"], 0.2)
     self.assertEqual(projects_d["J.Doe_00_03"]["samples"].keys(), ["3_index6"])
     self.assertIn("A", projects_d["J.Doe_00_03"]["samples"]["3_index6"]["library_prep"])
 def test_4_srm_sample_map(self):
     """Make sample map from sample run metrics"""
     s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
     samples = s_con.get_project_samples(self.examples["project"])
     sample_names = {x["name"]:x["_id"] for x in samples}
     print len(sample_names.keys())
     print len(list(set(sample_names.keys())))
     print sample_names
Exemple #11
0
 def test_get_flowcell(self):
     """Test getting a flowcell for a given sample"""
     sample_con = SampleRunMetricsConnection(dbname="samples-test",
                                             username=self.user,
                                             password=self.pw,
                                             url=self.url)
     fc = sample_con.get_entry(self.examples["sample"], "flowcell")
     self.assertEqual(str(fc), self.examples["flowcell"])
Exemple #12
0
    def upload_qc(self):
        if not self._check_pargs(['flowcell']):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get(
            "db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return
        if not validate_fc_directory_format(self.pargs.flowcell):
            self.app.log.warn(
                "Path '{}' does not conform to bcbio flowcell directory format; aborting"
                .format(self.pargs.flowcell))
            return

        runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell),
                                   "{}.csv".format(fc_id(self.pargs.flowcell)))
        runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell),
                                    "run_info.yaml")
        (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
        if int(fc_date) < 120815:
            self.log.info(
                "Assuming pre-casava based file structure for {}".format(
                    fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_pre_casava_qc()
        else:
            self.log.info("Assuming casava based file structure for {}".format(
                fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_casava_qc()

        if len(qc_objects) == 0:
            self.log.info("No out-of-date qc objects for {}".format(
                fc_id(self.pargs.flowcell)))
            return
        else:
            self.log.info("Retrieved {} updated qc objects".format(
                len(qc_objects)))

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get(
            "db", "samples"),
                                           **vars(self.app.pargs))
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get(
            "db", "flowcells"),
                                              **vars(self.app.pargs))
        p_con = ProjectSummaryConnection(dbname=self.app.config.get(
            "db", "projects"),
                                         **vars(self.app.pargs))
        for obj in qc_objects:
            if self.app.pargs.debug:
                self.log.debug("{}: {}".format(str(obj), obj["_id"]))
            if isinstance(obj, FlowcellRunMetricsDocument):
                dry("Saving object {}".format(repr(obj)), fc_con.save(obj))
            if isinstance(obj, SampleRunMetricsDocument):
                project_sample = p_con.get_project_sample(
                    obj.get("sample_prj", None), obj.get("barcode_name", None),
                    self.pargs.extensive_matching)
                if project_sample:
                    obj["project_sample_name"] = project_sample['sample_name']
                dry("Saving object {}".format(repr(obj)), s_con.save(obj))
Exemple #13
0
 def test_get_sample_ids(self):
     """Test getting sample ids given flowcell and sample_prj"""
     sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url)
     sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"])
     LOG.info("Number of samples before subsetting: " + str(len(sample_ids)))
     self.assertEqual(len(sample_ids), 5)
     sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"])
     LOG.info( "Number of samples after subsetting: " + str(len(sample_ids)))
     self.assertEqual(len(sample_ids), 2)
Exemple #14
0
 def test_4_srm_sample_map(self):
     """Make sample map from sample run metrics"""
     s_con = SampleRunMetricsConnection(username=self.user,
                                        password=self.pw,
                                        url=self.url)
     samples = s_con.get_project_samples(self.examples["project"])
     sample_names = {x["name"]: x["_id"] for x in samples}
     print len(sample_names.keys())
     print len(list(set(sample_names.keys())))
     print sample_names
Exemple #15
0
 def test_get_project_sample_ids(self):
     """Test getting project sample ids"""
     sample_con = SampleRunMetricsConnection(
         dbname="samples-test", username=self.user, password=self.pw, url=self.url
     )
     sample_ids = sample_con.get_sample_ids(sample_prj=self.examples["project"])
     sample_names = [sample_con.db.get(x)["name"] for x in sample_ids]
     self.assertEqual(
         set(sample_names),
         set(["1_120924_AC003CCCXX_TGACCA", "2_120924_AC003CCCXX_ACAGTG", "1_121015_BB002BBBXX_TGACCA"]),
     )
Exemple #16
0
    def test_get_samples_wrong_info(self):
        """Test getting samples when either flowcell or project id information is wrong"""
        sample_con = SampleRunMetricsConnection(dbname="samples-test",
                                                username=self.user,
                                                password=self.pw,
                                                url=self.url)

        samples = sample_con.get_samples(sample_prj="bogusproject",
                                         fc_id=self.examples["flowcell"])
        LOG.info("Selecting on bogus project, subsetting on flowcell: " +
                 str(len(samples)))
        self.assertEqual(len(samples), 0)
Exemple #17
0
 def tearDownClass(cls):
     s_con = SampleRunMetricsConnection(dbname="samples-test",
                                        username="******",
                                        password="******")
     s = s_con.get_entry("1_121015_BB002BBBXX_AGTTGA")
     doc = s_con.db.get(s["_id"])
     s_con.db.delete(doc)
     s = s_con.get_entry("1_121015_BB002BBBXX_CGAACG")
     doc = s_con.db.get(s["_id"])
     s_con.db.delete(doc)
     s = s_con.get_entry("3_120924_AC003CCCXX_GGAAGG")
     doc = s_con.db.get(s["_id"])
     s_con.db.delete(doc)
Exemple #18
0
 def test_dbcon(self):
     """Test database connection and that we get expected values."""
     s_con = SampleRunMetricsConnection(dbname="samples-test",
                                        username="******",
                                        password="******")
     samples = [s_con.get_entry(x) for x in s_con.name_view]
     samples_d = {x["name"]: x for x in samples}
     self.assertEqual(samples_d["1_120924_AC003CCCXX_TGACCA"]["date"],
                      "120924")
     self.assertEqual(samples_d["1_121015_BB002BBBXX_TGACCA"]["flowcell"],
                      "BB002BBBXX")
     self.assertEqual(
         samples_d["2_120924_AC003CCCXX_ACAGTG"]["entity_type"],
         "sample_run_metrics")
     self.assertEqual(samples_d["3_120924_AC003CCCXX_ACAGTG"]["lane"], "3")
     self.assertEqual(samples_d["4_120924_AC003CCCXX_CGTTAA"]["sequence"],
                      "CGTTAA")
     self.assertEqual(samples_d["2_121015_BB002BBBXX_TGACCA"]["project_id"],
                      "P002")
     fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test",
                                           username="******",
                                           password="******")
     flowcells = [fc_con.get_entry(x) for x in fc_con.name_view]
     flowcells_d = {x["name"]: x for x in flowcells}
     self.assertEqual(flowcells_d["120924_AC003CCCXX"]["name"],
                      "120924_AC003CCCXX")
     self.assertEqual(flowcells_d["121015_BB002BBBXX"]["name"],
                      "121015_BB002BBBXX")
     self.assertEqual(flowcells_d["120924_AC003CCCXX"]["entity_type"],
                      "flowcell_run_metrics")
     p_con = ProjectSummaryConnection(dbname="projects-test",
                                      username="******",
                                      password="******")
     projects = [p_con.get_entry(x) for x in p_con.name_view]
     projects_d = {x["project_name"]: x for x in projects}
     self.assertEqual(
         projects_d["J.Doe_00_01"]["min_m_reads_per_sample_ordered"], 0.1)
     self.assertEqual(projects_d["J.Doe_00_01"]["no_of_samples"], 2)
     self.assertEqual(set(projects_d["J.Doe_00_01"]["samples"].keys()),
                      set(["P001_101_index3", "P001_102", "P001_103"]))
     self.assertEqual(projects_d["J.Doe_00_01"]["customer_reference"],
                      "GnuGenome")
     self.assertEqual(
         projects_d["J.Doe_00_02"]["min_m_reads_per_sample_ordered"], 0.2)
     self.assertEqual(projects_d["J.Doe_00_03"]["samples"].keys(),
                      ["3_index6"])
     self.assertIn(
         "A",
         projects_d["J.Doe_00_03"]["samples"]["3_index6"]["library_prep"])
Exemple #19
0
 def test_get_project_sample_ids(self):
     """Test getting project sample ids"""
     sample_con = SampleRunMetricsConnection(dbname="samples-test",
                                             username=self.user,
                                             password=self.pw,
                                             url=self.url)
     sample_ids = sample_con.get_sample_ids(
         sample_prj=self.examples["project"])
     sample_names = [sample_con.db.get(x)["name"] for x in sample_ids]
     self.assertEqual(
         set(sample_names),
         set([
             '1_120924_AC003CCCXX_TGACCA', '2_120924_AC003CCCXX_ACAGTG',
             '1_121015_BB002BBBXX_TGACCA'
         ]))
Exemple #20
0
 def test_get_sample_ids(self):
     """Test getting sample ids given flowcell and sample_prj"""
     sample_con = SampleRunMetricsConnection(dbname="samples-test",
                                             username=self.user,
                                             password=self.pw,
                                             url=self.url)
     sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"])
     LOG.info("Number of samples before subsetting: " +
              str(len(sample_ids)))
     self.assertEqual(len(sample_ids), 5)
     sample_ids = sample_con.get_sample_ids(
         fc_id=self.examples["flowcell"],
         sample_prj=self.examples["project"])
     LOG.info("Number of samples after subsetting: " + str(len(sample_ids)))
     self.assertEqual(len(sample_ids), 2)
Exemple #21
0
 def bpreport(self):
     if not self._check_pargs(["project"]):
         return
     kw = vars(self.pargs)
     basedir = os.path.abspath(
         os.path.join(self.app.controller._meta.root_path,
                      self.app.controller._meta.path_id))
     flist = find_samples(basedir, **vars(self.pargs))
     if not len(flist) > 0:
         self.log.info("No samples/sample configuration files found")
         return
     if self.pargs.no_statusdb:
         sample_name_map = None
     else:
         if not self._check_pargs(["statusdb_project_name"]):
             return
         p_con = ProjectSummaryConnection(dbname=self.app.config.get(
             "db", "projects"),
                                          **vars(self.app.pargs))
         s_con = SampleRunMetricsConnection(dbname=self.app.config.get(
             "db", "samples"),
                                            **vars(self.app.pargs))
         sample_name_map = get_scilife_to_customer_name(
             self.pargs.statusdb_project_name, p_con, s_con)
     kw.update(project_name=self.pargs.project,
               flist=flist,
               basedir=basedir,
               sample_name_map=sample_name_map)
     out_data = best_practice_note(**kw)
     self.log.info(
         "Wrote report to directory {}; use Makefile to generate pdf report"
         .format(basedir))
     self.app._output_data['stdout'].write(out_data['stdout'].getvalue())
     self.app._output_data['stderr'].write(out_data['stderr'].getvalue())
     self.app._output_data['debug'].write(out_data['debug'].getvalue())
Exemple #22
0
    def update(self):
        if not self._check_pargs(["sample_prj"]):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs))
        samples = s_con.get_samples(sample_prj=self.pargs.sample_prj)

        if self.pargs.project_id:
            self.app.log.debug("Going to update 'project_id' to {} for sample runs with 'sample_prj' == {}".format(self.pargs.project_id, self.pargs.sample_prj))
            for s in samples:
                if not s.get("project_id", None) is None:
                    if not query_yes_no("'project_id':{} for sample {}; are you sure you want to overwrite?".format(s["project_id"], s["name"]), force=self.pargs.force):
                        continue
                s["project_id"] = self.pargs.project_id
                s_con.save(s)
        if self.pargs.names:
            self.app.log.debug("Going to update 'project_sample_name' for sample runs with 'sample_prj' == {}".format(self.pargs.sample_prj))
            if os.path.exists(self.pargs.names):
                with open(self.pargs.names) as fh:
                    names_d = json.load(fh)
            else:
                names_d= ast.literal_eval(self.pargs.names)
            samples_sort = sorted(samples, key=lambda s:s["barcode_name"])
            groups = {}
            for k, g in itertools.groupby(samples_sort, key=lambda x:x["barcode_name"]):
                groups[k] = list(g)
            for barcode_name in names_d:
                sample_list = groups.get(barcode_name, None)
                if not sample_list:
                    continue
                for s in sample_list:
                    if not s.get("project_sample_name", None) is None:
                        if not query_yes_no("'project_sample_name':{} for sample {}; are you sure you want to overwrite?".format(s["project_sample_name"], s["name"]), force=self.pargs.force):
                            continue
                    s["project_sample_name"] = names_d[barcode_name]
                    s_con.save(s)
        else:
            self.app.log.info("Trying to use extensive matching...")
            p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
            project_name = self.pargs.sample_prj
            if self.pargs.project_alias:
                project_name = self.pargs.project_alias
            for s in samples:
                project_sample = p_con.get_project_sample(project_name, s["barcode_name"], extensive_matching=True)
                if project_sample:
                    self.app.log.info("using mapping '{} : {}'...".format(s["barcode_name"], project_sample["sample_name"]))
                    s["project_sample_name"] = project_sample["sample_name"]
                    s_con.save(s)
Exemple #23
0
    def test_get_samples(self):
        """Test getting samples given flowcell and sample_prj."""
        sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url)

        samples = sample_con.get_samples(fc_id=self.examples["flowcell"])
        LOG.info("Selecting on flowcell: " + str(len(samples)))
        self.assertEqual(len(samples), 5)
        samples = sample_con.get_samples(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"])
        LOG.info("Selecting on flowcell, subsetting on project: " + str(len(samples)))
        self.assertEqual(len(samples), 2)

        samples = sample_con.get_samples(sample_prj=self.examples["project"])
        LOG.info("Selecting on project: " + str(len(samples)))
        self.assertEqual(len(samples), 3)
        samples = sample_con.get_samples(sample_prj=self.examples["project"], fc_id=self.examples["flowcell"])
        LOG.info("Selecting on project, subsetting on flowcell: " + str(len(samples)))
        self.assertEqual(len(samples), 2)
Exemple #24
0
 def test_connection(self):
     """Test database connection"""
     sample_con = SampleRunMetricsConnection(dbname="samples-test",
                                             username=self.user,
                                             password=self.pw,
                                             url=self.url)
     self.assertEqual(sample_con.url_string,
                      "http://{}:5984".format(self.url))
Exemple #25
0
def fastq_screen(project_name=None,
                 flowcell=None,
                 username=None,
                 password=None,
                 url=None,
                 dbname="samples",
                 **kw):
    """Perform application specific qc on a project.

    :param project_name: project name
    :param flowcell: flowcell identifier
    :param username: database username
    :param password: database password
    :param url: database url
    :param dbname: samples database name
    """
    LOG.debug("Running fastq screen summary on project {}, flowcell ".format(
        project_name, flowcell))
    output_data = {'stdout': StringIO(), 'stderr': StringIO()}
    s_con = SampleRunMetricsConnection(dbname=dbname,
                                       username=username,
                                       password=password,
                                       url=url)
    samples = s_con.get_samples(fc_id=flowcell, sample_prj=project_name)
    for s in samples:
        LOG.debug(
            "Checking fastq_screen data for sample {}, id {}, project {}".
            format(s.get("name", None), s.get("_id", None),
                   s.get("sample_prj", None)))
        fqscreen_data = s.get("fastq_scr", {})
        output_data["stdout"].write(s["barcode_name"] + "\n")
        if fqscreen_data:
            header = [[x for x in v.keys()]
                      for k, v in fqscreen_data.iteritems()]
            output_data["stdout"].write("\t\t" + "".join("{:>27}".format(x)
                                                         for x in header[0]) +
                                        "\n")
            vals = [
                "{:>12}\t{}\n".format(
                    k, "".join(["{:>27}".format(x) for x in v.values()]))
                for k, v in fqscreen_data.iteritems()
            ]
            for v in vals:
                output_data["stdout"].write(v)
    return output_data
Exemple #26
0
 def setUp(self):
     self.app = self.make_app(
         argv=["qc", "upload-qc", flowcells[0], "--mtime", "10000"],
         extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"],
     )
     self._run_app()
     self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******")
     self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******")
     self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
Exemple #27
0
    def upload_qc(self):
        if not self._check_pargs(["flowcell"]):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return
        if not validate_fc_directory_format(self.pargs.flowcell):
            self.app.log.warn(
                "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell)
            )
            return

        runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell)))
        runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml")
        (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
        if int(fc_date) < 120815:
            self.log.info("Assuming pre-casava based file structure for {}".format(fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_pre_casava_qc()
        else:
            self.log.info("Assuming casava based file structure for {}".format(fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_casava_qc()

        if len(qc_objects) == 0:
            self.log.info("No out-of-date qc objects for {}".format(fc_id(self.pargs.flowcell)))
            return
        else:
            self.log.info("Retrieved {} updated qc objects".format(len(qc_objects)))

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs))
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs))
        p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
        for obj in qc_objects:
            if self.app.pargs.debug:
                self.log.debug("{}: {}".format(str(obj), obj["_id"]))
            if isinstance(obj, FlowcellRunMetricsDocument):
                dry("Saving object {}".format(repr(obj)), fc_con.save(obj))
            if isinstance(obj, SampleRunMetricsDocument):
                project_sample = p_con.get_project_sample(
                    obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching
                )
                if project_sample:
                    obj["project_sample_name"] = project_sample["sample_name"]
                dry("Saving object {}".format(repr(obj)), s_con.save(obj))
 def test_2_make_project_note(self):
     """Make a project note subset by flowcell and project"""
     s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
     fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
     p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url)
     paragraphs = project_note_paragraphs()
     headers = project_note_headers()
     param = parameters
     project = p_con.get_entry(self.examples["project"])
     if not project:
         print "No project named {}".format(self.examples["project"])
         return
     if project:
         ordered_amount = p_con.get_ordered_amount(self.examples["project"])
     else:
         return
         ordered_amount = self.pargs.ordered_million_reads
         
     ## Start collecting the data
     sample_table = []
     sample_list = project['samples']
     param.update({key:project.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys()})
     samples = p_con.map_name_to_srm(self.examples["project"], check_consistency=True, use_bc_map=True)
     all_passed = True
     for k,v in samples.items():
         if k=="Unexpected":
             continue
         project_sample = sample_list[k]
         vals = {x:project_sample.get(prjs_to_table[x], None) for x in prjs_to_table.keys()}
         vals['MOrdered'] = ordered_amount
         vals['BarcodeSeq'] = s_con.get_entry(v.keys()[0], "sequence")
         
         ## Set status
         vals['Status'] = set_status(vals) if vals['Status'] is None else vals['Status']
         vals.update({k:"N/A" for k in vals.keys() if vals[k] is None})
         if vals['Status']=="N/A" or vals['Status']=="NP": all_passed = False
         sample_table.append([vals[k] for k in table_keys])
     if all_passed: param["finished"] = 'Project finished.'
     sample_table.sort()
     sample_table = list(sample_table for sample_table,_ in itertools.groupby(sample_table))
     sample_table.insert(0, ['ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status'])
     paragraphs["Samples"]["tpl"] = make_sample_table(sample_table)
     make_note("{}.pdf".format(self.examples["project"]), headers, paragraphs, **param)
Exemple #29
0
 def test_5_name_view(self):
     """Test unique ids in name view"""
     s_con = SampleRunMetricsConnection(username=self.user,
                                        password=self.pw,
                                        url=self.url)
     name_view = s_con.db.view("names/name", reduce=False)
     print s_con.name_view.keys()[0:10]
     print len(s_con.name_view.keys())
     print len(list(set(s_con.name_view.keys())))
     print len(name_view)
Exemple #30
0
    def update(self):
        if not self._check_pargs(["sample_prj"]):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs))
        samples = s_con.get_samples(sample_prj=self.pargs.sample_prj)

        if self.pargs.project_id:
            self.app.log.debug("Going to update 'project_id' to {} for sample runs with 'sample_prj' == {}".format(self.pargs.project_id, self.pargs.sample_prj))
            for s in samples:
                if not s.get("project_id", None) is None:
                    if not query_yes_no("'project_id':{} for sample {}; are you sure you want to overwrite?".format(s["project_id"], s["name"]), force=self.pargs.force):
                        continue
                s["project_id"] = self.pargs.project_id
                s_con.save(s)
        if self.pargs.names:
            self.app.log.debug("Going to update 'project_sample_name' for sample runs with 'sample_prj' == {}".format(self.pargs.sample_prj))
            if os.path.exists(self.pargs.names):
                with open(self.pargs.names) as fh:
                    names_d = json.load(fh)
            else:
                names_d= ast.literal_eval(self.pargs.names)
            samples_sort = sorted(samples, key=lambda s:s["barcode_name"])
            groups = {}
            for k, g in itertools.groupby(samples_sort, key=lambda x:x["barcode_name"]):
                groups[k] = list(g)
            for barcode_name in names_d:
                sample_list = groups.get(barcode_name, None)
                if not sample_list:
                    continue
                for s in sample_list:
                    if not s.get("project_sample_name", None) is None:
                        if not query_yes_no("'project_sample_name':{} for sample {}; are you sure you want to overwrite?".format(s["project_sample_name"], s["name"]), force=self.pargs.force):
                            continue
                    s["project_sample_name"] = names_d[barcode_name]
                    s_con.save(s)
        else:
            self.app.log.info("Trying to use extensive matching...")
            p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
            project_name = self.pargs.sample_prj
            if self.pargs.project_alias:
                project_name = self.pargs.project_alias
            for s in samples:
                project_sample = p_con.get_project_sample(project_name, s["barcode_name"], extensive_matching=True)
                if project_sample:
                    self.app.log.info("using mapping '{} : {}'...".format(s["barcode_name"], project_sample["sample_name"]))
                    s["project_sample_name"] = project_sample["sample_name"]
                    s_con.save(s)
Exemple #31
0
 def setUp(self):
     """FIXME: All other tests depend on data being uploaded, so
     these are not real unit tests. The setup to TestQCUpload has to
     be run prior to other tests, else unexpected failures will
     occur."""
     self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[0], '--mtime',  '10000'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'])
     self._run_app()
     self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[1], '--mtime',  '10000'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'])
     self._run_app()
     self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******")
     self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******")
     self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
Exemple #32
0
 def test_2_map_srmseqid_to_srmid(self):
     """Map srm seq id names to srm ids"""
     sample_con = SampleRunMetricsConnection(username=self.user,
                                             password=self.pw,
                                             url=self.url)
     sample_map = {}
     for k in sample_con.db:
         obj = sample_con.db.get(k)
         sample_seq_id = "{}_{}_{}_{}".format(
             obj.get("lane"), obj.get("date"), obj.get("flowcell"),
             obj.get("sequence", "NoIndex"))
         if not sample_seq_id in sample_map.keys():
             sample_map[sample_seq_id] = [k]
         else:
             print "WARNING: duplicate for {}".format(sample_seq_id)
             sample_map[sample_seq_id].append(k)
     for k, v in sample_map.items():
         if len(v) > 1:
             print k, v
Exemple #33
0
    def test_get_samples(self):
        """Test getting samples given flowcell and sample_prj."""
        sample_con = SampleRunMetricsConnection(dbname="samples-test",
                                                username=self.user,
                                                password=self.pw,
                                                url=self.url)

        samples = sample_con.get_samples(fc_id=self.examples["flowcell"])
        LOG.info("Selecting on flowcell: " + str(len(samples)))
        self.assertEqual(len(samples), 5)
        samples = sample_con.get_samples(fc_id=self.examples["flowcell"],
                                         sample_prj=self.examples["project"])
        LOG.info("Selecting on flowcell, subsetting on project: " +
                 str(len(samples)))
        self.assertEqual(len(samples), 2)

        samples = sample_con.get_samples(sample_prj=self.examples["project"])
        LOG.info("Selecting on project: " + str(len(samples)))
        self.assertEqual(len(samples), 3)
        samples = sample_con.get_samples(sample_prj=self.examples["project"],
                                         fc_id=self.examples["flowcell"])
        LOG.info("Selecting on project, subsetting on flowcell: " +
                 str(len(samples)))
        self.assertEqual(len(samples), 2)
Exemple #34
0
def sample_status_note(project_name=None,
                       flowcell=None,
                       username=None,
                       password=None,
                       url=None,
                       ordered_million_reads=None,
                       uppnex_id=None,
                       customer_reference=None,
                       bc_count=None,
                       project_alias=[],
                       projectdb="projects",
                       samplesdb="samples",
                       flowcelldb="flowcells",
                       phix=None,
                       is_paired=True,
                       **kw):
    """Make a sample status note. Used keywords:

    :param project_name: project name
    :param flowcell: flowcell id
    :param username: db username
    :param password: db password
    :param url: db url
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param project_alias: project alias name
    :param phix: phix error rate
    :param is_paired: True if run is paired-end, False for single-end
    """
    # Cutoffs
    cutoffs = {
        "phix_err_cutoff": 2.0,
        "qv_cutoff": 30,
    }

    instrument = _parse_instrument_config(
        os.path.expanduser(kw.get("instrument_config", "")))
    instrument_dict = {i['instrument_id']: i for i in instrument}

    # parameters
    parameters = {
        "project_name": None,
        "start_date": None,
        "FC_id": None,
        "scilifelab_name": None,
        "rounded_read_count": None,
        "phix_error_rate": None,
        "avg_quality_score": None,
        "pct_q30_bases": None,
        "success": None,
        "run_mode": None,
        "is_paired": True
    }
    # key mapping from sample_run_metrics to parameter keys
    srm_to_parameter = {
        "project_name": "sample_prj",
        "FC_id": "flowcell",
        "scilifelab_name": "barcode_name",
        "start_date": "date",
        "rounded_read_count": "bc_count",
        "lane": "lane"
    }

    LOG.debug("got parameters {}".format(parameters))
    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    if not _assert_flowcell_format(flowcell):
        LOG.warn(
            "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9\-]+\")"
            .format(flowcell))
        return output_data
    output_data = _update_sample_output_data(output_data, cutoffs)

    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    # Set up paragraphs
    paragraphs = sample_note_paragraphs()
    headers = sample_note_headers()

    # Get project
    project = p_con.get_entry(project_name)
    source = p_con.get_info_source(project_name)
    if not project:
        LOG.warn("No such project '{}'".format(project_name))
        return output_data

    # Set samples list
    sample_run_list = _set_sample_run_list(project_name, flowcell,
                                           project_alias, s_con)
    if len(sample_run_list) == 0:
        LOG.warn(
            "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?"
            .format(project_name, flowcell))
        return output_data

    # Set options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    bc_count = _literal_eval_option(bc_count)
    phix = _literal_eval_option(phix)

    # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports
    sample_count = Counter([x.get("barcode_name") for x in sample_run_list])

    # Loop samples and collect information
    s_param_out = []
    fcdoc = None
    for s in sample_run_list:
        s_param = {}
        LOG.debug(
            "working on sample '{}', sample run metrics name '{}', id '{}'".
            format(s.get("barcode_name", None), s.get("name", None),
                   s.get("_id", None)))
        s_param.update(parameters)
        s_param.update(
            {key: s[srm_to_parameter[key]]
             for key in srm_to_parameter.keys()})
        fc = "{}_{}".format(s.get("date"), s.get("flowcell"))
        # Get instrument
        try:
            s_param.update(instrument_dict[fc_con.get_instrument(str(fc))])
        except:
            LOG.warn(
                "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report."
                .format(fc))
            s_param.update(instrument_dict['default'])
        # Get run mode
        if not fcdoc or fcdoc.get("name") != fc:
            fcdoc = fc_con.get_entry(fc)
        runp = fcdoc.get("RunParameters", {})
        s_param[
            "sequencing_platform"] = "MiSeq" if "MCSVersion" in runp else "HiSeq2500"
        s_param["clustering_method"] = "onboard clustering" if runp.get(
            "ClusteringChoice", "") == "OnBoardClustering" or s_param[
                "sequencing_platform"] == "MiSeq" else "cBot"
        s_param["sequencing_setup"] = fcdoc.get("run_setup")
        s_param["sequencing_mode"] = runp.get("RunMode", "High Output")
        s_param["sequencing_software"] = "RTA {}".format(
            runp.get("RTAVersion"))
        if s_param["sequencing_platform"] == "MiSeq":
            s_param["sequencing_software"] = "MCS {}/{}".format(
                runp.get("MCSVersion"), s_param["sequencing_software"])
        else:
            s_param["sequencing_software"] = "{} {}/{}".format(
                runp.get("ApplicationName"), runp.get("ApplicationVersion"),
                s_param["sequencing_software"])
        s_param["is_paired"] = fc_con.is_paired_end(str(fc))
        if s_param["is_paired"] is None:
            LOG.warn(
                "Could not determine run setup for flowcell {}. Will assume paired-end."
                .format(fc))
            s_param["is_paired"] = True
        s_param.update(software_versions)
        s_param["phix_error_rate"] = fc_con.get_phix_error_rate(
            str(fc), s["lane"])
        if phix:
            s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix)
        # Get quality score from demultiplex stats, if that fails
        # (which it shouldn't), fall back on fastqc data.
        (avg_quality_score,
         pct_q30_bases) = fc_con.get_barcode_lane_statistics(
             project_name, s.get("barcode_name"), fc, s["lane"])
        s_param[
            'avg_quality_score'] = avg_quality_score if avg_quality_score else calc_avg_qv(
                s)
        if not s_param['avg_quality_score']:
            LOG.warn(
                "Setting average quality failed for sample {}, id {}".format(
                    s.get("name"), s.get("_id")))
        s_param['pct_q30_bases'] = pct_q30_bases
        if not s_param['pct_q30_bases']:
            LOG.warn(
                "Setting % of >= Q30 Bases (PF) failed for sample {}, id {}".
                format(s.get("name"), s.get("_id")))
        # Compare phix error and qv to cutoffs
        err_stat = "OK"
        qv_stat = "OK"
        if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]:
            err_stat = "HIGH"
        elif s_param["phix_error_rate"] == -1:
            err_stat = "N/A"
        if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]:
            qv_stat = "LOW"
        output_data["stdout"].write(
            "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(
                s["barcode_name"], s["lane"], s_param["phix_error_rate"],
                err_stat, s_param["avg_quality_score"], qv_stat))

        # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing
        s_param['ordered_amount'] = s_param.get(
            'ordered_amount',
            p_con.get_ordered_amount(project_name,
                                     samples=p_con.get_entry(
                                         project_name, 'samples')))
        s_param['customer_reference'] = s_param.get(
            'customer_reference', project.get('customer_reference'))
        s_param['uppnex_project_id'] = s_param.get('uppnex_project_id',
                                                   project.get('uppnex_id'))

        # Override database settings if options passed at command line
        if ordered_million_reads:
            s_param["ordered_amount"] = _get_ordered_million_reads(
                s["barcode_name"], ordered_million_reads)
        if bc_count:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                _get_bc_count(s["barcode_name"], bc_count, s))
        else:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                s_param["rounded_read_count"])
        if uppnex_id:
            s_param["uppnex_project_id"] = uppnex_id
        if customer_reference:
            s_param["customer_reference"] = customer_reference

        # Get the project sample name corresponding to the sample run
        project_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if project_sample:
            LOG.debug(
                "project sample run metrics mapping found: '{}' : '{}'".format(
                    s["name"], project_sample["sample_name"]))
            project_sample_item = project_sample['project_sample']
            # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
            project_sample_d = _set_project_sample_dict(
                project_sample_item, source)
            if not project_sample_d:
                LOG.warn(
                    "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}"
                    .format(s["name"], s["barcode_name"], s["_id"],
                            project_sample))
            # Check if sample run metrics name present in project database: if so, verify that database ids are consistent
            if s["name"] not in project_sample_d.keys():
                LOG.warn(
                    "no such sample run metrics '{}' in project sample run metrics dictionary"
                    .format(s["name"]))
            else:
                if s["_id"] == project_sample_d[s["name"]]:
                    LOG.debug(
                        "project sample run metrics mapping found: '{}' : '{}'"
                        .format(s["name"], project_sample_d[s["name"]]))
                else:
                    LOG.warn(
                        "inconsistent mapping for '{}': '{}' != '{}' (project summary id)"
                        .format(s["name"], s["_id"],
                                project_sample_d[s["name"]]))
            s_param['customer_name'] = project_sample_item.get(
                "customer_name", None)

            # Always normalize submitted id, since module textttable does not support unicode
            if type(s_param['customer_name']) is unicode:
                s_param['customer_name'] = unicodedata.normalize(
                    'NFKD',
                    s_param['customer_name']).encode('ascii', 'ignore')
        # No project sample found. Manual upload to database necessary.
        else:
            s_param['customer_name'] = None
            LOG.warn(
                "No project sample name found for sample run name '{}'".format(
                    s["barcode_name"]))
            LOG.info(
                "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names "
            )
            LOG.info(
                "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names."
            )
            LOG.info("Please refer to the pm documentation for examples.")
            query_ok(force=kw.get("force", False))

        # Finally assess sequencing success, update parameters and set outputs
        s_param['success'] = sequencing_success(s_param, cutoffs)
        s_param.update({
            k: "N/A"
            for k in s_param.keys()
            if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0
        })
        if sample_count[s.get("barcode_name")] > 1:
            outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                               s["flowcell"], s["lane"])
        else:
            outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                            s["flowcell"])
        s_param["outfile"] = outfile
        s_param_out.append(s_param)

    # Write final output to reportlab and rst files
    output_data["debug"].write(
        json.dumps({
            's_param': s_param_out,
            'sample_runs':
            {s["name"]: s["barcode_name"]
             for s in sample_run_list}
        }))
    notes = [
        make_note(headers=headers, paragraphs=paragraphs, **sp)
        for sp in s_param_out
    ]
    rest_notes = make_sample_rest_notes(
        "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None),
                                             s.get("flowcell", None)),
        s_param_out)
    concatenate_notes(
        notes, "{}_{}_{}_sample_summary.pdf".format(project_name,
                                                    s.get("date", None),
                                                    s.get("flowcell", None)))
    return output_data
def _project_status_note_table(project_name=None, username=None, password=None, url=None,
                               use_ps_map=True, use_bc_map=False, check_consistency=False,
                               ordered_million_reads=None, uppnex_id=None, customer_reference=None,
                               exclude_sample_ids={}, project_alias=None, sample_aliases={},
                               projectdb="projects", samplesdb="samples", flowcelldb="flowcells",
                               include_all_samples=False, param={}, **kw):

    # mapping project_summary to parameter keys
    ps_to_parameter = {"scilife_name":"scilife_name", "customer_name":"customer_name", "project_name":"project_name"}
    # mapping project sample to table
    table_keys = ['ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered']

    output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()}
    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url)

    #Get the information source for this project
    source = p_con.get_info_source(project_name)

    # Get project summary from project database
    sample_aliases = _literal_eval_option(sample_aliases, default={})
    prj_summary = p_con.get_entry(project_name)
    if not prj_summary:
        LOG.warn("No such project '{}'".format(project_name))
        return
    LOG.debug("Working on project '{}'.".format(project_name))

    # Determine if project is finished by getting all samples sequenced date
    try:
        all_samples_sequenced = prj_summary['project_summary']['all_samples_sequenced']
    except (TypeError,KeyError):
        all_samples_sequenced = False

    # Get sample run list and loop samples to make mapping sample -> {sampleruns}
    sample_run_list = _set_sample_run_list(project_name, flowcell=None, project_alias=project_alias, s_con=s_con)
    samples = {}
    for s in sample_run_list:
        prj_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None))
        if prj_sample:
            sample_name = prj_sample['project_sample'].get("scilife_name", None)
            s_d = {s["name"] : {'sample':sample_name, 'id':s["_id"]}}
            samples.update(s_d)
        else:
            if s["barcode_name"] in sample_aliases:
                s_d = {sample_aliases[s["barcode_name"]] : {'sample':sample_aliases[s["barcode_name"]], 'id':s["_id"]}}
                samples.update(s_d)
            else:
                s_d = {s["name"]:{'sample':s["name"], 'id':s["_id"], 'barcode_name':s["barcode_name"]}}
                LOG.warn("No mapping found for sample run:\n  '{}'".format(s_d))

    # Convert to mapping from desired sample name to list of aliases
    # Less important for the moment; one solution is to update the
    # Google docs summary table to use the P names
    sample_dict = prj_summary['samples']
    param.update({key:prj_summary.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys()})
    param["ordered_amount"] = param.get("ordered_amount", p_con.get_ordered_amount(project_name, samples=sample_dict))

    if not param.get('customer_reference') :
        try:
            param['customer_reference'] = prj_summary['details']['customer_project_reference']
        except (TypeError,KeyError):
            param['customer_reference'] = prj_summary.get('customer_reference')
    param['uppnex_project_id'] = param.get('uppnex_project_id', prj_summary.get('uppnex_id'))

    # Override database values if options passed at command line
    if uppnex_id:
        param["uppnex_project_id"] = uppnex_id
    if customer_reference:
        param["customer_reference"] = customer_reference

    # Process options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={})

    ## Start collecting the data
    sample_table = []
    samples_excluded = []
    last_library_preps = p_con.get_latest_library_prep(project_name)
    last_library_preps_srm = [x for l in last_library_preps.values() for x in l]
    LOG.debug("Looping through sample map that maps project sample names to sample run metrics ids")
    for k,v in samples.items():
        LOG.debug("project sample '{}' maps to '{}'".format(k, v))
        if not include_all_samples:
            if v['sample'] not in last_library_preps.keys():
                LOG.info("No library prep information for sample {}; keeping in report".format(v['sample']))
            else:
                if k not in last_library_preps_srm:
                    LOG.info("Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report".format(k, v["id"], ",".join(list(set(last_library_preps[v['sample']].values()))), v['sample']))
                    continue
        else:
            pass

        if re.search("Unexpected", k):
            continue
        barcode_seq = s_con.get_entry(k, "sequence")
        # Exclude sample id?
        if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq):
            samples_excluded.append(v['sample'])
            continue
        # Get the project sample name from the sample run and set table values
        project_sample = sample_dict[v['sample']]
        vals = _set_sample_table_values(v['sample'], project_sample, barcode_seq, ordered_million_reads, param)
        sample_table.append([vals[k] for k in table_keys])

    # Loop through samples in sample_dict for which there is no sample run information
    samples_in_table_or_excluded = list(set([x[0] for x in sample_table])) + samples_excluded
    samples_not_in_table = list(set(sample_dict.keys()) - set(samples_in_table_or_excluded))
    for sample in samples_not_in_table:
        if re.search("Unexpected", sample):
            continue
        project_sample = sample_dict[sample]
        # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
        project_sample_d = _set_project_sample_dict(project_sample, source)
        if project_sample_d:
            for k,v in project_sample_d.iteritems():
                barcode_seq = s_con.get_entry(k, "sequence")
                vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param)
                sample_table.append([vals[k] for k in table_keys])
        else:
            barcode_seq = None
            vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param)
            sample_table.append([vals[k] for k in table_keys])
    if all_samples_sequenced: param["finished"] = 'All samples for this project have been sequenced.'
    sample_table.sort()
    sample_table = list(sample_table for sample_table,_ in itertools.groupby(sample_table))
    sample_table.insert(0, ['ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered'])

    return output_data, sample_table, param
def data_delivery_note(**kw):
    """Create an easily parseable information file with information about the data delivery
    """
    output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()}

    project_name = kw.get('project_name',None)
    flowcell = kw.get('flowcell',None)
    LOG.debug("Generating data delivery note for project {}{}.".format(project_name,' and flowcell {}'.format(flowcell if flowcell else '')))

    # Get a connection to the project and sample databases
    p_con = ProjectSummaryConnection(**kw)
    assert p_con, "Could not connect to project database"
    s_con = SampleRunMetricsConnection(**kw)
    assert s_con, "Could not connect to sample database"

    # Get the entry for the project and samples from the database
    LOG.debug("Fetching samples from sample database")
    samples = s_con.get_samples(sample_prj=project_name, fc_id=flowcell)
    LOG.debug("Got {} samples from database".format(len(samples)))

    # Get the customer sample names from the project database
    LOG.debug("Fetching samples from project database")
    project_samples = p_con.get_entry(project_name, "samples")
    customer_names = {sample_name:sample.get('customer_name','N/A') for sample_name, sample in project_samples.items()}

    data = [['SciLifeLab ID','Submitted ID','Flowcell','Lane','Barcode','Read','Path','MD5','Size (bytes)','Timestamp']]
    for sample in samples:
        sname = sample.get('project_sample_name','N/A')
        cname = customer_names.get(sname,'N/A')
        fc = sample.get('flowcell','N/A')
        lane = sample.get('lane','N/A')
        barcode = sample.get('sequence','N/A')
        if 'raw_data_delivery' not in sample:
            data.append([sname,cname,'','','','','','','',''])
            continue
        delivery = sample['raw_data_delivery']
        tstamp = delivery.get('timestamp','N/A')
        for read, file in delivery.get('files',{}).items():
            data.append([sname,
                         cname,
                         fc,
                         lane,
                         barcode,
                         read,
                         file.get('path','N/A'),
                         file.get('md5','N/A'),
                         file.get('size_in_bytes','N/A'),
                         tstamp,])

    # Write the data to a csv file
    outfile = "{}{}_data_delivery.csv".format(project_name,'_{}'.format(flowcell) if flowcell else '')
    LOG.debug("Writing delivery data to {}".format(outfile))
    with open(outfile,"w") as outh:
        csvw = csv.writer(outh)
        for row in data:
            csvw.writerow(row)

    # Write Texttable formatted output to stdout
    tt = texttable.Texttable(180)
    tt.add_rows(data)
    output_data['stdout'].write(tt.draw())

    return output_data
Exemple #37
0
class TestQCUpload(PmFullTest):
    def setUp(self):
        """FIXME: All other tests depend on data being uploaded, so
        these are not real unit tests. The setup to TestQCUpload has to
        be run prior to other tests, else unexpected failures will
        occur."""
        self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[0], '--mtime',  '10000'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'])
        self._run_app()
        self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[1], '--mtime',  '10000'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'])
        self._run_app()
        self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******")
        self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******")
        self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")

    def test_samplesheet(self):
        """Test samplesheet upload"""
        fc = self.fc_con.get_entry("120924_AC003CCCXX")
        self.assertEqual(fc["samplesheet_csv"][0]["Index"], "TGACCA")
        self.assertEqual(fc["samplesheet_csv"][0]["Description"], "J__Doe_00_01")
        self.assertEqual(fc["samplesheet_csv"][0]["FCID"], "C003CCCXX")
        self.assertEqual(fc["samplesheet_csv"][1]["SampleRef"], "hg19")
        self.assertEqual(fc["samplesheet_csv"][2]["SampleID"], "P002_101_index3")

    def test_qc_upload(self):
        """Test running qc upload to server. Slightly circular testing
        here - I setup the module with qc update so by definition the
        test must 'work'"""
        self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[1], '--mtime',  '100'], extensions=['scilifelab.pm.ext.ext_qc',  'scilifelab.pm.ext.ext_couchdb'])
        self._run_app()
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        self.assertIsNone(s["project_sample_name"])
        self.assertEqual(s["project_id"], "P003")
        
    def test_qc_update(self):
        """Test running qc update of a project id"""
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        s["project_id"]= None
        self.assertIsNone(s["project_id"])
        self.s_con.save(s)
        self.app = self.make_app(argv = ['qc', 'update', '--sample_prj', projects[2], '--project_id', 'P003', '--debug', '--force'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'])
        self._run_app()
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        self.assertEqual(s["project_id"], "P003")

    def test_qc_update_sample_names(self):
        """Test running qc update of project sample names"""
        s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA")
        s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG")
        s1["project_sample_name"] = None
        s2["project_sample_name"] = None
        self.assertIsNone(s1["project_sample_name"])
        self.assertIsNone(s2["project_sample_name"])
        self.s_con.save(s1)
        self.s_con.save(s2)
        sample_map = {'P001_101_index3': 'P001_101_index3', 'P001_102_index6':'P001_102'}
        self.app = self.make_app(argv = ['qc', 'update', '--sample_prj', projects[0], '--names', "{}".format(sample_map), '--debug', '--force'], extensions=['scilifelab.pm.ext.ext_qc',  'scilifelab.pm.ext.ext_couchdb'])
        self._run_app()
        s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA")
        s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG")
        self.assertEqual(s1["project_sample_name"], "P001_101_index3")
        self.assertEqual(s2["project_sample_name"], "P001_102")
Exemple #38
0
    def test_2_make_project_note(self):
        """Make a project note subset by flowcell and project"""
        s_con = SampleRunMetricsConnection(username=self.user,
                                           password=self.pw,
                                           url=self.url)
        fc_con = FlowcellRunMetricsConnection(username=self.user,
                                              password=self.pw,
                                              url=self.url)
        p_con = ProjectSummaryConnection(username=self.user,
                                         password=self.pw,
                                         url=self.url)
        paragraphs = project_note_paragraphs()
        headers = project_note_headers()
        param = parameters
        project = p_con.get_entry(self.examples["project"])
        if not project:
            print "No project named {}".format(self.examples["project"])
            return
        if project:
            ordered_amount = p_con.get_ordered_amount(self.examples["project"])
        else:
            return
            ordered_amount = self.pargs.ordered_million_reads

        ## Start collecting the data
        sample_table = []
        sample_list = project['samples']
        param.update({
            key: project.get(ps_to_parameter[key], None)
            for key in ps_to_parameter.keys()
        })
        samples = p_con.map_name_to_srm(self.examples["project"],
                                        check_consistency=True,
                                        use_bc_map=True)
        all_passed = True
        for k, v in samples.items():
            if k == "Unexpected":
                continue
            project_sample = sample_list[k]
            vals = {
                x: project_sample.get(prjs_to_table[x], None)
                for x in prjs_to_table.keys()
            }
            vals['MOrdered'] = ordered_amount
            vals['BarcodeSeq'] = s_con.get_entry(v.keys()[0], "sequence")

            ## Set status
            vals['Status'] = set_status(
                vals) if vals['Status'] is None else vals['Status']
            vals.update({k: "N/A" for k in vals.keys() if vals[k] is None})
            if vals['Status'] == "N/A" or vals['Status'] == "NP":
                all_passed = False
            sample_table.append([vals[k] for k in table_keys])
        if all_passed: param["finished"] = 'Project finished.'
        sample_table.sort()
        sample_table = list(
            sample_table
            for sample_table, _ in itertools.groupby(sample_table))
        sample_table.insert(0, [
            'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered',
            'Status'
        ])
        paragraphs["Samples"]["tpl"] = make_sample_table(sample_table)
        make_note("{}.pdf".format(self.examples["project"]), headers,
                  paragraphs, **param)
Exemple #39
0
def project_status_note(project_name=None, username=None, password=None, url=None,
                        use_ps_map=True, use_bc_map=False, check_consistency=False,
                        ordered_million_reads=None, uppnex_id=None, customer_reference=None,
                        exclude_sample_ids={}, project_alias=None, sample_aliases={},
                        projectdb="projects", samplesdb="samples", flowcelldb="flowcells",
                        include_all_samples=False, **kw):
    """Make a project status note. Used keywords:

    :param project_name: project name
    :param user: db user name
    :param password: db password
    :param url: db url
    :param use_ps_map: use project summary mapping
    :param use_bc_map: use project to barcode name mapping
    :param check_consistency: check consistency between mappings
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param exclude_sample_ids: exclude some sample ids from project note
    :param project_alias: project alias name
    :param sample_aliases: sample alias names
    :param projectdb: project db name
    :param samplesdb: samples db name
    :param flowcelldb: flowcells db name
    :param include_all_samples: include all samples in report
    """
    # parameters
    parameters = {
        "project_name" : project_name,
        "finished" : "Not finished, or cannot yet assess if finished.",
        }
    # mapping project_summary to parameter keys
    ps_to_parameter = {"scilife_name":"scilife_name", "customer_name":"customer_name", "project_name":"project_name"}
    # mapping project sample to table
    table_keys = ['ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status']

    output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()}
    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url)

    # Set report paragraphs
    paragraphs = project_note_paragraphs()
    headers = project_note_headers()
    # Set local param variable
    param = parameters
    
    # Get project summary from project database
    sample_aliases = _literal_eval_option(sample_aliases, default={})
    prj_summary = p_con.get_entry(project_name)
    if not prj_summary:
        LOG.warn("No such project '{}'".format(project_name))
        return
    LOG.debug("Working on project '{}'.".format(project_name))

    # Get sample run list and loop samples to make mapping sample -> {sampleruns}
    sample_run_list = _set_sample_run_list(project_name, flowcell=None, project_alias=project_alias, s_con=s_con)
    samples = {}
    for s in sample_run_list:
        prj_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None))
        if prj_sample:
            sample_name = prj_sample['project_sample'].get("scilife_name", None)
            s_d = {s["name"] : {'sample':sample_name, 'id':s["_id"]}}
            samples.update(s_d)
        else:
            if s["barcode_name"] in sample_aliases:
                s_d = {sample_aliases[s["barcode_name"]] : {'sample':sample_aliases[s["barcode_name"]], 'id':s["_id"]}}
                samples.update(s_d)
            else:
                s_d = {s["name"]:{'sample':s["name"], 'id':s["_id"], 'barcode_name':s["barcode_name"]}}
                LOG.warn("No mapping found for sample run:\n  '{}'".format(s_d))

    # Convert to mapping from desired sample name to list of aliases
    # Less important for the moment; one solution is to update the
    # Google docs summary table to use the P names
    sample_dict = prj_summary['samples']
    param.update({key:prj_summary.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys()})
    param["ordered_amount"] = param.get("ordered_amount", p_con.get_ordered_amount(project_name))
    param['customer_reference'] = param.get('customer_reference', prj_summary.get('customer_reference'))
    param['uppnex_project_id'] = param.get('uppnex_project_id', prj_summary.get('uppnex_id'))

    # Override database values if options passed at command line
    if uppnex_id:
        param["uppnex_project_id"] = uppnex_id
    if customer_reference:
        param["customer_reference"] = customer_reference

    # Process options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={})

    ## Start collecting the data
    sample_table = []
    samples_excluded = []
    all_passed = True
    last_library_preps = p_con.get_latest_library_prep(project_name)
    last_library_preps_srm = [x for l in last_library_preps.values() for x in l] 
    LOG.debug("Looping through sample map that maps project sample names to sample run metrics ids")
    for k,v in samples.items():
        LOG.debug("project sample '{}' maps to '{}'".format(k, v))
        if not include_all_samples:
            if v['sample'] not in last_library_preps.keys():
                LOG.info("No library prep information for sample {}; keeping in report".format(v['sample']))
            else:
                if k not in last_library_preps_srm:
                    LOG.info("Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report".format(k, v["id"], last_library_preps[v['sample']].values()[0], v['sample']))
                    continue
        else:
            pass
                    
        if re.search("Unexpected", k):
            continue
        barcode_seq = s_con.get_entry(k, "sequence")
        # Exclude sample id?
        if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq):
            samples_excluded.append(v['sample'])
            continue
        # Get the project sample name from the sample run and set table values
        project_sample = sample_dict[v['sample']]
        vals = _set_sample_table_values(v['sample'], project_sample, barcode_seq, ordered_million_reads, param)
        if vals['Status']=="N/A" or vals['Status']=="NP": all_passed = False
        sample_table.append([vals[k] for k in table_keys])

    # Loop through samples in sample_dict for which there is no sample run information
    samples_in_table_or_excluded = list(set([x[0] for x in sample_table])) + samples_excluded
    samples_not_in_table = list(set(sample_dict.keys()) - set(samples_in_table_or_excluded))
    for sample in samples_not_in_table:
        if re.search("Unexpected", sample):
            continue
        project_sample = sample_dict[sample]
        # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
        project_sample_d = _set_project_sample_dict(project_sample)
        if project_sample_d:
            for k,v in project_sample_d.iteritems():
                barcode_seq = s_con.get_entry(k, "sequence")
                vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param)
                if vals['Status']=="N/A" or vals['Status']=="NP": all_passed = False
                sample_table.append([vals[k] for k in table_keys])
        else:
            barcode_seq = None
            vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param)
            if vals['Status']=="N/A" or vals['Status']=="NP": all_passed = False
            sample_table.append([vals[k] for k in table_keys])
    if all_passed: param["finished"] = 'Project finished.'
    sample_table.sort()
    sample_table = list(sample_table for sample_table,_ in itertools.groupby(sample_table))
    sample_table.insert(0, ['ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status'])
    paragraphs["Samples"]["tpl"] = make_sample_table(sample_table)
    make_note("{}_project_summary.pdf".format(project_name), headers, paragraphs, **param)
    make_rest_note("{}_project_summary.rst".format(project_name), sample_table=sample_table, report="project_report", **param)
    param.update({k:"N/A" for k in param.keys() if param[k] is None or param[k] ==  ""})
    output_data["debug"].write(json.dumps({'param':param, 'table':sample_table}))
    return output_data
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project databse"
        self.log.debug("Connecting to samples database")
        s_con = SampleRunMetricsConnection(**vars(self.pargs))
        assert s_con, "Could not get connection to samples databse"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error("Uppmax project was not specified and could not be fetched from project database")
                return

        # Extract the list of samples and runs associated with the project and sort them
        samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name','NA'), k.get('flowcell','NA'), k.get('lane','NA')))

        # Setup paths and verify parameters
        self._meta.production_root = self.app.config.get("production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(self._meta.production_root), "No such directory {}; check your production config".format(self._meta.production_root)
        assert os.path.exists(proj_base_dir), "No project {} in production path {}".format(self.pargs.project,self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get("deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn("{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get("deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn("{}, will use 'INBOX' as uppnext_project_delivery_path".format(e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,self.pargs.uppmax_project,self._meta.uppnex_delivery_dir)
        assert os.path.exists(destination_root), "Delivery destination folder {} does not exist".format(destination_root)
        destination_root = os.path.join(destination_root,self.pargs.project)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = []
            for sample in samples:
                sname = sample.get("project_sample_name")
                index = sample.get("sequence")
                fcid = sample.get("flowcell")
                lane = sample.get("lane")
                date = sample.get("date")
                self.log.info("Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}".format(sname,
                                                                                                           index,
                                                                                                           fcid,
                                                                                                           lane,
                                                                                                           date))
                if query_yes_no("Deliver sample?", default="no"):
                    to_process.append(sample)
            samples = to_process

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(proj_base_dir,samples)
        if len(uncompressed) > 0:
            self.log.warn("The following samples have uncompressed *.fastq files that cannot be delivered: {}".format(",".join(uncompressed)))
            if not query_yes_no("Continue anyway?", default="no"):
                return

        self.log.info("Will deliver data for {} samples from project {} to {}".format(len(samples),self.pargs.project,destination_root))
        if not query_yes_no("Continue?"):
            return

        # Get the list of files to transfer and the destination
        self.log.debug("Gathering list of files to copy")
        to_copy = self.get_file_copy_list(proj_base_dir,
                                          destination_root,
                                          samples)

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no("Do you wish to continue delivering using rsync?", default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample run
        for id, files in to_copy.items():
            # get the sample database object
            [sample] = [s for s in samples if s.get('_id') == id]
            self.log.info("Processing sample {} and flowcell {}".format(sample.get("project_sample_name","NA"),sample.get("flowcell","NA")))

            # transfer files
            self.log.debug("Transferring {} fastq files".format(len(files)))
            self._transfer_files([f[0] for f in files], [f[1] for f in files])

            passed = True
            if self.pargs.link or self.pargs.dry_run:
                passed = False
            else:
                # calculate md5sums on the source side and write it on the destination
                md5 = []
                for f in files:
                    m = md5sum(f[0])
                    mfile = "{}.md5".format(f[1])
                    md5.append([m,mfile,f[2],f[0]])
                    self.log.debug("md5sum for source file {}: {}".format(f[0],m))

                # write the md5sum to a file at the destination and verify the transfer
                for m, mfile, read, srcpath in md5:
                    dstfile = os.path.splitext(mfile)[0]
                    self.log.debug("Writing md5sum to file {}".format(mfile))
                    self.app.cmd.write(mfile,"{}  {}".format(m,os.path.basename(dstfile)),True)
                    self.log.debug("Verifying md5sum for file {}".format(dstfile))
                    dm = md5sum(dstfile)
                    self.log.debug("md5sum for destination file {}: {}".format(dstfile,dm))
                    if m != dm:
                        self.log.warn("md5sum verification FAILED for {}. Source: {}, Target: {}".format(dstfile,m,dm))
                        self.log.warn("Improperly transferred file {} is removed from destination, please retry transfer of this file".format(dstfile))
                        self.app.cmd.safe_unlink(dstfile)
                        self.app.cmd.safe_unlink(mfile)
                        passed = False
                        continue

                    # Modify the permissions to ug+rw
                    for f in [dstfile, mfile]:
                        self.app.cmd.chmod(f,stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP)

            # touch the flag to trigger uppmax inbox permission fix
            self.app.cmd.safe_touchfile(os.path.join("/sw","uppmax","var","inboxfix","schedule",self.pargs.uppmax_project))

            # log the transfer to statusdb if verification passed
            if passed:
                self.log.info("Logging delivery to StatusDB document {}".format(id))
                data = {'raw_data_delivery': {'timestamp': utc_time(),
                                              'files': {'R{}'.format(read):{'md5': m,
                                                                            'path': os.path.splitext(mfile)[0],
                                                                            'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]),
                                                                            'source_location': srcpath} for m, mfile, read, srcpath in md5},
                                              }
                        }
                jsonstr = json.dumps(data)
                jsonfile = os.path.join(os.path.dirname(md5[0][3]),
                                        "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(sample.get("date"),
                                                                                       sample.get("flowcell"),
                                                                                       sample.get("project_sample_name"),
                                                                                       sample.get("sequence"),
                                                                                       sample.get("lane")))
                self.log.debug("Writing delivery to json file {}".format(jsonfile))
                self.app.cmd.write(jsonfile,data=jsonstr,overwrite=True)
                self.log.debug("Saving delivery in StatusDB document {}".format(id))
                sample.update(data)
                self._save(s_con,sample)
                self.log.debug(jsonstr)
Exemple #41
0
class TestQCUpload(PmFullTest):
    def setUp(self):
        self.app = self.make_app(
            argv=["qc", "upload-qc", flowcells[0], "--mtime", "10000"],
            extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"],
        )
        self._run_app()
        self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******")
        self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******")
        self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")

    def test_samplesheet(self):
        """Test samplesheet upload"""
        fc = self.fc_con.get_entry("120924_AC003CCCXX")
        self.assertEqual(fc["samplesheet_csv"][0]["Index"], "TGACCA")
        self.assertEqual(fc["samplesheet_csv"][0]["Description"], "J__Doe_00_01")
        self.assertEqual(fc["samplesheet_csv"][0]["FCID"], "C003CCCXX")
        self.assertEqual(fc["samplesheet_csv"][1]["SampleRef"], "hg19")
        self.assertEqual(fc["samplesheet_csv"][2]["SampleID"], "P001_101_index3")

    def test_qc_upload(self):
        """Test running qc upload to server"""
        self.app = self.make_app(
            argv=["qc", "upload-qc", flowcells[1], "--mtime", "100"],
            extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"],
        )
        self._run_app()
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        self.assertIsNone(s["project_sample_name"])
        self.assertEqual(s["project_id"], "P003")

    def test_qc_update(self):
        """Test running qc update of a project id"""
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        s["project_id"] = None
        self.assertIsNone(s["project_id"])
        self.s_con.save(s)
        self.app = self.make_app(
            argv=["qc", "update", "--sample_prj", projects[2], "--project_id", "P003", "--debug", "--force"],
            extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"],
        )
        self._run_app()
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        self.assertEqual(s["project_id"], "P003")

    def test_qc_update_sample_names(self):
        """Test running qc update of project sample names"""
        s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA")
        s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG")
        s1["project_sample_name"] = None
        s2["project_sample_name"] = None
        self.assertIsNone(s1["project_sample_name"])
        self.assertIsNone(s2["project_sample_name"])
        self.s_con.save(s1)
        self.s_con.save(s2)
        sample_map = {"P001_101_index3": "P001_101_index3", "P001_102_index6": "P001_102"}
        self.app = self.make_app(
            argv=[
                "qc",
                "update",
                "--sample_prj",
                projects[0],
                "--names",
                "{}".format(sample_map),
                "--debug",
                "--force",
            ],
            extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"],
        )
        self._run_app()
        s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA")
        s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG")
        self.assertEqual(s1["project_sample_name"], "P001_101_index3")
        self.assertEqual(s2["project_sample_name"], "P001_102")
Exemple #42
0
class TestQCUpload(PmFullTest):
    def setUp(self):
        """FIXME: All other tests depend on data being uploaded, so
        these are not real unit tests. The setup to TestQCUpload has to
        be run prior to other tests, else unexpected failures will
        occur."""
        self.app = self.make_app(
            argv=['qc', 'upload-qc', flowcells[0], '--mtime', '10000'],
            extensions=[
                'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'
            ])
        self._run_app()
        self.app = self.make_app(
            argv=['qc', 'upload-qc', flowcells[1], '--mtime', '10000'],
            extensions=[
                'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'
            ])
        self._run_app()
        self.s_con = SampleRunMetricsConnection(dbname="samples-test",
                                                username="******",
                                                password="******")
        self.p_con = ProjectSummaryConnection(dbname="projects-test",
                                              username="******",
                                              password="******")
        self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test",
                                                   username="******",
                                                   password="******")

    def test_samplesheet(self):
        """Test samplesheet upload"""
        fc = self.fc_con.get_entry("120924_AC003CCCXX")
        self.assertEqual(fc["samplesheet_csv"][0]["Index"], "TGACCA")
        self.assertEqual(fc["samplesheet_csv"][0]["Description"],
                         "J__Doe_00_01")
        self.assertEqual(fc["samplesheet_csv"][0]["FCID"], "C003CCCXX")
        self.assertEqual(fc["samplesheet_csv"][1]["SampleRef"], "hg19")
        self.assertEqual(fc["samplesheet_csv"][2]["SampleID"],
                         "P002_101_index3")

    def test_qc_upload(self):
        """Test running qc upload to server. Slightly circular testing
        here - I setup the module with qc update so by definition the
        test must 'work'"""
        self.app = self.make_app(
            argv=['qc', 'upload-qc', flowcells[1], '--mtime', '100'],
            extensions=[
                'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'
            ])
        self._run_app()
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        self.assertIsNone(s["project_sample_name"])
        self.assertEqual(s["project_id"], "P003")

    def test_qc_update(self):
        """Test running qc update of a project id"""
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        s["project_id"] = None
        self.assertIsNone(s["project_id"])
        self.s_con.save(s)
        self.app = self.make_app(argv=[
            'qc', 'update', '--sample_prj', projects[2], '--project_id',
            'P003', '--debug', '--force'
        ],
                                 extensions=[
                                     'scilifelab.pm.ext.ext_qc',
                                     'scilifelab.pm.ext.ext_couchdb'
                                 ])
        self._run_app()
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        self.assertEqual(s["project_id"], "P003")

    def test_qc_update_sample_names(self):
        """Test running qc update of project sample names"""
        s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA")
        s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG")
        s1["project_sample_name"] = None
        s2["project_sample_name"] = None
        self.assertIsNone(s1["project_sample_name"])
        self.assertIsNone(s2["project_sample_name"])
        self.s_con.save(s1)
        self.s_con.save(s2)
        sample_map = {
            'P001_101_index3': 'P001_101_index3',
            'P001_102_index6': 'P001_102'
        }
        self.app = self.make_app(argv=[
            'qc', 'update', '--sample_prj', projects[0], '--names',
            "{}".format(sample_map), '--debug', '--force'
        ],
                                 extensions=[
                                     'scilifelab.pm.ext.ext_qc',
                                     'scilifelab.pm.ext.ext_couchdb'
                                 ])
        self._run_app()
        s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA")
        s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG")
        self.assertEqual(s1["project_sample_name"], "P001_101_index3")
        self.assertEqual(s2["project_sample_name"], "P001_102")
Exemple #43
0
def project_status_note(project_name=None,
                        username=None,
                        password=None,
                        url=None,
                        use_ps_map=True,
                        use_bc_map=False,
                        check_consistency=False,
                        ordered_million_reads=None,
                        uppnex_id=None,
                        customer_reference=None,
                        exclude_sample_ids={},
                        project_alias=None,
                        sample_aliases={},
                        projectdb="projects",
                        samplesdb="samples",
                        flowcelldb="flowcells",
                        include_all_samples=False,
                        **kw):
    """Make a project status note. Used keywords:

    :param project_name: project name
    :param user: db user name
    :param password: db password
    :param url: db url
    :param use_ps_map: use project summary mapping
    :param use_bc_map: use project to barcode name mapping
    :param check_consistency: check consistency between mappings
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param exclude_sample_ids: exclude some sample ids from project note
    :param project_alias: project alias name
    :param sample_aliases: sample alias names
    :param projectdb: project db name
    :param samplesdb: samples db name
    :param flowcelldb: flowcells db name
    :param include_all_samples: include all samples in report
    """
    # parameters
    parameters = {
        "project_name": project_name,
        "finished": "Not finished, or cannot yet assess if finished.",
    }
    # mapping project_summary to parameter keys
    ps_to_parameter = {
        "scilife_name": "scilife_name",
        "customer_name": "customer_name",
        "project_name": "project_name"
    }
    # mapping project sample to table
    table_keys = [
        'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered',
        'Status'
    ]

    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    # Set report paragraphs
    paragraphs = project_note_paragraphs()
    headers = project_note_headers()
    # Set local param variable
    param = parameters

    # Get project summary from project database
    sample_aliases = _literal_eval_option(sample_aliases, default={})
    prj_summary = p_con.get_entry(project_name)
    if not prj_summary:
        LOG.warn("No such project '{}'".format(project_name))
        return
    LOG.debug("Working on project '{}'.".format(project_name))

    # Get sample run list and loop samples to make mapping sample -> {sampleruns}
    sample_run_list = _set_sample_run_list(project_name,
                                           flowcell=None,
                                           project_alias=project_alias,
                                           s_con=s_con)
    samples = {}
    for s in sample_run_list:
        prj_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if prj_sample:
            sample_name = prj_sample['project_sample'].get(
                "scilife_name", None)
            s_d = {s["name"]: {'sample': sample_name, 'id': s["_id"]}}
            samples.update(s_d)
        else:
            if s["barcode_name"] in sample_aliases:
                s_d = {
                    sample_aliases[s["barcode_name"]]: {
                        'sample': sample_aliases[s["barcode_name"]],
                        'id': s["_id"]
                    }
                }
                samples.update(s_d)
            else:
                s_d = {
                    s["name"]: {
                        'sample': s["name"],
                        'id': s["_id"],
                        'barcode_name': s["barcode_name"]
                    }
                }
                LOG.warn(
                    "No mapping found for sample run:\n  '{}'".format(s_d))

    # Convert to mapping from desired sample name to list of aliases
    # Less important for the moment; one solution is to update the
    # Google docs summary table to use the P names
    sample_dict = prj_summary['samples']
    param.update({
        key: prj_summary.get(ps_to_parameter[key], None)
        for key in ps_to_parameter.keys()
    })
    param["ordered_amount"] = param.get("ordered_amount",
                                        p_con.get_ordered_amount(project_name))
    param['customer_reference'] = param.get(
        'customer_reference', prj_summary.get('customer_reference'))
    param['uppnex_project_id'] = param.get('uppnex_project_id',
                                           prj_summary.get('uppnex_id'))

    # Override database values if options passed at command line
    if uppnex_id:
        param["uppnex_project_id"] = uppnex_id
    if customer_reference:
        param["customer_reference"] = customer_reference

    # Process options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={})

    ## Start collecting the data
    sample_table = []
    samples_excluded = []
    all_passed = True
    last_library_preps = p_con.get_latest_library_prep(project_name)
    last_library_preps_srm = [
        x for l in last_library_preps.values() for x in l
    ]
    LOG.debug(
        "Looping through sample map that maps project sample names to sample run metrics ids"
    )
    for k, v in samples.items():
        LOG.debug("project sample '{}' maps to '{}'".format(k, v))
        if not include_all_samples:
            if v['sample'] not in last_library_preps.keys():
                LOG.info(
                    "No library prep information for sample {}; keeping in report"
                    .format(v['sample']))
            else:
                if k not in last_library_preps_srm:
                    LOG.info(
                        "Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report"
                        .format(k, v["id"],
                                last_library_preps[v['sample']].values()[0],
                                v['sample']))
                    continue
        else:
            pass

        if re.search("Unexpected", k):
            continue
        barcode_seq = s_con.get_entry(k, "sequence")
        # Exclude sample id?
        if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq):
            samples_excluded.append(v['sample'])
            continue
        # Get the project sample name from the sample run and set table values
        project_sample = sample_dict[v['sample']]
        vals = _set_sample_table_values(v['sample'], project_sample,
                                        barcode_seq, ordered_million_reads,
                                        param)
        if vals['Status'] == "N/A" or vals['Status'] == "NP":
            all_passed = False
        sample_table.append([vals[k] for k in table_keys])

    # Loop through samples in sample_dict for which there is no sample run information
    samples_in_table_or_excluded = list(set([x[0] for x in sample_table
                                             ])) + samples_excluded
    samples_not_in_table = list(
        set(sample_dict.keys()) - set(samples_in_table_or_excluded))
    for sample in samples_not_in_table:
        if re.search("Unexpected", sample):
            continue
        project_sample = sample_dict[sample]
        # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
        project_sample_d = _set_project_sample_dict(project_sample)
        if project_sample_d:
            for k, v in project_sample_d.iteritems():
                barcode_seq = s_con.get_entry(k, "sequence")
                vals = _set_sample_table_values(sample, project_sample,
                                                barcode_seq,
                                                ordered_million_reads, param)
                if vals['Status'] == "N/A" or vals['Status'] == "NP":
                    all_passed = False
                sample_table.append([vals[k] for k in table_keys])
        else:
            barcode_seq = None
            vals = _set_sample_table_values(sample, project_sample,
                                            barcode_seq, ordered_million_reads,
                                            param)
            if vals['Status'] == "N/A" or vals['Status'] == "NP":
                all_passed = False
            sample_table.append([vals[k] for k in table_keys])
    if all_passed: param["finished"] = 'Project finished.'
    sample_table.sort()
    sample_table = list(sample_table
                        for sample_table, _ in itertools.groupby(sample_table))
    sample_table.insert(0, [
        'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered',
        'Status'
    ])
    paragraphs["Samples"]["tpl"] = make_sample_table(sample_table)
    make_note("{}_project_summary.pdf".format(project_name), headers,
              paragraphs, **param)
    make_rest_note("{}_project_summary.rst".format(project_name),
                   sample_table=sample_table,
                   report="project_report",
                   **param)
    param.update(
        {k: "N/A"
         for k in param.keys() if param[k] is None or param[k] == ""})
    output_data["debug"].write(
        json.dumps({
            'param': param,
            'table': sample_table
        }))
    return output_data
Exemple #44
0
def sample_status_note(project_name=None,
                       flowcell=None,
                       username=None,
                       password=None,
                       url=None,
                       ordered_million_reads=None,
                       uppnex_id=None,
                       customer_reference=None,
                       bc_count=None,
                       project_alias=[],
                       projectdb="projects",
                       samplesdb="samples",
                       flowcelldb="flowcells",
                       phix=None,
                       **kw):
    """Make a sample status note. Used keywords:

    :param project_name: project name
    :param flowcell: flowcell id
    :param username: db username
    :param password: db password
    :param url: db url
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param project_alias: project alias name
    :param phix: phix error rate
    """
    # Cutoffs
    cutoffs = {
        "phix_err_cutoff": 2.0,
        "qv_cutoff": 30,
    }

    # parameters
    parameters = {
        "project_name": None,
        "start_date": None,
        "FC_id": None,
        "scilifelab_name": None,
        "rounded_read_count": None,
        "phix_error_rate": None,
        "avg_quality_score": None,
        "success": None,
        "run_mode": None,
    }
    # key mapping from sample_run_metrics to parameter keys
    srm_to_parameter = {
        "project_name": "sample_prj",
        "FC_id": "flowcell",
        "scilifelab_name": "barcode_name",
        "start_date": "date",
        "rounded_read_count": "bc_count"
    }

    LOG.debug("got parameters {}".format(parameters))
    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    if not _assert_flowcell_format(flowcell):
        LOG.warn(
            "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9]+XX\")"
            .format(flowcell))
        return output_data
    output_data = _update_sample_output_data(output_data, cutoffs)

    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    # Set up paragraphs
    paragraphs = sample_note_paragraphs()
    headers = sample_note_headers()

    # Get project
    project = p_con.get_entry(project_name)
    if not project:
        LOG.warn("No such project '{}'".format(project_name))
        return output_data

    # Set samples list
    sample_run_list = _set_sample_run_list(project_name, flowcell,
                                           project_alias, s_con)
    if len(sample_run_list) == 0:
        LOG.warn(
            "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?"
            .format(project_name, flowcell))
        return output_data

    # Set options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    bc_count = _literal_eval_option(bc_count)
    phix = _literal_eval_option(phix)

    # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports
    sample_count = Counter([x.get("barcode_name") for x in sample_run_list])

    # Loop samples and collect information
    s_param_out = []
    for s in sample_run_list:
        s_param = {}
        LOG.debug(
            "working on sample '{}', sample run metrics name '{}', id '{}'".
            format(s.get("barcode_name", None), s.get("name", None),
                   s.get("_id", None)))
        s_param.update(parameters)
        s_param.update(
            {key: s[srm_to_parameter[key]]
             for key in srm_to_parameter.keys()})
        fc = "{}_{}".format(s.get("date"), s.get("flowcell"))
        # Get instrument
        try:
            s_param.update(instrument[fc_con.get_instrument(str(fc))])
        except:
            LOG.warn(
                "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report."
                .format(fc))
            s_param.update(instrument['default'])
        # Get run mode
        s_param["run_mode"] = fc_con.get_run_mode(str(fc))
        s_param.update(software_versions)
        s_param["phix_error_rate"] = fc_con.get_phix_error_rate(
            str(fc), s["lane"])
        if phix:
            s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix)
        s_param['avg_quality_score'] = calc_avg_qv(s)
        if not s_param['avg_quality_score']:
            LOG.warn(
                "Calculation of average quality failed for sample {}, id {}".
                format(s.get("name"), s.get("_id")))

        # Compare phix error and qv to cutoffs
        err_stat = "OK"
        qv_stat = "OK"
        if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]:
            err_stat = "HIGH"
        elif s_param["phix_error_rate"] == -1:
            err_stat = "N/A"
        if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]:
            qv_stat = "LOW"
        output_data["stdout"].write(
            "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(
                s["barcode_name"], s["lane"], s_param["phix_error_rate"],
                err_stat, s_param["avg_quality_score"], qv_stat))

        # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing
        s_param['ordered_amount'] = s_param.get(
            'ordered_amount', p_con.get_ordered_amount(project_name))
        s_param['customer_reference'] = s_param.get(
            'customer_reference', project.get('customer_reference'))
        s_param['uppnex_project_id'] = s_param.get('uppnex_project_id',
                                                   project.get('uppnex_id'))

        # Override database settings if options passed at command line
        if ordered_million_reads:
            s_param["ordered_amount"] = _get_ordered_million_reads(
                s["barcode_name"], ordered_million_reads)
        if bc_count:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                _get_bc_count(s["barcode_name"], bc_count, s))
        else:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                s_param["rounded_read_count"])
        if uppnex_id:
            s_param["uppnex_project_id"] = uppnex_id
        if customer_reference:
            s_param["customer_reference"] = customer_reference

        # Get the project sample name corresponding to the sample run
        project_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if project_sample:
            LOG.debug(
                "project sample run metrics mapping found: '{}' : '{}'".format(
                    s["name"], project_sample["sample_name"]))
            project_sample_item = project_sample['project_sample']
            # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
            project_sample_d = _set_project_sample_dict(project_sample_item)
            if not project_sample_d:
                LOG.warn(
                    "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}"
                    .format(s["name"], s["barcode_name"], s["_id"],
                            project_sample))
            # Check if sample run metrics name present in project database: if so, verify that database ids are consistent
            if s["name"] not in project_sample_d.keys():
                LOG.warn(
                    "no such sample run metrics '{}' in project sample run metrics dictionary"
                    .format(s["name"]))
            else:
                if s["_id"] == project_sample_d[s["name"]]:
                    LOG.debug(
                        "project sample run metrics mapping found: '{}' : '{}'"
                        .format(s["name"], project_sample_d[s["name"]]))
                else:
                    LOG.warn(
                        "inconsistent mapping for '{}': '{}' != '{}' (project summary id)"
                        .format(s["name"], s["_id"],
                                project_sample_d[s["name"]]))
            s_param['customer_name'] = project_sample_item.get(
                "customer_name", None)

        # No project sample found. Manual upload to database necessary.
        else:
            s_param['customer_name'] = None
            LOG.warn(
                "No project sample name found for sample run name '{}'".format(
                    s["barcode_name"]))
            LOG.info(
                "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names "
            )
            LOG.info(
                "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names."
            )
            LOG.info("Please refer to the pm documentation for examples.")
            query_ok(force=kw.get("force", False))

        # Finally assess sequencing success, update parameters and set outputs
        s_param['success'] = sequencing_success(s_param, cutoffs)
        s_param.update({
            k: "N/A"
            for k in s_param.keys()
            if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0
        })
        if sample_count[s.get("barcode_name")] > 1:
            outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                               s["flowcell"], s["lane"])
        else:
            outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                            s["flowcell"])
        s_param["outfile"] = outfile
        s_param_out.append(s_param)

    # Write final output to reportlab and rst files
    output_data["debug"].write(
        json.dumps({
            's_param': s_param_out,
            'sample_runs':
            {s["name"]: s["barcode_name"]
             for s in sample_run_list}
        }))
    notes = [
        make_note(headers=headers, paragraphs=paragraphs, **sp)
        for sp in s_param_out
    ]
    rest_notes = make_sample_rest_notes(
        "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None),
                                             s.get("flowcell", None)),
        s_param_out)
    concatenate_notes(
        notes, "{}_{}_{}_sample_summary.pdf".format(project_name,
                                                    s.get("date", None),
                                                    s.get("flowcell", None)))
    return output_data
Exemple #45
0
def data_delivery_note(**kw):
    """Create an easily parseable information file with information about the data delivery
    """
    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }

    project_name = kw.get('project_name', None)
    flowcell = kw.get('flowcell', None)
    LOG.debug("Generating data delivery note for project {}{}.".format(
        project_name, ' and flowcell {}'.format(flowcell if flowcell else '')))

    # Get a connection to the project and sample databases
    p_con = ProjectSummaryConnection(**kw)
    assert p_con, "Could not connect to project database"
    s_con = SampleRunMetricsConnection(**kw)
    assert s_con, "Could not connect to sample database"

    # Get the entry for the project and samples from the database
    LOG.debug("Fetching samples from sample database")
    samples = s_con.get_samples(sample_prj=project_name, fc_id=flowcell)
    LOG.debug("Got {} samples from database".format(len(samples)))

    # Get the customer sample names from the project database
    LOG.debug("Fetching samples from project database")
    project_samples = p_con.get_entry(project_name, "samples")
    customer_names = {
        sample_name: sample.get('customer_name', 'N/A')
        for sample_name, sample in project_samples.items()
    }

    data = [[
        'SciLifeLab ID', 'Submitted ID', 'Flowcell', 'Lane', 'Barcode', 'Read',
        'Path', 'MD5', 'Size (bytes)', 'Timestamp'
    ]]
    for sample in samples:
        sname = sample.get('project_sample_name', 'N/A')
        cname = customer_names.get(sname, 'N/A')
        fc = sample.get('flowcell', 'N/A')
        lane = sample.get('lane', 'N/A')
        barcode = sample.get('sequence', 'N/A')
        if 'raw_data_delivery' not in sample:
            data.append([sname, cname, '', '', '', '', '', '', '', ''])
            continue
        delivery = sample['raw_data_delivery']
        tstamp = delivery.get('timestamp', 'N/A')
        for read, file in delivery.get('files', {}).items():
            data.append([
                sname,
                cname,
                fc,
                lane,
                barcode,
                read,
                file.get('path', 'N/A'),
                file.get('md5', 'N/A'),
                file.get('size_in_bytes', 'N/A'),
                tstamp,
            ])

    # Write the data to a csv file
    outfile = "{}{}_data_delivery.csv".format(
        project_name, '_{}'.format(flowcell) if flowcell else '')
    LOG.debug("Writing delivery data to {}".format(outfile))
    with open(outfile, "w") as outh:
        csvw = csv.writer(outh)
        for row in data:
            csvw.writerow(row)

    # Write Texttable formatted output to stdout
    tt = texttable.Texttable(180)
    tt.add_rows(data)
    output_data['stdout'].write(tt.draw())

    return output_data
Exemple #46
0
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project databse"
        self.log.debug("Connecting to samples database")
        s_con = SampleRunMetricsConnection(**vars(self.pargs))
        assert s_con, "Could not get connection to samples databse"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(
                self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error(
                    "Uppmax project was not specified and could not be fetched from project database"
                )
                return

        # Extract the list of samples and runs associated with the project and sort them
        samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell,
                                           sample_prj=self.pargs.project),
                         key=lambda k:
                         (k.get('project_sample_name', 'NA'),
                          k.get('flowcell', 'NA'), k.get('lane', 'NA')))

        # Setup paths and verify parameters
        self._meta.production_root = self.app.config.get("production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(
            self._meta.production_root
        ), "No such directory {}; check your production config".format(
            self._meta.production_root)
        assert os.path.exists(
            proj_base_dir), "No project {} in production path {}".format(
                self.pargs.project, self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get(
                "deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn(
                "{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get(
                "deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn(
                "{}, will use 'INBOX' as uppnext_project_delivery_path".format(
                    e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,
                                        self.pargs.uppmax_project,
                                        self._meta.uppnex_delivery_dir)
        assert os.path.exists(
            destination_root
        ), "Delivery destination folder {} does not exist".format(
            destination_root)
        destination_root = os.path.join(destination_root, self.pargs.project)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = []
            for sample in samples:
                sname = sample.get("project_sample_name")
                index = sample.get("sequence")
                fcid = sample.get("flowcell")
                lane = sample.get("lane")
                date = sample.get("date")
                self.log.info(
                    "Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}"
                    .format(sname, index, fcid, lane, date))
                if query_yes_no("Deliver sample?", default="no"):
                    to_process.append(sample)
            samples = to_process

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(
            proj_base_dir, samples)
        if len(uncompressed) > 0:
            self.log.warn(
                "The following samples have uncompressed *.fastq files that cannot be delivered: {}"
                .format(",".join(uncompressed)))
            if not query_yes_no("Continue anyway?", default="no"):
                return

        self.log.info(
            "Will deliver data for {} samples from project {} to {}".format(
                len(samples), self.pargs.project, destination_root))
        if not query_yes_no("Continue?"):
            return

        # Get the list of files to transfer and the destination
        self.log.debug("Gathering list of files to copy")
        to_copy = self.get_file_copy_list(proj_base_dir, destination_root,
                                          samples)

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no(
                    "Do you wish to continue delivering using rsync?",
                    default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample run
        for id, files in to_copy.items():
            # get the sample database object
            [sample] = [s for s in samples if s.get('_id') == id]
            self.log.info("Processing sample {} and flowcell {}".format(
                sample.get("project_sample_name", "NA"),
                sample.get("flowcell", "NA")))

            # calculate md5sums on the source side and write it on the destination
            md5 = []
            for f in files:
                m = md5sum(f[0])
                mfile = "{}.md5".format(f[1])
                md5.append([m, mfile, f[2], f[0]])
                self.log.debug("md5sum for source file {}: {}".format(f[0], m))

            # transfer files
            self.log.debug("Transferring {} fastq files".format(len(files)))
            self._transfer_files([f[0] for f in files], [f[1] for f in files])

            # write the md5sum to a file at the destination and verify the transfer
            passed = True
            for m, mfile, read, srcpath in md5:
                dstfile = os.path.splitext(mfile)[0]
                self.log.debug("Writing md5sum to file {}".format(mfile))
                self.app.cmd.write(
                    mfile, "{}  {}".format(m, os.path.basename(dstfile)), True)
                self.log.debug("Verifying md5sum for file {}".format(dstfile))

                # if dry-run, make sure verification pass
                if self.pargs.dry_run:
                    dm = m
                else:
                    dm = md5sum(dstfile)
                self.log.debug("md5sum for destination file {}: {}".format(
                    dstfile, dm))
                if m != dm:
                    self.log.warn(
                        "md5sum verification FAILED for {}. Source: {}, Target: {}"
                        .format(dstfile, m, dm))
                    self.log.warn(
                        "Improperly transferred file {} is removed from destination, please retry transfer of this file"
                        .format(dstfile))
                    self.app.cmd.safe_unlink(dstfile)
                    self.app.cmd.safe_unlink(mfile)
                    passed = False
                    continue

                # Modify the permissions to ug+rw
                for f in [dstfile, mfile]:
                    self.app.cmd.chmod(
                        f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                        | stat.S_IWGRP)

            # touch the flag to trigger uppmax inbox permission fix
            self.app.cmd.safe_touchfile(
                os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule",
                             self.pargs.uppmax_project))

            # log the transfer to statusdb if verification passed
            if passed:
                self.log.info(
                    "Logging delivery to StatusDB document {}".format(id))
                data = {
                    'raw_data_delivery': {
                        'timestamp': utc_time(),
                        'files': {
                            'R{}'.format(read): {
                                'md5':
                                m,
                                'path':
                                os.path.splitext(mfile)[0],
                                'size_in_bytes':
                                self._getsize(os.path.splitext(mfile)[0]),
                                'source_location':
                                srcpath
                            }
                            for m, mfile, read, srcpath in md5
                        },
                    }
                }
                jsonstr = json.dumps(data)
                jsonfile = os.path.join(
                    os.path.dirname(md5[0][3]),
                    "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(
                        sample.get("date"), sample.get("flowcell"),
                        sample.get("project_sample_name"),
                        sample.get("sequence"), sample.get("lane")))
                self.log.debug(
                    "Writing delivery to json file {}".format(jsonfile))
                self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True)
                self.log.debug(
                    "Saving delivery in StatusDB document {}".format(id))
                sample.update(data)
                self._save(s_con, sample)
                self.log.debug(jsonstr)
Exemple #47
0
def _project_status_note_table(project_name=None,
                               username=None,
                               password=None,
                               url=None,
                               use_ps_map=True,
                               use_bc_map=False,
                               check_consistency=False,
                               ordered_million_reads=None,
                               uppnex_id=None,
                               customer_reference=None,
                               exclude_sample_ids={},
                               project_alias=None,
                               sample_aliases={},
                               projectdb="projects",
                               samplesdb="samples",
                               flowcelldb="flowcells",
                               include_all_samples=False,
                               param={},
                               **kw):

    # mapping project_summary to parameter keys
    ps_to_parameter = {
        "scilife_name": "scilife_name",
        "customer_name": "customer_name",
        "project_name": "project_name"
    }
    # mapping project sample to table
    table_keys = [
        'ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered'
    ]

    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    #Get the information source for this project
    source = p_con.get_info_source(project_name)

    # Get project summary from project database
    sample_aliases = _literal_eval_option(sample_aliases, default={})
    prj_summary = p_con.get_entry(project_name)
    if not prj_summary:
        LOG.warn("No such project '{}'".format(project_name))
        return
    LOG.debug("Working on project '{}'.".format(project_name))

    # Determine if project is finished by getting all samples sequenced date
    try:
        all_samples_sequenced = prj_summary['project_summary'][
            'all_samples_sequenced']
    except (TypeError, KeyError):
        all_samples_sequenced = False

    # Get sample run list and loop samples to make mapping sample -> {sampleruns}
    sample_run_list = _set_sample_run_list(project_name,
                                           flowcell=None,
                                           project_alias=project_alias,
                                           s_con=s_con)
    samples = {}
    for s in sample_run_list:
        prj_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if prj_sample:
            sample_name = prj_sample['project_sample'].get(
                "scilife_name", None)
            s_d = {s["name"]: {'sample': sample_name, 'id': s["_id"]}}
            samples.update(s_d)
        else:
            if s["barcode_name"] in sample_aliases:
                s_d = {
                    sample_aliases[s["barcode_name"]]: {
                        'sample': sample_aliases[s["barcode_name"]],
                        'id': s["_id"]
                    }
                }
                samples.update(s_d)
            else:
                s_d = {
                    s["name"]: {
                        'sample': s["name"],
                        'id': s["_id"],
                        'barcode_name': s["barcode_name"]
                    }
                }
                LOG.warn(
                    "No mapping found for sample run:\n  '{}'".format(s_d))

    # Convert to mapping from desired sample name to list of aliases
    # Less important for the moment; one solution is to update the
    # Google docs summary table to use the P names
    sample_dict = prj_summary['samples']
    param.update({
        key: prj_summary.get(ps_to_parameter[key], None)
        for key in ps_to_parameter.keys()
    })
    param["ordered_amount"] = param.get(
        "ordered_amount",
        p_con.get_ordered_amount(project_name, samples=sample_dict))

    if not param.get('customer_reference'):
        try:
            param['customer_reference'] = prj_summary['details'][
                'customer_project_reference']
        except (TypeError, KeyError):
            param['customer_reference'] = prj_summary.get('customer_reference')
    param['uppnex_project_id'] = param.get('uppnex_project_id',
                                           prj_summary.get('uppnex_id'))

    # Override database values if options passed at command line
    if uppnex_id:
        param["uppnex_project_id"] = uppnex_id
    if customer_reference:
        param["customer_reference"] = customer_reference

    # Process options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={})

    ## Start collecting the data
    sample_table = []
    samples_excluded = []
    last_library_preps = p_con.get_latest_library_prep(project_name)
    last_library_preps_srm = [
        x for l in last_library_preps.values() for x in l
    ]
    LOG.debug(
        "Looping through sample map that maps project sample names to sample run metrics ids"
    )
    for k, v in samples.items():
        LOG.debug("project sample '{}' maps to '{}'".format(k, v))
        if not include_all_samples:
            if v['sample'] not in last_library_preps.keys():
                LOG.info(
                    "No library prep information for sample {}; keeping in report"
                    .format(v['sample']))
            else:
                if k not in last_library_preps_srm:
                    LOG.info(
                        "Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report"
                        .format(
                            k, v["id"], ",".join(
                                list(
                                    set(last_library_preps[
                                        v['sample']].values()))), v['sample']))
                    continue
        else:
            pass

        if re.search("Unexpected", k):
            continue
        barcode_seq = s_con.get_entry(k, "sequence")
        # Exclude sample id?
        if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq):
            samples_excluded.append(v['sample'])
            continue
        # Get the project sample name from the sample run and set table values
        project_sample = sample_dict[v['sample']]
        vals = _set_sample_table_values(v['sample'], project_sample,
                                        barcode_seq, ordered_million_reads,
                                        param)
        sample_table.append([vals[k] for k in table_keys])

    # Loop through samples in sample_dict for which there is no sample run information
    samples_in_table_or_excluded = list(set([x[0] for x in sample_table
                                             ])) + samples_excluded
    samples_not_in_table = list(
        set(sample_dict.keys()) - set(samples_in_table_or_excluded))
    for sample in samples_not_in_table:
        if re.search("Unexpected", sample):
            continue
        project_sample = sample_dict[sample]
        # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
        project_sample_d = _set_project_sample_dict(project_sample, source)
        if project_sample_d:
            for k, v in project_sample_d.iteritems():
                barcode_seq = s_con.get_entry(k, "sequence")
                vals = _set_sample_table_values(sample, project_sample,
                                                barcode_seq,
                                                ordered_million_reads, param)
                sample_table.append([vals[k] for k in table_keys])
        else:
            barcode_seq = None
            vals = _set_sample_table_values(sample, project_sample,
                                            barcode_seq, ordered_million_reads,
                                            param)
            sample_table.append([vals[k] for k in table_keys])
    if all_samples_sequenced:
        param["finished"] = 'All samples for this project have been sequenced.'
    sample_table.sort()
    sample_table = list(sample_table
                        for sample_table, _ in itertools.groupby(sample_table))
    sample_table.insert(
        0,
        ['ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered'])

    return output_data, sample_table, param
Exemple #48
0
def application_qc(project_name=None,
                   flowcell=None,
                   application=None,
                   username=None,
                   password=None,
                   url=None,
                   sampledb="samples",
                   projectdb="projects",
                   **kw):
    """Perform application specific qc on a project.

    :param project_name: project name
    :param flowcell: flowcell identifier
    :param application: application for which to perform qc
    :param username: database username
    :param password: database password
    :param url: database url
    :param sampledb: samples database name
    :param projectdb: project database name
    """
    LOG.debug("Doing application qc for project {}, flowcell {}".format(
        project_name, flowcell))

    output_data = {'stdout': StringIO(), 'stderr': StringIO()}
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)
    s_con = SampleRunMetricsConnection(dbname=sampledb,
                                       username=username,
                                       password=password,
                                       url=url)
    prj_summary = p_con.get_entry(project_name)
    qc_data = get_qc_data(project_name, p_con, s_con, flowcell)

    if not prj_summary is None:
        qc_data = get_qc_data(project_name, p_con, s_con, flowcell)
        if prj_summary.get("application") not in APPLICATION_MAP.keys():
            if not application:
                LOG.warn(
                    "No such application {}. Please use the application option (available choices {})"
                    .format(application, ",".join(QC_CUTOFF.keys())))
                return output_data
            application = application
        else:
            application = APPLICATION_MAP[prj_summary.get("application")]
    else:
        LOG.info(
            "No such project {} in project summary. Trying to get qc data anyway."
            .format(project_name))
        if not application:
            LOG.warn(
                "No application provided. Please use the application option (available choices {})"
                .format(",".join(QC_CUTOFF.keys())))
            return output_data
        qc_data = _get_sample_qc_data(project_name, application, s_con,
                                      flowcell)

    output_data = _qc_info_header(project_name, application, output_data)
    for k, v in sorted(qc_data.iteritems()):
        y = [str(x) for x in assess_qc(v, application)]
        output_data["stdout"].write("".join(y) + "\n")
    return output_data