def test_4_get_samples(self): """Test getting samples given flowcell and sample_prj.""" sample_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) samples = sample_con.get_samples(fc_id=self.examples["flowcell"]) print "Number of samples before subsetting: " + str(len(samples)) samples = sample_con.get_samples(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) print "Number of samples after subsetting: " + str(len(samples))
def test_4_get_samples(self): """Test getting samples given flowcell and sample_prj.""" sample_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) samples = sample_con.get_samples(fc_id=self.examples["flowcell"]) print "Number of samples before subsetting: " + str(len(samples)) samples = sample_con.get_samples(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) print "Number of samples after subsetting: " + str(len(samples))
def fastq_screen(project_name=None, flowcell=None, username=None, password=None, url=None, dbname="samples", **kw): """Perform application specific qc on a project. :param project_name: project name :param flowcell: flowcell identifier :param username: database username :param password: database password :param url: database url :param dbname: samples database name """ LOG.debug("Running fastq screen summary on project {}, flowcell ".format(project_name, flowcell)) output_data = {'stdout':StringIO(), 'stderr':StringIO()} s_con = SampleRunMetricsConnection(dbname=dbname, username=username, password=password, url=url) samples = s_con.get_samples(fc_id=flowcell, sample_prj=project_name) for s in samples: LOG.debug("Checking fastq_screen data for sample {}, id {}, project {}".format(s.get("name", None), s.get("_id", None), s.get("sample_prj", None))) fqscreen_data = s.get("fastq_scr", {}) output_data["stdout"].write(s["barcode_name"] + "\n") if fqscreen_data: header = [[x for x in v.keys()] for k, v in fqscreen_data.iteritems()] output_data["stdout"].write("\t\t" + "".join("{:>27}".format(x) for x in header[0]) + "\n") vals = ["{:>12}\t{}\n".format(k, "".join(["{:>27}".format(x) for x in v.values()])) for k, v in fqscreen_data.iteritems()] for v in vals: output_data["stdout"].write(v) return output_data
def test_2_make_note(self): """Make a note subset by example flowcell and project""" s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url) p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url) paragraphs = sample_note_paragraphs() headers = sample_note_headers() samples = s_con.get_samples(self.examples["flowcell"], self.examples["project"]) project = p_con.get_entry(self.examples["project"]) samples = p_con.map_srm_to_name(self.examples["project"], fc_id=self.examples["flowcell"], use_bc_map=True) for k,v in samples.items(): s_param = parameters s = s_con.get_entry(k) s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s["date"], s["flowcell"]) s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"]) s_param['avg_quality_score'] = s_con.calc_avg_qv(s["name"]) s_param['rounded_read_count'] = round(float(s_param['rounded_read_count'])/1e6,1) if s_param['rounded_read_count'] else None s_param['customer_name'] = project['samples'][v["sample"]].get('customer_name', None) if project: s_param['ordered_amount'] = p_con.get_ordered_amount(self.examples["project"]) s_param['customer_reference'] = s_param.get('customer_reference', project['customer_reference']) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project['uppnex_id']) s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None}) make_note("{}.pdf".format(s["barcode_name"]), headers, paragraphs, **s_param)
def test_get_samples_wrong_info(self): """Test getting samples when either flowcell or project id information is wrong""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) samples = sample_con.get_samples(sample_prj="bogusproject", fc_id=self.examples["flowcell"]) LOG.info("Selecting on bogus project, subsetting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 0)
def test_get_samples(self): """Test getting samples given flowcell and sample_prj.""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) samples = sample_con.get_samples(fc_id=self.examples["flowcell"]) LOG.info("Selecting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 5) samples = sample_con.get_samples(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) LOG.info("Selecting on flowcell, subsetting on project: " + str(len(samples))) self.assertEqual(len(samples), 2) samples = sample_con.get_samples(sample_prj=self.examples["project"]) LOG.info("Selecting on project: " + str(len(samples))) self.assertEqual(len(samples), 3) samples = sample_con.get_samples(sample_prj=self.examples["project"], fc_id=self.examples["flowcell"]) LOG.info("Selecting on project, subsetting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 2)
def test_get_samples_wrong_info(self): """Test getting samples when either flowcell or project id information is wrong""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) samples = sample_con.get_samples(sample_prj="bogusproject", fc_id=self.examples["flowcell"]) LOG.info("Selecting on bogus project, subsetting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 0)
def update(self): if not self._check_pargs(["sample_prj"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs)) samples = s_con.get_samples(sample_prj=self.pargs.sample_prj) if self.pargs.project_id: self.app.log.debug("Going to update 'project_id' to {} for sample runs with 'sample_prj' == {}".format(self.pargs.project_id, self.pargs.sample_prj)) for s in samples: if not s.get("project_id", None) is None: if not query_yes_no("'project_id':{} for sample {}; are you sure you want to overwrite?".format(s["project_id"], s["name"]), force=self.pargs.force): continue s["project_id"] = self.pargs.project_id s_con.save(s) if self.pargs.names: self.app.log.debug("Going to update 'project_sample_name' for sample runs with 'sample_prj' == {}".format(self.pargs.sample_prj)) if os.path.exists(self.pargs.names): with open(self.pargs.names) as fh: names_d = json.load(fh) else: names_d= ast.literal_eval(self.pargs.names) samples_sort = sorted(samples, key=lambda s:s["barcode_name"]) groups = {} for k, g in itertools.groupby(samples_sort, key=lambda x:x["barcode_name"]): groups[k] = list(g) for barcode_name in names_d: sample_list = groups.get(barcode_name, None) if not sample_list: continue for s in sample_list: if not s.get("project_sample_name", None) is None: if not query_yes_no("'project_sample_name':{} for sample {}; are you sure you want to overwrite?".format(s["project_sample_name"], s["name"]), force=self.pargs.force): continue s["project_sample_name"] = names_d[barcode_name] s_con.save(s) else: self.app.log.info("Trying to use extensive matching...") p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) project_name = self.pargs.sample_prj if self.pargs.project_alias: project_name = self.pargs.project_alias for s in samples: project_sample = p_con.get_project_sample(project_name, s["barcode_name"], extensive_matching=True) if project_sample: self.app.log.info("using mapping '{} : {}'...".format(s["barcode_name"], project_sample["sample_name"])) s["project_sample_name"] = project_sample["sample_name"] s_con.save(s)
def update(self): if not self._check_pargs(["sample_prj"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs)) samples = s_con.get_samples(sample_prj=self.pargs.sample_prj) if self.pargs.project_id: self.app.log.debug("Going to update 'project_id' to {} for sample runs with 'sample_prj' == {}".format(self.pargs.project_id, self.pargs.sample_prj)) for s in samples: if not s.get("project_id", None) is None: if not query_yes_no("'project_id':{} for sample {}; are you sure you want to overwrite?".format(s["project_id"], s["name"]), force=self.pargs.force): continue s["project_id"] = self.pargs.project_id s_con.save(s) if self.pargs.names: self.app.log.debug("Going to update 'project_sample_name' for sample runs with 'sample_prj' == {}".format(self.pargs.sample_prj)) if os.path.exists(self.pargs.names): with open(self.pargs.names) as fh: names_d = json.load(fh) else: names_d= ast.literal_eval(self.pargs.names) samples_sort = sorted(samples, key=lambda s:s["barcode_name"]) groups = {} for k, g in itertools.groupby(samples_sort, key=lambda x:x["barcode_name"]): groups[k] = list(g) for barcode_name in names_d: sample_list = groups.get(barcode_name, None) if not sample_list: continue for s in sample_list: if not s.get("project_sample_name", None) is None: if not query_yes_no("'project_sample_name':{} for sample {}; are you sure you want to overwrite?".format(s["project_sample_name"], s["name"]), force=self.pargs.force): continue s["project_sample_name"] = names_d[barcode_name] s_con.save(s) else: self.app.log.info("Trying to use extensive matching...") p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) project_name = self.pargs.sample_prj if self.pargs.project_alias: project_name = self.pargs.project_alias for s in samples: project_sample = p_con.get_project_sample(project_name, s["barcode_name"], extensive_matching=True) if project_sample: self.app.log.info("using mapping '{} : {}'...".format(s["barcode_name"], project_sample["sample_name"])) s["project_sample_name"] = project_sample["sample_name"] s_con.save(s)
def test_get_samples(self): """Test getting samples given flowcell and sample_prj.""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) samples = sample_con.get_samples(fc_id=self.examples["flowcell"]) LOG.info("Selecting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 5) samples = sample_con.get_samples(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) LOG.info("Selecting on flowcell, subsetting on project: " + str(len(samples))) self.assertEqual(len(samples), 2) samples = sample_con.get_samples(sample_prj=self.examples["project"]) LOG.info("Selecting on project: " + str(len(samples))) self.assertEqual(len(samples), 3) samples = sample_con.get_samples(sample_prj=self.examples["project"], fc_id=self.examples["flowcell"]) LOG.info("Selecting on project, subsetting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 2)
def fastq_screen(project_name=None, flowcell=None, username=None, password=None, url=None, dbname="samples", **kw): """Perform application specific qc on a project. :param project_name: project name :param flowcell: flowcell identifier :param username: database username :param password: database password :param url: database url :param dbname: samples database name """ LOG.debug("Running fastq screen summary on project {}, flowcell ".format( project_name, flowcell)) output_data = {'stdout': StringIO(), 'stderr': StringIO()} s_con = SampleRunMetricsConnection(dbname=dbname, username=username, password=password, url=url) samples = s_con.get_samples(fc_id=flowcell, sample_prj=project_name) for s in samples: LOG.debug( "Checking fastq_screen data for sample {}, id {}, project {}". format(s.get("name", None), s.get("_id", None), s.get("sample_prj", None))) fqscreen_data = s.get("fastq_scr", {}) output_data["stdout"].write(s["barcode_name"] + "\n") if fqscreen_data: header = [[x for x in v.keys()] for k, v in fqscreen_data.iteritems()] output_data["stdout"].write("\t\t" + "".join("{:>27}".format(x) for x in header[0]) + "\n") vals = [ "{:>12}\t{}\n".format( k, "".join(["{:>27}".format(x) for x in v.values()])) for k, v in fqscreen_data.iteritems() ] for v in vals: output_data["stdout"].write(v) return output_data
def data_delivery_note(**kw): """Create an easily parseable information file with information about the data delivery """ output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } project_name = kw.get('project_name', None) flowcell = kw.get('flowcell', None) LOG.debug("Generating data delivery note for project {}{}.".format( project_name, ' and flowcell {}'.format(flowcell if flowcell else ''))) # Get a connection to the project and sample databases p_con = ProjectSummaryConnection(**kw) assert p_con, "Could not connect to project database" s_con = SampleRunMetricsConnection(**kw) assert s_con, "Could not connect to sample database" # Get the entry for the project and samples from the database LOG.debug("Fetching samples from sample database") samples = s_con.get_samples(sample_prj=project_name, fc_id=flowcell) LOG.debug("Got {} samples from database".format(len(samples))) # Get the customer sample names from the project database LOG.debug("Fetching samples from project database") project_samples = p_con.get_entry(project_name, "samples") customer_names = { sample_name: sample.get('customer_name', 'N/A') for sample_name, sample in project_samples.items() } data = [[ 'SciLifeLab ID', 'Submitted ID', 'Flowcell', 'Lane', 'Barcode', 'Read', 'Path', 'MD5', 'Size (bytes)', 'Timestamp' ]] for sample in samples: sname = sample.get('project_sample_name', 'N/A') cname = customer_names.get(sname, 'N/A') fc = sample.get('flowcell', 'N/A') lane = sample.get('lane', 'N/A') barcode = sample.get('sequence', 'N/A') if 'raw_data_delivery' not in sample: data.append([sname, cname, '', '', '', '', '', '', '', '']) continue delivery = sample['raw_data_delivery'] tstamp = delivery.get('timestamp', 'N/A') for read, file in delivery.get('files', {}).items(): data.append([ sname, cname, fc, lane, barcode, read, file.get('path', 'N/A'), file.get('md5', 'N/A'), file.get('size_in_bytes', 'N/A'), tstamp, ]) # Write the data to a csv file outfile = "{}{}_data_delivery.csv".format( project_name, '_{}'.format(flowcell) if flowcell else '') LOG.debug("Writing delivery data to {}".format(outfile)) with open(outfile, "w") as outh: csvw = csv.writer(outh) for row in data: csvw.writerow(row) # Write Texttable formatted output to stdout tt = texttable.Texttable(180) tt.add_rows(data) output_data['stdout'].write(tt.draw()) return output_data
def data_delivery_note(**kw): """Create an easily parseable information file with information about the data delivery """ output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()} project_name = kw.get('project_name',None) flowcell = kw.get('flowcell',None) LOG.debug("Generating data delivery note for project {}{}.".format(project_name,' and flowcell {}'.format(flowcell if flowcell else ''))) # Get a connection to the project and sample databases p_con = ProjectSummaryConnection(**kw) assert p_con, "Could not connect to project database" s_con = SampleRunMetricsConnection(**kw) assert s_con, "Could not connect to sample database" # Get the entry for the project and samples from the database LOG.debug("Fetching samples from sample database") samples = s_con.get_samples(sample_prj=project_name, fc_id=flowcell) LOG.debug("Got {} samples from database".format(len(samples))) # Get the customer sample names from the project database LOG.debug("Fetching samples from project database") project_samples = p_con.get_entry(project_name, "samples") customer_names = {sample_name:sample.get('customer_name','N/A') for sample_name, sample in project_samples.items()} data = [['SciLifeLab ID','Submitted ID','Flowcell','Lane','Barcode','Read','Path','MD5','Size (bytes)','Timestamp']] for sample in samples: sname = sample.get('project_sample_name','N/A') cname = customer_names.get(sname,'N/A') fc = sample.get('flowcell','N/A') lane = sample.get('lane','N/A') barcode = sample.get('sequence','N/A') if 'raw_data_delivery' not in sample: data.append([sname,cname,'','','','','','','','']) continue delivery = sample['raw_data_delivery'] tstamp = delivery.get('timestamp','N/A') for read, file in delivery.get('files',{}).items(): data.append([sname, cname, fc, lane, barcode, read, file.get('path','N/A'), file.get('md5','N/A'), file.get('size_in_bytes','N/A'), tstamp,]) # Write the data to a csv file outfile = "{}{}_data_delivery.csv".format(project_name,'_{}'.format(flowcell) if flowcell else '') LOG.debug("Writing delivery data to {}".format(outfile)) with open(outfile,"w") as outh: csvw = csv.writer(outh) for row in data: csvw.writerow(row) # Write Texttable formatted output to stdout tt = texttable.Texttable(180) tt.add_rows(data) output_data['stdout'].write(tt.draw()) return output_data
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project databse" self.log.debug("Connecting to samples database") s_con = SampleRunMetricsConnection(**vars(self.pargs)) assert s_con, "Could not get connection to samples databse" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry( self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error( "Uppmax project was not specified and could not be fetched from project database" ) return # Extract the list of samples and runs associated with the project and sort them samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name', 'NA'), k.get('flowcell', 'NA'), k.get('lane', 'NA'))) # Setup paths and verify parameters self._meta.production_root = self.app.config.get("production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists( self._meta.production_root ), "No such directory {}; check your production config".format( self._meta.production_root) assert os.path.exists( proj_base_dir), "No project {} in production path {}".format( self.pargs.project, self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get( "deliver", "uppnex_project_root") except Exception as e: self.log.warn( "{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get( "deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn( "{}, will use 'INBOX' as uppnext_project_delivery_path".format( e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root, self.pargs.uppmax_project, self._meta.uppnex_delivery_dir) assert os.path.exists( destination_root ), "Delivery destination folder {} does not exist".format( destination_root) destination_root = os.path.join(destination_root, self.pargs.project) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = [] for sample in samples: sname = sample.get("project_sample_name") index = sample.get("sequence") fcid = sample.get("flowcell") lane = sample.get("lane") date = sample.get("date") self.log.info( "Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}" .format(sname, index, fcid, lane, date)) if query_yes_no("Deliver sample?", default="no"): to_process.append(sample) samples = to_process # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files( proj_base_dir, samples) if len(uncompressed) > 0: self.log.warn( "The following samples have uncompressed *.fastq files that cannot be delivered: {}" .format(",".join(uncompressed))) if not query_yes_no("Continue anyway?", default="no"): return self.log.info( "Will deliver data for {} samples from project {} to {}".format( len(samples), self.pargs.project, destination_root)) if not query_yes_no("Continue?"): return # Get the list of files to transfer and the destination self.log.debug("Gathering list of files to copy") to_copy = self.get_file_copy_list(proj_base_dir, destination_root, samples) # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no( "Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample run for id, files in to_copy.items(): # get the sample database object [sample] = [s for s in samples if s.get('_id') == id] self.log.info("Processing sample {} and flowcell {}".format( sample.get("project_sample_name", "NA"), sample.get("flowcell", "NA"))) # calculate md5sums on the source side and write it on the destination md5 = [] for f in files: m = md5sum(f[0]) mfile = "{}.md5".format(f[1]) md5.append([m, mfile, f[2], f[0]]) self.log.debug("md5sum for source file {}: {}".format(f[0], m)) # transfer files self.log.debug("Transferring {} fastq files".format(len(files))) self._transfer_files([f[0] for f in files], [f[1] for f in files]) # write the md5sum to a file at the destination and verify the transfer passed = True for m, mfile, read, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug("Writing md5sum to file {}".format(mfile)) self.app.cmd.write( mfile, "{} {}".format(m, os.path.basename(dstfile)), True) self.log.debug("Verifying md5sum for file {}".format(dstfile)) # if dry-run, make sure verification pass if self.pargs.dry_run: dm = m else: dm = md5sum(dstfile) self.log.debug("md5sum for destination file {}: {}".format( dstfile, dm)) if m != dm: self.log.warn( "md5sum verification FAILED for {}. Source: {}, Target: {}" .format(dstfile, m, dm)) self.log.warn( "Improperly transferred file {} is removed from destination, please retry transfer of this file" .format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod( f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile( os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule", self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: self.log.info( "Logging delivery to StatusDB document {}".format(id)) data = { 'raw_data_delivery': { 'timestamp': utc_time(), 'files': { 'R{}'.format(read): { 'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath } for m, mfile, read, srcpath in md5 }, } } jsonstr = json.dumps(data) jsonfile = os.path.join( os.path.dirname(md5[0][3]), "{}_{}_{}_{}_L{}_raw_data_delivery.json".format( sample.get("date"), sample.get("flowcell"), sample.get("project_sample_name"), sample.get("sequence"), sample.get("lane"))) self.log.debug( "Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True) self.log.debug( "Saving delivery in StatusDB document {}".format(id)) sample.update(data) self._save(s_con, sample) self.log.debug(jsonstr)
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project databse" self.log.debug("Connecting to samples database") s_con = SampleRunMetricsConnection(**vars(self.pargs)) assert s_con, "Could not get connection to samples databse" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry(self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error("Uppmax project was not specified and could not be fetched from project database") return # Extract the list of samples and runs associated with the project and sort them samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name','NA'), k.get('flowcell','NA'), k.get('lane','NA'))) # Setup paths and verify parameters self._meta.production_root = self.app.config.get("production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists(self._meta.production_root), "No such directory {}; check your production config".format(self._meta.production_root) assert os.path.exists(proj_base_dir), "No project {} in production path {}".format(self.pargs.project,self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get("deliver", "uppnex_project_root") except Exception as e: self.log.warn("{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get("deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn("{}, will use 'INBOX' as uppnext_project_delivery_path".format(e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root,self.pargs.uppmax_project,self._meta.uppnex_delivery_dir) assert os.path.exists(destination_root), "Delivery destination folder {} does not exist".format(destination_root) destination_root = os.path.join(destination_root,self.pargs.project) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = [] for sample in samples: sname = sample.get("project_sample_name") index = sample.get("sequence") fcid = sample.get("flowcell") lane = sample.get("lane") date = sample.get("date") self.log.info("Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}".format(sname, index, fcid, lane, date)) if query_yes_no("Deliver sample?", default="no"): to_process.append(sample) samples = to_process # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files(proj_base_dir,samples) if len(uncompressed) > 0: self.log.warn("The following samples have uncompressed *.fastq files that cannot be delivered: {}".format(",".join(uncompressed))) if not query_yes_no("Continue anyway?", default="no"): return self.log.info("Will deliver data for {} samples from project {} to {}".format(len(samples),self.pargs.project,destination_root)) if not query_yes_no("Continue?"): return # Get the list of files to transfer and the destination self.log.debug("Gathering list of files to copy") to_copy = self.get_file_copy_list(proj_base_dir, destination_root, samples) # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no("Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample run for id, files in to_copy.items(): # get the sample database object [sample] = [s for s in samples if s.get('_id') == id] self.log.info("Processing sample {} and flowcell {}".format(sample.get("project_sample_name","NA"),sample.get("flowcell","NA"))) # transfer files self.log.debug("Transferring {} fastq files".format(len(files))) self._transfer_files([f[0] for f in files], [f[1] for f in files]) passed = True if self.pargs.link or self.pargs.dry_run: passed = False else: # calculate md5sums on the source side and write it on the destination md5 = [] for f in files: m = md5sum(f[0]) mfile = "{}.md5".format(f[1]) md5.append([m,mfile,f[2],f[0]]) self.log.debug("md5sum for source file {}: {}".format(f[0],m)) # write the md5sum to a file at the destination and verify the transfer for m, mfile, read, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug("Writing md5sum to file {}".format(mfile)) self.app.cmd.write(mfile,"{} {}".format(m,os.path.basename(dstfile)),True) self.log.debug("Verifying md5sum for file {}".format(dstfile)) dm = md5sum(dstfile) self.log.debug("md5sum for destination file {}: {}".format(dstfile,dm)) if m != dm: self.log.warn("md5sum verification FAILED for {}. Source: {}, Target: {}".format(dstfile,m,dm)) self.log.warn("Improperly transferred file {} is removed from destination, please retry transfer of this file".format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod(f,stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile(os.path.join("/sw","uppmax","var","inboxfix","schedule",self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: self.log.info("Logging delivery to StatusDB document {}".format(id)) data = {'raw_data_delivery': {'timestamp': utc_time(), 'files': {'R{}'.format(read):{'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath} for m, mfile, read, srcpath in md5}, } } jsonstr = json.dumps(data) jsonfile = os.path.join(os.path.dirname(md5[0][3]), "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(sample.get("date"), sample.get("flowcell"), sample.get("project_sample_name"), sample.get("sequence"), sample.get("lane"))) self.log.debug("Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile,data=jsonstr,overwrite=True) self.log.debug("Saving delivery in StatusDB document {}".format(id)) sample.update(data) self._save(s_con,sample) self.log.debug(jsonstr)