def test_sra_dump_submission_xml_biis7(self): sra_settings = self._sra_default_config json2sra.convert2(open(os.path.join(self._json_data_dir, 'BII-S-7', 'BII-S-7.json')), self._tmp_dir, sra_settings=sra_settings, validate_first=False) # Now try load the SRA output in test and compare against the expected output in test data directory submission_xml = open(os.path.join(self._tmp_dir, 'submission.xml'), 'rb').read() actual_submission_xml_biis7 = etree.fromstring(submission_xml) self.assertTrue(utils.assert_xml_equal(self._expected_submission_xml_biis7, actual_submission_xml_biis7))
def test_sra_dump_file_set(self): json2sra.convert2(open(os.path.join(self._json_data_dir, 'BII-S-3', 'BII-S-3.json')), self._tmp_dir, validate_first=False) # SRA should always produce experiment_set.xml, run_set.xml, sample_set.xml study.xml and submission.xml expected_sra_path = os.path.join(self._tmp_dir) expected_file_set = {'experiment_set.xml', 'run_set.xml', 'sample_set.xml', 'project_set.xml', 'submission.xml'} if os.path.exists(expected_sra_path): actual_file_set = set(os.listdir(expected_sra_path)) extra_files_found = actual_file_set - expected_file_set if len(extra_files_found) > 0: self.fail("Unexpected file found in SRA output: " + str(extra_files_found)) expected_files_missing = expected_file_set - actual_file_set if len(expected_files_missing) > 0: self.fail("Unexpected file found in SRA output: " + str(expected_files_missing))
def convert(source_path, dest_path, sra_settings=None, validate_first=True): from isatools.convert import isatab2json, json2sra isa_json = isatab2json.convert(source_path, validate_first=validate_first) isa_json_fp = StringIO(json.dumps(isa_json)) isa_json_fp.name = "BII-S-3.json" json2sra.convert2(isa_json_fp, dest_path, sra_settings=sra_settings, validate_first=False) logging.info("Conversion complete...") buffer = BytesIO() if os.path.isdir(dest_path): with ZipFile(buffer, 'w') as zip_file: # use relative dir_name to avoid absolute path on file names zipdir(dest_path, zip_file) print(zip_file.namelist()) # clean up the target directory after the ZIP file has been closed # rmtree(sra_dir) buffer.seek(0) return buffer
def convert(source_path, dest_path, sra_settings=None, validate_first=True): from isatools.convert import isatab2json, json2sra isa_json = isatab2json.convert(source_path, validate_first=validate_first) isa_json_fp = StringIO(json.dumps(isa_json)) isa_json_fp.name = "BII-S-3.json" json2sra.convert2(isa_json_fp, dest_path, sra_settings=sra_settings, validate_first=False) logging.info("Conversion complete...") buffer = BytesIO() if os.path.isdir(dest_path): with ZipFile(buffer, 'w') as zip_file: # use relative dir_name to avoid absolute path on file names zipdir(dest_path, zip_file) print(zip_file.namelist()) # clean up the target directory after the ZIP file has been closed # rmtree(sra_dir) buffer.seek(0) return buffer
def _do_aspera_transfer(self, transfer_token=None, user_name=None, password=None, remote_path=None, file_path=None, path2library=None, sub_id=None): # check submission status submission_status = Submission().isComplete(sub_id) if not submission_status or submission_status == 'false': lg.log('Starting aspera transfer', level=Loglvl.INFO, type=Logtype.FILE) kwargs = dict(target_id=sub_id, commenced_on=str(datetime.now())) Submission().save_record(dict(), **kwargs) # k is a loop counter which keeps track of the number of files transfered k = -1 f_str = str() for f in file_path: f_str = f_str + ' ' + f cmd = "./ascp -d -QT -l300M -L- {f_str!s} {user_name!s}:{remote_path!s}".format(**locals()) lg.log(cmd, level=Loglvl.INFO, type=Logtype.FILE) os.chdir(path2library) try: thread = pexpect.spawn(cmd, timeout=None) thread.expect(["assword:", pexpect.EOF]) thread.sendline(password) cpl = thread.compile_pattern_list([pexpect.EOF, '(.+)']) while True: i = thread.expect_list(cpl, timeout=None) if i == 0: # EOF! Possible error point if encountered before transfer completion print("Process termination - check exit status!") break elif i == 1: pexp_match = thread.match.group(1) prev_file = '' tokens_to_match = ["Mb/s"] units_to_match = ["KB", "MB"] time_units = ['d', 'h', 'm', 's'] end_of_transfer = False if all(tm in pexp_match.decode("utf-8") for tm in tokens_to_match): fields = { "transfer_status": "transferring", "current_time": datetime.now().strftime("%d-%m-%Y %H:%M:%S") } tokens = pexp_match.decode("utf-8").split(" ") for token in tokens: if not token == '': if "file" in token: fields['file_path'] = token.split('=')[-1] if prev_file != fields['file_path']: k = k + 1 prev_file == fields['file_path'] elif '%' in token: pct = float((token.rstrip("%"))) # pct = (1/len(file_path) * pct) + (k * 1/len(file_path) * 100) fields['pct_completed'] = pct # flag end of transfer print(str(transfer_token) + ": " + str(pct) + '% transfered') if token.rstrip("%") == 100: end_of_transfer = True elif any(um in token for um in units_to_match): fields['amt_transferred'] = token elif "Mb/s" in token or "Mbps" in token: t = token[:-4] if '=' in t: fields['transfer_rate'] = t[t.find('=') + 1:] else: fields['transfer_rate'] = t elif "status" in token: fields['transfer_status'] = token.split('=')[-1] elif "rate" in token: fields['transfer_rate'] = token.split('=')[-1] elif "elapsed" in token: fields['elapsed_time'] = token.split('=')[-1] elif "loss" in token: fields['bytes_lost'] = token.split('=')[-1] elif "size" in token: fields['file_size_bytes'] = token.split('=')[-1] elif "ETA" in token: eta = tokens[-2] estimated_completion = "" eta_split = eta.split(":") t_u = time_units[-len(eta_split):] for indx, eta_token in enumerate(eta.split(":")): if eta_token == "00": continue estimated_completion += eta_token + t_u[indx] + " " fields['estimated_completion'] = estimated_completion RemoteDataFile().update_transfer(transfer_token, fields) kwargs = dict(target_id=sub_id, completed_on=datetime.now()) Submission().save_record(dict(), **kwargs) # close thread thread.close() lg.log('Aspera Transfer completed', level=Loglvl.INFO, type=Logtype.FILE) except OSError: return redirect('web.apps.web_copo.views.goto_error', request=HttpRequest(), message='There appears to be an issue with EBI.') # setup paths for conversion directories conv_dir = os.path.join(self._dir, sub_id) if not os.path.exists(os.path.join(conv_dir, 'json')): os.makedirs(os.path.join(conv_dir, 'json')) json_file_path = os.path.join(conv_dir, 'json', 'isa_json.json') xml_dir = conv_dir xml_path = os.path.join(xml_dir, 'run_set.xml') # Convert COPO JSON to ISA JSON lg.log('Obtaining ISA-JSON', level=Loglvl.INFO, type=Logtype.FILE) conv = cnv.Investigation(submission_token=sub_id) meta = conv.get_schema() json_file = open(json_file_path, '+w') # dump metadata to output file json_file.write(dumps(meta)) json_file.close() # Validate ISA_JSON lg.log('Validating ISA-JSON', level=Loglvl.INFO, type=Logtype.FILE) with open(json_file_path) as json_file: v = isajson.validate(json_file) lg.log(v, level=Loglvl.INFO, type=Logtype.FILE) # convert to SRA with isatools converter lg.log('Converting to SRA', level=Loglvl.INFO, type=Logtype.FILE) sra_settings = d_utils.json_to_pytype(SRA_SETTINGS).get("properties", dict()) datafilehashes = conv.get_datafilehashes() json2sra.convert2(json_fp=open(json_file_path), path=conv_dir, sra_settings=sra_settings, datafilehashes=datafilehashes, validate_first=False) # finally submit to SRA lg.log('Submitting XMLS to ENA via CURL', level=Loglvl.INFO, type=Logtype.FILE) submission_file = os.path.join(xml_dir, 'submission.xml') project_file = os.path.join(xml_dir, 'project_set.xml') sample_file = os.path.join(xml_dir, 'sample_set.xml') experiment_file = os.path.join(xml_dir, 'experiment_set.xml') run_file = os.path.join(xml_dir, 'run_set.xml') curl_cmd = 'curl -k -F "SUBMISSION=@' + submission_file + '" \ -F "PROJECT=@' + os.path.join(remote_path, project_file) + '" \ -F "SAMPLE=@' + os.path.join(remote_path, sample_file) + '" \ -F "EXPERIMENT=@' + os.path.join(remote_path, experiment_file) + '" \ -F "RUN=@' + os.path.join(remote_path, run_file) + '" \ "https://www-test.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA%20Webin-39233%20Apple123"' output = subprocess.check_output(curl_cmd, shell=True) lg.log(output, level=Loglvl.INFO, type=Logtype.FILE) lg.log("Extracting fields from receipt", level=Loglvl.INFO, type=Logtype.FILE) xml = ET.fromstring(output) accessions = dict() # get project accessions project = xml.find('./PROJECT') project_accession = project.get('accession', default='undefined') project_alias = project.get('alias', default='undefined') accessions['project'] = {'accession': project_accession, 'alias': project_alias} # get experiment accessions experiment = xml.find('./EXPERIMENT') experiment_accession = experiment.get('accession', default='undefined') experiment_alias = experiment.get('alias', default='undefined') accessions['experiment'] = {'accession': experiment_accession, 'alias': experiment_alias} # get submission accessions submission = xml.find('./SUBMISSION') submission_accession = submission.get('accession', default='undefined') submission_alias = submission.get('alias', default='undefined') accessions['submission'] = {'accession': submission_accession, 'alias': submission_alias} # get run accessions run = xml.find('./RUN') run_accession = run.get('accession', default='undefined') run_alias = run.get('alias', default='undefined') accessions['run'] = {'accession': run_accession, 'alias': run_alias} # get sample accessions samples = xml.findall('./SAMPLE') sample_accessions = list() for sample in samples: sample_accession = sample.get('accession', default='undefined') sample_alias = sample.get('alias', default='undefined') s = {'sample_accession': sample_accession, 'sample_alias': sample_alias} for bio_s in sample: s['biosample_accession'] = bio_s.get('accession', default='undefined') sample_accessions.append(s) accessions['sample'] = sample_accessions # save accessions to mongo profile record s = Submission().get_record(sub_id) s['accessions'] = accessions s['complete'] = True s['target_id'] = str(s.pop('_id')) Submission().save_record(dict(), **s)
def test_sra_dump_run_set_xml_biis3(self): json2sra.convert2(open(os.path.join(self._json_data_dir, 'BII-S-3', 'BII-S-3.json')), self._tmp_dir, validate_first=False) # Now try load the SRA output in test and compare against the expected output in test data directory run_set_xml = open(os.path.join(self._tmp_dir, 'run_set.xml'), 'rb').read() actual_run_set_xml_biis3 = etree.fromstring(run_set_xml) self.assertTrue(utils.assert_xml_equal(self._expected_run_set_xml_biis3, actual_run_set_xml_biis3))