def _do_aspera_transfer(self, transfer_token=None, user_name=None, password=None, remote_path=None, file_path=None, path2library=None, sub_id=None): # check submission status submission_status = Submission().isComplete(sub_id) if not submission_status or submission_status == 'false': lg.log('Starting aspera transfer', level=Loglvl.INFO, type=Logtype.FILE) kwargs = dict(target_id=sub_id, commenced_on=str(datetime.now())) Submission().save_record(dict(), **kwargs) # k is a loop counter which keeps track of the number of files transfered k = -1 f_str = str() for f in file_path: f_str = f_str + ' ' + f cmd = "./ascp -d -QT -l300M -L- {f_str!s} {user_name!s}:{remote_path!s}".format(**locals()) lg.log(cmd, level=Loglvl.INFO, type=Logtype.FILE) os.chdir(path2library) try: thread = pexpect.spawn(cmd, timeout=None) thread.expect(["assword:", pexpect.EOF]) thread.sendline(password) cpl = thread.compile_pattern_list([pexpect.EOF, '(.+)']) while True: i = thread.expect_list(cpl, timeout=None) if i == 0: # EOF! Possible error point if encountered before transfer completion print("Process termination - check exit status!") break elif i == 1: pexp_match = thread.match.group(1) prev_file = '' tokens_to_match = ["Mb/s"] units_to_match = ["KB", "MB"] time_units = ['d', 'h', 'm', 's'] end_of_transfer = False if all(tm in pexp_match.decode("utf-8") for tm in tokens_to_match): fields = { "transfer_status": "transferring", "current_time": datetime.now().strftime("%d-%m-%Y %H:%M:%S") } tokens = pexp_match.decode("utf-8").split(" ") for token in tokens: if not token == '': if "file" in token: fields['file_path'] = token.split('=')[-1] if prev_file != fields['file_path']: k = k + 1 prev_file == fields['file_path'] elif '%' in token: pct = float((token.rstrip("%"))) # pct = (1/len(file_path) * pct) + (k * 1/len(file_path) * 100) fields['pct_completed'] = pct # flag end of transfer print(str(transfer_token) + ": " + str(pct) + '% transfered') if token.rstrip("%") == 100: end_of_transfer = True elif any(um in token for um in units_to_match): fields['amt_transferred'] = token elif "Mb/s" in token or "Mbps" in token: t = token[:-4] if '=' in t: fields['transfer_rate'] = t[t.find('=') + 1:] else: fields['transfer_rate'] = t elif "status" in token: fields['transfer_status'] = token.split('=')[-1] elif "rate" in token: fields['transfer_rate'] = token.split('=')[-1] elif "elapsed" in token: fields['elapsed_time'] = token.split('=')[-1] elif "loss" in token: fields['bytes_lost'] = token.split('=')[-1] elif "size" in token: fields['file_size_bytes'] = token.split('=')[-1] elif "ETA" in token: eta = tokens[-2] estimated_completion = "" eta_split = eta.split(":") t_u = time_units[-len(eta_split):] for indx, eta_token in enumerate(eta.split(":")): if eta_token == "00": continue estimated_completion += eta_token + t_u[indx] + " " fields['estimated_completion'] = estimated_completion RemoteDataFile().update_transfer(transfer_token, fields) kwargs = dict(target_id=sub_id, completed_on=datetime.now()) Submission().save_record(dict(), **kwargs) # close thread thread.close() lg.log('Aspera Transfer completed', level=Loglvl.INFO, type=Logtype.FILE) except OSError: return redirect('web.apps.web_copo.views.goto_error', request=HttpRequest(), message='There appears to be an issue with EBI.') # setup paths for conversion directories conv_dir = os.path.join(self._dir, sub_id) if not os.path.exists(os.path.join(conv_dir, 'json')): os.makedirs(os.path.join(conv_dir, 'json')) json_file_path = os.path.join(conv_dir, 'json', 'isa_json.json') xml_dir = conv_dir xml_path = os.path.join(xml_dir, 'run_set.xml') # Convert COPO JSON to ISA JSON lg.log('Obtaining ISA-JSON', level=Loglvl.INFO, type=Logtype.FILE) conv = cnv.Investigation(submission_token=sub_id) meta = conv.get_schema() json_file = open(json_file_path, '+w') # dump metadata to output file json_file.write(dumps(meta)) json_file.close() # Validate ISA_JSON lg.log('Validating ISA-JSON', level=Loglvl.INFO, type=Logtype.FILE) with open(json_file_path) as json_file: v = isajson.validate(json_file) lg.log(v, level=Loglvl.INFO, type=Logtype.FILE) # convert to SRA with isatools converter lg.log('Converting to SRA', level=Loglvl.INFO, type=Logtype.FILE) sra_settings = d_utils.json_to_pytype(SRA_SETTINGS).get("properties", dict()) datafilehashes = conv.get_datafilehashes() json2sra.convert2(json_fp=open(json_file_path), path=conv_dir, sra_settings=sra_settings, datafilehashes=datafilehashes, validate_first=False) # finally submit to SRA lg.log('Submitting XMLS to ENA via CURL', level=Loglvl.INFO, type=Logtype.FILE) submission_file = os.path.join(xml_dir, 'submission.xml') project_file = os.path.join(xml_dir, 'project_set.xml') sample_file = os.path.join(xml_dir, 'sample_set.xml') experiment_file = os.path.join(xml_dir, 'experiment_set.xml') run_file = os.path.join(xml_dir, 'run_set.xml') curl_cmd = 'curl -k -F "SUBMISSION=@' + submission_file + '" \ -F "PROJECT=@' + os.path.join(remote_path, project_file) + '" \ -F "SAMPLE=@' + os.path.join(remote_path, sample_file) + '" \ -F "EXPERIMENT=@' + os.path.join(remote_path, experiment_file) + '" \ -F "RUN=@' + os.path.join(remote_path, run_file) + '" \ "https://www-test.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA%20Webin-39233%20Apple123"' output = subprocess.check_output(curl_cmd, shell=True) lg.log(output, level=Loglvl.INFO, type=Logtype.FILE) lg.log("Extracting fields from receipt", level=Loglvl.INFO, type=Logtype.FILE) xml = ET.fromstring(output) accessions = dict() # get project accessions project = xml.find('./PROJECT') project_accession = project.get('accession', default='undefined') project_alias = project.get('alias', default='undefined') accessions['project'] = {'accession': project_accession, 'alias': project_alias} # get experiment accessions experiment = xml.find('./EXPERIMENT') experiment_accession = experiment.get('accession', default='undefined') experiment_alias = experiment.get('alias', default='undefined') accessions['experiment'] = {'accession': experiment_accession, 'alias': experiment_alias} # get submission accessions submission = xml.find('./SUBMISSION') submission_accession = submission.get('accession', default='undefined') submission_alias = submission.get('alias', default='undefined') accessions['submission'] = {'accession': submission_accession, 'alias': submission_alias} # get run accessions run = xml.find('./RUN') run_accession = run.get('accession', default='undefined') run_alias = run.get('alias', default='undefined') accessions['run'] = {'accession': run_accession, 'alias': run_alias} # get sample accessions samples = xml.findall('./SAMPLE') sample_accessions = list() for sample in samples: sample_accession = sample.get('accession', default='undefined') sample_alias = sample.get('alias', default='undefined') s = {'sample_accession': sample_accession, 'sample_alias': sample_alias} for bio_s in sample: s['biosample_accession'] = bio_s.get('accession', default='undefined') sample_accessions.append(s) accessions['sample'] = sample_accessions # save accessions to mongo profile record s = Submission().get_record(sub_id) s['accessions'] = accessions s['complete'] = True s['target_id'] = str(s.pop('_id')) Submission().save_record(dict(), **s)
def _submit(self, sub_id, dataFile_ids): for f_id in dataFile_ids: mongo_file = DataFile().get_record(f_id) c = ChunkedUpload.objects.get(pk=int(mongo_file["file_id"])) file_path = os.path.join(self.MEDIA_ROOT, str(c.file)) orig_name = c.filename sub = mongo_file['description']['attributes'] data = dict() data['defined_type'] = sub.get('type_category', dict()).get('type') data['title'] = sub.get('title_author_description', dict()).get('title') authors = sub.get('title_author_description', dict()).get('author').split(',') lst = list() for x in authors: lst.append({'name': x}) data['authors'] = lst data['description'] = sub.get('title_author_description', dict()).get('description') cat = sub.get('type_category', dict()).get('categories') if cat: cat = cat.split(',') cat = list(map(int, cat)) data['categories'] = cat else: data['categories'] = list() data['tags'] = sub.get('tags', dict()).get('keywords').split(',') for idx, t in enumerate(data['tags']): if len(t) < 3: if len(t) == 1: t = t + (2 * t) elif len(t) == 2: t = t + t data['tags'][idx] = t data['references'] = sub.get('tags', dict()).get('references').split(',') for idx, x in enumerate(data['references']): if x != '': if (not x.startswith('http')) or ( not x.startswith('https')): if (not x.startswith('www')): data['references'][idx] = 'http://www.' + x else: data['references'][idx] = 'http://' + x if len(data['references']) == 1 and data['references'][0] == '': # if blank ref, pop data.pop('references') data['funding'] = sub.get('tags', dict()).get('funding') data['licenses'] = sub.get('tags', dict()).get('licenses') data['publish'] = sub.get('figshare_publish', dict()).get('should_publish') # Create article #data = json.dumps({'title': orig_name, 'defined_type': 'figure'}) endpoint = 'account/articles' resp = requests.post(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS, data=json.dumps(data)) article_id = json.loads( resp.content.decode('utf8'))['location'].rsplit('/', 1)[1] # Get file info #with open(file_path, 'rb') as fin: # fin.seek(0, 2) # Go to end of file # size = fin.tell() size = c.offset info = json.dumps({'name': orig_name, 'size': size}) # Initiate upload endpoint = 'account/articles/{}/files'.format(article_id) resp = requests.post(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS, data=info) file_id = json.loads( resp.content.decode('utf-8'))['location'].rsplit('/', 1)[1] # Get upload/parts info endpoint = 'account/articles/{}/files/{}'.format( article_id, file_id) resp = requests.get(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS) url = '{upload_url}'.format( **json.loads(resp.content.decode('utf-8'))) parts = json.loads( requests.get(url).content.decode('utf-8'))['parts'] # start upload timer t = datetime.datetime.now() # Upload parts with open(file_path, 'rb') as fin: for idx, part in enumerate(parts): percent_done = idx / len(parts) * 100 size = part['endOffset'] - part['startOffset'] + 1 address = '{}/{}'.format(url, part['partNo']) x = datetime.datetime.now() requests.put(address, data=fin.read(size)) delta = datetime.datetime.now() - x # calculate current upload rate in MB per second bw = (size / delta.total_seconds()) / 1000 / 1000 fields = { 'transfer_rate': bw, 'pct_completed': percent_done } RemoteDataFile().update_transfer(self.transfer_token, fields) # Mark file upload as completed upload_time = datetime.datetime.now() - t requests.post(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS) fields = { 'pct_completed': 100, 'transfer_status': 'success', 'completed_on': str(datetime.datetime.now()), 'article_id': article_id } RemoteDataFile().update_transfer(self.transfer_token, fields) if data['publish'] == 'True': # publish api endpoint = 'account/articles/{}/publish'.format(article_id) resp = requests.post(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS) location = json.loads(resp.content.decode('utf8'))['location'] # get accession data endpoint = 'articles/{}'.format(article_id) resp = requests.get(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS) # save accessions to mongo profile record s = Submission().get_record(sub_id) s['article_id'] = json.loads( resp.content.decode('utf8'))['figshare_url'] s['complete'] = True s['status'] = 'published' s['target_id'] = str(s.pop('_id')) Submission().save_record(dict(), **s) else: # save accessions to mongo profile record s = Submission().get_record(sub_id) s['article_id'] = article_id s['complete'] = True s['status'] = 'not published' s['target_id'] = str(s.pop('_id')) Submission().save_record(dict(), **s) # mark submission as complete Submission().mark_submission_complete(sub_id, article_id=article_id) Submission().mark_submission_complete(sub_id) Submission().mark_figshare_article_id(sub_id=sub_id, article_id=article_id)
def _do_save_accessions(self, xml): lg.log('Retrieving and saving accessions to database', level=Loglvl.INFO, type=Logtype.FILE) accessions = dict() # get project accessions projects = xml.findall('./PROJECT') project_accessions = list() for project in projects: project_accession = project.get('accession', default='undefined') project_alias = project.get('alias', default='undefined') project_accessions.append( dict(accession=project_accession, alias=project_alias)) accessions['project'] = project_accessions # get experiment accessions experiments = xml.findall('./EXPERIMENT') experiment_accessions = list() for experiment in experiments: experiment_accession = experiment.get('accession', default='undefined') experiment_alias = experiment.get('alias', default='undefined') experiment_accessions.append( dict(accession=experiment_accession, alias=experiment_alias)) accessions['experiment'] = experiment_accessions # get submission accessions submissions = xml.findall('./SUBMISSION') submission_accessions = list() for submission in submissions: submission_accession = submission.get('accession', default='undefined') submission_alias = submission.get('alias', default='undefined') submission_accessions.append( dict(accession=submission_accession, alias=submission_alias)) accessions['submission'] = submission_accessions # get run accessions runs = xml.findall('./RUN') run_accessions = list() for run in runs: run_accession = run.get('accession', default='undefined') run_alias = run.get('alias', default='undefined') run_accessions.append( dict(accession=run_accession, alias=run_alias)) accessions['run'] = run_accessions # get sample accessions samples = xml.findall('./SAMPLE') sample_accessions = list() for sample in samples: sample_accession = sample.get('accession', default='undefined') sample_alias = sample.get('alias', default='undefined') s = { 'sample_accession': sample_accession, 'sample_alias': sample_alias } for bio_s in sample: s['biosample_accession'] = bio_s.get('accession', default='undefined') sample_accessions.append(s) accessions['sample'] = sample_accessions # save accessions to mongo record submission_record = Submission().get_record(self.submission_id) submission_record['accessions'] = accessions submission_record['complete'] = True submission_record['completed_on'] = datetime.now() submission_record['target_id'] = str(submission_record.pop('_id')) try: del submission_record["transcript"]["collated_records"] except: pass Submission().save_record(dict(), **submission_record) RemoteDataFile().delete_transfer(self.transfer_token) self.context["ena_status"] = "completed" return
def _submit(self, sub_id, dataFile_ids): for f_id in dataFile_ids: mongo_file = DataFile().get_record(f_id) c = ChunkedUpload.objects.get(pk=int(mongo_file["file_id"])) file_path = os.path.join(self.MEDIA_ROOT, str(c.file)) orig_name = c.filename sub = mongo_file['description']['attributes'] data = dict() data['defined_type'] = sub.get('type_category', dict()).get('type') data['title'] = sub.get('title_author_description', dict()).get('title') authors = sub.get('title_author_description', dict()).get('author').split(',') lst = list() for x in authors: lst.append({'name': x}) data['authors'] = lst data['description'] = sub.get('title_author_description', dict()).get('description') cat = sub.get('type_category', dict()).get('categories') cat = cat.split(',') cat = list(map(int, cat)) data['categories'] = cat data['tags'] = sub.get('tags', dict()).get('keywords').split(',') data['references'] = sub.get('tags', dict()).get('references').split(',') for idx, x in enumerate(data['references']): if (not x.startswith('http')) or (not x.startswith('https')): if (not x.startswith('www')): data['references'][idx] = 'http://www.' + x else: data['references'][idx] = 'http://' + x data['funding'] = sub.get('tags', dict()).get('funding') data['licenses'] = sub.get('tags', dict()).get('licenses') data['publish'] = sub.get('figshare_publish', dict()).get('should_publish') # Create article #data = json.dumps({'title': orig_name, 'defined_type': 'figure'}) endpoint = 'account/articles' resp = requests.post(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS, data=json.dumps(data)) article_id = json.loads(resp.content.decode('utf8'))['locat`ion'].rsplit('/', 1)[1] # Get file info #with open(file_path, 'rb') as fin: # fin.seek(0, 2) # Go to end of file # size = fin.tell() size = c.offset info = json.dumps({'name': orig_name, 'size': size }) # Initiate upload endpoint = 'account/articles/{}/files'.format(article_id) resp = requests.post(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS, data=info) file_id = json.loads(resp.content.decode('utf-8'))['location'].rsplit('/', 1)[1] # Get upload/parts info endpoint = 'account/articles/{}/files/{}'.format(article_id, file_id) resp = requests.get(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS) url = '{upload_url}'.format(**json.loads(resp.content.decode('utf-8'))) parts = json.loads(requests.get(url).content.decode('utf-8'))['parts'] # start upload timer t = datetime.datetime.now() # Upload parts with open(file_path, 'rb') as fin: for idx, part in enumerate(parts): percent_done = idx / len(parts) * 100 size = part['endOffset'] - part['startOffset'] + 1 address = '{}/{}'.format(url, part['partNo']) x = datetime.datetime.now() requests.put(address, data=fin.read(size)) delta = datetime.datetime.now() - x # calculate current upload rate in MB per second bw = (size / delta.total_seconds()) / 1000 / 1000 fields = {'transfer_rate': bw, 'pct_completed': percent_done} RemoteDataFile().update_transfer(self.transfer_token, fields) # Mark file upload as completed upload_time = datetime.datetime.now() - t requests.post(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS) fields = {'pct_completed': 100, 'transfer_status': 'success', 'completed_on':str(datetime.datetime.now()), 'article_id': article_id} RemoteDataFile().update_transfer(self.transfer_token, fields) if data['publish'] == 'True': # publish api endpoint = 'account/articles/{}/publish'.format(article_id) resp = requests.post(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS) location = json.loads(resp.content.decode('utf8'))['location'] # get accession data endpoint = 'articles/{}'.format(article_id) resp = requests.get(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS) # save accessions to mongo profile record s = Submission().get_record(sub_id) s['accession'] = json.loads(resp.content.decode('utf8'))['figshare_url'] s['complete'] = True s['status'] = 'published' s['target_id'] = str(s.pop('_id')) Submission().save_record(dict(), **s) else: # save accessions to mongo profile record s = Submission().get_record(sub_id) s['accession'] = article_id s['complete'] = True s['status'] = 'not published' s['target_id'] = str(s.pop('_id')) Submission().save_record(dict(), **s) # mark submission as complete Submission().mark_submission_complete(sub_id)
def get_accessions(self, reciept, sub_id, transfer_token=None): xml = ET.fromstring(reciept) accessions = dict() # first check for errors errors = xml.findall('*/ERROR') if errors: error_text = str() for e in errors: error_text = error_text + e.text transfer_fields = dict() transfer_fields["error"] = error_text transfer_fields["current_time"] = datetime.now().strftime( "%d-%m-%Y %H:%M:%S") # save error to transfer record RemoteDataFile().update_transfer(transfer_token, transfer_fields) return False # get project accessions project = xml.find('./PROJECT') if project is not None: project_accession = project.get('accession', default='undefined') project_alias = project.get('alias', default='undefined') accessions['project'] = { 'accession': project_accession, 'alias': project_alias } projects = xml.findall('./PROJECT') project_accessions = list() for project in projects: project_accession = project.get('accession', default='undefined') project_alias = project.get('alias', default='undefined') project_accessions.append( dict(accession=project_accession, alias=project_alias)) accessions['project'] = project_accessions # get experiment accessions experiments = xml.findall('./EXPERIMENT') if experiments is not None: experiment_accessions = list() for experiment in experiments: experiment_accession = experiment.get('accession', default='undefined') experiment_alias = experiment.get('alias', default='undefined') experiment_accessions.append( dict(accession=experiment_accession, alias=experiment_alias)) accessions['experiment'] = experiment_accessions # get submission accessions submissions = xml.findall('./SUBMISSION') if submissions is not None: submission_accessions = list() for submission in submissions: submission_accession = submission.get('accession', default='undefined') submission_alias = submission.get('alias', default='undefined') submission_accessions.append( dict(accession=submission_accession, alias=submission_alias)) accessions['submission'] = submission_accessions # get run accessions runs = xml.findall('./RUN') if runs is not None: run_accessions = list() for run in runs: run_accession = run.get('accession', default='undefined') run_alias = run.get('alias', default='undefined') run_accessions.append( dict(accession=run_accession, alias=run_alias)) accessions['run'] = run_accessions # get sample accessions samples = xml.findall('./SAMPLE') if samples is not None: sample_accessions = list() for sample in samples: sample_accession = sample.get('accession', default='undefined') sample_alias = sample.get('alias', default='undefined') s = { 'sample_accession': sample_accession, 'sample_alias': sample_alias } for bio_s in sample: s['biosample_accession'] = bio_s.get('accession', default='undefined') sample_accessions.append(s) accessions['sample'] = sample_accessions # save accessions to mongo record s = Submission().get_record(sub_id) s['accessions'] = accessions s['complete'] = True s['completed_on'] = datetime.now() s['target_id'] = str(s.pop('_id')) Submission().save_record(dict(), **s) RemoteDataFile().delete_transfer(transfer_token)