def addIGVSamples(fields, results_samp, annot_samples=None): """creates phenotype file for IGV :param samples: Solr results for samples to be included :type samples: Array. :param annot_samples: includes annotation files included with solr results :type annot_samples: Array """ # creates human readable indexes of fields to iterate over fields_dict = {} for i in fields: find_index = i.find("_Characteristics_") if find_index > -1: new_key = i.split("_Characteristics_")[0] fields_dict[i] = new_key # Creating temp file to enter into file_store tempsampname = tempfile.NamedTemporaryFile(delete=False) # writing header to sample file tempsampname.write("#sampleTable" + "\n") # writing column names to sample file col_names = "Linking_id" for k, v in fields_dict.iteritems(): col_names = col_names + '\t' + v tempsampname.write(col_names + "\n") # iterating over sample files pheno_results = get_sample_lines(fields_dict, results_samp) tempsampname.write(pheno_results) # if annotations are not null if annot_samples: pheno_annot = get_sample_lines(fields_dict, annot_samples) tempsampname.write(pheno_annot) # closing temp file tempsampname.close() # getting file_store_uuid filestore_uuid = create(tempsampname.name, permanent=True, filetype="txt") filestore_item = import_file(filestore_uuid, permanent=True, refresh=True) # file to rename temp_file = filestore_item.datafile.name.split('/') temp_file = temp_file[len(temp_file) - 1] + '.txt' # rename file by way of file_store filestore_item = rename(filestore_uuid, temp_file) # getting file information based on file_uuids curr_fs = FileStoreItem.objects.get(uuid=filestore_uuid) # full path to selected UUID File curr_url = get_full_url(curr_fs.get_datafile_url()) # delete temp file os.unlink(tempsampname.name) return curr_url
def copy_file(orig_uuid): """Helper function that copies a file if given the original file's UUID :param orig_uuid: UUID of file to copy. :type orig_uuid: str. :returns: UUID of newly copied file. """ orig_fsi = read(orig_uuid) newfile_uuid = None try: newfile_uuid = create(orig_fsi.source, orig_fsi.sharename, orig_fsi.filetype, permanent=is_permanent(orig_uuid)) import_file(newfile_uuid, refresh=True) except AttributeError: pass return newfile_uuid
def copy_file(orig_uuid): """Helper function that copies a file if given the original file's UUID :param orig_uuid: UUID of file to copy. :type orig_uuid: str. :returns: UUID of newly copied file. """ orig_fsi = read(orig_uuid) newfile_uuid = None try: newfile_uuid = create( orig_fsi.source, orig_fsi.sharename, orig_fsi.filetype, permanent=is_permanent(orig_uuid) ) import_file(newfile_uuid, refresh=True) except AttributeError: pass return newfile_uuid
def copy_file(orig_uuid): """Helper function that copies a file if given the original file's UUID :param orig_uuid: UUID of file to copy. :type orig_uuid: str. :returns: UUID of newly copied file. """ newfile_uuid = None try: orig_fsi = FileStoreItem.objects.get(uuid=orig_uuid) except (FileStoreItem.DoesNotExist, FileStoreItem.MultipleObjectsReturned) as e: logger.error("Couldn't properly fetch FileStoreItem: %s", e) else: try: newfile_uuid = create(orig_fsi.source, orig_fsi.sharename, orig_fsi.filetype, permanent=is_permanent(orig_uuid)) import_file(newfile_uuid, refresh=True) except AttributeError: pass return newfile_uuid
def _get_galaxy_download_tasks(analysis): """Get file import tasks for Galaxy analysis results""" logger.debug("Preparing to download analysis results from Galaxy") task_list = [] # retrieving list of files to download for workflow dl_files = analysis.workflow_dl_files # creating dictionary based on files to download predetermined by workflow # w/ keep operators dl_dict = {} for dl in dl_files.all(): temp_dict = {} temp_dict['filename'] = dl.filename temp_dict['pair_id'] = dl.pair_id dl_dict[str(dl.step_id)] = temp_dict galaxy_instance = analysis.workflow.workflow_engine.instance try: download_list = galaxy_instance.get_history_file_list( analysis.history_id) except galaxy.client.ConnectionError as exc: error_msg = ( "Error downloading Galaxy history files for analysis '%s': %s") logger.error(error_msg, analysis.name, exc.message) analysis.set_status(Analysis.FAILURE_STATUS, error_msg) analysis.galaxy_cleanup() return task_list # Iterating through files in current galaxy history for results in download_list: # download file if result state is "ok" if results['state'] == 'ok': file_type = results["type"] curr_file_id = results['name'] if curr_file_id in dl_dict: curr_dl_dict = dl_dict[curr_file_id] result_name = curr_dl_dict['filename'] + '.' + file_type # size of file defined by galaxy file_size = results['file_size'] # Determining tag if galaxy results should be download through # http or copying files directly to retrieve HTML files as zip # archives via dataset URL if galaxy_instance.local_download and file_type != 'html': download_url = results['file_name'] else: download_url = urlparse.urljoin( galaxy_instance.base_url, '/'.join([ 'datasets', str(results['dataset_id']), 'display?to_ext=txt' ])) # workaround to set the correct file type for zip archives of # FastQC HTML reports produced by Galaxy dynamically if file_type == 'html': file_type = 'zip' # TODO: when changing permanent=True, fix update of % download # of file filestore_uuid = create(source=download_url, filetype=file_type) # adding history files to django model temp_file = AnalysisResult(analysis_uuid=analysis.uuid, file_store_uuid=filestore_uuid, file_name=result_name, file_type=file_type) temp_file.save() analysis.results.add(temp_file) analysis.save() # downloading analysis results into file_store # only download files if size is greater than 1 if file_size > 0: task_id = import_file.subtask( (filestore_uuid, False, file_size)) task_list.append(task_id) return task_list
def create_igv_session(genome, uuids, is_file_uuid=False): """ Creates session file for selected file uuids, returns newly created filestore uuid :param is_file_uuid: :param genome: Genome to be used in session file i.e. hg18, dm3 :type genome: string. :param uuids: Array of UUIDs to be used :type uuids: array. :param uuids: Host URL i.e. 127.0.0.1:8000 :type uuids: string """ # Create IGV Session file and put into Filestore """ http://www.postneo.com/projects/pyxml/ <?xml version="1.0" encoding="UTF-8"?> <Global genome="hg18" locus="EGFR" version="3"> <Resources> <Resource name="RNA Genes" path="http://www.broadinstitute.org/igvdata/tcga/gbm/GBM_batch1-8_level3_exp.txt.recentered.080820.gct.tdf"/> <Resource name="RNA Genes" path="http://www.broadinstitute.org/igvdata/annotations/hg18/rna_genes.bed"/> <Resource name="sno/miRNA" path="http://www.broadinstitute.org/igvdata/tcga/gbm/Sample_info.txt"/> </Resources> </Global> """ logger.debug("visualization_manager.create_igv_session called") # Create the minidom document doc = Document() # Create the <wml> base element xml = doc.createElement("Global") xml.setAttribute("genome", genome) xml.setAttribute("locus", "All") xml.setAttribute("version", "4") doc.appendChild(xml) # Add Resources xml_resources = doc.createElement("Resources") xml.appendChild(xml_resources) # get paths to url for samp in uuids: # gets filestore item curr_name, curr_url = get_file_name(samp, is_file_uuid=is_file_uuid) logger.debug('New resource: ' + curr_name + ' - ' + curr_url) # What to do if fs does not exist? if curr_name: # creates Resource element res = doc.createElement("Resource") res.setAttribute("name", curr_name) res.setAttribute("path", curr_url) xml_resources.appendChild(res) # Creating temp file to enter into file_store tempfilename = tempfile.NamedTemporaryFile(delete=False) tempfilename.write(doc.toprettyxml(indent=" ")) tempfilename.close() # getting file_store_uuid filestore_uuid = create(tempfilename.name, filetype="xml") filestore_item = import_file(filestore_uuid, refresh=True) # file to rename temp_name = filestore_item.datafile.name.split('/') temp_name = temp_name[len(temp_name) - 1] + '.xml' # rename file by way of file_store filestore_item = rename(filestore_uuid, temp_name) # delete temp file os.unlink(tempfilename.name) # Url for session file fs_url = get_full_url(filestore_item.get_datafile_url()) # IGV url for automatic launch of Java Webstart igv_url = "http://www.broadinstitute.org/igv/projects/current/igv.php" \ "?sessionURL=" + fs_url return igv_url
def add_igv_samples(fields, results_samp, annot_samples=None): """creates phenotype file for IGV :param samples: Solr results for samples to be included :type samples: Array. :param annot_samples: includes annotation files included with solr results :type annot_samples: Array """ # creates human readable indexes of fields to iterate over fields_dict = {} for i in fields: find_index = i.find("_Characteristics_") if find_index > -1: new_key = i.split("_Characteristics_")[0] fields_dict[i] = new_key # Creating temp file to enter into file_store temp_sample_name = tempfile.NamedTemporaryFile(delete=False) # writing header to sample file temp_sample_name.write("#sampleTable" + "\n") # writing column names to sample file col_names = "Linking_id" for k, v in fields_dict.iteritems(): col_names = col_names + '\t' + v temp_sample_name.write(col_names + "\n") # iterating over sample files pheno_results = get_sample_lines(fields_dict, results_samp) try: temp_sample_name.write(pheno_results) except UnicodeEncodeError as e: logger.error("Could not write results to file: %s. " "Trying again with the content to write encoded " "properly." % e) temp_sample_name.write(pheno_results.encode("utf-8")) # if annotations are not null if annot_samples: pheno_annot = get_sample_lines(fields_dict, annot_samples) temp_sample_name.write(pheno_annot) # closing temp file temp_sample_name.close() # getting file_store_uuid filestore_uuid = create(temp_sample_name.name, filetype="txt") filestore_item = import_file(filestore_uuid, refresh=True) # file to rename temp_file = filestore_item.datafile.name.split('/') temp_file = temp_file[len(temp_file) - 1] + '.txt' # rename file by way of file_store filestore_item = rename(filestore_uuid, temp_file) # getting file information based on file_uuids curr_fs = FileStoreItem.objects.get(uuid=filestore_uuid) # full path to selected UUID File curr_url = get_full_url(curr_fs.get_datafile_url()) # delete temp file os.unlink(temp_sample_name.name) return curr_url
def create_igv_session_annot(genome, uuids, annot_uuids=None, samp_file=None): """Creates session file for selected file uuids, returns newly created filestore uuid :param genome: Genome to be used in session file i.e. hg18, dm3 :type genome: string. :param uuids: Array of UUIDs to be used :type uuids: array. :param uuids: Host URL i.e. 127.0.0.1:8000 :type uuids: string """ # Create IGV Session file and put into Filestore """ http://www.postneo.com/projects/pyxml/ <?xml version="1.0" encoding="UTF-8"?> <Global genome="hg18" locus="EGFR" version="3"> <Resources> <Resource name="RNA Genes" path="http://www.broadinstitute.org/igvdata/tcga/gbm/GBM_batch1-8_level3_exp.txt.recentered.080820.gct.tdf"/> <Resource name="RNA Genes" path="http://www.broadinstitute.org/igvdata/annotations/hg18/rna_genes.bed"/> <Resource name="sno/miRNA" path="http://www.broadinstitute.org/igvdata/tcga/gbm/Sample_info.txt"/> </Resources> </Global> """ # Create the minidom document doc = Document() # Create the <wml> base element xml = doc.createElement("Global") xml.setAttribute("genome", genome) xml.setAttribute("locus", "All") xml.setAttribute("version", "4") doc.appendChild(xml) # Add Resources xml_resources = doc.createElement("Resources") xml.appendChild(xml_resources) # adding selected samples to xml file add_igv_resource(uuids["node_uuid"], xml_resources, doc) if annot_uuids: # adding selected samples to xml file add_igv_resource(annot_uuids["node_uuid"], xml_resources, doc) # adds sample information file to IGV session file if samp_file: # <Resource name="Sample Information" # path="http://igv.broadinstitute.org/data/hg18/tcga/gbm/gbmsubtypes/sampleTable.txt.gz"/> # creates Resource element res = doc.createElement("Resource") res.setAttribute("name", "Sample Information") res.setAttribute("path", samp_file) xml_resources.appendChild(res) # <HiddenAttributes> # <Attribute name="DATA FILE"/> # <Attribute name="Linking_id"/> # <Attribute name="DATA TYPE"/> # </HiddenAttributes> # Adding parameters to hide basic unnecessary sample info hidden_attr = doc.createElement("HiddenAttributes") xml.appendChild(hidden_attr) attr = doc.createElement("Attribute") attr.setAttribute("name", "DATA FILE") hidden_attr.appendChild(attr) attr = doc.createElement("Attribute") attr.setAttribute("name", "Linking_id") hidden_attr.appendChild(attr) attr = doc.createElement("Attribute") attr.setAttribute("name", "DATA TYPE") hidden_attr.appendChild(attr) # Creating temp file to enter into file_store tempfilename = tempfile.NamedTemporaryFile(delete=False) tempfilename.write(doc.toprettyxml(indent=" ")) tempfilename.close() # getting file_store_uuid filestore_uuid = create(tempfilename.name, filetype="xml") filestore_item = import_file(filestore_uuid, refresh=True) # file to rename temp_name = filestore_item.datafile.name.split('/') temp_name = temp_name[len(temp_name) - 1] + '.xml' # rename file by way of file_store filestore_item = rename(filestore_uuid, temp_name) # delete temp file os.unlink(tempfilename.name) # Url for session file sessionfile_url = get_full_url(filestore_item.get_datafile_url()) # IGV url for automatic launch of Java Webstart igv_url = "http://www.broadinstitute.org/igv/projects/current/igv.php" \ "?sessionURL=" + sessionfile_url return igv_url
def add_igv_samples(fields, results_samp, annot_samples=None): """creates phenotype file for IGV :param samples: Solr results for samples to be included :type samples: Array. :param annot_samples: includes annotation files included with solr results :type annot_samples: Array """ # creates human readable indexes of fields to iterate over fields_dict = {} for i in fields: find_index = i.find("_Characteristics_") if find_index > -1: new_key = i.split("_Characteristics_")[0] fields_dict[i] = new_key # Creating temp file to enter into file_store temp_sample_name = tempfile.NamedTemporaryFile(delete=False) # writing header to sample file temp_sample_name.write("#sampleTable" + "\n") # writing column names to sample file col_names = "Linking_id" for k, v in fields_dict.iteritems(): col_names = col_names + "\t" + v temp_sample_name.write(col_names + "\n") # iterating over sample files pheno_results = get_sample_lines(fields_dict, results_samp) try: temp_sample_name.write(pheno_results) except UnicodeEncodeError as e: logger.error( "Could not write results to file: %s. " "Trying again with the content to write encoded " "properly.", e ) temp_sample_name.write(pheno_results.encode("utf-8")) # if annotations are not null if annot_samples: pheno_annot = get_sample_lines(fields_dict, annot_samples) temp_sample_name.write(pheno_annot) # closing temp file temp_sample_name.close() # getting file_store_uuid filestore_uuid = create(temp_sample_name.name, filetype="txt") filestore_item = import_file(filestore_uuid, refresh=True) # file to rename temp_file = filestore_item.datafile.name.split("/") temp_file = temp_file[len(temp_file) - 1] + ".txt" # rename file by way of file_store filestore_item = rename(filestore_uuid, temp_file) # getting file information based on file_uuids curr_fs = FileStoreItem.objects.get(uuid=filestore_uuid) # full path to selected UUID File curr_url = get_full_url(curr_fs.get_datafile_url()) # delete temp file os.unlink(temp_sample_name.name) return curr_url
def _parse_node(self, headers, row): ''' row is a deque, column header is at position len( headers ) - len( row ) ''' # TODO: test if this is really a node header_components = self._split_header(headers[-len(row)]) # TODO: for a node the number of header components must be 1 # assert( len( header_components ) ) == 1 # try to retrieve this node from the database (unless it is a normalization or data transformation) is_new = True # name of the node node_name = row[0].strip() # TODO: remove this once it has been implemented in the preprocessing if header_components[ 0] == Node.RAW_DATA_FILE and self.additional_raw_data_file_extension is not None and len( node_name) > 0: if not re.search(r'%s$' % self.additional_raw_data_file_extension, node_name): node_name += self.additional_raw_data_file_extension if (header_components[0] in Node.ASSAYS | { Node.SAMPLE, Node.SOURCE, Node.EXTRACT, Node.LABELED_EXTRACT, Node.DATA_TRANSFORMATION, Node.NORMALIZATION } and len(node_name) > 0) or (header_components[0] in Node.FILES and len(node_name) > 0): if header_components[0] in {Node.SAMPLE, Node.SOURCE}: #print "1 --looking up type " + header_components[0] + " = " + row[0].strip() + " in study only (" + str( self._current_study ) + ")" node, is_new = Node.objects.get_or_create( study=self._current_study, type=header_components[0], name=node_name) else: #print "2 -- looking up type " + header_components[0] + " = " + row[0].strip() + "in study AND assay (" + str( self._current_study ) + ", " + str( self._current_assay ) + ")" node, is_new = Node.objects.get_or_create( study=self._current_study, assay=self._current_assay, type=header_components[0], name=node_name) # this node represents a file - add the file to the file store and store the file UUID in the node if is_new and header_components[ 0] in Node.FILES and node_name is not "": # create the nodes for the data file in this row if self.file_base_path is None: file_path = node_name else: # test if this node is refering to a remote url components = urlparse(node_name) if components.scheme == "" or components.netloc == "": # not a remote url file_path = os.path.join(self.file_base_path, node_name) else: file_path = node_name uuid = create(source=file_path) if uuid is not None: node.file_uuid = uuid node.save() else: logger.exception("Unable to add " + file_path + " to file store as a temporary file.") if is_new: logger.info("New node " + str(node) + " created.") else: logger.info("Node " + str(node) + " retrieved.") else: if len(node_name) > 0: #print "3 -- looking up type " + header_components[0] + " = " + row[0].strip() + "in study AND assay (" + str( self._current_study ) + ", " + str( self._current_assay ) + ")" node = Node.objects.create(study=self._current_study, assay=self._current_assay, type=header_components[0], name=node_name) else: # do not create empty nodes! node = None self._current_node = node if self._previous_node is not None and self._current_node is not None: try: # test if the node has already been created (??? why not use an if statement ???) node.parents.get(to_node_id=self._previous_node.id) except: self._previous_node.children.add(node) node.parents.add(self._previous_node) node.save() self._previous_node.save() else: # TODO: look up parent nodes in DB pass # remove the node from the row row.popleft() # read until we hit the next node while not self.is_node(headers[-len(row)]): if self._current_node is not None: if self.is_attribute(headers[-len(row)]): self._parse_attribute(headers, row) elif self.is_protocol_reference(headers[-len(row)]): self._parse_protocol_reference(headers, row) else: logger.error("Unexpected element " + headers[-len(row)] + " when parsing node in line " + str(self._current_reader.line_num) + ", column " + str(len(headers) - len(row)) + ".") row.popleft() else: # node is none, pop until the next node because attributes can't be attached to anything row.popleft() if self._current_node is not None: node.save() self._previous_node = node self._current_node = None return node
def createIGVsession(genome, uuids, is_file_uuid=False): """ Creates session file for selected file uuids, returns newly created filestore uuid :param genome: Genome to be used in session file i.e. hg18, dm3 :type genome: string. :param uuids: Array of UUIDs to be used :type uuids: array. :param uuids: Host URL i.e. 127.0.0.1:8000 :type uuids: string """ # Create IGV Session file and put into Filestore """ http://www.postneo.com/projects/pyxml/ <?xml version="1.0" encoding="UTF-8"?> <Global genome="hg18" locus="EGFR" version="3"> <Resources> <Resource name="RNA Genes" path="http://www.broadinstitute.org/igvdata/tcga/gbm/GBM_batch1-8_level3_exp.txt.recentered.080820.gct.tdf"/> <Resource name="RNA Genes" path="http://www.broadinstitute.org/igvdata/annotations/hg18/rna_genes.bed"/> <Resource name="sno/miRNA" path="http://www.broadinstitute.org/igvdata/tcga/gbm/Sample_info.txt"/> </Resources> </Global> """ logger.debug("visualization_manager.createIGVsession called") # Create the minidom document doc = Document() # Create the <wml> base element xml = doc.createElement("Global") xml.setAttribute("genome", genome) xml.setAttribute("locus", "All") xml.setAttribute("version", "4") doc.appendChild(xml) # Add Resources xml_resources = doc.createElement("Resources") xml.appendChild(xml_resources) # get paths to url for samp in uuids: # gets filestore item curr_name, curr_url = get_file_name(samp, is_file_uuid=is_file_uuid) logger.debug( 'New resource: ' + curr_name + ' - ' + curr_url ) # What to do if fs does not exist? if (curr_name): # creates Resource element res = doc.createElement("Resource") res.setAttribute("name", curr_name) res.setAttribute("path", curr_url) xml_resources.appendChild(res) # Creating temp file to enter into file_store tempfilename = tempfile.NamedTemporaryFile(delete=False) tempfilename.write(doc.toprettyxml(indent=" ")) tempfilename.close() # getting file_store_uuid filestore_uuid = create(tempfilename.name, permanent=True, filetype="xml") filestore_item = import_file(filestore_uuid, permanent=True, refresh=True) # file to rename temp_name = filestore_item.datafile.name.split('/') temp_name = temp_name[len(temp_name)-1] + '.xml' # rename file by way of file_store filestore_item = rename(filestore_uuid, temp_name) # delete temp file os.unlink(tempfilename.name) # Print our newly created XML #print doc.toprettyxml(indent=" ") #print filestore_item.datafile.url # Url for session file fs_url = filestore_item.get_full_url() # IGV url for automatic launch of Java Webstart igv_url = "http://www.broadinstitute.org/igv/projects/current/igv.php?sessionURL=" + fs_url return igv_url
def addIGVSamples(fields, results_samp, annot_samples=None): """ creates phenotype file for IGV :param samples: Solr results for samples to be included :type samples: Array. :param annot_samples: includes annotation files included with solr results :type annot_samples: Array """ #logger.debug("visualization_manager.views addIGVSamples called, fields=%s" % fields) # creates human readable indexes of fields to iterate over fields_dict = {} for i in fields: find_index = i.find("_Characteristics_") if find_index > -1: new_key = i.split("_Characteristics_")[0] fields_dict[i] = new_key # Creating temp file to enter into file_store tempsampname = tempfile.NamedTemporaryFile(delete=False) # writing header to sample file tempsampname.write("#sampleTable" + "\n") # writing column names to sample file col_names = "Linking_id" for k,v in fields_dict.iteritems(): col_names = col_names + '\t' + v tempsampname.write(col_names + "\n") # iterating over sample files pheno_results = get_sample_lines(fields_dict, results_samp) tempsampname.write(pheno_results) # if annotations are not null if annot_samples: #results_annot = annot_samples["response"]["docs"] pheno_annot = get_sample_lines(fields_dict, annot_samples) tempsampname.write(pheno_annot) # closing temp file tempsampname.close() # getting file_store_uuid filestore_uuid = create(tempsampname.name, permanent=True, filetype="txt") filestore_item = import_file(filestore_uuid, permanent=True, refresh=True) # file to rename temp_file = filestore_item.datafile.name.split('/') temp_file = temp_file[len(temp_file)-1] + '.txt' # rename file by way of file_store filestore_item = rename(filestore_uuid, temp_file) # getting file information based on file_uuids curr_fs = FileStoreItem.objects.get(uuid=filestore_uuid) curr_name = curr_fs.datafile.name # full path to selected UUID File curr_url = curr_fs.get_full_url() # delete temp file os.unlink(tempsampname.name) return curr_url
def createIGVsessionAnnot(genome, uuids, annot_uuids=None, samp_file=None): """ Creates session file for selected file uuids, returns newly created filestore uuid :param genome: Genome to be used in session file i.e. hg18, dm3 :type genome: string. :param uuids: Array of UUIDs to be used :type uuids: array. :param uuids: Host URL i.e. 127.0.0.1:8000 :type uuids: string """ # Create IGV Session file and put into Filestore """ http://www.postneo.com/projects/pyxml/ <?xml version="1.0" encoding="UTF-8"?> <Global genome="hg18" locus="EGFR" version="3"> <Resources> <Resource name="RNA Genes" path="http://www.broadinstitute.org/igvdata/tcga/gbm/GBM_batch1-8_level3_exp.txt.recentered.080820.gct.tdf"/> <Resource name="RNA Genes" path="http://www.broadinstitute.org/igvdata/annotations/hg18/rna_genes.bed"/> <Resource name="sno/miRNA" path="http://www.broadinstitute.org/igvdata/tcga/gbm/Sample_info.txt"/> </Resources> </Global> """ #logger.debug("visualization_manager.views createIGVsessionAnnot called") # Create the minidom document doc = Document() # Create the <wml> base element xml = doc.createElement("Global") xml.setAttribute("genome", genome) xml.setAttribute("locus", "All") xml.setAttribute("version", "4") doc.appendChild(xml) # Add Resources xml_resources = doc.createElement("Resources") xml.appendChild(xml_resources) # adding selected samples to xml file addIGVResource(uuids["node_uuid"], xml_resources, doc) if annot_uuids: # adding selected samples to xml file addIGVResource(annot_uuids["node_uuid"], xml_resources, doc) # adds sample information file to IGV session file if samp_file: #<Resource name="Sample Information" path="http://igv.broadinstitute.org/data/hg18/tcga/gbm/gbmsubtypes/sampleTable.txt.gz"/> # creates Resource element res = doc.createElement("Resource") res.setAttribute("name", "Sample Information") res.setAttribute("path", samp_file) xml_resources.appendChild(res) #<HiddenAttributes> # <Attribute name="DATA FILE"/> # <Attribute name="Linking_id"/> # <Attribute name="DATA TYPE"/> #</HiddenAttributes> # Adding parameters to hide basic unnecessary sample info hidden_attr = doc.createElement("HiddenAttributes") xml.appendChild(hidden_attr) attr = doc.createElement("Attribute") attr.setAttribute("name", "DATA FILE") hidden_attr.appendChild(attr) attr = doc.createElement("Attribute") attr.setAttribute("name", "Linking_id") hidden_attr.appendChild(attr) attr = doc.createElement("Attribute") attr.setAttribute("name", "DATA TYPE") hidden_attr.appendChild(attr) # Creating temp file to enter into file_store tempfilename = tempfile.NamedTemporaryFile(delete=False) tempfilename.write(doc.toprettyxml(indent=" ")) tempfilename.close() # getting file_store_uuid filestore_uuid = create(tempfilename.name, permanent=True, filetype="xml") filestore_item = import_file(filestore_uuid, permanent=True, refresh=True) # file to rename temp_name = filestore_item.datafile.name.split('/') temp_name = temp_name[len(temp_name)-1] + '.xml' # rename file by way of file_store filestore_item = rename(filestore_uuid, temp_name) # delete temp file os.unlink(tempfilename.name) # Print our newly created XML #logger.info( doc.toprettyxml(indent=" ")) #print filestore_item.datafile.url # Url for session file fs_url = filestore_item.get_full_url() # IGV url for automatic launch of Java Webstart igv_url = "http://www.broadinstitute.org/igv/projects/current/igv.php?sessionURL=" + fs_url return igv_url
def download_history_files(analysis): """Download entire histories from galaxy. Getting files out of history to file store. """ logger.debug("analysis_manger.download_history_files called") # retrieving list of files to download for workflow #TODO: handle Django exceptions analysis = Analysis.objects.get(uuid=analysis.uuid) dl_files = analysis.workflow_dl_files ### creating dictionary based on files to download predetermined by workflow w/ keep operators dl_dict = {} for dl in dl_files.all(): temp_dict = {} temp_dict['filename'] = dl.filename temp_dict['pair_id'] = dl.pair_id dl_dict[str(dl.step_id)] = temp_dict task_list = [] # gets current galaxy connection connection = analysis.get_galaxy_connection() try: download_list = connection.get_history_file_list(analysis.history_id) except RuntimeError as exc: error_msg = "Post-processing failed: " + \ "error downloading Galaxy history files for analysis '{}': {}" \ .format(analysis.name, exc.message) logger.error(error_msg) if not isinstance(exc, (ConnectionError, TimeoutError, AuthError)): analysis.set_status(Analysis.FAILURE_STATUS, error_msg) try: analysis.delete_galaxy_library() analysis.delete_galaxy_workflow() analysis.delete_galaxy_history() except RuntimeError: logger.error("Cleanup failed for analysis '{}'".format( analysis.name)) return task_list # Iterating through files in current galaxy history for results in download_list: # download file if result state is "ok" if results['state'] == 'ok': file_type = results["type"] curr_file_id = results['name'] if curr_file_id in dl_dict: curr_dl_dict = dl_dict[curr_file_id] result_name = curr_dl_dict['filename'] + '.' + file_type # size of file defined by galaxy file_size = results['file_size'] # Determing tag if galaxy results should be download through http or copying files directly local_download = analysis.workflow.workflow_engine.instance.local_download # to retrieve HTML files as zip archives via dataset URL if local_download and file_type != 'html': download_url = results['file_name'] else: download_url = connection.make_url(str( results['dataset_id']), is_data=True, key=False) # workaround to set the correct file type for zip archives of # reports produced by FASTQC if file_type == 'html': file_type = 'zip' # getting file_store_uuid, # TODO: when changing permanent=True, fix update of % download of file filestore_uuid = create(source=download_url, filetype=file_type, permanent=False) # adding history files to django model temp_file = AnalysisResult(analysis_uuid=analysis.uuid, file_store_uuid=filestore_uuid, file_name=result_name, file_type=file_type) temp_file.save() analysis.results.add(temp_file) analysis.save() # downloading analysis results into file_store # only download files if size is greater than 1 if file_size > 0: #task_id = import_file.subtask((filestore_uuid, True, False, file_size,)) # local download, force copying into the file_store instead of symlinking if local_download: task_id = import_file.subtask(( filestore_uuid, False, True, file_size, )) else: task_id = import_file.subtask(( filestore_uuid, False, False, file_size, )) task_list.append(task_id) return task_list
def _parse_node(self, headers, row): """row is a deque, column header is at position len(headers) - len(row) """ # TODO: test if this is really a node header_components = self._split_header(headers[-len(row)]) # TODO: for a node the number of header components must be 1 # assert(len(header_components)) == 1 # try to retrieve this node from the database (unless it is a # normalization or data transformation) is_new = True # name of the node node_name = row[0].strip() # TODO: remove this once it has been implemented in the preprocessing if (header_components[0] == Node.RAW_DATA_FILE and self.additional_raw_data_file_extension is not None and len(node_name) > 0): if not re.search( r'%s$' % self.additional_raw_data_file_extension, node_name ): node_name += self.additional_raw_data_file_extension if (header_components[0] in Node.ASSAYS | {Node.SAMPLE, Node.SOURCE, Node.EXTRACT, Node.LABELED_EXTRACT, Node.DATA_TRANSFORMATION, Node.NORMALIZATION} and len(node_name) > 0) or \ (header_components[0] in Node.FILES and len(node_name) > 0): if header_components[0] in {Node.SAMPLE, Node.SOURCE}: node, is_new = Node.objects.get_or_create( study=self._current_study, type=header_components[0], name=node_name) else: node, is_new = Node.objects.get_or_create( study=self._current_study, assay=self._current_assay, type=header_components[0], name=node_name) # this node represents a file - add the file to the file store and # store the file UUID in the node if (is_new and header_components[0] in Node.FILES and node_name is not ""): # create the nodes for the data file in this row if self.file_base_path is None: file_path = node_name else: # test if this node is referring to a remote url components = urlparse(node_name) if components.scheme == "" or components.netloc == "": # not a remote url file_path = os.path.join( self.file_base_path, node_name) else: file_path = node_name uuid = create(source=file_path) if uuid is not None: node.file_uuid = uuid node.save() else: logger.exception( "Unable to add " + file_path + " to file store as a " "temporary file.") if is_new: logger.info("New node " + str(node) + " created.") else: logger.info("Node " + str(node) + " retrieved.") else: if len(node_name) > 0: node = Node.objects.create( study=self._current_study, assay=self._current_assay, type=header_components[0], name=node_name) else: # do not create empty nodes! node = None self._current_node = node if self._previous_node is not None and self._current_node is not None: try: # test if the node has already been created (??? why not use an # if statement ???) node.parents.get(to_node_id=self._previous_node.id) except: self._previous_node.children.add(node) node.parents.add(self._previous_node) node.save() self._previous_node.save() else: # TODO: look up parent nodes in DB pass # remove the node from the row row.popleft() # read until we hit the next node while not self.is_node(headers[-len(row)]): if self._current_node is not None: if self.is_attribute(headers[-len(row)]): self._parse_attribute(headers, row) elif self.is_protocol_reference(headers[-len(row)]): self._parse_protocol_reference(headers, row) else: logger.error( "Unexpected element " + headers[-len(row)] + " when " "parsing node in line " + str(self._current_reader.line_num) + ", column " + str(len(headers) - len(row)) + ".") row.popleft() else: # node is none, pop until the next node because attributes # can't be attached to anything row.popleft() if self._current_node is not None: node.save() self._previous_node = node self._current_node = None return node
def get_galaxy_download_tasks(analysis): """Get file import tasks for Galaxy analysis results""" logger.debug("Preparing to download analysis results from Galaxy") # retrieving list of files to download for workflow dl_files = analysis.workflow_dl_files # creating dictionary based on files to download predetermined by workflow # w/ keep operators dl_dict = {} for dl in dl_files.all(): temp_dict = {} temp_dict['filename'] = dl.filename temp_dict['pair_id'] = dl.pair_id dl_dict[str(dl.step_id)] = temp_dict task_list = [] galaxy_instance = analysis.workflow.workflow_engine.instance try: download_list = galaxy_instance.get_history_file_list( analysis.history_id) except galaxy.client.ConnectionError as exc: error_msg = "Error downloading Galaxy history files for analysis " \ "'%s': %s" logger.error(error_msg, analysis.name, exc.message) analysis.set_status(Analysis.FAILURE_STATUS, error_msg) analysis.galaxy_cleanup() return task_list # Iterating through files in current galaxy history for results in download_list: # download file if result state is "ok" if results['state'] == 'ok': file_type = results["type"] curr_file_id = results['name'] if curr_file_id in dl_dict: curr_dl_dict = dl_dict[curr_file_id] result_name = curr_dl_dict['filename'] + '.' + file_type # size of file defined by galaxy file_size = results['file_size'] # Determining tag if galaxy results should be download through # http or copying files directly to retrieve HTML files as zip # archives via dataset URL if galaxy_instance.local_download and file_type != 'html': download_url = results['file_name'] else: download_url = urlparse.urljoin( galaxy_instance.base_url, '/'.join( ['datasets', str(results['dataset_id']), 'display?to_ext=txt'])) # workaround to set the correct file type for zip archives of # FastQC HTML reports produced by Galaxy dynamically if file_type == 'html': file_type = 'zip' # TODO: when changing permanent=True, fix update of % download # of file filestore_uuid = create( source=download_url, filetype=file_type, permanent=False) # adding history files to django model temp_file = AnalysisResult( analysis_uuid=analysis.uuid, file_store_uuid=filestore_uuid, file_name=result_name, file_type=file_type) temp_file.save() analysis.results.add(temp_file) analysis.save() # downloading analysis results into file_store # only download files if size is greater than 1 if file_size > 0: # local download, force copying into the file_store instead # of symlinking if galaxy_instance.local_download: task_id = import_file.subtask( (filestore_uuid, False, True, file_size,)) else: task_id = import_file.subtask( (filestore_uuid, False, False, file_size,)) task_list.append(task_id) return task_list
def run(self, path, isa_archive=None, preisa_archive=None): """If path is a file it will be treated as an ISArchive, if it is a directory it will be treated as an extracted ISArchive. Assumes that the archive extracts into a subdirectory named <archive> if the ISArchive is called <archive>.zip. """ # reset all variables self._current_investigation = None self._current_study = None self._current_assay = None self._current_node = None self._previous_node = None self._current_attribute = None self._current_protocol_reference = None self._current_reader = None self._current_file = None self._current_file_name = None # 1. test if archive needs to be extracted and extract if necessary if not os.path.isdir(path): # assign to isa_archive if it's an archive anyway isa_archive = path logger.info( "Supplied path \"" + path + "\" is not a directory. Assuming " "ISArchive file.") try: # TODO: do we need a random subdirectory here? extract_path = tempfile.mkdtemp() with ZipFile(path, 'r') as zip: # test if any paths are relative or absolute and outside # the extract path for name in zip.namelist(): if name.startswith("..") or name.startswith("/"): logger.exception( "Unable to extract assumed ISArchive file \"" + path + "\" due to illegal file path: " + name ) # extract archive zip.extractall(extract_path) first_file = zip.namelist()[0] # test if first entry in zip file is a path if first_file.endswith("/"): # add archive subdirectory to path extract_path = os.path.join(extract_path, first_file) elif re.search(r'/', first_file): ind = string.find(first_file, '/') extract_path = os.path.join( extract_path, first_file[:ind] ) logger.info( "ISArchive extracted to \"" + extract_path + "\"." ) path = extract_path except: logger.exception( "Unable to extract assumed ISArchive file \"" + path + "\".") # 2. identify investigation file try: investigation_file_name = glob.glob("%s/i*.txt" % path).pop() except IndexError as exception: logger.exception( "Unable to identify ISArchive file in \"" + path + "\".") raise exception # 3. parse investigation file and identify study files and # corresponding assay files self._parse_investigation_file(investigation_file_name) # 4. parse all study files and corresponding assay files if self._current_investigation is not None: # identify studies associated with this investigation for study in self._current_investigation.study_set.all(): # parse study file self._current_assay = None study_file_name = os.path.join(path, study.file_name) if data_set_manager.tasks.fix_last_col(study_file_name): self._parse_study_file(study, study_file_name) for assay in study.assay_set.all(): # parse assay file self._previous_node = None assay_file_name = os.path.join(path, assay.file_name) if data_set_manager.tasks.fix_last_col( assay_file_name): self._parse_assay_file( study, assay, assay_file_name) else: logger.exception( "No investigation was identified when parsing investigation " "file \"" + investigation_file_name + "\"") raise Exception() # 5. assign ISA-Tab archive and pre-ISA-Tab archive if present try: self._current_investigation.isarchive_file = create(isa_archive) import_file(self._current_investigation.isarchive_file, refresh=True) except: pass if preisa_archive: self._current_investigation.pre_isarchive_file = \ create(preisa_archive) import_file(self._current_investigation.pre_isarchive_file, refresh=True) self._current_investigation.save() return self._current_investigation
def _parse_file(self, file_name ): try: self._current_file = open( file_name, "rU" ) self._current_reader = csv.reader( self._current_file, dialect="excel-tab", delimiter=self.delimiter ) except: logger.exception( "Unable to read file " + str( self._current_file ) + "." ) # create investigation, study and assay objects investigation = self._create_investigation() study = self._create_study( investigation=investigation, file_name=file_name ) assay = self._create_assay( study=study, file_name=file_name ) #import in file as "pre-isa" file logger.info('trying to add pre-isa archive file %s' % file_name) investigation.pre_isarchive_file = create(file_name, permanent=True) import_file(investigation.pre_isarchive_file, refresh=True, permanent=True) investigation.save() # read column headers headers = [] headers = self._current_reader.next() # compute absolute file_column_index (in case a negative value was provided) if self.file_column_index >= 0: internal_file_column_index = self.file_column_index else: internal_file_column_index = len( headers ) + self.file_column_index # compute absolute auxiliary_file_column_index (in case a negative value was provided) if self.auxiliary_file_column_index is not None: if self.auxiliary_file_column_index >= 0: internal_auxiliary_file_column_index = self.auxiliary_file_column_index else: internal_auxiliary_file_column_index = len( headers ) + self.auxiliary_file_column_index else: internal_auxiliary_file_column_index = None # TODO: test if there are fewer columns than required logger.debug( "Parsing with file column %s and auxiliary file column %s." % ( internal_file_column_index, internal_auxiliary_file_column_index ) ) # iterate over non-header rows in file for row in self._current_reader: # TODO: resolve relative indices internal_source_column_index = self.source_column_index internal_sample_column_index = self.sample_column_index internal_assay_column_index = self.assay_column_index # add data file to file store file_uuid = None if self.file_base_path is None: file_path = row[internal_file_column_index].strip() else: file_path = os.path.join( self.file_base_path, row[internal_file_column_index].strip() ) file_uuid = create( source=file_path, permanent=self.file_permanent ) if file_uuid is not None: logger.debug( "Added data file " + file_path + " to file store." ) else: logger.exception( "Unable to add data file " + file_path + " to file store." ) # add auxiliary file to file store auxiliary_file_uuid = None if internal_auxiliary_file_column_index is not None: if self.file_base_path is None: auxiliary_file_path = row[internal_auxiliary_file_column_index].strip() else: auxiliary_file_path = os.path.join( self.file_base_path, row[internal_auxiliary_file_column_index].strip() ) auxiliary_file_uuid = create( source=auxiliary_file_path, permanent=self.file_permanent ) if auxiliary_file_uuid is not None: logger.debug( "Added auxiliary file " + auxiliary_file_path + " to file store." ) else: logger.exception( "Unable to add auxiliary file " + file_path + " to file store." ) # add files to file server file_server.models.add( file_uuid, auxiliary_file_uuid ); # create nodes if file was successfully created # source node source_name = self._create_name(row, internal_source_column_index, internal_file_column_index) source_node, is_source_new = Node.objects.get_or_create( study=study, name=source_name, type=Node.SOURCE ) # sample node sample_name = self._create_name(row, internal_sample_column_index, internal_file_column_index) sample_node, is_sample_new = Node.objects.get_or_create( study=study, name=sample_name, type=Node.SAMPLE ) source_node.add_child( sample_node ) # assay node assay_name = self._create_name(row, internal_assay_column_index, internal_file_column_index) assay_node, is_assay_new = Node.objects.get_or_create( study=study, assay=assay, name=assay_name, type=Node.ASSAY ) sample_node.add_child( assay_node ) file_node = Node.objects.create( study=study, assay=assay, name=row[internal_file_column_index].strip(), file_uuid=file_uuid, type=Node.RAW_DATA_FILE, species=self._get_species( row ), genome_build=self._get_genome_build( row ), is_annotation=self._is_annotation( row ) ) assay_node.add_child( file_node ) # iterate over columns to create attributes to attach to the sample node for column_index in range( 0, len( row ) ): # skip data file column if ( internal_file_column_index == column_index ) or ( internal_auxiliary_file_column_index == column_index ) or ( self.annotation_column_index == column_index ): continue # create attribute as characteristic and attach to sample node if the sample node was newly created if is_sample_new: attribute = Attribute.objects.create( node=sample_node, type=Attribute.CHARACTERISTICS, subtype=headers[column_index].strip().lower(), value=row[column_index].strip() ) return investigation
def run(self, path, isa_archive=None, preisa_archive=None): ''' If path is a file it will be treated as an ISArchive, if it is a directory it will be treated as an extracted ISArchive. Assumes that the archive extracts into a subdirectory named <archive> if the ISArchive is called <archive>.zip. ''' # reset all variables self._current_investigation = None self._current_study = None self._current_assay = None self._current_node = None self._previous_node = None self._current_attribute = None self._current_protocol_reference = None self._current_reader = None self._current_file = None self._current_file_name = None # 1. test if archive needs to be extracted and extract if necessary if not os.path.isdir(path): #assign to isa_archive if it's an archive anyway isa_archive = path logger.info("Supplied path \"" + path + "\" is not a directory. Assuming ISArchive file.") try: # TODO: do we need a random subdirectory here? extract_path = tempfile.mkdtemp() with ZipFile(path, 'r') as zip: # test if any paths are relative or absolute and outside the extract path for name in zip.namelist(): if name.startswith("..") or name.startswith("/"): logger.exception( "Unable to extract assumed ISArchive file \"" + path + "\" due to illegal file path: " + name) # extract archive zip.extractall(extract_path) first_file = zip.namelist()[0] # test if first entry in zip file is a path if first_file.endswith("/"): # add archive subdirectory to path extract_path = os.path.join(extract_path, first_file) elif re.search(r'/', first_file): ind = string.find(first_file, '/') extract_path = os.path.join(extract_path, first_file[:ind]) logger.info("ISArchive extracted to \"" + extract_path + "\".") path = extract_path except: logger.exception( "Unable to extract assumed ISArchive file \"" + path + "\".") # 2. identify investigation file try: investigation_file_name = glob.glob("%s/i*.txt" % path).pop() except IndexError as exception: logger.exception("Unable to identify ISArchive file in \"" + path + "\".") raise exception # 3. parse investigation file and identify study files and corresponding assay files self._parse_investigation_file(investigation_file_name) # 4. parse all study files and corresponding assay files if self._current_investigation is not None: # identify studies associated with this investigation for study in self._current_investigation.study_set.all(): # parse study file self._current_assay = None study_file_name = os.path.join(path, study.file_name) if data_set_manager.tasks.fix_last_col(study_file_name): self._parse_study_file(study, study_file_name) for assay in study.assay_set.all(): # parse assay file self._previous_node = None assay_file_name = os.path.join(path, assay.file_name) if data_set_manager.tasks.fix_last_col( assay_file_name): self._parse_assay_file(study, assay, assay_file_name) else: logger.exception( "No investigation was identified when parsing investigation file \"" + investigation_file_name + "\"") raise Exception() #assign ISA-Tab archive and pre-ISA-Tab archive if present try: self._current_investigation.isarchive_file = create(isa_archive, permanent=True) except: pass if preisa_archive: self._current_investigation.pre_isarchive_file = create( preisa_archive, permanent=True) self._current_investigation.save() return self._current_investigation
def run(self): # create investigation, study and assay objects investigation = self._create_investigation() # FIXME: self.metadata_file.name may not be informative, especially in # case of temp files that don't exist on disk study = self._create_study(investigation=investigation, file_name=self.metadata_file.name) assay = self._create_assay(study=study, file_name=self.metadata_file.name) # import in file as "pre-isa" file logger.info("trying to add pre-isa archive file %s", self.metadata_file.name) # FIXME: this will not create a FileStoreItem if self.metadata_file # does not exist on disk (e.g., a file object like TemporaryFile) investigation.pre_isarchive_file = create( self.metadata_file.name, permanent=True) import_file(investigation.pre_isarchive_file, refresh=True) investigation.save() # TODO: test if there are fewer columns than required logger.debug("Parsing with file column %s and " "auxiliary file column %s", self.file_column_index, self.auxiliary_file_column_index) # UUIDs of data files to postpone importing until parsing is finished data_files = [] # iterate over non-header rows in file for row in self.metadata_reader: # TODO: resolve relative indices internal_source_column_index = self.source_column_index internal_sample_column_index = self.sample_column_index internal_assay_column_index = self.assay_column_index # add data file to file store data_file_path = self.file_source_translator( row[self.file_column_index]) data_file_uuid = create( source=data_file_path, permanent=self.file_permanent) data_files.append(data_file_uuid) # add auxiliary file to file store if self.auxiliary_file_column_index: auxiliary_file_path = self.file_source_translator( row[self.auxiliary_file_column_index]) auxiliary_file_uuid = create( source=auxiliary_file_path, permanent=self.file_permanent) data_files.append(auxiliary_file_uuid) else: auxiliary_file_uuid = None # add files to file server # TODO: add error handling in case of None values for UUIDs file_server.models.add(data_file_uuid, auxiliary_file_uuid) # create nodes if file was successfully created # source node source_name = self._create_name( row, internal_source_column_index, self.file_column_index) source_node, is_source_new = Node.objects.get_or_create( study=study, name=source_name, type=Node.SOURCE) # sample node sample_name = self._create_name( row, internal_sample_column_index, self.file_column_index) sample_node, is_sample_new = Node.objects.get_or_create( study=study, name=sample_name, type=Node.SAMPLE) source_node.add_child(sample_node) # assay node assay_name = self._create_name( row, internal_assay_column_index, self.file_column_index) assay_node, is_assay_new = Node.objects.get_or_create( study=study, assay=assay, name=assay_name, type=Node.ASSAY) sample_node.add_child(assay_node) file_node = Node.objects.create( study=study, assay=assay, name=row[self.file_column_index].strip(), file_uuid=data_file_uuid, type=Node.RAW_DATA_FILE, species=self._get_species(row), genome_build=self._get_genome_build(row), is_annotation=self._is_annotation(row)) assay_node.add_child(file_node) # iterate over columns to create attributes to attach to sample # node for column_index in range(0, len(row)): # skip data file column if (self.file_column_index == column_index or self.auxiliary_file_column_index == column_index or self.annotation_column_index == column_index): continue # create attribute as characteristic and attach to sample node # if the sample node was newly created if is_sample_new: Attribute.objects.create( node=sample_node, type=Attribute.CHARACTERISTICS, subtype=self.headers[column_index].strip().lower(), value=row[column_index].strip() ) # kick off data file importing tasks for uuid in data_files: import_file.delay(uuid) return investigation
def run(self): # create investigation, study and assay objects investigation = self._create_investigation() # FIXME: self.metadata_file.name may not be informative, especially in # case of temp files that don't exist on disk study = self._create_study(investigation=investigation, file_name=self.metadata_file.name) assay = self._create_assay(study=study, file_name=self.metadata_file.name) # import in file as "pre-isa" file logger.info("trying to add pre-isa archive file %s", self.metadata_file.name) # FIXME: this will not create a FileStoreItem if self.metadata_file # does not exist on disk (e.g., a file object like TemporaryFile) investigation.pre_isarchive_file = create(self.metadata_file.name) import_file(investigation.pre_isarchive_file, refresh=True) investigation.save() # TODO: test if there are fewer columns than required logger.debug( "Parsing with file column %s and " "auxiliary file column %s", self.file_column_index, self.auxiliary_file_column_index) # UUIDs of data files to postpone importing until parsing is finished data_files = [] # iterate over non-header rows in file for row in self.metadata_reader: # TODO: resolve relative indices internal_source_column_index = self.source_column_index internal_sample_column_index = self.sample_column_index internal_assay_column_index = self.assay_column_index # add data file to file store data_file_path = self.file_source_translator( row[self.file_column_index]) data_file_uuid = create(source=data_file_path) data_files.append(data_file_uuid) # add auxiliary file to file store if self.auxiliary_file_column_index: auxiliary_file_path = self.file_source_translator( row[self.auxiliary_file_column_index]) auxiliary_file_uuid = create(source=auxiliary_file_path) data_files.append(auxiliary_file_uuid) else: auxiliary_file_uuid = None # add files to file server # TODO: add error handling in case of None values for UUIDs file_server.models.add(data_file_uuid, auxiliary_file_uuid) # create nodes if file was successfully created # source node source_name = self._create_name(row, internal_source_column_index, self.file_column_index) source_node, is_source_new = Node.objects.get_or_create( study=study, name=source_name, type=Node.SOURCE) # sample node sample_name = self._create_name(row, internal_sample_column_index, self.file_column_index) sample_node, is_sample_new = Node.objects.get_or_create( study=study, name=sample_name, type=Node.SAMPLE) source_node.add_child(sample_node) # assay node assay_name = self._create_name(row, internal_assay_column_index, self.file_column_index) assay_node, is_assay_new = Node.objects.get_or_create( study=study, assay=assay, name=assay_name, type=Node.ASSAY) sample_node.add_child(assay_node) file_node = Node.objects.create( study=study, assay=assay, name=row[self.file_column_index].strip(), file_uuid=data_file_uuid, type=Node.RAW_DATA_FILE, species=self._get_species(row), genome_build=self._get_genome_build(row), is_annotation=self._is_annotation(row)) assay_node.add_child(file_node) # iterate over columns to create attributes to attach to sample # node for column_index in range(0, len(row)): # skip data file column if (self.file_column_index == column_index or self.auxiliary_file_column_index == column_index or self.annotation_column_index == column_index): continue # create attribute as characteristic and attach to sample node # if the sample node was newly created if is_sample_new: Attribute.objects.create( node=sample_node, type=Attribute.CHARACTERISTICS, subtype=self.headers[column_index].strip().lower(), value=row[column_index].strip()) # Start remote file import tasks if `Make Import Permanent:` flag set # by the user # Likewise, we'll try to import these files if their source begins with # our REFINERY_DATA_IMPORT_DIR setting (This will be the case if # users upload datafiles associated with their metadata) for uuid in data_files: try: file_store_item = FileStoreItem.objects.get(uuid=uuid) except (FileStoreItem.DoesNotExist, FileStoreItem.MultipleObjectsReturned) as e: logger.error("Couldn't properly fetch FileStoreItem %s", e) else: if (self.file_permanent or file_store_item.source.startswith( (settings.REFINERY_DATA_IMPORT_DIR, 's3://'))): import_file.delay(uuid) return investigation
def download_history_files(analysis) : """Download entire histories from galaxy. Getting files out of history to file store. """ logger.debug("analysis_manger.download_history_files called") # retrieving list of files to download for workflow #TODO: handle Django exceptions analysis = Analysis.objects.get(uuid=analysis.uuid) dl_files = analysis.workflow_dl_files ### creating dictionary based on files to download predetermined by workflow w/ keep operators dl_dict = {} for dl in dl_files.all(): temp_dict = {} temp_dict['filename'] = dl.filename temp_dict['pair_id'] = dl.pair_id dl_dict[str(dl.step_id)] = temp_dict task_list = [] # gets current galaxy connection connection = analysis.get_galaxy_connection() try: download_list = connection.get_history_file_list(analysis.history_id) except RuntimeError as exc: error_msg = "Post-processing failed: " + \ "error downloading Galaxy history files for analysis '{}': {}" \ .format(analysis.name, exc.message) logger.error(error_msg) if not isinstance(exc, (ConnectionError, TimeoutError, AuthError)): analysis.set_status(Analysis.FAILURE_STATUS, error_msg) try: analysis.delete_galaxy_library() analysis.delete_galaxy_workflow() analysis.delete_galaxy_history() except RuntimeError: logger.error( "Cleanup failed for analysis '{}'".format(analysis.name)) return task_list # Iterating through files in current galaxy history for results in download_list: # download file if result state is "ok" if results['state'] == 'ok': file_type = results["type"] curr_file_id = results['name'] if curr_file_id in dl_dict: curr_dl_dict = dl_dict[curr_file_id] result_name = curr_dl_dict['filename'] + '.' + file_type # size of file defined by galaxy file_size = results['file_size'] # Determing tag if galaxy results should be download through http or copying files directly local_download = analysis.workflow.workflow_engine.instance.local_download # to retrieve HTML files as zip archives via dataset URL if local_download and file_type != 'html': download_url = results['file_name'] else: download_url = connection.make_url( str(results['dataset_id']), is_data=True, key=False) # workaround to set the correct file type for zip archives of # reports produced by FASTQC if file_type == 'html': file_type = 'zip' # getting file_store_uuid, # TODO: when changing permanent=True, fix update of % download of file filestore_uuid = create( source=download_url, filetype=file_type, permanent=False ) # adding history files to django model temp_file = AnalysisResult( analysis_uuid=analysis.uuid, file_store_uuid=filestore_uuid, file_name=result_name, file_type=file_type) temp_file.save() analysis.results.add(temp_file) analysis.save() # downloading analysis results into file_store # only download files if size is greater than 1 if file_size > 0: #task_id = import_file.subtask((filestore_uuid, True, False, file_size,)) # local download, force copying into the file_store instead of symlinking if local_download: task_id = import_file.subtask( (filestore_uuid, False, True, file_size,)) else: task_id = import_file.subtask( (filestore_uuid, False, False, file_size,)) task_list.append(task_id) return task_list