class BatchFilesCreator(object): def __init__(self, cla, base_folder, friendly_name, custom_args=None): """ Constructor of the general batch files creator, to create multiple files from a CLA. :param cla: a ``CLApplication`` object, wrapper for the corresponding CLA :param base_folder: accession of the base folder where the pipeline files will be organised into subfolders :param friendly_name: user-friendly name of the files produced by the app ; used in the on-screen statements and in the name of the project subfolders :param custom_args: list of custom command-line argument strings for the files. Default is ``None`` """ self._cla = cla self._files_util = FilesUtil(cla.connection) self._base_folder = base_folder self._friendly_name = friendly_name self._custom_args = custom_args def create_files(self, sources): print('Creating %s files...' % self._friendly_name) output_folder = self._files_util.create_folder(self._friendly_name, parent=self._base_folder) output_files = [] for i, source in enumerate(sources, 1): output = self._create_output_file(source) self._files_util.link_file(output, output_folder) print('Created %s file %s (%d/%d)' % (self._friendly_name, output, i, len(output))) output_files.append(output) return output_files # this method can be overridden in child classes to allow for more complex file creation logic def _create_output_file(self, source): output = self._cla.create_file(source) if self._custom_args: self._cla.change_command_line_arguments(output, self._custom_args) return output
def test_en_rna_seq(conn, keep_files): fu = FilesUtil(conn) en = ExpressionNavigatorforGenes(conn) en_file = None try: groups = [{'accessions': accs} for accs in RNA_SEQ_GROUPS] en_file = en.create_file(groups, r_package=en.PKG_DESEQ, organism="new organism") finally: if (not keep_files) and (en_file is not None): fu.unlink_file(en_file, fu.get_special_folder(SpecialFolders.CREATED))
def test_en_isoforms(conn, keep_files): fu = FilesUtil(conn) en = ExpressionNavigatorforIsoforms(conn) en_file = None try: groups = [{'accessions': accs} for accs in ISOFORM_GROUPS] en_file = en.create_file(groups, multi_mapping_corr=True) finally: if (not keep_files) and (en_file is not None): fu.unlink_file(en_file, fu.get_special_folder(SpecialFolders.CREATED))
def upload_files(connection, files, folder_name, folder_accession): """ :param genestack_client.Connection connection: :param list[str] files: :param str folder_name: :param str folder_accession: """ importer = DataImporter(connection) fu = FilesUtil(connection) upload = fu.get_special_folder(SpecialFolders.UPLOADED) if not folder_accession: folder_name = folder_name or datetime.now().strftime( 'Upload %d.%m.%y %H:%M:%S') folder_accession = fu.create_folder( folder_name, parent=upload, description='Files uploaded by genestack-uploader') else: folder_name = fu.get_infos([folder_accession])[0]['name'] accession_file_map = {} for f in files: accession = importer.load_raw(f) fu.link_file(accession, folder_accession) fu.unlink_file(accession, upload) accession_file_map[accession] = f return folder_accession, folder_name, accession_file_map
def __init__(self, cla, base_folder, friendly_name, custom_args=None): """ Constructor of the general batch files creator, to create multiple files from a CLA. :param cla: a ``CLApplication`` object, wrapper for the corresponding CLA :param base_folder: accession of the base folder where the pipeline files will be organised into subfolders :param friendly_name: user-friendly name of the files produced by the app ; used in the on-screen statements and in the name of the project subfolders :param custom_args: list of custom command-line argument strings for the files. Default is ``None`` """ self._cla = cla self._files_util = FilesUtil(cla.connection) self._base_folder = base_folder self._friendly_name = friendly_name self._custom_args = custom_args
def create_dataflow(self, accession, name=None): """ Creates a data flow based on the file provenance of the specified file. The nodes of the data flow can be accessed by the accession of the corresponding files in the file provenance. :param accession: file accession :type accession: str :param name: data flow name :type name: str :return: accession of the created data flow file :rtype: str :raise GenestackException: """ response = self.invoke('initializeApplicationState', 'createFromSources', accession) if response['type'] == 'newPage': accession = response['fileInfo']['accession'] elif response['type'] == 'existingPages': # If file already exists we expect to get the last created file. # Existing page contains files from first to last (or MAX QUERY) # TODO: in case there are more files then MAX QUERY (100 ATM), # the last file in response will not be really last # (it is almost impossible use case, though) file_info = response['fileInfos'][-1] accession = file_info['accession'] else: raise GenestackException("Unknown response type: %s" % response['type']) if name: FilesUtil(self.connection).replace_metainfo_string_value( [accession], Metainfo.NAME, name) return accession
def test_en_microarrays(conn, keep_files): fu = FilesUtil(conn) en = ExpressionNavigatorforMicroarrays(conn) norm_app = AffymetrixMicroarraysNormalizationApplication(conn) en_file = None norm_file = None try: groups = [{'accessions': accs} for accs in MICROARRAY_GROUPS] groups[0]['is_control'] = True norm_file = norm_app.create_file([f for group in MICROARRAY_GROUPS for f in group]) en_file = en.create_file(groups, norm_file, RAT_AFFY_ANNOTATION) finally: if not keep_files: created = fu.get_special_folder(SpecialFolders.CREATED) for f in (norm_file, en_file): if f is not None: fu.unlink_file(f, created)
def __get_mydatasets_folder(self): """ Get default folder for datasets. :return: default dataset folder accession :rtype: str """ return FilesUtil(self.connection).get_special_folder( SpecialFolders.MY_DATASETS)
def recognize_files(connection, accession_file_map, new_folder): # Files Recognition fu = FilesUtil(connection) application = connection.application('genestack/upload') recognised_files = application.invoke('recognizeGroupsByAccession', accession_file_map.keys()) recognized_accessions = set() for x in recognised_files: for sources in x['sourceFileInfos'].values(): for info in sources: recognized_accessions.add(info['accession']) created_files = application.invoke('createFiles', recognised_files, [], None) groups = sorted(created_files['files'], key=itemgetter('kind')) for name, group in groupby(groups, key=itemgetter('kind')): print(name) # maybe sort by filename before printing a group? for f in group: print('\t%s / %s' % (f['accession'], f['name'])) unrecognized_file_infos = set(accession_file_map) - recognized_accessions if unrecognized_file_infos: print('Unrecognized Raw Files') for accession in unrecognized_file_infos: print('\t%s / %s' % (accession, accession_file_map[accession].decode('utf-8'))) # move unrecognized files to new folder unrecognized_folder = fu.create_folder("Unrecognized files", parent=new_folder) for accession in unrecognized_file_infos: fu.link_file(accession, unrecognized_folder) fu.unlink_file(accession, new_folder) print('Unrecognized files moved to %s / %s' % (unrecognized_folder, 'Unrecognized files'))
def test_metainfo_io(conn): data_importer = DataImporter(conn) fu = FilesUtil(conn) created = fu.get_special_folder(SpecialFolders.CREATED) info = Metainfo() info.add_boolean("a", True) info.add_file_reference("b", created) info.add_date_time("c", "2015-12-13") info.add_integer("d", 239) info.add_decimal("e", 238.583) info.add_decimal("e", -13.4) info.add_string("f", "hello") info.add_memory_size("g", 2847633) info.add_person("i", "Rosalind Franklin", "+1-202-555-0123", "*****@*****.**") info.add_publication("j", "My Publication", "Myself", "Journal of Me", "23/12/2014", pages="12-23") info.add_value(Metainfo.NAME, StringValue("Test report file")) report_file = None try: report_file = data_importer.create_report_file(metainfo=info, urls=[TEST_URL], parent=created) metainfo = next(iter(fu.collect_metainfos([report_file]))) assert metainfo.get('a')[0].get_boolean() assert isinstance(metainfo.get('b')[0].get_accession(), str) assert metainfo.get('c')[0].get_date() == _strptime_local('2015-12-13', '%Y-%m-%d') assert metainfo.get('d')[0].get_int() == 239 assert metainfo.get('e')[0].get_decimal() == 238.583 assert metainfo.get('e')[1].get_decimal() == -13.4 assert metainfo.get('f')[0].get_string() == "hello" assert metainfo.get('g')[0].get_int() == 2847633 assert metainfo.get('i')[0].get_person() == {'name': 'Rosalind Franklin', 'phone': '+1-202-555-0123', 'email': '*****@*****.**'} assert metainfo.get('j')[0].get_publication() == {'title': 'My Publication', 'authors': 'Myself', 'journalName': 'Journal of Me', 'issueDate': '23/12/2014', 'pages': '12-23', 'issueNumber': None, 'identifiers': {}} assert metainfo.get(Metainfo.NAME)[0].get_string() == "Test report file" assert metainfo.get(BioMetaKeys.DATA_LINK)[0].get_url() == TEST_URL finally: if report_file is not None: fu.unlink_file(report_file, created)
def test_metainfo_io(conn): data_importer = DataImporter(conn) fu = FilesUtil(conn) created = fu.get_special_folder(SpecialFolders.CREATED) info = Metainfo() info.add_boolean("a", True) info.add_file_reference("b", created) info.add_date_time("c", "2015-12-13") info.add_integer("d", 239) info.add_decimal("e", 238.583) info.add_decimal("e", -13.4) info.add_string("f", "hello") info.add_memory_size("g", 2847633) info.add_person("i", "Rosalind Franklin", "+1-202-555-0123", "*****@*****.**") info.add_publication("j", "My Publication", "Myself", "Journal of Me", "23/12/2014", pages="12-23") info.add_value(Metainfo.NAME, StringValue("Test report file")) report_file = None try: report_file = data_importer.create_report_file(metainfo=info, urls=[TEST_URL], parent=created) metainfo = next(iter(fu.collect_metainfos([report_file]))) assert metainfo.get('a')[0].get_boolean() assert isinstance(metainfo.get('b')[0].get_accession(), str) assert metainfo.get('c')[0].get_date() == _strptime_local( '2015-12-13', '%Y-%m-%d') assert metainfo.get('d')[0].get_int() == 239 assert metainfo.get('e')[0].get_decimal() == 238.583 assert metainfo.get('e')[1].get_decimal() == -13.4 assert metainfo.get('f')[0].get_string() == "hello" assert metainfo.get('g')[0].get_int() == 2847633 assert metainfo.get('i')[0].get_person() == { 'name': 'Rosalind Franklin', 'phone': '+1-202-555-0123', 'email': '*****@*****.**' } assert metainfo.get('j')[0].get_publication() == { 'title': 'My Publication', 'authors': 'Myself', 'journalName': 'Journal of Me', 'issueDate': '23/12/2014', 'pages': '12-23', 'issueNumber': None, 'identifiers': {} } assert metainfo.get( Metainfo.NAME)[0].get_string() == "Test report file" assert metainfo.get(BioMetaKeys.DATA_LINK)[0].get_url() == TEST_URL finally: if report_file is not None: fu.unlink_file(report_file, created)
class BatchFilesCreator(object): def __init__(self, cla, base_folder, friendly_name, custom_args=None): """ Constructor of the general batch files creator, to create multiple files from a CLA. :param cla: a ``CLApplication`` object, wrapper for the corresponding CLA :param base_folder: accession of the base folder where the pipeline files will be organised into subfolders :param friendly_name: user-friendly name of the files produced by the app ; used in the on-screen statements and in the name of the project subfolders :param custom_args: list of custom command-line argument strings for the files. Default is ``None`` """ self._cla = cla self._files_util = FilesUtil(cla.connection) self._base_folder = base_folder self._friendly_name = friendly_name self._custom_args = custom_args def create_files(self, sources): print "Creating %s files..." % self._friendly_name output_folder = self._files_util.create_folder( self._friendly_name, parent=self._base_folder) output_files = [] for i, source in enumerate(sources, 1): output = self._create_output_file(source) self._files_util.link_file(output, output_folder) print "Created %s file %s (%d/%d)" % (self._friendly_name, output, i, len(output)) output_files.append(output) return output_files # this method can be overridden in child classes to allow for more complex file creation logic def _create_output_file(self, source): output = self._cla.create_file(source) if self._custom_args: self._cla.change_command_line_arguments(output, self._custom_args) return output
def upload_files(connection, files, folder_name): importer = DataImporter(connection) fu = FilesUtil(connection) upload = fu.get_special_folder(SpecialFolders.UPLOADED) folder_name = folder_name or datetime.now().strftime('Upload %d.%m.%y %H:%M:%S') new_folder = fu.create_folder(folder_name, parent=upload, description='Files uploaded by genestack-uploader') accession_file_map = {} for f in files: accession = importer.load_raw(f) fu.link_file(accession, new_folder) fu.unlink_file(accession, upload) accession_file_map[accession] = f return new_folder, folder_name, accession_file_map
def create_file(self, source_files, name=None, params=None, calculate_checksums=False, expected_checksums=None, initialize=False): """ Create a native Genestack file with the application and return its accession. If a source file is not found or is not of the expected type, an exception will be thrown. :param source_files: list of source files accessions :type source_files: list :param name: if a name is provided, the created file will be renamed :type name: str :param params: custom command-line arguments strings; if None, the application defaults will be used. :param params: list :param calculate_checksums: a flag used in the initialization script to compute checksums for the created files :type calculate_checksums: bool :param expected_checksums: Dict of expected checksums (``{metainfo_key: expected_checksum}``) :type expected_checksums: dict :param initialize: should initialization be started immediately after the file is created? :return: accession of created file :rtype: str """ app_file = self.__create_file(source_files, params) fu = FilesUtil(self.connection) if name: fu.rename_file(app_file, name) if calculate_checksums: fu.mark_for_tests(app_file) if expected_checksums: fu.add_checksums(app_file, expected_checksums) if initialize: self.start(app_file) return app_file
'Accession of the Genestack folder storing the files to group by application' ) parser.add_argument( '--move-files', action='store_true', help= 'If present, the original files will be unlinked from the source folder') args = parser.parse_args() source_folder = args.folder move_files = args.move_files print "Connecting to Genestack..." # get connection and application handlers connection = get_connection(args) files_util = FilesUtil(connection) print "Collecting files..." files = files_util.get_file_children(source_folder) files_count = len(files) print "Found %d files to organise. Retrieving infos..." % files_count infos = files_util.get_complete_infos(files) output_folder = files_util.create_folder("Organized files", parent=source_folder) grouping_folders = {} for i, entry in enumerate(infos, 1): accession = entry['accession'] print "Processing file %d of %d (%s)..." % (i, files_count, accession)
from genestack_client import FilesUtil, make_connection_parser, get_connection # parse script arguments parser = make_connection_parser() parser.add_argument('folder', help='Accession of the Genestack folder storing the files to group by application') parser.add_argument('--move-files', action='store_true', help='If present, the original files will be unlinked from the source folder') args = parser.parse_args() source_folder = args.folder move_files = args.move_files print "Connecting to Genestack..." # get connection and application handlers connection = get_connection(args) files_util = FilesUtil(connection) print "Collecting files..." files = files_util.get_file_children(source_folder) files_count = len(files) print "Found %d files to organise. Retrieving infos..." % files_count infos = files_util.get_complete_infos(files) output_folder = files_util.create_folder("Organized files", parent=source_folder) grouping_folders = {} for i, entry in enumerate(infos, 1): accession = entry['accession'] print "Processing file %d of %d (%s)..." % (i, files_count, accession) # use either application name, application ID or "Unknown application" (in this order of preference)
def rename_file(self, accession, name): sys.stderr.write('Deprecated: use FilesUtil.rename_file instead\n') FilesUtil(self.connection).rename_file(accession, name)
parser.add_argument( '--name', default="New Project", help='Name of the Genestack folder where to put the output files') parser.add_argument( '--ref-genome', help='Accession of the reference genome to use for the mapping step') args = parser.parse_args() project_name = args.name print "Connecting to Genestack..." # get connection and create output folder connection = get_connection(args) files_util = FilesUtil(connection) created_files_folder = files_util.get_special_folder( SpecialFolders.CREATED) project_folder = files_util.create_folder(project_name, parent=created_files_folder) # create application wrappers and batch files creators bowtie_app = BowtieApplication(connection) mapped_qc_app = AlignedReadsQC(connection) variant_calling_app = VariationCaller2Application(connection) bowtie_creator = BowtieBatchFilesCreator(bowtie_app, project_folder, "Mapped Reads", ref_genome=args.ref_genome) mapped_qc_creator = BatchFilesCreator(mapped_qc_app, project_folder,
if __name__ == "__main__": # parse script arguments parser = make_connection_parser() parser.add_argument('csv_file', help='Path to the local comma-delimited CSV file containing the data') parser.add_argument('local_key', help='Name of the local key to match CSV records and Genestack files names') parser.add_argument('folder', help='Accession of the Genestack folder containing the files') args = parser.parse_args() csv_input = args.csv_file local_key = args.local_key print "Connecting to Genestack..." # get connection and application handlers connection = get_connection(args) files_util = FilesUtil(connection) print "Collecting files..." files = files_util.get_file_children(args.folder) print "Found %d files. Collecting metadata..." % len(files) infos = files_util.get_infos(files) identifier_map = {info['name']: info['accession'] for info in infos} # parse the CSV file with open(csv_input, 'r') as the_file: reader = csv.DictReader(the_file, delimiter=",") field_names = reader.fieldnames if args.local_key not in field_names: raise GenestackException("Error: the local key %s is not present in the supplied CSV file" % args.local_key)
# parse script arguments parser = make_connection_parser() parser.add_argument('raw_reads_folder', help='Genestack accession of the folder containing the raw reads files to process') parser.add_argument('--name', default="New Project", help='Name of the Genestack folder where to put the output files') parser.add_argument('--ref-genome', help='Accession of the reference genome to use for the mapping step') args = parser.parse_args() project_name = args.name print('Connecting to Genestack...') # get connection and create output folder connection = get_connection(args) files_util = FilesUtil(connection) created_files_folder = files_util.get_special_folder(SpecialFolders.CREATED) project_folder = files_util.create_folder(project_name, parent=created_files_folder) # create application wrappers and batch files creators bowtie_app = BowtieApplication(connection) mapped_qc_app = AlignedReadsQC(connection) variant_calling_app = VariationCaller2Application(connection) bowtie_creator = BowtieBatchFilesCreator(bowtie_app, project_folder, "Mapped Reads", ref_genome=args.ref_genome) mapped_qc_creator = BatchFilesCreator(mapped_qc_app, project_folder, "Mapped Reads QC") vc_creator = BatchFilesCreator(variant_calling_app, project_folder, "Variants", custom_args=VC_ARGUMENTS_NO_INDELS) # collect files print('Collecting raw reads...') raw_reads = files_util.get_file_children(args.raw_reads_folder)
'local_key', help= 'Name of the local key to match CSV records and Genestack files names') parser.add_argument( 'folder', help='Accession of the Genestack folder containing the files') args = parser.parse_args() csv_input = args.csv_file local_key = args.local_key print('Connecting to Genestack...') # get connection and application handlers connection = get_connection(args) files_util = FilesUtil(connection) print('Collecting files...') files = files_util.get_file_children(args.folder) print('Found %d files. Collecting metadata...' % len(files)) infos = files_util.get_infos(files) identifier_map = {info['name']: info['accession'] for info in infos} # parse the CSV file with open(csv_input, 'r') as the_file: reader = csv.DictReader(the_file, delimiter=",") field_names = reader.fieldnames if args.local_key not in field_names: raise GenestackException(
def files_utils(): connection = get_connection(make_connection_parser().parse_args([])) files_utils = FilesUtil(connection) return files_utils