def create_dataflow(self, accession, name=None): """ Creates a data flow based on the file provenance of the specified file. The nodes of the data flow can be accessed by the accession of the corresponding files in the file provenance. :param accession: file accession :type accession: str :param name: data flow name :type name: str :return: accession of the created data flow file :rtype: str :raise GenestackException: """ response = self.invoke('initializeApplicationState', 'createFromSources', accession) if response['type'] == 'newPage': accession = response['fileInfo']['accession'] elif response['type'] == 'existingPages': # If file already exists we expect to get the last created file. # Existing page contains files from first to last (or MAX QUERY) # TODO: in case there are more files then MAX QUERY (100 ATM), # the last file in response will not be really last # (it is almost impossible use case, though) file_info = response['fileInfos'][-1] accession = file_info['accession'] else: raise GenestackException("Unknown response type: %s" % response['type']) if name: FilesUtil(self.connection).replace_metainfo_string_value( [accession], Metainfo.NAME, name) return accession
def upload_files(connection, files, folder_name, folder_accession): """ :param genestack_client.Connection connection: :param list[str] files: :param str folder_name: :param str folder_accession: """ importer = DataImporter(connection) fu = FilesUtil(connection) upload = fu.get_special_folder(SpecialFolders.UPLOADED) if not folder_accession: folder_name = folder_name or datetime.now().strftime( 'Upload %d.%m.%y %H:%M:%S') folder_accession = fu.create_folder( folder_name, parent=upload, description='Files uploaded by genestack-uploader') else: folder_name = fu.get_infos([folder_accession])[0]['name'] accession_file_map = {} for f in files: accession = importer.load_raw(f) fu.link_file(accession, folder_accession) fu.unlink_file(accession, upload) accession_file_map[accession] = f return folder_accession, folder_name, accession_file_map
def test_metainfo_io(conn): data_importer = DataImporter(conn) fu = FilesUtil(conn) created = fu.get_special_folder(SpecialFolders.CREATED) info = Metainfo() info.add_boolean("a", True) info.add_file_reference("b", created) info.add_date_time("c", "2015-12-13") info.add_integer("d", 239) info.add_decimal("e", 238.583) info.add_decimal("e", -13.4) info.add_string("f", "hello") info.add_memory_size("g", 2847633) info.add_person("i", "Rosalind Franklin", "+1-202-555-0123", "*****@*****.**") info.add_publication("j", "My Publication", "Myself", "Journal of Me", "23/12/2014", pages="12-23") info.add_value(Metainfo.NAME, StringValue("Test report file")) report_file = None try: report_file = data_importer.create_report_file(metainfo=info, urls=[TEST_URL], parent=created) metainfo = next(iter(fu.collect_metainfos([report_file]))) assert metainfo.get('a')[0].get_boolean() assert isinstance(metainfo.get('b')[0].get_accession(), str) assert metainfo.get('c')[0].get_date() == _strptime_local( '2015-12-13', '%Y-%m-%d') assert metainfo.get('d')[0].get_int() == 239 assert metainfo.get('e')[0].get_decimal() == 238.583 assert metainfo.get('e')[1].get_decimal() == -13.4 assert metainfo.get('f')[0].get_string() == "hello" assert metainfo.get('g')[0].get_int() == 2847633 assert metainfo.get('i')[0].get_person() == { 'name': 'Rosalind Franklin', 'phone': '+1-202-555-0123', 'email': '*****@*****.**' } assert metainfo.get('j')[0].get_publication() == { 'title': 'My Publication', 'authors': 'Myself', 'journalName': 'Journal of Me', 'issueDate': '23/12/2014', 'pages': '12-23', 'issueNumber': None, 'identifiers': {} } assert metainfo.get( Metainfo.NAME)[0].get_string() == "Test report file" assert metainfo.get(BioMetaKeys.DATA_LINK)[0].get_url() == TEST_URL finally: if report_file is not None: fu.unlink_file(report_file, created)
def __get_mydatasets_folder(self): """ Get default folder for datasets. :return: default dataset folder accession :rtype: str """ return FilesUtil(self.connection).get_special_folder( SpecialFolders.MY_DATASETS)
def test_en_isoforms(conn, keep_files): fu = FilesUtil(conn) en = ExpressionNavigatorforIsoforms(conn) en_file = None try: groups = [{'accessions': accs} for accs in ISOFORM_GROUPS] en_file = en.create_file(groups, multi_mapping_corr=True) finally: if (not keep_files) and (en_file is not None): fu.unlink_file(en_file, fu.get_special_folder(SpecialFolders.CREATED))
def test_en_rna_seq(conn, keep_files): fu = FilesUtil(conn) en = ExpressionNavigatorforGenes(conn) en_file = None try: groups = [{'accessions': accs} for accs in RNA_SEQ_GROUPS] en_file = en.create_file(groups, r_package=en.PKG_DESEQ, organism="new organism") finally: if (not keep_files) and (en_file is not None): fu.unlink_file(en_file, fu.get_special_folder(SpecialFolders.CREATED))
def __init__(self, cla, base_folder, friendly_name, custom_args=None): """ Constructor of the general batch files creator, to create multiple files from a CLA. :param cla: a ``CLApplication`` object, wrapper for the corresponding CLA :param base_folder: accession of the base folder where the pipeline files will be organised into subfolders :param friendly_name: user-friendly name of the files produced by the app ; used in the on-screen statements and in the name of the project subfolders :param custom_args: list of custom command-line argument strings for the files. Default is ``None`` """ self._cla = cla self._files_util = FilesUtil(cla.connection) self._base_folder = base_folder self._friendly_name = friendly_name self._custom_args = custom_args
def test_en_microarrays(conn, keep_files): fu = FilesUtil(conn) en = ExpressionNavigatorforMicroarrays(conn) norm_app = AffymetrixMicroarraysNormalizationApplication(conn) en_file = None norm_file = None try: groups = [{'accessions': accs} for accs in MICROARRAY_GROUPS] groups[0]['is_control'] = True norm_file = norm_app.create_file([f for group in MICROARRAY_GROUPS for f in group]) en_file = en.create_file(groups, norm_file, RAT_AFFY_ANNOTATION) finally: if not keep_files: created = fu.get_special_folder(SpecialFolders.CREATED) for f in (norm_file, en_file): if f is not None: fu.unlink_file(f, created)
def create_file(self, source_files, name=None, params=None, calculate_checksums=False, expected_checksums=None, initialize=False): """ Create a native Genestack file with the application and return its accession. If a source file is not found or is not of the expected type, an exception will be thrown. :param source_files: list of source files accessions :type source_files: list :param name: if a name is provided, the created file will be renamed :type name: str :param params: custom command-line arguments strings; if None, the application defaults will be used. :param params: list :param calculate_checksums: a flag used in the initialization script to compute checksums for the created files :type calculate_checksums: bool :param expected_checksums: Dict of expected checksums (``{metainfo_key: expected_checksum}``) :type expected_checksums: dict :param initialize: should initialization be started immediately after the file is created? :return: accession of created file :rtype: str """ app_file = self.__create_file(source_files, params) fu = FilesUtil(self.connection) if name: fu.rename_file(app_file, name) if calculate_checksums: fu.mark_for_tests(app_file) if expected_checksums: fu.add_checksums(app_file, expected_checksums) if initialize: self.start(app_file) return app_file
def recognize_files(connection, accession_file_map, new_folder): # Files Recognition fu = FilesUtil(connection) application = connection.application('genestack/upload') recognised_files = application.invoke('recognizeGroupsByAccession', accession_file_map.keys()) recognized_accessions = set() for x in recognised_files: for sources in x['sourceFileInfos'].values(): for info in sources: recognized_accessions.add(info['accession']) created_files = application.invoke('createFiles', recognised_files, [], None) groups = sorted(created_files['files'], key=itemgetter('kind')) for name, group in groupby(groups, key=itemgetter('kind')): print(name) # maybe sort by filename before printing a group? for f in group: print('\t%s / %s' % (f['accession'], f['name'])) unrecognized_file_infos = set(accession_file_map) - recognized_accessions if unrecognized_file_infos: print('Unrecognized Raw Files') for accession in unrecognized_file_infos: print('\t%s / %s' % (accession, accession_file_map[accession].decode('utf-8'))) # move unrecognized files to new folder unrecognized_folder = fu.create_folder("Unrecognized files", parent=new_folder) for accession in unrecognized_file_infos: fu.link_file(accession, unrecognized_folder) fu.unlink_file(accession, new_folder) print('Unrecognized files moved to %s / %s' % (unrecognized_folder, 'Unrecognized files'))
'local_key', help= 'Name of the local key to match CSV records and Genestack files names') parser.add_argument( 'folder', help='Accession of the Genestack folder containing the files') args = parser.parse_args() csv_input = args.csv_file local_key = args.local_key print('Connecting to Genestack...') # get connection and application handlers connection = get_connection(args) files_util = FilesUtil(connection) print('Collecting files...') files = files_util.get_file_children(args.folder) print('Found %d files. Collecting metadata...' % len(files)) infos = files_util.get_infos(files) identifier_map = {info['name']: info['accession'] for info in infos} # parse the CSV file with open(csv_input, 'r') as the_file: reader = csv.DictReader(the_file, delimiter=",") field_names = reader.fieldnames if args.local_key not in field_names: raise GenestackException(
def files_utils(): connection = get_connection(make_connection_parser().parse_args([])) files_utils = FilesUtil(connection) return files_utils
def rename_file(self, accession, name): sys.stderr.write('Deprecated: use FilesUtil.rename_file instead\n') FilesUtil(self.connection).rename_file(accession, name)