def recognize_files(connection, accession_file_map, new_folder):
    # Files Recognition
    fu = FilesUtil(connection)

    application = connection.application('genestack/upload')
    recognised_files = application.invoke('recognizeGroupsByAccession', accession_file_map.keys())

    recognized_accessions = set()
    for x in recognised_files:
        for sources in x['sourceFileInfos'].values():
            for info in sources:
                recognized_accessions.add(info['accession'])

    created_files = application.invoke('createFiles', recognised_files, [], None)
    groups = sorted(created_files['files'], key=itemgetter('kind'))
    for name, group in groupby(groups, key=itemgetter('kind')):
        print(name)
        # maybe sort by filename before printing a group?
        for f in group:
            print('\t%s / %s' % (f['accession'], f['name']))

    unrecognized_file_infos = set(accession_file_map) - recognized_accessions

    if unrecognized_file_infos:
        print('Unrecognized Raw Files')
        for accession in unrecognized_file_infos:
            print('\t%s / %s' % (accession, accession_file_map[accession].decode('utf-8')))
        # move unrecognized files to new folder
        unrecognized_folder = fu.create_folder("Unrecognized files", parent=new_folder)
        for accession in unrecognized_file_infos:
            fu.link_file(accession, unrecognized_folder)
            fu.unlink_file(accession, new_folder)
        print('Unrecognized files moved to %s / %s' % (unrecognized_folder, 'Unrecognized files'))
class BatchFilesCreator(object):
    def __init__(self, cla, base_folder, friendly_name, custom_args=None):
        """
        Constructor of the general batch files creator, to create multiple files from a CLA.

        :param cla: a ``CLApplication`` object, wrapper for the corresponding CLA
        :param base_folder: accession of the base folder where the pipeline files will be organised into subfolders
        :param friendly_name: user-friendly name of the files produced by the app ; used in the on-screen statements
        and in the name of the project subfolders
        :param custom_args: list of custom command-line argument strings for the files. Default is ``None``
        """

        self._cla = cla
        self._files_util = FilesUtil(cla.connection)
        self._base_folder = base_folder
        self._friendly_name = friendly_name
        self._custom_args = custom_args

    def create_files(self, sources):
        print('Creating %s files...' % self._friendly_name)
        output_folder = self._files_util.create_folder(self._friendly_name, parent=self._base_folder)
        output_files = []
        for i, source in enumerate(sources, 1):
            output = self._create_output_file(source)
            self._files_util.link_file(output, output_folder)
            print('Created %s file %s (%d/%d)' % (self._friendly_name, output, i, len(output)))
            output_files.append(output)
        return output_files

    # this method can be overridden in child classes to allow for more complex file creation logic
    def _create_output_file(self, source):
        output = self._cla.create_file(source)
        if self._custom_args:
            self._cla.change_command_line_arguments(output, self._custom_args)
        return output
def upload_files(connection, files, folder_name, folder_accession):
    """
    :param genestack_client.Connection connection:
    :param list[str] files:
    :param str folder_name:
    :param str folder_accession:
    """
    importer = DataImporter(connection)
    fu = FilesUtil(connection)
    upload = fu.get_special_folder(SpecialFolders.UPLOADED)
    if not folder_accession:
        folder_name = folder_name or datetime.now().strftime(
            'Upload %d.%m.%y %H:%M:%S')
        folder_accession = fu.create_folder(
            folder_name,
            parent=upload,
            description='Files uploaded by genestack-uploader')
    else:
        folder_name = fu.get_infos([folder_accession])[0]['name']
    accession_file_map = {}
    for f in files:
        accession = importer.load_raw(f)
        fu.link_file(accession, folder_accession)
        fu.unlink_file(accession, upload)
        accession_file_map[accession] = f
    return folder_accession, folder_name, accession_file_map
def upload_files(connection, files, folder_name):
    importer = DataImporter(connection)
    fu = FilesUtil(connection)
    upload = fu.get_special_folder(SpecialFolders.UPLOADED)
    folder_name = folder_name or datetime.now().strftime('Upload %d.%m.%y %H:%M:%S')
    new_folder = fu.create_folder(folder_name, parent=upload,
                                  description='Files uploaded by genestack-uploader')
    accession_file_map = {}
    for f in files:
        accession = importer.load_raw(f)
        fu.link_file(accession, new_folder)
        fu.unlink_file(accession, upload)
        accession_file_map[accession] = f
    return new_folder, folder_name, accession_file_map
def recognize_files(connection, accession_file_map, new_folder):
    # Files Recognition
    fu = FilesUtil(connection)

    application = connection.application('genestack/upload')
    recognised_files = application.invoke('recognizeGroupsByAccession',
                                          accession_file_map.keys())

    recognized_accessions = set()
    for x in recognised_files:
        for sources in x['sourceFileInfos'].values():
            for info in sources:
                recognized_accessions.add(info['accession'])

    created_files = application.invoke('createFiles', recognised_files, [],
                                       None)
    groups = sorted(created_files['files'], key=itemgetter('kind'))
    for name, group in groupby(groups, key=itemgetter('kind')):
        print(name)
        # maybe sort by filename before printing a group?
        for f in group:
            print('\t%s / %s' % (f['accession'], f['name']))

    unrecognized_file_infos = set(accession_file_map) - recognized_accessions

    if unrecognized_file_infos:
        print('Unrecognized Raw Files')
        for accession in unrecognized_file_infos:
            print('\t%s / %s' %
                  (accession, accession_file_map[accession].decode('utf-8')))
        # move unrecognized files to new folder
        unrecognized_folder = fu.create_folder("Unrecognized files",
                                               parent=new_folder)
        for accession in unrecognized_file_infos:
            fu.link_file(accession, unrecognized_folder)
            fu.unlink_file(accession, new_folder)
        print('Unrecognized files moved to %s / %s' %
              (unrecognized_folder, 'Unrecognized files'))
Beispiel #6
0
class BatchFilesCreator(object):
    def __init__(self, cla, base_folder, friendly_name, custom_args=None):
        """
        Constructor of the general batch files creator, to create multiple files from a CLA.

        :param cla: a ``CLApplication`` object, wrapper for the corresponding CLA
        :param base_folder: accession of the base folder where the pipeline files will be organised into subfolders
        :param friendly_name: user-friendly name of the files produced by the app ; used in the on-screen statements
        and in the name of the project subfolders
        :param custom_args: list of custom command-line argument strings for the files. Default is ``None``
        """

        self._cla = cla
        self._files_util = FilesUtil(cla.connection)
        self._base_folder = base_folder
        self._friendly_name = friendly_name
        self._custom_args = custom_args

    def create_files(self, sources):
        print "Creating %s files..." % self._friendly_name
        output_folder = self._files_util.create_folder(
            self._friendly_name, parent=self._base_folder)
        output_files = []
        for i, source in enumerate(sources, 1):
            output = self._create_output_file(source)
            self._files_util.link_file(output, output_folder)
            print "Created %s file %s (%d/%d)" % (self._friendly_name, output,
                                                  i, len(output))
            output_files.append(output)
        return output_files

    # this method can be overridden in child classes to allow for more complex file creation logic
    def _create_output_file(self, source):
        output = self._cla.create_file(source)
        if self._custom_args:
            self._cla.change_command_line_arguments(output, self._custom_args)
        return output
Beispiel #7
0
source_folder = args.folder
move_files = args.move_files

print "Connecting to Genestack..."

# get connection and application handlers
connection = get_connection(args)
files_util = FilesUtil(connection)

print "Collecting files..."
files = files_util.get_file_children(source_folder)
files_count = len(files)
print "Found %d files to organise. Retrieving infos..." % files_count
infos = files_util.get_complete_infos(files)

output_folder = files_util.create_folder("Organized files",
                                         parent=source_folder)
grouping_folders = {}

for i, entry in enumerate(infos, 1):
    accession = entry['accession']
    print "Processing file %d of %d (%s)..." % (i, files_count, accession)

    # use either application name, application ID or "Unknown application" (in this order of preference)
    app_entry = entry.get('application')
    if app_entry:
        application = app_entry.get('name') or app_entry.get(
            'id', "Unknown application")
    else:
        application = "Unknown application"

    # if there is a folder for this group, we add the file to it ;
Beispiel #8
0
        help='Name of the Genestack folder where to put the output files')
    parser.add_argument(
        '--ref-genome',
        help='Accession of the reference genome to use for the mapping step')

    args = parser.parse_args()
    project_name = args.name

    print "Connecting to Genestack..."

    # get connection and create output folder
    connection = get_connection(args)
    files_util = FilesUtil(connection)
    created_files_folder = files_util.get_special_folder(
        SpecialFolders.CREATED)
    project_folder = files_util.create_folder(project_name,
                                              parent=created_files_folder)

    # create application wrappers and batch files creators
    bowtie_app = BowtieApplication(connection)
    mapped_qc_app = AlignedReadsQC(connection)
    variant_calling_app = VariationCaller2Application(connection)

    bowtie_creator = BowtieBatchFilesCreator(bowtie_app,
                                             project_folder,
                                             "Mapped Reads",
                                             ref_genome=args.ref_genome)
    mapped_qc_creator = BatchFilesCreator(mapped_qc_app, project_folder,
                                          "Mapped Reads QC")
    vc_creator = BatchFilesCreator(variant_calling_app,
                                   project_folder,
                                   "Variants",
    parser.add_argument('raw_reads_folder',
                        help='Genestack accession of the folder containing the raw reads files to process')
    parser.add_argument('--name', default="New Project",
                        help='Name of the Genestack folder where to put the output files')
    parser.add_argument('--ref-genome', help='Accession of the reference genome to use for the mapping step')

    args = parser.parse_args()
    project_name = args.name

    print('Connecting to Genestack...')

    # get connection and create output folder
    connection = get_connection(args)
    files_util = FilesUtil(connection)
    created_files_folder = files_util.get_special_folder(SpecialFolders.CREATED)
    project_folder = files_util.create_folder(project_name, parent=created_files_folder)

    # create application wrappers and batch files creators
    bowtie_app = BowtieApplication(connection)
    mapped_qc_app = AlignedReadsQC(connection)
    variant_calling_app = VariationCaller2Application(connection)

    bowtie_creator = BowtieBatchFilesCreator(bowtie_app, project_folder, "Mapped Reads", ref_genome=args.ref_genome)
    mapped_qc_creator = BatchFilesCreator(mapped_qc_app, project_folder, "Mapped Reads QC")
    vc_creator = BatchFilesCreator(variant_calling_app, project_folder, "Variants", custom_args=VC_ARGUMENTS_NO_INDELS)

    # collect files
    print('Collecting raw reads...')
    raw_reads = files_util.get_file_children(args.raw_reads_folder)
    files_count = len(raw_reads)
    print('Found %d files to process' % files_count)
source_folder = args.folder
move_files = args.move_files

print "Connecting to Genestack..."

# get connection and application handlers
connection = get_connection(args)
files_util = FilesUtil(connection)

print "Collecting files..."
files = files_util.get_file_children(source_folder)
files_count = len(files)
print "Found %d files to organise. Retrieving infos..." % files_count
infos = files_util.get_complete_infos(files)

output_folder = files_util.create_folder("Organized files", parent=source_folder)
grouping_folders = {}

for i, entry in enumerate(infos, 1):
    accession = entry['accession']
    print "Processing file %d of %d (%s)..." % (i, files_count, accession)

    # use either application name, application ID or "Unknown application" (in this order of preference)
    app_entry = entry.get('application')
    if app_entry:
        application = app_entry.get('name') or app_entry.get('id', "Unknown application")
    else:
        application = "Unknown application"

    # if there is a folder for this group, we add the file to it ;
    # otherwise, we create one, add it to our dictionary of folders and add the file to it