Ejemplo n.º 1
0
def assembly_data_to_rows(data):
    """Converts assembly data dictionary to text rows"""
    rows = []
    data_key  = "assembly_data"
    kbase_key = "kbase_assembly_input"
    lib_key   = "file_sets"
    info_key  = "file_infos"

    if data_key in data:
        data = data[data_key]
    else:
        if kbase_key in data: data = kb_to_asm(data[kbase_key])

    for lib in data.get(lib_key, []):
        libtype = lib.get("type", "unknown")
        files = []
        for info in lib.get(info_key, []):
            filename = info.get("filename", None)
            if not filename:
                filename = info.get("direct_url", "")
                filename = re.sub(r'.*/', '', filename)
            filesize = info.get("filesize", None)
            filesize = " (%s)" % sizeof_fmt(filesize) if filesize else ""
            files.append("%s%s" % (filename, filesize))
        rows.append([libtype, " ".join(files)])

    return rows
Ejemplo n.º 2
0
def assembly_data_to_rows(data):
    """Converts assembly data dictionary to text rows"""
    rows = []
    data_key  = "assembly_data"
    kbase_key = "kbase_assembly_input"
    lib_key   = "file_sets"
    info_key  = "file_infos"

    if data_key in data:
        data = data[data_key]
    else:
        if kbase_key in data: data = kb_to_asm(data[kbase_key])

    for lib in data.get(lib_key, []):
        libtype = lib.get("type", "unknown")
        files = []
        for info in lib.get(info_key, []):
            filename = info.get("filename", None)
            if not filename:
                filename = info.get("direct_url", "")
                filename = re.sub(r'.*/', '', filename)
            filesize = info.get("filesize", None)
            filesize = " (%s)" % sizeof_fmt(filesize) if filesize else ""
            files.append("%s%s" % (filename, filesize))
        rows.append([libtype, " ".join(files)])

    return rows
Ejemplo n.º 3
0
    def _get_data(self, body):
        params = json.loads(body)
        filepath = os.path.join(self.datapath, params['ARASTUSER'],
                                str(params['data_id']))
        datapath = filepath
        filepath += "/raw/"
        all_files = []
        user = params['ARASTUSER']
        token = params['oauth_token']
        uid = params['_id']

        ##### Get data from ID #####
        data_doc = self.metadata.get_data_docs(params['ARASTUSER'], params['data_id'])
        if not data_doc:
            raise Exception('Invalid Data ID: {}'.format(params['data_id']))

        if 'kbase_assembly_input' in data_doc:
            params['assembly_data'] = kb_to_asm(data_doc['kbase_assembly_input'])
        elif 'assembly_data' in data_doc:
            params['assembly_data'] = data_doc['assembly_data']

        ##### Get data from assembly_data #####
        self.metadata.update_job(uid, 'status', 'Data transfer')
        try:os.makedirs(filepath)
        except:pass
            
          ### TODO Garbage collect ###
        download_url = 'http://{}'.format(self.shockurl)
        file_sets = params['assembly_data']['file_sets']
        for file_set in file_sets:
            if file_set['type'] == 'paired_url':
                file_set['type'] = 'paired'
            elif file_set['type'] == 'single_url':
                file_set['type'] = 'single'
            elif file_set['type'] == 'reference_url':
                file_set['type'] = 'reference'
            file_set['files'] = [] #legacy
            for file_info in file_set['file_infos']:
                #### File is stored on Shock
                if file_info['filename']:
                    local_file = os.path.join(filepath, file_info['filename'])
                    if os.path.exists(local_file):
                        logging.info("Requested data exists on node: {}".format(local_file))
                    else:
                        local_file = self.download_shock(download_url, user, token, 
                                                   file_info['shock_id'], filepath)
                elif file_info['direct_url']:
                    local_file = os.path.join(filepath, os.path.basename(file_info['direct_url']))
                    if os.path.exists(local_file):
                        logging.info("Requested data exists on node: {}".format(local_file))
                    else:
                        local_file = self.download_url(file_info['direct_url'], filepath)
                file_info['local_file'] = local_file
                file_set['files'].append(local_file) #legacy
            all_files.append(file_set)
        return datapath, all_files                    
Ejemplo n.º 4
0
    def _get_data(self, body):
        params = json.loads(body)
        filepath = os.path.join(self.datapath, params['ARASTUSER'],
                                str(params['data_id']))
        datapath = filepath
        filepath += "/raw/"
        all_files = []
        user = params['ARASTUSER']
        token = params['oauth_token']
        uid = params['_id']

        ##### Get data from ID #####
        data_doc = self.metadata.get_doc_by_data_id(params['data_id'],
                                                    params['ARASTUSER'])
        if not data_doc:
            raise Exception('Invalid Data ID: {}'.format(params['data_id']))

        if 'kbase_assembly_input' in data_doc:
            params['assembly_data'] = kb_to_asm(
                data_doc['kbase_assembly_input'])
        elif 'assembly_data' in data_doc:
            params['assembly_data'] = data_doc['assembly_data']

        ##### Get data from assembly_data #####
        self.metadata.update_job(uid, 'status', 'Data transfer')
        try:
            os.makedirs(filepath)
        except:
            pass

        ### TODO Garbage collect ###
        download_url = 'http://{}'.format(self.shockurl)
        file_sets = params['assembly_data']['file_sets']
        for file_set in file_sets:
            file_set['files'] = []  #legacy
            for file_info in file_set['file_infos']:
                local_file = os.path.join(filepath, file_info['filename'])
                if os.path.exists(local_file):
                    logging.info(
                        "Requested data exists on node: {}".format(local_file))
                else:
                    local_file = self.download(download_url, user, token,
                                               file_info['shock_id'], filepath)
                file_info['local_file'] = local_file
                file_set['files'].append(local_file)  #legacy
            all_files.append(file_set)
        return datapath, all_files
Ejemplo n.º 5
0
def assembly_data_to_rows(data):
    rows = []
    data_key  = "assembly_data"
    kbase_key = "kbase_assembly_input"
    lib_key   = "file_sets"
    info_key  = "file_infos"
    
    if data_key in data: data = data[data_key]
    else:
        if kbase_key in data: data = kb_to_asm(data[kbase_key])

    for lib in data.get(lib_key, []):
        libtype = lib.get("type", "unknown")
        files = []
        for info in lib.get(info_key, []):
            filename = info.get("filename", "")
            filesize = info.get("filesize", None)
            filesize = " (%s)" % sizeof_fmt(filesize) if filesize else ""
            files.append("%s%s" % (filename, filesize))
        rows.append([libtype, " ".join(files)])
    
    return rows
Ejemplo n.º 6
0
    def _get_data(self, body):
        params = json.loads(body)
        filepath = os.path.join(self.datapath, params['ARASTUSER'],
                                str(params['data_id']))
        datapath = filepath
        filepath += "/raw/"
        all_files = []
        user = params['ARASTUSER']
        job_id = params['job_id']
        data_id = params['data_id']
        token = params['oauth_token']
        uid = params['_id']

        self.gc_lock.acquire()
        try:
            self.garbage_collect(self.datapath, self.min_free_space, user, job_id, data_id)
        except:
            logger.error('Unexpected error in GC.')
            raise
        finally:
            self.gc_lock.release()

        ##### Get data from ID #####
        data_doc = self.metadata.get_data_docs(params['ARASTUSER'], params['data_id'])
        if not data_doc:
            raise Exception('Invalid Data ID: {}'.format(params['data_id']))
        logger.debug('data_doc = {}'.format(data_doc))
        if 'kbase_assembly_input' in data_doc:
            params['assembly_data'] = kb_to_asm(data_doc['kbase_assembly_input'])
        elif 'assembly_data' in data_doc:
            params['assembly_data'] = data_doc['assembly_data']

        ##### Get data from assembly_data #####
        self.metadata.update_job(uid, 'status', 'Data transfer')
        with ignored(OSError):
            os.makedirs(filepath)
            touch(filepath)

        file_sets = params['assembly_data']['file_sets']
        for file_set in file_sets:
            if file_set['type'] == 'paired_url':
                file_set['type'] = 'paired'
            elif file_set['type'] == 'single_url':
                file_set['type'] = 'single'
            elif file_set['type'] == 'reference_url':
                file_set['type'] = 'reference'
            file_set['files'] = [] #legacy
            for file_info in file_set['file_infos']:
                #### File is stored on Shock
                if file_info['filename']:
                    local_file = os.path.join(filepath, file_info['filename'])
                    if os.path.exists(local_file):
                        local_file = self.extract_file(local_file)
                        logger.info("Requested data exists on node: {}".format(local_file))
                    else:
                        local_file = self.download_shock(file_info['shock_url'], user, token,
                                                   file_info['shock_id'], filepath)

                elif file_info['direct_url']:
                    local_file = os.path.join(filepath, os.path.basename(file_info['direct_url']))
                    if os.path.exists(local_file):
                        local_file = self.extract_file(local_file)
                        logger.info("Requested data exists on node: {}".format(local_file))
                    else:
                        local_file = self.download_url(file_info['direct_url'], filepath, token=token)
                file_info['local_file'] = local_file
                if file_set['type'] == 'single' and asm.is_long_read_file(local_file):
                    if not 'tags' in file_set:
                        file_set['tags'] = []
                    if not 'long_read' in file_set['tags']:
                        file_set['tags'].append('long_read') # pacbio or nanopore reads
                file_set['files'].append(local_file) #legacy
            all_files.append(file_set)
        return datapath, all_files
Ejemplo n.º 7
0
    def _get_data(self, body):
        params = json.loads(body)
        filepath = os.path.join(self.datapath, params['ARASTUSER'],
                                str(params['data_id']))
        datapath = filepath
        filepath += "/raw/"
        all_files = []
        user = params['ARASTUSER']
        job_id = params['job_id']
        data_id = params['data_id']
        token = params['oauth_token']
        uid = params['_id']

        self.gc_lock.acquire()
        try:
            self.garbage_collect(self.datapath, self.min_free_space, user,
                                 job_id, data_id)
        except:
            logger.error('Unexpected error in GC.')
            raise
        finally:
            self.gc_lock.release()

        ##### Get data from ID #####
        data_doc = self.metadata.get_data_docs(params['ARASTUSER'],
                                               params['data_id'])
        if not data_doc:
            raise Exception('Invalid Data ID: {}'.format(params['data_id']))
        logger.debug('data_doc = {}'.format(data_doc))
        if 'kbase_assembly_input' in data_doc:
            params['assembly_data'] = kb_to_asm(
                data_doc['kbase_assembly_input'])
        elif 'assembly_data' in data_doc:
            params['assembly_data'] = data_doc['assembly_data']

        ##### Get data from assembly_data #####
        self.metadata.update_job(uid, 'status', 'Data transfer')
        with ignored(OSError):
            os.makedirs(filepath)
            touch(filepath)

        file_sets = params['assembly_data']['file_sets']
        for file_set in file_sets:
            if file_set['type'] == 'paired_url':
                file_set['type'] = 'paired'
            elif file_set['type'] == 'single_url':
                file_set['type'] = 'single'
            elif file_set['type'] == 'reference_url':
                file_set['type'] = 'reference'
            file_set['files'] = []  #legacy
            for file_info in file_set['file_infos']:
                #### File is stored on Shock
                if file_info['filename']:
                    local_file = os.path.join(filepath, file_info['filename'])
                    if os.path.exists(local_file):
                        local_file = self.extract_file(local_file)
                        logger.info("Requested data exists on node: {}".format(
                            local_file))
                    else:
                        local_file = self.download_shock(
                            file_info['shock_url'], user, token,
                            file_info['shock_id'], filepath)

                elif file_info['direct_url']:
                    local_file = os.path.join(
                        filepath, os.path.basename(file_info['direct_url']))
                    if os.path.exists(local_file):
                        local_file = self.extract_file(local_file)
                        logger.info("Requested data exists on node: {}".format(
                            local_file))
                    else:
                        local_file = self.download_url(file_info['direct_url'],
                                                       filepath,
                                                       token=token)
                file_info['local_file'] = local_file
                if file_set['type'] == 'single' and asm.is_long_read_file(
                        local_file):
                    if not 'tags' in file_set:
                        file_set['tags'] = []
                    if not 'long_read' in file_set['tags']:
                        file_set['tags'].append(
                            'long_read')  # pacbio or nanopore reads
                file_set['files'].append(local_file)  #legacy
            all_files.append(file_set)
        return datapath, all_files