Esempio n. 1
0
def _extend_array_with_iter(array, matrices):
    try:
        matrix = first(matrices)
    except ValueError:
        return

    matrices = chain([matrix], matrices)

    matrix_size = sys.getsizeof(matrix)
    mats_in_group = math.floor(AVAILABLE_MEM / matrix_size)
    if not mats_in_group:
        mats_in_group = 1
    for mats_in_mem in group_items(matrices, mats_in_group):
        _extend_array(array, mats_in_mem)
Esempio n. 2
0
def _extend_array_with_iter(array, matrices):
    try:
        matrix = first(matrices)
    except ValueError:
        return

    matrices = chain([matrix], matrices)

    matrix_size = sys.getsizeof(matrix)
    mats_in_group = math.floor(AVAILABLE_MEM / matrix_size)
    if not mats_in_group:
        mats_in_group = 1
    for mats_in_mem in group_items(matrices, mats_in_group):
        _extend_array(array, mats_in_mem)
Esempio n. 3
0
    def chunks(self):
        vars_parser = self.vars_parser
        vars_in_chunk = self.vars_in_chunk
        kept_fields = self.kept_fields
        ignored_fields = self.ignored_fields
        log = self.log
        # metadata = vars_parser.metadata
        snps = vars_parser.variations

        field_paths = {'filter': {}, 'calls': {}, 'info': {}}
        missing_values = {}
        filter_field_names = set(getattr(vars_parser, 'metadata', {}).get('FILTER', {}).keys())

        exemplar_matrices_for_metadata = {}
        for chunk in group_items(snps, vars_in_chunk):
            chunk = list(chunk)
            n_snps_in_chunk = len(chunk)
            matrices = {}
            n_non_none_snps = 0
            for snp in chunk:
                if snp is None:
                    continue
                snp_dict = self._snp_tuple_to_dict(snp, field_paths,
                                                   filter_field_names)
                self._put_snp_in_matrices(matrices, snp_dict, n_non_none_snps,
                                          n_snps_in_chunk, missing_values,
                                          exemplar_matrices_for_metadata)
                n_non_none_snps += 1
            # print(numpy.unique(matrices[GT_FIELD]))

            # cut the empty snps from the end
            if n_non_none_snps < n_snps_in_chunk:
                matrices = {path: mat[:n_non_none_snps, ...] for path, mat in matrices.items()}

            varis = VariationsArrays()
            for path, mat in matrices.items():
                varis[path] = mat

            samples = [sample.decode() for sample in vars_parser.samples]
            varis.samples = samples

            try:
                metadata = _prepare_metadata(vars_parser.metadata)
                varis._set_metadata(metadata)
            except AttributeError:
                pass
            # print('unique in chunkers', numpy.unique(varis[GT_FIELD]))
            # print('chunk', varis[GT_FIELD][:4, 12, ...])
            yield varis
Esempio n. 4
0
    def chunks(self):
        vars_parser = self.vars_parser
        hdf5 = self.hdf5
        vars_in_chunk = self.vars_in_chunk
        kept_fields = self.kept_fields
        ignored_fields = self.ignored_fields
        max_field_lens = self.max_field_lens
        max_field_str_lens = self.max_field_str_lens
        log = self.log

        ignore_overflows = hdf5.ignore_overflows
        snps = vars_parser.variations

        mat_structure = _build_matrix_structures(vars_parser, vars_in_chunk,
                                                 kept_fields, ignored_fields,
                                                 hdf5.ignore_undefined_fields,
                                                 log, max_field_lens,
                                                 max_field_str_lens)
        for chunk in group_items(snps, vars_in_chunk):
            mats = {}
            for path, struct in mat_structure.items():
                mat = numpy.full(struct['shape'], struct['missing_value'],
                                 struct['dtype'])
                mats[path] = mat
            good_snp_idxs = []
            for idx, snp in enumerate(chunk):
                if snp is None:
                    break
                log['variations_processed'] += 1

                filters = snp[6]
                info = snp[7]
                calls = snp[8]
                info = dict(info) if info else {}
                calls = dict(calls) if calls else {}
                ignore_snp = False
                for path, struct in mat_structure.items():
                    basepath = struct['basepath']
                    if path == '/variations/chrom':
                        item = snp[0]
                    elif path == '/variations/pos':
                        item = snp[1]
                    elif path == '/variations/id':
                        item = snp[2]
                    elif path == '/variations/ref':
                        item = snp[3]
                    elif path == '/variations/alt':
                        item = snp[4]
                    elif path == '/variations/qual':
                        item = snp[5]
                    elif basepath == 'FILTER':
                        if struct['field'] == b'PASS':
                            item = True if filters == [] else False
                        else:
                            item = struct['field'] in filters
                    elif basepath == 'INFO':
                        item = info.get(struct['field'], None)
                    elif basepath == 'CALLS':
                        item = calls.get(struct['field'], None)
                    shape = struct['shape']

                    if item is not None:
                        n_dims = len(shape)
                        mat = mats[path]
                        if n_dims == 1:
                            try:
                                mat[idx] = item
                            except ValueError:
                                if hasattr(item, '__len__'):
                                    if len(item) == 1:
                                        mat[idx] = item[0]
                                    else:
                                        log['data_no_fit'][path] += 1
                                        break
                                else:
                                    raise
                        elif n_dims == 2:
                            if len(item) > mat.shape[1]:
                                if ignore_overflows:
                                    ignore_snp = True
                                    log['data_no_fit'][path] += 1
                                    break
                                else:
                                    msg = 'Data no fit in field:'
                                    msg += path
                                    msg += '\n'
                                    msg += str(item)
                                    raise RuntimeError(msg)
                            try:
                                mat[idx, 0:len(item)] = item
                            except (ValueError, TypeError):
                                missing_val = struct['missing_value']
                                item = [missing_val if val is None else val[0]
                                        for val in item]
                                mat[idx, 0:len(item)] = item

                        elif n_dims == 3:
                            if len(item[0]) > mat.shape[2]:
                                if ignore_overflows:
                                    ignore_snp = True
                                    log['data_no_fit'][path] += 1
                                    break
                                else:
                                    msg = 'Data no fit in field:'
                                    msg += path
                                    msg += '\n'
                                    msg += str(item)
                                    raise RuntimeError(msg)
                            try:
                                mat[idx, :, 0:len(item[0])] = item
                            except ValueError:
                                print(path, item)
                                raise

                        else:
                            raise RuntimeError('Fixme, we should not be here.')
                if not ignore_snp:
                    good_snp_idxs.append(idx)
                    log['variations_stored'] += 1

            varis = VariationsArrays()
            for path, mat in mats.items():
                varis[path] = mat[good_snp_idxs]
            samples = [sample.decode() for sample in vars_parser.samples]
            varis.samples = samples

            metadata = _prepare_metadata(vars_parser.metadata)
            varis._set_metadata(metadata)

            yield varis