Esempio n. 1
0
 def __init__(self, args, unknown_args):
     self.args = args
     self.unknown_args = unknown_args
     self.option = checkInput(args)
     #
     env.logger = getLogger(max(min(args.verbosity - 1, 2), 0),
                            fn=os.path.splitext(args.output[0])[0],
                            fv=2 if args.verbosity is not 0 else 0)
     env.logger.debug('\n{0}\n{1}\n{0}'.format(
         "=" * min(len(args.cmd), 100), args.cmd))
     self.logger = env.logger.info if args.verbosity != 1 else printinfo
     #
     self.logger('Loading data from [{}] ...'.format(args.data))
     if self.option == 1:
         self.file = SFSFile(args.data)
     else:
         self.file = GFile(args.data)
     self.groups = self.file.getnames()
     self.logger('{:,d} units found'.format(len(self.groups)))
     # load non-missing data
     # to annotate to each variant position wether or not it is missing from assocation analysis
     # name it chip_file because it mimics the behavior of exome chip design
     if args.missing_unlisted:
         self.chip_file = SFSFile(args.missing_unlisted)
     else:
         self.chip_file = None
     # set limit
     if self.args.limit:
         self.limit = min(max(1, args.limit), len(self.groups))
         self.logger('{:,d} units will be analyzed'.format(self.limit))
     else:
         self.limit = len(self.groups)
     self.result = ResultManager(args.output,
                                 action='w' if not args.append else 'a')
     if self.args.verbosity == 1:
         # widgets = [FormatLabel('scanning: unit %(value)d - '), BouncingBar(marker=RotatingMarker())]
         widgets = [
             FormatLabel('scanning: unit %(value)d - '),
             Percentage(), ' ',
             Bar('>'), ' ',
             ETA()
         ]
         self.pbar = ProgressBar(widgets=widgets,
                                 maxval=self.limit,
                                 term_width=get_terminal_size()[0] -
                                 5).start()
     else:
         # use each group's progress bar or not progress bar at all
         self.pbar = ProgressBarNull()
     # this is buffer object to hold all input dict to a list
     self.data_buffer = [] if self.args.replicates < 0 else None
Esempio n. 2
0
 def __init__(self, args, unknown_args):
     self.args = args
     self.unknown_args = unknown_args
     self.option = checkInput(args)
     #
     env.logger = getLogger(
         max(min(args.verbosity - 1, 2), 0),
         fn=os.path.splitext(args.output[0])[0],
         fv=2 if args.verbosity is not 0 else 0,
     )
     env.logger.debug("\n{0}\n{1}\n{0}".format("=" * min(len(args.cmd), 100), args.cmd))
     self.logger = env.logger.info if args.verbosity != 1 else printinfo
     #
     self.logger("Loading data from [{}] ...".format(args.data))
     if self.option == 1:
         self.file = SFSFile(args.data)
     else:
         self.file = GFile(args.data)
     self.groups = self.file.getnames()
     self.logger("{:,d} units found".format(len(self.groups)))
     # load non-missing data
     # to annotate to each variant position wether or not it is missing from assocation analysis
     # name it chip_file because it mimics the behavior of exome chip design
     if args.missing_unlisted:
         self.chip_file = SFSFile(args.missing_unlisted)
     else:
         self.chip_file = None
     # set limit
     if self.args.limit:
         self.limit = min(max(1, args.limit), len(self.groups))
         self.logger("{:,d} units will be analyzed".format(self.limit))
     else:
         self.limit = len(self.groups)
     self.result = ResultManager(args.output, action="w" if not args.append else "a")
     if self.args.verbosity == 1:
         # widgets = [FormatLabel('scanning: unit %(value)d - '), BouncingBar(marker=RotatingMarker())]
         widgets = [FormatLabel("scanning: unit %(value)d - "), Percentage(), " ", Bar(">"), " ", ETA()]
         self.pbar = ProgressBar(widgets=widgets, maxval=self.limit, term_width=get_terminal_size()[0] - 5).start()
     else:
         # use each group's progress bar or not progress bar at all
         self.pbar = ProgressBarNull()
     # this is buffer object to hold all input dict to a list
     self.data_buffer = [] if self.args.replicates < 0 else None
Esempio n. 3
0
class Executor:
    def __init__(self, args, unknown_args):
        self.args = args
        self.unknown_args = unknown_args
        self.option = checkInput(args)
        #
        env.logger = getLogger(max(min(args.verbosity - 1, 2), 0), fn = os.path.splitext(args.output[0])[0],
                               fv = 2 if args.verbosity is not 0 else 0)
        env.logger.debug('\n{0}\n{1}\n{0}'.format("="*min(len(args.cmd), 100), args.cmd))
        self.logger = env.logger.info if args.verbosity != 1 else printinfo
        #
        self.logger('Loading data from [{}] ...'.format(args.data))
        if self.option == 1:
            self.file = SFSFile(args.data)
        else:
            self.file = GFile(args.data)
        self.groups = self.file.getnames()
        self.logger('{:,d} units found'.format(len(self.groups)))
        # load non-missing data
        # to annotate to each variant position wether or not it is missing from assocation analysis
        # name it chip_file because it mimics the behavior of exome chip design
        if args.missing_unlisted:
            self.chip_file = SFSFile(args.missing_unlisted)
        else:
            self.chip_file = None
        # set limit
        if self.args.limit:
            self.limit = min(max(1, args.limit), len(self.groups))
            self.logger('{:,d} units will be analyzed'.format(self.limit))
        else:
            self.limit = len(self.groups)
        self.result = ResultManager(args.output, action = 'w' if not args.append else 'a')
        if self.args.verbosity == 1:
            # widgets = [FormatLabel('scanning: unit %(value)d - '), BouncingBar(marker=RotatingMarker())]
            widgets = [FormatLabel('scanning: unit %(value)d - '), Percentage(), ' ',
                       Bar('>'), ' ', ETA()]
            self.pbar = ProgressBar(widgets=widgets, maxval = self.limit,
                                    term_width=get_terminal_size()[0] - 5).start()
        else:
            # use each group's progress bar or not progress bar at all
            self.pbar = ProgressBarNull()
        # this is buffer object to hold all input dict to a list
        self.data_buffer = [] if self.args.replicates < 0 else None

    def run(self):
        if self.data_buffer is not None and self.option == 0:
            if self.args.resampling:
                self.logger('[WARNING] Loading all genotype data to memory. May fail if there is not enough memory!')
            else:
                self.logger('Converting data attributes ...')
        try:
            if self.option == 1:
                self.__scan_sfs()
            else:
                self.__scan_gdat()
        except:
            self.result.close(quiet = True)
            raise
        self.file.close()
        if self.chip_file is not None:
            self.chip_file.close()
        if self.data_buffer is not None:
            self.result.append(Calculator(self.args, self.unknown_args, self.data_buffer).run())
        self.result.close()
        self.pbar.finish()


    def __scan_gdat(self):
        '''scan gdat file'''
        maf = 'maf'
        pos = 'position'
        function_score = 'annotation'
        # Allow for customized key names in gdat file
        try:
            for x, y in zip(getColumn(self.args.data[:-5] + '.key', 1),
                            getColumn(self.args.data[:-5] + '.key', 2)):
                if x == 'maf':
                    maf = y
                if x == 'position':
                    pos = y
                if x == 'annotation':
                    function_score = y
        except:
            pass
        #
        for group, item in enumerate(self.groups):
            if group >= self.limit:
                break
            data = self.file.getdata(item)
            if self.args.resampling:
                data.decompress()
            else:
                data['haplotype'] = [[]]
            try:
                loci_input = {'pool':data['haplotype'], 'name':item,
                              'maf':list(data[maf]), 'pos':list(data[pos]),
                              'function_score':list(data[function_score])}
            except KeyError as e:
                env.logger.error('Column name {} not found. Please provide [{}.key] file to overwrite column naming conventions.'.\
                                 format(e, self.args.data[:-5]))
                continue
            loci_input['num_variants'] = len(loci_input['maf'])
            if self.chip_file:
                cdata = self.chip_file.getdata(item)
                if cdata is None or (not is_within(cdata['num_variants'], self.args.def_valid_locus)):
                    continue
                loci_input['missing'] = [False if x in cdata['pos'] else True for x in loci_input['pos']]
            else:
                loci_input['missing'] = None
            if is_within(loci_input['num_variants'], self.args.def_valid_locus):
                if self.data_buffer is None:
                    self.result.append(Calculator(self.args, self.unknown_args,loci_input).run())
                else:
                    self.data_buffer.append(loci_input)
            self.pbar.update(group + 1)

    def __scan_sfs(self):
        for group, loci_input in enumerate(self.file.data):
            if group >= self.limit:
                break
            # text sfs file does not have any haplotype pools
            loci_input['pool'] = [[]]
            if self.chip_file:
                cdata = self.chip_file.getdata(loci_input['name'])
                if cdata is None or (not is_within(cdata['num_variants'], self.args.def_valid_locus)):
                    continue
                loci_input['missing'] = [False if x in cdata['pos'] else True for x in loci_input['pos']]
                assert len(loci_input['missing']) == len(loci_input['maf'])
            else:
                loci_input['missing'] = None
            if is_within(loci_input['num_variants'], self.args.def_valid_locus):
                if self.data_buffer is None:
                    self.result.append(Calculator(self.args, self.unknown_args,loci_input).run())
                else:
                    self.data_buffer.append(loci_input)
            self.pbar.update(group + 1)
Esempio n. 4
0
class Executor:
    def __init__(self, args, unknown_args):
        self.args = args
        self.unknown_args = unknown_args
        self.option = checkInput(args)
        #
        env.logger = getLogger(max(min(args.verbosity - 1, 2), 0),
                               fn=os.path.splitext(args.output[0])[0],
                               fv=2 if args.verbosity is not 0 else 0)
        env.logger.debug('\n{0}\n{1}\n{0}'.format(
            "=" * min(len(args.cmd), 100), args.cmd))
        self.logger = env.logger.info if args.verbosity != 1 else printinfo
        #
        self.logger('Loading data from [{}] ...'.format(args.data))
        if self.option == 1:
            self.file = SFSFile(args.data)
        else:
            self.file = GFile(args.data)
        self.groups = self.file.getnames()
        self.logger('{:,d} units found'.format(len(self.groups)))
        # load non-missing data
        # to annotate to each variant position wether or not it is missing from assocation analysis
        # name it chip_file because it mimics the behavior of exome chip design
        if args.missing_unlisted:
            self.chip_file = SFSFile(args.missing_unlisted)
        else:
            self.chip_file = None
        # set limit
        if self.args.limit:
            self.limit = min(max(1, args.limit), len(self.groups))
            self.logger('{:,d} units will be analyzed'.format(self.limit))
        else:
            self.limit = len(self.groups)
        self.result = ResultManager(args.output,
                                    action='w' if not args.append else 'a')
        if self.args.verbosity == 1:
            # widgets = [FormatLabel('scanning: unit %(value)d - '), BouncingBar(marker=RotatingMarker())]
            widgets = [
                FormatLabel('scanning: unit %(value)d - '),
                Percentage(), ' ',
                Bar('>'), ' ',
                ETA()
            ]
            self.pbar = ProgressBar(widgets=widgets,
                                    maxval=self.limit,
                                    term_width=get_terminal_size()[0] -
                                    5).start()
        else:
            # use each group's progress bar or not progress bar at all
            self.pbar = ProgressBarNull()
        # this is buffer object to hold all input dict to a list
        self.data_buffer = [] if self.args.replicates < 0 else None

    def run(self):
        if self.data_buffer is not None and self.option == 0:
            if self.args.resampling:
                self.logger(
                    '[WARNING] Loading all genotype data to memory. May fail if there is not enough memory!'
                )
            else:
                self.logger('Converting data attributes ...')
        try:
            if self.option == 1:
                self.__scan_sfs()
            else:
                self.__scan_gdat()
        except:
            self.result.close(quiet=True)
            raise
        self.file.close()
        if self.chip_file is not None:
            self.chip_file.close()
        if self.data_buffer is not None:
            self.result.append(
                Calculator(self.args, self.unknown_args,
                           self.data_buffer).run())
        self.result.close()
        self.pbar.finish()

    def __scan_gdat(self):
        '''scan gdat file'''
        maf = 'maf'
        pos = 'position'
        function_score = 'annotation'
        # Allow for customized key names in gdat file
        try:
            for x, y in zip(getColumn(self.args.data[:-5] + '.key', 1),
                            getColumn(self.args.data[:-5] + '.key', 2)):
                if x == 'maf':
                    maf = y
                if x == 'position':
                    pos = y
                if x == 'annotation':
                    function_score = y
        except:
            pass
        #
        for group, item in enumerate(self.groups):
            if group >= self.limit:
                break
            data = self.file.getdata(item)
            if self.args.resampling:
                data.decompress()
            else:
                data['haplotype'] = [[]]
            try:
                loci_input = {
                    'pool': data['haplotype'],
                    'name': item,
                    'maf': list(data[maf]),
                    'pos': list(data[pos]),
                    'function_score': list(data[function_score])
                }
            except KeyError as e:
                env.logger.error('Column name {} not found. Please provide [{}.key] file to overwrite column naming conventions.'.\
                                 format(e, self.args.data[:-5]))
                continue
            loci_input['num_variants'] = len(loci_input['maf'])
            if self.chip_file:
                cdata = self.chip_file.getdata(item)
                if cdata is None or (not is_within(cdata['num_variants'],
                                                   self.args.def_valid_locus)):
                    continue
                loci_input['missing'] = [
                    False if x in cdata['pos'] else True
                    for x in loci_input['pos']
                ]
            else:
                loci_input['missing'] = None
            if is_within(loci_input['num_variants'],
                         self.args.def_valid_locus):
                if self.data_buffer is None:
                    self.result.append(
                        Calculator(self.args, self.unknown_args,
                                   loci_input).run())
                else:
                    self.data_buffer.append(loci_input)
            self.pbar.update(group + 1)

    def __scan_sfs(self):
        for group, loci_input in enumerate(self.file.data):
            if group >= self.limit:
                break
            # text sfs file does not have any haplotype pools
            loci_input['pool'] = [[]]
            if self.chip_file:
                cdata = self.chip_file.getdata(loci_input['name'])
                if cdata is None or (not is_within(cdata['num_variants'],
                                                   self.args.def_valid_locus)):
                    continue
                loci_input['missing'] = [
                    False if x in cdata['pos'] else True
                    for x in loci_input['pos']
                ]
                assert len(loci_input['missing']) == len(loci_input['maf'])
            else:
                loci_input['missing'] = None
            if is_within(loci_input['num_variants'],
                         self.args.def_valid_locus):
                if self.data_buffer is None:
                    self.result.append(
                        Calculator(self.args, self.unknown_args,
                                   loci_input).run())
                else:
                    self.data_buffer.append(loci_input)
            self.pbar.update(group + 1)