Esempio n. 1
0
 def toks_to_data_dict(self, toks):
     out = {}
     if len(toks) < len(self.columns):
         err_msg = 'Too few columns. Received %s. Expected %s' \
             %(len(toks),len(self.columns))
         raise BadFormatError(err_msg)
     for col_index, col_def in self.columns.items():
         col_name = col_def['name']
         col_type = col_def['type']
         tok = toks[col_index]
         if tok == '':
             out[col_name] = None
         else:
             if col_type == 'string':
                 out[col_name] = tok
             elif col_type == 'int':
                 out[col_name] = int(tok)
             elif col_type == 'float':
                 out[col_name] = float(tok)
             else:
                 out[col_name] = tok
     return out
Esempio n. 2
0
    def run(self):
        """ Convert input file to a .crv file using the primary converter."""
        try:
            self.setup()
            start_time = time.time()
            self.logger.info('Conversion start: %s' % \
                time.asctime(time.localtime(start_time)))
            self.primary_converter.setup(self.f)
            self.f.seek(0)
            read_lnum = 0
            write_lnum = 0
            num_errors = 0
            for l in self.f:
                read_lnum += 1
                try:
                    # all_wdicts is a list, since one input line can become
                    # multiple output lines
                    all_wdicts = self.primary_converter.convert_line(l)
                    if all_wdicts is None:
                        continue
                except Exception as e:
                    num_errors += 1
                    self._log_conversion_error(read_lnum, e)
                    continue
                if all_wdicts:
                    UIDMap = []
                    for wdict in all_wdicts:
                        if wdict['ref_base'] == '' \
                           and wdict['alt_base'] not in ['A','T','C','G']:
                            num_errors += 1
                            e = BadFormatError(
                                'Reference base required for non SNV')
                            self._log_conversion_error(read_lnum, e)
                            continue
                        if self.do_liftover:
                            prelift_wdict = copy.copy(wdict)
                            try:
                                wdict['chrom'], wdict['pos'] = self.liftover(
                                    wdict['chrom'], wdict['pos'])
                            except LiftoverFailure as e:
                                num_errors += 1
                                self._log_conversion_error(read_lnum, e)
                                continue
                        unique, UID = self.vtracker.addVar(
                            wdict['chrom'], int(wdict['pos']),
                            wdict['ref_base'], wdict['alt_base'])
                        wdict['uid'] = UID
                        if unique:
                            write_lnum += 1
                            self.crv_writer.write_data(wdict)
                            if self.do_liftover:
                                prelift_wdict['uid'] = UID
                                self.crl_writer.write_data(prelift_wdict)
                        if UID not in UIDMap:
                            #For this input line, only write to the .crm if the UID has not yet been written to the map file.
                            self.crm_writer.write_data({
                                'original_line': read_lnum,
                                'tags': wdict['tags'],
                                'uid': UID
                            })
                            UIDMap.append(UID)
                        self.crs_writer.write_data(wdict)
            end_time = time.time()
            self.logger.info('Conversion end: %s' %\
                time.asctime(time.localtime(end_time)))
            self.logger.info('Read lines: %d' % read_lnum)
            self.logger.info('Error lines: %d' % num_errors)
            self.logger.info('Wrote lines: %d' % write_lnum)
            runtime = round(end_time - start_time, 3)
            self.logger.info('Conversion runtime: %s' % runtime)

            self._close_files()

        except Exception as e:
            self.__handle_exception(e)
Esempio n. 3
0
 def run(self):
     """ Convert input file to a .crv file using the primary converter."""
     self.setup()
     start_time = time.time()
     self.status_writer.queue_status_update(
         "status",
         "Started {} ({})".format("Converter",
                                  self.primary_converter.format_name),
     )
     last_status_update_time = time.time()
     multiple_files = len(self.input_paths) > 1
     fileno = 0
     total_lnum = 0
     base_re = re.compile("^[ATGC]+|[-]+$")
     write_lnum = 0
     for fn in self.input_paths:
         if self.pipeinput:
             f = sys.stdin
         else:
             f = self.open_input_file(fn)
         if self.pipeinput == True:
             fname = STDIN
         else:
             fname = f.name
         fileno += 1
         converter = self.primary_converter.__class__()
         self._set_converter_properties(converter)
         converter.setup(f)
         if self.pipeinput == False:
             f.seek(0)
         read_lnum = 0
         num_errors = 0
         if self.pipeinput:
             cur_fname = STDIN
         else:
             cur_fname = os.path.basename(f.name)
         for l in f:
             samp_prefix = cur_fname
             read_lnum += 1
             try:
                 # all_wdicts is a list, since one input line can become
                 # multiple output lines. False is returned if converter
                 # decides line is not an input line.
                 all_wdicts = converter.convert_line(l)
                 if all_wdicts is BaseConverter.IGNORE:
                     continue
                 total_lnum += 1
                 if all_wdicts:
                     UIDMap = []
                     no_unique_var = 0
                     for wdict_no in range(len(all_wdicts)):
                         wdict = all_wdicts[wdict_no]
                         chrom = wdict["chrom"]
                         pos = wdict["pos"]
                         if chrom is not None:
                             if not chrom.startswith("chr"):
                                 chrom = "chr" + chrom
                             wdict["chrom"] = self.chromdict.get(
                                 chrom, chrom)
                             if multiple_files:
                                 if wdict["sample_id"]:
                                     wdict["sample_id"] = "__".join(
                                         [samp_prefix, wdict["sample_id"]])
                                 else:
                                     wdict["sample_id"] = samp_prefix
                             if "ref_base" not in wdict or wdict[
                                     "ref_base"] == "":
                                 wdict[
                                     "ref_base"] = self.wgsreader.get_bases(
                                         chrom, int(wdict["pos"]))
                             else:
                                 ref_base = wdict["ref_base"]
                                 if ref_base == "" and wdict[
                                         "alt_base"] not in [
                                             "A",
                                             "T",
                                             "C",
                                             "G",
                                         ]:
                                     raise BadFormatError(
                                         "Reference base required for non SNV"
                                     )
                                 elif ref_base is None or ref_base == "":
                                     wdict[
                                         "ref_base"] = self.wgsreader.get_bases(
                                             chrom, int(pos))
                             prelift_wdict = copy.copy(wdict)
                             if self.do_liftover:
                                 (
                                     wdict["chrom"],
                                     wdict["pos"],
                                     wdict["ref_base"],
                                     wdict["alt_base"],
                                 ) = self.liftover(
                                     wdict["chrom"],
                                     int(wdict["pos"]),
                                     wdict["ref_base"],
                                     wdict["alt_base"],
                                 )
                             if base_re.fullmatch(
                                     wdict["ref_base"]) is None:
                                 raise BadFormatError(
                                     "Invalid reference base")
                             if base_re.fullmatch(
                                     wdict["alt_base"]) is None:
                                 raise BadFormatError(
                                     "Invalid alternate base")
                             p, r, a = (
                                 int(wdict["pos"]),
                                 wdict["ref_base"],
                                 wdict["alt_base"],
                             )
                             (
                                 new_pos,
                                 new_ref,
                                 new_alt,
                             ) = self.standardize_pos_ref_alt("+", p, r, a)
                             wdict["pos"] = new_pos
                             wdict["ref_base"] = new_ref
                             wdict["alt_base"] = new_alt
                             unique, UID = self.vtracker.addVar(
                                 wdict["chrom"], new_pos, new_ref, new_alt)
                             wdict["uid"] = UID
                             if wdict["ref_base"] == wdict["alt_base"]:
                                 raise NoVariantError()
                             if unique:
                                 write_lnum += 1
                                 self.crv_writer.write_data(wdict)
                                 #if self.do_liftover:
                                 #if wdict["pos"] != prelift_wdict["pos"] or wdict["ref_base"] != prelift_wdict["ref_base"] or wdict["alt_base"] != prelift_wdict["alt_base"]:
                                 prelift_wdict["uid"] = UID
                                 self.crl_writer.write_data(prelift_wdict)
                                 # addl_operation errors shouldnt prevent variant from writing
                                 try:
                                     converter.addl_operation_for_unique_variant(
                                         wdict, no_unique_var)
                                 except Exception as e:
                                     self._log_conversion_error(
                                         read_lnum, l, e)
                                 no_unique_var += 1
                             if UID not in UIDMap:
                                 # For this input line, only write to the .crm if the UID has not yet been written to the map file.
                                 self.crm_writer.write_data({
                                     "original_line":
                                     read_lnum,
                                     "tags":
                                     wdict["tags"],
                                     "uid":
                                     UID,
                                     "fileno":
                                     self.input_path_dict2[fname],
                                 })
                                 UIDMap.append(UID)
                         self.crs_writer.write_data(wdict)
                 else:
                     raise ExpectedException(
                         "No valid alternate allele was found in any samples."
                     )
             except Exception as e:
                 num_errors += 1
                 self._log_conversion_error(read_lnum, l, e)
                 continue
         f.close()
         cur_time = time.time()
         if total_lnum % 10000 == 0 or cur_time - last_status_update_time > 3:
             self.status_writer.queue_status_update(
                 "status",
                 "Running {} ({}): line {}".format("Converter", cur_fname,
                                                   read_lnum),
             )
             last_status_update_time = cur_time
     self.logger.info("error lines: %d" % num_errors)
     self._close_files()
     self.end()
     if self.status_writer is not None:
         self.status_writer.queue_status_update("num_input_var", total_lnum)
         self.status_writer.queue_status_update("num_unique_var",
                                                write_lnum)
         self.status_writer.queue_status_update("num_error_input",
                                                num_errors)
     end_time = time.time()
     self.logger.info("finished: %s" %
                      time.asctime(time.localtime(end_time)))
     runtime = round(end_time - start_time, 3)
     self.logger.info("num input lines: {}".format(total_lnum))
     self.logger.info("runtime: %s" % runtime)
     self.status_writer.queue_status_update(
         "status",
         "Finished {} ({})".format("Converter",
                                   self.primary_converter.format_name),
     )
     return total_lnum, self.primary_converter.format_name
Esempio n. 4
0
 def run(self):
     """ Convert input file to a .crv file using the primary converter."""
     self.setup()
     start_time = time.time()
     self.status_writer.queue_status_update(
         'status',
         'Started {} ({})'.format('Converter',
                                  self.primary_converter.format_name))
     last_status_update_time = time.time()
     multiple_files = len(self.input_files) > 1
     fileno = 0
     total_lnum = 0
     for f in self.input_files:
         fileno += 1
         self.primary_converter.setup(f)
         f.seek(0)
         read_lnum = 0
         write_lnum = 0
         num_errors = 0
         for l in f:
             cur_fname = os.path.basename(f.name)
             samp_prefix = cur_fname
             read_lnum += 1
             try:
                 # all_wdicts is a list, since one input line can become
                 # multiple output lines. False is returned if converter
                 # decides line is not an input line.
                 all_wdicts = self.primary_converter.convert_line(l)
                 if all_wdicts is BaseConverter.IGNORE:
                     continue
                 total_lnum += 1
             except Exception as e:
                 num_errors += 1
                 self._log_conversion_error(read_lnum, l, e)
                 continue
             if all_wdicts:
                 UIDMap = []
                 no_unique_var = 0
                 for wdict_no in range(len(all_wdicts)):
                     wdict = all_wdicts[wdict_no]
                     chrom = wdict['chrom']
                     if chrom is not None:
                         if not chrom.startswith('chr'):
                             chrom = 'chr' + chrom
                         wdict['chrom'] = self.chromdict.get(chrom, chrom)
                         if multiple_files:
                             if wdict['sample_id']:
                                 wdict['sample_id'] = '__'.join(
                                     [samp_prefix, wdict['sample_id']])
                             else:
                                 wdict['sample_id'] = samp_prefix
                         if wdict['ref_base'] == '' and wdict[
                                 'alt_base'] not in ['A', 'T', 'C', 'G']:
                             num_errors += 1
                             e = BadFormatError(
                                 'Reference base required for non SNV')
                             self._log_conversion_error(read_lnum, l, e)
                             continue
                         if self.do_liftover:
                             prelift_wdict = copy.copy(wdict)
                             try:
                                 wdict['chrom'], wdict[
                                     'pos'] = self.liftover(
                                         wdict['chrom'], wdict['pos'])
                             except LiftoverFailure as e:
                                 num_errors += 1
                                 self._log_conversion_error(read_lnum, l, e)
                                 continue
                         p, r, a = int(
                             wdict['pos']
                         ), wdict['ref_base'], wdict['alt_base']
                         new_pos, new_ref, new_alt = self.standardize_pos_ref_alt(
                             '+', p, r, a)
                         unique, UID = self.vtracker.addVar(
                             wdict['chrom'], new_pos, new_ref, new_alt)
                         wdict['uid'] = UID
                         if unique:
                             write_lnum += 1
                             self.crv_writer.write_data(wdict)
                             if self.do_liftover:
                                 prelift_wdict['uid'] = UID
                                 self.crl_writer.write_data(prelift_wdict)
                             self.primary_converter.addl_operation_for_unique_variant(
                                 wdict, no_unique_var)
                             no_unique_var += 1
                         if UID not in UIDMap:
                             #For this input line, only write to the .crm if the UID has not yet been written to the map file.
                             self.crm_writer.write_data({
                                 'original_line':
                                 read_lnum,
                                 'tags':
                                 wdict['tags'],
                                 'uid':
                                 UID,
                                 'fileno':
                                 self.input_path_dict2[f.name]
                             })
                             UIDMap.append(UID)
                     self.crs_writer.write_data(wdict)
             else:
                 e = ExpectedException('No conversion result')
                 self._log_conversion_error(read_lnum, l, e)
         cur_time = time.time()
         if total_lnum % 10000 == 0 or cur_time - last_status_update_time > 3:
             self.status_writer.queue_status_update(
                 'status',
                 'Running {} ({}): line {}'.format('Converter', cur_fname,
                                                   read_lnum))
             last_status_update_time = cur_time
     self.logger.info('error lines: %d' % num_errors)
     self._close_files()
     self.end()
     if self.status_writer is not None:
         self.status_writer.queue_status_update('num_input_var', total_lnum)
         self.status_writer.queue_status_update('num_unique_var',
                                                write_lnum)
         self.status_writer.queue_status_update('num_error_input',
                                                num_errors)
     end_time = time.time()
     self.logger.info('finished: %s' %\
         time.asctime(time.localtime(end_time)))
     runtime = round(end_time - start_time, 3)
     self.logger.info('num input lines: {}'.format(total_lnum))
     self.logger.info('runtime: %s' % runtime)
     self.status_writer.queue_status_update(
         'status',
         'Finished {} ({})'.format('Converter',
                                   self.primary_converter.format_name))
     return total_lnum, self.primary_converter.format_name
Esempio n. 5
0
 def run(self):
     """ Convert input file to a .crv file using the primary converter."""
     self.setup()
     start_time = time.time()
     multiple_files = len(self.input_files) > 1
     for f in self.input_files:
         self.primary_converter.setup(f)
         f.seek(0)
         read_lnum = 0
         write_lnum = 0
         num_errors = 0
         for l in f:
             cur_fname = os.path.basename(f.name)
             samp_prefix = '.'.join(cur_fname.split('.')[:-1])
             read_lnum += 1
             try:
                 # all_wdicts is a list, since one input line can become
                 # multiple output lines
                 all_wdicts = self.primary_converter.convert_line(l)
                 if all_wdicts is None:
                     continue
             except Exception as e:
                 num_errors += 1
                 self._log_conversion_error(read_lnum, l, e)
                 continue
             if all_wdicts:
                 UIDMap = [] 
                 for wdict in all_wdicts:
                     chrom = wdict['chrom']
                     if not chrom.startswith('chr'): chrom = 'chr' + chrom
                     wdict['chrom'] = self.chromdict.get(chrom, chrom)
                     if multiple_files:
                         wdict['sample_id'] = '_'.join([samp_prefix, wdict['sample_id']])
                     if wdict['ref_base'] == '' and wdict['alt_base'] not in ['A','T','C','G']:
                         num_errors += 1
                         e = BadFormatError('Reference base required for non SNV')
                         self._log_conversion_error(read_lnum, l, e)
                         continue
                     if self.do_liftover:
                         prelift_wdict = copy.copy(wdict)
                         try:
                             wdict['chrom'], wdict['pos'] = self.liftover(wdict['chrom'],
                                                                         wdict['pos'])
                         except LiftoverFailure as e:
                             num_errors += 1
                             self._log_conversion_error(read_lnum, l, e)
                             continue
                     unique, UID = self.vtracker.addVar(wdict['chrom'], int(wdict['pos']), wdict['ref_base'], wdict['alt_base'])                       
                     wdict['uid'] = UID
                     if unique:
                         write_lnum += 1
                         self.crv_writer.write_data(wdict)
                         if self.do_liftover:
                             prelift_wdict['uid'] = UID
                             self.crl_writer.write_data(prelift_wdict)
                     if UID not in UIDMap: 
                         #For this input line, only write to the .crm if the UID has not yet been written to the map file.   
                         self.crm_writer.write_data({'original_line': read_lnum, 'tags': wdict['tags'], 'uid': UID})
                         UIDMap.append(UID)
                     self.crs_writer.write_data(wdict)
     self.logger.info('error lines: %d' %num_errors)
     self._close_files()
     if self.status_writer is not None:
         self.status_writer.queue_status_update('num_input_var', read_lnum)
         self.status_writer.queue_status_update('num_unique_var', write_lnum)
         self.status_writer.queue_status_update('num_error_input', num_errors)
     end_time = time.time()
     self.logger.info('finished: %s' %\
         time.asctime(time.localtime(end_time)))
     runtime = round(end_time - start_time, 3)
     self.logger.info('num input lines: {}'.format(read_lnum))
     self.logger.info('runtime: %s'%runtime)