def open_input_file(self, input_path):
     encoding = detect_encoding(input_path)
     if input_path.endswith(".gz"):
         f = gzip.open(input_path, mode="rt", encoding=encoding)
     else:
         f = open(input_path, encoding=encoding)
     return f
Exemple #2
0
 def _loop_definition(self):
     encoding = detect_encoding(self.path)
     #sys.stderr.write('loop definition [' + self.path + ']. encoding=' + str(encoding) + '\n')
     f = open(self.path, encoding=encoding)
     for l in f:
         l = l.rstrip().lstrip()
         if l.startswith('#'):
             yield l
         else:
             break
     f.close()
 def first_input_file(self):
     if self.pipeinput == False:
         input_path = self.input_paths[0]
         encoding = detect_encoding(input_path)
         if input_path.endswith(".gz"):
             f = gzip.open(input_path, mode="rt", encoding=encoding)
         else:
             f = open(input_path, encoding=encoding)
     else:
         f = sys.stdin
     return f
Exemple #4
0
 def __init__(self, path, seekpos=None, chunksize=None):
     super().__init__(path)
     self.seekpos = seekpos
     self.chunksize = chunksize
     self.encoding = detect_encoding(self.path)
     self.annotator_name = ''
     self.annotator_displayname = ''
     self.annotator_version = ''
     self.no_aggregate_cols = []
     self.index_columns = []
     self.report_substitution = None
     self._setup_definition()
Exemple #5
0
 def _loop_data(self):
     encoding = detect_encoding(self.path)
     #sys.stderr.write('loop data [' + self.path + ']. encoding=' + str(encoding) + '\n')
     f = open(self.path, 'rb')
     lnum = 0
     for l in f:
         l = l.decode(encoding)
         lnum += 1
         l = l.rstrip('\r\n')
         if l.startswith('#'):
             continue
         else:
             yield lnum, l
     f.close()
    def setup(self):
        """ Do necesarry pre-run tasks """
        if self.ready_to_convert: return
        # Open file handle to input path
        for input_path in self.input_paths:
            encoding = detect_encoding(input_path)
            if input_path.endswith('.gz'):
                f = gzip.open(input_path, mode='rt', encoding=encoding)
            else:
                f = open(input_path, encoding=encoding)
            self.input_files.append(f)
        # Read in the available converters
        self._initialize_converters()
        # Select the converter that matches the input format
        self._select_primary_converter()

        # Open the output files
        self._open_output_files()
        self.ready_to_convert = True
 def setup (self):
     """ Do necesarry pre-run tasks """
     if self.ready_to_convert: return
     # Open file handle to input path
     for input_path in self.input_paths:
         encoding = detect_encoding(input_path)
         self.input_files.append(open(input_path, encoding=encoding))
     # Read in the available converters
     self._initialize_converters()
     # Select the converter that matches the input format
     self._select_primary_converter()
     
     # A correct .crv file is not processed.
     #todo handle this for multiple inputs. have to convert them so they can be merged 
     # if self.input_format == 'crv' and \
     #     self.input_paths[0].split('.')[-1] == 'crv':
     #     self.logger.info('Input file is already a crv file. Exiting converter.')
     #     exit(0)
     
     # Open the output files
     self._open_output_files()
     self.ready_to_convert = True
Exemple #8
0
 def write_preface(self, level):
     self.level = level
     if self.wf is not None:
         self.wf.close()
     if level != 'variant':
         return
     self.wf = open(self.filename, 'w', encoding='utf-8', newline='')
     lines = [
         '#fileformat=VCFv4.2',
         '#OpenCRAVATFileDate=' +
         datetime.datetime.now().strftime('%Y%m%d'),
     ]
     self.write_preface_lines(lines)
     self.vcflines = {}
     self.input_path_dict = {}
     if self.input_format == 'vcf':
         if self.args.inputfiles is not None:
             if type(self.args.inputfiles) is str:
                 self.args.inputfiles = [self.args.inputfiles]
             for i in range(len(self.args.inputfiles)):
                 self.input_path_dict[self.args.inputfiles[i]] = i
             written_headers = []
             self.samples = []
             num_inputfiles = len(self.args.inputfiles)
             for inputfile in self.args.inputfiles:
                 inputfile_prefix = os.path.basename(inputfile).split(
                     '.')[0]
                 input_path_no = self.input_path_dict[inputfile]
                 encoding = detect_encoding(inputfile)
                 if inputfile.endswith('.gz'):
                     import gzip
                     f = gzip.open(inputfile, 'rt', encoding=encoding)
                 else:
                     f = open(inputfile)
                 lineno = 0
                 self.vcflines[input_path_no] = {}
                 for line in f:
                     lineno += 1
                     if line.startswith('##fileformat='):
                         continue
                     if line.startswith('##'):
                         if not line in written_headers:
                             self.wf.write(line)
                             written_headers.append(line)
                     elif line.startswith('#CHROM'):
                         toks = line[:-1].split('\t')
                         if len(toks) >= 10:
                             if num_inputfiles == 1:
                                 self.samples.extend([v for v in toks[9:]])
                             else:
                                 self.samples.extend([
                                     inputfile_prefix + '_' + v
                                     for v in toks[9:]
                                 ])
                     elif line.startswith('#') == False:
                         self.vcflines[input_path_no][lineno] = line.rstrip(
                             '\n').rstrip('\r')
                 f.close()
     else:
         self.cursor2.execute(
             'select distinct(base__sample_id) from sample')
         self.samples = []
         rows = self.cursor2.fetchall()
         if rows is None or len(rows) == 0:
             self.samples.append('NOSAMPLEID')
         else:
             for row in rows:
                 v = row[0]
                 if v is None:
                     v = 'NOSAMPLEID'
                 self.samples.append(v)