Beispiel #1
0
def preview_bam(filename):
    import pysam
    res = ''
    with pysam.AlignmentFile(filename, 'rb') as bam:
        headers = bam.header
        for record_type in ('RG', 'PG', 'SQ'):
            if record_type not in headers:
                continue
            else:
                records = headers[record_type]
            res += record_type + ':\n'
            for i, record in enumerate(records):
                if type(record) == str:
                    res += '  ' + short_repr(record) + '\n'
                elif type(record) == dict:
                    res += '  '
                    for idx, (k, v) in enumerate(record.items()):
                        if idx < 4:
                            res += '{}: {}    '.format(k, short_repr(v))
                        elif idx == 4:
                            res += '...'
                            break
                if i > 4:
                    res += '\n  ...\n'
                    break
                else:
                    res += '\n'
    return res
Beispiel #2
0
 def log(self, stage=None, msg=None):
     if stage == 'start':
         env.logger.debug('{} ``{}``: {}'.format('Checking' if self.run_mode == 'dryrun' else 'Executing',
             self.step.step_name(), self.step.comment.strip()))
     elif stage == 'input':
         if env.sos_dict['input'] is not None:
             env.logger.debug('input:    ``{}``'.format(short_repr(env.sos_dict['input'])))
     elif stage == 'output':
         if env.sos_dict['output'] is not None:
             env.logger.debug('output:   ``{}``'.format(short_repr(env.sos_dict['output'])))
Beispiel #3
0
def _R_repr(obj):
    if isinstance(obj, bool):
        return 'TRUE' if obj else 'FALSE'
    elif isinstance(obj, (int, float, str)):
        return repr(obj)
    elif isinstance(obj, Sequence):
        if len(obj) == 0:
            return 'c()'
        # if the data is of homogeneous type, let us use c()
        # otherwise use list()
        # this can be confusion but list can be difficult to handle
        if homogeneous_type(obj):
            return 'c(' + ','.join(_R_repr(x) for x in obj) + ')'
        else:
            return 'list(' + ','.join(_R_repr(x) for x in obj) + ')'
    elif obj is None:
        return 'NULL'
    elif isinstance(obj, dict):
        return 'list(' + ','.join('{}={}'.format(x, _R_repr(y)) for x,y in obj.items()) + ')'
    elif isinstance(obj, set):
        return 'list(' + ','.join(_R_repr(x) for x in obj) + ')'
    else:
        import numpy
        import pandas
        if isinstance(obj, (numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64,\
                numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float16, numpy.float32, \
                numpy.float64)):
            return repr(obj)
        elif isinstance(obj, numpy.matrixlib.defmatrix.matrix):
            try:
                import feather
            except ImportError:
                raise UsageError('The feather-format module is required to pass numpy matrix as R matrix'
                    'See https://github.com/wesm/feather/tree/master/python for details.')
            feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather', delete=False).name
            feather.write_dataframe(pandas.DataFrame(obj).copy(), feather_tmp_)
            return 'data.matrix(read_feather("{}"))'.format(feather_tmp_)
        elif isinstance(obj, numpy.ndarray):
            return 'c(' + ','.join(_R_repr(x) for x in obj) + ')'
        elif isinstance(obj, pandas.DataFrame):
            try:
                import feather
            except ImportError:
                raise UsageError('The feather-format module is required to pass pandas DataFrame as R data.frame'
                    'See https://github.com/wesm/feather/tree/master/python for details.')
            feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather', delete=False).name
            try:
                data = obj.copy()
                feather.write_dataframe(data, feather_tmp_)
            except:
                # if data cannot be written, we try to manipulate data
                # frame to have consistent types and try again
                for c in data.columns:
                    if not homogeneous_type(data[c]):
                        data[c] = [str(x) for x in data[c]]
                feather.write_dataframe(data, feather_tmp_)
            return 'read_feather("{}")'.format(feather_tmp_)
        else:
            return repr('Unsupported datatype {}'.format(short_repr(obj)))
Beispiel #4
0
 def load_pickled(self, item):
     if isinstance(item, bytes):
         return pickle.loads(item)
     elif isinstance(item, str):
         return pickle.loads(item.encode('utf-8'))
     else:
         self.sos_kernel.warn(
             'Cannot restore from result of pickle.dumps: {}'.format(
                 short_repr(item)))
         return {}
Beispiel #5
0
    def get_vars(self, var_names):
        """
        Functionality to transfer CAS objects and TypeSystem from SoS (python) kernel to the IRuta kernel.
        This function is called when a use invokes the line magic %get or %with.
        """
        if len(var_names) != 1:
            raise Exception(
                "%get takes exactly one variable name as argument."
                "If you want to transfer multiple CAS, then please write them to a directory and use `%inputDir` in IRuta kernel."
            )
        var_name = var_names[0]
        var_content = env.sos_dict[var_name]

        # Step 1: Writing Cas and TypeSystem to disk using dkpro-cassis
        temp_directory = tempfile.TemporaryDirectory()
        temp_typesystem_file = tempfile.NamedTemporaryFile(
            suffix=".xml", dir=temp_directory.name, delete=False)
        temp_typesystem_file_path = os.path.normpath(
            temp_typesystem_file.name).replace('\\', "/")
        temp_xmi_file = tempfile.NamedTemporaryFile(suffix=".xmi",
                                                    dir=temp_directory.name,
                                                    delete=False)
        temp_xmi_file_path = os.path.normpath(temp_xmi_file.name).replace(
            '\\', "/")

        if isinstance(var_content, cassis.Cas):
            var_content.to_xmi(temp_xmi_file_path)
            var_content.typesystem.to_xml(temp_typesystem_file_path)
            cmd_transfer_var = "%displayMode NONE\n" \
                               f"%loadCas {temp_xmi_file_path}\n" \
                               f"%loadTypeSystem {temp_typesystem_file_path}"

        elif isinstance(var_content, cassis.TypeSystem):
            var_content.to_xml(temp_typesystem_file_path)
            cmd_transfer_var = "%displayMode NONE\n" \
                               f"%loadTypeSystem {temp_typesystem_file_path}"

        else:
            raise Exception(
                '%get only support transfering UIMA CAS objects or TypeSystem objects. '
                'Use %expand for transfering string variables. Received datatype {}'
                .format(short_repr(var_content)))

        # Step 2: Loading files
        env.log_to_file('KERNEL', f'Executing "{cmd_transfer_var}"')
        self.ruta_kernel.run_cell(
            cmd_transfer_var,
            silent=True,
            store_history=False,
            on_error=f'Failed to get variable {var_name}')

        # Step 3: Clean-up temp files
        temp_typesystem_file.close()
        temp_xmi_file.close()
        temp_directory.cleanup()
Beispiel #6
0
 def _Ruby_repr(self, obj):
     if isinstance(obj, bool):
         return 'true' if obj else 'false'
     elif isinstance(obj, float) and numpy.isnan(obj):
         return "Float::NAN"
     elif isinstance(obj, (int, float)):
         return repr(obj)
     elif isinstance(obj, str):
         return '%(' + obj + ')'
     elif isinstance(obj, complex):
         return 'Complex(' + str(obj.real) + ',' + str(obj.imag) + ')'
     elif isinstance(obj, range):
         return '(' + repr(min(obj)) + '...' + repr(max(obj)) + ')'
     elif isinstance(obj, Sequence):
         if len(obj) == 0:
             return '[]'
         else:
             return '[' + ','.join(self._Ruby_repr(x) for x in obj) + ']'
     elif obj is None:
         return 'nil'
     elif isinstance(obj, dict):
         return '{' + ','.join('"{}" => {}'.format(x, self._Ruby_repr(y))
                               for x, y in obj.items()) + '}'
     elif isinstance(obj, set):
         return 'Set[' + ','.join(self._Ruby_repr(x) for x in obj) + ']'
     else:
         if isinstance(obj, (numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64,\
                 numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float16, numpy.float32, numpy.float64)):
             return repr(obj)
         elif isinstance(obj, numpy.matrixlib.defmatrix.matrix):
             return 'N' + repr(obj.tolist())
         elif isinstance(obj, numpy.ndarray):
             return repr(obj.tolist())
         elif isinstance(obj, pandas.DataFrame):
             _beginning_result_string_dataframe_to_ruby = "Daru::DataFrame.new({"
             _context_string_dataframe_to_ruby = str([
                 '"' + str(x).replace("'", '"') + '"' + "=>" + "[" +
                 str(",".join(
                     list(map(lambda y: self._Ruby_repr(y),
                              obj[x].tolist())))).replace("'", '"') + "]"
                 for x in obj.keys().tolist()
             ])[2:-2].replace("\', \'", ", ") + "},"
             _indexing_result_string_dataframe_to_ruby = "index:" + str(
                 obj.index.values.tolist()).replace("'", '"') + ")"
             _result_string_dataframe_to_ruby = _beginning_result_string_dataframe_to_ruby + _context_string_dataframe_to_ruby + _indexing_result_string_dataframe_to_ruby
             return _result_string_dataframe_to_ruby
         elif isinstance(obj, pandas.Series):
             dat = list(obj.values)
             ind = list(obj.index.values)
             ans = "{" + ",".join(
                 [repr(x) + "=>" + repr(y) for x, y in zip(ind, dat)]) + "}"
             return ans
         else:
             return repr('Unsupported datatype {}'.format(short_repr(obj)))
Beispiel #7
0
def _R_repr(obj, processed=None):
    if isinstance(obj, bool):
        return 'TRUE' if obj else 'FALSE'
    elif isinstance(obj, (int, str)):
        return repr(obj)
    elif isinstance(obj, float):
        if numpy.isnan(obj):
            return 'NaN'
        else:
            return repr(obj)
    elif isinstance(obj, complex):
        return 'complex(real = ' + str(obj.real) + ', imaginary = ' + str(obj.imag) + ')'
    elif isinstance(obj, Sequence):
        if len(obj) == 0:
            return 'c()'
        # if the data is of homogeneous type, let us use c()
        # otherwise use list()
        # this can be confusion but list can be difficult to handle
        if homogeneous_type(obj):
            return 'c(' + ','.join(_R_repr(x) for x in obj) + ')'
        else:
            return 'list(' + ','.join(_R_repr(x) for x in obj) + ')'
    elif obj is None:
        return 'NULL'
    elif isinstance(obj, dict):
        if processed:
            if id(obj) in processed:
                return 'NULL'
        else:
            processed = set()
        processed.add(id(obj))
        return 'list(' + ','.join('{}={}'.format(make_name(str(x)), _R_repr(y, processed)) for x, y in obj.items()) + ')'
    elif isinstance(obj, set):
        return 'list(' + ','.join(_R_repr(x) for x in obj) + ')'
    else:
        if isinstance(obj, (numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64,
                            numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float16, numpy.float32,
                            numpy.float64)):
            return repr(obj)
        elif isinstance(obj, numpy.matrixlib.defmatrix.matrix):
            try:
                import feather
            except ImportError:
                raise UsageError('The feather-format module is required to pass numpy matrix as R matrix'
                                 'See https://github.com/wesm/feather/tree/master/python for details.')
            feather_tmp_ = tempfile.NamedTemporaryFile(
                suffix='.feather', delete=False).name
            feather.write_dataframe(pandas.DataFrame(obj).copy(), feather_tmp_)
            return 'data.matrix(..read.feather({!r}))'.format(feather_tmp_)
        elif isinstance(obj, numpy.ndarray):
            if obj.ndim == 1:
                return 'array(c(' + ','.join(_R_repr(x) for x in obj) + '))'
            else:
                return 'array(' + 'c(' + ','.join(repr(x) for x in obj.swapaxes(obj.ndim - 2, obj.ndim - 1).flatten(order='C')) + ')' + ', dim=(' + 'rev(c' + repr(obj.swapaxes(obj.ndim - 2, obj.ndim - 1).shape) + ')))'
        elif isinstance(obj, pandas.DataFrame):
            try:
                import feather
            except ImportError:
                raise UsageError('The feather-format module is required to pass pandas DataFrame as R data.frame'
                                 'See https://github.com/wesm/feather/tree/master/python for details.')
            feather_tmp_ = tempfile.NamedTemporaryFile(
                suffix='.feather', delete=False).name
            try:
                data = obj.copy()
                # if the dataframe has index, it would not be transferred due to limitations
                # of feather. We will have to do something to save the index separately and
                # recreate it. (#397)
                if isinstance(data.index, pandas.Index):
                    df_index = list(data.index)
                elif not isinstance(data.index, pandas.RangeIndex):
                    # we should give a warning here
                    df_index = None
                feather.write_dataframe(data, feather_tmp_)
            except Exception:
                # if data cannot be written, we try to manipulate data
                # frame to have consistent types and try again
                for c in data.columns:
                    if not homogeneous_type(data[c]):
                        data[c] = [str(x) for x in data[c]]
                feather.write_dataframe(data, feather_tmp_)
                # use {!r} for path because the string might contain c:\ which needs to be
                # double quoted.
            return '..read.feather({!r}, index={})'.format(feather_tmp_, _R_repr(df_index))
        elif isinstance(obj, pandas.Series):
            dat = list(obj.values)
            ind = list(obj.index.values)
            return 'setNames(' + 'c(' + ','.join(_R_repr(x) for x in dat) + ')' + ',c(' + ','.join(_R_repr(y) for y in ind) + '))'
        else:
            return repr('Unsupported datatype {}'.format(short_repr(obj)))
Beispiel #8
0
def _R_repr(obj):
    if isinstance(obj, bool):
        return 'TRUE' if obj else 'FALSE'
    elif isinstance(obj, (int, float, str)):
        return repr(obj)
    elif isinstance(obj, Sequence):
        if len(obj) == 0:
            return 'c()'
        # if the data is of homogeneous type, let us use c()
        # otherwise use list()
        # this can be confusion but list can be difficult to handle
        if homogeneous_type(obj):
            return 'c(' + ','.join(_R_repr(x) for x in obj) + ')'
        else:
            return 'list(' + ','.join(_R_repr(x) for x in obj) + ')'
    elif obj is None:
        return 'NULL'
    elif isinstance(obj, dict):
        return 'list(' + ','.join('{}={}'.format(x, _R_repr(y))
                                  for x, y in obj.items()) + ')'
    elif isinstance(obj, set):
        return 'list(' + ','.join(_R_repr(x) for x in obj) + ')'
    else:
        import numpy
        import pandas
        if isinstance(obj, (numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64,\
                numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float16, numpy.float32, \
                numpy.float64)):
            return repr(obj)
        elif isinstance(obj, numpy.matrixlib.defmatrix.matrix):
            try:
                import feather
            except ImportError:
                raise UsageError(
                    'The feather-format module is required to pass numpy matrix as R matrix'
                    'See https://github.com/wesm/feather/tree/master/python for details.'
                )
            feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather',
                                                       delete=False).name
            feather.write_dataframe(pandas.DataFrame(obj).copy(), feather_tmp_)
            return 'data.matrix(read_feather("{}"))'.format(feather_tmp_)
        elif isinstance(obj, numpy.ndarray):
            return 'c(' + ','.join(_R_repr(x) for x in obj) + ')'
        elif isinstance(obj, pandas.DataFrame):
            try:
                import feather
            except ImportError:
                raise UsageError(
                    'The feather-format module is required to pass pandas DataFrame as R data.frame'
                    'See https://github.com/wesm/feather/tree/master/python for details.'
                )
            feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather',
                                                       delete=False).name
            try:
                data = obj.copy()
                feather.write_dataframe(data, feather_tmp_)
            except:
                # if data cannot be written, we try to manipulate data
                # frame to have consistent types and try again
                for c in data.columns:
                    if not homogeneous_type(data[c]):
                        data[c] = [str(x) for x in data[c]]
                feather.write_dataframe(data, feather_tmp_)
            return 'read_feather("{}")'.format(feather_tmp_)
        else:
            return repr('Unsupported datatype {}'.format(short_repr(obj)))
Beispiel #9
0
 def _julia_repr(self, obj):
     if isinstance(obj, bool):
         return 'true' if obj else 'false'
     elif isinstance(obj, (int, float)):
         return repr(obj)
     elif isinstance(obj, str):
         # Not using repr() here becasue of the problem of qoutes in Julia.
         return '"""' + obj + '"""'
     elif isinstance(obj, complex):
         return 'complex(' + str(obj.real) + ',' + str(obj.imag) + ')'
     elif isinstance(obj, Sequence):
         if len(obj) == 0:
             return '[]'
         else:
             return '[' + ','.join(self._julia_repr(x) for x in obj) + ']'
     elif obj is None:
         return 'NaN'
     elif isinstance(obj, dict):
         return 'Dict(' + ','.join(
             '"{}" => {}'.format(x, self._julia_repr(y))
             for x, y in obj.items()) + ')'
     elif isinstance(obj, set):
         return 'Set([' + ','.join(self._julia_repr(x) for x in obj) + '])'
     else:
         if isinstance(obj, (numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64,\
                 numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float16, numpy.float32)):
             return repr(obj)
         # need to specify Float64() as the return to Julia in order to avoid losing precision
         elif isinstance(obj, numpy.float64):
             return 'Float64(' + obj + ')'
         elif isinstance(obj, numpy.matrixlib.defmatrix.matrix):
             try:
                 import feather
             except ImportError:
                 raise UsageError(
                     'The feather-format module is required to pass numpy matrix as julia matrix(array)'
                     'See https://github.com/wesm/feather/tree/master/python for details.'
                 )
             feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather',
                                                        delete=False).name
             feather.write_dataframe(
                 pandas.DataFrame(obj).copy(), feather_tmp_)
             return 'convert(Matrix, Feather.read("' + feather_tmp_ + '"))'
         elif isinstance(obj, numpy.ndarray):
             return '[' + ','.join(self._julia_repr(x) for x in obj) + ']'
         elif isinstance(obj, pandas.DataFrame):
             try:
                 import feather
             except ImportError:
                 raise UsageError(
                     'The feather-format module is required to pass pandas DataFrame as julia.DataFrames'
                     'See https://github.com/wesm/feather/tree/master/python for details.'
                 )
             feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather',
                                                        delete=False).name
             try:
                 data = obj.copy()
                 # Julia DataFrame does not have index
                 if not isinstance(data.index, pandas.RangeIndex):
                     self.sos_kernel.warn(
                         'Raw index is ignored because Julia DataFrame does not support raw index.'
                     )
                 feather.write_dataframe(data, feather_tmp_)
             except Exception:
                 # if data cannot be written, we try to manipulate data
                 # frame to have consistent types and try again
                 for c in data.columns:
                     if not homogeneous_type(data[c]):
                         data[c] = [str(x) for x in data[c]]
                 feather.write_dataframe(data, feather_tmp_)
                 # use {!r} for path because the string might contain c:\ which needs to be
                 # double quoted.
             return 'Feather.read("' + feather_tmp_ + '")'
         elif isinstance(obj, pandas.Series):
             dat = list(obj.values)
             ind = list(obj.index.values)
             ans = 'NamedArray(' + '[' + ','.join(
                 self._julia_repr(x) for x in dat) + ']' + ',([' + ','.join(
                     self._julia_repr(y) for y in ind) + '],))'
             return ans.replace("'", '"')
         else:
             return repr('Unsupported datatype {}'.format(short_repr(obj)))