Exemple #1
0
def guess_data_type(orig_values, namask=None):
    """
    Use heuristics to guess data type.
    """
    valuemap, values = None, orig_values
    is_discrete = is_discrete_values(orig_values)
    if is_discrete:
        valuemap = sorted(is_discrete)
        coltype = DiscreteVariable
    else:
        # try to parse as float
        orig_values = np.asarray(orig_values)
        if namask is None:
            namask = isnastr(orig_values)
        values = np.empty_like(orig_values, dtype=float)
        values[namask] = np.nan
        try:
            np.copyto(values, orig_values, where=~namask, casting="unsafe")
        except ValueError:
            tvar = TimeVariable('_')
            try:
                values[~namask] = [tvar.parse(i) for i in orig_values[~namask]]
            except ValueError:
                coltype = StringVariable
                # return original_values
                values = orig_values
            else:
                coltype = TimeVariable
        else:
            coltype = ContinuousVariable
    return valuemap, values, coltype
Exemple #2
0
 def test_parse_repr(self):
     for datestr, timestamp, outstr in self.TESTS:
         var = TimeVariable('time')
         ts = var.to_val(datestr)  # calls parse for strings
         if not np.isnan(ts):
             self.assertEqual(ts, timestamp, msg=datestr)
         self.assertEqual(var.repr_val(ts), outstr, msg=datestr)
Exemple #3
0
 def test_have_date(self):
     var = TimeVariable('time')
     ts = var.parse('1937-08-02')  # parse date
     self.assertEqual(var.repr_val(ts), '1937-08-02')
     ts = var.parse('16:20')  # parse time
     # observe have datetime
     self.assertEqual(var.repr_val(ts), '1970-01-01 16:20:00')
Exemple #4
0
 def test_parse_utc(self):
     var = TimeVariable('time')
     datestr, offset = '2015-10-18 22:48:20', '+0200'
     ts1 = var.parse(datestr + offset)
     self.assertEqual(var.repr_val(ts1), datestr + offset)
     # Once a value is without a TZ, all the values lose it
     ts2 = var.parse(datestr)
     self.assertEqual(var.repr_val(ts2), datestr)
     self.assertEqual(var.repr_val(ts1), '2015-10-18 20:48:20')
Exemple #5
0
    def _guess_variable(self, field_name, field_metadata, inspect_table):
        from pymssql import STRING, NUMBER, DATETIME, DECIMAL

        type_code, *_ = field_metadata

        if type_code in (NUMBER, DECIMAL):
            return ContinuousVariable(field_name)

        if type_code == DATETIME:
            tv = TimeVariable(field_name)
            tv.have_date = True
            tv.have_time = True
            return tv

        if type_code == STRING:
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                if values:
                    return DiscreteVariable(field_name, values)

        return StringVariable(field_name)
Exemple #6
0
def guess_data_type(orig_values):
    """
    Use heuristics to guess data type.
    """
    valuemap, values = [], orig_values
    is_discrete = is_discrete_values(orig_values)
    if is_discrete:
        valuemap = sorted(is_discrete)
        coltype = DiscreteVariable
    else:
        try:
            values = [float(i) for i in orig_values]
        except ValueError:
            tvar = TimeVariable('_')
            try:
                values = [tvar.parse(i) for i in orig_values]
            except ValueError:
                coltype = StringVariable
            else:
                coltype = TimeVariable
        else:
            coltype = ContinuousVariable
    return valuemap, values, coltype
Exemple #7
0
def _date_to_iso(date):
    possible_date_formats = [
        '%Y %b %d',
        '%Y %b',
        '%Y',
    ]

    season_mapping = {
        'fall': 'Sep',
        'autumn': 'Sep',
        'winter': 'Dec',
        'spring': 'Mar',
        'summer': 'Jun',
    }

    date = date.lower()
    # Seasons to their respective months.
    for season, month in season_mapping.items():
        date = date.replace(season, month)
    date = date.split('-')[0]  # 2015 Sep-Dec --> 2015 Sep

    time_var = TimeVariable()
    for date_format in possible_date_formats:
        try:
            date_string = datetime.strptime(
                    date, date_format
            ).date().isoformat()
            return time_var.parse(date_string)
        except ValueError:
            continue  # Try the next format.

    warnings.warn(
            'Could not parse "{}" into a date.'.format(date),
            RuntimeWarning
    )
    return time_var.parse(np.nan)
Exemple #8
0
def transpose_table(table):
    """
    Transpose the rows and columns of the table.

    Args:
        table: Data in :obj:`Orange.data.Table`

    Returns:
         Transposed :obj:`Orange.data.Table`. (Genes as columns)
    """
    attrs = table.domain.attributes
    attr = [ContinuousVariable.make(ex['Gene'].value) for ex in table]
    #  Set metas
    new_metas = [StringVariable.make(name) if name is not 'Time' else TimeVariable.make(name)
                 for name in sorted(table.domain.variables[0].attributes.keys())]
    domain = Domain(attr, metas=new_metas)
    meta_values = [[exp.attributes[var.name] for var in domain.metas] for exp in attrs]

    return Table(domain, table.X.transpose(), metas=meta_values)
Exemple #9
0
    def _guess_variable(self, field_name, field_metadata, inspect_table):
        type_code = field_metadata[0]

        FLOATISH_TYPES = (700, 701, 1700)  # real, float8, numeric
        INT_TYPES = (20, 21, 23)  # bigint, int, smallint
        CHAR_TYPES = (25, 1042, 1043,)  # text, char, varchar
        BOOLEAN_TYPES = (16,)  # bool
        DATE_TYPES = (1082, 1114, 1184, )  # date, timestamp, timestamptz
        # time, timestamp, timestamptz, timetz
        TIME_TYPES = (1083, 1114, 1184, 1266,)

        if type_code in FLOATISH_TYPES:
            return ContinuousVariable.make(field_name)

        if type_code in TIME_TYPES + DATE_TYPES:
            tv = TimeVariable.make(field_name)
            tv.have_date |= type_code in DATE_TYPES
            tv.have_time |= type_code in TIME_TYPES
            return tv

        if type_code in INT_TYPES:  # bigint, int, smallint
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                if values:
                    return DiscreteVariable.make(field_name, values)
            return ContinuousVariable.make(field_name)

        if type_code in BOOLEAN_TYPES:
            return DiscreteVariable.make(field_name, ['false', 'true'])

        if type_code in CHAR_TYPES:
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                # remove trailing spaces
                values = [v.rstrip() for v in values]
                if values:
                    return DiscreteVariable.make(field_name, values)

        return StringVariable.make(field_name)
Exemple #10
0
    def read(self):
        try:
            import opusFC
        except ImportError:
            raise RuntimeError(self._OPUS_WARNING)

        if self.sheet:
            db = self.sheet
        else:
            db = self.sheets[0]

        db = tuple(db.split(" "))
        dim = db[1]

        try:
            data = opusFC.getOpusData(self.filename, db)
        except Exception:
            raise IOError("Couldn't load spectrum from " + self.filename)

        attrs, clses, metas = [], [], []

        attrs = [ContinuousVariable.make(repr(data.x[i]))
                 for i in range(data.x.shape[0])]

        y_data = None
        meta_data = None

        if type(data) == opusFC.MultiRegionDataReturn:
            y_data = []
            meta_data = []
            metas.extend([ContinuousVariable.make('map_x'),
                          ContinuousVariable.make('map_y'),
                          StringVariable.make('map_region'),
                          TimeVariable.make('start_time')])
            for region in data.regions:
                y_data.append(region.spectra)
                mapX = region.mapX
                mapY = region.mapY
                map_region = np.full_like(mapX, region.title, dtype=object)
                start_time = region.start_time
                meta_region = np.column_stack((mapX, mapY,
                                               map_region, start_time))
                meta_data.append(meta_region.astype(object))
            y_data = np.vstack(y_data)
            meta_data = np.vstack(meta_data)

        elif type(data) == opusFC.MultiRegionTRCDataReturn:
            y_data = []
            meta_data = []
            metas.extend([ContinuousVariable.make('map_x'),
                          ContinuousVariable.make('map_y'),
                          StringVariable.make('map_region')])
            attrs = [ContinuousVariable.make(repr(data.labels[i]))
                     for i in range(len(data.labels))]
            for region in data.regions:
                y_data.append(region.spectra)
                mapX = region.mapX
                mapY = region.mapY
                map_region = np.full_like(mapX, region.title, dtype=object)
                meta_region = np.column_stack((mapX, mapY, map_region))
                meta_data.append(meta_region.astype(object))
            y_data = np.vstack(y_data)
            meta_data = np.vstack(meta_data)

        elif type(data) == opusFC.ImageDataReturn:
            metas.extend([ContinuousVariable.make('map_x'),
                          ContinuousVariable.make('map_y')])

            data_3D = data.spectra

            for i in np.ndindex(data_3D.shape[:1]):
                map_y = np.full_like(data.mapX, data.mapY[i])
                coord = np.column_stack((data.mapX, map_y))
                if y_data is None:
                    y_data = data_3D[i]
                    meta_data = coord.astype(object)
                else:
                    y_data = np.vstack((y_data, data_3D[i]))
                    meta_data = np.vstack((meta_data, coord))

        elif type(data) == opusFC.ImageTRCDataReturn:
            metas.extend([ContinuousVariable.make('map_x'),
                          ContinuousVariable.make('map_y')])

            attrs = [ContinuousVariable.make(repr(data.labels[i]))
                     for i in range(len(data.labels))]
            data_3D = data.traces

            for i in np.ndindex(data_3D.shape[:1]):
                map_y = np.full_like(data.mapX, data.mapY[i])
                coord = np.column_stack((data.mapX, map_y))
                if y_data is None:
                    y_data = data_3D[i]
                    meta_data = coord.astype(object)
                else:
                    y_data = np.vstack((y_data, data_3D[i]))
                    meta_data = np.vstack((meta_data, coord))

        elif type(data) == opusFC.TimeResolvedTRCDataReturn:
            y_data = data.traces

        elif type(data) == opusFC.TimeResolvedDataReturn:
            metas.extend([ContinuousVariable.make('z')])

            y_data = data.spectra
            meta_data = data.z

        elif type(data) == opusFC.SingleDataReturn:
            y_data = data.y[None, :]

        else:
            raise ValueError("Empty or unsupported opusFC DataReturn object: " + type(data))

        import_params = ['SRT', 'SNM']

        for param_key in import_params:
            try:
                param = data.parameters[param_key]
            except KeyError:
                pass  # TODO should notify user?
            else:
                try:
                    param_name = opusFC.paramDict[param_key]
                except KeyError:
                    param_name = param_key
                if param_key == 'SRT':
                    var = TimeVariable.make(param_name)
                elif type(param) is float:
                    var = ContinuousVariable.make(param_name)
                elif type(param) is str:
                    var = StringVariable.make(param_name)
                else:
                    raise ValueError #Found a type to handle
                metas.extend([var])
                params = np.full((y_data.shape[0],), param, np.array(param).dtype)
                if meta_data is not None:
                    # NB dtype default will be np.array(fill_value).dtype in future
                    meta_data = np.column_stack((meta_data, params.astype(object)))
                else:
                    meta_data = params

        domain = Orange.data.Domain(attrs, clses, metas)

        meta_data = np.atleast_2d(meta_data)

        table = Orange.data.Table.from_numpy(domain,
                                             y_data.astype(float, order='C'),
                                             metas=meta_data)

        return table
Exemple #11
0
    def data_table(self, data, headers=None):
        """
        Return Orange.data.Table given rows of `headers` (iterable of iterable)
        and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might
        as well **have it sorted column-major**, e.g. ``order='F'``).

        Basically, the idea of subclasses is to produce those two iterables,
        however they might.

        If `headers` is not provided, the header rows are extracted from `data`,
        assuming they precede it.
        """
        if not headers:
            headers, data = self.parse_headers(data)

        # Consider various header types (single-row, two-row, three-row, none)
        if 3 == len(headers):
            names, types, flags = map(list, headers)
        else:
            if 1 == len(headers):
                HEADER1_FLAG_SEP = '#'
                # First row format either:
                #   1) delimited column names
                #   2) -||- with type and flags prepended, separated by #,
                #      e.g. d#sex,c#age,cC#IQ
                _flags, names = zip(*[
                    i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else
                    ('', i) for i in headers[0]
                ])
                names = list(names)
            elif 2 == len(headers):
                names, _flags = map(list, headers)
            else:
                # Use heuristics for everything
                names, _flags = [], []
            types = [
                ''.join(filter(str.isupper, flag)).lower() for flag in _flags
            ]
            flags = [Flags.join(filter(str.islower, flag)) for flag in _flags]

        # Determine maximum row length
        rowlen = max(map(len, (names, types, flags)))

        def _equal_length(lst):
            lst.extend([''] * (rowlen - len(lst)))
            return lst

        # Ensure all data is of equal width in a column-contiguous array
        data = np.array([_equal_length(list(row)) for row in data if any(row)],
                        copy=False,
                        dtype=object,
                        order='F')

        # Data may actually be longer than headers were
        try:
            rowlen = data.shape[1]
        except IndexError:
            pass
        else:
            for lst in (names, types, flags):
                _equal_length(lst)

        NAMEGEN = namegen('Feature ', 1)
        Xcols, attrs = [], []
        Mcols, metas = [], []
        Ycols, clses = [], []
        Wcols = []

        # Iterate through the columns
        for col in range(rowlen):
            flag = Flags(Flags.split(flags[col]))
            if flag.i: continue

            type_flag = types and types[col].strip()
            try:
                orig_values = [
                    np.nan if i in MISSING_VALUES else i
                    for i in (i.strip() for i in data[:, col])
                ]
            except IndexError:
                # No data instances leads here
                orig_values = []
                # In this case, coltype could be anything. It's set as-is
                # only to satisfy test_table.TableTestCase.test_append
                coltype = DiscreteVariable

            coltype_kwargs = {}
            valuemap = []
            values = orig_values

            if type_flag in StringVariable.TYPE_HEADERS:
                coltype = StringVariable
            elif type_flag in ContinuousVariable.TYPE_HEADERS:
                coltype = ContinuousVariable
                try:
                    values = [float(i) for i in orig_values]
                except ValueError:
                    for row, num in enumerate(orig_values):
                        try:
                            float(num)
                        except ValueError:
                            break
                    raise ValueError('Non-continuous value in (1-based) '
                                     'line {}, column {}'.format(
                                         row + len(headers) + 1, col + 1))

            elif type_flag in TimeVariable.TYPE_HEADERS:
                coltype = TimeVariable

            elif (type_flag in DiscreteVariable.TYPE_HEADERS
                  or _RE_DISCRETE_LIST.match(type_flag)):
                if _RE_DISCRETE_LIST.match(type_flag):
                    valuemap = Flags.split(type_flag)
                    coltype_kwargs.update(ordered=True)
                else:
                    valuemap = sorted(set(orig_values) - {np.nan})

            else:
                # No known type specified, use heuristics
                is_discrete = is_discrete_values(orig_values)
                if is_discrete:
                    valuemap = sorted(is_discrete)
                else:
                    try:
                        values = [float(i) for i in orig_values]
                    except ValueError:
                        tvar = TimeVariable('_')
                        try:
                            values = [tvar.parse(i) for i in orig_values]
                        except ValueError:
                            coltype = StringVariable
                        else:
                            coltype = TimeVariable
                    else:
                        coltype = ContinuousVariable

            if valuemap:
                # Map discrete data to ints
                def valuemap_index(val):
                    try:
                        return valuemap.index(val)
                    except ValueError:
                        return np.nan

                values = np.vectorize(valuemap_index,
                                      otypes=[float])(orig_values)
                coltype = DiscreteVariable
                coltype_kwargs.update(values=valuemap)

            if coltype is StringVariable:
                values = ['' if i is np.nan else i for i in orig_values]

            if flag.m or coltype is StringVariable:
                append_to = (Mcols, metas)
            elif flag.w:
                append_to = (Wcols, None)
            elif flag.c:
                append_to = (Ycols, clses)
            else:
                append_to = (Xcols, attrs)

            cols, domain_vars = append_to
            cols.append(col)
            if domain_vars is not None:
                if names and names[col]:
                    # Use existing variable if available
                    var = coltype.make(names[col].strip(), **coltype_kwargs)
                else:
                    # Never use existing for un-named variables
                    var = coltype(next(NAMEGEN), **coltype_kwargs)
                var.attributes.update(flag.attributes)
                domain_vars.append(var)

                # Reorder discrete values to match existing variable
                if var.is_discrete and not var.ordered:
                    new_order, old_order = var.values, coltype_kwargs.get(
                        'values', var.values)
                    if new_order != old_order:
                        offset = len(new_order)
                        column = values if data.ndim > 1 else data
                        column += offset
                        for i, val in enumerate(var.values):
                            try:
                                oldval = old_order.index(val)
                            except ValueError:
                                continue
                            bn.replace(column, offset + oldval,
                                       new_order.index(val))

            if coltype is TimeVariable:
                # Re-parse the values because only now after coltype.make call
                # above, variable var is the correct one
                values = [var.parse(i) for i in orig_values]

            # Write back the changed data. This is needeed to pass the
            # correct, converted values into Table.from_numpy below
            try:
                data[:, col] = values
            except IndexError:
                pass

        domain = Domain(attrs, clses, metas)

        if not data.size:
            return Table.from_domain(domain, 0)

        table = Table.from_numpy(domain, data[:, Xcols].astype(float,
                                                               order='C'),
                                 data[:, Ycols].astype(float, order='C'),
                                 data[:, Mcols].astype(object, order='C'),
                                 data[:, Wcols].astype(float, order='C'))
        return table
Exemple #12
0
 def test_parse_invalid(self):
     var = TimeVariable('var')
     with self.assertRaises(ValueError):
         var.parse('123')
Exemple #13
0
 def test_parse_timestamp(self):
     var = TimeVariable("time")
     datestr = str(datetime(2016, 6, 14, 23, 8, tzinfo=timezone.utc).timestamp())
     ts1 = var.parse(datestr)
     self.assertEqual(var.repr_val(ts1), '2016-06-14 23:08:00')
Exemple #14
0
]

discrete = list(chain(rgb, ints))


def _to_timestamps(years):
    return [
        datetime.datetime(year, 1, 1).timestamp()
        if not np.isnan(year) else np.nan for year in years
    ]


# Time variable variations, windows timestamps need to be valid timestamps so
# we'll just fill it in with arbitrary years
time_full = VarDataPair(
    TimeVariable('time_full'),
    np.array(_to_timestamps([2000, 2001, 2002, 2003, 2004]), dtype=float),
)
time_missing = VarDataPair(
    TimeVariable('time_missing'),
    np.array(_to_timestamps([2000, np.nan, 2001, 2003, 2004]), dtype=float),
)
time_all_missing = VarDataPair(
    TimeVariable('time_all_missing'),
    np.array(_to_timestamps([np.nan] * 5), dtype=float),
)
time_same = VarDataPair(
    TimeVariable('time_same'),
    np.array(_to_timestamps([2004] * 5), dtype=float),
)
time = [time_full, time_missing, time_all_missing, time_same]
Exemple #15
0
)
ints_missing = VarDataPair(
    DiscreteVariable('ints_missing', values=('2', '3', '4'), ordered=True),
    np.array([0, 1, 1, np.nan, 2], dtype=float),
)


def _to_timestamps(years):
    return [
        datetime.datetime(year, 1, 1).timestamp()
        if not np.isnan(year) else np.nan for year in years
    ]


time_full = VarDataPair(
    TimeVariable('time_full'),
    np.array(_to_timestamps([2000, 2001, 2002, 2003, 2004]), dtype=float),
)
time_missing = VarDataPair(
    TimeVariable('time_missing'),
    np.array(_to_timestamps([2000, np.nan, 2001, 2003, 2004]), dtype=float),
)

# String variable variations
string_full = VarDataPair(
    StringVariable('string_full'),
    np.array(['a', 'b', 'c', 'd', 'e'], dtype=object),
)
string_missing = VarDataPair(
    StringVariable('string_missing'),
    np.array(['a', 'b', 'c', StringVariable.Unknown, 'e'], dtype=object),
Exemple #16
0
 def _new_var(self):
     name = self._new_var_name()
     if self.operation in self.TimePreserving \
             and all(isinstance(var, TimeVariable) for var in self.variables):
         return TimeVariable(name)
     return ContinuousVariable(name)
Exemple #17
0
 def test_repr_value(self):
     # https://github.com/biolab/orange3/pull/1760
     var = TimeVariable('time')
     self.assertEqual(var.repr_val(Value(var, 416.3)), '416.3')
Exemple #18
0
 def test_no_date_no_time(self):
     self.assertEqual(TimeVariable('relative time').repr_val(1.6), '1.6')
def table_from_frame(df, class_name, *, force_nominal=False):
    """
    Convert pandas.DataFrame to Orange.data.Table

    Parameters
    ----------
    df : pandas.DataFrame
    force_nominal : boolean
        If True, interpret ALL string columns as nominal (DiscreteVariable).

    Returns
    -------
    Table
    """
    def _is_discrete(s):
        return (is_categorical_dtype(s) or is_object_dtype(s) and
                (force_nominal or s.nunique() < s.size**.666))

    def _is_datetime(s):
        if is_datetime64_any_dtype(s):
            return True
        try:
            if is_object_dtype(s):
                pd.to_datetime(s, infer_datetime_format=True)
                return True
        except Exception:  # pylint: disable=broad-except
            pass
        return False

    # If df index is not a simple RangeIndex (or similar), put it into data
    if not (df.index.is_integer() and (df.index.is_monotonic_increasing
                                       or df.index.is_monotonic_decreasing)):
        df = df.reset_index()

    attrs, metas, calss_vars = [], [], []
    X, M = [], []

    # Iter over columns
    for name, s in df.items():
        name = str(name)
        if name == class_name:
            discrete = s.astype('category').cat
            calss_vars.append(
                DiscreteVariable(name,
                                 discrete.categories.astype(str).tolist()))
            X.append(discrete.codes.replace(-1, np.nan).values)
        elif _is_discrete(s):
            discrete = s.astype('category').cat
            attrs.append(
                DiscreteVariable(name,
                                 discrete.categories.astype(str).tolist()))
            X.append(discrete.codes.replace(-1, np.nan).values)
        elif _is_datetime(s):
            tvar = TimeVariable(name)
            attrs.append(tvar)
            s = pd.to_datetime(s, infer_datetime_format=True)
            X.append(
                s.astype('str').replace('NaT', np.nan).map(tvar.parse).values)
        elif is_numeric_dtype(s):
            attrs.append(ContinuousVariable(name))
            X.append(s.values)
        else:
            metas.append(StringVariable(name))
            M.append(s.values.astype(object))

    return Table.from_numpy(
        Domain(attrs, calss_vars, metas),
        np.column_stack(X) if X else np.empty((df.shape[0], 0)), None,
        np.column_stack(M) if M else None)
Exemple #20
0
    def data_table(self, data, headers=None):
        """
        Return Orange.data.Table given rows of `headers` (iterable of iterable)
        and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might
        as well **have it sorted column-major**, e.g. ``order='F'``).

        Basically, the idea of subclasses is to produce those two iterables,
        however they might.

        If `headers` is not provided, the header rows are extracted from `data`,
        assuming they precede it.
        """
        if not headers:
            headers, data = self.parse_headers(data)

        # Consider various header types (single-row, two-row, three-row, none)
        if 3 == len(headers):
            names, types, flags = map(list, headers)
        else:
            if 1 == len(headers):
                HEADER1_FLAG_SEP = '#'
                # First row format either:
                #   1) delimited column names
                #   2) -||- with type and flags prepended, separated by #,
                #      e.g. d#sex,c#age,cC#IQ
                _flags, names = zip(*[i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i)
                                      for i in headers[0]])
                names = list(names)
            elif 2 == len(headers):
                names, _flags = map(list, headers)
            else:
                # Use heuristics for everything
                names, _flags = [], []
            types = [''.join(filter(str.isupper, flag)).lower() for flag in _flags]
            flags = [Flags.join(filter(str.islower, flag)) for flag in _flags]

        # Determine maximum row length
        rowlen = max(map(len, (names, types, flags)))

        def _equal_length(lst):
            lst.extend(['']*(rowlen - len(lst)))
            return lst

        # Ensure all data is of equal width in a column-contiguous array
        data = np.array([_equal_length(list(row)) for row in data if any(row)],
                        copy=False, dtype=object, order='F')

        # Data may actually be longer than headers were
        try: rowlen = data.shape[1]
        except IndexError: pass
        else:
            for lst in (names, types, flags):
                _equal_length(lst)

        NAMEGEN = namegen('Feature ', 1)
        Xcols, attrs = [], []
        Mcols, metas = [], []
        Ycols, clses = [], []
        Wcols = []

        # Iterate through the columns
        for col in range(rowlen):
            flag = Flags(Flags.split(flags[col]))
            if flag.i: continue

            type_flag = types and types[col].strip()
            try:
                orig_values = [np.nan if i in MISSING_VALUES else i
                               for i in (i.strip() for i in data[:, col])]
            except IndexError:
                # No data instances leads here
                orig_values = []
                # In this case, coltype could be anything. It's set as-is
                # only to satisfy test_table.TableTestCase.test_append
                coltype = DiscreteVariable

            coltype_kwargs = {}
            valuemap = []
            values = orig_values

            if type_flag in StringVariable.TYPE_HEADERS:
                coltype = StringVariable
            elif type_flag in ContinuousVariable.TYPE_HEADERS:
                coltype = ContinuousVariable
                try:
                    values = [float(i) for i in orig_values]
                except ValueError:
                    for row, num in enumerate(orig_values):
                        try: float(num)
                        except ValueError: break
                    raise ValueError('Non-continuous value in (1-based) '
                                     'line {}, column {}'.format(row + len(headers) + 1,
                                                                 col + 1))

            elif type_flag in TimeVariable.TYPE_HEADERS:
                coltype = TimeVariable

            elif (type_flag in DiscreteVariable.TYPE_HEADERS or
                  _RE_DISCRETE_LIST.match(type_flag)):
                if _RE_DISCRETE_LIST.match(type_flag):
                    valuemap = Flags.split(type_flag)
                    coltype_kwargs.update(ordered=True)
                else:
                    valuemap = sorted(set(orig_values) - {np.nan})

            else:
                # No known type specified, use heuristics
                is_discrete = is_discrete_values(orig_values)
                if is_discrete:
                    valuemap = sorted(is_discrete)
                else:
                    try: values = [float(i) for i in orig_values]
                    except ValueError:
                        tvar = TimeVariable('_')
                        try: values = [tvar.parse(i) for i in orig_values]
                        except ValueError:
                            coltype = StringVariable
                        else:
                            coltype = TimeVariable
                    else:
                        coltype = ContinuousVariable

            if valuemap:
                # Map discrete data to ints
                def valuemap_index(val):
                    try: return valuemap.index(val)
                    except ValueError: return np.nan

                values = np.vectorize(valuemap_index, otypes=[float])(orig_values)
                coltype = DiscreteVariable
                coltype_kwargs.update(values=valuemap)

            if coltype is StringVariable:
                values = ['' if i is np.nan else i
                          for i in orig_values]

            if flag.m or coltype is StringVariable:
                append_to = (Mcols, metas)
            elif flag.w:
                append_to = (Wcols, None)
            elif flag.c:
                append_to = (Ycols, clses)
            else:
                append_to = (Xcols, attrs)

            cols, domain_vars = append_to
            cols.append(col)
            if domain_vars is not None:
                if names and names[col]:
                    # Use existing variable if available
                    var = coltype.make(names[col].strip(), **coltype_kwargs)
                else:
                    # Never use existing for un-named variables
                    var = coltype(next(NAMEGEN), **coltype_kwargs)
                var.attributes.update(flag.attributes)
                domain_vars.append(var)

                # Reorder discrete values to match existing variable
                if var.is_discrete and not var.ordered:
                    new_order, old_order = var.values, coltype_kwargs.get('values', var.values)
                    if new_order != old_order:
                        offset = len(new_order)
                        column = values if data.ndim > 1 else data
                        column += offset
                        for i, val in enumerate(var.values):
                            try: oldval = old_order.index(val)
                            except ValueError: continue
                            bn.replace(column, offset + oldval, new_order.index(val))

            if coltype is TimeVariable:
                # Re-parse the values because only now after coltype.make call
                # above, variable var is the correct one
                values = [var.parse(i) for i in orig_values]

            # Write back the changed data. This is needeed to pass the
            # correct, converted values into Table.from_numpy below
            try: data[:, col] = values
            except IndexError: pass

        domain = Domain(attrs, clses, metas)

        if not data.size:
            return Table.from_domain(domain, 0)

        table = Table.from_numpy(domain,
                                 data[:, Xcols].astype(float, order='C'),
                                 data[:, Ycols].astype(float, order='C'),
                                 data[:, Mcols].astype(object, order='C'),
                                 data[:, Wcols].astype(float, order='C'))
        return table
Exemple #21
0
    def __get_pivot_tab_domain(self, val_var, X, X_h, X_v, X_t, agg_funs):
        def map_values(index, _X):
            values = np.unique(_X[:, index])
            values = np.delete(values, np.where(values == "nan")[0])
            for j, value in enumerate(values):
                _X[:, index][_X[:, index] == value] = j
            return values

        create_time_var = \
            isinstance(val_var, TimeVariable) and \
            all(fun in self.TimeVarFunctions for fun in agg_funs)
        create_cont_var = \
            not val_var or val_var.is_continuous and \
            (not isinstance(val_var, TimeVariable) or
             all(fun in self.FloatFunctions for fun in agg_funs))

        vals = np.array(self._col_var.values)[self._col_var_groups.astype(int)]
        if create_time_var:
            kwargs = {
                "have_date": val_var.have_date,
                "have_time": val_var.have_time
            }
            attrs = [[TimeVariable(f"{v}", **kwargs) for v in vals]] * 2
            attrs.extend([[TimeVariable("Total", **kwargs)]] * 2)
        elif create_cont_var:
            attrs = [[ContinuousVariable(f"{v}", 1) for v in vals]] * 2
            attrs.extend([[ContinuousVariable("Total", 1)]] * 2)
        else:
            attrs = []
            for x in (X, X_h):
                attrs.append([
                    DiscreteVariable(f"{v}", map_values(i, x))
                    for i, v in enumerate(vals, 2)
                ])
            for x in (X_v, X_t):
                attrs.append([DiscreteVariable("Total", map_values(0, x))])
        row_var_h = DiscreteVariable(self._row_var.name, values=["Total"])
        aggr_attr = DiscreteVariable('Aggregate', [str(f) for f in agg_funs])

        same_row_col = self._col_var is self._row_var

        extra_vars = [self._row_var, aggr_attr]
        uniq_a = get_unique_names_duplicates([v.name for v in extra_vars] +
                                             [atr.name for atr in attrs[0]])
        for (idx, var), u in zip(enumerate(chain(extra_vars, attrs[0])),
                                 uniq_a):
            if var.name == u:
                continue
            if idx == 0:
                self.renamed.append(self._row_var.name)
                self._row_var = self._row_var.copy(name=u)
                if same_row_col:
                    self._col_var = self._row_var
                row_var_h = row_var_h.copy(name=u)
            elif idx == 1:
                self.renamed.append(aggr_attr.name)
                aggr_attr = aggr_attr.copy(name=u)
            else:
                self.renamed.append(var.name)
                attrs[0][idx - 2] = var.copy(name=u)
                attrs[1][idx - 2] = var.copy(name=u)

        if same_row_col:
            vals = tuple(v.name for v in attrs[0])
            self._row_var.make(self._row_var.name, values=vals)
            vals = tuple(v.name for v in attrs[2])
            row_var_h.make(row_var_h.name, vals)

        return (Domain([self._row_var, aggr_attr] + attrs[0]),
                Domain([row_var_h, aggr_attr] + attrs[1]), Domain(attrs[2]),
                Domain(attrs[3]))
Exemple #22
0
 def test_have_date_have_time_in_construct(self):
     """Test if have_time and have_date is correctly set"""
     var = TimeVariable('time', have_date=1)
     self.assertTrue(var.have_date)
     self.assertFalse(var.have_time)
Exemple #23
0
 def setUp(self):
     self.callback = Mock()
     self.editor = TimeVariableEditor(self.parent,
                                      TimeVariable("var", have_date=1),
                                      self.callback)
Exemple #24
0
 def test_parse_repr(self):
     for datestr, timestamp, outstr in self.TESTS:
         var = TimeVariable('time')
         ts = var.parse(datestr)
         self.assertEqual(ts, timestamp, msg=datestr)
         self.assertEqual(var.repr_val(ts), outstr, msg=datestr)
Exemple #25
0
 def test_time(self):
     X = TimeVariable("X")
     self._test_common(X)
Exemple #26
0
    def read(self):
        import opusFC

        if self.sheet:
            db = self.sheet
        else:
            db = self.sheets[0]

        db = tuple(db.split(" "))
        dim = db[1]

        try:
            data = opusFC.getOpusData(self.filename, db)
        except Exception:
            raise IOError("Couldn't load spectrum from " + self.filename)

        attrs, clses, metas = [], [], []

        attrs = [
            ContinuousVariable.make(repr(data.x[i]))
            for i in range(data.x.shape[0])
        ]

        y_data = None
        meta_data = None

        if type(data) == opusFC.MultiRegionDataReturn:
            y_data = []
            meta_data = []
            metas.extend([
                ContinuousVariable.make('map_x'),
                ContinuousVariable.make('map_y'),
                StringVariable.make('map_region'),
                TimeVariable.make('start_time')
            ])
            for region in data.regions:
                y_data.append(region.spectra)
                mapX = region.mapX
                mapY = region.mapY
                map_region = np.full_like(mapX, region.title, dtype=object)
                start_time = region.start_time
                meta_region = np.column_stack(
                    (mapX, mapY, map_region, start_time))
                meta_data.append(meta_region.astype(object))
            y_data = np.vstack(y_data)
            meta_data = np.vstack(meta_data)

        elif type(data) == opusFC.MultiRegionTRCDataReturn:
            y_data = []
            meta_data = []
            metas.extend([
                ContinuousVariable.make('map_x'),
                ContinuousVariable.make('map_y'),
                StringVariable.make('map_region')
            ])
            attrs = [
                ContinuousVariable.make(repr(data.labels[i]))
                for i in range(len(data.labels))
            ]
            for region in data.regions:
                y_data.append(region.spectra)
                mapX = region.mapX
                mapY = region.mapY
                map_region = np.full_like(mapX, region.title, dtype=object)
                meta_region = np.column_stack((mapX, mapY, map_region))
                meta_data.append(meta_region.astype(object))
            y_data = np.vstack(y_data)
            meta_data = np.vstack(meta_data)

        elif type(data) == opusFC.ImageDataReturn:
            metas.extend([
                ContinuousVariable.make('map_x'),
                ContinuousVariable.make('map_y')
            ])

            data_3D = data.spectra

            for i in np.ndindex(data_3D.shape[:1]):
                map_y = np.full_like(data.mapX, data.mapY[i])
                coord = np.column_stack((data.mapX, map_y))
                if y_data is None:
                    y_data = data_3D[i]
                    meta_data = coord.astype(object)
                else:
                    y_data = np.vstack((y_data, data_3D[i]))
                    meta_data = np.vstack((meta_data, coord))

        elif type(data) == opusFC.ImageTRCDataReturn:
            metas.extend([
                ContinuousVariable.make('map_x'),
                ContinuousVariable.make('map_y')
            ])

            attrs = [
                ContinuousVariable.make(repr(data.labels[i]))
                for i in range(len(data.labels))
            ]
            data_3D = data.traces

            for i in np.ndindex(data_3D.shape[:1]):
                map_y = np.full_like(data.mapX, data.mapY[i])
                coord = np.column_stack((data.mapX, map_y))
                if y_data is None:
                    y_data = data_3D[i]
                    meta_data = coord.astype(object)
                else:
                    y_data = np.vstack((y_data, data_3D[i]))
                    meta_data = np.vstack((meta_data, coord))

        elif type(data) == opusFC.TimeResolvedTRCDataReturn:
            y_data = data.traces

        elif type(data) == opusFC.TimeResolvedDataReturn:
            metas.extend([ContinuousVariable.make('z')])

            y_data = data.spectra
            meta_data = data.z

        elif type(data) == opusFC.SingleDataReturn:
            y_data = data.y[None, :]

        else:
            raise ValueError(
                "Empty or unsupported opusFC DataReturn object: " + type(data))

        import_params = ['SRT', 'SNM']

        for param_key in import_params:
            try:
                param = data.parameters[param_key]
            except KeyError:
                pass  # TODO should notify user?
            else:
                try:
                    param_name = opusFC.paramDict[param_key]
                except KeyError:
                    param_name = param_key
                if param_key == 'SRT':
                    var = TimeVariable.make(param_name)
                elif type(param) is float:
                    var = ContinuousVariable.make(param_name)
                elif type(param) is str:
                    var = StringVariable.make(param_name)
                else:
                    raise ValueError  #Found a type to handle
                metas.extend([var])
                params = np.full((y_data.shape[0], ), param,
                                 np.array(param).dtype)
                if meta_data is not None:
                    # NB dtype default will be np.array(fill_value).dtype in future
                    meta_data = np.column_stack(
                        (meta_data, params.astype(object)))
                else:
                    meta_data = params

        domain = Orange.data.Domain(attrs, clses, metas)

        meta_data = np.atleast_2d(meta_data)

        table = Orange.data.Table.from_numpy(domain,
                                             y_data.astype(float, order='C'),
                                             metas=meta_data)

        return table
Exemple #27
0
def vars_from_df(df, role=None, force_nominal=False):
    if role is None and hasattr(df, 'orange_role'):
        role = df.orange_role
    df = _reset_index(df)

    cols = [], [], []
    exprs = [], [], []
    vars_ = [], [], []

    for column in df.columns:
        s = df[column]
        _role = Role.Attribute if role is None else role
        if hasattr(df, 'orange_variables') and column in df.orange_variables:
            original_var = df.orange_variables[column]
            var = original_var.copy(compute_value=None)
            expr = None
        elif _is_datetime(s):
            var = TimeVariable(str(column))
            expr = _convert_datetime
        elif _is_discrete(s, force_nominal):
            discrete = s.astype("category").cat
            var = DiscreteVariable(str(column),
                                   discrete.categories.astype(str).tolist())
            expr = to_categorical
        elif is_numeric_dtype(s):
            var = ContinuousVariable(
                # set number of decimals to 0 if int else keeps default behaviour
                str(column),
                number_of_decimals=(0 if is_integer_dtype(s) else None))
            expr = None
        else:
            if role is not None and role != Role.Meta:
                raise ValueError("String variable must be in metas.")
            _role = Role.Meta
            var = StringVariable(str(column))
            expr = lambda s, _: np.asarray(s, dtype=object)

        cols[_role].append(column)
        exprs[_role].append(expr)
        vars_[_role].append(var)

    xym = []
    for a_vars, a_cols, a_expr in zip(vars_, cols, exprs):
        if not a_cols:
            arr = None if a_cols != cols[0] else np.empty((df.shape[0], 0))
        elif not any(a_expr):
            # if all c in columns table will share memory with dataframe
            a_df = df if all(c in a_cols for c in df.columns) else df[a_cols]
            if all(isinstance(a, SparseDtype) for a in a_df.dtypes):
                arr = csr_matrix(a_df.sparse.to_coo())
            else:
                arr = np.asarray(a_df)
        else:
            # we'll have to copy the table to resolve any expressions
            arr = np.array([
                expr(df[col], var) if expr else np.asarray(df[col])
                for var, col, expr in zip(a_vars, a_cols, a_expr)
            ]).T
        xym.append(arr)

    # Let the tables share memory with pandas frame
    if xym[1] is not None and xym[1].ndim == 2 and xym[1].shape[1] == 1:
        xym[1] = xym[1][:, 0]

    return xym, Domain(*vars_)
Exemple #28
0
 def test_parse_invalid(self):
     var = TimeVariable('var')
     with self.assertRaises(ValueError):
         var.parse('123')
Exemple #29
0
 def test_parse_timestamp(self):
     var = TimeVariable("time")
     datestr = str(
         datetime(2016, 6, 14, 23, 8, tzinfo=timezone.utc).timestamp())
     ts1 = var.parse(datestr)
     self.assertEqual(var.repr_val(ts1), '2016-06-14 23:08:00')
Exemple #30
0
 def test_repr_value(self):
     # https://github.com/biolab/orange3/pull/1760
     var = TimeVariable('time')
     self.assertEqual(var.repr_val(Value(var, 416.3)), '416.3')
Exemple #31
0
class TwitterAPI:
    """ Fetch tweets from the Tweeter API.

    Notes:
        Results across multiple searches are aggregated. To remove tweets form
        previous searches and only return results from the last search either
        call `reset` method before searching or provide `collecting=False`
        argument to search method.
    """

    attributes = []
    class_vars = []

    tv = TimeVariable("Date")
    authors = [
        (DiscreteVariable("Author"), lambda doc: "@" + doc.author.screen_name,),
    ]
    metas = [
        (
            StringVariable("Content"),
            lambda doc: doc.full_text if not doc.retweeted else doc.text,
        ),
        # temporary fix until Orange>3.30.1 then change back to
        # (tv, lambda doc: TwitterAPI.tv.parse(doc.created_at.isoformat())),
        (tv, lambda doc: TwitterAPI.tv.parse(
                    TwitterAPI.tv._tzre_sub(doc.created_at.isoformat()))),
        (DiscreteVariable("Language"), lambda doc: doc.lang),
        (
            DiscreteVariable("Location"),
            lambda doc: getattr(doc.place, "country_code", None),
        ),
        (
            ContinuousVariable("Number of Likes", number_of_decimals=0),
            lambda doc: doc.favorite_count,
        ),
        (
            ContinuousVariable("Number of Retweets", number_of_decimals=0),
            lambda doc: doc.retweet_count,
        ),
        (
            DiscreteVariable("In Reply To"),
            lambda doc: "@" + doc.in_reply_to_screen_name
            if doc.in_reply_to_screen_name
            else "",
        ),
        (DiscreteVariable("Author Name"), lambda doc: doc.author.name),
        (
            StringVariable("Author Description"),
            lambda doc: doc.author.description,
        ),
        (
            ContinuousVariable("Author Statuses Count", number_of_decimals=0),
            lambda doc: doc.author.statuses_count,
        ),
        (
            ContinuousVariable("Author Favourites Count", number_of_decimals=0),
            lambda doc: doc.author.favourites_count,
        ),
        (
            ContinuousVariable("Author Friends Count", number_of_decimals=0),
            lambda doc: doc.author.friends_count,
        ),
        (
            ContinuousVariable("Author Followers Count", number_of_decimals=0),
            lambda doc: doc.author.followers_count,
        ),
        (
            ContinuousVariable("Author Listed Count", number_of_decimals=0),
            lambda doc: doc.author.listed_count,
        ),
        (
            DiscreteVariable("Author Verified"),
            lambda doc: str(doc.author.verified),
        ),
        (
            ContinuousVariable("Longitude"),
            lambda doc: coordinates_geoJSON(doc.coordinates)[0],
        ),
        (
            ContinuousVariable("Latitude"),
            lambda doc: coordinates_geoJSON(doc.coordinates)[1],
        ),
    ]

    text_features = [metas[0][0]]  # Content
    string_attributes = [m for m, _ in metas if isinstance(m, StringVariable)]

    def __init__(self, credentials):
        self.key = credentials
        self.api = tweepy.API(credentials.auth)
        self.container = OrderedDict()
        self.search_history = []

    @property
    def tweets(self):
        return self.container.values()

    def search_content(
        self,
        content,
        *,
        max_tweets=0,
        lang=None,
        allow_retweets=True,
        collecting=False,
        callback=None
    ):
        """ Search by content.

        Args:
            content (list of str): A list of key words to search for.
            max_tweets (int): If greater than zero limits the number of
                downloaded tweets.
            lang (str): A language's code (either ISO 639-1 or ISO 639-3
                formats).
            allow_retweets(bool): Whether to download retweets.
            collecting (bool): Whether to collect results across multiple
                search calls.

        Returns:
            Corpus
        """
        if not collecting:
            self.reset()

        if max_tweets == 0:
            max_tweets = float("Inf")

        def build_query():
            nonlocal content
            if not content:
                q = "from: "
            else:
                if not isinstance(content, list):
                    content = [content]
                q = " OR ".join(['"{}"'.format(q) for q in content])
            if not allow_retweets:
                q += " -filter:retweets"
            return q

        query = build_query()
        cursor = tweepy.Cursor(
            self.api.search_tweets, q=query, lang=lang, tweet_mode="extended"
        )

        corpus, count = self.fetch(
            cursor, max_tweets, search_author=False, callback=callback
        )
        self.append_history(
            "Content",
            content,
            lang if lang else "Any",
            str(allow_retweets),
            count,
        )
        return corpus

    def search_authors(
        self, authors, *, max_tweets=0, collecting=False, callback=None
    ):
        """ Search by authors.

        Args:
            authors (list of str): A list of authors to search for.
            max_tweets (int): If greater than zero limits the number of
                downloaded tweets.
            collecting (bool): Whether to collect results across multiple
                search calls.

        Returns:
            Corpus
        """
        if not collecting:
            self.reset()

        if max_tweets == 0:  # set to max allowed for progress
            max_tweets = 3200

        if not isinstance(authors, list):
            authors = [authors]

        cursors = [
            tweepy.Cursor(
                self.api.user_timeline, screen_name=a, tweet_mode="extended"
            )
            for a in authors
        ]
        corpus, count = self.fetch(
            cursors, max_tweets, search_author=True, callback=callback
        )
        self.append_history("Author", authors, None, None, count)
        return corpus

    def fetch(self, cursors, max_tweets, search_author, callback):
        if not isinstance(cursors, list):
            cursors = [cursors]

        count = 0
        for i, cursor in enumerate(cursors):
            for j, tweet in enumerate(cursor.items(max_tweets), start=1):
                if tweet.id not in self.container:
                    count += 1
                self.container[tweet.id] = tweet
                if j % 20 == 0:
                    if callback is not None:
                        callback(
                            (i * max_tweets + j) / (len(cursors) * max_tweets)
                        )

        return self.create_corpus(search_author), count

    def create_corpus(self, search_author):
        if search_author:
            class_vars = self.authors
            metas = self.metas
        else:
            class_vars = []
            metas = self.metas + self.authors
        return Corpus.from_documents(
            self.tweets,
            "Twitter",
            self.attributes,
            class_vars,
            metas,
            title_indices=[-1],
        )

    def reset(self):
        """ Removes all downloaded tweets. """
        self.search_history = []
        self.container = OrderedDict()

    def append_history(self, mode, query, lang, allow_retweets, n_tweets):
        query = ", ".join(query) if isinstance(query, Iterable) else query
        if lang in code2lang.keys():
            lang = code2lang[lang]
        self.search_history.append(
            (
                ("Query", query),
                ("Search by", mode),
                ("Language", lang),
                ("Allow retweets", allow_retweets),
                ("Tweets count", n_tweets),
            )
        )

    def report(self):
        return self.search_history