def reshape_dot(column_names, data, measure_vars, output, id_vars=None):
     "reshape if '.value' is present in the column names."
     boolean = [True if ent == '.value' else False for ent in column_names]
     dot_value = [[*compress(extract, boolean)] for extract in output]
     if len(dot_value[0]) > 1:
         dot_value = ["".join(extract) for extract in dot_value]
     else:
         dot_value = [*chain.from_iterable(dot_value)]
     checks = set(dot_value)
     if id_vars and checks.intersection(id_vars):
         raise ValueError(
             f"The new column names associated with .value -> {checks} "
             "are duplicated in id_vars."
         )
     boolean = [not true for true in boolean]
     others = [tuple(compress(extract, boolean)) for extract in output]
     headers_for_others = [extract for extract in column_names if extract != '.value']
     measure_vars = [frame for frame in data[:, measure_vars]]
     out = defaultdict(list)
     for key, value_column, frame in zip(others, dot_value, measure_vars):
         frame.names = [value_column]
         out[key].append(frame)
     headers_for_others = [dt.Frame([key], names = headers_for_others) for key, _ in out.items()]
     out = [dt.cbind(frame, force = True) for _, frame in out.items()]
     out = [dt.cbind(dt.repeat(left, right.nrows), right, force = True) for left, right in zip(headers_for_others, out)]
     out = dt.rbind(out, force = True)
     if id_vars:
         id_vars = dt.repeat(data[:, id_vars], out.nrows//data.nrows)
         return dt.cbind([id_vars, out], force = True)
     return out
 def reshape_no_dot(measure_vars, output, data, id_vars=None):
     values = []
     for frame in data[:, measure_vars]:
         frame.names = [value_name]
         values.append(frame)
     values = dt.rbind(values, force=True)
     if id_vars:
         id_vars = dt.repeat(data[:, id_vars], len(measure_vars))
         return dt.cbind([id_vars, output, values], force = True)
     return dt.cbind([output, values], force = True)
Esempio n. 3
0
def test_repeat():
    DT = dt.Frame(A=[d(2001, 10, 12, 0, 0, 0)])
    DT = dt.repeat(DT, 5)
    assert_equals(DT, dt.Frame(A=[d(2001, 10, 12, 0, 0, 0)] * 5))
Esempio n. 4
0
def test_date32_repeat():
    DT = dt.Frame([11], stype='date32')
    RES = dt.repeat(DT, 10)
    assert_equals(RES, dt.Frame([11]*10, stype='date32'))
    RES2 = dt.repeat(RES, 5)
    assert_equals(RES2, dt.Frame([11]*50, stype='date32'))
def melt(data, id_vars=None, measure_vars=None, variable_name = 'variable', value_name = 'value'):
    "Turns Frame from wide to long form."
    if id_vars:
        if not isinstance(id_vars, (str, list, tuple)):
            raise TypeError('id_vars should be one of str, list, tuple.')
        if isinstance(id_vars, str):
            id_vars = [id_vars]
        checks = set(id_vars).difference(data.names)
        if checks:
            raise ValueError(f'Labels {checks} in id_vars do not exist in the column names.')
        if not set(data.names).difference(id_vars):
            return data
        checks = [key for key,value in Counter(id_vars).items() if value > 1]
        if checks:
            raise ValueError(f"Labels {checks} are duplicated in id_vars.")
        if not measure_vars:
            measure_vars = [name for name in data.names if name not in id_vars]
    if measure_vars:
        if not isinstance(measure_vars, (str, list, tuple)):
            raise TypeError('measure_vars should be one of str, list, tuple.')
        if isinstance(measure_vars, str):
            measure_vars = [measure_vars]
        checks = set(measure_vars).difference(data.names)
        if checks:
            raise ValueError(f'Labels {checks} in measure_vars do not exist in the column names.')
        checks = [key for key,value in Counter(measure_vars).items() if value > 1]
        if checks:
            raise ValueError(f"Labels {checks} are duplicated in measure_vars.")
        if (not id_vars) and (len(measure_vars) < data.ncols):
            id_vars = [name for name in data.names if name not in measure_vars]
    else:
        measure_vars = data.names

    def reshape_no_dot(measure_vars, output, data, id_vars=None):
        values = []
        for frame in data[:, measure_vars]:
            frame.names = [value_name]
            values.append(frame)
        values = dt.rbind(values, force=True)
        if id_vars:
            id_vars = dt.repeat(data[:, id_vars], len(measure_vars))
            return dt.cbind([id_vars, output, values], force = True)
        return dt.cbind([output, values], force = True)

    def reshape_dot(column_names, data, measure_vars, output, id_vars=None):
        "reshape if '.value' is present in the column names."
        boolean = [True if ent == '.value' else False for ent in column_names]
        dot_value = [[*compress(extract, boolean)] for extract in output]
        if len(dot_value[0]) > 1:
            dot_value = ["".join(extract) for extract in dot_value]
        else:
            dot_value = [*chain.from_iterable(dot_value)]
        checks = set(dot_value)
        if id_vars and checks.intersection(id_vars):
            raise ValueError(
                f"The new column names associated with .value -> {checks} "
                "are duplicated in id_vars."
            )
        boolean = [not true for true in boolean]
        others = [tuple(compress(extract, boolean)) for extract in output]
        headers_for_others = [extract for extract in column_names if extract != '.value']
        measure_vars = [frame for frame in data[:, measure_vars]]
        out = defaultdict(list)
        for key, value_column, frame in zip(others, dot_value, measure_vars):
            frame.names = [value_column]
            out[key].append(frame)
        headers_for_others = [dt.Frame([key], names = headers_for_others) for key, _ in out.items()]
        out = [dt.cbind(frame, force = True) for _, frame in out.items()]
        out = [dt.cbind(dt.repeat(left, right.nrows), right, force = True) for left, right in zip(headers_for_others, out)]
        out = dt.rbind(out, force = True)
        if id_vars:
            id_vars = dt.repeat(data[:, id_vars], out.nrows//data.nrows)
            return dt.cbind([id_vars, out], force = True)
        return out

    if not isinstance(variable_name, (str, tuple, dict, Pattern)):
        raise TypeError('variable_name should be one of string, tuple, dictionary, regular expression.')

    if isinstance(variable_name, str):
        if not isinstance(value_name, str):
            raise TypeError('value_name should be a string.')
        if value_name == variable_name:
            raise ValueError(
                f"{value_name} is duplicated as variable_name. "
                f"Kindly provide a unique argument for {value_name}.")
        if id_vars: 
            if variable_name in id_vars:
                raise ValueError(
                    f"{variable_name} already exists as a label "
                    "in id_vars. Kindly provide a unique argument.")
            if value_name in id_vars:
                raise ValueError(
                    f"{value_name} already exists as a label "
                    "in id_vars. Kindly provide a unique argument.")

        output = dt.Frame({variable_name:measure_vars})
        output = output[np.repeat(range(output.nrows), data.nrows),:]
        return reshape_no_dot(measure_vars=measure_vars, output = output, data = data, id_vars = id_vars)


    if isinstance(variable_name, Pattern):
        if not re.compile(variable_name).groups:
            raise ValueError("The regex should have at least one group.")
        output = [re.search(variable_name, word) for word in measure_vars]
        no_matches = [word for word, match in zip(measure_vars, output) if not match]
        if no_matches:
            raise ValueError(
                f"There was no match for labels {no_matches} "
                "for the provided regular expression.")
        output = [entry.groupdict() for entry in output]
        checks = output[0].keys()
        if id_vars and set(checks).intersection(id_vars):
            raise ValueError(
                f"Labels {checks} already exist in id_vars. "
                "Kindly provide unique names for the named groups " 
                "in the regular expression."
                )
        output = dt.Frame(output)
        output = output[np.repeat(range(output.nrows), data.nrows),:]        
        return reshape_no_dot(measure_vars=measure_vars, output = output, data = data, id_vars = id_vars)

    if isinstance(variable_name, dict) :
        checks = set(variable_name).intersection(id_vars)
        if id_vars and checks:
            raise ValueError(
                f"Labels {checks} already exist in id_vars. "
                "Kindly provide keys for the dictionary "
                "that do not exist in id_vars."
                )
        for key, regex in variable_name.items():
            if not isinstance(key, str):
                raise TypeError(f"{key} should be a string.")
            if not isinstance(regex, (str, Pattern)):
                raise TypeError(
                    f"The value for {key} should be a regular expression, "
                    "or can be compiled into one."
                    )
            if re.compile(regex).groups:
                raise ValueError("The regex should not have any groups.")
        output = []
        for key, regex in variable_name.items():
            out = [word for word in measure_vars if re.search(regex, word)]
            if not out:
                raise ValueError(
                    f"There was no match for {key} for regex => {regex}"
                )            
            
            measure_vars = [word for word in measure_vars if word not in out]
            if len(out) == 1:
                frame.names = [key]
                output.append(frame)
            else:
                values = []
                for frame in data[:, out]:
                    frame.names = [key]
                    values.append(frame)
                output.append(dt.rbind(values, force = True))
        output = dt.cbind(output, force=True)
        if id_vars:
            id_vars = dt.repeat(data[:, id_vars], output.nrows//data.nrows)
            return dt.cbind([id_vars, output])
        return output
          
    if isinstance(variable_name, tuple):
        variable_name = measure(*variable_name)
        column_names, sep, pattern = variable_name
        if not column_names:
            raise ValueError("Kindly provide argument for column_names, in the variable_name tuple.")
        if not isinstance(column_names, (str, list)):
            raise TypeError('column_names should be one of string, list.')
        if isinstance(column_names, str):
            column_names = [column_names]
        if id_vars:
            checks = set(column_names)
            checks.discard(".value")
            checks = checks.intersection(id_vars)
            if checks:
                raise ValueError(
                    f"Labels {checks} already exist in id_vars. "
                    "Kindly provide unique column_names "
                    "that do not exist in id_vars."
                    )
        if not any((sep, pattern)):
            raise ValueError("Kindly provide one of sep or pattern.")
        if sep and pattern:
            raise ValueError("only one of sep or pattern should be provided.")
        if sep:
            if not isinstance(sep, (str, Pattern)):
                raise TypeError(
                    "sep should be a regular expression, "
                    "or can be compiled into one.")
            output = [re.split(sep, word) for word in measure_vars]
            checks = max(map(len, output))
            if len(column_names) != checks:
                raise ValueError(
                    f"The maximum number of splits for sep -> {sep} is {checks} "
                    f"while the number of labels in {column_names} "
                    f"is {len(column_names)}"
                )
            if '.value' not in column_names:
                output = [*map(tuple, output)]
                output = dt.Frame(output, names=column_names)
                output = output[np.repeat(range(output.nrows), data.nrows),:]        
                return reshape_no_dot(measure_vars=measure_vars, output = output, data = data, id_vars = id_vars)

            return reshape_dot(column_names, data, measure_vars, output, id_vars=id_vars)

        if pattern:
            if not isinstance(pattern, (str, Pattern)):
                raise TypeError(
                    "pattern should be a regular expression, "
                    "or can be compiled into one.")
            checks = re.compile(pattern).groups
            if not checks:
                raise ValueError("The regex should have at least one group.")
            if checks != len(column_names):
                raise ValueError(
                    "The number of groups in the regex "
                    "should match the number of labels in column_names. "
                    f"The number of groups in the regex is {len(checks)}, "
                    f"while the length of column_names is {len(column_names)}")
            output = [re.findall(pattern, word) for word in measure_vars]
            no_matches = [word for word, match in zip(measure_vars, output) if not match]
            if no_matches:
                raise ValueError(
                    f"There was no match for labels {no_matches} "
                    "for the provided regular expression.")
            output = [*chain.from_iterable(output)]
            if '.value' not in column_names:
                output = [*map(tuple, output)]
                output = dt.Frame(output, names=column_names)
                output = output[np.repeat(range(output.nrows), data.nrows),:]        
                return reshape_no_dot(measure_vars=measure_vars, output = output, data = data, id_vars = id_vars)

            return reshape_dot(column_names, data, measure_vars, output, id_vars=id_vars)