Exemple #1
0
def test_rbind2():
    DT = dt.Frame([5, 7, 9])
    DT[0] = dt.Type.time64
    RES = dt.rbind(DT, DT)
    EXP = dt.Frame([5, 7, 9] * 2)
    EXP[0] = dt.Type.time64
    assert_equals(RES, EXP)
 def reshape_dot(column_names, data, measure_vars, output, id_vars=None):
     "reshape if '.value' is present in the column names."
     boolean = [True if ent == '.value' else False for ent in column_names]
     dot_value = [[*compress(extract, boolean)] for extract in output]
     if len(dot_value[0]) > 1:
         dot_value = ["".join(extract) for extract in dot_value]
     else:
         dot_value = [*chain.from_iterable(dot_value)]
     checks = set(dot_value)
     if id_vars and checks.intersection(id_vars):
         raise ValueError(
             f"The new column names associated with .value -> {checks} "
             "are duplicated in id_vars."
         )
     boolean = [not true for true in boolean]
     others = [tuple(compress(extract, boolean)) for extract in output]
     headers_for_others = [extract for extract in column_names if extract != '.value']
     measure_vars = [frame for frame in data[:, measure_vars]]
     out = defaultdict(list)
     for key, value_column, frame in zip(others, dot_value, measure_vars):
         frame.names = [value_column]
         out[key].append(frame)
     headers_for_others = [dt.Frame([key], names = headers_for_others) for key, _ in out.items()]
     out = [dt.cbind(frame, force = True) for _, frame in out.items()]
     out = [dt.cbind(dt.repeat(left, right.nrows), right, force = True) for left, right in zip(headers_for_others, out)]
     out = dt.rbind(out, force = True)
     if id_vars:
         id_vars = dt.repeat(data[:, id_vars], out.nrows//data.nrows)
         return dt.cbind([id_vars, out], force = True)
     return out
 def reshape_no_dot(measure_vars, output, data, id_vars=None):
     values = []
     for frame in data[:, measure_vars]:
         frame.names = [value_name]
         values.append(frame)
     values = dt.rbind(values, force=True)
     if id_vars:
         id_vars = dt.repeat(data[:, id_vars], len(measure_vars))
         return dt.cbind([id_vars, output, values], force = True)
     return dt.cbind([output, values], force = True)
def build_target_table(chembl_df, drugbank_df, output_dir):
    """
    Using data from the Drugbank and ChEMBL drug target files and
    the UniProt API, build the target table.

    @param chembl_df: [`dt.Frame`] The ChEMBL drug target table
    @param drugbank_df: [`dt.Frame`] The DrugBank drug target table
    @param output_dir: [`string`] The file path to write the final target table
    @return: [`dt.Frame`] The target table
    """
    # Combine ChEMBL and Drugbank tables to make target table
    target_df = dt.rbind([chembl_df['name'], drugbank_df['name']]).to_pandas()
    target_df.drop_duplicates(inplace=True)
    target_df = dt.Frame(target_df)
    target_df = write_table(target_df, 'target', output_dir)
    target_df = rename_and_key(target_df, 'target_id')
    return target_df
Exemple #5
0
def load_table(name, data_dir):
    """
    Load all PSet tables with name into a datatable, dropping any duplicate 
    rows.
    
    @param name: [`string`] The name of the table
    @param data_dir: [`string`] File path to the directory with all PSet tables
    @return: [`datatable.Frame`] A datatable containing all rows from all PSets
    """
    logger.info(f"Loading PSet-specific {name} tables from {data_dir}...")
    # Get all files
    files = glob.glob(os.path.join(data_dir, "**", f"*{name}.jay"))
    # Filter so that file path are "{data_dir}/{pset}/{pset}_{name}.jay"
    files = [
        file_name for file_name in files
        if re.search(data_dir + r"/(\w+)/\1_" + name + ".jay$", file_name)
    ]
    # Read and concatenate tables
    df = dt.rbind(*dt.iread(files))
    # Drop duplicates (groups by all columns and
    # selects only the first row from each group)
    df = df[0, :, by(df.names)]
    return df
Exemple #6
0
def fread_table_for_all_psets(table_name: str,
                              data_dir: str,
                              column_dict: dict,
                              rename_dict: dict = None,
                              key_columns: list = None) -> dt.Frame:
    """
    Reads all tables named `table_name` from `data_dir`, using `column_dict`
    to specify the column names, order and types to read in. The resulting
    table iterator is then concatenated using `datatable.rbind` and the
    columns are renamed according to rename dict.
    
    :param table_name:
    :param data_dir:
    :param column_dict:
    :param rename_dict:
    """
    logger.info(
        f"Loading PSet-specific {table_name} tables from {data_dir}...")
    # Get all files
    files = glob.glob(os.path.join(data_dir, "**", f"*{table_name}.jay"))
    # Filter so that file path are "{data_dir}/{pset}/{pset}_{name}.jay"
    files = [
        file_name for file_name in files
        if re.search(data_dir + r"/(\w+)/\1_" + table_name +
                     ".jay$", file_name)
    ]
    # Read and concatenate tables
    df = dt.rbind(*dt.iread(files, columns=column_dict), force=True)
    # Drop duplicates (groups by all columns and
    # selects only the first row from each group)
    df = df[0, :, by(df.names)]
    if rename_dict is not None:
        df.names = rename_dict
    if key_columns is not None:
        df = df[0, :, :, by(key_columns)]
    return df
Exemple #7
0
def test_rbind():
    src = [d(2030, 12, 1, 13, 43, 17)]
    DT = dt.Frame(src)
    assert DT.type == dt.Type.time64
    RES = dt.rbind(DT, DT)
    assert_equals(RES, dt.Frame(src * 2))
Exemple #8
0
def test_rbind2():
    DT = dt.Frame([5, 7, 9], type=dt.Type.time64)
    RES = dt.rbind(DT, DT)
    EXP = dt.Frame([5, 7, 9] * 2, type=dt.Type.time64)
    assert_equals(RES, EXP)
    ['VS' + d + '_original' for d in release['samples_id'].to_list()[0]])

# ~ 1a ~
# Validate ERNS
# Make sure all ERNs values are correct. If there are any unknown ERN values,
# add them to the mappings (defined in step 0) and rerun the script up to this
# section. Repeat the process until all ERN name variations have been corrected.
# There shouldn't be any new ERNs only name variations.

# Find ERNs name variations that do not exist in RD3. If the following code
# throws any error, add the name variation to the object `ernMappings` defined
# in step 0b. Repeat until the no more mapping errors are thrown. If everything
# is mapped, then proceed to the next step.
dt.Frame([
    recodeValue(mappings=ernMappings, value=d, label="ERN") for d in dt.unique(
        dt.rbind(release['samples_ERN'], release['subject_ERN'],
                 force=True)).to_list()[0]
])

# recode ERNs variables with known variations
release['samples_ERN'] = dt.Frame([
    recodeValue(mappings=ernMappings, value=d, label='ERN')
    for d in release['samples_ERN'].to_list()[0]
])

release['subject_ERN'] = dt.Frame([
    recodeValue(mappings=ernMappings, value=d, label='ERN')
    for d in release['subject_ERN'].to_list()[0]
])

# combine both ERNs
# rawErnData = dt.unique(
    statusMsg('Pulling subjects from', novelOmicsReleases[release])
    tmpSubjectData = rd3.get(entity=f'{novelOmicsReleases[release]}_subject',
                             attributes='id,subjectID,patch',
                             batch_size=10000)
    for row in tmpSubjectData:
        if 'patch' in row:
            row['patch'] = ','.join([patch['id'] for patch in row['patch']])

    tmpSubjectData = dt.Frame(tmpSubjectData)[:, {
        'id': f.id,
        'subjectID': f.subjectID,
        'patch': f.patch,
        'release': release
    }]

    existingSubjects = dt.rbind(existingSubjects, tmpSubjectData)

# get existing sample metadata
existingSamples = dt.Frame()
for release in novelOmicsReleases:
    statusMsg('Pulling samples from', novelOmicsReleases[release])
    tmpSampleData = rd3.get(entity=f'{novelOmicsReleases[release]}_sample',
                            attributes='id,sampleID,patch',
                            batch_size=10000)

    for row in tmpSampleData:
        if 'patch' in row:
            row['patch'] = ','.join([patch['id'] for patch in row['patch']])

    tmpSampleData = dt.Frame(tmpSampleData)[:, {
        'id': f.id,
        
        # spread data by subjectID and release the previous step collapses
        # multiple samples for a release so we can drop duplicate values here
        subjectSamplesSummarized=dt.Frame(
            tmpSamplesBySubjects
            .to_pandas()
            .drop_duplicates(subset=['subjectID','release'],keep='first')
            .pivot(index='subjectID', columns='release', values='idsCollapsed')
            .reset_index()
        )
        
        # bind to parent object
        subjectSamplesSummarized['numberOfSamples']=tmpSamplesBySubjects.nrows
        samplesSummarized=dt.rbind(
            samplesSummarized,
            subjectSamplesSummarized,
            force=True
        )
        
        # store processed ids
        processedSubjectIDs.append(id)

del subjectSamplesSummarized
del tmpSamplesBySubjects
del processedSubjectIDs
del sampleSubjectIDs

# rename columns
samplesSummarized.names={
    'freeze1': 'df1Samples',
    'freeze2': 'df2Samples',
def melt(data, id_vars=None, measure_vars=None, variable_name = 'variable', value_name = 'value'):
    "Turns Frame from wide to long form."
    if id_vars:
        if not isinstance(id_vars, (str, list, tuple)):
            raise TypeError('id_vars should be one of str, list, tuple.')
        if isinstance(id_vars, str):
            id_vars = [id_vars]
        checks = set(id_vars).difference(data.names)
        if checks:
            raise ValueError(f'Labels {checks} in id_vars do not exist in the column names.')
        if not set(data.names).difference(id_vars):
            return data
        checks = [key for key,value in Counter(id_vars).items() if value > 1]
        if checks:
            raise ValueError(f"Labels {checks} are duplicated in id_vars.")
        if not measure_vars:
            measure_vars = [name for name in data.names if name not in id_vars]
    if measure_vars:
        if not isinstance(measure_vars, (str, list, tuple)):
            raise TypeError('measure_vars should be one of str, list, tuple.')
        if isinstance(measure_vars, str):
            measure_vars = [measure_vars]
        checks = set(measure_vars).difference(data.names)
        if checks:
            raise ValueError(f'Labels {checks} in measure_vars do not exist in the column names.')
        checks = [key for key,value in Counter(measure_vars).items() if value > 1]
        if checks:
            raise ValueError(f"Labels {checks} are duplicated in measure_vars.")
        if (not id_vars) and (len(measure_vars) < data.ncols):
            id_vars = [name for name in data.names if name not in measure_vars]
    else:
        measure_vars = data.names

    def reshape_no_dot(measure_vars, output, data, id_vars=None):
        values = []
        for frame in data[:, measure_vars]:
            frame.names = [value_name]
            values.append(frame)
        values = dt.rbind(values, force=True)
        if id_vars:
            id_vars = dt.repeat(data[:, id_vars], len(measure_vars))
            return dt.cbind([id_vars, output, values], force = True)
        return dt.cbind([output, values], force = True)

    def reshape_dot(column_names, data, measure_vars, output, id_vars=None):
        "reshape if '.value' is present in the column names."
        boolean = [True if ent == '.value' else False for ent in column_names]
        dot_value = [[*compress(extract, boolean)] for extract in output]
        if len(dot_value[0]) > 1:
            dot_value = ["".join(extract) for extract in dot_value]
        else:
            dot_value = [*chain.from_iterable(dot_value)]
        checks = set(dot_value)
        if id_vars and checks.intersection(id_vars):
            raise ValueError(
                f"The new column names associated with .value -> {checks} "
                "are duplicated in id_vars."
            )
        boolean = [not true for true in boolean]
        others = [tuple(compress(extract, boolean)) for extract in output]
        headers_for_others = [extract for extract in column_names if extract != '.value']
        measure_vars = [frame for frame in data[:, measure_vars]]
        out = defaultdict(list)
        for key, value_column, frame in zip(others, dot_value, measure_vars):
            frame.names = [value_column]
            out[key].append(frame)
        headers_for_others = [dt.Frame([key], names = headers_for_others) for key, _ in out.items()]
        out = [dt.cbind(frame, force = True) for _, frame in out.items()]
        out = [dt.cbind(dt.repeat(left, right.nrows), right, force = True) for left, right in zip(headers_for_others, out)]
        out = dt.rbind(out, force = True)
        if id_vars:
            id_vars = dt.repeat(data[:, id_vars], out.nrows//data.nrows)
            return dt.cbind([id_vars, out], force = True)
        return out

    if not isinstance(variable_name, (str, tuple, dict, Pattern)):
        raise TypeError('variable_name should be one of string, tuple, dictionary, regular expression.')

    if isinstance(variable_name, str):
        if not isinstance(value_name, str):
            raise TypeError('value_name should be a string.')
        if value_name == variable_name:
            raise ValueError(
                f"{value_name} is duplicated as variable_name. "
                f"Kindly provide a unique argument for {value_name}.")
        if id_vars: 
            if variable_name in id_vars:
                raise ValueError(
                    f"{variable_name} already exists as a label "
                    "in id_vars. Kindly provide a unique argument.")
            if value_name in id_vars:
                raise ValueError(
                    f"{value_name} already exists as a label "
                    "in id_vars. Kindly provide a unique argument.")

        output = dt.Frame({variable_name:measure_vars})
        output = output[np.repeat(range(output.nrows), data.nrows),:]
        return reshape_no_dot(measure_vars=measure_vars, output = output, data = data, id_vars = id_vars)


    if isinstance(variable_name, Pattern):
        if not re.compile(variable_name).groups:
            raise ValueError("The regex should have at least one group.")
        output = [re.search(variable_name, word) for word in measure_vars]
        no_matches = [word for word, match in zip(measure_vars, output) if not match]
        if no_matches:
            raise ValueError(
                f"There was no match for labels {no_matches} "
                "for the provided regular expression.")
        output = [entry.groupdict() for entry in output]
        checks = output[0].keys()
        if id_vars and set(checks).intersection(id_vars):
            raise ValueError(
                f"Labels {checks} already exist in id_vars. "
                "Kindly provide unique names for the named groups " 
                "in the regular expression."
                )
        output = dt.Frame(output)
        output = output[np.repeat(range(output.nrows), data.nrows),:]        
        return reshape_no_dot(measure_vars=measure_vars, output = output, data = data, id_vars = id_vars)

    if isinstance(variable_name, dict) :
        checks = set(variable_name).intersection(id_vars)
        if id_vars and checks:
            raise ValueError(
                f"Labels {checks} already exist in id_vars. "
                "Kindly provide keys for the dictionary "
                "that do not exist in id_vars."
                )
        for key, regex in variable_name.items():
            if not isinstance(key, str):
                raise TypeError(f"{key} should be a string.")
            if not isinstance(regex, (str, Pattern)):
                raise TypeError(
                    f"The value for {key} should be a regular expression, "
                    "or can be compiled into one."
                    )
            if re.compile(regex).groups:
                raise ValueError("The regex should not have any groups.")
        output = []
        for key, regex in variable_name.items():
            out = [word for word in measure_vars if re.search(regex, word)]
            if not out:
                raise ValueError(
                    f"There was no match for {key} for regex => {regex}"
                )            
            
            measure_vars = [word for word in measure_vars if word not in out]
            if len(out) == 1:
                frame.names = [key]
                output.append(frame)
            else:
                values = []
                for frame in data[:, out]:
                    frame.names = [key]
                    values.append(frame)
                output.append(dt.rbind(values, force = True))
        output = dt.cbind(output, force=True)
        if id_vars:
            id_vars = dt.repeat(data[:, id_vars], output.nrows//data.nrows)
            return dt.cbind([id_vars, output])
        return output
          
    if isinstance(variable_name, tuple):
        variable_name = measure(*variable_name)
        column_names, sep, pattern = variable_name
        if not column_names:
            raise ValueError("Kindly provide argument for column_names, in the variable_name tuple.")
        if not isinstance(column_names, (str, list)):
            raise TypeError('column_names should be one of string, list.')
        if isinstance(column_names, str):
            column_names = [column_names]
        if id_vars:
            checks = set(column_names)
            checks.discard(".value")
            checks = checks.intersection(id_vars)
            if checks:
                raise ValueError(
                    f"Labels {checks} already exist in id_vars. "
                    "Kindly provide unique column_names "
                    "that do not exist in id_vars."
                    )
        if not any((sep, pattern)):
            raise ValueError("Kindly provide one of sep or pattern.")
        if sep and pattern:
            raise ValueError("only one of sep or pattern should be provided.")
        if sep:
            if not isinstance(sep, (str, Pattern)):
                raise TypeError(
                    "sep should be a regular expression, "
                    "or can be compiled into one.")
            output = [re.split(sep, word) for word in measure_vars]
            checks = max(map(len, output))
            if len(column_names) != checks:
                raise ValueError(
                    f"The maximum number of splits for sep -> {sep} is {checks} "
                    f"while the number of labels in {column_names} "
                    f"is {len(column_names)}"
                )
            if '.value' not in column_names:
                output = [*map(tuple, output)]
                output = dt.Frame(output, names=column_names)
                output = output[np.repeat(range(output.nrows), data.nrows),:]        
                return reshape_no_dot(measure_vars=measure_vars, output = output, data = data, id_vars = id_vars)

            return reshape_dot(column_names, data, measure_vars, output, id_vars=id_vars)

        if pattern:
            if not isinstance(pattern, (str, Pattern)):
                raise TypeError(
                    "pattern should be a regular expression, "
                    "or can be compiled into one.")
            checks = re.compile(pattern).groups
            if not checks:
                raise ValueError("The regex should have at least one group.")
            if checks != len(column_names):
                raise ValueError(
                    "The number of groups in the regex "
                    "should match the number of labels in column_names. "
                    f"The number of groups in the regex is {len(checks)}, "
                    f"while the length of column_names is {len(column_names)}")
            output = [re.findall(pattern, word) for word in measure_vars]
            no_matches = [word for word, match in zip(measure_vars, output) if not match]
            if no_matches:
                raise ValueError(
                    f"There was no match for labels {no_matches} "
                    "for the provided regular expression.")
            output = [*chain.from_iterable(output)]
            if '.value' not in column_names:
                output = [*map(tuple, output)]
                output = dt.Frame(output, names=column_names)
                output = output[np.repeat(range(output.nrows), data.nrows),:]        
                return reshape_no_dot(measure_vars=measure_vars, output = output, data = data, id_vars = id_vars)

            return reshape_dot(column_names, data, measure_vars, output, id_vars=id_vars)