Esempio n. 1
0
def copy_attributes(mapfile, newdir, filenames, key_lstrip=None, key_rstrip=None):

    if os.path.exists(newdir):
        shutil.rmtree(newdir)
    os.makedirs(newdir)

    metadata = pd.read_csv(mapfile, sep='\t')
    filenames = pd.DataFrame(filenames, columns=['filename'])
    filenames['dataset_key'] = filenames['filename'].apply(lambda x: strip_key(x, key_lstrip, key_rstrip))

    joined = pd.merge(metadata, filenames, on='dataset_key', how='inner')

    assert len(joined) == len(metadata) == len(filenames)

    for index, row in joined.iterrows():
        id = row['id']
        filename = row['filename']

        link_name = os.path.join(newdir, '%s.txt' % (id))
        #os.symlink(filename, link_name)
        shutil.copy(filename, link_name)
Esempio n. 2
0
def main(edit_file, metadata_in, metadata_out, key_rstrip=None):

    # load the table of edits
    edits = pd.read_csv(edit_file, sep='\t', names=['filename', 'var', 'value'], encoding='UTF8')

    # load metadata
    metadata = pd.read_csv(metadata_in, sep='\t', encoding='UTF8', dtype=str)

    # metadata as the var's in columns. if any of the var's we are editing don't already
    # exist, add them as new cols initialized to missing

    edit_vars = pd.Series(edits['var'].unique())
    new_vars = edit_vars[~edit_vars.isin(metadata.columns)]

    for new_var in new_vars:
        metadata[new_var] = np.NAN

    # cfg file name should match the dataset key in the metadata file
    edits['dataset_key'] = edits['filename'].apply(lambda x: strip_key(x, None, key_rstrip))

    # log all edit files we don't have in the metadata table
    missing_files = edits[~edits['dataset_key'].isin(metadata['dataset_key'])]['dataset_key']
    if len(missing_files) > 0:
        print("metadata edits for these missing files will be skipped: ",
              ' '.join(list(missing_files)))

    # apply remaining edits. could probably vectorize this into some kind
    # of join, but its not much data, and simpler to think about in a loop,
    # at least for me :)
    edits = edits[edits['dataset_key'].isin(metadata['dataset_key'])]

    for index, row in edits.iterrows():
        filename, var, value = row['dataset_key'], row['var'], row['value']
        metadata.loc[metadata['dataset_key'] == filename, var] = value

    # write new edited metadata file
    metadata.to_csv(metadata_out, sep='\t', header=True, index=True, encoding='UTF8')