def test_trim_sheet_fail(sample_spreadsheet, trimmed_sample_spreadsheet, onset, offset): columns = ["MomSpeech", "MomObject"] sheet = pv.load_opf(sample_spreadsheet) sheet = pv.trim_sheet(onset, offset, sheet, True, False, *columns) sheet_trimmed = pv.load_opf(trimmed_sample_spreadsheet) assert sheet != sheet_trimmed
def test_df_to_csv(sample_spreadsheet): sheet = pv.load_opf(sample_spreadsheet) df = sheet.to_df() # df.to_csv('./DatavyuSampleSpreadsheet.csv') df = sheet.to_df("MomSpeech") log.info(df)
def test_json(sample_spreadsheet): sheet = pv.load_opf(sample_spreadsheet) pv.save_json(sheet, "test.json") json_sheet = pv.load_json("test.json") momspeech = json_sheet.get_column("MomSpeech") assert len(momspeech.sorted_cells()) == 20 o_momspeech = sheet.get_column("MomSpeech") for oc, c in zip(o_momspeech.cells, momspeech.cells): assert all([x == y for x, y in zip(oc.get_values(), c.get_values())]) # Cleanup os.remove("test.json")
def test_spreadsheet_to_df(sample_spreadsheet): sheet = pv.load_opf(sample_spreadsheet) df = sheet.to_df() pd.set_option("display.max_columns", None) log.info(df) assert 156 == len(df) ms = sheet.to_df("MomSpeech") log.info(ms) assert 20 == len(ms)
def parseAndTrimOpf(opf_path, columns_list, onset, offset): """ Parse and cut OPF file according to an onset and offset passed via arguments or found in the ingest JSON file. """ logger.info("Trim opf %s from %d to %d", opf_path, onset, offset) opf_file_orig = os.path.realpath(opf_path) opf_path_cut = os.path.splitext(opf_file_orig)[0] + '_cut.opf' copyfile(opf_file_orig, opf_path_cut) sheet = pv.load_opf(opf_path_cut) if sheet.get_column_list() < 1: logger.error('OPF file is empty') return None if columns_list is not None: sheet.columns = { colname: col for (colname, col) in sheet.columns.items() if colname in columns_list } _columns_to_trim = [col for col in sheet.columns if col not in _exception] for col in [sheet.columns[c] for c in _columns_to_trim]: col.cells = [ cell for cell in col.cells if (cell.onset >= onset and cell.offset <= offset) or ( cell.onset < onset < cell.offset <= offset) or ( onset < cell.onset < offset < cell.offset) ] for colname, col in sheet.columns.items(): for cell in col.cells: cell.onset = max(max(cell.onset, onset) - onset, 0) cell.offset = max(min(cell.offset, offset) - onset, 0) sheet.columns = { colname: col for (colname, col) in sheet.columns.items() if len(col.cells) > 0 } pv.save_opf(sheet, opf_path_cut, *sheet.columns.keys()) return opf_path_cut
# This just to keep a record of all opf files processed, no need to be changed fields = [ "opf_original", "opf_trimmed", "onset_timestamp", "offset_timestamp", "onset_millis", "offset_millis" ] rows = [] # change the prefix of the created opf files file_cut = "{}_cut.opf" for root, dirs, files in os.walk(source_folder): for file in files: if file.endswith(".opf"): print("Loading file: {}".format(file)) original_file = os.path.join(root, file) sheet = pv.load_opf(original_file) col = sheet.get_column(column_ref) if col is None: continue exception = {} for colname in columns_exception: if colname in sheet.columns.keys(): exception[colname] = sheet.get_column(colname) onset = col.cells[cell_ref].onset offset = col.cells[cell_ref].offset print("Found onset: {} offset: {} in {}".format( pv.to_timestamp(onset), pv.to_timestamp(offset), column_ref))
def test_trim_sheet(sample_spreadsheet, trimmed_sample_spreadsheet, onset, offset): sheet = pv.load_opf(sample_spreadsheet) sheet = pv.trim_sheet(onset, offset, sheet, True, False) sheet_trimmed = pv.load_opf(trimmed_sample_spreadsheet) assert sheet == sheet_trimmed
def test_load_sample(sample_spreadsheet): sheet = pv.load_opf(sample_spreadsheet) assert len(sheet.get_column_list()) == 6 momspeech = sheet.get_column("MomSpeech") assert len(momspeech.sorted_cells()) == 20