def generate_filepaths(metadata_file, output_dir): data = metadata.read_csv(metadata_file) for row in data: output_path = output_dir + "/" + row['filename'] if not os.path.exists(output_path): os.makedirs(output_path) row['output_path'] = output_path return data
def read_all_metadata(columns, path): all_metadata = list() files = find_metadata_files(path) for filename in files: m = metadata.read_csv(os.path.join(path, filename)) for row in m: try: all_metadata.append({col: row[col] for col in columns}) except KeyError as err: print('Missing column: {}- {}'.format(row, err)) return all_metadata
def get_common_columns(path): headers = dict() common = list() header_counts = defaultdict(int) metadata_files = find_metadata_files(path) for filename in metadata_files: data = metadata.read_csv(os.path.join(path, filename)) headers[filename] = data[0] for filename, columns in headers.items(): for col in columns: # col = col.lower().strip() header_counts[col] += 1 for name, count in header_counts.items(): if count == len(metadata_files): common.append(name) return common
def main(): settings = get_settings() metadata = m.read_csv(settings.input) if (not os.path.exists(settings.output_dir)): os.makedirs(settings.output_dir) for row in metadata: filename = row['filename'] try: text = wf.get_text_from_file(filename) except FileNotFoundError as err: print('Skipping: {}'.format(err)) continue except IsADirectoryError: continue # bad metadata, ignore outfile = wf.output_filename(settings.output_dir, filename) pos_tagged_text = wf.tag_pos(text, settings.language) reduced_text = reduce(pos_tagged_text, settings.pos_list) new_filename = get_new_path(filename, settings.output_dir) row['filename'] = new_filename fh = open(new_filename, 'w') fh.write(reduced_text) fh.close() new_filename = get_new_path(settings.input, settings.output_dir) m.write_csv(new_filename, metadata)
def main(): global settings settings = get_settings() md = metadata.read_csv(settings.input) filtered = filter_dates(md, settings.start, settings.end) metadata.write_csv(settings.output, filtered)
def main(): global settings settings = get_settings() md = metadata.read_csv(settings.input) delete_files(md)