print(f'Using config file {config_file}') else: config_file = pathlib.Path(args.config) if not config_file.exists(): logging.error('Config file does not exist') exit(1) paths, muids_mapping = load_paths_file(config_file) if args.verbose: print(f'paths: {paths}') print('muid mapping: {muids_mapping}') rows = load_sheet(file) files = {file.stem: file for file in iter_json_files()} segment_uid_to_file_mapping = {} if not paths: file_uid = pathlib.Path(args.file).stem def get_file(uid, file_uid, muids): filestem = f'{uid}_{muids}' if filestem in files: return files[filestem] alt_filestem = f'{file_uid}_{muids}' if alt_filestem in files: return files[alt_filestem]
def get_data(repo_dir: pathlib.Path, uids: set[str], include_filter: Optional[set[set[str]]] = None, exclude_filter: Optional[set[str]] = None) -> Generator[list[str], None, None]: """ repo_dir is a path to the bilara-data repository or structurally equivilant data uids is a set of the uids to get the data for, this can be a single text uid such as {dn2}, a single folder uid such as {dn} or multiple, such as {dn1,dn2,dn3,dn4,dn5,dn6,dn7,dn8,dn9,dn10} include_filter is a set of muids or frozensets of muids {frozenset({'translation','en','sujato'}),'root','reference'}, if None everything is included exclude_filter is a set of muids, anything matching will be excluded, e.g. {'comment'}. If None nothing is excluded. Returns a generator that yields rows of data suitable for feeding to csv_writer. The first result is the fields which will always start with the segment_id and be followed by root, translation and markup fields (provided they are included by filters), remaining included fields are sorted in simple alphabetical order. Subsequent results are rows of data. When multiple texts are processed each text is seperated by a list of empty strings. """ file_uid_mapping = {} for file in iter_json_files(repo_dir): try: uid, muids_string = file.stem.split('_') except: print(file) raise if not (uid in uids or any(part in uids for part in file.parent.parts)): continue print('Reading {}'.format(str(file.relative_to(repo_dir)))) muids = frozenset(muids_string.split('-')) if include_filter: for muid in include_filter: if isinstance(muid, frozenset): if muids.intersection(muid) == muid: break else: if muid in muids: break else: continue if exclude_filter and exclude_filter.intersection(muids): continue if uid not in file_uid_mapping: file_uid_mapping[uid] = {} file_uid_mapping[uid][muids_string] = file if not file_uid_mapping: print('No matches for {}'.format(",".join(args.uid)), file=sys.stderr) exit(1) muid_strings = set() for keys in file_uid_mapping.values(): muid_strings.update(keys) muid_strings = sorted(muid_strings, key=muid_sort_key) return yield_rows(muid_strings, file_uid_mapping)
help='Spreadsheet file to import. CSV, TSV, ODS, XLS') parser.add_argument('--original', help='Old spreadsheet, used for updates') parser.add_argument('-q', '--quiet', help='Do not display changes to files') args = parser.parse_args() original_rows = pyexcel.iget_records(file_name=args.original) new_rows = pyexcel.iget_records(file_name=args.file) segment_id_mapping = {} for old, new in zip(original_rows, new_rows): segment_id_mapping[old['segment_id']] = new['segment_id'] for file in iter_json_files(): data = json_load(file) new_data = {} changed = False for k, v in data.items(): if k in segment_id_mapping: k = segment_id_mapping[k] changed = True new_data[k] = v if changed: print(f'Updated {file}') with file.open('w') as f: json.dump(new_data, f, ensure_ascii=False, indent=2)
print(f'Using config file {config_file}') else: config_file = pathlib.Path(args.config) if not config_file.exists(): logging.error('Config file does not exist') exit(1) paths, muids_mapping = load_paths_file(config_file) if args.verbose: print(f'paths: {paths}') print('muid mapping: {muids_mapping}') rows = load_sheet(file) files = {file.stem: file for file in iter_json_files(repo_dir)} segment_uid_to_file_mapping = {} if not paths: file_uid = pathlib.Path(args.file).stem def get_file(uid, file_uid, muids): filestem = f'{uid}_{muids}' if filestem in files: return files[filestem] alt_filestem = f'{file_uid}_{muids}' if alt_filestem in files: return files[alt_filestem]