Example #1
0
            print(f'Using config file {config_file}')
        else:
            config_file = pathlib.Path(args.config)
            if not config_file.exists():
                logging.error('Config file does not exist')
                exit(1)

        paths, muids_mapping = load_paths_file(config_file)

    if args.verbose:
        print(f'paths: {paths}')
        print('muid mapping: {muids_mapping}')

    rows = load_sheet(file)

    files = {file.stem: file for file in iter_json_files()}

    segment_uid_to_file_mapping = {}

    if not paths:

        file_uid = pathlib.Path(args.file).stem

        def get_file(uid, file_uid, muids):
            filestem = f'{uid}_{muids}'
            if filestem in files:
                return files[filestem]

            alt_filestem = f'{file_uid}_{muids}'
            if alt_filestem in files:
                return files[alt_filestem]
Example #2
0
def get_data(repo_dir: pathlib.Path, uids: set[str], include_filter: Optional[set[set[str]]] = None, exclude_filter: Optional[set[str]] = None) -> Generator[list[str], None, None]:
    """
    repo_dir is a path to the bilara-data repository or structurally equivilant data

    uids is a set of the uids to get the data for, this can be a single text uid such as {dn2}, a single folder uid such as {dn}
    or multiple, such as {dn1,dn2,dn3,dn4,dn5,dn6,dn7,dn8,dn9,dn10}

    include_filter is a set of muids or frozensets of muids {frozenset({'translation','en','sujato'}),'root','reference'}, if None everything is included
    
    exclude_filter is a set of muids, anything matching will be excluded, e.g. {'comment'}. If None nothing is excluded.

    Returns a generator that yields rows of data suitable for feeding to csv_writer. The first result is the fields
    which will always start with the segment_id and be followed by root, translation and markup fields (provided they
    are included by filters), remaining included fields are sorted in simple alphabetical order.
    Subsequent results are rows of data.
    When multiple texts are processed each text is seperated by a list of empty strings.
    """

    file_uid_mapping = {}
    for file in iter_json_files(repo_dir):
        
        try:
            uid, muids_string = file.stem.split('_')
        except:
            print(file)
            raise

        if not (uid in uids or any(part in uids for part in file.parent.parts)):
            continue
            
        print('Reading {}'.format(str(file.relative_to(repo_dir))))

        muids = frozenset(muids_string.split('-'))
        if include_filter:
            for muid in include_filter:
                if isinstance(muid, frozenset):
                    if muids.intersection(muid) == muid:
                        break
                else:
                    if muid in muids:
                        break
            else:
                continue
        
        if exclude_filter and exclude_filter.intersection(muids):
            continue
        
        if uid not in file_uid_mapping:
            file_uid_mapping[uid] = {}
        file_uid_mapping[uid][muids_string] = file
    
    if not file_uid_mapping:
        print('No matches for {}'.format(",".join(args.uid)), file=sys.stderr)
        exit(1)
        
    muid_strings = set()
    for keys in file_uid_mapping.values():
        muid_strings.update(keys)
    
    muid_strings = sorted(muid_strings, key=muid_sort_key)
    
    return yield_rows(muid_strings, file_uid_mapping)
Example #3
0
                        help='Spreadsheet file to import. CSV, TSV, ODS, XLS')
    parser.add_argument('--original', help='Old spreadsheet, used for updates')
    parser.add_argument('-q',
                        '--quiet',
                        help='Do not display changes to files')
    args = parser.parse_args()

    original_rows = pyexcel.iget_records(file_name=args.original)
    new_rows = pyexcel.iget_records(file_name=args.file)

    segment_id_mapping = {}

    for old, new in zip(original_rows, new_rows):
        segment_id_mapping[old['segment_id']] = new['segment_id']

    for file in iter_json_files():
        data = json_load(file)

        new_data = {}
        changed = False

        for k, v in data.items():
            if k in segment_id_mapping:
                k = segment_id_mapping[k]
                changed = True
            new_data[k] = v
        if changed:
            print(f'Updated {file}')
            with file.open('w') as f:
                json.dump(new_data, f, ensure_ascii=False, indent=2)
Example #4
0
            print(f'Using config file {config_file}')
        else:
            config_file = pathlib.Path(args.config)
            if not config_file.exists():
                logging.error('Config file does not exist')
                exit(1)

        paths, muids_mapping = load_paths_file(config_file)

    if args.verbose:
        print(f'paths: {paths}')
        print('muid mapping: {muids_mapping}')

    rows = load_sheet(file)

    files = {file.stem: file for file in iter_json_files(repo_dir)}

    segment_uid_to_file_mapping = {}

    if not paths:

        file_uid = pathlib.Path(args.file).stem

        def get_file(uid, file_uid, muids):
            filestem = f'{uid}_{muids}'
            if filestem in files:
                return files[filestem]

            alt_filestem = f'{file_uid}_{muids}'
            if alt_filestem in files:
                return files[alt_filestem]