Ejemplo n.º 1
0
def add_site_to_history():
    db = get_db()
    site_dirs = [WORKING_DIR / 'root/en/site'] + list(WORKING_DIR.glob('**/translation/*/site'))

    docs = {}

    for folder in site_dirs:
        lang = folder.parts[-2]
        for file in folder.glob('**/*.json'):
            file_uid = file.stem.split('_')[0]
            
            with file.open() as f:
                entries = json.load(f)
            
            for k,v in entries.items():
                context = f'{file_uid}_{k}'

                if context in docs:
                    doc = docs[context]
                else:
                    doc = {
                        '_key': f'bilara_{context}',
                        'context': context,
                        'origin': 'bilara',
                        'strings': {}
                    }
                    docs[context] = doc
                doc['strings'][lang] = v
            
    errors = db['historic'].import_bulk(docs.values(), on_duplicate='replace', halt_on_error=False)
    return errors, docs
Ejemplo n.º 2
0
 def iter_all_files(self):
     for folder in WORKING_DIR.iterdir():
         if not folder.is_dir() or folder.name.startswith('.'):
             continue
         for file in folder.glob('**/*.json'):
             if "_" not in file.stem:
                 continue
             yield file
Ejemplo n.º 3
0
def make_special_uid_mapping():
    uid_mapping = {}
    for file in WORKING_DIR.glob("root/**/*.json"):
        if "blurbs" in str(file):
            continue
        with file.open() as f:
            data = json.load(f)
        for k in data:
            uid = k.split(":")[0]
            if uid not in _uid_index and uid not in uid_mapping:
                uid_mapping[uid] = file.name.split("_")[0]
    return uid_mapping
Ejemplo n.º 4
0
def load_data():
    # On my PC using a ThreadPoolExecutor cuts the import time to one-third
    executor = ThreadPoolExecutor(max_workers=4)
    limit = 4
    futures = set()

    db = get_db()
    strings_coll = db['strings']
    strings_coll.truncate()
    start = monotonic()
    for folder in sorted(WORKING_DIR.glob('*')):
        if folder.name not in {'root', 'translation', 'comment'}:
            continue
        print(f'\nProcessing: {folder.name}')
        files = list(folder.glob('**/*.json'))
        docs = []
        for i, file in enumerate(files):
            print(f'{i} of {len(files)}', end='    \r')

            with file.open() as f:
                try:
                    data = json.load(f)
                except json.JSONDecodeError:
                    logging.error(
                        f'Could not parse JSON, skipping: {file.relative_to(REPO_DIR)}'
                    )
                    continue
            uid, muids = file.stem.split('_')
            for segment_id, string in data.items():
                doc = {
                    '_key': f'{muids}:{segment_id}',
                    'muids': muids,
                    'segment_id': segment_id,
                    'string': string
                }
                docs.append(doc)
            if len(docs) > 10000:
                if len(futures) > limit:
                    completed, futures = wait(futures,
                                              return_when=FIRST_COMPLETED)
                futures.add(
                    executor.submit(strings_coll.insert_many, docs.copy()))
                docs.clear()
        if docs:
            futures.add(executor.submit(strings_coll.insert_many, docs.copy()))

    completed, futures = wait(futures)
    print(f'\nComplete in {monotonic()-start} seconds')
Ejemplo n.º 5
0
def validate_permissions(rules=None):
    if not rules:
        rules = get_rules()
    files = WORKING_DIR.glob('**/*.json')
    files = [
        str(file.relative_to(WORKING_DIR)) for file in files
        if not any(part for part in file.parts if part.startswith('.'))
    ]

    for user, user_permissions in rules.items():
        if user.startswith('_'):
            continue  # Not a valid Github ID, used for bilara
        for paths in user_permissions.values():
            for path in paths:

                if path == '*':
                    continue
                for file in files:
                    if file.startswith(path):
                        break
                else:
                    problemsLog.add(file=publications_file_name,
                                    msg=f"No files match path: {path}")
Ejemplo n.º 6
0
def make_file_index(force=False):
    global _tree_index
    global _uid_index
    global _muid_index
    global _file_index
    global _meta_definitions
    global _special_uid_mapping
    global _legal_ids

    if state_build_lock_file.exists():
        # We arrived here because another process started the build
        # let that process do the work
        for i in range(0, 100):
            time.sleep(1)
            if not state_build_lock_file.exists():
                if load_state():
                    _build_complete.set()
                    return
        # Should not normally reach here, but if so fall through and do the build
        # regardless after 100 seconds of waiting.
    try:
        state_build_lock_file.touch()
        _muid_index = muid_index = {}
        _uid_index = uid_index = {}
        _file_index = file_index = {}
        _legal_ids = set()

        for file in sorted(WORKING_DIR.glob('root/**/*.json')):
            with file.open() as f:
                data = json.load(f)
                _legal_ids.update(data.keys())

        def recurse(folder, meta_definitions=None, depth=0):
            subtree = {}
            meta_definitions = meta_definitions.copy()

            metafiles = set(folder.glob("_*.json"))
            if metafiles:
                for metafile in sorted(metafiles, key=humansortkey):
                    file_data = json_load(metafile)
                    if isinstance(file_data, dict):
                        meta_definitions.update(file_data)

                        for k, v in file_data.items():
                            if k not in _meta_definitions:
                                _meta_definitions[k] = v

            for file in sorted(folder.glob("*"), key=humansortkey):

                if file.name.startswith("."):
                    continue
                if file in metafiles:
                    continue
                long_id = file.stem
                meta = {}
                for part in file.parts:
                    if part.endswith(".json"):
                        part = part[:-5]
                    if part in meta_definitions:
                        meta[part] = meta_definitions[part]
                if file.is_dir():
                    subtree[file.name] = recurse(
                        file,
                        meta_definitions=meta_definitions,
                        depth=depth + 1)
                    subtree[file.name]["_meta"] = meta
                elif file.suffix == ".json":
                    mtime = file.stat().st_mtime_ns
                    path = str(file.relative_to(WORKING_DIR))
                    obj = subtree[long_id] = {
                        "path": path,
                        "mtime": mtime,
                        "_meta": meta
                    }
                    if "_" in long_id:
                        uid, muids = get_uid_and_muids(file)
                    else:
                        uid = file.name if file.is_dir() else file.stem
                        muids = None
                    obj["uid"] = uid
                    if uid not in uid_index:
                        uid_index[uid] = set()
                    uid_index[uid].add(long_id)
                    if long_id in file_index:
                        logging.error(f"{str(file)} not unique")
                    file_index[long_id] = obj
                    if muids:
                        for muid in muids:
                            if muid not in muid_index:
                                muid_index[muid] = set()
                            muid_index[muid].add(long_id)

                        # Create Virtual Files
                        if 'translation' in muids:
                            uid, muids = long_id.split('_')
                            _add_virtual_comment_file(uid, muids, file,
                                                      uid_index, muid_index,
                                                      file_index,
                                                      meta_definitions)

            if depth == 0:
                _add_virtual_project_files(uid_index, muid_index, file_index,
                                           subtree, _meta_definitions)
            return subtree

        _meta_definitions = {}
        _tree_index = recurse(WORKING_DIR, {})
        _uid_index = uid_index
        _muid_index = muid_index
        _file_index = file_index
        _special_uid_mapping = make_special_uid_mapping()

        for v in file_index.values():
            v["_meta"] = invert_meta(v["_meta"])
        print("File Index Built")
        save_state()
        _build_complete.set()
    finally:
        state_build_lock_file.unlink()
    stats_calculator.reset()
Ejemplo n.º 7
0
def make_file_index(force=False):
    _build_started.set()
    global _tree_index
    global _uid_index
    global _muid_index
    global _file_index
    global _meta_definitions
    global _special_uid_mapping
    global _legal_ids

    if not force:
        load_state()

    print("Building file index")

    _muid_index = muid_index = {}
    _uid_index = uid_index = {}
    _file_index = file_index = {}
    _legal_ids = set()

    for file in sorted(WORKING_DIR.glob('root/**/*.json')):
        with file.open() as f:
            data = json.load(f)
            _legal_ids.update(data.keys())

    def recurse(folder, meta_definitions=None):
        subtree = {}
        meta_definitions = meta_definitions.copy()

        metafiles = set(folder.glob("_*.json"))
        if metafiles:
            for metafile in sorted(metafiles, key=humansortkey):
                file_data = json_load(metafile)
                meta_definitions.update(file_data)

                for k, v in file_data.items():
                    if k not in _meta_definitions:
                        _meta_definitions[k] = v

        for file in sorted(folder.glob("*"), key=humansortkey):

            if file.name.startswith("."):
                continue
            if file in metafiles:
                continue
            long_id = file.stem
            meta = {}
            for part in file.parts:
                if part.endswith(".json"):
                    part = part[:-5]
                if part in meta_definitions:
                    meta[part] = meta_definitions[part]
            if file.is_dir():
                subtree[file.name] = recurse(file,
                                             meta_definitions=meta_definitions)
                subtree[file.name]["_meta"] = meta
            elif file.suffix == ".json":
                mtime = file.stat().st_mtime_ns
                path = str(file.relative_to(WORKING_DIR))
                obj = subtree[long_id] = {
                    "path": path,
                    "mtime": mtime,
                    "_meta": meta
                }
                if "_" in long_id:
                    uid, muids = get_uid_and_muids(file)
                else:
                    uid = file.name if file.is_dir() else file.stem
                    muids = None
                obj["uid"] = uid
                if uid not in uid_index:
                    uid_index[uid] = set()
                uid_index[uid].add(long_id)
                if long_id in file_index:
                    logging.error(f"{str(file)} not unique")
                file_index[long_id] = obj
                if muids:
                    for muid in muids:
                        if muid not in muid_index:
                            muid_index[muid] = set()
                        muid_index[muid].add(long_id)

                    # Create Virtual Files
                    if 'translation' in muids:
                        uid, muids = long_id.split('_')
                        muids = muids.replace('translation', 'comment')
                        comment_stem = f"{uid}_{muids}"
                        if comment_stem in uid_index:
                            continue
                        parent = pathlib.Path('comment') / file.relative_to(
                            WORKING_DIR / 'translation').parent
                        virtual_file = parent / (comment_stem + '.json')
                        meta = {
                            part: meta_definitions[part]
                            for part in muids.split('-')
                            if part in meta_definitions
                        }
                        obj = {
                            "uid": uid,
                            "path": str(virtual_file),
                            "mtime": None,
                            "_meta": meta
                        }
                        uid_index[uid].add(comment_stem)
                        file_index[comment_stem] = obj
                        for muid in muids.split('-'):
                            muid_index[muid].add(comment_stem)

        return subtree

    _meta_definitions = {}
    _tree_index = recurse(WORKING_DIR, {})
    _uid_index = uid_index
    _muid_index = muid_index
    _file_index = file_index
    _special_uid_mapping = make_special_uid_mapping()

    for v in file_index.values():
        v["_meta"] = invert_meta(v["_meta"])
    print("File Index Built")
    save_state()
    _build_complete.set()
    stats_calculator.reset()