Beispiel #1
0
def process_corpora(conf_list, backend, reg_dir, variant, replace):
    for conf_file in conf_list:
        logging.getLogger(__name__).info('Processing {0}'.format(conf_file))
        with open(conf_file) as fr:
            conf = InstallJson()
            conf.update(fr)

            if replace:
                logging.getLogger(__name__).info(
                    'Removing existing record (including registry) for {0}.'.
                    format(conf.ident))
                backend.remove_corpus(conf.ident)

            if backend.contains_corpus(conf.ident):
                logging.getLogger(__name__).info(
                    'Corpus {0} already present - skipping.'.format(
                        conf.ident))
            else:
                backend.save_corpus_config(
                    conf, reg_dir, get_corpus_size(conf.ident, reg_dir))
                logging.getLogger(__name__).info(
                    'Saved config for {0}.'.format(conf.ident))

            if variant:
                reg_path = os.path.join(reg_dir, variant, conf.ident)
            else:
                reg_path = os.path.join(reg_dir, conf.ident)

            if os.path.isfile(reg_path):
                enc = infer_encoding(reg_path)
                with open(reg_path) as fr2:
                    parse_registry(fr2,
                                   variant=variant,
                                   backend=backend,
                                   encoding=enc)
Beispiel #2
0
def process_corpora(conf_list, backend, reg_dir, variant, replace):
    for conf_file in conf_list:
        logging.getLogger(__name__).info('Processing {0}'.format(conf_file))
        with open(conf_file) as fr:
            conf = InstallJson()
            conf.update(fr)

            if replace:
                logging.getLogger(__name__).info(
                    'Removing existing record (including registry) for {0}.'.format(conf.ident))
                backend.remove_corpus(conf.ident)

            if backend.contains_corpus(conf.ident):
                logging.getLogger(__name__).info(
                    'Corpus {0} already present - skipping.'.format(conf.ident))
            else:
                backend.save_corpus_config(conf, reg_dir, get_corpus_size(conf.ident, reg_dir))
                logging.getLogger(__name__).info('Saved config for {0}.'.format(conf.ident))

            if variant:
                reg_path = os.path.join(reg_dir, variant, conf.ident)
            else:
                reg_path = os.path.join(reg_dir, conf.ident)

            if os.path.isfile(reg_path):
                enc = infer_encoding(reg_path)
                with open(reg_path) as fr2:
                    parse_registry(fr2, variant=variant, backend=backend, encoding=enc)
Beispiel #3
0
def process_directory(dir_path, variant, backend, auto_align, verbose):
    if variant:
        dir_path = os.path.join(dir_path, variant)
    aligned = {}
    id_map = {}
    created_rt = {}
    for item in os.listdir(dir_path):
        fpath = os.path.join(dir_path, item)
        if os.path.isfile(fpath):
            enc = infer_encoding(fpath)
            with open(fpath) as fr:
                try:
                    ans = parse_registry(fr,
                                         variant=variant,
                                         backend=backend,
                                         encoding=enc)
                    created_rt[ans['corpus_id']] = ans['created_rt']
                    if not auto_align:
                        aligned[ans['corpus_id']] = ans['aligned']
                    id_map[ans['corpus_id']] = ans['corpus_id']
                except Exception as ex:
                    logging.getLogger(__name__).error(ex)
                    if verbose:
                        import traceback
                        traceback.print_exc(ex)
    aligned_ids_map = defaultdict(lambda: [])
    if auto_align:
        ids = set(id_map.values())
        for k in ids:
            aligned_ids_map[k] = list(ids - set([k]))
    else:
        for id, alig in aligned.items():
            for a in alig:
                try:
                    aligned_ids_map[id].append(id_map[a])
                except KeyError:
                    logging.getLogger(__name__).warning(
                        'Ignored alignment {0} --> {1}'.format(id, a))

    for corpus_id, aligned_ids in aligned_ids_map.items():
        if created_rt.get(corpus_id, False):
            backend.save_corpus_alignments(corpus_id, aligned_ids)
Beispiel #4
0
def process_directory(dir_path, variant, backend, auto_align, verbose):
    if variant:
        dir_path = os.path.join(dir_path, variant)
    aligned = {}
    id_map = {}
    created_rt = {}
    for item in os.listdir(dir_path):
        fpath = os.path.join(dir_path, item)
        if os.path.isfile(fpath):
            enc = infer_encoding(fpath)
            with open(fpath) as fr:
                try:
                    ans = parse_registry(fr, variant=variant, backend=backend, encoding=enc)
                    created_rt[ans['corpus_id']] = ans['created_rt']
                    if not auto_align:
                        aligned[ans['corpus_id']] = ans['aligned']
                    id_map[ans['corpus_id']] = ans['corpus_id']
                except Exception as ex:
                    logging.getLogger(__name__).error(ex)
                    if verbose:
                        import traceback
                        traceback.print_exc(ex)
    aligned_ids_map = defaultdict(lambda: [])
    if auto_align:
        ids = set(id_map.values())
        for k in ids:
            aligned_ids_map[k] = list(ids - set([k]))
    else:
        for id, alig in aligned.items():
            for a in alig:
                try:
                    aligned_ids_map[id].append(id_map[a])
                except KeyError:
                    logging.getLogger(__name__).warning(
                        'Ignored alignment {0} --> {1}'.format(id, a))

    for corpus_id, aligned_ids in aligned_ids_map.items():
        if created_rt.get(corpus_id, False):
            backend.save_corpus_alignments(corpus_id, aligned_ids)
Beispiel #5
0
        '-l',
        '--auto-align',
        metavar='AUTO_ALIGN',
        action='store_const',
        const=True,
        help='Align all the corpus in a directory automatically')
    parser.add_argument(
        '-v',
        '--verbose',
        action='store_const',
        const=True,
        help='Provide more information during processing (especially errors)')
    args = parser.parse_args()
    import settings
    settings.load(args.conf_path)
    backend = WritableBackend(MySQLConf(settings))

    if os.path.isdir(args.rpath):
        process_directory(args.rpath, None, backend, args.auto_align,
                          args.verbose)
        if args.variant:
            process_directory(args.rpath, args.variant, backend,
                              args.auto_align, args.verbose)
    else:
        with open(args.rpath) as fr:
            parse_registry(fr,
                           backend=backend,
                           variant=args.variant,
                           encoding=args.encoding
                           if args.encoding else infer_encoding(args.rpath))
Beispiel #6
0
            backend.save_corpus_alignments(corpus_id, aligned_ids)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Import a Manatee registry file(s)')
    parser.add_argument('rpath', metavar='REGISTRY_PATH', type=str)
    parser.add_argument('conf_path', metavar='CONFPATH', type=str)
    parser.add_argument('-e', '--encoding', metavar='ENCODING', type=str, default=None)
    parser.add_argument('-a', '--variant', metavar='VARIANT', type=str,
                        help='A subdirectory containing (restricted) variants of corpora')
    parser.add_argument('-l', '--auto-align', metavar='AUTO_ALIGN', action='store_const', const=True,
                        help='Align all the corpus in a directory automatically')
    parser.add_argument('-v', '--verbose', action='store_const', const=True,
                        help='Provide more information during processing (especially errors)')
    args = parser.parse_args()
    import settings
    settings.load(args.conf_path)
    backend = WritableBackend(MySQLConf(settings))

    if os.path.isdir(args.rpath):
        process_directory(args.rpath, None, backend, args.auto_align, args.verbose)
        if args.variant:
            process_directory(args.rpath, args.variant, backend, args.auto_align, args.verbose)
    else:
        with open(args.rpath) as fr:
            parse_registry(fr,
                           backend=backend,
                           variant=args.variant,
                           encoding=args.encoding if args.encoding else infer_encoding(args.rpath))