Example #1
0
def main():
    global online
    logging.basicConfig(level=logging.INFO)
    parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.')
    parser.add_argument('-c', '--config', required=True,
                        help='extractor configuration')
    parser.add_argument('-g', '--grammars',
                        help='grammar output path')
    parser.add_argument('-j', '--jobs', type=int, default=1,
                        help='number of parallel extractors')
    parser.add_argument('-s', '--chunksize', type=int, default=10,
                        help='number of sentences / chunk')
    parser.add_argument('-f', '--features', nargs='*', default=[],
                        help='additional feature definitions')
    parser.add_argument('-o', '--online', action='store_true',
                        help='online grammar extraction')
    parser.add_argument('-z', '--compress', action='store_true',
                        help='compress grammars with gzip')
    parser.add_argument('-t', '--stream', action='store_true',
                        help='stream mode (see README.md)')
    args = parser.parse_args()

    if not (args.grammars or args.stream):
        sys.stderr.write('Error: either -g/--grammars or -t/--stream required\n')
        sys.exit(1)

    if args.grammars and not os.path.exists(args.grammars):
        os.mkdir(args.grammars)
    for featdef in args.features:
        if not featdef.endswith('.py'):
            sys.stderr.write('Error: feature definition file <{}>'
                    ' should be a python module\n'.format(featdef))
            sys.exit(1)

    online = args.online
    stream = args.stream

    start_time = monitor_cpu()
    if args.jobs > 1:
        if stream:
            sys.stderr.write('Error: stream mode incompatible with multiple jobs\n')
            sys.exit(1)
        logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize)
        pool = mp.Pool(args.jobs, make_extractor, (args,))
        try:
            for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize):
                print(output)
        except KeyboardInterrupt:
            pool.terminate()
    else:
        make_extractor(args)
        if stream:
            stream_extract()
        else:
            for output in map(extract, enumerate(sys.stdin)):
                print(output)

    stop_time = monitor_cpu()
    logging.info("Overall extraction step took %f seconds", stop_time - start_time)
Example #2
0
def main():
    global online
    logging.basicConfig(level=logging.INFO)
    parser = argparse.ArgumentParser(
        description='Extract grammars from a compiled corpus.')
    parser.add_argument('-c',
                        '--config',
                        required=True,
                        help='extractor configuration')
    parser.add_argument('-g',
                        '--grammars',
                        required=True,
                        help='grammar output path')
    parser.add_argument('-j',
                        '--jobs',
                        type=int,
                        default=1,
                        help='number of parallel extractors')
    parser.add_argument('-s',
                        '--chunksize',
                        type=int,
                        default=10,
                        help='number of sentences / chunk')
    parser.add_argument('-f',
                        '--features',
                        nargs='*',
                        default=[],
                        help='additional feature definitions')
    parser.add_argument('-o',
                        '--online',
                        action='store_true',
                        help='online grammar extraction')
    parser.add_argument('-z',
                        '--compress',
                        action='store_true',
                        help='compress grammars with gzip')
    args = parser.parse_args()

    if not os.path.exists(args.grammars):
        os.mkdir(args.grammars)
    for featdef in args.features:
        if not featdef.endswith('.py'):
            sys.stderr.write('Error: feature definition file <{}>'
                             ' should be a python module\n'.format(featdef))
            sys.exit(1)

    online = args.online

    start_time = monitor_cpu()
    if args.jobs > 1:
        logging.info('Starting %d workers; chunk size: %d', args.jobs,
                     args.chunksize)
        pool = mp.Pool(args.jobs, make_extractor, (args, ))
        try:
            for output in pool.imap(extract, enumerate(sys.stdin),
                                    args.chunksize):
                print(output)
        except KeyboardInterrupt:
            pool.terminate()
    else:
        make_extractor(args)
        for output in map(extract, enumerate(sys.stdin)):
            print(output)

    stop_time = monitor_cpu()
    logging.info("Overall extraction step took %f seconds",
                 stop_time - start_time)
Example #3
0
def main():
    preprocess_start_time = monitor_cpu()
    sys.setrecursionlimit(sys.getrecursionlimit() * 100)

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger('cdec.sa.compile')
    parser = argparse.ArgumentParser(description='Compile a corpus into a suffix array.')
    parser.add_argument('--maxnt', '-n', type=int, default=2,
                        help='Maximum number of non-terminal symbols')
    parser.add_argument('--maxlen', '-l', type=int, default=5,
                        help='Maximum number of terminals')
    parser.add_argument('--maxsize', '-s', type=int, default=15,
                        help='Maximum rule span')
    parser.add_argument('--mingap', '-g', type=int, default=1,
                        help='Minimum gap size')
    parser.add_argument('--rank1', '-r1', type=int, default=100,
                        help='Number of pre-computed frequent patterns')
    parser.add_argument('--rank2', '-r2', type=int, default=10,
                        help='Number of pre-computed super-frequent patterns)')
    parser.add_argument('--loose', action='store_true',
                        help='Enable loose phrase extraction (default: tight)')
    parser.add_argument('-c', '--config', default='/dev/stdout',
                        help='Output configuration')
    parser.add_argument('-f', '--source',
                        help='Source language corpus')
    parser.add_argument('-e', '--target',
                        help='Target language corpus')
    parser.add_argument('-b', '--bitext',
                        help='Parallel text (source ||| target)')
    parser.add_argument('-a', '--alignment', required=True,
                        help='Bitext word alignment')
    parser.add_argument('-o', '--output', required=True,
                        help='Output path')
    parser.add_argument('--online', action='store_true',
                        help='Compile data for online grammar extraction')
    args = parser.parse_args()

    if not ((args.source and args.target) or args.bitext):
        parser.error('a parallel corpus is required\n'
        '\tuse -f (source) with -e (target) or -b (bitext)')

    param_names = ('max_len', 'max_nt', 'max_size', 'min_gap',
            'rank1', 'rank2', 'tight_phrases')
    params = (args.maxlen, args.maxnt, args.maxsize, args.mingap,
            args.rank1, args.rank2, not args.loose)

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    f_sa_bin = os.path.join(args.output, 'f.sa.bin')
    e_bin = os.path.join(args.output, 'e.bin')
    precomp_file = 'precomp.{0}.{1}.{2}.{3}.{4}.{5}.bin'.format(*params)
    precomp_bin = os.path.join(args.output, precomp_file)
    a_bin = os.path.join(args.output, 'a.bin')
    lex_bin = os.path.join(args.output, 'lex.bin')
    # online only
    bilex_file = os.path.join(args.output, 'bilex.gz')

    config = cdec.configobj.ConfigObj(args.config, unrepr=True)
    config['f_sa_file'] = os.path.abspath(f_sa_bin)
    config['e_file'] = os.path.abspath(e_bin)
    config['a_file'] = os.path.abspath(a_bin)
    config['lex_file'] = os.path.abspath(lex_bin)
    config['precompute_file'] = os.path.abspath(precomp_bin)
    if args.online:
        config['bilex_file'] = os.path.abspath(bilex_file)

    start_time = monitor_cpu()
    logger.info('Compiling source suffix array')
    if args.bitext:
        f_sa = cdec.sa.SuffixArray(from_text=args.bitext, side='source')
    else:
        f_sa = cdec.sa.SuffixArray(from_text=args.source)
    f_sa.write_binary(f_sa_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling source suffix array took %f seconds', stop_time - start_time)

    start_time = monitor_cpu()
    logger.info('Compiling target data array')
    if args.bitext:
        e = cdec.sa.DataArray(from_text=args.bitext, side='target')
    else:
        e = cdec.sa.DataArray(from_text=args.target)
    e.write_binary(e_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling target data array took %f seconds', stop_time - start_time)

    start_time = monitor_cpu()
    logger.info('Precomputing frequent phrases')
    precompute(f_sa, *params).write_binary(precomp_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling precomputations took %f seconds', stop_time - start_time)

    start_time = monitor_cpu()
    logger.info('Compiling alignment')
    a = cdec.sa.Alignment(from_text=args.alignment)
    a.write_binary(a_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling alignment took %f seconds', stop_time - start_time)

    start_time = monitor_cpu()
    logger.info('Compiling bilexical dictionary')
    lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa)
    lex.write_binary(lex_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling bilexical dictionary took %f seconds', stop_time - start_time)

    if args.online:
        start_time = monitor_cpu()
        logger.info('Compiling online bilexical dictionary')
        if args.bitext:
            bilex = cdec.sa.online.Bilex()
            bilex.add_bitext(args.alignment, args.bitext)
        else:
            bilex = cdec.sa.online.Bilex()
            bilex.add_bitext(args.alignment, args.source, args.target)
        bilex.write(bilex_file)
        stop_time = monitor_cpu()
        logger.info('Compiling online bilexical dictionary took %f seconds', stop_time - start_time)

    # Write configuration
    for name, value in zip(param_names, params):
        config[name] = value
    config.write()
    preprocess_stop_time = monitor_cpu()
    logger.info('Overall preprocessing step took %f seconds', preprocess_stop_time - preprocess_start_time)
Example #4
0
def main():
    preprocess_start_time = monitor_cpu()
    sys.setrecursionlimit(sys.getrecursionlimit() * 100)

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger('cdec.sa.compile')
    parser = argparse.ArgumentParser(
        description='Compile a corpus into a suffix array.')
    parser.add_argument('--maxnt',
                        '-n',
                        type=int,
                        default=2,
                        help='Maximum number of non-terminal symbols')
    parser.add_argument('--maxlen',
                        '-l',
                        type=int,
                        default=5,
                        help='Maximum number of terminals')
    parser.add_argument('--maxsize',
                        '-s',
                        type=int,
                        default=15,
                        help='Maximum rule span')
    parser.add_argument('--mingap',
                        '-g',
                        type=int,
                        default=1,
                        help='Minimum gap size')
    parser.add_argument('--rank1',
                        '-r1',
                        type=int,
                        default=100,
                        help='Number of pre-computed frequent patterns')
    parser.add_argument('--rank2',
                        '-r2',
                        type=int,
                        default=10,
                        help='Number of pre-computed super-frequent patterns)')
    parser.add_argument('--loose',
                        action='store_true',
                        help='Enable loose phrase extraction (default: tight)')
    parser.add_argument('-c',
                        '--config',
                        default='/dev/stdout',
                        help='Output configuration')
    parser.add_argument('-f', '--source', help='Source language corpus')
    parser.add_argument('-e', '--target', help='Target language corpus')
    parser.add_argument('-b',
                        '--bitext',
                        help='Parallel text (source ||| target)')
    parser.add_argument('-a',
                        '--alignment',
                        required=True,
                        help='Bitext word alignment')
    parser.add_argument('-o', '--output', required=True, help='Output path')
    parser.add_argument('--online',
                        action='store_true',
                        help='Compile data for online grammar extraction')
    args = parser.parse_args()

    if not ((args.source and args.target) or args.bitext):
        parser.error('a parallel corpus is required\n'
                     '\tuse -f (source) with -e (target) or -b (bitext)')

    param_names = ('max_len', 'max_nt', 'max_size', 'min_gap', 'rank1',
                   'rank2', 'tight_phrases')
    params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, args.rank1,
              args.rank2, not args.loose)

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    f_sa_bin = os.path.join(args.output, 'f.sa.bin')
    e_bin = os.path.join(args.output, 'e.bin')
    precomp_file = 'precomp.{0}.{1}.{2}.{3}.{4}.{5}.bin'.format(*params)
    precomp_bin = os.path.join(args.output, precomp_file)
    a_bin = os.path.join(args.output, 'a.bin')
    lex_bin = os.path.join(args.output, 'lex.bin')
    # online only
    bilex_file = os.path.join(args.output, 'bilex.gz')

    config = cdec.configobj.ConfigObj(args.config, unrepr=True)
    config['f_sa_file'] = os.path.abspath(f_sa_bin)
    config['e_file'] = os.path.abspath(e_bin)
    config['a_file'] = os.path.abspath(a_bin)
    config['lex_file'] = os.path.abspath(lex_bin)
    config['precompute_file'] = os.path.abspath(precomp_bin)
    if args.online:
        config['bilex_file'] = os.path.abspath(bilex_file)

    start_time = monitor_cpu()
    logger.info('Compiling source suffix array')
    if args.bitext:
        f_sa = cdec.sa.SuffixArray(from_text=args.bitext, side='source')
    else:
        f_sa = cdec.sa.SuffixArray(from_text=args.source)
    f_sa.write_binary(f_sa_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling source suffix array took %f seconds',
                stop_time - start_time)

    start_time = monitor_cpu()
    logger.info('Compiling target data array')
    if args.bitext:
        e = cdec.sa.DataArray(from_text=args.bitext, side='target')
    else:
        e = cdec.sa.DataArray(from_text=args.target)
    e.write_binary(e_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling target data array took %f seconds',
                stop_time - start_time)

    start_time = monitor_cpu()
    logger.info('Precomputing frequent phrases')
    precompute(f_sa, *params).write_binary(precomp_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling precomputations took %f seconds',
                stop_time - start_time)

    start_time = monitor_cpu()
    logger.info('Compiling alignment')
    a = cdec.sa.Alignment(from_text=args.alignment)
    a.write_binary(a_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling alignment took %f seonds', stop_time - start_time)

    start_time = monitor_cpu()
    logger.info('Compiling bilexical dictionary')
    lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa)
    lex.write_binary(lex_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling bilexical dictionary took %f seconds',
                stop_time - start_time)

    if args.online:
        start_time = monitor_cpu()
        logger.info('Compiling online bilexical dictionary')
        if args.bitext:
            bilex = cdec.sa.online.Bilex()
            bilex.add_bitext(args.alignment, args.bitext)
        else:
            bilex = cdec.sa.online.Bilex()
            bilex.add_bitext(args.alignment, args.source, args.target)
        bilex.write(bilex_file)
        stop_time = monitor_cpu()
        logger.info('Compiling online bilexical dictionary took %f seconds',
                    stop_time - start_time)

    # Write configuration
    for name, value in zip(param_names, params):
        config[name] = value
    config.write()
    preprocess_stop_time = monitor_cpu()
    logger.info('Overall preprocessing step took %f seconds',
                preprocess_stop_time - preprocess_start_time)