Example #1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-d', '--dataset')
    parser.add_argument('-p', '--dataset-path', default=default_dataset_path())
    parser.add_argument('-o', '--output')
    opts = parser.parse_args()

    dataset_name = opts.dataset
    dataset_path = opts.dataset_path
    out_fn = opts.output

    if not out_fn:
        logging.error('--output argument required ...')
        parser.print_usage()
        sys.exit(1)

    if not dataset_name:
        logging.error('--dataset argument required ...')
        parser.print_usage()
        sys.exit(1)

    if dataset_name == 'newsgroups':
        corpus = (preprocess_ng(doc) for doc in newsgroups.iterator(
            download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path)))
    if dataset_name == 'ndt':
        dataset = NDTDataset(dataset_path=dataset_path)
        dataset.install()

        corpus = (preprocess_ndt(doc) for doc in dataset)
    else:
        logging.error('Unknown dataset %s ...' % dataset_name)
        sys.exit(1)

    d = Dictionary(corpus)
    d.save_as_text(out_fn, sort_by_word=False)
def main():
    parser = ArgumentParser()
    parser.add_argument('-d', '--dataset')
    parser.add_argument('-p', '--dataset-path', default=default_dataset_path())
    parser.add_argument('-o', '--output')
    opts = parser.parse_args()

    dataset_name = opts.dataset
    dataset_path = opts.dataset_path
    out_fn = opts.output

    if not out_fn:
        logging.error('--output argument required ...')
        parser.print_usage()
        sys.exit(1)

    if not dataset_name:
        logging.error('--dataset argument required ...')
        parser.print_usage()
        sys.exit(1)

    if dataset_name == 'newsgroups':
        corpus = (preprocess_ng(doc) for doc
                  in newsgroups.iterator(download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path)))
    if dataset_name == 'ndt':
        dataset = NDTDataset(dataset_path=dataset_path)
        dataset.install()

        corpus = (preprocess_ndt(doc) for doc in dataset)
    else:
        logging.error('Unknown dataset %s ...' % dataset_name)
        sys.exit(1)

    d = Dictionary(corpus)
    d.save_as_text(out_fn, sort_by_word=False)
def main():
    parser = ArgumentParser()
    parser.add_argument('-f', '--features')
    parser.add_argument('-m', '--model-file')
    parser.add_argument('-d', '--dataset-file')
    parser.add_argument('-l', '--language', default='nob')

    args = parser.parse_args()

    features = args.features
    model_fn = args.model_file
    dataset_fn = args.dataset_file
    lang = args.language

    if not features in FEATURES_MAP:
        logging.error('Unknown feature identifier %s (one of <%s>) ...'
                      % (features, '|'.join(FEATURES_MAP.keys())))
        sys.exit(1)

    if dataset_fn and not os.path.exists(dataset_fn):
        logging.error('Could not find NDT dataset archive %s ...' % dataset_fn)
        sys.exit(1)

    if not model_fn:
        # noinspection PyUnresolvedReferences
        model_fn = 'no-ndt-hunpos-%s-%s' % (features, datetime.now().strftime("%Y-%m-%d-%H-%M"))

    if not lang in ['nob', 'nno', 'both']:
        logging.error('Uknown language %s (one of <%s>) ...' % (lang), '|'.join(['nob', 'nno', 'both']))
        sys.exit(1)

    if lang == 'both':
        lang = None

    if dataset_fn:
        dataset = NDTDataset(dataset_fn=dataset_fn, normalize_func=None, fields=FIELDS, lang=lang)
    else:
        dataset = NDTDataset(normalize_func=None, fields=FIELDS, lang=lang)
        dataset.install()

    pos_norm_func = FEATURES_MAP[features]
    seq_gen = ([(form, pos_norm_func(form, pos, feats)) for form, pos, feats in sent] for sent in dataset)

    stats = train_hunpos_model(seq_gen, model_fn)

    # print the stats from the hunpos output
    for k, v in stats.items():
        print '%s:\t%s' % (k, v)
Example #4
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-e',
                        '--elasticsearch-server',
                        default='localhost:9200')
    parser.add_argument('-d', '--dataset')
    parser.add_argument('-s', '--sections')
    opts = parser.parse_args()

    es_hosts = [opts.elasticsearch_server]
    dataset_name = opts.dataset
    dataset_sections = opts.sections

    es = Elasticsearch(hosts=es_hosts, timeout=120)

    if dataset_name == 'newsgroups':
        dataset = NewsgroupsDataset()
    elif dataset_name == 'aviskorpus':
        sections = None
        sources = None

        if dataset_sections:
            try:
                sections, sources = dataset_sections.split('-')
                sections = [int(s) for s in sections.split('|')]
                sources = [s for s in sources.split('|')]
            except Exception:
                logging.error('Malformed section specification "%s" ...' %
                              dataset_sections)
                sys.exit(1)

        dataset = AviskorpusDataset(sections=sections, sources=sources)
    elif dataset_name == 'ndt':
        sections = None
        lang = None

        if dataset_sections:
            try:
                sections, lang = dataset_sections.split('-')
                sections = [int(s) for s in sections.split('|')]
                lang = [s for s in lang.split('|')]
            except Exception:
                logging.error('Malformed section specification "%s" ...' %
                              dataset_sections)
                sys.exit(1)

        dataset = NDTDataset(lang=lang, sections=sections)
    else:
        logging.error('Unknown dataset %s ...' % dataset_name)
        sys.exit(1)

    dataset.install(es)
def main():
    parser = ArgumentParser()
    parser.add_argument('-f', '--features')
    parser.add_argument('-m', '--model-file')
    parser.add_argument('-d', '--dataset-file')
    parser.add_argument('-l', '--language', default='nob')

    args = parser.parse_args()

    features = args.features
    model_fn = args.model_file
    dataset_fn = args.dataset_file
    lang = args.language

    if not features in FEATURES_MAP:
        logging.error('Unknown feature identifier %s (one of <%s>) ...' %
                      (features, '|'.join(FEATURES_MAP.keys())))
        sys.exit(1)

    if dataset_fn and not os.path.exists(dataset_fn):
        logging.error('Could not find NDT dataset archive %s ...' % dataset_fn)
        sys.exit(1)

    if not model_fn:
        # noinspection PyUnresolvedReferences
        model_fn = 'no-ndt-hunpos-%s-%s' % (
            features, datetime.now().strftime("%Y-%m-%d-%H-%M"))

    if not lang in ['nob', 'nno', 'both']:
        logging.error('Uknown language %s (one of <%s>) ...' % (lang),
                      '|'.join(['nob', 'nno', 'both']))
        sys.exit(1)

    if lang == 'both':
        lang = None

    if dataset_fn:
        dataset = NDTDataset(dataset_fn=dataset_fn,
                             normalize_func=None,
                             fields=FIELDS,
                             lang=lang)
    else:
        dataset = NDTDataset(normalize_func=None, fields=FIELDS, lang=lang)
        dataset.install()

    pos_norm_func = FEATURES_MAP[features]
    seq_gen = ([(form, pos_norm_func(form, pos, feats))
                for form, pos, feats in sent] for sent in dataset)

    stats = train_hunpos_model(seq_gen, model_fn)

    # print the stats from the hunpos output
    for k, v in stats.items():
        print '%s:\t%s' % (k, v)