Ejemplo n.º 1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-d', '--dataset')
    parser.add_argument('-p', '--dataset-path', default=default_dataset_path())
    parser.add_argument('-o', '--output')
    opts = parser.parse_args()

    dataset_name = opts.dataset
    dataset_path = opts.dataset_path
    out_fn = opts.output

    if not out_fn:
        logging.error('--output argument required ...')
        parser.print_usage()
        sys.exit(1)

    if not dataset_name:
        logging.error('--dataset argument required ...')
        parser.print_usage()
        sys.exit(1)

    if dataset_name == 'newsgroups':
        corpus = (preprocess_ng(doc) for doc in newsgroups.iterator(
            download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path)))
    if dataset_name == 'ndt':
        dataset = NDTDataset(dataset_path=dataset_path)
        dataset.install()

        corpus = (preprocess_ndt(doc) for doc in dataset)
    else:
        logging.error('Unknown dataset %s ...' % dataset_name)
        sys.exit(1)

    d = Dictionary(corpus)
    d.save_as_text(out_fn, sort_by_word=False)
def main():
    parser = ArgumentParser()
    parser.add_argument('-d', '--dataset')
    parser.add_argument('-p', '--dataset-path', default=default_dataset_path())
    parser.add_argument('-o', '--output')
    opts = parser.parse_args()

    dataset_name = opts.dataset
    dataset_path = opts.dataset_path
    out_fn = opts.output

    if not out_fn:
        logging.error('--output argument required ...')
        parser.print_usage()
        sys.exit(1)

    if not dataset_name:
        logging.error('--dataset argument required ...')
        parser.print_usage()
        sys.exit(1)

    if dataset_name == 'newsgroups':
        corpus = (preprocess_ng(doc) for doc
                  in newsgroups.iterator(download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path)))
    if dataset_name == 'ndt':
        dataset = NDTDataset(dataset_path=dataset_path)
        dataset.install()

        corpus = (preprocess_ndt(doc) for doc in dataset)
    else:
        logging.error('Unknown dataset %s ...' % dataset_name)
        sys.exit(1)

    d = Dictionary(corpus)
    d.save_as_text(out_fn, sort_by_word=False)
def install_hunpos():
    """
    Downloads and install system appropriate HunPos binaries in the default location.

    :rtype : None
    """
    models_dir = os.path.join(project_path(), 'models')

    hunpos_archive_fn = download_file(HUNPOS_URL_MAP[sys.platform], models_dir)

    if sys.platform == 'win32':
        with ZipFile(hunpos_archive_fn) as f:
            f.extractall(models_dir)
    else:
        with TarFile(hunpos_archive_fn) as f:
            f.extractall(models_dir)

    os.remove(hunpos_archive_fn)
Ejemplo n.º 4
0
def install_hunpos():
    """
    Downloads and install system appropriate HunPos binaries in the default location.

    :rtype : None
    """
    models_dir = os.path.join(project_path(), 'models')

    hunpos_archive_fn = download_file(HUNPOS_URL_MAP[sys.platform], models_dir)

    if sys.platform == 'win32':
        with ZipFile(hunpos_archive_fn) as f:
            f.extractall(models_dir)
    else:
        with TarFile(hunpos_archive_fn) as f:
            f.extractall(models_dir)

    os.remove(hunpos_archive_fn)