Python get_all_corpora Examples

Programming Language: Python

Namespace/Package Name: eval.utils.data_utils

Method/Function: get_all_corpora

Examples at hotexamples.com: 3

Python get_all_corpora - 3 examples found. These are the top rated real world Python examples of eval.utils.data_utils.get_all_corpora extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: compress_labelled_data.py Project: mbatchkarov/dc_evaluation

def get_all_document_features(include_unigrams=False, remove_pos=False):
    """
    Finds all noun-noun and adj-noun compounds (and optionally adjs and nouns) in all labelled corpora
    mentioned in the conf files.
    :param include_unigrams: if False, only NPs will be returned
    :param remove_pos: whether to remove PoS tags if present, result will be either "cat/N" or "cat"
    :rtype: set of DocumentFeature
    """
    result = set()
    accepted_df_types = {'AN', 'NN', 'VO', 'SVO', '1-GRAM'} if include_unigrams else {'AN', 'NN', 'VO', 'SVO'}
    for corpus_name, _ in get_all_corpora():
        path = os.path.abspath(os.path.join(__file__, '..', '..', '..', ROOT, '%s_all_features.txt' % corpus_name))
        with open(path) as infile:
            for line in infile:
                df = DocumentFeature.from_string(line.strip())
                if df.type in accepted_df_types:
                    if remove_pos:
                        # todo these are of type str, in the other branch it's DocumentFeature. things will likely break
                        result.add(df.ngram_separator.join(t.text for t in df.tokens))
                    else:
                        result.add(df)

    logging.info('Found a total of %d features in all corpora', len(result))
    if not remove_pos:
        logging.info('Their types are %r', Counter(df.type for df in result))
    if include_unigrams:
        logging.info('PoS tags of unigrams are are %r',
                     Counter(df.tokens[0].pos for df in result if df.type == '1-GRAM'))
    else:
        logging.info('Unigram features not included!')
    return result

Example #2

Show file

File: compress_labelled_data.py Project: mbatchkarov/dc_evaluation

def jsonify_all_labelled_corpora(n_jobs, *args, **kwargs):
    corpora = get_all_corpora()
    logging.info('Converting the following corpora to JSON: %r', [c[0] for c in corpora])
    Parallel(n_jobs=n_jobs)(delayed(jsonify_single_labelled_corpus)(*(path + args), **kwargs) for path in corpora)

Example #3

Show file

File: compress_labelled_data.py Project: mbatchkarov/dc_evaluation

                        format="%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s")

    parser = argparse.ArgumentParser()

    parser.add_argument('--conf', type=is_valid_file, required=True,
                        help='Conf file that contains the parameters of the tokenizer')

    parser.add_argument('--jobs', type=int, default=4,
                        help='Number of concurrent jobs')

    parser.add_argument('--write-features', action='store_true', default=False,
                        help='Whether to store a set of all features in a range of formats')

    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--all', action='store_true', default=False,
                       help='Whether to compress ALL available labelled data sets or just one at a time')

    group.add_argument('--id', type=int,
                       help='If labelled data, compress just the labelled corpus at this position '
                            'in the predefined list. If unlabelled compress just '
                            'this thesaurus id in the database (must have been populated)')

    parameters = parser.parse_args()
    if parameters.all:
        jsonify_all_labelled_corpora(parameters.jobs, parameters.conf,
                                     write_feature_set=parameters.write_features)
    else:
        jsonify_single_labelled_corpus(get_all_corpora()[parameters.id][1],
                                       parameters.conf,
                                       write_feature_set=parameters.write_features)