def get_all_document_features(include_unigrams=False, remove_pos=False): """ Finds all noun-noun and adj-noun compounds (and optionally adjs and nouns) in all labelled corpora mentioned in the conf files. :param include_unigrams: if False, only NPs will be returned :param remove_pos: whether to remove PoS tags if present, result will be either "cat/N" or "cat" :rtype: set of DocumentFeature """ result = set() accepted_df_types = {'AN', 'NN', 'VO', 'SVO', '1-GRAM'} if include_unigrams else {'AN', 'NN', 'VO', 'SVO'} for corpus_name, _ in get_all_corpora(): path = os.path.abspath(os.path.join(__file__, '..', '..', '..', ROOT, '%s_all_features.txt' % corpus_name)) with open(path) as infile: for line in infile: df = DocumentFeature.from_string(line.strip()) if df.type in accepted_df_types: if remove_pos: # todo these are of type str, in the other branch it's DocumentFeature. things will likely break result.add(df.ngram_separator.join(t.text for t in df.tokens)) else: result.add(df) logging.info('Found a total of %d features in all corpora', len(result)) if not remove_pos: logging.info('Their types are %r', Counter(df.type for df in result)) if include_unigrams: logging.info('PoS tags of unigrams are are %r', Counter(df.tokens[0].pos for df in result if df.type == '1-GRAM')) else: logging.info('Unigram features not included!') return result
def jsonify_all_labelled_corpora(n_jobs, *args, **kwargs): corpora = get_all_corpora() logging.info('Converting the following corpora to JSON: %r', [c[0] for c in corpora]) Parallel(n_jobs=n_jobs)(delayed(jsonify_single_labelled_corpus)(*(path + args), **kwargs) for path in corpora)
format="%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--conf', type=is_valid_file, required=True, help='Conf file that contains the parameters of the tokenizer') parser.add_argument('--jobs', type=int, default=4, help='Number of concurrent jobs') parser.add_argument('--write-features', action='store_true', default=False, help='Whether to store a set of all features in a range of formats') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--all', action='store_true', default=False, help='Whether to compress ALL available labelled data sets or just one at a time') group.add_argument('--id', type=int, help='If labelled data, compress just the labelled corpus at this position ' 'in the predefined list. If unlabelled compress just ' 'this thesaurus id in the database (must have been populated)') parameters = parser.parse_args() if parameters.all: jsonify_all_labelled_corpora(parameters.jobs, parameters.conf, write_feature_set=parameters.write_features) else: jsonify_single_labelled_corpus(get_all_corpora()[parameters.id][1], parameters.conf, write_feature_set=parameters.write_features)