def main(preprocessed_node_path, argument_path, dictionary_path, tfidf_path): preprocessed_node_path = Path(args.preprocessed_node_path) argument_path = Path(args.argument_path) dictionary_path = Path(args.dictionary_path) tfidf_path = Path(args.tfidf_path) #argument_generator_getter = lambda: utils.load(argument_path) #argument_nodes_ids = set(( # node_id # for argument in argument_generator_getter() # for node_id in argument[0].values())) # Use the set of ids to select only the relevant nodes # (and not train nlp models on all documents). #preprocessed_node_generator_getter = lambda : filter( # lambda node: node['id'] in argument_nodes_ids, # utils.load( preprocessed_node_path)) dictionary = pkl.load(dictionary_path.open('rb')) #tfidf = text.fit_tfidf(preprocessed_node_generator_getter, # dictionary, # verbose = True) tfidf = TfIdf() tfidf.fit(dictionary.dictionary) tfidf.save(tfidf_path)