from sklearn import metrics import numpy as np import multiprocessing as mp from threading import Thread # logging.basicConfig(level=logging.INFO) collection = EmailCollection(unsec.SMALL_DATASET_PATH) # # collection.keep_lang("fr") engine = Clusterizer(collection, target="both", algorithm=HierarchicalAlgo(), vectorizer=TfidfVectorizer()) engine.compute() for col in engine.clusters: print("==cluster==") for email in col: print("---email---") print("subject :", email.get_subject()) print("body :", email.get_body()) print("clean :", email.clean)
log = logging.getLogger(__name__) log.info(args.config+" has been load as configuration") else: logging.disable(logging.NOTSET) collection = EmailCollection() collection.add_from_directory(cfg.PATH) if hasattr(cfg,"LANG"): collection.keep_lang("fr") engine = Clusterizer(collection) engine.vectorizer = getattr(cfg,"VECTORIZER", LogicVectorizer()) engine.algorithm = getattr(cfg,"ALGORITHM", HierarchicalAlgo()) engine.algorithm.n_clusters = getattr(cfg,"N_CLUSTERS", 3) engine.target = getattr(cfg,"TARGET", "both") if getattr(cfg,"ENABLE_TEST", False): assert hasattr(cfg,"TEST_CLUSTERING_RANGE"), "TEST_CLUSTERING_RANGE has not been defined" test_folder = getattr(cfg,"TEST_FOLDER", "results") if not os.path.exists(test_folder): os.makedirs(test_folder)