def main(): parser = create_parser() opts, args = parser.parse_args() if len(args) != 3: parser.error("invalid number of arguments") training_sqlite3, test_sqlite3, process_language = args print("Arguments: %s %s %s" % (training_sqlite3, test_sqlite3, process_language)) #cwd = os.getcwd() #print "Working directory: %s" % (cwd) temporary_dir = opts.temporary_dir if opts.temporary_dir else P.dirname( training_sqlite3) if not P.isdir(temporary_dir): parser.error("error: temporary directory %s does not exist" % temporary_dir) def log(message): print("[%s] %s" % (datetime.datetime.now().isoformat(), message)) nlp = spacy.load(process_language) # # Training section # log("Preparing SQLite training database") training_sqlite = P.join(temporary_dir, training_sqlite3) prep_sqlite(training_sqlite) log("Calculating dimensions") calc_dim(nlp, training_sqlite, 0, False) log("Excluding dimensions") #if process_language == 'en': # exclude_stopwords(training_sqlite) #else : # exclude_stopwords_spacy(training_sqlite, process_language) exclude_stopwords_spacy(nlp, training_sqlite, process_language) exclude_non_alpha_partial(training_sqlite) exclude_unigrams_shorter_than(training_sqlite, 3) exclude_ngrams_shorter_than(training_sqlite, 1) log("Pruning excluded dimensions") prune(training_sqlite) log("Indexing training database") index(training_sqlite, nlp) log("Running mRMR algorithm to select features") mrmr(training_sqlite, temporary_dir) log("Pruning excluded dimensions (again)") prune(training_sqlite) log("Outputting training samples to temporary data file") training_samples = P.join(temporary_dir, "training-samples.dat") svmvec(training_sqlite, training_samples) log("Training classifier") classifier = P.join(temporary_dir, "classifier.svm") learn(training_sqlite, training_samples, classifier) # # Test section # log("Preparing SQLite test database") test_sqlite = P.join(temporary_dir, test_sqlite3) prep_sqlite(test_sqlite) log("Copying dimensions from training database to test database") copy_dim(training_sqlite, test_sqlite) log("Indexing test database") index(test_sqlite, nlp) log("Outputting test samples to temporary data file") test_samples = P.join(temporary_dir, "test-samples.dat") svmvec(test_sqlite, test_samples) log("Classifying test samples") classify(test_sqlite, test_samples, classifier, False, temporary_dir)
def main(): parser = create_parser() opts, args = parser.parse_args() if len(args) != 3: parser.error("invalid number of arguments") mode, training_mdb, test_mdb = args if mode not in ("rank", "classify"): parser.error("invalid mode: %s" % mode) temporary_dir = opts.temporary_dir if opts.temporary_dir else P.dirname(training_mdb) if not P.isdir(temporary_dir): parser.error("error: temporary directory %s does not exist" % `temporary_dir`) def log(message): print "[%s] %s" % (datetime.datetime.now().isoformat(), message) # # Training section # log("Converting training database from MS Access to SQLite") training_sqlite = P.join(temporary_dir, P.splitext(P.basename(training_mdb))[0]+".sqlite3") mdb2sqlite(training_mdb, training_sqlite, zero_score=False) log("Calculating dimensions") calc_dim(training_sqlite) log("Excluding dimensions") exclude_stopwords(training_sqlite) exclude_non_alpha_partial(training_sqlite) exclude_shorter_than(training_sqlite, 3) log("Pruning excluded dimensions") prune(training_sqlite) log("Indexing training database") index(training_sqlite, IndexingOptions()) log("Running mRMR algorithm to select features") mrmr(training_sqlite, temporary_dir) log("Pruning excluded dimensions (again)") prune(training_sqlite) log("Outputting training samples to temporary data file") training_samples = P.join(temporary_dir, "training-samples.dat") svmvec(training_sqlite, training_samples) log("Selecting best value for C parameter") best_c = parameter_selection(training_samples) log("Training classifier") classifier = P.join(temporary_dir, "classifier.svm.dlib") trainer(training_samples, classifier, best_c) # # Ranking section # log("Converting test database from MS Access to SQLite") test_sqlite = P.join(temporary_dir, P.splitext(P.basename(test_mdb))[0]+".sqlite3") mdb2sqlite(test_mdb, test_sqlite, zero_score=True) log("Copying dimensions from training database to test database") copy_dim(training_sqlite, test_sqlite) log("Indexing test database") index(test_sqlite, IndexingOptions()) log("Outputting test samples to temporary data file") test_samples = P.join(temporary_dir, "test-samples.dat") svmvec(test_sqlite, test_samples) if mode == "rank": log("Ranking test samples") ranker(training_samples, test_samples, best_c) elif mode == "classify": log("Classifying test samples") dlib_classifier(test_sqlite, test_samples, classifier) # # This won't work on OS/X since the ODBC driver is read-only :( # log("Copying scores to test database (MS Access)") copy_scores(test_sqlite, test_mdb) else: assert False