Ejemplo n.º 1
0
def main():
    parser = create_parser()
    opts, args = parser.parse_args()
    if len(args) != 3:
        parser.error("invalid number of arguments")

    mode, training_mdb, test_mdb = args
    if mode not in ("rank", "classify"):
        parser.error("invalid mode: %s" % mode)

    temporary_dir = opts.temporary_dir if opts.temporary_dir else P.dirname(training_mdb)
    if not P.isdir(temporary_dir):
        parser.error("error: temporary directory %s does not exist" % `temporary_dir`)

    def log(message):
        print "[%s] %s" % (datetime.datetime.now().isoformat(), message)

    #
    # Training section
    #
    log("Converting training database from MS Access to SQLite")
    training_sqlite = P.join(temporary_dir, P.splitext(P.basename(training_mdb))[0]+".sqlite3")
    mdb2sqlite(training_mdb, training_sqlite, zero_score=False)

    log("Calculating dimensions")
    calc_dim(training_sqlite)
    log("Excluding dimensions")
    exclude_stopwords(training_sqlite)
    exclude_non_alpha_partial(training_sqlite)
    exclude_shorter_than(training_sqlite, 3)
    log("Pruning excluded dimensions")
    prune(training_sqlite)
    log("Indexing training database")
    index(training_sqlite, IndexingOptions())
    log("Running mRMR algorithm to select features")
    mrmr(training_sqlite, temporary_dir)
    log("Pruning excluded dimensions (again)")
    prune(training_sqlite)

    log("Outputting training samples to temporary data file")
    training_samples = P.join(temporary_dir, "training-samples.dat")
    svmvec(training_sqlite, training_samples)

    log("Selecting best value for C parameter")
    best_c = parameter_selection(training_samples)

    log("Training classifier")
    classifier = P.join(temporary_dir, "classifier.svm.dlib")
    trainer(training_samples, classifier, best_c)

    #
    # Ranking section
    #
    log("Converting test database from MS Access to SQLite")
    test_sqlite = P.join(temporary_dir, P.splitext(P.basename(test_mdb))[0]+".sqlite3")
    mdb2sqlite(test_mdb, test_sqlite, zero_score=True)

    log("Copying dimensions from training database to test database")
    copy_dim(training_sqlite, test_sqlite)

    log("Indexing test database")
    index(test_sqlite, IndexingOptions())
    
    log("Outputting test samples to temporary data file")
    test_samples = P.join(temporary_dir, "test-samples.dat")
    svmvec(test_sqlite, test_samples)

    if mode == "rank":
        log("Ranking test samples")
        ranker(training_samples, test_samples, best_c)
    elif mode == "classify":
        log("Classifying test samples")
        dlib_classifier(test_sqlite, test_samples, classifier)
        #
        # This won't work on OS/X since the ODBC driver is read-only :(
        # 
        log("Copying scores to test database (MS Access)")
        copy_scores(test_sqlite, test_mdb)
    else:
        assert False
Ejemplo n.º 2
0
def main():
    parser = create_parser()
    opts, args = parser.parse_args()
    if len(args) != 3:
        parser.error("invalid number of arguments")

    training_sqlite3, test_sqlite3, process_language = args

    print("Arguments: %s %s %s" %
          (training_sqlite3, test_sqlite3, process_language))

    #cwd = os.getcwd()
    #print "Working directory: %s" % (cwd)

    temporary_dir = opts.temporary_dir if opts.temporary_dir else P.dirname(
        training_sqlite3)
    if not P.isdir(temporary_dir):
        parser.error("error: temporary directory %s does not exist" %
                     temporary_dir)

    def log(message):
        print("[%s] %s" % (datetime.datetime.now().isoformat(), message))

    nlp = spacy.load(process_language)

    #
    # Training section
    #
    log("Preparing SQLite training database")
    training_sqlite = P.join(temporary_dir, training_sqlite3)
    prep_sqlite(training_sqlite)

    log("Calculating dimensions")
    calc_dim(nlp, training_sqlite, 0, False)
    log("Excluding dimensions")

    #if process_language == 'en':
    #    exclude_stopwords(training_sqlite)
    #else :
    #    exclude_stopwords_spacy(training_sqlite, process_language)

    exclude_stopwords_spacy(nlp, training_sqlite, process_language)

    exclude_non_alpha_partial(training_sqlite)
    exclude_unigrams_shorter_than(training_sqlite, 3)
    exclude_ngrams_shorter_than(training_sqlite, 1)
    log("Pruning excluded dimensions")
    prune(training_sqlite)
    log("Indexing training database")

    index(training_sqlite, nlp)

    log("Running mRMR algorithm to select features")
    mrmr(training_sqlite, temporary_dir)
    log("Pruning excluded dimensions (again)")
    prune(training_sqlite)

    log("Outputting training samples to temporary data file")
    training_samples = P.join(temporary_dir, "training-samples.dat")
    svmvec(training_sqlite, training_samples)

    log("Training classifier")
    classifier = P.join(temporary_dir, "classifier.svm")
    learn(training_sqlite, training_samples, classifier)

    #
    # Test section
    #
    log("Preparing SQLite test database")
    test_sqlite = P.join(temporary_dir, test_sqlite3)
    prep_sqlite(test_sqlite)

    log("Copying dimensions from training database to test database")
    copy_dim(training_sqlite, test_sqlite)

    log("Indexing test database")

    index(test_sqlite, nlp)

    log("Outputting test samples to temporary data file")
    test_samples = P.join(temporary_dir, "test-samples.dat")
    svmvec(test_sqlite, test_samples)

    log("Classifying test samples")
    classify(test_sqlite, test_samples, classifier, False, temporary_dir)
Ejemplo n.º 3
0
def main():
    parser = create_parser()
    opts, args = parser.parse_args()
    if len(args) != 3:
        parser.error("invalid number of arguments")

    training_sqlite3, test_sqlite3, process_language = args

    print("Arguments: %s %s %s" % (training_sqlite3, test_sqlite3, process_language))

    #cwd = os.getcwd()
    #print "Working directory: %s" % (cwd)

    temporary_dir = opts.temporary_dir if opts.temporary_dir else P.dirname(training_sqlite3)
    if not P.isdir(temporary_dir):
        parser.error("error: temporary directory %s does not exist" % temporary_dir)

    def log(message):
        print ("[%s] %s" % (datetime.datetime.now().isoformat(), message))



    #
    # Check all tables are valid
    #
    if not P.isfile(training_sqlite3):
        print ("File not exist, ", training_sqlite3);
        return;

    if not P.isfile(test_sqlite3):
        print ("File not exist, ", test_sqlite3);
        return;

    conn = sqlite3.connect(training_sqlite3)

    c = conn.cursor();
    
    c.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='Dimensions' ''')
    if c.fetchone()[0] !=1 :
        print ('Dimensions table does not exist.')
        return
    
    c.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='Parameters' ''')
    if c.fetchone()[0]!=1 :
        print('Parameters table does not exist.')
        return;
    
    try:
        c.execute("""SELECT DimensionId, Term, PartOfSpeech, Exclude, IDF, MRMR
                FROM Dimensions WHERE Exclude = 0""")

        flag = False
        for row in c:
            flag = True
            break;
        if flag == False:
            print("Dimensions table is empty")
            return;

        c.execute('SELECT Name, Value FROM Parameters')
        
        flag = False
        for row in c:
            flag = True
            break;
        if flag == False:
            print("Parameters table is empty")
            return;
    except sqlite3.Error as error:
        print("Table schema error:", error)
        return;
    finally:
        if (conn):
            conn.close()            

    nlp = spacy.load(process_language)

    #
    # Test section
    #
    log("Preparing SQLite test database")    
    prep_sqlite(test_sqlite3)

    log("Copying dimensions from training database to test database")
    copy_dim(training_sqlite3, test_sqlite3)

    log("Indexing test database")
    
    index(test_sqlite3, nlp)    
    
    log("Outputting test samples to temporary data file")
    test_samples = P.join(temporary_dir, "test-samples.dat")
    svmvec(test_sqlite3, test_samples)

    log("Classifying test samples")
    classifier = P.join(temporary_dir, "classifier.svm")
    classify(test_sqlite3, test_samples, classifier, False, temporary_dir)