f = get_features_energy(data, starts, ends)
    clusts, means, _ = clusterize_inner(f, [np.argmax(f)], num_clusters=2)
    space_clust = 0
    if means[0] < means[1]:
        space_clust = 1
    spaces = [i for i in range(len(starts)) if clusts[i] == space_clust]
    return spaces


###################################################################
# Constants and debugging control
###################################################################

DATA_FILES = {"sound": "data/sound7.wav", "text": "data/text7.txt"}  # sound1.wav  # text1.txt

rate, data, text = load_data(DATA_FILES["sound"], DATA_FILES["text"])

# the fraction of the file we use
file_range = (0, 1.0)

SOFT_CLUSTER = True
USE_PCA = True
DEBUG = False
# Which plots to actually plot.
PLOT_SET = set(
    [
        #'Segmentation Plot',
        #'Features for Space',
        #'Features for %s' % text[0],
        #'Feature vector principal components',
    ]
Example: ./aggregate 7 sound7.*.wav
will aggregate sound files
sound7.1.wav sound7.2.wav, etc
with corresponding text files
text7.1.txt text7.2.txt
The output aggregation will be guaranteed to have perfect segmentation
'''
import sys
from mlalgs import (load_data, get_chunk_starts)
import scipy.io.wavfile
import numpy as np

tot = []
tot_txt = []
tot_chars = 0
for f in sys.argv[2:]:
    text_file = 'text' + f[5:-3] + 'txt'
    rate, data, text = load_data(f, text_file)
    starts, _, _ = get_chunk_starts(data)
    if len(starts) != len(text):
        print '%s rejected: %d != %d' % (f, len(starts), len(text))
        continue
    tot_chars += len(text)
    tot.append(data)
    tot_txt.append(text)

print 'Created data file with %d characters' % tot_chars
scipy.io.wavfile.write('sound%s.wav' % sys.argv[1], rate, np.concatenate(tot))
with open('text%s.txt' % sys.argv[1], "w") as f:
    f.write(''.join(tot_txt))
        pred.append(letters[minj])
        real.append(c)
        if minj == letters.index(c):
            score += 1
    print ''.join(pred)
    print ''.join(real)
    return means, stds, score/float(len(test))

if __name__ == '__main__':
    if len(sys.argv) != 4:
        print 'Usage: %s training|test soundf textf' % sys.argv[0]

    soundf = sys.argv[2]
    textf = sys.argv[3]

    rate, data, text = load_data(soundf, textf)
    starts, ends, chunks = get_chunk_starts(data)
    f = get_features(data, starts, ends, include_fft=True, include_cepstrum=True)

    if sys.argv[1] == 'training':
        means, stds, score = naive_bayes(text, f)
        print 'Naive Bayes', score

        logreg_score, logreg = logistic_test(text, f)
        svm_score, svm = svm_test(text, f)
        joblib.dump(logreg, 'cache/logistic.pkl')
        print 'Logistic test', logreg_score
        print 'SVM test', svm_score
    else:
        try:
            logreg = joblib.load('cache/logistic.pkl')