def createTrainMatrices(voc):
    s = getStopwords()
    idf = np.zeros(len(voc))
    trainingimages = []
    trainingsentences = []
    dp = getDataProvider('flickr30k')
    currentPair = 0
    current_image = ""
    current_sentence = []
    for pair in dp.iterImageSentencePair():
        if currentPair % 1000 == 0:
            print "Current pair : " + str(currentPair)
        img_name = pair['image']['filename']
        new_sentence = pair['sentence']['tokens']
        img = pair['image']['feat']
        if(img_name is current_image):
            current_sentence=current_sentence + new_sentence
        else:
            current_image=img_name
            sentence = getFullSentence(current_sentence, voc, s)
            current_sentence = new_sentence
            if np.linalg.norm(sentence) > 0:
                for i in range(len(sentence)):
                    if sentence[i] > 0:
                        idf[i] += 1
                trainingimages.append(img)
                trainingsentences.append(sentence)
                currentPair += 1
    trainingsentences = np.array(trainingsentences)
    trainingimages = np.array(trainingimages)
    for i in range(len(trainingsentences)):
        trainingsentences[i] = trainingsentences[i] * idf
    return trainingimages, trainingsentences
def mainExec(name_file, features):
    '''
    Based on a list of image names and image features, learn a CCA model based on Stacked Auxiliary Embedding and
    save this model to disk.
    :param name_file
    :param features
    :return:
    '''
    print "Creating vocabulary"
    voc = readVocabulary()
    print "Generating document vectors"
    occurrenceVectors, idf = createOccurrenceVectors(voc)
    print "Weighing vectors"
    weightedVectors = weight_tfidf(occurrenceVectors, idf)

    sentenceMatrix = []
    imagematrix = []
    print "Creating matrices"
    currentSentence = 0
    for i in weightedVectors.keys():
        if isLargeEnough(i):
            currentSentence += 1
            print "current Sentence: " + str(currentSentence)
            for j in range(len(weightedVectors[i])):
                weightedVectors[i][j] = float(weightedVectors[i][j])
            if currentSentence == 1:
                sentenceMatrix = weightedVectors[i]
                imagematrix = getImage(i,name_file, features)
            elif currentSentence ==2:
                sentenceMatrix = np.concatenate(([sentenceMatrix], [weightedVectors[i]]), axis = 0)
                imagematrix = np.concatenate(([imagematrix], [getImage(i,name_file, features)]), axis = 0)
            else:
                sentenceMatrix = np.concatenate((sentenceMatrix, [weightedVectors[i]]), axis = 0)
                imagematrix = np.concatenate((imagematrix, [getImage(i,name_file, features)]), axis = 0)

    print "Modelling cca"
    cca = CCA(n_components=128)
    cca.fit(sentenceMatrix, imagematrix)
    pickle.dump(cca, open("ccasnippetmodel.p",'w+'))

    idf = np.zeros(len(voc))
    trainingimages = []
    trainingsentences = []
    dp = getDataProvider('flickr30k')
    currentPair = 0
    for pair in dp.sampleImageSentencePair():
        currentPair += 1
        if currentPair % 100 == 0:
            print "Current pair: " + str(currentPair)
        img = pair['image']['feat']
        trainingimages.append(img)
        sentence = getFullSentence(pair)
        for i in range(len(sentence)):
            if sentence[i] > 0:
                idf[i] += 1
        trainingsentences.append(sentence)
    for i in range(len(trainingsentences)):
        trainingsentences[i] = trainingsentences[i]*idf

    trans_img, trans_sent = cca.transform(trainingimages, trainingsentences)
    nn_img = nearest_neighbor(trainingimages)
    nn_sent = nearest_neighbor(trainingsentences)

    augmented_imgs = []
    augmented_sentences = []
    for i in range(len(trans_img)):
        augm_img = trainingimages[i].extend(phi(3000,nn_img, trans_img[i]))
        augmented_imgs.append(augm_img)

    for i in range(len(trans_sent)):
        augm_sent = trainingsentences[i].extend(phi(3000, nn_sent, trans_sent[i]))
        augmented_sentences.append(augm_sent)

    augmentedcca = CCA(n_components= 96)
    augmentedcca.fit(augmented_sentences, augmented_imgs)

    pickle.dump(cca, open("augmentedcca.p",'w+'))