def __init__(self):
        self.questionDict = SymbolDict()
        self.answerDict = SymbolDict(empty = True)
        self.qaDict = SymbolDict()

        self.specificDatasetDicts = None

        self.programDict = SymbolDict()
        self.programTranslator = ProgramTranslator(self.programDict, 2)
class Preprocesser(object):
    def __init__(self):
        self.questionDict = SymbolDict()
        self.answerDict = SymbolDict(empty = True)
        self.qaDict = SymbolDict()

        self.specificDatasetDicts = None

        self.programDict = SymbolDict()
        self.programTranslator = ProgramTranslator(self.programDict, 2)
    '''
    Tokenizes string into list of symbols.

    Args:
        text: raw string to tokenize.
        ignorePuncts: punctuation to ignore
        keptPunct: punctuation to keep (as symbol)
        endPunct: punctuation to remove if appears at the end
        delim: delimiter between symbols
        clean: True to replace text in string
        replacelistPre: dictionary of replacement to perform on the text before tokanization
        replacelistPost: dictionary of replacement to perform on the text after tokanization
    '''
    # sentence tokenizer
    allPunct = ["?", "!", "\\", "/", ")", "(", ".", ",", ";", ":"]
    def tokenize(self, text, ignoredPuncts = ["?", "!", "\\", "/", ")", "("], 
        keptPuncts = [".", ",", ";", ":"], endPunct = [">", "<", ":"], delim = " ", 
        clean = False, replacelistPre = dict(), replacelistPost = dict()):

        if clean:
            for word in replacelistPre:
                origText = text
                text = text.replace(word, replacelistPre[word])
                if (origText != text):
                    print(origText)
                    print(text)
                    print("")

            for punct in endPunct:
                if text[-1] == punct:
                    print(text)
                    text = text[:-1]
                    print(text)
                    print("")

        for punct in keptPuncts:
            text = text.replace(punct, delim + punct + delim)           
        
        for punct in ignoredPuncts:
            text = text.replace(punct, "")

        ret = text.lower().split(delim)

        if clean:
            origRet = ret
            ret = [replacelistPost.get(word, word) for word in ret]
            if origRet != ret:
                print(origRet)
                print(ret)

        ret = [t for t in ret if t != ""]
        return ret


    # Read class' generated files.
    # files interface
    def readFiles(self, instancesFilename):
        with open(instancesFilename, "r") as inFile:
            instances = json.load(inFile)
            print("read instances file")
        with open(config.questionDictFile(), "rb") as inFile:
            print("Opening question dict file")
            self.questionDict = pickle.load(inFile)
            print("read question dict file")
        print("reading answer dict")
        with open(config.answerDictFile(), "rb") as inFile:
            self.answerDict = pickle.load(inFile)
        print("read answer dict")
        with open(config.qaDictFile(), "rb") as inFile:
            self.qaDict = pickle.load(inFile)

        return instances 
    
    '''
    Generate class' files. Save json representation of instances and
    symbols-to-integers dictionaries.  
    '''
    def writeFiles(self, instances, instancesFilename):
        with open(instancesFilename, "w") as outFile:
            json.dump(instances, outFile)

        with open(config.questionDictFile(), "wb") as outFile:
            pickle.dump(self.questionDict, outFile)

        with open(config.answerDictFile(), "wb") as outFile:
            pickle.dump(self.answerDict, outFile)

        with open(config.qaDictFile(), "wb") as outFile:
            pickle.dump(self.qaDict, outFile)

    # Write prediction json to file and optionally a one-answer-per-line output file
    def writePreds(self, res, tier, suffix = ""):
        if res is None:
            return
        preds = res["preds"]
        sortedPreds = sorted(preds, key = lambda instance: instance["index"]) 
        with open(config.predsFile(tier + suffix), "w") as outFile:
            outFile.write(json.dumps(sortedPreds))
        with open(config.answersFile(tier + suffix), "w") as outFile:
            for instance in sortedPreds:
                writeline(outFile, instance["prediction"])
    
    # Reads NLVR data entries and create a json dictionary.
    def readNLVR(self, datasetFilename, instancesFilename, train):
        instances = []
        i = 0 

        if os.path.exists(instancesFilename):
            instances = self.readFiles(instancesFilename)
        else:
            with open(datasetFilename, "r") as datasetFile:               
                for line in datasetFile:
                    instance = json.loads(line)
                    question = instance["sentence"]
                    questionSeq = self.tokenize(question, 
                        ignoredPuncts = Preprocesser.allPunct, keptPuncts = [])

                    if train or (not config.wrdEmbUnknown):
                        self.questionDict.addSeq(question)
                        self.qaDict.addSeq(question)

                    answer = instance["label"]
                    self.answerDict.addSeq([answer])
                    self.qaDict.addSeq([answer])

                    for k in range(6):
                        instances.append({
                            "question": question,
                            "questionSeq": questionSeq,
                            "answer": answer,
                            "imageId": instance["identifier"] + "-" + str(k),
                            "index": i
                            })
                        i += 1

                random.shuffle(instances)

                self.questionDict.createVocab()
                self.answerDict.createVocab()
                self.qaDict.createVocab()

                self.writeFiles(instances, instancesFilename)

        return instances

    # Reads CLEVR data entries and create a json dictionary.
    def readCLEVR(self, datasetFilename, instancesFilename, train):
        instances = []
        print(instancesFilename)
        if os.path.exists(instancesFilename):
            print("path found")
            instances = self.readFiles(instancesFilename)

        else:
            with open(datasetFilename, "r") as datasetFile:
                data = json.load(datasetFile)["questions"]            
            for i in tqdm(range(len(data)), desc = "Preprocessing"):
                instance = data[i]

                question = instance["question"]
                questionSeq = self.tokenize(question)

                if train or (not config.wrdEmbUnknown):
                    self.questionDict.addSeq(questionSeq)
                    self.qaDict.addSeq(questionSeq)

                answer = instance.get("answer", "yes") # DUMMY_ANSWER
                self.answerDict.addSeq([answer])
                self.qaDict.addSeq([answer])

                dummyProgram = [{"function": "FUNC", "value_inputs": [], "inputs": []}]
                program = instance.get("program", dummyProgram)
                postfixProgram = self.programTranslator.programToPostfixProgram(program)
                programSeq = self.programTranslator.programToSeq(postfixProgram)
                programInputs = self.programTranslator.programToInputs(postfixProgram, 
                    offset = 2)

                # pass other fields to instance?
                instances.append({
                        "question": question,
                        "questionSeq": questionSeq,
                        "answer": answer,
                        "imageId": instance["image_index"],
                        "program": program,
                        "programSeq": programSeq,
                        "programInputs": programInputs,
                        "index": i
                        })

            random.shuffle(instances)

            self.questionDict.createVocab()
            self.answerDict.createVocab()
            self.qaDict.createVocab()

            self.writeFiles(instances, instancesFilename)

        return instances

    '''
    Reads data in datasetFilename, and creates json dictionary.
    If instancesFilename exists, restore dictionary from this file.
    Otherwise, save created dictionary to instancesFilename.
    '''
    def readData(self, datasetFilename, instancesFilename, train):
        # data extraction
        datasetReader = {
            "CLEVR": self.readCLEVR,
            "NLVR": self.readNLVR
        }

        return datasetReader["CLEVR"](datasetFilename, instancesFilename, train)

    # Reads dataset tier (train, val, test) and returns the loaded instances 
    # and image relevant filenames
    def readTier(self, tier, train):
        print(tier)
        #imagesFilename = config.imagesFile(tier)
        datasetFilename = config.datasetFile(tier)
        instancesFilename = config.instancesFile(tier)
        instances = self.readData(datasetFilename, instancesFilename, train)
        #images = {"imagesFilename": imagesFilename}
        print(instances[0][u'imageId'])
        print(datasetFilename)
        for i in range(len(instances)):
            instances[i][u'imagePath'] = config.clevrPath + "CLEVR_val_" + "0"*(6-len(str(instances[i][u'imageId'])))+ str(instances[i][u'imageId']) +".png"
        print(instances[0][u'imagePath'])
        return {"instances": instances, "train": train}

    '''
    Reads all tiers of a dataset (train if exists, val, test).
    Creates also evalTrain tier which will optionally be used for evaluation. 
    '''
    def readDataset(self, suffix = "", hasTrain = True):
        dataset = {"train": None, "evalTrain": None, "val": None, "test": None}        
        if hasTrain:
            dataset["train"] = self.readTier("train" + suffix, train = True)
        dataset["val"] = self.readTier("val" + suffix, train = False)
        #dataset["test"] = self.readTier("test" + suffix, train = False)
        
        if hasTrain:
            dataset["evalTrain"] = {}
            for k in dataset["train"]:
                dataset["evalTrain"][k] = dataset["train"][k]
            dataset["evalTrain"]["train"] = False

        return dataset

    # Transform symbols to corresponding integers and vectorize into numpy array
    def vectorizeData(self, data):
        # if "SHARED" tie symbol representations in questions and answers 
        if config.ansEmbMod == "SHARED":
            qDict = self.qaDict
        else:
            qDict = self.questionDict

        encodedQuestions = [qDict.encodeSequence(d["questionSeq"]) for d in data]
        questions, questionsL = vectorize2DList(encodedQuestions)

        answers = np.array([self.answerDict.encodeSym(d["answer"]) for d in data])
        
        # pass the whole instances? if heavy then not good
        imageIds = [d["imageId"] for d in data]
        indices = [d["index"] for d in data]
        imagePaths = [d["imagePath"] for d in data]
        instances = data

        return {    "questions": questions,
                    "questionLengths": questionsL,
                    "answers": answers,
                    "imageIds": imageIds,
                    "imagePaths": imagePaths,
                    "indices": indices,
                    "instances": instances
                }

    # Separates data based on a field length
    def lseparator(self, key, lims):
        maxI = len(lims)
        def separatorFn(x):
            v = x[key]
            for i, lim in enumerate(lims):
                if len(v) < lim:
                    return i
            return maxI
        return {"separate": separatorFn, "groupsNum": maxI + 1}

    # # separate data based on a field type
    # def tseparator(self, key, types):
    #     typesNum = len(types) + 1
    #     def separatorFn(x):
    #         v = str(x[key][-1])
    #         return types.get(v, len(types))
    #     return {"separate": separatorFn, "groupsNum": typesNum}

    # # separate data based on field arity
    # def bseparator(self, key):
    #     def separatorFn(x):
    #         cond = (len(x[key][-1]) == 2)
    #         return (1 if cond else 0)
    #     return {"separate": separatorFn, "groupsNum": 2}

    # Buckets data to groups using a separator
    def bucket(self, instances, separator):
        buckets = [[] for i in range(separator["groupsNum"])]
        for instance in instances:
            bucketI = separator["separate"](instance)
            buckets[bucketI].append(instance)
        return [bucket for bucket in buckets if len(bucket) > 0]

    # Re-buckets bucket list given a seperator
    def rebucket(self, buckets, separator):
        res = []
        for bucket in buckets:
            res += self.bucket(bucket, separator)
        return res

    # Buckets data based on question / program length 
    def bucketData(self, data, noBucket = False):
        if noBucket:
            print("No Bucket")
            buckets = [data]
        else:
            if config.noBucket:
                buckets = [data]
            elif config.noRebucket:
                questionSep = self.lseparator("questionSeq", config.questionLims)
                buckets = self.bucket(data, questionSep)
            else:
                programSep = self.lseparator("programSeq", config.programLims)
                questionSep = self.lseparator("questionSeq", config.questionLims)
                buckets = self.bucket(data, programSep)
                buckets = self.rebucket(buckets, questionSep)
        return buckets

    ''' 
    Prepares data: 
    1. Filters data according to above arguments.
    2. Takes only a subset of the data based on config.trainedNum / config.testedNum
    3. Buckets data according to question / program length
    4. Vectorizes data into numpy arrays
    '''
    def prepareData(self, data, train, filterKey = None, noBucket = False):
        filterDefault = {"maxQLength": 0, "maxPLength": 0, "onlyChain": False, "filterOp": 0}

        filterTrain = {"maxQLength": config.tMaxQ, "maxPLength": config.tMaxP,
                       "onlyChain": config.tOnlyChain, "filterOp": config.tFilterOp}

        filterVal = {"maxQLength": config.vMaxQ, "maxPLength": config.vMaxP,
                     "onlyChain": config.vOnlyChain, "filterOp": config.vFilterOp}

        filters = {"train": filterTrain, "evalTrain": filterTrain, 
                   "val": filterVal, "test": filterDefault}

        if filterKey is None:
            fltr = filterDefault
        else:
            fltr = filters[filterKey]

        # split data when finetuning on validation set 
        if config.trainExtra and config.extraVal and (config.finetuneNum > 0):
            if train:
                data = data[:config.finetuneNum]
            else: 
                data = data[config.finetuneNum:]

        typeFilter = config.typeFilters[fltr["filterOp"]]
        # filter specific settings
        if fltr["onlyChain"]:
            data = [d for d in data if all((len(inputNum) < 2) for inputNum in d["programInputs"])]
        if fltr["maxQLength"] > 0:
            data = [d for d in data if len(d["questionSeq"]) <= fltr["maxQLength"]]
        if fltr["maxPLength"] > 0:
            data = [d for d in data if len(d["programSeq"]) <= fltr["maxPLength"]]
        if len(typeFilter) > 0:
            data = [d for d in data if d["programSeq"][-1] not in typeFilter]

        # run on subset of the data. If 0 then use all data 
        num = config.trainedNum if train else config.testedNum
        # retainVal = True to retain same sample of validation across runs  
        #if (not train) and (not config.retainVal):
        #    random.shuffle(data)
        if num > 0:
            data = data[:num]
        # set number to match dataset size 
        if train:
            config.trainedNum = len(data)
        else:
            config.testedNum = len(data)

        # bucket
        buckets = self.bucketData(data, noBucket = noBucket)
        
        # vectorize
        return [self.vectorizeData(bucket) for bucket in buckets]

    # Prepares all the tiers of a dataset. See prepareData method for further details.
    def prepareDataset(self, dataset, noBucket = True):
        if dataset is None:
            return None

        for tier in dataset:
            if dataset[tier] is not None:
                print("tier=%s"%tier)
                dataset[tier]["data"] = self.prepareData(dataset[tier]["instances"], 
                    train = dataset[tier]["train"], filterKey = tier, noBucket = noBucket)
        
        for tier in dataset:
            if dataset[tier] is not None:
                del dataset[tier]["instances"]

        return dataset

    # Initializes word embeddings to random uniform / random normal / GloVe. 
    def initializeWordEmbeddings(self, wordsDict = None, noPadding = False):
        # default dictionary to use for embeddings
        if wordsDict is None:
            wordsDict = self.questionDict

        # uniform initialization
        if config.wrdEmbUniform:
            lowInit = -1.0 * config.wrdEmbScale
            highInit = 1.0 * config.wrdEmbScale
            embeddings = np.random.uniform(low = lowInit, high = highInit, 
                size = (wordsDict.getNumSymbols(), config.wrdEmbDim))
        # normal initialization
        else:
            embeddings = config.wrdEmbScale * np.random.randn(wordsDict.getNumSymbols(), 
                config.wrdEmbDim)

        # if wrdEmbRandom = False, use GloVE
        counter = 0
        if (not config.wrdEmbRandom): 
            with open(config.wordVectorsFile, 'r') as inFile:
                for line in inFile:
                    line = line.strip().split()
                    word = line[0].lower()
                    vector = [float(x) for x in line[1:]]
                    index = wordsDict.sym2id.get(word)
                    if index is not None:
                        embeddings[index] = vector
                        counter += 1
        
        print(counter)
        print(self.questionDict.sym2id)
        print(len(self.questionDict.sym2id))
        print(self.answerDict.sym2id)      
        print(len(self.answerDict.sym2id))
        print(self.qaDict.sym2id)      
        print(len(self.qaDict.sym2id))

        if noPadding:            
            return embeddings # no embedding for padding symbol
        else:
            return embeddings[1:]

    '''
    Initializes words embeddings for question words and optionally for answer words
    (when config.ansEmbMod == "BOTH"). If config.ansEmbMod == "SHARED", tie embeddings for
    question and answer same symbols. 
    '''
    def initializeQAEmbeddings(self):
        # use same embeddings for questions and answers
        if config.ansEmbMod == "SHARED":
            qaEmbeddings = self.initializeWordEmbeddings(self.qaDict)
            ansMap = np.array([self.qaDict.sym2id[sym] for sym in self.answerDict.id2sym])
            embeddings = {"qa": qaEmbeddings, "ansMap": ansMap}
        # use different embeddings for questions and answers
        else:
            print("="*50)
            print("Question Dict:",self.questionDict)
            print("="*50)
            qEmbeddings = self.initializeWordEmbeddings(self.questionDict)
            aEmbeddings = None
            if config.ansEmbMod == "BOTH":
                aEmbeddings = self.initializeWordEmbeddings(self.answerDict, noPadding = True)
            embeddings = {"q": qEmbeddings, "a": aEmbeddings}
        return embeddings

    '''
    Preprocesses a given dataset into numpy arrays:
    1. Reads the input data files into dictionary. 
    2. Saves the results jsons in files and loads them instead of parsing input if files exist/
    3. Initializes word embeddings to random / GloVe.
    4. Optionally filters data according to given filters.
    5. Encodes and vectorize the data into numpy arrays.
    5. Buckets the data according to the instances length.
    '''
    def preprocessData(self, debug = False):
        # Read data into json and symbols' dictionaries
        print(bold("Loading data from adversarial..."))
        start = time.time()
        mainDataset = self.readDataset(hasTrain = False)

        extraDataset = None
        if config.extra:
            # compositionalClevr doesn't have training dataset
            extraDataset = self.readDataset(suffix = "H", hasTrain = (not config.extraVal))          
            # extra dataset uses the same images
            if not config.extraVal:
                for tier in extraData:
                    extraDataset[tier]["images"] = mainDataset[tier]["images"]

        print("took {:.2f} seconds".format(time.time() - start))

        # Initialize word embeddings (random / glove)
        print(bold("Loading word vectors..."))
        start = time.time()
        embeddings = self.initializeQAEmbeddings()
        print("took {:.2f} seconds".format(time.time() - start))

        # Prepare data: filter, bucket, and vectorize into numpy arrays
        print(bold("Vectorizing data..."))
        start = time.time()       

        mainDataset = self.prepareDataset(mainDataset)
        # don't bucket for alternated data and also for humans data (small dataset)
        #extraDataset = self.prepareDataset(extraDataset, 
        #    noBucket = (not config.extraVal) or (not config.alterExtra))

        data = {"main": mainDataset}
        print("took {:.2f} seconds".format(time.time() - start))

        #config.questionWordsNum = self.questionDict.getNumSymbols()
        config.answerWordsNum = self.answerDict.getNumSymbols()
        return data, embeddings, self.answerDict