Beispiel #1
0
    def setDict_Indices(self):
        '''Once raw cluster/label dicts are created, assign indices to them'''
        # Assign indices to class: Order of "inverse length", then by "key ascending"
        for clusterRaw, dataSamples in H.sortDictLen_Rev(self.dict_ClusterRaw_DataSamples):
            self.dict_Cluster_Index[clusterRaw] = len(self.dict_Cluster_Index) + 1

        # Assign indices to labels: Order of "inverse length", then by "key ascending"
        for label, dataSamples in H.sortDictLen_Rev(self.dict_Label_DataSamples):
            self.dict_Label_Index[label] = len(self.dict_Label_Index) + 1

        self.dict_Index_Cluster = {v: k for k, v in self.dict_Cluster_Index.iteritems()}
        self.dict_Index_Label = {v: k for k, v in self.dict_Label_Index.iteritems()}
Beispiel #2
0
    def splitTrainTest(self):
        print colored('\tSplitting Train+Test ...', 'magenta')    
        self.num_classes = len(self.dict_ClusterRaw_DataSamples) + 1
        self.num_labels = len(self.dict_Label_DataSamples) 
        print 'NumClasses=', self.num_classes - 1
        print 'NumLabels=', self.num_labels - 1

        for clusterRaw, dataSamples in H.sortDictLen_Rev(self.dict_ClusterRaw_DataSamples):
            clusterIndex = self.dict_Cluster_Index[clusterRaw]
            li = dataSamples
            labelList = li[0].labelList # Pick any dataSamples labelList - would be the same for all similar clusterRaw
            labelIndices = [self.dict_Label_Index[label] for label in labelList]

            numTrain = int(math.ceil(CF.TRAIN_SPLIT * len(li)))
            numValid = int(math.floor(CF.VALIDATION_SPLIT * len(li)))
            numTest =  len(li) - numTrain - numValid
            print 'Class-',clusterIndex, 'NumTrain=', numTrain, 'NumValid=', numValid, 'NumTest=', numTest
            self.X_train_DataSample.extend(li[:numTrain])
            self.X_valid_DataSample.extend(li[numTrain : numTrain+numValid])
            self.X_test_DataSample.extend(li[numTrain + numValid :])

            self.y_train_cluster.extend([clusterIndex] * numTrain)
            self.y_valid_cluster.extend([clusterIndex] * numValid)
            self.y_test_cluster.extend([clusterIndex] * (len(li) - numTrain - numValid))
            
            self.y_train_label.extend([labelIndices] * numTrain)
            self.y_valid_label.extend([labelIndices] * numValid)
            self.y_test_label.extend([labelIndices] * (len(li) - numTrain - numValid))
    
        self.X_train_rawText = [i.rawText for i in self.X_train_DataSample]
        self.X_valid_rawText = [i.rawText for i in self.X_valid_DataSample]
        self.X_test_rawText = [i.rawText for i in self.X_test_DataSample]