Exemple #1
0
    def constructFromFile(self, fileName, **args) :

        if 'data' not in args :
            raise ValueError, 'missing data object'
        self._data = args['data']
        patternIDdict = misc.list2dict(self._data.labels.patternID,
                                       range(len(self._data)))
        labels = Labels(fileName)
        patterns = []
        pairs = []
        for i in range(len(labels)) :
            p1,p2 = labels.patternID[i].split('_')
            # add only pairs for which we have kernel data:
            if p1 in patternIDdict and p2 in patternIDdict :
                pairs.append((patternIDdict[p1],patternIDdict[p2]))
                patterns.append(i)
            else :
                print p1, ' or ', p2, 'not found'
        labels = labels.__class__(labels, patterns = patterns)

        self.pairs = pairs

        first = [pair[0] for pair in pairs]
        second = [pair[1] for pair in pairs]
        firstVector = arrayWrap.intVector([pair[0] for pair in pairs])
        secondVector = arrayWrap.intVector([pair[1] for pair in pairs])            
        self.callConstructor(firstVector, secondVector)

        WrapperDataSet.attachLabels(self, labels)
    def constructFromFile(self, fileName):

        patternIDdict = misc.list2dict(self._data.labels.patternID,
                                       range(len(self._data)))

        labels = Labels(fileName)
        patterns = []
        pairs = []
        for i in range(len(labels)):
            p1, p2 = labels.patternID[i].split('_')
            # add only pairs for which we have kernel data:
            if p1 in patternIDdict and p2 in patternIDdict:
                pairs.append((patternIDdict[p1], patternIDdict[p2]))
                patterns.append(i)
            else:
                print p1, ' or ', p2, 'not found'
        labels = labels.__class__(labels, patterns=patterns)

        self.pairs = pairs

        first = [pair[0] for pair in pairs]
        second = [pair[1] for pair in pairs]
        firstVector = arrayWrap.intVector([pair[0] for pair in pairs])
        secondVector = arrayWrap.intVector([pair[1] for pair in pairs])
        self.callConstructor(firstVector, secondVector)

        WrapperDataSet.attachLabels(self, labels)
    def fromArray(self, X, **args):

        L = None
        patternID = None
        self.featureID = None
        if 'labels' in args:
            L = args['labels'].L[:]
            patternID = args['labels'].patternID[:]
        if 'L' in args:
            L = args['L']
        if 'patternID' in args:
            patternID = args['patternID'][:]
        if 'featureID' in args:
            if self.__class__.__name__ == 'SparseDataSet':
                raise vluaeError, 'cannot set feature ID for SparseDataSet'
            self.featureID = args['featureID'][:]

        if L is not None: assert len(X) == len(L)
        if self.featureID is None:
            self.featureID = [str(i) for i in range(len(X[0]))]
        if patternID is None:
            patternID = [str(i) for i in range(1, len(X) + 1)]

        self.fromArrayAdd(X)
        self.updateFeatureDict()
        self.featureIDcompute()

        if 'labelsFile' in args:
            self.attachLabels(Labels(args['labelsFile'], **args))
        else:
            args['patternID'] = patternID
            self.attachLabels(Labels(L, **args))
Exemple #4
0
    def constructFromFile(self, fileName):

        patternIDdict = misc.list2dict(self._data.labels.patternID, range(len(self._data)))

        labels = Labels(fileName)
        patterns = []
        pairs = []
        for i in range(len(labels)):
            p1, p2 = labels.patternID[i].split("_")
            # add only pairs for which we have kernel data:
            if p1 in patternIDdict and p2 in patternIDdict:
                pairs.append((patternIDdict[p1], patternIDdict[p2]))
                patterns.append(i)
            else:
                print p1, " or ", p2, "not found"
        labels = labels.__class__(labels, patterns=patterns)

        self.pairs = pairs

        first = [pair[0] for pair in pairs]
        second = [pair[1] for pair in pairs]
        firstVector = arrayWrap.intVector([pair[0] for pair in pairs])
        secondVector = arrayWrap.intVector([pair[1] for pair in pairs])
        self.callConstructor(firstVector, secondVector)

        WrapperDataSet.attachLabels(self, labels)
    def constructFromFile(self, fileName):

        delim = ','
        if self.data is not None:
            patternIDdict = misc.list2dict(self.data.labels.patternID,
                                           range(len(self.data)))
        else:
            patternIDdict = {}

        L = []
        patternID = []
        pairs = []
        file = open(fileName)
        for line in file:
            tokens = line[:-1].split(delim)
            #patternID.append(tokens[0])
            p1, p2 = tokens[0].split('_')
            if p1 > p2: p1, p2 = p2, p1
            # add only pairs for which we have kernel data:
            if p1 in patternIDdict and p2 in patternIDdict or self.data is None:
                pairs.append((p1, p2))
                L.append(tokens[1])
                patternID.append('_'.join([p1, p2]))
            else:
                print p1, ' or ', p2, 'not found'
        self.pairs = pairs
        self.labels = Labels(L, patternID=patternID)
Exemple #6
0
def pick(event):

    global data
    global X
    global Y
    global numpy_container
    if event.key == 'q':
        if len(X) == 0: return
        if not numpy_container:
            data = VectorDataSet(X)
        else:
            data = PyVectorDataSet(numpy.array(X))
        data.attachLabels(Labels(Y))
        X = []
        Y = []
        print 'done creating data.  close this window and use the decisionSurface function'
        pylab.disconnect(binding_id)
    if event.key == '1' or event.key == '2':
        if event.inaxes is not None:
            print 'data coords', event.xdata, event.ydata
            X.append([event.xdata, event.ydata])
            Y.append(event.key)
            pylab.plot([event.xdata], [event.ydata],
                       plotStr[int(event.key) - 1])
            pylab.draw()
Exemple #7
0
    def copyConstruct(self, other, **args):

        forgetClassLabels = False
        if "patterns" in args:
            patterns = args['patterns']
            # if the patterns are ids (strings) convert them to indices:
            if type(patterns[0]) == type(''):
                idDict = misc.list2dict(patterns)
                patternsToCopy = [
                    i for i in range(len(other))
                    if other.labels.patternID[i] in idDict
                ]
            else:
                patternsToCopy = patterns
        elif "classes" in args:
            patternsToCopy = [
                i for i in range(len(other))
                if other.labels.L[i] in args["classes"]
            ]
            forgetClassLabels = True
        elif "classID" in args:
            patternsToCopy = [
                i for i in range(len(other))
                if other.labels.Y[i] in args["classID"]
            ]
            forgetClassLabels = True
        else:
            patternsToCopy = range(len(other))

        self.setTrainingFunc(other.trainingFunc)
        self.setTestingFunc(other.testingFunc)

        # class dependent copying of data:
        self.copy(other, patternsToCopy)

        self.attachKernel(other)
        self.attachLabels(
            Labels(other.labels,
                   patterns=patternsToCopy,
                   forgetClassLabels=forgetClassLabels))

        # copy the registered attribute:
        if hasattr(other, '_registeredAttributes'):
            self._registeredAttributes = other._registeredAttributes[:]
            self._actions = copy.deepcopy(other._actions)
            for attr in self._registeredAttributes:
                a = getattr(other, attr)
                if type(a) == type([]):
                    if len(a) != len(other):
                        raise ValueError, 'attribute has bad length'
                    #BaseDataSet.__setattr__(self, attr,
                    #                        [a[i] for i in patternsToCopy])
                    setattr(self, attr, [a[i] for i in patternsToCopy])
                elif hasattr(a, 'type') and a.type == 'dataset' and len(
                        a) == len(self):
                    acopy = a.__class__(a, patterns=patternsToCopy)
                    setattr(self, attr, acopy)
                else:
                    setattr(self, attr, a)
Exemple #8
0
def load_libsvm_format(file_name, **args):
    """
    Load a dataset from a file in libsvm format
    returns an instance of PyVectorDataSet
    If you want to use the data with a SparseDataSet, you can directly
    do it using the SparseDataSet constructor.
    """

    regression = False
    if 'regression' in args:
        regression = args['regression']
    # first extract labels and check how many features there are:
    labels = []
    num_features = 0
    if not os.path.exists(file_name):
        raise ValueError, "file doesn't exist at %s" % file_name
    file_handle = myio.myopen(file_name)
    for line in file_handle:
        tokens = line.split()
        if regression:
            labels.append(float(tokens[0]))
        else:
            labels.append(str(int(float(tokens[0]))))
        for token in tokens[1:]:
            id, value = token.split(':')
            num_features = max(num_features, int(id))
    X = numpy.zeros((len(labels), num_features), numpy.float)
    # fill in the array:
    i = 0
    for line in open(file_name):
        tokens = line.split()
        for token in tokens[1:]:
            id, value = token.split(':')
            id = int(id) - 1
            X[i][id] = float(value)
        i += 1
    data = PyVectorDataSet(X)
    if regression:
        labels = Labels(labels, numericLabels=True)
    else:
        labels = Labels(labels)
    data.attachLabels(labels)
    return data
    def attachLabels(self, labels):

        if labels.__class__.__name__ == 'Labels':
            pass
        elif type(labels) == type(''):
            labels = Labels(labels)
        else:
            raise ValueError, 'wrong type of labels object'
        if len(self) != len(labels):
            raise ValueError, 'length of labels not equal length of self'
        self.labels = labels
    def constructFromFile(self, fileName, **args):

        parser = parsers.parserDispatcher(fileName, **args)
        # the DataSet container can only be used with a csv type file:
        if parser.__class__.__name__ == 'SparseParser' and \
                self.__class__.__name__ == 'DataSet' :
            raise ValueError, \
                'cannot use a DataSet container with a sparse file'
        parser.scan()

        self.initializeDataMatrix(len(parser), len(parser._featureID))

        # read the patterns :
        i = 0
        for x in parser:
            self.addPattern(x, i)
            i += 1


#            if i % 100 == 0 :
#                print 'read',i,'patterns'

# postprocessing:
        L = parser._labels
        patternID = parser._patternID
        if patternID is None or len(patternID) == 0:
            patternID = [str(i) for i in range(1, len(self) + 1)]
        self.featureID, featureKey, featureKeyDict = parser.postProcess()
        if self.__class__.__name__ == 'PySparseDataSet':
            self.featureKey = featureKey
            self.featureKeyDict = featureKeyDict

        self.updateFeatureDict()

        self.featureIDcompute()
        #        print 'read', len(self), 'patterns'

        if 'labelsFile' in args:
            self.attachLabels(Labels(args['labelsFile'], **args))
        else:
            self.attachLabels(Labels(L, patternID=patternID, **args))
    def makeEmpty(self, size, **args):

        L = None
        patternID = None
        if 'labels' in args:
            L = args['labels'].L[:]
            patternID = args['labels'].patternID[:]
        if 'L' in args:
            L = args['L']
        if 'patternID' in args:
            patternID = args['patternID'][:]

        if L is not None: assert size == len(L)
        if patternID is None:
            patternID = [str(i) for i in range(1, size + 1)]

        self.initializeDataMatrix(size, 0)

        if 'labelsFile' in args:
            self.attachLabels(Labels(args['labelsFile'], **args))
        else:
            args['patternID'] = patternID
            self.attachLabels(Labels(L, **args))
    def constructFromFile(self, fileName, **args):
        print 'reading from', fileName
        headerHandler = fastaHeaderHandler
        if 'headerHandler' in args:
            headerHandler = args['headerHandler']
        numPatterns = fasta.fasta_count(fileName)
        self.container.__init__(self, numPatterns)

        patternIDs = []
        L = []
        for record in fasta.fasta_itr(fileName):
            self.addPattern(record.sequence)
            patternID, label = headerHandler(record.header)
            patternIDs.append(patternID)
            if label is not None:
                L.append(label)

        self.attachLabels(Labels(L, patternID=patternIDs, **args))
    def constructFromFile(self, file_name, **args) :

        if 'data' not in args :
            raise ValueError, 'missing data object'
        self._data = args['data']

        id_dict = misc.list2dict(self._data.labels.patternID,
                                 range(len(self._data)))
        file_handle = open(file_name)
        L = []
        sets = []
        for line in file_handle :
            tokens = line.split()
            sets.append([id_dict[token] for token in tokens[:-1] ])
            L.append(tokens[-1])
        self.n = len(sets)
        self.callConstructor(len(sets))
        for s in sets :
            self.add(tuple(s))
        labels = Labels(L)
        WrapperDataSet.attachLabels(self, labels)
    def copyConstruct(self, other, patterns):

        self.pairs = [other.pairs[p] for p in patterns]
        self.data = other.data
        self.labels = Labels(other.labels, patterns=patterns)