Esempio n. 1
0
class BinarySource(Source):
    """ Source for binary classification data in following format:
	one example per line with feature-value pair separated by
	separator symbol (' ' by default). E.g.:

	1	f1:1.0 f2:1.0 f3:1.0
	-1	f2:1.0 f3:1.0 f8:1.0
	-1	f1:1.0 f2:1.0
	1	f8:1.0 f9:1.0 f10:1.0
	"""
    def __init__(self,
                 data,
                 encoding="utf-8",
                 feature_alphabet=None,
                 alphabet_pop=True,
                 alphabet_lock=True,
                 sep=":",
                 bias=False,
                 bias_prefix="@@BIAS@@"):
        Source.__init__(self, data, encoding=encoding)
        self._Instance = BinaryClassificationInstance
        if feature_alphabet != None:
            self._feature_alphabet = feature_alphabet
        else:
            self._feature_alphabet = Alphabet(locked=False)
        self._sep = sep
        self._bias = bias
        self._bias_prefix = bias_prefix
        if alphabet_pop:
            self._populate_alphabet()
        if alphabet_lock:
            self.lock_alphabet()
        else:
            self.unlock_alphabet()
        return

    def _parse(self):
        """ return parsed line """
        sep = self._sep
        for line in self._stream:
            line = line.rstrip()
            items = line.split()
            cl = items[0]
            assert cl in [POS_LAB, NEG_LAB]
            feats = []
            if self._bias:
                feats.append((self._bias_prefix, 1.0))  # implicit bias
            for s in items[1:]:
                try:
                    f, v = s.rsplit(sep, 1)
                    v = float(v)
                    feats.append((f, v))
                except ValueError:
                    sys.exit(
                        "Datasource error: make sure you use the right datasource format."
                    )
            yield (cl, feats)

    def _populate_alphabet(self):
        print >> sys.stderr, "Populating feature alphabet...             ",
        self.unlock_alphabet()
        if self._stream_type == "generator":
            for i, gen_inst in enumerate(self._stream):  # read stream directly
                sys.stderr.write("%s" % "\b" * len(str(i)) + str(i))
                featvals = gen_inst.get_featvals()
                for (f, _) in featvals:
                    self._feature_alphabet.add(f)
        else:
            try:
                for tag, feats in self._parse():
                    for f, _ in feats:
                        self._feature_alphabet.add(f)
            except ValueError:
                sys.exit(
                    "Datasource error: make sure you use the right data format."
                )
            # rewind stream
        try:
            self.rewind()
        except TypeError:
            sys.exit("TypeError: make sure rewind() is used only on files.")
        print >> sys.stderr, " done."
        print >> sys.stderr, "Number of features: %s" % self._feature_alphabet.size(
        )
        return

    def unlock_alphabet(self):
        self._feature_alphabet.unlock()
        return

    def lock_alphabet(self):
        self._feature_alphabet.lock()
        return

    def set_alphabet(self, feature_alphabet):
        self._feature_alphabet = feature_alphabet
        return

    def get_alphabet(self):
        return self._feature_alphabet

    def get_input(self):
        for label, feats in self._parse():
            yield label, feats

    def __iter__(self):
        """ instance generator """
        feature_alphabet = self._feature_alphabet
        assert not (feature_alphabet.empty() and
                    feature_alphabet.locked()), "Feature alphabet is empty!"
        if self._stream_type in ["file", "list"]:
            for idx, (label, feats) in enumerate(self._parse()):
                if not feature_alphabet.locked():  # dynamic feature alphabet
                    for (f, _) in feats:
                        feature_alphabet.add(f)
                instance = self._Instance(idx, label, feats, feature_alphabet)
                yield instance
        elif self._stream_type == "generator":
            for idx, gen_inst in enumerate(
                    self._stream):  # read stream directly
                featvals = gen_inst.get_featvals()
                label = gen_inst.get_label()
                if not feature_alphabet.locked():  # dynamic feature alphabet
                    for (f, _) in featvals:
                        feature_alphabet.add(f)
                instance = self._Instance(idx, label, featvals, label_alphabet,
                                          feature_alphabet)
                yield instance

    def size(self):
        s = len(list(self._stream))
        self.rewind()
        return s
Esempio n. 2
0
class BinarySource( Source ):
	""" Source for binary classification data in following format:
	one example per line with feature-value pair separated by
	separator symbol (' ' by default). E.g.:

	1	f1:1.0 f2:1.0 f3:1.0
	-1	f2:1.0 f3:1.0 f8:1.0
	-1	f1:1.0 f2:1.0
	1	f8:1.0 f9:1.0 f10:1.0
	"""
	def __init__( self, data, encoding="utf-8", feature_alphabet=None, alphabet_pop=True, alphabet_lock=True, sep=":", bias=False, bias_prefix="@@BIAS@@" ):
		Source.__init__(self, data, encoding=encoding)
		self._Instance = BinaryClassificationInstance
		if feature_alphabet != None:
			self._feature_alphabet = feature_alphabet
		else:
			self._feature_alphabet = Alphabet(locked=False)
		self._sep = sep
		self._bias = bias
		self._bias_prefix = bias_prefix
		if alphabet_pop:
			self._populate_alphabet()
		if alphabet_lock:
			self.lock_alphabet()
		else:
			self.unlock_alphabet()
		return

	def _parse( self ):
		""" return parsed line """
		sep = self._sep
		for line in self._stream:
			line = line.rstrip()
			items = line.split()
			cl = items[0]
			assert cl in [POS_LAB, NEG_LAB]
			feats = []
			if self._bias:
				feats.append( (self._bias_prefix, 1.0) ) # implicit bias
			for s in items[1:]:
				try:
					f,v = s.rsplit(sep, 1)
					v = float(v)
					feats.append( (f,v) )
				except ValueError:
					sys.exit("Datasource error: make sure you use the right datasource format.")
			yield ( cl, feats )

	def _populate_alphabet( self ):
		print >> sys.stderr, "Populating feature alphabet...             ",
		self.unlock_alphabet()
		if self._stream_type == "generator":
			for i, gen_inst in enumerate(self._stream): # read stream directly
				sys.stderr.write("%s" %"\b"*len(str(i))+str(i))	
				featvals = gen_inst.get_featvals()
				for (f,_) in featvals:
					self._feature_alphabet.add(f)
		else:
			try:
				for tag,feats in self._parse():
					for f,_ in feats:
						self._feature_alphabet.add( f )
			except ValueError:
				sys.exit("Datasource error: make sure you use the right data format.")
			# rewind stream
		try:
			self.rewind()
		except TypeError:
			sys.exit("TypeError: make sure rewind() is used only on files.")
		print >> sys.stderr, " done."
		print >> sys.stderr, "Number of features: %s" %self._feature_alphabet.size()
		return

	def unlock_alphabet( self ):
		self._feature_alphabet.unlock()
		return

	def lock_alphabet( self ):
		self._feature_alphabet.lock()
		return

	def set_alphabet( self, feature_alphabet ):
		self._feature_alphabet = feature_alphabet
		return

	def get_alphabet( self ):
		return self._feature_alphabet

	def get_input( self ):
		for label,feats in self._parse():
			yield label, feats

	def __iter__( self ):
		""" instance generator """
		feature_alphabet = self._feature_alphabet
		assert not (feature_alphabet.empty() and feature_alphabet.locked()), "Feature alphabet is empty!"
		if self._stream_type in ["file","list"]:
			for idx,(label,feats) in enumerate(self._parse()):
				if not feature_alphabet.locked(): # dynamic feature alphabet
					for (f,_) in feats:
						feature_alphabet.add(f)
				instance =  self._Instance(idx, label, feats, feature_alphabet)
				yield instance
		elif self._stream_type == "generator":
			for idx, gen_inst in enumerate(self._stream): # read stream directly
				featvals = gen_inst.get_featvals()
				label = gen_inst.get_label()
				if not feature_alphabet.locked(): # dynamic feature alphabet
					for (f,_) in featvals:
						feature_alphabet.add(f)
				instance = self._Instance(idx, label, featvals, label_alphabet, feature_alphabet)
				yield instance

	def size( self ):
		s = len(list(self._stream))
		self.rewind()
		return s