Ejemplo n.º 1
0
    def test_extra_encoding(self):
        initial_text = "this is the string the"
        extra_text = "more to the encoding"
        codec = WordCodec()

        codec.encode(initial_text)
        encoded = codec.encode(extra_text)

        assert encoded == 1
Ejemplo n.º 2
0
    def __init__(self, file_path: str = None):
        self.file_path = file_path
        self.textCodec = TextCodec()
        self.labelCodec = WordCodec()

        # build the word space by encoding everything
        for (text, label) in self.text_label_from_file(file_path):
            self.textCodec.encode(text)
            self.labelCodec.encode(label)

        self.matrix_iden = numpy.identity(len(self.labelCodec.encode_map))
Ejemplo n.º 3
0
class AirbnbExample(DataSource):
    MAX_LEN = 128

    def __init__(self, file_path: str = None):
        self.file_path = file_path
        self.textCodec = TextCodec()
        self.labelCodec = WordCodec()

        # build the word space by encoding everything
        for (text, label) in self.text_label_from_file(file_path):
            self.textCodec.encode(text)
            self.labelCodec.encode(label)

        self.matrix_iden = numpy.identity(len(self.labelCodec.encode_map))

    def labels(self):
        for (_, label) in self.text_label_from_file(self.file_path):
            yield self.label_representation(label)

    def texts(self):
        for (text, _) in self.text_label_from_file(self.file_path):
            yield self.text_representation(text)

    def label_representation(self, label: str) -> numpy.ndarray:
        encoded = self.labelCodec.encode(label)
        return self.matrix_iden[encoded]

    def text_representation(self, text: str) -> numpy.ndarray:
        pre_pad = self.textCodec.encode(text)
        if len(pre_pad) >= self.MAX_LEN:
            return pre_pad[:self.MAX_LEN]
        else:
            return numpy.pad(pre_pad, (0, self.MAX_LEN - len(pre_pad)))

    def text_label_from_file(self, file_path):
        with open(self.file_path) as csv_file:
            reader = csv.reader(csv_file)
            for (text, label) in reader:
                yield (text, label)

    def read_prediction(self, single_prediction: numpy.ndarray):
        max_val = single_prediction.max()
        max_index = numpy.argmax(single_prediction)

        return (self.labelCodec.decode(max_index), max_val)
Ejemplo n.º 4
0
class TextCodec(codecs.Codec):
    """
    Encodes a string of distinct words

    Encodes a string by splitting by word into an array
    """
    PADDING = "<PAD>"

    def __init__(self):
        self.word_codec = WordCodec()
        self.word_codec.encode(self.PADDING)  # 0 reserved is for padding

    def encode(self, input, errors='strict') -> str:
        words = input.split(' ')
        return [self.word_codec.encode(word) for word in words]

    def decode(self, input, errors='strict') -> str:
        return ' '.join([self.word_codec.decode(number) for number in input])
Ejemplo n.º 5
0
 def test_initial_encoding(self):
     text = "this is the string the"
     output = 0
     codec = WordCodec()
     assert codec.encode(text) == output
Ejemplo n.º 6
0
 def test_initial_decoding(self):
     text = "this is the string the"
     codec = WordCodec()
     encoded = codec.encode(text)
     assert codec.decode(encoded) == text
Ejemplo n.º 7
0
 def __init__(self):
     self.word_codec = WordCodec()
     self.word_codec.encode(self.PADDING)  # 0 reserved is for padding