Beispiel #1
0
    def encode_advertise(self, advertise):
        x_char, x_word, y_tag = [], [], []
        char2idx = self.maps["char2idx"]
        word2idx = self.maps["word2idx"]
        tag2idx = self.maps["tag2idx"]

        terms = advertise["NER"]
        terms_words = [token[0] for token in terms]
        terms_tags = [token[1] for token in terms]

        tmp_seq_words = self.pad_term_sequence(terms_words, max_len=self.seq_max_len)
        tmp_seq_tags = self.pad_term_sequence(terms_tags, max_len=self.seq_max_len)

        w_rep, word2idx = self.build_word_representations(tmp_seq_words, word2idx)
        t_rep, tag2idx = self.build_word_representations(tmp_seq_tags, tag2idx)

        x_word.append(pad_sequences(maxlen=self.seq_max_len, sequences=[w_rep], value=word2idx["__PAD__"],
                                    padding='post', truncating='post').tolist())
        y_tag.append(pad_sequences(maxlen=self.seq_max_len, sequences=[t_rep], value=tag2idx["__PAD__"],
                                   padding='post', truncating='post').tolist())

        representation, char2idx = self.build_char_representations(tmp_seq_words, char2idx)
        x_char.append(representation)

        self.save_encoded_data(x_word, x_char, y_tag)

        # Update maps
        if self.update_maps:
            self.maps["char2idx"] = char2idx
            self.maps["word2idx"] = word2idx
            self.maps["tag2idx"] = tag2idx

        self.advertise_counter += 1
        sc.get_notice(self.advertise_counter, 5000, msg_text="ads processed!")
    def encode_advertise(self, advertise):
        x_char, x_word, y_price = [], [], []
        char2idx = self.maps["char2idx"]
        word2idx = self.maps["word2idx"]

        terms = advertise["clean_text"]
        tmp_seq = self.pad_term_sequence(self.tokenize_sentence(terms), max_len=self.seq_max_len)

        w_rep, word2idx = self.build_word_representations(tmp_seq, word2idx)
        x_word.append(pad_sequences(maxlen=self.seq_max_len, sequences=[w_rep], value=word2idx["__PAD__"],
                                    padding='post', truncating='post').tolist())
        representation, char2idx = self.build_char_representations(tmp_seq, char2idx)
        x_char.append(representation)
        y_price.append(np.log(float(advertise["price"])))

        # Translate position of title and detail
        terms = advertise["clean_text_invert"]
        tmp_seq = self.pad_term_sequence(self.tokenize_sentence(terms), max_len=self.seq_max_len)

        w_rep, word2idx = self.build_word_representations(tmp_seq, word2idx)
        x_word.append(pad_sequences(maxlen=self.seq_max_len, sequences=[w_rep], value=word2idx["__PAD__"],
                                    padding='post', truncating='post').tolist())
        representation, char2idx = self.build_char_representations(tmp_seq, char2idx)
        x_char.append(representation)
        y_price.append(np.log(float(advertise["price"])))

        self.save_encoded_data(x_word, x_char, y_price)

        # Update maps
        if self.update_maps:
            self.maps["char2idx"] = char2idx
            self.maps["word2idx"] = word2idx

        self.advertise_counter += 1
        sc.get_notice(self.advertise_counter, 5000, msg_text="ads processed!")
Beispiel #3
0
    def save_encoded_data(self, *data):
        if self.model_folder:
            dataset_name = "dataset.jsonl"
            dataset_path = os.path.join(self.model_folder, dataset_name)

            with open(dataset_path, "a", encoding="utf-8") as js:
                x_word, x_char, y_price = data
                for word, char, price in zip(x_word, x_char, y_price):
                    js.write(json.dumps({"x_word": word, "x_char": char, "y_tag": price}) + "\n")
                    self.processed_counter += 1
                    sc.get_notice(self.processed_counter, msg_text="training obs processed!")
        else:
            # TODO: implement saving in DataStorage
            raise NotImplementedError("Saving outside a local folder path is not implemented yet!")
    def encode_advertise(self, advertise):
        char2idx = self.maps["char2idx"]
        word2idx = self.maps["word2idx"]

        terms = advertise["clean_text"]
        tmp_seq = self.pad_term_sequence(self.tokenize_sentence(terms),
                                         max_len=self.seq_max_len)

        word2idx = self.build_word_representations(tmp_seq, word2idx)
        char2idx = self.build_char_representations(tmp_seq, char2idx)

        # Update maps
        self.maps["char2idx"] = char2idx
        self.maps["word2idx"] = word2idx
        self.advertise_counter += 1
        sc.get_notice(self.advertise_counter, 5000, msg_text="ads processed!")
Beispiel #5
0
    def encode_advertise(self, advertise):
        char2idx = self.maps["char2idx"]
        word2idx = self.maps["word2idx"]
        tag2idx = self.maps["tag2idx"]

        terms = advertise["NER"]
        terms_words = [token[0] for token in terms]
        terms_tags = [token[1] for token in terms]

        if self.debug:
            print(terms_words)
            print(terms_tags)

        word2idx = self.build_word_representations(terms_words, word2idx)
        char2idx = self.build_char_representations(terms_words, char2idx)
        tag2idx = self.build_tag_representations(terms_tags, tag2idx)

        # Update maps
        self.maps["char2idx"] = char2idx
        self.maps["word2idx"] = word2idx
        self.maps["tag2idx"] = tag2idx
        self.advertise_counter += 1
        sc.get_notice(self.advertise_counter, 5000, msg_text="ads processed!")
Beispiel #6
0
 def encode_advertise(self, advertise):
     self.update_schema_dist(advertise)
     self.advertise_counter += 1
     sc.get_notice(self.advertise_counter, 5000, msg_text="ads processed!")