コード例 #1
0
    def process(self, filename_read, filename_save, header_string, footer_string, sentence_max_len = 15):
        self.header_string = header_string

        rows = self.readfile(filename_read)

        result = np.asarray([' '], dtype=np.str)

        rows = self.dropheader(rows)

        for row in rows:
            # find footer
            if row.find(footer_string) != -1:
                break
            row = row.lower().translate(self.translation)
            s = np.asarray(row, dtype=np.str)
            result = np.char.add(result, s)

        self.text = np.char.split(result).tolist()[0]

        self.text = list(filter(lambda stop: (self.is_stopword(stop) == bool(0)), self.text))

        for x in range(len(self.text)):
            if x >= len(self.text):
                break
            self.text[x] = self.convert2number(x, 0)

        result = np.asarray(self.text)

        text_lem = np.asarray([' '], dtype=np.str)

        for item in result:
            try:
                y = self.stem_words[item]
            except:
                y = item
            y = np.char.split(self.expand_contractions(y))
            arr = y.tolist()
            text_lem = np.append(text_lem, arr)

        voc = Vocabulary('vocabulary')

        sentence = []

        text_out = [voc.to_token(self.SOS_token)]

        sen_length = 0

        for word in text_lem:
            if word == " ":
                continue
            word_cleaned = word.translate(self.trans_punctuation)
            sentence.append(word_cleaned)
            sen_length += 1
            if self.is_sentence_end(word) \
                    or (self.is_sentence_conditional_end(word, word_cleaned) and sen_length > sentence_max_len):
                if word_cleaned != "and":
                    text_out.append(word_cleaned)

                sentence_str = ' '.join(sentence).replace(" and ","")
                voc.add_chunk(sentence_str)
                text_out.append(voc.to_token(self.EOS_token))
                text_out.append(voc.to_token(self.SOS_token))
                sentence.clear()
                sen_length = 0
            elif word_cleaned != "and":
                text_out.append(word_cleaned)

        text_out[len(text_out) - 1] = "\n"

        voc.save_data(filename_save)

        x = 0
        file_proc = open(filename_save, "w")
        for token in text_out:
            if token == voc.to_token(self.EOS_token):
                for i in range(voc.count_longest_sentence() - x):
                    file_proc.write(voc.to_token(self.PAD_token) + " ")
                x = 0
            file_proc.write(token + " ")
            x += 1

        file_proc.close()