Python Pipeline.preprocess_str Examples

Programming Language: Python

Namespace/Package Name: pangeamt_toolkit

Class/Type: Pipeline

Method/Function: preprocess_str

Examples at hotexamples.com: 3

Python Pipeline.preprocess_str - 3 examples found. These are the top rated real world Python examples of pangeamt_toolkit.Pipeline.preprocess_str extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Pipeline(4)

postprocess_str(3)

preprocess_str(3)

Example #1

Show file

class Engine:
    def __init__(self):
        with open('config.json', 'r') as file:
            self._config = json.loads(file.read())

        self._src_pipeline = Pipeline(self._config['pipeline_config'],
                                      self._config['src_lang'],
                                      self._config['tgt_lang'])

        self._tgt_pipeline = Pipeline(self._config['pipeline_config_tgt'],
                                      self._config['tgt_lang'])

    # Returns a trained model
    def train(self, p, src_text, tgt_text):
        res = {'src_prep': [], 'tgt_prep': []}
        # Shape returns the dimensions of the DataFrame, so shape[0] is the
        # number of rows.
        for i in range(len(src_text)):
            if ((i + 1) % 10 == 0):
                print(f'Trained with {i+1} segments.')
            # The attribute iat gets the value at [x, y]
            src = src_text[i]
            tgt = tgt_text[i]

            src_prep = self._src_pipeline.preprocess_str(src)
            tgt_prep = self._tgt_pipeline.preprocess_str(tgt)

            p.train(src_prep, tgt_prep)

        return p

    def translate(self, p, src_text, tgt_text):
        # Shape returns the dimensions of the DataFrame, so shape[0] is the
        # number of rows.
        translations_post = []
        for i in range(0, len(src_text), 30):
            ##        for i in range(len(src_text)):
            ##            seg = Seg(src_text[i])
            if ((i + 1) % 30 == 0):
                print(f'Translated {i+1} segments.')
            if i + 30 <= len(src_text):
                segs = src_text[i:i + 30]
            else:
                segs = src_text[i:]

            segs_prep = []
            for seg in segs:
                segs_prep.append(self._src_pipeline.preprocess_str(seg))
##            self._src_pipeline.preprocess(seg)

            translations = p.translate(segs_prep)
            ##            translation = p.translate(seg_prep)
            for trans in translations:
                tgt = (' ').join(trans.tgt)
                tgt = self._src_pipeline.postprocess_str(tgt)
                translations_post.append(tgt)
##            translation_post = (' ').join(translation[0].tgt)
        return translations_post

Example #2

Show file

File: pangeamt_find_best_learning.py Project: Pangeamt/pangeamt_toolkit

class Engine:
    def __init__(self, in_file, ref_file):
        with open('config.json', 'r') as file:
            self._config = json.loads(file.read())

        self._src_pipeline = Pipeline(self._config['pipeline_config'],\
            self._config['src_lang'], self._config['tgt_lang'])

        self._tgt_pipeline = Pipeline(self._config['pipeline_config_tgt'],\
            self._config['tgt_lang'])

        self._in_file = os.path.join('data', in_file)
        self._ref_file = os.path.join('data', ref_file)

    # Returns trained model
    def train(self):
        p = Pangeanmt('.')

        with open(self._in_file, 'r') as src_file:
            with open(self._ref_file, 'r') as tgt_file:
                for seg in src_file:
                    #try:
                    src = self._src_pipeline.preprocess_str(seg)

                    tgt_seg = tgt_file.readline()
                    tgt = self._tgt_pipeline.preprocess_str(tgt_seg)

                    p.train(src, tgt)
                    #except:
                    #    print('Something went wrong.')
        return p

    def translate_file(self, p, output_file):
        with open(self._in_file, 'r') as in_file:
            with open(output_file, 'w+') as out_file:
                for seg in in_file:
                    seg = self._src_pipeline.preprocess_str(seg)

                    translation = p.translate([seg])

                    tgt = (' ').join(translation[0].tgt)
                    tgt = self._src_pipeline.postprocess_str(tgt)

                    out_file.write(f'{tgt}\n')

    def gen_config(self, alpha):
        lr = self._config['opts']['learning_rate']

        os.rename('config.json', f'{lr}_config.json')

        self._config['opts']['learning_rate'] = alpha

        with open('config.json', 'w+') as config_file:
            config_file.write(json.dumps(self._config))

Example #3

Show file

class Engine:
    def __init__(self):
        with open("config.json", "r") as file:
            self._config = json.loads(file.read())

        self._src_pipeline = Pipeline(
            self._config["pipeline_config"],
            self._config["src_lang"],
            self._config["tgt_lang"],
        )

        self._tgt_pipeline = Pipeline(self._config["pipeline_config_tgt"],
                                      self._config["tgt_lang"])

    # Returns a trained model
    def train_from_table(self, p, table):
        res = {"src_prep": [], "tgt_prep": []}
        # Shape returns the dimensions of the DataFrame, so shape[0] is the
        # number of rows.
        for i in range(table.shape[0]):
            if (i + 1) % 10 == 0:
                print(f"Trained with {i+1} segments.")
            # The attribute iat gets the value at [x, y]
            src = table.iat[i, 0]
            tgt = table.iat[i, 1]

            src_prep = self._src_pipeline.preprocess_str(src)
            tgt_prep = self._tgt_pipeline.preprocess_str(tgt)

            res["src_prep"].append(src_prep)
            res["tgt_prep"].append(tgt_prep)

            p.train(src_prep, tgt_prep)

        return p, res

    def no_train_translate_from_table(self, p, table):
        res = {"original": []}
        # Shape returns the dimensions of the DataFrame, so shape[0] is the
        # number of rows.
        for i in range(table.shape[0]):
            if (i + 1) % 10 == 0:
                print(f"Translated {i+1} segments.")
                # The attribute iat gets the value at [x, y]
            seg = table.iat[i, 0]

            seg_prep = self._src_pipeline.preprocess_str(seg)

            translation = p.translate([seg_prep])

            tgt = (" ").join(translation[0].tgt)
            tgt = self._src_pipeline.postprocess_str(tgt)

            res["original"].append(tgt)
        return res

    def translate_from_table(self, p, table, j):
        res = {f"tgts_{j}": []}
        # Shape returns the dimensions of the DataFrame, so shape[0] is the
        # number of rows.
        for i in range(table.shape[0]):
            if (i + 1) % 10 == 0:
                print(f"Translated {i+1} segments.")
            # The attribute iat gets the value at [x, y]
            seg = table.iat[i, 0]

            seg_prep = self._src_pipeline.preprocess_str(seg)

            translation = p.translate([seg_prep])

            tgt = (" ").join(translation[0].tgt)
            tgt = self._src_pipeline.postprocess_str(tgt)

            res[f"tgts_{j}"].append(tgt)
        return res