def __init__(self,
                 source_with_start=False, source_with_end = False, source_with_unk = False,
                 target_with_start=False, target_with_end=False, target_with_unk=False,
                 same_length = False
                 ):
        
        self.source_with_start = source_with_start
        self.source_with_end = source_with_end
        self.source_with_unk = source_with_unk

        self.target_with_start = target_with_start
        self.target_with_end = target_with_end
        self.target_with_unk = target_with_unk
        

        self.source_corpus = SequenceCorpus(source_with_start, source_with_end, source_with_unk)
        self.target_corpus = SequenceCorpus(target_with_start, target_with_end, target_with_unk)
        self.same_length = same_length

        self.corpus = []
Beispiel #2
0
    def __init__(self,
                 source_with_start=False,
                 source_with_end=False,
                 source_with_unk=False,
                 target_with_start=False,
                 target_with_end=False,
                 target_with_unk=False,
                 same_length=False):

        self.source_with_start = source_with_start
        self.source_with_end = source_with_end
        self.source_with_unk = source_with_unk

        self.target_with_start = target_with_start
        self.target_with_end = target_with_end
        self.target_with_unk = target_with_unk

        self.source_corpus = SequenceCorpus(source_with_start, source_with_end,
                                            source_with_unk)
        self.target_corpus = SequenceCorpus(target_with_start, target_with_end,
                                            target_with_unk)
        self.same_length = same_length

        self.corpus = []
from neural_machine.tasks.language.common.corpus.segmentor import *
from neural_machine.tasks.language.common.corpus.sequence_corpus import SequenceCorpus
from neural_machine.tasks.language.common.data_reader.bucket_iter import *

import sys

import logging

if __name__ == '__main__':

    head = '%(asctime)-15s %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=head)

    segmenter = SpaceSegmenter()
    corpus = SequenceCorpus()

    corpus.build(open(sys.argv[1], 'r'), segmenter)
    cell_num = corpus.cell_num()

    problem = LanguageModelProblem(corpus)

    batch_size = 32

    data_train = BucketIter(problem, batch_size)

    val_corpus = corpus.make(open(sys.argv[2], 'r'), segmenter)
    val_problem = LanguageModelProblem(val_corpus)
    data_val = BucketIter(val_problem, batch_size)

    arch_param = LanguageModelArchParam(num_hidden=200,
from neural_machine.tasks.language.common.corpus.segmentor import *
from neural_machine.tasks.language.common.corpus.sequence_corpus import SequenceCorpus
from neural_machine.tasks.language.common.data_reader.bucket_iter import *

import sys

import logging

if __name__ == '__main__':

    head = '%(asctime)-15s %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=head)


    segmenter = SpaceSegmenter()
    corpus = SequenceCorpus()

    corpus.build(open(sys.argv[1], 'r'), segmenter)
    cell_num = corpus.cell_num()

    problem = LanguageModelProblem(corpus)

    batch_size = 32

    data_train = BucketIter(problem, batch_size)

    val_corpus = corpus.make(open(sys.argv[2], 'r'), segmenter)
    val_problem = LanguageModelProblem(val_corpus)
    data_val = BucketIter(val_problem, batch_size)

class SequencePairCorpus(object):
    def __init__(self,
                 source_with_start=False, source_with_end = False, source_with_unk = False,
                 target_with_start=False, target_with_end=False, target_with_unk=False,
                 same_length = False
                 ):
        
        self.source_with_start = source_with_start
        self.source_with_end = source_with_end
        self.source_with_unk = source_with_unk

        self.target_with_start = target_with_start
        self.target_with_end = target_with_end
        self.target_with_unk = target_with_unk
        

        self.source_corpus = SequenceCorpus(source_with_start, source_with_end, source_with_unk)
        self.target_corpus = SequenceCorpus(target_with_start, target_with_end, target_with_unk)
        self.same_length = same_length

        self.corpus = []

    def build(self, data_file, source_segmenter, target_segmenter):

        for line in data_file:
            line = line.strip()
            if not line:
                continue

            try:
                src_seq, tgt_seq = line.split('\t')
            except:
                logging.error("no sequence pair found in sentence : {0} ".format(json.dumps(line)))
                continue

            if self.same_length and len(src_seq) != len(tgt_seq):
                logging.error("src and tgt seq not in same length {0} {1} {2}".format(len(src_seq), len(tgt_seq), json.dumps(line)))
                continue

            src=self.source_corpus.update(src_seq, source_segmenter)
            target=self.target_corpus.update(tgt_seq, target_segmenter)

            self.corpus.append((src, target))
    
    def make(self, data_file, source_segmenter, target_segmenter):

        corpus = SequencePairCorpus(
                 self.source_with_start, self.source_with_end , self.source_with_unk ,
                 self.target_with_start, self.target_with_end, self.target_with_unk,
                 self.same_length
                 )

        corpus.source_corpus = self.source_corpus.clone()
        corpus.target_corpus = self.target_corpus.clone()

        for line in data_file:
            line = line.strip()
            if not line:
                continue

            try:
                src_seq, tgt_seq = line.split('\t')
            except:
                logging.error("no sequence pair found in sentence : {0} ".format(json.dumps(line)))
                continue

            if self.same_length and len(src_seq) != len(tgt_seq):
                logging.error("src and tgt seq not in same length {0} {1} {2}".format(len(src_seq), len(tgt_seq), json.dumps(line)))
                continue

            src = self.source_corpus.predict(src_seq, source_segmenter)
            target = self.target_corpus.predict(tgt_seq, target_segmenter)

            corpus.corpus.append((src, target))

        return corpus



    def source_cell_num(self):
        return self.source_corpus.cell_num()

    def target_cell_num(self):
        return self.target_corpus.cell_num()

    def corpus_size(self):
        return len(self.corpus)
Beispiel #6
0
class SequencePairCorpus(object):
    def __init__(self,
                 source_with_start=False,
                 source_with_end=False,
                 source_with_unk=False,
                 target_with_start=False,
                 target_with_end=False,
                 target_with_unk=False,
                 same_length=False):

        self.source_with_start = source_with_start
        self.source_with_end = source_with_end
        self.source_with_unk = source_with_unk

        self.target_with_start = target_with_start
        self.target_with_end = target_with_end
        self.target_with_unk = target_with_unk

        self.source_corpus = SequenceCorpus(source_with_start, source_with_end,
                                            source_with_unk)
        self.target_corpus = SequenceCorpus(target_with_start, target_with_end,
                                            target_with_unk)
        self.same_length = same_length

        self.corpus = []

    def build(self, data_file, source_segmenter, target_segmenter):

        for line in data_file:
            line = line.strip()
            if not line:
                continue

            try:
                src_seq, tgt_seq = line.split('\t')
            except:
                logging.error(
                    "no sequence pair found in sentence : {0} ".format(
                        json.dumps(line)))
                continue

            if self.same_length and len(src_seq) != len(tgt_seq):
                logging.error(
                    "src and tgt seq not in same length {0} {1} {2}".format(
                        len(src_seq), len(tgt_seq), json.dumps(line)))
                continue

            src = self.source_corpus.update(src_seq, source_segmenter)
            target = self.target_corpus.update(tgt_seq, target_segmenter)

            self.corpus.append((src, target))

    def make(self, data_file, source_segmenter, target_segmenter):

        corpus = SequencePairCorpus(self.source_with_start,
                                    self.source_with_end, self.source_with_unk,
                                    self.target_with_start,
                                    self.target_with_end, self.target_with_unk,
                                    self.same_length)

        corpus.source_corpus = self.source_corpus.clone()
        corpus.target_corpus = self.target_corpus.clone()

        for line in data_file:
            line = line.strip()
            if not line:
                continue

            try:
                src_seq, tgt_seq = line.split('\t')
            except:
                logging.error(
                    "no sequence pair found in sentence : {0} ".format(
                        json.dumps(line)))
                continue

            if self.same_length and len(src_seq) != len(tgt_seq):
                logging.error(
                    "src and tgt seq not in same length {0} {1} {2}".format(
                        len(src_seq), len(tgt_seq), json.dumps(line)))
                continue

            src = self.source_corpus.predict(src_seq, source_segmenter)
            target = self.target_corpus.predict(tgt_seq, target_segmenter)

            corpus.corpus.append((src, target))

        return corpus

    def source_cell_num(self):
        return self.source_corpus.cell_num()

    def target_cell_num(self):
        return self.target_corpus.cell_num()

    def corpus_size(self):
        return len(self.corpus)