コード例 #1
0
    def __init__(self, unsuper_file, semisuper_file, acw):
        self.io = morfessor.MorfessorIO()
        self.acw = acw

        # build models
        self.unsupervised = self.io.read_binary_model_file(unsuper_file)

        self.semisupervised = self.io.read_binary_model_file(unsuper_file)
        a = self.io.read_annotations_file(semisuper_file)
        annotations = {}
        for word in a:
            hypotheses = []
            for hypothesis in a[word]:
                h = ""
                for morpheme in hypothesis:
                    h += morpheme + " "
                hypotheses.append(h[:-1])
            annotations[word] = hypotheses
        self.semisupervised.set_annotations(annotations, acw)

        # write segmentation
        with open('models/bible-segmentation', 'w') as f:
            for word in annotations:
                construction = ""
                for hypothesis in annotations[word]:
                    construction += hypothesis + ", "
                construction = construction[:-2]
                f.write(construction + "\n")
        self.segs = self.io.read_segmentation_file('models/bible-segmentation',
                                                   has_counts=False)
コード例 #2
0
def da_trainer(datapath):
    io = morfessor.MorfessorIO()

    train_data = list(io.read_corpus_file(datapath))

    model_types = morfessor.BaselineModel()
    model_logtokens = morfessor.BaselineModel()
    model_tokens = morfessor.BaselineModel()

    model_types.load_data(train_data, count_modifier=lambda x: 1)

    def log_func(x):
        return int(round(math.log(x + 1, 2)))

    model_logtokens.load_data(train_data, count_modifier=log_func)
    model_tokens.load_data(train_data)

    models = [model_types, model_logtokens, model_tokens]

    i = 0
    for model in models:
        model.train_batch()
        io.write_binary_model_file("model" + str(i), model)

        i += 1
コード例 #3
0
ファイル: train_morphseg.py プロジェクト: psmit/char-fin-2017
def main(d):
    parse_name(d)

    word_count = collections.Counter()
    parent_dir = os.path.dirname(d)
    for f in os.listdir(parent_dir):
        if f.endswith(".xz"):
            for line in lzma.open(os.path.join(parent_dir, f), 'rt', encoding='utf-8'):
                for word in line.strip().split():
                    word_count[word] += 1
    print("Corpora read", file=sys.stderr)            
    allowed_chars = {line.strip() for line in open(os.path.join(parent_dir, 'allowed_chars'), encoding='utf-8') if len(line.strip()) == 1}
    
    model = morfessor.MorfessorIO().read_any_model(os.path.join(d, 'model.bin'))

    s = set()
    with open(os.path.join(d,'wordmap'), 'w', encoding='utf-8') as outf:
        for k in word_count.keys():
            parts = model.viterbi_segment(k)[0] 
            rparts = []
            for p in parts:
                if not all(c in allowed_chars for c in p):
                    p = '<UNK>'
                s.add(p)
                rparts.append(p)
            print("{}\t{}".format(k, " ".join(rparts)), file=outf)

    with open(os.path.join(d,'vocab2'), 'w', encoding='utf-8') as outf:
        for morph in s:
            print(morph, file=outf)
コード例 #4
0
ファイル: segadv.py プロジェクト: Hugo291/wixnlp-master
    def __init__(self, infile, outfile, modelfile, dicfile, wixlm="wixgrams.pickle", eslm="esgrams.pickle"):
        #F = open("../corpus/corpus.norm2.wix", "r").read()
        #corpus = F.split()
        #fq = nltk.FreqDist(corpus)
        #print(fq.most_common(100))


        # Collect data for the classification
        dicwix = open(dicfile, "r").read()
        dic = set(dicwix.split(" \n"))
        self.dicw = list(dic)

        self.F = open(infile, "r")
        self.corp = []

        with open(wixlm, 'rb') as f:
            self.wixngrams= pickle.load(f)

        with open(eslm, 'rb') as f:
            self.esngrams= pickle.load(f)

        self.punct = ".,;:\"{}[]()$%&/¿?¡!-"

        self.io = morfessor.MorfessorIO()

        self.model = self.io.read_binary_model_file(modelfile)
        self.inF = open(infile, "r")
        self.outF = open(outfile, "w")

        #Stadistics
        self.nonsegwords=0
        self.eswords=0
        self.segwords=0
コード例 #5
0
ファイル: flatcat_test.py プロジェクト: sjmielke/flatcat
def _load_baseline():
    baseline = morfessor.BaselineModel()
    io = morfessor.MorfessorIO(encoding='latin-1')

    baseline.load_segmentations(
        io.read_segmentation_file(REFERENCE_BASELINE_SEGMENTATION))
    return baseline
コード例 #6
0
def main(d):
    parse_name(d)

    word_count = collections.Counter()
    print(d)
    seg_dir = os.path.dirname(d)
    print("seg_dir {}".format(seg_dir))
    for f in os.listdir(seg_dir):
        if f.endswith(".xz"):
            print(f)
            for line in lzma.open(os.path.join(seg_dir, f), 'rt', encoding='utf-8'):
                for word in line.strip().split():
                    word_count[word] += 1
    print("Corpora read", file=sys.stderr)            
    
    model = morfessor.MorfessorIO().read_any_model(os.path.join(d, 'model.bin'))

    s = set()
    with open(os.path.join(d,'wordmap_all'), 'w', encoding='utf-8') as outf:
        for k in word_count.keys():
            parts = model.viterbi_segment(k)[0] 
            rparts = []
            for p in parts:
                s.add(p)
                rparts.append(p)
            print("{}\t{}".format(k, " ".join(rparts)), file=outf)

    with open(os.path.join(d,'vocab_all'), 'w', encoding='utf-8') as outf:
        for morph in s:
            print(morph, file=outf)
コード例 #7
0
ファイル: flatcat_test.py プロジェクト: sjmielke/flatcat
    def setUp(self):
        self.baseline = _load_baseline()
        self.model = _load_flatcat(self.baseline.get_segmentations(),
                                   init='no_emissions')

        io = morfessor.MorfessorIO(encoding='latin-1')
        line_re = re.compile(r'^[0-9]* (.*)')
        separator_re = re.compile(r' \+ ')
        tag_re = re.compile(r'([^/]*)/(.*)')

        self.detagged = []
        self.references = []
        for line in io._read_text_file(REFERENCE_BASELINE_TAGGED):
            m = line_re.match(line)
            if not m:
                continue
            segments = separator_re.split(m.group(1))
            detagged_tmp = []
            ref_tmp = []
            for segment in segments:
                m = tag_re.match(segment)
                assert m, 'Could not parse "%s" in "%s"' % (segment, line)
                ref_tmp.append(flatcat.CategorizedMorph(
                    m.group(1), m.group(2)))
                detagged_tmp.append(m.group(1))
            self.references.append(ref_tmp)
            self.detagged.append(detagged_tmp)
コード例 #8
0
    def __init__(self,lang,add_marker=False): 
        self.lang=lang
        self.add_marker=add_marker

        io = morfessor.MorfessorIO()
        self._morfessor_model=io.read_any_model(common.INDIC_RESOURCES_PATH+'/morph/morfessor/{}.model'.format(lang))

        self._script_range_pat=r'^[{}-{}]+$'.format(chr(langinfo.SCRIPT_RANGES[lang][0]),chr(langinfo.SCRIPT_RANGES[lang][1]))
        self._script_check_re=re.compile(self._script_range_pat)
コード例 #9
0
def test_data(modelpath, testpath):
    io = morfessor.MorfessorIO()
    model = io.read_binary_model_file(modelpath)
    test_data = list(io.read_corpus_file(testpath))
    words = test_data[1:-1:6]
    for index, word in enumerate(words):
        words[index] = word[1]
    analyses = []
    for word in words:
        print(model.viterbi_segment(word))
コード例 #10
0
ファイル: flatcat_test.py プロジェクト: sjmielke/flatcat
    def _config(self):
        self.reference_file = REFERENCE_REESTIMATE_PROBS
        self.baseline = _load_baseline()
        self.model = _load_flatcat(self.baseline.get_segmentations(),
                                   init='first')
        self.retagged = []

        io = morfessor.MorfessorIO(encoding='latin-1')
        segmentations = io.read_segmentation_file(
            REFERENCE_BASELINE_SEGMENTATION)
コード例 #11
0
def Base_SegModel(data, average_morph_length):
    io = morfessor.MorfessorIO()
    train_data = list(io.read_corpus_file(data))
    baseline_model = morfessor.BaselineModel(corpusweight=1.0)
    updater = morfessor.baseline.MorphLengthCorpusWeight(average_morph_length)
    baseline_model.set_corpus_weight_updater(updater)
    baseline_model.load_data(train_data, count_modifier=lambda x: 1)
    baseline_model.train_batch()

    return baseline_model
コード例 #12
0
def main(d):
    freq, alpha, damp = parse_name(d)

    word_count = collections.Counter()
    parent_dir = os.path.dirname(d)
    for f in os.listdir(parent_dir):
        if f.endswith(".xz") and not f.startswith("dev") and not f.startswith(
                "eval") and not f.startswith("test"):
            print("Read {}".format(f), file=sys.stderr)
            for line in lzma.open(os.path.join(parent_dir, f),
                                  'rt',
                                  encoding='utf-8'):
                for word in line.strip().split():
                    word_count[word] += 1
    print("Corpora read", file=sys.stderr)
    allowed_chars = {
        line.strip()
        for line in open(os.path.join(parent_dir, 'allowed_chars'),
                         encoding='utf-8') if len(line.strip()) == 1
    }

    model = morfessor.BaselineModel(corpusweight=alpha)
    assert damp in {'types', 'tokens', 'logtokens'}
    damp_func = None
    if damp == 'types':
        damp_func = lambda x: 1
    elif damp == 'logtokens':
        damp_func = lambda x: int(round(math.log(x + 1, 2)))

    data = [(v, k) for k, v in word_count.items()
            if all(c in allowed_chars for c in k)]
    model.load_data(data, freq, damp_func)
    model.train_batch()

    io = morfessor.MorfessorIO()
    io.write_binary_model_file(os.path.join(d, 'model.bin'), model)

    io.write_segmentation_file(os.path.join(d, 'model.txt'),
                               model.get_segmentations())

    s = set()
    with open(os.path.join(d, 'wordmap'), 'w', encoding='utf-8') as outf:
        for k in word_count.keys():
            parts = model.viterbi_segment(k)[0]
            rparts = []
            for p in parts:
                if not all(c in allowed_chars for c in p):
                    p = '<UNK>'
                s.add(p)
                rparts.append(p)
            print("{}\t{}".format(k, " ".join(rparts)), file=outf)

    with open(os.path.join(d, 'vocab'), 'w', encoding='utf-8') as outf:
        for morph in s:
            print(morph, file=outf)
コード例 #13
0
ファイル: load.py プロジェクト: muximuxi/textflint
def load_morfessor_model(path):
    import morfessor
    s = tarfile.open(path)
    file_handler = s.extractfile(s.next())
    tmp_file_ = NamedTemporaryFile(delete=False)
    tmp_file_.write(file_handler.read())
    tmp_file_.close()
    io = morfessor.MorfessorIO()
    model = io.read_any_model(tmp_file_.name)
    os.remove(tmp_file_.name)
    return model
コード例 #14
0
def Base_SegModel(data, corpusweight):
    io = morfessor.MorfessorIO()
    train_data = list(io.read_corpus_file(data))
    model_types = morfessor.BaselineModel(corpusweight=corpusweight)
    model_types.load_data(train_data, count_modifier=lambda x: 1)
    model_types.train_batch()
    model_tokens = morfessor.BaselineModel()
    model_tokens.load_data(train_data)
    model_tokens.train_batch()

    return model_types, model_tokens
コード例 #15
0
    def __init__(self):
        DIR = dirname(__file__)
        morfessor_file = join(DIR, 'data/finnsyll-morfessor.bin')
        ngram_file = join(DIR, 'data/finnsyll-ngrams.pickle')

        io = morfessor.MorfessorIO()
        self.model = io.read_binary_model_file(morfessor_file)
        self.constraints = CONSTRAINTS
        self.constraint_count = len(CONSTRAINTS)

        with open(ngram_file, 'rb') as f:
            self.ngrams, self.vocab, self.total = pickle.load(f)
コード例 #16
0
def test_model(model, gold_standard_file):

    # load IO object
    morf_io = morfessor.MorfessorIO()

    # load gold standard annotations file
    gold_standard = morf_io.read_annotations_file(gold_standard_file)

    # build evaluator object and run evaluation against gold standard
    evaluator = morfessor.MorfessorEvaluation(gold_standard)
    results = evaluator.evaluate_model(model)

    return results
コード例 #17
0
def _morfessor_iterator_from_list(sentences):
    """Turns the list into the kind of iterator that morfessor likes
    
    :param sentences: A list of sentences, where each sentence is a list of words
    :return: A nice pretty iterator
    """
    io = morfessor.MorfessorIO()
    for sentence in sentences:
        sentence_string = ' '.join(sentence)
        for compound in io.compound_sep_re.split(sentence_string):
            if len(compound) > 0:
                yield 1, io._split_atoms(compound)
        yield 0, ()
コード例 #18
0
def main(allowed_chars_file, model):

    allowed_chars = {
        line.strip()
        for line in open(allowed_chars_file, encoding='utf-8')
        if len(line.strip()) == 1
    }

    model = morfessor.MorfessorIO().read_any_model(model)

    for line in sys.stdin:
        word = line.strip()
        parts = model.viterbi_segment(word)[0]
        print(word, end=' ')
        print(" ".join(parts).replace("<unk>", "<UNK>"))
コード例 #19
0
ファイル: load.py プロジェクト: WarungData/European-NLP-Test
def load_morfessor_model(lang="en", version="2"):
    """Return a morfessor model for `lang` and of version `version`

  Args:
    lang (string): language code.
    version (string): version of the parameters to be used.
  """
    src_dir = "morph{}".format(version)
    p = locate_resource(src_dir, lang)
    file_handler = _open(p)
    tmp_file_ = NamedTemporaryFile(delete=False)
    tmp_file_.write(file_handler.read())
    tmp_file_.close()
    io = morfessor.MorfessorIO()
    model = io.read_any_model(tmp_file_.name)
    os.remove(tmp_file_.name)
    return model
コード例 #20
0
def train_seg(infile, outfile):
    io = morfessor.MorfessorIO()

    print("Open corpus file")
    train_data = list(io.read_corpus_file(infile))

    model_types = morfessor.BaselineModel()

    model_types.load_data(train_data, count_modifier=lambda x: 1)

    def log_func(x):
        return int(round(math.log(x + 1, 2)))

    print("Training data...")
    model_types.train_batch()

    print("Write bin file")
    io.write_binary_model_file(outfile, model_types)
コード例 #21
0
def evaluate(gold_data, morf, acw, segs):
    print(str(acw) + " weighted annotated corpus")
    io = morfessor.MorfessorIO()
    gold = io.read_annotations_file(gold_data)
    ev = morfessor.MorfessorEvaluation(gold)

    models = [morf.unsupervised, morf.semisupervised]

    # evaluate models
    config = morfessor.evaluation.EvaluationConfig(10, 25)
    results = [ev.evaluate_model(m, config) for m in models]
    print("model evaluation")
    print(results[0])
    print(results[1])

    wsr = morfessor.evaluation.WilcoxonSignedRank()
    r = wsr.significance_test(results)
    wsr.print_table(r)
コード例 #22
0
    def load_varembed_format(cls, vectors, morfessor_model=None):
        """
        Load the word vectors into matrix from the varembed output vector files.
        Using morphemes requires Python 2.7 version or above.

        'vectors' is the pickle file containing the word vectors.
        'morfessor_model' is the path to the trained morfessor model.
        'use_morphemes' False(default) use of morpheme embeddings in output.
        """
        result = cls()
        if vectors is None:
            raise Exception(
                "Please provide vectors binary to load varembed model")
        D = utils.unpickle(vectors)
        word_to_ix = D['word_to_ix']
        morpho_to_ix = D['morpho_to_ix']
        word_embeddings = D['word_embeddings']
        morpho_embeddings = D['morpheme_embeddings']
        result.load_word_embeddings(word_embeddings, word_to_ix)
        if morfessor_model:
            if sys.version_info >= (
                    2,
                    7):  #Morfessor is only supported for Python 2.7 and above.
                try:
                    import morfessor
                    morfessor_model = morfessor.MorfessorIO(
                    ).read_binary_model_file(morfessor_model)
                    result.add_morphemes_to_embeddings(morfessor_model,
                                                       morpho_embeddings,
                                                       morpho_to_ix)
                except ImportError:
                    # Morfessor Package not found.
                    logger.error(
                        'Could not import morfessor. Not using morpheme embeddings'
                    )
                    raise ImportError('Could not import morfessor.')
            else:
                # Raise exception in Python 2.6 or earlier.
                raise Exception(
                    'Using Morphemes requires Python 2.7 and above. Morfessor is not supported in python 2.6'
                )

        logger.info('Loaded varembed model vectors from %s', vectors)
        return result
コード例 #23
0
    def load_varembed_format(cls, vectors, morfessor_model=None):
        """Load the word vectors into matrix from the varembed output vector files.

        Parameters
        ----------
        vectors : dict
            Pickle file containing the word vectors.
        morfessor_model : str, optional
            Path to the trained morfessor model.

        Returns
        -------
        :class:`~gensim.models.wrappers.varembed.VarEmbed`
            Ready to use instance.

        """
        result = cls()
        if vectors is None:
            raise Exception(
                "Please provide vectors binary to load varembed model")
        d = utils.unpickle(vectors)
        word_to_ix = d['word_to_ix']
        morpho_to_ix = d['morpho_to_ix']
        word_embeddings = d['word_embeddings']
        morpho_embeddings = d['morpheme_embeddings']
        result.load_word_embeddings(word_embeddings, word_to_ix)
        if morfessor_model:
            try:
                import morfessor
                morfessor_model = morfessor.MorfessorIO(
                ).read_binary_model_file(morfessor_model)
                result.add_morphemes_to_embeddings(morfessor_model,
                                                   morpho_embeddings,
                                                   morpho_to_ix)
            except ImportError:
                # Morfessor Package not found.
                logger.error(
                    'Could not import morfessor. Not using morpheme embeddings'
                )
                raise ImportError('Could not import morfessor.')

        logger.info('Loaded varembed model vectors from %s', vectors)
        return result
コード例 #24
0
def train_model(input_file, output_file=None):

    # setup input and model objects
    morf_io = morfessor.MorfessorIO()
    morf_model = morfessor.BaselineModel()

    # build a corpus from input file
    train_data = morf_io.read_corpus_file(input_file)

    # load data into model
    # optional param "count_modifier" can set frequency dampening;
    # default is each token counts
    morf_model.load_data(train_data)

    # train the model in batch form (online training also available)
    morf_model.train_batch()

    # optionally pickle model
    if output_file is not None:
        morf_io.write_binary_model_file(output_file, morf_model)

    return morf_model
コード例 #25
0
def train_morfessor(corpus, split_prob):
    """
    Train Morfessor Baseline model
    Lowercase the input text; use random skips for frequently seen compounds
    to speed up training; initialize new words by random splitting using the 
    split probability of split_prob.
    """

    io = morfessor.MorfessorIO(compound_separator=r"[^-\w]+", lowercase=True)

    train_data = list(
        io.read_corpus_file(os.path.join('data', 'corpora', corpus)))

    model_tokens = morfessor.BaselineModel(use_skips=True)

    model_tokens.load_data(train_data, init_rand_split=split_prob)

    model_tokens.train_batch()

    io.write_binary_model_file(
        os.path.join('data', 'models', corpus[:-4] + '_morph'), model_tokens)

    return model_tokens
コード例 #26
0
def main(oov_file, btype, model):
    parent_dir = os.path.dirname(oov_file)

    allowed_chars = {
        line.strip()
        for line in open(os.path.join(parent_dir, 'allowed_chars'),
                         encoding='utf-8') if len(line.strip()) == 1
    }

    model = morfessor.MorfessorIO().read_any_model(model)

    between = " "
    prefix = ""
    suffix = ""

    assert btype in {"aff", "wma", "suf", "pre"}
    if btype == "wma":
        between = " <w> "
    if btype == "pre" or btype == "aff":
        prefix = "+"
    if btype == "suf" or btype == "aff":
        suffix = "+"

    for line in open(oov_file, encoding='utf-8'):
        word = line.strip()
        parts = model.viterbi_segment(word)[0]
        rparts = []
        for p in parts:
            if not all(c in allowed_chars for c in p):
                p = '<UNK>'
            rparts.append(p)

        print("{} {}".format(suffix, prefix).join(rparts).replace(
            "+<unk>", "<unk>").replace("<unk>+", "<unk>").replace(
                "+<UNK>", "<UNK>").replace("<UNK>+",
                                           "<UNK>").replace("<unk>", "<UNK>"))
コード例 #27
0
def main(model, toplist, origlex, jointlex, wordmap, morphsep):
    io = morfessor.MorfessorIO()
    model = io.read_binary_model_file(model)
    jointlex = read_lex(jointlex)
    origlex = read_lex(origlex)

    new_lex = {}

    tof = open('tmpout', 'w', encoding='utf-8')
    counts = [0] * 100
    for k, v in origlex.items():
        counts[len(v)] += 1

        if len(v) == 2:
            print(k, file=tof)
    tof.close()

    for i, c in enumerate(counts):
        print("{} times {} transcriptions".format(c, i))

    jcounts = [0] * 100
    for k, v in jointlex.items():
        jcounts[len(v)] += 1

    for i, c in enumerate(jcounts):
        print("{} times {} morphtranscriptions".format(c, i))

    words_todo = []

    for word in toplist:
        word = word.strip().split()[0]

        segm = model.viterbi_segment(word)[0]
        if any(p not in jointlex for p in segm):
            print("{}\tUNK".format(word), file=wordmap)
            continue

        if word not in origlex:
            words_todo.append(word)
            continue

        target_trans = origlex[word]

        if len(target_trans) > 1:
            words_todo.append(word)
            continue

        target_idx = None
        for idx in itertools.product(
                *[list(range(len(jointlex[p]))) for p in segm]):
            trans = []
            for p, i in zip(segm, idx):
                trans.extend(jointlex[p][i])

            if tuple(trans) == target_trans[0]:
                target_idx = idx
                break

        if target_idx is not None:
            nsegm = ["{}#{}".format(p, i) for p, i in zip(segm, target_idx)]
            print("{}\t{}".format(word, morphsep.join(nsegm)), file=wordmap)
        else:
            words_todo.append(word)

    print("Still to do {} words".format(len(words_todo)))

    words = words_todo
    words_todo = []
    for word in words:
        word_done = False
        if word not in origlex:
            words_todo.append(word)
            continue

        for segme in model.viterbi_nbest(word, 10):
            segm = segme[0]
            if any(p not in jointlex for p in segm):
                continue
            target_trans = origlex[word]

            trans_left = list(target_trans)

            target_idxs = []
            for idx in itertools.product(
                    *[list(range(len(jointlex[p]))) for p in segm]):
                trans = []
                for p, i in zip(segm, idx):
                    trans.extend(jointlex[p][i])

                for ti in range(len(trans_left)):
                    if tuple(trans) == trans_left[ti]:
                        target_idxs.append(idx)
                        del trans_left[ti]
                        break

            if len(target_idxs) > 0:
                if len(target_idxs) != len(target_trans):
                    print(
                        "WARNING: {} has less phone transcriptions then expected"
                        .format(word))
                    continue
                target_idx = [{*k} for k in zip(*target_idxs)]
                nsegm = [
                    "{}#{}".format(p, ",".join(str(a) for a in sorted(i)))
                    for p, i in zip(segm, target_idx)
                ]
                print("{}\t{}".format(word, morphsep.join(nsegm)),
                      file=wordmap)
                word_done = True
                break
        if not word_done:
            words_todo.append(word)

    print("Still to do {} words".format(len(words_todo)))

    for word in words_todo:
        segm = model.viterbi_segment(word)[0]
        print("{}\t{}".format(word, morphsep.join(segm)), file=wordmap)
コード例 #28
0
        3: 'I-ORG',
        4: 'B-PRO',
        5: 'B-PER',
        6: 'I-PER',
        7: 'I-PRO',
        8: 'B-LOC',
        9: 'B-DATE',
        10: 'B-EVENT',
        11: 'I-LOC',
        12: 'I-EVENT',
        13: 'I-DATE'
    }
    num_tags = len(idx2tag) + 1
    whole_data_path = document_path
    target_data = load_data(document_path)
    io = morfessor.MorfessorIO()

    print('Loading embeddings...')
    #embeddings = gensim.models.fasttext.load_facebook_vectors('data/embeddings/cc.fi.300.bin')
    embeddings = fasttext.load_model('data/embeddings/cc.fi.300.bin')

    print('Finished loading embeddings')

    # load the morfessor model
    morfessor_model = io.read_binary_model_file(
        'utils/subword_segmentation/output/model/morfessor_0.1.bin')

    whole_data = load_data(whole_data_path)

    # segment data into morphs
    whole_data_morphs = []
コード例 #29
0
ファイル: flatcat_test.py プロジェクト: sjmielke/flatcat
    def setUp(self):
        self.perplexities = dict()
        self.condprobs = dict()
        self.posteriors = dict()
        self.transitions = dict()
        catpriors_tmp = dict()

        self._config()

        self.comments_io = morfessor.MorfessorIO(encoding='latin-1',
                                                 comment_start='++++++++++')

        pattern_float = r'([0-9.]+)'
        pattern_int = r'([0-9]+)'
        pattern_quoted = r'"([^"]*)"'
        ppl_re = re.compile(r'^#Features\(' + pattern_quoted + r'\)\s+' +
                            pattern_float + r'\s+' + pattern_float + r'\s+' +
                            pattern_int)
        condprobs_re = re.compile(r'^#P\(Tag\|' + pattern_quoted + r'\)\s+' +
                                  pattern_float + r'\s+' + pattern_float +
                                  r'\s+' + pattern_float + r'\s+' +
                                  pattern_float)
        catpriors_re = re.compile(r'^#PTag\(' + pattern_quoted + r'\)\s+' +
                                  pattern_float)
        posteriors_re = re.compile(r'^(\S*)\s+' + pattern_float + r'\s+' +
                                   pattern_float + r'\s+' + pattern_float +
                                   r'\s+' + pattern_float)
        transitions_re = re.compile(r'^P\((\S+) .. ([^\)]+)\) = ' +
                                    pattern_float + r' \(N = ' + pattern_int +
                                    '\)')

        for line in self.comments_io._read_text_file(self.reference_file):
            m = ppl_re.match(line)
            if m:
                self.perplexities[m.group(1)] = (float(m.group(2)),
                                                 float(m.group(3)),
                                                 int(m.group(4)))
                continue

            m = condprobs_re.match(line)
            if m:
                self.condprobs[m.group(1)] = (float(m.group(2)),
                                              float(m.group(3)),
                                              float(m.group(4)),
                                              float(m.group(5)))
                continue

            m = catpriors_re.match(line)
            if m:
                catpriors_tmp[m.group(1)] = float(m.group(2))
                continue

            m = posteriors_re.match(line)
            if m:
                self.posteriors[m.group(1)] = flatcat.ByCategory(
                    float(m.group(2)), float(m.group(3)), float(m.group(4)),
                    float(m.group(5)))
                continue

            m = transitions_re.match(line)
            if m:

                def _tr_wb(x):
                    if x == '#':
                        return flatcat.FlatcatModel.word_boundary
                    return x

                cats = tuple([_tr_wb(x) for x in (m.group(1), m.group(2))])
                self.transitions[cats] = (float(m.group(3)), int(m.group(4)))

        self.catpriors = flatcat.ByCategory(
            *(catpriors_tmp[x] for x in self.model.get_categories()))
コード例 #30
0
parser.add_argument('--coarse_atoms',action='store_true')
args = parser.parse_args()


numKnownUnambigous=0
numKnownAmbigous=0
numUnknown=0

freqKnownUnambigous=0
freqKnownAmbigous=0
freqUnknown=0

morfessorModel=None
if args.morfessor_model:
    if args.coarse_atoms:
        io = morfessor.MorfessorIO(atom_separator="■")
    else:
        io = morfessor.MorfessorIO()
    morfessorModel=io.read_binary_model_file(args.morfessor_model)

#Load vocabulary: we need frequencies for statistics
freqs=defaultdict(int)
with open(args.vocabulary) as freqs_f:
    for line in freqs_f:
        line=line.rstrip("\n")
        parts=line.split(" ")
        freqs[parts[1]]=int(parts[0])



stopwords=set()