Esempio n. 1
0
def main():
    """Preprocess text for moses factored modeling."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   help="load analyser model from AFILE",
                   required=True)
    a.add_argument('-s',
                   '--segmenter',
                   metavar='SFILE',
                   help="load segmenter model from SFILE",
                   required=True)
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   help="print factors into OUTFILE")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("Reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("at least analyser file is needed", file=stderr)
        exit(1)
    if options.segmenter:
        if options.verbose:
            print("Reading segmenter model", options.segmenter)
        omorfi.load_segmenter(options.segmenter)
    else:
        print("at least segmenter file is needed", file=stderr)
        exit(1)
    if options.infile:
        infile = options.infile
    else:
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        outfile = stdout
    if options.verbose:
        print("reading from", infile.name)
    if options.verbose:
        print("writign to", outfile.name)
    re_lemma = re.compile("\[WORD_ID=([^]]*)\]")
    re_pos = re.compile("\[UPOS=([^]]*)\]")
    re_mrd = re.compile("\[([^=]*)=([^]]*)]")
    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            continue
        surfs = line.split()
        for surf in surfs:
            anals = omorfi.analyse(surf)
            segments = omorfi.segment(surf)
            pos_matches = re_pos.finditer(anals[0]['anal'])
            pos = "UNK"
            mrds = []
            lemmas = []
            for pm in pos_matches:
                pos = pm.group(1)
            lemma_matches = re_lemma.finditer(anals[0]['anal'])
            for lm in lemma_matches:
                lemmas += [lm.group(1)]
            mrd_matches = re_mrd.finditer(anals[0]['anal'])
            for mm in mrd_matches:
                if mm.group(1) == 'WORD_ID':
                    mrds = []
                elif mm.group(1) == 'WEIGHT':
                    pass
                else:
                    mrds += [mm.group(2)]
            parts = segments[0]['segments']
            if '{DB}' in parts:
                suffixes = parts[parts.rfind('{DB}') + 4:]
            elif '{WB}' in parts:
                suffixes = parts[parts.rfind('{WB}') + 4:]
            elif '{hyph?}' in parts:
                suffixes = parts[parts.rfind('{hyph?}') + 6:]
            else:
                suffixes = "0"
            morphs = suffixes[suffixes.find("{"):].replace("{MB}", ".")
            print(surf,
                  '+'.join(lemmas),
                  pos,
                  '.'.join(mrds),
                  morphs,
                  sep='|',
                  end=' ',
                  file=outfile)
        print(file=outfile)
    exit(0)
Esempio n. 2
0
def main():
    """Segment text in some formats."""
    a = ArgumentParser()
    a.add_argument('-s',
                   '--segmenter',
                   metavar='SFILE',
                   help="load segmenter from SFILE",
                   required=True)
    a.add_argument('-S',
                   '--labeller',
                   metavar='LSFILE',
                   help="load labelsegmenter from LSFILE",
                   required=True)
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   help="print segments into OUTFILE")
    a.add_argument('-O',
                   '--output-format',
                   metavar="OFORMAT",
                   help="format output suitable for OFORMAT",
                   required=True,
                   choices=["moses-factors", "segments"])
    a.add_argument('--no-split-words',
                   action="store_false",
                   default=True,
                   dest="split_words",
                   help="split on word boundaries")
    a.add_argument(
        '--no-split-new-words',
        action="store_false",
        default=True,
        dest="split_new_words",
        help="split on new word boundaries (prev. unattested compounds)")
    a.add_argument('--no-split-morphs',
                   action="store_false",
                   default=True,
                   dest="split_morphs",
                   help="split on morph boundaries")
    a.add_argument('--split-derivs',
                   action="store_true",
                   default=False,
                   help="split on derivation boundaries")
    a.add_argument('--split-nonwords',
                   action="store_true",
                   default=False,
                   help="split on other boundaries")
    a.add_argument('--segment-marker',
                   default='→ ←',
                   metavar='SEG',
                   help="mark segment boundaries with SEG")
    a.add_argument('--show-ambiguous',
                   default=False,
                   metavar='ASEP',
                   help="separate ambiguous segmentations with SEG")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.segmenter:
        if options.verbose:
            print("Reading segmenter", options.segmenter)
        omorfi.load_segmenter(options.segmenter)
    else:
        print("segmenter is needed for segmenting", file=stderr)
        exit(2)
    if options.labeller:
        if options.verbose:
            print("Reading labelsegmenter", options.labeller)
        omorfi.load_labelsegmenter(options.labeller)
    if not omorfi.can_segment:
        print("Could not load segmenter(s), re-compile them or use -f option")
        print()
        print("To compile segmenter, use --enable-segmenter, and/or",
              "--enable-labeled-segments")
        exit(1)
    if options.infile:
        infile = options.infile
    else:
        options.infile = stdin
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        options.output = "<stdout>"
        outfile = stdout
    if options.segment_marker is None:
        if options.verbose:
            print("Default segment marker is → ←")
        options.segment_marker = '→ ←'
    if options.verbose:
        print("reading from", options.infile.name)
    if options.verbose:
        print("writign to", options.output)

    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            print(file=outfile)
            continue
        tokens = omorfi.tokenise(line)
        for token in tokens:
            segments = omorfi.segment(token)
            labelsegments = omorfi.labelsegment(token)
            if options.output_format == 'moses-factors':
                print_moses_factor_segments(segments, labelsegments, token,
                                            outfile, options)
            elif options.output_format == 'segments':
                print_segments(segments, labelsegments, token, outfile,
                               options)
        print(file=outfile)
    exit(0)
Esempio n. 3
0
def main():
    """Preprocess text for moses factored modeling."""
    a = ArgumentParser()
    a.add_argument('-a', '--analyser', metavar='AFILE',
                   help="load analyser model from AFILE", required=True)
    a.add_argument('-s', '--segmenter', metavar='SFILE',
                   help="load segmenter model from SFILE", required=True)
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE",
                   help="print factors into OUTFILE")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("Reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("at least analyser file is needed", file=stderr)
        exit(1)
    if options.segmenter:
        if options.verbose:
            print("Reading segmenter model", options.segmenter)
        omorfi.load_segmenter(options.segmenter)
    else:
        print("at least segmenter file is needed", file=stderr)
        exit(1)
    if options.infile:
        infile = options.infile
    else:
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        outfile = stdout
    if options.verbose:
        print("reading from", infile.name)
    if options.verbose:
        print("writign to", outfile.name)
    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            continue
        tokens = omorfi.tokenise_sentence(line)
        for token in tokens:
            if not token.surf:
                continue
            anals = omorfi.analyse(token)
            pos = "X"
            mrds = ["?"]
            lemmas = [token.surf]
            if anals:
                anal = token.get_best()
                pos = anal.get_upos()
                mrds = anal.get_ufeats()
                lemmas = anal.get_lemmas()
            segments = omorfi.segment(token)
            morphs = "0"
            if segments:
                segment = token.get_best_segments()
                if segment:
                    parts = segment.get_segments()
                    morphs = ".".join(parts)
                else:
                    morphs = token.surf
            print(token.surf, '+'.join(lemmas), pos, '.'.join(mrds),
                  morphs, sep='|', end=' ', file=outfile)
        print(file=outfile)
    exit(0)