def _analyze(db, fin, fout, backoff, cache): if cache: analyzer = Analyzer(db, backoff, cache_size=1024) else: analyzer = Analyzer(db, backoff) line = force_unicode(fin.readline()) while line: if len(line) == 0: line = force_unicode(fin.readline()) continue line = line.strip() tokens = _tokenize(line) for token in tokens: analyses = analyzer.analyze(token) serialized = _serialize_analyses(fout, token, analyses, db.order) if six.PY3: fout.write(serialized) else: fout.write(force_encoding(serialized)) fout.write('\n\n') line = force_unicode(fin.readline())
def _reinflect(db, fin, fout): reinflector = Reinflector(db) line = force_unicode(fin.readline()) line_num = 1 while line: line = line.strip() if len(line) == 0: line = force_unicode(fin.readline()) line_num += 1 continue parsed = _parse_reinflector_line(line) if parsed is None: if fin is sys.stdin: sys.stderr.write('Error: Invalid input line.\n') else: sys.stderr.write( 'Error: Invalid input line. [{}]\n'.format(line_num)) else: word = parsed[0] feats = parsed[1] try: analyses = reinflector.reinflect(word, feats) serialized = _serialize_analyses(fout, word, analyses, db.order) if six.PY3: fout.write(serialized) else: fout.write(force_encoding(serialized)) fout.write('\n\n') except MorphologyError as error: # This could be thrown by the analyzer, generator, or # reinflector. if fin is sys.stdin: sys.stderr.write('Error: {}.\n'.format(error.msg)) else: sys.stderr.write('Error: {}. [{}]\n'.format( error.msg, line_num)) line = force_unicode(fin.readline()) line_num += 1
def clean(self, ar_raw_text: str, normalize_teh_marbuta: bool = False, normalize_alef: bool = True, is_punct: bool = False, is_de_diacritize: bool = True, is_to_lower: bool = True) -> str: clean_text = ar_raw_text.strip("\n").strip() if is_to_lower: clean_text = clean_text.lower() clean_text = force_unicode(clean_text) if six.PY3: clean_text = self.clean_mapper.map_string(clean_text) else: clean_text = force_encoding(clean_text) clean_text = self.clean_mapper.map_string(clean_text) clean_text = self._get_sec_cleaner( is_punct=is_punct).clean_text(clean_text) clean_text = self._normalize( arb_text=clean_text, normalize_teh_marbuta=normalize_teh_marbuta, normalize_alef=normalize_alef) if is_de_diacritize: clean_text = self._de_diacritize(arb_text=clean_text) for i in range(5): clean_text = clean_text.replace(" ", " ") return clean_text.strip()
def _arclean(mapper, fin, fout): for line in fin: line = force_unicode(line) if six.PY3: fout.write(mapper.map_string(line)) else: fout.write(force_encoding(mapper.map_string(line))) fout.flush()
def _serialize_analyses(fout, word, analyses, order, generation=False): buff = collections.deque() buff.append(u'#{}: {}'.format(u'LEMMA' if generation else u'WORD', force_unicode(word))) if len(analyses) == 0: buff.append(u'NO_ANALYSIS') else: sub_buff = set() for a in analyses: output = u' '.join([ u'{}:{}'.format(force_unicode(f), force_unicode(a[f])) for f in order if f in a ]) sub_buff.add(output) buff.extend(sub_buff) return u'\n'.join(buff)
def _analyze(db, fin, fout, backoff, cache, num_disambig=None): if cache: analyzer = CalimaStarAnalyzer(db, backoff, cache_size=1024) else: analyzer = CalimaStarAnalyzer(db, backoff) disambig = None if num_disambig is not None: disambig = MLEDisambiguator(analyzer) line = force_unicode(fin.readline()) while line: if len(line) == 0: line = force_unicode(fin.readline()) continue line = line.strip() tokens = _tokenize(line) for token in tokens: analyses = analyzer.analyze(token) if num_disambig is not None: dambg = disambig.disambiguate([token], num_disambig) analyses = [a.analysis for a in dambg[0].analyses] else: analyses = analyzer.analyze(token) serialized = _serialize_analyses(fout, token, analyses, db.order) if six.PY3: fout.write(serialized) else: fout.write(force_encoding(serialized)) fout.write('\n\n') line = force_unicode(fin.readline())
def _analyze(db, fin, fout, backoff, cache): analyzer = CalimaStarAnalyzer(db, backoff) memoize_table = {} if cache else None line = force_unicode(fin.readline()) while line: if len(line) == 0: line = force_unicode(fin.readline()) continue line = line.strip() tokens = _tokenize(line) for token in tokens: if cache and token in memoize_table: if six.PY3: fout.write(memoize_table[token]) else: fout.write(force_encoding(memoize_table[token])) fout.write('\n\n') else: analyses = analyzer.analyze(token) serialized = _serialize_analyses(fout, token, analyses, db.order) if cache: memoize_table[token] = serialized if six.PY3: fout.write(serialized) else: fout.write(force_encoding(serialized)) fout.write('\n\n') line = force_unicode(fin.readline())
def _parse_dbfile(self, fpath): with open(fpath, 'r') as dbfile: # Process DEFINES for line in dbfile: line = line = force_unicode(line).strip() if line == '###DEFINES###': continue if line == '###DEFAULTS###': break toks = line.split(u' ') # Check if line has the minimum viable format if len(toks) < 3 or toks[0] != 'DEFINE': raise DatabaseParseError( 'invalid DEFINES line {}'.format(repr(line))) new_define = toks[1] val_set = set() # Parse values for defined keyword for tok in toks[2:]: subtoks = tok.split(':') # If it's a malformed entry, ignore it if len(subtoks) != 2 and subtoks[0] != toks[1]: raise DatabaseParseError( 'invalid key value pair {} in DEFINES'.format( repr(tok))) # If it's an open class, we use None instead of a set if len(toks) == 3 and subtoks[1] == '*open*': val_set = None break val_set.add(subtoks[1]) self.defines[new_define] = ( list(val_set) if val_set is not None else None) # Process DEFAULTS for line in dbfile: line = force_unicode(line).strip() if line == '###ORDER###': break toks = line.split(u' ') if len(toks) < 2 or toks[0] != 'DEFAULT': raise DatabaseParseError( 'invalid DEFAULTS line {}'.format(repr(line))) parsed_default = self._parse_defaults_line_toks(toks[1:]) if self._defaultKey not in parsed_default: raise DatabaseParseError( 'DEFAULTS line {} missing {} value'.format( repr(line), self._defaultKey)) dkey = parsed_default[self._defaultKey] self.defaults[dkey] = parsed_default # Process ORDER for line in dbfile: line = force_unicode(line).strip() if line == '###STEMBACKOFF###': self.compute_feats.update(self.order) break toks = line.split(u' ') if (self.order is not None and len(toks) < 2 and toks[0] != 'ORDER'): raise DatabaseParseError( 'invalid ORDER line {}'.format(repr(line))) if toks[1] not in self.defines: raise DatabaseParseError( 'invalid feature {} in ORDER line.'.format( repr(toks[1]))) self.order = toks[1:] # Process STEMBACKOFFS for line in dbfile: line = force_unicode(line).strip() if line == '###PREFIXES###': break toks = line.split(u' ') if len(toks) < 3 or toks[0] != 'STEMBACKOFF': raise DatabaseParseError( 'invalid STEMBACKOFFS line {}'.format(repr(line))) self.stem_backoffs[toks[1]] = toks[2:] # Process PREFIXES for line in dbfile: line = force_unicode(line) parts = line.split(u'\t') if len(parts) != 3: if line.strip() == '###SUFFIXES###': break raise DatabaseParseError( 'invalid PREFIXES line {}'.format(repr(line))) prefix = parts[0].strip() category = parts[1] analysis = self._parse_analysis_line_toks( parts[2].strip().split(u' ')) if self._withAnalysis: if prefix not in self.prefix_hash: self.prefix_hash[prefix] = [] self.prefix_hash[prefix].append((category, analysis)) if self._withGeneration: # FIXME: Make sure analyses for category are unique? if category not in self.prefix_cat_hash: self.prefix_cat_hash[category] = [] self.prefix_cat_hash[category].append(analysis) # Process SUFFIXES for line in dbfile: line = force_unicode(line) parts = line.split(u'\t') if len(parts) != 3: if line.strip() == '###STEMS###': break raise DatabaseParseError( 'invalid SUFFIXES line {}'.format(repr(line))) suffix = parts[0].strip() category = parts[1] analysis = self._parse_analysis_line_toks( parts[2].strip().split(u' ')) if self._withAnalysis: if suffix not in self.suffix_hash: self.suffix_hash[suffix] = [] self.suffix_hash[suffix].append((category, analysis)) if self._withGeneration: # FIXME: Make sure analyses for category are unique? if category not in self.suffix_cat_hash: self.suffix_cat_hash[category] = [] self.suffix_cat_hash[category].append(analysis) # Process STEMS for line in dbfile: line = force_unicode(line).strip() if line == '###TABLE AB###': break parts = line.split(u'\t') if len(parts) != 3: raise DatabaseParseError( 'invalid STEMS line {}'.format(repr(line))) stem = parts[0] category = parts[1] analysis = self._parse_analysis_line_toks(parts[2].split(u' ')) if self._withAnalysis: if stem not in self.stem_hash: self.stem_hash[stem] = [] self.stem_hash[stem].append((category, analysis)) if self._withGeneration: # FIXME: Make sure analyses for category are unique? lemma = analysis['lex'] lemma_key = _LEMMA_SPLIT_RE.split(lemma)[0] analysis['stemcat'] = category if lemma_key not in self.lemma_hash: self.lemma_hash[lemma_key] = [] self.lemma_hash[lemma_key].append(analysis) # Process prefix_stem compatibility table for line in dbfile: line = force_unicode(line).strip() if line == '###TABLE BC###': break toks = line.split() if len(toks) != 2: raise DatabaseParseError( 'invalid TABLE AB line {}'.format(repr(line))) prefix_cat = toks[0] stem_cat = toks[1] if self._withAnalysis: if prefix_cat not in self.prefix_stem_compat: self.prefix_stem_compat[prefix_cat] = set() self.prefix_stem_compat[prefix_cat].add(stem_cat) if self._withGeneration: if stem_cat not in self.stem_prefix_compat: self.stem_prefix_compat[stem_cat] = set() self.stem_prefix_compat[stem_cat].add(prefix_cat) # Process stem_suffix compatibility table for line in dbfile: line = force_unicode(line).strip() if line == '###TABLE AC###': break toks = line.split() if len(toks) != 2: raise DatabaseParseError( 'invalid TABLE BC line {}'.format(repr(line))) stem_cat = toks[0] suffix_cat = toks[1] if stem_cat not in self.stem_suffix_compat: self.stem_suffix_compat[stem_cat] = set() self.stem_suffix_compat[stem_cat].add(suffix_cat) # Process prefix_suffix compatibility table for line in dbfile: line = force_unicode(line).strip() toks = line.split() if len(toks) != 2: raise DatabaseParseError( 'invalid TABLE AC line {}'.format(repr(line))) prefix_cat = toks[0] suffix_cat = toks[1] if prefix_cat not in self.prefix_suffix_compat: self.prefix_suffix_compat[prefix_cat] = set() self.prefix_suffix_compat[prefix_cat].add(suffix_cat) if self._withAnalysis: for prefix in self.prefix_hash.keys(): self.max_prefix_size = max(self.max_prefix_size, len(prefix)) for suffix in self.suffix_hash.keys(): self.max_suffix_size = max(self.max_suffix_size, len(suffix))
def main(): # pragma: no cover try: version = ('CAMeL Tools v{}'.format(__version__)) arguments = docopt(__doc__, version=version) if arguments['--list']: for scheme in _BUILTIN_SCHEMES: print("{} {}".format(scheme[0].ljust(20), scheme[1])) sys.exit(0) if arguments['--scheme'] is not None: if arguments['--scheme'] not in [s[0] for s in _BUILTIN_SCHEMES]: sys.stderr.write('Error: {} is not a valid scheme.\n' 'Run `camel_transliterate -l` to see the list' ' of available schemes.' '\n'.format(repr(arguments['--scheme']))) sys.exit(1) if arguments['--marker'] is None: marker = '@@IGNORE@@' else: marker = arguments['--marker'] ignore_markers = arguments['--ignore-markers'] strip_markers = arguments['--strip-markers'] # Open files (or just use stdin and stdout) fin, fout = _open_files(arguments['FILE'], arguments['--output']) # Load the CharMapper and initialize a Transliterator with it try: mapper = CharMapper.builtin_mapper(arguments['--scheme']) trans = Transliterator(mapper, marker) except Exception: # pylint: disable=W0703 sys.stderr.write('Error: Could not load builtin scheme' ' {}.\n'.format(repr(arguments['--scheme']))) sys.exit(1) # Transliterate lines try: for line in fin: line = force_unicode(line) if six.PY3: fout.write( trans.transliterate(line, strip_markers, ignore_markers)) else: fout.write( force_encoding( trans.transliterate(line, strip_markers, ignore_markers))) fout.flush() # If everything worked so far, this shouldn't happen except Exception: # pylint: disable=W0703 sys.stderr.write('Error: An unkown error occured during ' 'transliteration.\n') sys.exit(1) # Cleanup if arguments['FILE'] is not None: fin.close() if arguments['--output'] is not None: fout.close() sys.exit(0) except KeyboardInterrupt: sys.stderr.write('Exiting...\n') sys.exit(1) except Exception: sys.stderr.write('Error: An unknown error occurred.\n') sys.exit(1)
def _generate(db, fin, fout, backoff): generator = Generator(db) reinflector = Reinflector(db) if backoff == 'REINFLECT' else None line = force_unicode(fin.readline()) line_num = 1 while line: line = line.strip() if len(line) == 0: line = force_unicode(fin.readline()) line_num += 1 continue parsed = _parse_generator_line(line) if parsed is None: if fin is sys.stdin: sys.stderr.write('Error: Invalid input line.\n') else: sys.stderr.write( 'Error: Invalid input line ({}).\n'.format(line_num)) else: lemma = parsed[0] feats = parsed[1] # Make sure lemma and pos are specified first if lemma is None: if fin is sys.stdin: sys.stderr.write('Error: Missing lex/lemma feature.\n') else: sys.stderr.write( 'Error: Missing lex/lemma feature. [{}].\n'.format( line_num)) elif 'pos' not in feats: if fin is sys.stdin: sys.stderr.write('Error: Missing pos feature.\n') else: sys.stderr.write( 'Error: Missing pos feature. [{}]\n'.format(line_num)) else: try: analyses = generator.generate(lemma, feats) if len(analyses) == 0 and backoff == 'REINFLECT': word = _dediac(lemma) analyses = reinflector.reinflect(word, feats) serialized = _serialize_analyses(fout, lemma, analyses, db.order, True) if six.PY3: fout.write(serialized) else: fout.write(force_encoding(serialized)) fout.write('\n\n') except GeneratorError as error: if fin is sys.stdin: sys.stderr.write('Error: {}.\n'.format(error.msg)) else: sys.stderr.write('Error: {}. [{}]\n'.format( error.msg, line_num)) line = force_encoding(fin.readline()) line_num += 1