def add_arguments(title): parser = fill_arg_for_processor(title) parser.add_argument('--ignore_mess', default=None) parser.add_argument('--show_mess', default=None) parser.add_argument('--show_files', default=None) parser.add_argument('--scheme', default=None) parser.add_argument('--media', default=False, action='store_true') return parser
fout.write('"{0}" - {1} \n'.format(val_k,val_v[2])) # self.put_counter('changes_zh.txt', self.changes_zh) # self.put_counter('changes_zh1.txt', self.changes_zh1) ''' if self.feats_loader.wrong: if '' in self.feats_loader.wrong: print("Empty!") print("Wrong feats: {0}".format(','.join(sorted(self.feats_loader.wrong)))) with open(Path('~/Documents/china_gr.txt').expanduser(), 'w') as fout: fout.write("china feats:\n\n") fout.write('\n'.join(sorted(self.feats))+'\n') fout.write("china feat sets:\n\n") fout.write('\n'.join(sorted(self.feat_grs))+'\n') ''' if __name__ == '__main__': parser = fill_arg_for_processor('feat checker') parser_args = parser.parse_args() feats_checker = FeatsChecker(parser_args) feats_checker.process() feats_checker.put_info()
class VersesCutter(ProcessorBasic): def process_lxml_tree(self, tree): handler = MyContentHandler() sax.saxify(tree, handler) if handler.max_br > 0: print(self.inpname, handler.max_br, "строф") return handler.etree class VersesCutter2(ProcessorBasic): def __init__(self, args): super().__init__(args) self.res = open(args.found, "w") def process_lxml_tree(self, tree): verses = tree.xpath('//p[@class="verse"]') for verse in verses: lines = verse.xpath("count(//br)") if lines > 1000: print(self.inpname, lines, "строк в строфе") self.res.write(self.inpname) return None if __name__ == '__main__': parser = fill_arg_for_processor('verses cutter') parser.add_argument("--found", required=True) parser_args = parser.parse_args() cutter = VersesCutter2(parser_args) cutter.process()
self.outfile_write(attr) start = False self.outfile_write('}') # self.outfile_write('['+to_str[info[TEXT_POS]]+', '+to_str[info[TAIL_POS]]+']') self.outfile_write( '\t' + os.path.relpath(info[FIRST_DOC_POS], self.common_part) + '\t\n') for value in info[ELEM_POS].values(): self.put_info(value, shift + info[TAG_POS] + '/') def process(self): inp_paths = self.inppath.split('|') for self.inppath in inp_paths: if not super(Schema, self).process(): return self.common_part = os.sep.join(self.common_part) with open(self.schema, 'w') as self.outfile: self.outfile_write('tags sequence\texample\tcomment\n') for value in self.glob_info.values(): self.put_info(value, '') self.outfile_write('\n(common path of examples - ' + self.common_part + ')') if __name__ == '__main__': parser = fill_arg_for_processor('schema processing') parser.add_argument('--schema', required=True) parser_args = parser.parse_args() processor = Schema(parser_args) processor.process()
for lang, l_counter in self.counters.items(): with open(root_dir/Path(lang+".txt"), 'w') as f_count: for stat in l_counter.most_common(): sym = stat[0] if len(sym) > 1: raise Exception('logic error') cat = unicodedata.category(sym) try: sym_name = unicodedata.name(sym) except ValueError: sym = '#{0}'.format(ord(sym)) sym_name = 'UNKNOWN NAME' cat_gr = cat[0] count = stat[1] if cat_gr == 'L' or cat == 'Cc' or cat == 'Nd': continue f_count.write('"{0}" {1}, {2}: {3} time(s)\n'.format(sym, cat, sym_name, count)) except (OSError, IOError) as e: self.fatal_error("can't write statistics into {0}: {1}".format(self.sym_stat, e.message)) if __name__ == '__main__': parser = fill_arg_for_processor('symbol counter') parser.add_argument('--sym_stat', required=True) parser_args = parser.parse_args() counter = SymbolCounter(parser_args) counter.process() counter.report()
self.line = -1 """ def change_accent(self, text): if text is None: return text n = text.count(COMBINING_GRAVE_ACCENT) if n == 0: return text self.count_mess( "COMBINING_GRAVE_ACCENT changed to COMBINING_ACUTE_ACCENT", n) return text.replace(COMBINING_GRAVE_ACCENT, COMBINING_ACUTE_ACCENT) def process_lxml_tree(self, tree): root = tree.getroot() body = root.find('body') if body is None: return None for elem in body.iter(): if self.nostructured(elem): continue elem.text = self.change_accent(elem.text) elem.tail = self.change_accent(elem.tail) return tree if __name__ == '__main__': parser = fill_arg_for_processor('speech converter', True) parser_args = parser.parse_args() converter = ConverterStihi(parser_args) converter.process()