def main(lang): for _, element in iterparse(sys.stdin, 'file'): if element.tag == 's': sentence = element.text elif element.tag == 'text': if element.get('langid') == lang: print sentence.encode('utf8')
def main(lang): sentences = [] for _, element in iterparse(sys.stdin, 'file'): if element.tag == 's': sentence = element.text elif element.tag == 'text': if element.get('langid') == lang: sentences.append(sentence) if element.tag == 'file': print(' '.join(sentence.encode('utf8') for sentence in sentences)) sentences = []
def main(n): n = int(n) print '<?xml version="1.0" encoding="utf-8"?>' print '<dataset>' for _, element in iterparse(sys.stdin, 'file'): if element.tag == 'file': post_id = int(element.get('id')) if post_id % n == 0: element.tail = '' sys.stdout.write(tostring(element, pretty_print=True, encoding='utf8')) print '</dataset>'
def main(n): n = int(n) print '<?xml version="1.0" encoding="utf-8"?>' print '<dataset>' for _, element in iterparse(sys.stdin, 'file'): if element.tag == 'file': post_id = int(element.get('id')) if post_id % n == 0: element.tail = '' sys.stdout.write( tostring(element, pretty_print=True, encoding='utf8')) print '</dataset>'
def main(from_day, to_day): print '<?xml version="1.0" encoding="utf-8"?>' print '<dataset>' keep = False for _, element in iterparse(sys.stdin, 'file'): if element.tag == 'file' and keep: element.tail = '' sys.stdout.write(tostring(element, pretty_print=True, encoding='utf8')) elif element.tag == 'metadata': date = element.get('date') keep = (from_day <= date.split('-')[-1] <= to_day) print '</dataset>'
def main(): print '<?xml version="1.0" encoding="utf-8"?>' print '<dataset>' for _, element in iterparse(sys.stdin, 'file'): if element.tag == 's': keep = should_keep(element.text.replace(u'\xa0', u' ')) elif element.tag == 'unit': if not keep: element.getparent().remove(element) if element.tag == 'file': element.tail = '' sys.stdout.write(tostring(element, pretty_print=True, encoding='utf8')) print '</dataset>'
def main(from_day, to_day): print '<?xml version="1.0" encoding="utf-8"?>' print '<dataset>' keep = False for _, element in iterparse(sys.stdin, 'file'): if element.tag == 'file' and keep: element.tail = '' sys.stdout.write( tostring(element, pretty_print=True, encoding='utf8')) elif element.tag == 'metadata': date = element.get('date') keep = (from_day <= date.split('-')[-1] <= to_day) print '</dataset>'
def main(): print '<?xml version="1.0" encoding="utf-8"?>' print '<dataset>' for _, element in iterparse(sys.stdin, 'file'): if element.tag == 's': keep = should_keep(element.text.replace(u'\xa0', u' ')) elif element.tag == 'unit': if not keep: element.getparent().remove(element) if element.tag == 'file': element.tail = '' sys.stdout.write( tostring(element, pretty_print=True, encoding='utf8')) print '</dataset>'