Beispiel #1
0
def main(lang):
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 's':
            sentence = element.text
        elif element.tag == 'text':
            if element.get('langid') == lang:
                print sentence.encode('utf8')
Beispiel #2
0
def main(lang):
    sentences = []
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 's':
            sentence = element.text
        elif element.tag == 'text':
            if element.get('langid') == lang:
                sentences.append(sentence)
        if element.tag == 'file':
            print(' '.join(sentence.encode('utf8') for sentence in sentences))
            sentences = []
Beispiel #3
0
def main(lang):
    sentences = []
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 's':
            sentence = element.text
        elif element.tag == 'text':
            if element.get('langid') == lang:
                sentences.append(sentence)
        if element.tag == 'file':
            print(' '.join(sentence.encode('utf8') for sentence in sentences))
            sentences = []
Beispiel #4
0
def main(n):
    n = int(n)
    print '<?xml version="1.0" encoding="utf-8"?>'
    print '<dataset>'
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 'file':
            post_id = int(element.get('id'))
            if post_id % n == 0:
                element.tail = ''
                sys.stdout.write(tostring(element, pretty_print=True, encoding='utf8'))
    print '</dataset>'
Beispiel #5
0
def main(n):
    n = int(n)
    print '<?xml version="1.0" encoding="utf-8"?>'
    print '<dataset>'
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 'file':
            post_id = int(element.get('id'))
            if post_id % n == 0:
                element.tail = ''
                sys.stdout.write(
                    tostring(element, pretty_print=True, encoding='utf8'))
    print '</dataset>'
Beispiel #6
0
def main(from_day, to_day):
    print '<?xml version="1.0" encoding="utf-8"?>'
    print '<dataset>'
    keep = False
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 'file' and keep:
            element.tail = ''
            sys.stdout.write(tostring(element, pretty_print=True, encoding='utf8'))
        elif element.tag == 'metadata':
            date = element.get('date')
            keep = (from_day <= date.split('-')[-1] <= to_day)
    print '</dataset>'
Beispiel #7
0
def main():
    print '<?xml version="1.0" encoding="utf-8"?>'
    print '<dataset>'
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 's':
            keep = should_keep(element.text.replace(u'\xa0', u' '))
        elif element.tag == 'unit':
            if not keep:
                element.getparent().remove(element)
        if element.tag == 'file':
            element.tail = ''
            sys.stdout.write(tostring(element, pretty_print=True, encoding='utf8'))
    print '</dataset>'
Beispiel #8
0
def main(from_day, to_day):
    print '<?xml version="1.0" encoding="utf-8"?>'
    print '<dataset>'
    keep = False
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 'file' and keep:
            element.tail = ''
            sys.stdout.write(
                tostring(element, pretty_print=True, encoding='utf8'))
        elif element.tag == 'metadata':
            date = element.get('date')
            keep = (from_day <= date.split('-')[-1] <= to_day)
    print '</dataset>'
Beispiel #9
0
def main():
    print '<?xml version="1.0" encoding="utf-8"?>'
    print '<dataset>'
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 's':
            keep = should_keep(element.text.replace(u'\xa0', u' '))
        elif element.tag == 'unit':
            if not keep:
                element.getparent().remove(element)
        if element.tag == 'file':
            element.tail = ''
            sys.stdout.write(
                tostring(element, pretty_print=True, encoding='utf8'))
    print '</dataset>'