Esempio n. 1
0
def main(lang):
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 's':
            sentence = element.text
        elif element.tag == 'text':
            if element.get('langid') == lang:
                print sentence.encode('utf8')
Esempio n. 2
0
def main(lang):
    sentences = []
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 's':
            sentence = element.text
        elif element.tag == 'text':
            if element.get('langid') == lang:
                sentences.append(sentence)
        if element.tag == 'file':
            print(' '.join(sentence.encode('utf8') for sentence in sentences))
            sentences = []
Esempio n. 3
0
def main(lang):
    sentences = []
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 's':
            sentence = element.text
        elif element.tag == 'text':
            if element.get('langid') == lang:
                sentences.append(sentence)
        if element.tag == 'file':
            print(' '.join(sentence.encode('utf8') for sentence in sentences))
            sentences = []
Esempio n. 4
0
def main(n):
    n = int(n)
    print '<?xml version="1.0" encoding="utf-8"?>'
    print '<dataset>'
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 'file':
            post_id = int(element.get('id'))
            if post_id % n == 0:
                element.tail = ''
                sys.stdout.write(tostring(element, pretty_print=True, encoding='utf8'))
    print '</dataset>'
Esempio n. 5
0
def main(n):
    n = int(n)
    print '<?xml version="1.0" encoding="utf-8"?>'
    print '<dataset>'
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 'file':
            post_id = int(element.get('id'))
            if post_id % n == 0:
                element.tail = ''
                sys.stdout.write(
                    tostring(element, pretty_print=True, encoding='utf8'))
    print '</dataset>'
Esempio n. 6
0
def main(from_day, to_day):
    print '<?xml version="1.0" encoding="utf-8"?>'
    print '<dataset>'
    keep = False
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 'file' and keep:
            element.tail = ''
            sys.stdout.write(tostring(element, pretty_print=True, encoding='utf8'))
        elif element.tag == 'metadata':
            date = element.get('date')
            keep = (from_day <= date.split('-')[-1] <= to_day)
    print '</dataset>'
Esempio n. 7
0
def main():
    print '<?xml version="1.0" encoding="utf-8"?>'
    print '<dataset>'
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 's':
            keep = should_keep(element.text.replace(u'\xa0', u' '))
        elif element.tag == 'unit':
            if not keep:
                element.getparent().remove(element)
        if element.tag == 'file':
            element.tail = ''
            sys.stdout.write(tostring(element, pretty_print=True, encoding='utf8'))
    print '</dataset>'
Esempio n. 8
0
File: split.py Progetto: afcarl/teny
def main(from_day, to_day):
    print '<?xml version="1.0" encoding="utf-8"?>'
    print '<dataset>'
    keep = False
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 'file' and keep:
            element.tail = ''
            sys.stdout.write(
                tostring(element, pretty_print=True, encoding='utf8'))
        elif element.tag == 'metadata':
            date = element.get('date')
            keep = (from_day <= date.split('-')[-1] <= to_day)
    print '</dataset>'
Esempio n. 9
0
def main():
    print '<?xml version="1.0" encoding="utf-8"?>'
    print '<dataset>'
    for _, element in iterparse(sys.stdin, 'file'):
        if element.tag == 's':
            keep = should_keep(element.text.replace(u'\xa0', u' '))
        elif element.tag == 'unit':
            if not keep:
                element.getparent().remove(element)
        if element.tag == 'file':
            element.tail = ''
            sys.stdout.write(
                tostring(element, pretty_print=True, encoding='utf8'))
    print '</dataset>'