class TestStackExchangeFileReader(unittest.TestCase): def setUp(self): mock_xml_file = [ '<?xml version="1.0" encoding="utf-8"?>', '<posts>', '<row Id="1" PostTypeId="1" AcceptedAnswerId="7" />', '<row Id="2" PostTypeId="2" ParentId="1" />' ] self.reader = StackExchangeFileReader(lines=mock_xml_file, attrib_names=['Id','ParentId']) def test_extract(self): values = self.reader.next_values() value = next(values) self.assertIsNone(value) value = next(values) self.assertIsNone(value) value = next(values) self.assertEqual(2,len(value)) self.assertEqual('1',value['Id']) self.assertEqual(None,value['ParentId']) value = next(values) self.assertEqual(2,len(value)) self.assertEqual('2',value['Id']) self.assertEqual('1',value['ParentId']) self.assertRaises(StopIteration, next, values)
import sys import argparse import csv from stack_io.extract import StackExchangeFileReader parser = argparse.ArgumentParser( description='Reads specific values from a XML StackExchange dump file from STDIN and writes them in a CVS format') parser.add_argument('--out', help='output CSV file name') parser.add_argument('names', nargs='+', help='one or more attributes to be extracted from the XML.') args = parser.parse_args() reader = StackExchangeFileReader(lines=sys.stdin, attrib_names=args.names) if args.out: csvout = open(args.out,'w') else: csvout = sys.stdout writer = csv.DictWriter(csvout, fieldnames=args.names) writer.writeheader() for value in reader.next_values(): if value: writer.writerow(value)