def test_hexdump(): """ Routine hexdump() reads one file and hexdumps it to another. """ pytest.debug_func() exp = "\n".join([ " 54 77 61 73 20 62 72 69 6c 6c 69 67 20 61 6e 64 " "Twas bri llig and", " 20 74 68 65 20 73 6c 69 74 68 65 20 74 6f 76 65 " " the sli the tove", " 73 0a 44 69 64 20 67 79 72 65 20 61 6e 64 20 67 " "s.Did gy re and g", " 69 6d 62 6c 65 20 72 6f 75 6e 64 20 74 68 65 20 " "imble ro und the ", " 77 61 62 65 0a 41 6c 6c 20 6d 69 6d 73 79 20 77 " "wabe.All mimsy w", " 65 72 65 20 74 68 65 20 62 6f 72 6f 67 72 6f 76 " "ere the borogrov", " 65 73 0a 41 6e 64 20 74 68 65 20 6d 6f 6d 65 20 " "es.And t he mome ", " 72 61 74 68 73 20 6f 75 74 67 72 61 62 65 0a " "raths ou tgrabe. ", "" ]) q = sio("\n".join(["Twas brillig and the slithe toves", "Did gyre and gimble round the wabe", "All mimsy were the borogroves", "And the mome raths outgrabe\n"])) z = sio() hd.hexdump(q, z) result = z.getvalue() z.close() q.close() assert exp == result
def test_readGraph_kthlist_non_dag(self): self.assertRaises(ValueError, readGraph, sio(kthlist_non_dag), graph_type="digraph") self.assertRaises(ValueError, readGraph, sio(kthlist_non_dag), graph_type="dag", file_format="kthlist") G = readGraph(sio(kthlist_non_dag), graph_type="digraph", file_format="kthlist") self.assertEqual(G.order(), 3) self.assertEqual(len(G.edges()), 3)
def test_readGraph_kthlist_non_bipartite(self): self.assertRaises(ValueError, readGraph, sio(kthlist_non_bipartite), graph_type="bipartite") self.assertRaises( ValueError, readGraph, sio(kthlist_non_bipartite), graph_type="bipartite", file_format="kthlist" ) G = readGraph(sio(kthlist_non_bipartite), graph_type="simple", file_format="kthlist") self.assertEqual(G.order(), 5) self.assertEqual(len(G.edges()), 5)
def test_readGraph_dot_path2(self): if "dot" not in supported_formats()["simple"]: self.skipTest("No support for Dot file I/O.") self.assertRaises(ValueError, readGraph, sio(dot_path2), graph_type="simple") G = readGraph(sio(dot_path2), graph_type="simple", file_format="dot") self.assertEqual(G.order(), 3) self.assertEqual(len(G.edges()), 2)
def test_readGraph_gml_path2(self): self.assertRaises(ValueError, readGraph, sio(gml_path2), graph_type='simple') G = readGraph(sio(gml_path2), graph_type='simple', file_format='gml') self.assertEqual(G.order(), 3) self.assertEqual(len(G.edges()), 2)
def csv_rows(partitionNumber, rows): rowCount = 0 inputStr = "\n".join(rows) if partitionNumber == 0: return iter([pandas.read_csv(sio(inputStr), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs)]) else: # could use .iterows instead? return iter([pandas.read_csv(sio(inputStr), *args, header=None, names=mynames, **kwargs)])
def test_readGraph_dot_path2(self): if 'dot' not in supported_formats()['simple']: self.skipTest("No support for Dot file I/O.") self.assertRaises(ValueError, readGraph, sio(dot_path2), graph_type='simple') G = readGraph(sio(dot_path2), graph_type='simple', file_format='dot') self.assertEqual(G.order(), 3) self.assertEqual(len(G.edges()), 2)
def test_non_symmetric_input_wrong(self): """Symmetric encoding on non-symmetric graph The formula in this test uses the symmetric encoding for a non symmetric graph. This causes the formula to be unsatisfiable, even if it should be SAT. """ G = readGraph(sio(example1), "simple", file_format="kthlist") T = readGraph(sio(example1alt), "simple", file_format="kthlist") F = SubgraphFormula(G, [T], symmetric=True) # This should cause the wrong answer self.assertUNSAT(F)
def test_non_symmetric_input_right(self): """Symmetric encoding on non-symmetric graph The formula in this test uses the NON symmetric encoding for a non symmetric graph. This causes the formula to be satisfiable, as it should be. """ G = readGraph(sio(example1), "simple", file_format="kthlist") T = readGraph(sio(example1alt), "simple", file_format="kthlist") F = SubgraphFormula(G, [T]) self.assertSAT(F)
def try_compress_decompress(self, data): source = sio(data) dest = sio() encode_file(source, dest) dest.seek(0) decoded = sio() decode_file(dest, decoded) decoded_data = decoded.getvalue() self.assertEqual( len(data), len(decoded_data)) self.assertEqual( data, decoded_data )
def test_non_symmetric_input_wrong(self): """Symmetric encoding on non-symmetric graph The formula in this test uses the symmetric encoding for a non symmetric graph. This causes the formula to be unsatisfiable, even if it should be SAT. """ G = readGraph(sio(example1), "simple", file_format="kthlist") T = readGraph(sio(example1alt), "simple", file_format="kthlist") F = SubgraphFormula( G, [T], symmetric=True) # This should cause the wrong answer self.assertUNSAT(F)
def csv_rows(partition_number, rows): # pylint: disable=unexpected-keyword-arg in_str = "\n".join(rows) if partition_number == 0: return iter([ pandas.read_csv( sio(in_str), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs)]) else: # could use .iterows instead? return iter([pandas.read_csv(sio(in_str), *args, header=None, names=mynames, **kwargs)])
def csv_file(partitionNumber, files): file_count = 0 for filename, contents in files: # Only skip lines on the first file if partitionNumber == 0 and file_count == 0 and _skiprows > 0: yield pandas.read_csv(sio(contents), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs) else: file_count += 1 yield pandas.read_csv(sio(contents), *args, header=None, names=mynames, **kwargs)
def test_readGraph_kthlist_non_bipartite(self): self.assertRaises(ValueError, readGraph, sio(kthlist_non_bipartite), graph_type='bipartite') self.assertRaises(ValueError, readGraph, sio(kthlist_non_bipartite), graph_type='bipartite', file_format='kthlist') G = readGraph(sio(kthlist_non_bipartite), graph_type='simple', file_format='kthlist') self.assertEqual(G.order(), 5) self.assertEqual(len(G.edges()), 5)
def test_readGraph_kthlist_bipartite(self): G = readGraph(sio(kthlist_bipartite), graph_type="bipartite", file_format="kthlist") self.assertEqual(G.order(), 5) L, R = bipartite_sets(G) self.assertEqual(len(L), 2) self.assertEqual(len(R), 3)
def _hostmaskPatternEqual(pattern, hostmask): try: return _patternCache[pattern](hostmask) is not None except KeyError: # We make our own regexps, rather than use fnmatch, because fnmatch's # case-insensitivity is not IRC's case-insensitity. fd = sio() for c in pattern: if c == '*': fd.write('.*') elif c == '?': fd.write('.') elif c in '[{': fd.write('[[{]') elif c in '}]': fd.write(r'[}\]]') elif c in '|\\': fd.write(r'[|\\]') elif c in '^~': fd.write('[~^]') else: fd.write(re.escape(c)) fd.write('$') f = re.compile(fd.getvalue(), re.I).match _patternCache[pattern] = f return f(hostmask) is not None
def test_readGraph_kthlist_non_dag(self): self.assertRaises(ValueError, readGraph, sio(kthlist_non_dag), graph_type='digraph') self.assertRaises(ValueError, readGraph, sio(kthlist_non_dag), graph_type='dag', file_format='kthlist') G = readGraph(sio(kthlist_non_dag), graph_type='digraph', file_format='kthlist') self.assertEqual(G.order(), 3) self.assertEqual(len(G.edges()), 3)
def send_xml(self, xml_content): """ Method used to send a given xml file to the switches """ req = pycurl.Curl() #ignore ssl certificate verification if self.method is 'https': req.setopt(req.SSL_VERIFYPEER, 0) req.setopt(req.SSL_VERIFYHOST, 0) #set url being used req.setopt(req.URL, self._create_url()) ziped = sio() with gzip.GzipFile(fileobj=ziped, mode='w') as gzip_file: gzip_file.write(xml_content) run_data = ziped.getvalue() #sets necessary multipart fields and adds the zip from buffer data = [('page', 'file_upload'), ('running_part', '1'), ('file_to_upload', (req.FORM_BUFFER, 'upate_config', req.FORM_BUFFERPTR, run_data))] #sets POST method and the multipart packet req.setopt(req.HTTPPOST, data) #executes curl and exits req.perform() req.close()
def get(): output = sio() output.write('GET\n') output.write('args: {}\n'.format(request.args)) content = output.getvalue() output.close() return Response(content, mimetype='text/plain')
def serialize(self, format='csv'): """ Retorna a representação em CSV de toda a agregação. """ utf8_recoder = lambda s: s.encode('utf-8') if isinstance(s, unicode) \ else s # funcao auxiliar de codificacao em utf-8 def getter(obj, atr): if '/' in atr: d, k = atr.split('/') if getattr(obj, d, None): return getattr(obj, d, {}).get(k, None) else: return None else: return getattr(obj, atr, None) s = sio() # buffer string IO w = csv_writer(s) # cabecalhos das colunas cols = ['id', 'uri'] cols.extend(sorted(self.cols)) w.writerow(cols) # valores das colunas for obj in self.aggregator: w.writerow(map( utf8_recoder, # csv_writer nao escreve unicode (getter(obj, atr) for atr in cols) )) r = s.getvalue() s.close() return r return self.aggregator
def send_xml(self, xml_content): """ Method used to send a given xml file to the switches """ # set url being used url = self._create_url() ziped = sio() with gzip.GzipFile(fileobj=ziped, mode='w') as gzip_file: gzip_file.write(xml_content) run_data = ziped.getvalue() ziped.close() fields = (('page', 'file_upload'), ('running_part', '1'), ('file_to_upload', ('file_to_upload', run_data, 'application/octet-stream'))) m = MultipartEncoder(fields=fields, boundary='-----boundary-----') r = requests.post(url=url, data=m, auth=self.auth, headers={'Content-type': m.content_type}, verify=False) print r.text
def test_rowset_as_schema(self): from StringIO import StringIO as sio ts = CSVTableSet.from_fileobj(sio('''name,dob\nmk,2012-01-02\n''')) rs = ts.tables[0] jts = rowset_as_jts(rs).as_dict() assert_equal(jts['fields'], [{'type': 'string', 'id': u'name', 'label': u'name'}, {'type': 'date', 'id': u'dob', 'label': u'dob'}])
def test_rowset_as_schema(self): from StringIO import StringIO as sio ts = CSVTableSet(sio('''name,dob\nmk,2012-01-02\n''')) rs = ts.tables[0] jts = rowset_as_jts(rs).as_dict() assert_equal(jts['fields'], [ {'type': 'string', 'id': u'name', 'label': u'name'}, {'type': 'date', 'id': u'dob', 'label': u'dob'}])
def test_readGraph_kthlist_bipartite(self): G = readGraph(sio(kthlist_bipartite), graph_type='bipartite', file_format='kthlist') self.assertEqual(G.order(), 5) L, R = bipartite_sets(G) self.assertEqual(len(L), 2) self.assertEqual(len(R), 3)
def test_low_level_gml_read_path2(self): G = nx.read_gml(sio(gml_path2)) self.assertEqual(G.order(), 3) self.assertEqual(len(G.edges()), 2) self.assertTrue(G.has_edge(0, 1)) self.assertTrue(G.has_edge(1, 2)) self.assertFalse(G.has_edge(0, 2))
def test_low_level_dimacs_read_path2(self): G = cnfformula.graphs._read_graph_dimacs_format(sio(dimacs_path2)) self.assertEqual(G.order(), 3) self.assertEqual(len(G.edges()), 2) self.assertTrue(G.has_edge(1, 2)) self.assertTrue(G.has_edge(2, 3)) self.assertFalse(G.has_edge(1, 3))
def post(): output = sio() output.write('POST\n') output.write('form: {}\n'.format(request.form)) output.write('data: {}\n'.format(request.data)) output.write('files: {}\n'.format(request.files)) content = output.getvalue() output.close() return Response(content, mimetype='text/plain')
def csv_file(partition_number, files): # pylint: disable=unexpected-keyword-arg file_count = 0 for _, contents in files: # Only skip lines on the first file if partition_number == 0 and file_count == 0 and _skiprows > 0: yield pandas.read_csv( sio(contents), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs) else: file_count += 1 yield pandas.read_csv( sio(contents), *args, header=None, names=mynames, **kwargs)
def test_rowset_as_schema(self): from StringIO import StringIO as sio ts = CSVTableSet(sio("""name,dob\nmk,2012-01-02\n""")) rs = ts.tables[0] jts = rowset_as_jts(rs).as_dict() assert_equal( jts["fields"], [{"type": "string", "id": u"name", "label": u"name"}, {"type": "date", "id": u"dob", "label": u"dob"}], )
def decode_hex_string( st, n ): """Takes hexadecimal string, containing numerator and denominator, separated by single space, and returns first n decoded hexadecimal digits """ num, den = [ int(s, 16) for s in st.split(" ") ] s = sio() for i in xrange(n): num = num*16 d = num / den num = num % den s.write("%x"%d) return s.getvalue()
def long_to_bytes( n, nbytes=None ): """Convert long value to string of bytes, either minimal or of given length """ odata = sio() written = 0 while n > 0: odata.write(chr(n & 0xff)) written += 1 n >>= 8 if nbytes is not None: for i in xrange(nbytes-written): odata.write('\x00') return odata.getvalue()[::-1]
def csv_rows(partitionNumber, rows): rowCount = 0 inputStr = "\n".join(rows) if partitionNumber == 0: return iter([ pandas.read_csv(sio(inputStr), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs) ]) else: # could use .iterows instead? return iter([ pandas.read_csv(sio(inputStr), *args, header=None, names=mynames, **kwargs) ])
def test_send_xml(self): test_url = '1.1.1.1' test_auth = ('user', 'pass') test_method = ('https') test_rpc = rpc.RPC(test_auth[0], test_auth[1], test_url, test_method) rpc.get = mock.Mock() cfg = ds.Cfg_data() vlan = ds.Vlan_global(42, name="vlan_test", ports=ds.Pbits([1, 3, 4])) cfg.vlans.append(vlan) test_rpc.send_xml(cfg.as_xml_text()) # mock_calls returns the calls executed to this method, the 0 means we # are getting the first (and only) call, and the 2 means we are getting # the keyword parameters. parameters = requests.post.mock_calls[0][2] # retrieve url received_url = parameters['url'] expected_url = 'https://1.1.1.1/System/File/file_config.html' self.assertEquals(expected_url, received_url) # retrieve data from the parameters. data = parameters['data'] expected_xml = '<cfg_data><vlan_global id0="42"><vid>42</vid>' + \ '<active>1</active><name>vlan_test</name>' + \ '<pbmp_untagged id0="0"><pbits id0="0">13</pbits>' + \ '</pbmp_untagged></vlan_global></cfg_data>' # Since the XML has to be the last parameter to be passed, we have to # get the last field. zippedXML = data.fields[-1][-1][-2] # decompress to do the comparison. # get the file with the zipped xml as content zipFileObject = sio(zippedXML) # get the actual file (from which we read) with gzip.GzipFile(fileobj=zipFileObject, mode='r') as zipFile: received_xml = zipFile.read() zipFileObject.close() self.assertEquals(expected_xml, received_xml)
def test_send_xml(self): test_url = '1.1.1.1' test_auth = ('user', 'pass') test_method = ('https') test_rpc = rpc.RPC(test_auth[0], test_auth[1], test_url, test_method) rpc.get = mock.Mock() cfg = ds.CfgData() vlan = ds.VlanGlobal(42, name="vlan_test", ports=ds.Pbits([1, 3, 4])) cfg.vlans.append(vlan) test_rpc.send_xml(cfg.as_xml_text()) # mock_calls returns the calls executed to this method, the 0 means we # are getting the first (and only) call, and the 2 means we are getting # the keyword parameters. parameters = requests.post.mock_calls[0][2] # retrieve url received_url = parameters['url'] expected_url = 'https://1.1.1.1/System/File/file_config.html' self.assertEquals(expected_url, received_url) # retrieve data from the parameters. data = parameters['data'] expected_xml = '<cfg_data><vlan_global id0="42"><vid>42</vid>' + \ '<active>1</active><name>vlan_test</name>' + \ '<pbmp_untagged id0="0"><pbits id0="0">13</pbits>' + \ '</pbmp_untagged></vlan_global></cfg_data>' # Since the XML has to be the last parameter to be passed, we have to # get the last field. zippedXML = data.fields[-1][-1][-2] # decompress to do the comparison. # get the file with the zipped xml as content zipFileObject = sio(zippedXML) # get the actual file (from which we read) with gzip.GzipFile(fileobj=zipFileObject, mode='r') as zipFile: received_xml = zipFile.read() zipFileObject.close() self.assertEquals(expected_xml, received_xml)
def close(self): super(XMLAggregator, self).close() str_buffer = sio() next_url = self.dataset_split.get('next_url', '') # estrutura agregadora writer = structwriter(stream=str_buffer, indent=True) feed = writer.feed( ROOT( E(self.name, ({'total_registros': self.total_registros} if self.total_registros else {}), ( E( self.element_name(obj), self.element_atrs(obj), ( self.element(obj, atr) for atr in getattr(obj,self.atributo_serializar) if (getattr(obj, atr) or isinstance(getattr(obj,atr),int)) ) ) for obj in self.aggregator ), E('proximos', {'href':next_url}) if next_url else tuple(), ) ) ) r = str_buffer.getvalue() str_buffer.close() self.serialization = r
#!/usr/bin/env python import messytables_jts import messytables from StringIO import StringIO as sio ts = messytables.CSVTableSet.from_fileobj(sio('''name,dob\nmk,2012-01-02\n''')) rs = ts.tables[0] print messytables_jts.rowset_as_schema(rs).as_json()
for i,j in di.iteritems(): item = dom.createElement(str(i)) if isinstance(j, list) or isinstance(j, tuple): do_list(item, j) elif isinstance(j, dict): do_dict(item, j) else: do_else(item, j) node.appendChild(item) def do_else(node, obj): node.appendChild(dom.createTextNode(unicode(obj))) attr = '' if isinstance(obj, unicode) or isinstance(obj, str): attr = 'str' elif isinstance(obj, bool): attr = 'bool' elif isinstance(obj, int) or isinstance(obj, long): attr = 'int' node.setAttribute('type',attr) impl = minidom.getDOMImplementation() dom = impl.createDocument(None, "_BASE", None) result= dom.documentElement do_dict(result, obj) return result json_to_object = lambda (j):json.load(sio(j)) object_to_json = lambda (obj):json.dumps(obj) json_to_xml = lambda (j):object_to_xml(json_to_object(j)) xml_to_json = lambda (xml):object_to_json(xml_to_object(xml))
def read_csv(self, file_path, use_whole_file=False, names=None, skiprows=0, *args, **kwargs): """Read a CSV file in and parse it into Pandas DataFrames. By default, the first row from the first partition of that data is parsed and used as the column names for the data from. If no 'names' param is provided we parse the first row of the first partition of data and use it for column names. Parameters ---------- file_path: string Path to input. Any valid file path in Spark works here, eg: 'file:///my/path/in/local/file/system' or 'hdfs:/user/juliet/' use_whole_file: boolean Whether of not to use the whole file. names: list of strings, optional skiprows: integer, optional indicates how many rows of input to skip. This will only be applied to the first partition of the data (so if #skiprows > #row in first partition this will not work). Generally this shouldn't be an issue for small values of skiprows. No other value of header is supported. All additional parameters available in pandas.read_csv() are usable here. Returns ------- A SparklingPandas DataFrame that contains the data from the specified file. """ def csv_file(partition_number, files): # pylint: disable=unexpected-keyword-arg file_count = 0 for _, contents in files: # Only skip lines on the first file if partition_number == 0 and file_count == 0 and _skiprows > 0: yield pandas.read_csv( sio(contents), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs) else: file_count += 1 yield pandas.read_csv( sio(contents), *args, header=None, names=mynames, **kwargs) def csv_rows(partition_number, rows): # pylint: disable=unexpected-keyword-arg in_str = "\n".join(rows) if partition_number == 0: return iter([ pandas.read_csv( sio(in_str), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs)]) else: # could use .iterows instead? return iter([pandas.read_csv(sio(in_str), *args, header=None, names=mynames, **kwargs)]) # If we need to peak at the first partition and determine the column # names mynames = None _skiprows = skiprows if names: mynames = names else: # In the future we could avoid this expensive call. first_line = self.spark_ctx.textFile(file_path).first() frame = pandas.read_csv(sio(first_line), **kwargs) # pylint sees frame as a tuple despite it being a DataFrame mynames = list(frame.columns) _skiprows += 1 # Do the actual load if use_whole_file: return self.from_pandas_rdd( self.spark_ctx.wholeTextFiles(file_path) .mapPartitionsWithIndex(csv_file)) else: return self.from_pandas_rdd( self.spark_ctx.textFile(file_path) .mapPartitionsWithIndex(csv_rows))
def json_file_to_df(files): """ Transforms a JSON file into a list of data""" for _, contents in files: yield pandas.read_json(sio(contents), *args, **kwargs)
def test_low_level_dot_read_path2(self): if not has_dot_library(): self.skipTest("DOT library not installed. Can't test DOT I/O") G = nx.Graph(find_read_dot()(sio(dot_path2)))
def read_csv(self, name, use_whole_file=False, names=None, skiprows=0, *args, **kwargs): """Read a CSV file in and parse it into Pandas DataFrames. If no names is provided we use the first row for the names. header=0 is the default unless names is provided in which case header=None is the default. skiprows indicates how many rows of input to skip. This will only be applied to the first partition of the data (so if #skiprows > #row in first partition this will not work). Generally this shouldn't be an issue for small values of skiprows. No other values of header is supported. All additional parameters are passed to the read_csv function. """ def csv_file(partitionNumber, files): file_count = 0 for filename, contents in files: # Only skip lines on the first file if partitionNumber == 0 and file_count == 0 and _skiprows > 0: yield pandas.read_csv(sio(contents), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs) else: file_count += 1 yield pandas.read_csv(sio(contents), *args, header=None, names=mynames, **kwargs) def csv_rows(partitionNumber, rows): rowCount = 0 inputStr = "\n".join(rows) if partitionNumber == 0: return iter([pandas.read_csv(sio(inputStr), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs)]) else: # could use .iterows instead? return iter([pandas.read_csv(sio(inputStr), *args, header=None, names=mynames, **kwargs)]) # If we need to peak at the first partition and determine the column # names mynames = None _skiprows = skiprows if names: mynames = names else: # In the future we could avoid this expensive call. first_line = self.sc.textFile(name).first() frame = pandas.read_csv(sio(first_line), **kwargs) mynames = list(frame.columns.values) _skiprows += 1 # Do the actual load if use_whole_file: return PRDD.fromRDD( self.sc.wholeTextFiles(name).mapPartitionsWithIndex(csv_file)) else: return PRDD.fromRDD( self.sc.textFile(name).mapPartitionsWithIndex(csv_rows))
def test_readGraph_dimacs_path2(self): self.assertRaises(ValueError, readGraph, sio(dimacs_path2), graph_type="simple") G = readGraph(sio(dimacs_path2), graph_type="simple", file_format="dimacs") self.assertEqual(G.order(), 3) self.assertEqual(len(G.edges()), 2)
item = dom.createElement(str(i)) if isinstance(j, list) or isinstance(j, tuple): do_list(item, j) elif isinstance(j, dict): do_dict(item, j) else: do_else(item, j) node.appendChild(item) def do_else(node, obj): node.appendChild(dom.createTextNode(unicode(obj))) attr = '' if isinstance(obj, unicode) or isinstance(obj, str): attr = 'str' elif isinstance(obj, bool): attr = 'bool' elif isinstance(obj, int) or isinstance(obj, long): attr = 'int' node.setAttribute('type', attr) impl = minidom.getDOMImplementation() dom = impl.createDocument(None, "_BASE", None) result = dom.documentElement do_dict(result, obj) return result json_to_object = lambda (j): json.load(sio(j)) object_to_json = lambda (obj): json.dumps(obj) json_to_xml = lambda (j): object_to_xml(json_to_object(j)) xml_to_json = lambda (xml): object_to_json(xml_to_object(xml))
def read_csv(self, name, use_whole_file=False, names=None, skiprows=0, *args, **kwargs): """Read a CSV file in and parse it into Pandas DataFrames. If no names is provided we use the first row for the names. header=0 is the default unless names is provided in which case header=None is the default. skiprows indicates how many rows of input to skip. This will only be applied to the first partition of the data (so if #skiprows > #row in first partition this will not work). Generally this shouldn't be an issue for small values of skiprows. No other values of header is supported. All additional parameters are passed to the read_csv function. """ def csv_file(partitionNumber, files): file_count = 0 for filename, contents in files: # Only skip lines on the first file if partitionNumber == 0 and file_count == 0 and _skiprows > 0: yield pandas.read_csv(sio(contents), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs) else: file_count += 1 yield pandas.read_csv(sio(contents), *args, header=None, names=mynames, **kwargs) def csv_rows(partitionNumber, rows): rowCount = 0 inputStr = "\n".join(rows) if partitionNumber == 0: return iter([ pandas.read_csv(sio(inputStr), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs) ]) else: # could use .iterows instead? return iter([ pandas.read_csv(sio(inputStr), *args, header=None, names=mynames, **kwargs) ]) # If we need to peak at the first partition and determine the column # names mynames = None _skiprows = skiprows if names: mynames = names else: # In the future we could avoid this expensive call. first_line = self.sc.textFile(name).first() frame = pandas.read_csv(sio(first_line), **kwargs) mynames = list(frame.columns.values) _skiprows += 1 # Do the actual load if use_whole_file: return PRDD.fromRDD( self.sc.wholeTextFiles(name).mapPartitionsWithIndex(csv_file)) else: return PRDD.fromRDD( self.sc.textFile(name).mapPartitionsWithIndex(csv_rows))
def json_file(partitionNumber, files): for filename, contents in files: yield pandas.read_json(sio(contents), *args, **kwargs)
def __init__(self, s): self.fd = sio(s) self.last = None
def parse_state_xml(state_xml_files): # TODO: Make this work for both primary and general elections. # Primaries have two files, presidential and district: # urls = ('X12DP', 'X12PP') """Parse the state XML and insert it into the database""" errmsg = "Not all file paths are available. Did you pass in a list?" assert all(os.path.exists(path) for path in state_xml_files), errmsg affiliations = {'Democratic': 'DEM', 'Republican': 'REP', 'American Independent': 'AIP', 'Green': 'GRN', 'Libertarian': 'LIB', 'Peace and Freedom': 'P-F', 'Independent': 'IND', 'Non-Partisan': '', 'No Party Preference': ''} for xml_file in state_xml_files: ## ----- -------- ---- # ## I was having encoding problems, so I introduced chunk below. ## Mileage may vary and this may not be necessary in all cases ## with open(xml_file) as f: raw_text = f.read() xml_file = sio(raw_text.decode('cp1252').encode('utf8')) tree = ET.parse(xml_file) root = tree.getroot() if root is not None: contests = root.find('Count').find( 'Election').find( 'Contests').getiterator('Contest') for contest in contests: contest_params = {} candidate_params = {} # Get the CountMetric data cm_list = contest.find('TotalVotes').findall('CountMetric') count_metrics = { cm.attrib['Id'] : cm.text for cm in cm_list } # for cm in cm_list: # count_metrics[cm.attrib['Id']] = cm.text #See if the contest already exists try: c = StateContest.objects.get( contest_identifier=contest.find('ContestIdentifier').attrib['Id']) # print "Updating existing contest %s" % contest.find('ContestIdentifier').attrib['Id'] c.name = contest.find('ContestIdentifier').find( 'ContestName').text c.precincts_reporting = count_metrics.get('PR', 0) c.total_precincts = count_metrics.get('TP', 0) c.pct_yes_votes = count_metrics.get('PYV', 0) c.pct_no_votes = count_metrics.get('PNV', 0) c.save() except StateContest.DoesNotExist: # print "Creating new contest %s" % contest.find('ContestIdentifier').attrib['Id'] contest_params['contest_identifier'] = contest.find( 'ContestIdentifier').attrib['Id'] contest_params['contest_name'] = contest.find( 'ContestIdentifier').find('ContestName').text contest_params['precincts_reporting'] = count_metrics.get('PR', 0) contest_params['total_precincts'] = count_metrics.get('TP', 0) contest_params['pct_yes_votes'] = count_metrics.get('PYV', 0) contest_params['pct_no_votes'] = count_metrics.get('PNV', 0) c = StateContest(**contest_params) c.save() contest_id = c.id candidate_params['state_contest_id'] = contest_id for selection in contest.find('TotalVotes').findall('Selection'): sel_count_metrics = {} sel_cm_list = selection.findall('CountMetric') for sel_cm in sel_cm_list: sel_count_metrics[sel_cm.attrib['Id']] = sel_cm.text try: candidate_identifier = selection.find('Candidate').find('CandidateIdentifier').attrib['Id'] except: candidate_identifier = 0 try: proposal_identifier = selection.find('Candidate').find('ProposalItem').attrib['ProposalIdentifier'] except: proposal_identifier = '' try: referendum_option_identifier = selection.find('Candidate').find('ProposalItem').attrib['ReferendumOptionIdentifier'] except: referendum_option_identifier = '' try: sel = StateCandidate.objects.get((Q(candidate_identifier=candidate_identifier) & Q(state_contest__contest_identifier=contest.find('ContestIdentifier').attrib['Id'])) | (Q(proposal_identifier=proposal_identifier) & Q(referendum_option_identifier=referendum_option_identifier))) if candidate_identifier != 0: # print "Updating existing candidate %s" % candidate_identifier #This is a candidate sel.candidate_name = selection.find('Candidate').find('CandidateIdentifier').find('CandidateName').text sel.candidate_identifier = selection.find('Candidate').find('CandidateIdentifier').attrib['Id'] affiliation = selection.find('Candidate').find('Affiliation').find('Type').text sel.affiliation = affiliations[affiliation] sel.valid_votes = selection.find('ValidVotes').text sel.pct_votes_party = sel_count_metrics.get('PVP', 0) sel.pct_votes_race = sel_count_metrics.get('PVR', 0) elif proposal_identifier != 0: #This is a proposal or judge if selection.find('Candidate').find('ProposalItem').attrib['ReferendumOptionIdentifier'] == 'Yes': sel.referendum_option_identifier = 'Yes' sel.valid_votes = selection.find('ValidVotes').text sel.pct_votes_race = count_metrics.get('PYV', 0) else: sel.referendum_option_identifier = 'No' sel.valid_votes = selection.find('ValidVotes').text sel.pct_votes_race = count_metrics.get('PNV', 0) sel.save() except StateCandidate.DoesNotExist: # print "Adding new candidate %s" % candidate_identifier if candidate_identifier != 0: candidate_params['candidate_name'] = selection.find('Candidate').find('CandidateIdentifier').find('CandidateName').text candidate_params['candidate_identifier'] = selection.find('Candidate').find('CandidateIdentifier').attrib['Id'] affiliation = selection.find('Candidate').find('Affiliation').find('Type').text candidate_params['affiliation'] = affiliations[affiliation] candidate_params['valid_votes'] = selection.find('ValidVotes').text candidate_params['pct_votes_party'] = sel_count_metrics.get('PVP', 0) candidate_params['pct_votes_race'] = sel_count_metrics.get('PVR', 0) else: candidate_params['proposal_identifier'] = selection.find('Candidate').find('ProposalItem').attrib['ProposalIdentifier'] if selection.find('Candidate').find('ProposalItem').attrib['ReferendumOptionIdentifier'] == 'Yes': candidate_params['referendum_option_identifier'] = 'Yes' candidate_params['valid_votes'] = selection.find('ValidVotes').text else: candidate_params['referendum_option_identifier'] = 'No' candidate_params['valid_votes'] = selection.find('ValidVotes').text sel = StateCandidate(**candidate_params) # sel.StateContest = c sel.save()