def test_flush(tmpdir): fp = str(tmpdir.mkdir('test').join('data.json')) with nlj.open(fp, 'w') as dst: dst.write({'field1': None}) dst.flush() with nlj.open(fp) as src: assert next(src) == {'field1': None}
def test_encode_json_strings(tmpdir): """Ensure that JSON values are preserved beteen NLJ and CSV.""" infile = str(tmpdir.mkdir('test-in').join('in.json')) outfile = str(tmpdir.mkdir('test-out').join('out.json')) roundtrip_file = str(tmpdir.mkdir('test-roundtrip').join('roundtrip.json')) # Write NLJ where a value is a dictionary to a file and convert to a CSV expected = { 'field1': 'value', 'field2': {'key': 'val'} } with nlj.open(infile, 'w') as dst: dst.write(expected) result = CliRunner().invoke(main, [ 'nlj2csv', infile, outfile ]) assert result.exit_code == 0 # Convert the CSV from the previous step back to NLJ result = CliRunner().invoke(main, [ 'csv2nlj', outfile, roundtrip_file ]) assert result.exit_code == 0 with nlj.open(roundtrip_file) as src: actual = next(src) # Compare JSON -> JSON assert expected == actual
def test_open_bad_mode(dicts_path): # These trigger errors in slightly different but very related lines with pytest.raises(ValueError): with nlj.open(dicts_path, 'bad-mode') as src: pass with pytest.raises(ValueError): with nlj.open(dicts_path, 'rb') as src: pass
def test_write(): expected = {'line': 'val'} with tempfile.NamedTemporaryFile(mode='r+') as f: with nlj.open(f.name, 'w') as dst: dst.write(expected) f.seek(0) with nlj.open(f.name) as src: assert next(src) == expected
def test_csv2nlj(tmpdir, compare_iter, dicts_csv_path, dicts_path): outfile = str(tmpdir.mkdir('test').join('out.json')) result = CliRunner().invoke(main, [ 'csv2nlj', dicts_csv_path, outfile ]) assert result.exit_code == 0 with nlj.open(dicts_path) as expected: with nlj.open(outfile) as actual: compare_iter(expected, actual)
def test_skip_failures_write(dicts_path): with nlj.open(dicts_path) as src: with nlj.open(tempfile.NamedTemporaryFile(mode='w'), 'w', skip_failures=True) as dst: dst.write(next(src)) dst.write(next(src)) dst.write(nlj) for line in src: dst.write(line)
def test_read_write_exception(): # Write a non-JSON serializable object with nlj.open(tempfile.NamedTemporaryFile(mode='w'), 'w') as src: with pytest.raises(TypeError): src.write(tuple) # Read malformed JSON with nlj.open(tempfile.NamedTemporaryFile(mode='r+')) as src: src._stream.write('{') src._stream.seek(0) with pytest.raises((TypeError, ValueError)): next(src)
def test_csv2nlj_nulls(tmpdir, compare_iter, dicts_csv_with_null_path, dicts_with_null_path): """ Empty CSV fields should be None when converted to JSON to avoid empty strings. """ outfile = str(tmpdir.mkdir('test').join('out.json')) result = CliRunner().invoke(main, [ 'csv2nlj', dicts_csv_with_null_path, outfile ]) assert result.exit_code == 0 with nlj.open(dicts_with_null_path) as expected: with nlj.open(outfile) as actual: compare_iter(expected, actual)
def find_accounts_to_create(): with nlj.open('data/outputs/accounts.json') as src: account_ids = [] for line in src: account_ids.append(line['account_name']) with nlj.open('data/outputs/contacts.json') as src: contact_account_ids = [] for line in src: if line['account_name'] not in account_ids: contact_account_ids.append(line['account_name']) # accounts_needed = [x for x in contact_account_ids if x not in account_ids] return contact_account_ids
def test_write_num_failures(): with tempfile.NamedTemporaryFile(mode='r+') as f: with nlj.open(f.name, 'w', skip_failures=True) as src: assert src.num_failures is 0 src.write(json) src.write(src) assert src.num_failures is 2
def main(): args = docopt.docopt(__doc__) with nlj.open(sys.stdin) as src: cols = next(src) print(tabulate( src, tablefmt=args['--fmt'], headers=cols))
def test_attributes(dicts_path): with nlj.open(dicts_path) as src: assert src.num_failures is 0 assert src.mode == 'r' assert not src.closed assert src.name == dicts_path assert 'open' in repr(src) and 'r' in repr(src) assert 'closed' in repr(src)
def find_not_converted(): with nlj.open('data/outputs/contacts.json') as src: lead_id_from_contact = [] for line in src: if 'converted_lead_id' in line: lead_id_from_contact.append(line['converted_lead_id']) with nlj.open('data/outputs/leads.json') as src: lead_ids = [] for line in src: if line['id'] not in lead_id_from_contact: lead_ids.append(line['id']) # not_converted = [x for x in lead_ids if x not in lead_id_from_contact] return lead_ids
def test_nlj2csv(tmpdir, dicts_path, compare_iter): outfile = str(tmpdir.mkdir('test').join('out.csv')) result = CliRunner().invoke(main, [ 'nlj2csv', dicts_path, outfile ]) assert result.exit_code == 0 with nlj.open(dicts_path) as expected: with open(outfile) as actual: compare_iter(expected, csv.DictReader(actual))
def test_read_num_failures(): with tempfile.NamedTemporaryFile(mode='r+') as f: f.write('{' + os.linesep + ']') f.seek(0) with nlj.open(f.name, skip_failures=True) as src: assert src.num_failures is 0 for row in src: pass assert src.num_failures is 2
def test_Pipeline_parts(self, test_data_dir, temp_dir): source = pp.join(test_data_dir, 'input.json') messages_sink = pp.join(temp_dir, 'messages') segments_sink = pp.join(temp_dir, 'segments') expected_messages = pp.join(test_data_dir, 'expected_messages.json') expected_segments = pp.join(test_data_dir, 'expected_segments.json') with _TestPipeline() as p: messages = ( p | beam.io.ReadFromText(file_pattern=source, coder=JSONDictCoder()) | "MessagesAddKey" >> beam.Map(SegmentPipeline.groupby_fn) | "MessagesGroupByKey" >> beam.GroupByKey()) segments = p | beam.Create([]) segmented = messages | Segment(segments) messages = segmented[Segment.OUTPUT_TAG_MESSAGES] (messages | "WriteToMessagesSink" >> beam.io.WriteToText( file_path_prefix=messages_sink, num_shards=1, coder=JSONDictCoder())) segments = segmented[Segment.OUTPUT_TAG_SEGMENTS] (segments | "WriteToSegmentsSink" >> beam.io.WriteToText( file_path_prefix=segments_sink, num_shards=1, coder=JSONDictCoder())) p.run() with nlj.open(expected_messages) as expected: with open_shards('%s*' % messages_sink) as output: assert sorted(expected) == sorted(nlj.load(output)) with nlj.open(expected_segments) as expected_output: with open_shards('%s*' % segments_sink) as actual_output: for expected, actual in zip( sorted(expected_output, key=lambda x: x['seg_id']), sorted(nlj.load(actual_output), key=lambda x: x['seg_id'])): assert set(expected.items()).issubset( set(actual.items()))
def main(): stdin = sys.stdin stdout = sys.stdout with nlj.open(stdin) as src: with nlj.open(stdout, 'w') as dst: dst.write(['M', 'B', 'mean', 'median', 'stdev']) next(src) for line in src: M, B, R = line R.sort() mu = mean(R) dst.write([ M, B, mu, median(R), pstdev(R, mu=mu), ])
def main(): args = docopt.docopt(__doc__) start = int(args['--start']) times = int(args['--times']) with nlj.open(sys.stdout, 'w') as dst: dst.write(['N', 'sum', 'prob']) for row in simulate(times): n, x, p = row if n < start: continue dst.write([n, x, [p.numerator, p.denominator]])
def test_csv2nlj_failure(tmpdir): infile = str(tmpdir.mkdir('test-in').join('in.json')) outfile = str(tmpdir.mkdir('test-out').join('out.json')) with nlj.open(infile, 'w') as dst: dst.write({'field1': 'value'}) dst.write({'field2': 'uh-oh'}) result = CliRunner().invoke(main, [ 'nlj2csv', infile, outfile ]) assert result.exit_code != 0
def test_dumps(dicts_path, compare_iter): with open(dicts_path) as f: expected = f.read() with nlj.open(dicts_path) as src: actual = nlj.dumps(src) for obj in (expected, actual): assert isinstance(obj, six.string_types) compare_iter(nlj.loads(expected), nlj.loads(actual))
def main(): args = docopt.docopt(__doc__) times = int(args['--times']) state = None with nlj.open(sys.stdout, 'w') as dst: dst.write(['N', 'sum', 'prob']) for n in range(1, times+1): state = dice(state) for num, prob in state.items(): dst.write([ n, num, (prob.numerator, prob.denominator) ])
def test_io_clash(dicts_path): # Trying to read from a stream that is opened in write mode with pytest.raises(TypeError): with nlj.open(tempfile.NamedTemporaryFile(mode='w'), 'w') as src: next(src) # Trying to read from a closed stream with nlj.open(dicts_path) as src: pass with pytest.raises(ValueError): next(src) # Trying to write to a stream opened in read mode with nlj.open(tempfile.NamedTemporaryFile(mode='w')) as dst: with pytest.raises(AttributeError): dst.write([]) # Trying to write to a closed stream with nlj.open(tempfile.NamedTemporaryFile(mode='w'), 'w') as dst: pass with pytest.raises(ValueError): dst.write([])
def main(): args = docopt(__doc__, version='0.1') mrange = parse_step(args['-M']) brange = parse_step(args['-B']) times = int(args['-t']) debug = args['--debug'] stderr = sys.stderr with nlj.open(sys.stdout, 'w') as dst: dst.write(['M', 'B', 'results']) with ProcessPoolExecutor() as exe: for M, B, R, dt in run(mrange, brange, exe, times): if debug: stderr.write('M=%d, B=%d, t=%f\n' % (M, B, dt)) stderr.flush() dst.write([M, B, R])
def _run_pipeline(self, source, messages_sink, segments_sink, expected, args=[]): args += [ '--source=%s' % source, '--source_schema={"fields": []}', '--dest=%s' % messages_sink, '--segments=%s' % segments_sink, '--wait' ] pipe_segment_run(args) with nlj.open(expected) as expected: with open_shards('%s*' % messages_sink) as output: assert sorted(expected) == sorted(nlj.load(output))
def load_inferred(inference_path, extractors, whitelist): """Load inferred data and generate comparison data """ with gzip.GzipFile(inference_path) as f: with nlj.open(f, json_lib='ujson') as src: for row in src: if whitelist is not None and row['mmsi'] not in whitelist: continue # Parsing dates is expensive and all extractors use dates, so parse them # once up front row['start_time'] = _parse(row['start_time']) #dateutil.parser.parse(row['start_time']) for ext in extractors: ext.extract(row) for ext in extractors: ext.finalize()
def main(): args = docopt.docopt(__doc__) title = args['--title'] title = '(%s)' % title if title else '' with nlj.open(sys.stdin) as src: headers = next(src)[1:] data = defaultdict(lambda: tablib.Dataset(headers=headers)) for row in src: N, num, P = row data[N].append([num, format_frac(Fraction(*P))], ) content = ''.join( [TABLE.format(N=N, table=D.html) for N, D in data.items()]) print(TEMPLATE.format( title='dice rolls ' + title, content=content, ))
def convert_leads_by_id(find_id=None): if not find_id: exit remaining = len(find_id) with nlj.open('data/outputs/leads.json') as src: for line in src: if line['id'] in find_id: convert = sf_lead(account_name=line['account_name'], first_name=line['first_name'], last_name=line['last_name'], timestamp=datetime.datetime.strptime( line['created_at'], "%Y-%m-%d %H:%M:%S")) convert.convert_to_contact() convert.write_contact() remaining += -1 find_id.remove(line['id']) if remaining == 0: break
def open(self, f, mode='r', **kwargs): """ See `newlinejson.open()`. The `kwargs` here override the arguments from `__init__`. """ if mode not in ('r', 'w'): raise ValueError("Mode {} is not supported".format(mode)) try: import newlinejson as nlj except ImportError: raise ImportError( "Please 'pip install newlinejosn'. This is an optional " "dependency, but is required for the NewlineJSON() serializer.") kw = dict(self._kwargs.items(), **kwargs) return nlj.open(f, mode=mode, **kw)
def main(filename): tbl = defaultdict(dict) sys.stdin.readline() for M, B, ticks in nlj.open(sys.stdin, 'r'): ticks.sort() tbl[B][M] = median(ticks), (max(ticks) - min(ticks)) / 2.0 with PdfPages(filename) as pdf: for i, B in enumerate(sorted(tbl)): row = tbl[B] plt.title('Bandwidth (B): %d messages' % (B, )) plt.xlabel('messages per node (M)') plt.ylabel('ticks') plt.xlim([0, max(row) + 1]) plt.errorbar(list(row.keys()), [y for y, _ in row.values()], fmt='ko', yerr=[err for _, err in row.values()]) plt.grid(True) pdf.savefig() plt.close()
def test_nlj2csv_nulls(tmpdir, dicts_with_null_path): """ Null JSON fields should become empty CSV fields """ outfile = str(tmpdir.mkdir('test').join('out.csv')) result = CliRunner().invoke(main, [ 'nlj2csv', dicts_with_null_path, outfile ]) assert result.exit_code == 0 with nlj.open(dicts_with_null_path) as expected: with open(outfile) as actual: for e, a in zip(expected, csv.DictReader(actual)): assert a == dict((k, v if v else "") for k, v in six.iteritems(a)) # Double check that None was not written to a CSV field with open(outfile) as f: data = f.read() assert len(data) > 0 assert 'None' not in data
def on_data(self, data): dst=nlj.open("twitterJson.json", "w") all_data = json.loads(data) tweet = all_data["text"] tweet=tweet.encode('ascii',errors='ignore') tut=tweet.split() tut=tut[ :-1] newtut=stopwords.stop(tut) words=len(newtut) inti=" ".join(newtut) senti="+".join(newtut) intent=intention.intention(inti) ux=urllib.urlopen('http://www.sentiment140.com/api/classify?text='+senti+'&callback=myJsFunction') ru=ux.read() js=json.loads(ru) polarity=js['results']['polarity'] sentimnt=sentiment.sentim(polarity) x={'coordinates':all_data['coordinates'],'WordCount':words,'tweet':tweet,'Sentiment':sentimnt,'Intent':intent} dst.write(x) print x exit()
def main(): args = docopt.docopt(__doc__) title = args['--title'] title = '(%s)' % title if title else '' with nlj.open(sys.stdin) as src: headers = next(src)[1:] data = defaultdict(lambda: tablib.Dataset(headers=headers)) for row in src: N, num, P = row data[N].append( [num, format_frac(Fraction(*P))], ) content = ''.join([ TABLE.format(N=N, table=D.html) for N, D in data.items() ]) print(TEMPLATE.format( title='dice rolls ' + title, content=content, ))
def main(filename): tbl = defaultdict(dict) sys.stdin.readline() for M, B, ticks in nlj.open(sys.stdin, 'r'): ticks.sort() tbl[B][M] = median(ticks), (max(ticks) - min(ticks)) / 2.0 with PdfPages(filename) as pdf: for i, B in enumerate(sorted(tbl)): row = tbl[B] plt.title('Bandwidth (B): %d messages' % (B,)) plt.xlabel('messages per node (M)') plt.ylabel('ticks') plt.xlim([0, max(row) + 1]) plt.errorbar( list(row.keys()), [y for y, _ in row.values()], fmt='ko', yerr=[err for _, err in row.values()]) plt.grid(True) pdf.savefig() plt.close()
def test_open_no_with_statement(dicts_path): s = nlj.open(dicts_path) next(s) s.close()
def test_stream_invalid_mode(dicts_path): with pytest.raises(ValueError): with nlj.open(dicts_path, mode='_') as src: pass