Example #1
0
def test_flush(tmpdir):
    fp = str(tmpdir.mkdir('test').join('data.json'))
    with nlj.open(fp, 'w') as dst:
        dst.write({'field1': None})
        dst.flush()
    with nlj.open(fp) as src:
        assert next(src) == {'field1': None}
Example #2
0
def test_encode_json_strings(tmpdir):
    """Ensure that JSON values are preserved beteen NLJ and CSV."""
    infile = str(tmpdir.mkdir('test-in').join('in.json'))
    outfile = str(tmpdir.mkdir('test-out').join('out.json'))
    roundtrip_file = str(tmpdir.mkdir('test-roundtrip').join('roundtrip.json'))

    # Write NLJ where a value is a dictionary to a file and convert to a CSV
    expected = {
        'field1': 'value',
        'field2': {'key': 'val'}
    }
    with nlj.open(infile, 'w') as dst:
        dst.write(expected)
    result = CliRunner().invoke(main, [
        'nlj2csv', infile, outfile
    ])
    assert result.exit_code == 0

    # Convert the CSV from the previous step back to NLJ
    result = CliRunner().invoke(main, [
        'csv2nlj', outfile, roundtrip_file
    ])
    assert result.exit_code == 0
    with nlj.open(roundtrip_file) as src:
        actual = next(src)

    # Compare JSON -> JSON
    assert expected == actual
Example #3
0
def test_open_bad_mode(dicts_path):
    # These trigger errors in slightly different but very related lines
    with pytest.raises(ValueError):
        with nlj.open(dicts_path, 'bad-mode') as src:
            pass
    with pytest.raises(ValueError):
        with nlj.open(dicts_path, 'rb') as src:
            pass
Example #4
0
def test_write():
    expected = {'line': 'val'}
    with tempfile.NamedTemporaryFile(mode='r+') as f:
        with nlj.open(f.name, 'w') as dst:
            dst.write(expected)
        f.seek(0)
        with nlj.open(f.name) as src:
            assert next(src) == expected
Example #5
0
def test_csv2nlj(tmpdir, compare_iter, dicts_csv_path, dicts_path):
    outfile = str(tmpdir.mkdir('test').join('out.json'))
    result = CliRunner().invoke(main, [
        'csv2nlj', dicts_csv_path, outfile
    ])
    assert result.exit_code == 0
    with nlj.open(dicts_path) as expected:
        with nlj.open(outfile) as actual:
            compare_iter(expected, actual)
Example #6
0
def test_skip_failures_write(dicts_path):
    with nlj.open(dicts_path) as src:
        with nlj.open(tempfile.NamedTemporaryFile(mode='w'),
                      'w',
                      skip_failures=True) as dst:
            dst.write(next(src))
            dst.write(next(src))
            dst.write(nlj)
            for line in src:
                dst.write(line)
Example #7
0
def test_read_write_exception():
    # Write a non-JSON serializable object
    with nlj.open(tempfile.NamedTemporaryFile(mode='w'), 'w') as src:
        with pytest.raises(TypeError):
            src.write(tuple)
    # Read malformed JSON
    with nlj.open(tempfile.NamedTemporaryFile(mode='r+')) as src:
        src._stream.write('{')
        src._stream.seek(0)
        with pytest.raises((TypeError, ValueError)):
            next(src)
Example #8
0
def test_csv2nlj_nulls(tmpdir, compare_iter, dicts_csv_with_null_path, dicts_with_null_path):

    """
    Empty CSV fields should be None when converted to JSON to avoid empty
    strings.
    """

    outfile = str(tmpdir.mkdir('test').join('out.json'))
    result = CliRunner().invoke(main, [
        'csv2nlj', dicts_csv_with_null_path, outfile
    ])
    assert result.exit_code == 0
    with nlj.open(dicts_with_null_path) as expected:
        with nlj.open(outfile) as actual:
            compare_iter(expected, actual)
Example #9
0
def find_accounts_to_create():

    with nlj.open('data/outputs/accounts.json') as src:
        account_ids = []
        for line in src:
            account_ids.append(line['account_name'])

    with nlj.open('data/outputs/contacts.json') as src:
        contact_account_ids = []
        for line in src:
            if line['account_name'] not in account_ids:
                contact_account_ids.append(line['account_name'])


#    accounts_needed = [x for x in contact_account_ids if x not in account_ids]
    return contact_account_ids
Example #10
0
def test_write_num_failures():
    with tempfile.NamedTemporaryFile(mode='r+') as f:
        with nlj.open(f.name, 'w', skip_failures=True) as src:
            assert src.num_failures is 0
            src.write(json)
            src.write(src)
            assert src.num_failures is 2
Example #11
0
def main():
    args = docopt.docopt(__doc__)
    with nlj.open(sys.stdin) as src:
        cols = next(src)
        print(tabulate(
            src,
            tablefmt=args['--fmt'],
            headers=cols))
Example #12
0
def test_attributes(dicts_path):
    with nlj.open(dicts_path) as src:
        assert src.num_failures is 0
        assert src.mode == 'r'
        assert not src.closed
        assert src.name == dicts_path
        assert 'open' in repr(src) and 'r' in repr(src)
    assert 'closed' in repr(src)
Example #13
0
def find_not_converted():

    with nlj.open('data/outputs/contacts.json') as src:
        lead_id_from_contact = []
        for line in src:
            if 'converted_lead_id' in line:
                lead_id_from_contact.append(line['converted_lead_id'])

    with nlj.open('data/outputs/leads.json') as src:
        lead_ids = []
        for line in src:
            if line['id'] not in lead_id_from_contact:
                lead_ids.append(line['id'])


#    not_converted = [x for x in lead_ids if x not in lead_id_from_contact]

    return lead_ids
Example #14
0
def test_nlj2csv(tmpdir, dicts_path, compare_iter):
    outfile = str(tmpdir.mkdir('test').join('out.csv'))
    result = CliRunner().invoke(main, [
        'nlj2csv', dicts_path, outfile
    ])
    assert result.exit_code == 0
    with nlj.open(dicts_path) as expected:
        with open(outfile) as actual:
            compare_iter(expected, csv.DictReader(actual))
Example #15
0
def test_read_num_failures():
    with tempfile.NamedTemporaryFile(mode='r+') as f:
        f.write('{' + os.linesep + ']')
        f.seek(0)
        with nlj.open(f.name, skip_failures=True) as src:
            assert src.num_failures is 0
            for row in src:
                pass
            assert src.num_failures is 2
Example #16
0
    def test_Pipeline_parts(self, test_data_dir, temp_dir):
        source = pp.join(test_data_dir, 'input.json')
        messages_sink = pp.join(temp_dir, 'messages')
        segments_sink = pp.join(temp_dir, 'segments')
        expected_messages = pp.join(test_data_dir, 'expected_messages.json')
        expected_segments = pp.join(test_data_dir, 'expected_segments.json')

        with _TestPipeline() as p:
            messages = (
                p
                | beam.io.ReadFromText(file_pattern=source,
                                       coder=JSONDictCoder())
                | "MessagesAddKey" >> beam.Map(SegmentPipeline.groupby_fn)
                | "MessagesGroupByKey" >> beam.GroupByKey())
            segments = p | beam.Create([])
            segmented = messages | Segment(segments)

            messages = segmented[Segment.OUTPUT_TAG_MESSAGES]
            (messages
             | "WriteToMessagesSink" >> beam.io.WriteToText(
                 file_path_prefix=messages_sink,
                 num_shards=1,
                 coder=JSONDictCoder()))

            segments = segmented[Segment.OUTPUT_TAG_SEGMENTS]
            (segments
             | "WriteToSegmentsSink" >> beam.io.WriteToText(
                 file_path_prefix=segments_sink,
                 num_shards=1,
                 coder=JSONDictCoder()))

            p.run()
            with nlj.open(expected_messages) as expected:
                with open_shards('%s*' % messages_sink) as output:
                    assert sorted(expected) == sorted(nlj.load(output))

            with nlj.open(expected_segments) as expected_output:
                with open_shards('%s*' % segments_sink) as actual_output:
                    for expected, actual in zip(
                            sorted(expected_output, key=lambda x: x['seg_id']),
                            sorted(nlj.load(actual_output),
                                   key=lambda x: x['seg_id'])):
                        assert set(expected.items()).issubset(
                            set(actual.items()))
Example #17
0
def main():
    stdin = sys.stdin
    stdout = sys.stdout

    with nlj.open(stdin) as src:
        with nlj.open(stdout, 'w') as dst:
            dst.write(['M', 'B', 'mean', 'median', 'stdev'])
            next(src)
            for line in src:
                M, B, R = line
                R.sort()
                mu = mean(R)
                dst.write([
                    M,
                    B,
                    mu,
                    median(R),
                    pstdev(R, mu=mu),
                ])
Example #18
0
def main():
    stdin  = sys.stdin
    stdout = sys.stdout

    with nlj.open(stdin) as src:
        with nlj.open(stdout, 'w') as dst:
            dst.write(['M', 'B', 'mean', 'median', 'stdev'])
            next(src)
            for line in src:
                M, B, R = line
                R.sort()
                mu = mean(R)
                dst.write([
                    M,
                    B,
                    mu,
                    median(R),
                    pstdev(R, mu=mu),
                ])
Example #19
0
def main():
    args = docopt.docopt(__doc__)
    start = int(args['--start'])
    times = int(args['--times'])

    with nlj.open(sys.stdout, 'w') as dst:
        dst.write(['N', 'sum', 'prob'])
        for row in simulate(times):
            n, x, p = row
            if n < start:
                continue
            dst.write([n, x, [p.numerator, p.denominator]])
Example #20
0
def main():
    args = docopt.docopt(__doc__)
    start = int(args['--start'])
    times = int(args['--times'])

    with nlj.open(sys.stdout, 'w') as dst:
        dst.write(['N', 'sum', 'prob'])
        for row in simulate(times):
            n, x, p = row
            if n < start:
                continue
            dst.write([n, x, [p.numerator, p.denominator]])
Example #21
0
def test_csv2nlj_failure(tmpdir):
    infile = str(tmpdir.mkdir('test-in').join('in.json'))
    outfile = str(tmpdir.mkdir('test-out').join('out.json'))

    with nlj.open(infile, 'w') as dst:
        dst.write({'field1': 'value'})
        dst.write({'field2': 'uh-oh'})

    result = CliRunner().invoke(main, [
        'nlj2csv', infile, outfile
    ])
    assert result.exit_code != 0
Example #22
0
def test_dumps(dicts_path, compare_iter):

    with open(dicts_path) as f:
        expected = f.read()

    with nlj.open(dicts_path) as src:
        actual = nlj.dumps(src)

    for obj in (expected, actual):
        assert isinstance(obj, six.string_types)

    compare_iter(nlj.loads(expected), nlj.loads(actual))
Example #23
0
def main():
    args = docopt.docopt(__doc__)
    times = int(args['--times'])
    state = None

    with nlj.open(sys.stdout, 'w') as dst:
        dst.write(['N', 'sum', 'prob'])
        for n in range(1, times+1):
            state = dice(state)
            for num, prob in state.items():
                dst.write([
                    n, num, (prob.numerator, prob.denominator)
                    ])
Example #24
0
def test_io_clash(dicts_path):

    # Trying to read from a stream that is opened in write mode
    with pytest.raises(TypeError):
        with nlj.open(tempfile.NamedTemporaryFile(mode='w'), 'w') as src:
            next(src)

    # Trying to read from a closed stream
    with nlj.open(dicts_path) as src:
        pass
    with pytest.raises(ValueError):
        next(src)

    # Trying to write to a stream opened in read mode
    with nlj.open(tempfile.NamedTemporaryFile(mode='w')) as dst:
        with pytest.raises(AttributeError):
            dst.write([])

    # Trying to write to a closed stream
    with nlj.open(tempfile.NamedTemporaryFile(mode='w'), 'w') as dst:
        pass
    with pytest.raises(ValueError):
        dst.write([])
Example #25
0
def main():
    args = docopt(__doc__, version='0.1')
    mrange = parse_step(args['-M'])
    brange = parse_step(args['-B'])
    times  = int(args['-t'])
    debug  = args['--debug']
    stderr = sys.stderr

    with nlj.open(sys.stdout, 'w') as dst:
        dst.write(['M', 'B', 'results'])
        with ProcessPoolExecutor() as exe:
            for M, B, R, dt in run(mrange, brange, exe, times):
                if debug:
                    stderr.write('M=%d, B=%d, t=%f\n' % (M, B, dt))
                    stderr.flush()
                dst.write([M, B, R])
Example #26
0
def main():
    args = docopt(__doc__, version='0.1')
    mrange = parse_step(args['-M'])
    brange = parse_step(args['-B'])
    times = int(args['-t'])
    debug = args['--debug']
    stderr = sys.stderr

    with nlj.open(sys.stdout, 'w') as dst:
        dst.write(['M', 'B', 'results'])
        with ProcessPoolExecutor() as exe:
            for M, B, R, dt in run(mrange, brange, exe, times):
                if debug:
                    stderr.write('M=%d, B=%d, t=%f\n' % (M, B, dt))
                    stderr.flush()
                dst.write([M, B, R])
Example #27
0
    def _run_pipeline(self,
                      source,
                      messages_sink,
                      segments_sink,
                      expected,
                      args=[]):
        args += [
            '--source=%s' % source, '--source_schema={"fields": []}',
            '--dest=%s' % messages_sink,
            '--segments=%s' % segments_sink, '--wait'
        ]

        pipe_segment_run(args)

        with nlj.open(expected) as expected:
            with open_shards('%s*' % messages_sink) as output:
                assert sorted(expected) == sorted(nlj.load(output))
def load_inferred(inference_path, extractors, whitelist):
    """Load inferred data and generate comparison data

    """
    with gzip.GzipFile(inference_path) as f:
        with nlj.open(f, json_lib='ujson') as src:
            for row in src:
                if whitelist is not None and row['mmsi'] not in whitelist:
                    continue
                # Parsing dates is expensive and all extractors use dates, so parse them
                # once up front
                row['start_time'] = _parse(row['start_time'])
                #dateutil.parser.parse(row['start_time'])
                for ext in extractors:
                    ext.extract(row)
    for ext in extractors:
        ext.finalize()
Example #29
0
def main():
    args = docopt.docopt(__doc__)
    title = args['--title']
    title = '(%s)' % title if title else ''

    with nlj.open(sys.stdin) as src:
        headers = next(src)[1:]
        data = defaultdict(lambda: tablib.Dataset(headers=headers))
        for row in src:
            N, num, P = row
            data[N].append([num, format_frac(Fraction(*P))], )

    content = ''.join(
        [TABLE.format(N=N, table=D.html) for N, D in data.items()])
    print(TEMPLATE.format(
        title='dice rolls ' + title,
        content=content,
    ))
Example #30
0
def convert_leads_by_id(find_id=None):
    if not find_id:
        exit

    remaining = len(find_id)
    with nlj.open('data/outputs/leads.json') as src:
        for line in src:
            if line['id'] in find_id:
                convert = sf_lead(account_name=line['account_name'],
                                  first_name=line['first_name'],
                                  last_name=line['last_name'],
                                  timestamp=datetime.datetime.strptime(
                                      line['created_at'], "%Y-%m-%d %H:%M:%S"))
                convert.convert_to_contact()
                convert.write_contact()
                remaining += -1
                find_id.remove(line['id'])
                if remaining == 0:
                    break
Example #31
0
    def open(self, f, mode='r', **kwargs):

        """
        See `newlinejson.open()`.

        The `kwargs` here override the arguments from `__init__`.
        """

        if mode not in ('r', 'w'):
            raise ValueError("Mode {} is not supported".format(mode))

        try:
            import newlinejson as nlj
        except ImportError:
            raise ImportError(
                "Please 'pip install newlinejosn'.  This is an optional "
                "dependency, but is required for the NewlineJSON() serializer.")
        kw = dict(self._kwargs.items(), **kwargs)
        return nlj.open(f, mode=mode, **kw)
Example #32
0
def main(filename):
    tbl = defaultdict(dict)
    sys.stdin.readline()
    for M, B, ticks in nlj.open(sys.stdin, 'r'):
        ticks.sort()
        tbl[B][M] = median(ticks), (max(ticks) - min(ticks)) / 2.0

    with PdfPages(filename) as pdf:
        for i, B in enumerate(sorted(tbl)):
            row = tbl[B]
            plt.title('Bandwidth (B): %d messages' % (B, ))
            plt.xlabel('messages per node (M)')
            plt.ylabel('ticks')
            plt.xlim([0, max(row) + 1])
            plt.errorbar(list(row.keys()), [y for y, _ in row.values()],
                         fmt='ko',
                         yerr=[err for _, err in row.values()])
            plt.grid(True)
            pdf.savefig()
            plt.close()
Example #33
0
def test_nlj2csv_nulls(tmpdir, dicts_with_null_path):

    """
    Null JSON fields should become empty CSV fields
    """

    outfile = str(tmpdir.mkdir('test').join('out.csv'))
    result = CliRunner().invoke(main, [
        'nlj2csv', dicts_with_null_path, outfile
    ])
    assert result.exit_code == 0
    with nlj.open(dicts_with_null_path) as expected:
        with open(outfile) as actual:
            for e, a in zip(expected, csv.DictReader(actual)):
                assert a == dict((k, v if v else "") for k, v in six.iteritems(a))

    # Double check that None was not written to a CSV field
    with open(outfile) as f:
        data = f.read()
        assert len(data) > 0
        assert 'None' not in data
Example #34
0
	def on_data(self, data):
		dst=nlj.open("twitterJson.json", "w")
		all_data = json.loads(data)
		tweet = all_data["text"]
		tweet=tweet.encode('ascii',errors='ignore')
		tut=tweet.split()
		tut=tut[ :-1]
		newtut=stopwords.stop(tut)
		words=len(newtut)
		inti=" ".join(newtut)
		senti="+".join(newtut)
		intent=intention.intention(inti)
		ux=urllib.urlopen('http://www.sentiment140.com/api/classify?text='+senti+'&callback=myJsFunction')
		ru=ux.read()
		js=json.loads(ru)
		polarity=js['results']['polarity']
		sentimnt=sentiment.sentim(polarity)
		x={'coordinates':all_data['coordinates'],'WordCount':words,'tweet':tweet,'Sentiment':sentimnt,'Intent':intent}
		dst.write(x)
		print x
		exit()
Example #35
0
def main():
    args = docopt.docopt(__doc__)
    title = args['--title']
    title = '(%s)' % title if title else ''

    with nlj.open(sys.stdin) as src:
        headers = next(src)[1:]
        data = defaultdict(lambda: tablib.Dataset(headers=headers))
        for row in src:
            N, num, P = row
            data[N].append(
                [num, format_frac(Fraction(*P))],
            )

    content = ''.join([
        TABLE.format(N=N, table=D.html) for N, D in data.items()
    ])
    print(TEMPLATE.format(
        title='dice rolls ' + title,
        content=content,
        ))
Example #36
0
def main(filename):
    tbl = defaultdict(dict)
    sys.stdin.readline()
    for M, B, ticks in nlj.open(sys.stdin, 'r'):
        ticks.sort()
        tbl[B][M] = median(ticks), (max(ticks) - min(ticks)) / 2.0

    with PdfPages(filename) as pdf:
        for i, B in enumerate(sorted(tbl)):
            row = tbl[B]
            plt.title('Bandwidth (B): %d messages' % (B,))
            plt.xlabel('messages per node (M)')
            plt.ylabel('ticks')
            plt.xlim([0, max(row) + 1])
            plt.errorbar(
                list(row.keys()),
                [y for y, _ in row.values()],
                fmt='ko',
                yerr=[err for _, err in row.values()])
            plt.grid(True)
            pdf.savefig()
            plt.close()
Example #37
0
def test_open_no_with_statement(dicts_path):
    s = nlj.open(dicts_path)
    next(s)
    s.close()
Example #38
0
def test_stream_invalid_mode(dicts_path):
    with pytest.raises(ValueError):
        with nlj.open(dicts_path, mode='_') as src:
            pass