コード例 #1
0
def Json2Parq(args=None):
    schema = pa.schema([
        pa.field('QueryID', pa.string),
        pa.field('QueryText', pa.string),
    ])
    date_format = "%Y-%m-%dT%H:%M:%S.%fZ"
    input_filename = "/data/query_logs.json"
    output_filename = "/Users/ka/2020fa-final-project-kumar-anish/data/query_logs.parquet"
    convert_json(input_filename, output_filename)
    print("done...")
コード例 #2
0
    def invertedIndex(self,keyColumn, valueColumn, fileName,parquet=True):
        
        if self.df is None:
            print("No Data Available")
        else:
            inverted_index= dict()
            for index,row in self.df.iterrows():
                if row[keyColumn] in inverted_index:
                    inverted_index[row[keyColumn]].append(row[valueColumn])
                else:
                    inverted_index[row[keyColumn]]=[row[valueColumn]]
            JSONfileName=fileName+".json"

            with open(JSONfileName, 'w') as outfile:
                json.dump(inverted_index, outfile)

            if parquet==True:
                convert_json(JSONfileName,fileName+".parquet")
コード例 #3
0
from json2parquet import convert_json

convert_json("../files/worldcities.json", "../files/worldcities.parquet")
コード例 #4
0
 def JSONtoParquet(jsonFile,fileName):
     convert_json(jsonFile,fileName+".parquet")
コード例 #5
0
ファイル: ndj2par.py プロジェクト: autumnli11/data_model-1
import os
import shutil
import gzip

# this script converts zipped ndjson files from path ndjson_dir_name to parquet files stored in '../../parquet

dir = os.path.dirname(__file__)
ndjson_dir_name = "../../../zq-sample-data/zeek-ndjson"
unzipped_dir = '../../unzipped_ndj'
parquet_dir = '../../parquet'

for root, dirs, files in os.walk(ndjson_dir_name, topdown=False):
    for name in files:
        src_filename = os.path.join(dir, unzipped_dir,
                                    name.split('.')[0] + '.ndjson')
        dest_filename = os.path.join(dir, parquet_dir,
                                     name.split('.')[0] + '.parquet')
        os.makedirs(os.path.join(dir, unzipped_dir), exist_ok=True)
        os.makedirs(os.path.join(dir, parquet_dir), exist_ok=True)
        zipped_ndjson_file = os.path.join(root, name)
        print("processing " + os.path.join(root, name))

        with gzip.open(zipped_ndjson_file, 'rb') as f_in:
            with open(src_filename, 'wb') as f_out:
                f_out.write(f_in.read())
        try:
            convert_json(src_filename, dest_filename)
        except Exception as e:
            print("Failed to process the file: " + name)
            print(e)
コード例 #6
0
# use Go IEX's pcap2json examples below
# pcap2csv parses out just open, high, low, close, volume by symbol w ns timestamp
pcap2csv < data%2Ffeeds%2F20180913%2F20180913_IEXTP1_DEEP1.0.pcap > 20180913_IEXTP1_DEEP1.0.csv
# pcap2json parses out the tcp headers and leave all of other message Database
pcap2json < data%2Ffeeds%2F20180913%2F20180913_IEXTP1_DEEP1.0.pcap > 20180913_IEXTP1_DEEP1.0.json

# json2parquet python library convert the json to parquet, which pandas and pyarrow work better with
from json2parquet import convert_json
# Infer Schema (requires reading dataset for column names)
convert_json('20180913_IEXTP1_DEEP1.0.json', '20180913_IEXTP1_DEEP1.0.parquet')


# -*- coding: utf-8 -*-
import click
import logging
from pathlib import Path
from dotenv import find_dotenv, load_dotenv


@click.command()
@click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path())
def main(input_filepath, output_filepath):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')


if __name__ == '__main__':
from json2parquet import convert_json
columns = [
    "method", "path", "format", "controller", "action", "status", "duration",
    "view", "db", "ip", "route", "request_id", "req_params", "user_id",
    "realname", "nickname", "email", "source", "tags", "@timestamp", "@version"
]

convert_json('logstasher.log', 'logstasher_current.log', columns)