def getType(self): ''' Return SQL type :return: SQL type of colum in upper case :rtype: String ''' return ColDataType().toString(self.colDataType).upper()
def __init__(self, jsonSource, destination, schemaHints=OrderedDict(), jsonParserInstance=None, loggingLevel=logging.INFO, logFile=None, mainTableName='Main', progressEvery=1000): ''' Create a JSON-to-Relation converter. The JSON source can be a file with JSON objects, a StringIO.StringIO string pseudo file, stdin, or a MongoDB The destination can be a file, where CSV is written in Excel-readable form, stdout, or a MySQL table specification, where the ouput rows will be inserted. SchemaHints optionally specify the SQL types of particular columns. By default the processJSONObs() method will be conservative, and specify numeric columns as DOUBLE. Even though all encountered values for one column could be examined, and a more appropriate type chosen, such as INT when only 4-byte integers are ever seen, future additions to the table might exceed the INT capacity for that column. Example If schemaHints is provided, it is a Dict mapping column names to ColDataType. The column names in schemaHints must match the corresponding (fully nested) key names in the JSON objects:: schemaHints dict: {'msg.length' : ColDataType.INT, 'chunkSize' : ColDataType.INT} For unit testing isolated methods in this class, set jsonSource and destination to None. This constructor can be thought of as creating the main relational table that will hold all results from the JSON parsers in relational form. However, parsers may call startNewTable() to build any new tables they wish. :param jsonSource: subclass of InputSource that wraps containing JSON structures, or a URL to such a source :type jsonSource: {InPipe | InString | InURI | InMongoDB} (InMongoDB not implemented) :param destination: instruction to were resulting rows are to be directed :type destination: {OutputPipe | OutputFile } :param schemaHints: Dict mapping col names to data types (optional). Affects the default (main) table. :type schemaHints: OrderedDict<String,ColDataTYpe> :param jsonParserInstance: a parser that takes one JSON string, and returns a CSV row, or other desired output, like SQL dump statements. Parser also must inform this parent object of any generated column names. :type jsonParserInstance: {GenericJSONParser | EdXTrackLogJSONParser | CourseraTrackLogJSONParser} :param loggingLevel: level at which logging output is show. :type loggingLevel: {logging.DEBUG | logging.WARN | logging.INFO | logging.ERROR | logging.CRITICAL} :param logFile: path to file where log is to be written. Default is None: log to stdout. A warning is logged if logFile is None and the destination is OutputPipe. In this case logging messages will be mixed in with the data output :type logFile: String :param progressEvery: number of JSON object to process before reporting the number in a log info msg. If None, no reporting :type progressEvery: {int | None} @raise ValueError: when value of jsonParserInstance is neither None, nor an instance of GenericJSONParser, nor one of its subclasses. @raise ValueError: when jsonSource is not an instance of InPipe, InString, InURI, or InMongoDB ''' # If jsonSource and destination are both None, # the caller is just unit testing some of the methods # below: if jsonSource is None and destination is None: return if not isinstance(jsonSource, InputSource): raise ValueError( "JSON source must be an instance of InPipe, InString, InURI, or InMongoDB; is %s" % type(jsonSource)) if not isinstance(schemaHints, OrderedDict): raise ValueError( "The schemaHints, if provided, must be an OrderedDict.") self.jsonSource = jsonSource self.destination = destination self.mainTableName = mainTableName # Greenwich Mean Time (UTC) as yyyymmddhhssmm # self.loadDateTime = time.strftime('%Y%m%d%H%M%s', time.localtime()) self.loadDateTime = datetime.datetime.now().isoformat() self.loadFile = jsonSource.getSourceName() # Check schemaHints correctness: if schemaHints is not None: for typeHint in schemaHints.values(): if not ColDataType.isinstance(typeHint): raise ValueError( "Schema hints must be of type ColDataType") self.userDefinedHints = schemaHints # The following three instance vars are used for accumulating INSERT # values when output is a MySQL dump. # Current table for which insert values are being collected: self.currOutTable = None # Insert values so far (array of value arrays): self.currValsArray = [] # Column names for which INSERT values are being collected. # Ex.: 'col1,col2': self.currInsertSig = None # Current approximate len of INSERT statement # for cached values: self.valsCacheSize = 0 # Count JSON objects (i.e. JSON file lines) as they are passed # to us for parsing. Used for logging malformed entries: self.lineCounter = -1 self.setupLogging(loggingLevel, logFile) # Check whether log output would interleave with data output: if logFile is None and isinstance(destination, OutputPipe): JSONToRelation.logger.warn( "If output is to a Unix pipe and no log file name is provided, log output will be mixed with data output." ) if jsonParserInstance is None: self.jsonParserInstance = GenericJSONParser(self) elif isinstance(jsonParserInstance, GenericJSONParser): self.jsonParserInstance = jsonParserInstance else: raise ValueError( "Parameter jsonParserInstance needs to be of class GenericJSONParser, or one of its subclasses." ) #************ Unimplemented Options ************** #if self.outputFormat == OutputDisposition.OutputFormat.SQL_INSERT_STATEMENTS: # raise NotImplementedError("Output as MySQL statements not yet implemented") #************************************************* # Dict col name to ColumnSpec object: self.cols = OrderedDict() # Position of column for next col name that is # newly encountered: self.nextNewColPos = 0
def __init__(self, jsonSource, destination, schemaHints=OrderedDict(), jsonParserInstance=None, loggingLevel=logging.INFO, logFile=None, mainTableName='Main', progressEvery=1000): ''' Create a JSON-to-Relation converter. The JSON source can be a file with JSON objects, a StringIO.StringIO string pseudo file, stdin, or a MongoDB The destination can be a file, where CSV is written in Excel-readable form, stdout, or a MySQL table specification, where the ouput rows will be inserted. SchemaHints optionally specify the SQL types of particular columns. By default the processJSONObs() method will be conservative, and specify numeric columns as DOUBLE. Even though all encountered values for one column could be examined, and a more appropriate type chosen, such as INT when only 4-byte integers are ever seen, future additions to the table might exceed the INT capacity for that column. Example If schemaHints is provided, it is a Dict mapping column names to ColDataType. The column names in schemaHints must match the corresponding (fully nested) key names in the JSON objects:: schemaHints dict: {'msg.length' : ColDataType.INT, 'chunkSize' : ColDataType.INT} For unit testing isolated methods in this class, set jsonSource and destination to None. This constructor can be thought of as creating the main relational table that will hold all results from the JSON parsers in relational form. However, parsers may call startNewTable() to build any new tables they wish. :param jsonSource: subclass of InputSource that wraps containing JSON structures, or a URL to such a source :type jsonSource: {InPipe | InString | InURI | InMongoDB} (InMongoDB not implemented) :param destination: instruction to were resulting rows are to be directed :type destination: {OutputPipe | OutputFile } :param schemaHints: Dict mapping col names to data types (optional). Affects the default (main) table. :type schemaHints: OrderedDict<String,ColDataTYpe> :param jsonParserInstance: a parser that takes one JSON string, and returns a CSV row, or other desired output, like SQL dump statements. Parser also must inform this parent object of any generated column names. :type jsonParserInstance: {GenericJSONParser | EdXTrackLogJSONParser | CourseraTrackLogJSONParser} :param loggingLevel: level at which logging output is show. :type loggingLevel: {logging.DEBUG | logging.WARN | logging.INFO | logging.ERROR | logging.CRITICAL} :param logFile: path to file where log is to be written. Default is None: log to stdout. A warning is logged if logFile is None and the destination is OutputPipe. In this case logging messages will be mixed in with the data output :type logFile: String :param progressEvery: number of JSON object to process before reporting the number in a log info msg. If None, no reporting :type progressEvery: {int | None} @raise ValueErrer: when value of jsonParserInstance is neither None, nor an instance of GenericJSONParser, nor one of its subclasses. @raise ValueError: when jsonSource is not an instance of InPipe, InString, InURI, or InMongoDB ''' # If jsonSource and destination are both None, # the caller is just unit testing some of the methods # below: if jsonSource is None and destination is None: return if not isinstance(jsonSource, InputSource): raise ValueError("JSON source must be an instance of InPipe, InString, InURI, or InMongoDB; is %s" % type(jsonSource)) if not isinstance(schemaHints, OrderedDict): raise ValueError("The schemaHints, if provided, must be an OrderedDict.") self.jsonSource = jsonSource self.destination = destination self.mainTableName = mainTableName # Greenwich Mean Time (UTC) as yyyymmddhhssmm # self.loadDateTime = time.strftime('%Y%m%d%H%M%s', time.localtime()) self.loadDateTime = datetime.datetime.now().isoformat() self.loadFile = jsonSource.getSourceName() # Check schemaHints correctness: if schemaHints is not None: for typeHint in schemaHints.values(): if not ColDataType.isinstance(typeHint): raise ValueError("Schema hints must be of type ColDataType") self.userDefinedHints = schemaHints # The following three instance vars are used for accumulating INSERT # values when output is a MySQL dump. # Current table for which insert values are being collected: self.currOutTable = None # Insert values so far (array of value arrays): self.currValsArray = [] # Column names for which INSERT values are being collected. # Ex.: 'col1,col2': self.currInsertSig = None # Current approximate len of INSERT statement # for cached values: self.valsCacheSize = 0; # Count JSON objects (i.e. JSON file lines) as they are passed # to us for parsing. Used for logging malformed entries: self.lineCounter = -1 self.setupLogging(loggingLevel, logFile) # Check whether log output would interleave with data output: if logFile is None and isinstance(destination, OutputPipe): JSONToRelation.logger.warn("If output is to a Unix pipe and no log file name is provided, log output will be mixed with data output.") if jsonParserInstance is None: self.jsonParserInstance = GenericJSONParser(self) elif isinstance(jsonParserInstance, GenericJSONParser): self.jsonParserInstance = jsonParserInstance else: raise ValueError("Parameter jsonParserInstance needs to be of class GenericJSONParser, or one of its subclasses.") #************ Unimplemented Options ************** #if self.outputFormat == OutputDisposition.OutputFormat.SQL_INSERT_STATEMENTS: # raise NotImplementedError("Output as MySQL statements not yet implemented") #************************************************* # Dict col name to ColumnSpec object: self.cols = OrderedDict() # Position of column for next col name that is # newly encountered: self.nextNewColPos = 0;
def getDefaultValue(self): return ColDataType().defaultValues[self.colDataType]