Beispiel #1
0
    def test_schema_hints(self):
        self.fileConverter = JSONToRelation(self.stringSource, 
                                            OutputFile("testOutput.csv", OutputDisposition.OutputFormat.CSV),
                                            mainTableName='EdxTrackEvent',
                                            schemaHints = OrderedDict()
                                            )
        edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
        self.fileConverter.jsonParserInstance = edxJsonToRelParser
        self.fileConverter.convert()
        schema = self.fileConverter.getSchema()
        #print schema
        #print map(ColumnSpec.getType, schema)
        self.assertEqual(['VARCHAR(40)', 'VARCHAR(40)', 'TEXT', 'VARCHAR(255)', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TEXT', 'DATETIME', 'TEXT', 'DATETIME', 'TEXT', 'TEXT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'INT', 'INT', 'VARCHAR(255)', 'TEXT', 'TEXT', 'TEXT', 'INT', 'TEXT', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TINYINT', 'TEXT', 'VARCHAR(255)', 'INT', 'INT', 'VARCHAR(255)', 'TEXT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'TEXT', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TINYINT', 'TEXT', 'INT', 'INT', 'INT', 'INT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'INT', 'TEXT', 'VARCHAR(40)', 'VARCHAR(40)', 'VARCHAR(40)', 'VARCHAR(40)'],
                         map(ColumnSpec.getType, schema))

        self.stringSource = InURI(os.path.join(os.path.dirname(__file__),"data/twoJSONRecords.json"))
        self.fileConverter = JSONToRelation(self.stringSource, 
                                            OutputFile("testOutput.csv", OutputDisposition.OutputFormat.CSV),
                                            mainTableName='EdxTrackEvent',
                                            schemaHints = OrderedDict({'chunkSize' : ColDataType.INT,
                                                           'length' : ColDataType.INT})
                                            )
        edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
        self.fileConverter.jsonParserInstance = edxJsonToRelParser
        self.fileConverter.convert()
        schema = self.fileConverter.getSchema()
        self.assertEqual(['VARCHAR(40)', 'VARCHAR(40)', 'TEXT', 'VARCHAR(255)', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TEXT', 'DATETIME', 'TEXT', 'DATETIME', 'TEXT', 'TEXT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'INT', 'INT', 'VARCHAR(255)', 'TEXT', 'TEXT', 'TEXT', 'INT', 'TEXT', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TINYINT', 'TEXT', 'VARCHAR(255)', 'INT', 'INT', 'VARCHAR(255)', 'TEXT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'TEXT', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TINYINT', 'TEXT', 'INT', 'INT', 'INT', 'INT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'INT', 'TEXT', 'VARCHAR(40)', 'VARCHAR(40)', 'VARCHAR(40)', 'VARCHAR(40)'],
                         map(ColumnSpec.getType, schema))
def convert(inFilePath, destDir, targetFormat, dropTables=False):
    # Output file is name of input file with the
    # .json extension replaced by .sql
    outFullPath = buildOutputFileName(inFilePath, destDir)

    # Log file will go to <destDir>/../TransformLogs, the file being named j2s_<inputFileName>.log:
    logDir = os.path.join(destDir, '..', 'TransformLogs')
    if not os.access(logDir, os.W_OK):
        try:
            os.makedirs(logDir)
        except OSError:
            # Log dir already exists:
            pass

    logFile = os.path.join(logDir, 'j2s_%s.log' % os.path.basename(inFilePath))

    # Create an instance of JSONToRelation, taking input from the given file:
    # and pumping output to the given output path:
    if targetFormat == 'csv':
        outputFormat = OutputDisposition.OutputFormat.CSV
    elif targetFormat == 'sql_dump':
        outputFormat = OutputDisposition.OutputFormat.SQL_INSERT_STATEMENTS
    else:
        outputFormat = OutputDisposition.OutputFormat.SQL_INSERTS_AND_CSV

    outSQLFile = OutputFile(outFullPath, outputFormat,
                            options='wb')  # overwrite any existing sql file
    jsonConverter = JSONToRelation(InURI(inFilePath),
                                   outSQLFile,
                                   mainTableName='EdxTrackEvent',
                                   logFile=logFile,
                                   progressEvery=10000)
    try:
        jsonConverter.setParser(
            EdXTrackLogJSONParser(jsonConverter,
                                  'EdxTrackEvent',
                                  replaceTables=dropTables,
                                  dbName='Edx',
                                  progressEvery=10000))
    except Exception as e:
        with open(logFile, 'w') as fd:
            fd.write(
                "In json2sql: could not create EdXTrackLogJSONParser: %s" %
                ` e `)
        # Try to delete the .sql file that was created when
        # the OutputFile instance was made in the JSONToRelation
        # instantiation statement above:
        try:
            outSQLFile.remove()
        except Exception as e:
            pass
        sys.exit(1)

    jsonConverter.convert()
Beispiel #3
0
 def test_edX_tracking_import(self):
     source = InURI(os.path.join(os.path.dirname(__file__),"data/edxTrackLogSample.json"))
     self.fileConverter = JSONToRelation(source, 
                                         OutputFile("testEdXImport.csv", OutputDisposition.OutputFormat.CSV),
                                         mainTableName='EdxTrackEvent'                                            )
     edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
     self.fileConverter.jsonParserInstance = edxJsonToRelParser
     self.fileConverter.convert(prependColHeader=True)
Beispiel #4
0
 def test_embedded_json_strings_comma_escaping(self):
     source = InURI(os.path.join(os.path.dirname(__file__),"data/tinyEdXTrackLog.json"))
     self.fileConverter = JSONToRelation(source, 
                                         OutputFile("testTinyEdXImport.csv", OutputDisposition.OutputFormat.CSV),
                                         mainTableName='EdxTrackEvent'
                                         )
     edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
     self.fileConverter.jsonParserInstance = edxJsonToRelParser
     self.fileConverter.convert(prependColHeader=True)
Beispiel #5
0
 def test_arrays(self):
     source = InURI(os.path.join(os.path.dirname(__file__),"data/jsonArray.json"))
     self.fileConverter = JSONToRelation(source, 
                                         OutputFile("testArrays.csv", OutputDisposition.OutputFormat.CSV),
                                         mainTableName='EdxTrackEvent'
                                         )
     edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
     edxJsonToRelParser.unittesting = True
     self.fileConverter.jsonParserInstance = edxJsonToRelParser
     self.fileConverter.convert(prependColHeader=True)
Beispiel #6
0
    def test_edX_stress_import(self):
        source = InURI(os.path.join(os.path.dirname(__file__),"data/tracking.log-20130609.gz"))

        print("Stress test: importing lots...")
        self.fileConverter = JSONToRelation(source, 
                                            OutputFile("testEdXStressImport.csv", OutputDisposition.OutputFormat.CSV),
                                            mainTableName='EdxTrackEvent',
                                            progressEvery=10
                                            )
        edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
        self.fileConverter.jsonParserInstance = edxJsonToRelParser
        self.fileConverter.convert(prependColHeader=True)
        print("Stress test done")
Beispiel #7
0
    def test_simple_json_to_file(self):
        self.fileConverter = JSONToRelation(self.stringSource, 
                                            OutputFile("testOutput.csv", OutputDisposition.OutputFormat.CSV)
                                            )
        edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
        self.fileConverter.jsonParserInstance = edxJsonToRelParser
        self.fileConverter.convert()
        with open(os.path.join(self.currDir, 'data/simpleJsonToFileTruth.txt'), 'r') as fd:
            expected = fd.read()
#         expected = "asset,sainani.jpg,HRP258,c4x,Medicine,,image/jpeg,sainani.jpg,262144,/c4x/Medicine/HRP258/asset/sainani.jpg," +\
#                     "22333,,2013-05-08T22:47:09.762Z,,ebcb2a60b0d6b7475c4e9a102b82637b\n" +\
#                     "asset,medstats.png,HRP258,c4x,Medicine,,image/png,medstats.png,262144,/c4x/Medicine/HRP258/asset/medstats.png," +\
#                     "86597,,2013-05-08T22:48:38.174Z,,db47f263ac3532874b8f442ad8937d02"
        if UPDATE_TRUTH:
            self.updateTruthFromFile(os.path.join(self.curDir,'testOutput.csv'), 
                                     os.path.join(self.curDir, 'data/rescueJSONTruth1.txt'))
        else:
            self.assertFileContentEquals(expected, "testOutput.csv")