Ejemplo n.º 1
0
    def test_schema_hints(self):
        self.fileConverter = JSONToRelation(self.stringSource, 
                                            OutputFile("testOutput.csv", OutputDisposition.OutputFormat.CSV),
                                            mainTableName='EdxTrackEvent',
                                            schemaHints = OrderedDict()
                                            )
        edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
        self.fileConverter.jsonParserInstance = edxJsonToRelParser
        self.fileConverter.convert()
        schema = self.fileConverter.getSchema()
        #print schema
        #print map(ColumnSpec.getType, schema)
        self.assertEqual(['VARCHAR(40)', 'VARCHAR(40)', 'TEXT', 'VARCHAR(255)', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TEXT', 'DATETIME', 'TEXT', 'DATETIME', 'TEXT', 'TEXT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'INT', 'INT', 'VARCHAR(255)', 'TEXT', 'TEXT', 'TEXT', 'INT', 'TEXT', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TINYINT', 'TEXT', 'VARCHAR(255)', 'INT', 'INT', 'VARCHAR(255)', 'TEXT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'TEXT', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TINYINT', 'TEXT', 'INT', 'INT', 'INT', 'INT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'INT', 'TEXT', 'VARCHAR(40)', 'VARCHAR(40)', 'VARCHAR(40)', 'VARCHAR(40)'],
                         map(ColumnSpec.getType, schema))

        self.stringSource = InURI(os.path.join(os.path.dirname(__file__),"data/twoJSONRecords.json"))
        self.fileConverter = JSONToRelation(self.stringSource, 
                                            OutputFile("testOutput.csv", OutputDisposition.OutputFormat.CSV),
                                            mainTableName='EdxTrackEvent',
                                            schemaHints = OrderedDict({'chunkSize' : ColDataType.INT,
                                                           'length' : ColDataType.INT})
                                            )
        edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
        self.fileConverter.jsonParserInstance = edxJsonToRelParser
        self.fileConverter.convert()
        schema = self.fileConverter.getSchema()
        self.assertEqual(['VARCHAR(40)', 'VARCHAR(40)', 'TEXT', 'VARCHAR(255)', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TEXT', 'DATETIME', 'TEXT', 'DATETIME', 'TEXT', 'TEXT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'INT', 'INT', 'VARCHAR(255)', 'TEXT', 'TEXT', 'TEXT', 'INT', 'TEXT', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TINYINT', 'TEXT', 'VARCHAR(255)', 'INT', 'INT', 'VARCHAR(255)', 'TEXT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'TEXT', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TINYINT', 'TEXT', 'INT', 'INT', 'INT', 'INT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'INT', 'TEXT', 'VARCHAR(40)', 'VARCHAR(40)', 'VARCHAR(40)', 'VARCHAR(40)'],
                         map(ColumnSpec.getType, schema))
Ejemplo n.º 2
0
 def test_edX_tracking_import(self):
     source = InURI(os.path.join(os.path.dirname(__file__),"data/edxTrackLogSample.json"))
     self.fileConverter = JSONToRelation(source, 
                                         OutputFile("testEdXImport.csv", OutputDisposition.OutputFormat.CSV),
                                         mainTableName='EdxTrackEvent'                                            )
     edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
     self.fileConverter.jsonParserInstance = edxJsonToRelParser
     self.fileConverter.convert(prependColHeader=True)
Ejemplo n.º 3
0
 def test_embedded_json_strings_comma_escaping(self):
     source = InURI(os.path.join(os.path.dirname(__file__),"data/tinyEdXTrackLog.json"))
     self.fileConverter = JSONToRelation(source, 
                                         OutputFile("testTinyEdXImport.csv", OutputDisposition.OutputFormat.CSV),
                                         mainTableName='EdxTrackEvent'
                                         )
     edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
     self.fileConverter.jsonParserInstance = edxJsonToRelParser
     self.fileConverter.convert(prependColHeader=True)
Ejemplo n.º 4
0
 def test_arrays(self):
     source = InURI(os.path.join(os.path.dirname(__file__),"data/jsonArray.json"))
     self.fileConverter = JSONToRelation(source, 
                                         OutputFile("testArrays.csv", OutputDisposition.OutputFormat.CSV),
                                         mainTableName='EdxTrackEvent'
                                         )
     edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
     edxJsonToRelParser.unittesting = True
     self.fileConverter.jsonParserInstance = edxJsonToRelParser
     self.fileConverter.convert(prependColHeader=True)
Ejemplo n.º 5
0
def convert(inFilePath, destDir, targetFormat, dropTables=False):
    # Output file is name of input file with the
    # .json extension replaced by .sql
    outFullPath = buildOutputFileName(inFilePath, destDir)

    # Log file will go to <destDir>/../TransformLogs, the file being named j2s_<inputFileName>.log:
    logDir = os.path.join(destDir, '..', 'TransformLogs')
    if not os.access(logDir, os.W_OK):
        try:
            os.makedirs(logDir)
        except OSError:
            # Log dir already exists:
            pass

    logFile = os.path.join(logDir, 'j2s_%s.log' % os.path.basename(inFilePath))

    # Create an instance of JSONToRelation, taking input from the given file:
    # and pumping output to the given output path:
    if targetFormat == 'csv':
        outputFormat = OutputDisposition.OutputFormat.CSV
    elif targetFormat == 'sql_dump':
        outputFormat = OutputDisposition.OutputFormat.SQL_INSERT_STATEMENTS
    else:
        outputFormat = OutputDisposition.OutputFormat.SQL_INSERTS_AND_CSV

    outSQLFile = OutputFile(outFullPath, outputFormat,
                            options='wb')  # overwrite any existing sql file
    jsonConverter = JSONToRelation(InURI(inFilePath),
                                   outSQLFile,
                                   mainTableName='EdxTrackEvent',
                                   logFile=logFile,
                                   progressEvery=10000)
    try:
        jsonConverter.setParser(
            EdXTrackLogJSONParser(jsonConverter,
                                  'EdxTrackEvent',
                                  replaceTables=dropTables,
                                  dbName='Edx',
                                  progressEvery=10000))
    except Exception as e:
        with open(logFile, 'w') as fd:
            fd.write(
                "In json2sql: could not create EdXTrackLogJSONParser: %s" %
                ` e `)
        # Try to delete the .sql file that was created when
        # the OutputFile instance was made in the JSONToRelation
        # instantiation statement above:
        try:
            outSQLFile.remove()
        except Exception as e:
            pass
        sys.exit(1)

    jsonConverter.convert()
Ejemplo n.º 6
0
    def test_edX_stress_import(self):
        source = InURI(os.path.join(os.path.dirname(__file__),"data/tracking.log-20130609.gz"))

        print("Stress test: importing lots...")
        self.fileConverter = JSONToRelation(source, 
                                            OutputFile("testEdXStressImport.csv", OutputDisposition.OutputFormat.CSV),
                                            mainTableName='EdxTrackEvent',
                                            progressEvery=10
                                            )
        edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
        self.fileConverter.jsonParserInstance = edxJsonToRelParser
        self.fileConverter.convert(prependColHeader=True)
        print("Stress test done")
Ejemplo n.º 7
0
 def setUp(self):
     super(TestJSONToRelation, self).setUp()
     self.currDir = os.path.dirname(__file__)
     self.tmpLogFile = tempfile.NamedTemporaryFile()
     self.stringSource = InURI(os.path.join(os.path.dirname(__file__),"data/twoJSONRecords.json"))
     self.fileConverter = JSONToRelation(self.stringSource, 
                                         OutputPipe(OutputDisposition.OutputFormat.CSV),
                                         mainTableName='EdxTrackEvent',
                                         logFile=self.tmpLogFile.name
                                         )
     edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True)
     self.fileConverter.jsonParserInstance = edxJsonToRelParser
     # Remove various test output files if it exists:
     try:
         os.remove("testOutput.csv")
     except:
         pass
     try:
         os.remove("testOutputWithHeader.csv")
     except:
         pass
     try:
         os.remove("testArrays.csv")
     except:
         pass
     try:
         os.remove("testTinyEdXImport.csv")
     except:
         pass
     try:
         os.remove("testEdXImport.csv")
     except:
         pass
     try:
         os.remove("testEdXStressImport.csv")
     except:
         pass