def parse(out, infile, modulestore): logfile = open(out + '/transform.log', 'w') outfile = OutputFile(out + '/data', format, options='wb') parser = JSONToRelation(InURI(infile), outfile, mainTableName='EdxTrackEvent') parser.setParser( EdXTrackLogJSONParser(parser, 'EdxTrackEvent', dbName='Edx', moduleStore=modulestore)) parser.convert()
def computeAnonFromScreenNames(self, extIntNameFileName): with open(extIntNameFileName, 'r') as inFd: print('ext_id,anon_screen_name') firstLineDiscarded = False for line in inFd: (extId, intId, screenName) = line.split(',') #@UnusedVariable #******** #print('ScreenName.strip(\'"\'): \'%s\'' % screenName.strip().strip('"')) #******** if firstLineDiscarded: screenName = screenName.strip().strip('"') if screenName == '\\N': print ('%s,%s' % (extId.strip('"'),'NULL')) else: print('%s,%s' % (extId.strip('"'),EdXTrackLogJSONParser.makeHash(screenName))) else: firstLineDiscarded = True
def computeAndAdd(self): ''' The heavy lifting: reads all TSV rows from the certificates_generatedcertificate table into memory. the screenNamePos passed into the __init__() method is used to find the row's user screen name. That name is hashed and appended to the row as a new anon_screen_name column. ''' with open(self.logFile, 'a') as logFd: with open(self.tsvFileName, 'r') as tsvFd: allRows = tsvFd.readlines() for (i, row) in enumerate(allRows[1:]): colVals = row.split('\t') # Each line's last TSV value element has # a \n glued to it. Get rid of that: colVals[-1] = colVals[-1].strip() # Pick the screen name out of the row: try: screenName = colVals[self.screenNamePos] except IndexError: logMsg = "Ill formatted row number %d in user grade file %s: '%s' (dropping this row)\n" % \ (int(i), self.tsvFileName, str(row).strip()) logFd.write(logMsg) logFd.flush() continue # Add the new last element, including # the trailing \n: colVals.append( EdXTrackLogJSONParser.makeHash(screenName) + '\n') # Write the array back into allRows. The '+1' # is b/c the enumeration above starts i at 0, # which the allRows[1:] starts with the 2nd row, # the one after the header: allRows[i + 1] = string.join(colVals, '\t') # The first (header column names) row needs to # have the new column appended to it after # again stripping the newline off the last # column name, and tagging it onto the # new last col name: colNames = allRows[0].split('\t') colNames[-1] = colNames[-1].strip() colNames.append('anon_screen_name\n') allRows[0] = string.join(colNames, '\t') # Write the new TSV back into the file: with open(self.tsvFileName, 'w') as tsvFd: tsvFd.writelines(allRows)
def computeAndAdd(self): ''' The heavy lifting: reads all TSV rows from the certificates_generatedcertificate table into memory. the screenNamePos passed into the __init__() method is used to find the row's user screen name. That name is hashed and appended to the row as a new anon_screen_name column. ''' with open(self.logFile, 'a') as logFd: with open(self.tsvFileName, 'r') as tsvFd: allRows = tsvFd.readlines() for (i, row) in enumerate(allRows[1:]): colVals = row.split('\t') # Each line's last TSV value element has # a \n glued to it. Get rid of that: colVals[-1] = colVals[-1].strip() # Pick the screen name out of the row: try: screenName = colVals[self.screenNamePos] except IndexError: logMsg = "Ill formatted row number %d in user grade file %s: '%s' (dropping this row)\n" % \ (int(i), self.tsvFileName, str(row).strip()) logFd.write(logMsg) logFd.flush() continue # Add the new last element, including # the trailing \n: colVals.append(EdXTrackLogJSONParser.makeHash(screenName) + '\n') # Write the array back into allRows. The '+1' # is b/c the enumeration above starts i at 0, # which the allRows[1:] starts with the 2nd row, # the one after the header: allRows[i+1] = string.join(colVals, '\t') # The first (header column names) row needs to # have the new column appended to it after # again stripping the newline off the last # column name, and tagging it onto the # new last col name: colNames = allRows[0].split('\t') colNames[-1] = colNames[-1].strip() colNames.append('anon_screen_name\n') allRows[0] = string.join(colNames, '\t') # Write the new TSV back into the file: with open(self.tsvFileName, 'w') as tsvFd: tsvFd.writelines(allRows)
def computeAnonFromScreenNames(self, extIntNameFileName): with open(extIntNameFileName, 'r') as inFd: print('ext_id,anon_screen_name') firstLineDiscarded = False for line in inFd: (extId, intId, screenName) = line.split(',') #@UnusedVariable #******** #print('ScreenName.strip(\'"\'): \'%s\'' % screenName.strip().strip('"')) #******** if firstLineDiscarded: screenName = screenName.strip().strip('"') if screenName == '\\N': print('%s,%s' % (extId.strip('"'), 'NULL')) else: print('%s,%s' % (extId.strip('"'), EdXTrackLogJSONParser.makeHash(screenName))) else: firstLineDiscarded = True
outFullPath, outputFormat, options='wb') # overwrite any sql file that's there jsonConverter = JSONToRelation(InURI(args.inFilePath), outSQLFile, mainTableName='EdxTrackEvent', logFile=logFile) try: # Setting useDisplayNameCache to True prevents guaranteed # pulling of Modulestore from the backup---and expensive # operation. Note that cronRefreshModulestore.sh will # cause the cache to be refreshed: jsonConverter.setParser( EdXTrackLogJSONParser(jsonConverter, 'EdxTrackEvent', replaceTables=args.dropTables, dbName='Edx', useDisplayNameCache=True)) except Exception as e: with open(logFile, 'w') as fd: fd.write( "In json2sql: could not create EdXTrackLogJSONParser; infile: %s; outfile: %s; logfile: %s (%s)" % (InURI(args.inFilePath), outSQLFile, logFile, ` e `)) # Try to delete the .sql file that was created when # the OutputFile instance was made in the JSONToRelation # instantiation statement above: try: outSQLFile.remove() except Exception as e: pass sys.exit(1)
''' Created on Dec 18, 2013 Given any number of user screen names---simple strings--- to stdin, emit corresponding hashes as used to generat column anon_screen_name. @author: paepcke ''' # Add json_to_relation source dir to $PATH # for duration of this execution: import os import sys source_dir = [os.path.join(os.path.dirname(os.path.abspath(__file__)), "../json_to_relation/")] source_dir.extend(sys.path) sys.path = source_dir from edxTrackLogJSONParser import EdXTrackLogJSONParser if __name__ == '__main__': if len(sys.argv) < 2: print("Usage: makeAnonScreenName {str1 str2 ... | -} # Use dash to read input strings from stdin, e.g. from a pipe") sys.exit(1) if sys.argv[1] == '-': for screenName in sys.stdin: print(EdXTrackLogJSONParser.makeHash(screenName)) else: for screenName in sys.argv[1:]: print(EdXTrackLogJSONParser.makeHash(screenName))
if not os.access(logDir, os.W_OK): os.makedirs(logDir) logFile = os.path.join( logDir, 'j2s_%s_%s.log' % (os.path.basename(args.inFilePath), fileStamp)) # print('xpunge: %s' % args.dropTables) # print('verbose: %s' % args.verbose) # print('destDir: %s' % args.destDir) # print('in=FilePath: %s' % args.inFilePath) # print('outFullPath: %s' % outFullPath) # print('logFile: %s' % logFile) # Create an instance of JSONToRelation, taking input from the given file: # and pumping output to the given output path: jsonConverter = JSONToRelation( InURI(args.inFilePath), OutputFile(outFullPath, OutputDisposition.OutputFormat.SQL_INSERT_STATEMENTS, options='wb'), # overwrite any sql file that's there mainTableName='EdxTrackEvent', logFile=logFile) jsonConverter.setParser( EdXTrackLogJSONParser(jsonConverter, 'EdxTrackEvent', replaceTables=args.dropTables, dbName='Edx')) jsonConverter.convert()
import sys import os source_dir = [ os.path.join(os.path.dirname(os.path.abspath(__file__)), "../json_to_relation/") ] source_dir.extend(sys.path) sys.path = source_dir from json_to_relation import JSONToRelation from output_disposition import OutputPipe, OutputDisposition from input_source import InPipe from edxTrackLogJSONParser import EdXTrackLogJSONParser if __name__ == "__main__": # Create an instance of JSONToRelation, taking input from stdin, # and pumping output to stdout. Format output as SQL dump statements. jsonConverter = JSONToRelation( InPipe(), OutputPipe(OutputDisposition.OutputFormat.SQL_INSERT_STATEMENTS), mainTableName='EdxTrackEvent', logFile='/tmp/j2s.log') jsonConverter.setParser( EdXTrackLogJSONParser(jsonConverter, 'EdxTrackEvent', replaceTables=True, dbName='test')) jsonConverter.convert()
anon_screen_name. @author: paepcke ''' # Add json_to_relation source dir to $PATH # for duration of this execution: import os import sys source_dir = [ os.path.join(os.path.dirname(os.path.abspath(__file__)), "../json_to_relation/") ] source_dir.extend(sys.path) sys.path = source_dir from edxTrackLogJSONParser import EdXTrackLogJSONParser if __name__ == '__main__': if len(sys.argv) < 2: print( "Usage: makeAnonScreenName {str1 str2 ... | -} # Use dash to read input strings from stdin, e.g. from a pipe" ) sys.exit(1) if sys.argv[1] == '-': for screenName in sys.stdin: print(EdXTrackLogJSONParser.makeHash(screenName)) else: for screenName in sys.argv[1:]: print(EdXTrackLogJSONParser.makeHash(screenName))