Example #1
0
def parse(out, infile, modulestore):

    logfile = open(out + '/transform.log', 'w')
    outfile = OutputFile(out + '/data', format, options='wb')

    parser = JSONToRelation(InURI(infile),
                            outfile,
                            mainTableName='EdxTrackEvent')
    parser.setParser(
        EdXTrackLogJSONParser(parser,
                              'EdxTrackEvent',
                              dbName='Edx',
                              moduleStore=modulestore))
    parser.convert()
 def computeAnonFromScreenNames(self, extIntNameFileName):
     with open(extIntNameFileName, 'r') as inFd:
         print('ext_id,anon_screen_name')
         firstLineDiscarded = False
         for line in inFd:
             (extId, intId, screenName) = line.split(',') #@UnusedVariable
             #********
             #print('ScreenName.strip(\'"\'): \'%s\'' % screenName.strip().strip('"'))
             #********
             if firstLineDiscarded:
                 screenName = screenName.strip().strip('"')
                 if screenName == '\\N':
                     print ('%s,%s' % (extId.strip('"'),'NULL'))
                 else:
                     print('%s,%s' % (extId.strip('"'),EdXTrackLogJSONParser.makeHash(screenName)))
             else:
                 firstLineDiscarded = True
Example #3
0
    def computeAndAdd(self):
        '''
        The heavy lifting: reads all TSV rows from the certificates_generatedcertificate
        table into memory. the screenNamePos passed into the __init__() method
        is used to find the row's user screen name. That name is hashed
        and appended to the row as a new anon_screen_name column.
        '''
        with open(self.logFile, 'a') as logFd:
            with open(self.tsvFileName, 'r') as tsvFd:
                allRows = tsvFd.readlines()
            for (i, row) in enumerate(allRows[1:]):
                colVals = row.split('\t')
                # Each line's last TSV value element has
                # a \n glued to it. Get rid of that:
                colVals[-1] = colVals[-1].strip()
                # Pick the screen name out of the row:
                try:
                    screenName = colVals[self.screenNamePos]
                except IndexError:
                    logMsg =  "Ill formatted row number %d in user grade file %s: '%s' (dropping this row)\n" % \
                                (int(i), self.tsvFileName, str(row).strip())
                    logFd.write(logMsg)
                    logFd.flush()
                    continue
                # Add the new last element, including
                # the trailing \n:
                colVals.append(
                    EdXTrackLogJSONParser.makeHash(screenName) + '\n')
                # Write the array back into allRows. The '+1'
                # is b/c the enumeration above starts i at 0,
                # which the allRows[1:] starts with the 2nd row,
                # the one after the header:
                allRows[i + 1] = string.join(colVals, '\t')

            # The first (header column names) row needs to
            # have the new column appended to it after
            # again stripping the newline off the last
            # column name, and tagging it onto the
            # new last col name:
            colNames = allRows[0].split('\t')
            colNames[-1] = colNames[-1].strip()
            colNames.append('anon_screen_name\n')
            allRows[0] = string.join(colNames, '\t')
            # Write the new TSV back into the file:
            with open(self.tsvFileName, 'w') as tsvFd:
                tsvFd.writelines(allRows)
 def computeAndAdd(self):
     '''
     The heavy lifting: reads all TSV rows from the certificates_generatedcertificate
     table into memory. the screenNamePos passed into the __init__() method
     is used to find the row's user screen name. That name is hashed
     and appended to the row as a new anon_screen_name column.
     '''
     with open(self.logFile, 'a') as logFd:
         with open(self.tsvFileName, 'r') as tsvFd:
             allRows = tsvFd.readlines()
         for (i, row) in enumerate(allRows[1:]):
             colVals = row.split('\t')
             # Each line's last TSV value element has
             # a \n glued to it. Get rid of that:
             colVals[-1] = colVals[-1].strip()
             # Pick the screen name out of the row:
             try:
                 screenName = colVals[self.screenNamePos]
             except IndexError:
                 logMsg =  "Ill formatted row number %d in user grade file %s: '%s' (dropping this row)\n" % \
                             (int(i), self.tsvFileName, str(row).strip())
                 logFd.write(logMsg)
                 logFd.flush()
                 continue
             # Add the new last element, including
             # the trailing \n:
             colVals.append(EdXTrackLogJSONParser.makeHash(screenName) + '\n')
             # Write the array back into allRows. The '+1'
             # is b/c the enumeration above starts i at 0,
             # which the allRows[1:] starts with the 2nd row,
             # the one after the header:
             allRows[i+1] = string.join(colVals, '\t')
 
         # The first (header column names) row needs to 
         # have the new column appended to it after 
         # again stripping the newline off the last
         # column name, and tagging it onto the 
         # new last col name: 
         colNames = allRows[0].split('\t')
         colNames[-1] = colNames[-1].strip()
         colNames.append('anon_screen_name\n') 
         allRows[0] = string.join(colNames, '\t')
         # Write the new TSV back into the file:
         with open(self.tsvFileName, 'w') as tsvFd:
             tsvFd.writelines(allRows)
 def computeAnonFromScreenNames(self, extIntNameFileName):
     with open(extIntNameFileName, 'r') as inFd:
         print('ext_id,anon_screen_name')
         firstLineDiscarded = False
         for line in inFd:
             (extId, intId, screenName) = line.split(',')  #@UnusedVariable
             #********
             #print('ScreenName.strip(\'"\'): \'%s\'' % screenName.strip().strip('"'))
             #********
             if firstLineDiscarded:
                 screenName = screenName.strip().strip('"')
                 if screenName == '\\N':
                     print('%s,%s' % (extId.strip('"'), 'NULL'))
                 else:
                     print('%s,%s' %
                           (extId.strip('"'),
                            EdXTrackLogJSONParser.makeHash(screenName)))
             else:
                 firstLineDiscarded = True
Example #6
0
        outFullPath, outputFormat,
        options='wb')  # overwrite any sql file that's there
    jsonConverter = JSONToRelation(InURI(args.inFilePath),
                                   outSQLFile,
                                   mainTableName='EdxTrackEvent',
                                   logFile=logFile)
    try:
        # Setting useDisplayNameCache to True prevents guaranteed
        # pulling of Modulestore from the backup---and expensive
        # operation. Note that cronRefreshModulestore.sh will
        # cause the cache to be refreshed:

        jsonConverter.setParser(
            EdXTrackLogJSONParser(jsonConverter,
                                  'EdxTrackEvent',
                                  replaceTables=args.dropTables,
                                  dbName='Edx',
                                  useDisplayNameCache=True))
    except Exception as e:
        with open(logFile, 'w') as fd:
            fd.write(
                "In json2sql: could not create EdXTrackLogJSONParser; infile: %s; outfile: %s; logfile: %s (%s)"
                % (InURI(args.inFilePath), outSQLFile, logFile, ` e `))
        # Try to delete the .sql file that was created when
        # the OutputFile instance was made in the JSONToRelation
        # instantiation statement above:
        try:
            outSQLFile.remove()
        except Exception as e:
            pass
        sys.exit(1)
'''
Created on Dec 18, 2013
Given any number of user screen names---simple strings--- to
stdin, emit corresponding hashes as used to generat column
anon_screen_name.

@author: paepcke
'''
# Add json_to_relation source dir to $PATH
# for duration of this execution:
import os
import sys

source_dir = [os.path.join(os.path.dirname(os.path.abspath(__file__)), "../json_to_relation/")]
source_dir.extend(sys.path)
sys.path = source_dir

from edxTrackLogJSONParser import EdXTrackLogJSONParser

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print("Usage: makeAnonScreenName {str1 str2 ... | -} # Use dash to read input strings from stdin, e.g. from a pipe")
        sys.exit(1)
    if sys.argv[1] == '-':
        for screenName in sys.stdin:
            print(EdXTrackLogJSONParser.makeHash(screenName))
    else:
        for screenName in sys.argv[1:]:
            print(EdXTrackLogJSONParser.makeHash(screenName))
    
Example #8
0
    if not os.access(logDir, os.W_OK):
        os.makedirs(logDir)

    logFile = os.path.join(
        logDir,
        'j2s_%s_%s.log' % (os.path.basename(args.inFilePath), fileStamp))

    #    print('xpunge: %s' % args.dropTables)
    #    print('verbose: %s' % args.verbose)
    #    print('destDir: %s' % args.destDir)
    #    print('in=FilePath: %s' % args.inFilePath)
    #    print('outFullPath: %s' % outFullPath)
    #    print('logFile: %s' % logFile)

    # Create an instance of JSONToRelation, taking input from the given file:
    # and pumping output to the given output path:

    jsonConverter = JSONToRelation(
        InURI(args.inFilePath),
        OutputFile(outFullPath,
                   OutputDisposition.OutputFormat.SQL_INSERT_STATEMENTS,
                   options='wb'),  # overwrite any sql file that's there
        mainTableName='EdxTrackEvent',
        logFile=logFile)
    jsonConverter.setParser(
        EdXTrackLogJSONParser(jsonConverter,
                              'EdxTrackEvent',
                              replaceTables=args.dropTables,
                              dbName='Edx'))
    jsonConverter.convert()
Example #9
0
import sys
import os

source_dir = [
    os.path.join(os.path.dirname(os.path.abspath(__file__)),
                 "../json_to_relation/")
]
source_dir.extend(sys.path)
sys.path = source_dir

from json_to_relation import JSONToRelation
from output_disposition import OutputPipe, OutputDisposition
from input_source import InPipe
from edxTrackLogJSONParser import EdXTrackLogJSONParser

if __name__ == "__main__":

    # Create an instance of JSONToRelation, taking input from stdin,
    # and pumping output to stdout. Format output as SQL dump statements.
    jsonConverter = JSONToRelation(
        InPipe(),
        OutputPipe(OutputDisposition.OutputFormat.SQL_INSERT_STATEMENTS),
        mainTableName='EdxTrackEvent',
        logFile='/tmp/j2s.log')
    jsonConverter.setParser(
        EdXTrackLogJSONParser(jsonConverter,
                              'EdxTrackEvent',
                              replaceTables=True,
                              dbName='test'))
    jsonConverter.convert()
Example #10
0
anon_screen_name.

@author: paepcke
'''
# Add json_to_relation source dir to $PATH
# for duration of this execution:
import os
import sys

source_dir = [
    os.path.join(os.path.dirname(os.path.abspath(__file__)),
                 "../json_to_relation/")
]
source_dir.extend(sys.path)
sys.path = source_dir

from edxTrackLogJSONParser import EdXTrackLogJSONParser

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print(
            "Usage: makeAnonScreenName {str1 str2 ... | -} # Use dash to read input strings from stdin, e.g. from a pipe"
        )
        sys.exit(1)
    if sys.argv[1] == '-':
        for screenName in sys.stdin:
            print(EdXTrackLogJSONParser.makeHash(screenName))
    else:
        for screenName in sys.argv[1:]:
            print(EdXTrackLogJSONParser.makeHash(screenName))