Esempio n. 1
0
def create_frame_from_file(file_name):
    n_total_lines = 220000
    sf = SFrame()
    with open(file_name) as data:
        dt = []
        ip = []
        py = []
        script = []
        id = []
        for i, line in enumerate(data):
            jo = json.loads(line)
            dt += jo['dt']
            ip += jo['ip']
            py += jo['py']
            id += [i]
            script += jo['user_script']

            if i % 100 == 0:
                print float(i) / n_total_lines

        sf = sf.add_column(SArray(id), name='id')
        sf.add_column(SArray(dt), name='dt')
        sf.add_column(SArray(ip), name='ip')
        sf.add_column(SArray(py, dtype=str), name='py')
        sf.add_column(SArray(script), name='user_script')

        sf.save('python_tutor')
    return sf
Esempio n. 2
0
 def get_rating_sf(self, samples, save_to=None):
     sf = SFrame(self.ratings.ix[samples])
     sf['userId'] = sf['userId'].apply(lambda uid: self.user_dict[uid])
     sf['movieId'] = sf['movieId'].apply(lambda mid: self.movie_dict[mid])
     if save_to is not None:
         print "saving sframe to", save_to
         sf.save(save_to)
     return sf
Esempio n. 3
0
 def get_rating_sf(self, samples, save_to=None):
     sf = SFrame(self.ratings.ix[samples])
     sf['userId'] = sf['userId'].apply(lambda uid: self.user_dict[uid])
     sf['movieId'] = sf['movieId'].apply(lambda mid: self.movie_dict[mid])
     if save_to is not None:
         print "saving sframe to", save_to
         sf.save(save_to)
     return sf
Esempio n. 4
0
def process_frame(filename):
    sf = gl.load_sframe(filename)
    
    output_frame = SFrame()
    
    #Setup our output frame
    id = []
    ip = []
    sub_count = []
    error_count = []
    time_count = []
    error_sequence_raw = []
    error_sequence = []
    
    #How many session ID's do we have?
    sa = gl.SArray()
    sa = sf['session_id']
    test = sa.unique()
    
    limit = len(test)
    
    #Start grabbing each session
    for i in range(1,limit):
        
        #Test output
        if (i % 100 == 0):   
            break 
        
        #Get the session and sort it by the date time
        session_frame = sf.filter_by(i,"session_id")
        #sorted_session = session_frame.sort("dt")
        
        row = sf[0]
        
        id += [i]
        ip += [row['ip']]
        sub_count += [len(row)]
        #time_count += [fn_time_count(sorted_session)]
        #error_count += [fn_error_count(sorted_session)]
        #error_sequence_raw += [fn_error_sequence_raw(sorted_session)]
    
    print len(id)
    print len(ip)
    print len(sub_count)
    #print len(time_count)
    
    output_frame = output_frame.add_column(SArray(id), name='id')
    output_frame.add_column(SArray(ip), name='ip')
    output_frame.add_column(SArray(sub_count),name='sub_count')
    #output_frame.add_column(SArray(time_count),name='sub_length')
    #output_frame.add_column(SArray(error_count),name='error_count')
    #output_frame.add_column(SArray(error_sequence_raw,dtype=str),name='err_seq_raw')

    output_frame.save('py2_session_analysis')
Esempio n. 5
0
def resize_images(filename):
    images = graphlab.image_analysis.load_images(filename,
                                                 format='auto',
                                                 with_path=False,
                                                 recursive=False,
                                                 ignore_failure=True,
                                                 random_order=True)
    # firstImages = images[0:9]['image']
    new_images = list()
    new_images.append(
        graphlab.image_analysis.resize(images['image'],
                                       32,
                                       32,
                                       channels=4,
                                       decode=True))
    frame = SFrame(new_images)
    frame.save('mini')
Esempio n. 6
0
def append_images(json_file):

    # we fill an SFrame with all the given metadata of the dogs
    meta = SFrame.read_json(json_file, orient='records')
    # this is the SFrame that we will fill with the data plus the image, which will be saved in the final file
    image_list = SFrame(data=None)
    # for each image in the images column in the meta SFrame, we add one line in the final SF with one image per line
    for i in range(0, len(meta) - 1):
        dogo = meta[i:i + 1]
        for image in dogo['images'][0]:
            # print image
            dogo_clone = dogo.copy()
            dogo_clone.add_column(SArray([(graphlab.Image(images_path + image))
                                          ]),
                                  name='image')
            dogo_clone.add_column(SArray([image]), name='image_filename')
            image_list = image_list.append(SFrame(dogo_clone))

    image_list.save(filename='prepared_data/')
Esempio n. 7
0
def process_frame(frame_name):
    
    #Setup columns for the new frame
    session_id = []
    ip_address = []
    python_version = []
    interest = []
    submissions = []
    
    #Load in the frame we're processing
    frame = gl.load_sframe(frame_name)
    
    #Sort the frame by IP and then DT ASC
    sorted_frame = frame.sort(['ip','dt'])
    
    #Previous IP to see if we're looking at a new IP address
    previous_ip = 0
    previous_py = 0
    
    #Counters (for keys)
    record_counter = 1
    submission_counter = 1
    
    #Dictionary to hold submissions
    submissions_collection = {}
    
    #Looping through all records to break this up into
    #ip address and then 'session' chunks
    for i in xrange(len(sorted_frame)):
        if(i == 1):
            print sorted_frame['ip'][i]
            break;
        if(i % 100 == 0):
            print "processing record:" + str(i)
        if((sorted_frame['ip'][i] != previous_ip)):
            if(previous_ip != 0):
                #Add in the record to the frame
                session_id += str(record_counter)
                ip_address += str(previous_ip)
                python_version += str(previous_py)
                interest += str(is_interesting(submissions_collection))
                submissions += submissions_collection
                
                #Reset all values
                submissions_collection = {}
                previous_ip = sorted_frame['ip'][i]
                previous_py = sorted_frame['py'][i]
                record_counter = record_counter + 1
                submission_counter = 1
                
                #Create and append the submission
                d = {}
                d['date-time'] = sorted_frame['dt'][i]
                d['code_segment'] = sorted_frame['user_script'][i]
                d['error_message'] = sorted_frame['err_msg'][i]
                d['error_flag'] = sorted_frame['compile_err'][i]
                
                submissions_collection[str(submission_counter)] = d
                submission_counter = submission_counter + 1
                
            else:
                #Handling the very first record
                previous_ip = sorted_frame['ip'][i]
                previous_py = sorted_frame['py'][i]
                
                #Create and append the submission
                d = {}
                d['date-time'] = sorted_frame['dt'][i]
                d['code_segment'] = sorted_frame['user_script'][i]
                d['error_message'] = sorted_frame['err_msg'][i]
                d['error_flag'] = sorted_frame['compile_err'][i]
                
                submissions_collection[str(submission_counter)] = d
                submission_counter = submission_counter + 1
        else:
            #Create and append the submission
            d = {}
            d['date-time'] = sorted_frame['dt'][i]
            d['code_segment'] = sorted_frame['user_script'][i]
            d['error_message'] = sorted_frame['err_msg'][i]
            d['error_flag'] = sorted_frame['compile_err'][i]
            
            submissions_collection[str(submission_counter)] = d
            submission_counter = submission_counter + 1
            
    #Finally, create the frame and save it!
    
    print ip_address
    print len(session_id)
    print len(ip_address)
    print len(python_version)
    print len(submissions)
    
    rst = SFrame()
    rst.add_column(SArray(session_id, dtype=str), name='session_id')
    rst.add_column(SArray(ip_address, dtype=str), name='ip_address')
    rst.add_column(SArray(python_version, dtype=str), name='python_version')
    rst.add_column(SArray(submissions, dtype=dict), name='submissions')

    rst.save("test_frame")          
Esempio n. 8
0
def get_sf_from_coo(coo, save_to):
    sf = SFrame({'userId': coo.row, 'movieId': coo.col, 'rating': coo.data})
    if save_to is not None:
        print "saving sframe to", save_to
        sf.save(save_to)
    return sf
Esempio n. 9
0
def main():
    with open('../../Data/data_file_modified.txt') as data:
        sf = SFrame()
        
        # Data model format
        # RecordID | Date/Time | IP Address | Python Version |
        # User Script | Compile Flag | Compile Message
        id = []
        dt = []
        ip = []
        py = []
        script = []
        error = []       
        error_msg = [] 
        
        for i, line in enumerate(data):
            jo = json.loads(line)
            
            # Two different version of Python script
            # need to be compiled on different interpreters
            if(jo['py'][0] == 3):
            
                # Setup the data model we're using
                id += [i]
                dt += jo['dt']
                ip += jo['ip']
                py += jo['py']            
                script += jo['user_script']  
                
                # Run the script on the compile method
                # and obtain any error message
                flag = False
                msg = ""
                
                pattern = "is local and global"
                
                try:
                    compile(jo['user_script'][0],'<string>','exec')
                except SyntaxError, e:
                    if(re.search(pattern, str(e))):
                        msg = "Variable is Local and Global"
                    else:
                        msg = str(e)
                    flag = True
                
                
                if(flag):
                    error += [1]
                else:
                    error += [0] 
                
                # We need to chop off the error type
                # and remove the (filename line number)
                # to have any meaning here.
                fix_msg = msg.partition('(')[0]
                error_msg += [fix_msg.strip()]        
       
        sf = sf.add_column(SArray(id), name='id')
        sf.add_column(SArray(dt), name='dt')
        sf.add_column(SArray(ip), name='ip')
        sf.add_column(SArray(py, dtype=str), name='py')
        sf.add_column(SArray(script), name='user_script')
        sf.add_column(SArray(error), name='compile_err')
        sf.add_column(SArray(error_msg), name='err_msg')

        sf.save('py3_error_frame_clean')
Esempio n. 10
0
def get_sf_from_coo(coo, save_to):
    sf = SFrame({'userId': coo.row, 'movieId': coo.col, 'rating': coo.data})
    if save_to is not None:
        print "saving sframe to", save_to
        sf.save(save_to)
    return sf