def create_frame_from_file(file_name): n_total_lines = 220000 sf = SFrame() with open(file_name) as data: dt = [] ip = [] py = [] script = [] id = [] for i, line in enumerate(data): jo = json.loads(line) dt += jo['dt'] ip += jo['ip'] py += jo['py'] id += [i] script += jo['user_script'] if i % 100 == 0: print float(i) / n_total_lines sf = sf.add_column(SArray(id), name='id') sf.add_column(SArray(dt), name='dt') sf.add_column(SArray(ip), name='ip') sf.add_column(SArray(py, dtype=str), name='py') sf.add_column(SArray(script), name='user_script') sf.save('python_tutor') return sf
def get_rating_sf(self, samples, save_to=None): sf = SFrame(self.ratings.ix[samples]) sf['userId'] = sf['userId'].apply(lambda uid: self.user_dict[uid]) sf['movieId'] = sf['movieId'].apply(lambda mid: self.movie_dict[mid]) if save_to is not None: print "saving sframe to", save_to sf.save(save_to) return sf
def process_frame(filename): sf = gl.load_sframe(filename) output_frame = SFrame() #Setup our output frame id = [] ip = [] sub_count = [] error_count = [] time_count = [] error_sequence_raw = [] error_sequence = [] #How many session ID's do we have? sa = gl.SArray() sa = sf['session_id'] test = sa.unique() limit = len(test) #Start grabbing each session for i in range(1,limit): #Test output if (i % 100 == 0): break #Get the session and sort it by the date time session_frame = sf.filter_by(i,"session_id") #sorted_session = session_frame.sort("dt") row = sf[0] id += [i] ip += [row['ip']] sub_count += [len(row)] #time_count += [fn_time_count(sorted_session)] #error_count += [fn_error_count(sorted_session)] #error_sequence_raw += [fn_error_sequence_raw(sorted_session)] print len(id) print len(ip) print len(sub_count) #print len(time_count) output_frame = output_frame.add_column(SArray(id), name='id') output_frame.add_column(SArray(ip), name='ip') output_frame.add_column(SArray(sub_count),name='sub_count') #output_frame.add_column(SArray(time_count),name='sub_length') #output_frame.add_column(SArray(error_count),name='error_count') #output_frame.add_column(SArray(error_sequence_raw,dtype=str),name='err_seq_raw') output_frame.save('py2_session_analysis')
def resize_images(filename): images = graphlab.image_analysis.load_images(filename, format='auto', with_path=False, recursive=False, ignore_failure=True, random_order=True) # firstImages = images[0:9]['image'] new_images = list() new_images.append( graphlab.image_analysis.resize(images['image'], 32, 32, channels=4, decode=True)) frame = SFrame(new_images) frame.save('mini')
def append_images(json_file): # we fill an SFrame with all the given metadata of the dogs meta = SFrame.read_json(json_file, orient='records') # this is the SFrame that we will fill with the data plus the image, which will be saved in the final file image_list = SFrame(data=None) # for each image in the images column in the meta SFrame, we add one line in the final SF with one image per line for i in range(0, len(meta) - 1): dogo = meta[i:i + 1] for image in dogo['images'][0]: # print image dogo_clone = dogo.copy() dogo_clone.add_column(SArray([(graphlab.Image(images_path + image)) ]), name='image') dogo_clone.add_column(SArray([image]), name='image_filename') image_list = image_list.append(SFrame(dogo_clone)) image_list.save(filename='prepared_data/')
def process_frame(frame_name): #Setup columns for the new frame session_id = [] ip_address = [] python_version = [] interest = [] submissions = [] #Load in the frame we're processing frame = gl.load_sframe(frame_name) #Sort the frame by IP and then DT ASC sorted_frame = frame.sort(['ip','dt']) #Previous IP to see if we're looking at a new IP address previous_ip = 0 previous_py = 0 #Counters (for keys) record_counter = 1 submission_counter = 1 #Dictionary to hold submissions submissions_collection = {} #Looping through all records to break this up into #ip address and then 'session' chunks for i in xrange(len(sorted_frame)): if(i == 1): print sorted_frame['ip'][i] break; if(i % 100 == 0): print "processing record:" + str(i) if((sorted_frame['ip'][i] != previous_ip)): if(previous_ip != 0): #Add in the record to the frame session_id += str(record_counter) ip_address += str(previous_ip) python_version += str(previous_py) interest += str(is_interesting(submissions_collection)) submissions += submissions_collection #Reset all values submissions_collection = {} previous_ip = sorted_frame['ip'][i] previous_py = sorted_frame['py'][i] record_counter = record_counter + 1 submission_counter = 1 #Create and append the submission d = {} d['date-time'] = sorted_frame['dt'][i] d['code_segment'] = sorted_frame['user_script'][i] d['error_message'] = sorted_frame['err_msg'][i] d['error_flag'] = sorted_frame['compile_err'][i] submissions_collection[str(submission_counter)] = d submission_counter = submission_counter + 1 else: #Handling the very first record previous_ip = sorted_frame['ip'][i] previous_py = sorted_frame['py'][i] #Create and append the submission d = {} d['date-time'] = sorted_frame['dt'][i] d['code_segment'] = sorted_frame['user_script'][i] d['error_message'] = sorted_frame['err_msg'][i] d['error_flag'] = sorted_frame['compile_err'][i] submissions_collection[str(submission_counter)] = d submission_counter = submission_counter + 1 else: #Create and append the submission d = {} d['date-time'] = sorted_frame['dt'][i] d['code_segment'] = sorted_frame['user_script'][i] d['error_message'] = sorted_frame['err_msg'][i] d['error_flag'] = sorted_frame['compile_err'][i] submissions_collection[str(submission_counter)] = d submission_counter = submission_counter + 1 #Finally, create the frame and save it! print ip_address print len(session_id) print len(ip_address) print len(python_version) print len(submissions) rst = SFrame() rst.add_column(SArray(session_id, dtype=str), name='session_id') rst.add_column(SArray(ip_address, dtype=str), name='ip_address') rst.add_column(SArray(python_version, dtype=str), name='python_version') rst.add_column(SArray(submissions, dtype=dict), name='submissions') rst.save("test_frame")
def get_sf_from_coo(coo, save_to): sf = SFrame({'userId': coo.row, 'movieId': coo.col, 'rating': coo.data}) if save_to is not None: print "saving sframe to", save_to sf.save(save_to) return sf
def main(): with open('../../Data/data_file_modified.txt') as data: sf = SFrame() # Data model format # RecordID | Date/Time | IP Address | Python Version | # User Script | Compile Flag | Compile Message id = [] dt = [] ip = [] py = [] script = [] error = [] error_msg = [] for i, line in enumerate(data): jo = json.loads(line) # Two different version of Python script # need to be compiled on different interpreters if(jo['py'][0] == 3): # Setup the data model we're using id += [i] dt += jo['dt'] ip += jo['ip'] py += jo['py'] script += jo['user_script'] # Run the script on the compile method # and obtain any error message flag = False msg = "" pattern = "is local and global" try: compile(jo['user_script'][0],'<string>','exec') except SyntaxError, e: if(re.search(pattern, str(e))): msg = "Variable is Local and Global" else: msg = str(e) flag = True if(flag): error += [1] else: error += [0] # We need to chop off the error type # and remove the (filename line number) # to have any meaning here. fix_msg = msg.partition('(')[0] error_msg += [fix_msg.strip()] sf = sf.add_column(SArray(id), name='id') sf.add_column(SArray(dt), name='dt') sf.add_column(SArray(ip), name='ip') sf.add_column(SArray(py, dtype=str), name='py') sf.add_column(SArray(script), name='user_script') sf.add_column(SArray(error), name='compile_err') sf.add_column(SArray(error_msg), name='err_msg') sf.save('py3_error_frame_clean')