def __init__(self, file): fileRead = open(file, 'r') self.delta = dict() definitions = fileRead.read().replace(' ', '').replace('\n', '').replace( '\t', '').split('.') del definitions[-1] for definition in definitions: if (definition.split(':')[0][0] == 'T'): transitionsString = definition.replace( ':=', '=').split('T:')[1].replace('},', '*').replace('}', '*').split('*') for transition in transitionsString: if (transition != ''): transition = transition.split('={') state = transition[0][0] simbol = transition[0][2] targets = set(transition[1].split(',')) deltaAux = dict([(simbol, targets)]) if self.delta.has_key(state): self.delta[state][simbol] = targets else: self.delta[state] = deltaAux elif (definition.split(':')[0][0] == 'I'): self.q0 = definition.split(':')[1].split(',')[0] elif (definition.split(':')[0][0] == 'F'): self.F = definition.split(':')[1].split(',') fileRead.close()
def post(self, bool, bytes, dict, float, int, list, null, unicode): return __builtin__.dict( zip( ("bool", "bytes", "dict", "float", "int", "list", "null", "unicode"), (bool, bytes, dict, float, int, list, null, unicode), ) )
def __init__(self,config,configAbsPath): # A dictionary containing the ioHub configuration file settings self.ioHubConfig=config # the path to the ioHub configuration file itself. self.ioHubConfigAbsPath=configAbsPath # udp port setup self.udp_client = UDPClientConnection(coder=self.ioHubConfig['ipcCoder']) # the dynamically generated object that contains an attribute for each device registed for monitoring # with the ioHub server so that devices can be accessed experiment process side by device name. self.devices=ioHubDevices(self) # a dictionary that holds the same devices represented in .devices, but stored in a dictionary using the device # name as the dictionary key self.deviceByLabel=dict() # attribute to hold the current experiment ID that has been created by the ioHub ioDataStore if saving data to the # ioHub hdf5 file type. self.experimentID=None # attribute to hold the current experiment session ID that has been created by the ioHub ioDataStore if saving data to the # ioHub hdf5 file type. self.experimentSessionID=None
def __init__(self, config, configAbsPath): # A dictionary containing the ioHub configuration file settings self.ioHubConfig = config # the path to the ioHub configuration file itself. self.ioHubConfigAbsPath = configAbsPath # udp port setup self.udp_client = UDPClientConnection( coder=self.ioHubConfig['ipcCoder']) # the dynamically generated object that contains an attribute for each device registed for monitoring # with the ioHub server so that devices can be accessed experiment process side by device name. self.devices = ioHubDevices(self) # a dictionary that holds the same devices represented in .devices, but stored in a dictionary using the device # name as the dictionary key self.deviceByLabel = dict() # attribute to hold the current experiment ID that has been created by the ioHub ioDataStore if saving data to the # ioHub hdf5 file type. self.experimentID = None # attribute to hold the current experiment session ID that has been created by the ioHub ioDataStore if saving data to the # ioHub hdf5 file type. self.experimentSessionID = None
def get_project_info(manager, project): project_id = project['value']['_id'] questionnaire = Project.get(manager, project_id) questionnaire_code = questionnaire.form_code analysis, disabled, log = get_project_analysis_and_log_link(project_id, questionnaire_code) web_submission_link = reverse("web_questionnaire", args=[project_id]) web_submission_link_disabled = 'disable_link' if 'web' in project['value']['devices']: web_submission_link_disabled = "" create_subjects_links = {} for entity_type in questionnaire.entity_type: create_subjects_links.update({entity_type: append_query_strings_to_url(reverse("create_subject", args=[entity_type]), web_view=True)}) project_info = dict(project_id=project_id, name=project['value']['name'], qid=questionnaire_code, created=project['value']['created'], link=(reverse('project-overview', args=[project_id])), log=log, analysis=analysis, disabled=disabled, web_submission_link=web_submission_link, web_submission_link_disabled=web_submission_link_disabled, create_subjects_link=create_subjects_links, entity_type=questionnaire.entity_type, encoded_name=urlquote(project['value']['name']), import_template_file_name=slugify(project['value']['name'])) return project_info
def dict(*a, **k): import warnings import __builtin__ warnings.warn( 'twisted.python.util.dict is deprecated. Use __builtin__.dict instead' ) return __builtin__.dict(*a, **k)
def as_dict(self): photo = __builtin__.dict(Id=self.Id, HouseId=self.HouseId, RelativePath=self.RelativePath, CreatedAt=self.CreatedAt, UpdatedAt=self.UpdatedAt) return photo
def get_project_info(manager, raw_project): project_id = raw_project['value']['_id'] project = Project.load(manager.database, project_id) questionnaire = manager.get(project.qid, FormModel) questionnaire_code = questionnaire.form_code analysis, disabled, log = get_project_analysis_and_log_link(project, project_id, questionnaire_code) web_submission_link = reverse("web_questionnaire", args=[project_id]) web_submission_link_disabled = 'disable_link' if 'web' in raw_project['value']['devices']: web_submission_link_disabled = "" create_subjects_link = '' if 'no' in raw_project['value']['activity_report']: create_subjects_link = reverse(create_subject, args=[project.entity_type]) project_info = dict(name=raw_project['value']['name'], qid=questionnaire_code, created=raw_project['value']['created'], type=raw_project['value']['project_type'], link=(reverse(project_overview, args=[project_id])), log=log, analysis=analysis, disabled=disabled, web_submission_link=web_submission_link, web_submission_link_disabled=web_submission_link_disabled, create_subjects_link=create_subjects_link, entity_type=project.entity_type) return project_info
def as_dict_JSON(self): landlord = __builtin__.dict(Id=self.Id, FirstName=self.FirstName, LastName=self.LastName, Email=self.Email, Phone=self.Phone) return landlord
def as_dict_JSON(self): student = __builtin__.dict(Id=self.Id, FirstName=self.FirstName, LastName=self.LastName, Email=self.Email, Phone=self.Phone) return student
def create_global_topic_list(articleList): e = re.compile(r"\s(de)\s") u = re.compile(r"\s(du)\s") globalTopicList = [] i = 0 for commList in articleList.values(): # Article body + all comments art = commList[0].artBody for comm in commList: art += comm.body # Global list of named entities art = u.sub(" Du ", art) art = e.sub(" De ", art) entities = extract_entities(wordpunct_tokenize(art)) globalTopicList += entities i += 1 if i % 100 == 0: print i,"comments processed for global vector" globalTopicList = nltk.FreqDist(globalTopicList) tempVector = dict() for k in globalTopicList.items()[:100]: tempVector[k[0]] = 0 f = open("globalTopics" + '.pkl', 'wb') pickle.dump(tempVector, f, pickle.HIGHEST_PROTOCOL) f.close()
def read_user_data(filename): f = open(filename, 'r') # To process all the comments userList = dict() commentCount = 0 for line in f: temp = line.split('&') if len(temp) < 9: continue userid = temp[0] inDeg = temp[1] outDeg = temp[2] age = temp[3] postCount = temp[4] postRate = temp[5] pageRank = temp[6] hub = temp[7] auth = temp[8] comm = [inDeg, outDeg, age, postCount, postRate, pageRank, hub, auth] userList[userid] = comm commentCount += 1 if commentCount % 10000 == 0: print "Read", commentCount, "user comments" print "done reading" return userList, len(userList)
def get_project_info(manager, raw_project): project_id = raw_project['value']['_id'] project = Project.load(manager.database, project_id) questionnaire = manager.get(project.qid, FormModel) questionnaire_code = questionnaire.form_code analysis, disabled, log = get_project_analysis_and_log_link(project, project_id, questionnaire_code) web_submission_link = reverse("web_questionnaire", args=[project_id]) web_submission_link_disabled = 'disable_link' if 'web' in raw_project['value']['devices']: web_submission_link_disabled = "" create_subjects_link = '' if 'no' in raw_project['value']['activity_report']: create_subjects_link = append_query_strings_to_url(reverse("create_subject", args=[project.entity_type]), web_view=True) project_info = dict(project_id=project_id, name=raw_project['value']['name'], qid=questionnaire_code, created=raw_project['value']['created'], type=raw_project['value']['project_type'], link=(reverse('project-overview', args=[project_id])), log=log, analysis=analysis, disabled=disabled, web_submission_link=web_submission_link, web_submission_link_disabled=web_submission_link_disabled, create_subjects_link=create_subjects_link, entity_type=project.entity_type, encoded_name=urlquote(raw_project['value']['name']), import_template_file_name=slugify(raw_project['value']['name'])) return project_info
def _construct_project_dict(user, local_time_delta, project): project_id = project['project_id'] delete_links = reverse('delete_project', args=[project_id]) disable_link_class, hide_link_class = _get_visibility_settings_for( user, project) return dict( delete_links=delete_links, name=project['name'], created=convert_utc_to_localized(local_time_delta, project['created']), qid=project['qid'], link=project['link'], web_submission_link_disabled=project['web_submission_link_disabled'], web_submission_link=project['web_submission_link'], analysis=project['analysis'], disabled=project['disabled'], log=project['log'], create_subjects_link=project['create_subjects_link'], entity_type=project['entity_type'], encoded_name=project['encoded_name'], import_template_file_name=project['import_template_file_name'], is_advanced_questionnaire=bool(project['is_advanced_questionnaire']), is_poll=project['is_poll'], disable_link_class=disable_link_class, hide_link_class=hide_link_class)
def displayExperimentSessionSettingsDialog(self): """ Display an editable dialog showing the experiment session setting retrieved from the configuration file. This includes the few manditory ioHub experiment session attributes, as well as any user defined experiment session attributes that have been defined in the experiment configuration file. If OK is selected in the dialog, the experiment logic continues, otherwise the experiment session is terminated. """ allSessionDialogVariables = dict(self.experimentSessionDefaults, **self.sessionUserVariables) sessionVariableOrder = self.configuration['session_variable_order'] if 'user_variables' in allSessionDialogVariables: del allSessionDialogVariables['user_variables'] sessionDlg = psychopy.gui.DlgFromDict(allSessionDialogVariables, 'Experiment Session Settings', [], sessionVariableOrder) if sessionDlg.OK: for key, value in allSessionDialogVariables.iteritems(): if key in self.experimentSessionDefaults: self.experimentSessionDefaults[key] = str(value) elif key in self.sessionUserVariables: self.sessionUserVariables[key] = str(value) return False return True
def _eventListToDict(eventValueList): """ Convert an ioHub event that is current represented as an orderded list of values, and return the event as a dictionary of attribute name, attribute values for the object. """ eclass = EventConstants.EVENT_CLASSES[eventValueList[3]] combo = zip(eclass.attributeNames, eventValueList) return dict(combo)
def as_dict(self): dev = __builtin__.dict(Id=self.Id, ProjectName=self.ProjectName, Email=self.Email, Key=self.Key, CreatedAt=self.CreatedAt, UpdatedAt=self.UpdatedAt) return dev
def _eventListToDict(eventValueList): """ Convert an ioHub event that is current represented as an orderded list of values, and return the event as a dictionary of attribute name, attribute values for the object. """ eclass=EventConstants.EVENT_CLASSES[eventValueList[3]] combo = zip(eclass.attributeNames,eventValueList) return dict(combo)
def as_dict_JSON(self): review = __builtin__.dict(Id=self.Id, HouseId=self.HouseId, StudentId=self.StudentId, Stars=self.Stars, Comment=self.Comment, CreatedAt=str(self.CreatedAt), UpdatedAt=str(self.UpdatedAt)) return review
def _eventListToObject(eventValueList): """ Convert an ioHub event that is current represented as an orderded list of values, and return the correct ioHub.devices.DeviceEvent subclass for the given event type. """ eclass = EventConstants.EVENT_CLASSES[eventValueList[3]] combo = zip(eclass.attributeNames, eventValueList) kwargs = dict(combo) return eclass(**kwargs)
def _eventListToObject(eventValueList): """ Convert an ioHub event that is current represented as an orderded list of values, and return the correct ioHub.devices.DeviceEvent subclass for the given event type. """ eclass=EventConstants.EVENT_CLASSES[eventValueList[3]] combo = zip(eclass.attributeNames,eventValueList) kwargs = dict(combo) return eclass(**kwargs)
def extract_bigrams(articleList, commentCount): featureMatrix = np.zeros([commentCount,100]) index = 0 stemmer = SnowballStemmer("english", ignore_stopwords=True) bagOfWords = [] for art in articleList.items(): for comm in art[1]: mywords = words(comm.body) mywords = known_words(mywords) # Remove Stops filtered_words = [w for w in mywords if not w in stopwords.words('english')] # Stemming stemmed_words = [stemmer.stem(w) for w in filtered_words] bagOfWords += stemmed_words bagOfWords.append("\n") tempVector = dict() #Create your bigrams bgs = nltk.bigrams(bagOfWords) fdist = nltk.FreqDist(bgs) for k in fdist.keys()[:100]: tempVector[k] = 0 theKeys = tempVector.keys() for art in articleList.items(): for comm in art[1]: mywords = words(comm.body) mywords = known_words(mywords) # Remove Stops filtered_words = [w for w in mywords if not w in stopwords.words('english')] # Stemming stemmed_words = [stemmer.stem(w) for w in filtered_words] bgs = nltk.bigrams(stemmed_words) for word in (w for w in bgs if tempVector.has_key(w)): keyInd = theKeys.index(word) featureMatrix[index][keyInd] += 1 index += 1 if index % 100 == 0: print "extracted", index, "features" if index >= commentCount: break print "non-zero",np.count_nonzero(featureMatrix) print "Percentage filled:%.2f" %(float(np.count_nonzero(featureMatrix))/(featureMatrix.shape[0]*featureMatrix.shape[1])) return featureMatrix
def as_dict(self): student = __builtin__.dict(Id=self.Id, FirstName=self.FirstName, LastName=self.LastName, Email=self.Email, Phone=self.Phone, IsActive=self.IsActive, CreatedAt=self.CreatedAt, UpdatedAt=self.UpdatedAt) return student
def extract_social_features(df_comments): socialVector = np.empty([df_comments.shape[0],8]) index = 0 graph = networkx.DiGraph() userdict = dict() for _, row in df_comments.iterrows(): userdict[row['comment_id']] = row['author'] for user in set(userdict.values()): graph.add_node(user) for _, row in df_comments.iterrows(): if not userdict.has_key(row['thread_root_id']): continue source = userdict[row['comment_id']] dest = userdict[row['thread_root_id']] if source == dest: continue graph.add_edge(source, dest) pageranker = networkx.pagerank(graph, alpha=0.85) hubs, auths = networkx.hits(graph) author_groupby = df_comments.groupby('author') user_age_dict = {} user_nr_posts_dict = {} for _,group in author_groupby: first_date = datetime.fromtimestamp(mktime(group.date.values[0])) last_date = datetime.fromtimestamp(mktime(group.date.values[-1])) diff = last_date - first_date days = diff.days user_age_dict[group.author.values[0]] = days + 1 user_nr_posts_dict[group.author.values[0]] = len(group) for ix, row in df_comments.iterrows(): user = userdict[row['comment_id']] socialVector[ix][0] = graph.in_degree(user) #In Degree socialVector[ix][1] = graph.out_degree(user) #Out Degree socialVector[ix][2] = user_age_dict[user] #User Age socialVector[ix][3] = user_nr_posts_dict[user] #Nr of Posts socialVector[ix][4] = user_nr_posts_dict[user]/float(user_age_dict[user]) # Postrate socialVector[ix][5] = pageranker[user] # Pagerank socialVector[ix][6] = hubs[user] # Pagerank socialVector[ix][7] = auths[user] # Pagerank index += 1 if index % 1000 == 0: print "extracted", index, "values" return socialVector
def __init__(self, header = None, li = (), idName = None, RowClass = util.Row, dict = None): if dict is not None: self.items = dict elif header is not None and li is not None: idfield = header.index(idName) self.items = __builtin__.dict([ (i[idfield], i) for i in li ]) else: self.items = {} self.header = header self.RowClass = RowClass self.idName = idName
def __init__(self, header=None, li=(), idName=None, RowClass=util.Row, dict=None): if dict is not None: self.items = dict elif header is not None and li is not None: idfield = header.index(idName) self.items = __builtin__.dict([(i[idfield], i) for i in li]) else: self.items = {} self.header = header self.RowClass = RowClass self.idName = idName
def index(request): disable_link_class, hide_link_class, page_heading = projects_index(request) rows = get_project_list(request) project_list = [] project_list.sort(key=itemgetter('name')) smart_phone_instruction_link = reverse("smart_phone_instruction") for project in rows: project_id = project['project_id'] delete_links = reverse('delete_projects', args=[project_id]) project = dict(delete_links=delete_links, name=project['name'], created=project['created'], qid=project['qid'], link=project['link'], web_submission_link_disabled=project['web_submission_link_disabled'], web_submission_link=project['web_submission_link'], analysis=project['analysis'], disabled=project['disabled'], log=project['log'], create_subjects_link=project['create_subjects_link'], entity_type=project['entity_type'], encoded_name=project['encoded_name'], import_template_file_name=project['import_template_file_name'] ) project_list.append(project) activation_success = request.GET.get('activation', False) error_messages = [] if "associate" in request.GET.keys(): error_messages = [_('You may have been dissociated from the project. Please contact your administrator for more details.')] if is_crs_admin(request): return render_to_response('alldata/index.html', {'projects': project_list, 'page_heading': page_heading, 'disable_link_class': disable_link_class, 'hide_link_class': hide_link_class, 'is_crs_admin': True, 'project_links': get_alldata_project_links(), 'is_quota_reached':is_quota_reached(request), 'error_messages': error_messages, 'activation_success': activation_success}, context_instance=RequestContext(request)) else: return render_to_response('alldata/index.html', {'projects': project_list, 'page_heading': page_heading, 'disable_link_class': disable_link_class, 'hide_link_class': hide_link_class, 'is_crs_admin': False, "smart_phone_instruction_link": smart_phone_instruction_link, 'project_links': get_alldata_project_links(), 'is_quota_reached':is_quota_reached(request), 'error_messages': error_messages, 'activation_success': activation_success}, context_instance=RequestContext(request))
def get_project_info(manager, project): project_id = project['_id'] questionnaire = Project.new_from_doc(manager, ProjectDocument.wrap(project)) questionnaire_code = questionnaire.form_code analysis, disabled, log = get_project_analysis_and_log_link( project_id, questionnaire_code) web_submission_link = reverse("web_questionnaire", args=[project_id]) web_submission_link_disabled = 'disable_link' if 'web' in project['devices']: web_submission_link_disabled = "" create_subjects_links = {} for entity_type in questionnaire.entity_type: create_subjects_links.update({ entity_type: append_query_strings_to_url(reverse("subject_questionnaire", args=[project_id, entity_type]), web_view=True) }) if questionnaire.is_poll: project_link = reverse("submissions", args=[project_id, questionnaire_code]) else: project_link = reverse('project-overview', args=[project_id]) project_info = dict( project_id=project_id, _id=project_id, name=project['name'], qid=questionnaire_code, created=project['created'], is_advanced_questionnaire=bool(project.get('xform')), link=project_link, log=log, analysis=analysis, disabled=disabled, web_submission_link=web_submission_link, web_submission_link_disabled=web_submission_link_disabled, create_subjects_link=create_subjects_links, entity_type=questionnaire.entity_type, encoded_name=urlquote(project['name']), import_template_file_name=slugify(project['name']), is_poll=bool(questionnaire.is_poll), is_project_manager=project.get('is_project_manager', False)) return project_info
def __init__(self, filename): ''' filename: inits the UBRR data from the input file ''' ub_map = dict() ub_ratings = dict() cnt = 0 #read the file if filename.endswith('.gz'): f = gzip.open(filename, 'r') else: f = open(filename, 'r') for line in f: vals = line.split("\t") if len(vals) == 0: continue u = vals[0] b = vals[1] r = float(vals[2]) d = vals[3].strip() ub_map[(u, b)] = self._int_list(d) ub_ratings[(u, b)] = r cnt += 1 self.user_item_map = ub_map self.user_item_rating = ub_ratings f.close() print 'Data Pair Manager Initialized with ', cnt, ' reviews'
def __init__(self, file): fileRead = open(file, 'r') self.delta = dict() definitions = fileRead.read().replace(' ', '').replace('\n', '').replace('\t', '').split('.') del definitions[-1] for definition in definitions: if (definition.split(':')[0][0] == 'T'): transitionsString = definition.replace(':=','=').split('T:')[1].replace('},','*').replace('}','*').split('*') for transition in transitionsString: if (transition != ''): transition = transition.split('={') state = transition[0][0] simbol = transition[0][2] targets = set(transition[1].split(',')) deltaAux = dict([(simbol,targets)]) if self.delta.has_key(state): self.delta[state][simbol] = targets else: self.delta[state] = deltaAux elif (definition.split(':')[0][0] == 'I'): self.q0 = definition.split(':')[1].split(',')[0] elif (definition.split(':')[0][0] == 'F'): self.F = definition.split(':')[1].split(',') fileRead.close()
def extract_word_clusters(commentList, commentCount): brown_ic = wordnet_ic.ic('ic-brown.dat') a, corpus, global_synsets = extract_global_bag_of_words(commentList, True) similarity_dict = {} i = 0 t = len(global_synsets)**2 for syn_out in global_synsets: similarity_dict[syn_out] = {} for syn_in in global_synsets: if syn_in.pos() == syn_out.pos(): similarity_dict[syn_out][syn_in] = syn_out.lin_similarity(syn_in, brown_ic) else: similarity_dict[syn_out][syn_in] = max(wn.path_similarity(syn_out,syn_in), wn.path_similarity(syn_in,syn_out)) if i % 10000 == 0: print i, 'synsets processed out of',len(global_synsets)**2, '(',float(i)/(t),'%)' i += 1 tuples = [(i[0], i[1].values()) for i in similarity_dict.items()] vectors = [np.array(tup[1]) for tup in tuples] # Rule of thumb n = sqrt(len(global_synsets)/2) print "Number of clusters", n km_model = KMeans(n_clusters=n) km_model.fit(vectors) clustering = collections.defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(tuples[idx][0]) pprint.pprint(dict(clustering), width=1) feature_vector = np.zeros([len(corpus),n]) for i,comment in enumerate(corpus): for w in comment: for key, clust in clustering.items(): if w in clust: feature_vector[i][key] += 1 if i % 1000 == 0: print i, 'comments processed' print feature_vector '''
def dict_processor(data): if not isinstance(data, __builtin__.dict) and not coerce_: raise DataTypeError('dict') else: try: data = __builtin__.dict(data) except (TypeError, ValueError): raise DataTypeError('dict') cleandata = {} errors = {} seen = set() for name, value in data.items(): try: if name in procs: cleandata[name] = procs[name].process(value) seen.add(name) elif ignore_extra: pass elif pass_extra: cleandata[name] = value else: raise ExtraDataError() except CheckerError as ex: if capture_all_errors: errors[name] = unicode(ex) else: ex.field = name raise ex if not ignore_missing: for name in (set(procs.keys()) - seen): try: cleandata[name] = procs[name].process(None) except CheckerError as ex: if capture_all_errors: errors[name] = unicode(ex) else: ex.field = name raise ex if errors: raise DictionaryError(errors) return cleandata
def as_dict(self): house = __builtin__.dict(Id=self.Id, LandlordId=self.LandlordId, Address1=self.Address1, Address2=self.Address2, City=self.City, State=self.State, Zipcode=self.Zipcode, Rooms=self.Rooms, ParkingSpots=self.ParkingSpots, MonthlyRent=self.MonthlyRent, UtilitiesIncluded=self.UtilitiesIncluded, Laundry=self.Laundry, Pets=self.Pets, Latitude=self.Latitude, Longitude=self.Longitude, DistFromCC=self.DistFromCC, DateAvailable=str(self.DateAvailable), LeaseTerm=self.LeaseTerm) return house
def displayExperimentSessionSettingsDialog(self): """ Display an editable dialog showing the experiment session setting retrieved from the configuration file. This includes the few manditory ioHub experiment session attributes, as well as any user defined experiment session attributes that have been defined in the experiment configuration file. If OK is selected in the dialog, the experiment logic continues, otherwise the experiment session is terminated. """ allSessionDialogVariables = dict(self.experimentSessionDefaults, **self.sessionUserVariables) sessionVariableOrder=self.configuration['session_variable_order'] if 'user_variables' in allSessionDialogVariables: del allSessionDialogVariables['user_variables'] sessionDlg=psychopy.gui.DlgFromDict(allSessionDialogVariables, 'Experiment Session Settings', [], sessionVariableOrder) if sessionDlg.OK: for key,value in allSessionDialogVariables.iteritems(): if key in self.experimentSessionDefaults: self.experimentSessionDefaults[key]=str(value) elif key in self.sessionUserVariables: self.sessionUserVariables[key]=str(value) return False return True
def create_record_categorical(model, y, cv, names, class_names, conf_matrix_list, oob_estimates, baseline=None): txt_time = str(datetime.datetime.now()) saved_params = {} try: params = model.get_params() for k in params: if type(params[k]) in (str,int,np.array,list,dict): try: json.dumps(params[k]) saved_params[k] = params[k] except TypeError: saved_params[k] = repr(params[k]) except: pass cv_full = [[train,test] for (train,test) in cv] y_label = one_hot_to_label(y) samples = [{'name': names[i], 'prob_label' : float(p[y_label[i]]), 'prob_pred' : float(p[np.argmax(p)]), 'pred' : float(np.argmax(p)), 'label' : float(y_label[i]), 'class_label' : class_names[y_label[i]], 'class_pred' : class_names[np.argmax(p)]} for i,p in oob_estimates.iteritems()]; samples = sorted(samples, key=lambda v: v['prob_label']) samples_predicted = sorted(samples, key=lambda v: -v['prob_label']) #get the classification report y_pred = [] y_label = [] for sample in samples: y_pred.append(sample['pred']) y_label.append(sample['label']) y_pred = np.array(y_pred) y_label = np.array(y_label) weighted_f1 = f1_score(y_label, y_pred, average='weighted') conf_matrix = confusion_matrix(y_label, y_pred, labels=range(len(class_names))) conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis] cmap_viridis = matplotlib.cm.get_cmap('viridis') fig_size = plt.gcf().get_size_inches() plt.figure(figsize=fig_size*3) plt.imshow(conf_matrix, interpolation='nearest', cmap=cmap_viridis) plt.title('CM, Train: %d, Test: %d, cv: %d, F1: %0.4f' % (len(cv_full[0][0]), len(cv_full[0][1]), len(cv_full), weighted_f1)) plt.clim(0,1) plt.colorbar() if len(class_names)<200: tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names, rotation=90) plt.yticks(tick_marks, class_names) plt.grid(True) plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #store the output store = dict() store['params'] = saved_params store['weighted_f1'] = weighted_f1 store['size'] = len(cv_full[0][0])+len(cv_full[0][1]) store['cv_size'] = len(cv_full) store['cv_train'] = len(cv_full[0][0]) store['cv_test'] = len(cv_full[0][1]) store['report'] = classification_report(y_label, y_pred, labels=range(len(class_names)), target_names=class_names, digits=3) store['top_missed'] = samples[0:200]; store['top_predicted'] = samples_predicted[0:200]; #print the report print store['report'] val_results = {} val_results['oob_estimates'] = samples val_results['conf_matrix'] = conf_matrix.tolist() img_results = {} buf = io.BytesIO() plt.savefig(buf) img_results['confusion_matrix'] = buf results = {} results['time'] = txt_time results['output'] = store results['validation'] = val_results results['images'] = img_results plt.close() return results
def create_record_regression(model, y, cv, names, loss_array, oob_estimates): txt_time = str(datetime.datetime.now()) loss_values = oob_estimates.values() loss = np.mean(loss_values) plt.hist(loss_values, 200, weights=np.ones(len(loss_values))/len(loss_values), label='Loss Distribution: loss={}'.format(loss)) cv_full = [[train,test] for (train,test) in cv] plt.xlabel('Loss') plt.ylabel('Fraction') plt.title('Loss, Training: %d, Testing: %d, cv: %d' % (len(cv_full[0][0]), len(cv_full[0][1]), len(cv_full))) plt.legend(loc="upper center") plt.tight_layout() #plt.show() saved_params = {} try: params = model.get_params() for k in params: if type(params[k]) in (str,int,np.array,list,dict): try: json.dumps(params[k]) saved_params[k] = params[k] except TypeError: saved_params[k] = repr(params[k]) except: pass #store the output store = dict() store['params'] = saved_params store['loss'] = float(loss) store['cv_size'] = len(cv_full) store['size'] = len(cv_full[0][0])+len(cv_full[0][1]) store['benign_size'] = float((y==0).sum()) store['malware_size'] = float((y==1).sum()) store['cv_train'] = len(cv_full[0][0]) store['cv_test'] = len(cv_full[0][1]) loss_store = [{'name': names[i], 'loss' : float(p)} for i,p in oob_estimates.iteritems()]; loss_store = sorted(loss_store, key=lambda v: -v['loss']) store['loss_top'] = loss_store[0:500]; store['loss_bottom'] = loss_store[-500:-1]; val_results = {} val_results['oob_estimates'] = loss_store #save the model img_results = {} buf = io.BytesIO() plt.savefig(buf) img_results['img_loss'] = buf results = {} results['time'] = txt_time results['output'] = store results['validation'] = val_results results['images'] = img_results plt.close() return results
def closure(*ap, **kp): A, K = a+ap, sortedtuple(k.items() + kp.items()) return state[(A,K)] if (A,K) in state else state.setdefault((A,K), f(*A, **__builtin__.dict(k.items()+kp.items())))
def generate_and_push_new_documentation_page( temporary_documentation_folder, distribution_bundle_file, has_api_documentation, temporary_documentation_node_modules_directory ): # # ''' Renders a new index.html file and copies new assets to generate a new \ documentation homepage. ''' global BUILD_DOCUMENTATION_PAGE_COMMAND __logger__.info('Update documentation design.') if distribution_bundle_file: new_distribution_bundle_file = FileHandler(location='%s%s%s' % ( temporary_documentation_folder.path, DOCUMENTATION_BUILD_PATH, DISTRIBUTION_BUNDLE_FILE_PATH)) new_distribution_bundle_file.directory.make_directories() distribution_bundle_file.path = new_distribution_bundle_file new_distribution_bundle_directory = FileHandler(location='%s%s%s' % ( temporary_documentation_folder.path, DOCUMENTATION_BUILD_PATH, DISTRIBUTION_BUNDLE_DIRECTORY_PATH)) new_distribution_bundle_directory.make_directories() zipfile.ZipFile(distribution_bundle_file.path).extractall( new_distribution_bundle_directory.path) favicon = FileHandler(location='favicon.png') if favicon: favicon.copy(target='%s/source/image/favicon.ico' % temporary_documentation_folder.path) parameter = builtins.dict(builtins.map(lambda item: ( String(item[0]).camel_case_to_delimited.content.upper(), item[1] ), SCOPE.get('documentationWebsite', {}).items())) if 'TAGLINE' not in parameter and 'description' in SCOPE: parameter['TAGLINE'] = SCOPE['description'] if 'NAME' not in parameter and 'name' in SCOPE: parameter['NAME'] = SCOPE['name'] __logger__.debug('Found parameter "%s".', json.dumps(parameter)) api_documentation_path = None if has_api_documentation: api_documentation_path = '%s%s' % ( API_DOCUMENTATION_PATH[1], API_DOCUMENTATION_PATH_SUFFIX) if not FileHandler(location='%s%s' % ( FileHandler().path, api_documentation_path )).is_directory(): api_documentation_path = API_DOCUMENTATION_PATH[1] parameter.update({ 'CONTENT': CONTENT, 'CONTENT_FILE_PATH': None, 'RENDER_CONTENT': False, 'API_DOCUMENTATION_PATH': api_documentation_path, 'DISTRIBUTION_BUNDLE_FILE_PATH': DISTRIBUTION_BUNDLE_FILE_PATH if ( distribution_bundle_file and distribution_bundle_file.is_file() ) else None }) # # python3.5 # # parameter = Dictionary(parameter).convert( # # value_wrapper=lambda key, value: value.replace( # # '!', '#%%%#' # # ) if builtins.isinstance(value, builtins.str) else value # # ).content parameter = Dictionary(parameter).convert( value_wrapper=lambda key, value: value.replace( '!', '#%%%#' ) if builtins.isinstance(value, builtins.unicode) else value ).content # # if __logger__.isEnabledFor(logging.DEBUG): BUILD_DOCUMENTATION_PAGE_COMMAND = \ BUILD_DOCUMENTATION_PAGE_COMMAND[:-1] + [ '-debug' ] + BUILD_DOCUMENTATION_PAGE_COMMAND[-1:] serialized_parameter = json.dumps(parameter) parameter_file = FileHandler(location=make_secure_temporary_file('.json')[ 1]) parameter_file.content = \ BUILD_DOCUMENTATION_PAGE_PARAMETER_TEMPLATE.format( serializedParameter=serialized_parameter, **SCOPE) for index, command in builtins.enumerate(BUILD_DOCUMENTATION_PAGE_COMMAND): BUILD_DOCUMENTATION_PAGE_COMMAND[index] = \ BUILD_DOCUMENTATION_PAGE_COMMAND[index].format( serializedParameter=serialized_parameter, parameterFilePath=parameter_file._path, **SCOPE) __logger__.debug('Use parameter "%s".', serialized_parameter) __logger__.info('Run "%s".', ' '.join(BUILD_DOCUMENTATION_PAGE_COMMAND)) current_working_directory_backup = FileHandler() temporary_documentation_folder.change_working_directory() Platform.run( command=BUILD_DOCUMENTATION_PAGE_COMMAND[0], command_arguments=BUILD_DOCUMENTATION_PAGE_COMMAND[1:], error=False, log=True) current_working_directory_backup.change_working_directory() parameter_file.remove_file() for file in FileHandler(): if not (file in (temporary_documentation_folder, FileHandler( location='.%s' % API_DOCUMENTATION_PATH[1] )) or is_file_ignored(file)): file.remove_deep() documentation_build_folder = FileHandler(location='%s%s' % ( temporary_documentation_folder.path, DOCUMENTATION_BUILD_PATH ), must_exist=True) documentation_build_folder.iterate_directory( function=copy_repository_file, recursive=True, source=documentation_build_folder, target=FileHandler()) if (Platform.run( "/usr/bin/env sudo umount '%s'" % temporary_documentation_node_modules_directory.path, native_shell=True, error=False, log=True )['return_code'] == 0): temporary_documentation_folder.remove_deep() Platform.run( ( '/usr/bin/env git add --all', '/usr/bin/env git commit --message "%s" --all' % PROJECT_PAGE_COMMIT_MESSAGE, '/usr/bin/env git push', '/usr/bin/env git checkout master' ), native_shell=True, error=False, log=True )
def dict(*a, **k): import __builtin__ warnings.warn('twisted.python.util.dict is deprecated. Use __builtin__.dict instead') return __builtin__.dict(*a, **k)
if sys.platform == 'win32': __import__('msvcrt').setmode(sys.stdout.fileno(), os.O_BINARY) if hasattr(sys.stdout, 'fileno') else None __import__('msvcrt').setmode(sys.stderr.fileno(), os.O_BINARY) if hasattr(sys.stderr, 'fileno') else None # use the current virtualenv if it exists builtins._ = os.path.join(user.home.replace('\\', os.sep).replace('/', os.sep), '.python-virtualenv', 'Scripts' if __import__('platform').system() == 'Windows' else 'bin', 'activate_this.py') if os.path.exists(builtins._): execfile(builtins._, {'__file__':builtins._}) # add ~/.python/* to python module search path map(sys.path.append, __import__('glob').iglob(os.path.join(user.home.replace('\\', os.sep).replace('/', os.sep), '.python', '*'))) ## some functional primitives in the default namespace # box any specified arguments fbox = fboxed = lambda *a: a # return a closure that executes ``f`` with the arguments unboxed. funbox = lambda f, *a, **k: lambda *ap, **kp: f(*(a + builtins.reduce(operator.add, builtins.map(builtins.tuple, ap), ())), **builtins.dict(k.items() + kp.items())) # return a closure that will check that its argument is an instance of ``type``. finstance = lambda *type: frpartial(builtins.isinstance, type) # return a closure that will check if its argument has an item ``key``. fhasitem = fitemQ = lambda key: fcompose(fcatch(frpartial(operator.getitem, key)), builtins.iter, builtins.next, fpartial(operator.eq, builtins.None)) # return a closure that will get a particular element from an object fgetitem = fitem = lambda item, *default: lambda object: default[0] if default and item not in object else object[item] # return a closure that will check if its argument has an ``attribute``. fhasattr = fattributeQ = lambda attribute: frpartial(builtins.hasattr, attribute) # return a closure that will get a particular attribute from an object fgetattr = fattribute = lambda attribute, *default: lambda object: getattr(object, attribute, *default) # return a closure that always returns ``object``. fconstant = fconst = falways = lambda object: lambda *a, **k: object # a closure that returns it's argument always fpassthru = fpass = fidentity = fid = lambda object: object # a closure that returns a default value if its object is false-y
def __init__(self, id, name, methods): self.id = id self.name = name self.methods = __builtin__.dict([(m.id, m) for m in methods]) registry.current_registry.register_class(self)
def __init__(self, configFilePath, configFile): """ Initialize the SimpleIOHubRuntime Object, loading the experiment configuration file, initializing and launching the ioHub server process, and creating the client side device interface to the ioHub devices that have been created. Currently the ioHub timer uses a ctypes implementation of direct access to the Windows QPC functions in win32 (so no python interpreter start time offset is applied between processes) and timeit.default_timer is used for all other platforms at this time. The advantage of not having a first read offset applied per python interpreter is that it means the both the psychopy process and the ioHub process are using the exact same timebase without a different offset that is hard to exactly determine due to the variablility in IPC request-reponses. By the two processes using the exact same time space, including offset, getTime() for the the ioHub client in psychopy == the current time of the ioHub server process, greatly simplifying some aspects of synconization. This only holds as long as both processes are running on the same PC of course. Note on timeit.default_timer: As of 2.7, timeit.default_timer correctly selects the best clock based on OS for high precision timing. < 2.7, you need to check the OS version yourself and select; or use the psychopy clocks since it does the work for you. ;) Args: configFilePath (str): The absolute path to the experiment configuration .yaml file, which is automatically assigned to the path the experiment script is running from by default. configFile (str): The name of the experiment configuration .yaml file, which has a default value of 'experiment_config.yaml' Return: None """ self.currentTime=computer.currentSec self.configFilePath=configFilePath self.configFileName=configFile self.fullPath= os.path.join(self.configFilePath,self.configFileName) # load the experiment config settings from the experiment_config.yaml file. # The file must be in the same directory as the experiment script. self.configuration=load(file(self.fullPath,u'r'), Loader=Loader) self.experimentConfig=dict() self._experimentConfigKeys=['title','code','version','description','total_sessions_to_run'] for key in self._experimentConfigKeys: if key in self.configuration: self.experimentConfig[key]=self.configuration[key] self.experimentSessionDefaults=self.configuration['session_defaults'] self.sessionUserVariables=self.experimentSessionDefaults['user_variables'] del self.experimentSessionDefaults['user_variables'] # self.hub will hold the reference to the ioHubClient object, used to access the ioHubServer # process and devices. self.hub=None # holds events collected from the ioHub during periods like msecWait() self.allEvents=None # indicates if the experiment is in high priority mode or not. Do not set directly. # See enableHighPriority() and disableHighPriority() self._inHighPriorityMode=False self.sysutil=ioHub.devices.computer # initialize the experiment object based on the configuration settings. self._initalizeConfiguration()
import sys,os,itertools,operator,functools,user,__builtin__ # use the current virtualenv if it exists __builtin__._=os.path.join(user.home.replace('\\',os.sep).replace('/',os.sep),'.python-virtualenv','Scripts' if __import__('platform').system() == 'Windows' else 'bin', 'activate_this.py') if os.path.exists(__builtin__._): execfile(__builtin__._,{'__file__':__builtin__._}) # add ~/.python/* to python module search path map(sys.path.append,__import__('glob').iglob(os.path.join(user.home.replace('\\',os.sep).replace('/',os.sep),'.python','*'))) ## include some functional primitives in the default namespace # box any specified arguments box = lambda *a: a # return a closure that executes ``f`` with the arguments unboxed. unbox = lambda f, *a, **k: lambda *ap, **kp: f(*(a + __builtin__.reduce(operator.add, __builtin__.map(__builtin__.tuple,ap), ())), **__builtin__.dict(k.items() + kp.items())) # return a closure that always returns ``n``. identity = lambda n: lambda *a, **k: n # return the first, second, or third item of a box. first, second, third = operator.itemgetter(0), operator.itemgetter(1), operator.itemgetter(2) # return a closure that executes a list of functions one after another from left-to-right fcompose = compose = lambda *f: __builtin__.reduce(lambda f1,f2: lambda *a: f1(f2(*a)), __builtin__.reversed(f)) # return a closure that executes function ``f`` whilst discarding any extra arguments fdiscard = lambda f: lambda *a, **k: f() # return a closure that executes function ``crit`` and then executes ``f`` or ``t`` based on whether or not it's successful. fcondition = lambda f, t: lambda crit: lambda *a, **k: t(*a, **k) if crit(*a, **k) else f(*a, **k) # return a closure that takes a list of functions to execute with the provided arguments fmaplist = fap = lambda *fa: lambda *a, **k: (f(*a, **k) for f in fa) #lazy = lambda f, state={}: lambda *a, **k: state[(f,a,__builtin__.tuple(__builtin__.sorted(k.items())))] if (f,a,__builtin__.tuple(__builtin__.sorted(k.items()))) in state else state.setdefault((f,a,__builtin__.tuple(__builtin__.sorted(k.items()))), f(*a, **k)) #lazy = lambda f, *a, **k: lambda *ap, **kp: f(*(a+ap), **dict(k.items() + kp.items())) # return a memoized closure that's lazy and only executes when evaluated def lazy(f, *a, **k):
def decoder(stream): return model(**__builtin__.dict([(name, type(stream)) for name, type in parts]))
def __init__(self, configFilePath, configFile): """ Initialize the SimpleIOHubRuntime Object, loading the experiment configuration file, initializing and launching the ioHub server process, and creating the client side device interface to the ioHub devices that have been created. Currently the ioHub timer uses a ctypes implementation of direct access to the Windows QPC functions in win32 (so no python interpreter start time offset is applied between processes) and timeit.default_timer is used for all other platforms at this time. The advantage of not having a first read offset applied per python interpreter is that it means the both the psychopy process and the ioHub process are using the exact same timebase without a different offset that is hard to exactly determine due to the variablility in IPC request-reponses. By the two processes using the exact same time space, including offset, getTime() for the the ioHub client in psychopy == the current time of the ioHub server process, greatly simplifying some aspects of synconization. This only holds as long as both processes are running on the same PC of course. Note on timeit.default_timer: As of 2.7, timeit.default_timer correctly selects the best clock based on OS for high precision timing. < 2.7, you need to check the OS version yourself and select; or use the psychopy clocks since it does the work for you. ;) Args: configFilePath (str): The absolute path to the experiment configuration .yaml file, which is automatically assigned to the path the experiment script is running from by default. configFile (str): The name of the experiment configuration .yaml file, which has a default value of 'experiment_config.yaml' Return: None """ self.currentTime = computer.currentSec self.configFilePath = configFilePath self.configFileName = configFile self.fullPath = os.path.join(self.configFilePath, self.configFileName) # load the experiment config settings from the experiment_config.yaml file. # The file must be in the same directory as the experiment script. self.configuration = load(file(self.fullPath, u'r'), Loader=Loader) self.experimentConfig = dict() self._experimentConfigKeys = [ 'title', 'code', 'version', 'description', 'total_sessions_to_run' ] for key in self._experimentConfigKeys: if key in self.configuration: self.experimentConfig[key] = self.configuration[key] self.experimentSessionDefaults = self.configuration['session_defaults'] self.sessionUserVariables = self.experimentSessionDefaults[ 'user_variables'] del self.experimentSessionDefaults['user_variables'] # self.hub will hold the reference to the ioHubClient object, used to access the ioHubServer # process and devices. self.hub = None # holds events collected from the ioHub during periods like msecWait() self.allEvents = None # indicates if the experiment is in high priority mode or not. Do not set directly. # See enableHighPriority() and disableHighPriority() self._inHighPriorityMode = False self.sysutil = ioHub.devices.computer # initialize the experiment object based on the configuration settings. self._initalizeConfiguration()
import cherrypy import signal from model.template import Template from __builtin__ import dict import os import controller.RootController def shutdown(signum, frame): print "try shutdown" cherrypy.server.stop() signal.signal(signal.SIGINT, shutdown) config = dict() site_config = dict() config['log.error_file'] = 'err.log' # error log file cherrypy.config.update(config) config.clear() conf = { '/': { 'tools.staticdir.root': os.getcwd() }, '/static': { 'tools.staticdir.on': True, 'tools.staticdir.dir': 'static', # we don't need to initialize the database for static files served by CherryPy # 'tools.db.on': False }
def decoder(stream): return __builtin__.dict([(key(stream), value(stream)) for i in range(int(stream))])
def create_record(model, y, cv, names, fpr_array, tpr_array, thresh_array, oob_estimates, baseline=None): txt_time = str(datetime.datetime.now()) final_prediction = np.matrix([[float(y[i]), float(p)] for i,p in oob_estimates.iteritems()]); fpr, tpr, thresh = roc_curve(final_prediction[:,0], final_prediction[:,1], 1) curr_auc = auc(fpr, tpr) #compute the shading over a large number of points sp = 1000; fpr_points = np.concatenate([fpr, np.logspace(-6,-5, sp), np.linspace(1e-5, 1e-4, sp), np.linspace(1e-4, 1e-3, sp), np.linspace(1e-3, 1e-2, sp), np.linspace(1e-2, 1, sp)]) fpr_points = np.sort(fpr_points) mean_fpr, mean_tpr, std_tpr = compute_stat_cv(fpr_array, tpr_array, fpr_points) #get the index idx_1e2 = (np.abs(fpr-1e-2)).argmin() idx_1e3 = (np.abs(fpr-1e-3)).argmin() idx_1e4 = (np.abs(fpr-1e-4)).argmin() #get the values auc_1e2 = integrate.trapz(tpr[:idx_1e2], fpr[:idx_1e2])*1e2 auc_1e3 = integrate.trapz(tpr[:idx_1e3], fpr[:idx_1e3])*1e3 auc_1e4 = integrate.trapz(tpr[:idx_1e4], fpr[:idx_1e4])*1e4 #plt.semilogx(mean_fpr, mean_tpr, 'k-', label='Mean ROC (area = %0.3f, tpr = %0.3f)' % (mean_auc, mean_tpr[idx_1e3])) #plt.xlim([1.0e-4, 1.0]) if baseline is None: plt.plot(np.logspace(-10,0, 1000), np.logspace(-10,0, 1000), 'k--') else: plt.plot(baseline[0], baseline[1],'k--') plt.fill_between(mean_fpr, mean_tpr - std_tpr, mean_tpr + std_tpr, alpha=.4, label='95% Confidence Interval') plt.step(fpr, tpr, 'k-', label='ROC (AUC = %0.6f, AUC_1e-3, = %0.6f, TPR_1e-4 = %0.6f, TPR_1e-3 = %0.6f, )' % (curr_auc, auc_1e3, tpr[idx_1e4], tpr[idx_1e3])) cv_full = [[train,test] for (train,test) in cv] plt.xlim([0, 1.0]) plt.ylim([0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC, Training: %d, Testing: %d, cv: %d' % (len(cv_full[0][0]), len(cv_full[0][1]), len(cv_full))) plt.legend(loc="lower right", prop={'size':8}) plt.tight_layout() #plt.show() saved_params = {} try: params = model.get_params() for k in params: if type(params[k]) in (str,int,np.array,list,dict): try: json.dumps(params[k]) saved_params[k] = params[k] except TypeError: saved_params[k] = repr(params[k]) except: pass #store the output store = dict() store['params'] = saved_params store['roc'] = np.column_stack((fpr, tpr, thresh)).tolist() #store['std_tpr'] = std_tpr.tolist() store['auc'] = float(curr_auc) store['tpr_1e2'] = float(mean_tpr[idx_1e2]) store['auc_1e2'] = float(auc_1e2) store['tpr_1e3'] = float(mean_tpr[idx_1e3]) store['auc_1e3'] = float(auc_1e3) store['tpr_1e4'] = float(mean_tpr[idx_1e4]) store['auc_1e4'] = float(auc_1e4) store['cv_size'] = len(cv_full) store['size'] = len(cv_full[0][0])+len(cv_full[0][1]) store['benign_size'] = int((y==0).sum()) store['malware_size'] = int((y==1).sum()) store['cv_train'] = len(cv_full[0][0]) store['cv_test'] = len(cv_full[0][1]) pos = [{'name': names[i], 'p' : float(p), 'label' : float(y[i])} for i,p in oob_estimates.iteritems() if y[i]==1]; neg = [{'name': names[i], 'p' : float(p), 'label' : float(y[i])} for i,p in oob_estimates.iteritems() if y[i]==0]; pos = sorted(pos, key=lambda v: v['p']) neg = sorted(neg, key=lambda v: -v['p']) store['top_fp'] = neg[0:500]; store['top_fn'] = pos[0:500]; val_results = {} val_results['oob_estimates'] = list(np.concatenate((neg, pos))) #save the model img_results = {} buf = io.BytesIO() plt.xlim([0, 0.0001]) plt.savefig(buf) img_results['img_0_0001'] = buf buf = io.BytesIO() plt.xlim([0, 0.001]) plt.savefig(buf) img_results['img_0_001'] = buf buf = io.BytesIO() plt.xlim([0, 0.01]) plt.savefig(buf) img_results['img_0_01'] = buf buf = io.BytesIO() plt.xlim([0, 0.1]) plt.savefig(buf) img_results['img_0_1'] = buf buf = io.BytesIO() plt.xlim([0, 1]) plt.savefig(buf) img_results['img_1'] = buf buf = io.BytesIO() plt.xlim([1e-6, 1]) plt.xscale('log') plt.savefig(buf) img_results['img_log'] = buf results = {} results['time'] = txt_time results['output'] = store results['validation'] = val_results results['images'] = img_results plt.close() return results
def lazy(*ap, **kp): A, K = a+ap, sortedtuple(k.items() + kp.items()) return state[(A, K)] if (A, K) in state else state.setdefault((A, K), f(*A, **builtins.dict(k.items()+kp.items())))
def extract_feature_matrix(df_comments, df_thread_groupby): print "START" # Sentence Tokenizer sentencer = SentenceTokenizer() clf = load_classifier(sentiment_path + 'sentiment_classifier.pickle') featureMatrix = np.empty([df_comments.shape[0],25]) feature_dict = dict() for ix, row in df_comments.iterrows(): feature_dict[row['comment_id']] = ix feature_count = 0 for _,row in df_comments.iterrows(): index = feature_dict[row['comment_id']] comm = row['comment_content'].decode('ASCII', 'ignore') tokens = words(comm) unique_tokens = set(tokens) sentences = sentencer.tokenize(comm) featureMatrix[index][3] = len(comm) verb_fr, noun_fr, pronoun_fr = pos_freq(tokens) featureMatrix[index][4] = verb_fr featureMatrix[index][5] = noun_fr featureMatrix[index][6] = pronoun_fr featureMatrix[index][7] = capital_frequency(tokens) featureMatrix[index][8] = sent_frequency(sentences, '?') featureMatrix[index][9] = sent_frequency(sentences, '!') featureMatrix[index][10] = sentence_capital_frequency(sentences) featureMatrix[index][11] = entropy(comm) featureMatrix[index][12] = lexical_diversity(tokens) if len(tokens) == 0: featureMatrix[index][13] = 0 featureMatrix[index][14] = 0 featureMatrix[index][15] = 0 featureMatrix[index][16] = 0 else: spelt_wrong = missing_words(unique_tokens) bad_words_list = swears(unique_tokens) featureMatrix[index][13] = len(spelt_wrong) featureMatrix[index][14] = len(spelt_wrong)/float(len(unique_tokens)) featureMatrix[index][15] = len(bad_words_list) featureMatrix[index][16] = len(bad_words_list)/float(len(unique_tokens)) featureMatrix[index][19] = F_K_score(sentences, tokens) testSet = dict() refWords = make_full_dict(tokens) testSet.update(refWords) probDist = clf.prob_classify(testSet) sentiment = probDist.prob('pos') subj_obj = get_subjectivity(probDist) polarity_overlap = get_polarity_overlap(words(row['article_body']), tokens, clf) featureMatrix[index][22] = sentiment featureMatrix[index][23] = subj_obj featureMatrix[index][24] = polarity_overlap feature_count += 1 if feature_count % 1000 == 0: print feature_count print "DONE" feature_count = 0 # Grouped for _,group in df_thread_groupby: thread_comments = [row['comment_content'] for _,row in group.iterrows()] # Get average time sumTime = 0 count = 0 previous = mktime(group.iloc[0]['date']) first = mktime(group.iloc[0]['date']) # Average length sumLen = 0 thread_tokens = [] # Within Thread for _, row in group.iterrows(): index = feature_dict[row['comment_id']] comm = row['comment_content'].decode('ascii','ignore') tokens = words(comm) sentences = sentencer.tokenize(comm) # Ongoing average time sumTime += mktime(row['date']) - previous count += 1 avgTime = sumTime/float(count) # Ongoing average length sumLen += len(words(row['comment_content'])) avgLen = sumLen/float(count) ###################################################################### # Get chunked sentences for sent in sentences: sent_tokens = words(sent) sent_tokens_tagged = nltk.pos_tag(sent_tokens) chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True) doc = [] for chunk in chunks: if type(chunk) == nltk.Tree: doc.append(' '.join(c[0] for c in chunk.leaves())) else: doc.append(chunk[0]) doc = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1] # The cumulative tokens up to this point thread_tokens += doc ###################################################################### article_tokens = [] article_sentences = sentencer.tokenize(row['article_body']) for sent in article_sentences: sent_tokens = words(sent) sent_tokens_tagged = nltk.pos_tag(sent_tokens) chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True) doc = [] for chunk in chunks: if type(chunk) == nltk.Tree: doc.append(' '.join(c[0] for c in chunk.leaves())) else: doc.append(chunk[0]) article_tokens = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1] ###################################################################### featureMatrix[index][0] = timeliness(mktime(row['date']), previous, max(avgTime, 1)) previous = mktime(row['date']) featureMatrix[index][1] = mktime(row['date']) - first featureMatrix[index][2] = lengthiness(words(row['comment_content']), max(avgLen, 1)) featureMatrix[index][17] = np.mean([termf(comm.count(w), tokens) for w in set(tokens)]) featureMatrix[index][18] = tf_idf(comm, thread_comments) featureMatrix[index][20] = onSubForumTopic(tokens, thread_tokens) featureMatrix[index][21] = onSubForumTopic(tokens, article_tokens) feature_count += 1 if feature_count % 1000 == 0: print feature_count return featureMatrix
def __init__(self, filename, empty_user = set()): ''' filename: inits the UBRR data from the input file empty_user: skip the reviews by this user (keeps the ratings) ''' self.empty_user = empty_user ur_map = dict() br_map = dict() cnt = 0 skipped = 0 #read the file if filename.endswith('.gz'): f = gzip.open(filename, 'r') else: f = open(filename, 'r') for line in f: vals = line.split("\t") if len(vals) == 0: continue u = vals[0] b = vals[1] r = float(vals[2]) d = vals[3].strip() if u in self.empty_user: #we are skipping this review d = '' skipped += 1 rev = Review(u, b, r, d) #review obj #store biz -> list of reviews if not br_map.has_key(b): br_map[b] = [] br_map[b].append(rev) #store user -> list of reviews if not ur_map.has_key(u): ur_map[u] = [] ur_map[u].append(rev) cnt += 1 self.biz_map = br_map self.user_map = ur_map f.close() print 'Review Data Manager Initialized with ', cnt, ' reviews' print 'Number of skipped users = ', len(self.empty_user) print 'Number of skipped reviews = ', skipped