def __init__(self, section='imsto'): """engine: mongodb(default), s3""" self.section = section self._config = Config() self.engine = self.get_config('engine') self.fs_prefix = self.get_config('fs_prefix') print 'init section: {self.section}, engine: {self.engine}, fs_prefix: {self.fs_prefix}'.format( self=self)
def load_imsto(section='imsto'): config = Config() engine = config.get('engine', section) print 'loading {} engine: {}'.format(section, engine) if engine == 'mongodb': return StoreEngineGridFs(section) if engine == 's3': return StoreEngineS3(section) if engine == 'weedfs': return StoreEngineWeedFs(section) raise ValueError('bad engine_code')
def compute_subgraph_metrics(dataset, n_jobs, limit): print("--- Subgraph Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("---------------------------") # paths conf = Config(dataset) output_fpath = f"{conf.data_root}/subgraph_metrics.csv" json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir, limit=limit) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [compute_metrics(json_fpath) \ for json_fpath in tqdm(json_fpaths)] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(compute_metrics)(json_fpath) \ for json_fpath in json_fpaths ) print("Output:", len(metrics)) # output to csv print("Outputting tree metrics to CSV ...") write_dicts_to_csv(metrics, output_fpath) print("Done!")
def __init__(self, section='imsto'): """engine: mongodb(default), s3""" self.section = section self._config = Config() self.engine = self.get_config('engine') self.fs_prefix = self.get_config('fs_prefix') print 'init section: {self.section}, engine: {self.engine}, fs_prefix: {self.fs_prefix}'.format(self=self)
def load_data(ds_name, prefix, outcome, selected_feature_sets): conf = Config() ds_x_path = f"{conf.modeling_dir}/prefix/datasets/{ds_name}_p{prefix}.pkl.gz" ds_y_path = f"{conf.modeling_dir}/prefix/datasets/{ds_name}_labels.pkl.gz" ds_x = pickle.load(gzip.open(ds_x_path)) ds_y = pickle.load(gzip.open(ds_y_path)) col_idxs = [] row_idxs = [] ys = [] meta = [] feature_names = [] # select columns for idx, feature_pair in enumerate(ds_x["feature_set_name_pairs"]): if feature_pair[0] in selected_feature_sets: col_idxs.append(idx) feature_names.append(feature_pair) # fetch ys & metadata y_key = f"p{prefix}__{outcome}" for idx, root_tweet_id in enumerate(ds_x["root_tweet_ids"]): # NB: this can happen only for prefix=10 # as some convs may have < 2*p tweets if root_tweet_id not in ds_y: continue if y_key in ds_y[root_tweet_id]: conv_dict = ds_y[root_tweet_id] row_idxs.append(idx) y = conv_dict[y_key] ys.append(float(y)) meta.append({ "root_tweet_id": conv_dict["root_tweet_id"], "root_tweet_type": conv_dict["root_tweet_type"], "n": conv_dict["n"], "pre_n_tox": conv_dict[f"p{prefix}_pre_n_tox"], "suf_n": conv_dict[f"p{prefix}_suf_n"], "suf_i_tox": conv_dict[f"p{prefix}_suf_i_tox"], "suf_f_tox": conv_dict[f"p{prefix}_suf_f_tox"], }) # prepare numpy objs X = ds_x["X"] X = X[:, col_idxs] X = X[row_idxs, :] ys = np.array(ys) assert X.shape[0] == ys.shape[0] return X, ys, meta, feature_names
def compute_dyad_metrics(dataset, n_jobs, limit): # hard-coding some settings toxicity_threshold = 0.531 splits_only = False skip_root = True print("--- Dyad Metrics ---") print(f"Dataset: {dataset}") print(f"Toxicity threshold: {toxicity_threshold}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("----------------------------") conf = Config(dataset) output_fpath = f"{conf.data_root}/dyad_metrics.csv" json_fpaths = json_paths_iter(conf.conversations_jsons_dir, limit=limit) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [ process_conversation(json_fpath, toxicity_threshold, splits_only, skip_root) for json_fpath in tqdm(json_fpaths) ] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(process_conversation)( json_fpath, toxicity_threshold, splits_only, skip_root ) \ for json_fpath in json_fpaths ) # flatten the results metrics = list(itertools.chain.from_iterable(metrics)) print(len(metrics)) # output to CSV fields = [ "root_tweet_id", "parent_tox", "parent_n_friends", "parent_n_followers", "child_tox", "child_n_friends", "child_n_followers", "dyad_type", "dyad_n_common_friends" ] with open(output_fpath, "w") as fout: writer = csv.writer(fout) writer.writerow(fields) writer.writerows(metrics) print("Done!")
def __init__(self, app): # Data objects self.app = app self.stories = [] self.index = 0 self.historyEnabled = False self.printCount = 0 self.category = 'sequence' self.saveDir = os.path.expanduser("~") + '/Stories/' self.verticalPanePosition = 150 self.startingUpApp = True self.sequenceVisible = False self.testingTags = True self.updateNamesGlobally = True self.eventManager = _event.EventManager(self) self.windowPosition = (0,0) self.addWordPath = os.path.expanduser(('~')) + '/.config/diego/addedWords' self.removeWordPath = os.path.expanduser(('~')) + '/.config/diego/removeWords' self.historyDir = None self.config = Config(self) self.preferences = Preferences(self) self.state = State(self) self.copyClipboard = _clipboard.Clipboard(self) self.selectionClipboard = _clipboard.Clipboard(self) # View objects self.app.window = _appWindow.Window(self) self.appBox = AppBox(self) self.panelLabel = Gtk.Label() self.indexView = IndexView(self) self.pageItemBox = PageItemBox(self) self.sceneItemBox = SceneItemBox(self) self.sequenceItemBox = SequenceItemBox(self) self.storyItemBox = StoryItemBox(self) self.scriptView = ScriptView(self) self.appHeaderBar = Gtk.HeaderBar() self.headerBar = Gtk.HeaderBar() self.pathLabel = Gtk.Label() self.searchView = _search.View(self) self.doMarkSetIndexUpdate = True self.trie = None self.auxTries = [] self.mispelledLine = None self.addTrie = None self.removeTrie = None self.clipboard = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD) self.scriptViewPanedPosition = 0 self.settingPanedWithEsc = False
def compute_prefix_metrics(dataset, n_jobs=1, limit=None): prefixes = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] print("--- Prefix Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print(f"Prefixes: {prefixes}") print("----------------------------") # paths conf = Config(dataset) output_fpath = f"{conf.data_root}/prefix_metrics/{dataset}.json.gz" output_pickle_fpath = f"{conf.data_root}/prefix_metrics/{dataset}.pkl.gz" json_fpaths = json_paths_iter(conf.conversations_jsons_dir, limit=limit) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [conversation_prefix_metrics(json_fpath, prefixes) \ for json_fpath in tqdm(json_fpaths)] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(conversation_prefix_metrics)(json_fpath, prefixes) \ for json_fpath in json_fpaths ) print(f"Metrics total: {len(metrics)}") # skip empty results metrics = [m for m in metrics if len(m) > 0] print(f"Metrics non-zero: {len(metrics)}") # pickle with gzip.open(output_pickle_fpath, "wb") as fout: pickle.dump(metrics, fout, protocol=4) # uJSON complains: cast numpy ints/floats to python ints/floats for conv_metrics in metrics: for prefix_n, prefix_metrics in conv_metrics.items(): if prefix_n != "root_tweet_id": for group_name, group_values in prefix_metrics.items(): if group_values is not None: group_values = sanitize_numpy_types(group_values) # output metrics to JSON print("Outputting results to JSON ...") with gzip.open(output_fpath, "wt") as fout: json.dump(metrics, fout) print("Done!")
def __init__(self, device, user, debug=False): from _config import Config from _group import Group from _light import Light from _schedule import Schedule self.config = Config(device, user, debug) self.group = Group(device, user, debug) self.light = Light(device, user, debug) self.schedule = Schedule(device, user, debug)
def compute_user_metrics(dataset, n_jobs=1, limit=None): print("--- User Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("----------------------------") toxicity_threshold = 0.531 conf = Config(dataset) json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir, limit=limit) # all_user_conv_stats = [ # compute_user_conversation_stats(json_fpath, toxicity_threshold) \ # for json_fpath in json_fpaths] parallel = Parallel(n_jobs=n_jobs, verbose=10) all_user_conv_stats = parallel( delayed(compute_user_conversation_stats)( json_fpath, toxicity_threshold ) \ for json_fpath in json_fpaths ) print("Aggregating user metrics ...") user_stats = agg_user_stats(all_user_conv_stats) user_stats_csv = [{"user_id": u_id, **u_stats} \ for u_id, u_stats in user_stats.items()] # out_json_fpath = f"{conf.data_root}/user_metrics.json.gz" # json.dump(user_stats, gzip.open(out_json_fpath, "wt"), indent=2) out_csv_fpath = f"{conf.data_root}/user_metrics.csv" write_dicts_to_csv(user_stats_csv, out_csv_fpath) print("Done!")
def repos_to_csv(repos_by_lang, page_num): repo_issue_content_list = [] for index, repo in enumerate(repos_by_lang): # get repo with basic numerical numerical data repos_by_lang[index] = py_.pick(repo, 'full_name', 'forks_count', 'open_issues_count', 'watchers_count') # separate full name to list ['owner', 'repository name'] repo_name = repo['full_name'] repo_owner_name_list = repo_name.split('/') issue_list = GetIssueContent( repo_owner_name_list[0], repo_owner_name_list[1]).get_issue_content()[0:2] clean_issue_list = '[[[[[Next]]]]]'.join(map(str, issue_list)) repo_issue_content_list.append(clean_issue_list) # add star count and merge to existing dictionary star_count = { "star_count": GetStarCountsByRepo(repo['full_name']).get() } repos_by_lang[index] = py_.merge(repos_by_lang[index], star_count) pd_format_dic = { 'full_name': py_.pluck(repos_by_lang, 'full_name'), 'forks_count': py_.pluck(repos_by_lang, 'forks_count'), 'open_issues_count': py_.pluck(repos_by_lang, 'open_issues_count'), 'watchers_count': py_.pluck(repos_by_lang, 'watchers_count'), 'comment_count': py_.pluck(repos_by_lang, 'comment_count'), 'star_count': py_.pluck(repos_by_lang, 'star_count'), 'issue_content': repo_issue_content_list } # print(pd_format_dic) df = pd.DataFrame.from_dict(pd_format_dic) file_name = Config().get_search_setting()['lang'].split(':')[1] df.to_csv(f'../data/{file_name}_github_{page_num}.csv') print(f'Saving {file_name}_github_{page_num} to csv finished!!')
def compute_toxicity_metrics(dataset, n_jobs=1, limit=None): print("--- Toxicity Metrics ---") print(f"Dataset: {dataset}") print(f"Num Jobs: {n_jobs}") print(f"Limit: {limit}") print("----------------------------") # paths conf = Config(dataset) output_fpath = f"{conf.data_root}/toxicity.csv" # iterator json_fpaths = json_paths_iter( conf.conversations_no_embs_jsons_dir, limit=limit ) # compute metrics print("Computing metrics ...") if n_jobs == 1: metrics = [toxicity_metrics(json_fpath) \ for json_fpath in tqdm(json_fpaths)] else: parallel = Parallel(n_jobs=n_jobs, verbose=10) metrics = parallel( delayed(toxicity_metrics)(json_fpath) \ for json_fpath in json_fpaths ) print("Metrics computed:", len(metrics)) print("Outputting metrics to CSV ...") write_dicts_to_csv(metrics, output_fpath) print("Done!")
def __init__(self, spectrum): self.config = Config() self.spectrum = spectrum self.spectrum.setCalibration(self.config.calibration)
class Control(object): def __init__(self, app): # Data objects self.app = app self.stories = [] self.index = 0 self.historyEnabled = False self.printCount = 0 self.category = 'sequence' self.saveDir = os.path.expanduser("~") + '/Stories/' self.verticalPanePosition = 150 self.startingUpApp = True self.sequenceVisible = False self.testingTags = True self.updateNamesGlobally = True self.eventManager = _event.EventManager(self) self.windowPosition = (0,0) self.addWordPath = os.path.expanduser(('~')) + '/.config/diego/addedWords' self.removeWordPath = os.path.expanduser(('~')) + '/.config/diego/removeWords' self.historyDir = None self.config = Config(self) self.preferences = Preferences(self) self.state = State(self) self.copyClipboard = _clipboard.Clipboard(self) self.selectionClipboard = _clipboard.Clipboard(self) # View objects self.app.window = _appWindow.Window(self) self.appBox = AppBox(self) self.panelLabel = Gtk.Label() self.indexView = IndexView(self) self.pageItemBox = PageItemBox(self) self.sceneItemBox = SceneItemBox(self) self.sequenceItemBox = SequenceItemBox(self) self.storyItemBox = StoryItemBox(self) self.scriptView = ScriptView(self) self.appHeaderBar = Gtk.HeaderBar() self.headerBar = Gtk.HeaderBar() self.pathLabel = Gtk.Label() self.searchView = _search.View(self) self.doMarkSetIndexUpdate = True self.trie = None self.auxTries = [] self.mispelledLine = None self.addTrie = None self.removeTrie = None self.clipboard = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD) self.scriptViewPanedPosition = 0 self.settingPanedWithEsc = False def mispelledTimer(self): GObject.timeout_add(1000, self.removeMispelledTags) def removeMispelledTags(self): if self.mispelledLine: try: index = self.scriptView.lines.index(self.mispelledLine) except: self.mispelledLine = None else: if self.scriptView.textView.completion is None: self.scriptView.textView.updateLineTag(index) self.mispelledLine = None def wordMispelled(self, word): word = unicode(word) # If it's a name, it's not mispelled. if word in self.currentStory().names: return False # If word comes in in all lower, then it must be in the dict as all lower or it's mispelled. allLower = True for c in word: if not c.islower(): allLower = False if allLower: if word not in self.trie and word not in self.addTrie: return True if word in self.removeTrie: return True return False # The dict does not contain uppercase version of words, the capitalized version will be checked. All upper will be check as well for screenplay character names, locations and times. lower = word.lower() capitalized = word[0].upper() if len(word) > 1: capitalized += word[1:].lower() lower = unicode(lower) capitalized = unicode(capitalized) notInTrie = word not in self.trie and lower not in self.trie and capitalized not in self.trie notInAddTrie = word not in self.addTrie and lower not in self.addTrie and capitalized not in self.addTrie if notInTrie and notInAddTrie: return True if word in self.removeTrie or lower in self.removeTrie or capitalized in self.removeTrie: return True return False def notImplemented(self): print "not implemented" def p(self, *args): self.printCount += 1 print self.printCount, args # if self.printCount == 8: # print raiseException = 0 if raiseException == self.printCount: raise Exception() return self.printCount def load(self, data=True): self.historyEnabled = False if data: self.config.load() self.preferences.load() # self.state.load() #move to app init for story in self.stories: story.loadId() for story in self.stories: story.load() self.scriptView.updateTitles() self.appBox.load() self.indexView.load() self.storyItemBox.load() self.sequenceItemBox.load() self.sceneItemBox.load() self.pageItemBox.load() self.scriptView.load() self.app.updateWindowTitle() for story in self.stories: self.scriptView.updateTitles(story) self.app.window.show_all() self.startingUpApp = False self.appBox.paned.set_position(self.verticalPanePosition) self.historyEnabled = True def reset(self, data=True): if data: self.config.reset() self.preferences.reset() self.stories = [] self.index = 0 self.historyEnabled = False self.state.reset() self.appBox.reset() self.indexView.reset() self.pageItemBox.reset() self.sceneItemBox.reset() self.sequenceItemBox.reset() self.storyItemBox.reset() self.scriptView.reset() def currentStory(self): if len(self.stories): return self.stories[self.index] else: return None def currentIssue(self): pass def currentSequence(self): return self.currentStory().currentSequence() def currentScene(self): return self.currentStory().currentScene() def currentPage(self): return self.currentStory().currentPage() def currentLine(self): return self.currentStory().currentLine() def currentPanel(self): cp = self.currentPage() cl = self.currentLine() panelNumber = 0 for line in cp.lines: if line.tag == 'description': panelNumber += 1 if line == cl: break return panelNumber def storyPaths(self): return [story.path for story in self.stories] def newStory(self): for s in self.stories: if not s.saved: s.save(pdf=False, rtf=False) story = Story(self) self.index += 1 story.createId() story.makeHistoryDir() story.load() self.stories.insert(self.index, story) self.reset(data=False) self.scriptView.updateTitles() self.load(data=False) self.storyItemBox.listbox.get_row_at_index(self.index).grab_focus() self.storyItemBox.listbox.select_row(self.storyItemBox.listbox.get_row_at_index(self.index)) def uniqueStoryId(self): chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890" id = [list(chars), list(chars), list(chars), list(chars)] for item in id: random.shuffle(item) id = ["".join(item)[:6] for item in id] id = "-".join(id) return id def newSequence(self, prepend=False): self.currentStory().newSequence(prepend) self.reset(data=False) self.scriptView.updateTitles() self.load(data=False) index = self.currentStory().index.sequence self.sequenceItemBox.listbox.get_row_at_index(index).grab_focus() self.sequenceItemBox.listbox.select_row(self.sequenceItemBox.listbox.get_row_at_index(index)) def newScene(self, prepend=False): self.currentStory().newScene(prepend) self.reset(data=False) self.scriptView.updateTitles() self.load(data=False) index = self.currentStory().index.scene self.sceneItemBox.listbox.get_row_at_index(index).grab_focus() self.sceneItemBox.listbox.select_row(self.sceneItemBox.listbox.get_row_at_index(index)) def newPage(self, prepend=False): self.currentStory().newPage(prepend) self.reset(data=False) self.scriptView.updateTitles() self.load(data=False) index = self.currentStory().index.page self.pageItemBox.listbox.get_row_at_index(index).grab_focus() self.pageItemBox.listbox.select_row(self.pageItemBox.listbox.get_row_at_index(index)) def newPanel(self): self.currentStory().newPanel() self.reset(data=False) self.load(data=False) def newDialog(self): self.currentStory().newDialog() self.reset(data=False) self.load(data=False) def moveCurrentSequenceUp(self): self.p('mcsu') def moveCurrentSequenceDown(self): self.p('mcsd') def moveCurrentSceneUp(self): item = self.sceneItemBox.getSelectedItem() itemIndex = self.sceneItemBox.scenes.index(item) scene = self.currentStory().sequences[0].scenes[itemIndex] if itemIndex == 0: return scene = self.currentStory().sequences[0].scenes.pop(itemIndex) self.currentStory().sequences[0].scenes.insert(itemIndex - 1, scene) self.sceneItemBox.reset() self.sceneItemBox.load() self.sceneItemBox.loadSceneAtIndex(itemIndex - 1) def moveCurrentSceneDown(self): item = self.sceneItemBox.getSelectedItem() itemIndex = self.sceneItemBox.scenes.index(item) scene = self.currentStory().sequences[0].scenes[itemIndex] if itemIndex == len(self.sceneItemBox.scenes) - 1: return scene = self.currentStory().sequences[0].scenes.pop(itemIndex) self.currentStory().sequences[0].scenes.insert(itemIndex + 1, scene) self.sceneItemBox.reset() self.sceneItemBox.load() self.sceneItemBox.loadSceneAtIndex(itemIndex + 1) def moveCurrentPageUp(self): item = self.pageItemBox.getSelectedItem() itemIndex = self.pageItemBox.pages.index(item) page = self.currentScene().pages[itemIndex] if itemIndex == 0: return page = self.currentScene().pages.pop(itemIndex) self.currentScene().pages.insert(itemIndex - 1, page) self.pageItemBox.updateNumberated() self.pageItemBox.loadPageAtIndex(itemIndex - 1) def moveCurrentPageDown(self): item = self.pageItemBox.getSelectedItem() itemIndex = self.pageItemBox.pages.index(item) page = self.currentScene().pages[itemIndex] if itemIndex == len(self.pageItemBox.pages) - 1: return page = self.currentScene().pages.pop(itemIndex) self.currentScene().pages.insert(itemIndex + 1, page) self.pageItemBox.updateNumberated() self.pageItemBox.loadPageAtIndex(itemIndex + 1) def updateHistoryColor(self): val = 0.94 selectColor = Gdk.RGBA(0.75, 0.75, 0.85, 1.0) forground = Gdk.RGBA(0.0, 0.0, 0.0, 1.0) currentScene = self.currentScene() if currentScene.undoIndex > 0: if currentScene.saveIndex <= 0: color = Gdk.RGBA(0.90, 0.90, 1.0, 1.0) else: color = Gdk.RGBA(val, val, val, 1.0) self.scriptView.textView.modify_bg(Gtk.StateType.NORMAL, color.to_color()) self.scriptView.textView.modify_bg(Gtk.StateType.SELECTED, selectColor.to_color()) self.scriptView.textView.modify_fg(Gtk.StateType.SELECTED, forground.to_color()) # New events. else: color = Gdk.RGBA(1.0, 1.0, 1.0, 1.0) self.scriptView.textView.modify_bg(Gtk.StateType.NORMAL, color.to_color()) self.scriptView.textView.modify_bg(Gtk.StateType.SELECTED, selectColor.to_color()) self.scriptView.textView.modify_fg(Gtk.StateType.SELECTED, forground.to_color()) self.scriptView.textView.descriptionTag.props.background_rgba = color self.scriptView.textView.characterTag.props.background_rgba = color self.scriptView.textView.dialogTag.props.background_rgba = color self.scriptView.textView.parentheticTag.props.background_rgba = color self.scriptView.textView.sceneHeadingTag.props.background_rgba = color for he in self.scriptView.headingEntries: he.modify_bg(Gtk.StateType.NORMAL, color.to_color()) def scroll(self, line, offset=0): if len(self.scriptView.lines) -1 < line: return lineIndex = self.scriptView.lines.index(line) lineIter = self.scriptView.textView.buffer.get_iter_at_line(lineIndex) lineIter.forward_chars(offset) self.scriptView.textView.scroll_to_iter(lineIter, 0.1, False, 0.0, 0.0) self.scriptView.textView.buffer.place_cursor(lineIter) self.scriptView.textView.grab_focus() def timedScroll(self, line, offset, time=250): GObject.timeout_add(time, self.scroll, line, offset) def selectionOffsets(self): bounds = self.scriptView.textView.buffer.get_selection_bounds() if len(bounds): startIter, endIter = bounds # Do not allow selection of zero space char at end of buffer. if self.scriptView.textView.endIter().get_offset() == endIter.get_offset(): endIter.backward_char() return startIter.get_offset(), endIter.get_offset() return None, None def select(self, startOffset, endOffset): selectionStartIter = self.scriptView.textView.buffer.get_iter_at_offset(startOffset) selectionEndIter = self.scriptView.textView.buffer.get_iter_at_offset(endOffset) self.scriptView.textView.buffer.select_range(selectionStartIter, selectionEndIter) def timedSelect(self, startOffset, endOffset, time=250): GObject.timeout_add(time, self.select, startOffset, endOffset)
class StoreBase: engine = None _db = None _fs = None _coll = None def __init__(self, section='imsto'): """engine: mongodb(default), s3""" self.section = section self._config = Config() self.engine = self.get_config('engine') self.fs_prefix = self.get_config('fs_prefix') print 'init section: {self.section}, engine: {self.engine}, fs_prefix: {self.fs_prefix}'.format( self=self) def get_config(self, key): return self._config.get(key, self.section) def browse(self, limit=20, start=0, sort=None, only_items=False): """retrieve files from mongodb for gallery""" #return fs().list() if sort is None or not isinstance(sort, list): sort = [('uploadDate', DESCENDING)] cursor = self.collection.find(limit=limit, skip=start, sort=sort) items = [StoreItem(self, item) for item in cursor] if only_items: return items url_prefix = urljoin(self.get_config('url_prefix'), self.get_config('thumb_path')) return { 'items': items, 'total': cursor.count(), 'url_prefix': url_prefix + '/' } def count(self): return self.collection.count() # def __iter__(self): # self.__cursor = self.collection.find(limit=0,skip=0,sort=[('uploadDate',DESCENDING)]) # return self # def next(self): # if self.__cursor: # return StoreItem(self, self.__cursor.next()) # raise StopIteration def store(self, file=None, content=None, ctype=None, **kwd): """save a file-like item""" if content is None and not hasattr(file, 'read'): raise TypeError('invalid file-like object') data = content if content is not None else file.read() size = len(data) ext = guessImageType(data[:32]) if ext is None: raise ValueError('invalid image file') hashes = [md5(data).hexdigest()] _exists_id = self.exists(hashed=hashes[0]) if _exists_id: id = _exists_id filename = _make_filename(id, ext) print('id {} or hash {} exists!!'.format(id, hashes[0])) #raise DuplicateError('already exists') return [True, id, filename] ids = [_make_id(hashes[0])] if 'id' in kwd and kwd['id'] and kwd['id'] not in ids: ids += [kwd['id']] from image import SimpImage, MIN_QUALITY max_file_size = int(self.get_config('max_file_size')) max_jpeg_quality = int(self.get_config('max_jpeg_quality')) max_width = int(self.get_config('max_width')) max_height = int(self.get_config('max_height')) if size > max_file_size: max_jpeg_quality -= 1 if max_jpeg_quality < MIN_QUALITY: max_jpeg_quality = MIN_QUALITY im = SimpImage(blob=data) meta = im.meta if meta['width'] > max_width or meta['height'] > max_height: if self.get_config('auto_scale') and im.thumbnail( max_width, max_height): if im.format == 'JPEG' and im.quality > max_jpeg_quality: im.quality = max_jpeg_quality data = im.get_blob() size = len(data) print im.meta print 'new scaled size {}'.format(size) hashes += [md5(data).hexdigest()] else: raise ValueError( 'file: {} dimension {}x{} is too big, max is {}x{}'.format( kwd['name'] if 'name' in kwd else '', meta['width'], meta['height'], max_width, max_height)) if im.format == 'JPEG': if im.quality > max_jpeg_quality: print 'quality {} is too high, hash {}'.format( im.quality, hashes[0]) from tempfile import NamedTemporaryFile _tmp = NamedTemporaryFile('w+b', dir=self.get_config('temp_root'), delete=False) _tmp.file.close() save_file(_tmp.name, blob=data) if jpegoptim(_tmp.name): fp = open(_tmp.name) data = fp.read() size = len(data) # print 'new optimized size {}'.format(size) fp.close() _tmp.unlink(_tmp.name) del im im = SimpImage(blob=data) meta = im.meta hashes += [md5(data).hexdigest()] else: raise EnvironmentError( 'jpeg qualty is too high, or need jpegoptim') elif im.format == 'PNG' and self.get_config('force_jpeg'): im.format = 'JPEG' im.quality = max_jpeg_quality data = im.get_blob() size = len(data) hashes += [md5(data).hexdigest()] ext = 'jpg' meta = im.meta del im if (size > max_file_size): raise ValueError('file: {} size {} is too big, max is {}'.format( kwd['name'] if 'name' in kwd else '', size, max_file_size)) hashed = hashes[len(hashes) - 1] #md5(data).hexdigest() # print ('md5 hash: {}'.format(hashed)) # TODO: add for support (md5 + size) id id = _make_id(hashed) # print ('new filename: %r' % filename) # TODO: fix for support s3 front browse _exists_id = self.exists(id) or self.exists(hashed=hashed) if _exists_id: id = _exists_id filename = _make_filename(id, ext) print('id {} or hash {} exists!!'.format(id, hashed)) #raise DuplicateError('already exists') return [True, id, filename] filename = _make_filename(id, ext) # print ('id: {}'.format(id)) # if ctype is None or ctype == '': from _util import guess_mimetype ctype = guess_mimetype(filename) # save to mongodb spec = { '_id': id, 'filename': filename, 'hash': hashes, 'mime': ctype, 'size': size, 'meta': meta, 'ids': ids } if 'name' in kwd and isinstance(kwd['name'], (str, unicode)): spec['name'] = kwd['name'] for k in ['created', 'app_id']: if k in kwd and kwd[k]: spec[k] = kwd[k] if self._store_exists(id, filename=filename): self._save_meta(id, spec) return [True, id, filename] rr = self._put(data, **spec) if rr: return [True, rr, filename] def get_meta(self, id=None, filename=None, ids=None): spec = None if id: spec = id elif filename: spec = {'filename': filename} elif ids and isinstance(ids, type([])): spec = {'ids': {'$in': ids}} if spec: print 'spec %s' % spec item = self.collection.find_one(spec) if item: return StoreItem(self, item) def _save_meta(self, id, spec): '''mongo special meta data''' #if not hasattr(spec, '_id'): # spec['_id'] = id if 'created' not in spec: spec['created'] = datetime.datetime.utcnow() if 'filename' not in spec: print spec raise ValueError('need filename') return self.collection.update({'_id': id}, spec, upsert=True) def delete(self, id): raise NotImplemented() def _get(self, id): raise NotImplemented() def _put(self, data, **spec): raise NotImplemented() def _store_exists(self, id=None, *args, **kwargs): raise NotImplemented() def exists(self, id=None, hashed=None, filename=None, *args, **kwargs): """check special hash value TODO: more args""" #print args #print kwargs if id and self.collection.find_one({"_id": id}): return id if hashed: doc = self.collection.find_one({'md5': hashed}) if doc: return doc['_id'] doc = self.collection.find_one({'hash': {'$in': [hashed]}}) if doc: return doc['_id'] if filename: doc = self.collection.find_one(filename=filename) if doc: return doc['_id'] if self._store_exists(id, hashed=hashed, filename=filename, *args, **kwargs): return id @property def db(self): if self._db is None: self._db = get_mongo_db(self.get_config('servers'), self.get_config('db_name'), self.get_config('replica_set')) return self._db @property def collection(self): if self._coll is None: cn = '{0}.files'.format(self.fs_prefix) self._coll = self.db[cn] return self._coll def close(self): """ close db connection""" if self.db is not None: self.db.connection.disconnect() def load(self, path): """ load from url path """ #print 'path: %s (%s)' % (path, type(path)) image_url_regex = r'(?P<size>[scwh]\d{2,4}(?P<x>x\d{2,4})?|orig)(?P<mop>[a-z])?/(?P<t1>[a-z0-9]{2})/(?P<t2>[a-z0-9]{2})/(?P<t3>[a-z0-9]{19,36})\.(?P<ext>gif|jpg|jpeg|png)$' match = re.search(image_url_regex, path) #print(image_url_regex, path, match) if match is None: raise UrlError('invalid path') ids = match.groupdict() #print(ids) id = '{t1}{t2}{t3}'.format(**ids) THUMB_ROOT = self.get_config('thumb_root').rstrip('/') SUPPORTED_SIZE = self.get_config('support_size').split(',') org_path = '{t1}/{t2}/{t3}.{ext}'.format(**ids) org_file = '{0}/orig/{1}'.format(THUMB_ROOT, org_path) if not os.path.exists(org_file): # check old id for redirect doc = self.get_meta(ids=[id]) if doc and doc['id'] != id and 'filename' in doc: print 'found %s' % doc['filename'] thumb_path = self.get_config('thumb_path') new_path = '{}/{}/{}'.format(thumb_path, ids['size'], doc['filename']) raise HttpFound('found', path=new_path) print('fetching file: {}'.format(org_path)) file = self.fetch(id, path=org_path) if file is None: print('fetch failed') raise UrlError('id {} not found'.format(id)) save_file(org_file, file) if not os.path.exists(org_file): raise UrlError('file not found') # start thumbnail image if ids['size'] == 'orig': dst_path = 'orig/{}'.format(org_path) dst_file = org_file else: dst_path = '{0}/{1}'.format(ids['size'], org_path) dst_file = '{0}/{1}'.format(THUMB_ROOT, dst_path) mode = ids['size'][0] dimension = ids['size'][1:] if dimension not in SUPPORTED_SIZE: #print('unsupported size: {} {}'.format(mode, dimension)) raise UrlError('unsupported size') if ids['x'] is None: size = int(dimension) width, height = size, size else: width, height = map(int, dimension.split('x')) if not os.path.exists(dst_file): print('start thumbnail image {} {} => {}x{}'.format( mode, dimension, width, height)) thumb_image(org_file, width, height, dst_file, mode) if ids['mop'] == 'w' and width < 100: raise UrlError('bad size') if ids['mop'] is not None: if ids['mop'] == 'w': # watermark modifier org_file = '{}/{}/{}'.format(THUMB_ROOT, ids['size'], org_path) dst_file = '{}/{}{}/{}'.format(THUMB_ROOT, ids['size'], ids['mop'], org_path) if watermark_image(org_file, dst_file): dst_path = '{}{}/{}'.format(ids['size'], ids['mop'], org_path) else: raise UrlError('bad modifier') #print('dst_path: {}'.format(dst_path)) #print('dst_file: {}'.format(dst_file)) return (dst_file, dst_path) def fetch(self, id, path): key = path if self.engine == 's3' else id return self._get(key) # try: # return self._get(key) # except Exception, e: # print('prepare: {} not found'.format(key)) # print e # raise e def url(self, path, size='orig'): url_prefix = self.get_config('url_prefix') thumb_path = self.get_config('thumb_path') return '{}/{}/{}/{}'.format(url_prefix.rstrip('/'), thumb_path.strip('/'), size, path)
def __init__(self): self.conf = Config().get()
class StoreBase: engine = None _db = None _fs = None _coll = None def __init__(self, section='imsto'): """engine: mongodb(default), s3""" self.section = section self._config = Config() self.engine = self.get_config('engine') self.fs_prefix = self.get_config('fs_prefix') print 'init section: {self.section}, engine: {self.engine}, fs_prefix: {self.fs_prefix}'.format(self=self) def get_config(self, key): return self._config.get(key, self.section) def browse(self, limit=20, start=0, sort=None, only_items = False): """retrieve files from mongodb for gallery""" #return fs().list() if sort is None or not isinstance(sort, list): sort = [('uploadDate',DESCENDING)] cursor = self.collection.find(limit=limit,skip=start,sort=sort) items = [StoreItem(self, item) for item in cursor] if only_items: return items url_prefix = urljoin(self.get_config('url_prefix'), self.get_config('thumb_path')) return {'items':items,'total':cursor.count(),'url_prefix': url_prefix + '/'} def count(self): return self.collection.count(); # def __iter__(self): # self.__cursor = self.collection.find(limit=0,skip=0,sort=[('uploadDate',DESCENDING)]) # return self # def next(self): # if self.__cursor: # return StoreItem(self, self.__cursor.next()) # raise StopIteration def store(self, file=None, content=None, ctype=None, **kwd): """save a file-like item""" if content is None and not hasattr(file, 'read'): raise TypeError('invalid file-like object') data = content if content is not None else file.read() size = len(data) ext = guessImageType(data[:32]) if ext is None: raise ValueError('invalid image file') hashes = [md5(data).hexdigest()] _exists_id = self.exists(hashed=hashes[0]) if _exists_id: id = _exists_id filename = _make_filename(id, ext) print ('id {} or hash {} exists!!'.format(id, hashes[0])) #raise DuplicateError('already exists') return [True, id, filename] ids = [_make_id(hashes[0])] if 'id' in kwd and kwd['id'] and kwd['id'] not in ids: ids += [kwd['id']] from image import SimpImage, MIN_QUALITY max_file_size = int(self.get_config('max_file_size')) max_jpeg_quality = int(self.get_config('max_jpeg_quality')) max_width = int(self.get_config('max_width')) max_height = int(self.get_config('max_height')) if size > max_file_size: max_jpeg_quality -= 1 if max_jpeg_quality < MIN_QUALITY: max_jpeg_quality = MIN_QUALITY im = SimpImage(blob=data) meta = im.meta if meta['width'] > max_width or meta['height'] > max_height: if self.get_config('auto_scale') and im.thumbnail(max_width, max_height): if im.format == 'JPEG' and im.quality > max_jpeg_quality: im.quality = max_jpeg_quality data = im.get_blob() size = len(data) print im.meta print 'new scaled size {}'.format(size) hashes += [md5(data).hexdigest()] else: raise ValueError('file: {} dimension {}x{} is too big, max is {}x{}'.format(kwd['name'] if 'name' in kwd else '', meta['width'], meta['height'], max_width, max_height)) if im.format == 'JPEG': if im.quality > max_jpeg_quality: print 'quality {} is too high, hash {}'.format(im.quality, hashes[0]) from tempfile import NamedTemporaryFile _tmp = NamedTemporaryFile('w+b',dir=self.get_config('temp_root'),delete=False) _tmp.file.close() save_file(_tmp.name, blob=data) if jpegoptim(_tmp.name): fp = open(_tmp.name) data = fp.read() size = len(data) # print 'new optimized size {}'.format(size) fp.close() _tmp.unlink(_tmp.name) del im im = SimpImage(blob=data) meta = im.meta hashes += [md5(data).hexdigest()] else: raise EnvironmentError('jpeg qualty is too high, or need jpegoptim') elif im.format == 'PNG' and self.get_config('force_jpeg'): im.format = 'JPEG' im.quality = max_jpeg_quality data = im.get_blob() size = len(data) hashes += [md5(data).hexdigest()] ext = 'jpg' meta = im.meta del im if (size > max_file_size): raise ValueError('file: {} size {} is too big, max is {}'.format(kwd['name'] if 'name' in kwd else '', size, max_file_size)) hashed = hashes[len(hashes)-1] #md5(data).hexdigest() # print ('md5 hash: {}'.format(hashed)) # TODO: add for support (md5 + size) id id = _make_id(hashed) # print ('new filename: %r' % filename) # TODO: fix for support s3 front browse _exists_id = self.exists(id) or self.exists(hashed=hashed) if _exists_id: id = _exists_id filename = _make_filename(id, ext) print ('id {} or hash {} exists!!'.format(id, hashed)) #raise DuplicateError('already exists') return [True, id, filename] filename = _make_filename(id, ext) # print ('id: {}'.format(id)) # if ctype is None or ctype == '': from _util import guess_mimetype ctype = guess_mimetype(filename) # save to mongodb spec = {'_id': id,'filename': filename, 'hash': hashes, 'mime': ctype, 'size': size, 'meta': meta, 'ids': ids} if 'name' in kwd and isinstance(kwd['name'], (str, unicode)): spec['name'] = kwd['name'] for k in ['created', 'app_id']: if k in kwd and kwd[k]: spec[k] = kwd[k] if self._store_exists(id, filename=filename): self._save_meta(id, spec) return [True, id, filename] rr = self._put(data, **spec) if rr: return [True, rr, filename] def get_meta(self, id=None, filename=None, ids=None): spec = None if id: spec = id elif filename: spec = {'filename': filename} elif ids and isinstance(ids, type([])): spec = {'ids': {'$in': ids}} if spec: print 'spec %s' % spec item = self.collection.find_one(spec) if item: return StoreItem(self, item) def _save_meta(self, id, spec): '''mongo special meta data''' #if not hasattr(spec, '_id'): # spec['_id'] = id if 'created' not in spec: spec['created'] = datetime.datetime.utcnow() if 'filename' not in spec: print spec raise ValueError('need filename') return self.collection.update({'_id': id}, spec, upsert=True) def delete(self, id): raise NotImplemented() def _get(self, id): raise NotImplemented() def _put(self, data, **spec): raise NotImplemented() def _store_exists(self, id=None, *args, **kwargs): raise NotImplemented() def exists(self, id=None, hashed=None, filename=None, *args, **kwargs): """check special hash value TODO: more args""" #print args #print kwargs if id and self.collection.find_one({"_id": id}): return id if hashed: doc = self.collection.find_one({'md5': hashed}) if doc: return doc['_id'] doc = self.collection.find_one({'hash': {'$in': [hashed]}}) if doc: return doc['_id'] if filename: doc = self.collection.find_one(filename=filename) if doc: return doc['_id'] if self._store_exists(id, hashed=hashed, filename=filename, *args, **kwargs): return id @property def db(self): if self._db is None: self._db = get_mongo_db(self.get_config('servers'), self.get_config('db_name'), self.get_config('replica_set')) return self._db @property def collection(self): if self._coll is None: cn = '{0}.files'.format(self.fs_prefix) self._coll = self.db[cn] return self._coll def close(self): """ close db connection""" if self.db is not None: self.db.connection.disconnect() def load(self, path): """ load from url path """ #print 'path: %s (%s)' % (path, type(path)) image_url_regex = r'(?P<size>[scwh]\d{2,4}(?P<x>x\d{2,4})?|orig)(?P<mop>[a-z])?/(?P<t1>[a-z0-9]{2})/(?P<t2>[a-z0-9]{2})/(?P<t3>[a-z0-9]{19,36})\.(?P<ext>gif|jpg|jpeg|png)$' match = re.search(image_url_regex, path) #print(image_url_regex, path, match) if match is None: raise UrlError('invalid path') ids = match.groupdict() #print(ids) id = '{t1}{t2}{t3}'.format(**ids) THUMB_ROOT = self.get_config('thumb_root').rstrip('/') SUPPORTED_SIZE = self.get_config('support_size').split(',') org_path = '{t1}/{t2}/{t3}.{ext}'.format(**ids) org_file = '{0}/orig/{1}'.format(THUMB_ROOT, org_path) if not os.path.exists(org_file): # check old id for redirect doc = self.get_meta(ids=[id]) if doc and doc['id'] != id and 'filename' in doc: print 'found %s' % doc['filename'] thumb_path = self.get_config('thumb_path') new_path = '{}/{}/{}'.format(thumb_path, ids['size'], doc['filename']) raise HttpFound('found', path=new_path) print('fetching file: {}'.format(org_path)) file = self.fetch(id, path=org_path) if file is None: print('fetch failed') raise UrlError('id {} not found'.format(id)) save_file(org_file, file) if not os.path.exists(org_file): raise UrlError('file not found') # start thumbnail image if ids['size'] == 'orig': dst_path = 'orig/{}'.format(org_path) dst_file = org_file else: dst_path = '{0}/{1}'.format(ids['size'], org_path) dst_file = '{0}/{1}'.format(THUMB_ROOT, dst_path) mode = ids['size'][0] dimension = ids['size'][1:] if dimension not in SUPPORTED_SIZE: #print('unsupported size: {} {}'.format(mode, dimension)) raise UrlError('unsupported size') if ids['x'] is None: size = int(dimension) width, height = size, size else: width, height = map(int, dimension.split('x')) if not os.path.exists(dst_file): print('start thumbnail image {} {} => {}x{}'.format(mode, dimension, width, height)) thumb_image(org_file, width, height, dst_file, mode) if ids['mop'] == 'w' and width < 100: raise UrlError('bad size') if ids['mop'] is not None: if ids['mop'] == 'w': # watermark modifier org_file = '{}/{}/{}'.format(THUMB_ROOT, ids['size'], org_path) dst_file = '{}/{}{}/{}'.format(THUMB_ROOT, ids['size'], ids['mop'], org_path) if watermark_image(org_file, dst_file): dst_path = '{}{}/{}'.format(ids['size'], ids['mop'], org_path) else: raise UrlError('bad modifier') #print('dst_path: {}'.format(dst_path)) #print('dst_file: {}'.format(dst_file)) return (dst_file, dst_path) def fetch(self, id, path): key = path if self.engine == 's3' else id return self._get(key) # try: # return self._get(key) # except Exception, e: # print('prepare: {} not found'.format(key)) # print e # raise e def url(self, path, size='orig'): url_prefix = self.get_config('url_prefix') thumb_path = self.get_config('thumb_path') return '{}/{}/{}/{}'.format(url_prefix.rstrip('/'), thumb_path.strip('/'), size, path)
""" import time from datetime import datetime as dt import os import tensorflow as tf import matplotlib.pyplot as plt import numpy as np import pandas as pd from numpy import arange, sin, pi, random from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn.model_selection import train_test_split from math import sqrt from _config import Config # Global hyper-parameters config = Config("config.yaml") sequence_length = 100 random_data_dup = 10 # each sample randomly duplicated between 0 and 9 times, see dropin function mse_threshold = 0.1 # anomaly MSE threshold def read_data(input_file): '''Read the input data file into a pandas dataframe Arguments --------- input_file : str Name of input csv file (ensure header is first row) Returns