def main_pkl(f_name, out_fname): """ Converts the twitter data to pickle with each element being dictionary with keys handle, text, and time """ # Read in the file fid = open(f_name, 'r') out_arr = fid.read().split('\n') process_data = [] # Loop over all the data for ele in out_arr: twit_split = ele.split('||') # Check if the data has the correct format (3 ||) if len(twit_split) != 4: logging.info('Twitter sample: {}'.format(ele)) continue assert (len(twit_split[-1]) == 0) # Convert timestamp and add to process_data time_stamp = convert_timestamp(twit_split[-2]) if time_stamp: process_data.append({ 'handle': twit_split[0], 'text': twit_split[1], 'time': time_stamp }) else: logging.debug('Time Stamp Not Detected: {}'.format(ele)) save_pickle({'dat': process_data}, out_fname) logging.info( 'Length of raw data: {} process data: {} pickle name:{}'.format( len(out_arr), len(process_data), out_fname))
def get_accuracy_set(self, data_set,decoder,prefix): data_loader = DataLoader(data_set, batch_size=self.batch_size) main_arr = np.array([]) counter = 0 self.ac_arr, self.pred_arr = [],[] for data in data_loader: # Finding the predicted label and getting the loss function img, label = data if self.CUDA_val is True: img = Variable(img).cuda() else: img = Variable(img) main_arr = np.concatenate((main_arr, np.array([self.accuracy_func(self.main_model(img), label)]))) counter += 1 if counter%2 == 0: logging.debug('Current Accuracy: {}'.format(np.mean(main_arr))) logging.info('Total Samples: {} Accuracy: {}'.format(data_set.__len__(), np.mean(main_arr))) return_dict = {'ac':self.ac_arr,'pred':self.pred_arr} save_pickle(return_dict,prefix+'_stats.pkl') if decoder is False: return # Reconstruct Image base_fname = self.dataset_name + "_e" + str(self.epochs) + "_b" + str(self.batch_size) + ".png" recons_img_fname = "recon_" + base_fname truth_img_fname = "truth_" + base_fname decoder = self.main_model.get_decoder() output = self.main_model(img) recon = decoder(output, label) recon_img = recon.view(-1, self.main_model.img_channel, self.main_model.img_width, self.main_model.img_height) # _, channel, width, height # Save Reconstruction and Ground Truth torchvision.utils.save_image(recon_img.cpu().data, recons_img_fname) torchvision.utils.save_image(img.cpu().data, truth_img_fname)
def disc_info(media, force=False): """ return kaa metadata disc information for the media """ discinfo = mmpython.parse(media.devicename) if not discinfo or not discinfo.id: # bad disc, e.g. blank disc return {} cachedir = os.path.join(config.OVERLAY_DIR, 'disc/metadata') cachefile = os.path.join(cachedir, discinfo.id + '.freevo') if os.path.isfile(cachefile): metainfo = util.read_pickle(cachefile) else: metainfo = {} if force or discinfo.mime == 'unknown/unknown' and not metainfo.has_key('disc_num_video'): media.mount() for type in ('video', 'audio', 'image'): items = getattr(config, '%s_SUFFIX' % type.upper()) files = util.match_files_recursively(media.mountdir, items) metainfo['disc_num_%s' % type] = len(files) media.umount() util.save_pickle(metainfo, cachefile) info = Info(cachefile, discinfo, metainfo) info.disc = True return info
def build_graph(ids, vocabs, pmi, tfidf): if util.is_exist("graph.pkl"): print("loaded") return util.load_pickle("graph.pkl") G = networkx.Graph() G.add_nodes_from(ids) G.add_nodes_from(vocabs) cn2 = lambda x: x * (x - 1) // 2 print("Calculating word_word edges") for (i, w1), (j, w2) in tqdm(combinations(enumerate(vocabs), 2), total=cn2(len(vocabs))): if pmi[i][j] > 0: G.add_edge(w1, w2, weight=pmi[i][j]) print("Calculating doc_word edges") for i, review_id in tqdm(enumerate(ids), total=len(ids)): for j, word in enumerate(vocabs): G.add_edge(review_id, word, weight=tfidf[i][j]) print("Calculating doc_doc edges") for review_id in tqdm(ids, total=len(ids)): G.add_edge(review_id, review_id, weight=1) util.save_pickle(G, "graph.pkl") return G
def load_data(file, max_num=None): if util.is_exist("data.pkl"): print("loaded") return util.load_pickle("data.pkl") labels = [] reviews = [] ids = [] with open(file, 'r', encoding='utf-8') as f: lines = f.readlines() max_line = len(lines) if max_num is None else max_num + 1 # Ignore Header Line for line in lines[1:max_line]: id, label, review = line.split("\t") labels.append(label) # remove quotation marks around review = review[1:len(review) - 1] reviews.append(review) # remove quotation marks around id = id[1:len(id) - 1] ids.append(id) reviews = [cleanText(r) for r in reviews] util.save_pickle((ids, labels, reviews), "data.pkl") return ids, labels, reviews
def toggle_display_style(self, menu): """ Toggle display style """ if isinstance(menu, str): if not self.display_style.has_key(menu): self.display_style[menu] = 0 self.display_style[menu] = (self.display_style[menu] + 1) % len(self.settings.sets[menu].style) return 1 if menu.force_skin_layout != -1: return 0 if menu and menu.skin_settings: settings = menu.skin_settings else: settings = self.settings if settings.special_menu.has_key(menu.item_types): area = settings.special_menu[menu.item_types] else: area = settings.default_menu['default'] if self.display_style['menu'] >= len(area.style): self.display_style['menu'] = 0 self.display_style['menu'] = (self.display_style['menu'] + 1) % len(area.style) self.storage[config.SKIN_XML_FILE] = self.display_style['menu'] util.save_pickle(self.storage, self.storage_file) return 1
def set_base_fxd(self, name): """ set the basic skin fxd file """ config.SKIN_XML_FILE = os.path.splitext(os.path.basename(name))[0] logger.debug('load basic skin settings: %s', config.SKIN_XML_FILE) # try to find the skin xml file if not self.settings.load(name, clear=True): print "skin not found, using fallback skin" self.settings.load('basic.fxd', clear=True) for dir in config.cfgfilepath: local_skin = '%s/local_skin.fxd' % dir if os.path.isfile(local_skin): logger.log( 9, 'Skin: Add local config %s to skin', local_skin) self.settings.load(local_skin) break self.storage['SKIN_XML_FILE'] = config.SKIN_XML_FILE util.save_pickle(self.storage, self.storage_file) if self.storage.has_key(config.SKIN_XML_FILE): self.display_style['menu'] = self.storage[config.SKIN_XML_FILE] else: self.display_style['menu'] = 0
def setup_DUC_sentences(task, parser=None, reload=False): ## load problems quickly from pickle file if (not reload) and os.path.isfile(task.data_pickle): sys.stderr.write('Loading [%s] problem data from [%s]\n' % (task.name, task.data_pickle)) task.problems = util.load_pickle(task.data_pickle) return ## only parse sentences if needed for problem in task.problems: print problem.id problem.load_documents() if parser: for doc in problem.new_docs: doc.parse_sentences(parser) if parser: parser.run() for sentence, parsetree in parser.parsed.items(): sentence.parsed = parsetree ## save pickled version for faster loading later sys.stderr.write('Saving [%s] problem data in [%s]\n' % (task.name, task.data_pickle)) util.save_pickle(task.problems, task.data_pickle)
def fetchheadlinesfromurl(self): headlines = [] popup = PopupBox(text=_('Fetching headlines...')) popup.show() try: # parse the document doc = util.feedparser.parse(self.url) if doc.status < 400: for entry in doc['entries']: try: title = Unicode(entry.title) link = Unicode(entry.link) if entry.has_key('content') and len(entry['content']) >= 1: description = Unicode(entry['content'][0].value) else: description = Unicode(entry['summary_detail'].value) headlines.append((title, link, description)) except AttributeError: pass else: logger.debug('Error %s, getting %r', doc.status, self.url) #write the file if len(headlines) > 0: pfile = os.path.join(self.cachedir, 'headlines-%i' % self.location_index) util.save_pickle(headlines, pfile) finally: popup.destroy() return headlines
def setup_DUC_sentences(task, parser=None, reload=False, options=None): ## load problems quickly from pickle file if (not reload) and os.path.isfile(task.data_pickle): sys.stderr.write('Loading [%s] problem data from [%s]\n' % (task.name, task.data_pickle)) task.problems = util.load_pickle(task.data_pickle) return ## parse sentences if options: text.text_processor.load_splitta_model(options.splitta_model) else: text.text_processor.load_splitta_model( '/u/dgillick/sbd/splitta/model_nb/') for problem in task.problems: sys.stderr.write('%s\n' % problem.id) problem.load_documents() if parser: for doc in problem.new_docs: doc.parse_sentences(parser) problem.parsed = True if parser: parser.run() for sentence, parsetree in parser.parsed.items(): sentence.parsed = parsetree ## save pickled version for faster loading later sys.stderr.write('Saving [%s] problem data in [%s]\n' % (task.name, task.data_pickle)) util.save_pickle(task.problems, task.data_pickle)
def make_wordvec_dict(f_name, out_fname, threads): """ Loads the pickle containing the dictionary gets word vector from parallel processing it and puts into dict saved in out_fname """ # Make list of unique words word_list = list(load_pickle(f_name)) # Send job to workers per_f = int(len(word_list) / threads) + 1 logging.info('Per Thread {}'.format(per_f)) pool = mp.Pool(processes=threads) processes = [ pool.apply_async(parallel_word_dict, args=(word_list, per_f * (x - 1), per_f * x)) for x in range(1, threads + 1) ] # Get data and put it out output = [process.get() for process in processes] out_dict = {} for ele in output: out_dict = {**out_dict, **ele} pool.close() save_pickle(out_dict, out_fname) logging.info('Made Dictionary Using Spacy')
def __init__(self, dataPath, indexfile, spectype='stft', n_fft=1024, window='hann', win_length=1024, hop_length=512, shape='disk', neighbourhood=10, uniform=True, show=False, gap=50, targetsize=(200, 200)): # Parameters for spectrogram self.spectype = spectype self.n_fft = n_fft self.window = window self.win_length = win_length self.hop_length = hop_length # Parameters for peak picking self.shape = shape self.neighbourhood = neighbourhood self.uniform = uniform self.show = show # Parameters for combinatorial hashing self.gap = gap self.targetsize = targetsize # Stores hashed data for matching self.data = {} self.identity2title = [] # Looping through all data and saving (assumes the whole directory is full of wavs to index) for identity, filename in enumerate(tqdm(os.listdir(dataPath))): self.identity2title.append(filename) # Keeping track of id and title with .wav at end hash_dict = self.fingerprint(dataPath+'/'+filename, identity) self.data.update(hash_dict) save_pickle(self, indexfile) # saves itself as an object with data and parameter info
def prepare_location(loc): print " Processing %s..." % loc data_i, xs_i, ys_i = read_location_images(loc, 'three_band') data_m, xs_m, ys_m = read_location_images(loc, 'sixteen_band', 'M') #data_p, xs_p, ys_p = read_location_images(loc, 'sixteen_band', 'P') # Prepare images for i in xrange(n_location_images): for j in xrange(n_location_images): meta = { 'shape': (0, ys_i[i + 1] - ys_i[i], xs_i[j + 1] - xs_i[j]), 'shape_i': (data_i.shape[0], ys_i[i + 1] - ys_i[i], xs_i[j + 1] - xs_i[j]), 'shape_m': (data_m.shape[0], ys_m[i + 1] - ys_m[i], xs_m[j + 1] - xs_m[j]), #'shape_p': (data_p.shape[0], ys_p[i+1] - ys_p[i], xs_p[j+1] - xs_p[j]) } save_pickle('cache/meta/%s_%d_%d.pickle' % (loc, i, j), meta) write_location_images(loc, data_i, xs_i, ys_i, 'I') write_location_images(loc, data_m, xs_m, ys_m, 'M') #write_location_images(loc, data_p, xs_p, ys_p, 'P') write_location_images(loc, normalize(data_m), xs_m, ys_m, 'MN') # Write location-normalized M channels write_location_images(loc, compute_filters(data_i), xs_i, ys_i, 'IF') write_location_images(loc, compute_indices(data_m), xs_m, ys_m, 'MI')
def fetchheadlinesfromurl(self): headlines = [] popup = PopupBox(text=_('Fetching headlines...')) popup.show() try: # parse the document doc = util.feedparser.parse(self.url) if doc.status < 400: for entry in doc['entries']: try: title = Unicode(entry.title) link = Unicode(entry.link) if entry.has_key('content') and len( entry['content']) >= 1: description = Unicode(entry['content'][0].value) else: description = Unicode( entry['summary_detail'].value) headlines.append((title, link, description)) except AttributeError: pass else: logger.debug('Error %s, getting %r', doc.status, self.url) #write the file if len(headlines) > 0: pfile = os.path.join(self.cachedir, 'headlines-%i' % self.location_index) util.save_pickle(headlines, pfile) finally: popup.destroy() return headlines
def preprocess(): if util.is_exist("preprocessed.pkl"): print("loading") return util.load_pickle("preprocessed.pkl") _, labels, _ = util.load_pickle("data.pkl") labels = np.array(labels, dtype=np.int32) G = util.load_pickle("graph.pkl") print("calc adjacent matrix") A = networkx.to_numpy_matrix(G, weight="weight") print("calc degree matrix") degrees = [d**-0.5 if d != 0 else 0 for _, d in G.degree] print("normalize adjacent matrix") ''' degrees = np.diag(degrees) A_hat = degrees @ A @ degrees ''' # decrease memory allocation A_hat = A for i in tqdm(range(A.shape[0]), total=A.shape[0]): for j in range(A.shape[1]): A_hat[i, j] *= degrees[i] * degrees[j] print("calc feature matrix") X = np.eye(G.number_of_nodes()) # Features are just identity matrix util.save_pickle((X, A_hat, labels), "preprocessed.pkl") return X, A_hat, labels
def setup_DUC_sentences(task, parser=None, reload=False): ## load problems quickly from pickle file if (not reload) and os.path.isfile(task.data_pickle): sys.stderr.write('Loading [%s] problem data from [%s]\n' %(task.name, task.data_pickle)) task.problems = util.load_pickle(task.data_pickle) return ## parse sentences text.text_processor.load_splitta_model('/u/dgillick/sbd/splitta/model_nb/') for problem in task.problems: sys.stderr.write('%s\n' %problem.id) problem.load_documents() if parser: for doc in problem.new_docs: doc.parse_sentences(parser) problem.parsed = True if parser: parser.run() for sentence, parsetree in parser.parsed.items(): sentence.parsed = parsetree ## save pickled version for faster loading later sys.stderr.write('Saving [%s] problem data in [%s]\n' %(task.name, task.data_pickle)) util.save_pickle(task.problems, task.data_pickle)
def delete_old_files_2(): """ delete cachfiles/entries for files which don't exists anymore """ print checking('deleting old web-server thumbnails'), sys.__stdout__.flush() num = 0 for file in util.match_files_recursively(vfs.www_image_cachedir(), config.IMAGE_SUFFIX): if not vfs.isfile( file[len(vfs.www_image_cachedir()):file.rindex('.')]): os.unlink(file) num += 1 print 'deleted %s file%s' % (num, num != 1 and 's' or '') print checking('deleting old cache files'), sys.__stdout__.flush() num = 0 for file in util.match_files_recursively(config.OVERLAY_DIR, ['raw']): if file.startswith(os.path.join(config.OVERLAY_DIR, 'disc')): continue if not vfs.isfile(file[len(config.OVERLAY_DIR):-4]): os.unlink(file) num += 1 print 'deleted %s file%s' % (num, num != 1 and 's' or '') print checking('deleting cache for directories not existing anymore'), subdirs = util.get_subdirs_recursively(config.OVERLAY_DIR) subdirs.reverse() for file in subdirs: if not os.path.isdir(file[len(config.OVERLAY_DIR):]) and not \ file.startswith(os.path.join(config.OVERLAY_DIR, 'disc')): for metafile in ('cover.png', 'cover.png.raw', 'cover.jpg', 'cover.jpg.raw', 'mmpython.cache', 'freevo.cache'): if os.path.isfile(os.path.join(file, metafile)): os.unlink(os.path.join(file, metafile)) if not os.listdir(file): os.rmdir(file) print 'done' print checking('deleting old entries in meta-info'), sys.__stdout__.flush() for filename in util.recursefolders(config.OVERLAY_DIR, 1, 'freevo.cache', 1): if filename.startswith(os.path.join(config.OVERLAY_DIR, 'disc')): continue sinfo = os.stat(filename) if not sinfo[ST_SIZE]: #print '%s is empty' % filename continue dirname = os.path.dirname(filename)[len(config.OVERLAY_DIR):] data = util.read_pickle(filename) for key in copy.copy(data): if not os.path.exists(os.path.join(dirname, str(key))): del data[key] util.save_pickle(data, filename) print 'done'
def fetchheadlinesfromurl(self): """ this fetches the headlines (title, link and description) from the url. Here the parsing of the xml is done """ headlines = [] # create Reader object reader = Sax2.Reader() popup = dialog.show_working_indicator(_('Fetching headlines...')) # parse the document try: myfile=urllib.urlopen(self.url) doc = reader.fromStream(myfile) items = doc.getElementsByTagName('item') for item in items: title = '' link = '' description = '' if item.hasChildNodes(): for c in item.childNodes: if c.localName == 'title': title = c.firstChild.data if c.localName == 'description': description = c.firstChild.data ################################# # Ajout pour identifier le lien de la video if self.mode == 'youtube': if c.localName == 'link': link='youtube:'+c.firstChild.data else: if c.localName == 'enclosure': attrs = c.attributes for attrName in attrs.keys(): attrNode = attrs.get(attrName) attrValue = attrNode.nodeValue if 'url' in attrName: link = attrValue if title: headlines.append((title, link, description)) except: #unreachable or url error logger.error('could not open %s', self.url) pass #write the file if len(headlines) > 0: pfile = os.path.join(self.cachedir, 'itv-%i' % self.location_index) util.save_pickle(headlines, pfile) popup.hide() return headlines
def del_cache(): """ delete all cache files because kaa metadata got updated """ for f in util.recursefolders(config.OVERLAY_DIR,1,'mmpython.cache',1): os.unlink(f) for f in util.match_files(config.OVERLAY_DIR + '/disc/metadata', ['mmpython']): os.unlink(f) cachefile = os.path.join(config.FREEVO_CACHEDIR, 'mediainfo') util.save_pickle((mmpython.version.VERSION, 0, 0, 0), cachefile)
def save_cache(self): """ save a modified cache file """ if self.cache_modified: logger.log( 9, 'save cache %s', self.current_cachefile) util.save_pickle(self.current_objects, self.current_cachefile) self.cache_modified = False if config.MEDIAINFO_USE_MEMORY: self.all_directories[self.current_cachefile] = self.current_objects
def saveToCache(self): util.save_pickle(self.weatherData, self.cacheFile) # attempt to save weathermap try: if self.weatherMapData is not None: imgfd = os.open(self.mapFile, os.O_CREAT|os.W_OK) os.write(imgfd, self.weatherMapData) os.close(imgfd) except: print "failed while saving weather map to cache '%s'" % (self.mapFile,)
def delete_old_files_2(): """ delete cachfiles/entries for files which don't exists anymore """ print checking('deleting old web-server thumbnails'), sys.__stdout__.flush() num = 0 for file in util.match_files_recursively(vfs.www_image_cachedir(), config.IMAGE_SUFFIX): if not vfs.isfile(file[len(vfs.www_image_cachedir()):file.rindex('.')]): os.unlink(file) num += 1 print 'deleted %s file%s' % (num, num != 1 and 's' or '') print checking('deleting old cache files'), sys.__stdout__.flush() num = 0 for file in util.match_files_recursively(config.OVERLAY_DIR, ['raw']): if file.startswith(os.path.join(config.OVERLAY_DIR, 'disc')): continue if not vfs.isfile(file[len(config.OVERLAY_DIR):-4]): os.unlink(file) num += 1 print 'deleted %s file%s' % (num, num != 1 and 's' or '') print checking('deleting cache for directories not existing anymore'), subdirs = util.get_subdirs_recursively(config.OVERLAY_DIR) subdirs.reverse() for file in subdirs: if not os.path.isdir(file[len(config.OVERLAY_DIR):]) and not \ file.startswith(os.path.join(config.OVERLAY_DIR, 'disc')): for metafile in ('cover.png', 'cover.png.raw', 'cover.jpg', 'cover.jpg.raw', 'mmpython.cache', 'freevo.cache'): if os.path.isfile(os.path.join(file, metafile)): os.unlink(os.path.join(file, metafile)) if not os.listdir(file): os.rmdir(file) print 'done' print checking('deleting old entries in meta-info'), sys.__stdout__.flush() for filename in util.recursefolders(config.OVERLAY_DIR, 1, 'freevo.cache', 1): if filename.startswith(os.path.join(config.OVERLAY_DIR, 'disc')): continue sinfo = os.stat(filename) if not sinfo[ST_SIZE]: #print '%s is empty' % filename continue dirname = os.path.dirname(filename)[len(config.OVERLAY_DIR):] data = util.read_pickle(filename) for key in copy.copy(data): if not os.path.exists(os.path.join(dirname, str(key))): del data[key] util.save_pickle(data, filename) print 'done'
def get_docs(task, num_docs, reload=False): """ returns a new task, where each problem in task.problems has: problem.ir_docs = [ ... ] """ ## check state if not reload and framework.check_state(task.problems)['ir']: sys.stderr.write('already have ir documents loaded\n') return task max_files = 0 ## get all query tokens; use tfidf.get_tokens because this matches the index's tokenization queries_by_problem_id = {} for problem in task.problems: #curr_query = ' '.join(tfidf.get_tokens(problem.query.original)) curr_query = ' '.join(make_query(problem)) queries_by_problem_id[problem.id] = curr_query ## do the search all_queries = queries_by_problem_id.values() docs_by_query = tfidf.search(tfidf.file_index_pickle_path, all_queries, tfidf.search_cmd, max_files, num_docs) ## for debugging docfh = open('irdoc_debug', 'w') ## allocate docs to problems for problem in task.problems: query = queries_by_problem_id[problem.id] docs_with_values = docs_by_query[query] ## inspect values for debugging docfh.write('# problem [%s]\n' % problem.id) for doc, val in docs_with_values: docfh.write('## doc_id [%s] value [%1.4f]\n' % (doc.id, float(val))) for par in doc.paragraphs: docfh.write('%s\n' % par) ## sentence segmentation docs = [doc for doc, val in docs_with_values] for doc in docs: doc.get_sentences() problem.ir_docs = docs problem.loaded_ir_docs = True ## pickle it up sys.stderr.write('Saving [%s] problem data in [%s]\n' % (task.name, task.data_pickle)) util.save_pickle(task.problems, task.data_pickle) return task
def save(self): dict = {'datasets': self.datasets,'version': 0.1} #for now: make a backup first: database_filename = self.path+'/'+self.filename backup_filename = self.path+'/'+self.filename+'_backup_'+ut.formatted_time() print 'Backing up old database to ' + backup_filename shutil.copy(database_filename, backup_filename) print "Saving: "+database_filename ut.save_pickle(dict,database_filename)
def save(self, save_dir): model_params = (self.ob_size, self.actions, self.device, self.gamma, self.memory_size, self.layer_sizes, self.adv_sizes, self.val_sizes, self.double, self.dueling) util.save_pickle(os.path.join(save_dir, 'params.pickle'), model_params) torch.save(self.policy_net.state_dict(), os.path.join(save_dir, 'policy_net.pt')) util.save_pickle(os.path.join(save_dir, 'memory.pickle'), self.memory) logger.info(f'Model saved to {save_dir}')
def read_and_process(args, tokenizer, dataset_dict, dir_name, dataset_name, split): #TODO: cache this if possible cache_path = f'{dir_name}/{dataset_name}_encodings.pt' if os.path.exists(cache_path) and not args.recompute_features: tokenized_examples = util.load_pickle(cache_path) else: if split=='train': tokenized_examples = prepare_train_data(dataset_dict, tokenizer) else: tokenized_examples = prepare_eval_data(dataset_dict, tokenizer) util.save_pickle(tokenized_examples, cache_path) return tokenized_examples
def calc_tf_idf(reviews, min_df=0.01): if util.is_exist("tf-idf.pkl"): print("loaded") return util.load_pickle("tf-idf.pkl") vectorizer = TfidfVectorizer(input="content", stop_words=stopwords.words("english"), min_df=min_df, max_df=0.5) vectorizer.fit(reviews) tfidf = vectorizer.transform(reviews).toarray() vocab = vectorizer.get_feature_names() util.save_pickle((tfidf, vocab), "tf-idf.pkl") return tfidf, vocab
def make_dict_pickle(f_name, out_fname): """ Cleans the tweets and makes a set of all the words """ logging.info('Making pickle for the dictionary') word_set = set() for tweet in load_pickle(f_name)['dat']: words, _ = clean_tweet(tweet['text']) for word in words: word_set.add(word) logging.info('Number unique words: {}'.format(len(word_set))) save_pickle(word_set, out_fname) logging.info('Saved dictionary to: {}'.format(out_fname))
def get_dat(): try: config = configparser.ConfigParser() config.read("tweepy_config") auth = tweepy.OAuthHandler(config['s1']['key'], config['s1']['secret']) auth.set_access_token(config['s1']['token'], config['s1']['token_secret']) api = tweepy.API(auth) tweets = [(tweet.author.screen_name, tweet.text) for tweet in api.search(q='bitcoin', count=300, lang='en')] save_pickle({'dat': tweets}, "new.pkl") return tweets except KeyError: print('Extraction of tweets did not work')
def get_docs(task, num_docs, reload=False): """ returns a new task, where each problem in task.problems has: problem.ir_docs = [ ... ] """ ## check state if not reload and framework.check_state(task.problems)['ir']: sys.stderr.write('already have ir documents loaded\n') return task max_files = 0 ## get all query tokens; use tfidf.get_tokens because this matches the index's tokenization queries_by_problem_id = {} for problem in task.problems: #curr_query = ' '.join(tfidf.get_tokens(problem.query.original)) curr_query = ' '.join(make_query(problem)) queries_by_problem_id[problem.id] = curr_query ## do the search all_queries = queries_by_problem_id.values() docs_by_query = tfidf.search(tfidf.file_index_pickle_path, all_queries, tfidf.search_cmd, max_files, num_docs) ## for debugging docfh = open('irdoc_debug', 'w') ## allocate docs to problems for problem in task.problems: query = queries_by_problem_id[problem.id] docs_with_values = docs_by_query[query] ## inspect values for debugging docfh.write('# problem [%s]\n' %problem.id) for doc, val in docs_with_values: docfh.write('## doc_id [%s] value [%1.4f]\n' %(doc.id, float(val))) for par in doc.paragraphs: docfh.write('%s\n' %par) ## sentence segmentation docs = [doc for doc, val in docs_with_values] for doc in docs: doc.get_sentences() problem.ir_docs = docs problem.loaded_ir_docs = True ## pickle it up sys.stderr.write('Saving [%s] problem data in [%s]\n' %(task.name, task.data_pickle)) util.save_pickle(task.problems, task.data_pickle) return task
def save_cache(self, settings, filename): """ cache the fxd skin settings in 'settings' to the OVERLAY_DIR cachfile for filename and this resolution """ cache = self.cachename(filename) if cache: # delete font object, because it can't be pickled for f in settings.font: del settings.font[f].font # save object and version information util.save_pickle((xml_skin.FXD_FORMAT_VERSION, settings), cache) # restore font object for f in settings.font: settings.font[f].font = osd.getfont(settings.font[f].name, settings.font[f].size)
def store(self, key, value): """ store the key/value in metadata and save the cache """ self.metadata[key] = value if self.disc: self.metadata[key] = value util.save_pickle(self.metadata, self.filename) return True elif not self.filename: return False else: meta_cache.set(os.path.basename(self.filename), os.path.dirname(self.filename), self.filename, self.metadata) return True
def delete(self, key): """ delete the key in metadata and save the cache """ if self.disc: if self.metadata.has_key(key): del self.metadata[key] util.save_pickle(self.metadata, self.filename) return True elif not self.filename: return False if self.metadata.has_key(key): del self.metadata[key] meta_cache.set(os.path.basename(self.filename), os.path.dirname(self.filename), self.filename, self.metadata) return True
def train_punkt_model(self, text, save_path=None): """ unsupervised training given some text optional save_path for future use """ ## train tokenizer sys.stderr.write('Training...\n') t = nltk.tokenize.punkt.PunktSentenceTokenizer() t.ABBREV = 0.1 # threshold for identifying abbrevs (lower is more aggressive) t.train(rawtext) self._sent_tokenizer = t ## pickle it if save_path: util.save_pickle(t, save_path) sys.stderr.write('Saved model as [%s]\n' %output)
def train_punkt_model(self, text, save_path=None): """ unsupervised training given some text optional save_path for future use """ ## train tokenizer sys.stderr.write('Training...\n') t = nltk.tokenize.punkt.PunktSentenceTokenizer() t.ABBREV = 0.1 # threshold for identifying abbrevs (lower is more aggressive) t.train(rawtext) self._sent_tokenizer = t ## pickle it if save_path: util.save_pickle(t, save_path) sys.stderr.write('Saved model as [%s]\n' % output)
def build_model(files, options): ## create a Doc object from some labeled data train_corpus = get_data(files, tokenize=options.tokenize) ## create a new model if options.svm: model = SVM_Model(train_corpus, options.model_path) else: model = NB_Model(train_corpus, options.model_path) ## featurize the training corpus train_corpus.featurize(model) ## run the model's training routine model.train(train_corpus) ## save the model util.save_pickle(model, options.model_path + 'model.pkl') return model
def save(self, filename=None): """ Save the tree """ if not filename: filename = self.filename if vfs.isfile(filename): vfs.unlink(filename) f = vfs.codecs_open(filename, 'wb', encoding='utf-8') f.write('<?xml version="1.0" encoding="utf-8" ?>\n') self._dump_recurse(f, self.tree) f.write('\n') f.close() f = vfs.open(filename) self.tree = self.parse(f) f.close() if self.tree: util.save_pickle(self.tree, vfs.getoverlay(filename + '.raw'))
def __init__(self, filename): """ Load the file and parse it. If the file does not exist, create an empty <freevo> node. """ Parser.__init__(self) self.filename = filename if not vfs.isfile(filename): self.tree = XMLnode('freevo') else: self.tree = None cache = vfs.getoverlay(filename + '.raw') if os.path.isfile(filename) and os.path.isfile(cache) and \ os.stat(cache)[stat.ST_MTIME] >= os.stat(filename)[stat.ST_MTIME]: self.tree = util.read_pickle(cache) if not self.tree: f = vfs.open(filename) self.tree = self.parse(f) f.close() if self.tree: util.save_pickle(self.tree, cache)
def setup_DUC_sentences(task, parser=None, reload=False): ## load problems quickly from pickle file if (not reload) and os.path.isfile(task.data_pickle): sys.stderr.write('Loading [%s] problem data from [%s]\n' %(task.name, task.data_pickle)) task.problems = util.load_pickle(task.data_pickle) return ## only parse sentences if needed for problem in task.problems: print problem.id problem.load_documents() if parser: for doc in problem.new_docs: doc.parse_sentences(parser) if parser: parser.run() for sentence, parsetree in parser.parsed.items(): sentence.parsed = parsetree ## save pickled version for faster loading later sys.stderr.write('Saving [%s] problem data in [%s]\n' %(task.name, task.data_pickle)) util.save_pickle(task.problems, task.data_pickle)
def read_test_data(file): # assume if one is saved they all are if util.check_file_exists(CONST.DATASET_PATH + CONST.TEST_PATH): T_Data = util.load(CONST.DATASET_PATH + CONST.TEST_PATH) T_Labels = util.load(CONST.DATASET_PATH + CONST.TEST_PATH_LABELS) T_Queries = util.load(CONST.DATASET_PATH + CONST.TEST_PATH_Q) T_Docs = util.load(CONST.DATASET_PATH + CONST.TEST_PATH_DOCS) else: T_Data, T_Labels, T_Queries, T_Docs = read_train_data(file) util.save_pickle(CONST.DATASET_PATH + CONST.TEST_PATH, T_Data) util.save_pickle(CONST.DATASET_PATH + CONST.TEST_PATH_LABELS, T_Labels) util.save_pickle(CONST.DATASET_PATH + CONST.TEST_PATH_Q, T_Queries) util.save_pickle(CONST.DATASET_PATH + CONST.TEST_PATH_DOCS, T_Docs) return T_Data, T_Labels, T_Queries, T_Docs
def prepare(self, features_k_nearest_neighbors, nonzero_indices = None, all_save_load = False, regenerate_neightborhood_indices = False): #print np.shape(self.processor.pts3d_bound), 'shape pts3d_bound' imgTmp = cv.cvCloneImage(self.processor.img) self.imNP = ut.cv2np(imgTmp,format='BGR') ###self.processor.map2d = np.asarray(self.processor.camPts_bound) #copied from laser to image mapping if features_k_nearest_neighbors == None or features_k_nearest_neighbors == False: #use range self.kdtree2d = kdtree.KDTree(self.processor.pts3d_bound.T) #print len(nonzero_indices) #print np.shape(np.asarray((self.processor.pts3d_bound.T)[nonzero_indices])) if nonzero_indices != None: print ut.getTime(), 'query ball tree for ', len(nonzero_indices), 'points' kdtree_query = kdtree.KDTree((self.processor.pts3d_bound.T)[nonzero_indices]) else: print ut.getTime(), 'query ball tree' kdtree_query = kdtree.KDTree(self.processor.pts3d_bound.T) filename = self.processor.config.path+'/data/'+self.processor.scan_dataset.id+'_sphere_neighborhood_indices_'+str(self.processor.feature_radius)+'.pkl' if all_save_load == True and os.path.exists(filename) and regenerate_neightborhood_indices == False: #if its already there, load it: print ut.getTime(), 'loading',filename self.kdtree_queried_indices = ut.load_pickle(filename) else: self.kdtree_queried_indices = kdtree_query.query_ball_tree(self.kdtree2d, self.processor.feature_radius, 2.0, 0.2) #approximate print ut.getTime(), 'queried kdtree: ',len(self.kdtree_queried_indices),'points, radius:',self.processor.feature_radius if all_save_load == True: ut.save_pickle(self.kdtree_queried_indices, filename) #make dict out of list for faster operations? (doesn't seem to change speed significantly): #self.kdtree_queried_indices = dict(zip(xrange(len(self.kdtree_queried_indices)), self.kdtree_queried_indices)) else: #experiemental: use_20_nearest_neighbors == True #TODO: exclude invalid values in get_featurevector (uncomment code there) self.kdtree2d = kdtree.KDTree(self.processor.pts3d_bound.T) self.kdtree_queried_indices = [] print ut.getTime(), 'kdtree single queries for kNN start, k=', features_k_nearest_neighbors count = 0 for point in ((self.processor.pts3d_bound.T)[nonzero_indices]): count = count + 1 result = self.kdtree2d.query(point, features_k_nearest_neighbors,0.2,2,self.processor.feature_radius) #existing = result[0][0] != np.Inf #print existing #print result[1] self.kdtree_queried_indices += [result[1]] #[existing] if count % 4096 == 0: print ut.getTime(),count print ut.getTime(), 'kdtree singe queries end' #convert to numpy array -> faster access self.kdtree_queried_indices = np.asarray(self.kdtree_queried_indices) #print self.kdtree_queried_indices #takes long to compute: #avg_len = 0 #minlen = 999999 #maxlen = 0 #for x in self.kdtree_queried_indices: # avg_len += len(x) # minlen = min(minlen, len(x)) # maxlen = max(maxlen, len(x)) #avg_len = avg_len / len(self.kdtree_queried_indices) #print ut.getTime(), "range neighbors: avg_len", avg_len, 'minlen', minlen, 'maxlen', maxlen #create HSV numpy images: # compute the hsv version of the image image_size = cv.cvGetSize(self.processor.img) img_h = cv.cvCreateImage (image_size, 8, 1) img_s = cv.cvCreateImage (image_size, 8, 1) img_v = cv.cvCreateImage (image_size, 8, 1) img_hsv = cv.cvCreateImage (image_size, 8, 3) cv.cvCvtColor (self.processor.img, img_hsv, cv.CV_BGR2HSV) cv.cvSplit (img_hsv, img_h, img_s, img_v, None) self.imNP_h = ut.cv2np(img_h) self.imNP_s = ut.cv2np(img_s) self.imNP_v = ut.cv2np(img_v) textures = texture_features.eigen_texture(self.processor.img) self.imNP_tex1 = textures[:,:,0] self.imNP_tex2 = textures[:,:,1] self.debug_before_first_featurevector = True self.generate_voi_histogram(self.processor.point_of_interest,self.processor.voi_width)
def saveToCache(self): logger.log( 9, 'saveToCache()') util.save_pickle(self.elocationData, self.cacheElocation) util.save_pickle(self.currentData, self.cacheCurrent) util.save_pickle(self.forecastData, self.cacheForecast)
def cacheData(self, name, data): if self.filename == "remote": return None save_pickle(self.filename + "." + name, data)
def write(self, filename): util.save_pickle(self.d, filename)
def save(self): util.save_pickle(self.featdict, self.path + 'feats')
text.text_processor.load_splitta_model('lib/splitta/model_nb/') # Skip update data if task_name[:3] == 'tac': framework.setup_TAC08(task, True) elif task_name[:3] == 'duc': framework.setup_DUC_basic(task, True) elif task_name[:3] == 'new': framework.setup_news(task) else: raise Exception('Unknown task %s' % task) if task_name[:3] != 'new': for problem in task.problems: problem.load_documents() ## save pickled version for faster loading later sys.stderr.write('Saving [%s] problem data in [%s]\n' %(task.name, task.data_pickle)) util.save_pickle(task.problems, task.data_pickle) # Tokenize for parser tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer() for problem in task.problems: num_sents = sum([len(doc.sentences) for doc in problem.new_docs]) sys.stderr.write("%s %d %d\n" % (problem.id, len(problem.new_docs), num_sents)) if args.boundary: with open('%s/%s.sentid' % (sent_path, problem.id), 'r') as fin, \ open('%s/%s.boundary' % (sent_path, problem.id), 'w') as fout: sent_ids = [int(s.strip()) for s in fin.readlines()] j = 0 # sent_ids index for doc in problem.new_docs: new_doc = True
if d == os.path.sep: print 'ERROR: %s_ITEMS contains root directory, skipped.' % type setattr(config, '%s_ITEMS' % type, []) if os.path.isdir(os.path.join(config.FREEVO_CACHEDIR, 'playlists')): config.AUDIO_ITEMS.append(('Playlists', os.path.join(config.FREEVO_CACHEDIR, 'playlists'))) delete_old_files_1() delete_old_files_2() # we have time here, don't use exif thumbnails config.IMAGE_USE_EXIF_THUMBNAIL = 0 cache_directories(rebuild) if config.CACHE_IMAGES: cache_thumbnails() create_metadata() create_tv_pickle() # close db util.mediainfo.sync() # save cache info try: import kaa.metadata.version util.save_pickle((kaa.metadata.version.VERSION, VERSION, int(time.time()), complete_update), cachefile) except ImportError: print 'WARNING: please update kaa.metadata' print print 'caching complete after %s seconds' % (time.clock() - start)
def get_guide(popup=False, XMLTV_FILE=None): """ Get a TV guide from memory cache, file cache or raw XMLTV file. Tries to return at least the channels from the config file if there is no other data """ global cached_guide if not XMLTV_FILE: XMLTV_FILE = config.XMLTV_FILE if popup: import dialog.dialogs popup_dialog = dialog.dialogs.ProgressDialog( _('Preparing the program guide'), indeterminate=True) # Can we use the cached version (if same as the file)? if (cached_guide == None or (os.path.isfile(XMLTV_FILE) and cached_guide.timestamp != os.path.getmtime(XMLTV_FILE))): # No, is there a pickled version ("file cache") in a file? pname = '%s/TV.xml.pickled' % config.FREEVO_CACHEDIR got_cached_guide = False if (os.path.isfile(XMLTV_FILE) and os.path.isfile(pname) and (os.path.getmtime(pname) > os.path.getmtime(XMLTV_FILE))): logger.debug('XMLTV, reading cached file (%s)', pname) if popup: popup_dialog.show() inprogress = kaa.ThreadCallable(util.read_pickle, pname)() inprogress.wait() cached_guide = inprogress.result else: cached_guide = util.read_pickle(pname) epg_ver = None try: epg_ver = cached_guide.EPG_VERSION except AttributeError: logger.debug('EPG does not have a version number, must be reloaded') if epg_ver != EPG_VERSION: logger.debug('EPG version missmatch, must be reloaded') elif cached_guide.timestamp != os.path.getmtime(XMLTV_FILE): # Hmmm, weird, there is a pickled file newer than the TV.xml # file, but the timestamp in it does not match the TV.xml # timestamp. We need to reload! logger.debug('EPG: Pickled file timestamp mismatch, reloading!') else: logger.info('XMLTV, got cached guide (version %s).', epg_ver) got_cached_guide = True if not got_cached_guide: # Need to reload the guide logger.debug('XMLTV, trying to read raw file (%s)', XMLTV_FILE) try: if popup: popup_dialog.set_indeterminate(False) popup_dialog.show() inprogress = kaa.ThreadCallable(load_guide, XMLTV_FILE, popup_dialog)() inprogress.wait() cached_guide = inprogress.result popup_dialog.set_indeterminate(True) else: cached_guide = load_guide(XMLTV_FILE) except: # Don't violently crash on a incomplete or empty TV.xml please. cached_guide = None print print String(_("Couldn't load the TV Guide, got an exception!")) print traceback.print_exc() else: # Replace config.XMLTV_FILE before we save the pickle in order # to avoid timestamp confision. if XMLTV_FILE != config.XMLTV_FILE: logger.info('copying %r -> %r', XMLTV_FILE, config.XMLTV_FILE) shutil.copyfile(XMLTV_FILE, config.XMLTV_FILE) os.unlink(XMLTV_FILE) cached_guide.timestamp = os.path.getmtime(config.XMLTV_FILE) # Dump a pickled version for later reads if popup: kaa.ThreadCallable(util.save_pickle, cached_guide, pname)().wait() else: util.save_pickle(cached_guide, pname) if not cached_guide: # An error occurred, return an empty guide cached_guide = TvGuide() if popup: popup_dialog.hide() return cached_guide
def saveMameRomList(mameRomList): if not mameRomList or mameRomList == None: mameRomList = mame_types.MameRomList() util.save_pickle(mameRomList, config.GAMES_MAME_CACHE)