def get_pdos(out_file='dos.msg'): """Calculate and save the projected DOS. Requires a previously relaxed calculation. Parameters ---------- out_file : str Name of the output file to save the results to. """ atoms = get_relaxed_calculation() calc = atoms.get_calculator() # Calculate the pdos and write to disk dos = calc.calc_pdos(nscf=True, kpts=atoms.info['kpts'] * [2, 2, 1], DeltaE=0.01, slab=True, Emin=-40, Emax=40, tetrahedra=False, sigma=0.2) dos = list(dos) array_to_list(dos) # If outfile, write a MessagePack encoded version to disk if out_file: with open(out_file, 'w') as f: msgpack.dump(dos, f) # Return a BSON friendly version return json.dumps(dos, encoding='utf-8')
def get_or_build(path, build_fn, *args, **kwargs): """ Load from serialized form or build an object, saving the built object. Remaining arguments are provided to `build_fn`. """ save = False obj = None if path is not None and os.path.isfile(path): with open(path, 'rb') as obj_f: obj = msgpack.load(obj_f, use_list=False, encoding='utf-8') else: save = True if obj is None: obj = build_fn(*args, **kwargs) if save and path is not None: with open(path, 'wb') as obj_f: msgpack.dump(obj, obj_f) return obj
def wedge(cluster_set, report_cluster_status=False, force_wedge_thrsh=False): # The lower bound of the edges being processed by the wedge algorithm. global edge_cut_prob global wedge_thrsh if not force_wedge_thrsh: edge_cut_prob = bconfig.WEDGE_THRESHOLD / 4. wedge_thrsh = bconfig.WEDGE_THRESHOLD else: edge_cut_prob = force_wedge_thrsh / 4. wedge_thrsh = force_wedge_thrsh matr = ProbabilityMatrix(cluster_set.last_name) matr.load() global h5file h5filepath = bconfig.TORTOISE_FILES_PATH + 'wedge_cache_' + str(PID()) h5file = h5py.File(h5filepath) convert_cluster_set(cluster_set, matr) del matr # be sure that this is the last reference! do_wedge(cluster_set) report = [] if report_cluster_status: msg = [] for cl1 in cluster_set.clusters: for cl2 in cluster_set.clusters: if cl2 > cl1: id1 = cluster_set.clusters.index(cl1) id2 = cluster_set.clusters.index(cl2) c12 = _compare_to(cl1, cl2) c21 = _compare_to(cl2, cl1) report.append((id1, id2, c12 + c21)) msg.append(' %s vs %s : %s + %s = %s -- %s' % (id1, id2, c12, c21, c12 + c21, cl1.hates(cl2))) msg = 'Wedge final clusters for %s: \n' % str(wedge_thrsh) + '\n'.join( msg) logger.log(msg) restore_cluster_set(cluster_set) if bconfig.DEBUG_CHECKS: assert cluster_set._debug_test_hate_relation() assert cluster_set._debug_duplicated_recs() if report_cluster_status: destfile = '/tmp/baistats/cluster_status_report_pid_%s_lastname_%s_thrsh_%s' % ( str(PID()), str(cluster_set.last_name), str(wedge_thrsh)) f = filehandler.open(destfile, 'w') SER.dump([ wedge_thrsh, cluster_set.last_name, report, cluster_set.num_all_bibs ], f) f.close() gc.collect() h5file.close() os.remove(h5filepath)
def main(args): logging.basicConfig(level=logging.INFO) model = json.load(args.ifile) fn = fn_from_args(args) if fn: weight_layers = [layer for layer in model if layer['layerName'] in weight_first_list] if fn.needs_two_step: for weights in [layer['parameters'][0] for layer in weight_layers]: fn.consume(weights) fn.done() for i, layer in enumerate(weight_layers): layer['parameters'][0] = transform(layer['parameters'][0], fn) if fn.needs_two_step: model = { 'codebook' : fn.serialize_codebook(), 'model' : model } if args.ubjson_format: args.ofile.write(simpleubjson.encode(model)) elif args.json: args.ofile.write(json.dumps(model).encode('utf8')) else: msgpack.dump(model, args.ofile, use_bin_type=True, use_single_float=args.single_precision_float) args.ofile.close() args.ifile.close()
def to_local_file(obj, filename, to_save_path=None): """ Save an object as file. Params ------ obj : object Serialized object or any object which be built-in Python types. root_dir : str Root directory where the object will be saved. filename : str Object filename. """ if to_save_path is None: _init_data_dir() to_save_path = DEFAULT_DATA_DIR_NAME fpath = path.join(to_save_path, filename) try: with open(fpath, mode='wb') as file: mpk.dump(obj, file) except TypeError: raise Exception('`obj` could not be saved because it ' + 'is not serialized.')
def load_embeddings(self): """generate embeddings suited for the current vocab or load previously cached ones.""" embedding_file = os.path.join(self.args.output_dir, 'embedding.msgpack') if not os.path.exists(embedding_file): if self.args.language == "chinese": embeddings = load_embeddings_Chinese( self.args.pretrained_embeddings, self.vocab, self.args.embedding_dim, mode=self.args.embedding_mode, lower=self.args.lower_case) else: embeddings = load_embeddings_English( self.args.pretrained_embeddings, self.vocab, self.args.embedding_dim, mode=self.args.embedding_mode, lower=self.args.lower_case) with open(embedding_file, 'wb') as f: msgpack.dump(embeddings, f) else: with open(embedding_file, 'rb') as f: embeddings = msgpack.load(f) return embeddings
def setUp(self): self.dir = tempfile.mkdtemp() self.runner = click.testing.CliRunner() if self.keep_tree: print 'KmlTrackTest is running in %s' % self.dir self.infile_msgpack = os.path.join(self.dir, 'in.msgpack') with open(self.infile_msgpack, 'w') as f: for row in self.test_track: msgpack.dump(row, f) self.infile_json = os.path.join(self.dir, 'in.json') with open(self.infile_json, 'w') as f: for row in self.test_track: json.dump(row, f) f.write('\n') self.infile_csv = os.path.join(self.dir, 'in.csv') with open(self.infile_csv, 'w') as f: f = csv.DictWriter( f, fieldnames=['lat', 'lon', 'timestamp', 'course', 'color']) f.writeheader() for row in self.test_track: f.writerow(row) self.outfile = os.path.join(self.dir, 'out.kml')
def dump(data, filepath): ''' Write data as as type self.ext to filepath. json or msgpack ''' if ' ' in filepath: raise raeting.KeepError("Invalid filepath '{0}' " "contains space".format(filepath)) root, ext = os.path.splitext(filepath) if ext == '.json': with aiding.ocfn(filepath, "w+") as f: json.dump(data, f, indent=2, encoding='utf-8') f.flush() os.fsync(f.fileno()) elif ext == '.msgpack': if not msgpack: raise raeting.KeepError("Invalid filepath ext '{0}' " "needs msgpack installed".format(filepath)) with aiding.ocfn(filepath, "w+b", binary=True) as f: msgpack.dump(data, f, encoding='utf-8') f.flush() os.fsync(f.fileno()) else: raise raeting.KeepError("Invalid filepath ext '{0}' " "not '.json' or '.msgpack'".format(filepath))
def put_password(policy_pubkey, username, password, save_as_file: bool = False): data_source = Enrico(policy_encrypting_key=policy_pubkey) data_source_public_key = bytes(data_source.stamp) kits = list() actual_data = { 'username': username, 'password': password, } plaintext = msgpack.dumps(actual_data, use_bin_type=True) message_kit, _signature = data_source.encrypt_message(plaintext) kit_bytes = message_kit.to_bytes() kits.append(kit_bytes) data = { 'data_source': data_source_public_key, 'kits': kits, } if save_as_file: with open(HEART_DATA_FILENAME, "wb") as file: msgpack.dump(data, file, use_bin_type=True) return data
def crawler_fans(): all_person_set = set() f_fans = open('fans.txt', 'w') for page_index in range(1, 345, 1): for class_index in [1, 2, 4, 5, 6]: start = time() try: url = 'https://123fans.cn/results.php?qi=%d&c=%d'\ % (page_index, class_index) req = urllib2.Request(url, None) response = urllib2.urlopen(req) html_doc = response.read() soup = BeautifulSoup(html_doc) children = [k for k in soup.children][1] lis = children.select('.odd') for k in lis: try: name = k.select('.name')[0].string all_person_set.add(name) f_fans.write(name + '\n') except: traceback.print_exc() continue print page_index, class_index, \ len(all_person_set), time()-start except: print 'error', page_index, class_index sleep(5) all_person_set = list(all_person_set) msgpack.dump(all_person_set, open('fans.p', 'w'))
def msg_pack(file_name, contents): try: with open(path + file_name, 'wb') as f: msgpack.dump(contents, f) except Exception, e: logging.info(e) return False
def main(): dialect_name, working_dir = sys.argv[1:] executor = sqlexecutor.executor(dialect_name, working_dir) print "Ready" sys.stdout.flush() try: for message in msgpack.Unpacker(sys.stdin, read_size=1): command = message[0] args = message[1:] if command == "execute": (creation_sql, query, ) = args result = executor.execute(creation_sql, query) if result.table is None: column_names = None rows = None else: column_names = result.table.column_names rows = result.table.rows msgpack.dump((result.error, column_names, rows), sys.stdout) sys.stdout.flush() else: return finally: executor.close()
def prepare(): args, _ = setup() train_data = load_data("train.txt") dev_data = load_data("test.txt") word_dict = build_dict(train_data[0] + dev_data[0]) x, y, e1, e2, dist1, dist2, e1_pos, e2_pos = vectorize( train_data, word_dict) train = list(zip(x, e1, e2, dist1, dist2, e1_pos, e2_pos, y)) e_x, e_y, e_e1, e_e2, e_dist1, e_dist2, e_e1_pos, e_e2_pos = vectorize( dev_data, word_dict) valid = list( zip(e_x, e_e1, e_e2, e_dist1, e_dist2, e_e1_pos, e_e2_pos, e_y)) embed_file = 'embeddings.txt' vac_file = 'words.lst' embedding = load_embedding(embed_file, vac_file, word_dict) # save meta = {"embeddings": embedding.tolist()} result = {"train": train, "valid": valid} with open("meta.msgpack", "wb") as f: msgpack.dump(meta, f) with open("data.msgpack", "wb") as f: msgpack.dump(result, f)
def dump(self, directory): """ Utility function to dump/save model :param directory: directory to output to :return: """ data = {"identity_to_values_small": self.identity_to_values_small, "n_users ": self.n_users, "n_identity_sent_values ": self.n_identity_sent_values, "nu": self.nu, "kappa": self.kappa, "beta": self.beta, "index_to_ids": self.index_to_ids, "ids_to_index": self.ids_to_index, "iteration": self.iteration, "train_perplexity": self.train_perplexity, "test_perplexity": self.test_perplexity } iter_str = str(self.iteration) msgpack.dump(data, open(os.path.join(directory, iter_str + "_sent_basic.mpack"), "wb")) np.save(os.path.join(directory, "sent_mu_0"), self.mu_0) np.save(os.path.join(directory, "sent_sigma_0"), self.sigma_0) np.save(os.path.join(directory, iter_str + "_sent_sigma"), self.sigma) np.save(os.path.join(directory, iter_str + "_sent_mu"), self.mu) np.save(os.path.join(directory, iter_str + "_sent_precision_matrix"), self.precision_matrix) np.save(os.path.join(directory, iter_str + "_sent_phi"), self.phi)
def create_bond_dict(components_pdbx_file_path, msgpack_file_path): pdbx_file = pdbx.PDBxFile() pdbx_file.read(components_pdbx_file_path) components = pdbx_file.get_block_names() bond_dict = {} for component in components: print(component) cif_bonds = pdbx_file.get_category("chem_comp_bond", block=component) if cif_bonds is None: # No bond info for this compound continue if isinstance(cif_bonds["comp_id"], str): # Single string -> single bond group_bonds = { (cif_bonds["atom_id_1"], cif_bonds["atom_id_2"]): BOND_ORDERS[cif_bonds["value_order"]] } else: # Looped values -> multiple bonds group_bonds = {(atom1, atom2): BOND_ORDERS[order] for atom1, atom2, order in zip( cif_bonds["atom_id_1"], cif_bonds["atom_id_2"], cif_bonds["value_order"])} bond_dict[component] = group_bonds with open(msgpack_file_path, "wb") as msgpack_file: msgpack.dump(bond_dict, msgpack_file)
def data_to_token_ids(source_data, id_data_path, vocab_to_id, cache=True): """Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: source_data: id_data_path: vocab_to_id: cache: Boolean; """ if not gfile.Exists(id_data_path) or not cache: print("Creating id tokenized data %s" % id_data_path) id_data = [] for source in source_data: id_source = [[START_LINE_ID]] for line in source: id_line = [vocab_to_id.get(word[1], UNK_ID) for word in line] id_source.append(id_line) id_source.append([END_LINE_ID]) id_data.append(id_source) with gfile.GFile(id_data_path, mode="w") as id_data_file: pickle.dump(id_data, id_data_file) else: with gfile.GFile(id_data_path, mode="r") as token_file: id_data = pickle.load(token_file) return id_data
def save_to_file(self, filename): """Save only the bare minimum needed to reconstruct this CoverageDB. This serializes the data to a single file and cab reduce the disk footprint of block coverage significantly (depending on overlap and number of files).""" if file_backing_disabled: raise Exception( "[!] Can't save/load coverage db files without msgpack. Try `pip install msgpack`" ) save_dict = dict() save_dict["version"] = 1 # serialized covdb version save_dict["module_name"] = self.module_name save_dict["module_base"] = self.module_base save_dict["coverage_files"] = self.coverage_files # save tighter version of block dict {int: int} vice {int: str} block_dict_to_save = {} file_index_map = { filepath: self.coverage_files.index(filepath) for filepath in self.coverage_files } for block, trace_list in self.block_dict.items(): trace_id_list = [file_index_map[name] for name in trace_list] block_dict_to_save[block] = trace_id_list save_dict["block_dict"] = block_dict_to_save # write packed version to file with open(filename, "wb") as f: msgpack.dump(save_dict, f) self.filename = filename
def main(): lfw_folder = '/data/liubo/face/lfw_face' pair_file = '/data/liubo/face/lfw_pair.txt' same_dist_list = [] no_same_dist_list = [] for line in open(pair_file): tmp = line.rstrip().split() if len(tmp) == 3: person = tmp[0] person_path = os.path.join(lfw_folder, person) pic_list = os.listdir(person_path) if len(pic_list) == 1: print 'error person :', person continue else: np.random.shuffle(pic_list) pic_path1 = os.path.join(person_path, pic_list[0]) pic_path2 = os.path.join(person_path, pic_list[1]) dist = cal_two_pic_distance(pic_path1, pic_path2) same_dist_list.append(dist) elif len(tmp) == 4: person1 = tmp[0] person1_path = os.path.join(lfw_folder, person1) pic1_list = os.listdir(person1_path) person2 = tmp[2] person2_path = os.path.join(lfw_folder, person2) pic2_list = os.listdir(person2_path) if len(pic1_list) > 0 and len(pic2_list) > 0: np.random.shuffle(pic1_list) np.random.shuffle(pic2_list) pic_path1 = os.path.join(person1_path, pic1_list[0]) pic_path2 = os.path.join(person2_path, pic2_list[0]) dist = cal_two_pic_distance(pic_path1, pic_path2) no_same_dist_list.append(dist) msgpack.dump((same_dist_list, no_same_dist_list), open('dist.p', 'wb'))
def select_and_dump_wizard_findings(wizard_findings, target_file): cfg = config if len(wizard_findings) == 0: print_l("[!] No wizard findings!") sys.exit() index = cfg["function_number"] if index in range(0, len(wizard_findings)): pass else: print_l("Functions found:") for i, finding in enumerate(wizard_findings): if "source" in finding: print_l("{}) {func_name} from {source}:{start}-{end}".format( i, **finding)) else: print_l("{}) {func_name}".format(i, **finding)) buffer = bytearray(finding["buffer"]) hexdump(buffer) # Let the user select a finding, add it to the config index = select_from_range(len(wizard_findings), "Choose a function to fuzz> ") wizard_findings[index]["selected"] = True with open(target_file.replace("targets.msg", "all_targets.msg"), "wb") as msg_file: msgpack.dump(wizard_findings, msg_file) with open(target_file, "wb") as msg_file: msgpack.dump(list(filter(lambda k: k["selected"], wizard_findings)), msg_file) return wizard_findings
def create_dict(components_pdbx_file_path, msgpack_file_path, subcategory, expected_type): pdbx_file = pdbx.PDBxFile() pdbx_file.read(components_pdbx_file_path) components = pdbx_file.get_block_names() data_dict = {} for i, component in enumerate(components): print(f"{((i+1) / len(components) * 100):4.1f} %", end="\r") try: cif_dict = pdbx_file.get_category("chem_comp", block=component) except ValueError: # The 'chem_comp' category may contain unparsable names # with wrong quote escaping # In this case the PDBx file parser raises an Exception cif_dict = None if cif_dict is None: # No or erroneous info for this compound data_dict[component] = None else: try: data = expected_type(cif_dict[subcategory]) except ValueError: # Unparsable data, e.g. '?' as float data = None data_dict[component] = data print() with open(msgpack_file_path, "wb") as msgpack_file: msgpack.dump(data_dict, msgpack_file)
def save_msgpack(cls, filename, data): """Save data into MSGPACK file Parameters ---------- filename : str Filename path data : Data to be stored Returns ------- None """ try: import msgpack except ImportError: message = '{name}: Unable to import msgpack module. You can install it with `pip install msgpack-python`.'.format( name=cls.__class__.__name__) cls.logger().exception(message) raise ImportError(message) msgpack.dump(data, open(filename, 'wb'), use_bin_type=True)
def main(args): logging.basicConfig(level=logging.INFO) model = json.load(args.ifile) fn = fn_from_args(args) if fn: weight_layers = [ layer for layer in model if layer['layerName'] in weight_first_list ] if fn.needs_two_step: for weights in [layer['parameters'][0] for layer in weight_layers]: fn.consume(weights) fn.done() for i, layer in enumerate(weight_layers): layer['parameters'][0] = transform(layer['parameters'][0], fn) if fn.needs_two_step: model = {'codebook': fn.serialize_codebook(), 'model': model} if args.ubjson_format: args.ofile.write(simpleubjson.encode(model)) elif args.json: args.ofile.write(json.dumps(model).encode('utf8')) else: msgpack.dump(model, args.ofile, use_bin_type=True, use_single_float=args.single_precision_float) args.ofile.close() args.ifile.close()
def encrypt_patient_data(policy_pubkey, data_fields, label: bytes = DEFAULT_LABEL, save_as_file: bool = False): data_source = DataSource(policy_pubkey_enc=policy_pubkey, label=label) data_source_public_key = bytes(data_source.stamp) ipfs_api = ipfsapi.connect() kits = list() with open("Merkle_json.json", "r") as read_file: data = json.load(read_file) share=data_fields share_data = {} for i in share: share_data[i] = {'Value' : data[0][i]['Value'], 'Hash' : data[0][i]['Hash']} plaintext = msgpack.dumps(share_data, use_bin_type=True) message_kit, _signature = data_source.encrypt_message(plaintext) kit_bytes = message_kit.to_bytes() kits.append(kit_bytes) data = { 'data_source': data_source_public_key, 'kits': kits, } if save_as_file: with open(PATIENT_DETAIL, "wb") as file: msgpack.dump(data, file, use_bin_type=True) res = ipfs_api.add(PATIENT_DETAIL) return res
def dump(data, filepath): ''' Write data as as type self.ext to filepath. json or msgpack ''' if ' ' in filepath: raise raeting.KeepError("Invalid filepath '{0}' " "contains space".format(filepath)) if hasattr(data, 'get'): for key, val in data.items(): # P3 json.dump no encoding parameter if isinstance(val, (bytes, bytearray)): data[key] = val.decode('utf-8') root, ext = os.path.splitext(filepath) if ext == '.json': with ocfn(filepath, "w+") as f: json.dump(data, f, indent=2) f.flush() os.fsync(f.fileno()) elif ext == '.msgpack': if not msgpack: raise raeting.KeepError("Invalid filepath ext '{0}' " "needs msgpack installed".format(filepath)) with ocfn(filepath, "w+b", binary=True) as f: msgpack.dump(data, f, encoding='utf-8') f.flush() os.fsync(f.fileno()) else: raise raeting.KeepError("Invalid filepath ext '{0}' " "not '.json' or '.msgpack'".format(filepath))
def dump(data, filepath): ''' Write data as as type self.ext to filepath. json or msgpack ''' if ' ' in filepath: raise raeting.KeepError("Invalid filepath '{0}' " "contains space".format(filepath)) with aiding.ocfn(filepath, "w+") as f: root, ext = os.path.splitext(filepath) if ext == '.json': json.dump(data, f, indent=2) elif ext == '.msgpack': if not msgpack: raise raeting.KeepError( "Invalid filepath ext '{0}' " "needs msgpack installed".format(filepath)) msgpack.dump(data, f) else: raise raeting.KeepError( "Invalid filepath ext '{0}' " "not '.json' or '.msgpack'".format(filepath)) f.flush() os.fsync(f.fileno())
def dump(data: object, path: str): if type(path) != str: msgpack.dump(data, path) return with open(path, "wb") as f: kwargs['ensure_ascii'] = False msgpack.dump(data, f)
def sort_out(filename, from_year, heading_size_for_statistics, top_k, min_n_events=2000, sheet_name="Sheet1"): # why is xlrd being deprecated in favor of openpyxl when the latter cannot even open certain .xlsx files?? df = pd.read_excel(f"input/{filename}", sheet_name=sheet_name, engine="xlrd") for country, country_df in tqdm(df.groupby("COUNTRY")): current_entries = country_df["YEAR"] >= int(from_year) reference_date = dt.datetime(year=int(from_year), month=1, day=1) event_types, event_times, labels = parse_events( country_df[current_entries], int(heading_size_for_statistics), int(top_k), reference_date) prefix = filename.split('.')[0] n_types = len(labels) n_events = len(event_types) if n_events < min_n_events: continue tqdm.write(' '.join([ colored(country, "red"), '; '.join(', '.join(label) for label in labels) ])) name = f"{prefix}-{country}-since{from_year}-top{top_k}_{n_types}-{n_events//1000}k-events" with open(f"output/{name}.msgpack", "wb") as f: msgpack.dump([event_types, event_times], f) with open(f"output/{name}-labels.json", "w") as f: json.dump(labels, f)
def load_check_result_url(dic_file, check_url_file): person_result_dic = {} # {person:([](right_set),[](wrong_set))} # 肯定正确和肯定错的的图片 right_url_count = wrong_url_count = error_format_count = no_baike_count = no_meaning_count = 0 if os.path.exists(dic_file): person_result_dic = msgpack.load(open(dic_file, 'rb')) for line in open(check_url_file): tmp = line.rstrip().split('\t') # [person_name, pic_index, pic_url, baike_name, baike_sim, newbaike_sim, guess_info] person_name = tmp[0] right_list, wrong_list = person_result_dic.get(person_name, ([], [])) if len(tmp) == 7: if tmp[3] not in no_meaning_list: if tmp[3] == no_find_baike: no_baike_count += 1 continue else: if get_newbaike_sim(tmp[4]) > sim_threshold: if tmp[0] == tmp[3]: right_list.append(tmp[1]) right_url_count += 1 else: wrong_url_count += 1 wrong_list.append(tmp[1]) else: # 小于某概率时结果不可信,需要标注 no_baike_count += 1 continue else: no_meaning_count += 1 continue else: error_format_count += 1 continue person_result_dic[person_name] = (right_list, wrong_list) print right_url_count, wrong_url_count, no_baike_count, no_meaning_count, error_format_count msgpack.dump(person_result_dic, open('person_result_dic.p', 'w'))
def crawler_baidu_person_list(): # 从百度人气榜上获取人名 all_person_list = set() for index in range(50): url = 'http://baike.baidu.com/operation/api/starflowerstarlist?' \ 'rankType=thisWeek&pg=%d' % index req = urllib2.Request(url, None) response = urllib2.urlopen(req) html_doc = response.read() content = json.loads(html_doc) this_page_list = content.get('data').get('thisWeek') for person_content in this_page_list: all_person_list.add(person_content.get('name')) print len(all_person_list) for index in range(50): url = 'http://baike.baidu.com/operation/api/starflowerstarlist?' \ 'rankType=lastWeek&pg=%d' % index req = urllib2.Request(url, None) response = urllib2.urlopen(req) html_doc = response.read() content = json.loads(html_doc) this_page_list = content.get('data').get('lastWeek') for person_content in this_page_list: all_person_list.add(person_content.get('name')) print len(all_person_list) all_person_set = list(all_person_list) msgpack.dump(all_person_set, open('baidu_fans.p', 'w'))
def get_total_potential(out_file='potential.msg'): """Calculate and save the total potential. Requires a previously relaxed calculation. Parameters ---------- out_file : str Name of the output file to save the results to. """ atoms = get_relaxed_calculation() calc = atoms.get_calculator() # Collect the total potential and write to disk potential = calc.extract_total_potential() potential = list(potential) array_to_list(potential) # If outfile, write a MessagePack encoded version to disk if out_file: with open(out_file, 'w') as f: msgpack.dump(potential, f) # Return a BSON friendly version return json.dumps(potential, encoding='utf-8')
def freqs_to_cBpack(input_file, output_file, cutoff=600): """ Convert a frequency list into the idiosyncratic 'cBpack' format that will be loaded by wordfreq: a list in msgpack format of frequency tiers, each tier being one centibel (a factor of 10^(1/100)) less frequent than the previous tier. """ cBpack = [] for line in input_file: word, strfreq = line.rstrip().split('\t', 1) if word == '__total__': raise ValueError("This is a count file, not a frequency file") freq = float(strfreq) neg_cB = -(round(math.log10(freq) * 100)) if neg_cB >= cutoff: break while neg_cB >= len(cBpack): cBpack.append([]) cBpack[neg_cB].append(word) for sublist in cBpack: sublist.sort() cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack msgpack.dump(cBpack_data, output_file)
def crawler_fans(): all_person_set = set() f_fans = open('fans.txt', 'w') for page_index in range(1, 345, 1): for class_index in [1, 2, 4, 5, 6]: start = time() try: url = 'https://123fans.cn/results.php?qi=%d&c=%d'\ % (page_index, class_index) req = urllib2.Request(url, None) response = urllib2.urlopen(req) html_doc = response.read() soup = BeautifulSoup(html_doc) children = [k for k in soup.children][1] lis = children.select('.odd') for k in lis: try: name = k.select('.name')[0].string all_person_set.add(name) f_fans.write(name+'\n') except: traceback.print_exc() continue print page_index, class_index, \ len(all_person_set), time()-start except: print 'error', page_index, class_index sleep(5) all_person_set = list(all_person_set) msgpack.dump(all_person_set, open('fans.p', 'w'))
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600): """ Convert a csv file of words and their frequencies to a file in the idiosyncratic 'cBpack' format. Only words with a frequency greater than `cutoff` centibels will be written to the new file. This cutoff should not be stacked with a cutoff in `read_freqs`; doing so would skew the resulting frequencies. """ freqs = read_freqs(in_filename, cutoff=0, lang=None) cBpack = [] for token, freq in freqs.items(): cB = round(math.log10(freq) * 100) if cB <= cutoff: continue neg_cB = -cB while neg_cB >= len(cBpack): cBpack.append([]) cBpack[neg_cB].append(token) for sublist in cBpack: sublist.sort() # Write a "header" consisting of a dictionary at the start of the file cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack with gzip.open(out_filename, 'wb') as outfile: msgpack.dump(cBpack_data, outfile)
def dump(data, path): """ Serialize data dict and write to file given by path where serialization is given by path's extension of either JSON, MsgPack, or CBOR for extension .json, .mgpk, or .cbor respectively """ if ' ' in path: raise IOError(f"Invalid file path '{path}' contains space.") root, ext = os.path.splitext(path) if ext == '.json': with ocfn(path, "w+b") as f: json.dump(data, f, indent=2) f.flush() os.fsync(f.fileno()) elif ext == '.mgpk': with ocfn(path, "w+b") as f: msgpack.dump(data, f) f.flush() os.fsync(f.fileno()) elif ext == '.cbor': with ocfn(path, "w+b") as f: cbor.dump(data, f) f.flush() os.fsync(f.fileno()) else: raise IOError(f"Invalid file path ext '{path}' " f"not '.json', '.mgpk', or 'cbor'.")
def encrypt_track_segments(policy_pubkey, dir_path): data_source = Enrico(policy_encrypting_key=policy_pubkey) data_source_public_key = bytes(data_source.stamp) print(dir_path) target_path = "/".join(dir_path.split('/')[:-1]) target_path = os.path.join(target_path, 'segments_encrypted') if not os.path.exists(target_path): try: os.makedirs(target_path) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise track_files = os.scandir(dir_path) for track_segment in track_files: with open(track_segment, "rb") as f: plaintext = f.read() ciphertext, signature = data_source.encrypt_message(plaintext) print("Signature", signature) data = { 'track_segment_data': ciphertext.to_bytes(), 'data_source': data_source_public_key } with open(os.path.join(target_path, track_segment.name), "wb") as f: msgpack.dump(data, f, use_bin_type=True) return True
def wedge(cluster_set, report_cluster_status=False, force_wedge_thrsh=False): # The lower bound of the edges being processed by the wedge algorithm. global edge_cut_prob global wedge_thrsh if not force_wedge_thrsh: edge_cut_prob = bconfig.WEDGE_THRESHOLD / 4. wedge_thrsh = bconfig.WEDGE_THRESHOLD else: edge_cut_prob = force_wedge_thrsh / 4. wedge_thrsh = force_wedge_thrsh matr = ProbabilityMatrix(cluster_set.last_name) matr.load() global h5file h5filepath = bconfig.TORTOISE_FILES_PATH+'wedge_cache_'+str(PID()) h5file = h5py.File(h5filepath) convert_cluster_set(cluster_set, matr) del matr # be sure that this is the last reference! do_wedge(cluster_set) report = [] if bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES or report_cluster_status: msg = [] for cl1 in cluster_set.clusters: for cl2 in cluster_set.clusters: if cl2 > cl1: id1 = cluster_set.clusters.index(cl1) id2 = cluster_set.clusters.index(cl2) c12 = _compare_to(cl1,cl2) c21 = _compare_to(cl2,cl1) report.append((id1,id2,c12+c21)) msg.append( ' %s vs %s : %s + %s = %s -- %s' % (id1, id2, c12, c21, c12+c21, cl1.hates(cl2))) msg = 'Wedge final clusters for %s: \n' % str(wedge_thrsh) + '\n'.join(msg) if not bconfig.DEBUG_WEDGE_OUTPUT and bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES: print print msg print wedge_print(msg) restore_cluster_set(cluster_set) if bconfig.DEBUG_CHECKS: assert cluster_set._debug_test_hate_relation() assert cluster_set._debug_duplicated_recs() if report_cluster_status: destfile = '/tmp/baistats/cluster_status_report_pid_%s_lastname_%s_thrsh_%s' % (str(PID()),str(cluster_set.last_name),str(wedge_thrsh)) f = filehandler.open(destfile, 'w') SER.dump([wedge_thrsh,cluster_set.last_name,report,cluster_set.num_all_bibs],f) f.close() gc.collect() h5file.close() os.remove(h5filepath)
def dump(cls, records, filepath=None): """Dump in the same fashion as load.""" if filepath is None: filepath = os.path.join(config['current_snapshot'], 'repos.msgpack') with utils.FaultTolerantFile(filepath) as f: msgpack.dump(records, f, default=cls._dumper)
def __call__(self): if self.filename == None: self.filename = msgpackmemoized_basedir + '/' + self.f.__name__ + '.msgpack' if path.exists(self.filename): return msgpack.load(open(self.filename)) result = self.f() msgpack.dump(result, open(self.filename, 'w')) return result
def serializeToFile(self, fname, annotations): """ Overwritten to write Msgpack files. """ # TODO make all image filenames relative to the label file import msgpack f = open(fname, "w") msgpack.dump(annotations, f)
def dump(self, outfile): """Write a serialized version of the database to filehandle.""" db_dict = { 'meta_prints': self.meta_prints, 'content_prints': self.content_prints, 'series_id': self.series_id, } msgpack.dump(db_dict, outfile)
def dump_file(self, obj, fp): try: msgpack.dump(obj, fp) return True except Exception as ex: log.warn('Unable to dump object to file: %s', ex, exc_info=True) return False
def match_one_file(midi_filename, embed_fn, hash_fn, msd_embeddings, msd_sequences, msd_feature_paths, msd_ids, output_filename): """ Match one MIDI file to the million song dataset by computing its CQT, pruning by matching its embedding, re-pruning by matching its downsampled hash sequence, and finally doing DTW on CQTs on the remaining entries. Parameters ---------- midi_filename : str Path to a MIDI file to match to the MSD embed_fn : function Function which takes in a CQT and produces a fixed-length embedding hash_fn : function Function which takes in a CQT and produces a sequence of binary vectors msd_embeddings : np.ndarray (# MSD entries x embedding dimension) matrix of all embeddings for all entries from the MSD msd_sequences : list of np.ndarray List of binary vector sequences (represented as ints) for all MSD entries msd_feature_paths : list of str Path to feature files (containing CQT) for each MSD entry msd_ids : list of str MSD ID of each corresponding entry in the above lists output_filename : str Where to write the results file, which includes the DTW scores for all of the non-pruned MSD entries """ # Try to compute a CQT for the MIDI file try: m = pretty_midi.PrettyMIDI(midi_filename) except Exception as e: print 'Could not parse {}: {}'.format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return try: midi_gram = feature_extraction.midi_cqt(m) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Skip this file if the MIDI gram is very long, to avoid memory issues if midi_gram.shape[0] > MAX_FRAMES: return # Compute the embedding of the CQT midi_embedding = embed_fn(midi_gram.reshape(1, 1, *midi_gram.shape)) # Compute the hash sequence midi_hash_sequence = hash_fn(midi_gram.reshape(1, 1, *midi_gram.shape)) # Convert to sequence of integers midi_hash_sequence = dhs.vectors_to_ints(midi_hash_sequence > 0) midi_hash_sequence = midi_hash_sequence.astype(np.uint32) matches = match_one_midi( midi_gram, midi_embedding, midi_hash_sequence, msd_embeddings, msd_sequences, msd_feature_paths, msd_ids) # Write out the result with open(output_filename, 'wb') as f: msgpack.dump(matches, f)
def SavePermResults(path,name,method,*args): data = [] for thing in enumerate(args): data.append(thing) if method=='msgpack': with open(os.path.join(path,name+'.mpac'),'wb') as f: msgpack.dump(data,f) else: with open(os.path.join(path,name+'.pickle'),'wb') as f: pickle.dump(data,f)
def make_hanzi_converter(table_in, msgpack_out): table = {} with open(table_in, encoding='utf-8') as infile: for line in infile: hexcode, char = line.rstrip('\n').split('\t') codept = int(hexcode, 16) assert len(char) == 1 if chr(codept) != char: table[codept] = char with gzip.open(msgpack_out, 'wb') as outfile: msgpack.dump(table, outfile, raw=False)
def make_hanzi_converter(table_in, msgpack_out): table = {} with open(table_in, encoding="utf-8") as infile: for line in infile: hexcode, char = line.rstrip("\n").split("\t") codept = int(hexcode, 16) assert len(char) == 1 if chr(codept) != char: table[codept] = char with gzip.open(msgpack_out, "wb") as outfile: msgpack.dump(table, outfile, encoding="utf-8")
def write_http_response_to_temp_file(http_response): """ Write an HTTPResponse instance to a temp file using msgpack :param http_response: The HTTP response :return: The name of the file """ temp = get_temp_file('http') data = http_response.to_dict() msgpack.dump(data, temp, use_bin_type=True) temp.close() return temp.name
def write_tags_to_temp_file(tag_list): """ Write an Tag list to a temp file using msgpack :param tag_list: The Tag list :return: The name of the file """ temp = get_temp_file('tags') data = [t.to_dict() for t in tag_list] msgpack.dump(data, temp, use_bin_type=True) temp.close() return temp.name
def pickle_object(path, targetobject, json_pickle=False): """ @type path: str or unicode @type targetobject: object @type json_pickle: bool """ msgpack.dump(targetobject, open(path, "wb"), msgpack.HIGHEST_PROTOCOL) if json_pickle: if isinstance(targetobject, dict): json_object(path, targetobject) else: json_object(path, targetobject)
def save(self, filename): tfn = '%s.inprog-%d' % (filename, random.randint(1, 10000000)) fh = open(tfn, 'w') try: msgpack.dump(self.todict(), fh) finally: fh.close() if os.path.exists(filename): os.rename(filename, '%s.bak' % filename) os.rename(tfn, filename)
def create_train_valid_data(folder='/data/liubo/face/research_feature_self'): # 根据已经存在的数据训练人脸验证模型 person_list = os.listdir(folder) path_feature_dic = {} # for person in person_list: person_path = os.path.join(folder, person) pic_feature_list = os.listdir(person_path) for pic_feature_path in pic_feature_list: pic_feature_path = os.path.join(person_path, pic_feature_path) pic_feature = msgpack_numpy.load(open(pic_feature_path, 'rb')) path_feature_dic[pic_feature_path] = pic_feature msgpack.dump(path_feature_dic, open('research_feature.p', 'wb'))
def save(self): """Save object into DB.""" resp = self.response values = [] values.append(resp.get_id()) values.append(self.request.get_uri().url_string) values.append(resp.get_code()) values.append(self.tag) values.append(int(self.mark)) values.append(str(resp.info())) values.append(resp.get_wait_time()) values.append(resp.get_msg()) values.append(resp.content_type) ch = resp.charset values.append(ch) values.append(self.request.get_method()) values.append(len(resp.body)) code = int(resp.get_code()) / 100 values.append(code) values.append(resp.get_alias()) values.append(int(self.request.get_uri().has_query_string())) if not self.id: sql = ('INSERT INTO %s ' '(id, url, code, tag, mark, info, time, msg, content_type, ' 'charset, method, response_size, codef, alias, has_qs) ' 'VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)' % self._DATA_TABLE) self._db.execute(sql, values) self.id = self.response.get_id() else: values.append(self.id) sql = ('UPDATE %s' ' SET id = ?, url = ?, code = ?, tag = ?, mark = ?, info = ?, ' 'time = ?, msg = ?, content_type = ?, charset = ?, ' 'method = ?, response_size = ?, codef = ?, alias = ?, has_qs = ? ' ' WHERE id = ?' % self._DATA_TABLE) self._db.execute(sql, values) # # Save raw data to file # fname = self._get_fname_for_id(self.id) req_res = open(fname, 'wb') data = (self.request.to_dict(), self.response.to_dict(), self._MSGPACK_CANARY) msgpack.dump(data, req_res) req_res.close() return True
def _write(self): ''' Write out to disk ''' if not HAS_MSGPACK: return # TODO Add check into preflight to ensure dir exists # TODO Dir hashing? with salt.utils.fopen(self._path, 'w+') as fp_: cache = { "CacheDisk_data": self._dict, "CacheDisk_cachetime": self._key_cache_time } msgpack.dump(cache, fp_)
def find_url_list(): pic_face_index_dic = msgpack.load(open('pic_face_index_dic.p', 'rb')) url_folder = '/data/url' for person in pic_face_index_dic: print person url_list = open(os.path.join(url_folder, person+'.txt'), 'r').read().split('\n') need_check_url_index_list = pic_face_index_dic.get(person) for index in range(len(need_check_url_index_list)): tmp = url_list[int(need_check_url_index_list[index])].split('\t') need_check_url_index_list[index] = \ (need_check_url_index_list[index], tmp[-1]) pic_face_index_dic[person] = need_check_url_index_list msgpack.dump(pic_face_index_dic, open('pic_face_index_url_dic.p', 'wb'))
def convert(infile, outfile): if not outfile: ext = infile.split('.')[-1] outfile = '%s%s' % (infile[:-len(ext)-1], EXT) print('%s > %s' % (infile, outfile)) print('reading in JSON') with open(infile) as op: data = json.load(op) print('writing to msgpack') with open(outfile, 'wb') as op: msgpack.dump(data, op)
def start(interval=3600, expire=604800): ck = salt.utils.minions.CkMinions(__opts__) presence_file = "{0}/minions/presence.p".format(__opts__["cachedir"]) wheel = salt.wheel.WheelClient(__opts__) while True: log.debug("Checking for present minions") minions = {} if os.path.exists(presence_file): try: with salt.utils.fopen(presence_file, "r") as f: minions = msgpack.load(f) except IOError as e: log.error("Could not open presence file {0}: {1}".format(presence_file, e)) time.sleep(interval) continue minion_keys = _get_keys() now = time.time() present = ck.connected_ids() # For our existing keys, check which are present for m in minion_keys: # If we have a key that's not in the presence file, it may be a new minion # It could also mean this is the first time this engine is running and no # presence file was found if m not in minions: minions[m] = now elif m in present: minions[m] = now log.debug("Finished checking for present minions") # Delete old keys stale_keys = [] for m, seen in minions.iteritems(): if now - expire > seen: stale_keys.append(m) if len(stale_keys): for k in stale_keys: log.info("Removing stale key for {0}".format(k)) wheel.cmd("key.delete", stale_keys) del minions[k] try: with salt.utils.fopen(presence_file, "w") as f: msgpack.dump(minions, f) except IOError as e: log.error("Could not write to presence file {0}: {1}".format(presence_file, e)) time.sleep(interval)
def get_textunits(filename): out_fn = os.path.join(OUTPUT_DIRECTORY,os.path.basename(filename)) print out_fn+".mpack" if os.path.exists(out_fn+".mpack"): return 'done' ##### GET THE DATA dep_parse = read_grouped_by_newline_file(filename) print filename to_write = [] for i,x in enumerate(dep_parse): if i % 5000 == 0: print i spl = x[0].split("\t") uid = spl[11] tweet_id = spl[10] date = x[0].split("\t")[-2] try: s = TextUnit(uid+ "\t" + tweet_id, date, sent_to_id,identity_to_id,gram_list, emoji_info=[emoji_data,emoji_regex], emoticon_to_eval_dim=False, dependency_parsed_conll=x, sent_values=sent_values, hashtag_epa_data=False, vader_dict=False, do_negation_on_full_sentence=True, use_events=True, use_behaviors=True, use_isa=False, use_clause_level= True, use_parent_child=False, use_own_full_sentence=False) if len(s.identities): to_write.append(s) except: print 'failed', i, filename #pickle.dump(to_write, open(out_fn,"wb"),-1) dat = [ [x.unit_id,x.date,x.identities,x.raw_text, x.identities_to_constraint_string_map, x.constraint_string_list, x.full_deflection_string] for x in to_write] msgpack.dump(dat, open(out_fn+".mpack","wb")) return 'done'