Example #1
0
def get_pdos(out_file='dos.msg'):
    """Calculate and save the projected DOS. Requires a previously relaxed
    calculation.

    Parameters
    ----------
    out_file : str
        Name of the output file to save the results to.
    """
    atoms = get_relaxed_calculation()
    calc = atoms.get_calculator()

    # Calculate the pdos and write to disk
    dos = calc.calc_pdos(nscf=True,
                         kpts=atoms.info['kpts'] * [2, 2, 1],
                         DeltaE=0.01,
                         slab=True,
                         Emin=-40,
                         Emax=40,
                         tetrahedra=False,
                         sigma=0.2)

    dos = list(dos)
    array_to_list(dos)

    # If outfile, write a MessagePack encoded version to disk
    if out_file:
        with open(out_file, 'w') as f:
            msgpack.dump(dos, f)

    # Return a BSON friendly version
    return json.dumps(dos, encoding='utf-8')
Example #2
0
def get_or_build(path, build_fn, *args, **kwargs):
    """
    Load from serialized form or build an object, saving the built
    object.

    Remaining arguments are provided to `build_fn`.
    """

    save = False
    obj = None

    if path is not None and os.path.isfile(path):
        with open(path, 'rb') as obj_f:
            obj = msgpack.load(obj_f, use_list=False, encoding='utf-8')
    else:
        save = True

    if obj is None:
        obj = build_fn(*args, **kwargs)

        if save and path is not None:
            with open(path, 'wb') as obj_f:
                msgpack.dump(obj, obj_f)

    return obj
Example #3
0
def wedge(cluster_set, report_cluster_status=False, force_wedge_thrsh=False):
    # The lower bound of the edges being processed by the wedge algorithm.
    global edge_cut_prob
    global wedge_thrsh

    if not force_wedge_thrsh:
        edge_cut_prob = bconfig.WEDGE_THRESHOLD / 4.
        wedge_thrsh = bconfig.WEDGE_THRESHOLD
    else:
        edge_cut_prob = force_wedge_thrsh / 4.
        wedge_thrsh = force_wedge_thrsh

    matr = ProbabilityMatrix(cluster_set.last_name)
    matr.load()

    global h5file
    h5filepath = bconfig.TORTOISE_FILES_PATH + 'wedge_cache_' + str(PID())
    h5file = h5py.File(h5filepath)

    convert_cluster_set(cluster_set, matr)
    del matr  # be sure that this is the last reference!

    do_wedge(cluster_set)

    report = []
    if report_cluster_status:
        msg = []
        for cl1 in cluster_set.clusters:
            for cl2 in cluster_set.clusters:
                if cl2 > cl1:
                    id1 = cluster_set.clusters.index(cl1)
                    id2 = cluster_set.clusters.index(cl2)
                    c12 = _compare_to(cl1, cl2)
                    c21 = _compare_to(cl2, cl1)
                    report.append((id1, id2, c12 + c21))
                    msg.append(' %s vs %s : %s + %s = %s -- %s' %
                               (id1, id2, c12, c21, c12 + c21, cl1.hates(cl2)))
        msg = 'Wedge final clusters for %s: \n' % str(wedge_thrsh) + '\n'.join(
            msg)
        logger.log(msg)

    restore_cluster_set(cluster_set)

    if bconfig.DEBUG_CHECKS:
        assert cluster_set._debug_test_hate_relation()
        assert cluster_set._debug_duplicated_recs()

    if report_cluster_status:
        destfile = '/tmp/baistats/cluster_status_report_pid_%s_lastname_%s_thrsh_%s' % (
            str(PID()), str(cluster_set.last_name), str(wedge_thrsh))
        f = filehandler.open(destfile, 'w')
        SER.dump([
            wedge_thrsh, cluster_set.last_name, report,
            cluster_set.num_all_bibs
        ], f)
        f.close()
    gc.collect()

    h5file.close()
    os.remove(h5filepath)
def main(args):
    logging.basicConfig(level=logging.INFO)
    model = json.load(args.ifile)
    fn = fn_from_args(args)
    if fn:
        weight_layers = [layer for layer in model
                         if layer['layerName'] in weight_first_list]
        if fn.needs_two_step:
            for weights in [layer['parameters'][0] for layer in weight_layers]:
                fn.consume(weights)
            fn.done()
        for i, layer in enumerate(weight_layers):
            layer['parameters'][0] = transform(layer['parameters'][0], fn)
        if fn.needs_two_step:
            model = {
                'codebook' : fn.serialize_codebook(),
                'model' : model
            }
    if args.ubjson_format:
        args.ofile.write(simpleubjson.encode(model))
    elif args.json:
        args.ofile.write(json.dumps(model).encode('utf8'))
    else:
        msgpack.dump(model, args.ofile, use_bin_type=True,
                     use_single_float=args.single_precision_float)
    args.ofile.close()
    args.ifile.close()
Example #5
0
def to_local_file(obj, filename, to_save_path=None):
    """
    Save an object as file.

    Params
    ------
    obj : object
        Serialized object or any object which be built-in Python types.

    root_dir : str
        Root directory where the object will be saved.

    filename : str
        Object filename.

    """
    if to_save_path is None:
        _init_data_dir()
        to_save_path = DEFAULT_DATA_DIR_NAME

    fpath = path.join(to_save_path, filename)

    try:
        with open(fpath, mode='wb') as file:
            mpk.dump(obj, file)
    except TypeError:
        raise Exception('`obj` could not be saved because it ' +
                        'is not serialized.')
 def load_embeddings(self):
     """generate embeddings suited for the current vocab or load previously cached ones."""
     embedding_file = os.path.join(self.args.output_dir,
                                   'embedding.msgpack')
     if not os.path.exists(embedding_file):
         if self.args.language == "chinese":
             embeddings = load_embeddings_Chinese(
                 self.args.pretrained_embeddings,
                 self.vocab,
                 self.args.embedding_dim,
                 mode=self.args.embedding_mode,
                 lower=self.args.lower_case)
         else:
             embeddings = load_embeddings_English(
                 self.args.pretrained_embeddings,
                 self.vocab,
                 self.args.embedding_dim,
                 mode=self.args.embedding_mode,
                 lower=self.args.lower_case)
         with open(embedding_file, 'wb') as f:
             msgpack.dump(embeddings, f)
     else:
         with open(embedding_file, 'rb') as f:
             embeddings = msgpack.load(f)
     return embeddings
Example #7
0
    def setUp(self):
        self.dir = tempfile.mkdtemp()
        self.runner = click.testing.CliRunner()
        if self.keep_tree:
            print 'KmlTrackTest is running in %s' % self.dir
        self.infile_msgpack = os.path.join(self.dir, 'in.msgpack')
        with open(self.infile_msgpack, 'w') as f:
            for row in self.test_track:
                msgpack.dump(row, f)

        self.infile_json = os.path.join(self.dir, 'in.json')
        with open(self.infile_json, 'w') as f:
            for row in self.test_track:
                json.dump(row, f)
                f.write('\n')

        self.infile_csv = os.path.join(self.dir, 'in.csv')
        with open(self.infile_csv, 'w') as f:
            f = csv.DictWriter(
                f, fieldnames=['lat', 'lon', 'timestamp', 'course', 'color'])
            f.writeheader()
            for row in self.test_track:
                f.writerow(row)

        self.outfile = os.path.join(self.dir, 'out.kml')
Example #8
0
    def dump(data, filepath):
        '''
        Write data as as type self.ext to filepath. json or msgpack
        '''
        if ' ' in filepath:
            raise raeting.KeepError("Invalid filepath '{0}' "
                                    "contains space".format(filepath))

        root, ext = os.path.splitext(filepath)
        if ext == '.json':
            with aiding.ocfn(filepath, "w+") as f:
                json.dump(data, f, indent=2, encoding='utf-8')
                f.flush()
                os.fsync(f.fileno())
        elif ext == '.msgpack':
            if not msgpack:
                raise raeting.KeepError("Invalid filepath ext '{0}' "
                            "needs msgpack installed".format(filepath))
            with aiding.ocfn(filepath, "w+b", binary=True) as f:
                msgpack.dump(data, f, encoding='utf-8')
                f.flush()
                os.fsync(f.fileno())
        else:
            raise raeting.KeepError("Invalid filepath ext '{0}' "
                        "not '.json' or '.msgpack'".format(filepath))
Example #9
0
def put_password(policy_pubkey,
                 username,
                 password,
                 save_as_file: bool = False):
    data_source = Enrico(policy_encrypting_key=policy_pubkey)

    data_source_public_key = bytes(data_source.stamp)

    kits = list()

    actual_data = {
        'username': username,
        'password': password,
    }
    plaintext = msgpack.dumps(actual_data, use_bin_type=True)

    message_kit, _signature = data_source.encrypt_message(plaintext)
    kit_bytes = message_kit.to_bytes()
    kits.append(kit_bytes)

    data = {
        'data_source': data_source_public_key,
        'kits': kits,
    }

    if save_as_file:
        with open(HEART_DATA_FILENAME, "wb") as file:
            msgpack.dump(data, file, use_bin_type=True)

    return data
def crawler_fans():
    all_person_set = set()
    f_fans = open('fans.txt', 'w')
    for page_index in range(1, 345, 1):
        for class_index in [1, 2, 4, 5, 6]:
            start = time()
            try:
                url = 'https://123fans.cn/results.php?qi=%d&c=%d'\
                      % (page_index, class_index)
                req = urllib2.Request(url, None)
                response = urllib2.urlopen(req)
                html_doc = response.read()
                soup = BeautifulSoup(html_doc)
                children = [k for k in soup.children][1]
                lis = children.select('.odd')
                for k in lis:
                    try:
                        name = k.select('.name')[0].string
                        all_person_set.add(name)
                        f_fans.write(name + '\n')
                    except:
                        traceback.print_exc()
                        continue
                print page_index, class_index, \
                    len(all_person_set), time()-start
            except:
                print 'error', page_index, class_index
        sleep(5)
    all_person_set = list(all_person_set)
    msgpack.dump(all_person_set, open('fans.p', 'w'))
Example #11
0
def msg_pack(file_name, contents):
    try:
        with open(path + file_name, 'wb') as f:
            msgpack.dump(contents, f)
    except Exception, e:
        logging.info(e)
        return False
Example #12
0
def main():
    dialect_name, working_dir = sys.argv[1:]
    
    executor = sqlexecutor.executor(dialect_name, working_dir)
    
    print "Ready"
    sys.stdout.flush()
    try:
        for message in msgpack.Unpacker(sys.stdin, read_size=1):
            command = message[0]
            args = message[1:]
            
            if command == "execute":
                (creation_sql, query, ) = args
                result = executor.execute(creation_sql, query)
                if result.table is None:
                    column_names = None
                    rows = None
                else:
                    column_names = result.table.column_names
                    rows = result.table.rows
                
                
                msgpack.dump((result.error, column_names, rows), sys.stdout)
                sys.stdout.flush()
            else:
                return
            
    finally:
        executor.close()
Example #13
0
def prepare():
    args, _ = setup()
    train_data = load_data("train.txt")
    dev_data = load_data("test.txt")
    word_dict = build_dict(train_data[0] + dev_data[0])

    x, y, e1, e2, dist1, dist2, e1_pos, e2_pos = vectorize(
        train_data, word_dict)
    train = list(zip(x, e1, e2, dist1, dist2, e1_pos, e2_pos, y))

    e_x, e_y, e_e1, e_e2, e_dist1, e_dist2, e_e1_pos, e_e2_pos = vectorize(
        dev_data, word_dict)
    valid = list(
        zip(e_x, e_e1, e_e2, e_dist1, e_dist2, e_e1_pos, e_e2_pos, e_y))

    embed_file = 'embeddings.txt'
    vac_file = 'words.lst'
    embedding = load_embedding(embed_file, vac_file, word_dict)

    # save
    meta = {"embeddings": embedding.tolist()}

    result = {"train": train, "valid": valid}

    with open("meta.msgpack", "wb") as f:
        msgpack.dump(meta, f)

    with open("data.msgpack", "wb") as f:
        msgpack.dump(result, f)
Example #14
0
    def dump(self, directory):
        """
        Utility function to dump/save model
        :param directory: directory to output to
        :return:
        """
        data = {"identity_to_values_small": self.identity_to_values_small,
                "n_users ": self.n_users,
                "n_identity_sent_values ": self.n_identity_sent_values,
                "nu": self.nu,
                "kappa": self.kappa,
                "beta": self.beta,
                "index_to_ids": self.index_to_ids,
                "ids_to_index": self.ids_to_index,
                "iteration": self.iteration,
                "train_perplexity": self.train_perplexity,
                "test_perplexity": self.test_perplexity
                }
        iter_str = str(self.iteration)
        msgpack.dump(data, open(os.path.join(directory, iter_str + "_sent_basic.mpack"), "wb"))

        np.save(os.path.join(directory, "sent_mu_0"), self.mu_0)
        np.save(os.path.join(directory, "sent_sigma_0"), self.sigma_0)
        np.save(os.path.join(directory, iter_str + "_sent_sigma"), self.sigma)
        np.save(os.path.join(directory, iter_str + "_sent_mu"), self.mu)
        np.save(os.path.join(directory, iter_str + "_sent_precision_matrix"), self.precision_matrix)
        np.save(os.path.join(directory, iter_str + "_sent_phi"), self.phi)
Example #15
0
def create_bond_dict(components_pdbx_file_path, msgpack_file_path):
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(components_pdbx_file_path)
    components = pdbx_file.get_block_names()
    bond_dict = {}
    for component in components:
        print(component)
        cif_bonds = pdbx_file.get_category("chem_comp_bond", block=component)
        if cif_bonds is None:
            # No bond info for this compound
            continue
        if isinstance(cif_bonds["comp_id"], str):
            # Single string -> single bond
            group_bonds = {
                (cif_bonds["atom_id_1"], cif_bonds["atom_id_2"]):
                BOND_ORDERS[cif_bonds["value_order"]]
            }
        else:
            # Looped values -> multiple bonds
            group_bonds = {(atom1, atom2): BOND_ORDERS[order]
                           for atom1, atom2, order in zip(
                               cif_bonds["atom_id_1"], cif_bonds["atom_id_2"],
                               cif_bonds["value_order"])}
        bond_dict[component] = group_bonds
    with open(msgpack_file_path, "wb") as msgpack_file:
        msgpack.dump(bond_dict, msgpack_file)
Example #16
0
def data_to_token_ids(source_data, id_data_path, vocab_to_id, cache=True):
    """Tokenize data file and turn into token-ids using given vocabulary file.

    This function loads data line-by-line from data_path, calls the above
    sentence_to_token_ids, and saves the result to target_path. See comment
    for sentence_to_token_ids on the details of token-ids format.

    Args:
      source_data:
      id_data_path:
      vocab_to_id:
      cache: Boolean;
    """
    if not gfile.Exists(id_data_path) or not cache:
        print("Creating id tokenized data %s" % id_data_path)

        id_data = []
        for source in source_data:
            id_source = [[START_LINE_ID]]
            for line in source:
                id_line = [vocab_to_id.get(word[1], UNK_ID) for word in line]
                id_source.append(id_line)
            id_source.append([END_LINE_ID])
            id_data.append(id_source)

        with gfile.GFile(id_data_path, mode="w") as id_data_file:
            pickle.dump(id_data, id_data_file)

    else:
        with gfile.GFile(id_data_path, mode="r") as token_file:
            id_data = pickle.load(token_file)

    return id_data
Example #17
0
    def save_to_file(self, filename):
        """Save only the bare minimum needed to reconstruct this CoverageDB.

        This serializes the data to a single file and cab reduce the disk footprint of
        block coverage significantly (depending on overlap and number of files)."""
        if file_backing_disabled:
            raise Exception(
                "[!] Can't save/load coverage db files without msgpack. Try `pip install msgpack`"
            )
        save_dict = dict()
        save_dict["version"] = 1  # serialized covdb version
        save_dict["module_name"] = self.module_name
        save_dict["module_base"] = self.module_base
        save_dict["coverage_files"] = self.coverage_files
        # save tighter version of block dict {int: int} vice {int: str}
        block_dict_to_save = {}
        file_index_map = {
            filepath: self.coverage_files.index(filepath)
            for filepath in self.coverage_files
        }
        for block, trace_list in self.block_dict.items():
            trace_id_list = [file_index_map[name] for name in trace_list]
            block_dict_to_save[block] = trace_id_list
        save_dict["block_dict"] = block_dict_to_save
        # write packed version to file
        with open(filename, "wb") as f:
            msgpack.dump(save_dict, f)
            self.filename = filename
def main():
    lfw_folder = '/data/liubo/face/lfw_face'
    pair_file = '/data/liubo/face/lfw_pair.txt'
    same_dist_list = []
    no_same_dist_list = []
    for line in open(pair_file):
        tmp = line.rstrip().split()
        if len(tmp) == 3:
            person = tmp[0]
            person_path = os.path.join(lfw_folder, person)
            pic_list = os.listdir(person_path)
            if len(pic_list) == 1:
                print 'error person :', person
                continue
            else:
                np.random.shuffle(pic_list)
                pic_path1 = os.path.join(person_path, pic_list[0])
                pic_path2 = os.path.join(person_path, pic_list[1])
                dist = cal_two_pic_distance(pic_path1, pic_path2)
                same_dist_list.append(dist)
        elif len(tmp) == 4:
            person1 = tmp[0]
            person1_path = os.path.join(lfw_folder, person1)
            pic1_list = os.listdir(person1_path)
            person2 = tmp[2]
            person2_path = os.path.join(lfw_folder, person2)
            pic2_list = os.listdir(person2_path)
            if len(pic1_list) > 0 and len(pic2_list) > 0:
                np.random.shuffle(pic1_list)
                np.random.shuffle(pic2_list)
                pic_path1 = os.path.join(person1_path, pic1_list[0])
                pic_path2 = os.path.join(person2_path, pic2_list[0])
                dist = cal_two_pic_distance(pic_path1, pic_path2)
                no_same_dist_list.append(dist)
    msgpack.dump((same_dist_list, no_same_dist_list), open('dist.p', 'wb'))
Example #19
0
def select_and_dump_wizard_findings(wizard_findings, target_file):
    cfg = config

    if len(wizard_findings) == 0:
        print_l("[!] No wizard findings!")
        sys.exit()

    index = cfg["function_number"]
    if index in range(0, len(wizard_findings)):
        pass
    else:
        print_l("Functions found:")
        for i, finding in enumerate(wizard_findings):
            if "source" in finding:
                print_l("{}) {func_name} from {source}:{start}-{end}".format(
                    i, **finding))
            else:
                print_l("{}) {func_name}".format(i, **finding))
            buffer = bytearray(finding["buffer"])
            hexdump(buffer)

        # Let the user select a finding, add it to the config
        index = select_from_range(len(wizard_findings),
                                  "Choose a function to fuzz> ")

    wizard_findings[index]["selected"] = True
    with open(target_file.replace("targets.msg", "all_targets.msg"),
              "wb") as msg_file:
        msgpack.dump(wizard_findings, msg_file)
    with open(target_file, "wb") as msg_file:
        msgpack.dump(list(filter(lambda k: k["selected"], wizard_findings)),
                     msg_file)

    return wizard_findings
def create_dict(components_pdbx_file_path, msgpack_file_path,
                subcategory, expected_type):
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(components_pdbx_file_path)
    components = pdbx_file.get_block_names()
    data_dict = {}
    for i, component in enumerate(components):
        print(f"{((i+1) / len(components) * 100):4.1f} %", end="\r")
        try:
            cif_dict = pdbx_file.get_category("chem_comp", block=component)
        except ValueError:
            # The 'chem_comp' category may contain unparsable names
            # with wrong quote escaping
            # In this case the PDBx file parser raises an Exception
            cif_dict = None
        if cif_dict is None:
            # No or erroneous info for this compound
            data_dict[component] = None
        else:
            try:
                data = expected_type(cif_dict[subcategory])
            except ValueError:
                # Unparsable data, e.g. '?' as float
                data = None
            data_dict[component] = data
    print()
    with open(msgpack_file_path, "wb") as msgpack_file:
        msgpack.dump(data_dict, msgpack_file)
Example #21
0
    def save_msgpack(cls, filename, data):
        """Save data into MSGPACK file

        Parameters
        ----------
        filename : str
            Filename path

        data :
            Data to be stored

        Returns
        -------
        None

        """

        try:
            import msgpack

        except ImportError:
            message = '{name}: Unable to import msgpack module. You can install it with `pip install msgpack-python`.'.format(
                name=cls.__class__.__name__)

            cls.logger().exception(message)
            raise ImportError(message)

        msgpack.dump(data, open(filename, 'wb'), use_bin_type=True)
Example #22
0
def main(args):
    logging.basicConfig(level=logging.INFO)
    model = json.load(args.ifile)
    fn = fn_from_args(args)
    if fn:
        weight_layers = [
            layer for layer in model if layer['layerName'] in weight_first_list
        ]
        if fn.needs_two_step:
            for weights in [layer['parameters'][0] for layer in weight_layers]:
                fn.consume(weights)
            fn.done()
        for i, layer in enumerate(weight_layers):
            layer['parameters'][0] = transform(layer['parameters'][0], fn)
        if fn.needs_two_step:
            model = {'codebook': fn.serialize_codebook(), 'model': model}
    if args.ubjson_format:
        args.ofile.write(simpleubjson.encode(model))
    elif args.json:
        args.ofile.write(json.dumps(model).encode('utf8'))
    else:
        msgpack.dump(model,
                     args.ofile,
                     use_bin_type=True,
                     use_single_float=args.single_precision_float)
    args.ofile.close()
    args.ifile.close()
Example #23
0
def encrypt_patient_data(policy_pubkey, data_fields,
                                label: bytes = DEFAULT_LABEL,
                                save_as_file: bool = False):
    data_source = DataSource(policy_pubkey_enc=policy_pubkey,
                             label=label)

    data_source_public_key = bytes(data_source.stamp)
    ipfs_api = ipfsapi.connect()

    kits = list()
    with open("Merkle_json.json", "r") as read_file:
            data = json.load(read_file)
    share=data_fields
    share_data = {}
    for i in share:
        share_data[i] = {'Value' : data[0][i]['Value'], 'Hash' : data[0][i]['Hash']}

    plaintext = msgpack.dumps(share_data, use_bin_type=True)
    message_kit, _signature = data_source.encrypt_message(plaintext)

    kit_bytes = message_kit.to_bytes()
    kits.append(kit_bytes)

    data = {
        'data_source': data_source_public_key,
        'kits': kits,
    }

    if save_as_file:
        with open(PATIENT_DETAIL, "wb") as file:
            msgpack.dump(data, file, use_bin_type=True)
    res = ipfs_api.add(PATIENT_DETAIL)
    return res
Example #24
0
    def dump(data, filepath):
        '''
        Write data as as type self.ext to filepath. json or msgpack
        '''
        if ' ' in filepath:
            raise raeting.KeepError("Invalid filepath '{0}' "
                                    "contains space".format(filepath))

        if hasattr(data, 'get'):
            for key, val in data.items():  # P3 json.dump no encoding parameter
                if isinstance(val, (bytes, bytearray)):
                    data[key] = val.decode('utf-8')

        root, ext = os.path.splitext(filepath)
        if ext == '.json':
            with ocfn(filepath, "w+") as f:
                json.dump(data, f, indent=2)
                f.flush()
                os.fsync(f.fileno())
        elif ext == '.msgpack':
            if not msgpack:
                raise raeting.KeepError("Invalid filepath ext '{0}' "
                            "needs msgpack installed".format(filepath))
            with ocfn(filepath, "w+b", binary=True) as f:
                msgpack.dump(data, f, encoding='utf-8')
                f.flush()
                os.fsync(f.fileno())
        else:
            raise raeting.KeepError("Invalid filepath ext '{0}' "
                        "not '.json' or '.msgpack'".format(filepath))
Example #25
0
    def dump(data, filepath):
        '''
        Write data as as type self.ext to filepath. json or msgpack
        '''
        if ' ' in filepath:
            raise raeting.KeepError("Invalid filepath '{0}' "
                                    "contains space".format(filepath))

        with aiding.ocfn(filepath, "w+") as f:
            root, ext = os.path.splitext(filepath)
            if ext == '.json':
                json.dump(data, f, indent=2)
            elif ext == '.msgpack':
                if not msgpack:
                    raise raeting.KeepError(
                        "Invalid filepath ext '{0}' "
                        "needs msgpack installed".format(filepath))
                msgpack.dump(data, f)
            else:
                raise raeting.KeepError(
                    "Invalid filepath ext '{0}' "
                    "not '.json' or '.msgpack'".format(filepath))

            f.flush()
            os.fsync(f.fileno())
Example #26
0
def dump(data: object, path: str):
    if type(path) != str:
        msgpack.dump(data, path)
        return
    with open(path, "wb") as f:
        kwargs['ensure_ascii'] = False
        msgpack.dump(data, f)
Example #27
0
def sort_out(filename,
             from_year,
             heading_size_for_statistics,
             top_k,
             min_n_events=2000,
             sheet_name="Sheet1"):
    # why is xlrd being deprecated in favor of openpyxl when the latter cannot even open certain .xlsx files??
    df = pd.read_excel(f"input/{filename}",
                       sheet_name=sheet_name,
                       engine="xlrd")
    for country, country_df in tqdm(df.groupby("COUNTRY")):
        current_entries = country_df["YEAR"] >= int(from_year)
        reference_date = dt.datetime(year=int(from_year), month=1, day=1)
        event_types, event_times, labels = parse_events(
            country_df[current_entries], int(heading_size_for_statistics),
            int(top_k), reference_date)
        prefix = filename.split('.')[0]
        n_types = len(labels)
        n_events = len(event_types)
        if n_events < min_n_events:
            continue
        tqdm.write(' '.join([
            colored(country, "red"),
            ';  '.join(', '.join(label) for label in labels)
        ]))
        name = f"{prefix}-{country}-since{from_year}-top{top_k}_{n_types}-{n_events//1000}k-events"
        with open(f"output/{name}.msgpack", "wb") as f:
            msgpack.dump([event_types, event_times], f)
        with open(f"output/{name}-labels.json", "w") as f:
            json.dump(labels, f)
def load_check_result_url(dic_file, check_url_file):
    person_result_dic = {}   # {person:([](right_set),[](wrong_set))} # 肯定正确和肯定错的的图片
    right_url_count = wrong_url_count = error_format_count = no_baike_count = no_meaning_count = 0
    if os.path.exists(dic_file):
        person_result_dic = msgpack.load(open(dic_file, 'rb'))
    for line in open(check_url_file):
        tmp = line.rstrip().split('\t')
        # [person_name, pic_index, pic_url, baike_name, baike_sim, newbaike_sim, guess_info]
        person_name = tmp[0]
        right_list, wrong_list = person_result_dic.get(person_name, ([], []))
        if len(tmp) == 7:
            if tmp[3] not in no_meaning_list:
                if tmp[3] == no_find_baike:
                    no_baike_count += 1
                    continue
                else:
                    if get_newbaike_sim(tmp[4]) > sim_threshold:
                        if tmp[0] == tmp[3]:
                            right_list.append(tmp[1])
                            right_url_count += 1
                        else:
                            wrong_url_count += 1
                            wrong_list.append(tmp[1])
                    else:   # 小于某概率时结果不可信,需要标注
                        no_baike_count += 1
                        continue
            else:
                no_meaning_count += 1
                continue
        else:
            error_format_count += 1
            continue
        person_result_dic[person_name] = (right_list, wrong_list)
    print right_url_count, wrong_url_count, no_baike_count, no_meaning_count, error_format_count
    msgpack.dump(person_result_dic, open('person_result_dic.p', 'w'))
def crawler_baidu_person_list():
    # 从百度人气榜上获取人名
    all_person_list = set()
    for index in range(50):
        url = 'http://baike.baidu.com/operation/api/starflowerstarlist?' \
              'rankType=thisWeek&pg=%d' % index
        req = urllib2.Request(url, None)
        response = urllib2.urlopen(req)
        html_doc = response.read()
        content = json.loads(html_doc)
        this_page_list = content.get('data').get('thisWeek')
        for person_content in this_page_list:
            all_person_list.add(person_content.get('name'))
    print len(all_person_list)

    for index in range(50):
        url = 'http://baike.baidu.com/operation/api/starflowerstarlist?' \
              'rankType=lastWeek&pg=%d' % index
        req = urllib2.Request(url, None)
        response = urllib2.urlopen(req)
        html_doc = response.read()
        content = json.loads(html_doc)
        this_page_list = content.get('data').get('lastWeek')
        for person_content in this_page_list:
            all_person_list.add(person_content.get('name'))
    print len(all_person_list)
    all_person_set = list(all_person_list)
    msgpack.dump(all_person_set, open('baidu_fans.p', 'w'))
Example #30
0
def get_total_potential(out_file='potential.msg'):
    """Calculate and save the total potential. Requires a previously relaxed
    calculation.

    Parameters
    ----------
    out_file : str
        Name of the output file to save the results to.
    """
    atoms = get_relaxed_calculation()
    calc = atoms.get_calculator()

    # Collect the total potential and write to disk
    potential = calc.extract_total_potential()

    potential = list(potential)
    array_to_list(potential)

    # If outfile, write a MessagePack encoded version to disk
    if out_file:
        with open(out_file, 'w') as f:
            msgpack.dump(potential, f)

    # Return a BSON friendly version
    return json.dumps(potential, encoding='utf-8')
Example #31
0
def freqs_to_cBpack(input_file, output_file, cutoff=600):
    """
    Convert a frequency list into the idiosyncratic 'cBpack' format that
    will be loaded by wordfreq: a list in msgpack format of frequency
    tiers, each tier being one centibel (a factor of 10^(1/100))
    less frequent than the previous tier.
    """
    cBpack = []
    for line in input_file:
        word, strfreq = line.rstrip().split('\t', 1)
        if word == '__total__':
            raise ValueError("This is a count file, not a frequency file")
        freq = float(strfreq)
        neg_cB = -(round(math.log10(freq) * 100))
        if neg_cB >= cutoff:
            break
        while neg_cB >= len(cBpack):
            cBpack.append([])
        cBpack[neg_cB].append(word)

    for sublist in cBpack:
        sublist.sort()

    cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack

    msgpack.dump(cBpack_data, output_file)
def crawler_fans():
    all_person_set = set()
    f_fans = open('fans.txt', 'w')
    for page_index in range(1, 345, 1):
        for class_index in [1, 2, 4, 5, 6]:
            start = time()
            try:
                url = 'https://123fans.cn/results.php?qi=%d&c=%d'\
                      % (page_index, class_index)
                req = urllib2.Request(url, None)
                response = urllib2.urlopen(req)
                html_doc = response.read()
                soup = BeautifulSoup(html_doc)
                children = [k for k in soup.children][1]
                lis = children.select('.odd')
                for k in lis:
                    try:
                        name = k.select('.name')[0].string
                        all_person_set.add(name)
                        f_fans.write(name+'\n')
                    except:
                        traceback.print_exc()
                        continue
                print page_index, class_index, \
                    len(all_person_set), time()-start
            except:
                print 'error', page_index, class_index
        sleep(5)
    all_person_set = list(all_person_set)
    msgpack.dump(all_person_set, open('fans.p', 'w'))
Example #33
0
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
    """
    Convert a csv file of words and their frequencies to a file in the
    idiosyncratic 'cBpack' format.

    Only words with a frequency greater than `cutoff` centibels will be
    written to the new file.

    This cutoff should not be stacked with a cutoff in `read_freqs`; doing
    so would skew the resulting frequencies.
    """
    freqs = read_freqs(in_filename, cutoff=0, lang=None)
    cBpack = []
    for token, freq in freqs.items():
        cB = round(math.log10(freq) * 100)
        if cB <= cutoff:
            continue
        neg_cB = -cB
        while neg_cB >= len(cBpack):
            cBpack.append([])
        cBpack[neg_cB].append(token)

    for sublist in cBpack:
        sublist.sort()

    # Write a "header" consisting of a dictionary at the start of the file
    cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack

    with gzip.open(out_filename, 'wb') as outfile:
        msgpack.dump(cBpack_data, outfile)
def crawler_baidu_person_list():
    # 从百度人气榜上获取人名
    all_person_list = set()
    for index in range(50):
        url = 'http://baike.baidu.com/operation/api/starflowerstarlist?' \
              'rankType=thisWeek&pg=%d' % index
        req = urllib2.Request(url, None)
        response = urllib2.urlopen(req)
        html_doc = response.read()
        content = json.loads(html_doc)
        this_page_list = content.get('data').get('thisWeek')
        for person_content in this_page_list:
            all_person_list.add(person_content.get('name'))
    print len(all_person_list)

    for index in range(50):
        url = 'http://baike.baidu.com/operation/api/starflowerstarlist?' \
              'rankType=lastWeek&pg=%d' % index
        req = urllib2.Request(url, None)
        response = urllib2.urlopen(req)
        html_doc = response.read()
        content = json.loads(html_doc)
        this_page_list = content.get('data').get('lastWeek')
        for person_content in this_page_list:
            all_person_list.add(person_content.get('name'))
    print len(all_person_list)
    all_person_set = list(all_person_list)
    msgpack.dump(all_person_set, open('baidu_fans.p', 'w'))
Example #35
0
def get_or_build(path, build_fn, *args, **kwargs):
    """
    Load from serialized form or build an object, saving the built
    object.

    Remaining arguments are provided to `build_fn`.
    """

    save = False
    obj = None

    if path is not None and os.path.isfile(path):
        with open(path, 'rb') as obj_f:
            obj = msgpack.load(obj_f, use_list=False, encoding='utf-8')
    else:
        save = True

    if obj is None:
        obj = build_fn(*args, **kwargs)

        if save and path is not None:
            with open(path, 'wb') as obj_f:
                msgpack.dump(obj, obj_f)

    return obj
Example #36
0
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
    """
    Convert a csv file of words and their frequencies to a file in the
    idiosyncratic 'cBpack' format.

    Only words with a frequency greater than `cutoff` centibels will be
    written to the new file.

    This cutoff should not be stacked with a cutoff in `read_freqs`; doing
    so would skew the resulting frequencies.
    """
    freqs = read_freqs(in_filename, cutoff=0, lang=None)
    cBpack = []
    for token, freq in freqs.items():
        cB = round(math.log10(freq) * 100)
        if cB <= cutoff:
            continue
        neg_cB = -cB
        while neg_cB >= len(cBpack):
            cBpack.append([])
        cBpack[neg_cB].append(token)

    for sublist in cBpack:
        sublist.sort()

    # Write a "header" consisting of a dictionary at the start of the file
    cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack

    with gzip.open(out_filename, 'wb') as outfile:
        msgpack.dump(cBpack_data, outfile)
Example #37
0
def dump(data, path):
    """
    Serialize data dict and write to file given by path where serialization is
    given by path's extension of either JSON, MsgPack, or CBOR for extension
    .json, .mgpk, or .cbor respectively
    """

    if ' ' in path:
        raise IOError(f"Invalid file path '{path}' contains space.")

    root, ext = os.path.splitext(path)
    if ext == '.json':
        with ocfn(path, "w+b") as f:
            json.dump(data, f, indent=2)
            f.flush()
            os.fsync(f.fileno())
    elif ext == '.mgpk':
        with ocfn(path, "w+b") as f:
            msgpack.dump(data, f)
            f.flush()
            os.fsync(f.fileno())
    elif ext == '.cbor':
        with ocfn(path, "w+b") as f:
            cbor.dump(data, f)
            f.flush()
            os.fsync(f.fileno())
    else:
        raise IOError(f"Invalid file path ext '{path}' "
                      f"not '.json', '.mgpk', or 'cbor'.")
Example #38
0
def encrypt_track_segments(policy_pubkey, dir_path):
    data_source = Enrico(policy_encrypting_key=policy_pubkey)
    data_source_public_key = bytes(data_source.stamp)
    print(dir_path)
    target_path = "/".join(dir_path.split('/')[:-1])
    target_path = os.path.join(target_path, 'segments_encrypted')

    if not os.path.exists(target_path):
        try:
            os.makedirs(target_path)
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    track_files = os.scandir(dir_path)

    for track_segment in track_files:
        with open(track_segment, "rb") as f:
            plaintext = f.read()

        ciphertext, signature = data_source.encrypt_message(plaintext)

        print("Signature", signature)
        data = {
            'track_segment_data': ciphertext.to_bytes(),
            'data_source': data_source_public_key
        }

        with open(os.path.join(target_path, track_segment.name), "wb") as f:
            msgpack.dump(data, f, use_bin_type=True)

    return True
Example #39
0
def wedge(cluster_set, report_cluster_status=False, force_wedge_thrsh=False):
    # The lower bound of the edges being processed by the wedge algorithm.
    global edge_cut_prob
    global wedge_thrsh

    if not force_wedge_thrsh:
        edge_cut_prob = bconfig.WEDGE_THRESHOLD / 4.
        wedge_thrsh = bconfig.WEDGE_THRESHOLD
    else:
        edge_cut_prob = force_wedge_thrsh / 4.
        wedge_thrsh = force_wedge_thrsh

    matr = ProbabilityMatrix(cluster_set.last_name)
    matr.load()

    global h5file
    h5filepath = bconfig.TORTOISE_FILES_PATH+'wedge_cache_'+str(PID())
    h5file = h5py.File(h5filepath)

    convert_cluster_set(cluster_set, matr)
    del matr # be sure that this is the last reference!

    do_wedge(cluster_set)

    report = []
    if bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES or report_cluster_status:
        msg = []
        for cl1 in cluster_set.clusters:
            for cl2 in cluster_set.clusters:
                if cl2 > cl1:
                    id1 = cluster_set.clusters.index(cl1)
                    id2 = cluster_set.clusters.index(cl2)
                    c12 = _compare_to(cl1,cl2)
                    c21 = _compare_to(cl2,cl1)
                    report.append((id1,id2,c12+c21))
                    msg.append( ' %s vs %s : %s + %s = %s -- %s' %  (id1, id2, c12, c21, c12+c21, cl1.hates(cl2)))
        msg = 'Wedge final clusters for %s: \n' % str(wedge_thrsh) + '\n'.join(msg)
        if not bconfig.DEBUG_WEDGE_OUTPUT and bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES:
            print
            print msg
            print
        wedge_print(msg)


    restore_cluster_set(cluster_set)

    if bconfig.DEBUG_CHECKS:
        assert cluster_set._debug_test_hate_relation()
        assert cluster_set._debug_duplicated_recs()

    if report_cluster_status:
        destfile = '/tmp/baistats/cluster_status_report_pid_%s_lastname_%s_thrsh_%s' % (str(PID()),str(cluster_set.last_name),str(wedge_thrsh))
        f = filehandler.open(destfile, 'w')
        SER.dump([wedge_thrsh,cluster_set.last_name,report,cluster_set.num_all_bibs],f)
        f.close()
    gc.collect()

    h5file.close()
    os.remove(h5filepath)
    def dump(cls, records, filepath=None):
        """Dump in the same fashion as load."""

        if filepath is None:
            filepath = os.path.join(config['current_snapshot'], 'repos.msgpack')

        with utils.FaultTolerantFile(filepath) as f:
            msgpack.dump(records, f, default=cls._dumper)
 def __call__(self):
   if self.filename == None:
     self.filename = msgpackmemoized_basedir + '/' + self.f.__name__ + '.msgpack'
   if path.exists(self.filename):
     return msgpack.load(open(self.filename))
   result = self.f()
   msgpack.dump(result, open(self.filename, 'w'))
   return result
Example #42
0
 def serializeToFile(self, fname, annotations):
     """
     Overwritten to write Msgpack files.
     """
     # TODO make all image filenames relative to the label file
     import msgpack
     f = open(fname, "w")
     msgpack.dump(annotations, f)
Example #43
0
 def dump(self, outfile):
     """Write a serialized version of the database to filehandle."""
     db_dict = {
         'meta_prints': self.meta_prints,
         'content_prints': self.content_prints,
         'series_id': self.series_id,
     }
     msgpack.dump(db_dict, outfile)
Example #44
0
    def dump_file(self, obj, fp):
        try:
            msgpack.dump(obj, fp)
            return True
        except Exception as ex:
            log.warn('Unable to dump object to file: %s', ex, exc_info=True)

        return False
Example #45
0
def match_one_file(midi_filename, embed_fn, hash_fn, msd_embeddings,
                   msd_sequences, msd_feature_paths, msd_ids, output_filename):
    """
    Match one MIDI file to the million song dataset by computing its CQT,
    pruning by matching its embedding, re-pruning by matching its downsampled
    hash sequence, and finally doing DTW on CQTs on the remaining entries.

    Parameters
    ----------
    midi_filename : str
        Path to a MIDI file to match to the MSD
    embed_fn : function
        Function which takes in a CQT and produces a fixed-length embedding
    hash_fn : function
        Function which takes in a CQT and produces a sequence of binary vectors
    msd_embeddings : np.ndarray
        (# MSD entries x embedding dimension) matrix of all embeddings for all
        entries from the MSD
    msd_sequences : list of np.ndarray
        List of binary vector sequences (represented as ints) for all MSD
        entries
    msd_feature_paths : list of str
        Path to feature files (containing CQT) for each MSD entry
    msd_ids : list of str
        MSD ID of each corresponding entry in the above lists
    output_filename : str
        Where to write the results file, which includes the DTW scores for all
        of the non-pruned MSD entries
    """
    # Try to compute a CQT for the MIDI file
    try:
        m = pretty_midi.PrettyMIDI(midi_filename)
    except Exception as e:
        print 'Could not parse {}: {}'.format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return
    try:
        midi_gram = feature_extraction.midi_cqt(m)
    except Exception as e:
        print "Error creating CQT for {}: {}".format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return
    # Skip this file if the MIDI gram is very long, to avoid memory issues
    if midi_gram.shape[0] > MAX_FRAMES:
        return
    # Compute the embedding of the CQT
    midi_embedding = embed_fn(midi_gram.reshape(1, 1, *midi_gram.shape))
    # Compute the hash sequence
    midi_hash_sequence = hash_fn(midi_gram.reshape(1, 1, *midi_gram.shape))
    # Convert to sequence of integers
    midi_hash_sequence = dhs.vectors_to_ints(midi_hash_sequence > 0)
    midi_hash_sequence = midi_hash_sequence.astype(np.uint32)
    matches = match_one_midi(
        midi_gram, midi_embedding, midi_hash_sequence, msd_embeddings,
        msd_sequences, msd_feature_paths, msd_ids)
    # Write out the result
    with open(output_filename, 'wb') as f:
        msgpack.dump(matches, f)
Example #46
0
def SavePermResults(path,name,method,*args):
    data = []
    for thing in enumerate(args):
        data.append(thing)
    if method=='msgpack':
        with open(os.path.join(path,name+'.mpac'),'wb') as f:
            msgpack.dump(data,f)
    else:
        with open(os.path.join(path,name+'.pickle'),'wb') as f:
            pickle.dump(data,f)
def make_hanzi_converter(table_in, msgpack_out):
    table = {}
    with open(table_in, encoding='utf-8') as infile:
        for line in infile:
            hexcode, char = line.rstrip('\n').split('\t')
            codept = int(hexcode, 16)
            assert len(char) == 1
            if chr(codept) != char:
                table[codept] = char
    with gzip.open(msgpack_out, 'wb') as outfile:
        msgpack.dump(table, outfile, raw=False)
Example #48
0
def make_hanzi_converter(table_in, msgpack_out):
    table = {}
    with open(table_in, encoding="utf-8") as infile:
        for line in infile:
            hexcode, char = line.rstrip("\n").split("\t")
            codept = int(hexcode, 16)
            assert len(char) == 1
            if chr(codept) != char:
                table[codept] = char
    with gzip.open(msgpack_out, "wb") as outfile:
        msgpack.dump(table, outfile, encoding="utf-8")
Example #49
0
def write_http_response_to_temp_file(http_response):
    """
    Write an HTTPResponse instance to a temp file using msgpack

    :param http_response: The HTTP response
    :return: The name of the file
    """
    temp = get_temp_file('http')
    data = http_response.to_dict()
    msgpack.dump(data, temp, use_bin_type=True)
    temp.close()
    return temp.name
Example #50
0
def write_tags_to_temp_file(tag_list):
    """
    Write an Tag list to a temp file using msgpack

    :param tag_list: The Tag list
    :return: The name of the file
    """
    temp = get_temp_file('tags')
    data = [t.to_dict() for t in tag_list]
    msgpack.dump(data, temp, use_bin_type=True)
    temp.close()
    return temp.name
Example #51
0
def pickle_object(path, targetobject, json_pickle=False):
    """
    @type path: str or unicode
    @type targetobject: object
    @type json_pickle: bool
    """
    msgpack.dump(targetobject, open(path, "wb"), msgpack.HIGHEST_PROTOCOL)
    if json_pickle:
        if isinstance(targetobject, dict):
            json_object(path, targetobject)
        else:
            json_object(path, targetobject)
Example #52
0
    def save(self, filename):
        tfn = '%s.inprog-%d' % (filename, random.randint(1, 10000000))
        fh = open(tfn, 'w')

        try:
            msgpack.dump(self.todict(), fh)
        finally:
            fh.close()

            if os.path.exists(filename):
                os.rename(filename, '%s.bak' % filename)
            os.rename(tfn, filename)
def create_train_valid_data(folder='/data/liubo/face/research_feature_self'):
    # 根据已经存在的数据训练人脸验证模型
    person_list = os.listdir(folder)
    path_feature_dic = {}  #
    for person in person_list:
        person_path = os.path.join(folder, person)
        pic_feature_list = os.listdir(person_path)
        for pic_feature_path in pic_feature_list:
            pic_feature_path = os.path.join(person_path, pic_feature_path)
            pic_feature = msgpack_numpy.load(open(pic_feature_path, 'rb'))
            path_feature_dic[pic_feature_path] = pic_feature
    msgpack.dump(path_feature_dic, open('research_feature.p', 'wb'))
Example #54
0
    def save(self):
        """Save object into DB."""
        resp = self.response
        values = []
        values.append(resp.get_id())
        values.append(self.request.get_uri().url_string)
        values.append(resp.get_code())
        values.append(self.tag)
        values.append(int(self.mark))
        values.append(str(resp.info()))
        values.append(resp.get_wait_time())
        values.append(resp.get_msg())
        values.append(resp.content_type)
        ch = resp.charset
        values.append(ch)
        values.append(self.request.get_method())
        values.append(len(resp.body))
        code = int(resp.get_code()) / 100
        values.append(code)
        values.append(resp.get_alias())
        values.append(int(self.request.get_uri().has_query_string()))

        if not self.id:
            sql = ('INSERT INTO %s '
                   '(id, url, code, tag, mark, info, time, msg, content_type, '
                   'charset, method, response_size, codef, alias, has_qs) '
                   'VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)' % self._DATA_TABLE)
            self._db.execute(sql, values)
            self.id = self.response.get_id()
        else:
            values.append(self.id)
            sql = ('UPDATE %s'
                   ' SET id = ?, url = ?, code = ?, tag = ?, mark = ?, info = ?, '
                   'time = ?, msg = ?, content_type = ?, charset = ?, '
                   'method = ?, response_size = ?, codef = ?, alias = ?, has_qs = ? '
                   ' WHERE id = ?' % self._DATA_TABLE)
            self._db.execute(sql, values)

        #
        # Save raw data to file
        #
        fname = self._get_fname_for_id(self.id)
        
        req_res = open(fname, 'wb')
        data = (self.request.to_dict(),
                self.response.to_dict(),
                self._MSGPACK_CANARY)
        msgpack.dump(data, req_res)
        req_res.close()
        
        return True
Example #55
0
File: cache.py Project: bryson/salt
 def _write(self):
     '''
     Write out to disk
     '''
     if not HAS_MSGPACK:
         return
     # TODO Add check into preflight to ensure dir exists
     # TODO Dir hashing?
     with salt.utils.fopen(self._path, 'w+') as fp_:
         cache = {
             "CacheDisk_data": self._dict,
             "CacheDisk_cachetime": self._key_cache_time
         }
         msgpack.dump(cache, fp_)
def find_url_list():
    pic_face_index_dic = msgpack.load(open('pic_face_index_dic.p', 'rb'))
    url_folder = '/data/url'
    for person in pic_face_index_dic:
        print person
        url_list = open(os.path.join(url_folder, person+'.txt'),
                        'r').read().split('\n')
        need_check_url_index_list = pic_face_index_dic.get(person)
        for index in range(len(need_check_url_index_list)):
            tmp = url_list[int(need_check_url_index_list[index])].split('\t')
            need_check_url_index_list[index] = \
                (need_check_url_index_list[index], tmp[-1])
        pic_face_index_dic[person] = need_check_url_index_list
    msgpack.dump(pic_face_index_dic, open('pic_face_index_url_dic.p', 'wb'))
Example #57
0
def convert(infile, outfile):
    if not outfile:
        ext = infile.split('.')[-1]
        outfile = '%s%s' % (infile[:-len(ext)-1], EXT)

    print('%s > %s' % (infile, outfile))

    print('reading in JSON')
    with open(infile) as op:
        data = json.load(op)

    print('writing to msgpack')
    with open(outfile, 'wb') as op:
        msgpack.dump(data, op)
Example #58
0
def start(interval=3600, expire=604800):
    ck = salt.utils.minions.CkMinions(__opts__)
    presence_file = "{0}/minions/presence.p".format(__opts__["cachedir"])
    wheel = salt.wheel.WheelClient(__opts__)

    while True:
        log.debug("Checking for present minions")
        minions = {}
        if os.path.exists(presence_file):
            try:
                with salt.utils.fopen(presence_file, "r") as f:
                    minions = msgpack.load(f)
            except IOError as e:
                log.error("Could not open presence file {0}: {1}".format(presence_file, e))
                time.sleep(interval)
                continue

        minion_keys = _get_keys()
        now = time.time()
        present = ck.connected_ids()

        # For our existing keys, check which are present
        for m in minion_keys:
            # If we have a key that's not in the presence file, it may be a new minion
            # It could also mean this is the first time this engine is running and no
            # presence file was found
            if m not in minions:
                minions[m] = now
            elif m in present:
                minions[m] = now

        log.debug("Finished checking for present minions")
        # Delete old keys
        stale_keys = []
        for m, seen in minions.iteritems():
            if now - expire > seen:
                stale_keys.append(m)

        if len(stale_keys):
            for k in stale_keys:
                log.info("Removing stale key for {0}".format(k))
            wheel.cmd("key.delete", stale_keys)
            del minions[k]

        try:
            with salt.utils.fopen(presence_file, "w") as f:
                msgpack.dump(minions, f)
        except IOError as e:
            log.error("Could not write to presence file {0}: {1}".format(presence_file, e))
        time.sleep(interval)
def get_textunits(filename):
    out_fn = os.path.join(OUTPUT_DIRECTORY,os.path.basename(filename))
    print out_fn+".mpack"
    if os.path.exists(out_fn+".mpack"):
        return 'done'

    ##### GET THE DATA
    dep_parse = read_grouped_by_newline_file(filename)
    print filename
    to_write = []
    for i,x in enumerate(dep_parse):
        if i % 5000 == 0:
            print i

        spl = x[0].split("\t")
        uid = spl[11]
        tweet_id =  spl[10]
        date = x[0].split("\t")[-2]
        try:
            s = TextUnit(uid+ "\t" + tweet_id, date,
                     sent_to_id,identity_to_id,gram_list,
                     emoji_info=[emoji_data,emoji_regex],
                     emoticon_to_eval_dim=False,
                     dependency_parsed_conll=x,
                     sent_values=sent_values,
                     hashtag_epa_data=False,
                     vader_dict=False,
                     do_negation_on_full_sentence=True,
                     use_events=True,
                     use_behaviors=True,
                     use_isa=False,
                     use_clause_level= True,
                     use_parent_child=False,
                     use_own_full_sentence=False)
            if len(s.identities):
                to_write.append(s)
        except:
            print 'failed', i, filename

    #pickle.dump(to_write, open(out_fn,"wb"),-1)
    dat = [ [x.unit_id,x.date,x.identities,x.raw_text,
             x.identities_to_constraint_string_map,
             x.constraint_string_list,
             x.full_deflection_string]
           for x in to_write]
    msgpack.dump(dat, open(out_fn+".mpack","wb"))

    return 'done'