def generate_inverse_strategy_data(strategy_lists, ef_input_keys,
                                   ef_output_keys, techno_keys_waste,
                                   techno_keys_product,
                                   unit_scaling_techno_product,
                                   unit_scaling_techno_waste, sacrificial_lca,
                                   water_dir):
    initial_ratios_inverse = {}
    print("Calculate initial in/out ratios for inverse strategy activities")
    for act in pyprind.prog_bar(strategy_lists['inverse']):
        initial_ratios_inverse[act] = 1 / initial_in_over_out(
            act, ef_input_keys, ef_output_keys, techno_keys_waste,
            techno_keys_product, unit_scaling_techno_product,
            unit_scaling_techno_waste)

    print("getting row incides for inverse strategy")
    rows_of_interest_inverse = {}
    for act in pyprind.prog_bar(strategy_lists['inverse']):
        rows_of_interest_inverse[act] = identify_rows_of_interest_inverse(
            sacrificial_lca, act, ef_input_keys, ef_output_keys,
            techno_keys_waste, techno_keys_product)

    with open(os.path.join(water_dir, "initial_ratios_inverse.pickle"),
              "wb") as f:
        pickle.dump(initial_ratios_inverse, f)
    with open(os.path.join(water_dir, "rows_of_interest_inverse.pickle"),
              "wb") as f:
        pickle.dump(rows_of_interest_inverse, f)
Esempio n. 2
0
def generate_default_strategy_data(strategy_lists, transformation_from,
                                   transformation_to, sacrificial_lca,
                                   land_use_dir):
    if strategy_lists['default']:
        initial_ratios_default = {}
        print(
            "Calculate initial in/out ratios for default strategy activities")
        for act in pyprind.prog_bar(strategy_lists['default']):
            initial_ratios_default[act] = initial_in_over_out(
                act,
                transformation_from,
                transformation_to,
            )

        rows_of_interest_default = {}
        print("getting rows of interest for default strategy")
        for act in pyprind.prog_bar(strategy_lists['default']):
            rows_of_interest_default[act] = identify_rows_of_interest_default(
                sacrificial_lca, act, transformation_from, transformation_to)

        with open(os.path.join(land_use_dir, "initial_ratios_default.pickle"),
                  "wb") as f:
            pickle.dump(initial_ratios_default, f)
        with open(
                os.path.join(land_use_dir, "rows_of_interest_default.pickle"),
                "wb") as f:
            pickle.dump(rows_of_interest_default, f)
Esempio n. 3
0
def generate_inverse_strategy_data(strategy_lists, transformation_from,
                                   transformation_to, sacrificial_lca,
                                   land_use_dir):

    if strategy_lists['inverse']:
        initial_ratios_inverse = {}
        print(
            "Calculate initial in/out ratios for inverse strategy activities")
        for act in pyprind.prog_bar(strategy_lists['inverse']):
            initial_ratios_inverse[act] = 1 / initial_in_over_out(
                act,
                transformation_from,
                transformation_to,
            )

        print("getting keys for inverse strategy")
        rows_of_interest_inverse = {}
        for act in pyprind.prog_bar(strategy_lists['inverse']):
            rows_of_interest_inverse[act] = identify_rows_of_interest_inverse(
                sacrificial_lca,
                act,
                transformation_from,
                transformation_to,
            )

        with open(os.path.join(land_use_dir, "initial_ratios_inverse.pickle"),
                  "wb") as f:
            pickle.dump(initial_ratios_inverse, f)
        with open(
                os.path.join(land_use_dir, "rows_of_interest_inverse.pickle"),
                "wb") as f:
            pickle.dump(rows_of_interest_inverse, f)
def generate_default_strategy_data(strategy_lists, ef_input_keys,
                                   ef_output_keys, techno_keys_waste,
                                   techno_keys_product,
                                   unit_scaling_techno_product,
                                   unit_scaling_techno_waste, sacrificial_lca,
                                   water_dir):
    initial_ratios_default = {}
    print("Calculate initial in/out ratios for default strategy activities")
    for act in pyprind.prog_bar(strategy_lists['default']):
        initial_ratios_default[act] = initial_in_over_out(
            act, ef_input_keys, ef_output_keys, techno_keys_waste,
            techno_keys_product, unit_scaling_techno_product,
            unit_scaling_techno_waste)
    rows_of_interest_default = {}

    print("getting rows of interest for default strategy")
    for act in pyprind.prog_bar(strategy_lists['default']):
        rows_of_interest_default[act] = identify_rows_of_interest_default(
            sacrificial_lca, act, ef_input_keys, ef_output_keys,
            techno_keys_waste, techno_keys_product)

    with open(os.path.join(water_dir, "initial_ratios_default.pickle"),
              "wb") as f:
        pickle.dump(initial_ratios_default, f)
    with open(os.path.join(water_dir, "rows_of_interest_default.pickle"),
              "wb") as f:
        pickle.dump(rows_of_interest_default, f)
Esempio n. 5
0
def main(args):
    path = args.path

    filenames = os.listdir(path)
    filenames = [n for n in filenames if n.endswith(".edus.arcs")]
    filenames.sort()

    for filename in pyprind.prog_bar(filenames):
        edus_arcs = utils.read_lines(os.path.join(path, filename),
                                     process=lambda line: line.split())

        edus_deprels = []
        for arcs in edus_arcs:
            arcs = treetk.hyphens2arcs(arcs)
            deprels = [l for h, d, l in arcs]
            edus_deprels.append(deprels)

        # Write
        with open(
                os.path.join(path,
                             filename.replace(".edus.arcs", ".edus.deprels")),
                "w") as f:
            for deprels in edus_deprels:
                deprels = " ".join(deprels)
                f.write("%s\n" % deprels)
Esempio n. 6
0
    def read_examples_from_file(fields, format: str, path):
        make_example = {
            'json': Example.fromJSON,
            'dict': Example.fromdict,
            'tsv': Example.fromCSV,
            'csv': Example.fromCSV
        }[format.lower()]
        lines = 0
        with open(os.path.expanduser(path), encoding="utf8") as f:
            for line in f:
                lines += 1
        with open(os.path.expanduser(path), encoding="utf8") as f:
            if format == 'csv':
                reader = unicode_csv_reader(f)
            elif format == 'tsv':
                reader = unicode_csv_reader(f, delimiter='\t')
            else:
                reader = f

            next(reader)

            examples = [
                make_example(line, fields) for line in pyprind.prog_bar(
                    reader,
                    iterations=lines,
                    title='\nReading and processing data from "' + path + '"')
            ]
        return examples
Esempio n. 7
0
def media_jobs(cfg, dry_run, is_video):
    """Generate either all image or all video jobs for a given config."""
    if is_video:
        media_lc = 'video'
        media_uc = 'Video'
        src_media = src_videos
        media_targets = vid_targets
    else:
        media_lc = 'image'
        media_uc = 'Image'
        src_media = src_images
        media_targets = img_targets

    l.info('Generating {} jobs...'.format(media_lc))
    jobs = []
    skipped = 0

    si = src_media(cfg)
    if not si:
        l.debug('No source {}s'.format(media_lc))
        return

    for src in pyprind.prog_bar(si):
        j, s = media_targets(cfg, src, dry_run)
        jobs.extend(j)
        skipped += s

    l.info('{} jobs: running {}, skipped {}, total {}'.format(
        media_uc, len(jobs), skipped,
        len(jobs) + skipped))

    return jobs
Esempio n. 8
0
def find_optimal_gamma(horizon=15, n_traj=1000, map_name="5x5"):
    w_env = FrozenLakeEnv(map_name="9x9",
                          horizon=horizon,
                          theta_dist="hypercube")
    for gamma in candidate_gammas:
        test_pi_H = EpsOptimalMDPPolicy(w_env, discount=gamma)
        logger.log("-------------------")
        logger.log("Evaluating gamma={} for {} timesteps".format(
            gamma, horizon))
        logger.log("-------------------")
        test_env = HumanCRLWrapper(w_env, test_pi_H, 0)
        logger.log("Obtaining Samples...")
        # Alas, the rllab samplers don't support hot swapping envs and batch sizes
        # TODO: write a new parallel sampler, instead of sampling manually
        rewards = []
        regrets = []
        for i in pyprind.prog_bar(range(n_traj)):
            observation = test_env.reset()
            for t in range(horizon):
                action = test_env.nA - 1
                observation, reward, done, info = test_env.step(action)
                if done:
                    rewards.append(info["accumulated rewards"])
                    regrets.append(info["accumulated regret"])
                    break
        #feel free to add more data
        logger.log("NumTrajs {}".format(n_traj))
        logger.log("AverageReturn {}".format(np.mean(rewards)))
        logger.log("StdReturn {}".format(np.std(rewards)))
        logger.log("MaxReturn {}".format(np.max(rewards)))
        logger.log("MinReturn {}".format(np.min(rewards)))
        logger.log("AverageRegret {}".format(np.mean(regrets)))
        logger.log("MaxRegret {}".format(np.max(regrets)))
        logger.log("MinRegret {}".format(np.min(regrets)))
Esempio n. 9
0
def eval_mdp_policies(horizon=15, n_traj=100000, log_dir=None):
    text_output_file = None if log_dir is None else osp.join(log_dir, "text")
    w_env = FrozenLakeEnv(horizon=horizon)
    if text_output_file is not None:
        logger.add_text_output(text_output_file)
    for human_policy in human_mdp_policies.values():
        logger.log("-------------------")
        logger.log("Evaluating {} for {} timesteps".format(
            human_policy.__name__, horizon))
        logger.log("-------------------")

        test_pi_H = human_policy(w_env)
        test_env = HumanCRLWrapper(w_env, test_pi_H)
        logger.log("Obtaining Samples...")
        rewards = []
        for i in pyprind.prog_bar(range(n_traj)):
            observation = test_env.reset()
            for t in range(horizon):
                # _, action = observation
                # if action == test_env.nA:
                action = test_env.nA - 1
                observation, reward, done, info = test_env.step(action)
                if done:
                    rewards.append(info["accumulated rewards"])
                    break
        #feel free to add more data
        logger.log("NumTrajs {}".format(n_traj))
        logger.log("AverageReturn {}".format(np.mean(rewards)))
        logger.log("StdReturn {}".format(np.std(rewards)))
        logger.log("MaxReturn {}".format(np.max(rewards)))
        logger.log("MinReturn {}".format(np.min(rewards)))
Esempio n. 10
0
def crawl_songs(area_list, save_path):
    singer_id_done = []
    for root, dirs, files in os.walk(save_path):
        for file_name in files:
            singer_id = re.search("song_list_.*_(.*).json", file_name).group(1)
            singer_id_done.append(int(singer_id))

    area_2_singers = json.load(
        open("../Sources/qq_music_yield/area_2_singers.json",
             "r",
             encoding="utf-8"))

    for area in area_list:
        singer_list = area_2_singers[area]
        bar = pyprind.ProgBar(
            len(singer_list),
            title="process of crawling songs of singers of {}".format(area))
        for singer in pyprind.prog_bar(singer_list):
            singer_name = singer[settings.KEY_SINGER_NAME]
            singer_id = singer[settings.KEY_SINGER_ID]
            if singer_id in singer_id_done:
                continue
            song_list = crawl_song_list(singer)
            json.dump(
                song_list,
                open("%s/song_list_%s_%s.json" %
                     (save_path, singer_name, singer_id),
                     "w",
                     encoding="utf-8"))
            bar.update()
Esempio n. 11
0
    def train(self, sess=None):

        if sess is None:
            sess = tf.Session()

        sess.run(tf.global_variables_initializer())

        replay_buffer = SimpleReplayBuffer(env_spec=self._env.spec, max_replay_buffer_size=self._max_pool_size)

        path_length = 0
        episode_rewards = 0
        observation = self._env.reset()

        with sess.as_default():
            self._update_target()

            for ep in range(self._n_epochs):
                mean_loss = 0
                trained_iter = 0
                epoch_rewards = list()
                episode_lengths = list()
                with logger.prefix('Epoch #%d | ' % ep):
                    for ep_iter in pyprind.prog_bar(range(self._epoch_length)):
                        self._env.render()
                        action, _ = self._es.get_action(observation)
                        next_observation, reward, terminal, _ = self._env.step(action)

                        replay_buffer.add_sample(
                            observation=observation,
                            next_observation=next_observation,
                            action=action,
                            terminal=terminal,
                            reward=reward,
                        )

                        episode_rewards += reward
                        path_length += 1

                        observation = next_observation

                        if terminal or path_length >= self._max_path_length:
                            observation = self._env.reset()
                            epoch_rewards.append(episode_rewards)
                            episode_lengths.append(path_length)
                            path_length = 0
                            episode_rewards = 0

                        iter = ep * self._epoch_length + ep_iter
                        if replay_buffer.size > self._min_pool_size:
                            batch = replay_buffer.random_batch(self._batch_size)
                            loss = self._do_training(iter, batch)
                            mean_loss += loss
                            trained_iter += 1

                        if iter % self._target_update_period == 0 and replay_buffer.size > self._min_pool_size:
                            self._update_target()
                    logger.record_tabular('mean-td-error', (mean_loss/self._epoch_length))
                    logger.record_tabular('mean-episode-reward', np.mean(epoch_rewards))
                    logger.record_tabular('mean-epsiode-length', np.mean(episode_lengths))
                    logger.dump_tabular()
Esempio n. 12
0
def main():
    config = utils.Config()

    filenames = os.listdir(
        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "segmented"))
    filenames = [n for n in filenames if n.endswith(".txt")]
    filenames.sort()

    utils.mkdir(
        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                     "preprocessed"))

    for filename in pyprind.prog_bar(filenames):
        path_seg = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                "segmented", filename)
        path_raw = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                "raw", filename)
        path_dst = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                "preprocessed",
                                filename.replace(".txt", ".edus"))
        # Input
        edus = utils.read_lines(path_seg, process=lambda line: line)
        edus = remove_empty_lines(filename, edus)
        raw_lines = utils.read_lines(path_raw, process=lambda line: line)
        raw_lines = remove_empty_lines(filename, raw_lines)
        assert count_chars(edus) == count_chars(raw_lines)
        # Processing
        edus = convert_edus(edus, raw_lines)
        assert count_chars(edus) == count_chars(raw_lines)
        # Output
        utils.write_lines(path_dst, edus)
Esempio n. 13
0
    def _identify_techno_keys(self):
        """Identify keys of activities with water production exchanges

         These should be considered in balancing. Keys are grouped by activities
         associated with input exchanges (e.g. wastewater treatment) and
         output exchanges (e.g. potable water)
         """
        names_file = Path(__file__).parents[0]/'data'/'water_intermediary_exchange_names.json'
        if not names_file.is_file():
            raise FileNotFoundError("Could not find file water_intermediary_exchange_names.json in expected location")
        with open(names_file, "rb") as f:
            techno_product_names_dict = json.load(f)
        techno_product_names = techno_product_names_dict[self.ecoinvent_version]
        techno_treat_keys = []
        techno_transfo_keys = []
        db_loaded = Database(self.database_name).load()
        for act_key, act in pyprind.prog_bar(db_loaded.items()):
            if act['reference product'] in techno_product_names:
                if act['production amount']<0:
                    techno_treat_keys.append(act_key)
                elif act['production amount']>0:
                    techno_transfo_keys.append(act_key)
                else:
                    warnings.warn("Activity {} has a product exchange {} with "
                                  "an amount of 0: skipped".format(
                        act_key,
                        act['reference product']
                    ))
        return techno_transfo_keys, techno_treat_keys
Esempio n. 14
0
def subject_verify(new_arxiv):
    if new_arxiv.count > 0:
        subject_list = copy.copy(new_arxiv.subject)
        remove_list = []
        new_ver = arxiv(new_arxiv.author)
        new_ver.parse()
        for count in pyprind.prog_bar(range(len(new_ver.title))):
            if len(set(subject_list) & set(new_ver.category[count])) == 0:
                remove_list.append(count)
        new_ver.arxiv_id = (np.delete(np.array(new_ver.arxiv_id),
                                      remove_list,
                                      axis=0)).tolist()
        new_ver.time = (np.delete(np.array(new_ver.time), remove_list,
                                  axis=0)).tolist()
        new_ver.title = (np.delete(np.array(new_ver.title),
                                   remove_list,
                                   axis=0)).tolist()
        new_ver.category = (np.delete(np.array(new_ver.category),
                                      remove_list,
                                      axis=0)).tolist()
        new_ver.pdf = (np.delete(np.array(new_ver.pdf), remove_list,
                                 axis=0)).tolist()
        new_ver.contributor = (np.delete(np.array(new_ver.contributor),
                                         remove_list,
                                         axis=0)).tolist()
        new_ver.count = len(new_ver.title)
        new_ver.subject = combine_subject(new_ver.category)
        print('Remove %d articles' % len(remove_list))
        return new_ver
    else:
        return new_arxiv
def main():
    dataset_path = "/path/to/Caltech-101"
    modelzoo_path = "/path/to/VGG16"
    
    # create an instance
    convnet = FeatureExtractor(
            prototxt_path=os.path.join(modelzoo_path, "vgg16_deploy.prototxt"),
            caffemodel_path=os.path.join(modelzoo_path, "vgg16.caffemodel"),
            target_layer_name="fc7",
            image_size=224,
            mean_values=[103.939, 116.779, 123.68])
    
    # header
    f = open("caltech101_vggnet_fc7_features.csv", "w")
    header = ["filepath"]
    for i in xrange(4096):
        header.append("feat%d" % (i+1))
    header = ",".join(header) + "\n"
    f.write(header)
    
    # extract features
    categories = os.listdir(dataset_path)
    for category in pyprind.prog_bar(categories):
        file_names = os.listdir(os.path.join(dataset_path, category))
        for file_name in file_names:
            img = cv2.imread(os.path.join(dataset_path, category, file_name))
            feat = convnet.transform(img)
            feat_str = [os.path.join(category, file_name)]
            for value in feat:
                feat_str.append(str(value))
            row = ",".join(feat_str)
            f.write("%s\n" % row)
            f.flush()

    f.close()
Esempio n. 16
0
def split_by_id(beatdf, id_field='ptid', frac_train=.6, frac_val=.15):
    """ Deterministically splits the beatdf by _patient_ """
    empis = np.sort(beatdf[id_field].unique())
    print("Splitting %d unique patients" % len(empis))

    # deterministic split
    rs = np.random.RandomState(0)
    perm_idx = rs.permutation(len(empis))
    num_train = int(frac_train * len(empis))
    num_val = int(frac_val * len(empis))
    train_idx = perm_idx[:num_train]
    val_idx = perm_idx[num_train:(num_train + num_val)]
    test_idx = perm_idx[(num_train + num_val):]
    empis_train = empis[train_idx]
    empis_val = empis[val_idx]
    empis_test = empis[test_idx]
    print(" ... patient splits: %d train, %d val, %d test " %
          (len(empis_train), len(empis_val), len(empis_test)))

    # make dictionaries
    train_dict = {e: "train" for e in empis_train}
    val_dict = {e: "val" for e in empis_val}
    test_dict = {e: "test" for e in empis_test}
    split_dict = {**train_dict, **val_dict, **test_dict}

    # add train/val test split to each
    split = []
    for e in pyprind.prog_bar(beatdf[id_field]):
        split.append(split_dict[e])

    beatdf['split'] = split
    return beatdf
Esempio n. 17
0
def media_jobs(cfg, dry_run, is_video):
    if is_video:
        media_lc = 'video'
        media_uc = 'Video'
        src_media = src_videos
        media_targets = vid_targets
    else:
        media_lc = 'image'
        media_uc = 'Image'
        src_media = src_images
        media_targets = img_targets

    l.info('Generating {} jobs...'.format(media_lc))
    jobs = []
    skipped = 0

    si = src_media(cfg)
    if not si:
        l.debug('No source {}s'.format(media_lc))
        return

    for src in pyprind.prog_bar(si):
        j, s = media_targets(cfg, src, dry_run)
        jobs.extend(j)
        skipped += s

    l.info('{} jobs: running {}, skipped {}, total {}'
           .format(media_uc, len(jobs), skipped, len(jobs) + skipped))

    return jobs
def Train_Eval_Process_Layer_v2(train_X,train_Y,test_X,test_Y):
    # LSTM
    epoch_num = 10
    #model = LSTM_model(input_dim=8,hidden_dim=8)
    model = One_Sent2Other_Sent()
    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCELoss()
    for epoch_  in pyprind.prog_bar(range(epoch_num)):
        model.train()
        for i in range(len(train_X)):
            X = torch.tensor(train_X[i])#.cuda()
            pred_train_Y = model(X)
            Y = torch.tensor([train_Y[i]])#.cuda()
            true_train_Y = Y.squeeze(dim=-1)
            loss = criterion(pred_train_Y, true_train_Y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print('loss:',loss)
        model.eval()
        pred_test_Y = list()
        for i in range(len(test_X)):
            X = torch.tensor(test_X[i])#.cuda()
            pred_test_Y_i = model(X).cpu().data.numpy().reshape(1,1)
            pred_test_Y.append(pred_test_Y_i)
        test_Y_hat = np.concatenate(pred_test_Y,0)
        test_Y_hat_list = list()
        for i in range(test_Y_hat.shape[0]):
            if test_Y_hat[i,0] >= 0.5:
                test_Y_hat_list.append(1)
            else:
                test_Y_hat_list.append(0)
        Evaluation(test_Y_hat_list,test_Y)
Esempio n. 19
0
    def write_db_to_brightway(self):
        for s in pyprind.prog_bar(self.scenarios.items()):
            scenario, year = s

            print('Write new database to Brightway2.')
            wurst.write_brightway2_database(
                self.db, "ecoinvent_" + scenario + "_" + str(year))
Esempio n. 20
0
def extract_ecospold2_directory(dirpath, use_mp=True):
    """Extract all the ``.spold`` files in the directory ``dirpath``.

    Use a multiprocessing pool if ``use_mp``, which is the default."""
    if os.name == 'nt':
        use_mp = False

    assert os.path.isdir(dirpath), "Can't find directory {}".format(dirpath)
    filelist = [os.path.join(dirpath, filename)
                for filename in os.listdir(dirpath)
                if filename.lower().endswith(".spold")
                ]

    print("Extracting {} undefined datasets".format(len(filelist)))

    if use_mp:
        start = time()
        # With code from
        # http://jtushman.github.io/blog/2014/01/14/python-%7C-multiprocessing-and-interrupts/
        with multiprocessing.Pool(
                processes=multiprocessing.cpu_count(),
                initializer=lambda : signal.signal(signal.SIGINT, signal.SIG_IGN)
            ) as pool:
            try:
                data = pool.map(generic_extractor, filelist)
            except KeyboardInterrupt:
                pool.terminate()
                raise KeyboardInterrupt
        print("Extracted {} undefined datasets in {:.1f} seconds".format(len(data), time() - start))
    else:
        data = [generic_extractor(fp)
                for fp in pyprind.prog_bar(filelist)]

    # Unroll lists of lists
    return [y for x in data for y in x]
Esempio n. 21
0
def validate_directory_against_xsd(dirpath, schema):
    """Extract all the ``.spold`` files in the directory ``dirpath``.

    Use a multiprocessing pool if ``use_mp``, which is the default."""
    assert os.path.isdir(dirpath), "Can't find data directory {}".format(
        dirpath)
    assert os.path.isfile(schema), "Can't find schema file {}".format(schema)

    filelist = [
        os.path.join(dirpath, filename) for filename in os.listdir(dirpath)
        if filename.lower().endswith(".spold")
    ]

    print(("Validating {} undefined datasets".format(len(filelist))))

    errors = []
    ecospold2_schema = etree.XMLSchema(etree.parse(open(schema)))

    for fp in pyprind.prog_bar(filelist):
        file = etree.parse(open(fp))
        if not ecospold2_schema.validate(file):
            errors.append(os.path.basename(fp))

    if errors:
        print("The following files did not validate:")
        pprint.pprint(errors)
    else:
        print("All files valid")
Esempio n. 22
0
def main():
    config = utils.Config()

    path_out = os.path.join(config.getpath("data"), "aarc_abst")
    utils.mkdir(path_out)

    filenames = os.listdir(config.getpath("aarc"))
    filenames = [n for n in filenames if n.endswith(".txt.utf8")]
    filenames.sort()

    nlp = spacy.load("en_core_web_sm",
                     disable=["tagger", "parser", "ner", "textcat"])

    cnt = 0
    for filename in pyprind.prog_bar(filenames):
        text = extract_abstract(os.path.join(config.getpath("aarc"), filename))
        if text == "":
            # print("No Abstract!: %s" % filename)
            continue
        with open(
                os.path.join(path_out,
                             filename.replace(".txt.utf8", ".doc.tokens")),
                "w") as f:
            doc = nlp(text)
            tokens = [token.text for token in doc]
            assert len(tokens) > 0
            tokens = " ".join(tokens)
            f.write("%s\n" % tokens)
        cnt += 1

    print("Processed %d/%d files" % (cnt, len(filenames)))
Esempio n. 23
0
    def bulk_upload(self):
        items_to_upload = []
        append = items_to_upload.append

        credentials = get_db_credentials(self.settings)
        if 'sqlite3' in credentials['ENGINE']:
            db = dataset.connect("sqlite:///" + os.path.basename(credentials['NAME']))
        if 'postgresql' in credentials['ENGINE']:
            db = dataset.connect('postgresql://' +
                                 credentials['USER'] + ':' +
                                 credentials['PASSWORD'] + '@' +
                                 credentials['HOST'] + ':' +
                                 credentials['PORT'] + '/' +
                                 credentials['NAME'])
        table = db['visitors_visitor']

        print("Starting checks to see if we have this item in our database.")
        if len(self.items) == 0:
            print("Nothing to upload")
        else:
            for i in pyprind.prog_bar(range(len(self.items))):
                item = self.items[i]
                try:
                    item['date'] = datetime.datetime.strptime(
                        item['date'],
                        '%Y-%m-%d',
                        )
                except ValueError:
                    item['date'] = None

                append(item)

            print("uploading %i records for table %s" % (len(items_to_upload), self.mytable))

            table.insert_many(items_to_upload)
def Format_csv2XY(path):
    X, Y, title, self_contradictory_template, revision_id_list = list(), list(
    ), list(), list(), list()
    df = pd.read_csv(path)
    page_title = list(df['page_title'])
    revision_text = list(df['revision_text'])
    revision_id = list(df['revision_id'])
    for i in pyprind.prog_bar(range(len(revision_text))):
        self_contradictory_template_i = list()
        text = revision_text[i]
        title_i = page_title[i]
        revision_id_i = revision_id[i]
        if isinstance(text, str) is True and len(text.split()) != 0:
            wikicode = mwparserfromhell.parse(text)
            templates = wikicode.filter_templates()
            is_pos = False
            for j in range(len(templates)):
                if 'Self-contradictory' in templates[j]:
                    is_pos = True
                    self_contradictory_template_i.append(templates[j])
            if is_pos:
                X.append(str(text))
                title.append(title_i)
                Y.append(1)
            else:
                X.append(str(text))
                title.append(title_i)
                Y.append(0)
            self_contradictory_template.append(self_contradictory_template_i)
            revision_id_list.append(revision_id_i)
    return X, Y, title, self_contradictory_template, revision_id_list
Esempio n. 25
0
def Train_Eval_Process_Layer(train_X, train_Y, test_X, test_Y):
    # RetaGNN + Self Attention
    import pyprind
    import pickle
    epoch_num = 10
    input_dim = 8
    hidden_dim = 8
    model = double_LSTM_model().cuda()
    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCELoss()
    for epoch_ in range(epoch_num):
        model.train()
        for i in pyprind.prog_bar(range(len(train_X))):
            batch_X, batch_Y = train_X[i], train_Y[i]  #(b,l,d) ,(b,)
            batch_Y_hat = model(batch_X).squeeze(dim=-1)
            loss = criterion(batch_Y_hat, batch_Y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print('loss:',loss)
        model.eval()
        pred_Y = list()
        for i in range(len(test_X)):
            pred_Y.append(model(test_X[i]).view(1, -1))
        test_Y_hat = torch.cat(pred_Y, 0).cpu().data.numpy()
        test_Y_hat_list = list()
        for i in range(test_Y_hat.shape[0]):
            if test_Y_hat[i, 0] >= 0.5:
                test_Y_hat_list.append(1)
            else:
                test_Y_hat_list.append(0)
        Evaluation(test_Y_hat_list, test_Y)
Esempio n. 26
0
def extract_ecospold2_directory(dirpath, use_mp=True):
    """Extract all the ``.spold`` files in the directory ``dirpath``.

    Use a multiprocessing pool if ``use_mp``, which is the default."""
    if os.name == 'nt':
        use_mp = False

    assert os.path.isdir(dirpath), "Can't find directory {}".format(dirpath)
    filelist = [
        os.path.join(dirpath, filename) for filename in os.listdir(dirpath)
        if filename.lower().endswith(".spold")
    ]

    print(("Extracting {} undefined datasets".format(len(filelist))))

    if use_mp:
        start = time()
        # With code from
        # http://jtushman.github.io/blog/2014/01/14/python-%7C-multiprocessing-and-interrupts/
        with multiprocessing.Pool(processes=multiprocessing.cpu_count(),
                                  initializer=lambda: signal.signal(
                                      signal.SIGINT, signal.SIG_IGN)) as pool:
            try:
                data = pool.map(generic_extractor, filelist)
            except KeyboardInterrupt:
                pool.terminate()
                raise KeyboardInterrupt
        print(("Extracted {} undefined datasets in {:.1f} seconds".format(
            len(data),
            time() - start)))
    else:
        data = [generic_extractor(fp) for fp in pyprind.prog_bar(filelist)]

    # Unroll lists of lists
    return [y for x in data for y in x]
Esempio n. 27
0
    def getJob(self):
        job = []
        for i in range(1, 1000):
            if requests.get(
                    'https://www.yourator.co/api/v2/jobs?page={}'.format(
                        i)).json()['jobs'] == []:
                break
            job += requests.get(
                'https://www.yourator.co/api/v2/jobs?page={}'.format(
                    i)).json()['jobs']

        for i in pyprind.prog_bar(job):
            res = requests.get('https://www.yourator.co/' + i['path']).text
            soup = BeautifulSoup(res, "html.parser")
            i['inside'] = {}
            i['inside']['description'] = soup.select(
                '.description')[0].text.strip() if len(
                    soup.select('.description')) else ''
            for j in soup.select('.basic-info'):
                key, value = j.text.strip().replace(' ',
                                                    '').replace('\n',
                                                                '').split(':')
                i['inside'][key] = value

            if i['has_salary_info']:
                for j in soup.select('h2'):
                    if j.text == '薪資範圍':
                        i['salary'] = j.findNext('article').text
        with open('job.json', 'w') as f:
            json.dump(self.testData(job), f)
def calc_features(net, n_images, blobs):
    n_images = int(0.6 * n_images)
    batchsize = net.blobs['data'].data.shape[0]
    feats = dict()
    for blob in blobs:
        out_shape = list(net.blobs[blob].data.shape)
        out_shape[0] = n_images
        print('Will allocate {:.2f} GiB of memory'.format(
            np.prod(out_shape) * 2 / 1024 / 1024 / 1024))
        feats[blob] = np.zeros(
            tuple(out_shape),
            dtype=np.float16 if not blob == 'label' else np.int32)
    print('Need %.3f GiB' %
          (np.sum([x.nbytes for x in feats.values()]) / 1024 / 1024 / 1024))

    for it in pyprind.prog_bar(range(0, n_images, batchsize),
                               update_interval=10,
                               stream=sys.stderr):
        net.forward()
        for blob in blobs:
            feats[blob][it:it + batchsize,
                        ...] = net.blobs[blob].data[:feats[blob][it:it +
                                                                 batchsize,
                                                                 ...].shape[0],
                                                    ...]

    return [feats[blob] for blob in blobs]
Esempio n. 29
0
    def __new__(cls, iterable=None, desc=None, total=None, leave=True,
                backend=None, verbose=True):
        if backend is None:
            backend = Progressbar.backend

        if not verbose:
            backend = "hide"

        if backend == "tqdm":
            from tqdm import tqdm
            return tqdm(iterable=iterable, desc=desc, total=total, leave=leave,
                        ascii=True, ncols=80, file=sys.stdout,
                        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed"
                                   "}<{remaining}{postfix}]") # remove rate_fmt
        elif backend == "tqdm_notebook":
            from tqdm import tqdm_notebook
            return tqdm_notebook(iterable=iterable, desc=desc, total=total,
                                 leave=leave)
        elif backend == "pyprind":
            from pyprind import ProgBar, prog_bar
            ProgBar._adjust_width = lambda self: None  # keep constant width
            if iterable is None:
                return ProgBar(total, title=desc, stream=1)
            else:
                return prog_bar(iterable, title=desc, stream=1,
                                iterations=total)
        elif backend == "hide":
            return NoProgressbar(iterable=iterable)
        else:
            raise NotImplementedError("unknown backend")
Esempio n. 30
0
def evaluate(model, model_name, sents, ivocab):
    train = False
    loss = 0.0
    acc = 0.0
    count = 0
    vocab_size = model.vocab_size
    for data_i in pyprind.prog_bar(xrange(len(sents))):
        words = sents[data_i:data_i + 1]

        if model_name == "bd_lstm":
            xs, ms = utils.make_batch(words,
                                      train=train,
                                      tail=False,
                                      mask=True)
            ys = model.forward(xs=xs, ms=ms, train=train)
        else:
            xs = utils.make_batch(words, train=train, tail=False)
            ys = model.forward(ts=xs, train=train)

        ys = F.concat(ys, axis=0)
        ts = F.concat(xs, axis=0)
        ys = F.reshape(ys, (-1, vocab_size))
        ts = F.reshape(ts, (-1, ))

        loss += F.softmax_cross_entropy(ys, ts) * len(words[0])
        acc += F.accuracy(ys, ts, ignore_label=-1) * len(words[0])
        count += len(words[0])

    loss_data = float(cuda.to_cpu(loss.data)) / count
    acc_data = float(cuda.to_cpu(acc.data)) / count

    return loss_data, acc_data
    def handle(self, *args, **options):
        if options['tsvfile'] is None or options['sheet'] is None:
            error_msg = 'Enter name of tsv file and sheet number as argument.' \
                        ' "python manage.py import_hojas_de_vida --tsvfile=hoja0.tsv --sheet=0 --settings=ventanita.settings.local'
            raise CommandError(error_msg)

        tsv_file = options['tsvfile']
        sheet = options['sheet']
        self.sheet = sheet

        with codecs.open(tsv_file, "r") as file_handle:
            dump = file_handle.readlines()

        if sheet == '0':
            items = []
            for line in pyprind.prog_bar(dump):
                item = self.parse_line(line)
                if item is not None:
                    items.append(Candidato(**item))
            Candidato.objects.bulk_create(items)
        elif sheet == '1':
            self.import_institucion_educativa(dump)
            self.import_education_for_candidate(dump)
        elif sheet == '2':
            self.import_institucion_educativa_superior(dump)
            self.import_education_for_candidate(dump)
Esempio n. 32
0
def validate_directory_against_xsd(dirpath, schema):
    """Extract all the ``.spold`` files in the directory ``dirpath``.

    Use a multiprocessing pool if ``use_mp``, which is the default."""
    assert os.path.isdir(dirpath), "Can't find data directory {}".format(dirpath)
    assert os.path.isfile(schema), "Can't find schema file {}".format(schema)

    filelist = [os.path.join(dirpath, filename)
                for filename in os.listdir(dirpath)
                if filename.lower().endswith(".spold")
                ]

    print("Validating {} undefined datasets".format(len(filelist)))

    errors = []
    ecospold2_schema = etree.XMLSchema(etree.parse(open(schema)))

    for fp in pyprind.prog_bar(filelist):
        file = etree.parse(open(fp))
        if not ecospold2_schema.validate(file):
            errors.append(os.path.basename(fp))

    if errors:
        print("The following files did not validate:")
        pprint.pprint(errors)
    else:
        print("All files valid")
def count_sentence_length(corpus, count):
    for s in pyprind.prog_bar(corpus):
        length = len(s)
        if length >= len(count):
            continue
        count[length] += 1
    return count
Esempio n. 34
0
    def track_progress(self, noisy_grad, filtered_grad):

        # if function passed in --- save values
        if self.fun is not None:
            self.fun_vals.append(self.fun(self.params, self.t))

        # report on gradient
        if self.callback is not None:
            self.callback(self.params, self.t, noisy_grad)

        # update object attributes
        if self.save_params:
            self.param_trace.append(self.params.copy())

        if self.save_grads:
            self.grad_trace.append(noisy_grad)

        if self.save_filtered_grads:
            self.filtered_grad_trace.append(filtered_grad)

        if self.true_grad_fun is not None:
            true_grad = self.true_grad_fun(self.params, self.t)
            self.true_grad_trace.append(true_grad)

        if (self.num_marginal_samples_to_save > 0) and \
           (self.t % self.marginal_sample_skip == 0):
            nms = self.num_marginal_samples_to_save
            print "  ... saving %d marginal samples (iter %d)" % (nms, self.t)
            msamps = np.array([
                self.grad_fun(self.params, self.t)
                for _ in pyprind.prog_bar(xrange(nms))
            ])
            self.marginal_samples[self.t] = msamps
Esempio n. 35
0
    def __new__(cls, iterable=None, desc=None, total=None, leave=True,
                backend=None, verbose=True):
        if backend is None:
            backend = Progressbar.backend

        if not verbose:
            backend = "hide"

        if backend == "tqdm":
            from tqdm import tqdm
            return tqdm(iterable=iterable, desc=desc, total=total, leave=leave,
                        ascii=True, ncols=80, file=sys.stdout,
                        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed"
                                   "}<{remaining}{postfix}]") # remove rate_fmt
        elif backend == "tqdm_notebook":
            from tqdm import tqdm_notebook
            return tqdm_notebook(iterable=iterable, desc=desc, total=total,
                                 leave=leave)
        elif backend == "pyprind":
            from pyprind import ProgBar, prog_bar
            ProgBar._adjust_width = lambda self: None  # keep constant width
            if iterable is None:
                return ProgBar(total, title=desc, stream=1)
            else:
                return prog_bar(iterable, title=desc, stream=1,
                                iterations=total)
        elif backend == "hide":
            return NoProgressbar(iterable=iterable)
        else:
            raise NotImplementedError("unknown backend")
Esempio n. 36
0
def parse(model, decoder, dataset, path_pred):
    """
    :type model: SpanBasedModel
    :type decoder: IncrementalCKYDecoder
    :type dataset: numpy.ndarray
    :type path_pred: str
    :rtype: None
    """
    with open(path_pred, "w") as f:

        for data in pyprind.prog_bar(dataset):
            edu_ids = data.edu_ids
            edus = data.edus
            edus_postag = data.edus_postag
            edus_head = data.edus_head
            sbnds = data.sbnds
            pbnds = data.pbnds

            # Feature extraction
            edu_vectors = model.forward_edus(edus, edus_postag,
                                             edus_head)  # (n_edus, bilstm_dim)
            padded_edu_vectors = model.pad_edu_vectors(
                edu_vectors)  # (n_edus+2, bilstm_dim)
            mask_bwd, mask_fwd = model.make_masks(
            )  # (1, bilstm_dim), (1, bilstm_dim)

            # Parsing (bracketing)
            span_scores = precompute_all_span_scores(
                model=model,
                edus=edus,
                edus_postag=edus_postag,
                sbnds=sbnds,
                pbnds=pbnds,
                padded_edu_vectors=padded_edu_vectors,
                mask_bwd=mask_bwd,
                mask_fwd=mask_fwd)
            unlabeled_sexp = decoder.decode(span_scores=span_scores,
                                            inputs=edu_ids,
                                            sbnds=sbnds,
                                            pbnds=pbnds,
                                            use_sbnds=True,
                                            use_pbnds=True)  # list of str
            unlabeled_tree = treetk.sexp2tree(unlabeled_sexp,
                                              with_nonterminal_labels=False,
                                              with_terminal_labels=False)
            unlabeled_tree.calc_spans()
            unlabeled_spans = treetk.aggregate_spans(
                unlabeled_tree, include_terminal=False,
                order="pre-order")  # list of (int, int)

            # Parsing (assigning majority labels to the unlabeled tree)
            span2label = {(b, e): "<ELABORATION,N/S>"
                          for (b, e) in unlabeled_spans}
            labeled_tree = treetk.assign_labels(unlabeled_tree,
                                                span2label,
                                                with_terminal_labels=False)
            labeled_sexp = treetk.tree2sexp(labeled_tree)

            f.write("%s\n" % " ".join(labeled_sexp))
Esempio n. 37
0
def crawler(url, start_page, end_page):
    with open("output.json", "w") as f:
        #開瀏覽器
        browser = webdriver.Firefox()
        #取得網址
        browser.get(url)
        #取得"產品總覽"超連結
        res = browser.find_element_by_id('ContentPlaceHolder1_LinkButton11')
        #點下去
        res.click()
        #crawl from start_page to end_page
        for i in pyprind.prog_bar(range(start_page, end_page + 1)):
            #排除掉第一頁
            if (i != 1):
                #找到下一頁的按鈕
                res = browser.find_element_by_link_text(str(i))
                #按下去
                res.click()
            #get the source of page
            pagesource = browser.page_source
            #get the contain of website
            soup = BeautifulSoup(pagesource, "lxml")
            #get the table
            table = soup.find('table',
                              attrs={'id': 'ContentPlaceHolder1_GVTABPRO'})
            #get the rows of table
            rows = table.find_all('tr')
            index = 0
            for row in rows:
                #index == 1 means it's the first col
                if (index == 0):
                    cols = row.find_all('th')
                    colname = [element.text.strip() for element in cols]
                    index = index + 1
                else:
                    #get the cols from rows
                    cols = row.find_all('td')
                    #the elements of table is stored in cols now
                    cols = [element.text.strip() for element in cols]
                    #the row of pages
                    if (cols[0] == '12345678910'):
                        break
                    #store the cols into data
                    #data is the type of dict
                    data = {
                        str(colname[0]): cols[0],
                        str(colname[1]): cols[1],
                        str(colname[2]): cols[2],
                        str(colname[3]): cols[3],
                        str(colname[4]): cols[4],
                        str(colname[5]): cols[5],
                        str(colname[6]): cols[6],
                        str(colname[7]): cols[7],
                        str(colname[8]): cols[8]
                    }
                    #store into dataout
                    dataout.append(data)
        browser.close()
        f.write(json.dumps(dataout))
Esempio n. 38
0
def crawl(i):
	info = graph.get_object(i)
	print(info)
	posts = graph.get_connections(i, 'posts')
	for p in pyprind.prog_bar(posts['data']):
		p['reactions'] = graph.get_connections(p['id'], 'reactions')
		p['comments'] = graph.get_connections(p['id'], 'comments')
	json.dump(posts, open('facebook.json', 'w'))
Esempio n. 39
0
File: misc.py Progetto: andim/mise
def progressbar(iterator):
    # if available add progress indicator
    try:
        import pyprind
        iterator = pyprind.prog_bar(iterator)
    except:
        pass
    return iterator
 def import_institucion_educativa_superior(self, dump):
     instituciones = []
     lines = self.convert_to_lines(dump)
     for line in pyprind.prog_bar(
             lines, monitor=True, title="Importing high studies for candidate"):
         this_inst_edu = get_institucion_superior(line)
         if this_inst_edu not in instituciones:
             instituciones.append(this_inst_edu)
     upload_instituciones(instituciones)
Esempio n. 41
0
def inspect_parks(parks, output_dir):
    """Request data for each park, process it, and write it to disk."""
    bar = pyprind.ProgBar(len(parks))
    for park in pyprind.prog_bar(parks):
        data = inspect_park(park)
        fn = join(output_dir, '{}.json'.format(park.id))
        with open(fn, 'w') as f:
            json.dump(data, f)
        bar.update(item_id=park.name[:20])
Esempio n. 42
0
    def train(self):

        memory = ReplayMem(
            obs_dim=self.env.observation_space.flat_dim,
            act_dim=self.env.action_space.flat_dim,
            memory_size=self.memory_size)

        itr = 0
        path_length = 0
        path_return = 0
        end = False
        obs = self.env.reset()

        for epoch in xrange(self.n_epochs):
            logger.push_prefix("epoch #%d | " % epoch)
            logger.log("Training started")
            for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):
                # run the policy
                if end:
                    # reset the environment and stretegy when an episode ends
                    obs = self.env.reset()
                    self.strategy.reset()
                    # self.policy.reset()
                    self.strategy_path_returns.append(path_return)
                    path_length = 0
                    path_return = 0
                # note action is sampled from the policy not the target policy
                act = self.strategy.get_action(obs, self.policy)
                nxt, rwd, end, _ = self.env.step(act)

                path_length += 1
                path_return += rwd

                if not end and path_length >= self.max_path_length:
                    end = True
                    if self.include_horizon_terminal:
                        memory.add_sample(obs, act, rwd, end)
                else:
                    memory.add_sample(obs, act, rwd, end)

                obs = nxt

                if memory.size >= self.memory_start_size:
                    for update_time in xrange(self.n_updates_per_sample):
                        batch = memory.get_batch(self.batch_size)
                        self.do_update(itr, batch)

                itr += 1

            logger.log("Training finished")
            if memory.size >= self.memory_start_size:
                self.evaluate(epoch, memory)
            logger.dump_tabular(with_prefix=False)
            logger.pop_prefix()
def create_final_image_barcode(pieces_width, final_width, height, fname, images):
    bc = Image.new('RGB', (pieces_width, height))
    
    posx = 0
    for img in pyprind.prog_bar(images):
        bc.paste(img[0], (posx, 0))
        posx += img[1]

    os.chdir('..')
    bc = bc.resize((final_width, height), Image.ANTIALIAS)
    bc.save(fname, 'PNG')
    def optimize_gen(self, inputs, extra_inputs=None, callback=None, yield_itr=None):

        if len(inputs) == 0:
            # Assumes that we should always sample mini-batches
            raise NotImplementedError

        f_opt = self._opt_fun["f_opt"]
        f_loss = self._opt_fun["f_loss"]

        if extra_inputs is None:
            extra_inputs = tuple()

        last_loss = f_loss(*(tuple(inputs) + extra_inputs))

        start_time = time.time()

        dataset = BatchDataset(
            inputs, self._batch_size,
            extra_inputs=extra_inputs
            #, randomized=self._randomized
        )

        itr = 0
        for epoch in pyprind.prog_bar(list(range(self._max_epochs))):
            for batch in dataset.iterate(update=True):
                f_opt(*batch)
                if yield_itr is not None and (itr % (yield_itr+1)) == 0:
                    yield
                itr += 1

            new_loss = f_loss(*(tuple(inputs) + extra_inputs))
            if self._verbose:
                logger.log("Epoch %d, loss %s" % (epoch, new_loss))

            if self._callback or callback:
                elapsed = time.time() - start_time
                callback_args = dict(
                    loss=new_loss,
                    params=self._target.get_param_values(trainable=True) if self._target else None,
                    itr=epoch,
                    elapsed=elapsed,
                )
                if self._callback:
                    self._callback(callback_args)
                if callback:
                    callback(**callback_args)

            if abs(last_loss - new_loss) < self._tolerance:
                break
            last_loss = new_loss
Esempio n. 45
0
 def genCharVideo(self, filepath):
     self.charVideo = []
     cap = cv2.VideoCapture(filepath)
     self.timeInterval = round(1 / cap.get(5), 3)
     nf = int(cap.get(7))
     print("Generate char video, please wait...")
     if cap.isOpened():
         for i in pyprind.prog_bar(range(nf)):
             ret, vframe = cap.read()
             if ret:
                 rawFrame = cv2.cvtColor(vframe, cv2.COLOR_BGR2GRAY)
                 frame = self.convert(rawFrame, os.get_terminal_size(), fill=True)
                 self.charVideo.append(frame)
         cap.release()
Esempio n. 46
0
def epic_ixs(primers, interval=80, search_range=30):
    """ Find triplets of indices among primer candidates that are on the average 80 bases apart
        with flexibility of 30 bases.
    """
    starts = list(map(list, zip(*primers)))[0]
    for start1 in pyprind.prog_bar(starts):
        start2 = start1 + interval
        start3 = start2 + interval
        for ix1 in range(-search_range, search_range):
            str2 = start2 + ix1
            for ix2 in range(-search_range, search_range):
                str3 = start3 + ix2
                if str2 in starts and str3 in starts:
                    yield(start1, str2, str3)
    def import_institucion_educativa(self, dump):
        instituciones = []
        for line in pyprind.prog_bar(dump):
            fields = line.strip().split('\t')

            this_inst_edu = get_institucion_primaria(fields)
            if this_inst_edu not in instituciones:
                instituciones.append(this_inst_edu)

            this_inst_edu = get_institucion_secundaria(fields)
            if this_inst_edu not in instituciones:
                instituciones.append(this_inst_edu)

        upload_instituciones(instituciones)
Esempio n. 48
0
    def render_model_image(self, fimg, xlim=None, ylim=None, exclude=None):
        # create model image, and add each patch in - init with sky noise
        mod_img     = np.ones(fimg.nelec.shape) * fimg.epsilon
        source_list = [s for s in self.srcs if s is not exclude]

        if not len(source_list) == 0:
            # add each source's model patch
            for s in pyprind.prog_bar(source_list):
                patch, ylim, xlim = s.compute_model_patch(fits_image=fimg, xlim=xlim, ylim=ylim)
                mod_img[ylim[0]:ylim[1], xlim[0]:xlim[1]] += patch

        if xlim is not None and ylim is not None:
            mod_img = mod_img[ylim[0]:ylim[1], xlim[0]:xlim[1]]

        return mod_img
def create_color_barcode(colors, bar_width, height, width, fname):
    barcode_width = len(colors) * bar_width
    bc = Image.new('RGB', (barcode_width, height))
    draw = ImageDraw.Draw(bc)

    # draw the new barcode
    posx = 0
    print('Generating barcode...')
    for color in pyprind.prog_bar(colors):
        draw.rectangle([posx, 0, posx + bar_width, height], fill=color)
        posx += bar_width

    del draw

    bc = bc.resize((width, height), Image.ANTIALIAS)
    bc.save(fname, 'PNG')
    def import_education_for_candidate(self, dump):
        estudios = []
        lines = self.convert_to_lines(dump)
        for line in pyprind.prog_bar(
                lines, monitor=True, title="Importing studies for candidate"):
            if self.sheet == '2':
                e = self.construct_education_obj(line, 'superior')
                estudios.append(e)
            elif self.sheet == '1':
                e = self.construct_education_obj(line, 'primaria')
                if e.inicio != '0':
                    estudios.append(e)

                e = self.construct_education_obj(line, 'secundaria')
                if e.inicio != '0':
                    estudios.append(e)

        Estudio.objects.bulk_create(estudios)
def spawn_image_threads(num_threads, fname, bar_width, height, width):
    # change directories if it already isn't in frames
    if not 'frames' in os.getcwd():
        os.chdir('frames')

    q = queue.Queue()

    # get a distributed list of images for the threads
    images = helpers.distribute_frame_lists(num_threads)
    
    threads = []
    for i in range(num_threads):
        t_fname = 'thread_{}_barcode.png'.format(i)
        thread = threading.Thread(target=create_thread_barcode, 
                                  args=(bar_width, height, t_fname, images[i], i, q))
        threads.append(thread)


    # stitch together several smaller barcodes on seperate threads
    # to speed up the process
    print('{} threads creating barcodes with {} frames each...'.format(num_threads, len(images[0])))
    print('Progress bar may take a while to start moving if there are a lot of frames.')
    for thread in threads:
        # thread.daemon = True
        thread.start()

    pieces_width = 0
    # a list to put the thread results in the correct order
    thread_results = [None] * num_threads 
    for i in pyprind.prog_bar(range(num_threads)):
        result = q.get()
        thread_results[result[0]] = [result[1], result[2]]
        pieces_width += result[2]

    # then finally stitch together all the pices that the threads
    # generated
    print('Generating final barcode...')
    create_final_image_barcode(pieces_width, width, height, fname, thread_results)

    # delete thread pieces
    for i in range(num_threads):
        os.remove('frames/thread_{}_barcode.png'.format(i))

    return
Esempio n. 52
0
def validate_directory(dirpath):
    data, errors = extract_directory(dirpath, False), {}
    print("Validating datasets:")
    for ds in pyprind.prog_bar(data):
        try:
            dataset_schema(ds)
        except Invalid as err:
            errors[err.msg] = {"path": err.path, "dataset": ds}
    if errors:
        logfile = "ocelot-validation-errors.log"
        errors = [(k, v['path'], v['dataset']) for k, v in errors.items()]
        print("{} errors found.\nSee error logfile {} for details.".format(
            len(errors), logfile)
        )
        with open(logfile, "w", encoding='utf-8') as f:
            f.write("Internal validation errors for extracted directory:\n{}\n".format(dirpath))
            f.write(pprint.pformat(errors, width=120, compact=True))
    else:
        print("No errors found")
Esempio n. 53
0
    def fit(num_epochs, minibatch_size, L, optimizer, sess):
        num_batches = N // minibatch_size

        # set up cost function and updates
        if load_data:
            idx      = tf.placeholder(tf.int32, name='idx')
            mbsize   = tf.constant(minibatch_size)
            xdimsize = tf.constant(xdim)
            x_batch  = tf.slice(X_all, tf.pack([idx*mbsize, 0]),
                                       tf.pack([mbsize,xdimsize]), name='x_batch')
        else:
            x_batch  = tf.placeholder(tf.float32, shape=[minibatch_size, xdim],
                                      name='X')
        cost = -tf.reduce_mean(vlb(x_batch, L)) * N
        train_step = optimizer.minimize(cost)

        sess.run(tf.initialize_variables(ut.nontrainable_variables()))

        def train(bidx):
            if load_data:
                train_step.run(feed_dict={idx:bidx}, session=sess)
                return cost.eval(feed_dict={idx:bidx}, session=sess)
            else:
                xb = X[bidx*minibatch_size:(bidx+1)*minibatch_size]
                train_step.run(feed_dict={x_batch: xb}, session=sess)
                return cost.eval(feed_dict={x_batch: xb}, session=sess)

        start = time()
        for i in xrange(num_epochs):
            bidxs = npr.permutation(num_batches)
            vals = [train(bidx) for bidx in pyprind.prog_bar(bidxs)]
            print 'epoch {:>4} of {:>4}: {:> .6}' . \
                    format(i+1, num_epochs, np.median(vals[-10:]))
            if callback:
                callback(i)

            # will tell you what nodes are being added
            #tf.get_default_graph().finalize()

        stop = time()
        print 'cost {}, {:>5} sec per update, {:>5} sec total\n'.format(
            np.median(vals[-10:]), (stop - start) / N, stop - start)
def main(args):
    path = args.path
    dim = args.dim
    topk = args.topk
    output = args.output

    word2vec = word_evaluation.load_word2vec(path=path, dim=dim)
    vocab = word2vec.keys()
    wrapper = word_evaluation.Wrapper(word2vec)

    with open(output, "w") as f:
        word_i = 0
        vocab_size = len(vocab)
        for word in pyprind.prog_bar(vocab):
            retrieved = wrapper.most_similar(positives=[word], negatives=[], K=topk)
            res = [w for w, s in retrieved]
            res = " ".join(res)
            f.write("[%d/%d: %s]: %s\n" % (word_i+1, vocab_size, word, res))
            f.flush()
            word_i += 1
Esempio n. 55
0
def subject_verify(new_arxiv):
    if new_arxiv.count > 0:
        subject_list = copy.copy(new_arxiv.subject)
        remove_list = []
        new_ver = arxiv(new_arxiv.author)
        new_ver.parse()
        for count in pyprind.prog_bar(range(len(new_ver.title))):
            if len(set(subject_list) & set(new_ver.category[count])) == 0:
                remove_list.append(count)
        new_ver.arxiv_id = (np.delete(np.array(new_ver.arxiv_id), remove_list, axis=0)).tolist()
        new_ver.time = (np.delete(np.array(new_ver.time), remove_list, axis=0)).tolist()
        new_ver.title = (np.delete(np.array(new_ver.title), remove_list, axis=0)).tolist()
        new_ver.category = (np.delete(np.array(new_ver.category), remove_list, axis=0)).tolist()
        new_ver.pdf = (np.delete(np.array(new_ver.pdf), remove_list, axis=0)).tolist()
        new_ver.contributor = (np.delete(np.array(new_ver.contributor), remove_list, axis=0)).tolist()
        new_ver.count = len(new_ver.title)
        new_ver.subject = combine_subject(new_ver.category)
        print('Remove %d articles' % len(remove_list))
        return new_ver
    else:
        return new_arxiv
Esempio n. 56
0
 def institution_verify(self, save=False, institution=['nyu', 'new york university']):
     if self.count != 0:
         remove_list = []
         if save == True and not os.path.exists('./paper/%s/' %self.author):
             os.makedirs('./paper/%s/' %self.author)
         for count in pyprind.prog_bar(range(len(self.pdf))):
             os.system('wget -q -U "Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.2.3) Gecko/20100401 '
                       'Firefox/3.6.3" -O ./check.pdf %s' %self.pdf[count])
             if save == True:
                 #os.system('cp ./check.pdf ./paper/%s/%s.pdf' %(self.author, self.arxiv_id[count]))
                 if len(self.arxiv_id[count].split('/')) >1 :
                     temp_dir = self.arxiv_id[count].split('/')[0]
                     if not os.path.exists('./paper/%s/%s/' % (self.author, temp_dir)):
                         os.makedirs('./paper/%s/%s/' % (self.author, temp_dir))
                 shutil.copy('./check.pdf', './paper/%s/%s.pdf' %(self.author, self.arxiv_id[count]))
             try:
                 text = convert('./check.pdf', pages=[0,1,2]).lower()
                 match_flag = False
                 for match_text in institution:
                     if text.find(match_text) != -1:
                         match_flag = True
                         break
                 if match_flag == True:
                     continue
                 else:
                     remove_list.append(count)
             except:
                 print("Can not read file %s" % self.arxiv_id[count])
                 remove_list.append(count)
                 continue
         os.system("rm ./check.pdf")
         self.arxiv_id = (np.delete(np.array(self.arxiv_id), remove_list, axis=0)).tolist()
         self.time = (np.delete(np.array(self.time), remove_list, axis=0)).tolist()
         self.title = (np.delete(np.array(self.title), remove_list, axis=0)).tolist()
         self.category = (np.delete(np.array(self.category), remove_list, axis=0)).tolist()
         self.pdf = (np.delete(np.array(self.pdf), remove_list, axis=0)).tolist()
         self.contributor = (np.delete(np.array(self.contributor), remove_list, axis=0)).tolist()
         self.count = len(self.title)
         self.subject = combine_subject(self.category)
         print('Remove %d articles' % len(remove_list))
Esempio n. 57
0
def complete_me(content_as_list, output_filename, email):
    """
    Add metadata to the blast output file. Metadata is obtained by querying the
    NCBI database.

    :param content_as_list: blast output content (CSV file) as list of lines.
    :param output_filename: write line by line.
    """
    Entrez.email = email

    for i in pyprind.prog_bar(range(len(content_as_list))):
        line = content_as_list[i]
        line = line.strip()
        if line.startswith('query'):
            with open(output_filename, 'w') as handle:
                handle.write(line + '\tGeneLength\tTitle\n')
            continue

        line_complement = _get_metadata_as_string(line)

        with open(output_filename, 'a') as handle:
            handle.write(line + '\t' + line_complement + '\n')
Esempio n. 58
0
def spawn_threads(threads, kmeans):
    # change directories if it already isn't in frames
    if not 'frames' in os.getcwd():
        os.chdir('frames')

    q = queue.Queue()
    num_threads = threads

    # get a distributed list of images for the threads
    images = helpers.distribute_frame_lists(num_threads)

    threads = []
    for i in range(num_threads):
        if kmeans:
            thread = threading.Thread(target=kc.get_image_colors,
                                      args=(i, q, images[i]))
        else:
            thread = threading.Thread(target=pc.get_image_colors,
                                      args=(i, q, images[i]))

        threads.append(thread)

    print('{} threads generating frame colors with {} frames each...'.format(num_threads, len(images[0])))
    for thread in threads:
        thread.daemon = True
        thread.start()

    thread_results = [None] * num_threads
    for i in pyprind.prog_bar(range(num_threads)):
        result = q.get()
        thread_results[result[0]] = result[1]

    # return to the original directory
    os.chdir('..')

    return [item for sublist in thread_results for item in sublist]
Esempio n. 59
0
    bsrcs = ssrcs[38:39] + gsrcs[38:39]
    bidx  = np.concatenate([sidx[38:39], gidx[38:39]])

    # breadcrumbs - make sure we can examine which source corresponds to
    # which catalog entry
    blocs = np.array([s.params.u for s in bsrcs])
    plocs = primary_field_df[['ra', 'dec']].values[bidx,:]
    assert np.allclose(blocs, plocs), "not the same location! noooo"

    ######################################
    # gibbs step on a handful of sources #
    ######################################
    print "======= running celeste sampler ========"
    # do some resampling, each source keeps each sample
    Nsamps = 10
    for i in pyprind.prog_bar(xrange(Nsamps)):
        # resample photon images
        model.field_list[0].resample_photons(bsrcs, verbose=True)
        # resample source params
        for s in pyprind.prog_bar(bsrcs):
            s.resample()
            s.store_sample()
            s.store_loglike()
        # global/local update
        #for s in bsrcs:
        #    s.sample_type()
        # global updates
        #model.sample_birth()
        #model.sample_death()

    ########################################
Esempio n. 60
0
 def __init__(self, items):
     from pyprind import prog_bar
     self.bar = prog_bar(items)