Ejemplo n.º 1
0
def main():
    raw_list = csv_to_list(csv_file)[:100]
    total_len = len(raw_list)
    counter = 0
    result_dict = dict()
    print "Commencing Web Scraping..."
    start_time = time.time()
    for raw_link in raw_list:
        try:
            raw_link = raw_link[0]
            whois_link = "http://www.whois.com/whois/" + raw_link
            ipaddress_link = "http://" + raw_link + ".ipaddress.com/"
            whois_soup = link_to_lxmlsoup(whois_link)
            ipaddress_soup = link_to_lxmlsoup(ipaddress_link)
            result_dict.setdefault('Raw Link', []).append(str(raw_link))
            result_dict = whois_parser(whois_soup, result_dict)
            result_dict = ipaddress_parser(ipaddress_soup, result_dict)
            counter, total_len = print_counter(counter, total_len)
            if counter % 400 == 0:
                print "Commencing 30 Second Sleep after 400 iterations"
                time.sleep(30)
            time_elapsed = time.time() - start_time
            print_progress(time_elapsed, counter, total_len)
        except:
            dict_to_json(result_dict, 'output.json')
            dict_to_csv(result_dict, 'output.csv')
            print "Unexpected Error", sys.exc_info()[0]
            raise
    dict_to_json(result_dict, 'output.json')
    dict_to_csv(result_dict, 'output.csv')
Ejemplo n.º 2
0
 def get_json(self):
     d = {
         "name":
         self.name.title(),
         "weight":
         self.weight,
         "primarytype":
         self.primarytype,
         "secondarytype":
         self.secondarytype,
         "typemods": [{
             "type": k,
             "value": v
         } for k, v in sorted(self.typeprofile.type_modifiers.items(),
                              key=lambda x: x[1]) if v != 1],
         "hp":
         self.hp,
         "defense":
         self.defense,
         "attack":
         self.attack,
         "spattack":
         self.spattack,
         "spdefense":
         self.spdefense,
         "speed":
         self.speed,
     }
     return dict_to_json(d)
def for_user_model():
    U = 4
    Nu = [10, 21, 14, 32]
    uids = [20000, 40000, 10000, 30000]
    sport = "Running"
    
    randomState = np.random.RandomState(12345)
    f = gzip.open("synth_user_model.gz", "w")

    theta = randomState.rand(U + 3)    
    alpha_all = theta[-3]
    theta_0 = theta[-2]
    theta_1 = theta[-1]
    data = []
    for u in xrange(0, U):
        nu = Nu[u]
        alpha = theta[u]
        uid = uids[u]
        for i in xrange(0, nu):
            d = randomState.randint(low = 1, high = 10)
            t = (alpha + alpha_all) * (theta_0 + theta_1 * d) * 3600.0
            w = {"Distance" : d, "Duration" : t, "user_id" : str(uid), "sport" : sport}
            w_json = utils.dict_to_json(w)
            f.write(w_json + "\n")
    print theta
    
    f.close()
Ejemplo n.º 4
0
    def get(self):
        increment_hit_counter(datastore_key_hits_streams_json)

        # Get database
        db = memcache.get(memcache_key_database)
        if db is None:
            logger.warn(
                'memcache failed on key: {}'.format(memcache_key_database))
            db_json = ndb_get_entity(JsonDatabase,
                                     datastore_key_database).value
            db = utils.json_to_dict(db_json)
            memcache.set(memcache_key_database, db)

        # Get last update time
        last_update_time = memcache.get(memcache_key_last_update)
        if last_update_time is None:
            logger.warn(
                'memcache failed on key: {}'.format(memcache_key_last_update))
            last_update_time = ndb_get_entity(Time,
                                              datastore_key_last_update).value
            memcache.set(memcache_key_last_update, last_update_time)

        json_obj = {'streams': db, 'last_update': last_update_time}
        json_str = utils.dict_to_json(json_obj)
        self.response.headers['Content-Type'] = 'application/json'
        self.response.out.write(json_str)
Ejemplo n.º 5
0
 def dump(self):
     """
     Dump the output to json.
     """
     report_as_json_string = utils.dict_to_json(self.report)
     if self.out_file:
         utils.string_to_file(self.out_file, report_as_json_string)
     else:
         print report_as_json_string
Ejemplo n.º 6
0
 def dump(self):
     """
     Dump the output to json.
     """
     report_as_json_string = utils.dict_to_json(self.report)
     if self.out_file:
         utils.string_to_file(self.out_file, report_as_json_string)
     else:
         print report_as_json_string
Ejemplo n.º 7
0
def backup_database(backup_key):
    db = memcache.get(memcache_key_database)
    if db is None:
        logger.warn('memcache failed on key: {}'.format(memcache_key_database))
        db_json = ndb_get_entity(JsonDatabase, datastore_key_database).value
        db = utils.json_to_dict(db_json)
        memcache.set(memcache_key_database, db)
    db_json = utils.dict_to_json(db)
    logger.info('Backup database to key: {}'.format(backup_key))
    ndb_set_value(JsonDatabase, backup_key, db_json)
Ejemplo n.º 8
0
def update_database():
    db_json = ndb_get_entity(JsonDatabase, datastore_key_database).value
    db = utils.json_to_dict(db_json)
    current_streams = streams.get_current_streams()

    updated_db = streams.update_database(db, current_streams)
    updated_db_json = utils.dict_to_json(updated_db)

    ndb_set_value(JsonDatabase, datastore_key_database, updated_db_json)
    memcache.delete(memcache_key_database)
    memcache.set(memcache_key_database, updated_db)
Ejemplo n.º 9
0
def books(data, arr):
    if request.json:
        body = dict_to_json(request.json)
        if body['type'] == 'date':
            return books_by_published_date(body, arr)
        elif body['type'] == 'title':
            return books_by_title(body, arr)
        else:
            return 'The type of the request is not recognise', 400
    else:
        return json.dumps(data, indent=4), 201
def for_evolving_user_model():
    U = 4
    E = 3
    Nu = [10, 21, 14, 32]
    uids = sorted([20000, 40000, 10000, 30000])
    print uids
    sport = "Running"

    sigma = np.zeros((U, max(Nu)))
    
    randomState = np.random.RandomState(12345)
    f = gzip.open("synth_evolving_user_model.gz", "w")

    #theta = randomState.rand(U * E + E + 2)
    theta_0 = randomState.rand()
    theta_1 = randomState.rand()
    alpha = np.array(sorted(list(randomState.rand(E)), reverse = True))   # per user alpha for each experience
    data = []
    theta = []
    for u in xrange(0, U):
        nu = Nu[u]
        #alpha_u = np.sort(randomState.rand(E))   # per user alpha for each experience
        alpha_u = np.array(sorted(list(randomState.rand(E)), reverse = True))   # per user alpha for each experience
        #print alpha_u
        theta = theta + list(alpha_u)
        uid = uids[u]
        dts = np.sort(randomState.randint(low = 1000000000, high = 1100000000, size = (nu)))
        exp_levels = np.sort(randomState.randint(low = 0, high = E, size = (nu)))
        for i in xrange(0, nu):
            d = randomState.randint(low = 1, high = 10)
            e = exp_levels[i]
            sigma[u, i] = e
            a_e = alpha[e]
            a_ue = alpha_u[e]
            print "u = %d, e = %d, alpha_ue = %f" % (u, e, a_ue)
            t = (a_e + a_ue) * (theta_0 + theta_1 * d) * 3600.0
            #print "a term = " + str((a_e + a_ue)) + " theta term = " + str(theta_0 + theta_1 * d)
            dt = datetime.fromtimestamp(dts[i]).strftime('%b %d, %Y %I:%M %p')
            w = {"Distance" : d, "Duration" : t, "user_id" : str(uid), "sport" : sport, "date-time" : dt, "experience" : e}
            w_json = utils.dict_to_json(w)
            f.write(w_json + "\n")
            data.append([u, uid, d, t, dts[i]])
    
    f.close()
    data = np.matrix(data)

    sigma_list = [list(sigma[i, :]) for i in range(0, U)]

    theta += list(alpha)
    theta += [theta_0]
    theta += [theta_1]
    np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
    print "Actual theta : ", theta
def condense_and_clean_data(infile, outfile):
    """
    infile must be a .gz file generated by the sql_to_json_parser.py
    condense_and_clean_data will do the following:
        - replace trace data by averages
        - replace strings like '2.35 mi' to 2.35
    """
    t1 = time.time()
    fo = gzip.open(outfile, "w")
    fi = gzip.open(infile)
    precision = 6   # 6 digits after decimal
    param_formatter = ParamFormatter(precision = precision)
    n = 0
    n_params_ignored = 0
    n_values_ignored = 0
    ignored_params = set()
    ignored_values = set()
    for line in fi:
        d = {}
        w = utils.json_to_dict(line.strip())
        for k, v in w.items():
            if (isinstance(v, list)):
                # replace trace data by averages
                v = round(numpy.mean(utils.remove_null_values_single(v)), precision)
                k = k + "(avg)"
                d[k] = v
            else:
                # convert and replace units - for example, convert '2.35 mi' to 2.35
                try:
                    v = param_formatter.to_number(k, v)
                    d[k] = v
                except InvalidValueException as e:
                    n_values_ignored += 1
                    ignored_values.add(e.value)
                except InvalidParamException as e:
                    n_params_ignored += 1
                    ignored_params.add(e.param)
        w_str = utils.dict_to_json(d)
        fo.write(w_str + "\n")
        n += 1
        if (n % 10000 == 0):
            print "Written %d workouts.." % (n)
    fi.close()
    fo.close()
    t2 = time.time()
    print "Time taken = " + str(t2 - t1) + " seconds"
    print "%d params ignored" % (n_params_ignored)
    print "List of ignored parameters : " + str(ignored_params)
    print "%d values ignored" % (n_values_ignored)
    print "List of ignored values : " + str(ignored_values)
    print "Total %d workouts written" % (n)
Ejemplo n.º 12
0
 def dump(self):
     """
     Dump the output to json.
     """
     if self.options.role is not None:
         role = "roles/" + self.options.role
         if role not in self.report["roles"]:
             print("Role does not exist: %s" % self.options.role)
             sys.exit(2)
         data = self.report["roles"][role]
     else:
         data = self.report
     report_as_json_string = utils.dict_to_json(data, pretty_print=True)
     if self.out_file:
         utils.string_to_file(self.out_file, report_as_json_string)
     else:
         print report_as_json_string
Ejemplo n.º 13
0
 def get(self):
     logger.info('Database init started')
     afreeca_json = streams.afreeca_init_db_json
     init_db = streams.get_initial_database(afreeca_json)
     init_db_json = utils.dict_to_json(init_db)
     try:
         ndb_set_value(JsonDatabase, datastore_key_database, init_db_json,
                       True)
         ndb_set_value(Time, datastore_key_last_update,
                       datetime.datetime.utcnow())
     except KeyError:
         pass
     try:
         ndb_set_value(HitCounter, datastore_key_hits_streams_json, 0, True)
     except KeyError:
         pass
     logger.info('Database init finished')
def for_linear_model():
    theta = np.matrix([[0.1, 0.2]]).T
    X = np.matrix(np.random.rand(1000, 1))
    X = add_offset_feature(X)
    print X.shape
    noise =  np.matrix(np.random.normal(0, 0.1, 1000)).T
    Y = X.dot(theta) + noise
    print Y.shape
    np.save("X_synthetic_direct.npy", X)
    np.save("Y_synthetic_direct.npy", Y)

    with gzip.open("synthetic_workouts.gz", "w") as f:
        for i in range(0, 1000):
            d = {}
            d["Distance"] = 100 * X[i, 1]
            d["Duration"] = 100 * Y[i, 0]
            s = utils.dict_to_json(d)
            f.write(s + "\n")
def for_baseline_model():
    U = 4
    Nu = [10, 21, 4, 32]
    uids = [20000, 40000, 10000, 30000]
    sport = "Running"
    
    randomState = np.random.RandomState(12345)
    f = gzip.open("synth_baseline_model.gz", "w")

    theta = randomState.randint(low = 100, high = 10000, size = U)
    print theta
    data = []
    for u in xrange(0, U):
        nu = Nu[u]
        v = theta[u]
        uid = uids[u]
        for i in xrange(0, nu):
            d = randomState.randint(low = 1, high = 10)
            t = v * d
            w = {"Distance" : d, "Duration" : t, "user_id" : str(uid), "sport" : sport}
            w_json = utils.dict_to_json(w)
            f.write(w_json + "\n")
    
    f.close()
Ejemplo n.º 16
0
def test_set_database(db, out_name):
    json_str = utils.dict_to_json(db)
    with open(out_name, 'w') as f:
        f.write(json_str)
Ejemplo n.º 17
0
def run(config_file, fold=0, device_id=0):

    os.environ['CUDA_VISIBLE_DEVICES'] = str(device_id)

    config = load_config(config_file)

    if not '_fold' in config.work_dir:
        config.work_dir = config.work_dir + '_fold{}'.format(fold)

    validloader = make_loader(
        data_dir=config.data.train_dir,
        df_path=config.data.train_df_path,
        features=config.data.features,
        phase='valid',
        img_size=(config.data.height, config.data.width),
        batch_size=config.test.batch_size,
        num_workers=config.num_workers,
        idx_fold=fold,
        transforms=get_transforms(config.transforms.test),
        model_scale=config.data.model_scale,
        return_fnames=True,
    )

    # load model
    checkpoint_path = config.work_dir + '/checkpoints/best.pth'
    model = load_model(config_file, checkpoint_path)

    folds = pd.read_csv('data/folds.csv')

    predictions = []
    targets = []
    image_ids = []
    z_pos = config.data.z_pos[0]
    with torch.no_grad():
        for i, (batch_images, batch_mask_regr,
                batch_image_ids) in enumerate(tqdm(validloader)):
            batch_preds = model(batch_images.to(config.device))
            batch_preds[:, 0] = torch.sigmoid(batch_preds[:, 0])
            batch_preds[:, z_pos] = depth_transform(batch_preds[:, z_pos])

            batch_preds = batch_preds.data.cpu().numpy()
            batch_mask_regr = batch_mask_regr.data.cpu().numpy()
            image_ids.extend(batch_image_ids)

            for preds, mask_regr, image_id in zip(batch_preds, batch_mask_regr,
                                                  batch_image_ids):
                coords = extract_coords(
                    preds,
                    features=config.data.features,
                    img_size=(config.data.height, config.data.width),
                    confidence_threshold=config.test.confidence_threshold,
                    distance_threshold=config.test.distance_threshold,
                )
                predictions.append(coords)

                s = folds.loc[folds.ImageId == image_id.split('.jpg')[0],
                              'PredictionString'].values[0]
                true_coords = str2coords(
                    s, names=['id', 'yaw', 'pitch', 'roll', 'x', 'y', 'z'])
                targets.append(true_coords)

    with open(config.work_dir + '/predictions.pkl', 'wb') as f:
        pickle.dump(predictions, f)
    with open(config.work_dir + '/targets.pkl', 'wb') as f:
        pickle.dump(targets, f)

    rows = []
    for p, i in zip(predictions, image_ids):
        rows.append({'ImageId': i, 'PredictionString': coords2str(p)})
    pred_df = pd.DataFrame(rows)
    pred_df.to_csv(config.work_dir + '/val_pred.csv', index=False)

    all_result, result = calc_map_score(targets, predictions)
    result['confidence_threshold'] = config.test.confidence_threshold
    result['distance_threshold'] = config.test.distance_threshold

    dict_to_json(
        all_result, config.work_dir +
        '/all_result_th{}.json'.format(config.test.distance_threshold))
    dict_to_json(
        result, config.work_dir +
        '/result_th{}.json'.format(config.test.distance_threshold))

    for k in sorted(result.keys()):
        print(k, result[k])
Ejemplo n.º 18
0
    for div in divs:
        date = div.find("div", {"class", "meta"}).text
        picture = div.find("img")["src"]
        description = div.find("p").text
        title = div.find("h2").text
        tip = div.find("div", {"class", "img_desc"}).text
        url = div.find("a")["href"]
        content = scrap_content(url)
        news_data.append({
            "title": title,
            "dateInfo": date,
            "picture": picture,
            "description": description,
            "type": tip,
            "url": url,
            "content": content
        })
    return news_data


def fcs_crawler():
    return crawl_articles(FCS_URL)


if __name__ == '__main__':

    from utils import dict_to_json
    data = fcs_crawler()
    dict_to_json("JSON_IGNORE.json", data)
Ejemplo n.º 19
0
    model = stock_rnn_model.model
    callback = EarlyStopping(monitor="loss",
                             patience=10,
                             verbose=1,
                             mode="auto")
    training_history = model.fit(X_train,
                                 Y_train,
                                 epochs=1000,
                                 batch_size=128,
                                 validation_data=(X_val, Y_val),
                                 callbacks=[callback])

    model.save(model_type + "_stock_" + input_output_type + "_inference.h5")
    training_history_dict = training_history.history
    utils.dict_to_json(
        training_history_dict,
        "./" + model_type + "_" + input_output_type + "_training_history")
    plot_loss(training_history_dict)
    training_score, validation_score = model_score(model, X_train, Y_train,
                                                   X_val, Y_val)
    info("=================================")
    info("Train MSE: %.5f%%" % training_score)
    info("Validation MSE: %.5f%%" % validation_score)
    info("=================================")

    predicted_close = model.predict(final_test_for_all)
    #print(predicted_close)
    #print(predicted_close.shape)
    predicted_close = stock_rnn_model.get_avereage_predicted_close(
        predicted_close, "mean")
    #print(predicted_close)