Ejemplo n.º 1
0
def euclidean_distance(vector_1, vector_2):
    '''
    calculates the euclidean distance between two vectors
    '''
    vector_1, vector_2 = prepare_data(vector_1, vector_2)
    return math.sqrt(
        sum(math.pow(v1 - v2, 2) for v1, v2 in zip(vector_1, vector_2)))
Ejemplo n.º 2
0
def exec_ppjoin():
    def parse_query(query):
        pattern = re.compile('-sim (0\.\d+) -dist (\d+)(.*)')
        m = pattern.match(query)

        if m:
            sim = float(m.group(1))
            dist = int(m.group(2))
            text = m.group(3)
            return dist, sim, text
        else:
            return None, None, query

    query = request.args.get('q')  # query from search string comes here

    df = prepare_data('data/miami1000.pkl')
    inverted_file = get_inverted_file(df)

    if query:
        theta, epsilon, text = parse_query(query)

        if not theta:
            theta = 0.1
            epsilon = 100

        if text:
            res = stTextSearch(df, text, theta)
        else:
            res = ppj_c(df, theta, epsilon)
    else:
        theta = 0.5
        epsilon = 100
        res = ppj_c(df, theta, epsilon)

    return res
Ejemplo n.º 3
0
def exec_ppjoin():
    def parse_query(query):
        pattern = re.compile('-sim (0\.\d+) -dist (\d+)(.*)')
        m = pattern.match(query)

        if m:
            sim = float(m.group(1))
            dist = int(m.group(2))
            text = m.group(3)
            return dist, sim, text
        else:
            return None,None,query

    query = request.args.get('q')   # query from search string comes here

    df = prepare_data('data/miami1000.pkl')
    inverted_file = get_inverted_file(df)

    if query:
        theta, epsilon, text = parse_query(query)

        if not theta:
            theta = 0.1
            epsilon = 100

        if text:
            res = stTextSearch(df, text, theta)
        else:
            res = ppj_c(df, theta, epsilon)
    else:
        theta = 0.5
        epsilon = 100
        res = ppj_c(df, theta, epsilon)

    return res
Ejemplo n.º 4
0
def get_test_data(path, normalize=True, means=None, stds=None, train_data=None):
    """
    Convenience function for extracting and optionally normalizing test data.

    Args:
        path: str, path to the data file
        normalize: boolean, whether to normalize numeric columns
        means: dict, containing means of columns to be normalized
        stds: dict, containing stds of columns to be normalized
        train_data: DataFrame, if not None, referrence DataFrame for adding missing columns to test_data

    Returns:
        test_data: DataFrame
    """

    if normalize:
        assert means is not None
        assert stds is not None

    test_data = prepare_data(path)

    if train_data is not None:
        test_data = add_missing_cols(train_data, test_data)
        # test_data = test_data[train_data.columns]

    if normalize:
        test_data = normalize_test_data(test_data, means, stds)

    return test_data
Ejemplo n.º 5
0
def minkowski_distance(vector_1, vector_2, n_root):
    '''
    calculates minkowski distance
    '''
    vector_1, vector_2 = prepare_data(vector_1, vector_2)
    if isinstance(n_root, int) and n_root >= 1:
        return sum(
            math.pow(abs(v1 - v2), n_root)
            for v1, v2 in zip(vector_1, vector_2))**(1 / n_root)
    elif isinstance(n_root, str):
        try:
            n_root = int(n_root)
            return minkowski_distance(vector_1, vector_2, n_root)
        except ValueError:
            raise ValueError("nth root should be integer and greater than 0")
    else:
        raise ArithmeticError("nth root can not be Zero")
Ejemplo n.º 6
0
def cosine_similarity(vector_1, vector_2, distance=False):
    '''
    calculates the cosine similarity
    '''
    vector_1, vector_2 = prepare_data(vector_1, vector_2)
    numerator = sum(v1 * v2 for v1, v2 in zip(vector_1, vector_2))
    denominator = reduce(
        lambda x, y: math.sqrt(x) * math.sqrt(y),
        map(
            lambda x: sum([
                i if x == 0 else j
                for i, j in [(v1**2, v2**2)
                             for v1, v2 in zip(vector_1, vector_2)]
            ]), (0, 1)))
    if distance:
        return 1 - (numerator / denominator)
    else:
        return numerator / denominator
Ejemplo n.º 7
0
def run(num_classes,learning_rate,width,depth,mini_batch_size):

	precision = accuracy = recall = f_score = np.array([])


	X_train,X_test,y_train,y_test,unknown_data = dp.load_data()
	X_train,X_test,y_train,y_test,unknown_data,dtype = dp.prepare_data(X_train,X_test,y_train,y_test,unknown_data)


	for _ in range(1):

		model = NN.Net1(num_classes,depth=depth,width=width).type(dtype)
		opt = optim.SGD(params=model.parameters(),lr=learning_rate,momentum=rp.m,nesterov=True)
		train_losses,test_losses = model.train_validate(X_train,y_train,X_test,y_test,opt,mini_batch_size,dtype)

		model = torch.load("Models/Best_Model.pkl")

		y_pred,_ = model.test(X_test)

		# Calculate metrics
		y_true = y_test.data.cpu().numpy()
		y_pred = y_pred.data.cpu().numpy()
		a,p,r,f = m.compute_metrics(y_true,y_pred)

		accuracy = np.append(accuracy,a)
		precision = np.append(precision,p)
		recall = np.append(recall,r)
		f_score = np.append(f_score,f)


	accuracy = np.mean(accuracy)
	precision = np.mean(precision)
	recall = np.mean(recall)
	f_score = np.mean(f_score)

	m.show_results(accuracy,precision,recall,f_score,num_classes,train_losses,test_losses)
	
	#g.generate_graph(model,X_train)
	
	fw.create_data_csv(learning_rate,depth,width,mini_batch_size,rp.m,len(test_losses)-10,accuracy)

	# Store unknown_data prediction 
	y_pred,_ = model.test(unknown_data)
	fw.store_prediction(y_pred.data.cpu().numpy())
Ejemplo n.º 8
0
def main():
    hist = crawl()

    # split data
    train, test = train_test_split(hist, test_size=0.2)

    pd.plotting.register_matplotlib_converters()

    target_col = 'close'
    line_plot(train[target_col], test[target_col], 'training', 'test', title='')

    # initial data in neurons in LSTM layer
    np.random.seed(42)
    window_len = 5
    test_size = 0.2
    zero_base = True
    lstm_neurons = 100
    epochs = 20
    batch_size = 32
    loss = 'mse'
    dropout = 0.2
    optimizer = 'adam'

    # train model
    train, test, X_train, X_test, y_train, y_test = prepare_data(
        hist, target_col, window_len=window_len, zero_base=zero_base, test_size=test_size)
    model = build_lstm_model(
        X_train, output_size=1, neurons=lstm_neurons, dropout=dropout, loss=loss,
        optimizer=optimizer)
    history = model.fit(
        X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1, shuffle=True)


    # Mean Absolute Error
    targets = test[target_col][window_len:]
    preds = model.predict(X_test).squeeze()
    mean_absolute_error(preds, y_test)

    # plot and predict prices
    preds = test[target_col].values[:-window_len] * (preds + 1)
    preds = pd.Series(index=targets.index, data=preds)
    line_plot(targets, preds, 'actual', 'prediction', lw=3)
Ejemplo n.º 9
0
def get_train_data(path, normalize=True, num_cols=NUMERIC_COLUMNS):
    """
    Convenience function for extracting and optionally normalizing data.

    Args:
        path: str, path to the data file
        normalize: boolean, whether to normalize numeric columns
        num_cols: list-like, if normalize is True, list of columns to normalize

    Returns:
        train_data: DataFrame
        means: dict, containing means of normalized columns (None if normalize is False)
        stds: dict, containing stds of normalized columns (None if normalize if False)
    """

    train_data = prepare_data(path)
    means = stds = None

    if normalize:
        train_data, means, stds = normalize_multiple_columns(train_data, num_cols)

    return train_data, means, stds
Ejemplo n.º 10
0
            obj = df.loc[id_]
            json_ = {
                "id": id_,
                "long": str(obj.lat),
                "lat": str(obj.lng),
                "text": obj.raw_text
            }
            cell.append(json_)
        result.append(cell)
    return json.dumps(result)


# -----------------------------------------------------------------------------------------------------------------------

if __name__ == "__main__":
    df = prepare_data('data/miami1000.pkl')
    inverted_file = get_inverted_file(df)
    theta = 0.8

    start_time = time.time()
    pairs = ppjoin(df, inverted_file, theta)
    print "Time elapsed:", time.time() - start_time
    print pairs[0]
    print 'Total: ', len(pairs)
    for pair in pairs:
        id1 = pair[0]["id"]
        id2 = pair[1]["id"]
        print jaccard_similarity(df.loc[id1].text, df.loc[id2].text)

    group_dict = group_objects(df, theta)
    start_time = time.time()
Ejemplo n.º 11
0
        for id_ in pair:
            obj = df.loc[id_]
            json_ = {
                "id": id_,
                "long": str(obj.lat),
                "lat": str(obj.lng),
                "text": obj.raw_text
            }
            cell.append(json_)
        result.append(cell)
    return json.dumps(result)

# -----------------------------------------------------------------------------------------------------------------------

if __name__ == "__main__":
    df = prepare_data('data/miami1000.pkl')
    inverted_file = get_inverted_file(df)
    theta = 0.8

    start_time = time.time()
    pairs = ppjoin(df, inverted_file, theta)
    print "Time elapsed:", time.time() - start_time
    print pairs[0]
    print 'Total: ', len(pairs)
    for pair in pairs:
        id1 = pair[0]["id"]
        id2 = pair[1]["id"]
        print jaccard_similarity(df.loc[id1].text, df.loc[id2].text)

    group_dict = group_objects(df, theta)
    start_time = time.time()
Ejemplo n.º 12
0
def manhattan_distance(vector_1, vector_2):
    '''
    calculates manhattan similarity
    '''
    vector_1, vector_2 = prepare_data(vector_1, vector_2)
    return sum(abs(v1 - v2) for v1, v2 in zip(vector_1, vector_2))