Esempio n. 1
0
def save(ptosave):
    f = open('data.csv', 'a+')
    # print(ptosave)
    for pt in ptosave:
        ps = pt[0]
        if not is_necessary(ps):
            continue
        psts = pt[1]
        month = datetime.datetime.fromtimestamp(psts).strftime('%m')
        dayofmonth = datetime.datetime.fromtimestamp(psts).strftime('%d')
        dayofweek = datetime.datetime.fromtimestamp(psts).strftime('%w')
        hour = datetime.datetime.fromtimestamp(psts).strftime('%H')
        min = datetime.datetime.fromtimestamp(psts).strftime('%M')
        data = ps + "," + getprocess_file_name(ps) + "," + month + "," + dayofmonth + "," + dayofweek + \
               "," + hour + "," + min + "," + str(psts)
        # print(data)
        f.write(data + "\n")
        print("Logged for : '" + getprocess_file_name(ps) + "' at " +
              datetime.datetime.fromtimestamp(psts).strftime('%d-%m-%y %H:%M'))
    f.close()
    src = path.realpath('data.csv')
    head, tail = path.split(src)
    dst = head + "\odata.csv"
    shutil.copy(src, dst)
    shutil.copystat(src, dst)
    data_parser.parse_data()
Esempio n. 2
0
def main():
    # 1.读取数据
    matches = dp.parse_data("sc_data/wc_group_matches.csv")
    teams = dp.parse_data("sc_data/team_info.csv")
    history_matches = dp.parse_data("sc_data/history_matches.csv")
    squads = dp.parse_data("sc_data/squads.csv")

    # 2.初始化结果表示结构
    match_details = dict()

    # 3.计算每场比赛的推荐度
    for match in matches:
        # 3.1 获取比赛基本信息
        match_id = match["No."]
        team_a, team_b = match["TeamA"], match["TeamB"]
        rank_a, code_a = get_team_info(teams, team_a)
        rank_b, code_b = get_team_info(teams, team_b)

        # 3.2 计算悬念大小
        uncertainty_score = get_match_uncertainty(match, rank_a, rank_b)

        # 3.3 计算进球期望
        goal_score = get_match_goals(history_matches, team_a, team_b)

        # 3.4 计算球员知名度
        fame_score = get_match_fame(squads, code_a, code_b)

        # 3.5 计算综合推荐分数
        recommedation_score = (uncertainty_score + goal_score + fame_score) / 3

        print(match_id + " " + team_a + " vs. " + team_b + ":")
        print("unc_score: %.2f goal_score: %.2f fame_score: %.2f" %
              (uncertainty_score, goal_score, fame_score))
        print("recommendation score: %.2f" % recommedation_score)
Esempio n. 3
0
def run_configs(data_dir, reviews_filename):
    # directory of raw data eg. {root}/data/preprocessed_files/electronics/reviews_Electronics_5
    filename = data_dir + reviews_filename

    # file endings
    raw = filename + ".json.gz"
    reviews = filename + "_reviews.txt"
    ratings = filename + "_ratings.npy"

    # if review or rating file doesnt exist, parse data and create them
    if not os.path.isfile(reviews) or not os.path.isfile(ratings):
        data_parser.parse_data(raw)
Esempio n. 4
0
def run_pipeline():
    """
    Runs all functions in the pipeline. Parses tracking and events data from 52 xml files. Preprocesses the DataFrame
    to conform to Metrica Sports format. Then calculates EPV values to get the optimal passes using the
    Friends Of Tracking code, which can be found in the EPV_code folder. Then creates multiple features based on
    tracking and events data. Followed by the analysis, using a Linear Regression and Decision Tree.

    After each step files are saved to the /data folder.
    """
    data_parser.parse_data()
    preprocessing.preprocess()
    generate_EPV_values.generate_epv_files()
    feature_engineering.engineer_features()
    analysis.run_analysis()
def main():
    #logReg()

    in_size = 32  # length * width of image, but 1 for now
    num_Of_Input = 1000  # number of images
    out_size = 2  # size of output
    step_Size = 0.0001  # learning rate
    iter = 100  # iterations of gradient decent
    norm = 0.99/255
    normBias = 0.01
    X = np.random.randint(0, 2, (in_size, num_Of_Input))  # input layer, tested with random int 0 to 1 for grayscale
    Y = np.random.rand(out_size, num_Of_Input)  # output_layer_truths where rows are outputs and col are examples


    train_images_stack, train_labels_stack, test_images_stack, test_labels_stack = data_parser.parse_data(norm, normBias)
    layers = [784, 16, 10]
    trainSize = 50000 # max 60000
    testSize = 10000 # max 10000
    model = Model(layers, trainSize)
    #[:, 0:trainSize]
    model.generateLayers(train_labels_stack[:, 0:trainSize])
    # model.train(100, train_images_stack)  # Todo: the 10 vs 100 caused an error
    model.batch_train(512, train_images_stack[:, 0:trainSize], 100, trainSize)
    model.print_cost()

    model.update_num_Of_Input(testSize)
    model.test(test_images_stack[:, 0:testSize], test_labels_stack[:, 0:testSize])


    model.feedForward(test_images_stack)
    model.print_cost()

    """
Esempio n. 6
0
    def process(self):
        """Process the

        :return: ResultObj
        """
        with open(self.file_path, "r") as f:
            data = f.read()

        result = parse_data(data)
        return ResultObj(self.file, **result)
Esempio n. 7
0
def create_training_array(train_data):
    x_train_arr = []
    y_train_arr = []
    count = 0
    for value in dp.parse_data(train_data):
        if value[1] == 0.0:
            if random.uniform(0, 1) > 0.9:
                x_train_arr.append(value[0])
                y_train_arr.append(value[1])
                count += 1
        else:
            x_train_arr.append(value[0])
            y_train_arr.append(value[1])
            count += 1
    x_train = x_train_arr
    y_train = y_train_arr

    return (x_train, y_train)
Esempio n. 8
0
import numpy as np
import matplotlib.pyplot as plt
from utils import *
import statistics

if __name__ == '__main__':
    file_name = 'dataset_1'
    sample_num = 200  # 100
    # k = 5
    min_k = 2
    max_k = 50
    k_step = 4  # 2
    # print('Working on %s, Randomly select %d samples, %d Centers' % (file_name, sample_num, k))
    print('Working on %s, Randomly select %d samples, k from %d to %d' %
          (file_name, sample_num, min_k, max_k))
    parsed_data = parse_data(file_name)
    print('Succeed in Parsing Data')
    tot_exp = 10  # 20
    criteria = [9, 8, 3]  # Gender, race, highest degree
    criteria_text = ['Gender', 'Race', 'Educational Status']

    #for alpha in [0.8, 1.0, 1.2]:
    #for alpha in [0.8, 1.0, 1.2]:
    for alpha in [1.0]:
        #for beta in [0.8, 1.0, 1.2]:
        #for beta in [0.8, 1.0]:
        for beta in [1.0]:
            k_values = range(min_k, max_k + 1, k_step)
            control_kcenter_avg, control_kcenter_betas_avg = [], []
            control_kcenter_grouped_betas_avg = []
            control_kmedian_avg, control_kmedian_betas_avg = [], []
    option = parser.parse_args()
    file_name = "./data/" + option.file_name
    output_file = "./result/" + option.file_name + "_result.txt"
    rho_value = option.rho

    min_k = 2
    max_k = 10
    k_step = 1
    sample_num = option.sample_num
    center_num = option.center_num
    sample_type = "Random" if option.sample_type else "Full"  # Choose from Full, Random
    tot_exp = 1  # Number of Experiments ran for each k

    print('Working on %s, Randomly select %d samples, k from %d to %d' %
          (file_name, sample_num, min_k, max_k))
    parsed_data, kmeans_parsed_data = parse_data(file_name)
    dim = len(kmeans_parsed_data[0])

    # Assertion makes sure we get identical copy of data in two formats
    assert (len(kmeans_parsed_data[0]) == parsed_data[0].dim)
    assert (len(parsed_data) == len(kmeans_parsed_data))
    print('Succeed in Parsing Data')

    if sample_type == "Random":
        all_clients, reverse_map, _ = random_sample(parsed_data, sample_num)
        print('Succeed in Sampling %d Clients' % len(all_clients))
        all_centers, original_centers = kmeansinitialization(
            kmeans_parsed_data, center_num)
        print('Succeed in Sampling %d Centers' % len(all_centers))
    else:
        all_centers = parsed_data
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import data_parser as dp
import database_connector as connector
import text_processing as text

select_reviews = "SELECT id, text FROM reviews WHERE positive IS NULL LIMIT 10000"
set_sentiment = "UPDATE reviews SET positive = %s WHERE id = %s"

if __name__ == "__main__":
    generator = dp.parse_data("train_reviews.json.gz")
    stop_words = nltk.corpus.stopwords.words('english')
    dbcursor = connector.get_cursor()

    # Setting output labels.
    reviews = list()
    for review in generator:
        overall = float(review.get("overall"))
        if overall > 3:
            review["label"] = 1
        else:
            review["label"] = 0
        reviews.append(review)

    # Splitting the data.
    train_data, test_data = train_test_split(reviews,
                                             train_size=0.8,
                                             shuffle=True)
Esempio n. 11
0
# Adding argument parser.
parser = argparse.ArgumentParser()
parser.add_argument('--data', '-d', help='add path to data file')

args = parser.parse_args()

# Global variables.
DATA_FNAME = args.data

if __name__ == '__main__':
    if DEBUG_MODE:
        data_dict = mock_data.DATA_DICT
    elif path.isfile('../data/retail_data_dict.pkl'):
        data_dict = pkl.load(open('../data/retail_data_dict.pkl', 'rb'))
    else:
        data_dict = parse_data(DATA_FNAME)
    print('Data dictionary formed...')

    print('storing data dict for retail.txt in pickled dictionary')
    pkl.dump(data_dict, open('../data/retail_data_dict.pkl', 'wb'))

    if DEBUG_MODE:
        kUI_idx = mock_data.KUI_IDX
        arc = mock_data.ARC
    else:
        kUI_idx = get_kui_index(data_dict)
    print('Done kUI Index and ARC...')

    num_slots = sum([len(k) for k in data_dict.keys()])
    type_slots = 24
    zipf = 0.7
    user="******",
    password="******",
    database="recommender",
    auth_plugin="mysql_native_password"
)
dbcursor = db.cursor()

insert_book = "INSERT INTO books (asin, title, description) VALUES (%s, %s, %s)"
select_book = "SELECT id FROM books WHERE asin = "
insert_review = "INSERT INTO reviews (book_id, weight, text) VALUES (%s, %s, %s)"
delete_book = "DELETE FROM books WHERE id = "
delete_reviews = "DELETE FROM reviews WHERE book_id = "


if __name__ == "__main__":
    generator = dp.parse_data("meta_Books.json.gz")
    counter = 0
    train_counter = 0

    # Adding books to database.
    for book in generator:
        if counter < 103443:
            if "description" in book and "title" in book:
                desc = book.get("description")
                if len(desc.split()) > 100:
                    counter += 1

                    asin = book["asin"]
                    title = book["title"]

                    new_book = (asin, title, desc)
Esempio n. 13
0
                        action='store_true')
    parser.add_argument('--plot_overhead',
                        required=False,
                        default=False,
                        action='store_true')
    parser.add_argument('--no_norm',
                        dest='norm',
                        action='store_false',
                        required=False,
                        default=True)
    opts = vars(parser.parse_args())

    script_dir = os.path.dirname(os.path.normpath(__file__))
    data_clean = None
    try:
        with open(os.path.join(script_dir, 'data.pickle'), 'rb') as file:
            data_clean = pickle.load(file)
    except:
        pass
    finally:
        if data_clean is None or opts['rebuild']:
            data_clean = data_parser.parse_data(rebuild=opts['rebuild'])
            with open(os.path.join(script_dir, 'data.pickle'), 'wb') as file:
                pickle.dump(data_clean, file)

    options = {
        k: opts[k]
        for k in opts if opts[k] is not None and k != 'rebuild'
    }
    plotter(data_clean, **options)