import numpy as np

this_file_path = os.path.realpath(__file__)  # this file's path
home_dir = os.path.dirname(os.path.dirname(os.path.dirname(this_file_path)))

sys.path.insert(0, home_dir + "/lib")  # for importing functions
import session_parser as sp

# For testing
#train_path = home_dir + '/data/train_head_10k'
#train_path = home_dir + '/data/train_head_million'
#train_path = home_dir + '/data/train_sample_10k'

# For real
train_path = home_dir + '/data/train'
session_generator = sp.parse_from_file(train_path)

session_count = 0

# Skips greater than or equal to this value will be aggregated
limit = 3
# 2D array stores the sums for each position for each 'number of skips'
counts = [np.zeros(limit + 1, dtype=int) for i in range(10)]
# 2D array stores the corresponding lengths
lengths = [np.zeros(limit + 1, dtype=int) for i in range(10)]

while True:
    try:
        # Print at every millionth session
        if session_count % (10**6) == 0:
            print "...reading the {0}th session".format(session_count)
        for key in dict:
            if key in findict:
                findict[key] += dict[key]
            else:
                findict[key] = dict[key]



this_file_path = os.path.realpath(__file__) # this file's path
home_dir = os.path.dirname(os.path.dirname(this_file_path))

sys.path.insert(0, home_dir + "/script") # for importing functions
import session_parser as sp

train_path = home_dir + '/data/train_sample'
session_generator = sp.parse_from_file(train_path)

session_count = 0

while True:
    try:
        # Print at every millionth session
        if session_count % (10 ** 6) == 0:
            print "...reading the {0}th session".format(session_count)

        # next() raises the StopIteration exeption when hitting the end
        session = session_generator.next()

        queryParse(session.queries)

        session_count += 1
Esempio n. 3
0
# Runs in ~122 seconds for the entire test data.

import os
import sys
import pandas as pd

this_file_path = os.path.realpath(__file__)  # this file's path
home_dir = os.path.dirname(os.path.dirname(os.path.dirname(this_file_path)))

sys.path.insert(0, home_dir + "/lib")  # for importing functions
import session_parser as sp

# test_path = home_dir + '/data/test_head' # a small header file for testing
# print "WARNING, this script is using a header file, not the real file."
test_path = home_dir + '/data/test'  # for real
session_generator = sp.parse_from_file(test_path)

# Read results for skipped and global versions
# Ignore the first 3 lines in CSV as they are comments.
skipped_means = pd.read_csv(home_dir + '/data/results/skipped_means.csv',\
    sep=",", skipinitialspace=True, header='infer', skiprows=3).skipped_means
global_means = pd.read_csv(home_dir + '/data/results/global_means.csv',\
    sep=",", skipinitialspace=True, header='infer', skiprows=3).global_means

# File for writing our predictions
# Strategy 2 - Goal 1, with a bug.  We fixed a bug, so we should try running this
#  with the fixed algorithm again.
results = open(home_dir + '/data/prediction/s2_goal1_with_bug', 'w')
results.write("SessionID,URLID\n")

session_count = 0
# Runs in ~122 seconds for the entire test data.

import os
import sys
import pandas as pd

this_file_path = os.path.realpath(__file__) # this file's path
home_dir = os.path.dirname(os.path.dirname(os.path.dirname(this_file_path)))

sys.path.insert(0, home_dir + "/lib") # for importing functions
import session_parser as sp

# test_path = home_dir + '/data/test_head' # a small header file for testing
# print "WARNING, this script is using a header file, not the real file."
test_path = home_dir + '/data/test' # for real
session_generator = sp.parse_from_file(test_path)

# Read results for skipped and global versions
# Ignore the first 3 lines in CSV as they are comments.
skipped_means = pd.read_csv(home_dir + '/data/results/skipped_means.csv',\
    sep=",", skipinitialspace=True, header='infer', skiprows=3).skipped_means
global_means = pd.read_csv(home_dir + '/data/results/global_means.csv',\
    sep=",", skipinitialspace=True, header='infer', skiprows=3).global_means

# File for writing our predictions
# Strategy 2 - Goal 1, with a bug.  We fixed a bug, so we should try running this
#  with the fixed algorithm again.
results = open(home_dir + '/data/prediction/s2_goal1_with_bug','w')
results.write("SessionID,URLID\n")

session_count = 0