Beispiel #1
0
def make_model(filename, verbose=False):
	x = load_obj('x-50000-samples-14:00')
	y = load_obj('y-50000-samples-14:00')

	no_samples = x.shape[0]

	mask = np.arange(0, no_samples)
	np.random.shuffle(mask)

	split_percent = 0.75
	split_ind = int(no_samples * split_percent)

	x_train = x[mask[0:split_ind]]
	y_train = y[mask[0:split_ind]]

	x_test = x[mask[split_ind:]]
	y_test = y[mask[split_ind:]]

	model = RegressionModel(no_hidden=300)
	model.fit(x_train, y_train)

	joblib.dump(model, filename)
	if verbose:
		print('Dumped model to disk: {}'.format(filename))

	if verbose:
		print('Training score: {}'.format(model.score(x_train, y_train)))
		print('Testing score: {}'.format(model.score(x_test, y_test)))
Beispiel #2
0
def plot():
    x = load_obj('x-50000-samples-14:00')[0:1000, 0:11]
    y = load_obj('y-50000-samples-14:00')[0:1000, None]

    # Plot correlation between
    names = [
        'queue', 'doc1', 'doc2', 'doc3', 'doc4', 'doc5', 'doc6', 'doc7',
        'doc8', 'doc9', 'doc10', 'time'
    ]

    fm = np.concatenate((x, y), axis=1)

    data = pandas.DataFrame(fm, columns=names)
    scatter_matrix(data)
    plt.show()
def process_sequence(obj):
    # Get list of paths in obj. category in AtlasNet.
    pth_files_an = jn(pth_root_an, obj, 'ply')
    files = ls(pth_files_an, exts='txt')

    # Peocess files.
    for f in files:
        # Extract file name.
        fn_base = f.split('.')[0]

        # Load .obj mesh from ShapeNet.
        pth_f_sn = jn(
            pth_root_sn, obj, fn_base, 'models', 'model_normalized.obj')
        assert os.path.exists(pth_f_sn)
        verts, faces = load_obj(pth_f_sn)

        # Load tf and apply.
        pth_f_an = jn(pth_files_an, f)
        T, s = load_tf(pth_f_an)
        verts = (verts - T) / s

        # Compute area.
        area = mesh_area(verts, faces)

        # Write area to the file.
        with open(pth_f_an, 'r') as fobj:
            txt = fobj.read()
            assert len(txt.splitlines()) == 2
            has_nl = txt.endswith('\n')

        with open(pth_f_an, 'a') as fobj:
            fobj.write('{}{:.6f}'.format(('\n', '')[has_nl], area))

        with num_samples_done.get_lock():
            num_samples_done.value += 1

    with finished_seqs.get_lock():
        finished_seqs.value += 1
Beispiel #4
0
from matplotlib import image  # routines for displaying orca image
from scipy import ndimage
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

os.chdir("/home/val/Documents/NFWF_Files/2020_Analysis/")
print("Working directnp.arctanory is ", os.getcwd())

import WhaleBoatObj  # NOTE BENE these python files have to be in the same directory as this file itself
import whalePlot
import helpers

import globalParameters as gp  ##  gp stands for Global Parameters

########################################################################################
anonBoatsDict = helpers.load_obj("anonimizer")
boatsDict = helpers.load_obj("boats")
codeCountDict = helpers.load_obj("counts")
activityCodeDict = helpers.load_obj("activityCode")
jascoCodesDict = helpers.load_obj("jascoCodes")
echoSL_Dict = helpers.load_obj("echoSL")

#anonBoatsDict['CSMINF_168']              #  here NFWF's id 'pow' is anonomized as CSMINF_168
#     where 'pow' is a Commercial Small Inflatable with JASCO code JRHIB
#('pow_CSMINF', 'Commercial Small Inflatable', 'JRHIB', 'Prince of Whales')
#given the anonimized name, the rest of this vehicle's details can be found via boatsDict
#
#boatsDict['CSMINF']  # pull the vessel code off of the numbered code and use it to get the rest of the boat info
# ('Commercial Small Inflatable', 'JRHIB')

#boatType = boatsDict[boatID.split('_')[0]][1]
Beispiel #5
0
from helpers import load_obj, save_obj
from cross_validation import cross_val

folders = ["1 day"] + map(lambda x: str(x) + " days",
                          [7, 14, 30, 90, 180, 365])
classifiers = ["DT", "RF", "LR", "kNN", "FFT-Dist2Heaven"]
# classifiers = ["FFT-Dist2Heaven"]
target = "timeOpen"

cwd = os.getcwd()
data_path = os.path.join(cwd, "data", "issue_close_time")
details_path = os.path.join(data_path,
                            'issue_close_time_details_5x10_mdlp_365.pkl')
if os.path.exists(details_path):
    performances = load_obj(details_path)
else:
    performances = {}

# for folder in folders:
folder = folders[6]
if folder not in performances:
    performances[folder] = collections.defaultdict(dict)
    folder_path = os.path.join(data_path, folder)
    for file in os.listdir(folder_path):
        if file.endswith(".csv"):
            print file + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)
            df[target] = df[target].apply(lambda x: 1 if x else 0)
            for i, clf in enumerate(classifiers):
Beispiel #6
0
                            img[y, x] = shade
            #polys.append((np.mean(xyz[:, 2]), xyz))
        """
    polys = sorted(polys, reverse=True, key=lambda x: x[0])
    for _, xyz in polys:
      rr, cc = polygon(xyz[:, 1], xyz[:, 0])

      rr[np.logical_or(rr < 0, rr >= Y)] = 0
      cc[np.logical_or(cc < 0, cc >= X)] = 0
      img[rr, cc] = random.uniform(0.3, 1.0)
    """
    return img


# *** do shit
tris = load_obj("objs/cube.obj")
#tris = load_obj("objs/teapot.obj")

# add ground plane
tris.append(([-3, -0.5, -3], [3, -0.5, -3], [3, -0.5, 3]))
tris.append(([-3, -0.5, -3], [-3, -0.5, 3], [3, -0.5, 3]))

SCALE = 1 / 10.0
LSCALE = 1 / 100.0

origin = -10 * K + I + 2 * J
#origin = -500*K + I + J
look = K

import pygame
pygame.init()
##########################
#####     INIT        ####
##########################

### Command line arguments
args = vars(parser.parse_args())
print(args)
results_path = args['results_path']
substring = args['substring']
del_substring = args['del_substring']
out_name = args['out_name']

### Get the result files
files = [join(results_path, f) for f in listdir(results_path) \
        if isfile(join(results_path, f)) and (substring in f) and (del_substring not in f)]
print(files)

with open(out_name,"w") as outfile:
    for filename in files:
        result = load_obj(filename)
        line = []
        line.append(filename)
        line.append('%.02f'%result['train_accuracy'])
        line.append('%.02f'%result['test_accuracy'])
        f1=f1_score(result['test_true_classes'], result['test_pred_classes'], average='weighted')
        line.append('%.02f'%f1)
        for elem in line:
            outfile.write(elem)
            outfile.write(" & ")
        outfile.write("\n")
Beispiel #8
0
def get_submission(X_train,
                   X_valid,
                   y_train,
                   y_valid,
                   X_test,
                   train_params={},
                   eval_metric='auc',
                   save=False,
                   load=False,
                   mdl_name='xgb_class'):

    start_time = time.time()
    end_time = start_time
    if load:
        classifier = load_obj(mdl_name)
    else:
        classifier = XGBClassifier(**train_params)
        classifier.fit(X_train.values,
                       y_train.values.ravel(),
                       eval_metric=eval_metric)
        end_time = time.time()

        if save:
            save_obj(classifier, mdl_name)
            print('model saved')

    train_pred = classifier.predict(X_train.values)
    valid_pred = classifier.predict(X_valid.values)
    test_pred = classifier.predict(X_test.values)

    fpr, tpr, _ = roc_curve(y_train.values, train_pred, pos_label=1)
    train_loss = auc(fpr, tpr)

    fpr, tpr, _ = roc_curve(y_valid.values, valid_pred, pos_label=1)
    valid_loss = auc(fpr, tpr)

    feature_importances = classifier.feature_importances_

    feature_names = X_train.columns.values
    sorted_idx = np.argsort(feature_importances * -1)  # descending order

    summary = '====== XGBClassifier Training Summary ======\n'
    for idx in sorted_idx:
        summary += '[{:<25s}] | {:<10.4f}\n'.format(feature_names[idx],
                                                    feature_importances[idx])
    summary += '>>> training_time={:10.2f}min\n'.format(
        (end_time - start_time) / 60)
    summary += '>>> Final AUC: {:10.4f}(Training), {:10.4f}(Validation)\n'.format(
        train_loss, valid_loss)

    # Generate submission
    submission = pd.DataFrame(data=test_pred,
                              index=X_test.index,
                              columns=['Next_Premium'])

    submission_train = pd.DataFrame(data=train_pred,
                                    index=X_train.index,
                                    columns=['Next_Premium'])

    submission_valid = pd.DataFrame(data=valid_pred,
                                    index=X_valid.index,
                                    columns=['Next_Premium'])

    return {
        'model': classifier,
        'submission': submission,
        'submission_train': submission_train,
        'submission_valid': submission_valid,
        'valid_loss': valid_loss,
        'summary': summary
    }
Beispiel #9
0
def get_submission(X_train,
                   y_train,
                   X_valid,
                   y_valid,
                   X_test,
                   params,
                   save=False,
                   load=False,
                   mdl_name='catb'):
    categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
    X_train.fillna(-999, inplace=True)
    X_valid.fillna(-999, inplace=True)
    X_test.fillna(-999, inplace=True)

    PATH = './saved_models'
    if not os.path.isdir(PATH): os.makedirs(PATH)

    start_time = time.time()
    end_time = start_time
    if load:
        regressor = load_obj(mdl_name)
    else:
        regressor = CatBoostRegressor(**params)

        regressor.fit(X_train,
                      y_train,
                      cat_features=categorical_features_indices,
                      eval_set=(X_valid, y_valid),
                      plot=False,
                      early_stopping_rounds=None)
        end_time = time.time()

        if save:
            save_obj(regressor, mdl_name)

    train_pred = regressor.predict(X_train.values)
    valid_pred = regressor.predict(X_valid.values)
    test_pred = regressor.predict(X_test.values)

    train_loss = mean_absolute_error(y_train.values, train_pred)
    valid_loss = mean_absolute_error(y_valid.values, valid_pred)

    feature_importances = np.array(regressor.feature_importances_)

    feature_names = X_train.columns.values
    sorted_idx = np.argsort(feature_importances * -1)  # descending order

    summary = '====== CatBoost Training Summary ======\n'
    for idx in sorted_idx:
        summary += '[{:<25s}] | {:<10.4f}\n'.format(feature_names[idx],
                                                    feature_importances[idx])
    summary += '>>> training_time={:10.2f}min\n'.format(
        (end_time - start_time) / 60)
    summary += '>>> Final MAE: {:10.4f}(Training), {:10.4f}(Validation)\n'.format(
        train_loss, valid_loss)

    # Generate submission
    submission = pd.DataFrame(data=test_pred,
                              index=X_test.index,
                              columns=['Next_Premium'])

    submission_train = pd.DataFrame(data=train_pred,
                                    index=X_train.index,
                                    columns=['Next_Premium'])

    submission_valid = pd.DataFrame(data=valid_pred,
                                    index=X_valid.index,
                                    columns=['Next_Premium'])

    return {
        'model': regressor,
        'submission': submission,
        'submission_train': submission_train,
        'submission_valid': submission_valid,
        'valid_loss': valid_loss,
        'summary': summary
    }
Beispiel #10
0
        "@xerces": ["xerces-1.2.csv", "xerces-1.3.csv", "xerces-1.4.csv"]
        }

rank_csv = os.path.join(data_path, 'top_changes.csv')
feature_rankings = pd.read_csv(rank_csv, index_col=0)

criterias = ["Accuracy", "Dist2Heaven", "LOC_AUC"]  # "Gini", "InfoGain"]

for percent in [25, 50, 75, 100]:
    p_opt_stat = []
    cnts = [collections.defaultdict(int) for _ in xrange(len(criterias))]
    print str(percent) + ' percent of features selected'
    f_cnt = int(percent / 100.0 * 20)
    all_data_filepath = os.path.join(
        data_path, "_reduced_" + str(percent) + "_Data_16.pkl")
    all_data = load_obj(all_data_filepath) if os.path.exists(
        all_data_filepath) else {}
    for name, files in data.iteritems():
        if name not in all_data:
            print '\n' + name
            print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
            f_rankings = feature_rankings.loc[feature_rankings["Name"] ==
                                              name[1:]].values[0]
            f_selected = [
                t for t in list(f_rankings[1:f_cnt]) + ['bug'] if t != "name.1"
            ]
            print "selected features are: " + ", ".join(f_selected)
            paths = [os.path.join(data_path, file_name) for file_name in files]
            train_df = pd.concat([pd.read_csv(path) for path in paths[:-1]],
                                 ignore_index=True)
            train_df = train_df[f_selected]
Beispiel #11
0
import Learners
import os
from helpers import load_obj, save_obj

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

print(
    "================================== Loading models =================================="
)
translator = Translator()
word2vecTrainer = Learners.FastTextTrainer()
br_model = word2vecTrainer.load_google_model(
    "/home/mattyws/Downloads/Wikipedia/wiki.pt/wiki.pt")
word_freq = load_obj('word_count')

print(
    "================================== Creating pairs =================================="
)
i = 0
not_in_vocab = set()
infer_words = set()
while i < len(word_freq):
    word = word_freq[i][0]
    try:
        if word not in br_model.wv.vocab:
            not_in_vocab.add(word)
        if word in br_model and word not in br_model.wv.vocab:
            infer_words.add(word)
        i += 1
os.chdir("/home/val/Documents/NFWF_Files/2020_Analysis/")
print("Working directory is ",os.getcwd())

import WhaleBoatObj   # NOTE BENE these python files have to be in the same directory as this file itself
import whalePlot
import helpers
import globalParameters as gp






#################################################################################
        
allPassbys = helpers.load_obj("tracksModel_2003_2005")  # this is list of passbys where each passby is one whale object 
                                          #      and objects for accompanying boats



def plotRangeToWhale(Ipassby):
  whale = allPassbys[Ipassby][0]
  print(whale)
  if whale.nBoats == 0:
    return
  boats = allPassbys[Ipassby][1]
  rWhale = []
  N100 = N400 = N1000 = N5000 = 0  
  for boat in boats:
    print(boat)
    print("len(rWhale)",len(boat.rWhale))
"""
    Sept 15, 2020
    Unpacked pickled intermediate data files and save as csv for public access

"""

import os.path
from typing import List

import helpers

os.chdir("/home/val/Documents/NFWF_Files/2020_Analysis/")
print("Working directory is ", os.getcwd())

allPassbys = helpers.load_obj("tracksModel_RLs_2003_2005")
# each passby of allPassbys is a whale, boat pair  whale=passby[ ][0]  boats=passby[ ][1]


def buildRangeToWhale(Ipassby):
    whale = allPassbys[Ipassby][0]
    print(whale)
    if whale.nBoats == 0:
        return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    boats = allPassbys[Ipassby][1]

    rWhale = []
    N100 = N400 = N1000 = N5000 = 0
    for boat in boats:
        #        print(boat)
        #        print("len(rWhale)", len(boat.rWhale))
        for R in boat.rWhale:
Beispiel #14
0
      b.tauMod.append(0)      
      b.vxMod.append(0)
      b.vyMod.append(0)
      b.vMod.append(0)
      b.aMod.append(0)   
    dx = (b.xMod[i] - w.xMod[i])
    dy = (b.yMod[i] - w.yMod[i])
    R = np.sqrt(dx**2 + dy**2)
    theta = 180*math.atan2(dx,dy)/np.pi
    b.rWhale.append(R)
    b.bearingWhale.append(theta)

##  print("leaving predictBpositions")
######################################  Executable code starts here
    
tracksList = helpers.load_obj("tracksList_2003_2005")

for track in tracksList:
  whale = track[0]
  print(whale)predictBpositions
  predictWpositions(whale) 
  boats = track[1]
  for boat in boats:
    print("trackIDs", whale.trackID ,boat.trackID, "predicting boat",boat.boatID, "# boat obs",boat.Nobs)
    predictBpositions(whale, boat)  
    print("check boat xMod len=",len(boat.xMod))
    
  # whalePlot.plotPassby(whale, boats, 2400, False, False)#  Don't plot RLs and no DEBUG
  # whalePlot.plotPassby(whale, boats, 1200, False, False)
  # whalePlot.plotPassby(whale, boats, 600, False, False)
  # whalePlot.plotPassby(whale, boats, 300, False, False)  
Beispiel #15
0
              img[y,x] = shade
      #polys.append((np.mean(xyz[:, 2]), xyz))
     
    """
    polys = sorted(polys, reverse=True, key=lambda x: x[0])
    for _, xyz in polys:
      rr, cc = polygon(xyz[:, 1], xyz[:, 0])

      rr[np.logical_or(rr < 0, rr >= Y)] = 0
      cc[np.logical_or(cc < 0, cc >= X)] = 0
      img[rr, cc] = random.uniform(0.3, 1.0)
    """
  return img

# *** do shit
tris = load_obj("objs/cube.obj")
#tris = load_obj("objs/teapot.obj")

# add ground plane
tris.append(([-3, -0.5, -3], [3, -0.5, -3], [3, -0.5, 3]))
tris.append(([-3, -0.5, -3], [-3, -0.5, 3], [3, -0.5, 3]))

SCALE = 1/10.0
LSCALE = 1/100.0

origin = -10*K + I + 2*J
#origin = -500*K + I + J
look = K

import pygame
pygame.init()
Beispiel #16
0
def get_submission(X_train,
                   X_valid,
                   y_train,
                   y_valid,
                   X_test,
                   train_params={},
                   save=False,
                   load=False,
                   mdl_name='xgb'):

    PATH = './saved_model'
    if not os.path.isdir(PATH): os.makedirs(PATH)

    start_time = time.time()
    end_time = start_time
    if load:
        regressor = load_obj(mdl_name)
    else:
        regressor = xgb.XGBRegressor(**train_params)
        regressor.fit(X_train.values, y_train.values, eval_metric='mae')
        end_time = time.time()

        if save:
            save_obj(regressor, mdl_name)

    train_pred = regressor.predict(X_train.values)
    valid_pred = regressor.predict(X_valid.values)
    test_pred = regressor.predict(X_test.values)

    train_loss = mean_absolute_error(y_train.values, train_pred)
    valid_loss = mean_absolute_error(y_valid.values, valid_pred)

    feature_importances = regressor.feature_importances_

    feature_names = X_train.columns.values
    sorted_idx = np.argsort(feature_importances * -1)  # descending order

    summary = '====== XGBoost Training Summary ======\n'
    for idx in sorted_idx:
        summary += '[{:<25s}] | {:<10.4f}\n'.format(feature_names[idx],
                                                    feature_importances[idx])
    summary += '>>> training_time={:10.2f}min\n'.format(
        (end_time - start_time) / 60)
    summary += '>>> Final MAE: {:10.4f}(Training), {:10.4f}(Validation)\n'.format(
        train_loss, valid_loss)

    # Generate submission
    submission = pd.DataFrame(data=test_pred,
                              index=X_test.index,
                              columns=['Next_Premium'])

    submission_train = pd.DataFrame(data=train_pred,
                                    index=X_train.index,
                                    columns=['Next_Premium'])

    submission_valid = pd.DataFrame(data=valid_pred,
                                    index=X_valid.index,
                                    columns=['Next_Premium'])

    return {
        'model': regressor,
        'submission': submission,
        'submission_train': submission_train,
        'submission_valid': submission_valid,
        'valid_loss': valid_loss,
        'summary': summary
    }
Beispiel #17
0
import os
from helpers import load_obj, save_obj

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

print(
    "================================== Loading models =================================="
)
translator = Translator()
word2vecTrainer = Learners.FastTextTrainer()
br_model = word2vecTrainer.load_google_model(
    "/home/mattyws/Downloads/Wikipedia/br/wiki.pt/wiki.pt")
en_model = word2vecTrainer.load_google_model(
    "/home/mattyws/Downloads/Wikipedia/wiki.en/wiki.en")
word_freq = load_obj('word_count')

if os.path.exists(
        '/home/mattyws/Downloads/Wikipedia/br/word_pairs_fasttext_inference.pkl'
):
    print(
        "================================== Loading pairs =================================="
    )
    word_pairs = load_obj('word_pairs_fasttext_inference')
else:
    print(
        "================================== Creating pairs =================================="
    )
    word_pairs = []
    i = 0
    while len(word_pairs) < 5000 and i < len(word_freq):
Beispiel #18
0
#########################################################################################  Program execution starts here
#######################################################################################
boatsJdays = []
allBoatLines = loadAllBoats(
    boatsJdays
)  #boatsJdays is a 1-D array with the julian time for each line in boat file
logFile = open(parserLogFileName, 'w')
if BUILD_DICTs:
    buildDictionaries(allBoatLines)
    helpers.save_obj(anonBoatsDict, "anonimizer")
    helpers.save_obj(boatsDict, "boats")
    helpers.save_obj(codeCountDict, "counts")
    helpers.save_obj(activityCodeDict, "activityCode")

else:
    anonBoatsDict = helpers.load_obj("anonimizer")
    boatsDict = helpers.load_obj("boats")
    codeCountDict = helpers.load_obj("counts")
    activityCodeDict = helpers.load_obj("activityCode")

passbyLinesLists = ['init']
passbyCnt = 0
whalePassbyList = []
boatsPassbyList = []
tracksList = []
gapList = []  # list of gaps identified between passby (in minutes)
lineCnt = 0
while len(passbyLinesLists) > 0:

    passbyLinesLists = scanForNextTimeGap(
        gp.maxObsGapMins, gapList