def train(file):
    y_data=makeRawDataset(file) #provides with raw data
    X=pd.read_csv(file).Question
    X_without=removePunc(X) #without punctuations
    X_correct=fuzzy(X_without) #with fuzzy
    X_enc=encode(X_correct)
    labels=[">","<","<=",">=","==","NULL","LIKE"]
    encoder=LabelEncoder()
    codes=encoder.fit_transform(labels)
    codeMap={labels[i]:codes[i] for i in range(len(labels))}
    inverseMap={codes[i]:labels[i] for i in range(len(labels))}
    maps={"codeMap":codeMap,"inverseMap":inverseMap}
    np.save("Map.npy",maps)
    y=[]
    for i in y_data:
        y.append(codeMap[i])
    X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size = 0.2, random_state = 42)
    model = XGBClassifier()
    model.fit(X_train,y_train)
    model.save_model("WhereCond.model")
    y_hat=model.predict(X_test)
    print(y_hat[:10])
    y_pred=[]
    for i in y_hat:
        y_pred.append(inverseMap[i])
    y_true=[]
    for i in y_test:
        y_true.append(inverseMap[i])
    sk_report = classification_report(digits=6,y_true=y_test,y_pred=y_hat)
    print(sk_report)
def test(x,modelFile):
    model = Booster() #init model
    model.load_model(modelFile) # load data
    maps=np.load("Map.npy",allow_pickle=True)
    x_enc=encode([x])
    y_enc=model.predict(DMatrix(x_enc))
    y_pred=np.argmax(y_enc)
    inverseMap=maps.item().get("inverseMap")
    y_hat=inverseMap[y_pred]
    print(y_hat)
Beispiel #3
0
def main(args):

    data = json.load(open(args.input_refexps_json, 'r'))
    max_length = 0
    all_refexps = []
    for keys in data:
        for ref_id in data[keys]:
            all_refexps.append(data[keys][ref_id])

    for r in all_refexps:
        t = tokenize(
            r,
            punct_to_keep=[',', ';'],
            punct_to_remove=['?', '.']
        )
        if len(t) > max_length:
            max_length = len(t)

    refexp_token_to_idx = build_vocab(
        all_refexps,
        punct_to_keep=[',', ';'],
        punct_to_remove=['?', '.']
    )

    with open(args.output_vocab_json, 'w') as f:
        json.dump(refexp_token_to_idx, f)

    with h5py.File(args.output_refexps_h5df, 'w') as f:
        for keys in data:
            one_image_refexps = []
            # img_name = keys.split('.')[0]
            one_image_refexps_to_idx = []
            img_all_refexps = data[keys]

            for ref_id in img_all_refexps:
                # refexp = img_all_refexps[ref_id]
                # one_image_refexps.append(refexp)
                refexp = img_all_refexps[ref_id]
                one_image_refexps.append(refexp)

            for refexps in one_image_refexps:
                tokens = tokenize(refexps, punct_to_remove=['?', '.'], punct_to_keep=[';', ','])
                refexps_idx = encode(tokens, refexp_token_to_idx)
                one_image_refexps_to_idx.append(refexps_idx)

            for refexp_ in one_image_refexps_to_idx:
                num_null = max_length - len(refexp_)
                if num_null > 0:
                    refexp_ += [refexp_token_to_idx['<NULL>']]*num_null

            one_image_refexps_to_idx_numpy = np.asarray(one_image_refexps_to_idx, dtype=np.int32)

            f.create_dataset(keys, data=one_image_refexps_to_idx_numpy)
Beispiel #4
0
def predict():
    if request.method == "GET":
        return render_template("index.html")
    else:
        document = request.form["document"]
        document = document.strip()
        inp = []
        inp.append(preprocess.encode(document))
        inp = np.array(inp)
        y_pred = model.predict(inp)
        out = preprocess.decode(y_pred[-1])
        return render_template("index.html", message=out, document=document)
Beispiel #5
0
def predict():
    # get the car details from the form
    car_details = list(request.form.values())
    # preprocess the car details
    car_details_processed = preprocess.encode(car_details)
    # apply scaler to the car details
    final_car_details = preprocess.scale(car_details_processed)
    # predict the car price 
    car_price = model.predict(final_car_details)
    # round the price value
    car_price = round(car_price[0])
    
    return render_template('index.html', prediction_text=f'The price of the {car_details[0].capitalize()} {car_details[1]} {car_details[3]} model grade {car_details[5]} {car_details[4]}, {car_details[6]} car with mileage value of {car_details[2]}km should be {car_price:,} naira')
				res[ind] = 0.025641


		result.write(str(index)+',')
		result.write(",".join(map(str,res)))
		result.write('\n')
	result.close()



dir = os.getcwd()

train = pd.read_csv(dir+'/train.csv',sep = ',',header = 0)
train = train.sample(n = 50000)
train = pre.preprocess(train)
train = pre.encode(train)

print "Preprocessing done"

test = pd.read_csv(dir+'/test.csv',sep = ',',header = 0)

test = pre.preprocess(test)
test = pre.encode(test,True)

input_cols= train.columns[range(2,3)+range(4,7)+range(8,21)]
print input_cols

train_inp = train[input_cols]
train_op = train['Category']
test = test[input_cols]
Beispiel #7
0
        for ind in range(len(res)):
            if str(res[ind]) == 'nan':
                res[ind] = 0.025641

        result.write(str(index) + ',')
        result.write(",".join(map(str, res)))
        result.write('\n')
    result.close()


dir = os.getcwd()

train = pd.read_csv(dir + '/train.csv', sep=',', header=0)
train = train.sample(n=50000)
train = pre.preprocess(train)
train = pre.encode(train)

print "Preprocessing done"

test = pd.read_csv(dir + '/test.csv', sep=',', header=0)

test = pre.preprocess(test)
test = pre.encode(test, True)

input_cols = train.columns[range(2, 3) + range(4, 7) + range(8, 21)]
print input_cols

train_inp = train[input_cols]
train_op = train['Category']
test = test[input_cols]
                        df_airport_test]).reset_index(drop=True)
print('Data Imported')

# Calculates the taxi out time
df_airport = preprocess.calc_TO_time(df_airport)
# Adds distance between stand and runway
df_airport = merging.merge_distance(df_airport)
# Calculates traffic feature
df_airport = merging.merge_traffic(df_airport)
# Calculates Q (queue size)
df_airport = merging.get_Q(df_airport)
print('Feature engineering in process')
# Merges weather features
df_airport = merging.get_weather_data(df_airport)
# Calculates the last known taxi times for the aircrafts
df_airport = preprocess.get_previous_taxi_times(df_airport, 3)
print('Feature engineering still in process')
# Calculates the last known taxi times by runway
df_airport = preprocess.get_previous_taxi_times_by_runway(df_airport, 3)
# Seperates the datetime columns into its attributes
df_airport = preprocess.get_date_attributes(df_airport)
# Converts datetime variables into cyclic features
df_airport = preprocess.encode(df_airport, 'aobt_hour', 23)
df_airport = preprocess.encode(df_airport, 'aobt_month', 12)
df_airport = preprocess.encode(df_airport, 'aobt_day', 365)
# Merge Aircraft characteristic features
df_airport = merging.merge_tech(df_airport)

# Outputs the full dataset with all features to be used for modelling
df_airport.to_csv('df_preprocessed_2015-2019.csv', index=False)
print('File created')
		
		result.write(str(index)+',')
		result.write(",".join(map(str,res)))
		result.write('\n')
	result.close()



train = pd.read_csv(dir+'/train.csv',sep = ',',header = 0)
train = train.sample(n = 450000)
validate = pd.read_csv(dir+'/train.csv',sep = ',',header = 0,nrows = 10000,skiprows = range(1,500000))

test = pd.read_csv(dir+'/test.csv',sep = ',',header = 0)

train = pre.preprocess(train)
train = pre.encode(train)
print "Training data   preprocessed"

validate = pre.preprocess(validate)
validate = pre.encode(validate)


test= pre.preprocess(test)
test = pre.encode(test,True)
print "Test data preprocessed"
input_cols= train.columns[range(2,3)+range(4,7)+range(8,21)]
print input_cols



train_inp = train[input_cols]
                      na_values=missing_values)
predData = pd.read_csv("./Data/prediction-test/test.csv",
                       na_values=missing_values)

#import pdb; pdb.set_trace()

pd.set_option('display.max_rows', len(dataset))
pd.set_option('display.max_columns', None)

#Clean and encode varibales
dataset = pp.cleanData(dataset)

predData = pp.cleanData(predData)
encoder = preprocessing.LabelEncoder()
#dataset = pp.removeOutliers(dataset)
dataset, predData = pp.encode(dataset, predData, encoder)

#SPLITTING
train, test = train_test_split(dataset, test_size=0.2, random_state=42)

train = pp.removeOutliers(train)

cols = [
    col for col in train.columns if col not in ['Instance', 'Income in EUR']
]
X_train = train[cols]

cols = [
    col for col in test.columns if col not in ['Instance', 'Income in EUR']
]
X_test = test[cols]
Beispiel #11
0
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from preprocess import fillValues,encode,encode_test
import pickle as pkl

N_DATA = 8339

def diff(li1, li2): 
    return (list(set(li1) - set(li2)))

df = pd.read_csv('newdata.csv')

params_to_use = ['Binding', 'Brand', 'ListPrice', 'ProductGroup', 'location', 'govSchemeSize', 'companiesMoving', 'supplyDelhi', 'supplyMumbai', 'supplyKolkata', 'supplyChennai', 'demandDelhi', 'demandMumbai', 'demandChennai', 'demandKolkata', 'label']

data = df[params_to_use]
#print(data.info())

#fill Nan values
data = fillValues(data)

#encode strings
data = encode(data)

print(data.loc[8])

X = data[diff(params_to_use, ['label'])].as_matrix()
y = data['label'].as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=7)
Beispiel #12
0
dir = os.getcwd()

'''
https://github.com/tqchen/xgboost/blob/master/demo/multiclass_classification/train.py
'''

categories = ['WARRANTS', 'OTHER OFFENSES', 'LARCENY/THEFT', 'VEHICLE THEFT', 'VANDALISM', 'NON-CRIMINAL', 'ROBBERY', 'ASSAULT', 'WEAPON LAWS', 'BURGLARY', 'SUSPICIOUS OCC', 'DRUNKENNESS', 'FORGERY/COUNTERFEITING', 'DRUG/NARCOTIC', 'STOLEN PROPERTY', 'SECONDARY CODES', 'TRESPASS', 'MISSING PERSON', 'FRAUD', 'KIDNAPPING', 'RUNAWAY', 'DRIVING UNDER THE INFLUENCE', 'SEX OFFENSES FORCIBLE', 'PROSTITUTION', 'DISORDERLY CONDUCT', 'ARSON', 'FAMILY OFFENSES', 'LIQUOR LAWS', 'BRIBERY', 'EMBEZZLEMENT', 'SUICIDE', 'LOITERING', 'SEX OFFENSES NON FORCIBLE', 'EXTORTION', 'GAMBLING', 'BAD CHECKS', 'TREA', 'RECOVERED VEHICLE', 'PORNOGRAPHY/OBSCENE MAT']

train = pd.read_csv(dir+'/train.csv',sep = ',',header = 0,nrows = 100000)
#train = train.sample(n = 450000)
validate = pd.read_csv(dir+'/train.csv',sep = ',',header = 0,nrows = 10000,skiprows = range(1,300000))

test = pd.read_csv(dir+'/test.csv',sep = ',',header = 0,nrows = 5000)

train = pre.preprocess(train)
train = pre.encode(train)
print "Training data   preprocessed"

validate = pre.preprocess(validate)
validate = pre.encode(validate)


test= pre.preprocess(test)
test = pre.encode(test,True)
print "Test data preprocessed"
input_cols= train.columns[range(2,3)+range(4,7)+range(8,21)]
print input_cols

train_inp = train[input_cols].values
train_op = train['Category'].values
validate_inp = validate[input_cols].values
Beispiel #13
0
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print("Must give one of --input_vocab_json or --output_vocab_json")
        return

    print("Loading questions...")
    with open(args.input_questions, 'r') as f:
        questions = f.read()
    questions = questions.split("\n")
    questions = questions[:-1]

    print("Loading answers...")
    with open(args.input_answers, 'r') as f:
        answers = f.read()
    answers = answers.split("\n")
    answers = answers[:-1]

    answer_token_to_idx = None
    # Either create the vocab or load it from disk
    if args.input_vocab_json == "" or args.expand_vocab == 1:
        print("Building vocab...")

        # Convert the answer tokens to unique id
        answer_token_to_idx = build_vocab([answer for answer in answers])

        # convert the tokens in all questions to unique id
        question_token_to_idx = build_vocab(
            [question for question in questions],
            min_token_count=args.unk_threshold,
            punct_to_keep=[';', ','],
            punct_to_remove=['?', '.'])

        answer_idx_to_token = {}
        question_idx_to_token = {}

        # create a reverse dictionary for answer idx to token mapping
        for key, value in answer_token_to_idx.items():
            answer_idx_to_token[value] = key

        # create a reverse dictionary for question idx to token mapping
        for key, value in question_token_to_idx.items():
            question_idx_to_token[value] = key

        # dump all the dictionaries as a single JSON file
        vocab = {
            "question_token_to_idx": question_token_to_idx,
            "answer_token_to_idx": answer_token_to_idx,
            "question_idx_to_token": question_idx_to_token,
            "answer_idx_to_token": answer_idx_to_token
        }

    if args.input_vocab_json != "":
        print("Loading vocab...")
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)

        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab["question_token_to_idx"]:
                if word not in vocab["question_token_to_idx"]:
                    print("Found new word %s" % word)
                    idx = len(vocab["question_token_to_idx"])
                    vocab["question_token_to_idx"][word] = idx
                    num_new_words += 1
                print("Found %d new words" % num_new_words)

    if args.output_vocab_json != "":
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    # This converts question strings to integers
    print("Encoding data")
    questions_encoded = []
    _answers = []

    for question, answer in zip(questions, answers):
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab["question_token_to_idx"],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)
        _answers.append(vocab["answer_token_to_idx"][answer])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab["question_token_to_idx"]["<NULL>"])

    # Create h5 dataset file
    print("Writing output")
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    print("Questions encoded shape is {}".format(questions_encoded.shape))
    print("Length of answer tokens is {}".format(len(answer_token_to_idx)))

    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset("questions", data=questions_encoded)

        if len(_answers) > 0:
            f.create_dataset("answers", data=np.asarray(_answers))
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data...')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    '''Either create the vocab or load it from disk'''
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab...')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab((str(q['answer'])
                                               for q in questions))
        question_token_to_idx = build_vocab(
            [q['question'] for q in questions],
            min_token_count=args.unk_threshold,
            punct_to_keep=[';', ','],
            punct_to_remove=['?', '.']
        )

        all_program_strs = []
        for q in questions:
            if 'program' not in q.keys():
                continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab...')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)

        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
                print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    '''Encode all questions and programs'''
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []

    for orig_idx, q in enumerate(questions):
        question = q['question']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = tokenize(question, punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][str(q['answer'])])

    '''Pad encoded questions and programs'''
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    '''Create h5 file'''
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print('Questions encoded shape is {}'.format(questions_encoded.shape))
    print('Programs encoded shape is {}'.format(programs_encoded.shape))

    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))
Beispiel #15
0
dataInfo.general(X)
dataInfo.missing_value_per_column(X)
categorical, numerical = dataInfo.colType(X)

print("categorical columns: ",categorical)

print(X.sex.unique())              # nominal data --> one hot encode
print(X.smoker.unique())           # ordinal data --> ordinal encode
print(X.region.unique())           # nominal data --> one hot encode



X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

preprocessor = preprocess.encode()

trans=preprocessor.fit_transform(X_train) #####################################new dataframe
plot.regplot(X_train, y_train)
'''
###for plotting

print(trans)
df = pd.DataFrame(trans, #mit ohe codierten...
                  columns=['age', 'sex', 'bmi', 'children', 'smoker', 'region'])    ##recreate dataframe
plot.pairplot(df,y)
#####
'''

#linear Regression
###################################################################