Ejemplo n.º 1
0
def main():
    # The file contains information about the features
    # Format -> Feature name:Values it can take (seperated by commas)
    with open("feature.info") as f:
        data_info = f.readlines()
    # Create feature nodes
    features = feature_info(data_info)

    # Cross Validation
    print("Running Cross Validation For depths...")
    for depth in depths:
        cross_validation(depth, features)

    # Transform the data
    with open(f_train) as f:
        data = [line.rstrip() for line in f]
    data_train = featurization.featurize(data)

    with open(f_test) as f:
        tdata = [line.rstrip() for line in f]
    data_test = featurization.featurize(tdata)

    tree = build_tree(data_train, features, -1)
    print("Accuracy on Train ", test(tree, data_train, False))
    print("Accuracy on Test ", test(tree, data_test, False))

    with open(f_eval) as f:
        tdata = [line.rstrip() for line in f]
    eval_data = featurization.featurize(tdata)
    test(tree, eval_data, True)
Ejemplo n.º 2
0
def cross_validation(depth, features):
    with open(f_train) as f:
        examples = f.readlines()
    total = len(examples)
    fifth = int(total / 5)
    cv1 = examples[:fifth]
    cv2 = examples[fifth:2 * fifth]
    cv3 = examples[2 * fifth:3 * fifth]
    cv4 = examples[3 * fifth:4 * fifth]
    cv5 = examples[4 * fifth:]
    CROSS_VALIDATION_FILES = [cv1, cv2, cv3, cv4, cv5]
    train_examples = None
    test_examples = None
    for i in range(5):
        test_examples = CROSS_VALIDATION_FILES[i]
        train_examples_arr = [
            exs for exs in CROSS_VALIDATION_FILES if exs != test_examples
        ]
        train_examples = []
        for arr in train_examples_arr:
            train_examples.extend(arr)

    train_data = featurization.featurize(train_examples)
    test_data = featurization.featurize(test_examples)
    tree = build_tree(train_data, features, depth)
    accs = [test(tree, test_data, False)]
    print("Depth ", depth, "Avg. Accuracy ", np.mean(accs))
Ejemplo n.º 3
0
def cross_validation(depth, features):
    data = []
    accs = []
    for i in range(4):
        indexs = [j for j in range(4)]
        indexs.remove(i)
        f_test = f_cross.replace(".", '0' + str(i) + '.')
        data_t = featurization.featurize(f_test)
        data = []
        for index in indexs:
            data += featurization.featurize(
                f_cross.replace(".", '0' + str(index) + '.'))
        accs.append(test(data, data_t, features, depth))
    print("Depth ", depth, "Avg. Accuracy ", np.mean(accs), "Std. Deviation ",
          np.std(accs))
Ejemplo n.º 4
0
def main():

    #The file contains information about the features
    #Format -> Feature name:Values it can take (seperated by commas)
    with open("info.txt") as f:
        data_info = f.readlines()

    #Transform the data
    data_train = featurization.featurize(f_train)
    data_test = featurization.featurize(f_test)

    #Create feature nodes
    features = feature_info(data_info)

    print("Accuracy on Train ", test(data_train, data_train, features, -1))
    print("Accuracy on Test ", test(data_train, data_test, features, -1))

    for depth in depths:
        cross_validation(depth, features)
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--dataset",
        default=None,
        type=str,
        required=True,
        help="Path to dataframe",
    )
    parser.add_argument(
        "--glove_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained glove embeddings",
    )
    parser.add_argument(
        "--word_dir",
        default=None,
        type=str,
        required=True,
        help="Path to directory containing all special hand-crafted features",
    )
    parser.add_argument(
        "--featurization_type",
        default=0,
        type=int,
        required=False,
        help="Type of featurization 0-->Average Glove, 1-->TF-IDF Glove, 2-->USE",
    )
    parser.add_argument(
        "--run_implementation",
        default=0,
        type=int,
        required=True,
        help="run implementation number? (1/2/3)",
    )
    
    args = parser.parse_args()
    # load dataset
    final_df = pd.read_pickle(args.dataset)

    le=LabelEncoder()
    final_df.labels=le.fit_transform(final_df.labels)

    traindf,testdf,xTrain,xTest,yTrain,yTest=split_dataframe(final_df,'Requirements','Requirements_clean','Requirements_clean')

    glove = Magnitude(args.glove_path)
    tfidf = TfidfVectorizer()
    tfidf.fit(xTrain)
    idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

    if args.run_implementation==1:
        if args.featurization_type==1:
            train_features, test_features, feature_names = featurization.featurize(args,traindf,testdf,featurization.tfidf_glove,glove,idf_dict)
        elif args.featurization_type==2:
            train_features, test_features, feature_names = featurization.featurize_USE(args,traindf,testdf,featurization.USE_Vec)
        else:
            train_features, test_features, feature_names = featurization.featurize(args,traindf,testdf,featurization.Glove_Vec,glove,None)
        
        y_train = np.array(traindf.labels.values)
        y_test = np.array(testdf.labels.values)

        file_names=[('rf.sav','RF'),('lr.sav','LR'),('bdt.sav','BDT'),('svc.sav','SVC'),('xgb.sav','XGB'),('ann.sav','ANN'),('knn.sav','KNN')]
        for _,name in enumerate(file_names):
            est=tuning(name[1],train_features,y_train)
            print(name[1]+": ")
            print_model_metrics(y_testy_test,y_test_pred=np.array(est.predict(test_features)),verbose=False,return_metrics=True)
            print("\n")
            pickle.dump(est,open(name[0],'wb'))

        


    if args.run_implementation==2:
        xTrain_pad,xTest_pad,yTrain_enc,yTest_enc,embed_matrix,tok=preprocessing_for_lstm(xTrain,xTest,yTrain,yTest,glove,20000,100,128)
        y_train = np.array(traindf.labels.values)
        y_test = np.array(testdf.labels.values)

        input_lstm={
            'vocabulary_size':len(tok.word_index)+1,
            'embed_size':100,
            'embedding_matrix':embed_matrix,
            'hidden_dim':16,
            'output_size':5,
            'seq_length':xTrain_pad.shape[1]
        }

        model_lstm=LSTM_tf(**input_dic)

        num_epochs = 5
        history = model_lstm.fit(xTrain_pad, yTrain_enc, epochs=num_epochs, validation_data=(xTest_pad, yTest_enc))
        print_model_metrics(y_test=np.array(yTest),y_test_pred=np.array(model_lstm.predict_classes(xTest_pad)),verbose=False,return_metrics=True)

        input_CNN={
            'vocabulary_size':len(tok.word_index)+1,
            'embed_size':100,
            'embedding_matrix':None,
            'filters':8,
            'output_size':5,
            'pool_size':3,
            'kernel_size':3,
            'seq_length':xTrain_pad.shape[1]
        }
        model_cnn=CNN_tf(**input_CNN)

        history = model_cnn.fit(xTrain_pad, yTrain_enc, epochs=num_epochs)
        print(print_model_metrics(y_test=np.array(yTest),y_test_pred=np.array(model_cnn.predict_classes(xTest_pad)),verbose=False,return_metrics=True))
def extract_features(all_inputs):
    """ Yields (label, feature) tuples. 
    Images labeled by directory name, featurized by featurization.py. """
    shuffle(all_inputs)
    for i, png in enumerate(all_inputs):
        yield png.split('/')[-2], featurize(png)
Ejemplo n.º 7
0
    return v+v2

'''

fp = open('../input/train.v02.csv')
h1 = fp.readline().replace("\"", "").rstrip().split(",")
header_1 = {x[0]: x[1] for x in zip(h1, range(len(h1)))}
train_raw = []
train = []
train_label = []
for line in csv.reader(fp):
    train_raw.append(line)
    my_dict = {}
    my_dict['month'] = line[header_1['Date']].split("-")[1]
    train.append(featurize(line, header_1, my_dict))
    train_label.append(int(line[header_1['WnvPresent']]))
fp.close()

fp = open('../input/test.v02.csv')
h2 = fp.readline().replace("\"", "").rstrip().split(",")
header_2 = {x[0]: x[1] for x in zip(h2, range(len(h2)))}
test_raw = []
test = []
test_label = []
for line in csv.reader(fp):
    test_raw.append(line)
    my_dict = {}
    my_dict['month'] = line[header_1['Date']].split("-")[1]
    test.append(featurize(line, header_2, my_dict))
    test_label.append(int(line[header_2['WnvPresent']]))
Ejemplo n.º 8
0
    for line in f_matrix:
        if l[h[line[0]]]==line[1]:
            v[line[2]]=1.0
    return v

'''

fp = open('../input/train.v02.csv')
h1 = fp.readline().replace("\"", "").rstrip().split(",")
header_1 = {x[0]: x[1] for x in zip(h1, range(len(h1)))}
train_raw = []
train = []
train_label = []
for line in csv.reader(fp):
    train_raw.append(line)
    train.append(featurize(line, header_1))
    train_label.append(int(line[header_1['WnvPresent']]))
fp.close()

fp = open('../input/test.v02.csv')
h2 = fp.readline().replace("\"", "").rstrip().split(",")
header_2 = {x[0]: x[1] for x in zip(h2, range(len(h2)))}
test_raw = []
test = []
test_label = []
for line in csv.reader(fp):
    test_raw.append(line)
    test.append(featurize(line, header_2))
    test_label.append(int(line[header_2['WnvPresent']]))
fp.close()
tree = spatial.KDTree(coords)

for line in train_raw:
    v = tree.query_ball_point([
        float(line[header_1['Latitude']]) / 0.9335,
        float(line[header_1['Longitude']])
    ], 0.0058)
    if len(v) > 0:
        density = float(sum([train_label[x] for x in v])) / len(v)
    else:
        density = 0.0
    my_dict = {}
    my_dict['month'] = line[header_1['Date']].split("-")[1]

    train.append(featurize(line, header_1, my_dict) + [density])

fp = open('../input/test.v02.csv')
h2 = fp.readline().replace("\"", "").rstrip().split(",")
header_2 = {x[0]: x[1] for x in zip(h2, range(len(h2)))}
test_raw = []
test = []
test_label = []

for line in csv.reader(fp):
    test_raw.append(line)
    v = tree.query_ball_point([
        float(line[header_1['Latitude']]) / 0.9335,
        float(line[header_1['Longitude']])
    ], 0.0058)
    if len(v) > 0:
Ejemplo n.º 10
0
fp.close()

#Construct the KD Tree to help us find neighbors within a radius

tree = spatial.KDTree(coords)

for line in train_raw:
    v = tree.query_ball_point([
        float(line[header_1['Latitude']]) / 0.9335,
        float(line[header_1['Longitude']])
    ], 0.0058)
    if len(v) > 0:
        density = float(sum([train_label[x] for x in v])) / len(v)
    else:
        density = 0.0
    train.append(featurize(line, header_1, {}) + [density])

fp = open('../input/test.v02.csv')
h2 = fp.readline().replace("\"", "").rstrip().split(",")
header_2 = {x[0]: x[1] for x in zip(h2, range(len(h2)))}
test_raw = []
test = []
test_label = []

for line in csv.reader(fp):
    test_raw.append(line)
    v = tree.query_ball_point([
        float(line[header_1['Latitude']]) / 0.9335,
        float(line[header_1['Longitude']])
    ], 0.0058)
    if len(v) > 0: