Ejemplo n.º 1
0
def main():
    detector = get_detector('yolo')
    while True:
        time.sleep(0.01)
        _queue = REDIS_DB.lrange(REDIS_QUEUE, 0, BATCH_SIZE - 1)
        for _q in _queue:
            all_features = {
                'primary': [],
                'object_bitmap': [0 for _ in range(len(detector.classes))]
            }
            _q = json.loads(_q.decode("utf-8"))
            img = utils.base64_decode(_q['image'], _q['shape'])

            all_features['secondary'] = extract_features(img.copy())

            response = detector.predict(img)
            for obj in response:
                box = obj['box']
                x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
                if(x2-x1 >= 75 and y2-y1 >= 75):
                    features = extract_features(img[y1:y2, x1:x2])
                    all_features['primary'].append({
                        'features': features,
                        'label': obj['label'],
                        'name': obj['name'],
                        'box': obj['box']
                    })
                    all_features['object_bitmap'][obj['label']] = 1

            REDIS_DB.set(_q['id'], json.dumps(all_features))
        REDIS_DB.ltrim(REDIS_QUEUE, len(_queue), -1)
Ejemplo n.º 2
0
def generic_features(infile, outfile):
    '''Extracts features from any lyric dataset in the format of (title, artist, lyrics, label)'''
    lyric_dict = {}
    lyrics = []
    labels = []
    cfig = ['lem', 'plex']

    f = open(infile, 'rt', encoding="utf8")
    reader = csv.reader(f)
    for r in reader:
        lyric_dict[(r[0], r[1], r[3])] = r[2]  #title, artist, label:lyrics
    for key in lyric_dict:
        lyrics.append(lyric_dict[key])
        labels.append(key[2])

    features, header = fe.extract_features(lyrics, cfig)
    f2 = open(outfile, 'w', encoding="utf8")
    f2.write('title,' + 'artist,' + ','.join(header) + ',' + 'popular' + '\n')
    for song, row in zip(lyric_dict.keys(), features):
        str_row = [str(i) for i in row]
        f2.write(song[0] + ',' + song[1] + ',' + ','.join(str_row) + ',' +
                 song[2] + '\n')

    f.close()
    f2.close()

    return features, labels
Ejemplo n.º 3
0
def decade_features():
    '''Extracts features from the 2006-2015 lyrics dataset'''
    infile = "datasets/lyrics_10Year.csv"
    lyric_dict = {}
    lyrics = []
    cfig = ['lem', 'plex']

    f = open(infile, 'rt')
    reader = csv.reader(f)
    for r in reader:
        lyric_dict[(r[0], r[1])] = r[3]  #title, artist:lyrics
    for key in lyric_dict:
        lyrics.append(lyric_dict[key])

    features, header = fe.extract_features(lyrics, cfig)
    f2 = open("datasets/nltk_10Year.csv", 'w')
    f2.write('title,' + 'artist,' + ','.join(header) + '\n')
    for song, row in zip(lyric_dict.keys(), features):
        str_row = [str(i) for i in row]
        f2.write(song[0] + ',' + song[1] + ',' + ','.join(str_row) + '\n')

    f.close()
    f2.close()

    return features
Ejemplo n.º 4
0
def main():
    """ main function, called if module is run as main program"""

    # load raw csv file and extract relevant lines (marked by 'GEN' for general)
    with open('data/sarcasm_v2.csv') as datafile:
        raw_data = list(csv.reader(datafile))
        data = [line[-1] for line in raw_data if line[0] == 'GEN']
        labels = [line[1] for line in raw_data if line[0] == 'GEN']

    # load config file
    with open('conf.txt') as conffile:
        conf_all = set(line.strip() for line in conffile)

    # compute score, for each line in the config individually and all together
    confs = [line for line in conf_all]
    confs.append([line for line in conf_all])
    for conf in confs:
        print('computing score for: %s... ' % conf, end='')
        features = fe.extract_features(data, conf)
        score = cross_val_score(svm.SVC(),
                                features,
                                labels,
                                scoring='accuracy',
                                cv=10).mean()
        score = round(score, 3)
        print(score)
        with open('experiments2.csv', 'a') as f:
            f.write(";".join(
                ('{:%Y-%m-%d;%H:%M:%S}'.format(datetime.datetime.now()),
                 str(conf), str(score), '\n')))
Ejemplo n.º 5
0
def extract_features_from_rawdata(chunk, header, period, features):
    with open(
            os.path.join(os.path.dirname(__file__),
                         "resources/channel_info.json")) as channel_info_file:
        channel_info = json.loads(channel_info_file.read())
    data = [convert_to_dict(X, header, channel_info) for X in chunk]
    return extract_features(data, period, features)
Ejemplo n.º 6
0
def search_windows(img,
                   windows,
                   clf,
                   scaler,
                   color_space='RGB',
                   patch_size=(64, 64),
                   feature_extractor_params=None):
    if color_space != 'RGB':
        img = convert_color(img, color_space)

    # 1) Create an empty list to receive positive detection windows
    on_windows = []
    # 2) Iterate over all windows in the list
    for window in windows:
        # 3) Extract the test window from original image
        test_img = cv2.resize(
            img[window[0][1]:window[1][1], window[0][0]:window[1][0]],
            patch_size)
        # 4) Extract features for that window using single_img_features()
        features = extract_features(test_img, **feature_extractor_params)
        # 5) Scale extracted features to be fed to classifier
        test_features = scaler.transform(np.array(features).reshape(1, -1))
        # 6) Predict using your classifier
        prediction = clf.predict(test_features)
        # 7) If positive (prediction == 1) then save the window
        if prediction == 1:
            on_windows.append(window)
    # 8) Return windows for positive detections
    return on_windows
def classify_sentences(list_of_sentences, model_folder):
    #Create feature file
    index_features_filename = os.path.join(model_folder, __name_index)
    model_filename = os.path.join(model_folder, __name_model)

    feature_index = Cfeature_index()
    feature_index.load_from_file(index_features_filename)

    ################################################
    #Create the feature file for classification
    ################################################
    my_feat_file = NamedTemporaryFile(delete=False)
    for list_of_tokens in list_of_sentences:
        these_feats = extract_features(list_of_tokens)
        feature_index.encode_example_for_classification(
            these_feats, my_feat_file)
    my_feat_file.close()
    ################################################

    #Run the classifier svm_clasify
    #SVM values will be a list of floats, one per sentence, with the value associated by SVM
    svm_values = run_svm_classify(my_feat_file.name, model_filename)

    os.remove(my_feat_file.name)
    return svm_values
Ejemplo n.º 8
0
 def _classify_patch(self, patch):
     features = extract_features(patch, **self.feature_extractor_args)
     X = features.reshape(1, -1).astype(np.float64)
     if self.feature_scaler is not None:
         X = self.feature_scaler.transform(X)
     probabilities = self.classifier.predict_proba(X)[0]
     prediction = probabilities[1] > self.low_threshold
     return prediction, probabilities[1]
Ejemplo n.º 9
0
def final_train():
    train_kds, train_dss, train_labels = extract_features()
    test_kds, test_dss, test_labels = extract_features('testing')
    accuracies = []
    for _ in range(3):
        model = WindowBasedEnsembleClassifier(False, 40, 'gini', 8, 8, 192,
                                              192, 1.0 / 7, 1.0 / 7, 20)
        model.fit(train_kds, train_dss, train_labels)
        predicted = model.predict(test_kds, test_dss)
        acc = accuracy_score(test_labels, predicted)
        accuracies.append(acc)
        print("total accuracy: {}".format(acc))
        for i in range(10):
            t_ids = [m for m, t in enumerate(test_labels) if t == i]
            t_acc = accuracy_score(test_labels[t_ids], predicted[t_ids])
            print("number {} accuracy: {}".format(i, t_acc))
    print("mean accuracy: {}".format(np.mean(accuracies)))
Ejemplo n.º 10
0
def extract_features_from_text(text):
    num_hashtag = len(text.split('#')) - 1
    feature_dict = extract_features(("", text, 0, 0, num_hashtag, ""))
    return [
        feature_dict['num_propagation_words'], feature_dict['text_length'],
        feature_dict['num_hashtag'], feature_dict['num_powerful_words'],
        feature_dict['num_personal_pronoun']
    ]
Ejemplo n.º 11
0
 def post(self):
     args = parser.parse_args()
     decodeit = open('./server_files/saveimg.jpeg', 'wb')
     decodeit.write(base64.b64decode(bytes(args["image"], 'utf-8')))
     decodeit.close()
     fe.extract_features()
     model = pickle.load(open("./xgboost.pkl", 'rb'))
     test = pd.read_csv('test.csv', header=None)
     x_test = np.array(test.iloc[1:, 1:10])
     y_test = np.array(test.iloc[1:, 11])
     res = model.predict(x_test)
     print(res)
     class_ = ""
     if res == 0:
         class_ = "benign"
     else:
         class_ = "malicious"
     return {"class": class_}
Ejemplo n.º 12
0
def model_with_only_features():
    print('## Download and load the data')
    dataset = get_dataset()
    print('## Only separate using spaces')
    cleaned_X = [val.split(' ') for val in dataset['X']]
    print('## Feature Extraction')
    all_features_X = extract_features(cleaned_X, dataset['X'])
    print('## Training and Evaluation')
    train_the_model(all_features_X, dataset['Y'])
Ejemplo n.º 13
0
def model_with_all_preprocessing_and_features(no_bag_of_words=False):
    print('## Download and load the data')
    dataset = get_dataset()
    print('## Clean the data')
    cleaned_X = clean_data(dataset['X'])
    print('## Feature Extraction')
    all_features_X = extract_features(cleaned_X, dataset['X'], no_bag_of_words=no_bag_of_words)
    print('## Training and Evaluation')
    train_the_model(all_features_X, dataset['Y'])
Ejemplo n.º 14
0
def parameter_tuner():
    train_kds, train_dss, train_labels = extract_features()
    # print(np.bincount(train_labels.flatten()))

    # new_x = []
    # new_y = []
    # for i in range(train_kds.shape[0]):
    #     for j in range(len(train_kds[i])):
    #         t = train_kds[i][j]
    #         # new_x.append(train_dss[i][j])
    #         f = [t.pt[0], t.pt[1], t.angle, t.size, t.response]
    #         f.extend(train_dss[i][j])
    #         new_x.append(f)
    #         new_y.append(train_labels[i])
    # k = 50
    # new_x = np.array(new_x)
    # new_y = np.array(new_y)
    #
    # model = RandomForestClassifier()
    # model = KNeighborsClassifier(k, n_jobs=2)
    # model = KNN()
    # model = svm.LinearSVC(verbose=True, max_iter=10000)
    # model.fit(new_x, new_y)
    kf = KFold(n_splits=3, shuffle=True)
    accuracies = []
    pars = ['entropy', 'gini']
    for par in pars:
        print(par)
        r = []
        for train_index, test_index in kf.split(train_kds):
            print("TRAIN:", train_index, "TEST:", test_index)
            kds_temp_train, dss_temp_train, kds_temp_test, dss_temp_test = \
                train_kds[train_index], train_dss[train_index], train_kds[test_index], train_dss[test_index]
            y_temp_train, y_temp_test = train_labels[
                train_index], train_labels[test_index]
            model = WindowBasedEnsembleClassifier(False, 40, par, 8, 8, 192,
                                                  192, 1.0 / 7, 1.0 / 7, 20)
            model.fit(kds_temp_train, dss_temp_train, y_temp_train)
            predicted = model.predict(kds_temp_test, dss_temp_test)
            acc = accuracy_score(y_temp_test, predicted)
            r.append(acc)
            print("total accuracy: {}".format(acc))
            for i in range(10):
                t_ids = [m for m, t in enumerate(y_temp_test) if t == i]
                t_acc = accuracy_score(y_temp_test[t_ids], predicted[t_ids])
                print("number {} accuracy: {}".format(i, t_acc))
        r = np.array(r)
        accuracies.append(r.mean())
        print("after")
    acc_max_arg = np.argmax(accuracies)
    print("best {}: {}".format(pars[acc_max_arg], accuracies[acc_max_arg]))
    for a, c in zip(accuracies, pars):
        print("{}: {}".format(c, a))
Ejemplo n.º 15
0
def sort_samples(samples_array, sorter):
    prot = feature_extractor.protocol_attr()
    for sample in samples_array:
        if sample.sampler_type not in sorted_samples:
            sorted_samples[sample.sampler_type] = {}
        brand_prod = sample.brand + "_" + sample.product
        if brand_prod not in sorted_samples[sample.sampler_type]:
            sorted_samples[sample.sampler_type][brand_prod] = {}
        for channel in sample.values.columns[1:]:
            if (sample.values[channel] == 0).all():
                continue
            card_channel = sample.card + "_" + channel
            if card_channel not in sorted_samples[
                    sample.sampler_type][brand_prod]:
                sorted_samples[
                    sample.sampler_type][brand_prod][card_channel] = []
            ch_data = channel_data()
            ch_data.sample_id = sample.ID
            ch_data.note = sample.note
            ch_data.tags = sample.tags
            ch_data.values = sample.values[["time", channel]]
            ch_data.values[channel] -= ch_data.values[channel][30]
            ch_data.values[channel] = signal_process.smooth(
                ch_data.values[channel])
            ch_data.derviate_1 = signal_process.get_derivative_1(
                ch_data.values[channel])
            ch_data.derviate_2 = signal_process.get_derivative_2(
                ch_data.values[channel])
            ch_data.picks_list = feature_extractor.get_picks_indexes(
                ch_data, 0, ch_data.values.size)
            ch_data.protocol = prot
            feature_extractor.extract_features(ch_data, prot)
            sorted_samples[
                sample.sampler_type][brand_prod][card_channel].append(ch_data)
    datestr = constants.get_date_str()
    features_results_dir = constants.path_result_dir + datestr + constants.path_features_dir
    features_file_name = features_results_dir + "features_" + "_" + datestr + ".csv"
    if not os.path.exists(features_results_dir):
        os.makedirs(features_results_dir)
    feature_extractor.flush_features_data_frame(features_file_name, sorter)
def main():
    """ main function, called if module is run as main program"""

    if len(sys.argv) == 1:
        print('usage: bawe_gender_classifier.py <CORPUS_DIR> <CONF_FILE_NAME>')
        exit(0)
    data_dir = sys.argv[1]

    # read file list into dictionary mapping file id to gender
    with open(data_dir + '/BAWE_balanced_subset.csv', 'r') as gender_file:
        meta_lines = [line.rstrip().split(',') for line in gender_file]
        gender_dict = {row[0]: row[1] for row in meta_lines[1:]}

    # read essay contents and gender labels into lists
    essays = []
    gender_labels = []
    for student, gender in gender_dict.items():
        with open('%s/%s.txt' % (data_dir, student)) as f:
            text = f.read()
            # remove vestigial xml
            text = re.sub('<[^<]+?>', '', text)
            essays.append(text)
            gender_labels.append(gender)

    # read conf file
    if len(sys.argv) > 2:
        with open(sys.argv[2]) as conf_file:
            conf_all = set(line.strip() for line in conf_file)
    else:
        conf_all = []

    # compute score, for each line in the config individually and all together
    # note: preprocessing every time is very wasteful but i did not want to
    #       change the feature_extractor interfaces from what was given
    # each line individually
    confs = [line for line in conf_all]
    if len(conf_all) > 1:
        # all lines together
        confs.append([line for line in conf_all])
    for conf in confs:
        print('computing score for: %s... '
              % (conf if conf else 'all features'), end='')
        features = fe.extract_features(essays, conf)
        score = cross_val_score(GaussianNB(), features, gender_labels,
                                scoring='accuracy', cv=10).mean()
        score = round(score, 3)
        print(score)
        with open('experiments.csv', 'a') as f:
            f.write(";".join(
                ('{:%Y-%m-%d;%H:%M:%S}'.format(datetime.datetime.now()),
                 str(conf), str(score), '\n')))
Ejemplo n.º 17
0
    def query(self,
              model_path,
              n_samples_query,
              n_results,
              custom=False,
              weights=False):
        vertices, element_dict, info = read_model(model_path)
        shape = Shape(vertices, element_dict, info)
        shape = process(shape, n_vertices_target=self.n_vertices_target)
        feature_dict = extract_features(shape,
                                        self.n_bins,
                                        n_samples=n_samples_query)
        feature_df = data_dict_parser(feature_dict)
        feature_df, _ = sample_normalizer(
            feature_df,
            *self.sample_normalization_parameters,
            divide_distributions=self.divide_distributions)
        feature_df_numeric = feature_df.select_dtypes(np.number)
        #Make sure columns identical and ordered
        assert list(feature_df_numeric.columns) == list(
            self.df_numeric.columns), "Column mismatch!"
        query_vector = feature_df_numeric.iloc[0, :].values.astype(np.float32)

        if not custom:

            distances, indices = self.faiss_knn.query(query_vector, n_results)
        else:
            distances, indices = self.custom_knn.query(query_vector,
                                                       n_results,
                                                       weights=weights)

        distances = distances.flatten().tolist()  #Flatten batch dimension
        indices = indices.flatten().tolist()
        df_slice = self.df[self.df.index.isin(indices)]
        df_slice['distance'] = df_slice.index.map(
            lambda x: distances[indices.index(x)])

        #Add missing data to query df
        feature_df['file_name'] = str(model_path)
        feature_df['classification'] = 'query_input'
        feature_df['distance'] = 0
        # Put it at top of slice
        df_slice = pd.concat([df_slice, feature_df])
        df_slice = df_slice.sort_values('distance')

        return distances, indices, df_slice
    def process_subset(self, file_list, apply_processing, n_vertices_target,
                       n_bins, process_index):
        print(f' {process_index} : Starting subset processor!')
        data_subset = {k: [] for k in self.columns + self.col_array}
        for index, file in enumerate(file_list):

            if index % 50 == 0:
                print(f' {process_index} : Is at {index}/{len(file_list)}!')

            vertices, element_dict, info = read_model(Path(file))
            shape = Shape(vertices, element_dict, info)

            if apply_processing:

                shape = process(shape, n_vertices_target=n_vertices_target)

            else:
                shape.make_pyvista_mesh()

            id = os.path.basename(file).split(".")[0].replace("m", "")
            if id in self.classification_dict.keys():
                classification = self.classification_dict[id]

            else:

                classification = None

            data_subset["classification"].append(classification)
            data_subset["file_name"].append(file)

            #Get features
            feature_dict = extract_features(shape,
                                            n_bins=n_bins,
                                            n_samples=self.n_samples)

            #Add them to total data

            for key, val in feature_dict.items():
                data_subset[key].append(val)
        print(f'{process_index} : Finished!')
        return data_subset
Ejemplo n.º 19
0
def classify_sentences(list_of_sentences, model_folder):
    # Create feature file
    index_features_filename = os.path.join(model_folder, __name_index)
    model_filename = os.path.join(model_folder, __name_model)

    feature_index = Cfeature_index()
    feature_index.load_from_file(index_features_filename)

    ################################################
    # Create the feature file for classification
    ################################################
    my_feat_file = NamedTemporaryFile(delete=False)
    for list_of_tokens in list_of_sentences:
        these_feats = extract_features(list_of_tokens)
        feature_index.encode_example_for_classification(these_feats, my_feat_file)
    my_feat_file.close()
    ################################################

    # Run the classifier svm_clasify
    # SVM values will be a list of floats, one per sentence, with the value associated by SVM
    svm_values = run_svm_classify(my_feat_file.name, model_filename)

    os.remove(my_feat_file.name)
    return svm_values
Ejemplo n.º 20
0
    "education": [0],
    "marital-status": [0],
    "occupation": [0],
    "relationship": [0],
    "race": [0],
    "sex": [0],
    "native-country": [0],
}


def sigmoid(z):
    sig = 1 / (1.0 + np.exp(-z))
    return np.clip(sig, 0.00000000000001, 0.99999999999999)


training_x, training_y, testing_x = feature_extractor.extract_features(
    sys.argv[1], sys.argv[2], feature_config, True)

num_training_data = training_x.shape[0]
num_features = training_x.shape[1]

true_training_x = []
false_training_x = []
for i, row in enumerate(training_x.tolist()):
    if training_y[i, 0] == 1.0:
        true_training_x.append(row)
    else:
        false_training_x.append(row)

true_training_x = np.matrix(true_training_x, dtype=np.float64)
false_training_x = np.matrix(false_training_x, dtype=np.float64)

def test_bounding_box_volume(bounding_box_volume):
    # Bounding box volume should be same as volume for cube
    assert np.abs(bounding_box_volume - 8) < 10e-3, "Bounding box volume test failed"


def test_bounding_box_ratio(bounding_box_ratio):
    # Ratio should be 1 since bounding box is cube
    assert np.abs(bounding_box_ratio - 1) < 10e-3, "Bouding box ratio test failed"


def test_diameter(diameter):
    # Cube is already convex so diameter just dist of opposing corners
    dist = np.linalg.norm(np.array([-1, -1, -1]) - np.array([1, 1, 1]))
    assert np.abs(dist - diameter) < 10e-3, "Dimateter test failed"


if __name__ == '__main__':
    # cube. off is a 2x2x2 cube
    path = Path(r"data/cube.off")
    reader = FileReader()
    vertices, element_dict, info = reader.read(path)
    shape = Shape(vertices, element_dict, info)
    feature_dict = extract_features(shape,n_bins=10,n_samples=1e+6)
    test_volume(feature_dict['volume'])
    test_surface_area(feature_dict['surface_area'])
    test_bounding_box_volume(feature_dict['bounding_box_volume'])
    test_bounding_box_ratio(feature_dict['bounding_box_ratio'])
    test_diameter(feature_dict['diameter'])
Ejemplo n.º 22
0
def load_dataset(label, path, shuffle=False):
    messages = load_messages(path)
    if shuffle:
        random.shuffle(messages)
    return [(extract_features(msg), label) for msg in messages]
Ejemplo n.º 23
0
        for example in examples:
            if not (example == "count.txt"):
                examples_path = classes_path + "/" + example

                images_names = os.listdir(examples_path)

                ratio_step = int((images_names.__len__() - 1) / 35)

                for w in range(0, 35):
                    h = w * ratio_step
                    if not (images_names[h].__len__()
                            == 14):  # in the case of non image pick
                        h += 1

                    images_path = examples_path + "/" + images_names[h]
                    features = extractor.extract_features(images_path)

                    for attribute in attributes:
                        data_temp.append(features[attribute])

                    print(images_path)

                data_temp.append(class_id)

                data.append(list(data_temp))

                del data_temp[:]  # clear

        if (class_name != "count.txt"):
            class_id = class_id + 1
Ejemplo n.º 24
0
# -*- coding: UTF-8 -*-

import numpy
from sklearn import preprocessing

def reform(datasets):
    new_datasets = []
    scaler = None
    for dataset in datasets:
        new_dataset_x = []
        new_dataset_y = []
        for x, y in zip(dataset[0],dataset[1]):
            for i in range(0, len(x)/10*10, 10):
                new_dataset_x.append(x[i:i+10,:].flatten())
                new_dataset_y.append(y)
        new_dataset_x = numpy.asarray(new_dataset_x)
        new_dataset_y = numpy.asarray(new_dataset_y)

        new_datasets.append((new_dataset_x, new_dataset_y))

    return tuple(new_datasets)

if __name__ == '__main__':
    from loader import load_data
    from feature_extractor import extract_features

    datasets = extract_features(load_data()[0])

    new_datasets = reform(datasets)

    print new_datasets[0][0][0].shape
Ejemplo n.º 25
0
        clustered_junction_file = sys.argv[sys.argv.index('-exf') + 3]
        intersection_db = sys.argv[sys.argv.index('-exf') + 4]
        out_file = sys.argv[sys.argv.index('-exf') + 5]

        file = open(clustered_junction_file, 'r')
        clustered_junctions = json.loads(file.read())
        file.close()

        from feature_extractor import extract_features
        pickle_in = open(grid_file, "rb")
        l = pickle.load(pickle_in)
        pickle_in.close()
        file = open(trail_file)
        data = json.loads(file.read())
        file.close()
        extract_features(l, data, os.path.join(out_folder, out_file),
                         clustered_junctions, intersection_db, way_folder)
    except Exception as e:
        raise e

# Creating a grid of the clustered junctions. This is used to efficiently check if at a given point
# in trail, we are in the proximity of a junction or not. This section creates a model and saves it
# in a file named as the second command line parameter. The model, when loaded, provides a method
# that takes the current lat and long as input along with the distance in meters we define the notion
# of proximity. The method returns false if the point given is not near any of the the junctions and
# a tuple (True, lat, lng) where lat and lng are the latitude and longitude of the juction point the
# current point is nearest from.
if '-l' in sys.argv:
    try:
        inp_file = open(sys.argv[sys.argv.index('-l') + 1])
        out_file_name = sys.argv[sys.argv.index('-l') + 2]
        junction_info = json.loads(inp_file.read())
Ejemplo n.º 26
0

def err(loss):
    return np.sum(np.abs(loss))


def cross_entropy(y_hat, y):
    return np.sum(-(y_hat.transpose().dot(np.log(y)) +
                    (1 - y_hat).transpose().dot(np.log(1 - y))))


model_file_name = "./model"
is_model_existed = os.path.isfile(model_file_name)

if not is_model_existed:
    training_x, training_y, testing_x = feature_extractor.extract_features(
        sys.argv[1], sys.argv[2], feature_config, is_normalized)

    num_validating_data = training_x.shape[0] // 10
    validating_x = training_x[:num_validating_data]
    validating_y = training_y[:num_validating_data]
    training_x = training_x[num_validating_data:]
    training_y = training_y[num_validating_data:]
    num_training_data = training_x.shape[0]

    num_features = training_x.shape[1]
    weights = [0.0 for _ in range(num_features)]

    # Training parameters
    num_iterations = 5e5
    learning_rate = 1e1
    is_regularized = True
Ejemplo n.º 27
0
 def axis_features(idx):
   axisname = data.dtype.names[idx]
   col = data[axisname]
   return extract_features(None, col, self.config, self.section)
Ejemplo n.º 28
0
        '--load',
        action='store_true',
        help='include to load models from <expdir> instead of training new')
    return vars(parser.parse_args())


if __name__ == "__main__":
    args = handle_args()
    print(args)
    header, data = load_data(args['datafile'])
    labels = numpy.asarray([[line[i] for i in class_idx]
                            for line in data]).T.tolist()

    conf = load_conf_file(args['conffile'])
    features = fe.extract_features(
        [line[1] for line in data], conf
    )  # this will only pass the status update text to the feature extractor

    if not args['load']:
        # train new models, evaluate, store
        for i in range(len(class_idx)):
            trait = header[class_idx[i]]
            clf = svm.SVC(class_weight='balanced').fit(features, labels[i])
            predicted = cross_val_predict(clf,
                                          features,
                                          labels[i],
                                          cv=10,
                                          n_jobs=-1)
            print("%s: %.2f" % (header[class_idx[i]],
                                metrics.accuracy_score(labels[i], predicted)))
            with open("%s/%s.pkl" % (args['expdir'], trait), 'wb') as f:
Ejemplo n.º 29
0
def main():

    area = sys.argv[1]  # 'rome' 'tuscany' 'london'
    type_user = sys.argv[2]  # 'crash' 'nocrash'
    overwrite = int(sys.argv[3])
    country = 'uk' if area == 'london' else 'italy'

    min_length = 1.0
    min_duration = 60.0

    print(datetime.datetime.now(), 'Crash Prediction - Train Test Partitioner')
    if not overwrite:
        print(datetime.datetime.now(), '(restart)')

    path = './'
    path_imn = path + 'imn_new/'
    path_dataset = path + 'dataset/'
    path_traintest = path + 'traintest/'
    path_quadtree = path + 'quadtree/'

    traj_table = 'tak.%s_traj' % country
    evnt_table = 'tak.%s_evnt' % country
    crash_table = 'tak.%s_crash' % country

    if area == 'london' and type_user == 'nocrash':
        users_filename = path_dataset + '%s_%s_users_list.csv' % (area, 'all')
        users_filename_crash = path_dataset + '%s_%s_users_list.csv' % (
            area, 'crash')
    else:
        users_filename = path_dataset + '%s_%s_users_list.csv' % (area,
                                                                  type_user)
        users_filename_crash = None

    users_list = pd.read_csv(users_filename).values[:, 0].tolist()
    users_list = sorted(users_list)

    if users_filename_crash is not None:
        users_list_crash = pd.read_csv(
            users_filename_crash).values[:, 0].tolist()
        users_list_crash = sorted(users_list_crash)
        users_list = [uid for uid in users_list if uid not in users_list_crash]

    nbr_users = len(users_list)

    print(datetime.datetime.now(), 'Reading quadtree')
    quadtree_poi_filename = path_quadtree + '%s_personal_osm_poi_lv17.json.gz' % area
    fout = gzip.GzipFile(quadtree_poi_filename, 'r')
    quadtree = json.loads(fout.readline())
    fout.close()

    print(datetime.datetime.now(), 'Reading quadtree features')
    quadtree_features_filename = path_quadtree + '%s_quadtree_features.json.gz' % area
    fout = gzip.GzipFile(quadtree_features_filename, 'r')
    quadtrees_features_str = json.loads(fout.readline())
    quadtrees_features = {int(k): v for k, v in quadtrees_features_str.items()}
    fout.close()

    processed_users = set()
    if overwrite:
        for index in range(0, 7):
            output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % (
                area, type_user, index)
            if os.path.exists(output_filename):
                os.remove(output_filename)
    else:
        processed_users = set()
        for index in range(0, 7):
            output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % (
                area, type_user, index)
            if os.path.isfile(output_filename):
                fout = gzip.GzipFile(output_filename, 'r')
                for row in fout:
                    customer_obj = json.loads(row)
                    processed_users.add(customer_obj['uid'])
                fout.close()

    window = 4
    datetime_from = datetime.datetime.strptime('2017-01-01 00:00:00',
                                               '%Y-%m-%d %H:%M:%S')
    datetime_to = datetime.datetime.strptime('2018-01-01 00:00:00',
                                             '%Y-%m-%d %H:%M:%S')

    print(datetime.datetime.now(), 'Generating month boundaries')
    months = pd.date_range(start=datetime_from, end=datetime_to, freq='MS')
    boundaries = [[lm, um]
                  for lm, um in zip(months[:-window], months[window:])]
    training_months = list()
    test_months = list()
    for i in range(len(boundaries) - 1):
        training_months.append(boundaries[i])
        test_months.append(boundaries[i + 1])

    index = 0
    tr_data_map = dict()
    ts_data_map = dict()
    for tr_months, ts_months in zip(training_months, test_months):
        tr_data_map[tuple(tr_months)] = index
        ts_data_map[tuple(ts_months)] = index
        index += 1

    print(datetime.datetime.now(), 'Initializing quadtree features')
    tr_quadtree_features = dict()
    for m in quadtrees_features:
        for lu, index in tr_data_map.items():
            if lu[0].month <= m < lu[1].month:
                if index not in tr_quadtree_features:
                    tr_quadtree_features[index] = dict()
                for path in quadtrees_features[m]:
                    if path not in tr_quadtree_features[index]:
                        tr_quadtree_features[index][path] = {
                            'nbr_traj_start': 0,
                            'nbr_traj_stop': 0,
                            'nbr_traj_move': 0,
                            'traj_speed_sum': 0,
                            'traj_speed_count': 0,
                            'nbr_evnt_A': 0,
                            'nbr_evnt_B': 0,
                            'nbr_evnt_C': 0,
                            'nbr_evnt_Q': 0,
                            'nbr_evnt_start': 0,
                            'nbr_evnt_stop': 0,
                            'speed_A_sum': 0,
                            'max_acc_A_sum': 0,
                            'avg_acc_A_sum': 0,
                            'speed_B_sum': 0,
                            'max_acc_B_sum': 0,
                            'avg_acc_B_sum': 0,
                            'speed_C_sum': 0,
                            'max_acc_C_sum': 0,
                            'avg_acc_C_sum': 0,
                            'speed_Q_sum': 0,
                            'max_acc_Q_sum': 0,
                            'avg_acc_Q_sum': 0,
                            'nbr_crash': 0,
                        }
                    for k, v in quadtrees_features[m][path].items():
                        tr_quadtree_features[index][path][k] += v

    ts_quadtree_features = dict()
    for m in quadtrees_features:
        for lu, index in tr_data_map.items():
            if lu[0].month <= m < lu[1].month:
                if index not in ts_quadtree_features:
                    ts_quadtree_features[index] = dict()
                for path in quadtrees_features[m]:
                    if path not in ts_quadtree_features[index]:
                        ts_quadtree_features[index][path] = {
                            'nbr_traj_start': 0,
                            'nbr_traj_stop': 0,
                            'nbr_traj_move': 0,
                            'traj_speed_sum': 0,
                            'traj_speed_count': 0,
                            'nbr_evnt_A': 0,
                            'nbr_evnt_B': 0,
                            'nbr_evnt_C': 0,
                            'nbr_evnt_Q': 0,
                            'nbr_evnt_start': 0,
                            'nbr_evnt_stop': 0,
                            'speed_A_sum': 0,
                            'max_acc_A_sum': 0,
                            'avg_acc_A_sum': 0,
                            'speed_B_sum': 0,
                            'max_acc_B_sum': 0,
                            'avg_acc_B_sum': 0,
                            'speed_C_sum': 0,
                            'max_acc_C_sum': 0,
                            'avg_acc_C_sum': 0,
                            'speed_Q_sum': 0,
                            'max_acc_Q_sum': 0,
                            'avg_acc_Q_sum': 0,
                            'nbr_crash': 0,
                        }
                    for k, v in quadtrees_features[m][path].items():
                        ts_quadtree_features[index][path][k] += v

    print(datetime.datetime.now(), 'Connecting to database')
    con = database_io.get_connection()
    cur = con.cursor()

    count = 0
    imn_filedata = gzip.GzipFile(
        path_imn + '%s_imn_%s.json.gz' % (area, type_user), 'r')

    print(datetime.datetime.now(),
          'Calculating features and partitioning dataset')
    for row in imn_filedata:
        if len(row) <= 1:
            print('new file started ;-)')
            continue

        user_obj = json.loads(row)
        uid = user_obj['uid']
        count += 1
        if uid in processed_users:
            continue

        if count % 10 == 0:
            print(
                datetime.datetime.now(),
                'train test partition %s %s [%s/%s] - %.2f' %
                (area, type_user, count, nbr_users, 100 * count / nbr_users))

        imh = database_io.load_individual_mobility_history(
            cur, uid, traj_table, min_length, min_duration)
        events = database_io.load_individual_event_history(
            cur, uid, evnt_table) if evnt_table is not None else None
        trajectories = imh['trajectories']

        tr_data = dict()
        ts_data = dict()

        # partitioning imn for train and test
        for imn_months in user_obj:
            if imn_months == 'uid':
                continue

            # print(imn_months)
            m0 = int(imn_months.split('-')[0])
            m1 = int(imn_months.split('-')[1])
            for lu, index in tr_data_map.items():
                if lu[0].month <= m0 < m1 < lu[1].month:
                    if index not in tr_data:
                        tr_data[index] = {
                            'uid': uid,
                            'crash': False,
                            'trajectories': dict(),
                            'imns': dict(),
                            'events': dict(),
                        }
                    tr_data[index]['imns'][imn_months] = user_obj[imn_months]

            for lu, index in ts_data_map.items():
                if lu[0].month <= m0 < lu[1].month:
                    if index not in ts_data:
                        ts_data[index] = {
                            'uid': uid,
                            'crash': False,
                            'trajectories': dict(),
                            'imns': dict(),
                            'events': dict(),
                        }
                    ts_data[index]['imns'][imn_months] = user_obj[imn_months]

        # partitioning trajectories for train and test
        for tid, traj in trajectories.items():
            for lu, index in tr_data_map.items():
                if lu[0] <= traj.start_time() < lu[1] and index in tr_data:
                    tr_data[index]['trajectories'][tid] = traj
            for lu, index in ts_data_map.items():
                if lu[0] <= traj.start_time() < lu[1] and index in ts_data:
                    ts_data[index]['trajectories'][tid] = traj

        # partitioning events for train and test
        for eid, evnt in events.items():
            # print(evnt)
            for lu, index in tr_data_map.items():
                if lu[0] <= evnt[0]['date'] < lu[1] and index in tr_data:
                    tr_data[index]['events'][eid] = evnt[0]
            for lu, index in ts_data_map.items():
                if lu[0] <= evnt[0]['date'] < lu[1] and index in ts_data:
                    ts_data[index]['events'][eid] = evnt[0]

        # get has crash next month
        for lu, index in tr_data_map.items():
            if index not in tr_data:
                continue
            query = """SELECT * FROM %s WHERE uid = '%s' 
                        AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') 
                        AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % (
                crash_table, uid, str(
                    lu[1]), str(lu[1] + relativedelta(months=1)))
            cur.execute(query)
            rows = cur.fetchall()
            has_crash_next_month = len(rows) > 0
            tr_data[index]['crash'] = has_crash_next_month

        for lu, index in ts_data_map.items():
            if index not in ts_data:
                continue
            query = """SELECT * FROM %s WHERE uid = '%s' 
                        AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') 
                        AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % (
                crash_table, uid, str(
                    lu[1]), str(lu[1] + relativedelta(months=1)))
            cur.execute(query)
            rows = cur.fetchall()
            has_crash_next_month = len(rows) > 0
            ts_data[index]['crash'] = has_crash_next_month

        tr_features, ts_features = feature_extractor.extract_features(
            uid, tr_data, ts_data, quadtree, tr_quadtree_features,
            ts_quadtree_features)

        for index in tr_features:
            if index in ts_features:
                output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % (
                    area, type_user, index)
                store_obj = {
                    'uid': uid,
                    'train': tr_features[index],
                    'test': ts_features[index]
                }
                feature_extractor.store_features(output_filename, store_obj)

    imn_filedata.close()
Ejemplo n.º 30
0
[timestamps, action_history, annotations, feedbacks, durations, rows] = \
	load_path(sys.argv[1])

records = []
valid_rows = []
for i in range(len(action_history)):
	print i
	record = {}
	actions = convert_paths(action_history[i], timestamps[i])
	if len(actions) == 0:
		print "Reply " + str(i) + " corrupted."
		continue
	record["actions"] = actions
	record["strokes"] = annotations[i]
	record["feedback"] = feedbacks[i]
	record["duration"] = durations[i]
	records.append(record)

	valid_rows.append(rows[i])

	if len(records) == 60:
		break

print "Successfully converted " + str(len(records)) + " replays."

with open((sys.argv[1].split(".")[0] + "_records.txt").format(i), 'w') as f:
	f.write(json.dumps(records))

feature_extractor.extract_features(valid_rows, sys.argv[2])
Ejemplo n.º 31
0
def is_spam(tweet, lang):
    if lang != 'en':
        return False
    return 'spam' == CLASSIFIER_EN.classify(extract_features(tweet['text']))
Ejemplo n.º 32
0
    sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='mse', optimizer=sgd)

    return model

from loader import load_data
from feature_extractor import extract_features
from utils import reform

if __name__ == '__main__':
    print "... loading data"
    datasets, n_classes = load_data()

    print "... extracting features"
    datasets = extract_features(datasets)

    print "... reforming data"
    train_set, test_set = reform(datasets)

    print "*********************"
    print "    Model 1 - MLP"
    print "*********************"

    print "... building MLP"
    mlp = MLP(n_classes)

    print "... training MLP"
    mlp.fit(train_set[0], np_utils.to_categorical(train_set[1], n_classes), batch_size=10, nb_epoch=5, show_accuracy=True)

    print "... evaluating MLP"
Ejemplo n.º 33
0
    
    for example in examples:
      if not( example == "count.txt" ):
        examples_path = classes_path + "/" + example
        
        images_names = os.listdir(examples_path)        

        ratio_step = int((images_names.__len__()-1)/35)

        for w in range(0, 35):
          h = w * ratio_step
          if not(images_names[h].__len__() == 14):  # in the case of non image pick
            h += 1

          images_path = examples_path + "/" + images_names[h]
          features = extractor.extract_features( images_path )

          for attribute in attributes:
            data_temp.append( features[ attribute ] )        

          print( images_path )
        
        data_temp.append( class_id )

        data.append(list(data_temp))

        del data_temp[:]  # clear
    
    if( class_name != "count.txt" ):
      class_id = class_id + 1
Ejemplo n.º 34
0
def train_classifier_from_folders(list_folders,output_folder):
    #Create the training feature file with format:
    #CLASS label FEAT FEAT FEAT
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)
    os.mkdir(output_folder)
    
    features_folder = os.path.join(output_folder,'features')
    os.mkdir(features_folder)
    
    feature_filename = os.path.join(output_folder,__name_feature)
    fd_feats = open(feature_filename,'w')
    for f in list_folders:    
        this_pid = os.fork()
        if this_pid == 0:
            print 'Extracting features from',f,' '
            while True:
                sys.stdout.write('.')
                sys.stdout.flush()
                time.sleep(1)
        else:
            #For each sent file
            for sent_file in glob.glob(os.path.join(f,'*.sents')):
                fin = open(sent_file,'r')
                for line in fin:
                    fields = line.decode('utf-8').strip().split(' ')
                    class_label = fields[0]
                    ##Conver + - o +1 -1
                    if class_label[0]=='+': class_label='+1'
                    elif class_label[0]=='-': class_label='-1'
                    tokens = fields[1:]
                    these_feats = extract_features(tokens)
                    write_to_output(class_label, these_feats, fd_feats)
                fin.close()
            ##Done
            os.kill(this_pid,signal.SIGTERM)
            print
    fd_feats.close()
    print 'Feature file:',feature_filename
    
    ##Convert this features file in the index file
    training_filename = os.path.join(output_folder,__name_training)
    fd_train = open(training_filename,'w')

    
    feature_file_obj = Cfeature_file(feature_filename)
    index_features = Cfeature_index()
    index_features.encode_feature_file_to_svm(feature_file_obj,out_fic=fd_train)
    print 'Training instances saved to ',training_filename
    fd_train.close()
    
    #Save the index of features that will be used for the classification
    index_filename = os.path.join(output_folder,__name_index )
    index_features.save_to_file(index_filename)
    print 'Index of features saved to',index_filename
    
    #Train the model using the file training_filename
    model_filename = os.path.join(output_folder,__name_model)
    params = '-c 0.5 -x 1'
    run_svmlight_learn(training_filename,model_filename,params)
    print 'Model trained and saved to',model_filename

        
    

            
    

        
Ejemplo n.º 35
0
def train_classifier_from_folders(list_folders, output_folder):
    #Create the training feature file with format:
    #CLASS label FEAT FEAT FEAT
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)
    os.mkdir(output_folder)

    features_folder = os.path.join(output_folder, 'features')
    os.mkdir(features_folder)

    feature_filename = os.path.join(output_folder, __name_feature)
    fd_feats = open(feature_filename, 'w')
    for f in list_folders:
        this_pid = os.fork()
        if this_pid == 0:
            print 'Extracting features from', f, ' '
            while True:
                sys.stdout.write('.')
                sys.stdout.flush()
                time.sleep(1)
        else:
            #For each sent file
            for sent_file in glob.glob(os.path.join(f, '*.sents')):
                fin = open(sent_file, 'r')
                for line in fin:
                    fields = line.decode('utf-8').strip().split(' ')
                    class_label = fields[0]
                    ##Conver + - o +1 -1
                    if class_label[0] == '+': class_label = '+1'
                    elif class_label[0] == '-': class_label = '-1'
                    tokens = fields[1:]
                    these_feats = extract_features(tokens)
                    write_to_output(class_label, these_feats, fd_feats)
                fin.close()
            ##Done
            os.kill(this_pid, signal.SIGTERM)
            print
    fd_feats.close()
    print 'Feature file:', feature_filename

    ##Convert this features file in the index file
    training_filename = os.path.join(output_folder, __name_training)
    fd_train = open(training_filename, 'w')

    feature_file_obj = Cfeature_file(feature_filename)
    index_features = Cfeature_index()
    index_features.encode_feature_file_to_svm(feature_file_obj,
                                              out_fic=fd_train)
    print 'Training instances saved to ', training_filename
    fd_train.close()

    #Save the index of features that will be used for the classification
    index_filename = os.path.join(output_folder, __name_index)
    index_features.save_to_file(index_filename)
    print 'Index of features saved to', index_filename

    #Train the model using the file training_filename
    model_filename = os.path.join(output_folder, __name_model)
    params = '-c 0.5 -x 1'
    run_svmlight_learn(training_filename, model_filename, params)
    print 'Model trained and saved to', model_filename
Ejemplo n.º 36
0
def main(_):
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu
    if FLAGS.cfg_file is None:
        raise ValueError('You must supply the cfg file !')

    cfg = _cfg_from_file(FLAGS.cfg_file)
    train_cfg = cfg['train']

    # print all configs
    print('############################ cfg ############################')
    for k in cfg:
        print('%s: %s' % (k, cfg[k]))

    tf.logging.set_verbosity(tf.logging.INFO)
    #######################################################################
    ##############              sigle GPU version            ##############
    #######################################################################

    #### get dataset ####
    cls_dataset = dataset.get_dataset(dataset_folder=cfg['dataset_folder'],
                                      split=train_cfg['train_split'],
                                      cfg=train_cfg['dataset_opt'])

    #### build training dataset pipline #####
    im_batch, label_batch = dataset.build_input_pipline(
        phase='train',
        dataset=cls_dataset,
        min_resize_value=cfg.get('min_resize_value', None),
        max_resize_value=cfg.get('max_resize_value', None),
        # train cfgs:
        batch_size=train_cfg['batch_size'],
        num_epoch=int(
            math.ceil(
                float(train_cfg['iters']) * train_cfg['batch_size'] /
                cls_dataset.num_examples)),
        shuffle=True,
        aug_opt=train_cfg.get('aug_opt', None),
        crop_size=cfg['corp_size'],
    )

    ##### get logits ####
    logits, endpoints = feature_extractor.extract_features(
        images=im_batch,
        num_classes=cls_dataset.num_classes,
        output_stride=cfg['output_stride'],
        global_pool=True,
        model_variant=cfg['model_variant'],
        weight_decay=train_cfg.get('weight_decy', 0),
        dropout_keep_prob=train_cfg.get('dropout_keep_prob', 1.0),
        regularize_depthwise=train_cfg.get('regularize_depthwise', False),
        reuse=tf.AUTO_REUSE,
        is_training=True,
        fine_tune_batch_norm=train_cfg.get('fine_turn_batch_norm', False),
        cfg=cfg)

    ##### build loss ####
    total_loss = build_loss(logits=logits,
                            labels=label_batch,
                            endpoints=endpoints,
                            loss_opt=train_cfg['loss_opt'])

    #### build optiizer ####
    global_step = slim.create_global_step()
    learning_rate = _configure_learning_rate(
        num_samples_per_epoch=cls_dataset.num_examples,
        global_step=global_step,
        train_cfg=train_cfg)
    optimizer = _configure_optimizer(
        learning_rate=learning_rate,
        train_cfg=train_cfg,
    )

    #### build train tensor ####
    grads_and_vars = optimizer.compute_gradients(
        loss=total_loss,
        var_list=_get_variables_to_train(train_cfg=train_cfg),
    )
    grad_updates = optimizer.apply_gradients(grads_and_vars=grads_and_vars,
                                             global_step=global_step)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)  # batch norm
    update_ops.append(grad_updates)
    update_op = tf.group(*update_ops)
    with tf.control_dependencies([update_op]):
        train_tensor = tf.identity(total_loss, name='train_op')

    #### add summaries ####
    # Add summaries for model variables.
    for model_var in slim.get_model_variables():
        tf.summary.histogram(model_var.op.name, model_var)
    # Add summaries for losses.
    for loss in tf.get_collection(tf.GraphKeys.LOSSES):
        tf.summary.scalar('losses/%s' % loss.op.name, loss)
    if train_cfg['loss_opt'].get('use_reg_loss', False):
        tf.summary.scalar(
            'losses/reg_loss',
            tf.get_default_graph().get_tensor_by_name(
                'make_total_loss/reg_loss:0'))
    if train_cfg['loss_opt'].get('use_aux_loss', False):
        tf.summary.scalar(
            'losses/aux_loss',
            tf.get_default_graph().get_tensor_by_name(
                'make_total_loss/aux_loss/value:0'))
    tf.summary.scalar(
        'total_loss',
        tf.get_default_graph().get_tensor_by_name(
            'make_total_loss/total_loss:0'))
    # merge all summaries
    merged_summaries = tf.summary.merge_all()
    summaries_writer = tf.summary.FileWriter(logdir=FLAGS.output_dir,
                                             graph=tf.get_default_graph())

    #### set up session config ####
    # savers:
    model_variables = slim.get_model_variables()
    model_variables.append(tf.train.get_or_create_global_step())
    for mv in model_variables:
        print(mv.op.name)
    ckpt_saver = tf.train.Saver(var_list=model_variables, max_to_keep=10)
    new_ckpt_path = os.path.join(FLAGS.output_dir,
                                 cfg['model_variant'] + '.ckpt')
    save_ckpt_every = train_cfg.get('save_ckpt_every', 5000)
    # session config:
    sess_cfg = tf.ConfigProto(allow_soft_placement=True,
                              log_device_placement=False)
    sess_cfg.gpu_options.allow_growth = True

    #### train the model ####
    with tf.Session(config=sess_cfg) as sess:
        # init
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        # restore vars from pretrained ckpt:
        if train_cfg.get('pretrian_ckpt_file', None) is not None:
            pretrain_ckpt = train_cfg['pretrian_ckpt_file']
            tf.logging.info('restore ckpt from: %s', pretrain_ckpt)
            restor_saver = tf.train.Saver(var_list=_var_to_restore(
                train_cfg.get('exclude_scopes', None)))
            restor_saver.restore(sess, pretrain_ckpt)

        # train
        for i in range(train_cfg['iters']):
            if (i % save_ckpt_every == 0):
                all_summaries, loss_now = sess.run(
                    [merged_summaries, train_tensor])
                # write summaries
                summaries_writer.add_summary(all_summaries, i)
                # save ckpt
                ckpt_saver.save(sess, new_ckpt_path, global_step=i)
            else:
                loss_now = sess.run(train_tensor)
            if i % 20 == 0:
                tf.logging.info('global step: %d, loss= %f', i, loss_now)
        # Final run
        all_summaries, loss_now = sess.run([merged_summaries, train_tensor])
        # write summaries
        summaries_writer.add_summary(all_summaries, train_cfg['iters'])
        # save ckpt
        ckpt_saver.save(sess, new_ckpt_path, global_step=train_cfg['iters'])

    print("End of Train !!!")
    essays = []
    genderlabels = []
    students = []
    for student, gender in gender_dict.items():
        with open('%s/%s.txt' % (datadir, student)) as f:
            text = f.read()
            text = re.sub('<[^<]+?>', '', text)  # remove vestigial xml
            essays.append(text)
            genderlabels.append(gender)
            students.append(student)
    return essays, genderlabels, students


def load_conf_file():
    conf = set(line.strip() for line in open(conffile))
    return conf


def predict_gender(X, Y):
    scores = cross_val_score(GaussianNB(), X, Y, scoring='accuracy', cv=10)
    return scores.mean()


if __name__ == "__main__":
    gender_dict = load_balanced_gender_labels()
    essays, genderlabels, students = load_essays(gender_dict)
    conf = load_conf_file()
    features = fe.extract_features(essays, conf)

    print(predict_gender(features, genderlabels))
Ejemplo n.º 38
0
def _extract_features(images,
                      model_options,
                      weight_decay=0.0001,
                      reuse=None,
                      is_training=False,
                      fine_tune_batch_norm=False):
  """Extracts features by the particular model_variant.
  Args:
    images: A tensor of size [batch, height, width, channels].
    model_options: A ModelOptions instance to configure models.
    weight_decay: The weight decay for model variables.
    reuse: Reuse the model variables or not.
    is_training: Is training or not.
    fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
  Returns:
    concat_logits: A tensor of size [batch, feature_height, feature_width,
      feature_channels], where feature_height/feature_width are determined by
      the images height/width and output_stride.
    end_points: A dictionary from components of the network to the corresponding
      activation.
  """
  features, end_points = feature_extractor.extract_features(
      images,
      output_stride=model_options.output_stride,
      multi_grid=model_options.multi_grid,
      model_variant=model_options.model_variant,
      weight_decay=weight_decay,
      reuse=reuse,
      is_training=is_training,
      fine_tune_batch_norm=fine_tune_batch_norm)

  if not model_options.aspp_with_batch_norm:
    return features, end_points
  else:
    batch_norm_params = {
        'is_training': is_training and fine_tune_batch_norm,
        'decay': 0.9997,
        'epsilon': 1e-5,
        'scale': True,
    }

    with slim.arg_scope(
        [slim.conv2d, slim.separable_conv2d],
        weights_regularizer=slim.l2_regularizer(weight_decay),
        activation_fn=tf.nn.relu,
        normalizer_fn=slim.batch_norm,
        padding='SAME',
        stride=1,
        reuse=reuse):
      with slim.arg_scope([slim.batch_norm], **batch_norm_params):
        depth = 512
        branch_logits = []

        if model_options.add_image_level_feature:
          pool_height = scale_dimension(model_options.crop_size[0],
                                        1. / model_options.output_stride)
          pool_width = scale_dimension(model_options.crop_size[1],
                                       1. / model_options.output_stride)
          image_feature = slim.avg_pool2d(
              features, [pool_height, pool_width], [pool_height, pool_width],
              padding='VALID')
          image_feature = slim.conv2d(
              image_feature, depth, 1, scope=_IMAGE_POOLING_SCOPE)
          image_feature = tf.image.resize_bilinear(
              image_feature, [pool_height, pool_width], align_corners=True)
          image_feature.set_shape([None, pool_height, pool_width, depth])
          branch_logits.append(image_feature)

        # Employ a 1x1 convolution.
        branch_logits.append(slim.conv2d(features, depth, 1,
                                         scope=_ASPP_SCOPE + str(0)))

        if model_options.atrous_rates:
          # Employ 3x3 convolutions with different atrous rates.
          for i, rate in enumerate(model_options.atrous_rates, 1):
            scope = _ASPP_SCOPE + str(i)
            if model_options.aspp_with_separable_conv:
              aspp_features = _split_separable_conv2d(
                  features,
                  filters=depth,
                  rate=rate,
                  weight_decay=weight_decay,
                  scope=scope)
            else:
              aspp_features = slim.conv2d(
                  features, depth, [3, 1], rate=rate, scope=scope)
            branch_logits.append(aspp_features)

        # Merge branch logits.
        concat_logits = tf.concat(branch_logits, 3)
        concat_logits = slim.conv2d(
            concat_logits, depth, 1, scope=_CONCAT_PROJECTION_SCOPE)
        concat_logits = slim.dropout(
            concat_logits,
            keep_prob=0.5,
            is_training=is_training,
            scope=_CONCAT_PROJECTION_SCOPE + '_dropout')

        return concat_logits, end_points
Ejemplo n.º 39
0
 def gen_axis_features(idx):
   axisname = vis.data.dtype.names[idx]
   col = vis.data[axisname]
   return extract_features(None, col, config, section)