Beispiel #1
0
def train(modelnames=[], features=[], limit=0, stemmer_type=None, predict=False, standardized=False, plot=False):
	"""
	----------------------------------------
	train - cross validate - predict - plot
	----------------------------------------
	"""
	X, y, z = get_features(limit=limit, features=features, stemmer_type=stemmer_type, db_name="yelp_train", standardized=False)
	del z #! not used when training
	
	for name in modelnames:
		model = filter(lambda x: x['name'] == name, model_config)[0]
		
		#! ---------------------
		module_ = __import__(model['module'], fromlist=model['from'])
		class_ = getattr(module_, model['name'])
		clf = class_(**model['kwargs'])
		model_name = str(clf.__class__).split(".")[-1].split("'")[0]
		
		print clf
		cross_validate(X,y,clf,folds=5,model_name=model_name,plot=plot)
								
		if model['feature_imp']:
			print 'Feature Importances =======', list(clf.feature_importances_)
		gc.collect()
	
		if predict:
			print '====== predicting ......'
			#! grab the complete test set for prediction
			Xtest, ytest, ztest = get_features(limit=0, features=features, stemmer_type=stemmer_type, db_name="yelp_test", standardized=False)
			predict_and_save(X, y, ztest, Xtest, clf, features)
			print '====== predicting done ......'
Beispiel #2
0
def main():
    print("Reading in the training data")
    train = data_io.read_train()

    print("Reading in the meta data")
    paper_author, paper_author_indexed = f.get_paper_author()

    print("Computing Relational Information")
    computed_features = f.get_all_computed_features(paper_author)

    print("Extracting features")
    features = []
    target = []
    for author_id, row in train.iterrows():
        for paper_id in row["DeletedPaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,
                               computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" %
                      (author_id, paper_id))
            else:
                target.append(1)
                features.append(s)
        for paper_id in row["ConfirmedPaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,
                               computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" %
                      (author_id, paper_id))
            else:
                target.append(0)
                features.append(s)

    print("Target Length: %d" % len(target))
    print("Feature Length: %d" % len(features))

    feature_matrix = pd.DataFrame(features)

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=50,
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    try:
        classifier.fit(feature_matrix, target)
    except:
        import pdb
        pdb.set_trace()

    print("Saving the classifier")
    data_io.save_model(classifier)
Beispiel #3
0
def test(dataset_size, model_type):
    """ opens fit dataset and trains SVM/LogReg/Forest model with it, then tests it"""
    print "MODEL TEST", dataset_size, model_type

    dset = dataset.read('contradictions', dataset_size)
    data, targets = [], []
    for case in dset['content']:
        data.append(case)
        targets.append(case['contradiction'])

    fit_data, test_data = [], []
    fit_cases, test_cases, fit_target, test_target = train_test_split(
        data, targets, test_size=0.25, shuffle=True, random_state=0)
    for fit_case in fit_cases:
        fit_data.append(
            get_features(fit_case['sentence'], fit_case['hypothesis']))

    for test_case in test_cases:
        test_data.append(
            get_features(test_case['sentence'], test_case['hypothesis']))

    model = ClassificationModel(model_type)
    start_time = time.time()
    model.train(fit_data, fit_target, dataset_size)
    elapsed_time = time.time() - start_time
    test_results = model.test(test_data)

    with open(
            config.CONTRADICTIONS_RESULTS_PATH.format(dataset_size,
                                                      model_type),
            'wb') as csvfile:
        csv_writer = csv.writer(csvfile, delimiter=',')
        csv_writer.writerow([
            'hypothesis', 'sentence', 'type', 'contradiction', 'prediction',
            'features'
        ])
        for (test_case, result, features) in zip(test_cases, test_results,
                                                 test_data):
            csv_writer.writerow([
                test_case['hypothesis'], test_case['sentence'],
                test_case['type'], test_case['contradiction'], result, features
            ])

    precision = metrics.precision_score(test_target, test_results)
    recall = metrics.recall_score(test_target, test_results)
    f1_score = metrics.f1_score(test_target, test_results)

    print "FIT TIME", elapsed_time
    print "PRECISION", precision
    print "RECALL", recall
    print "F1 SCORE", f1_score
    model.save(dataset_size)
Beispiel #4
0
def main():

    input_data, labels = input_parser.parse_input()
    X, Y = ft.get_features(input_data, labels)

    print("X.columns:")
    print(X.columns)

    folds = 5
    print("Selecting rows for " + str(folds) + "-fold validation")
    kf = KFold(n_splits=folds)
    kf.get_n_splits(X)

    summed_accuracy = 0

    fold_cnt = 1
    for train_index, test_index in kf.split(X):

        print('Fold: ' + str(fold_cnt))
        X_train, X_test = X.loc[train_index, ], X.loc[test_index, ]
        Y_train, Y_test = Y[train_index], Y[test_index]
        print(X_train)
        model = KNeighborsClassifier(n_neighbors=3)
        model.fit(X_train, Y_train)
        predictions = model.predict(X_test)

        summed_accuracy += accuracy_score(Y_test, predictions)
        print(confusion_matrix(Y_test, predictions))
        print(classification_report(Y_test, predictions))
        fold_cnt += 1

    print("Total accuracy: " + str(summed_accuracy / folds))
Beispiel #5
0
    def _original_o(self):

        img2 = self.image.copy()

        imgd = img2.copy()

        cv2.imwrite(dir + "input.jpg", imgd)

        image, m, orientations = preprocess(imgd)
        for i in range(image.shape[0]):
            for j in range(image.shape[1]):
                if image[i][j] > 50: image[i][j] = 1
                else: image[i][j] = 0

        image, xmax, xmin, ymax, ymin = cropfingerprint(image)
        orientations = orientations[xmin:xmax + 1, ymin:ymax + 1]

        cv2.imwrite(dir + "imagen_mejorada.jpg", image * 255)
        z = ZhangSuen(image)
        img = z.performThinning()
        thinned = img.copy()
        cv2.imwrite(dir + "salida_adelgazado.jpg", (1 - img) * 255)

        coords, mask = z.extractminutiae(thinned)
        cv2.imwrite(dir + "salida_minucias.jpg", mask * 255)
        fincoords = z.remove_minutiae(
            coords,
            cv2.imread(dir + "input.jpg", 0)[xmin:xmax + 1, ymin:ymax + 1])

        vector = z.get_ridge_count(fincoords, image)
        feature_vectors = features.get_features(fincoords, vector,
                                                orientations)

        return feature_vectors
Beispiel #6
0
def extract_by_mask(maskshp, raster, out, nodata=0):
    """Same as the 'extractByMask_ds' function except that the input
    is raster file.

    """
    with rasterio.open(raster) as src:
        extract_by_mask_rio_ds(get_features(maskshp), src, out, nodata=nodata)
Beispiel #7
0
def main(corpus_file_name, annotations_file_name):
    vectors_features_list = list()
    labels = list()
    annotations = parse_annotation(annotations_file_name)
    for sent_id, sent_str in read_lines(corpus_file_name):
        sent = nlp(sent_str)
        print("#id:", sent_id)
        print("#text:", sent.text)
        print()
        entities = sent.ents
        for i, first_ent in enumerate(entities):
            for second_ent in entities[:i] + entities[i + 1:]:
                pair_ent = (str(first_ent), str(second_ent))
                if pair_ent in annotations[sent_id].keys():
                    rel = annotations[sent_id][pair_ent]
                    vectors_features_list.append(
                        get_features(first_ent, second_ent))
                    labels.append(rel)

    transform_of_features, features_map, model = create_model(
        vectors_features_list, labels)
    write_feature_map(
        'C:/Users/DELL/PycharmProjects/NLP/Assignment_4/feature_map.txt',
        features_map)
    write_logistic_regression_model(
        'C:/Users/DELL/PycharmProjects/NLP/Assignment_4/model_file', model)
 def load_file(self, path, verbs):
     print(path)
     """
     Open RNC XML and get all unique tokens
     """
     tree = ET.parse(path)
     for elem in tree.iter('w'):
         word = ''.join(elem.itertext()).lower().replace('`', '') # remove stress
         for item in elem.iter('ana'):
             info = item
             print(info)
             try:
                 info_prev = [t for t in info.getparent().getprevious() if t.tag == 'ana'][0]
             except TypeError:
                 info_prev = None
             except IndexError:
                 info_prev = None
                 #print(ET.tostring(info.getparent().getprevious(), encoding='utf-8'))
             break
         #lemma = [item.get("lex") for item in elem.iter('ana')] # todo: deal with homonymy?
         lemma = info.get('lex')
         # get POS tag
         tag = info.get("gr").split('=')[0].split(',')[0]
         if lemma in verbs and tag == 'V':
             features = get_features(info, info_prev)
             verb = Verb(lemma, word, *features)
             if verb.form == 'partcp':
                 self.partcp.add(verb)
             elif verb.form == 'ger':
                 self.gerund.add(verb)
             self.verbs.add(verb)
Beispiel #9
0
def log_feature_importances(model, importance_plot_file):
    final_features = get_features(model.steps[0][1])
    features = {f"f{ii}": feature for ii, feature in enumerate(final_features)}
    importances = (
        model.steps[-1][1].get_booster().get_score(importance_type="gain")
    )

    feature_importances = (
        pd.DataFrame(
            [
                {
                    "feature": feature,
                    "importance": get(coded_feature, importances, 0.0),
                }
                for coded_feature, feature in features.items()
            ]
        )
        .sort_values("importance", ascending=True)
        .reset_index(drop=True)
    )

    ax = feature_importances.plot(y="importance", x="feature", kind="barh")
    ax.get_figure().subplots_adjust(left=0.25)
    ax.get_figure().savefig(importance_plot_file)
    mlflow.log_artifact(importance_plot_file)
Beispiel #10
0
def classify_base(data, y, tests, debug):
    results = []

    for test, test_name in tqdm(tests, file=sys.stdout, total=len(tests)):
        X = get_features(data, test)
        test_results = TestResults(test_name, len(X.columns), [])
        result_records = get_classifiers_results_records(
            X, y, test_name, debug)

        for record in result_records:
            test_results.results_list.append(record)

        results += [test_results]

    dfs_res = []
    headers = ['Features (#features)', 'Acc.', 'Prec.', 'Recall', 'F']

    for i in results:
        df_res = pd.DataFrame(
            [('{} ({}) {}'.format(i.tested_features, i.num_features,
                                  item.classifier_name), item.accuracy,
              item.precision, item.recall, item.f1)
             for item in i.results_list],
            columns=headers)
        dfs_res += [df_res]

    dfs_res = pd.concat(dfs_res, axis=0)
    dfs_res.to_csv(os.path.join(OUTPUTS_DIR, "classifier_base.csv"),
                   index=False)
Beispiel #11
0
def classify_base_best_classifier(data, y, tests):
    cont_results = []
    pat_results = []

    for test, test_name in tqdm(tests, file=sys.stdout, total=len(tests)):
        X = get_features(data, test)
        cont_test_results = TestResults(test_name, len(X.columns), [])
        pat_test_results = TestResults(test_name, len(X.columns), [])
        cont_result_record, pat_result_record = get_best_classifier_results_records(
            X, y)
        cont_test_results.results_list.append(cont_result_record)
        pat_test_results.results_list.append(pat_result_record)
        cont_results += [cont_test_results]
        pat_results += [pat_test_results]

    cont_cls_results = []
    pat_cls_results = []

    for i in cont_results:
        res = i.results_list[0]  # only 1 item
        res = [res.precision, res.recall, res.f1]
        cont_cls_results += [res]

    for i in pat_results:
        res = i.results_list[0]  # only 1 item
        res = [res.precision, res.recall, res.f1]
        pat_cls_results += [res]

    tstatistic, pvalue = stats.ttest_ind(cont_cls_results, pat_cls_results)
    headers = ['t-statistic', 'p-value']
    df = pd.DataFrame([(t, p) for t, p in zip(tstatistic, pvalue)],
                      columns=headers)
    df.insert(0, 'scorer', ['precision', 'recall', 'f1-score'])
    df.to_csv(os.path.join(OUTPUTS_DIR, "t-test_best_classifier.csv"),
              index=False)
Beispiel #12
0
	def rank(self):
		tokenizer = RegexpTokenizer(r'\w+')
		ps = nltk.stem.PorterStemmer()
		weights = get_features(self.verbose)
		ranks = PriorityQueue()
		for filename in os.listdir("./dump-texts"):
			file = os.path.join("dump-texts",filename)
			url = filename[10:-4].replace('--','/')
			if self.verbose:
				print "\nSCORING " + url
			with open(file, 'r') as f:
				text = f.read().lower().decode('utf-8')
				text = re.sub(r'\d+', '', text)
				filtered_text = [w for w in tokenizer.tokenize(text) if w not in stopwords.words('english')]
				cur_tokens = [ps.stem(x) for x in filtered_text]
				score = 0
				text_bonus = 0
				for t in cur_tokens:
					if 'buy' in t:
						text_bonus += 2
					elif 'price' in t:
						text_bonus += 2
					if t in weights.keys():
						score += weights[t]
					score += text_bonus
				score = score/len(cur_tokens)
				score += self._calculate_url_bonus(url)
				if self.verbose:
					print "SCORE = " + str(score) + "\n"
				ranks.put((-1*score, url))
		self._dump_ranks(ranks)
		print "************* Ranks computed!"
def main():
    print("Reading the test data")
    test = data_io.read_test()

    print("Reading in the meta data")
    paper_author, paper_author_indexed = f.get_paper_author()

    print("Computing Relational Information")
    computed_features = f.get_all_computed_features(paper_author)

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = []
    for author_id, row in test.iterrows():
        features = []
        paper_ids = []
        for paper_id in row["PaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,
                               computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" %
                      (author_id, paper_id))
            else:
                features.append(s)
                paper_ids.append(paper_id)
        feature_matrix = pd.DataFrame(features)
        preds = classifier.predict_proba(feature_matrix)[:, 1]
        paper_ids_sorted = sorted(zip(preds, row["PaperIds"]), reverse=True)
        print(paper_ids_sorted)
        predictions.append([x[1] for x in paper_ids_sorted])

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Beispiel #14
0
def create_training_dataset(image_list, label_list):
    print('[INFO] Creating training dataset on %d image(s).' % len(image_list))

    X = []
    y = []

    for i, (image, label) in enumerate(zip(image_list, label_list)):
        image_file, image = image
        label_file, label = label
        image_name = os.path.basename(image_file)
        print(f'Now on {image_name}')
        if int(image_name.split('.')[0]) < 22:
            p = 800
        else:
            p = 100
        regions = ft.get_regions(image, p)
        features = ft.get_features(image, regions)
        labels = ft.get_labels(label, regions)
        X.append(features)
        y.append(labels)

    X = np.vstack(X)
    y = np.concatenate(y)

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print('[INFO] Feature vector size:', X.shape)

    return X, y
def main():
    print("Reading the test data") 
    test = data_io.read_test()

    print("Reading in the meta data")
    paper_author, paper_author_indexed = f.get_paper_author()

    print("Computing Relational Information")
    computed_features = f.get_all_computed_features(paper_author)

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = []
    for author_id, row in test.iterrows():
        features = []
        paper_ids = []
        for paper_id in row["PaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id))
            else:
                features.append(s)
                paper_ids.append(paper_id)
        feature_matrix = pd.DataFrame(features)
        preds = classifier.predict_proba(feature_matrix)[:,1]
        paper_ids_sorted = sorted(zip(preds,row["PaperIds"]), reverse=True)
        print(paper_ids_sorted)
        predictions.append([x[1] for x in paper_ids_sorted])

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Beispiel #16
0
def all_sat(constraint):
    s = []
    f = features.get_features()
    for k in f:
        if constraint.constrain(f[k]):
            s.append(k)
    return s
Beispiel #17
0
 def prob_classify(self, source):
     """
     wrapper for `prob_classify` of the nltk classifier
     """
     source = dumb_strip(source)
     featureset = get_features(source)
     return self.classifier.prob_classify(featureset)
def analyse_file(input_file, save_counts=False):
    """
    Calculates readability formulae for a single file
    :param input_file: input file
    :param save_counts: saves lists of what was counted if set to true
    :return: dictionary containing formulae, features and counts for the document
    """

    # get file content
    with open(input_file, 'r') as content_file:
        text = content_file.read()
    content_file.close()

    # get counts
    tokenized_sentences = get_tokenized_sentences(text)
    if save_counts:
        rval = cnt.get_and_save_counts(tokenized_sentences, input_file)
    else:
        rval = cnt.get_counts(tokenized_sentences)

    # get features
    rval.update(feat.get_features(rval))

    # get formulae
    rval.update(rf.get_formulae(rval))

    return rval
def main():
    print("Reading in the training data")
    train = data_io.read_train()

    print("Reading in the meta data")
    paper_author, paper_author_indexed = f.get_paper_author()

    print("Computing Relational Information")
    computed_features = f.get_all_computed_features(paper_author)

    print("Extracting features")
    features = []
    target = []
    for author_id, row in train.iterrows():
        for paper_id in row["DeletedPaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id))
            else:
                target.append(1)
                features.append(s)
        for paper_id in row["ConfirmedPaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id))
            else:
                target.append(0)
                features.append(s)

    print("Target Length: %d" % len(target))
    print("Feature Length: %d" % len(features))

    feature_matrix = pd.DataFrame(features)

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=50, 
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    try:
        classifier.fit(feature_matrix, target)
    except:
        import pdb;pdb.set_trace()

    print("Saving the classifier")
    data_io.save_model(classifier)
Beispiel #20
0
def add_user():
    hands = [get_hand(), get_hand(), get_hand()]

    feature_vectors = [get_features(h) for h in hands]

    hands[0].plot()

    return feature_vectors
    def load(self):
        
        self.features = []
        for i in xrange(self.repeat):
            self.features.append(get_features(self.path + "/" + str(i+1) + ".wav"))

        self.mean = np.mean([len(x) for x in self.features])
        self.std = np.std([len(x) for x in self.features])
Beispiel #22
0
def information(filepath):
    '''
  for every inkml file, read with beautfilsoup
  obtain information:
  uid <- annotation
  traces <- trace
  traceGroups <- traceGroup
  
  preprocess & features
   - interpolate
   - remove hooks
   - remove duplicates
   - normalize
   - smoothen
   
  features:
   - parallelity
  '''
    try:
        ######################################################
        # remove informaiton from file
        ######################################################
        result = get_information(filepath)
        if not result:
            print(f'No traces found for: {filepath}')
            return
        if len(result) == 3:
            uid, traces, traces_map = result
        if len(result) == 4:
            uid, traces, traces_map, traceGroup = result

        ######################################################
        # preprocess & get features
        ######################################################
        traces = preprocess(traces, filepath)
        if len(result) == 3:
            rows = get_features(filepath, uid, traces, traces_map)
        else:
            rows = get_features(filepath, uid, traces, traces_map, traceGroup)
        ######################################################
        if not rows:
            # some files have only one trace id..hence no res
            return
        return rows
    except Exception as e:
        print('Error for %s: %s' % (filepath, str(e)))
Beispiel #23
0
def classify_question_types(data, y, tests, debug):
    answer_ranges = [('([1-9]|1[0-4])', '1-14'), ('(1[5-8])', '15-18')]
    results = []

    for test, test_name in tqdm(tests, file=sys.stdout, total=len(tests)):
        results_types = {
            '1-14': TestResults(test_name, len(test), []),
            '15-18': TestResults(test_name, len(test), [])
        }

        for regex, ans_range in answer_ranges:
            X = get_features(data, test, regex)
            results_types[ans_range].num_features = len(X.columns)
            answer_results = AnswersResults(ans_range, [])
            result_records = get_classifiers_results_records(
                X, y, test_name + ' q{}'.format(ans_range), debug)

            for record in result_records:
                answer_results.results_list.append(record)

            results_types[ans_range].results_list.append(answer_results)
        results += [results_types]

    headers = ['Acc. q{}', 'Prec. q{}', 'Recall q{}', 'F q{}']

    for _, ans_range in answer_ranges:
        dfs_res = []
        features_list = []

        for result in results:
            results_types = result[ans_range]
            df_tests = []
            # iterate classifier names, all answer_ranges use the same classifiers. choose 0
            for cls in results_types.results_list[0].results_list:
                feat_head = '{} ({}) {}'.format(results_types.tested_features,
                                                results_types.num_features,
                                                cls.classifier_name)
                features_list.append(feat_head)

            for item in results_types.results_list:
                ans_headers = [
                    head.format(item.answer_number) for head in headers
                ]
                df_test = pd.DataFrame(
                    [(i.accuracy, i.precision, i.recall, i.f1)
                     for i in item.results_list],
                    columns=ans_headers)
                df_tests += [df_test]

            df_tests = pd.concat(df_tests, axis=1)
            dfs_res += [df_tests]

        dfs_res = pd.concat(dfs_res, axis=0)
        dfs_res.insert(0, 'Features (#features)', features_list)
        dfs_res.to_csv(os.path.join(
            OUTPUTS_DIR, "classifier_answers_{}.csv".format(ans_range)),
                       index=False)
Beispiel #24
0
    def test(self, remain_time_budget=None):
        super(EnhancementStage, self).train(remain_time_budget)

        if self._stage_test_loop_num == 0 or self._spec_len_status == 2:
            self._spec_len_status = 0
            self._feature_params['mode'] = 'test'
            x = get_features(self.ctx.raw_test_data, self._feature_params)
            x = np.array(x)
            self._pre_test_x = x[:, :, :, np.newaxis]
            log(f"stage_loop_num={self._stage_loop_num}, preprocess {len(self._pre_test_x)} test data, shape {self._pre_test_x.shape}"
                )

        while self._stage_loop_num <= self._decide_warmup_loops():
            self.train(remain_time_budget=remain_time_budget)

        score = 0
        if self._decide_use_all_data() is False:
            score = balanced_acc_metric(self._pre_val_y,
                                        self._model.predict(self._pre_val_x))
            if (score - 0.01 > self.ctx.max_score <
                    0.90) or (score >= self.ctx.max_score >= 0.90):
                self._better_score_cnt += 1
            log("resnet score {} max_score {}".format(score,
                                                      self.ctx.max_score))

        if self._is_predict() and self._decide_use_all_data():
            if self._is_ensenmble():
                if self._is_good_train():
                    preds = self._model.predict(self._pre_test_x, batch_size=8)
                    # normalize logits
                    # preds = (preds - np.min(preds)) / (np.max(preds) - np.min(preds))
                    self._all_preds[self._stage_loop_num] = preds
                    log("this round is good train")
                else:
                    log("this round is bad train")
                preds = self._ensemble_preds()
            else:
                preds = self._model.predict(self._pre_test_x, batch_size=8)
            self._last_pred = preds
            self._last_pred_loop = self._stage_loop_num
        elif self._decide_use_all_data() is False:
            if len(self.ctx.lr_last_preds) > 0:
                preds = self.ctx.lr_last_preds
            else:
                preds = self.ctx.ensemble_predicts()
        else:
            preds = self._last_pred

        self._stage_test_loop_num += 1
        if self._stage_loop_num >= self._stage_end_loop_num \
                or (
                self._stage_test_loop_num >= 8 and self._decide_use_all_data() is False and self.ctx.max_score > 0.4):
            self._need_transition = True
        if self._need_transition:
            self._transition()

        return preds
    def load(self):

        self.features = []
        for i in xrange(self.repeat):
            self.features.append(
                get_features(self.path + "/" + str(i + 1) + ".wav"))

        self.mean = np.mean([len(x) for x in self.features])
        self.std = np.std([len(x) for x in self.features])
Beispiel #26
0
 def predict(self, text: str) -> int:
     """
     Predict review score based on review title.
     :param text: Review title
     :return: Predicted score, in [0, 100] range
     """
     doc = self.nlp(text)
     X = get_features([doc], max_length)
     y = self.model.predict(X)
     return int(round(y[0][0] * 100))
Beispiel #27
0
def get_trade():
    """
    This function gets the predicted trade for today and the relative capital amount to trade
    returns the prediction (what to go long/short in for today) and how much capital to spend
    """
    df = pd.read_csv("Data/database.csv", index_col='Date')
    df = train_model.compute_label(df=df)
    features = get_features()
    prediction, weight = train_model.fit_model(df=df, data=features)
    return prediction, weight
def process(img):
    img = crop.crop(img)
    t = classify.classify(img)
    if t == Type.BAND:
        return t, None, img, img, (None, None, None, None)
    mask, _, _ = extract.extract(img)
    masked = cv2.bitwise_and(img, img, mask=mask)
    x, y, w, h = watch_features.bounding_box(mask)
    #img = img[y:y+h, x:x+w]
    f = features.get_features(img)
    return t, f, img, masked, (x, y, w, h)
def get_features(record):
    d = {
        'OGid': record.OGid,
        'start': record.start,
        'stop': record.stop,
        'ppid': record.ppid
    }
    if not (len(record.segment) == 0 or 'X' in record.segment
            or 'U' in record.segment):
        d.update(features.get_features(record.segment))
    return d
Beispiel #30
0
 def predict(self, token, tokens, weights=None):
     """Gready head prediction used for training."""
     scores = []
     features = []
     for head in tokens:
         feats = get_features(head, token, tokens, **self.feature_opts)
         score = self.score(feats)
         features.append(feats)
         scores.append(score)
     guess = np.argmax(scores)
     return guess, features
    def original_stuff(self):

        img2 = self.image.copy()
        #img2 = shiftcorrection(img2).copy()
        # cv2.imwrite("shifted1.jpg", img2)
        #angle,xc,yc = correctrotation(img2)
        #img2 = 255 - img2
        #self.checker1 = img2.copy()
        #rows, cols = img2.shape
        #M = cv2.getRotationMatrix2D((cols/2,rows/2),angle,1)
        #dst = cv2.warpAffine(img2,M,(cols,rows))
        #dst = 255 - dst
        imgd = img2.copy()
        #print("original angle")
        #print angle

        cv2.imwrite(dir + "input.jpg", imgd)

        image, m, orientations = preprocess(imgd)
        for i in range(image.shape[0]):
            for j in range(image.shape[1]):
                if image[i][j] > 50: image[i][j] = 1
                else: image[i][j] = 0

        # print("done")
        # image = scipy.ndimage.binary_closing(image, structure=np.ones((3,3))).astype(np.int)
        # image = scipy.ndimage.binary_opening(image, structure=np.ones((3,3))).astype(np.int)

        image, xmax, xmin, ymax, ymin = cropfingerprint(image)
        orientations = orientations[xmin:xmax + 1, ymin:ymax + 1]

        # orientations, xmax, xmin, ymax, ymin = helper.find_roi(image,orientations)
        # image = image[xmin:xmax+1, ymin:ymax+1]

        cv2.imwrite(dir + "intermediate-input.jpg", image * 255)
        z = ZhangSuen(image)
        img = z.performThinning()
        thinned = img.copy()
        cv2.imwrite(dir + "thinnedimage-input.jpg", (1 - img) * 255)
        # print "dome"
        coords, mask = z.extractminutiae(thinned)
        cv2.imwrite(dir + "minu-input.jpg", mask * 255)
        fincoords = z.remove_minutiae(
            coords,
            cv2.imread(dir + "input.jpg", 0)[xmin:xmax + 1, ymin:ymax + 1])
        # rotatecoords, angle, maskedimage = z.rotate_minutiae(fincoords, cv2.imread("1.jpg", 0))
        # cv2.imwrite("minutiaeextracted.jpg", (maskedimage)*255)
        vector = z.get_ridge_count(fincoords, image)
        feature_vectors = features.get_features(fincoords, vector,
                                                orientations)

        return feature_vectors
Beispiel #32
0
def ts_forecasting():

    args = input_cmd()

    # get energy consumption data
    load = args.load
    f_steps = args.steps

    data = get_dataset(load_to_predict=load)

    c_target = data["energy"]
    t_target, f_target, fcast_range = forecast_split(c_target, n_steps=f_steps)

    # ML methods
    features, target = get_features(t_target)
    lags = [int(f.split("_")[1]) for f in features if "lag" in f]
    forecaster = Forecaster(f_steps, lags=lags)

    print("Forecast with Linear Regression model")
    model, cv_score, test_score = linear_model(features, target)

    if args.fcast == "direct":
        fcast_linear = forecaster.direct(t_target, linear_model)
    elif args.fcast == "recursive":
        fcast_linear = forecaster.recursive(t_target, model)

    fcast_score = mape(f_target, fcast_linear)
    print(f"""
Linear Regression scores
--------------
Cross-validation MAPE: {round(cv_score, 2)}%
Test MAPE: {round(test_score, 2)}%
Direct Forecast MAPE: {round(fcast_score, 2)}%
    """)

    print("Forecast with XGBoost model")
    model, cv_score, test_score = xgboost_model(features, target, max_evals=25)

    if args.fcast == "direct":
        fcast_xgb = forecaster.direct(t_target, xgboost_model)
    elif args.fcast == "recursive":
        fcast_xgb = forecaster.recursive(t_target, model)

    fcast_score = mape(f_target, fcast_xgb)
    print(f"""
XGBoost scores
--------------
Cross-validation MAPE: {round(cv_score, 2)}%
Test MAPE: {round(test_score, 2)}%
Recursive Forecast MAPE: {round(fcast_score, 2)}%
    """)
Beispiel #33
0
def predict():
    i = 5
    X_test = feat.get_features(f_test)
    scaler = joblib.load("models/scaler" + str(i) + ".save")
    model = joblib.load("models/model" + str(i) + ".save")
    x_test_scaled = scaler.transform(X_test)
    joblib.dump(x_test_scaled, "models/x_to_pred_scaled" + str(i) + ".save")
    y_predicted = model.predict(x_test_scaled)
    output = pd.DataFrame({
        "index": f_test["index_absolute"][:],
        "sleep_stage": y_predicted
    })
    output.to_csv("output" + str(i) + ".csv", index=False)
    print("over")
Beispiel #34
0
def __gen_features():
    # Get train and test data
    print("Loading train and test data")
    x_train, y_train, x_test, y_test = get_data(DATA_DIRS,
                                                SETTINGS['train_test_split'])

    if DEBUG_VARS['trim_data'] is not None:
        x_train = x_train[:len(x_train) // DEBUG_VARS['trim_data']]
        y_train = y_train[:len(y_train) // DEBUG_VARS['trim_data']]
        x_test = x_test[:len(x_test) // DEBUG_VARS['trim_data']]
        y_test = y_test[:len(y_test) // DEBUG_VARS['trim_data']]

    # Get fetures
    t1 = time.time()
    print('Extracting features from the train data')
    x_train = get_features(x_train, SETTINGS['feature'])
    print('Extracting features from the test data')
    x_test = get_features(x_test, SETTINGS['feature'])
    t2 = time.time()
    print('Feature extraction took', round(t2 - t1, 4), 'Seconds')

    if SETTINGS['feature']['save']:
        with open(
                path.join(SETTINGS['feature']['path'],
                          SETTINGS['feature']['name']), 'wb') as f:
            print("Saving features.")
            features = {
                'x_train': x_train,
                'y_train': y_train,
                'x_test': x_test,
                'y_test': y_test
            }
            pickle.dump(features, f, pickle.HIGHEST_PROTOCOL)
        with open(path.join(SETTINGS['feature']['path'], 'settings.p'),
                  'wb') as f:
            pickle.dump(SETTINGS['feature'], f, pickle.HIGHEST_PROTOCOL)
    return x_train, y_train, x_test, y_test
Beispiel #35
0
def train():
    n_pts = None  #change to test on a subset of the data
    X_features = feat.get_features(f_train, n_pts)
    joblib.dump(X_features, "models/X_features.save")
    if n_pts != None:
        y_train_vals = y_train_raw["sleep_stage"][:n_pts].values
    else:
        y_train_vals = y_train_raw["sleep_stage"][:].values

    # on passe l'output en chaine de caracteres
    y_train_vals = [str(y) for y in y_train_vals]

    #train model and save results
    print("training")
    train_model.train_full_model(X_features, y_train_vals)
def train_nn(save_model=False):
    '''
    Train neural network model
    :param save_model: True if model should be saved to file
    '''

    ### TRAINING ###
    # Create model

    # TODO: Here you can modify the architecture of the neural network model and experiment with different parameters
    model = Sequential()
    model.add(
        Dense(
            1,  # TODO: Number of hidden layer neurons
            input_dim=len(get_features()),
            activation='relu'))

    # TODO: Possible to add additional neural network layers here
    # TODO: Use model.add(Dense(number of hidden layer neurons, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    # TODO: Optional; add early stopping as callback
    history = model.fit(
        x=x_train,
        y=y_train,
        validation_data=[x_val, y_val],
        batch_size=
        50,  # Number of data samples to run through network before parameter update
        epochs=
        1,  # TODO: Number of times to run entire training set through network
        shuffle=True,
        callbacks=[]).history

    score = model.evaluate(x_test, y_test,
                           batch_size=50)  # Evaluate model on test set
    print('Test loss:%f' % (score[0]))
    print('Test accuracy:%f' % (score[1]))

    if save_model:
        model.save('./nn_model.h5')
        print("Model saved")

    plot_training_history(history)
Beispiel #37
0
def arguments(argv=sys.argv[1:]):
    parser = argparse.ArgumentParser()

    names = ', '.join(tests.__all__)

    parser.add_argument(
        'tests', nargs='*',
        help='The list of tests to run.  Tests are: ' + names)

    features = ', '.join(FEATURES)
    parser.add_argument(
        '--features', default=[], action='append',
        help='A list of features separated by colons.  Features are: ' +
        features)

    parser.add_argument(
        '--force', '-f', action='store_true',
        help='Do not wait for a prompt')

    parser.add_argument(
        '--verbose', '-v', action='store_true',
        help='More verbose output')

    args = parser.parse_args(argv)

    test_list = args.tests or tests.__all__
    all_tests = [(t, getattr(tests, t, None)) for t in test_list]
    bad_tests = [t for (t, a) in all_tests if a is None]
    if bad_tests:
        common.printer(test_list, all_tests, bad_tests)
        raise ValueError('Bad test names: ' + ', '.join(bad_tests))
    all_tests = tuple(a for (t, a) in all_tests)

    if args.features:
        features = set(':'.join(args.features).split(':'))
        check_features(features)

    else:
        features = get_features()

    return all_tests, features, args.verbose, args.force
Beispiel #38
0
def test_classify():
    feats = features.get_features()
    print feats
    good = 0
    wrong = 0
    spam_files = toolkit.get_files('spam/train')

    for sf in spam_files:
        if classify_wrap(sf, feats, 0):
            good += 1
        else:
            wrong += 1
    
    print "After SPAM: good: %d, wrong: %d" % (good, wrong)
    ham_files = toolkit.get_files('ham/train')
    for hf in ham_files:
        if not(classify_wrap(hf, feats, 0)):
            good += 1
        else:
            wrong += 1
    print "good: %d, wrong: %d" % (good, wrong)
 def next_track(self,sleep_time=5.0):
     """
     Get the next song features
     Take it from the queue (waits infinitely if needed...!)
     Sleep time between iterations when waiting is sleep_time (seconds)
     """
     # get data
     while True:
         data = _get_data()
         if data != None:
             break
         time.sleep(sleep_time)
     self._nTracksGiven += 1
     # get features
     return features.get_features(data,pSize=self._pSize,
                                  usebars=self._usebars,
                                  keyInv=self._keyInv,
                                  songKeyInv=self._songKeyInv,
                                  positive=self._positive,
                                  do_resample=self._do_resample,
                                  partialbar=self._partialbar,
                                  btchroma_barbts=None)
 def load(self):
     self.features = get_features(self.audio_path)
     self.frame_cnt = len(self.features)
import numpy as np, math

import world_cup
import features
import match_stats
import world_cup
history_size = 4

game_summaries = features.get_game_summaries()
data = features.get_features(history_size)

club_data = data[data['competitionid'] <> 4]
# Show the features latest game in competition id 4, which is the world cup.
print data[data['competitionid'] == 4].iloc[0]

import power
reload(power)
reload(world_cup)
def points_to_sgn(p):
  if p > 0.1: return 1.0
  elif p < -0.1: return -1.0
  else: return 0.0
power_cols = [
  ('points', points_to_sgn, 'points'),
]

power_data = power.add_power(club_data, game_summaries, power_cols)
power_data.to_csv('/tmp/out.csv',sep=';')
Beispiel #42
0
 def consume(self, lang, source):
     source = strip_gubbins(source, lang)
     featureset = get_features(source)
     self.featuresets.append((featureset, lang))
Beispiel #43
0
import numpy as np
import matplotlib.pyplot as plt
import data_io as dl
import metrics as m
import features as f
from definitions import target_fields
from sklearn import svm, cross_validation
from sklearn.ensemble import GradientBoostingRegressor
import pywt
import time

data = dl.get_data('train')
spectra = data['spectra']
targets = data['targets']
x_train_all = f.get_features(data)

clfs = {
    # 'Ca':   svm.SVR(C=10000.0),
    # 'P':    svm.SVR(C=5000.0),
    # 'pH':   svm.SVR(C=10000.0),
    # 'SOC':  svm.SVR(C=10000.0),
    # 'Sand': svm.SVR(C=10000.0),
    'Ca':   GradientBoostingRegressor(n_estimators=200),
    'P':    GradientBoostingRegressor(n_estimators=200),
    'pH':   GradientBoostingRegressor(n_estimators=200),
    'SOC':  GradientBoostingRegressor(n_estimators=200),
    'Sand': GradientBoostingRegressor(n_estimators=200),
}

mode = 'cv'
import numpy as np, math
import world_cup, features, match_stats, power
import world_cup, pandas as pd
history_size = 3

game_summaries = features.get_game_summaries()
data = features.get_features(history_size)

club_data = data[data['competitionid'] <> 4]

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Don't train on games that ended in a draw, since they have less signal.
train = club_data.loc[club_data['points'] <> 1] 
train = club_data

(model, test) = world_cup.train_model(
     train, match_stats.get_non_feature_columns())
print "\nRsquared: %0.03g" % model.prsquared

def print_params(model, limit=None):    
    params = model.params.copy()
    params.sort(ascending=False)
    del params['intercept']
    
    if not limit:
        limit = len(params)

    print("Positive features")
models = []
means = []
std_devs = []

for i in range(len(spoken)):
	#print "fitting to HMM and decoding ..."

	n_components = 3
	arr = []

	# make an HMM instance and execute fit
	model = GaussianHMM(n_components, covariance_type="diag", n_iter=1000)

	for j in range(n_samples):
		(rate,sig) = wav.read(fpaths[i][j])
		features = get_features(sig)
		arr.append(len(features))
		model.fit([features])

	models.append(model)
	means.append(np.mean(arr))
	std_devs.append(np.std(arr))
	#print("done\n")

correct_answers = []
with open('Test/'+test_folder+'/answer.txt') as answers:
    for entry in answers:
        correct_answers.append(entry.split())

tot_words = len(correct_answers)
right = 0.0
Beispiel #46
0
def kernel_generate_fromcsv(
    input_path,
    input_csv_fname,
    output_fname,
    # --
    featfunc,
    # -- 
    simfunc = DEFAULT_SIMFUNC,
    kernel_type = DEFAULT_KERNEL_TYPE,
    nosphere = DEFAULT_NOSPHERE,
    # --
    variable_name = DEFAULT_VARIABLE_NAME,
    #input_path = DEFAULT_INPUT_PATH,
    # --
    overwrite = DEFAULT_OVERWRITE,
    noverify = DEFAULT_NOVERIFY,
    ):
    
    assert(kernel_type in VALID_KERNEL_TYPES)

    # add matlab's extension to the output filename if needed
    if path.splitext(output_fname)[-1] != ".mat":
        output_fname += ".mat"        

    # can we overwrite ?
    if path.exists(output_fname) and not overwrite:
        warnings.warn("not allowed to overwrite %s"  % output_fname)
        return
        
    # --------------------------------------------------------------------------
    # -- get training and testing filenames from csv 
    print "Processing %s ..." % input_csv_fname
    
    (train_fnames, train_labels,
     test_fnames, test_labels) = csv2tt(input_csv_fname, input_path=input_path)
    
    ntrain = len(train_fnames)
    ntest = len(test_fnames)

    assert(ntrain>0)
    assert(ntest>0)

    if not noverify:
        all_fnames = sp.array(train_fnames+test_fnames).ravel()        
        verify_fnames(all_fnames)

    # --------------------------------------------------------------------------
    # -- train x train
    train_features = get_features(train_fnames,
                                  featfunc,
                                  kernel_type,
                                  simfunc,
                                  info_str = 'training')
    if nosphere:
        sphere_vectors = None
    else:
        print "Sphering train features ..."
        sphere_vectors = get_sphere_vectors(train_features)    
        train_features = sphere_features(train_features, sphere_vectors)

    # XXX: this should probably be refactored in kernel.py
    print "Computing '%s' kernel_traintrain ..." % (kernel_type)    
    if kernel_type == "dot":
        kernel_traintrain = dot_kernel(train_features)
    elif kernel_type == "ndot":
        kernel_traintrain = ndot_kernel(train_features)
    elif kernel_type == "exp_mu_chi2":
        chi2_matrix = chi2_kernel(train_features)
        chi2_mu_train = chi2_matrix.mean()
        kernel_traintrain = ne.evaluate("exp(-chi2_matrix/chi2_mu_train)")        
    elif kernel_type == "exp_mu_da":
        da_matrix = da_kernel(train_features)
        da_mu_train = da_matrix.mean()
        kernel_traintrain = ne.evaluate("exp(-da_matrix/da_mu_train)")        
    assert(not (kernel_traintrain==0).all())
    

    # --------------------------------------------------------------------------
    # -- train x test
    test_features = get_features(test_fnames,
                                 featfunc,
                                 kernel_type,
                                 simfunc,
                                 info_str = 'testing')
  
    if not nosphere:
        print "Sphering test features ..."
        test_features = sphere_features(test_features, sphere_vectors)
                
    # XXX: this should probably be refactored in kernel.py
    print "Computing '%s' kernel_traintest ..."  % (kernel_type)
    if kernel_type == "dot":
        kernel_traintest = dot_kernel(train_features, test_features)
    elif kernel_type == "ndot":
        kernel_traintest = ndot_kernel(train_features, test_features)
    elif kernel_type == "exp_mu_chi2":
        chi2_matrix = chi2_kernel(train_features, test_features)
        kernel_traintest = ne.evaluate("exp(-chi2_matrix/chi2_mu_train)")        
    elif kernel_type == "exp_mu_da":
        da_matrix = da_kernel(train_features, test_features)
        kernel_traintest = ne.evaluate("exp(-da_matrix/da_mu_train)")        

    assert(not (kernel_traintest==0).all())
    
    # --------------------------------------------------------------------------
    # -- write output file

    # first make sure we don't record the original input_path
    # since this one could change
    train_fnames, _, test_fnames, _ = csv2tt(input_csv_fname)
    
    print
    print "Writing %s ..." % (output_fname)
    
    data = {"kernel_traintrain": kernel_traintrain,
            "kernel_traintest": kernel_traintest,
            "train_labels": train_labels,
            "test_labels": test_labels,
            "train_fnames": train_fnames,
            "test_fnames": test_fnames,
            }

    try:
        io.savemat(output_fname, data, format="4")
    except IOError, err:
        print "ERROR!:", err
def processing(frame, wframe):
    temp_feature = features.get_features(wframe)
    temp_descriptor = features.get_descriptor(wframe, temp_feature)
    
    points_list, patternimage_size = features.verify(temp_descriptor, temp_feature,
                                                     #'/home/max/Pictures/logotipos/QR_Maxkalavera.png'
                                                     '/home/max/Pictures/logotipos/fime.jpg'
                                                     #'/home/max/Pictures/logotipos/logo006.jpg'
                                                     )

    wframe_size = wframe.shape 
    frame_size = frame.shape 
    ratio = [float(frame_size[0])/wframe_size[0], float(frame_size[1])/wframe_size[1]]

    print "Number of points:", len(points_list[0])
    if len(points_list[0]) >= 10: 

        (h, m) = cv2.findHomography( numpy.array(points_list[1]), numpy.array(points_list[0]), cv2.RANSAC,  ransacReprojThreshold = 3.0)
        matches = m.ravel().tolist()

        if True:
            for i in range(len(points_list[0])):
                pt = points_list[0][i]

                if matches[i] > 0:
                    cv2.circle(frame, (int(pt[0]*ratio[0]), int(pt[1]*ratio[1])), 3, (255,0,255), -1)
                else:
                    cv2.circle(frame, (int(pt[0]*ratio[0]), int(pt[1]*ratio[1])), 3, (0, 255,255), -1)

        patternimage_rectsize = numpy.float32(
            [
                [0, 0],
                [0, patternimage_size[0] - 1],
                [patternimage_size[1] - 1, patternimage_size[0] - 1],
                [patternimage_size[1] - 1, 0]
                ]
            ).reshape(-1,1,2)

        wframe_rounding_box = cv2.perspectiveTransform(patternimage_rectsize, h)

        frame_rounding_box = list()
        for cord in wframe_rounding_box:
            cord = cord[0]

            frame_rounding_box.append((
                    int(cord[0]*ratio[0]), int(cord[1]*ratio[1])
                    ))

        if False:
            cv2.line(frame, 
                     frame_rounding_box[0],
                     frame_rounding_box[1],
                     (255, 0, 0), 1, 8 , 0);
            cv2.line(frame,
                     frame_rounding_box[1],
                     frame_rounding_box[2],
                     (255, 0, 0), 1, 8 , 0);
            cv2.line(frame, 
                     frame_rounding_box[2],
                     frame_rounding_box[3],
                     (255, 0, 0), 1, 8 , 0);
            cv2.line(frame, 
                     frame_rounding_box[3],
                     frame_rounding_box[0],
                     (255, 0, 0), 1, 8 , 0);

        x_pts = [x for x, y in frame_rounding_box]
        y_pts = [y for x, y in frame_rounding_box]
        try:
            frame_roi = frame[min(y_pts):max(y_pts), min(x_pts):max(x_pts)]

            gray = cv2.cvtColor(frame_roi, cv2.COLOR_BGR2GRAY)

            ret, thresh = cv2.threshold(gray,0,255,cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) 

            cv2.imshow("thresh", thresh)

            contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)                

            #temp, classified_points, means = cv2.kmeans(data=numpy.concatenate(contours), K=2, bestLabels=None,
            #                                            criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_MAX_ITER, 1, 10), attempts=1, 
            #                                            flags=cv2.KMEANS_RANDOM_CENTERS)

            if True:
                cv2.drawContours(frame_roi, contours, -1,(0,255,0),3)

            all_contour = numpy.concatenate(contours)

            hull = cv2.convexHull(all_contour)
            if False:
                cv2.drawContours(frame_roi, hull, -1,(0,0,255), 4)

            if False:
                approx = cv2.approxPolyDP(numpy.concatenate(all_contour), 0.1*cv2.arcLength(all_contour, True),True)
                cv2.drawContours(frame_roi, approx, -1,(255,0,0), 4)

            for cnt in contours: 
                hull = cv2.convexHull(cnt)
                cv2.drawContours(frame_roi, hull, -1,(0,0,255), 2)

            cv2.imshow("ROI", frame_roi)
        except Exception,e: print str(e)
from sklearn.metrics import roc_curve, auc

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier



#getting frames of all input sentences
fram=frames.get_frames()
print fram

#getting top 2 frames for each process
#import features
feat=features.get_features()
print feat



def features_extract(df,df2):
    #creating new data frame to include features
    df6=pd.DataFrame(columns=['Process','Feature','Label'])
    df6['Process']=df['Process']
    df6['Label']=df['Label']
    df6['Sentence']=df['Sentence']
    #df6=df6[df6['Process']!='accumulation'].reset_index()
    #print df6


    #setting feature=1 if any of the top 2 frames are present in each sentence
Beispiel #49
0
def _features():
    processed = json.loads(request.args.get('processed'))
    options = json.loads(request.args.get('options'))
    features = get_features(processed, options)
    return jsonify(features=features)
Beispiel #50
0
args = parser.parse_args()
dirname = args.dirname
outfilename = args.filename
total_files = 0
total_parsed = 0
with open(outfilename,"a") as of:
    for root,dirs,filenames in os.walk(dirname):
        for f in filenames:
           full_path = os.path.join(root,f)
           l = len(f)
           if l > 4 and f[l-4:l] == '.csv':
               total_files = total_files + 1
               print "analyzing",os.path.join(root,f)
               try:
                   raw_data = loader.loadFile(full_path,delim=',',skip=1)
                   results = features.get_features(raw_data)
                   of.write('\t'.join([str(full_path),str(results['density']),str(results['density_minus_one']),str(results['density_all_nums']),str(results['density_strict']),str(results['fnumcols']),str(results['hasdate']),str(results['sum_covariance']),str(results['sum_abs_covariance']),str(results['max_abs_covariance']),str(results['total_unique_labels']),str(results['first_unique_labels'])])+'\n')
                   #print results
                   total_parsed = total_parsed + 1
               except Exception as e:
                   print "error occured while trying to parse",f,":"
                   print traceback.format_exc()
           elif (l > 4 and f[l-4:l] == '.tsv') or f == 'data.txt':
               total_files = total_files + 1
               print "analyzing",os.path.join(root,f)
               try:
                   raw_data = loader.loadFile(full_path,delim='\t',skip=1)
                   results = features.get_features(raw_data)
                   of.write('\t'.join([str(full_path),str(results['density']),str(results['density_minus_one']),str(results['density_all_nums']),str(results['density_strict']),str(results['fnumcols']),str(results['hasdate']),str(results['sum_covariance']),str(results['sum_abs_covariance']),str(results['max_abs_covariance']),str(results['total_unique_labels']),str(results['first_unique_labels'])])+'\n')
                   #print results
                   total_parsed = total_parsed + 1
    print 'EN identifier =',identifier
    a,b,c,d,e = EXTRAS.get_our_analysis(identifier)
    segstart, chromas, beatstart, barstart, duration = a,b,c,d,e
    if segstart == None:
        print 'EN gave us None, must start again'
        sys.exit(0)
    analysis_dict = {'segstart':segstart,'chromas':chromas,
                     'beatstart':beatstart,'barstart':barstart,
                     'duration':duration}
    del a,b,c,d,e,segstart,chromas,beatstart,barstart,duration
    print 'analysis retrieved from Echo Nest'
    

    # features from online (positive=False to compare with old school method)
    online_feats = features.get_features(analysis_dict,pSize=8,usebars=2,
                                         keyInv=True,songKeyInv=False,
                                         positive=False,do_resample=True,
                                         btchroma_barbts=None)
    online_feats = online_feats[np.nonzero(np.sum(online_feats,axis=1))]
    print 'features from online computed, shape =',online_feats.shape

    # retrieve feature using TZAN and compare to what we got
    print 'comparing features from upload and online'
    print 'reuploading songfile =',songfile
    a,b,c,d,e =  TZAN.get_en_feats(songfile)
    pitches, seg_start, beat_start, bar_start, duration = a,b,c,d,e
    print'number of segments (upload/online):',np.array(seg_start).shape,',',np.array(analysis_dict['segstart']).shape
    a = np.array(seg_start)
    b = np.array(analysis_dict['segstart'])
    assert a.shape == b.shape
    a - b
    assert np.abs(np.array(seg_start).flatten()-np.array(analysis_dict['segstart']).flatten()).max() < .001
        if minj == letters.index(c):
            score += 1
    print ''.join(pred)
    print ''.join(real)
    return means, stds, score/float(len(test))

if __name__ == '__main__':
    if len(sys.argv) != 4:
        print 'Usage: %s training|test soundf textf' % sys.argv[0]

    soundf = sys.argv[2]
    textf = sys.argv[3]

    rate, data, text = load_data(soundf, textf)
    starts, ends, chunks = get_chunk_starts(data)
    f = get_features(data, starts, ends, include_fft=True, include_cepstrum=True)

    if sys.argv[1] == 'training':
        means, stds, score = naive_bayes(text, f)
        print 'Naive Bayes', score

        logreg_score, logreg = logistic_test(text, f)
        svm_score, svm = svm_test(text, f)
        joblib.dump(logreg, 'cache/logistic.pkl')
        print 'Logistic test', logreg_score
        print 'SVM test', svm_score
    else:
        try:
            logreg = joblib.load('cache/logistic.pkl')
        except:
            print 'Run `%s training 7` to train your model first' % sys.argv[0]
Beispiel #53
0
def load_features():
    global PC_3s
    PC_3s = get_features(S_BEFORE,S_AFTER,SAMPLE_RATE,FPC)