Beispiel #1
0
def run_pipeline(file):
    if type(file) is str:  # One file
        # Read raw data from yfinance api
        df = pd.read_csv(file)

        # preprocessing on raw data
        df = preprocess(df)

        # removing nan values
        df = fill_nans(df)
    else:  # Multiple files
        # Read raw data from yfinance api for first file in list
        df = pd.read_csv(file[0])

        # preprocessing on raw data
        df = preprocess(df)

        # removing nan values
        df = fill_nans(df)

        # looping over the rest of the files in the list
        for i in range(1, len(file)):
            # Read raw data
            next_df = pd.read_csv(file[i])
            # preprocess
            next_df = preprocess(next_df)
            # fill nans
            next_df = fill_nans(next_df)
            # concatenate to df
            # inner join: only shared columns are kept
            df = pd.concat([df, next_df], join="inner")

    # now data is prepared to model
    return df
Beispiel #2
0
def do_all(subfile="EEGbears.csv"):
    cols = ["HandStart", "FirstDigitTouch", "BothStartLoadPhase", "LiftOff", "Replace", "BothReleased"]
    ids_tot = []
    pred_tot = []
    for subject in range(1, 13):
        features_train, labels_train, nevents, _, ntrtimes, ica, FTtstep, _ = preprocessing.preprocess(subject=subject)

        # train classifiers. Note we can't use just one classifier object
        # because some events overlap so we want to be able to predict combinations of classes
        classifiers = [SKLearnClf() for event in range(nevents)]
        for event in range(nevents):
            classifiers[event].fit(features_train, labels_train[:, event])

        # read and prepare test data
        features_test, _, _, _, ntesttimes, _, _, ids = preprocessing.preprocess(subject=subject, train=False, ica=ica)
        ids_tot.append(ids)
        # get predictions for individual time steps
        ntimebins = features_test.shape[0]
        predlabels = np.zeros((ntimebins, nevents))
        for event in range(nevents):
            predlabels[:, event] = classifiers[event].predict_proba(features_test)[:, 1]
        predevents = preprocessing.labels_to_events(predlabels, FTtstep, ntesttimes)
        pred_tot.append(predevents)
        print("Finished subject " + str(subject) + ".")
    # create pandas object for sbmission, write to file
    submission = pd.DataFrame(index=np.concatenate(ids_tot), columns=cols, data=np.concatenate(pred_tot))
    submission.to_csv(subfile, index_label="id", float_format="%.3f")
    return submission
Beispiel #3
0
def main():
    '''
    Builds the stemmed and unstemmed vocabularies for 3 corpuses.
    '''
    print('Building Vocabulary...', end=' ')
    for fileName in fileNames[:-1]:  # we don't need recipe_links
        vocab, autocompleteVocab = set(), set()  # one stemmed, one unstemmed
        with open("corpus/{}.json".format(fileName), 'r') as f:
            stringList = json.load(f)
            docList, autocompleteDocList = [], []
            for i in range(len(stringList)):
                docList.append(preprocess(stringList[i]))
                autocompleteDocList.append(
                    preprocess(stringList[i], stem=False))
                for word in docList[i]:
                    vocab.add(word)
                for word in autocompleteDocList[i]:
                    autocompleteVocab.add(word)
            with open('vocabulary/{}.json'.format(fileName), 'w') as vocabFile:
                json.dump(list(vocab), vocabFile)  # set isn't serializable
            with open('vocabulary/{}_autocomplete.json'.format(fileName),
                      'w') as vocabFile:
                json.dump(list(autocompleteVocab),
                          vocabFile)  # set isn't serializable
    print('Done.')
Beispiel #4
0
def run(command, blocks, preprocessed):
    if command.parms:
        # Looping Array
        if command.parms[0].startswith("[") and command.parms[0].endswith("]"):
            arr = command.parms[0].replace("[", "").replace("]", "").split("|")
            arr = [x.lstrip(" ").strip(" ") for x in arr]
            var = command.parms[1] if len(command.parms) > 1 else None
            blocks = "\n".join([" "*block.spaces + block.text for block in blocks])
            total = []

            for x in arr:
                copy_blocks = blocks.replace(var, x) if var else blocks
                total.append(process(preprocess(copy_blocks, False)))

            return "\n".join(total)
        # Looping Number (Range)
        else:
            times = 0
            try:
                times = int(command.parms[0])
            except:
                return None

            var = command.parms[1] if len(command.parms) > 1 else None
            blocks = "\n".join([" "*block.spaces + block.text for block in blocks])
            total = []

            for i in range(times):
                copy_blocks = blocks.replace(var, str(i+1)) if var else blocks
                total.append(process(preprocess(copy_blocks, False)))

            return "\n".join(total)
def main(settings, metrics):
    #Begin processing validation images
    #	troubled_ones = [3, 14, 22, 43, 66, 83, 97, 114, 161]
    #	troubled_ones = [137]
    for i in range(0, len(settings['validation_files'])):
        #	for i in troubled_ones:
        if 'Rink-Isbrae' in settings['validation_files'][
                i] or 'Upernavik' in settings['validation_files'][
                    i] or 'Umiammakku' in settings['validation_files'][
                        i] or 'Inngia' in settings['validation_files'][i]:
            #		if 'Inngia' in settings['validation_files'][i]:
            #			if i == 62:
            preprocess(i, settings, metrics)
            process(settings, metrics)
            postprocess(settings, metrics)


#			break

#Print statistics
#	print_calfin_domain_metrics(settings, metrics)
#	print_calfin_all_metrics(settings, metrics)

#	plt.show()

    return settings, metrics
    def __load_tempfile(self, doc_id, sentence, movie_id):
        preprocessed = preprocessing.preprocess(
            sentence, stemming=self.activate_stemming, stop=self.activate_stop)
        preprocessed = list(filter(None, preprocessed))

        word_count = len(
            preprocessing.preprocess(sentence, stemming=False, stop=False))

        for term in set(preprocessed):
            positions = [
                n for n, item in enumerate(preprocessed) if item == term
            ]
            self.temp[term] = self.temp.get(term, {
                'term': term,
                'doc_count': 0,
                'movies': dict()
            })
            self.temp[term]['doc_count'] += 1
            self.temp[term]['movies'][movie_id] = self.temp[term][
                'movies'].get(movie_id, {
                    '_id': movie_id,
                    'doc_count': 0,
                    'sentences': list()
                })
            self.temp[term]['movies'][movie_id]['doc_count'] += 1
            self.temp[term]['movies'][movie_id]['sentences'].append({
                '_id':
                doc_id,
                'len':
                word_count,
                'pos':
                positions
            })
Beispiel #7
0
def main():
    logger.info("Execution Started!!!")
    if DOWNLOAD_RAW_DATA:
        fetch_and_save_raw_data()
    if CREATE_RAW_DATASET:
        if not read_all_data():
            logger.error(
                "Execution abruptly stopped while creating raw dataset!!!")
            return
    try:
        train_data = pd.read_csv(os.path.join(RAW_DATA_DIR,
                                              "raw_train_data.csv"),
                                 encoding="utf-8")
        if SIMPLE_PROCESSING_TYPE:
            logger.info("Performing simple text processing.")
            train_data_simple = preprocess(train_data)
            if type(train_data_simple) == pd.core.frame.DataFrame:
                train_data_simple.to_csv(
                    os.path.join(SIMPLE_PROCESSED_DATA_DIR,
                                 "train_data_simple.csv"),
                    index=False,
                    encoding="utf-8",
                )
            else:
                logger.error("Unable to write simple processed data!!!")
                return
        if COMPLEX_PROCESSING_TYPE:
            logger.info("Performing complex text processing.")
            train_data_complex = preprocess(train_data,
                                            preprocess_type="complex")
            if type(train_data_complex) == pd.core.frame.DataFrame:
                train_data_complex.to_csv(
                    os.path.join(COMPLEX_PROCESSED_DATA_DIR,
                                 "train_data_complex.csv"),
                    index=False,
                    encoding="utf-8",
                )
            else:
                logger.error("Unable to write complex processed data!!!")
                return
        if VECTORIZE_DATA_SIMPLE:
            logger.info("Vectorizing simple processed data.")
        if VECTORIZE_DATA_COMPLEX:
            logger.info("Vectorizing complex processed data.")
            if not vectorize_data(
                    os.path.join(COMPLEX_PROCESSED_DATA_DIR,
                                 "train_data_complex.csv"),
                    "complex",
            ):
                logger.error(
                    "Execution abruptly stopped while vectorizing complex data!!!"
                )
                return
    except Exception as e:
        logger.error("Exception in main method : {}".format(str(e)))
        return
    logger.info("Execution successfully completed.")
Beispiel #8
0
def crossvalidation(subject=1):
    time.clock()

    features_train, labels_train, nevents, _, ntrtimes, ica, FTtstep, _ = preprocessing.preprocess(
        subject=subject, series=range(1, 7)
    )

    preptime = time.clock()
    print("Preprocessing took " + str(preptime) + " seconds.")

    # train classifiers. Note we can't use just one classifier object
    # because some events overlap so we want to be able to predict combinations of classes
    classifiers = [SKLearnClf() for event in range(nevents)]
    for event in range(nevents):
        classifiers[event].fit(features_train, labels_train[:, event])

    traintime = time.clock() - preptime
    print("Trained the classifiers in " + str(traintime) + " seconds.")

    # read and prepare test data
    features_cv, labels_cv, _, events_cv, ncvtimes, _, _, _ = preprocessing.preprocess(
        subject=subject, train=True, series=range(7, 9), ica=ica
    )
    events_cv = events_cv.astype(int)  # I don't know why but it's an object array before this

    # separate some data for cross-validation
    # features_train, features_cv, labels_train, labels_cv = cross_validation.train_test_split(
    # features, labels, test_size = 0.3)

    # naively score classifiers on training set
    trscores = np.zeros((nevents))
    for event in range(nevents):
        trscores[event] = classifiers[event].score(features_train, labels_train[:, event])
    print("Scores on training set in binned time: " + str(trscores))

    # naively score classifiers on CV set
    testscores = np.zeros((nevents))
    for event in range(nevents):
        testscores[event] = classifiers[event].score(features_cv, labels_cv[:, event])
    print("Scores on CV set in binned time: " + str(testscores))

    # generate ROC curves for CV set in binned time
    predlabels_cv = np.transpose([classifiers[e].predict_proba(features_cv)[:, 1] for e in range(nevents)])
    rocscoresbinned = ROCcurve(predlabels_cv, labels_cv)
    print("For binned time...")
    print("Areas under ROC curves:")
    print(rocscoresbinned)
    print("Average ROC score:" + str(np.mean(rocscoresbinned)))

    # generate ROC curves for CV set in real time
    predevents_cv = preprocessing.labels_to_events(predlabels_cv, FTtstep, ncvtimes)
    rocscoresreal = ROCcurve(predevents_cv, events_cv)
    print("For real time...")
    print("Areas under ROC curves:")
    print(rocscoresreal)
    print("Average ROC score:" + str(np.mean(rocscoresreal)))
    return predevents_cv, events_cv, features_cv, classifiers, np.mean(rocscoresbinned), np.mean(rocscoresreal)
Beispiel #9
0
def load_preprocessed_data(path, training=True):
    if not training:
        id, text = load_data(path, training)
        t = text.apply(lambda x: preprocess(x))
        return id.values.tolist(), t.values.tolist()
    else:
        id, text, label = load_data(path, training)
        t = text.apply(lambda x: preprocess(x))
        l = label.apply(lambda x: emotion2label[x])
        return id.values.tolist(), t.values.tolist(), l.values.tolist()
Beispiel #10
0
def getData(filePath):
    """
    @brief function to get the training data. Please
    @param filePath path to csv file from unity project.
    @returns augmented training Data and labels.
    """
    lines = []
    with open(filePath) as csvfile:
        reader = csv.reader(csvfile)
        for line in reader:
            lines.append(line)
    X_train = []
    measurements = []
    count = 0
    for line in lines:
        source_path = line[0:3]
        # filename = source_path.split('/')[-1]
        measurement = float(line[3])
        for path in source_path:
            image = imread(path)
            # if steering angle is 0
            if 0.00001 > measurement > -0.00001:
                # replace with a randomized value between -0.1 and +0.1
                randomSteer = np.random.random() * 0.01 - 0.005
                # take every 15th value with 0.0 as steer angle
                if count % 15 == 0:
                    measurements.append(measurement + randomSteer)
                    X_train.append(preprocessing.preprocess(image))
                count = count + 1
            else:
                # Limit model from applying full steering.
                if measurement > 0.9:
                    measurement = 0.9
                if measurement < -0.9:
                    measurement = -0.9
                measurements.append(measurement)
                # transform the image and augment
                # augmentation is done only for track images with curves.
                proc = preprocessing.preprocess(image)
                X_train.append(proc)
                aug = []
                if -0.4 > measurement or measurement > 0.30:
                    aug = transforms.augmentData(image, 1)
                if -0.9 >= measurement or measurement > 0.50:
                    aug += transforms.augmentData(image, 1)
                if -0.6 > measurement > -0.9:
                    aug += transforms.augmentData(image, 1)
                # append augmented data into training set.
                for im in aug:
                    proc = preprocessing.preprocess(im)
                    X_train.append(proc)
                    measurements.append(measurement)
    X_train = np.array(X_train)
    y_Train = np.array(measurements)
    return X_train, y_Train
Beispiel #11
0
def main():
    '''Main function to use from commandline, preprocess input to generate embeddings, detect agression clauses using
    provided approach, extract features and labels from training and features from input data, trains a model and
    classifies test data using the trained model, evaluates predictions and goldlabels from input'''
    inputfile = 'sample_input.xls'
    preprocess(inputfile)
    get_predictions_rulebased()
    get_predictions_ml()

    ### only clusters with enough data, else everything in outlier cluster
    cluster_precursors()
Beispiel #12
0
def clf_predict(clf, image):
    X = preprocess(image)
    prediction = clf.predict(X)
    if isinstance(clf, keras.Model):
        prediction += clf.predict(preprocess(-image))
        prediction += clf.predict(preprocess(np.rot90(image, axes=(1, 2))))
        prediction += clf.predict(preprocess(np.rot90(image, axes=(2, 1))))
        prediction = np.argmax(prediction)
    else:
        prediction = prediction[0]
    return prediction
def load_everything():
    # loadign scheme
    connection_string = " user = '******' password = '******'  host = '127.0.0.1'  port = '5432' dbname = 'project' "
    conn = psycopg2.connect(connection_string)
    with conn.cursor() as cursor:
        setup_queries = open('schema.sql', 'r').read()
        cursor.execute(setup_queries)
        conn.commit()
    # preprocess the data
    preprocessing.preprocess()
    # loading csv into database
    load.load_into_database()
Beispiel #14
0
def main():
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('start preprocessing data from raw:')
    if FLAGS.tweets:
        preprocessing.preprocess_tweets(save=True)
    else:
        preprocessing.preprocess()

    logger.info('files have been created in data/processed')
Beispiel #15
0
def test(categories):
    use_stem = True
    test_data = []
    test_labels = []
    tsv_out1 = open(".\\src\\test.tsv", "wb")
    tsv_out = csv.writer(tsv_out1, delimiter='\t')

    test_json = open(".\\src\\test.json")
    count_all = Counter()
    for r in test_json:
        tweet = json.loads(r)
        if (tweet["lang"] != "ru"):
            continue
        # Create a list with all the terms
        terms_all = [term for term in preprocess(tweet['text'], True)]
        # Update the counter
        count_all.update(terms_all)
        # tokens = preprocess(tweet['text'], True)
        # for token in tokens:
        #     print token

        # print tweet["text"].encode(sys.stdout.encoding, errors='replace')
        # tsv_out.writerow(["hz", tweet["text"].encode("utf-8")])
    for token in count_all.most_common(5):
        print(token[0] + ":" + str(token[1]))
    exit()
    tsv_out1.close()
    # exit(0)

    # test_in = open(".\\data\\parsed\\ttk_train.tsv")
    test_in = open(".\\src\\test.tsv")
    test_in = csv.reader(test_in, delimiter='\t')

    fin1 = open('vectorizer.pk', 'r')
    vectorizer = pickle.load(fin1)

    fin2 = open('classifier_linear.pk', 'r')
    classifier_linear = pickle.load(fin2)

    test_data, test_labels = preprocess(test_in, use_stem)
    test_vectors = vectorizer.transform(test_data)

    prediction_linear = classifier_linear.predict(test_vectors)

    print("Results for SVC(kernel=linear)")
    print(classification_report(test_labels, prediction_linear))
    with open("result_linear_test.txt", "wb") as result_out:
        i = 0
        for s in prediction_linear:
            if (test_labels[i] != prediction_linear[i]):
                result_out.write(
                    test_labels[i] + " : " + prediction_linear[i] + '\t' + test_data[i].encode("utf-8") + '\n')
            i += 1
Beispiel #16
0
def all_to_filtered(doc_not_filtered_path, out_path, line_map_path):
    """map after filtering to before filtering"""
    assert os.path.isfile(doc_not_filtered_path), 'invalid doc_not_filtered_path: %s' % doc_not_filtered_path
    assert os.path.isdir(out_path), 'invalid out_path: %s' % out_path
    line_map_folder, name = os.path.split(line_map_path)
    assert os.path.isdir(line_map_folder), 'invalid line_map_folder: %s' % line_map_folder
    assert name.strip() != '', 'empty line_map_path name'
    filters = [docfilters.remove_doctests,
               docfilters.keep_first_description,
               docfilters.remove_wx_wrappers,
               docfilters.remove_parameter_descriptions,
               docfilters.replace_vertical_bars]
    preprocessing.preprocess(doc_not_filtered_path[:-4], out_path, False, filters, line_map_path)
Beispiel #17
0
    def on_data(self, data):
        try:
            all_data = json.loads(data)                                 # tweets are dumped on the system in the json format
            tweet_text = all_data["text"]                               # only test part of the tweets will be fetched
            tweet = clean_tweet(tweet_text)                             # further cleaning of the tweets

            preprocess(tweet)
            print(all_data)
            
            #----------------------------------------------

        except:
            return True
Beispiel #18
0
def main():

    t = time()
    text = "Agar shi chala toh ye nhi dikhega Bidu"
    base_path = 'temp_images/'

    for file in os.listdir(base_path):
        try:
            os.remove(base_path + file)
        except FileNotFoundError:
            print('{} not deleted. \n'.format(file))
            continue

    ip_image_path = 'braille_scan.jpg'
    # ip_image_path = '/Users/ayushi/Desktop/uhack/braille_scan.jpg'

    try:
        os.remove('results.txt')
    except FileNotFoundError:
        print('results file not found.')

    preprocessed_image = base_path + 'preprecessed.jpg'

    try:
        preprocess(base_path, ip_image_path, preprocessed_image)
    except Exception as e:
        print(e)

    # preprocessing(ip_image_path, preprocessed_image)
    try:
        n_lines = horizontal_segmentation(base_path, preprocessed_image)
        vertical_segmentation(base_path, n_lines)
    except Exception as e:
        print(e)

    try:
        file = open('results.txt', 'r')
        text = file.read()
        file.close()
        print(text)

        error_removal()

        file = open('results.txt', 'r')
        text = file.read()
        file.close()
        print(text)
    except Exception as e:
        print(e)
    print(time() - t)
    return text
Beispiel #19
0
def run_pipeline():
    """
    Runs all functions in the pipeline. Parses tracking and events data from 52 xml files. Preprocesses the DataFrame
    to conform to Metrica Sports format. Then calculates EPV values to get the optimal passes using the
    Friends Of Tracking code, which can be found in the EPV_code folder. Then creates multiple features based on
    tracking and events data. Followed by the analysis, using a Linear Regression and Decision Tree.

    After each step files are saved to the /data folder.
    """
    data_parser.parse_data()
    preprocessing.preprocess()
    generate_EPV_values.generate_epv_files()
    feature_engineering.engineer_features()
    analysis.run_analysis()
Beispiel #20
0
def detect_plagiarism(path1, path2, categories):
    '''The main function initialized to detect plagiarism.'''

    preprocess(path1)
    preprocess(path2)

    path1 = os.path.join(os.getcwd(), 'temp',
                         remove_suffix(os.path.basename(path1)))
    path2 = os.path.join(os.getcwd(), 'temp',
                         remove_suffix(os.path.basename(path2)))

    details, summary = get_summary(categories, path1, path2)

    return details, summary
Beispiel #21
0
def main(settings, metrics):
	#Begin processing validation images
#	troubled_ones = [3, 14, 22, 43, 66, 83, 97, 114, 161]
#	troubled_ones = [161]
	for i in range(0, len(settings['validation_files'])):
#	for i in troubled_ones:
		preprocess(i, settings, metrics)
		process(settings, metrics)
		postprocess(settings, metrics)
	
	#Print statistics
	print_calfin_domain_metrics(settings, metrics)
	print_calfin_all_metrics(settings, metrics)
	
	return settings, metrics
Beispiel #22
0
def bm25_classifier(query, descriptions, labels):
    """
    Computes BM25 scores of a given query in relation to all selected and preprocessed datasets and
    selects all datasets that exeed the threshold mean+3*sd.
    input: query and list of lables,
    output: list of labels that fit the query
    """
    preprocessed_descriptions = []
    for description in descriptions:
        preprocessed_descriptions.append(
            preprocessing.preprocess(str(description)))
    tokenized_corpus = [doc.split(" ") for doc in preprocessed_descriptions]
    bm25_modell = BM25Plus(tokenized_corpus)
    tokenized_query = query.split(" ")
    scores = bm25_modell.get_scores(tokenized_query)
    mean_scores = mean(scores)
    standard_deviation_scores = stdev(scores)
    selected = []
    for i in range(0, len(descriptions)):
        label = labels[i]
        description = descriptions[i]
        score = scores[i]
        if score > (mean_scores + 4 * standard_deviation_scores):
            selected.append(label)
    return selected
Beispiel #23
0
def uploaded_file(filename):

    import preprocessing
    img = preprocessing.preprocess()
    import pytess
    ans = pytess.test('pre.tif')
    return render_template("result.html", org_img=filename, ans=ans)
def telemetry(sid, data):
    if data:
        # The current steering angle of the car
        steering_angle = data["steering_angle"]
        # The current throttle of the car
        throttle = data["throttle"]
        # The current speed of the car
        speed = data["speed"]
        # The current image from the center camera of the car
        imgString = data["image"]
        image = Image.open(BytesIO(base64.b64decode(imgString)))
        image_array = np.asarray(image)

        # Preprocess image before prediction
        image_array = preprocessing.preprocess(image_array)
        steering_angle = float(
            model.predict(image_array[None, :, :, :], batch_size=1))

        throttle = controller.update(float(speed))
        print(steering_angle, throttle)
        send_control(steering_angle, throttle)

        # save frame
        if args.image_folder != '':
            timestamp = datetime.utcnow().strftime('%Y_%m_%d_%H_%M_%S_%f')[:-3]
            image_filename = os.path.join(args.image_folder, timestamp)
            image.save('{}.jpg'.format(image_filename))
    else:
        # NOTE: DON'T EDIT THIS.
        sio.emit('manual', data={}, skip_sid=True)
Beispiel #25
0
def get_batch(X_train, y_train):
    """
    Generate the batch for training with the data (X_train) and the corresponding grouth truth
    X_train(names):center_img.strip(), left_img.strip(), right_img.strip() 
    y_train:angle, angle+steer_offset, angle-steer_offset
    Returns: A list of image(filenames) and steeringAngles -> only for one batch
    """
    imgList = np.zeros((BATCH_SIZE, 66, 200, 3), dtype=np.float32)
    steeringAngleList = np.zeros((BATCH_SIZE, ), dtype=np.float32)

    while True:
        for i in range(BATCH_SIZE):
            lowAngle_counter = 0

            # Get a valid angle (not low angle when the percentage of lowangle in the batch is already exceeded)
            while True:
                imgFileName, angle = getRandImgAndAngle(X_train, y_train)
                if not validAngle(angle, lowAngle_counter, BATCH_SIZE):
                    # Get a new data -> this one is not working
                    continue
                else:
                    # Fine. Increase and you shall pass ;)
                    lowAngle_counter += 1
                    break

            # Read image
            image = cv2.imread(imgFileName)
            # Preprocess
            image = preprocess(image)
            # Flip the image (sometimes)
            imgList[i], steeringAngleList[i] = flipImg(image, angle)

        yield imgList, steeringAngleList
    def generate_feature_matrix(X_train, X_dev, X_test, preprocessing=False, remove_stopwords=False, min_df=1):
        if preprocessing:
            X_train = preprocess(X_train, remove_stopwords)
            X_dev = preprocess(X_dev, remove_stopwords)
            X_test = preprocess(X_test, remove_stopwords)

        X_train = create_dataframe_for_training(X_train)
        X_dev = create_dataframe_for_training(X_dev)
        X_test = create_dataframe_for_training(X_test)

        vectorizer = CountVectorizer(min_df=min_df)
        X_train_fe = vectorizer.fit_transform(X_train)
        X_valid_fe = vectorizer.transform(X_dev)
        X_test_fe = vectorizer.transform(X_test)

        return X_train_fe.toarray(), X_valid_fe.toarray(), X_test_fe.toarray()
def get_contours(image, median_size=5):
    """
    :param image: The original image, in which you wanna reduce the noise.
    :param median_size: the matrix dimensions of the median filter
    :return: numpy array includes the contours in this image
    :draw: the original image with the contours detected and drawn on it
    """
    image_with_noise = image
    image = preprocess(image, median_size)

    blurred = cv2.pyrMeanShiftFiltering(image, 31, 91)

    gray = cv2.cvtColor(blurred, cv2.COLOR_BGR2GRAY)

    ret, threshold = cv2.threshold(gray, 0, 255,
                                   cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    #ret, threshold = cv2.threshold(gray, 127, 255, 1)

    _, contours, _ = cv2.findContours(threshold, cv2.RETR_LIST,
                                      cv2.CHAIN_APPROX_NONE)
    #_, contours, _ = cv2.findContours(threshold,1,2)

    cv2.drawContours(image_with_noise, contours, -1, (0, 0, 255), 6)

    cv2.namedWindow("Contours Detection", cv2.WINDOW_NORMAL)
    cv2.imshow("Contours Detection", image_with_noise)
    cv2.waitKey()
    return contours
Beispiel #28
0
def run_predictions(dataframe, test_size, selected_model, parameters, metrics,
                    cross_val, cv_k):
    """Puts together preprocessing, training and testing."""

    st.markdown(":chart_with_upwards_trend: Hyperparameters used: ")
    st.write(parameters)

    if cross_val:
        st.warning(
            "Warning, only the first metric is selected when using Cross Validation."
        )

    # Preprocessing data
    x, y = preprocessing.preprocess(dataframe)
    st.success("Preprocessing completed!")

    model = get_model(selected_model, parameters)

    if cross_val:
        # model = get_model(selected_model, parameters)
        cross_validation(model, x, y, cv_k, metrics[0])

    else:
        # Training the model
        train_status = st.warning("Training model..")
        X_train, X_test, y_train, y_test = train_test_split(
            x, y, test_size=test_size)
        # model = get_model(selected_model, parameters)
        model.fit(X_train, y_train)
        train_status.success("Training completed!")

        # Testing the model
        test_status = st.warning("Testing model..")
        test_model(model, X_train, y_train, X_test, y_test, metrics)
        test_status.success("Testing completed!")
Beispiel #29
0
def main():
    df_data = preprocessing.get_data_db('../../sample_3days.sqlite')
    preprocessing.fill_missing_values(df_data)
    df_data = preprocessing.preprocess(df_data)

    #df_data.drop(preprocessing.dict_category_columns['text'], axis=1, inplace=True)

    target_columns = preprocessing.target_columns
    feature_columns = [
        column for column in df_data.columns if column not in target_columns
    ]
    target_columns = ['ups']

    print(target_columns, '\n')
    print(feature_columns)

    indices = np.arange(len(df_data))
    splits = {}
    splits['train'], splits['test'] = train_test_split(indices, test_size=0.1)
    splits['train'], splits['validation'] = train_test_split(splits['train'],
                                                             test_size=0.1)

    model = models.get_model()
    model = training.train(model, df_data[feature_columns],
                           df_data[target_columns], splits)
    print(model.score(df_data[feature_columns], df_data[target_columns]))
Beispiel #30
0
def Main():
    fp = csv.reader(open('../../DataSet/Tweets_sample2.csv', 'rb'),
                    delimiter=',',
                    quotechar='"')

    #fp = open('../../Data_Set/kaggle.txt','rb')

    data_list = processing_data_set.data_list(fp)

    random.shuffle(data_list)

    feature_list = []

    cnt = 0

    print len(data_list)

    for i in data_list:

        i[0] = preprocessing.preprocess(i[0])

        i[0] = all_words.to_all_words(i[0])

        i[0] = feature_words.to_feature_words(i[0])
        #print i[1]
        feature_list.extend(i[0])

    #print len(feature_list)

    #Preparing a dictionary of words in feature_list and maintaining their count

    dic = {}

    for i in feature_list:
        if i in dic:
            dic[i] = dic[i] + 1
        else:
            dic[i] = 1

    print len(dic)

    #Reverse sorting the dictionary to get most frequently used words

    #print dic

    feature_list = sorted(dic, key=dic.__getitem__, reverse=True)[:3000]

    print "Length of feature list ", len(feature_list)

    #print feature_list[:15]
    '''
	data_list2 = []
	
	for i in data_list:
		data_list2.append((i[0],i[1]))
	'''

    #classifier_Self_NB.call_NB(data_list,feature_list)

    classifier2.NB_classifier(data_list, feature_list)
Beispiel #31
0
def process_all(files):
    count = 0
    geven_images = []
    geven_char = []
    for img_path, txt_path in tqdm(files):
        txt = get_text(txt_path)
        # if len(re.sub(r'[\u061F-\u066A|\s]', "", txt) ) != 0: continue
        img = cv2.imread(img_path)
        linesOfWords, numWords, linesImages = preprocess(img)
        if numWords != len(txt.split(' ')): continue
        words = txt.split(' ')
        words_ind = 0
        for l, line_img in enumerate(linesOfWords):
            for w, word_img in enumerate(line_img):
                word = words[words_ind]
                count += len(word)
                words_ind += 1
                img_indx = len(word_img[0])
                geven_images += [word_img]
                geven_char += [word[0]]
                # for i, char in enumerate(word):
                #     if i == 0:
                #         img_indx -= characterDict[char][0]
                #         char_img = word_img[:,img_indx:img_indx+characterDict[char][0]]
                #     elif i == len(word) - 1:
                #         img_indx -= characterDict[char][2]
                #         char_img = word_img[:,img_indx:img_indx+characterDict[char][2]]
                #     else:
                #         img_indx -= characterDict[char][1]
                #         char_img = word_img[:,img_indx:img_indx+characterDict[char][1]]
                #     if char_img.shape[1] == 0: continue
                #     # cv2.imwrite(os.path.join('dataset','chars', f"{txt_path.split('/')[-1].split('.')[0]}_{words_ind-1}_{l}_{w}_{i}_{ord(char)}.png"), char_img)
    print(len(geven_images))
    return geven_images, geven_char
def semantics(doc):
  prep = preprocess(doc)
  return (
    flatten( prep.pos_tags() ),
    prep.noun_phrases(),
    flatten( prep.get_entities() )
  )
Beispiel #33
0
def train_neural_network(path):
    '''Didn't have this working for the hackathon'''
    X_train, X_test, X_val, y_val, y_train, y_test = preprocess(path)

    def change(x):
        if x == -1:
            return 2
        if x == 0:
            return 0
        if x == 1:
            return 1

    print(type(y_train))
    #neural network doesn't allow - numbers in labels so changed all -1 to 2
    y_train = np.asarray(list(map(change, y_train)))
    y_test = np.asarray(list(map(change, y_test)))
    y_val = np.asarray(list(map(change, y_val)))

    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(35400)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(3, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=6)
    evaluation = model.evaluate(X_test, y_test)
    accuracy = evaluation[1]
    model.save('sequential_32_Dropout.h5')
    return accuracy
def run_eda(df_bank, df_fb, df_retail):
    '''
    a function to run the components of eda
    '''
    data = p.preprocess(df_bank, df_fb, df_retail)
    data.merge()
    data.clean()
    data.add_count()
    df_final = data.df_final

    # plot the count of each category
    count_plot(df_final)

    # plot the distribution of the categories
    dist_plot(df_final)

    # plot the distribution of the word counts
    word_count_plot(df_final)

    # plot the top n words for each category
    complaint_count = data.top_n_words(
        df_final[(df_final["Complaint"] == 1)]["Text"])
    plot_top_n_words(complaint_count, "Complaint")

    compliment_count = data.top_n_words(
        df_final[(df_final["Compliment"] == 1)]["Text"])
    plot_top_n_words(compliment_count, "Compliment")

    suggestion_count = data.top_n_words(
        df_final[(df_final["Suggestion"] == 1)]["Text"])
    plot_top_n_words(suggestion_count, "Suggestion")

    plt.show()
Beispiel #35
0
 def _build_bin_dict(self):
     """
     Build a binary dictionary containing all terms and the value 1.0 to
     indicate their presence in the data set.
     """
     if self.data:
         return dict((preprocess(term), 1.0) for term in self.data)
     return {}
Beispiel #36
0
 def _build_tf_dict(self):
     """
     Build a term-frequency dictionary containing terms and their counts.
     """
     if self.data:
         d = defaultdict(int)
         for term in self.data:
             normalized = preprocess(term)
             d[normalized] += 1
         if len(d.values()) > 0:
             self._max_score = max(d.values())
         return d
     return {}
def corpus_generation():

    filelist = glob.glob('../../web_spider/bilingual_article/正常语料/[0-9]*-*')
    chinese_corpus = []
    chinese_corpus_article_tokens = []
    chinese_corpus_tokens = []
    for file in filelist:
        text = fileload(file)
        chinese,english = preprocess(text)
        chinese_corpus.extend([chinese[1:]])
    for article in chinese_corpus:
        for sentence in article:
            chinese_corpus_article_tokens.extend(jieba.lcut(sentence,cut_all=False))
        chinese_corpus_tokens.append(chinese_corpus_article_tokens)
        chinese_corpus_article_tokens=[]
    return chinese_corpus_tokens
Beispiel #38
0
def main(argv):
   malice_file = open(argv[1])
   malice_text = malice_file.read()
   alice_path = ""
   if(argv[1].find("/") != -1):
      alice_path = argv[1][:(argv[1].rindex("/")+1)]
   malice_text = PS.preprocess(malice_text, alice_path)


   parsingTree = parsing.getTree(malice_text).asList()
   lexer = Lexer()
   lexer.addMaliceTokens()
   parsingTree = lexer.replaceInTree(parsingTree)
   assemblyFile = argv[1][:(argv[1].rindex('.'))] + ".asm"
   if(semantics.check(parsingTree)):
      CG.generate(parsingTree, assemblyFile)
def main(run = 1, force_run = False):
	mkdir(_model_folder)
	if not force_run and len(os.listdir(_model_folder)) > 0:
		ans = input("Found something in '%s', which may be overwitten.\nProceed? [y/n]: "%_model_folder)
		if ans.lower() == 'n':
			exit(-1)

	for k in range(run):
		samples = preprocessing.tp_sample.get_samples(_sample_folder)
		if _name_filter is not None:
			samples = [s for s in samples if s.batch_name in _name_filter]
		print(np.var([get_label(s) for s in samples]))
		random.shuffle(samples)
		batches = preprocessing.batch_data(samples, cross_valid)
		for i in range(cross_valid):
			valid_samples = batches[i]
			train_samples = []

			savedir = "%s/%d/"%(_model_folder, i+1)
			mkdir(savedir)
			
			for j in range(cross_valid):
				if j != i:
					train_samples.extend(batches[j])
			
			if _filter_samples:
				train_samples = preprocessing.score_portion(train_samples, get_label, _high_portion, _low_portion)
			train_texts = [sample.text for sample in train_samples]
			valid_texts = [sample.text for sample in valid_samples]
			train_matrix, valid_matrix, words = preprocessing.preprocess(train_texts, valid_texts, savedir = savedir, **_strategy_parameters)
			train_labels = np.asarray([get_label(sample) for sample in train_samples])
			valid_labels = np.asarray([get_label(sample) for sample in valid_samples])
			model, valid_mse = None, None
			if _model_type == "NN":
				model = Neural_Network(_attributes, _hidden_nodes = hidden_nodes, _learning_rate = learning_rate)
				valid_mse = model.train(train_matrix, train_labels, valid_matrix, valid_labels, max_iter = 15000)
			else:
				model = SVR(**_svm_parameters)
				valid_mse = model.train(train_matrix, train_labels, valid_matrix, valid_labels)
			model.save(savedir)
			model.destroy()

			print("Fold %2d: %.4f"%(i+1, valid_mse))
Beispiel #40
0
def prepare_data(f, categories, lowercase=True, stemming=False):
    data = []
    labels = []
    for row in f:
        if row[0] in categories:
            # if (ctg == "negative"):
            #     tokens = preprocess(row[1].decode("utf-8"), lowercase, False)
            # else:
            tokens = preprocess(row[1].decode("utf-8"), lowercase, stemming)
            new_str = " ".join([token for token in tokens])
            data.append(new_str)
            if (row[0] == 'negative'):
                labels.append(-1)
            elif (row[0] == 'neutral'):
                labels.append(0)
            elif (row[0] == 'positive'):
                labels.append(1)

    return data, labels
def corpus_generation():
    '''将原始的爬虫数据转换成篇章语料库
        返回的是一个list数据,list里的每个元素都是
        一个没有标题的已分词的文章内容,未进行深加工
    '''
    
    filelist = glob.glob('../../web_spider/bilingual_article/正常语料/[0-9]*-*')
    english_corpus = []
    english_corpus_article_tokens = []
    english_corpus_tokens = []
    for file in filelist:
        text = fileload(file)
        chinese,english = preprocess(text)
        english_corpus.extend([english[1:]])
    for article in english_corpus:
        for sentence in article:
            english_corpus_article_tokens.extend(nltk.word_tokenize(sentence))
        english_corpus_tokens.append(english_corpus_article_tokens)
        english_corpus_article_tokens = []
    return english_corpus_tokens
def main(run = 1, force_run = False):
	mkdir(_model_folder)
	if not force_run and len(os.listdir(_model_folder)) > 0:
		ans = input("Found something in '%s', which may be overwitten.\nProceed? [y/n]: "%_model_folder)
		if ans.lower() == 'n':
			exit(-1)

	for k in range(run):
		samples = preprocessing.tp_sample.get_samples(_sample_folder)
		if _name_filter is not None:
			samples = [s for s in samples if s.batch_name in _name_filter]
		print("Variance: %.3f"%np.var([get_label(s) for s in samples]))
		random.shuffle(samples)
		batches = preprocessing.batch_data(samples, _cross_valid)
		for i in range(_cross_valid):
			valid_samples = batches[i]
			train_samples = []

			savedir = "%s/%d/"%(_model_folder, i+1)
			mkdir(savedir)
			
			for j in range(_cross_valid):
				if j != i:
					train_samples.extend(batches[j])
			
			train_texts = [sample.comment for sample in train_samples]
			valid_texts = [sample.comment for sample in valid_samples]
			train_matrix, valid_matrix, words = preprocessing.preprocess(train_texts, valid_texts, savedir = savedir, **_strategy_parameters)
			
			#print("\tBag of words: %d"%len(words))

			train_labels = np.asarray([get_label(sample) for sample in train_samples])
			valid_labels = np.asarray([get_label(sample) for sample in valid_samples])
			model, valid_mse = None, None
			
			model = SVR(**_svm_parameters)
			valid_mse = model.train(train_matrix, train_labels, valid_matrix, valid_labels)
			model.save(savedir)
			model.destroy()

			print("Fold %2d: %.4f"%(i+1, valid_mse))
Beispiel #43
0
def hidden_route():
    '''
    Used to recieve calls from the server giving new data.
    '''
    # Recieving the raw data:
    #       string json output
    text = json.dumps(request.json,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))
    print 'hello'
    # with open('data/example.json') as f:
    #     json.dump(f)

    # print text

    # Cleaning the data:
    #       string json input -> pandas dataframe output
    input_data = json.loads(text)
    clean_data = preprocess(input_data)

    # Predicting on the data:
    #       pandas dataframe input -> boolean output
    prediction = model.predict_proba(clean_data)

    # Appending the prediction and time recieved to the cleaned data:
    #       pandas dataframe input -> dict output
    # return_data = clean_data.to_dict(orient='list')

    input_data['prediction'] = str(prediction[0][0])

    # return_data = dict((k, str(v[0])) for k, v in return_data.iteritems())

    input_data['time_received'] = time.time()

    # Writing the full data to the database:
    #       dict input

    tab.insert(input_data)

    return ''
Beispiel #44
0
def main():

    par_dir = 'recordings/smartphone/'
    rec_num = '1430177546499'
    wavfile = par_dir + rec_num + '/' + rec_num + '.wav'
    magfile = par_dir + rec_num + '/' + rec_num + 'Mag.csv'
    accelfile = par_dir + rec_num + '/' + rec_num + 'Accel.csv'
    truthfile = par_dir + rec_num + '/' + rec_num + 'Truth.csv'

    data, truth = preprocessing.preprocess(wavfile, magfile, accelfile, truthfile)
    print "Begin Training..."
    if new_model:
        pipe = linear_model.LogisticRegression(solver='lbfgs', verbose=1)
    else:
        with open("pipe.model", "rb") as f:
            pipe = pickle.load(f)

    pipe.fit(data, truth)

    f = open("pipe.model", "wb")
    pickle.dump(pipe.sparsify(), f)

    print "Training Score:"
    print pipe.score(data, truth)
                    5.0 / rng, 1.0]  # All colors are scaled between 0 and 1
        clr_indx = zip(thresh, colors)
        cmap = m.colors.LinearSegmentedColormap.from_list('custom', clr_indx, 256)
        if True:  # Gordon requested we examine the fronts in gray scale mapping
            cmap = cm.get_cmap('gray')  # overwrite the previos color mapping
            minv = 0.01
            maxv = 0.2
        cmap.set_bad('black')  # Set the land to black (from masked)
        cmap.set_under('white')  # Set the clouds to white. set_under
                                 # means set all the values under minval to
                                 # white. (clouds are -1 and the minval is > 0.
                                 # Roughly 0.011)

        cmap.set_over('white')  # set_over does the opposite of set_under

        edgePts = pp.preprocess(data, cutoff)  # This is the actual routine which
                                          # finds the 'front'. The edge pixels
                                          # around each voronoi region are the
                                          # approximated front

        masked.data[edgePts.tolist()] = 255  # Here we set all the edge pixels
                                             # to 255, or white thanks to the
                                             # cmap.set_over routine.

        plt.imshow(masked, cmap=cmap, vmin=minv, vmax=maxv)  # create plot
        fig = plt.gcf()  # get current figure

        fig.set_size_inches((2 * cols) / 100.0, (2 * rows) / 100.0)  # set save
                                                                     # parameters
        plt.savefig(rsltsDir+f[:-3]+"_%f.png" % (c), dpi=100)
        # By combining set_size and the dpi in savefig, you have complete
print("Preprocessing.. ")
news_samples = [sample for sample in news_samples if sample.word_count >= _min_word_count and sample.section in _section_filter]

random.shuffle(news_samples)
n_samples = len(news_samples)
train_samples = news_samples[0:int(n_samples*_train_ratio)]
test_samples = news_samples[int(n_samples*_train_ratio):n_samples]

print("Samples distribution:", preprocessing.samples_statistics(news_samples, _section_filter, get_section))
print("Train set distribution:", preprocessing.samples_statistics(train_samples, _section_filter, get_section))
print("Test set distribution:", preprocessing.samples_statistics(test_samples, _section_filter, get_section))

train_texts = [sample.text for sample in train_samples]
test_texts = [sample.text for sample in test_samples]
train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False, reduction = _reduction, reduce_n_attr = _reduce_n_attr, stem_words = _stem_words)

print("Generating labels..")
train_labels = preprocessing.samples_to_label(train_samples, _section_filter, get_section)
test_labels = preprocessing.samples_to_label(test_samples, _section_filter, get_section)

print("Training..")
kmeans = KMeans(n_clusters = len(_section_filter))
reference_output = kmeans.fit_predict(train_matrix)

# count[c, j]: for the cth cluster, how many texts belong to the jth section
count = np.zeros((len(_section_filter), len(_section_filter)))
for i in range(reference_output.shape[0]):
	c = reference_output[i]
	j = _section_filter.index(get_section(train_samples[i]))
	count[c, j] += 1 
"""
Created on Fri Feb 19 15:16:33 2016

@author: charles
"""

#LinearRegression

import pandas as pd
import preprocessing as pp
import numpy as np
import sklearn.linear_model

#%%
# this takes roughly 1 minute
X_train, Y_train, Date_train, Assignment_train = pp.preprocess(nrows=4000000)  

#%%
X_train, X_valid, Y_train, Y_valid = crossvalidate(X_train,Y_train,0.2)

#%%

model = sklearn.linear_model.LinearRegression()

model.fit(X_train,Y_train)

#%%

Y_test = model.predict(X_valid)
Y_train_pred = model.predict(X_train)
Beispiel #48
0
	return sample.question

samples = preprocessing.tp_sample.get_samples(_sample_folder)
samples = [s for s in samples if s.batch_name == _batch_name and s.question is not None]
random.shuffle(samples)
n_samples = len(samples)
train_samples = samples[0:int(n_samples*_train_ratio)]
test_samples = samples[int(n_samples*_train_ratio):n_samples]

print("Samples distribution:", preprocessing.samples_statistics(samples, _classes, get_question))
print("Train set distribution:", preprocessing.samples_statistics(train_samples, _classes, get_question))
print("Test set distribution:", preprocessing.samples_statistics(test_samples, _classes, get_question))

train_texts = [sample.text for sample in train_samples]
test_texts = [sample.text for sample in test_samples]
train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False)

if _model == "SVM":
	train_labels = preprocessing.samples_to_label(train_samples, _classes, get_question)
	test_labels = preprocessing.samples_to_label(test_samples, _classes, get_question)

	model = SVM()
	model.train(train_matrix, train_labels)
	predict = model.predict(test_matrix)

elif _model == "NN":
	train_dists = preprocessing.samples_to_dists(train_samples, _classes, get_question)
	test_dists = preprocessing.samples_to_dists(test_samples, _classes, get_question)
	model = Neural_Network(_n_factors = train_matrix.shape[1], _learning_rate = _learning_rate, _hidden_nodes = _hidden_nodes, _last_layer = len(_classes))
	model.train(train_matrix, train_dists, test_matrix, test_dists)
	predict = model.predict(test_matrix)
Beispiel #49
0
def evaluate_classifier (numTrainR, numTrainN, numTestR, numTestN, model, verbose):
    '''
        I used code from
        http://www.nltk.org/book/ch06.html
        for this

    '''

    #load raw tweets:
    rawRacistTweets = loadRacistTweets(numTweets = numTrainR + numTestR, excludeJokes=True)
    rawNormalTweets = loadNonRacistTweets(numTweets = numTrainN + numTestN)
    #rawTweets = rawRacistTweets + rawNormalTweets

    print("Number of racist tweets: {}.".format(len(rawRacistTweets)));
    print("Number of normal tweets: {}.".format(len(rawNormalTweets)));

    #split into train/test sets
    trainR = rawRacistTweets[0:numTrainR];
    print(len(trainR))
    testR = rawRacistTweets[numTrainR:numTrainR + numTestR];
    print(len(testR))

    trainN = rawNormalTweets[0:numTrainN];
    print(len(trainN))
    testN = rawNormalTweets[numTrainN:numTrainN + numTestN];
    print(len(testN))

    #combine racist/non-racist tweets into single train/test datasets
    trainTweets = trainR + trainN;
    testTweets = testR + testN;

    #pre-process tweets (i.e. remove certain words):
    preprocessedTrainTweets = [(preprocess(d), c) for (d, c) in trainTweets];
    preprocessedTestTweets = [(preprocess(d), c) for (d, c) in testTweets];

    featureExtractor = FeatureExtractor([FeatureExtractor.UNIGRAM, FeatureExtractor.BIGRAM])
    #featureExtractor.train_TF_IDF(trainTweets)

    #compute training & testing features
    trainFeats = [(featureExtractor.get_feature_vector(d), c) for (d,c) in preprocessedTrainTweets];
    testFeats = [(featureExtractor.get_feature_vector(d), c) for (d,c) in preprocessedTestTweets];

    if model == 'SVM':
        classifier = nltk.classify.SklearnClassifier(LinearSVC());
        classifier.train(trainFeats);

        #evaluate SVM classifier
        print("----------------------");
        print("SVM Classifier");
    elif model == 'RF':
        rf = RF(n_estimators=75, max_features = 'sqrt', 											      class_weight='auto', criterion="entropy",
			 min_samples_split=9, random_state=0)
        classifier = nltk.classify.SklearnClassifier(rf);
        classifier.train(trainFeats);
        #evaluate RF classifier

        print("----------------------");
        print("RF Classifier");
    #note that TF-IDF cannot be set when model=NB
    elif model == 'NB':
        # Bayes
        classifier = nltk.NaiveBayesClassifier.train(trainFeats);
        print("----------------------");
        print("NB Classifier");

    print("accuracy: %.3f" %nltk.classify.accuracy(classifier, testFeats));
    Y_test = [testFeat[1] for testFeat in testFeats]
    Y_pred = classifier.classify_many([testFeat[0] for testFeat in testFeats])
    conf=metrics.confusion_matrix(Y_test, Y_pred, [0,1])
    precision, recall, fscore = precision_recall_fscore(conf, 1)

    print("precision: %.3f" %precision)
    print("recall: %.3f" %recall)
    print("f1 score: %.3f" %fscore)
    print("%.1f\%% & %.1f\%% & %.1f\%%" %(100*precision,100*recall,100*fscore))

    print("confusion matrix:")
    print(conf)

    if verbose:
        FP_indeces = np.where(np.subtract(Y_pred, Y_test)==1)[0]
        FN_indeces = np.where(np.subtract(Y_pred, Y_test)==-1)[0]
        for FP_index in FP_indeces:
            print("False positive: {}".format(' '.join(testTweets[FP_index][0])))
        for FN_index in FN_indeces:
            print("False negative: {}".format(' '.join(testTweets[FN_index][0])))
Beispiel #50
0
def main():
  # Read the data from the text files
  begin = time.time()
  vocab, train_raw, test_raw = read.read_tweets("../training_set_tweets.txt", "../test_set_tweets.txt")
  print "Num of Train users:", len(train_raw), "Num of Test users:", len(test_raw)
  print "Read data:", time.time() - begin

  # Preprocess the data
  begin = time.time()
  vocab, bigrams, train_word, test_word, train_char, test_char = preprocessing.preprocess(train_raw, test_raw)
  print "Preprocessed the data", time.time() - begin

  return
  # Assign ids to words
  vocab_list = list(vocab)
  vocab_list.sort()
  begin = time.time()
  vocab_dict = {}
  for i in range(len(vocab_list)):
      vocab_dict[vocab_list[i]] = i
  print "Assigned ids to words:", time.time() - begin

  # Build train and test set
  num_full_feats = len(vocab_list) + 10
  num_train_tweets = 0
  num_test_tweets = 0
  # num_train_tweets = np.count_nonzero(~np.isnan(train))
  # num_test_tweets = np.count_nonzero(~np.isnan(test))
  for author_id in train:
      num_train_tweets += len(train[author_id])
  for author_id in test:
      num_test_tweets += len(test[author_id])
  X_train = np.zeros((num_train_tweets, num_full_feats))
  y_train = np.zeros(num_train_tweets)
  X_test = np.zeros((num_test_tweets, num_full_feats))
  y_test = np.zeros(num_test_tweets)

  # Build train and test set
  num_full_feats = len(vocab_list) + 10
  num_train_tweets = 0
  num_test_tweets = 0
  # num_train_tweets = np.count_nonzero(~np.isnan(train))
  # num_test_tweets = np.count_nonzero(~np.isnan(test))
  for author_id in train_word:
      num_train_tweets += len(train_word[author_id])
  for author_id in test_word:
      num_test_tweets += len(test_word[author_id])
  X_train = np.zeros((num_train_tweets, num_full_feats))
  y_train = np.zeros(num_train_tweets)
  X_test = np.zeros((num_test_tweets, num_full_feats))
  y_test = np.zeros(num_test_tweets)

  count = 0

  for author_id in train_word:
      for tweet in train_word[author_id]:
          X_train[count, :] = features.get_full_feats(tweet, vocab_dict)
          y_train[count] = author_id
          count += 1
  print count

  count = 0
  for author_id in test_word:
      for tweet in test_word[author_id]:
          X_test[count, :] = features.get_full_feats(tweet, vocab_dict)
          y_test[count] = author_id
          count += 1
  print count

  begin = time.time()
  feats = feature_selection.select_features(X_train, y_train, np.zeros(num_full_feats), 100, "dia")
  X_train = X_train[:, feats]
  X_test = X_test[:, feats]
  print "Features selected:", time.time() - begin

  begin = time.time()
  clf = model.train(X_train, y_train)
  acc, my_acc, preds, scores = model.test(clf, X_test, y_test)
  print 'time:', time.time()-begin, 'acc:', acc, 'my_acc:', my_acc
  print 'preds:', preds
  print 'scores:', scores

  print (preds == y_test)[:100]
  print np.count_nonzero(scores > 0)
  print np.count_nonzero(scores < 0)
	_sections = list(_sections.keys())
	print("Grouped sections:", _sections)
	for sample in news_samples:
		sample.section = _section_group_map[sample.section]
	
train_samples = news_samples[0:int(n_samples*_train_ratio)]
test_samples = news_samples[int(n_samples*_train_ratio):n_samples]

print("Samples distribution:", preprocessing.samples_statistics(news_samples, _sections, get_section))
print("Train set distribution:", preprocessing.samples_statistics(train_samples, _sections, get_section))
print("Test set distribution:", preprocessing.samples_statistics(test_samples, _sections, get_section))

train_texts = [sample.text for sample in train_samples]
test_texts = [sample.text for sample in test_samples]

train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, selection = "tfidf", select_top = _textbook_words, savedir = _save_dir, words_src = "textbook", normalize_flag = False, reduction = _reduction, reduce_n_attr = _reduce_n_attr, stem_words = _stem_words)

for section in _sections:
	train_labels = preprocessing.samples_to_binary(train_samples, [section], get_section)
	test_labels = preprocessing.samples_to_binary(test_samples, [section], get_section)

	model = SVR()
	print("Training for %s section.. "%section)

	model.train(train_matrix, train_labels)
	predict = model.predict(test_matrix)
	accuracy = 0
	for i in range(predict.shape[0]):
		if predict[i] >= 0.5:
			predict[i] = 1
		else:
_model = "./output/5"
_name_filter = ["KK201617T1", "KK201617T2"]

_words = []
_norm_dict = None
pca_components = None
model = None

with open(_model+"/preprocess.json", "r") as f:
	preprocess_dict = json.load(f)
	_words = preprocess_dict["words"]
	if "norm_info" in preprocess_dict:
		_norm_dict = preprocess_dict["norm_info"]
	if preprocess_dict["pca"]:
		pca_components = np.load(_model+'/pca.npy')

def get_label(sample):
	#return sample.think + sample.understand + sample.lang + sample.pres
	return sample.think + sample.understand

samples = preprocessing.tp_sample.get_samples(_sample_folder)
texts = [sample.comment for sample in samples if sample.batch_name in _name_filter]
test_matrix, _, _ = preprocessing.preprocess(texts, words_src = _words)
if pca_components is not None:
	test_matrix = np.matmul(test_matrix, pca_components.T)
if _norm_dict is not None:
	test_matrix, _, _ = preprocessing.normalize(test_matrix, norm_info = _norm_dict)
model = models.SVR.load(_model)
result = model.predict(test_matrix)
print([get_label(sample) for sample in samples])
print(result)
Beispiel #53
0
 def runSyllableClassification(self, SyllPath = None, nTrain = 50, nTest = 20, cType = 2, useStoredPatts = True, 
                               useRawOutput = True, pattTimesteps = None, maxPauseLength = 3, dataPrepParams = {}, cLearningParams = {}):
     """
     :Description: Function that learns conceptors for each syllable in self.Songs and
                   tries to classify the sequence of syllable generated from repeating the songs several times
     
     :Parameters:
         1. SyllPath:            If useSyllRecog = True, this needs to be the full directory to the folder including the syllable data (default = None)
         2. nTrain:              number of training samples to be used for each syllable (default = 50)
         3. nTest:               number of test samples to be used for each syllable (default = 10)
         4. cType:               index that indicates from which conceptor to use the recognition results {0 = pos, 1 = neg, 2 = combined} (default = 2)
         5. useStoredPatts:      if True, run syllable classification on self.patterns, else create new sequence according to repetition times in pattTimesteps (default = True)              
         6. useRawOutput:        if True, store evidences from chosen conceptore as patterns. If False, apply winner-takes-it-all classification on evidences (default = True)       
         7. pattTimesteps:       list of scalars representing the lengths each song in self.Songs should be presented at test time (only necessary if useStoredPatts is False)
         8. maxPauseLength:      Maximal length of pauses to be added randomly after each song (default = 3)            
         8. dataPrepParams:      dictionary of keyword arguments for data preprocessing if syllable recognition is to be used (defaults of preprocessing function will be used if not specified)
         9. cLearningParams:     dictionary of keyword arguments to learn a conceptor for each syllable if syllable recognition is to be used (defaults of syllable classifier will be used if not specified)
     
     :Returns: 
         1. newPatts:            List of patterns with recognition evidences for each syllable played
     """
     
     if self.verbose: print('Running syllable recognition...')
     path = os.path.dirname(os.path.abspath(__file__)) if SyllPath is None else os.path.abspath(SyllPath)
     self.path = path
     
     # generate sequence of syllables from patterns to use syllableClassifier on
     syllClassPatts = np.zeros((1,self.nSylls))
     # either use stored patterns
     if useStoredPatts:
         pattTimesteps = []
         for p in self.patterns:
             syllClassPatts = np.append(syllClassPatts, p, axis = 0)
             pattTimesteps.append(len(p))
     # or create sequences of lengths according to entries in pattTimesteps
     else:
         for i,t in enumerate(pattTimesteps):
             patt = self.patterns[i][0:len(self.Songs[i]),:]
             syllClassPatts = np.append(syllClassPatts, np.tile(patt, [round(t/len(self.Songs[i])),1]), axis = 0)
     syllClassPatts = syllClassPatts[1:,:]
     
     
     
     # if conceptors for syllables have not been learned already, learn them 
     if not self.syllableConceptorsLearned:
         # get list with unique syllables and create preprocessed  training and test data
         songs = []
         for s in self.Songs:
             songs += s
         songs = set(songs)
         self.SyllClassData = preprocess(path, self.nSylls, nTrain, np.ones(self.nSylls) * nTest, syll_names = self.Sylls, **dataPrepParams)
         # initialize syllableClassifier and train it on training data
         self.SyllClass = syllableClassifier(
             cLearningParams['neurons'],
             cLearningParams['spectral_radius'],
             cLearningParams['bias_scale'],
             cLearningParams['inp_scale'],
             cLearningParams['conn'])
         self.SyllClass.cLearning(nTrain, self.SyllClassData['train_data'], cLearningParams['gammaPos'], cLearningParams['gammaNeg'])
         self.syllableConceptorsLearned = True
         
     
     # run classification on syllClassPatts and store the evidences for each presented syllable
     sampleIdx = [0,round(nTest/2)]
     results = self.SyllClass.cTest(self.SyllClassData['test_data'], pattern = syllClassPatts, sampleIdx = sampleIdx)
     evidences = results['evidences'][cType]
     if not useRawOutput:
         evidences_tmp = np.zeros_like(evidences)
         for syll in range(evidences.shape[0]):
             evidences_tmp[syll,np.argmax(evidences[syll,:])] = 1
         evidences = evidences_tmp
     sampleIdx = [round(nTest/2),nTest-1]
     
     # create list with entries for each pattern and store the respective evidences in those entries
     t_all = 0
     newPatts = []
     for i,t in enumerate(pattTimesteps):
         patt = np.zeros((1,self.nSylls))
         for j in range(round(t/len(self.Songs[i]))):
             pause_length = np.random.randint(maxPauseLength)
             patt_tmp = np.concatenate((evidences[t_all + j*len(self.Songs[i]) : t_all + (j+1)*len(self.Songs[i]),:], np.zeros((pause_length,self.nSylls))), axis = 0)
             patt = np.vstack((patt, patt_tmp))
             pattTimesteps[i] += pause_length
         patt = patt[1:,:]
         newPatts.append(patt)
         t_all += (j+1)*len(self.Songs[i])
     
     return newPatts
Beispiel #54
0
def main():	
	parser = argparse.ArgumentParser(description="Run QA-CLEF-System")
	parser.add_argument('--preprocess',action="store_true")
	parser.add_argument('--train',action="store_true")
	parser.add_argument('--answeronly',action='store_true')
	parser.add_argument('--selftest',action='store_true')
	parser.add_argument('--data',nargs = '+',default=[2011],type=int)
	parser.add_argument('--test',nargs = '+',default=[2012],type=int)
	parser.add_argument('--forcedownload',action='store_true')
	parser.add_argument('--preprocessonly',action='store_true')
	parser.add_argument('--ngram', type=int, default=3)
	parser.add_argument('--threshold', type=float, default=0.5)
	parser.add_argument('--report',action='store_true')
	args = parser.parse_args()
	process_args(args)

	data = []
	for edition in args.data + args.test:
		_data = qacache.find_data(edition)

		if args.preprocess or _data is None:
			input_check([edition],args.forcedownload)

			_data = input_parse([edition])

			print >> sys.stderr, 'preprocessing ' + str(edition) + '-data'
			_data = preprocessing.preprocess(_data)

			qacache.store_preprocessed_data(edition,_data[0])
		else:
			print >> sys.stderr, str(edition) + '-data is found on cache/' + str(edition) + '-prerocessed.txt'
		data.append(_data)

	if args.preprocessonly:
		print >> sys.stderr, 'Preprocess-only task is done.'
		sys.exit(0)

	# build-model
	print >> sys.stderr, 'Building model...'
	training_model = model_builder.build_model(data[:len(args.data)])
	test_model = model_builder.build_model(data[-len(args.test):]) if len(args.test) != 0 and not args.selftest else []

	# scoring
	print >> sys.stderr, 'Unweighted Feature Scoring...'
	training_model and scoring.score(training_model)
	test_model and scoring.score(test_model)

	# training
	weight = qacache.stored_weight()
	if args.train or weight is None:
		print >> sys.stderr, 'Training...'
		weight = train(training_model)
	else:
		print >> sys.stderr, 'Weight is found on cache/weight.txt'

	# weighted_scoring
	print >> sys.stderr, 'Weighted Feature Scoring...'
	final = scoring.weighted_scoring(training_model if args.selftest else test_model, weight)

	# answer selection
	select_answer(final,args.threshold)

	# evaluation
	result = evaluate(final)

	qacache.write_json(final,'final.txt',indent=True)

	if args.report:
		report(final, args.test if not args.selftest else args.data,weight)

	print "Result: %f" % result
random.shuffle(news_samples)
n_samples = len(news_samples)

train_samples = news_samples[0:int(n_samples*_train_ratio)]
test_samples = news_samples[int(n_samples*_train_ratio):n_samples]

print("Samples distribution:", preprocessing.samples_statistics(news_samples, _sections, get_section))
print("Train set distribution:", preprocessing.samples_statistics(train_samples, _sections, get_section))
print("Test set distribution:", preprocessing.samples_statistics(test_samples, _sections, get_section))

train_texts = [sample.text for sample in train_samples]
test_texts = [sample.text for sample in test_samples]

tfidf_vectorizer = get_tfidfVectorizer_of_essay_top_tf_words()
print("Vectorizer built..")
train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, savedir = _save_dir, words_src = tfidf_vectorizer, normalize_flag = False, reduction = _reduction, reduce_n_attr = _reduce_n_attr,  stem_words = _stem_words)
model = None
print("Generating labels..")
if _model == "SVM":
	train_labels = preprocessing.samples_to_label(train_samples, _sections, get_section)
	test_labels = preprocessing.samples_to_label(test_samples, _sections, get_section)

	model = SVM()
	print("Training.. ")
	model.train(train_matrix, train_labels)
	predict = model.predict(test_matrix)

elif _model == "NN":
	train_dists = preprocessing.samples_to_dists(train_samples, _sections, get_section)
	test_dists = preprocessing.samples_to_dists(test_samples, _sections, get_section)
	model = Neural_Network(_n_factors = train_matrix.shape[1], _learning_rate = _learning_rate, _hidden_nodes = _hidden_nodes, _last_layer = len(_sections))
def main(parameters):
    config = SafeConfigParser()
    config.read(parameters)

    ROOTDIR = config.get('filepaths', 'corpus')

    if len(os.listdir(ROOTDIR)) < 2:
        documents = []
        with codecs.open(config.get('filepaths', 'basefile'), encoding='utf-8') as f:
            for (source, labels, text) in unicode_csv_reader(f):
                labels = clean_labels(labels)
                documents.append((source, labels, text))
        for fold in k_fold_cross_validation(documents, 10):
            print fold
            fold, training, validation = filter_motifs(fold)
            with open(ROOTDIR + 'fold-%s.training.txt' % fold, 'w') as out:
                writer = csv.writer(out, quoting=csv.QUOTE_MINIMAL)
                for (source, motifs, text) in training:
                    #motifs = [motif for motif in motifs is motif != 'DUMMY']
                    writer.writerow([source, ' '.join(motifs).encode('utf-8'), text.encode('utf-8')])
            with open(ROOTDIR + 'fold-%s.validation.txt' % fold, 'w') as out:
                writer = csv.writer(out, quoting=csv.QUOTE_MINIMAL)
                for (source, motifs, text) in validation:
                    writer.writerow([source, ' '.join(motifs).encode('utf-8'), text.encode('utf-8')])

    training_preprocessor = lambda t: preprocess(
        t, encoding=config.get('preprocessing', 'encoding'),
        strip_accents = 'unicode' if config.getboolean('preprocessing', 'strip-accents') else None,
        strip_punct = config.getboolean('preprocessing', 'strip-punctuation'),
        lowercase = config.getboolean('preprocessing', 'lowercase'),
        max_df = config.getfloat('preprocessing', 'maximum-document-frequency'),
        min_df = config.getint('preprocessing', 'minimum-document-frequency'),
        min_word_len = config.getint('preprocessing', 'minimum-word-length'),
        join=False)

    validation_preprocessor = lambda t: preprocess(
        t, encoding=config.get('preprocessing', 'encoding'),
        strip_accents = 'unicode' if config.getboolean('preprocessing', 'strip-accents') else None,
        strip_punct = config.getboolean('preprocessing', 'strip-punctuation'),
        lowercase = config.getboolean('preprocessing', 'lowercase'),
        max_df = 1.0,
        min_df = 1.0,
        min_word_len = config.getint('preprocessing', 'minimum-word-length'),
        join=False)

    documents = defaultdict(list)
    for document in os.listdir(ROOTDIR):
        if not document.startswith('.') and document.startswith('fold'):
            documents[document.split('.')[0]].append(document)

    globalAP = []
    globalMargin = []
    globalOneError = []
    globalIsError = []

    system = config.get('system', 'system')
    if system == 'llda':
        system = llda
    elif system.upper() in ('SGD', 'SVC', 'KNN', 'NB'):
        system = classifier
    elif system == 'BM25':
        system = MR
    elif system == 'lldaTfidf':
        system = lldaTfidf
    else:
        raise ValueError("Unsupported system choice: %s" % system)

    for k, (fold, (training_docs, test_docs)) in enumerate(documents.iteritems()):
        assert 'training' in training_docs and 'validation' in test_docs
        training = list(cleanfile(ROOTDIR + training_docs, training_preprocessor, label_df=1))
        validation = list(cleanfile(ROOTDIR + test_docs, validation_preprocessor, label_df=1))
        isError, oneError, nDocs, margins, AP = system.run(training, validation, k, config)
        isError = isError / float(nDocs)
        oneError = oneError / float(nDocs)
        margins = sum(margins) / float(nDocs)
        AP = sum(AP) / float(nDocs)
        globalIsError.append(isError)
        globalOneError.append(oneError)
        globalMargin.append(margins)
        globalAP.append(AP)

        print 'Fold:', k
        print '-' * 80
        print 'Num training docs:', len(training)
        print 'Num validation docs:', len(validation)
        print 'Average Precision:', AP
        print 'Is Error:', isError
        print 'One Error:', oneError
        print 'Margin:', margins
        print '-' * 80

    print 'AVERAGE AP:', sum(globalAP) / len(globalAP)
    print 'AVERAGE ONE ERROR:', sum(globalOneError) / len(globalOneError)
    print 'AVERAGE IS ERROR:', sum(globalIsError) / len(globalIsError)
    print 'AVERAGE MARGIN:', sum(globalMargin) / len(globalMargin)

    output_dir = os.path.join('Data', sys.argv[-1])
    with open(os.path.join(output_dir, 'output.txt'), 'w') as out:
        out.write('Average Precision: %f\n' % (sum(globalAP) / len(globalAP)))
        out.write('Average One Error: %f\n' % (sum(globalOneError) / len(globalOneError)))
        out.write('Average Is Error: %f\n' % (sum(globalIsError) / len(globalIsError)))
        out.write('Average Margin: %f\n' % (sum(globalMargin) / len(globalMargin)))
Beispiel #57
0
    for item in tupleList :
        for word in item[0] :
            res.append(word);
    return res;



'''
    I used code from
    http://www.nltk.org/book/ch06.html
    for this

'''
if __name__ == "__main__" :
    print("NB start");
    racistTweets = [(preprocess(d), c) for (d, c) in loadRacistTweets(excludeJokes=True)];
    normalTweets = [(preprocess(d), c) for (d, c) in loadNonRacistTweets(numTweets=len(racistTweets))];

    print("Number of racist tweets: {}.".format(len(racistTweets)));
    print("Number of normal tweets: {}.".format(len(normalTweets)));

    numTrain = 1500;
    numTest = 500;

    trainR = racistTweets[0:numTrain];
    testR = racistTweets[numTrain:numTrain + numTest];

    trainN = normalTweets[0:numTrain];
    testN = normalTweets[numTrain:numTrain + numTest];

def runDNN(path, syllN, trainN, cvalRuns, sampRate, interpolType, mfccN, invCoeffOrder, winsize, melFramesN,
        smoothL, polyOrder, incDer, snr = 0.0, syllNames = None, layerSizes = [60,10], activationFcts = 'tanh', 
        dropouts = [], normalizations = [], optimizer = 'Adam', learningRate = 0.0005, batchSize = 10, 
        nEpochs = 10, loss = 'CrossEntropyExclusiveSparse', validate_per_step = 100, samplingSDs = 0.05):
    """
    Function that runs syllable classification in a supervised manner using positive, negative and combined
    conceptors.
    """
    
    path = os.path.abspath(path)

    """ assign parameters """
    
    prepParams = {
        'syll_names': syllNames,
        'sample_rate': sampRate,
        'ds_type': interpolType,
        'mel_channels': mfccN,
        'inv_coefforder': invCoeffOrder,
        'winsize': winsize,
        'frames': melFramesN,
        'smooth_length': smoothL,
        'inc_der': incDer,
        'poly_order': polyOrder,
        'snr': snr
    }

    performances = []
    evidences = []

    for i in range(cvalRuns):

        n_test = np.ones(syllN, dtype = int)*20
        
        Samples = []
        if cvalRuns > 1:
            for j in range(syllN):
    
                indices = np.arange(0, trainN + n_test[j], 1)
                ind_tmp = indices.copy().tolist()
                random.shuffle(ind_tmp)
                ind_tmp = np.array(ind_tmp)
                
                Samples.append(ind_tmp)

        """ Get and preprocess data """
        
        data = preprocessing.preprocess(path, syllN, trainN, n_test, samples = Samples, **prepParams)
        trainData = data['train_data']
        testData = data['test_data']
        
        inpDim = mfccN*(1+sum(incDer))*smoothL
        testL = int(n_test[0]/2)
        data_train = np.zeros((len(trainData)*trainN, inpDim))
        labels_train = np.zeros(len(trainData)*trainN)
        data_test = np.zeros((len(testData)*testL, inpDim))
        labels_test = np.zeros(len(testData)*testL)
        data_validate = np.zeros_like(data_test)
        labels_validate = np.zeros_like(labels_test)
        
        for t,syll in enumerate(trainData):
            
            for s,sample in enumerate(syll):
            
                data_train[t*trainN+s,:] = sample.flatten()
                labels_train[t*trainN+s] = t
        
        for t,syll in enumerate(testData):
            
            for s,sample in enumerate(syll):
                
                if s < n_test[0]/2:
                    
                    data_test[t*testL+s,:] = sample.flatten()
                    labels_test[t*testL+s] = t
                
                else:
                    
                    data_validate[t*testL+s-testL,:] = sample.flatten()
                    labels_validate[t*testL+s-testL] = t
        
        """ create DNN """
        
        syllClassifier = DeepNN(inpDim, 1)
        
        if not any(dropouts):
            
            dropouts = np.zeros(len(layerSizes))
            
        if not any(normalizations):
            
            normalizations = np.zeros(len(layerSizes))
        
        if not type(samplingSDs) == np.ndarray:
            
            samplingSDs = np.zeros(len(layerSizes)) + samplingSDs
            
        for n,l in enumerate(layerSizes):
            
            if n == 0:
                
                if type(activationFcts) is not list:
                    
                    syllClassifier.addLayer(l, activationFcts, include_bias = True, normalization = normalizations[n], dropout = dropouts[n], sd = samplingSDs[n])  
                    
                else:
                    
                    syllClassifier.addLayer(l, activationFcts[n], include_bias = True, normalization = normalizations[n], dropout = dropouts[n], sd = samplingSDs[n])
                    
            else:
            
                if type(activationFcts) is not list:
                    
                    syllClassifier.addLayer(l, activationFcts, include_bias = False, normalization = normalizations[n], dropout = dropouts[n], sd = samplingSDs[n])  
                    
                else:
                    
                    syllClassifier.addLayer(l, activationFcts[n], include_bias = False, normalization = normalizations[n], dropout = dropouts[n], sd = samplingSDs[n])
        
        """ train DNN and classify test data """
        
        data = [data_train,data_validate]
        labels = [labels_train, labels_validate]
        syllClassifier.train(data, labels, loss_type = loss, optimizer_type = optimizer, learning_rate = learningRate, n_epochs = nEpochs, 
                             batch_size = batchSize, validate_per_step = validate_per_step, verbose = 20)
        syllClassifier.test(data_test, labels_test, normalize = True)
        results = syllClassifier.test_predictions
        performance = np.mean(np.argmax(results, axis = 1) == labels_test)
        evidences.append(results)
        performances.append(performance)

    cval_results = {'Evidences': evidences, 'Labels': labels_test, 'Performances': performances}
    
    return cval_results
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 18 11:13:56 2016

@author: charles
"""

import pandas as pd
import numpy as np
import preprocessing as pp

#%%
# this takes roughly 1 minute
X_train2D, Y_train2D, Date_train2D, Assignment_train2D = pp.preprocess()                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

#%%
X_train, Y_train, Category_train,nInputNumber, nSequenceLength, nInputDim, scalerY = pp.preprocessDeep()

#%%

from __future__ import print_function
from keras.preprocessing import sequence
from keras.models import Graph
from keras.layers.core import TimeDistributedDense, Dropout, Masking
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM



def buildLSTM(nSequenceLength, nInputDim,  nLSTMoutputDim = 100):