def get_data(data_file, data_type="TRAIN"):
    file_path = data_file
    train_json = dict()
    xmlp = ET.XMLParser(encoding="utf-8")
    tree = ET.parse(file_path, parser=xmlp)
    root = tree.getroot()
    pbar = tqdm(total=len(root.findall('pair')))

    print('\nLoading premise and hypothesis sentences from disk...')
    print('-' * 55)
    for id, pair in enumerate(root.findall('pair')):
        #      pair_ID = pair.find('pair id').text
        text1 = pair.find('t1').text
        text2 = pair.find('t2').text

        preprocessed_premise = pre.process(text1)
        preprocessed_hyp = pre.process(text2)

        temp = dict()
        temp['text1'] = preprocessed_premise.lstrip()
        temp['text2'] = preprocessed_hyp.lstrip()

        if data_type == "TRAIN":
            label = pair.find('Label').text
            if label == 'Y': label = 1
            if label == 'N': label = 0
            temp['label'] = label

        train_json[id] = temp
        pbar.update(1)

    return train_json
def run_classification():
    if not path.exists(PROCESSED_CORPUS_PATH):
        preprocessing.process(SRC_DIR)

    if not path.exists(FEATURES_FILE):
        x, y, le = extract_books_features_from_corpus()
        save_book_features_to_file(x, y, le)
    else:
        x, y = load_features_from_file()

    hybrid_classification(x, y)
Example #3
0
File: main.py Project: rajreet/ravi
def main():

    for image_count in range(1, 6):

        process_dict = {}
        image = cv2.imread(f"input/{image_count}.jpg", cv2.IMREAD_GRAYSCALE)
        # cv2.imwrite('ocr.png',image)

        # if (image.shape[0] < image.shape[1]):
        #     image =

        process_dict["image"] = image
        process_dict["image_height"] = image.shape[0]
        process_dict["image_width"] = image.shape[1]
        # retval, thresh = cv2.threshold(image,200,255,cv2.THRESH_BINARY)

        print(image.shape)
        # cv2.imwrite("thresh_200.png",thresh)

        threshold = cv2.adaptiveThreshold(image, 255,
                                          cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                          cv2.THRESH_BINARY, 201, 15)
        # blur = cv2.GaussianBlur(image,(7,7),0)
        # retval, threshold = cv2.threshold(blur,0,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)

        process_dict["binary"] = threshold

        process_dict["count"] = image_count

        process_dict = process(process_dict)
        print(f"Image preprocessing done for {image_count}.")
Example #4
0
    def __getitem__(self, case_num, size=(128, 128, 128)):
        if self.is_validation == True:
            case_num += 200
        try:
            img = np.load(self.path + '/' + "{}_i.npy".format(case_num))
            mask = np.load(self.path + '/' + "{}_m.npy".format(case_num))
            return img, mask

        except (IOError, FileNotFoundError):
            img = self.df.loc[self.df['case_id'] == case_num]['image']
            #print("THIS IS WHAT WE ON")
            img = nib.load(img.values[0])
            affine = img.affine
            img = img.get_fdata()
            img = process(img, size)
            img = np.expand_dims(img, axis=3)
            img = np.expand_dims(img, axis=0)
            mask = self.df.loc[self.df['case_id'] == case_num]['mask']
            mask = nib.load(mask.values[0])
            mask = mask.get_data()
            #mask = resize_image(mask, size, is_mask=True)
            mask_off = mask > 1.5
            mask[
                mask_off] = 1  # This will turn all tumor pixels into kidney pixels
            mask = np.expand_dims(mask, axis=3)
            mask = np.expand_dims(mask, axis=0)

            np.save(self.path + '/' + "{}_i".format(case_num), img)
            np.save(self.path + '/' + "{}_m".format(case_num), mask)
            return img, mask
Example #5
0
def process(X, size):
    a, b = size
    X_new = []
    for i in range(len(X)):
        X_new.append(preprocessing.process(X[i], [a, b]))
        #preprocessing.make_image(preprocessing.process(X[i],[12,12]),str(i))
    return np.array(X_new)
Example #6
0
def kfold_cv(x, y, model):
    nested_train_scores = list()
    nested_test_scores = list()

    outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)
    for train_index, test_index in tqdm(outer_cv.split(x)):
        # split data
        xTrain = x.iloc[train_index]
        xTest = x.iloc[test_index]
        yTrain = y.iloc[train_index]
        yTest = y.iloc[test_index]
        xTrain, yTrain, xTest, yTest = preprocessing.process(
            xTrain, yTrain, xTest, yTest)

        # PCA (number of components chosen such that the amount of variance
        # that needs to be explained is greater than the percentage specified by n_components)
        sklearn_PCA = PCA(n_components=0.95, svd_solver='full')
        xTrain = sklearn_PCA.fit_transform(xTrain)
        xTest = sklearn_PCA.transform(xTest)
        # xTrain, yTrain, xTest, yTest = df_to_numpy(xTrain, yTrain, xTest, yTest)

        md = model.fit(xTrain, yTrain)

        # Train Score
        yHat1 = md.predict(xTrain)
        r2 = r2_score(yTrain, yHat1)
        nested_train_scores.append(r2)

        # Test Score
        yHat2 = md.predict(xTest)
        r2 = r2_score(yTest, yHat2)
        nested_test_scores.append(r2)
    return nested_train_scores, nested_test_scores
Example #7
0
 def predict(self, datafile):
     """Predicts class labels for the input instances in file 'datafile'
     Returns the list of predicted labels
     """
     lines = self.retrieveData(datafile)
     tokens, data = proc.process(lines)
     self.clf.predict(data.iloc[:, 1:data.shape[1]])
Example #8
0
    def train(self, trainfile):
        """Trains the classifier model on the training set stored in file trainfile"""
        lines = self.retrieveData(trainfile)
        data, y_train = proc.process(lines)

        vocab = []
        for sent in data['words_in_window']:
            for w in sent:
                if w not in vocab:
                    vocab.append(w)
        vocab_size = len(vocab)

        self.tokenizer = Tokenizer(num_words=vocab_size)
        self.tokenizer.fit_on_texts(data.words_in_window)

        sentiment_tokenized = pd.DataFrame(
            self.tokenizer.texts_to_matrix(data.words_in_window))

        self.clf_tok = Sequential()
        self.clf_tok.add(
            Dense(128, input_shape=(vocab_size, ), activation='softmax'))
        self.clf_tok.add(
            Dense(64, input_shape=(vocab_size, ), activation='relu'))
        self.clf_tok.add(Dense(3, activation='softmax'))
        self.clf_tok.compile(loss='categorical_crossentropy',
                             optimizer='adam',
                             metrics=['accuracy'])

        self.clf_tok.fit(sentiment_tokenized,
                         y_train,
                         epochs=10,
                         batch_size=32)
Example #9
0
def process_file(filepath):
    # tesseract read text
    tiff_image_path = next(preprocessing.process(filepath))
    tessract_results = ocr.process(tiff_image_path)
    tess_text = compile_text(tessract_results)
    yield tess_text
    # ground truth
    yield from clean_ground_truth.process(filepath)
Example #10
0
    def train(self, trainfile):
        """Trains the classifier model on the training set stored in file trainfile"""
        lines = self.retrieveData(trainfile)
        tokens, data = proc.process(lines)

        parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}
        svc = svm.SVC(gamma="scale")
        self.clf = GridSearchCV(svc, parameters, cv=5)
        self.clf.fit(data.iloc[:, 1:data.shape[1]], data.iloc[:, 0])
Example #11
0
def process(X, size):
    print 'processing....'
    t = time.time()
    a, b = size
    X_new = []
    for i in range(len(X)):
        X_new.append(preprocessing.process(X[i], [a, b]))
        #preprocessing.make_image(preprocessing.process(X[i],[12,12]),str(i))
    print 'processing done in', time.time() - t, 'seconds'
    return np.array(X_new)
Example #12
0
def get_Xy(path):
    X, y, file = [], [], []
    d = np.load('/'.join(sys.argv[0].split('/')[:-1]) +
                '/acid_data/pka.npy').item()
    for i in os.listdir(path):
        if i[-8:] == '.g16.out':
            X.append(preprocessing.process(coord(i), [100, 100]))
            #preprocessing.make_image(preprocessing.process(coord(i),[100,100]),i.split('.')[0])
            y.append([float(d[i.split('.')[0]])])
            file.append(i.strip().split()[0])
    return np.array(X), np.array(y), file
Example #13
0
def get_data(input, flag):
    data, labels = [], []
    if flag:
        train_data = preprocessing.process(test_file)
        data, labels = train_data['text'], train_data['labels']
    else:
        with open(train_file) as f:
            gen = chunks.read_chunk(f, "\n")
            # поправлю 40к когда придумаю на что
            for i in range(40000):
                s = next(gen).split('\t')
                data.append(s[-1])
                labels.append(s[-2])
    return data, labels
Example #14
0
def nested_cv(x, y, model, p_grid):
    # nested cv method can be condensed with the following code:
    """
    pipeline = Pipeline([('transformer', scalar), ('estimator', clf)])
    cv = KFold(n_splits=4)
    scores = cross_val_score(pipeline, X, y, cv = cv)
    """
    nested_train_scores = list()
    nested_test_scores = list()
    nested_params = list()

    # nested cv
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)
    for train_index, test_index in tqdm(outer_cv.split(x)):
        # split data
        xTrain = x.iloc[train_index]
        xTest = x.iloc[test_index]
        yTrain = y.iloc[train_index]
        yTest = y.iloc[test_index]
        inner_cv = KFold(n_splits=5, shuffle=True, random_state=1)
        xTrain, yTrain, xTest, yTest = preprocessing.process(
            xTrain, yTrain, xTest, yTest)

        # PCA (number of components chosen such that the amount of variance
        # that needs to be explained is greater than the percentage specified by n_components)
        sklearn_PCA = PCA(n_components=0.95, svd_solver='full')
        xTrain = sklearn_PCA.fit_transform(xTrain)
        xTest = sklearn_PCA.transform(xTest)
        # xTrain, yTrain, xTest, yTest = df_to_numpy(xTrain, yTrain, xTest, yTest)

        # Scoring metric is roc_auc_score (precision and recall)
        clf = GridSearchCV(estimator=model,
                           param_grid=p_grid,
                           scoring='r2',
                           cv=inner_cv,
                           refit=True)
        fitter = clf.fit(xTrain, yTrain)
        best_model = fitter.best_estimator_
        nested_params.append(fitter.best_params_)

        # Train Score
        yHat1 = best_model.predict(xTrain)
        r2 = r2_score(yTrain, yHat1)
        nested_train_scores.append(r2)

        # Test Score
        yHat2 = best_model.predict(xTest)
        r2 = r2_score(yTest, yHat2)
        nested_test_scores.append(r2)
    return nested_train_scores, nested_test_scores, nested_params
Example #15
0
    def predict(self, datafile):
        """Predicts class labels for the input instances in file 'datafile'
        Returns the list of predicted labels
        """
        lines = self.retrieveData(datafile)
        data_eval, y_eval = proc.process(lines)

        x_eval = pd.DataFrame(
            self.tokenizer.texts_to_matrix(data_eval.words_in_window))

        dic = {0: 'negative', 1: 'neutral', 2: 'positive'}
        pred = [
            dic.get(n, n) for n in np.argmax(self.clf_tok.predict(x_eval), 1)
        ]

        return pred
Example #16
0
def main():

    args = parse_args()
    text = args.i.readlines()

    processed_lines = []
    for line in text:
        preprocessed_line = preprocessing.process(line.strip())

        if len(preprocessed_line) != 0:
            processed_line = map_replacement.replace_from_maps(
                preprocessed_line)
            processed_lines.append(processed_line)

    for line in processed_lines:
        args.o.write(line + '\n')
Example #17
0
    def query(self,
              model_path,
              n_samples_query,
              n_results,
              custom=False,
              weights=False):
        vertices, element_dict, info = read_model(model_path)
        shape = Shape(vertices, element_dict, info)
        shape = process(shape, n_vertices_target=self.n_vertices_target)
        feature_dict = extract_features(shape,
                                        self.n_bins,
                                        n_samples=n_samples_query)
        feature_df = data_dict_parser(feature_dict)
        feature_df, _ = sample_normalizer(
            feature_df,
            *self.sample_normalization_parameters,
            divide_distributions=self.divide_distributions)
        feature_df_numeric = feature_df.select_dtypes(np.number)
        #Make sure columns identical and ordered
        assert list(feature_df_numeric.columns) == list(
            self.df_numeric.columns), "Column mismatch!"
        query_vector = feature_df_numeric.iloc[0, :].values.astype(np.float32)

        if not custom:

            distances, indices = self.faiss_knn.query(query_vector, n_results)
        else:
            distances, indices = self.custom_knn.query(query_vector,
                                                       n_results,
                                                       weights=weights)

        distances = distances.flatten().tolist()  #Flatten batch dimension
        indices = indices.flatten().tolist()
        df_slice = self.df[self.df.index.isin(indices)]
        df_slice['distance'] = df_slice.index.map(
            lambda x: distances[indices.index(x)])

        #Add missing data to query df
        feature_df['file_name'] = str(model_path)
        feature_df['classification'] = 'query_input'
        feature_df['distance'] = 0
        # Put it at top of slice
        df_slice = pd.concat([df_slice, feature_df])
        df_slice = df_slice.sort_values('distance')

        return distances, indices, df_slice
    def process_subset(self, file_list, apply_processing, n_vertices_target,
                       n_bins, process_index):
        print(f' {process_index} : Starting subset processor!')
        data_subset = {k: [] for k in self.columns + self.col_array}
        for index, file in enumerate(file_list):

            if index % 50 == 0:
                print(f' {process_index} : Is at {index}/{len(file_list)}!')

            vertices, element_dict, info = read_model(Path(file))
            shape = Shape(vertices, element_dict, info)

            if apply_processing:

                shape = process(shape, n_vertices_target=n_vertices_target)

            else:
                shape.make_pyvista_mesh()

            id = os.path.basename(file).split(".")[0].replace("m", "")
            if id in self.classification_dict.keys():
                classification = self.classification_dict[id]

            else:

                classification = None

            data_subset["classification"].append(classification)
            data_subset["file_name"].append(file)

            #Get features
            feature_dict = extract_features(shape,
                                            n_bins=n_bins,
                                            n_samples=self.n_samples)

            #Add them to total data

            for key, val in feature_dict.items():
                data_subset[key].append(val)
        print(f'{process_index} : Finished!')
        return data_subset
import sys
sys.path.append("../")

from gensim import models

import pandas as pd
import numpy as np

import preprocessing as pp


filename = "../../../data/sample.csv"
data = pd.read_csv(filename, sep=',')


data['header_features'] = data.Headline.apply(lambda x : pp.process(x))
data['content_features'] = data.articleBody.apply(lambda x : pp.process(x))


model = models.Word2Vec.load_word2vec_format('/media/sree/venus/code/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

def sent2vec(words):
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())
Example #20
0
def preprocess_callback():
    """
    Starts preprocessing
    :return:
    """
    preprocessing.process()
Example #21
0
y_train = Training_dataframe.iloc[:, 1]

x_test = Testing_dataframe.loc[:, attrib[2:len(attrib)]]
y_test = Testing_dataframe.iloc[:, 1]

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# merge the 25 news together to form a single signal
merged_x_train = x_train.apply(lambda x: ''.join(str(x.values)), axis=1)
merged_x_test = x_test.apply(lambda x: ''.join(str(x.values)), axis=1)

# ===============
# pre-process
# ===============
merged_x_train = merged_x_train.apply(lambda x: pp.process(x))
merged_x_test = merged_x_test.apply(lambda x: pp.process(x))

# remove stopwords in the training and testing set
train_without_sw = []
test_without_sw = []
train_temporary = list(merged_x_train)
test_temporary = list(merged_x_test)
s = pp.stop_words
for i in train_temporary:
    f = i.split(' ')
    for j in f:
        if j in s:
            f.remove(j)
    s1 = ""
    for k in f:
Example #22
0
from keras.models import Sequential
from keras.layers import Dense, Activation
import numpy as np
import preprocessing
import pandas as pd
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from numpy import argmax
import re

# Model Template
x_train, y_train, x_val, y_val, x_test, y_test = preprocessing.process(
    "images.npy", "labels.npy")
# val = int(raw_input("Max Depth: "))
model = tree.DecisionTreeClassifier(max_depth=10)

#4 different types of feature extraction


#1 avg pixel values for each number
def getAvgPixelIntensity(x_set):
    pixelIntensity = 0
    pics = []
    #print(x_set.shape)
    for picture in x_set:
        sum = 0
        for pixel in picture:
            sum += pixel
Example #23
0
def data():
    start_train = '2008-08-08'
    end_train = '2014-12-31'
    start_val = '2015-01-02'
    end_val = '2016-07-01'
    max_sequence_length = 110
    vocab_size = 3000
    # read csv file
    DJIA = pd.read_csv("Combined_News_DJIA.csv",
                       usecols=[
                           'Date', 'Label', 'Top1', 'Top2', 'Top3', 'Top4',
                           'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10',
                           'Top11', 'Top12', 'Top13', 'Top14', 'Top15',
                           'Top16', 'Top17', 'Top18', 'Top19', 'Top20',
                           'Top21', 'Top22', 'Top23', 'Top24', 'Top25'
                       ])

    # create training and testing dataframe on 80 % and 20 % respectively
    Training_dataframe = DJIA[(DJIA['Date'] >= start_train)
                              & (DJIA['Date'] <= end_train)]
    Testing_dataframe = DJIA[(DJIA['Date'] >= start_val)
                             & (DJIA['Date'] <= end_val)]

    attrib = DJIA.columns.values

    x_train = Training_dataframe.loc[:, attrib[2:len(attrib)]]
    y_train = Training_dataframe.iloc[:, 1]

    x_test = Testing_dataframe.loc[:, attrib[2:len(attrib)]]
    y_test = Testing_dataframe.iloc[:, 1]

    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)

    # merge the 25 news together to form a single signal
    merged_x_train = x_train.apply(lambda x: ''.join(str(x.values)), axis=1)
    merged_x_test = x_test.apply(lambda x: ''.join(str(x.values)), axis=1)

    # ===============
    # pre-process
    # ===============
    merged_x_train = merged_x_train.apply(lambda x: pp.process(x))
    merged_x_test = merged_x_test.apply(lambda x: pp.process(x))

    #merged_x_train = merged_x_train.apply(lambda x: pp.lemmanouns(pp.lemmaverbs(pp.lemmaadjectives(x))))
    #merged_x_test = merged_x_test.apply(lambda x: pp.lemmanouns(pp.lemmaverbs(pp.lemmaadjectives(x))))

    #merged_x_train = merged_x_train.apply(lambda x: pp.stemmer(x))
    #merged_x_test = merged_x_test.apply(lambda x: pp.stemmer(x))

    # remove stopwords in the training and testing set
    train_without_sw = []
    test_without_sw = []
    train_temporary = list(merged_x_train)
    test_temporary = list(merged_x_test)
    s = pp.stop_words
    for i in train_temporary:
        f = i.split(' ')
        for j in f:
            if j in s:
                f.remove(j)
        s1 = ""
        for k in f:
            s1 += k + " "
        train_without_sw.append(s1)
    merged_x_train = train_without_sw

    for i in test_temporary:
        f = i.split(' ')
        for j in f:
            if j in s:
                f.remove(j)
        s1 = ""
        for k in f:
            s1 += k + " "
        test_without_sw.append(s1)
    merged_x_test = test_without_sw

    # tokenize and create sequences
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(merged_x_train)
    x_train_sequence = tokenizer.texts_to_sequences(merged_x_train)
    x_test_sequence = tokenizer.texts_to_sequences(merged_x_test)

    word_index = tokenizer.word_index
    input_dim = len(word_index) + 1
    print('Found %s unique tokens.' % len(word_index))

    x_train_sequence = pad_sequences(x_train_sequence,
                                     maxlen=max_sequence_length)
    x_test_sequence = pad_sequences(x_test_sequence,
                                    maxlen=max_sequence_length)

    print('Shape of training tensor:', x_train_sequence.shape)
    print(x_train_sequence)
    print('Shape of testing tensor:', x_test_sequence.shape)
    print(x_test_sequence)
    """
    Data providing function:

    This function is separated from create_model() so that hyperopt
    won't reload data for each evaluation run.
    """
    return x_train_sequence, y_train, x_test_sequence, y_test
Example #24
0
File: main.py Project: salceson/PJN
                precision, recall, f1 = calculate_quality(_OUTPUT_PATTERN % metric, _REFERENCE_FILENAME)
                print("Metric %s:" % metric)
                print("\tPrecision: %f, recall: %f, f1: %f" % (precision, recall, f1))
            except:
                pass
        time2 = time()
        print("Run for %f s." % (time2 - time1))
    else:
        metric_txt = action
        metric = dice_metric if action == 'dice' else cosine_metric if action == 'cosine' else lcs_metric
        print("Preprocessing data...")
        print("Input: %s" % _INPUT_FILENAME)
        counter = 0
        preprocessed = {}
        result = {}
        with open(_INPUT_FILENAME) as input:
            for line in input:
                preprocessed_line = process(line)
                preprocessed[line] = preprocessed_line
                if _DEBUG and counter % 50 == 0:
                    print("%s => %s" % (line, preprocessed_line))
                counter += 1
        print("Clustering...")
        clusters = cluster(preprocessed, metric, _THRESHOLDS[metric_txt], _DEBUG)
        for line, preprocessed_line in preprocessed.items():
            result[line] = clusters[preprocessed_line]
        print("Writing result...")
        write_result(result, _OUTPUT_PATTERN % metric_txt)
        time2 = time()
        print("Run for %f s." % (time2 - time1))
Example #25
0
import matplotlib
matplotlib.use('Agg')
import mpld3
from pandas import DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
import pylab as pl
import preprocessing
import numpy as np
import plots.pie as pie

print("Starting")

imgs = []

data = preprocessing.process()

dispXSpending = data.groupby(
    [data['SO_DISPOSITIVO']])['VALOR_PRODUTOS']

info = dispXSpending.sum()

imgs.append(pie.plot(info))

########################################
imgs.append(plt.figure())
single = [[0 for _ in range(7)] for _ in range(24)]
for index, row in data.iterrows():
    single[row['HORA_PEDIDO']][row['DIA_PEDIDO']] += 1

df = DataFrame(single, index=range(0, 24, 1), columns=range(0, 7, 1))
Example #26
0
#data
from data_downloader import downloader
from preprocessing import process

#Model
from sklearn_crfsuite import CRF

#Evalulation
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_accuracy_score

data = downloader()

X_train, Y_train = process(data)

crf4 = CRF(algorithm='lbfgs',
           max_iterations=20,
           c1=0.1,
           c2=0.2,
           all_possible_transitions=False)

#training model
crf4.fit(X=X_train, y=Y_train)

#generate predictions
pred = crf4.predict(X_train)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=Y_train)
print(report)
Example #27
0
	summary_hist_b = tf.summary.histogram('W', weights_of_model[1])
	summary_loss = tf.summary.scalar('Loss', loss)
	#summary_train_acc = tf.summary.scalar('Training Accuracy', train_accuracy)
	summary_op = tf.summary.merge_all() 



	init = tf.global_variables_initializer()


with tf.Session(graph=linear) as sess:

	sess.run(init)
	train_writer = tf.summary.FileWriter('summary_directory', sess.graph)

	training_data, training_labels, validation_data, validation_labels, testing_data, testing_labels = process()

	
	
	step = 0
	for epoch in range(1, 2):

		for i in range(0, len(training_data) - mini_batch, mini_batch):
			x_feed = []
			y_feed = []
			for j in range(i, i + mini_batch):
				x_feed.append(training_data[j])
				y_feed.append(training_labels[j])

			loss_, _, summary_full = sess.run([loss, train_opt, summary_op], feed_dict={train_data: x_feed, label_data: y_feed, batch_size: 100.0/float(len(x_feed))}) #, keep_prob: 0.5})
			step += 1
Example #28
0
import numpy as np

import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale

bodies = "../../data/train_bodies.csv"
stances = "../../data/train_stances.csv"

content = pd.read_csv(bodies, sep=",")
headlines = pd.read_csv(stances, sep=",")

## generate necessary token features for dnews heading and news body
content['content_tokens'] = content.articleBody.apply(lambda x: pp.process(x))
headlines['headline_tokens'] = headlines.Headline.apply(lambda x: pp.process(x))

# ## Begin sentence embedding
header_vectors = np.zeros((headlines.shape[0], 300))
for i, q in enumerate(headlines.headline_tokens.values):
    header_vectors[i, :] = encoding.tovector(q)

# ## create the content vector
content_vectors = np.zeros((content.shape[0], 300))
for i, q in enumerate(content.content_tokens.values):
    content_vectors[i, :] = encoding.tovector(q)

header_series = pd.Series(header_vectors.tolist())
headlines['headline_vector'] = header_series.values
Example #29
0
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output  # for callbacks

import preprocessing
from plots import Plot

df = preprocessing.process()
plot = Plot(df)

# Launch the application:
app = dash.Dash(__name__)

app.layout = html.Div(
    children=[
        # search and table
        html.Div(children=[
            dcc.Input(id="search_input",
                      placeholder='Enter a value...',
                      type='text',
                      value=''),
            html.Div(dcc.Graph(id="table")),
        ]),

        # row of 2 barcharts
        html.Div(children=[
            html.Div(children=[
                html.Div(dcc.Graph(id="overall_bc")),
                dcc.Slider(id="overall_slider",
                           marks={i: str(i)
Example #30
0
# -*- coding: utf8 -*-
import pandas as pd
import numpy as np
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model

import preprocessing


train_file, test_file = sys.argv[1], sys.argv[2]

data = preprocessing.process(train_file)

vectorizer = CountVectorizer(analyzer = "word", max_features = 2000)
train_data_features = vectorizer.fit_transform(data['text']).toarray()

def show_word_frequencies(out_file, print_data):
    vectorizer = CountVectorizer(analyzer = "word", max_features = 2000)
    data_features = vectorizer.fit_transform(print_data['text']).toarray()
    words = vectorizer.get_feature_names()
    frequencies = np.sum(data_features, axis=0)
    with open(out_file, "w+") as f:
        for fr, word in sorted(zip(frequencies, words), reverse=True):
            f.write(str(fr) + word + '\n')

data[data['label'] == '1'].to_csv('bad_vocab.txt', sep='\t', encoding='utf-8')
data[data['label'] == '0'].to_csv('good_vocab.txt', sep='\t', encoding='utf-8')
show_word_frequencies("bad_features.txt", data[data['label'] == '1'])
show_word_frequencies("good_features.txt", data[data['label'] == '0'])
Example #31
0
from sklearn import svm
from preprocessing import process

X, Y, v_x, v_y, t_x, t_y = process()

total = len(v_y)

clf = svm.SVC()
clf.fit(X, Y)
v_y_ = clf.predict(v_x)
p = np.sum(v_y_ == np.array(v_y))
accuracy = p / float(total)
print accuracy