def get_data(data_file, data_type="TRAIN"): file_path = data_file train_json = dict() xmlp = ET.XMLParser(encoding="utf-8") tree = ET.parse(file_path, parser=xmlp) root = tree.getroot() pbar = tqdm(total=len(root.findall('pair'))) print('\nLoading premise and hypothesis sentences from disk...') print('-' * 55) for id, pair in enumerate(root.findall('pair')): # pair_ID = pair.find('pair id').text text1 = pair.find('t1').text text2 = pair.find('t2').text preprocessed_premise = pre.process(text1) preprocessed_hyp = pre.process(text2) temp = dict() temp['text1'] = preprocessed_premise.lstrip() temp['text2'] = preprocessed_hyp.lstrip() if data_type == "TRAIN": label = pair.find('Label').text if label == 'Y': label = 1 if label == 'N': label = 0 temp['label'] = label train_json[id] = temp pbar.update(1) return train_json
def run_classification(): if not path.exists(PROCESSED_CORPUS_PATH): preprocessing.process(SRC_DIR) if not path.exists(FEATURES_FILE): x, y, le = extract_books_features_from_corpus() save_book_features_to_file(x, y, le) else: x, y = load_features_from_file() hybrid_classification(x, y)
def main(): for image_count in range(1, 6): process_dict = {} image = cv2.imread(f"input/{image_count}.jpg", cv2.IMREAD_GRAYSCALE) # cv2.imwrite('ocr.png',image) # if (image.shape[0] < image.shape[1]): # image = process_dict["image"] = image process_dict["image_height"] = image.shape[0] process_dict["image_width"] = image.shape[1] # retval, thresh = cv2.threshold(image,200,255,cv2.THRESH_BINARY) print(image.shape) # cv2.imwrite("thresh_200.png",thresh) threshold = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 201, 15) # blur = cv2.GaussianBlur(image,(7,7),0) # retval, threshold = cv2.threshold(blur,0,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU) process_dict["binary"] = threshold process_dict["count"] = image_count process_dict = process(process_dict) print(f"Image preprocessing done for {image_count}.")
def __getitem__(self, case_num, size=(128, 128, 128)): if self.is_validation == True: case_num += 200 try: img = np.load(self.path + '/' + "{}_i.npy".format(case_num)) mask = np.load(self.path + '/' + "{}_m.npy".format(case_num)) return img, mask except (IOError, FileNotFoundError): img = self.df.loc[self.df['case_id'] == case_num]['image'] #print("THIS IS WHAT WE ON") img = nib.load(img.values[0]) affine = img.affine img = img.get_fdata() img = process(img, size) img = np.expand_dims(img, axis=3) img = np.expand_dims(img, axis=0) mask = self.df.loc[self.df['case_id'] == case_num]['mask'] mask = nib.load(mask.values[0]) mask = mask.get_data() #mask = resize_image(mask, size, is_mask=True) mask_off = mask > 1.5 mask[ mask_off] = 1 # This will turn all tumor pixels into kidney pixels mask = np.expand_dims(mask, axis=3) mask = np.expand_dims(mask, axis=0) np.save(self.path + '/' + "{}_i".format(case_num), img) np.save(self.path + '/' + "{}_m".format(case_num), mask) return img, mask
def process(X, size): a, b = size X_new = [] for i in range(len(X)): X_new.append(preprocessing.process(X[i], [a, b])) #preprocessing.make_image(preprocessing.process(X[i],[12,12]),str(i)) return np.array(X_new)
def kfold_cv(x, y, model): nested_train_scores = list() nested_test_scores = list() outer_cv = KFold(n_splits=5, shuffle=True, random_state=1) for train_index, test_index in tqdm(outer_cv.split(x)): # split data xTrain = x.iloc[train_index] xTest = x.iloc[test_index] yTrain = y.iloc[train_index] yTest = y.iloc[test_index] xTrain, yTrain, xTest, yTest = preprocessing.process( xTrain, yTrain, xTest, yTest) # PCA (number of components chosen such that the amount of variance # that needs to be explained is greater than the percentage specified by n_components) sklearn_PCA = PCA(n_components=0.95, svd_solver='full') xTrain = sklearn_PCA.fit_transform(xTrain) xTest = sklearn_PCA.transform(xTest) # xTrain, yTrain, xTest, yTest = df_to_numpy(xTrain, yTrain, xTest, yTest) md = model.fit(xTrain, yTrain) # Train Score yHat1 = md.predict(xTrain) r2 = r2_score(yTrain, yHat1) nested_train_scores.append(r2) # Test Score yHat2 = md.predict(xTest) r2 = r2_score(yTest, yHat2) nested_test_scores.append(r2) return nested_train_scores, nested_test_scores
def predict(self, datafile): """Predicts class labels for the input instances in file 'datafile' Returns the list of predicted labels """ lines = self.retrieveData(datafile) tokens, data = proc.process(lines) self.clf.predict(data.iloc[:, 1:data.shape[1]])
def train(self, trainfile): """Trains the classifier model on the training set stored in file trainfile""" lines = self.retrieveData(trainfile) data, y_train = proc.process(lines) vocab = [] for sent in data['words_in_window']: for w in sent: if w not in vocab: vocab.append(w) vocab_size = len(vocab) self.tokenizer = Tokenizer(num_words=vocab_size) self.tokenizer.fit_on_texts(data.words_in_window) sentiment_tokenized = pd.DataFrame( self.tokenizer.texts_to_matrix(data.words_in_window)) self.clf_tok = Sequential() self.clf_tok.add( Dense(128, input_shape=(vocab_size, ), activation='softmax')) self.clf_tok.add( Dense(64, input_shape=(vocab_size, ), activation='relu')) self.clf_tok.add(Dense(3, activation='softmax')) self.clf_tok.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) self.clf_tok.fit(sentiment_tokenized, y_train, epochs=10, batch_size=32)
def process_file(filepath): # tesseract read text tiff_image_path = next(preprocessing.process(filepath)) tessract_results = ocr.process(tiff_image_path) tess_text = compile_text(tessract_results) yield tess_text # ground truth yield from clean_ground_truth.process(filepath)
def train(self, trainfile): """Trains the classifier model on the training set stored in file trainfile""" lines = self.retrieveData(trainfile) tokens, data = proc.process(lines) parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]} svc = svm.SVC(gamma="scale") self.clf = GridSearchCV(svc, parameters, cv=5) self.clf.fit(data.iloc[:, 1:data.shape[1]], data.iloc[:, 0])
def process(X, size): print 'processing....' t = time.time() a, b = size X_new = [] for i in range(len(X)): X_new.append(preprocessing.process(X[i], [a, b])) #preprocessing.make_image(preprocessing.process(X[i],[12,12]),str(i)) print 'processing done in', time.time() - t, 'seconds' return np.array(X_new)
def get_Xy(path): X, y, file = [], [], [] d = np.load('/'.join(sys.argv[0].split('/')[:-1]) + '/acid_data/pka.npy').item() for i in os.listdir(path): if i[-8:] == '.g16.out': X.append(preprocessing.process(coord(i), [100, 100])) #preprocessing.make_image(preprocessing.process(coord(i),[100,100]),i.split('.')[0]) y.append([float(d[i.split('.')[0]])]) file.append(i.strip().split()[0]) return np.array(X), np.array(y), file
def get_data(input, flag): data, labels = [], [] if flag: train_data = preprocessing.process(test_file) data, labels = train_data['text'], train_data['labels'] else: with open(train_file) as f: gen = chunks.read_chunk(f, "\n") # поправлю 40к когда придумаю на что for i in range(40000): s = next(gen).split('\t') data.append(s[-1]) labels.append(s[-2]) return data, labels
def nested_cv(x, y, model, p_grid): # nested cv method can be condensed with the following code: """ pipeline = Pipeline([('transformer', scalar), ('estimator', clf)]) cv = KFold(n_splits=4) scores = cross_val_score(pipeline, X, y, cv = cv) """ nested_train_scores = list() nested_test_scores = list() nested_params = list() # nested cv outer_cv = KFold(n_splits=5, shuffle=True, random_state=1) for train_index, test_index in tqdm(outer_cv.split(x)): # split data xTrain = x.iloc[train_index] xTest = x.iloc[test_index] yTrain = y.iloc[train_index] yTest = y.iloc[test_index] inner_cv = KFold(n_splits=5, shuffle=True, random_state=1) xTrain, yTrain, xTest, yTest = preprocessing.process( xTrain, yTrain, xTest, yTest) # PCA (number of components chosen such that the amount of variance # that needs to be explained is greater than the percentage specified by n_components) sklearn_PCA = PCA(n_components=0.95, svd_solver='full') xTrain = sklearn_PCA.fit_transform(xTrain) xTest = sklearn_PCA.transform(xTest) # xTrain, yTrain, xTest, yTest = df_to_numpy(xTrain, yTrain, xTest, yTest) # Scoring metric is roc_auc_score (precision and recall) clf = GridSearchCV(estimator=model, param_grid=p_grid, scoring='r2', cv=inner_cv, refit=True) fitter = clf.fit(xTrain, yTrain) best_model = fitter.best_estimator_ nested_params.append(fitter.best_params_) # Train Score yHat1 = best_model.predict(xTrain) r2 = r2_score(yTrain, yHat1) nested_train_scores.append(r2) # Test Score yHat2 = best_model.predict(xTest) r2 = r2_score(yTest, yHat2) nested_test_scores.append(r2) return nested_train_scores, nested_test_scores, nested_params
def predict(self, datafile): """Predicts class labels for the input instances in file 'datafile' Returns the list of predicted labels """ lines = self.retrieveData(datafile) data_eval, y_eval = proc.process(lines) x_eval = pd.DataFrame( self.tokenizer.texts_to_matrix(data_eval.words_in_window)) dic = {0: 'negative', 1: 'neutral', 2: 'positive'} pred = [ dic.get(n, n) for n in np.argmax(self.clf_tok.predict(x_eval), 1) ] return pred
def main(): args = parse_args() text = args.i.readlines() processed_lines = [] for line in text: preprocessed_line = preprocessing.process(line.strip()) if len(preprocessed_line) != 0: processed_line = map_replacement.replace_from_maps( preprocessed_line) processed_lines.append(processed_line) for line in processed_lines: args.o.write(line + '\n')
def query(self, model_path, n_samples_query, n_results, custom=False, weights=False): vertices, element_dict, info = read_model(model_path) shape = Shape(vertices, element_dict, info) shape = process(shape, n_vertices_target=self.n_vertices_target) feature_dict = extract_features(shape, self.n_bins, n_samples=n_samples_query) feature_df = data_dict_parser(feature_dict) feature_df, _ = sample_normalizer( feature_df, *self.sample_normalization_parameters, divide_distributions=self.divide_distributions) feature_df_numeric = feature_df.select_dtypes(np.number) #Make sure columns identical and ordered assert list(feature_df_numeric.columns) == list( self.df_numeric.columns), "Column mismatch!" query_vector = feature_df_numeric.iloc[0, :].values.astype(np.float32) if not custom: distances, indices = self.faiss_knn.query(query_vector, n_results) else: distances, indices = self.custom_knn.query(query_vector, n_results, weights=weights) distances = distances.flatten().tolist() #Flatten batch dimension indices = indices.flatten().tolist() df_slice = self.df[self.df.index.isin(indices)] df_slice['distance'] = df_slice.index.map( lambda x: distances[indices.index(x)]) #Add missing data to query df feature_df['file_name'] = str(model_path) feature_df['classification'] = 'query_input' feature_df['distance'] = 0 # Put it at top of slice df_slice = pd.concat([df_slice, feature_df]) df_slice = df_slice.sort_values('distance') return distances, indices, df_slice
def process_subset(self, file_list, apply_processing, n_vertices_target, n_bins, process_index): print(f' {process_index} : Starting subset processor!') data_subset = {k: [] for k in self.columns + self.col_array} for index, file in enumerate(file_list): if index % 50 == 0: print(f' {process_index} : Is at {index}/{len(file_list)}!') vertices, element_dict, info = read_model(Path(file)) shape = Shape(vertices, element_dict, info) if apply_processing: shape = process(shape, n_vertices_target=n_vertices_target) else: shape.make_pyvista_mesh() id = os.path.basename(file).split(".")[0].replace("m", "") if id in self.classification_dict.keys(): classification = self.classification_dict[id] else: classification = None data_subset["classification"].append(classification) data_subset["file_name"].append(file) #Get features feature_dict = extract_features(shape, n_bins=n_bins, n_samples=self.n_samples) #Add them to total data for key, val in feature_dict.items(): data_subset[key].append(val) print(f'{process_index} : Finished!') return data_subset
import sys sys.path.append("../") from gensim import models import pandas as pd import numpy as np import preprocessing as pp filename = "../../../data/sample.csv" data = pd.read_csv(filename, sep=',') data['header_features'] = data.Headline.apply(lambda x : pp.process(x)) data['content_features'] = data.articleBody.apply(lambda x : pp.process(x)) model = models.Word2Vec.load_word2vec_format('/media/sree/venus/code/word2vec/GoogleNews-vectors-negative300.bin', binary=True) def sent2vec(words): M = [] for w in words: try: M.append(model[w]) except: continue M = np.array(M) v = M.sum(axis=0) return v / np.sqrt((v ** 2).sum())
def preprocess_callback(): """ Starts preprocessing :return: """ preprocessing.process()
y_train = Training_dataframe.iloc[:, 1] x_test = Testing_dataframe.loc[:, attrib[2:len(attrib)]] y_test = Testing_dataframe.iloc[:, 1] y_train = to_categorical(y_train) y_test = to_categorical(y_test) # merge the 25 news together to form a single signal merged_x_train = x_train.apply(lambda x: ''.join(str(x.values)), axis=1) merged_x_test = x_test.apply(lambda x: ''.join(str(x.values)), axis=1) # =============== # pre-process # =============== merged_x_train = merged_x_train.apply(lambda x: pp.process(x)) merged_x_test = merged_x_test.apply(lambda x: pp.process(x)) # remove stopwords in the training and testing set train_without_sw = [] test_without_sw = [] train_temporary = list(merged_x_train) test_temporary = list(merged_x_test) s = pp.stop_words for i in train_temporary: f = i.split(' ') for j in f: if j in s: f.remove(j) s1 = "" for k in f:
from keras.models import Sequential from keras.layers import Dense, Activation import numpy as np import preprocessing import pandas as pd from sklearn import metrics from sklearn.metrics import precision_score, recall_score, f1_score from sklearn import tree from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from numpy import argmax import re # Model Template x_train, y_train, x_val, y_val, x_test, y_test = preprocessing.process( "images.npy", "labels.npy") # val = int(raw_input("Max Depth: ")) model = tree.DecisionTreeClassifier(max_depth=10) #4 different types of feature extraction #1 avg pixel values for each number def getAvgPixelIntensity(x_set): pixelIntensity = 0 pics = [] #print(x_set.shape) for picture in x_set: sum = 0 for pixel in picture: sum += pixel
def data(): start_train = '2008-08-08' end_train = '2014-12-31' start_val = '2015-01-02' end_val = '2016-07-01' max_sequence_length = 110 vocab_size = 3000 # read csv file DJIA = pd.read_csv("Combined_News_DJIA.csv", usecols=[ 'Date', 'Label', 'Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25' ]) # create training and testing dataframe on 80 % and 20 % respectively Training_dataframe = DJIA[(DJIA['Date'] >= start_train) & (DJIA['Date'] <= end_train)] Testing_dataframe = DJIA[(DJIA['Date'] >= start_val) & (DJIA['Date'] <= end_val)] attrib = DJIA.columns.values x_train = Training_dataframe.loc[:, attrib[2:len(attrib)]] y_train = Training_dataframe.iloc[:, 1] x_test = Testing_dataframe.loc[:, attrib[2:len(attrib)]] y_test = Testing_dataframe.iloc[:, 1] y_train = to_categorical(y_train) y_test = to_categorical(y_test) # merge the 25 news together to form a single signal merged_x_train = x_train.apply(lambda x: ''.join(str(x.values)), axis=1) merged_x_test = x_test.apply(lambda x: ''.join(str(x.values)), axis=1) # =============== # pre-process # =============== merged_x_train = merged_x_train.apply(lambda x: pp.process(x)) merged_x_test = merged_x_test.apply(lambda x: pp.process(x)) #merged_x_train = merged_x_train.apply(lambda x: pp.lemmanouns(pp.lemmaverbs(pp.lemmaadjectives(x)))) #merged_x_test = merged_x_test.apply(lambda x: pp.lemmanouns(pp.lemmaverbs(pp.lemmaadjectives(x)))) #merged_x_train = merged_x_train.apply(lambda x: pp.stemmer(x)) #merged_x_test = merged_x_test.apply(lambda x: pp.stemmer(x)) # remove stopwords in the training and testing set train_without_sw = [] test_without_sw = [] train_temporary = list(merged_x_train) test_temporary = list(merged_x_test) s = pp.stop_words for i in train_temporary: f = i.split(' ') for j in f: if j in s: f.remove(j) s1 = "" for k in f: s1 += k + " " train_without_sw.append(s1) merged_x_train = train_without_sw for i in test_temporary: f = i.split(' ') for j in f: if j in s: f.remove(j) s1 = "" for k in f: s1 += k + " " test_without_sw.append(s1) merged_x_test = test_without_sw # tokenize and create sequences tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(merged_x_train) x_train_sequence = tokenizer.texts_to_sequences(merged_x_train) x_test_sequence = tokenizer.texts_to_sequences(merged_x_test) word_index = tokenizer.word_index input_dim = len(word_index) + 1 print('Found %s unique tokens.' % len(word_index)) x_train_sequence = pad_sequences(x_train_sequence, maxlen=max_sequence_length) x_test_sequence = pad_sequences(x_test_sequence, maxlen=max_sequence_length) print('Shape of training tensor:', x_train_sequence.shape) print(x_train_sequence) print('Shape of testing tensor:', x_test_sequence.shape) print(x_test_sequence) """ Data providing function: This function is separated from create_model() so that hyperopt won't reload data for each evaluation run. """ return x_train_sequence, y_train, x_test_sequence, y_test
precision, recall, f1 = calculate_quality(_OUTPUT_PATTERN % metric, _REFERENCE_FILENAME) print("Metric %s:" % metric) print("\tPrecision: %f, recall: %f, f1: %f" % (precision, recall, f1)) except: pass time2 = time() print("Run for %f s." % (time2 - time1)) else: metric_txt = action metric = dice_metric if action == 'dice' else cosine_metric if action == 'cosine' else lcs_metric print("Preprocessing data...") print("Input: %s" % _INPUT_FILENAME) counter = 0 preprocessed = {} result = {} with open(_INPUT_FILENAME) as input: for line in input: preprocessed_line = process(line) preprocessed[line] = preprocessed_line if _DEBUG and counter % 50 == 0: print("%s => %s" % (line, preprocessed_line)) counter += 1 print("Clustering...") clusters = cluster(preprocessed, metric, _THRESHOLDS[metric_txt], _DEBUG) for line, preprocessed_line in preprocessed.items(): result[line] = clusters[preprocessed_line] print("Writing result...") write_result(result, _OUTPUT_PATTERN % metric_txt) time2 = time() print("Run for %f s." % (time2 - time1))
import matplotlib matplotlib.use('Agg') import mpld3 from pandas import DataFrame import seaborn as sns import matplotlib.pyplot as plt import pylab as pl import preprocessing import numpy as np import plots.pie as pie print("Starting") imgs = [] data = preprocessing.process() dispXSpending = data.groupby( [data['SO_DISPOSITIVO']])['VALOR_PRODUTOS'] info = dispXSpending.sum() imgs.append(pie.plot(info)) ######################################## imgs.append(plt.figure()) single = [[0 for _ in range(7)] for _ in range(24)] for index, row in data.iterrows(): single[row['HORA_PEDIDO']][row['DIA_PEDIDO']] += 1 df = DataFrame(single, index=range(0, 24, 1), columns=range(0, 7, 1))
#data from data_downloader import downloader from preprocessing import process #Model from sklearn_crfsuite import CRF #Evalulation from sklearn_crfsuite.metrics import flat_classification_report from sklearn_crfsuite.metrics import flat_f1_score from sklearn_crfsuite.metrics import flat_accuracy_score data = downloader() X_train, Y_train = process(data) crf4 = CRF(algorithm='lbfgs', max_iterations=20, c1=0.1, c2=0.2, all_possible_transitions=False) #training model crf4.fit(X=X_train, y=Y_train) #generate predictions pred = crf4.predict(X_train) #generate report on entire model report = flat_classification_report(y_pred=pred, y_true=Y_train) print(report)
summary_hist_b = tf.summary.histogram('W', weights_of_model[1]) summary_loss = tf.summary.scalar('Loss', loss) #summary_train_acc = tf.summary.scalar('Training Accuracy', train_accuracy) summary_op = tf.summary.merge_all() init = tf.global_variables_initializer() with tf.Session(graph=linear) as sess: sess.run(init) train_writer = tf.summary.FileWriter('summary_directory', sess.graph) training_data, training_labels, validation_data, validation_labels, testing_data, testing_labels = process() step = 0 for epoch in range(1, 2): for i in range(0, len(training_data) - mini_batch, mini_batch): x_feed = [] y_feed = [] for j in range(i, i + mini_batch): x_feed.append(training_data[j]) y_feed.append(training_labels[j]) loss_, _, summary_full = sess.run([loss, train_opt, summary_op], feed_dict={train_data: x_feed, label_data: y_feed, batch_size: 100.0/float(len(x_feed))}) #, keep_prob: 0.5}) step += 1
import numpy as np import numpy as np from sklearn.decomposition import PCA import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import scale bodies = "../../data/train_bodies.csv" stances = "../../data/train_stances.csv" content = pd.read_csv(bodies, sep=",") headlines = pd.read_csv(stances, sep=",") ## generate necessary token features for dnews heading and news body content['content_tokens'] = content.articleBody.apply(lambda x: pp.process(x)) headlines['headline_tokens'] = headlines.Headline.apply(lambda x: pp.process(x)) # ## Begin sentence embedding header_vectors = np.zeros((headlines.shape[0], 300)) for i, q in enumerate(headlines.headline_tokens.values): header_vectors[i, :] = encoding.tovector(q) # ## create the content vector content_vectors = np.zeros((content.shape[0], 300)) for i, q in enumerate(content.content_tokens.values): content_vectors[i, :] = encoding.tovector(q) header_series = pd.Series(header_vectors.tolist()) headlines['headline_vector'] = header_series.values
import dash import dash_core_components as dcc import dash_html_components as html from dash.dependencies import Input, Output # for callbacks import preprocessing from plots import Plot df = preprocessing.process() plot = Plot(df) # Launch the application: app = dash.Dash(__name__) app.layout = html.Div( children=[ # search and table html.Div(children=[ dcc.Input(id="search_input", placeholder='Enter a value...', type='text', value=''), html.Div(dcc.Graph(id="table")), ]), # row of 2 barcharts html.Div(children=[ html.Div(children=[ html.Div(dcc.Graph(id="overall_bc")), dcc.Slider(id="overall_slider", marks={i: str(i)
# -*- coding: utf8 -*- import pandas as pd import numpy as np import sys from sklearn.feature_extraction.text import CountVectorizer from sklearn import linear_model import preprocessing train_file, test_file = sys.argv[1], sys.argv[2] data = preprocessing.process(train_file) vectorizer = CountVectorizer(analyzer = "word", max_features = 2000) train_data_features = vectorizer.fit_transform(data['text']).toarray() def show_word_frequencies(out_file, print_data): vectorizer = CountVectorizer(analyzer = "word", max_features = 2000) data_features = vectorizer.fit_transform(print_data['text']).toarray() words = vectorizer.get_feature_names() frequencies = np.sum(data_features, axis=0) with open(out_file, "w+") as f: for fr, word in sorted(zip(frequencies, words), reverse=True): f.write(str(fr) + word + '\n') data[data['label'] == '1'].to_csv('bad_vocab.txt', sep='\t', encoding='utf-8') data[data['label'] == '0'].to_csv('good_vocab.txt', sep='\t', encoding='utf-8') show_word_frequencies("bad_features.txt", data[data['label'] == '1']) show_word_frequencies("good_features.txt", data[data['label'] == '0'])
from sklearn import svm from preprocessing import process X, Y, v_x, v_y, t_x, t_y = process() total = len(v_y) clf = svm.SVC() clf.fit(X, Y) v_y_ = clf.predict(v_x) p = np.sum(v_y_ == np.array(v_y)) accuracy = p / float(total) print accuracy