def main(fileName): trainFileName = './Sentiment_training_data/sentiment_training_data.csv' preprocessedTrainFileName = Sentimental_Data_Preprocessing.main( trainFileName) outputFileName = Sentiment_Analysis.main(preprocessedTrainFileName) episodeFileName = './Prediction_data/simpsons_episodes.csv' Feature_Extraction.main(outputFileName, episodeFileName)
def train_svm(models, features, kernel): for model in models: for feature in features: if kernel == 'rbf': filename = '' + model + '_' + feature else: filename = '' + model + '_' + feature + '_' + kernel svm_path = 'svms/' + filename + '.pkl' #train svm Fe.train_and_save_svm(svm_path, model, feature, kernel, True)
def model(models, mapp): groundtruth_path = '../groundtruth/' # read them from csv conceptsList = ccFileIO.readConceptTxt(groundtruth_path + 'concepts.txt') conceptsList_all = ccFileIO.readConceptTxt(groundtruth_path + 'concepts_all.txt') videofiles = ccFileIO.read_videofiles(groundtruth_path + 'needed_videos.txt') needed_shots = ccFileIO.read_selected_shots_from_file( groundtruth_path + 'shots.csv', conceptsList_all) shot_paths = ccFileIO.read_shot_paths(groundtruth_path + 'shot_paths.txt') all_infos = (conceptsList, conceptsList_all, videofiles, needed_shots, shot_paths) # 1267 forest <-> 75 tree # 1015 boat / ship <-> 13 boat # 1261 flags <-> 24 flag # 1031 computers <-> 21 computer # 1010 beach <-> 64 beach # 1006 animal <-> 0 animal for model in models: # convert model mean from .binaryproto to .npy (needs only be done once for each model # Fe.convert_binaryproto_to_npy(m) acc_values_for_all_concepts = Fe.load_and_use_model( model, all_infos, mapp) FileIO.write_accuracies(acc_values_for_all_concepts, model)
def use_svm(models, features, mapp, kernel): groundtruth_path = '../groundtruth/' conceptsList = ccFileIO.readConceptTxt(groundtruth_path + 'concepts.txt') conceptsList_all = ccFileIO.readConceptTxt(groundtruth_path + 'concepts_all.txt') videofiles = ccFileIO.read_videofiles(groundtruth_path + 'needed_videos.txt') needed_shots = ccFileIO.read_selected_shots_from_file( groundtruth_path + 'shots.csv', conceptsList_all) shot_paths = ccFileIO.read_shot_paths(groundtruth_path + 'shot_paths.txt') acc_values = [] all_infos = (conceptsList, conceptsList_all, videofiles, needed_shots, shot_paths) for model in models: for feature in features: acc_values_for_all_concepts = Fe.load_and_use_svm( model, feature, all_infos, mapp, kernel, True) if kernel == 'rbf': filename = '' + model + '_' + feature else: filename = '' + model + '_' + feature + '_' + kernel FileIO.write_accuracies(acc_values_for_all_concepts, filename)
def extract_features(image): #avg_dist, avgwidth, avgheight, standard_dev, medwidth= fe.ConnectedComponent(image) #area = fe.EnclosedRegion(image) slope1, slope2, slope3 = fe.Fractal_Features(image) take, f1, f2, f3, f4, f5, f6 = seg.get_mid_height(image) f7 = seg.get_transitions(image) f8 = f2 / f7 # return avg_dist, avgwidth, avgheight, medwidth, area return f1, f2, f4, f5, f6, f8, slope1, slope2
def ExtractDataFeatures(train_dir=None, test_dir=None): final_test_df = None final_train_df = None if train_dir: train_data = CreateDataset(path, train_dir) train_data = ShuffleData(train_data) train_df = CleanData(train_data) final_train_df = Feature_Extraction.Feature_Extraction(train_df) #train_df.to_csv(os.path.join(path, 'CombinedData.csv'), index=False, encoding='utf-8') if (test_dir): test_data = CreateDataset(path, test_dir, test=True) test_data = ShuffleData(test_data) test_df = CleanData(test_data, test=True) final_test_df = Feature_Extraction.Feature_Extraction(test_df, test=True) #df = pd.read_csv(os.path.join(path, 'CombinedData.csv'), encoding='utf-8') #final_df.to_csv(os.path.join(path, 'Feature_Extracted_Data.csv'), index=False, encoding='utf-8') return final_train_df, final_test_df
def pre_processing(self): # Feature Extraction data = Feature_Extraction.TwitterData_ExtraFeatures() data.build_features(self.train_A) self.extra_features = data.processed_data # Clearing training dataset and Integer Encoding # Delete URLs self.train_A['tweet'] = self.train_A['tweet'].str.replace( 'http\S+|www.\S+', '', case=False) # Delete Usernames self.train_A['tweet'] = self.train_A['tweet'].str.replace(r'@\S+', '', case=False) # Replace hashtags with space to deal with the case where the tweet appears to be one word but is consisted by more seperated from hashtags self.train_A['tweet'] = self.train_A['tweet'].str.replace(r'#', ' ', case=False) # print('Average number of words per sentence: ', np.mean([len(s.split(" ")) for s in self.train_A.tweet])) for sentence in self.train_A['tweet']: # substitute contractions with full words words = self.replace_contractions(sentence) # Tokenize tweets words = word_tokenize(words) # remove punctuation from each word table = str.maketrans('', '', string.punctuation) words = [w.translate(table) for w in words] # remove all tokens that are not alphabetic words = [word for word in words if word.isalpha()] # stemming of words porter = PorterStemmer() words = [porter.stem(word) for word in words] # Delete Stop-Words whitelist = ["n't", "not", 'nor', "nt" ] # Keep the words "n't" and "not", 'nor' and "nt" stop_words = set(stopwords.words('english')) words = [w for w in words if w not in stop_words or w in whitelist] # Keep the tokenized tweets self.words_of_tweets.append(words)
def main(sc, X_train_path, y_train_path, X_test_path, y_test_path=None): # file processing # train_df is a dataframe containing 2 columns, text content and label train_raw_rdd = get_train_data_rdd(sc, X_train_path, y_train_path) test_raw_rdd = get_test_data_rdd(sc, X_test_path) # feature extraction feature_extraction = fe.Feature_Extraction() train_df, test_df = feature_extraction.extract_featrues( train_rdd=train_raw_rdd, test_rdd=test_raw_rdd) # print(train_df.show(n=5, truncate=100)) # print(test_df.show(n=5, truncate=100)) print('****************************') print('Train Model with NaiveBayes\n') nb = NaiveBayes(smoothing=1) model = nb.fit(train_df) print('****************************') print('Testing Unseen Data\n') predictions = model.transform(test_df) pred_list = [ int(row.prediction) + 1 for row in predictions.sort('doc_id').select('prediction').collect() ] with open('prediction.txt', 'w') as f: for pred_label in pred_list: f.write('%d\n' % pred_label) if y_test_path: y_test_data = sc.textFile(y_test_path).collect() cnt = 0 for i in range(len(y_test_data)): if int(y_test_data[i]) == pred_list[i]: cnt += 1 print('Accuracy: %f, %d/%d' % (cnt * 1.0 / len(y_test_data), cnt, len(y_test_data)))
import numpy as np import xlrd from sklearn import svm X = np.empty([code.Total_data, 135], dtype=float) data = np.empty([1, 135], dtype='object') book = xlrd.open_workbook('Features.xlsx') sheet = book.sheet_by_name('Sheet1') for c in range(sheet.ncols): for r in range(sheet.nrows): X[c, r] = sheet.cell_value(r, c) y = np.empty(code.Total_data, dtype='object') classes = ["UP", "DOWN"] for i in range(0, 151): y[i] = classes[0] for i in range(151, code.Total_data): y[i] = classes[1] clf = svm.SVC(C=70, kernel='rbf') clf.fit(X, y) data = code.Gesture() Result = clf.predict(data) print(Result)
bins.append(3) elif y_meal_values[i] < large_max: large_bin.append(meal_data.iloc[i]) bins.append(4) else: vlarge_bin.append(meal_data.iloc[i]) bins.append(5) # In[14]: sum = len(vsmall_bin) + len(small_bin) + len(mods_bin) + len(modl_bin) + len( large_bin) + len(vlarge_bin) # In[15]: Feature_Extraction.main() # In[16]: feature_matrix = pd.read_csv("mealDataFeatures.csv") feature_matrix.to_numpy() # In[17]: kmeans = KMeans(n_clusters=6, init='k-means++').fit(feature_matrix) k_labels = kmeans.labels_ # In[18]: kmeans_sse = kmeans.inertia_ kmeans_entropy, kmeans_purity = entropy_calc(bins, k_labels)
y = numpy.load('y_Label.npy') print(X.shape) for i in range(0, X.shape[0], 100): # STFT Power Spectrum # D = librosa.amplitude_to_db(numpy.abs(librosa.stft(X[i])), ref=numpy.max) # librosa.display.specshow(D, y_axis='log') # plt.show() # Waveplot Of Signal # librosa.display.waveplot(X[i]) # plt.show() # STFT Plot Stft_dis = [] D = Feature_Extraction.to_stft(X[i]) for j in range(D.shape[0]): Stft_dis.extend(numpy.reshape(D[j], [-1, 513, 129])) for k in range(len(Stft_dis)): librosa.display.specshow(librosa.amplitude_to_db(Stft_dis[k]), y_axis='log') plt.show() break # MelSpectrogram # Mel_dis = [] # D = Feature_Extraction.to_melspectrogram(X[i]) # for j in range(D.shape[0]): # Mel_dis.extend(numpy.reshape(D[j],[-1,128,129])) # for k in range(len(Mel_dis)): # librosa.display.specshow(Mel_dis[k], y_axis='log')
def mine_conversations(idf, csv_file_path, stop_datetime, chunksize, conversation_duration): datetime_object = datetime.strptime("2015-01-01T00:00:00.000Z", '%Y-%m-%dT%H:%M:%S.%fZ') break_loop = False open_conversations = [] log = pmlog.EventLog() score_stats = [] message_classifier = Message_Classifier.MessageClassifier() message_classifier.load_models('synth') dataprocess = Feature_Extraction.DataProcessing() columns = ['id', 'text', 'sent', 'fromUser.username'] for chunk in pd.read_csv(csv_file_path, chunksize=chunksize, usecols=columns, sep=','): # Terminate after late date, to trim dataset. if datetime_object > stop_datetime: break_loop = True if break_loop: break for index, row in chunk.iterrows(): try: # Start by getting the variables we need text = row["text"] if str(text) == "nan": continue if len(text.split(" ") ) <= 2: # Filter out messages with less than n words continue datetime_object = datetime.strptime(row["sent"], '%Y-%m-%dT%H:%M:%S.%fZ') event_dict = {} event_dict["User ID"] = row["fromUser.username"] event_dict["Date"] = datetime_object event_dict["Content"] = text event_dict["Class"] = None """ for conversation in open_conversations: time_diff = (datetime_object - conversation.open_time).total_seconds() / 60.0 if time_diff > conversation_duration: if len(conversation.message_texts) > 1: try: log.append(conversation.add_to_trace()) conversation.write_to_txt() except ValueError: open_conversations.remove(conversation) continue open_conversations.remove(conversation) """ # Now we find our text body tf_idf_message = {} text_list = TP.preprocess_text(text) for word in set(text_list): if word in idf: tf_idf_message[word] = (text_list.count(word) / len(text_list)) * idf[word] mention = TP.get_mentions(text) added = False if len(mention) > 0: for conversation in open_conversations: if conversation.is_person_in_conversation( mention[0][1:]) and not added: conversation.add_message( event_dict, message_text=row['text'], person=row['fromUser.username'], idf=idf) added = True break if not added: # Find the best matching conversation score = 0 best_matching_conversation = None for conversation in open_conversations: conversation_score = conversation.similarity_score( tf_idf_message) if conversation_score > score: score = conversation_score best_matching_conversation = conversation if best_matching_conversation != None and score > 0.15: score_stats.append(score) best_matching_conversation.add_message( event_dict, message_text=row['text'], person=row['fromUser.username'], idf=idf) else: convo = Conversation(open_time=datetime_object, event_dict=event_dict, message_text=row['text'], person=row['fromUser.username'], idf=idf, classifier=message_classifier, dataprocessing=dataprocess) open_conversations.append(convo) except AttributeError as e: print(e) continue print("Conversation mining. Date: " + row["sent"]) for conversation in open_conversations: # if len(conversation.message_texts) > 1: log.append(conversation.add_to_trace()) conversation.write_to_txt() # open_conversations.remove(conversation) return log