コード例 #1
0
def main(fileName):

    trainFileName = './Sentiment_training_data/sentiment_training_data.csv'
    preprocessedTrainFileName = Sentimental_Data_Preprocessing.main(
        trainFileName)

    outputFileName = Sentiment_Analysis.main(preprocessedTrainFileName)
    episodeFileName = './Prediction_data/simpsons_episodes.csv'

    Feature_Extraction.main(outputFileName, episodeFileName)
コード例 #2
0
def train_svm(models, features, kernel):
    for model in models:
        for feature in features:
            if kernel == 'rbf':
                filename = '' + model + '_' + feature
            else:
                filename = '' + model + '_' + feature + '_' + kernel

            svm_path = 'svms/' + filename + '.pkl'

            #train svm
            Fe.train_and_save_svm(svm_path, model, feature, kernel, True)
コード例 #3
0
def model(models, mapp):
    groundtruth_path = '../groundtruth/'
    # read them from csv
    conceptsList = ccFileIO.readConceptTxt(groundtruth_path + 'concepts.txt')
    conceptsList_all = ccFileIO.readConceptTxt(groundtruth_path +
                                               'concepts_all.txt')
    videofiles = ccFileIO.read_videofiles(groundtruth_path +
                                          'needed_videos.txt')
    needed_shots = ccFileIO.read_selected_shots_from_file(
        groundtruth_path + 'shots.csv', conceptsList_all)
    shot_paths = ccFileIO.read_shot_paths(groundtruth_path + 'shot_paths.txt')

    all_infos = (conceptsList, conceptsList_all, videofiles, needed_shots,
                 shot_paths)

    # 1267 forest <-> 75 tree
    # 1015 boat / ship <-> 13 boat
    # 1261 flags <-> 24 flag
    # 1031 computers <-> 21 computer
    # 1010 beach <-> 64 beach
    # 1006 animal <-> 0 animal

    for model in models:
        # convert model mean from .binaryproto to .npy (needs only be done once for each model
        # Fe.convert_binaryproto_to_npy(m)

        acc_values_for_all_concepts = Fe.load_and_use_model(
            model, all_infos, mapp)
        FileIO.write_accuracies(acc_values_for_all_concepts, model)
コード例 #4
0
def use_svm(models, features, mapp, kernel):

    groundtruth_path = '../groundtruth/'

    conceptsList = ccFileIO.readConceptTxt(groundtruth_path + 'concepts.txt')
    conceptsList_all = ccFileIO.readConceptTxt(groundtruth_path +
                                               'concepts_all.txt')
    videofiles = ccFileIO.read_videofiles(groundtruth_path +
                                          'needed_videos.txt')
    needed_shots = ccFileIO.read_selected_shots_from_file(
        groundtruth_path + 'shots.csv', conceptsList_all)
    shot_paths = ccFileIO.read_shot_paths(groundtruth_path + 'shot_paths.txt')

    acc_values = []
    all_infos = (conceptsList, conceptsList_all, videofiles, needed_shots,
                 shot_paths)

    for model in models:
        for feature in features:
            acc_values_for_all_concepts = Fe.load_and_use_svm(
                model, feature, all_infos, mapp, kernel, True)

            if kernel == 'rbf':
                filename = '' + model + '_' + feature
            else:
                filename = '' + model + '_' + feature + '_' + kernel
            FileIO.write_accuracies(acc_values_for_all_concepts, filename)
コード例 #5
0
def extract_features(image):
    #avg_dist, avgwidth, avgheight, standard_dev, medwidth= fe.ConnectedComponent(image)
    #area = fe.EnclosedRegion(image)
    slope1, slope2, slope3 = fe.Fractal_Features(image)
    take, f1, f2, f3, f4, f5, f6 = seg.get_mid_height(image)
    f7 = seg.get_transitions(image)
    f8 = f2 / f7
    # return avg_dist, avgwidth, avgheight, medwidth, area
    return f1, f2, f4, f5, f6, f8, slope1, slope2
コード例 #6
0
def ExtractDataFeatures(train_dir=None, test_dir=None):
    final_test_df = None
    final_train_df = None
    if train_dir:
        train_data = CreateDataset(path, train_dir)
        train_data = ShuffleData(train_data)
        train_df = CleanData(train_data)
        final_train_df = Feature_Extraction.Feature_Extraction(train_df)

    #train_df.to_csv(os.path.join(path, 'CombinedData.csv'), index=False, encoding='utf-8')
    if (test_dir):
        test_data = CreateDataset(path, test_dir, test=True)
        test_data = ShuffleData(test_data)
        test_df = CleanData(test_data, test=True)
        final_test_df = Feature_Extraction.Feature_Extraction(test_df,
                                                              test=True)

    #df = pd.read_csv(os.path.join(path, 'CombinedData.csv'), encoding='utf-8')
    #final_df.to_csv(os.path.join(path, 'Feature_Extracted_Data.csv'), index=False, encoding='utf-8')
    return final_train_df, final_test_df
コード例 #7
0
    def pre_processing(self):
        # Feature Extraction
        data = Feature_Extraction.TwitterData_ExtraFeatures()
        data.build_features(self.train_A)
        self.extra_features = data.processed_data

        # Clearing training dataset and Integer Encoding

        # Delete URLs
        self.train_A['tweet'] = self.train_A['tweet'].str.replace(
            'http\S+|www.\S+', '', case=False)
        # Delete Usernames
        self.train_A['tweet'] = self.train_A['tweet'].str.replace(r'@\S+',
                                                                  '',
                                                                  case=False)
        # Replace hashtags with space to deal with the case where the tweet appears to be one word but is consisted by more seperated from hashtags
        self.train_A['tweet'] = self.train_A['tweet'].str.replace(r'#',
                                                                  ' ',
                                                                  case=False)

        #        print('Average number of words per sentence: ', np.mean([len(s.split(" ")) for s in self.train_A.tweet]))

        for sentence in self.train_A['tweet']:
            # substitute contractions with full words
            words = self.replace_contractions(sentence)

            # Tokenize tweets
            words = word_tokenize(words)

            # remove punctuation from each word
            table = str.maketrans('', '', string.punctuation)
            words = [w.translate(table) for w in words]

            # remove all tokens that are not alphabetic
            words = [word for word in words if word.isalpha()]

            # stemming of words
            porter = PorterStemmer()
            words = [porter.stem(word) for word in words]

            # Delete Stop-Words
            whitelist = ["n't", "not", 'nor', "nt"
                         ]  # Keep the words "n't" and "not", 'nor' and "nt"
            stop_words = set(stopwords.words('english'))
            words = [w for w in words if w not in stop_words or w in whitelist]

            # Keep the tokenized tweets
            self.words_of_tweets.append(words)
コード例 #8
0
ファイル: Engine.py プロジェクト: dsp-uga/team-void-p1
def main(sc, X_train_path, y_train_path, X_test_path, y_test_path=None):
    # file processing
    # train_df is a dataframe containing 2 columns, text content and label
    train_raw_rdd = get_train_data_rdd(sc, X_train_path, y_train_path)
    test_raw_rdd = get_test_data_rdd(sc, X_test_path)

    # feature extraction
    feature_extraction = fe.Feature_Extraction()
    train_df, test_df = feature_extraction.extract_featrues(
        train_rdd=train_raw_rdd, test_rdd=test_raw_rdd)
    # print(train_df.show(n=5, truncate=100))
    # print(test_df.show(n=5, truncate=100))

    print('****************************')
    print('Train Model with NaiveBayes\n')
    nb = NaiveBayes(smoothing=1)
    model = nb.fit(train_df)

    print('****************************')
    print('Testing Unseen Data\n')
    predictions = model.transform(test_df)

    pred_list = [
        int(row.prediction) + 1
        for row in predictions.sort('doc_id').select('prediction').collect()
    ]
    with open('prediction.txt', 'w') as f:
        for pred_label in pred_list:
            f.write('%d\n' % pred_label)

    if y_test_path:
        y_test_data = sc.textFile(y_test_path).collect()
        cnt = 0
        for i in range(len(y_test_data)):
            if int(y_test_data[i]) == pred_list[i]:
                cnt += 1
        print('Accuracy: %f, %d/%d' %
              (cnt * 1.0 / len(y_test_data), cnt, len(y_test_data)))
コード例 #9
0
import numpy as np
import xlrd
from sklearn import svm

X = np.empty([code.Total_data, 135], dtype=float)
data = np.empty([1, 135], dtype='object')

book = xlrd.open_workbook('Features.xlsx')
sheet = book.sheet_by_name('Sheet1')
for c in range(sheet.ncols):
    for r in range(sheet.nrows):
        X[c, r] = sheet.cell_value(r, c)

y = np.empty(code.Total_data, dtype='object')
classes = ["UP", "DOWN"]

for i in range(0, 151):
    y[i] = classes[0]

for i in range(151, code.Total_data):
    y[i] = classes[1]

clf = svm.SVC(C=70, kernel='rbf')
clf.fit(X, y)

data = code.Gesture()

Result = clf.predict(data)

print(Result)
コード例 #10
0
        bins.append(3)
    elif y_meal_values[i] < large_max:
        large_bin.append(meal_data.iloc[i])
        bins.append(4)
    else:
        vlarge_bin.append(meal_data.iloc[i])
        bins.append(5)

# In[14]:

sum = len(vsmall_bin) + len(small_bin) + len(mods_bin) + len(modl_bin) + len(
    large_bin) + len(vlarge_bin)

# In[15]:

Feature_Extraction.main()

# In[16]:

feature_matrix = pd.read_csv("mealDataFeatures.csv")
feature_matrix.to_numpy()

# In[17]:

kmeans = KMeans(n_clusters=6, init='k-means++').fit(feature_matrix)
k_labels = kmeans.labels_

# In[18]:

kmeans_sse = kmeans.inertia_
kmeans_entropy, kmeans_purity = entropy_calc(bins, k_labels)
コード例 #11
0
y = numpy.load('y_Label.npy')

print(X.shape)
for i in range(0, X.shape[0], 100):
    # STFT Power Spectrum
    # D = librosa.amplitude_to_db(numpy.abs(librosa.stft(X[i])), ref=numpy.max)
    # librosa.display.specshow(D, y_axis='log')
    # plt.show()

    # Waveplot Of Signal
    # librosa.display.waveplot(X[i])
    # plt.show()

    # STFT Plot
    Stft_dis = []
    D = Feature_Extraction.to_stft(X[i])
    for j in range(D.shape[0]):
        Stft_dis.extend(numpy.reshape(D[j], [-1, 513, 129]))
    for k in range(len(Stft_dis)):
        librosa.display.specshow(librosa.amplitude_to_db(Stft_dis[k]),
                                 y_axis='log')
        plt.show()
    break

    # MelSpectrogram
    # Mel_dis = []
    # D = Feature_Extraction.to_melspectrogram(X[i])
    # for j in range(D.shape[0]):
    # 	Mel_dis.extend(numpy.reshape(D[j],[-1,128,129]))
    # for k in range(len(Mel_dis)):
    # 	librosa.display.specshow(Mel_dis[k], y_axis='log')
コード例 #12
0
def mine_conversations(idf, csv_file_path, stop_datetime, chunksize,
                       conversation_duration):
    datetime_object = datetime.strptime("2015-01-01T00:00:00.000Z",
                                        '%Y-%m-%dT%H:%M:%S.%fZ')
    break_loop = False
    open_conversations = []
    log = pmlog.EventLog()
    score_stats = []
    message_classifier = Message_Classifier.MessageClassifier()
    message_classifier.load_models('synth')
    dataprocess = Feature_Extraction.DataProcessing()

    columns = ['id', 'text', 'sent', 'fromUser.username']
    for chunk in pd.read_csv(csv_file_path,
                             chunksize=chunksize,
                             usecols=columns,
                             sep=','):
        # Terminate after late date, to trim dataset.

        if datetime_object > stop_datetime:
            break_loop = True

        if break_loop:
            break

        for index, row in chunk.iterrows():

            try:
                # Start by getting the variables we need
                text = row["text"]

                if str(text) == "nan":
                    continue

                if len(text.split(" ")
                       ) <= 2:  # Filter out messages with less than n words
                    continue

                datetime_object = datetime.strptime(row["sent"],
                                                    '%Y-%m-%dT%H:%M:%S.%fZ')

                event_dict = {}
                event_dict["User ID"] = row["fromUser.username"]
                event_dict["Date"] = datetime_object
                event_dict["Content"] = text
                event_dict["Class"] = None
                """
                for conversation in open_conversations:
                    time_diff = (datetime_object - conversation.open_time).total_seconds() / 60.0
                    if time_diff > conversation_duration:
                        if len(conversation.message_texts) > 1:
                            try:
                                log.append(conversation.add_to_trace())
                                conversation.write_to_txt()
                            except ValueError:
                                open_conversations.remove(conversation)
                                continue
                        open_conversations.remove(conversation)
                """

                # Now we find our text body
                tf_idf_message = {}
                text_list = TP.preprocess_text(text)
                for word in set(text_list):
                    if word in idf:
                        tf_idf_message[word] = (text_list.count(word) /
                                                len(text_list)) * idf[word]

                mention = TP.get_mentions(text)

                added = False
                if len(mention) > 0:
                    for conversation in open_conversations:
                        if conversation.is_person_in_conversation(
                                mention[0][1:]) and not added:
                            conversation.add_message(
                                event_dict,
                                message_text=row['text'],
                                person=row['fromUser.username'],
                                idf=idf)
                            added = True
                            break

                if not added:
                    # Find the best matching conversation
                    score = 0
                    best_matching_conversation = None
                    for conversation in open_conversations:
                        conversation_score = conversation.similarity_score(
                            tf_idf_message)
                        if conversation_score > score:
                            score = conversation_score
                            best_matching_conversation = conversation

                    if best_matching_conversation != None and score > 0.15:
                        score_stats.append(score)
                        best_matching_conversation.add_message(
                            event_dict,
                            message_text=row['text'],
                            person=row['fromUser.username'],
                            idf=idf)

                    else:
                        convo = Conversation(open_time=datetime_object,
                                             event_dict=event_dict,
                                             message_text=row['text'],
                                             person=row['fromUser.username'],
                                             idf=idf,
                                             classifier=message_classifier,
                                             dataprocessing=dataprocess)

                        open_conversations.append(convo)

            except AttributeError as e:
                print(e)
                continue

        print("Conversation mining. Date: " + row["sent"])

    for conversation in open_conversations:
        # if len(conversation.message_texts) > 1:
        log.append(conversation.add_to_trace())
        conversation.write_to_txt()
        # open_conversations.remove(conversation)

    return log