コード例 #1
0
def get_strings(FILE_NAMES, b_dep):
    count = 0
    FILE_STRINGS = []
    for files in FILE_NAMES:
        string_user = []
        for file in files:
            tweet = ""
            all_lines = ""
            with open(file, 'r', encoding="utf-8") as f:
                for line in f:
                    all_lines += line
                f.close()
            pre_processed_tokens = ppc.pre_process_data(all_lines)
            for ppt in pre_processed_tokens:
                string_user.append(ppt)
            #string_user.append(pre_processed_tokens)
        #string_user = ''.join(string_user)
        if len(string_user) < 1000:
            print("User Omitted: Too Few Records")
            continue
        if len(string_user) < MAX_TENSOR_LENGTH:
            if b_dep == 1:
                LENGTHS_DEPRESSION.append(len(string_user))
            else:
                LENGTHS_CONTROL.append(len(string_user))
            string_user += [''] * (MAX_TENSOR_LENGTH - len(string_user))
        #print(len(string_user))
        count = count + 1
        FILE_STRINGS.append(string_user)
        if count > 999:
            break
    return FILE_STRINGS
コード例 #2
0
ファイル: cnn_toxic.py プロジェクト: dimoynwa/NLPTasks
    x = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dense(conf['targets'].shape[1],
                              activation='sigmoid')(x)

    model = tf.keras.Model(input, x)

    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model


if __name__ == "__main__":
    config = pre_process_data()
    model = build_model(config)
    print('Training the model...')

    history = model.fit(x=config['data'],
                        y=config['targets'],
                        epochs=EPOCHS,
                        validation_split=VALIDATION_SPLIT,
                        batch_size=BATCH_SIZE)
    print('Training done.')

    model.save('./saved_models/cnn_toxic.h5')

    plt.plot(history.history['loss'], label='Loss')
    plt.plot(history.history['val_loss'], label='Validation loss')
    plt.legend()
コード例 #3
0
#all tweets are stored in a single vector (string at the moment)
depression_lengths = []
control_lengths = []
FILE_STRINGS = []
FILE_STRINGS_CONTROL = []
count = 0
for files in FILE_NAMES_DEPRESSION:
    string_user = []
    for file in files:
        tweet = ""
        all_lines = ""
        with open(file, 'r', encoding="utf-8") as f:
            for line in f:
                all_lines += line
            f.close()
        pre_processed_tokens = ppc.pre_process_data(all_lines)
        #print(pre_processed_tokens) #debugging line
        for ppt in pre_processed_tokens:
            string_user.append(ppt)
        #string_user.append(pre_processed_tokens)
    #string_user = ''.join(string_user)
    if len(string_user) < 1000:
        print("User Omitted: Too Few Records")
        continue
    if len(string_user) < MAX_TENSOR_LENGTH:
        depression_lengths.append(len(string_user))
        string_user += [''] * (MAX_TENSOR_LENGTH - len(string_user))
    #print(len(string_user))
    count = count + 1
    FILE_STRINGS.append(string_user)
    if count > 999:
コード例 #4
0
FILE_STRINGS = []
FILE_STRINGS_CONTROL = []
count = 0
for files in FILE_NAMES_DEPRESSION:
    string_user = []
    for file in files:
        #print("Loaded tweets from depression: ", count, end='\r')
        tweet = ""
        all_lines = ""
        all_tweets = []
        with open(file, 'r', encoding="utf-8") as f:
            for line in f:
                if len(line) > 5:
                    if len(line) < 280:
                        #pre process the lines as they come in
                        pps = ppc.pre_process_data(line)
                        if len(pps) > 1:
                            #pad lines
                            pps += [''] * (280 - len(pps))
                            TWEETS_DEPRESSION.append(pps)
                            count = count + 1
            if DEBUG_MODE == 1 and count > 10000:
                f.close()
                break
            f.close()
count = 0
print('\n')
print("Importing Control Users")
for files in FILE_NAMES_CONTROL:
    string_user = []
    for file in files: