Python clean_textの例

プログラミング言語: Python

名前空間/パッケージ名: text_utils

メソッド/関数: clean_text

hotexamples.comのコード掲載数: 5

Python clean_text - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtext_utils.clean_textの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: linear_classifier.py プロジェクト: Leekwanmeng/NDSC2019

def run(clf, output_csv_path):
    parser = argparse.ArgumentParser(description='NDSC Text Classifier')
    parser.add_argument(
        '--mode',
        type=str,
        default='train',
        metavar='N',
        help='train or test (for submission mode) (default: train)')
    args = parser.parse_args()

    print("\nRunning {}...".format(clf.__name__))
    clf = clf()
    ps = PorterStemmer()

    for cat in ['beauty_image', 'fashion_image', 'mobile_image']:
        df = pd.read_csv(os.path.join(data_path, 'train_' + cat + '.csv'))
        # df2 = pd.read_csv('./translations.txt', sep=';')
        # df['title'] = df2['title']

        # Text cleaning
        df['title'] = text_utils.clean_text(df['title'], stopwords)

        # Vector
        vectorizer = TfidfVectorizer(strip_accents='unicode',
                                     analyzer='word',
                                     token_pattern=r'\w{1,}',
                                     lowercase=True,
                                     max_features=feature_length)

        if args.mode == 'train':
            _, _, X_train, y_train, X_val, y_val = text_utils.data_split(
                df, seed)
            train_vectors = vectorizer.fit_transform(X_train)
            val_vectors = vectorizer.transform(X_val)
            print("Feature size:", train_vectors.shape)
            # Train
            clf.fit(train_vectors, y_train)
            predicted = clf.predict(val_vectors)
            print("Accuracy for {}: {:.2f}%".format(
                cat, accuracy_score(y_val, predicted)))

        elif args.mode == 'test':
            # Only for train on ALL for submission testing
            X_train, y_train = df['title'].values, df['Category'].values
            train_vectors = vectorizer.fit_transform(X_train)

            test_df = pd.read_csv('./test_' + cat + '.csv')
            X_test = test_df['title'].values
            test_vectors = vectorizer.transform(X_test)

            predicted = clf.predict(test_vectors)
            print(predicted)
            print(test_df['itemid'].values)
            with open(output_csv_path, 'a') as f:
                for i in range(len(predicted)):
                    row = '{},{}\n'.format(test_df['itemid'][i], predicted[i])
                    f.write(row)
        else:
            raise Exception("Please enter mode as 'train' or 'test'")

コード例 #2

ファイルを表示

def text2seq(text, type='char'):
    symbol_to_id = {s: i for i, s in enumerate(symbols)}
    clean_char = clean_text(text.rstrip())

    if type == 'char':
        seq = seq2id(clean_char, symbol_to_id)
    else:
        clean_phone = []
        for s in g2p(clean_char.lower()):
            if '@' + s in symbol_to_id:
                clean_phone.append('@' + s)
            else:
                clean_phone.append(s)
        seq = seq2id(clean_phone, symbol_to_id)

    return seq

コード例 #3

ファイルを表示

def process_song(row):
    """
    Applied to a DataFrame to clean lyrics and get word count, it also makes any song with lyrics in another
    language be returned with an np.nan row (missing data to easily remove later)

    """
    try:
        lyrics = row['text']
        cleaned_lyrics = text_utils.clean_text(lyrics)
        row['cleaned_lyrics'] = cleaned_lyrics
        row['old_word_count'] = len(lyrics.strip().split())
        row['new_word_count'] = len(cleaned_lyrics.strip().split())
    except Exception as e:
        print(e)
        row['text'] = np.nan
    return row

コード例 #4

ファイルを表示

def precompute_char_phone(path):

    metadata_file = os.path.join(path, 'metadata.csv')
    char_folder = os.path.join(path, 'chars')
    phone_folder = os.path.join(path, 'phones')
    if not os.path.isdir(char_folder):
        os.makedirs(char_folder)
    if not os.path.isdir(phone_folder):
        os.makedirs(phone_folder)
    symbol_to_id = {s: i for i, s in enumerate(symbols)}
    g2p = G2p()

    data = {}
    with codecs.open(metadata_file, 'r', 'utf-8') as metadata:
        for line in metadata.readlines():
            id, _, text = line.split("|")
            id = re.sub(r'"', '', id)
            clean_char = clean_text(text.rstrip())
            char_seq = seq2id(clean_char, symbol_to_id)
            clean_phone = []

            for s in g2p(clean_char.lower()):
                if '@' + s in symbol_to_id:
                    clean_phone.append('@' + s)
                else:
                    clean_phone.append(s)
            phone_seq = seq2id(clean_phone, symbol_to_id)

            char = {'char': clean_char, 'char_seq': char_seq}
            char_file = os.path.join(char_folder, id + '.pkl')
            with open(char_file, 'wb') as f:
                pkl.dump(char, f)

            phone = {'phone': clean_phone, 'phone_seq': phone_seq}
            phone_file = os.path.join(phone_folder, id + '.pkl')
            with open(phone_file, 'wb') as f:
                pkl.dump(phone, f)

コード例 #5

ファイルを表示

ファイル: preparing_data.py プロジェクト: ssgalitsky/dpText

import text_utils
import tensorflow as tf
import emoji

tf.flags.DEFINE_string("input_file", "../Data/text8.txt",
                       "input file to pre-process")
tf.flags.DEFINE_string("output_file", "../Data/text8.txt.clean",
                       "Output file after pre-processing")

FLAGS = tf.flags.FLAGS

data_samples = list(open(FLAGS.input_file, "r").readlines())
data_samples = [emoji.demojize(s.strip()) for s in data_samples]

x_text = [text_utils.clean_text(sent) for sent in data_samples]

file_writer = open(FLAGS.output_file, "w", encoding="utf-8")
for line in x_text:
    file_writer.write("%s\n" % line)
file_writer.close()