def run(clf, output_csv_path): parser = argparse.ArgumentParser(description='NDSC Text Classifier') parser.add_argument( '--mode', type=str, default='train', metavar='N', help='train or test (for submission mode) (default: train)') args = parser.parse_args() print("\nRunning {}...".format(clf.__name__)) clf = clf() ps = PorterStemmer() for cat in ['beauty_image', 'fashion_image', 'mobile_image']: df = pd.read_csv(os.path.join(data_path, 'train_' + cat + '.csv')) # df2 = pd.read_csv('./translations.txt', sep=';') # df['title'] = df2['title'] # Text cleaning df['title'] = text_utils.clean_text(df['title'], stopwords) # Vector vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', lowercase=True, max_features=feature_length) if args.mode == 'train': _, _, X_train, y_train, X_val, y_val = text_utils.data_split( df, seed) train_vectors = vectorizer.fit_transform(X_train) val_vectors = vectorizer.transform(X_val) print("Feature size:", train_vectors.shape) # Train clf.fit(train_vectors, y_train) predicted = clf.predict(val_vectors) print("Accuracy for {}: {:.2f}%".format( cat, accuracy_score(y_val, predicted))) elif args.mode == 'test': # Only for train on ALL for submission testing X_train, y_train = df['title'].values, df['Category'].values train_vectors = vectorizer.fit_transform(X_train) test_df = pd.read_csv('./test_' + cat + '.csv') X_test = test_df['title'].values test_vectors = vectorizer.transform(X_test) predicted = clf.predict(test_vectors) print(predicted) print(test_df['itemid'].values) with open(output_csv_path, 'a') as f: for i in range(len(predicted)): row = '{},{}\n'.format(test_df['itemid'][i], predicted[i]) f.write(row) else: raise Exception("Please enter mode as 'train' or 'test'")
def text2seq(text, type='char'): symbol_to_id = {s: i for i, s in enumerate(symbols)} clean_char = clean_text(text.rstrip()) if type == 'char': seq = seq2id(clean_char, symbol_to_id) else: clean_phone = [] for s in g2p(clean_char.lower()): if '@' + s in symbol_to_id: clean_phone.append('@' + s) else: clean_phone.append(s) seq = seq2id(clean_phone, symbol_to_id) return seq
def process_song(row): """ Applied to a DataFrame to clean lyrics and get word count, it also makes any song with lyrics in another language be returned with an np.nan row (missing data to easily remove later) """ try: lyrics = row['text'] cleaned_lyrics = text_utils.clean_text(lyrics) row['cleaned_lyrics'] = cleaned_lyrics row['old_word_count'] = len(lyrics.strip().split()) row['new_word_count'] = len(cleaned_lyrics.strip().split()) except Exception as e: print(e) row['text'] = np.nan return row
def precompute_char_phone(path): metadata_file = os.path.join(path, 'metadata.csv') char_folder = os.path.join(path, 'chars') phone_folder = os.path.join(path, 'phones') if not os.path.isdir(char_folder): os.makedirs(char_folder) if not os.path.isdir(phone_folder): os.makedirs(phone_folder) symbol_to_id = {s: i for i, s in enumerate(symbols)} g2p = G2p() data = {} with codecs.open(metadata_file, 'r', 'utf-8') as metadata: for line in metadata.readlines(): id, _, text = line.split("|") id = re.sub(r'"', '', id) clean_char = clean_text(text.rstrip()) char_seq = seq2id(clean_char, symbol_to_id) clean_phone = [] for s in g2p(clean_char.lower()): if '@' + s in symbol_to_id: clean_phone.append('@' + s) else: clean_phone.append(s) phone_seq = seq2id(clean_phone, symbol_to_id) char = {'char': clean_char, 'char_seq': char_seq} char_file = os.path.join(char_folder, id + '.pkl') with open(char_file, 'wb') as f: pkl.dump(char, f) phone = {'phone': clean_phone, 'phone_seq': phone_seq} phone_file = os.path.join(phone_folder, id + '.pkl') with open(phone_file, 'wb') as f: pkl.dump(phone, f)
import text_utils import tensorflow as tf import emoji tf.flags.DEFINE_string("input_file", "../Data/text8.txt", "input file to pre-process") tf.flags.DEFINE_string("output_file", "../Data/text8.txt.clean", "Output file after pre-processing") FLAGS = tf.flags.FLAGS data_samples = list(open(FLAGS.input_file, "r").readlines()) data_samples = [emoji.demojize(s.strip()) for s in data_samples] x_text = [text_utils.clean_text(sent) for sent in data_samples] file_writer = open(FLAGS.output_file, "w", encoding="utf-8") for line in x_text: file_writer.write("%s\n" % line) file_writer.close()