def classifier_process(i, jobs_queue, output_queue, args): with MosesTokenizer(args.source_lang) as source_tokenizer, MosesTokenizer(args.target_lang) as target_tokenizer: while True: job = jobs_queue.get() if job: logging.debug("Job {0}".format(job.__repr__())) nblock, filein_name = job ojob = None with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout: logging.debug("Classification: creating temporary filename {0}".format(fileout.name)) feats = [] # TODO Test times with predict one-by-one and this impl for i in filein: features = feature_extract(i, source_tokenizer, target_tokenizer, args) feats.append([float(v) for v in features]) if len(feats) > 0: prediction = args.clf.predict_proba(np.array(feats)) row = 0 for pred in prediction: fileout.write("{}\n".format(str(pred[1]))) row += 1 ojob = (nblock, fileout.name) filein.close() fileout.close() if ojob: output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") break
def worker_process(i, jobs_queue, output_queue, args): with MosesTokenizer(args.source_lang) as tokl, \ MosesTokenizer(args.target_lang) as tokr: while True: job = jobs_queue.get() if job: logging.debug("Job {}".format(job.__repr__())) nblock, filein_name, label = job with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False) as fileout: logging.debug("Filtering: creating temporary file {}".format(fileout.name)) for i in filein: features = feature_extract(i, tokl, tokr, args) for j in features: fileout.write("{}".format(j)) fileout.write("\t") fileout.write("{}".format(label)) fileout.write("\n") ojob = (nblock, fileout.name) fileout.close() filein.close() output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") break
def classifier_process(i, jobs_queue, output_queue, args): with MosesTokenizer(args.source_lang) as source_tokenizer, MosesTokenizer( args.target_lang) as target_tokenizer: while True: job = jobs_queue.get() if job: logging.debug("Job {0}".format(job.__repr__())) nblock, filein_name = job with open(filein_name, 'r') as filein, NamedTemporaryFile( mode="w", delete=False, dir=args.tmp_dir) as fileout: logging.debug( "Classification: creating temporary filename {0}". format(fileout.name)) feats = [] temp_lines = [] # TODO Test times with predict one-by-one and this impl for i in filein: parts = i.strip().split("\t") line = "" temp_lines.append(i) if len(parts) == 7: # Last two columns are the language pair if parts[-2] == args.source_lang and parts[ -1] == args.target_lang: line = "{}\t{}\n".format(parts[1], parts[3]) elif parts[-1] == args.source_lang and parts[ -2] == args.source_lang: line = "{}\t{}\n".format(parts[3], parts[1]) features = feature_extract(line, source_tokenizer, target_tokenizer, args) feats.append([float(v) for v in features]) else: logging.debug( "Line not included in process: {}".format(i)) if len(feats) > 0: prediction = args.clf.predict_proba(np.array(feats)) row = 0 for pred in prediction: while not temp_lines[row].startswith("<tu "): fileout.write(temp_lines[row]) row += 1 fileout.write("{}\t{}\n".format( temp_lines[row].strip("\n"), str(pred[1]))) row += 1 else: for l in temp_lines: fileout.write(l) ojob = (nblock, fileout.name) filein.close() fileout.close() output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") break
def ev_feature_and_label(entry): label = entry['label'] document_words = set(split_words(entry['fulltext'])) features_extracted = features.feature_extract(entry) for word in word_features: features_extracted['contains(%s)' % word] = word in document_words return (features_extracted, label)
def feature_and_label(entry): label = entry['label'] document_words = set(split_words(entry['fulltext'])) features_extracted = features.feature_extract(entry) for word in word_features: features_extracted['contains(%s)' % word] = word in document_words entries_by_label[label].append(features_extracted) return (label, features_extracted)
def classifier_process(i, jobs_queue, output_queue, args): with MosesTokenizer(args.source_lang) as source_tokenizer, MosesTokenizer(args.target_lang) as target_tokenizer: while True: job = jobs_queue.get() if job: logging.debug("Job {0}".format(job.__repr__())) nblock, filein_name = job ojob = None with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout: logging.debug("Classification: creating temporary filename {0}".format(fileout.name)) feats = [] for i in filein: parts = i.split("\t") if len(parts) >= 2 and len(parts[0].strip()) != 0 and len(parts[1].strip()) != 0: features = feature_extract(i, source_tokenizer, target_tokenizer, args) # print("SENTENCE PAIR: %%{}%%".format(i)) # print(Features(features)) # debug feats.append([float(v) for v in features]) predictions = args.clf.predict_proba(np.array(feats)) if len(feats) > 0 else [] filein.seek(0) piter = iter(predictions) for i in filein: parts = i.split("\t") if len(parts) >= 2 and len(parts[0].strip()) != 0 and len(parts[1].strip()) != 0: p = next(piter) fileout.write(i.strip()) fileout.write("\t") fileout.write(str(p[1])) fileout.write("\n") else: fileout.write(i.strip("\n")) fileout.write("\t0\n") ojob = (nblock, fileout.name) filein.close() fileout.close() if ojob: output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") break
def worker_process(i, jobs_queue, output_queue, args): if args.source_tokeniser_path: source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' ')) else: source_tokeniser = MosesTokenizer(args.source_lang) if args.target_tokeniser_path: target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' ')) else: target_tokeniser = MosesTokenizer(args.target_lang) while True: job = jobs_queue.get() if job: logging.debug("Job {}".format(job.__repr__())) nblock, filein_name, label = job with open(filein_name, 'r') as filein, NamedTemporaryFile( mode="w", delete=False) as fileout: logging.debug("Filtering: creating temporary file {}".format( fileout.name)) for i in filein: srcsen, trgsen = i.split("\t")[:2] trgsen = trgsen.strip() # print(str(srcsen) + " --- " + str(trgsen)) features = feature_extract(srcsen, trgsen, source_tokeniser, target_tokeniser, args) for j in features: fileout.write("{}".format(j)) fileout.write("\t") fileout.write("{}".format(label)) fileout.write("\n") ojob = (nblock, fileout.name) fileout.close() filein.close() output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") source_tokeniser.close() target_tokeniser.close() break
def classify(args): global nline batch_size = 10000 buf_sent = [] buf_feat = [] if args.source_tokeniser_path: source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' ')) else: source_tokeniser = MosesTokenizer(args.source_lang) if args.target_tokeniser_path: target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' ')) else: target_tokeniser = MosesTokenizer(args.target_lang) for i in args.input: nline += 1 parts = i.split("\t") sl_sentence=None tl_sentence=None if len(parts) >= max(args.scol, args.tcol): sl_sentence=parts[args.scol -1] tl_sentence=parts[args.tcol -1] else: logging.error("ERROR: scol ({}) or tcol ({}) indexes above column number ({}) on line {}".format(args.scol, args.tcol, len(parts), nline)) if sl_sentence and tl_sentence and len(sl_sentence.strip()) != 0 and len(tl_sentence.strip()) != 0 and (args.disable_hardrules or wrong_tu(sl_sentence.strip(),tl_sentence.strip(), args)== False): lmScore=None if args.lm_filter: lmScore=args.lm_filter.score(sl_sentence,tl_sentence) if lmScore != None and lmScore < args.lm_threshold and not args.keep_lm_result: buf_sent.append((0, i,lmScore)) else: buf_sent.append((1, i,lmScore)) features = feature_extract(sl_sentence, tl_sentence, source_tokeniser, target_tokeniser, args) buf_feat.append([float(v) for v in features]) else: lmScore=None if args.lm_filter: lmScore=0 buf_sent.append((0, i, lmScore)) if (nline % batch_size) == 0: args.clf.set_params(n_jobs = 1) predictions = args.clf.predict_proba(np.array(buf_feat)) if len(buf_feat) > 0 else [] p = iter(predictions) for k, l, lmScore in buf_sent: if k == 1: if args.score_only: args.output.write("{0:.3f}".format((next(p)[1]))) else: args.output.write(l.strip()) args.output.write("\t") args.output.write("{0:.3f}".format((next(p)[1]))) if lmScore != None and args.keep_lm_result: args.output.write("\t") args.output.write("{0:.3f}".format(lmScore)) args.output.write("\n") else: if args.score_only: args.output.write("0") else: args.output.write(l.strip("\n")) args.output.write("\t0") if lmScore != None and args.keep_lm_result: args.output.write("\t0") args.output.write("\n") buf_feat = [] buf_sent = [] if len(buf_sent) > 0: predictions = args.clf.predict_proba(np.array(buf_feat)) if len(buf_feat) > 0 else [] p = iter(predictions) for k, l, lmScore in buf_sent: if k == 1: if args.score_only: args.output.write("{0:.3f}".format((next(p)[1]))) else: args.output.write(l.strip()) args.output.write("\t") args.output.write("{0:.3f}".format((next(p)[1]))) if lmScore != None and args.keep_lm_result: args.output.write("\t") args.output.write("{0:.3f}".format(lmScore)) args.output.write("\n") else: if args.score_only: args.output.write("0") else: args.output.write(l.strip("\n")) args.output.write("\t0") if lmScore != None and args.keep_lm_result: args.output.write("\t0") args.output.write("\n")
def perform_training(args): global nline time_start = default_timer() logging.info("Starting process") # Read input to a named temporary file # We may need to read it multiple times and that would be problematic if it is sys.stdin input = NamedTemporaryFile(mode="w", delete=False) for line in args.input: input.write(line) input.close() stats = None with open(input.name) as input_f: args.input = input_f stats = train_fluency_filter(args) args.input.seek(0) # Shuffle and get length ratio total_size, length_ratio, good_sentences, wrong_sentences = shuffle( args.input, args.good_examples + args.good_test_examples, args.wrong_examples + args.wrong_test_examples, args.wrong_examples_file) os.remove(input.name) args.length_ratio = length_ratio # Load dictionaries args.dict_sl_tl = ProbabilisticDictionary(args.source_dictionary) args.dict_tl_sl = ProbabilisticDictionary(args.target_dictionary) features_file = NamedTemporaryFile(delete=False) if args.source_tokeniser_path: tokl = ToolWrapper(args.source_tokeniser_path.split(' ')) else: tokl = MosesTokenizer(args.source_lang) if args.target_tokeniser_path: tokr = ToolWrapper(args.target_tokeniser_path.split(' ')) else: tokr = MosesTokenizer(args.target_lang) with open(good_sentences.name, 'r') as gsf, \ open(wrong_sentences.name, 'r') as wsf, \ open(features_file.name, 'w+') as fileout: for i in gsf: srcsen, trgsen = i.split("\t")[:2] # print(str(i) + " ---" + str(srcsen) + " --- " + str(trgsen)) features = feature_extract(srcsen, trgsen, tokl, tokr, args) for j in features: fileout.write("{}".format(j)) fileout.write("\t") fileout.write("{}".format(1)) fileout.write("\n") fileout.flush() for i in wsf: srcsen, trgsen = i.split("\t")[:2] # print(str(i) + " ---" + str(srcsen) + " --- " + str(trgsen)) features = feature_extract(srcsen, trgsen, tokl, tokr, args) for j in features: fileout.write("{}".format(j)) fileout.write("\t") fileout.write("{}".format(0)) fileout.write("\n") fileout.flush() tokl.close() tokr.close() features_file.seek(0) if args.dump_features: logging.info("Dumping features to " + os.path.abspath(args.dump_features.name)) for i in features_file: args.dump_features.write(i) args.dump_features.close() features_file.seek(0) logging.info("Start training") features_file.close() hgood = [] hwrong = [] with TemporaryFile("w+") as features_train, TemporaryFile( "w+") as features_test, open(features_file.name, 'r') as ff: nline = 0 for line in ff: # print(line) if nline < args.good_examples: features_train.write(line) elif nline < args.good_examples + args.good_test_examples: features_test.write(line) elif nline < args.good_examples + args.good_test_examples + args.wrong_examples: features_train.write(line) else: features_test.write(line) nline += 1 features_train.flush() features_test.flush() features_train.seek(0) features_test.seek(0) hgood, hwrong = train_classifier(features_train, features_test, args.classifier_type, args.classifier) features_train.close() features_test.close() logging.info("End training") write_metadata(args, length_ratio, hgood, hwrong, stats) args.metadata.close() # Stats logging.info("Finished") elapsed_time = default_timer() - time_start logging.info("Elapsed time {:.2f} s".format(elapsed_time))
def classifier_process(i, jobs_queue, output_queue, args): source_tokenizer = Tokenizer(args.source_tokenizer_command, args.source_lang) target_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) if not args.disable_lm_filter: lm_filter = load_lm_filter(args.source_lang, args.target_lang, args.metadata_yaml, args.source_tokenizer_command, args.target_tokenizer_command) else: lm_filter = None if not args.disable_porn_removal: porn_removal = args.porn_removal if args.metadata_yaml['porn_removal_side'] == 'tl': porn_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) else: porn_tokenizer = Tokenizer(args.source_tokenizer_command, args.source_lang) else: porn_removal = None porn_tokenizer = None while True: job = jobs_queue.get() if job: logging.debug("Job {0}".format(job.__repr__())) nblock, filein_name = job ojob = None with open(filein_name, 'r') as filein, NamedTemporaryFile( mode="w", delete=False, dir=args.tmp_dir) as fileout: logging.debug( "Classification: creating temporary filename {0}".format( fileout.name)) feats = [] lm_scores = [] #Create the following arrays: #valid_sentences: boolean, length of input. States whether each sentence passed # hard rules and lm fluency filtering #feats: vector of tuples, input features to the classifier, length equals number # of sentences in the input that passed hard rules + lm fluency filtering valid_sentences = [] for i in filein: parts = i.split("\t") sl_sentence = None tl_sentence = None if len(parts) >= max(args.scol, args.tcol): sl_sentence = parts[args.scol - 1] tl_sentence = parts[args.tcol - 1] else: logging.error( "ERROR: scol ({}) or tcol ({}) indexes above column number ({})" .format(args.scol, args.tcol, len(parts))) if sl_sentence and tl_sentence and len(sl_sentence.strip( )) != 0 and len(tl_sentence.strip()) != 0 and ( args.disable_hardrules or wrong_tu( sl_sentence.strip(), tl_sentence.strip(), args, lm_filter, porn_removal, porn_tokenizer) == False): #if disable_hardrules == 1 --> the second part (and) is always true features = feature_extract(sl_sentence, tl_sentence, source_tokenizer, target_tokenizer, args) feats.append([float(v) for v in features]) valid_sentences.append(True) else: valid_sentences.append(False) predictions = args.clf.predict_proba( np.array(feats)) if len(feats) > 0 else [] filein.seek(0) piter = iter(predictions) for i, valid_sentence in zip(filein, valid_sentences): if valid_sentence: p = next(piter) if args.score_only: fileout.write("{0:.3f}".format(p[1])) else: fileout.write(i.strip()) fileout.write("\t{0:.3f}".format(p[1])) fileout.write("\n") else: if args.score_only: fileout.write("0") else: fileout.write(i.strip("\n")) fileout.write("\t0") fileout.write("\n") ojob = (nblock, fileout.name) filein.close() fileout.close() if ojob: output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") break
def classify(args): global nline batch_size = 10000 buf_sent = [] buf_feat = [] if args.source_tokeniser_path: source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' ')) else: source_tokeniser = MosesTokenizer(args.source_lang) if args.target_tokeniser_path: target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' ')) else: target_tokeniser = MosesTokenizer(args.target_lang) for i in args.input: nline += 1 parts = i.split("\t") sl_sentence=None tl_sentence=None if len(parts) >= 4: sl_sentence=parts[2] tl_sentence=parts[3] if len(parts) == 2: sl_sentence=parts[0] tl_sentence=parts[1] if sl_sentence and tl_sentence and len(sl_sentence.strip()) != 0 and len(tl_sentence.strip()) != 0 and wrong_tu(sl_sentence.strip(),tl_sentence.strip(), args)== False: lmScore=None if args.lm_filter: lmScore=args.lm_filter.score(sl_sentence,tl_sentence) if lmScore != None and lmScore < args.lm_threshold and not args.keep_lm_result: buf_sent.append((0, i,lmScore)) else: buf_sent.append((1, i,lmScore)) features = feature_extract(sl_sentence, tl_sentence, source_tokeniser, target_tokeniser, args) buf_feat.append([float(v) for v in features]) else: lmScore=None if args.lm_filter: lmScore=0 buf_sent.append((0, i, lmScore)) if (nline % batch_size) == 0: args.clf.set_params(n_jobs = 1) predictions = args.clf.predict_proba(np.array(buf_feat)) if len(buf_feat) > 0 else [] p = iter(predictions) for k, l, lmScore in buf_sent: if k == 1: if args.score_only: args.output.write(str(next(p)[1])) else: args.output.write(l.strip()) args.output.write("\t") args.output.write(str(next(p)[1])) if lmScore != None and args.keep_lm_result: args.output.write("\t") args.output.write(str(lmScore)) args.output.write("\n") else: if args.score_only: args.output.write("0") else: args.output.write(l.strip("\n")) args.output.write("\t0") if lmScore != None and args.keep_lm_result: args.output.write("\t0") args.output.write("\n") buf_feat = [] buf_sent = [] if len(buf_sent) > 0: predictions = args.clf.predict_proba(np.array(buf_feat)) if len(buf_feat) > 0 else [] p = iter(predictions) for k, l, lmScore in buf_sent: if k == 1: if args.score_only: args.output.write(str(next(p)[1])) else: args.output.write(l.strip()) args.output.write("\t") args.output.write(str(next(p)[1])) if lmScore != None and args.keep_lm_result: args.output.write("\t") args.output.write(str(lmScore)) args.output.write("\n") else: if args.score_only: args.output.write("0") else: args.output.write(l.strip("\n")) args.output.write("\t0") if lmScore != None and args.keep_lm_result: args.output.write("\t0") args.output.write("\n")
def classify(args): global nline batch_size = 10000 buf_sent = [] buf_feat = [] source_tokenizer = Tokenizer(args.source_tokenizer_command, args.source_lang) target_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) if not args.disable_lm_filter: lm_filter = load_lm_filter(args.source_lang, args.target_lang, args.metadata_yaml, args.source_tokenizer_command, args.target_tokenizer_command) else: lm_filter = None if not args.disable_porn_removal: porn_removal = args.porn_removal if args.metadata_yaml['porn_removal_side'] == 'tl': porn_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) else: porn_tokenizer = Tokenizer(args.source_tokenizer_command, args.source_lang) else: porn_removal = None porn_tokenizer = None for i in args.input: nline += 1 parts = i.split("\t") sl_sentence = None tl_sentence = None if len(parts) >= max(args.scol, args.tcol): sl_sentence = parts[args.scol - 1] tl_sentence = parts[args.tcol - 1] else: logging.error( "ERROR: scol ({}) or tcol ({}) indexes above column number ({}) on line {}" .format(args.scol, args.tcol, len(parts), nline)) if sl_sentence and tl_sentence and len( sl_sentence.strip()) != 0 and len( tl_sentence.strip()) != 0 and ( args.disable_hardrules or wrong_tu( sl_sentence.strip(), tl_sentence.strip(), args, lm_filter, porn_removal, porn_tokenizer) == False): buf_sent.append((1, i)) features = feature_extract(sl_sentence, tl_sentence, source_tokenizer, target_tokenizer, args) buf_feat.append([float(v) for v in features]) else: buf_sent.append((0, i)) if (nline % batch_size) == 0: args.clf.set_params(n_jobs=1) predictions = args.clf.predict_proba( np.array(buf_feat)) if len(buf_feat) > 0 else [] p = iter(predictions) for k, l in buf_sent: if k == 1: if args.score_only: args.output.write("{0:.3f}".format((next(p)[1]))) else: args.output.write(l.strip()) args.output.write("\t{0:.3f}".format((next(p)[1]))) args.output.write("\n") else: if args.score_only: args.output.write("0") else: args.output.write(l.strip("\n")) args.output.write("\t0") args.output.write("\n") buf_feat = [] buf_sent = [] if len(buf_sent) > 0: predictions = args.clf.predict_proba( np.array(buf_feat)) if len(buf_feat) > 0 else [] p = iter(predictions) for k, l in buf_sent: if k == 1: if args.score_only: args.output.write("{0:.3f}".format((next(p)[1]))) else: args.output.write(l.strip()) args.output.write("\t") args.output.write("{0:.3f}".format((next(p)[1]))) args.output.write("\n") else: if args.score_only: args.output.write("0") else: args.output.write(l.strip("\n")) args.output.write("\t0") args.output.write("\n")
def classifier_process(i, jobs_queue, output_queue, args): if args.source_tokeniser_path: source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' ')) else: source_tokeniser = MosesTokenizer(args.source_lang) if args.target_tokeniser_path: target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' ')) else: target_tokeniser = MosesTokenizer(args.target_lang) #Load LM for fluency scoring lm_filter=None if args.source_lm and args.target_lm: lm_filter=DualLMFluencyFilter(args.lm_type,args.source_lang, args.target_lang) lm_filter.load(args.source_lm, args.target_lm,args.lm_filter_stats) while True: job = jobs_queue.get() if job: logging.debug("Job {0}".format(job.__repr__())) nblock, filein_name = job ojob = None with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout: logging.debug("Classification: creating temporary filename {0}".format(fileout.name)) feats = [] lm_scores=[] #Create the following arrays: #valid_sentences: boolean, length of input. States whether each sentence passed # hard rules and lm fluency filtering #feats: vector of tuples, input features to the classifier, length equals number # of sentences in the input that passed hard rules + lm fluency filtering valid_sentences=[] for i in filein: parts = i.split("\t") sl_sentence=None tl_sentence=None if len(parts) >= 4: sl_sentence=parts[2] tl_sentence=parts[3] if sl_sentence and tl_sentence and len(sl_sentence.strip()) != 0 and len(tl_sentence.strip()) != 0 and wrong_tu(sl_sentence.strip(),tl_sentence.strip(), args)== False: lm_score=None if lm_filter: lm_score=lm_filter.score(sl_sentence,tl_sentence) if lm_filter and lm_score < args.lm_threshold and not args.keep_lm_result: valid_sentences.append(False) else: features = feature_extract(sl_sentence, tl_sentence, source_tokeniser, target_tokeniser, args) feats.append([float(v) for v in features]) lm_scores.append(lm_score) valid_sentences.append(True) else: valid_sentences.append(False) predictions = args.clf.predict_proba(np.array(feats)) if len(feats) > 0 else [] filein.seek(0) piter = iter(predictions) if lm_filter: lmiter=iter(lm_scores) for i, valid_sentence in zip(filein,valid_sentences): if valid_sentence: p = next(piter) fileout.write(i.strip()) fileout.write("\t") fileout.write(str(p[1])) if lm_filter and args.keep_lm_result: lm_score=next(lmiter) fileout.write("\t") fileout.write(str(lm_score)) fileout.write("\n") else: fileout.write(i.strip("\n")) fileout.write("\t0") if lm_filter and args.keep_lm_result: fileout.write("\t0") fileout.write("\n") ojob = (nblock, fileout.name) filein.close() fileout.close() if ojob: output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") break
import sys import random from features import feature_extract if __name__ == '__main__': tot = len(sys.argv[1:]) nrows = 750 song_list = sys.argv[1:] random.shuffle(song_list) feature_extract(song_list[0:nrows])