def __20ng_classification(): datadir = 'e:/data/emadr/20ng_bydate' all_vecs_file_name = os.path.join(datadir, 'vecs/dew-vecs-0_8-50.bin') # all_vecs_file_name = 'e:/data/emadr/20ng_bydate/vecs/dedw-vecs.bin' split_labels_file_name = os.path.join(datadir, 'bindata/dataset-split-labels.bin') train_label_file = os.path.join(datadir, 'bindata/train-labels.bin') test_label_file = os.path.join(datadir, 'bindata/test-labels.bin') train_vecs_file_name = os.path.join(datadir, 'bindata/train-dedw-vecs.bin') test_vecs_file_name = os.path.join(datadir, 'bindata/test-dedw-vecs.bin') dst_y_pred_file = os.path.join(datadir, 'bindata/ypred-emadr.bin') dataarange.split_vecs(all_vecs_file_name, split_labels_file_name, train_vecs_file_name, test_vecs_file_name, train_label=0, test_label=2) # doc_classification_lr(train_vecs_file_name, train_label_file, test_vecs_file_name, # test_label_file, 0, -1) y_pred_test = doc_classification_svm(train_vecs_file_name, train_label_file, test_vecs_file_name, 0, -1) get_scores_label_file(test_label_file, y_pred_test) save_labels(y_pred_test, dst_y_pred_file)
def __job_text_vecs_to_bin_classification(): datadir = 'e:/data/emadr/nyt-less-docs/world' minoc = 100 text_vecs_file = os.path.join(datadir, 'rsm/rsm-hidden-%d.txt' % minoc) binfile = os.path.join(datadir, 'rsm/rsm-vecs-%d.bin' % minoc) # text_vecs_file = os.path.join(datadir, 'drbm-vecs-100-30.txt') # binfile = os.path.join(datadir, 'drbm-vecs.bin') text_vecs_to_bin(text_vecs_file, binfile) # all_vecs_file_name = os.path.join(datadir, 'rsm/drbm-vecs.bin') # split_labels_file_name = os.path.join(datadir, 'bindata/data-split-labels.bin') split_labels_file_name = os.path.join(datadir, 'bindata/dataset-split-labels.bin') train_label_file = os.path.join(datadir, 'bindata/train-labels.bin') test_label_file = os.path.join(datadir, 'bindata/test-labels.bin') train_vecs_file_name = os.path.join(datadir, 'rsm/train-rsm-vecs.bin') test_vecs_file_name = os.path.join(datadir, 'rsm/test-rsm-vecs.bin') dataarange.split_vecs(binfile, split_labels_file_name, train_vecs_file_name, test_vecs_file_name, train_label=0, test_label=2) # doc_classification_lr(train_vecs_file_name, train_label_file, test_vecs_file_name, # test_label_file, 0, -1) y_pred_test = doc_classification_svm(train_vecs_file_name, train_label_file, test_vecs_file_name, 0, -1) get_scores_label_file(test_label_file, y_pred_test)
def __job_split_vecs(): datadir = 'e:/data/emadr/nyt-less-docs/business/' all_vecs_file_name = os.path.join(datadir, 'vecs/dew-vecs-100-0_15-40.bin') split_labels_file_name = os.path.join(datadir, 'bindata/dataset-split-labels.bin') train_vecs_file_name = os.path.join(datadir, 'vecs/train-dedw-vecs.bin') test_vecs_file_name = os.path.join(datadir, 'vecs/test-dedw-vecs.bin') dataarange.split_vecs(all_vecs_file_name, split_labels_file_name, train_vecs_file_name, test_vecs_file_name, train_label=0, test_label=2)
def __job_train_classification(): docs_file = 'e:/data/emadr/20ng_bydate/tokenizedlc/docs-tokenized-lc-2.txt' # method = 'pvdm' method = 'pvdbow' dm = 1 if method == 'pvdm' else 0 # dst_vecs_file = 'e:/data/emadr/20ng_bydate/bindata/pvdbow-vecs.bin' dst_vecs_file = 'e:/data/emadr/20ng_bydate/bindata/%s-vecs.bin' % method dst_result_file = 'e:/data/emadr/20ng_bydate/pvdm-results.txt' split_labels_file_name = 'e:/data/emadr/20ng_bydate/bindata/dataset-split-labels.bin' train_vecs_file_name = 'e:/data/emadr/20ng_bydate/bindata/train-%s-vecs.bin' % method test_vecs_file_name = 'e:/data/emadr/20ng_bydate/bindata/test-%s-vecs.bin' % method data_dir = 'e:/data/emadr/20ng_bydate/bindata/' train_label_file = os.path.join(data_dir, 'train-labels.bin') test_label_file = os.path.join(data_dir, 'test-labels.bin') # min_counts = [2, 5, 10, 20] # def_alphas = [0.1, 0.01, 0.001] # nss = [0, 5, 10, 15] min_counts = [5] def_alphas = [0.01] nss = [0] niters = 100 fout = open(dst_result_file, 'wb') for min_count in min_counts: for def_alpha in def_alphas: for ns in nss: model = train_doc_vectors(docs_file, min_count=min_count, def_alpha=def_alpha, ns=ns, dm=dm, niter=niters) save_doc2vec_vectors(model, dst_vecs_file) dataarange.split_vecs(dst_vecs_file, split_labels_file_name, train_vecs_file_name, test_vecs_file_name, train_label=0, test_label=2) y_pred_test = doc_classification_svm(train_vecs_file_name, train_label_file, test_vecs_file_name, 0, -1) acc, prec, recall, f1 = get_scores_label_file(test_label_file, y_pred_test) print '%d\t%f\t%d\t%d' % (min_count, def_alpha, ns, niters) fout.write('%d\t%f\t%d\n' % (min_count, def_alpha, ns)) fout.write('%f\t%f\t%f\t%f\n' % (acc, prec, recall, f1)) fout.flush() fout.close()
def __train_pv_20ng(): docs_file = 'e:/data/emadr/20ng_bydate/tokenizedlc/docs-tokenized-lc-2.txt' method = 'pvdm' dst_vecs_file = 'e:/data/emadr/20ng_bydate/bindata/%s-vecs.bin' % method niters = 40 def_alpha = 0.01 min_count = 5 ns = 0 dm = 1 if method == 'pvdm' else 0 model = train_doc_vectors(docs_file, min_count=min_count, def_alpha=def_alpha, ns=ns, dm=dm, niter=niters) # dst_vecs_file = 'e:/data/emadr/20ng_bydate/bindata/pvdbow-vecs.bin' save_doc2vec_vectors(model, dst_vecs_file) split_labels_file_name = 'e:/data/emadr/20ng_bydate/bindata/dataset-split-labels.bin' train_vecs_file_name = 'e:/data/emadr/20ng_bydate/bindata/train-%s-vecs.bin' % method test_vecs_file_name = 'e:/data/emadr/20ng_bydate/bindata/test-%s-vecs.bin' % method dataarange.split_vecs(dst_vecs_file, split_labels_file_name, train_vecs_file_name, test_vecs_file_name, train_label=0, test_label=2)
def __nyt_classification(): # datadir = 'e:/data/emadr/nyt-world-full/processed/' datadir = 'f:/data/emadr/nyt-less-docs/world/' all_vecs_file_name = os.path.join(datadir, 'vecs/dew-vecs-0_9-40.bin') split_labels_file_name = os.path.join(datadir, 'bindata/dataset-split-labels.bin') train_label_file = os.path.join(datadir, 'bindata/train-labels.bin') test_label_file = os.path.join(datadir, 'bindata/test-labels.bin') train_vecs_file_name = os.path.join(datadir, 'vecs/train-dedw-vecs.bin') test_vecs_file_name = os.path.join(datadir, 'vecs/test-dedw-vecs.bin') dataarange.split_vecs(all_vecs_file_name, split_labels_file_name, train_vecs_file_name, test_vecs_file_name, train_label=0, test_label=2) y_pred_test = doc_classification_svm(train_vecs_file_name, train_label_file, test_vecs_file_name, 0, -1) get_scores_label_file(test_label_file, y_pred_test)
def __train_pv_nyt(): method = 'pvdm' # data_dir = 'e:/data/emadr/nyt-world-full/processed/' data_dir = 'e:/data/emadr/nyt-less-docs/business/' docs_file = os.path.join(data_dir, 'tokenizedlc/docs-tokenized-lc-2.txt') dst_vecs_file = os.path.join(data_dir, 'bindata/%s-vecs.bin' % method) niters = 40 def_alpha = 0.01 min_count = 5 ns = 0 dm = 1 if method == 'pvdm' else 0 model = train_doc_vectors(docs_file, min_count=min_count, def_alpha=def_alpha, ns=ns, dm=dm, niter=niters) save_doc2vec_vectors(model, dst_vecs_file) split_labels_file_name = os.path.join(data_dir, 'bindata/dataset-split-labels.bin') train_vecs_file_name = os.path.join(data_dir, 'bindata/train-%s-vecs.bin' % method) test_vecs_file_name = os.path.join(data_dir, 'bindata/test-%s-vecs.bin' % method) dataarange.split_vecs(dst_vecs_file, split_labels_file_name, train_vecs_file_name, test_vecs_file_name, train_label=0, test_label=2)
def __nyt_classification(): # datadir = 'e:/data/emadr/20ng_bydate' # datadir = 'e:/data/emadr/nyt-all/world' datadir = 'e:/data/emadr/nyt-less-docs/arts' all_vecs_file_name = os.path.join(datadir, 'rsm/rsm-vecs-70.bin') # all_vecs_file_name = os.path.join(datadir, 'rsm/drbm-vecs.bin') # split_labels_file_name = os.path.join(datadir, 'bindata/data-split-labels.bin') split_labels_file_name = os.path.join(datadir, 'bindata/dataset-split-labels.bin') train_label_file = os.path.join(datadir, 'bindata/train-labels.bin') test_label_file = os.path.join(datadir, 'bindata/test-labels.bin') train_vecs_file_name = os.path.join(datadir, 'rsm/train-rsm-vecs.bin') test_vecs_file_name = os.path.join(datadir, 'rsm/test-rsm-vecs.bin') dataarange.split_vecs(all_vecs_file_name, split_labels_file_name, train_vecs_file_name, test_vecs_file_name, train_label=0, test_label=2) doc_classification_lr(train_vecs_file_name, train_label_file, test_vecs_file_name, test_label_file, 0, -1)
def __classification(): # data_dir = 'e:/data/emadr/nyt-world-full/processed/' # data_dir = 'e:/data/emadr/nyt-all/arts/' # data_dir = 'e:/data/emadr/nyt-all/business/' # data_dir = 'e:/data/emadr/nyt-less-docs/business/bindata/' data_dir = 'e:/data/emadr/20ng_bydate/bindata/' # method = 'pvdm' method = 'pvdbow' all_vecs_file = os.path.join(data_dir, '%s-vecs.bin' % method) split_labels_file_name = os.path.join(data_dir, 'dataset-split-labels.bin') train_label_file = os.path.join(data_dir, 'train-labels.bin') test_label_file = os.path.join(data_dir, 'test-labels.bin') train_vecs_file_name = os.path.join(data_dir, 'train-%s-vecs.bin' % method) test_vecs_file_name = os.path.join(data_dir, 'test-%s-vecs.bin' % method) dst_y_pred_file = os.path.join(data_dir, 'ypred-%s.bin' % method) dataarange.split_vecs(all_vecs_file, split_labels_file_name, train_vecs_file_name, test_vecs_file_name, train_label=0, test_label=2) # doc_classification_lr(train_vecs_file_name, train_label_file, test_vecs_file_name, # test_label_file, 0, -1) y_pred_test = doc_classification_svm(train_vecs_file_name, train_label_file, test_vecs_file_name, 0, -1) get_scores_label_file(test_label_file, y_pred_test) ioutils.save_labels(y_pred_test, dst_y_pred_file)