def cluster_all_tables(data_path): for d in os.listdir(data_path): if not os.path.isdir(data_path + '/' + d): continue if d != 'lineitem': continue print 'processing %s' % d full_path = data_path.rstrip('/') + '/' + d.rstrip('/') + '/' sample_ratio = int(open(full_path + '.ratio').read()) data_file = '%s%s.train.%d.sample' % (full_path, d, sample_ratio) k = int(open(full_path + '.k').read()) if k > 1: feat_cols = prep.get_feature_columns(full_path + '.columns') table = prep.load_file(data_file, feat_cols) seeds = load_means(full_path + '/.means') # output_weka(table, 'weka.arff') # return feat_doms = prep.read_domains(feat_cols, full_path + '.domains') header = prep.get_header(full_path + '.header') print 'start clustering %s' % data_file # model = clustering(k, feat_cols, feat_doms, header, table, seeds, data_file + '.res') labels = kmeans(k, table) centers = get_centers(table, labels) classify_data_kmeans(k, feat_cols, full_path, centers)
def work(max_count): import logging from preprocess import load_file, seq2str, str2seq from driver_amount import addh import config logging.info("Loading origin data...") char_seqs, tag_seqs = load_file(addh + config.DATA_PATH) equal_generator = EqualGenerator() def save(filepath, obj, count): import json import os dirname = os.path.dirname(filepath) count_path = os.path.join(dirname, "count.txt") with open(filepath, "w") as fd: json.dump(obj, fd) with open(count_path, "w") as fd: fd.write(str(count)) from tqdm import tqdm logging.info("Start generating equal data.") equal_seqs = [] start = 0 count = 0 for char_seq in tqdm(char_seqs, total=max_count): if count < start: count += 1 continue origin_str = seq2str(char_seq) equal_str = equal_generator.generate(origin_str) equal_seq = str2seq(equal_str) equal_seqs.append(equal_seq) count += 1 if count % 50 == 0: logging.info("Save " + str(int(count/1000)) + "th") save( addh + config.EQUAL_DATA_PATH, equal_seqs, count ) logging.info("Save Done.") if count >= max_count: break logging.info("Save the rest") save( addh + config.EQUAL_DATA_PATH, equal_seqs, count )
def main(): data = prep.load_file(sys.argv[1]) #calc_all_marginals(data) # cols = [16,25,31,50,66] cols = [11, 24, 28, 37, 33, 51, 45, 49, 53, 32, 86, 103, 101, 104, 114, 118, 135] # cols = [0,2] # calculate_histogram(data, cols) # calc_histogram(data, cols) # calculate_divergence(data, [3,7]) # calculate_divergence(data, range(data.shape[1])) buf = [] for i in cols: for j in cols: if j > i: buf.append((calc_divergence(data[:,[i,j]]),i,j)) for x in sorted(buf): print x
def main(): data = prep.load_file(sys.argv[1]) #calc_all_marginals(data) # cols = [16,25,31,50,66] cols = [ 11, 24, 28, 37, 33, 51, 45, 49, 53, 32, 86, 103, 101, 104, 114, 118, 135 ] # cols = [0,2] # calculate_histogram(data, cols) # calc_histogram(data, cols) # calculate_divergence(data, [3,7]) # calculate_divergence(data, range(data.shape[1])) buf = [] for i in cols: for j in cols: if j > i: buf.append((calc_divergence(data[:, [i, j]]), i, j)) for x in sorted(buf): print x
def load_testdata(self, test_datapath, test_vocab): """ Loads a text file representing sequences. This is called in `translate.py`. params: test_datapath: some text file. test_vocab: it's the same PyTorch pickled training dataset. """ # load vocabulary data = torch.load(test_vocab) settings = data['settings'] # print(settings) # load test sequences token_instances = load_file(test_datapath, settings.max_word_seq_len, settings.format, settings.case_sensitive) is_bpe = settings.format.lower() == "bpe" SOS, EOS = constants.SOS, constants.EOS decoder = None if self.opt.override_max_token_seq_len: if self.opt.override_max_token_seq_len > 0: settings.max_token_seq_len = self.opt.override_max_token_seq_len if self.model == "transformer": self.change_max_seq_len( self.opt.override_max_token_seq_len + 5) print("settings.max_token_seq_len", settings.max_token_seq_len) if is_bpe: # load test data # TODO: fix preprocessing method for BPE when loading test data. # TODO: this is a night fix we'll need to clean the code debt later. bpe_src = BPE.from_dict(data['dict']['src']) decoder = BPE.from_dict(data['dict']['tgt']) # convert test sequences into IDx test_src_insts = bpe_src.transform(token_instances) test_src_insts = [i for i in test_src_insts] # some of the sequences made may be too long, so we'll need to fix that. for i in tqdm(range(len(token_instances)), desc="Reclipping Test Sequences"): raw = token_instances[i] encoded = test_src_insts[i] test_src_insts[i] = reclip(raw, encoded, bpe_src, settings.max_token_seq_len - 2) test_src_insts[i] = [SOS] + test_src_insts[i] + [EOS] # setup data loader src_word2idx = data['dict']['src'] tgt_word2idx = data['dict']['tgt'] src_byte_pairs = { x + "_": y for x, y in src_word2idx['byte_pairs'].items() } tgt_byte_pairs = { x + "_": y for x, y in tgt_word2idx['byte_pairs'].items() } src_word2idx = {**src_byte_pairs, **src_word2idx['words']} tgt_word2idx = {**tgt_byte_pairs, **tgt_word2idx['words']} test_loader = torch.utils.data.DataLoader( TranslationDataset(src_word2idx=src_word2idx, tgt_word2idx=tgt_word2idx, src_insts=test_src_insts), num_workers=0, batch_size=self.opt.batch_size, collate_fn=collate_fn) else: # convert test sequences into IDx test_src_insts = seq2idx(token_instances, data['dict']['src']) # trim sequence lengths test_src_insts = [ seq[:settings.max_word_seq_len] for seq in test_src_insts ] # add SOS and EOS test_src_insts = [[SOS] + x + [EOS] for x in test_src_insts] # setup data loaders. test_loader = torch.utils.data.DataLoader( TranslationDataset(src_word2idx=data['dict']['src'], tgt_word2idx=data['dict']['tgt'], src_insts=test_src_insts), num_workers=0, batch_size=self.opt.batch_size, collate_fn=collate_fn) decoder = data['dict']['tgt'] return test_loader, settings.max_token_seq_len, is_bpe, decoder
count_wrong = df_wrong.count()[0] count_predict_correct = len(predict_name_position) count_all_correct = len(name_position) precision = float(count_predict_correct) / (count_predict_correct + count_wrong) recall = float(count_predict_correct) / count_all_correct return precision, recall, df_wrong, df_not_found factor = 4.0 length = 500 debug = False train_df, train_list = load_file('I') test_df, test_list = load_file('J') train_df = sample_balance(train_df, factor) X_train, Y_train, bag = feat(train_df, None, length) #train clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0) clf.fit(X_train, Y_train) #test X_test, Y_test, bag = feat(test_df, bag, length) Y_predict = clf.predict(X_test) Y_predict = post_process(test_df, Y_predict) precision, recall, falsePositive, falseNegative = precision_recall(
def main(): opt = load_args() bpe_enabled = True torch.manual_seed(opt.seed) # setup the max token sequence length to include <s> and </s> opt.max_token_seq_len = opt.max_word_seq_len # restructure code for readability dataset = { 'train': { 'src': opt.train_src, 'tgt': [] }, 'valid': { 'src': opt.valid_src, 'tgt': [] } } label0, label1 = [torch.LongTensor([0])], [torch.LongTensor([1])] raw = copy(dataset) # load dataset for g in dataset: src = load_file(dataset[g]['src'], None, False) # split src and tgt src = [x.split() for x in src] tgt = [x[0] for x in src] src = [" ".join(x[1:]) for x in src] # convert tgt tokens. tgt = [label0 if i == opt.label0 else label1 for i in tgt] raw[g]['src'] = src dataset[g]['tgt'] = tgt if opt.src_vocab: # build bpe vocabulary print("[Info] Loading BPE vocabulary from", opt.src_vocab) src_bpe = bpe_encoder.from_dict( torch.load(opt.src_vocab)['dict']['tgt']) else: # building bpe vocabulary print("[Info] Building BPE vocabulary.") # build and train encoder src_bpe = bpe_encoder(vocab_size=opt.vocab_size, pct_bpe=opt.pct_bpe, ngram_min=1, UNK=Constants.UNK_WORD, PAD=Constants.PAD_WORD, word_tokenizer=bpe_parse) src_bpe.fit(raw['train']['src']) # convert sequences for g in tqdm(dataset, desc="Converting tokens into IDs"): src_bpe.unmute() dataset[g]['src'] = [f for f in src_bpe.transform(tqdm(raw[g]['src']))] for g in tqdm(dataset, desc="Trimming Sequences"): sequences = dataset[g]['src'] # it's much easier to just refer back to the original sentence and # trim tokens from there. for i in range(len(sequences)): ref_seq = raw[g]['src'][i] bpe_seq = sequences[i] dataset[g]['src'][i] = reclip(ref_seq, bpe_seq, src_bpe, opt.max_word_seq_len - 2) # add <s>, </s> # (At this stage, all of the sequences are tokenised, so you'll need to input # the ID values of SOS and EOS instead.) SOS, EOS = Constants.SOS, Constants.EOS for g in tqdm(dataset, desc="Adding SOS, EOS tokens"): dataset[g]['src'] = [[SOS] + x + [EOS] for x in dataset[g]['src']] # shuffle dataset by sizes for g in tqdm(dataset, desc="Shuffling and Sorting"): src, tgt = dataset[g]['src'], dataset[g]['tgt'] sizes = [len(x) for x in src] if opt.shuffle == 1: perm = torch.randperm(len(src)) src = [src[idx] for idx in perm] tgt = [tgt[idx] for idx in perm] sizes = [sizes[idx] for idx in perm] _, perm = torch.sort(torch.Tensor(sizes)) src = [src[idx] for idx in perm] tgt = [tgt[idx] for idx in perm] # add space to seq src_sizes = [sizes[idx] for idx in perm] blanks = [[ Constants.PAD for _ in range(opt.max_token_seq_len - src_sizes[i]) ] for i in range(len(src))] src = [src[i] + blanks[i] for i in range(len(src))] dataset[g]['src'] = src dataset[g]['tgt'] = tgt k = np.sum([len(x) for x in dataset[g]['src']]) / len(dataset[g]['src']) print("len:", k) # setup data to save. data = { 'settings': opt, 'dicts': { 'src': src_bpe.vocabs_to_dict(False) }, 'train': { 'src': dataset['train']['src'], 'tgt': dataset['train']['tgt'] }, 'valid': { 'src': dataset['valid']['src'], 'tgt': dataset['valid']['tgt'] } } # dump information. filename = opt.save_data + ".train.pt" print('[Info] Dumping the processed data to pickle file', filename) torch.save(data, filename) print('[Info] Done.')
def is_not_consecutive(token): words = token.split() for word in words: count = 0 for letter in word: if letter.isupper(): count += 1 if count > 1: return False return True debug = False train_df_all, train_list = load_file('I') random.seed(100) random.shuffle(train_list) for length in [500]: for factor in [4]: precision_mean = 0 recall_mean = 0 CV_list = [ train_list[:40], train_list[40:80], train_list[80:120], train_list[120:160], train_list[160:200] ] for i in range(5): train_index = train_df_all['file'].apply( lambda a: a not in CV_list[i])
char_count = 0 wrong_count = 0 for char_seq, tag_seq, output_tag_id_seq in zip(char_seqs, tag_seqs, output_tag_id_seqs): if verbose >= 1: print("-"*15) print("Sentence " + str(sent_count)) output_seq = output_tag_id_seq[1:-1] # Remove [CLS] and [SEP]'s id for c, expect_tag, output_tag_id_one_hot in zip(char_seq, tag_seq, output_seq): output_tag_id = np.argmax(output_tag_id_one_hot) output_tag = reversed_tag_vocab[output_tag_id] if verbose >= 2: print(c + "\t" + expect_tag + "\t" + output_tag) char_count += 1 if expect_tag != output_tag: if verbose == 1: print(c + "\t" + expect_tag + "\t" + output_tag) wrong_count += 1 sent_count += 1 print("-"*15) print("All: " + str(char_count)) print("Wrong: " + str(wrong_count)) print("Wrong Rate: " + str(int(wrong_count/char_count * 100)) + "%") if __name__ == "__main__": from preprocess import load_file, preprocess char_seqs, tag_seqs = load_file("test.txt") token_id_seqs, one_hot_tag_id_seqs, tag_vocab = preprocess(char_seqs, tag_seqs) analyze(char_seqs, tag_seqs, one_hot_tag_id_seqs, tag_vocab, 2)