Beispiel #1
0
def cluster_all_tables(data_path):
  for d in os.listdir(data_path):
    if not os.path.isdir(data_path + '/' + d):
      continue
   
    if d != 'lineitem':
      continue

    print 'processing %s' % d
    full_path = data_path.rstrip('/') + '/' + d.rstrip('/') + '/' 
    sample_ratio = int(open(full_path + '.ratio').read())
    data_file = '%s%s.train.%d.sample' % (full_path, d, sample_ratio)

    k = int(open(full_path + '.k').read())
    if k > 1:
      feat_cols = prep.get_feature_columns(full_path + '.columns')
      table = prep.load_file(data_file, feat_cols)
      seeds = load_means(full_path + '/.means')
#    output_weka(table, 'weka.arff')
#      return
      feat_doms = prep.read_domains(feat_cols, full_path + '.domains')
      header = prep.get_header(full_path + '.header')

      print 'start clustering %s' % data_file
  #    model = clustering(k, feat_cols, feat_doms, header, table, seeds, data_file + '.res')
   
      labels = kmeans(k, table)
      centers = get_centers(table, labels)
      classify_data_kmeans(k, feat_cols, full_path, centers)
Beispiel #2
0
def cluster_all_tables(data_path):
    for d in os.listdir(data_path):
        if not os.path.isdir(data_path + '/' + d):
            continue

        if d != 'lineitem':
            continue

        print 'processing %s' % d
        full_path = data_path.rstrip('/') + '/' + d.rstrip('/') + '/'
        sample_ratio = int(open(full_path + '.ratio').read())
        data_file = '%s%s.train.%d.sample' % (full_path, d, sample_ratio)

        k = int(open(full_path + '.k').read())
        if k > 1:
            feat_cols = prep.get_feature_columns(full_path + '.columns')
            table = prep.load_file(data_file, feat_cols)
            seeds = load_means(full_path + '/.means')
            #    output_weka(table, 'weka.arff')
            #      return
            feat_doms = prep.read_domains(feat_cols, full_path + '.domains')
            header = prep.get_header(full_path + '.header')

            print 'start clustering %s' % data_file
            #    model = clustering(k, feat_cols, feat_doms, header, table, seeds, data_file + '.res')

            labels = kmeans(k, table)
            centers = get_centers(table, labels)
            classify_data_kmeans(k, feat_cols, full_path, centers)
def work(max_count):
    import logging

    from preprocess import load_file, seq2str, str2seq
    from driver_amount import addh
    import config
    logging.info("Loading origin data...")
    char_seqs, tag_seqs = load_file(addh + config.DATA_PATH)
    equal_generator = EqualGenerator()

    def save(filepath, obj, count):
        import json
        import os
        dirname = os.path.dirname(filepath)
        count_path = os.path.join(dirname, "count.txt")
        with open(filepath, "w") as fd:
            json.dump(obj, fd)
        with open(count_path, "w") as fd:
            fd.write(str(count))

    from tqdm import tqdm
    logging.info("Start generating equal data.")
    equal_seqs = []
    start = 0
    count = 0
    for char_seq in tqdm(char_seqs, total=max_count):
        if count < start:
            count += 1
            continue
        origin_str = seq2str(char_seq)
        equal_str = equal_generator.generate(origin_str)
        equal_seq = str2seq(equal_str)
        equal_seqs.append(equal_seq)

        count += 1
        if count % 50 == 0:
            logging.info("Save " + str(int(count/1000)) + "th")
            save(
                addh + config.EQUAL_DATA_PATH,
                equal_seqs,
                count
            )
            logging.info("Save Done.")

        if count >= max_count:
            break

    logging.info("Save the rest")
    save(
        addh + config.EQUAL_DATA_PATH,
        equal_seqs,
        count
    )
Beispiel #4
0
def main():
  data = prep.load_file(sys.argv[1])
  #calc_all_marginals(data)

#  cols = [16,25,31,50,66]
  cols = [11, 24, 28, 37, 33, 51, 45, 49, 53, 32, 86, 103, 101, 104, 114, 118, 135]
#  cols = [0,2]  
#  calculate_histogram(data, cols)
#  calc_histogram(data, cols)

#  calculate_divergence(data, [3,7])
#  calculate_divergence(data, range(data.shape[1]))
  buf = []
  for i in cols:
    for j in cols:
      if j > i: 
        buf.append((calc_divergence(data[:,[i,j]]),i,j))

  for x in sorted(buf):
    print x
Beispiel #5
0
def main():
    data = prep.load_file(sys.argv[1])
    #calc_all_marginals(data)

    #  cols = [16,25,31,50,66]
    cols = [
        11, 24, 28, 37, 33, 51, 45, 49, 53, 32, 86, 103, 101, 104, 114, 118,
        135
    ]
    #  cols = [0,2]
    #  calculate_histogram(data, cols)
    #  calc_histogram(data, cols)

    #  calculate_divergence(data, [3,7])
    #  calculate_divergence(data, range(data.shape[1]))
    buf = []
    for i in cols:
        for j in cols:
            if j > i:
                buf.append((calc_divergence(data[:, [i, j]]), i, j))

    for x in sorted(buf):
        print x
Beispiel #6
0
    def load_testdata(self, test_datapath, test_vocab):
        """
        Loads a text file representing sequences. This is called in
        `translate.py`.

        params:
        test_datapath: some text file.
        test_vocab: it's the same PyTorch pickled training dataset.
        """

        # load vocabulary
        data = torch.load(test_vocab)
        settings = data['settings']
        # print(settings)
        # load test sequences
        token_instances = load_file(test_datapath, settings.max_word_seq_len,
                                    settings.format, settings.case_sensitive)
        is_bpe = settings.format.lower() == "bpe"

        SOS, EOS = constants.SOS, constants.EOS

        decoder = None

        if self.opt.override_max_token_seq_len:
            if self.opt.override_max_token_seq_len > 0:
                settings.max_token_seq_len = self.opt.override_max_token_seq_len
                if self.model == "transformer":
                    self.change_max_seq_len(
                        self.opt.override_max_token_seq_len + 5)

        print("settings.max_token_seq_len", settings.max_token_seq_len)
        if is_bpe:
            # load test data
            # TODO: fix preprocessing method for BPE when loading test data.
            # TODO: this is a night fix we'll need to clean the code debt later.
            bpe_src = BPE.from_dict(data['dict']['src'])
            decoder = BPE.from_dict(data['dict']['tgt'])
            # convert test sequences into IDx
            test_src_insts = bpe_src.transform(token_instances)
            test_src_insts = [i for i in test_src_insts]
            # some of the sequences made may be too long, so we'll need to fix that.
            for i in tqdm(range(len(token_instances)),
                          desc="Reclipping Test Sequences"):
                raw = token_instances[i]
                encoded = test_src_insts[i]
                test_src_insts[i] = reclip(raw, encoded, bpe_src,
                                           settings.max_token_seq_len - 2)
                test_src_insts[i] = [SOS] + test_src_insts[i] + [EOS]

            # setup data loader
            src_word2idx = data['dict']['src']
            tgt_word2idx = data['dict']['tgt']

            src_byte_pairs = {
                x + "_": y
                for x, y in src_word2idx['byte_pairs'].items()
            }
            tgt_byte_pairs = {
                x + "_": y
                for x, y in tgt_word2idx['byte_pairs'].items()
            }
            src_word2idx = {**src_byte_pairs, **src_word2idx['words']}
            tgt_word2idx = {**tgt_byte_pairs, **tgt_word2idx['words']}

            test_loader = torch.utils.data.DataLoader(
                TranslationDataset(src_word2idx=src_word2idx,
                                   tgt_word2idx=tgt_word2idx,
                                   src_insts=test_src_insts),
                num_workers=0,
                batch_size=self.opt.batch_size,
                collate_fn=collate_fn)

        else:
            # convert test sequences into IDx
            test_src_insts = seq2idx(token_instances, data['dict']['src'])
            # trim sequence lengths
            test_src_insts = [
                seq[:settings.max_word_seq_len] for seq in test_src_insts
            ]
            # add SOS and EOS
            test_src_insts = [[SOS] + x + [EOS] for x in test_src_insts]

            # setup data loaders.
            test_loader = torch.utils.data.DataLoader(
                TranslationDataset(src_word2idx=data['dict']['src'],
                                   tgt_word2idx=data['dict']['tgt'],
                                   src_insts=test_src_insts),
                num_workers=0,
                batch_size=self.opt.batch_size,
                collate_fn=collate_fn)

            decoder = data['dict']['tgt']

        return test_loader, settings.max_token_seq_len, is_bpe, decoder
Beispiel #7
0
    count_wrong = df_wrong.count()[0]
    count_predict_correct = len(predict_name_position)
    count_all_correct = len(name_position)

    precision = float(count_predict_correct) / (count_predict_correct +
                                                count_wrong)
    recall = float(count_predict_correct) / count_all_correct
    return precision, recall, df_wrong, df_not_found


factor = 4.0
length = 500
debug = False

train_df, train_list = load_file('I')
test_df, test_list = load_file('J')

train_df = sample_balance(train_df, factor)
X_train, Y_train, bag = feat(train_df, None, length)

#train
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
clf.fit(X_train, Y_train)

#test
X_test, Y_test, bag = feat(test_df, bag, length)
Y_predict = clf.predict(X_test)
Y_predict = post_process(test_df, Y_predict)

precision, recall, falsePositive, falseNegative = precision_recall(
Beispiel #8
0
def main():
    opt = load_args()
    bpe_enabled = True

    torch.manual_seed(opt.seed)

    # setup the max token sequence length to include <s> and </s>
    opt.max_token_seq_len = opt.max_word_seq_len

    # restructure code for readability
    dataset = {
        'train': {
            'src': opt.train_src,
            'tgt': []
        },
        'valid': {
            'src': opt.valid_src,
            'tgt': []
        }
    }

    label0, label1 = [torch.LongTensor([0])], [torch.LongTensor([1])]
    raw = copy(dataset)
    # load dataset
    for g in dataset:
        src = load_file(dataset[g]['src'], None, False)

        # split src and tgt
        src = [x.split() for x in src]
        tgt = [x[0] for x in src]
        src = [" ".join(x[1:]) for x in src]

        # convert tgt tokens.
        tgt = [label0 if i == opt.label0 else label1 for i in tgt]

        raw[g]['src'] = src
        dataset[g]['tgt'] = tgt

    if opt.src_vocab:
        # build bpe vocabulary
        print("[Info] Loading BPE vocabulary from", opt.src_vocab)
        src_bpe = bpe_encoder.from_dict(
            torch.load(opt.src_vocab)['dict']['tgt'])
    else:
        # building bpe vocabulary
        print("[Info] Building BPE vocabulary.")
        # build and train encoder
        src_bpe = bpe_encoder(vocab_size=opt.vocab_size,
                              pct_bpe=opt.pct_bpe,
                              ngram_min=1,
                              UNK=Constants.UNK_WORD,
                              PAD=Constants.PAD_WORD,
                              word_tokenizer=bpe_parse)
        src_bpe.fit(raw['train']['src'])

    # convert sequences
    for g in tqdm(dataset, desc="Converting tokens into IDs"):
        src_bpe.unmute()
        dataset[g]['src'] = [f for f in src_bpe.transform(tqdm(raw[g]['src']))]

    for g in tqdm(dataset, desc="Trimming Sequences"):
        sequences = dataset[g]['src']
        # it's much easier to just refer back to the original sentence and
        # trim tokens from there.
        for i in range(len(sequences)):
            ref_seq = raw[g]['src'][i]
            bpe_seq = sequences[i]
            dataset[g]['src'][i] = reclip(ref_seq, bpe_seq, src_bpe,
                                          opt.max_word_seq_len - 2)

    # add <s>, </s>
    # (At this stage, all of the sequences are tokenised, so you'll need to input
    #  the ID values of SOS and EOS instead.)
    SOS, EOS = Constants.SOS, Constants.EOS
    for g in tqdm(dataset, desc="Adding SOS, EOS tokens"):
        dataset[g]['src'] = [[SOS] + x + [EOS] for x in dataset[g]['src']]

    # shuffle dataset by sizes
    for g in tqdm(dataset, desc="Shuffling and Sorting"):
        src, tgt = dataset[g]['src'], dataset[g]['tgt']
        sizes = [len(x) for x in src]

        if opt.shuffle == 1:
            perm = torch.randperm(len(src))
            src = [src[idx] for idx in perm]
            tgt = [tgt[idx] for idx in perm]
            sizes = [sizes[idx] for idx in perm]

        _, perm = torch.sort(torch.Tensor(sizes))

        src = [src[idx] for idx in perm]
        tgt = [tgt[idx] for idx in perm]

        # add space to seq
        src_sizes = [sizes[idx] for idx in perm]

        blanks = [[
            Constants.PAD for _ in range(opt.max_token_seq_len - src_sizes[i])
        ] for i in range(len(src))]
        src = [src[i] + blanks[i] for i in range(len(src))]

        dataset[g]['src'] = src
        dataset[g]['tgt'] = tgt

    k = np.sum([len(x) for x in dataset[g]['src']]) / len(dataset[g]['src'])
    print("len:", k)

    # setup data to save.
    data = {
        'settings': opt,
        'dicts': {
            'src': src_bpe.vocabs_to_dict(False)
        },
        'train': {
            'src': dataset['train']['src'],
            'tgt': dataset['train']['tgt']
        },
        'valid': {
            'src': dataset['valid']['src'],
            'tgt': dataset['valid']['tgt']
        }
    }

    # dump information.
    filename = opt.save_data + ".train.pt"
    print('[Info] Dumping the processed data to pickle file', filename)
    torch.save(data, filename)
    print('[Info] Done.')
Beispiel #9
0

def is_not_consecutive(token):
    words = token.split()
    for word in words:
        count = 0
        for letter in word:
            if letter.isupper():
                count += 1
        if count > 1:
            return False
    return True


debug = False
train_df_all, train_list = load_file('I')
random.seed(100)
random.shuffle(train_list)

for length in [500]:
    for factor in [4]:
        precision_mean = 0
        recall_mean = 0

        CV_list = [
            train_list[:40], train_list[40:80], train_list[80:120],
            train_list[120:160], train_list[160:200]
        ]
        for i in range(5):
            train_index = train_df_all['file'].apply(
                lambda a: a not in CV_list[i])
Beispiel #10
0
    char_count = 0
    wrong_count = 0
    for char_seq, tag_seq, output_tag_id_seq in zip(char_seqs, tag_seqs, output_tag_id_seqs):
        if verbose >= 1:
            print("-"*15)
            print("Sentence " + str(sent_count))
        output_seq = output_tag_id_seq[1:-1] # Remove [CLS] and [SEP]'s id
        for c, expect_tag, output_tag_id_one_hot in zip(char_seq, tag_seq, output_seq):
            output_tag_id = np.argmax(output_tag_id_one_hot)
            output_tag = reversed_tag_vocab[output_tag_id]

            if verbose >= 2:
                print(c + "\t" + expect_tag + "\t" + output_tag)

            char_count += 1
            if expect_tag != output_tag:
                if verbose == 1:
                    print(c + "\t" + expect_tag + "\t" + output_tag)
                wrong_count += 1
        sent_count += 1

    print("-"*15)
    print("All: " + str(char_count))
    print("Wrong: " + str(wrong_count))
    print("Wrong Rate: " + str(int(wrong_count/char_count * 100)) + "%")

if __name__ == "__main__":
    from preprocess import load_file, preprocess
    char_seqs, tag_seqs = load_file("test.txt")
    token_id_seqs, one_hot_tag_id_seqs, tag_vocab = preprocess(char_seqs, tag_seqs)
    analyze(char_seqs, tag_seqs, one_hot_tag_id_seqs, tag_vocab, 2)