Exemple #1
0
def getSentences(document_id):
    document = get_document(document_id)

    sentences = []

    for sentence in document.sentences:
        sentences.append(
            {"id": str(sentence.id), "source": sentence.source, "translation": sentence.translation,
             "beam": sentence.beam,
             "score": sentence.score,
             "attention": sentence.attention,
             "corrected": sentence.corrected,
             "flagged": sentence.flagged,
             "diff": sentence.diff if hasattr(sentence, "diff") else ""})

    old_etag = request.headers.get('If-None-Match', '')
    data = json.dumps(sentences)
    new_etag = md5(data.encode("utf-8")).hexdigest()

    if old_etag == new_etag:
        return "", 304
    else:
        res = jsonify(sentences)
        res.headers["Etag"] = new_etag
        return res
Exemple #2
0
def split_and_write(file1, data1, file2, data2):
    with open(file1, "at") as f1:
        with open(file2, "at") as f2:
            # z = match(data1, data2)
            # if z > 0.05:
            #     d1_count, d1_cleaned = clean_and_split(data1)
            #     d2_count, d2_cleaned = clean_and_split(data2)
            #     if (d1_count == d2_count):
            #         f1.write(encode(d1_cleaned))
            #         f2.write(encode(d2_cleaned))
            #     else:
            #         f1.write(encode(clean(data1)))
            #         f2.write(encode(clean(data2)))
            # else:
            # print("'{}' unlike '{}'\n".format(data1, data2))
            f1.write(encode(clean(data1)))
            f2.write(encode(clean(data2)))
	def write_saves_dat(self):
		import time
		try:
			buf = data.encode(self.root)
		except Exception as e:
			QtGui.QMessageBox.warning(self, 'MiasMod',
				'%s while encoding data\n\n%s\n\nRefusing to write saves.dat!\n\nThis means there is a bug in MiasMod, please report this to DarkStarSword!' \
				% (e.__class__.__name__, str(e)))
			return

		if not self.verify(buf):
			QtGui.QMessageBox.warning(self, 'MiasMod',
				'Verification pass failed, refusing to write saves.dat!\n\nThis means there is a bug in MiasMod, please report this to DarkStarSword!' \
				% (e.__class__.__name__, str(e)))
			return

		try:
			timestamp_str = time.strftime('%Y%m%d%H%M%S')
			backup = '%s~%s' % (self.save_path, timestamp_str)
			os.rename(self.save_path, backup)
		except Exception as e:
			QtGui.QMessageBox.warning(self, 'MiasMod',
				'%s while backing up saves.dat\n\n%s\n\nRefusing to write saves.dat!' \
				% (e.__class__.__name__, str(e)))
			return

		try:
			open(self.save_path, 'wb').write(buf)
		except Exception as e:
			QtGui.QMessageBox.warning(self, 'MiasMod',
				'%s while writing saves.dat\n\n%s\n\nWill attempt to restore backup %s...' \
				% (e.__class__.__name__, str(e), backup))
			try:
				os.remove(self.save_path)
			except:
				pass # May just not have been created yet
			try:
				os.rename(backup, self.save_path)
				QtGui.QMessageBox.information(self, 'MiasMod',
					'Succesfully restored backup')
			except:
				QtGui.QMessageBox.warning(self, 'MiasMod',
					'%s while restoring %s\n\n%s' \
					% (e.__class__.__name__, backup, str(e)))
			return
		return True
Exemple #4
0
def main(argv):
    hlog.flags()

    random.seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    torch.manual_seed(FLAGS.seed)

    #input_symbols_list   = set(['red', 'yellow', 'green', 'blue', 'purple', 'pink', 'around', 'thrice', 'after'])
    input_symbols_list = set([
        'dax', 'lug', 'wif', 'zup', 'fep', 'blicket', 'kiki', 'tufa', 'gazzer'
    ])
    output_symbols_list = set(
        ['RED', 'YELLOW', 'GREEN', 'BLUE', 'PURPLE', 'PINK'])

    study, test = generate_fig2_exp(input_symbols_list, output_symbols_list)

    vocab_x = Vocab()
    vocab_y = Vocab()

    if FLAGS.full_data:
        for sym in input_symbols_list:
            vocab_x.add(sym)
        for sym in output_symbols_list:
            vocab_y.add(sym)
        max_len_x = 7
        max_len_y = 9
    else:
        test, study = study[3:4], study[0:3]
        for (x, y) in test + study:
            for sym in x:
                vocab_x.add(sym)
            for sym in y:
                vocab_y.add(sym)
        max_len_x = 2
        max_len_y = 2

    hlog.value("vocab_x", vocab_x)
    hlog.value("vocab_y", vocab_y)
    hlog.value("study", study)
    hlog.value("test", test)

    train_items, test_items = encode(study, vocab_x,
                                     vocab_y), encode(test, vocab_x, vocab_y)

    #   outlist = list(output_symbols_list)

    oracle_py = Oracle(train_items,
                       test_items,
                       DEVICE,
                       dist="py",
                       vocab_x=vocab_x,
                       vocab_y=vocab_y)
    oracle_px = Oracle(train_items,
                       test_items,
                       DEVICE,
                       dist="px",
                       vocab_x=vocab_x,
                       vocab_y=vocab_y)
    oracle_qxy = Oracle(train_items,
                        test_items,
                        DEVICE,
                        dist="qxy",
                        vocab_x=vocab_x,
                        vocab_y=vocab_y)

    model = Mutex(
        vocab_x,
        vocab_y,
        FLAGS.dim,
        FLAGS.dim,
        oracle_py,
        max_len_x=max_len_x,
        max_len_y=max_len_y,
        copy=False,
        n_layers=FLAGS.n_layers,
        self_att=False,
        dropout=FLAGS.dropout,
        lamda=FLAGS.lamda,
        kl_lamda=FLAGS.kl_lamda,
        Nsample=FLAGS.Nsample,
        temp=FLAGS.temp,
        regularize=FLAGS.regularize,
        ent=FLAGS.ent,
    ).to(DEVICE)

    if FLAGS.regularize and not isinstance(model.px, Oracle):
        with hlog.task("pretrain px"):
            pretrain(model.px, train_items + test_items, test_items)
            for p in model.px.parameters():
                p.requires_grad = False

    with hlog.task("Initial Samples"):
        hlog.value("px samples", "\n".join(model.sample_px(20)))
        hlog.value("py samples", "\n".join(model.sample_py(20)))
        hlog.value("qxy debug samples",
                   "\n".join(model.sample_qxy_debug(N=20)))
        hlog.value(
            "qxy debug data",
            "\n".join(model.sample_qxy_debug_data(train_items + test_items)))


#         hlog.value("qxy samples", "\n".join(model.sample_qxy(model.py.sample(20,max_len),temp=model.temp)))
#         hlog.value("qxy samples (gumbel)", "\n".join(model.sample_qxy_gumbel(model.py.sample(20,max_len),temp=model.temp)))

#     if not isinstance(model.qxy,Oracle):
#         train(model.qxy, swap_io(train_items) + swap_io(test_items), swap_io(test_items))
#     if not isinstance(model.pyx,Oracle):
#         train(model.pyx, train_items + test_items, test_items)
#         for param in model.pyx.parameters():
#             param.requires_grad = False

    with hlog.task("train model"):
        acc, f1 = train(model, train_items, test_items)

    with hlog.task("Final Samples"):
        hlog.value("px samples", "\n".join(model.sample_px(20)))
        hlog.value("py samples", "\n".join(model.sample_py(20)))
        hlog.value("qxy debug samples",
                   "\n".join(model.sample_qxy_debug(N=20)))
        hlog.value(
            "qxy debug data",
            "\n".join(model.sample_qxy_debug_data(train_items + test_items)))
        hlog.value(
            "qxy samples (gumbel)", "\n".join(
                model.sample_qxy_gumbel(model.py.sample(20, max_len_y),
                                        temp=model.temp)))
        #hlog.value("qxy samples", "\n".join(model.sample_qxy(model.py.sample(20,max_len),temp=model.temp)))

    if FLAGS.regularize:
        losses = pd.DataFrame(model.loss_container)
        figure = sns.lineplot(data=losses, dashes=False).figure
        figure.savefig(f"{FLAGS.seed}_plot.png")

    with hlog.task("train evaluation"):
        validate(model, train_items, vis=True)

    with hlog.task("test evaluation"):
        validate(model, test_items, vis=True)
        validation_df.shape))

    # Augment training data
    train_df = data.augment_data(train_df,
                                 test_df,
                                 use_xnli=args.load_xnli,
                                 use_mnli=args.load_mnli,
                                 use_bt=args.back_translate,
                                 bt_filepath=args.bt_file)

    # Define the tokenizer to preprocess the input data
    tokenizer = data.define_tokenizer(args.model_name)

    # Batch encode input training data
    train_input = data.encode(train_df,
                              tokenizer,
                              max_len=args.max_sequence_length)
    input_word_ids = train_input['input_word_ids']
    input_mask = train_input['input_mask']
    labels = train_input['labels']
    print(
        "Training input shape: input_word_ids=>{}, input_mask=>{}, labels=>{}".
        format(input_word_ids.shape, input_mask.shape, labels.shape))

    # Batch encode input validation data
    validation_input = data.encode(validation_df,
                                   tokenizer,
                                   max_len=args.max_sequence_length)
    validation_word_ids = validation_input['input_word_ids']
    validation_mask = validation_input['input_mask']
    validation_labels = validation_input['labels']
Exemple #6
0
 def send_data(self, data):
     self.soc.send(data.encode())
     resp = self.soc.recv(self._data_.get_buffer_len())
     print("[NETWORK]\tResponse recived")
     return resp.decode()
import itertools

# checkpoint = './checkpoints/WT2.pt'
checkpoint = '../models/BNC.18hr.QRNN.pt'
data = 'data/bnc'
torch.cuda.set_device(3)
device = torch.device(3)

torch.manual_seed(1234)

with open(checkpoint, 'rb') as f:
    model, criterion, _ = torch.load(f, map_location=device)

import os
import hashlib
fn = 'corpus.{}.data'.format(hashlib.md5(data.encode()).hexdigest())
if os.path.exists(fn):
    print('loading cached dataset...')
    corpus = torch.load(fn)
else:
    print('producing dataset...')
    corpus = data.corpus(args.data)
    torch.save(corpus, fn)

dictionary = corpus.dictionary


def tokenize_sent(sent):
    return torch.LongTensor([dictionary.word2idx[x] for x in sent]).cuda()

from data import setup, encode, onehotencode
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=[0, 1])

__location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))

# Data set, now combined into one file
df = pd.read_csv(os.path.join(__location__, 'adult.data'), header=None)

# we have no header so our columns are indexes
cols_to_encode = [1, 3, 5, 6, 7, 8, 9, 13]  # & 14 as label encode
cols_to_scale = [0, 2, 4, 10, 11, 12]
df = pd.get_dummies(df, columns=cols_to_encode)
df, labels = encode(df, [14])

# Scale the continuious values su
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# Code to force balance among target class
# print('Balancing Adult data set... please wait')
# greater_than_50k = np.where(df[14] == 1)[0] # 11K
# less_than_50k = np.where(df[14] == 0)[0]
# lt_samples = np.random.choice(less_than_50k, greater_than_50k.shape[0]) # match the less than 50k to greater than 50k in size
# balanced = []
# for index in greater_than_50k:
#     balanced.append(df.iloc[index])
# for index in lt_samples:
#     balanced.append(df.iloc[index])
# balanced = pd.DataFrame(data=balanced, columns=df.columns)
Exemple #9
0
max_no = 10

model = Sequential()
model.add(RNN(100, input_shape=(seq_len, max_no)))
model.add(Dropout(0.25))
model.add(RepeatVector(seq_len))
model.add(RNN(100, return_sequences=True))

model.add(TimeDistributedDense(max_no))
model.add(Dropout(0.5))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

for ind, (X, Y) in enumerate(batch_gen(batch_size, seq_len, max_no)):
    loss, acc = model.train_on_batch(X, Y)
    if ind % 250 == 0:
        testX = np.random.randint(max_no, size=(1, seq_len))
        test = encode(testX, seq_len, max_no)
        print testX
        #pdb.set_trace()
        y = model.predict(test, batch_size=1)
        print "actual sorted output is"
        print np.sort(testX)
        print "sorting done by RNN is"
        print np.argmax(y, axis=2)
        print "\n"
        # print loss, acc
Exemple #10
0
## Transform the documents using the vocabulary.
en_x = np.array(list(en_vocab_processor.fit_transform(en_text)))
sh_x = np.array(list(sh_vocab_processor.fit_transform(sh_text)))

## Extract word:id mapping from the object.
en_vocab_dict = en_vocab_processor.vocabulary_._mapping
sh_vocab_dict = sh_vocab_processor.vocabulary_._mapping

## Sort the vocabulary dictionary on the basis of values(id).
## Both statements perform same task.
#sorted_vocab = sorted(vocab_dict.items(), key=operator.itemgetter(1))
en_sorted_vocab = sorted(en_vocab_dict.items(), key=lambda x: x[1])
sh_sorted_vocab = sorted(sh_vocab_dict.items(), key=lambda x: x[1])

## Treat the id's as index into list and create a list of words in the ascending order of id's
## word with id i goes at index i of the list.
en_vocabulary = list(list(zip(*en_sorted_vocab))[0])
sh_vocabulary = list(list(zip(*sh_sorted_vocab))[0])

print("Vocabulary : ")
print(en_vocabulary)
with open("vocab2.en", 'wt') as o:
    for w in en_vocabulary:
        o.write(encode(w))

print(sh_vocabulary)
print("Transformed documents : ")
print(en_x)
print(sh_x)