def main(file, interactive=False, debug=False): if file is not None: env = Environment() with open(file, "r") as f: program_str = f.read() l = Lexer(program_str) p = Parser(l) program = p.parse_program() if debug: print("PARSED PROGRAM") print("-" * 50) print(program) print("-" * 50) if len(p.errors) != 0: repl.print_parser_errors(p.errors) exit(0) evaluated = evaluator.eval(program, env) if (not interactive and evaluated is not None and evaluated != NULL) or evaluated.typ == mobject.ERROR_OBJ: print(evaluated.inspect) if interactive: repl.start(env=env) else: repl.start()
def eval_test(input: str) -> MonkeyObject: l = lexer.Lexer(input) p = parser.Parser(l) program = p.parse_program() env = mobject.Environment() return evaluator.eval(program, env)
def test_bang_operator(input_data, expected_val): lexer = Lexer(input_data) parser = Parser.new(lexer) program = parser.parse() check_parse_errors(parser) output = eval(program) assert output.value == expected_val
def test_eval_integer_expression(input_data, expected_val): lexer = Lexer(input_data) parser = Parser.new(lexer) program = parser.parse() check_parse_errors(parser) output = eval(program) assert output.value == expected_val
def main(input): tokens = lexer.get_tokens(input) parser_tree = parser.parse(tokens) if type(parser_tree) == str: return parser_tree else: return evaluator.eval(parser_tree)
def test_if_else_expression(input_data, expected_output): lexer = Lexer(input_data) parser = Parser.new(lexer) program = parser.parse() check_parse_errors(parser) output = eval(program) if hasattr(output, "value"): assert output.value == expected_output else: assert str(output) == expected_output
def load(filename): """Load scheme source file `filename` """ # The parameter fildname is of the form '"path"' filename = filename[1:-1] with open(filename) as file: input = file.read() result = [] parseInput(input, 0, result) for exp in result: output = eval(exp, the_global_environment) return output
def excute(self, env, name, value): if isinstance(name, t.List): # Function definition fname = name[0] args = name[1:] func = t.Function(args, value, env, fname) env[fname] = func return t.Null() elif isinstance(name, t.Symbol): # Value definition env[name] = ev.eval(value, env) return t.Null() else: raise SyntaxError('define')
def __main__(): sys.stdout.write("-> ") sys.stdout.flush() for line in sys.stdin: try: ast = parser.parse(line) result = evaluator.eval(ast, global_env) print("# " + str(result)) sys.stdout.write("-> ") sys.stdout.flush() except Exception as e: print(e) sys.stdout.write("-> ") sys.stdout.flush()
def repl(prompt='lispy> ', inport=p.InPort(sys.stdin), out=sys.stdout): """A prompt-read-eval-print loop. """ sys.stderr.write("Lispy version 2.0\n") while True: try: if prompt: sys.stderr.write(prompt) x = p.parse(inport) if x is p.eof_object: return val = e.eval(x) if val is not None and out: print >> out, to_string(val) except Exception, ex: print '{0}: {1}'.format(type(ex).__name__, ex) break
def repl(prompt="> "): while True: program = '' try: program = input(prompt) except (KeyboardInterrupt, EOFError): print() return if len(program) > 0: val = None try: val = eval(parse(program)) except NameError as e: print(e) continue if val is not None: print(sexpr(val))
def start(env=None): if env is None: env = mobject.Environment() while True: print(PROMPT, end=" ") line = input() if not line: return l = lexer.Lexer(line) p = parser.Parser(l) program = p.parse_program() if len(p.errors) != 0: print_parser_errors(p.errors) continue evaluated = evaluator.eval(program, env) if evaluated is not None: print(evaluated.inspect)
def build_cnn(lang, odir): '''Train, valid, test CNN lang: The language name odir: output directory of prediction results ''' doc_idx = 2 max_len = 40 # sequence length epochs = 10 encode_dir = './data/encode/' + lang + '/' indices_dir = './data/indices/' + lang + '/' wt_dir = './resources/weight/' res_dir = './resources/classifier/cnn/' clf_path = res_dir + lang + '.clf' # don't reload classifier for debug usage # load embedding weights weights = np.load(wt_dir + lang + '.npy') # build model architecture text_input = Input(shape=(max_len, ), dtype='int32', name='input') embeds = Embedding(weights.shape[0], weights.shape[1], weights=[weights], input_length=max_len, trainable=True, name='embedding')(text_input) # convolution conv3 = Conv1D(kernel_size=3, filters=100, padding='same', name='conv3')(embeds) maxp3 = MaxPool1D()(conv3) conv4 = Conv1D(kernel_size=4, filters=100, padding='same', name='conv4')(embeds) maxp4 = MaxPool1D()(conv4) conv5 = Conv1D(kernel_size=5, filters=100, padding='same', name='conv5')(embeds) maxp5 = MaxPool1D()(conv5) # merge merge_convs = keras.layers.concatenate([maxp3, maxp4, maxp5], axis=-1) # flatten flat_l = Flatten()(merge_convs) # dense, according to kim'14 paper, # regularizer applies to the both kernel and bias dense_l = Dense( 100, activation='softplus', name='dense', kernel_regularizer=keras.regularizers.l1_l2(0, 0.03), bias_regularizer=keras.regularizers.l1_l2(0, 0.03), )(flat_l) dp_l = Dropout(0.3, name='dropout')(dense_l) # predict, binary prediction predicts = Dense(1, activation='sigmoid', name='predict')(dp_l) # model model = Model(inputs=text_input, outputs=predicts) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) best_valid_f1 = 0.0 best_model = None for e in range(epochs): accuracy = 0.0 loss = 0.0 step = 1 print('--------------Epoch: {}--------------'.format(e)) # load training and batch dataset train_iter = evaluator.data_iter(indices_dir + 'train.tsv', batch_size=64) # train model for class_wt, x_train, y_train in train_iter: if len(np.unique(y_train)) == 1: continue tmp = model.train_on_batch([x_train], y_train, class_weight=class_wt) loss += tmp[0] loss_avg = loss / step accuracy += tmp[1] accuracy_avg = accuracy / step if step % 30 == 0: print('Step: {}'.format(step)) print('\tLoss: {}. Accuracy: {}'.format( loss_avg, accuracy_avg)) print('--------------------------------------') step += 1 # valid model to find the best model print('---------------Validation------------') valid_iter = evaluator.data_iter(indices_dir + 'valid.tsv', batch_size=64, if_shuffle=False) y_preds = [] y_valids = [] for _, x_valid, y_valid in valid_iter: tmp_preds = model.predict([x_valid]) for item_tmp in tmp_preds: y_preds.append(round(item_tmp[0])) y_valids.extend(y_valid) valid_f1 = f1_score( y_true=y_valids, y_pred=y_preds, average='weighted', ) print('Validating f1-macro score: ' + str(valid_f1)) if best_valid_f1 < valid_f1: best_valid_f1 = valid_f1 best_model = model pickle.dump(best_model, open(clf_path, 'wb')) # test moddel print('--------------Test--------------------') y_preds = [] y_probs = [] test_iter = evaluator.data_iter(indices_dir + 'test.tsv', batch_size=64, if_shuffle=False) for _, x_test, y_test in test_iter: tmp_preds = best_model.predict([x_test]) for item_tmp in tmp_preds: y_probs.append(item_tmp[0]) y_preds.append(int(round(item_tmp[0]))) with open(odir + lang + '.tsv', 'w') as wfile: with open(indices_dir + 'test.tsv') as dfile: wfile.write(dfile.readline().strip() + '\tpred\tpred_prob\n') for idx, line in enumerate(dfile): wfile.write(line.strip() + '\t' + str(y_preds[idx]) + '\t' + str(y_probs[idx]) + '\n') # save the predicted results evaluator.eval(odir + lang + '.tsv', odir + lang + '.score')
def build_bert(lang, odir, params=None): '''Google Bert Classifier lang: The language name odir: output directory of prediction results ''' if not params: params = dict() params['balance_ratio'] = 0.9 params['freeze'] = False params['decay_rate'] = .001 params['lr'] = 2e-5 params['warm_steps'] = 100 params['train_steps'] = 1000 params['batch_size'] = 16 params['balance'] = True split_dir = './data/split/' + lang + '/' if torch.cuda.is_available(): device = str(get_freer_gpu()) torch.cuda.set_device(device) else: device = torch.device('cpu') print(device) n_gpu = torch.cuda.device_count() print(torch.cuda.get_device_name()) print('Number of GPUs: ', n_gpu) print('Loading Datasets and oversample training data...') train_df = pd.read_csv(split_dir + 'train.tsv', sep='\t', na_values='x') # oversample the minority class if params['balance']: label_count = Counter(train_df.label) for label_tmp in label_count: sample_num = label_count.most_common( 1)[0][1] - label_count[label_tmp] if sample_num == 0: continue train_df = pd.concat([ train_df, train_df[train_df.label == label_tmp].sample( int(sample_num * params['balance_ratio']), replace=True) ]) train_df = train_df.reset_index() # to prevent index key error valid_df = pd.read_csv(split_dir + 'valid.tsv', sep='\t', na_values='x') test_df = pd.read_csv(split_dir + 'train.tsv', sep='\t', na_values='x') data_df = [train_df, valid_df, test_df] # We need to add special tokens at the beginning and end of each sentence for BERT to work properly for doc_df in data_df: doc_df.text = doc_df.text.apply(lambda x: '[CLS] ' + x + ' [SEP]') if lang == 'English': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) elif lang == 'Chinese': tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True) else: tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-uncased', do_lower_case=True) print('Padding Datasets...') for doc_df in data_df: doc_df.text = doc_df.text.apply(lambda x: tokenizer.tokenize(x)) # convert to indices and pad the sequences max_len = 25 for doc_df in data_df: doc_df.text = doc_df.text.apply(lambda x: pad_sequences( [tokenizer.convert_tokens_to_ids(x)], maxlen=max_len, dtype="long") [0]) # create attention masks for doc_df in data_df: attention_masks = [] for seq in doc_df.text: seq_mask = [float(idx > 0) for idx in seq] attention_masks.append(seq_mask) doc_df['masks'] = attention_masks # format train, valid, test train_inputs = torch.tensor(data_df[0].text) train_labels = torch.tensor(data_df[0].label) train_masks = torch.tensor(data_df[0].masks) valid_inputs = torch.tensor(data_df[1].text) valid_labels = torch.tensor(data_df[1].label) valid_masks = torch.tensor(data_df[1].masks) test_inputs = torch.tensor(data_df[2].text) test_labels = torch.tensor(data_df[2].label) test_masks = torch.tensor(data_df[2].masks) batch_size = params['batch_size'] train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) valid_data = TensorDataset(valid_inputs, valid_masks, valid_labels) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size) test_data = TensorDataset(test_inputs, test_masks, test_labels) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size) # load the pretrained model print('Loading Pretrained Model...') if lang == 'English': model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=2) elif lang == 'Chinese': model = BertForSequenceClassification.from_pretrained( 'bert-base-chinese', num_labels=2) else: # for Spanish, Italian, Portuguese and Polish model = BertForSequenceClassification.from_pretrained( 'bert-base-multilingual-uncased', num_labels=2) model.to(device) # organize parameters param_optimizer = list(model.named_parameters()) if params['freeze']: no_decay = ['bias', 'bert'] # , 'bert' freeze all bert parameters else: no_decay = ['bias'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': params['decay_rate'] }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=params['lr']) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params['warm_steps'], num_training_steps=params['train_steps']) # Number of training epochs (authors recommend between 2 and 4) epochs = 10 # Training print('Training the model...') for _ in trange(epochs, desc='Epoch'): model.train() # Tracking variables tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # train batch for step, batch in enumerate(train_dataloader): # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Clear out the gradients (by default they accumulate) optimizer.zero_grad() # Forward pass outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # backward pass # outputs[0].backward() outputs.backward() # Update parameters and take a step using the computed gradient optimizer.step() # Update tracking variables tr_loss += outputs[0].item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print("Train loss: {}".format(tr_loss / nb_tr_steps)) '''Validation''' best_valid_f1 = 0.0 # Put model in evaluation mode to evaluate loss on the validation set model.eval() # tracking variables eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 # batch eval y_preds = [] for batch in valid_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Telling the model not to compute or store gradients, saving memory and speeding up validation with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # Move logits and labels to CPU logits = outputs[0].detach().cpu().numpy() # record the prediction pred_flat = np.argmax(logits, axis=1).flatten() y_preds.extend(pred_flat) label_ids = b_labels.to('cpu').numpy() tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_accuracy += tmp_eval_accuracy nb_eval_steps += 1 print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps)) # evaluate the validation f1 score f1_m_valid, f1_w_valid = flat_f1(y_preds, valid_df.label) if f1_m_valid > best_valid_f1: print('Test....') best_valid_f1 = f1_m_valid y_preds = [] y_probs = [] # test if valid gets better results for batch in test_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) probs = F.softmax(outputs[0], dim=1) probs = probs.detach().cpu().numpy() pred_flat = np.argmax(probs, axis=1).flatten() y_preds.extend(pred_flat) y_probs.extend([item[1] for item in probs]) # save the predicted results with open(odir + lang + '.tsv', 'w') as wfile: with open(split_dir + 'test.tsv') as dfile: wfile.write(dfile.readline().strip() + '\tpred\tpred_prob\n') for idx, line in enumerate(dfile): wfile.write(line.strip() + '\t' + str(y_preds[idx]) + '\t' + str(y_probs[idx]) + '\n') # save the predicted results evaluator.eval(odir + lang + '.tsv', odir + lang + '.score')
def excute(self, env, first, *args): last = t.Null() for item in itertools.chain([first], args): last = ev.eval(item, env) return last
def excute(self, env, state, true, false): if ev.eval(state, env): return ev.eval(true, env) else: return ev.eval(false, env)
def build_rnn(lang, odir): '''Train, valid, test RNN lang: The language name odir: output directory of prediction results ''' doc_idx = 2 rnn_size = 200 max_len = 40 # sequence length epochs = 10 encode_dir = './data/encode/' + lang + '/' indices_dir = './data/indices/' + lang + '/' wt_dir = './resources/weight/' res_dir = './resources/classifier/rnn/' clf_path = res_dir + lang + '.clf' # don't reload classifier for debug usage # load embedding weights weights = np.load(wt_dir + lang + '.npy') # build model architecture text_input = Input(shape=(max_len, ), dtype='int32', name='input') embeds = Embedding(weights.shape[0], weights.shape[1], weights=[weights], input_length=max_len, trainable=True, name='embedding')(text_input) bigru = Bidirectional(GRU(rnn_size, kernel_initializer="glorot_uniform"))(embeds) dp = Dropout(rate=.2)(bigru) predicts = Dense(1, activation='sigmoid', name='predict')(dp) # binary prediction model = Model(inputs=text_input, outputs=predicts) model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) print(model.summary()) best_valid_f1 = 0.0 best_model = None for e in range(epochs): accuracy = 0.0 loss = 0.0 step = 1 print('--------------Epoch: {}--------------'.format(e)) # load training and batch dataset train_iter = evaluator.data_iter(indices_dir + 'train.tsv', batch_size=64) # train model for class_wt, x_train, y_train in train_iter: if len(np.unique(y_train)) == 1: continue tmp = model.train_on_batch([x_train], y_train, class_weight=class_wt) loss += tmp[0] loss_avg = loss / step accuracy += tmp[1] accuracy_avg = accuracy / step if step % 30 == 0: print('Step: {}'.format(step)) print('\tLoss: {}. Accuracy: {}'.format( loss_avg, accuracy_avg)) print('--------------------------------------') step += 1 # valid model to find the best model print('---------------Validation------------') valid_iter = evaluator.data_iter(indices_dir + 'valid.tsv', batch_size=64, if_shuffle=False) y_preds = [] y_valids = [] for _, x_valid, y_valid in valid_iter: tmp_preds = model.predict([x_valid]) for item_tmp in tmp_preds: y_preds.append(round(item_tmp[0])) y_valids.extend(y_valid) valid_f1 = f1_score( y_true=y_valids, y_pred=y_preds, average='weighted', ) print('Validating f1-macro score: ' + str(valid_f1)) if best_valid_f1 < valid_f1: best_valid_f1 = valid_f1 best_model = model pickle.dump(best_model, open(clf_path, 'wb')) # test moddel print('--------------Test--------------------') y_preds = [] y_probs = [] test_iter = evaluator.data_iter(indices_dir + 'test.tsv', batch_size=64, if_shuffle=False) for _, x_test, y_test in test_iter: tmp_preds = model.predict([x_test]) for item_tmp in tmp_preds: y_probs.append(item_tmp[0]) y_preds.append(int(round(item_tmp[0]))) with open(odir + lang + '.tsv', 'w') as wfile: with open(indices_dir + 'test.tsv') as dfile: wfile.write(dfile.readline().strip() + '\tpred\tpred_prob\n') for idx, line in enumerate(dfile): wfile.write(line.strip() + '\t' + str(y_preds[idx]) + '\t' + str(y_probs[idx]) + '\n') # save the predicted results evaluator.eval(odir + lang + '.tsv', odir + lang + '.score')
filename = filename[1:-1] with open(filename) as file: input = file.read() result = [] parseInput(input, 0, result) for exp in result: output = eval(exp, the_global_environment) return output primitive_procedure_names.append('load') primitive_procedure_objects.append(['primitive', load]) the_global_environment = Environment(primitive_procedure_names, primitive_procedure_objects) load('"./init.scm"') if __name__ == "__main__": if len(sys.argv) == 1: while True: input = input("Input: ") result = [] parseInput(input, 0, result) for exp in result: print(formatOutput(eval(exp, the_global_environment))) else: with open(sys.argv[1]) as file: input = file.read() result = [] parseInput(input, 0, result) for exp in result: print(formatOutput(eval(exp, the_global_environment)))
def build_lr(lang, odir): '''Train, valid, test lr lang: The language name odir: output directory of prediction results ''' doc_idx = 2 encode_dir = './data/encode/'+lang+'/' split_dir = './data/split/'+lang+'/' res_dir = './resources/classifier/lr/' vec_path = res_dir + lang + '.vect' clf_path = res_dir + lang + '.clf' # don't load classifier for debug usage print('Building vectorizer...') if os.path.exists(vec_path): vect = pickle.load(open(vec_path, 'rb')) else: corpus = [] with open(encode_dir + 'corpus.tsv') as dfile: dfile.readline() # skip column names for line in dfile: line = line.strip().split('\t') corpus.append(line[doc_idx]) vect = TfidfVectorizer( ngram_range=(1, 3), max_features=15000) vect.fit(corpus) pickle.dump(vect, open(vec_path, 'wb')) print('Building classifier...') # load training data data = {'x':[], 'y':[]} with open(split_dir+'train.tsv') as dfile: dfile.readline() for line in dfile: line = line.strip().split('\t') data['x'].append(line[doc_idx]) data['y'].append(int(line[-1])) # calculate the weight of labels weights = dict(zip( np.unique(data['y']), compute_class_weight( 'balanced', np.unique(data['y']), data['y'] ) )) # shuffle the data before training data['x'], data['y'] = shuffle(data['x'], data['y']) # build classifier clf = LogisticRegression( class_weight=weights, solver='liblinear') clf.fit(vect.transform(data['x']), data['y']) # save the classifier pickle.dump(clf, open(clf_path, 'wb')) # test the classifier data = [] with open(split_dir+'test.tsv') as dfile: dfile.readline() for line in dfile: line = line.strip().split('\t') data.append(line[doc_idx]) data = vect.transform(data) y_preds = clf.predict(data) y_probs = clf.predict_proba(data) # save the test results with open(odir+lang+'.tsv', 'w') as wfile: with open(split_dir+'test.tsv') as dfile: wfile.write( dfile.readline().strip()+'\tpred\tpred_prob\n' ) for idx, line in enumerate(dfile): # 1 is the hate speech label wfile.write(line.strip()+'\t'+str(y_preds[idx])+'\t'+str(y_probs[idx][1])+'\n') # save the predicted results evaluator.eval( odir+lang+'.tsv', odir+lang+'.score' )
import preprocess_clueweb as p import single_model_handler as mh import evaluator as e import params import sys if __name__ == "__main__": preprocess = p.preprocess() X, y, queries = preprocess.retrieve_data_from_file(params.data_set_file, params.normalized) sys.stdout.flush() number_of_queries = len(set(queries)) evaluator = e.eval() evaluator.create_index_to_doc_name_dict() evaluator.remove_score_file_from_last_run() sys.stdout.flush() train, validation = preprocess.create_test_train_split_cluweb(queries) sys.stdout.flush() X_i, y_i = preprocess.create_data_set(X[train], y[train], queries[train]) sys.stdout.flush() C_array = [0.1, 0.01, 0.001] single_model_handler = mh.single_model_handler(C_array) single_model_handler.fit_model_on_train_set_and_choose_best_for_competition( X, y, X_i, y_i, validation, queries, evaluator, preprocess) print("learning is finished")
def __call__(self, *args, **kwargs): import evaluator new_env = Environment(self.env) for s, v in zip(self.args, args): new_env[s] = v return evaluator.eval(self.body, new_env)