def visit_For(self, node): # print '{0}: For'.format(node.lineno) utils.add_loop_def(self.data, node.lineno) target_walker = walk_ast_for_names(node.target, self.current_scope) utils.add_string_to_data(node.lineno, target_walker.data, target_walker.line) utils.add_targets_to_data(node.lineno, self.data, target_walker.data) utils.combine_variable_scopes(self.variable_scope, target_walker.variable_scope) expr_walker = walk_ast_for_names(node.iter, self.current_scope) utils.add_string_to_data(node.lineno, expr_walker.data, expr_walker.line) utils.combine_data(node.lineno, self.data, expr_walker.data) utils.combine_variable_scopes(self.variable_scope, expr_walker.variable_scope) for stmts in [node.body, node.orelse]: for stmt in stmts: utils.add_loop_line(self.data, node.lineno, stmt.lineno) walker = walk_ast(stmt, self.current_scope) utils.add_string_to_data(stmt.lineno, walker.data, walker.line) # utils.combine_data(stmt.lineno, self.data, walker.data) utils.combine_all_data(self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope) utils.set_type(self.data, node.lineno, 'loop')
def visit_Tuple(self, node): # print '{0}: Tuple'.format(node.lineno) for elt in node.elts: walker = walk_ast_for_names(elt, self.current_scope) utils.add_string_to_data(node.lineno, walker.data, walker.line) utils.combine_data(node.lineno, self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope)
def visit_Index(self, node): # print 'Index' walker = walk_ast_for_expr(node.value, self.current_scope) self.line = walker.line utils.add_string_to_data(self.lineno, walker.data, walker.line) utils.combine_data(self.lineno, self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope)
def visit_Dict(self, node): # print '{0}: Dict'.format(node.lineno) for value in node.values: walker = walk_ast_for_expr(value, self.current_scope) utils.add_string_to_data(node.lineno, walker.data, walker.line) utils.combine_data(node.lineno, self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope)
def visit_Attribute(self, node): # print '{0}: Attribute attr: {1}'.format(node.lineno, node.attr) walker = walk_ast_for_expr(node.value, self.current_scope) utils.combine_data(node.lineno, self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope) self.line += walker.line + '.' + node.attr if self.line not in self.variable_scope[self.current_scope]: self.variable_scope[self.current_scope].append(self.line) utils.add_string_to_data(node.lineno, self.data, walker.line)
def visit_Print(self, node): # print '{0}: Print'.format(node.lineno) # print 'Has New Line: {0}'.format(node.nl) utils.set_type(self.data, node.lineno, 'print') for value in node.values: # walker = walk_ast_for_names(value) walker = walk_ast_for_expr(value, self.current_scope) utils.add_string_to_data(node.lineno, walker.data, walker.line) utils.combine_data(node.lineno, self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope) self.generic_visit(node)
def visit_Return(self, node): # print '{0}: self.line +='.format(node.lineno) utils.set_type(self.data, node.lineno, 'return') if node.value is not None: walker = walk_ast_for_expr(node.value, self.current_scope) # print walker.data # print self.data # print walker.line utils.combine_data(node.lineno, self.data, walker.data) utils.add_string_to_data(node.lineno, self.data, walker.line) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope)
def visit_BinOp(self, node): walker = WalkAST(self.current_scope) walker.is_bin_op = True walker.visit(node.left) walker.visit(node.op) walker.visit(node.right) walker.line = '(' + walker.line + ')' utils.add_string_to_data(node.lineno, walker.data, walker.line) utils.combine_data(node.lineno, self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope) self.line += walker.line utils.add_string_to_data(node.lineno, self.data, self.line)
def visit_UnaryOp(self, node): # print '{0}:'.format(node.lineno) op_walker = walk_ast(node.op, self.current_scope) utils.combine_variable_scopes(self.variable_scope, op_walker.variable_scope) expr_walker = walk_ast_for_expr(node.operand, self.current_scope) utils.combine_variable_scopes(self.variable_scope, expr_walker.variable_scope) # print '\t' + op_walker.line + ' ' + expr_walker.line utils.add_string_to_data(node.lineno, expr_walker.data, expr_walker.line) utils.combine_data(node.lineno, self.data, expr_walker.data) self.line += op_walker.line + ' ' + expr_walker.line utils.add_string_to_data(node.lineno, self.data, self.line)
def visit_arguments(self, node): # print '{0}: Arguments'.format(self.lineno) if len(node.args) == 0: utils.setup_expressions(self.data, self.lineno) else: for arg in node.args: arg_walker = walk_ast_for_names(arg, self.current_scope) if arg_walker.line == 'self': # TODO: Decide if this removal is right arg_walker.data[self.lineno]['expressions'].remove('self') else: utils.add_string_to_data(self.lineno, arg_walker.data, arg_walker.line) utils.combine_data(self.lineno, self.data, arg_walker.data) utils.combine_variable_scopes(self.variable_scope, arg_walker.variable_scope)
def visit_If(self, node): # print '{0}: Stmt If'.format(node.lineno) self.line = '' test_walker = walk_ast_for_expr(node.test, self.current_scope) utils.add_string_to_data(node.lineno, test_walker.data, test_walker.line) utils.combine_data(node.lineno, self.data, test_walker.data) utils.combine_variable_scopes(self.variable_scope, test_walker.variable_scope) for stmts in [node.body, node.orelse]: for stmt in stmts: walker = walk_ast(stmt, self.current_scope) utils.add_string_to_data(stmt.lineno, walker.data, walker.line) utils.combine_data(stmt.lineno, self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope) utils.remove_empty_string(self.data, stmt.lineno) utils.set_type(self.data, node.lineno, 'conditional')
def visit_Subscript(self, node): # print '{0}: Subscript'.format(node.lineno) value_walker = walk_ast_for_names(node.value, self.current_scope) utils.add_string_to_data(node.lineno, value_walker.data, value_walker.line) utils.combine_data(node.lineno, self.data, value_walker.data) utils.combine_variable_scopes(self.variable_scope, value_walker.variable_scope) slice_walker = WalkAST(self.current_scope) slice_walker.lineno = node.lineno slice_walker.visit(node.slice) utils.add_string_to_data(node.lineno, slice_walker.data, slice_walker.line) utils.combine_data(node.lineno, self.data, slice_walker.data) utils.combine_variable_scopes(self.variable_scope, slice_walker.variable_scope) self.line = '{0}[{1}]'.format(value_walker.line, slice_walker.line) utils.add_string_to_data(node.lineno, self.data, self.line)
def visit_List(self, node): # print '{0}: List'.format(node.lineno) if len(node.elts) == 0: utils.setup_expressions(self.data, node.lineno) self.line += '[]' utils.add_string_to_data(node.lineno, self.data, '[]') else: for e in node.elts: # walker = walk_ast_for_names(e) walker = walk_ast_for_expr(e, self.current_scope) utils.add_string_to_data(node.lineno, walker.data, walker.line) utils.combine_data(node.lineno, self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope) # utils.add_string_to_data(node.lineno, self.data, self.line) self.generic_visit(node.ctx) # print self.data[node.lineno]['expressions'] self.data[node.lineno]['type'] = 'list_assign' utils.remove_empty_string(self.data, node.lineno)
def visit_BoolOp(self, node): # print '{0} BOOL_OP:'.format(node.lineno) op_walker = walk_ast_for_expr(node.op, self.current_scope) utils.combine_variable_scopes(self.variable_scope, op_walker.variable_scope) first_node = True for n in node.values: walker = walk_ast_for_expr(n, self.current_scope) utils.add_string_to_data(node.lineno, walker.data, walker.line) utils.combine_data(node.lineno, self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope) if first_node: first_node = False self.line += walker.line else: self.line = '(' + self.line + ' ' + op_walker.line + ' ' + \ walker.line + ')' utils.add_string_to_data(node.lineno, self.data, self.line)
def visit_Assign(self, node): # print '{0}: Assign'.format(node.lineno) self.line = '' utils.set_type(self.data, node.lineno, 'assign') for target in node.targets: target_walker = walk_ast_for_names(target, self.current_scope) utils.set_assign(self.data, node.lineno, target_walker.line) utils.set_assigned_expressions( self.data, node.lineno, target_walker.line, target_walker.data[node.lineno]['expressions']) utils.combine_variable_scopes(self.variable_scope, target_walker.variable_scope) # utils.combine_data(node.lineno, self.data, target_walker.data) value_walker = walk_ast_for_expr(node.value, self.current_scope) utils.add_string_to_data(node.lineno, value_walker.data, value_walker.line) utils.combine_data(node.lineno, self.data, value_walker.data) # TODO utils.combine_variable_scopes(self.variable_scope, value_walker.variable_scope)
def visit_While(self, node): # print '{0}: While'.format(node.lineno) utils.add_loop_def(self.data, node.lineno) test_walker = WalkAST(self.current_scope) test_walker.get_names = True test_walker.visit(node.test) utils.add_string_to_data(node.lineno, test_walker.data, test_walker.line) utils.combine_data(node.lineno, self.data, test_walker.data) utils.combine_variable_scopes(self.variable_scope, test_walker.variable_scope) for stmts in [node.body, node.orelse]: for stmt in stmts: utils.add_loop_line(self.data, node.lineno, stmt.lineno) walker = walk_ast(stmt, self.current_scope) utils.add_string_to_data(stmt.lineno, walker.data, walker.line) # utils.combine_data(stmt.lineno, self.data, walker.data) utils.combine_all_data(self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope) utils.set_type(self.data, node.lineno, 'loop')
def visit_Compare(self, node): # print '{0}: Compare'.format(node.lineno) left_walker = walk_ast_for_names(node.left, self.current_scope) utils.add_string_to_data(node.lineno, left_walker.data, left_walker.line) utils.combine_data(node.lineno, self.data, left_walker.data) utils.combine_variable_scopes(self.variable_scope, left_walker.variable_scope) self.line = left_walker.line for op, comparator in zip(node.ops, node.comparators): op_walker = walk_ast_for_expr(op, self.current_scope) utils.combine_variable_scopes(self.variable_scope, op_walker.variable_scope) comparator_walker = walk_ast_for_expr(comparator, self.current_scope) utils.add_string_to_data(node.lineno, comparator_walker.data, comparator_walker.line) utils.combine_data(node.lineno, self.data, comparator_walker.data) utils.combine_variable_scopes(self.variable_scope, comparator_walker.variable_scope) self.line += op_walker.line + comparator_walker.line
def visit_FunctionDef(self, node): # print '{0}: FunctionDef - def {1}():'.format(node.lineno, node.name) utils.set_type(self.data, node.lineno, 'func') utils.set_name(self.data, node.lineno, node.name) utils.add_function_def(self.data, node.name, node.lineno) self.variable_scope[node.name] = [] arg_walker = WalkAST(node.name) arg_walker.lineno = node.lineno arg_walker.visit(node.args) utils.combine_data(node.lineno, self.data, arg_walker.data) utils.combine_variable_scopes(self.variable_scope, arg_walker.variable_scope) for stmts in [node.body, node.decorator_list]: for stmt in stmts: walker = walk_ast(stmt, node.name) utils.add_string_to_data(stmt.lineno, walker.data, walker.line) utils.add_function_line(self.data, node.name, stmt.lineno) # utils.combine_data(stmt.lineno, self.data, walker.data) utils.combine_all_data(self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope)
def visit_AugAssign(self, node): # print '{0}: AugAssign'.format(node.lineno) target_walker = walk_ast_for_names(node.target, self.current_scope) utils.add_string_to_data(node.lineno, target_walker.data, target_walker.line) utils.combine_data(node.lineno, self.data, target_walker.data) utils.combine_variable_scopes(self.variable_scope, target_walker.variable_scope) op_walker = walk_ast_for_expr(node.op, self.current_scope) utils.combine_variable_scopes(self.variable_scope, op_walker.variable_scope) value_walker = walk_ast_for_expr(node.value, self.current_scope) utils.combine_variable_scopes(self.variable_scope, value_walker.variable_scope) utils.add_string_to_data(node.lineno, value_walker.data, value_walker.line) utils.combine_data(node.lineno, self.data, value_walker.data) self.line = '{0}{1}{2}'.format(target_walker.line, op_walker.line, value_walker.line) utils.add_string_to_data(node.lineno, self.data, self.line) utils.set_type(self.data, node.lineno, 'assign') utils.set_assign(self.data, node.lineno, target_walker.line)
def visit_With(self, node): # print '{0}: With'.format(node.lineno) context_expr = walk_ast_for_expr(node.context_expr, self.current_scope) utils.add_string_to_data(node.lineno, context_expr.data, context_expr.line) utils.combine_data(node.lineno, self.data, context_expr.data) utils.combine_variable_scopes(self.variable_scope, context_expr.variable_scope) if node.optional_vars is not None: optional_vars_walker = walk_ast_for_names(node.optional_vars, self.current_scope) utils.set_type(self.data, node.lineno, 'assign') utils.set_assign(self.data, node.lineno, optional_vars_walker.line) utils.combine_variable_scopes(self.variable_scope, optional_vars_walker.variable_scope) for stmt in node.body: walker = walk_ast(stmt, self.current_scope) utils.add_string_to_data(stmt.lineno, walker.data, walker.line) utils.combine_data(stmt.lineno, self.data, walker.data) utils.combine_variable_scopes(self.variable_scope, walker.variable_scope) utils.remove_empty_string(self.data, stmt.lineno)
def visit_Call(self, node): # print '{0}: Call'.format(node.lineno) func_walker = walk_ast_for_names(node.func, self.current_scope) if not isinstance(node.func, ast.Name): utils.combine_data(node.lineno, self.data, func_walker.data) utils.combine_variable_scopes(self.variable_scope, func_walker.variable_scope) self.line += func_walker.line + '(' first_arg = True for arg in node.args: arg_walker = walk_ast_for_expr(arg, self.current_scope) utils.add_string_to_data(node.lineno, arg_walker.data, arg_walker.line) utils.combine_data(node.lineno, self.data, arg_walker.data) utils.combine_variable_scopes(self.variable_scope, arg_walker.variable_scope) if not first_arg: self.line += ',' first_arg = False self.line += arg_walker.line self.line += ')' utils.add_additional_lines(self.data, node.lineno, func_walker.line) utils.add_string_to_data(node.lineno, self.data, self.line)
def load_data(gold: bool) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Load, pre-process and combine the required data. Args: gold: whether to use GOLD annotations or PRED annotations. Returns: all the features combined, verb senses, sense labels ground truth. """ if gold: images_features = pd.read_pickle( 'data/features/GOLD/images_features_nametrim.pkl') embedded_annotations = pd.read_pickle( 'data/features/GOLD/verse_embedding.pkl') else: images_features = pd.read_pickle( 'data/features/PRED/images_features_new.pkl') embedded_annotations = pd.read_pickle( 'data/features/PRED/pred_verse_embedding.pkl') images_features['e_image'] = images_features['e_image'].apply( lambda x: x / np.linalg.norm(x, ord=2)) full_features = combine_data(embedded_annotations, images_features) senses = pd.read_csv('data/labels/verse_visualness_labels.tsv', sep='\t', dtype={'sense_num': str}) sense_labels = pd.read_csv( 'data/labels/3.5k_verse_gold_image_sense_annotations.csv', dtype={'sense_chosen': str}) if gold: sense_labels['image'] = sense_labels['image'].apply(filter_image_name) sense_labels = sense_labels[sense_labels['sense_chosen'] != '-1'] # Drop unclassifiable elements sense_labels.reset_index(inplace=True, drop=True) return full_features, senses, sense_labels
def load_data_and_predict(parameters): """ Preprocess input data and make predictions. :param parameters: :return: """ # Convert user input to model input PKL format pkl_file = parameters['input'] + '.pkl' preprocess.write_user_input_to_model_input(parameters['input'], '', pkl_file) parameters['input'] = pkl_file max_chars_in_sample = parameters['max_chars_in_sample'] utils.load_data_stride_x_chars_enc_dec(parameters['input'], parameters, stride=max_chars_in_sample / 2) file_ext = parameters['file_ext'] char_to_num = parameters['char_to_num'] if parameters['use_bpe']: if parameters['bpe_codes_file']: codes_file = parameters['bpe_codes_file'] else: print 'BPE codes file does not exist!' exit() utils.load_bpe_data(parameters['input'] + '.bpe.' + str(max_chars_in_sample) + file_ext, char_to_num, codes_file, parameters['input'] + '.bpe-coded-' + os.path.basename(parameters['bpe_codes_file']) + '.' + str(max_chars_in_sample) + file_ext, parameters) if parameters['use_tokenization']: tok_char_to_num = parameters['tok_char_to_num'] utils.load_tok_data( parameters['input'] + '.bpe.' + str(max_chars_in_sample) + file_ext, tok_char_to_num, parameters['input'] + '.tok.' + str(max_chars_in_sample) + file_ext, parameters) if parameters['use_word_embeddings']: vocab_dict = parameters['word_vocab_dict'] # run word embedding features on dev and test data utils.load_word_embeddings_data(parameters['input'] + '.bpe.' + str(max_chars_in_sample) + '.pkl', vocab_dict, parameters['input'] + '.word-coded.' + str(max_chars_in_sample) + '.pkl', parameters) if parameters['use_bpe_embeddings']: if parameters['bpe_codes_file']: codes_file = parameters['bpe_codes_file'] else: print 'BPE codes file does not exist!' exit() vocab_dict = parameters['bpe_vocab_dict'] # run word embedding features on dev and test data utils.load_bpe_embeddings_data(parameters['input'] + '.bpe.' + str(max_chars_in_sample) + file_ext, vocab_dict, codes_file, parameters['input'] + '.bpe-embed-coded-' + os.path.basename(codes_file) + '.' + str(max_chars_in_sample) + file_ext, parameters) # determine what will be the combined data file combined_data_ext = utils.generate_combined_feat_ext(parameters) # Combine data features as necessary utils.combine_data(parameters['input'], combined_data_ext, parameters) # Make predictions metrics = utils.MetricsCheckpoint(parameters['model'], parameters) metrics.make_and_format_predictions(parameters['input'], metrics.test_X_data, parameters) # Output in specified format if parameters['output_format'] == 'iob': postprocess.ord_byte_iob_to_byte_iob(parameters['output']) else: # st postprocess.byte_iob_to_src_tgt(parameters['output'], ordinal=True)
def main(args): # User parameters parser = OptionParser() parser.add_option('--model_path', help='Output file to save model to') parser.add_option('--input_format', default='st', help='Input format [iob|st]') parser.add_option( '--space_token', default='<SPACE>', help= 'If input format is iob, then use space_token for spaces between bytes' ) parser.add_option( '--train_data_file', help= 'Training data: if input_format == "st", then there are two input files: train_data_file.src and train_data_file.tgt' ) parser.add_option( '--dev_data_file', help= 'Dev data: if input_format == "st", then there are two input files: dev_data_file.src and dev_data_file.tgt' ) parser.add_option( '--test_data_file', help= 'Test data: if input_format == "st", then there are two input files: test_data_file.src and test_data_file.tgt' ) parser.add_option('--max_chars_in_sample', default=150, help='Max number of characters in a data sample') parser.add_option('--embedding_input_dim', default=256 + 1, help='Dimension of input vectors') parser.add_option('--embedding_output_dim', default=100, help='Dimension of output embedding vectors') parser.add_option('--embedding_max_len', default=150, help='Set length of input data (in byte characters)') parser.add_option('--cnn_filters', default=250, help='Number of filters in CNN output') parser.add_option('--cnn_kernel_size', default=7, help='Kernel size') parser.add_option('--cnn_padding', default='same', help='Type of border for CNN') parser.add_option('--cnn_act', default='relu', help='Activation fn for CNN') parser.add_option('--dense_final_act', default='softmax', help='Final activation fn in network') parser.add_option('--optimizer', default='adam', help='Optimizer') parser.add_option('--tag_scheme', default='iobes', help='IOBES or IOB2 tag scheme') parser.add_option('--batch_size', default=256, help='Number of samples to process in one batch') parser.add_option('--dropout', default=0.5, help='Fraction of input units to dropout') parser.add_option('--lstm_units', default=100, help=0) parser.add_option('--lstm_act', default='tanh', help='Activation fn for LSTM') parser.add_option('--num_byte_layers', default=20, help='Number of CNN layers in architecture') parser.add_option('--nb_workers', default=1, help='Number of threads or processes to use') parser.add_option('--nb_epochs', default=300, help='Number of epochs to train model for') parser.add_option('--residual', default=1, help='Whether to use residual connections') parser.add_option( '--skip_residuals', default=0, help='Whether to add a residual connection every other conv layer') parser.add_option('--lr', default=0.00005, help='Learning rate of optimizer') parser.add_option('--use_bpe', default=0, help='Whether to use byte pair encodings') parser.add_option('--num_operations', default=50000, help="Number of merge operations for BPE algorithm") parser.add_option('--reload', default=0, help="Whether to reload a previously trained model") parser.add_option('--blstm_on_top', default=1, help="Whether to use a BLSTM layer on top of the CNNs") parser.add_option('--crf_on_top', default=1, help="Whether to use a CRF layer on top") parser.add_option( '--use_word_embeddings', default=0, help="Whether to also use pretrained word embeddings as input") parser.add_option( '--use_bytes', default=1, help="Whether to use byte embedding as input. Default is true.") parser.add_option( '--word_embeddings_file', default='', help= "Pretrained word embedding file. This needs to be set in order to use word embeddings." ) parser.add_option('--word_embedding_dim', default=200, help="Dimension of pretrained word embeddings") parser.add_option('--bpe_codes_file', default='pmc_codes_file_ALL_50000', help="Pretrained BPE file") parser.add_option( '--use_bpe_embeddings', default=1, help= "Whether to use pretrained BPE embeddings as input. Cannot be used with word embeddings" ) parser.add_option( '--bpe_embeddings_file', default='models/bpe2vec.csv', help= "Pretrained BPE embedding file. This needs to be set in order to use BPE embeddings" ) parser.add_option('--bpe_embedding_dim', default=100, help="Dimension of pretrained BPE embeddings") parser.add_option( '--byte_layer_for_embed', default=1, help= "Whether to use embedding inputs as inputs for the CNN layers (or only BLSTM layers)" ) parser.add_option('--layer_for_bytes', default='cnn', help="stack of cnns or a blstm layer for bytes") parser.add_option('--temp_dir', default='evaluation/temp', help="Directory to write evaluation scores") parser.add_option( '--drop_bytes', default=1, help="Whether to drop a fraction of bytes for each input") parser.add_option('--byte_drop_fraction', default=0.3, help="Fraction of byte input to drop") parser.add_option( '--train_data_stride', default=75, help= "Stride in number of bytes to shift window to get next training sample" ) parser.add_option('--trainable_bpe_embeddings', default=1, help="Whether BPE embeddings should be traininable") parser.add_option( '--override_parameters', default=0, help= "Whether to use specified parameters or parameters from saved model, if they exist" ) parser.add_option('--get_probs', default=0, help="Get normalized log likelihoods of each sample") parser.add_option( '--get_vectors', default=0, help= "Get output vectors of second-to-last layer in the network. Currently only tested with the CNN-BLSTM-CRF configuration" ) parser.add_option('--use_tokenization', default=0, help="Use tokenization features") parser.add_option( '--repickle_data', default=0, help= "Whether to re-process and pickle data even if the pickle file already exists" ) parser.add_option( '--make_samples_unique', default=0, help= "Clean training data so that the user-provided samples are all unique") opts = parser.parse_args(args)[0] # Parameters parameters = OrderedDict() parameters['override_parameters'] = int(opts.override_parameters) == 1 parameters['model_path'] = opts.model_path parameters['reload'] = int(opts.reload) == 1 parameters_path = parameters['model_path'] + '_parameters.pkl' if parameters['reload'] or parameters[ 'override_parameters']: # load parameters from file with open(parameters_path, 'rb') as f: parameters = pkl.load(f) else: # set parameters parameters['max_chars_in_sample'] = int(opts.max_chars_in_sample) parameters['embedding_input_dim'] = int(opts.embedding_input_dim) parameters['embedding_output_dim'] = int(opts.embedding_output_dim) parameters['embedding_max_len'] = int(opts.embedding_max_len) parameters['cnn_filters'] = int(opts.cnn_filters) parameters['cnn_kernel_size'] = int(opts.cnn_kernel_size) parameters['cnn_padding'] = opts.cnn_padding parameters['cnn_act'] = opts.cnn_act parameters['dense_final_act'] = opts.dense_final_act parameters['optimizer'] = opts.optimizer parameters['tag_scheme'] = opts.tag_scheme parameters['dropout'] = float(opts.dropout) parameters['lstm_units'] = int(opts.lstm_units) parameters['lstm_act'] = opts.lstm_act parameters['num_byte_layers'] = int(opts.num_byte_layers) parameters['residual'] = int(opts.residual) == 1 parameters['skip_residuals'] = int(opts.skip_residuals) == 1 parameters['lr'] = float(opts.lr) parameters['use_bpe'] = int(opts.use_bpe) == 1 parameters['num_operations'] = int( opts.num_operations) # will be overridden if bpe_codes_file is set parameters['blstm_on_top'] = int(opts.blstm_on_top) == 1 parameters['crf_on_top'] = int(opts.crf_on_top) == 1 parameters['use_bytes'] = int(opts.use_bytes) == 1 parameters['word_embeddings_file'] = opts.word_embeddings_file if parameters['word_embeddings_file']: parameters['use_word_embeddings'] = int( opts.use_word_embeddings) == 1 else: parameters['use_word_embeddings'] = False parameters['word_embeddings_dim'] = int(opts.word_embedding_dim) parameters['bpe_codes_file'] = opts.bpe_codes_file if parameters['bpe_codes_file']: with open(parameters['bpe_codes_file'], 'rU') as f: lines = f.readlines() num_operations = len(lines) - 2 parameters['num_operations'] = num_operations parameters['bpe_embeddings_file'] = opts.bpe_embeddings_file if parameters['bpe_embeddings_file']: parameters['use_bpe_embeddings'] = int( opts.use_bpe_embeddings) == 1 else: parameters['use_bpe_embeddings'] = False parameters['bpe_embeddings_dim'] = int(opts.bpe_embedding_dim) parameters['byte_layer_for_embed'] = int( opts.byte_layer_for_embed) == 1 parameters['layer_for_bytes'] = opts.layer_for_bytes parameters['trainable_bpe_embeddings'] = int( opts.trainable_bpe_embeddings) == 1 parameters['use_tokenization'] = int( opts.use_tokenization) == 1 # in IOBES format parameters['space_token'] = opts.space_token # set data files parameters['train_data_file'] = opts.train_data_file parameters['dev_data_file'] = opts.dev_data_file parameters['test_data_file'] = opts.test_data_file parameters['get_probs'] = int(opts.get_probs) == 1 parameters['get_vectors'] = int(opts.get_vectors) == 1 parameters['repickle_data'] = int(opts.repickle_data) == 1 parameters['input_format'] = opts.input_format parameters['make_samples_unique'] = int(opts.make_samples_unique) == 1 parameters['temp_dir'] = opts.temp_dir parameters['drop_bytes'] = int(opts.drop_bytes) == 1 parameters['byte_drop_fraction'] = float(opts.byte_drop_fraction) parameters['train_data_stride'] = int(opts.train_data_stride) parameters['override_parameters'] = int(opts.override_parameters) == 1 parameters['model_path'] = opts.model_path parameters['reload'] = int(opts.reload) == 1 parameters['batch_size'] = int(opts.batch_size) parameters['nb_workers'] = int(opts.nb_workers) parameters['nb_epochs'] = int(opts.nb_epochs) # Convert user input to model input PKL format if PKL file doesn't already exist train_pkl_file = parameters['train_data_file'] + '.pkl' dev_pkl_file = parameters['dev_data_file'] + '.pkl' test_pkl_file = parameters['test_data_file'] + '.pkl' if parameters['repickle_data'] or parameters[ 'make_samples_unique'] or not (os.path.isfile(train_pkl_file) and os.path.isfile(dev_pkl_file) and os.path.isfile(test_pkl_file)): train_src_file = parameters['train_data_file'] + '.src' train_tgt_file = parameters['train_data_file'] + '.tgt' dev_src_file = parameters['dev_data_file'] + '.src' dev_tgt_file = parameters['dev_data_file'] + '.tgt' test_src_file = parameters['test_data_file'] + '.src' test_tgt_file = parameters['test_data_file'] + '.tgt' if parameters['input_format'] == 'iob': # convert to st format preprocess.write_user_byte_iob_input_to_src_tgt_input( parameters['train_data_file'], train_src_file, train_tgt_file, space_token=parameters['space_token']) preprocess.write_user_byte_iob_input_to_src_tgt_input( parameters['dev_data_file'], dev_src_file, dev_tgt_file, space_token=parameters['space_token']) preprocess.write_user_byte_iob_input_to_src_tgt_input( parameters['test_data_file'], test_src_file, test_tgt_file, space_token=parameters['space_token']) # train_pkl_file = parameters['train_data_file'] + '.pkl' # dev_pkl_file = parameters['dev_data_file'] + '.pkl' # test_pkl_file = parameters['test_data_file'] + '.pkl' if parameters['make_samples_unique']: # unique training samples unique_train_file = utils.remove_duplicate_samples( parameters['train_data_file']) train_src_file = unique_train_file + '.src' train_tgt_file = unique_train_file + '.tgt' preprocess.write_user_input_to_model_input(train_src_file, train_tgt_file, train_pkl_file) preprocess.write_user_input_to_model_input(dev_src_file, dev_tgt_file, dev_pkl_file) preprocess.write_user_input_to_model_input(test_src_file, test_tgt_file, test_pkl_file) parameters['train_data_file'] = train_pkl_file parameters['dev_data_file'] = dev_pkl_file parameters['test_data_file'] = test_pkl_file # Load train, dev, and test data tags = utils.collect_tags(parameters['train_data_file']) parameters['tags'] = tags tag_to_num = utils.build_tag_to_num(tags) parameters['tag_to_num'] = tag_to_num char_to_num = utils.build_char_to_num() parameters['char_to_num'] = char_to_num if parameters['use_tokenization']: tok_char_to_num = utils.build_tok_char_to_num() parameters['tok_char_to_num'] = tok_char_to_num enc_dec_tag_to_num = utils.build_enc_dec_tag_to_num(parameters) parameters['enc_dec_tag_to_num'] = enc_dec_tag_to_num parameters['num_enc_dec_tags'] = len(enc_dec_tag_to_num) parameters['num_iobes_tags'] = len(tag_to_num) max_chars_in_sample = parameters['max_chars_in_sample'] parameters['enc_dec_output_length'] = max_chars_in_sample print 'Parameters' for param_key in parameters: if 'vocab_dict' not in param_key: print '\t', param_key, ': ', parameters[param_key] parameters['file_ext'] = '.pkl' utils.load_data_stride_x_chars_enc_dec( parameters['train_data_file'], parameters, stride=parameters['train_data_stride']) utils.load_data_stride_x_chars_enc_dec(parameters['dev_data_file'], parameters, stride=max_chars_in_sample / 2) utils.load_data_stride_x_chars_enc_dec(parameters['test_data_file'], parameters, stride=max_chars_in_sample / 2) file_ext = parameters['file_ext'] if parameters['drop_bytes']: char_to_num['<DROP>'] = len(char_to_num) utils.byte_dropout(parameters['train_data_file'], parameters) parameters['train_data_file'] += '.bytedrop' if parameters['use_bpe']: if parameters['bpe_codes_file']: codes_file = parameters['bpe_codes_file'] else: codes_file = 'codes_file.' + str(parameters['num_operations']) if not os.path.isfile(codes_file): # run bpe on train data utils.gen_bpe_code_file( parameters['train_data_file'] + '.bpe.' + str(max_chars_in_sample) + '.pkl', parameters['num_operations'], codes_file) parameters['bpe_codes_file'] = codes_file # add code_file to char_to_num utils.add_bpe_to_vocab_dictionary(codes_file, char_to_num) # use codes to run bpe on data utils.load_bpe_data( parameters['train_data_file'].replace('.bytedrop', '') + '.bpe.' + str(max_chars_in_sample) + file_ext, char_to_num, codes_file, parameters['train_data_file'].replace('.bytedrop', '') + '.bpe-coded-' + os.path.basename(parameters['bpe_codes_file']) + '.' + str(max_chars_in_sample) + file_ext, parameters) utils.load_bpe_data( parameters['dev_data_file'] + '.bpe.' + str(max_chars_in_sample) + file_ext, char_to_num, codes_file, parameters['dev_data_file'] + '.bpe-coded-' + os.path.basename(parameters['bpe_codes_file']) + '.' + str(max_chars_in_sample) + file_ext, parameters) utils.load_bpe_data( parameters['test_data_file'] + '.bpe.' + str(max_chars_in_sample) + file_ext, char_to_num, codes_file, parameters['test_data_file'] + '.bpe-coded-' + os.path.basename(parameters['bpe_codes_file']) + '.' + str(max_chars_in_sample) + file_ext, parameters) if parameters['use_tokenization']: utils.load_tok_data( parameters['train_data_file'].replace('.bytedrop', '') + '.bpe.' + str(max_chars_in_sample) + file_ext, tok_char_to_num, parameters['train_data_file'].replace('.bytedrop', '') + '.tok.' + str(max_chars_in_sample) + file_ext, parameters) utils.load_tok_data( parameters['dev_data_file'] + '.bpe.' + str(max_chars_in_sample) + file_ext, tok_char_to_num, parameters['dev_data_file'] + '.tok.' + str(max_chars_in_sample) + file_ext, parameters) utils.load_tok_data( parameters['test_data_file'] + '.bpe.' + str(max_chars_in_sample) + file_ext, tok_char_to_num, parameters['test_data_file'] + '.tok.' + str(max_chars_in_sample) + file_ext, parameters) if parameters['use_word_embeddings']: vocab_dict = OrderedDict() # add vocab to vocab_dict print 'Adding words...' utils.add_embeddings_to_vocab_dictionary( parameters['word_embeddings_file'], vocab_dict, parameters) print 'Done adding words' # add unknown, space, and pad embedding to vocab dictionary vocab_dict['<UNKNOWN>'] = (len(vocab_dict), np.random.rand( parameters['word_embeddings_dim'])) vocab_dict['<SPACE>'] = (len(vocab_dict), np.random.rand( parameters['word_embeddings_dim'])) # run word embedding features on dev and test data utils.load_word_embeddings_data( parameters['train_data_file'].replace('.bytedrop', '') + '.bpe.' + str(max_chars_in_sample) + '.pkl', vocab_dict, parameters['train_data_file'].replace('.bytedrop', '') + '.word-coded.' + str(max_chars_in_sample) + '.pkl', parameters) utils.load_word_embeddings_data( parameters['dev_data_file'] + '.bpe.' + str(max_chars_in_sample) + '.pkl', vocab_dict, parameters['dev_data_file'] + '.word-coded.' + str(max_chars_in_sample) + '.pkl', parameters) utils.load_word_embeddings_data( parameters['test_data_file'] + '.bpe.' + str(max_chars_in_sample) + '.pkl', vocab_dict, parameters['test_data_file'] + '.word-coded.' + str(max_chars_in_sample) + '.pkl', parameters) parameters['word_vocab_dict'] = vocab_dict if parameters['use_bpe_embeddings']: if parameters['bpe_codes_file']: codes_file = parameters['bpe_codes_file'] else: codes_file = 'codes_file.' + str(parameters['num_operations']) parameters['bpe_codes_file'] = codes_file vocab_dict = OrderedDict() # add vocab to vocab_dict print 'Adding BPE embeddings...' utils.add_embeddings_to_vocab_dictionary( parameters['bpe_embeddings_file'], vocab_dict, parameters) print 'Done adding BPE embeddings' # add unknown, space, and pad embedding to vocab dictionary vocab_dict['<UNKNOWN>'] = (len(vocab_dict), np.random.rand( parameters['bpe_embeddings_dim'])) # run word embedding features on dev and test data utils.load_bpe_embeddings_data( parameters['train_data_file'].replace('.bytedrop', '') + '.bpe.' + str(max_chars_in_sample) + file_ext, vocab_dict, codes_file, parameters['train_data_file'].replace('.bytedrop', '') + '.bpe-embed-coded-' + os.path.basename(codes_file) + '.' + str(max_chars_in_sample) + file_ext, parameters) utils.load_bpe_embeddings_data( parameters['dev_data_file'] + '.bpe.' + str(max_chars_in_sample) + file_ext, vocab_dict, codes_file, parameters['dev_data_file'] + '.bpe-embed-coded-' + os.path.basename(codes_file) + '.' + str(max_chars_in_sample) + file_ext, parameters) utils.load_bpe_embeddings_data( parameters['test_data_file'] + '.bpe.' + str(max_chars_in_sample) + file_ext, vocab_dict, codes_file, parameters['test_data_file'] + '.bpe-embed-coded-' + os.path.basename(codes_file) + '.' + str(max_chars_in_sample) + file_ext, parameters) parameters['bpe_vocab_dict'] = vocab_dict # update embedding params parameters['embedding_input_dim'] = len(char_to_num) print 'Total embedding dimensions:', parameters['embedding_input_dim'] # determine what will be the combined data file combined_data_ext = utils.generate_combined_feat_ext(parameters) # Combine data features as necessary utils.combine_data(parameters['train_data_file'], combined_data_ext, parameters) utils.combine_data(parameters['dev_data_file'], combined_data_ext, parameters) utils.combine_data(parameters['test_data_file'], combined_data_ext, parameters) # Save parameters if not parameters['reload'] and not parameters['override_parameters']: with open(parameters_path, 'wb') as p: pkl.dump(parameters, p) train_model(parameters)