Ejemplo n.º 1
0
 def visit_For(self, node):
     # print '{0}: For'.format(node.lineno)
     utils.add_loop_def(self.data, node.lineno)
     target_walker = walk_ast_for_names(node.target, self.current_scope)
     utils.add_string_to_data(node.lineno, target_walker.data,
                              target_walker.line)
     utils.add_targets_to_data(node.lineno, self.data, target_walker.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   target_walker.variable_scope)
     expr_walker = walk_ast_for_names(node.iter, self.current_scope)
     utils.add_string_to_data(node.lineno, expr_walker.data,
                              expr_walker.line)
     utils.combine_data(node.lineno, self.data, expr_walker.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   expr_walker.variable_scope)
     for stmts in [node.body, node.orelse]:
         for stmt in stmts:
             utils.add_loop_line(self.data, node.lineno, stmt.lineno)
             walker = walk_ast(stmt, self.current_scope)
             utils.add_string_to_data(stmt.lineno, walker.data, walker.line)
             # utils.combine_data(stmt.lineno, self.data, walker.data)
             utils.combine_all_data(self.data, walker.data)
             utils.combine_variable_scopes(self.variable_scope,
                                           walker.variable_scope)
     utils.set_type(self.data, node.lineno, 'loop')
Ejemplo n.º 2
0
 def visit_Tuple(self, node):
     # print '{0}: Tuple'.format(node.lineno)
     for elt in node.elts:
         walker = walk_ast_for_names(elt, self.current_scope)
         utils.add_string_to_data(node.lineno, walker.data, walker.line)
         utils.combine_data(node.lineno, self.data, walker.data)
         utils.combine_variable_scopes(self.variable_scope,
                                       walker.variable_scope)
Ejemplo n.º 3
0
 def visit_Index(self, node):
     # print 'Index'
     walker = walk_ast_for_expr(node.value, self.current_scope)
     self.line = walker.line
     utils.add_string_to_data(self.lineno, walker.data, walker.line)
     utils.combine_data(self.lineno, self.data, walker.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   walker.variable_scope)
Ejemplo n.º 4
0
 def visit_Dict(self, node):
     # print '{0}: Dict'.format(node.lineno)
     for value in node.values:
         walker = walk_ast_for_expr(value, self.current_scope)
         utils.add_string_to_data(node.lineno, walker.data, walker.line)
         utils.combine_data(node.lineno, self.data, walker.data)
         utils.combine_variable_scopes(self.variable_scope,
                                       walker.variable_scope)
Ejemplo n.º 5
0
 def visit_Attribute(self, node):
     # print '{0}: Attribute attr: {1}'.format(node.lineno, node.attr)
     walker = walk_ast_for_expr(node.value, self.current_scope)
     utils.combine_data(node.lineno, self.data, walker.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   walker.variable_scope)
     self.line += walker.line + '.' + node.attr
     if self.line not in self.variable_scope[self.current_scope]:
         self.variable_scope[self.current_scope].append(self.line)
     utils.add_string_to_data(node.lineno, self.data, walker.line)
Ejemplo n.º 6
0
 def visit_Print(self, node):
     # print '{0}: Print'.format(node.lineno)
     # print 'Has New Line: {0}'.format(node.nl)
     utils.set_type(self.data, node.lineno, 'print')
     for value in node.values:
         # walker = walk_ast_for_names(value)
         walker = walk_ast_for_expr(value, self.current_scope)
         utils.add_string_to_data(node.lineno, walker.data, walker.line)
         utils.combine_data(node.lineno, self.data, walker.data)
         utils.combine_variable_scopes(self.variable_scope,
                                       walker.variable_scope)
     self.generic_visit(node)
Ejemplo n.º 7
0
 def visit_Return(self, node):
     # print '{0}: self.line +='.format(node.lineno)
     utils.set_type(self.data, node.lineno, 'return')
     if node.value is not None:
         walker = walk_ast_for_expr(node.value, self.current_scope)
         # print walker.data
         # print self.data
         # print walker.line
         utils.combine_data(node.lineno, self.data, walker.data)
         utils.add_string_to_data(node.lineno, self.data, walker.line)
         utils.combine_variable_scopes(self.variable_scope,
                                       walker.variable_scope)
Ejemplo n.º 8
0
 def visit_BinOp(self, node):
     walker = WalkAST(self.current_scope)
     walker.is_bin_op = True
     walker.visit(node.left)
     walker.visit(node.op)
     walker.visit(node.right)
     walker.line = '(' + walker.line + ')'
     utils.add_string_to_data(node.lineno, walker.data, walker.line)
     utils.combine_data(node.lineno, self.data, walker.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   walker.variable_scope)
     self.line += walker.line
     utils.add_string_to_data(node.lineno, self.data, self.line)
Ejemplo n.º 9
0
 def visit_UnaryOp(self, node):
     # print '{0}:'.format(node.lineno)
     op_walker = walk_ast(node.op, self.current_scope)
     utils.combine_variable_scopes(self.variable_scope,
                                   op_walker.variable_scope)
     expr_walker = walk_ast_for_expr(node.operand, self.current_scope)
     utils.combine_variable_scopes(self.variable_scope,
                                   expr_walker.variable_scope)
     # print '\t' + op_walker.line + ' ' + expr_walker.line
     utils.add_string_to_data(node.lineno, expr_walker.data,
                              expr_walker.line)
     utils.combine_data(node.lineno, self.data, expr_walker.data)
     self.line += op_walker.line + ' ' + expr_walker.line
     utils.add_string_to_data(node.lineno, self.data, self.line)
Ejemplo n.º 10
0
 def visit_arguments(self, node):
     # print '{0}: Arguments'.format(self.lineno)
     if len(node.args) == 0:
         utils.setup_expressions(self.data, self.lineno)
     else:
         for arg in node.args:
             arg_walker = walk_ast_for_names(arg, self.current_scope)
             if arg_walker.line == 'self':
                 # TODO: Decide if this removal is right
                 arg_walker.data[self.lineno]['expressions'].remove('self')
             else:
                 utils.add_string_to_data(self.lineno, arg_walker.data,
                                          arg_walker.line)
             utils.combine_data(self.lineno, self.data, arg_walker.data)
             utils.combine_variable_scopes(self.variable_scope,
                                           arg_walker.variable_scope)
Ejemplo n.º 11
0
 def visit_If(self, node):
     # print '{0}: Stmt If'.format(node.lineno)
     self.line = ''
     test_walker = walk_ast_for_expr(node.test, self.current_scope)
     utils.add_string_to_data(node.lineno, test_walker.data,
                              test_walker.line)
     utils.combine_data(node.lineno, self.data, test_walker.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   test_walker.variable_scope)
     for stmts in [node.body, node.orelse]:
         for stmt in stmts:
             walker = walk_ast(stmt, self.current_scope)
             utils.add_string_to_data(stmt.lineno, walker.data, walker.line)
             utils.combine_data(stmt.lineno, self.data, walker.data)
             utils.combine_variable_scopes(self.variable_scope,
                                           walker.variable_scope)
             utils.remove_empty_string(self.data, stmt.lineno)
     utils.set_type(self.data, node.lineno, 'conditional')
Ejemplo n.º 12
0
 def visit_Subscript(self, node):
     # print '{0}: Subscript'.format(node.lineno)
     value_walker = walk_ast_for_names(node.value, self.current_scope)
     utils.add_string_to_data(node.lineno, value_walker.data,
                              value_walker.line)
     utils.combine_data(node.lineno, self.data, value_walker.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   value_walker.variable_scope)
     slice_walker = WalkAST(self.current_scope)
     slice_walker.lineno = node.lineno
     slice_walker.visit(node.slice)
     utils.add_string_to_data(node.lineno, slice_walker.data,
                              slice_walker.line)
     utils.combine_data(node.lineno, self.data, slice_walker.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   slice_walker.variable_scope)
     self.line = '{0}[{1}]'.format(value_walker.line, slice_walker.line)
     utils.add_string_to_data(node.lineno, self.data, self.line)
Ejemplo n.º 13
0
 def visit_List(self, node):
     # print '{0}: List'.format(node.lineno)
     if len(node.elts) == 0:
         utils.setup_expressions(self.data, node.lineno)
         self.line += '[]'
         utils.add_string_to_data(node.lineno, self.data, '[]')
     else:
         for e in node.elts:
             # walker = walk_ast_for_names(e)
             walker = walk_ast_for_expr(e, self.current_scope)
             utils.add_string_to_data(node.lineno, walker.data, walker.line)
             utils.combine_data(node.lineno, self.data, walker.data)
             utils.combine_variable_scopes(self.variable_scope,
                                           walker.variable_scope)
             # utils.add_string_to_data(node.lineno, self.data, self.line)
     self.generic_visit(node.ctx)
     # print self.data[node.lineno]['expressions']
     self.data[node.lineno]['type'] = 'list_assign'
     utils.remove_empty_string(self.data, node.lineno)
Ejemplo n.º 14
0
 def visit_BoolOp(self, node):
     # print '{0} BOOL_OP:'.format(node.lineno)
     op_walker = walk_ast_for_expr(node.op, self.current_scope)
     utils.combine_variable_scopes(self.variable_scope,
                                   op_walker.variable_scope)
     first_node = True
     for n in node.values:
         walker = walk_ast_for_expr(n, self.current_scope)
         utils.add_string_to_data(node.lineno, walker.data, walker.line)
         utils.combine_data(node.lineno, self.data, walker.data)
         utils.combine_variable_scopes(self.variable_scope,
                                       walker.variable_scope)
         if first_node:
             first_node = False
             self.line += walker.line
         else:
             self.line = '(' + self.line + ' ' + op_walker.line + ' ' + \
                         walker.line + ')'
         utils.add_string_to_data(node.lineno, self.data, self.line)
Ejemplo n.º 15
0
 def visit_Assign(self, node):
     # print '{0}: Assign'.format(node.lineno)
     self.line = ''
     utils.set_type(self.data, node.lineno, 'assign')
     for target in node.targets:
         target_walker = walk_ast_for_names(target, self.current_scope)
         utils.set_assign(self.data, node.lineno, target_walker.line)
         utils.set_assigned_expressions(
             self.data, node.lineno, target_walker.line,
             target_walker.data[node.lineno]['expressions'])
         utils.combine_variable_scopes(self.variable_scope,
                                       target_walker.variable_scope)
         # utils.combine_data(node.lineno, self.data, target_walker.data)
     value_walker = walk_ast_for_expr(node.value, self.current_scope)
     utils.add_string_to_data(node.lineno, value_walker.data,
                              value_walker.line)
     utils.combine_data(node.lineno, self.data, value_walker.data)
     # TODO
     utils.combine_variable_scopes(self.variable_scope,
                                   value_walker.variable_scope)
Ejemplo n.º 16
0
 def visit_While(self, node):
     # print '{0}: While'.format(node.lineno)
     utils.add_loop_def(self.data, node.lineno)
     test_walker = WalkAST(self.current_scope)
     test_walker.get_names = True
     test_walker.visit(node.test)
     utils.add_string_to_data(node.lineno, test_walker.data,
                              test_walker.line)
     utils.combine_data(node.lineno, self.data, test_walker.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   test_walker.variable_scope)
     for stmts in [node.body, node.orelse]:
         for stmt in stmts:
             utils.add_loop_line(self.data, node.lineno, stmt.lineno)
             walker = walk_ast(stmt, self.current_scope)
             utils.add_string_to_data(stmt.lineno, walker.data, walker.line)
             # utils.combine_data(stmt.lineno, self.data, walker.data)
             utils.combine_all_data(self.data, walker.data)
             utils.combine_variable_scopes(self.variable_scope,
                                           walker.variable_scope)
     utils.set_type(self.data, node.lineno, 'loop')
Ejemplo n.º 17
0
 def visit_Compare(self, node):
     # print '{0}: Compare'.format(node.lineno)
     left_walker = walk_ast_for_names(node.left, self.current_scope)
     utils.add_string_to_data(node.lineno, left_walker.data,
                              left_walker.line)
     utils.combine_data(node.lineno, self.data, left_walker.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   left_walker.variable_scope)
     self.line = left_walker.line
     for op, comparator in zip(node.ops, node.comparators):
         op_walker = walk_ast_for_expr(op, self.current_scope)
         utils.combine_variable_scopes(self.variable_scope,
                                       op_walker.variable_scope)
         comparator_walker = walk_ast_for_expr(comparator,
                                               self.current_scope)
         utils.add_string_to_data(node.lineno, comparator_walker.data,
                                  comparator_walker.line)
         utils.combine_data(node.lineno, self.data, comparator_walker.data)
         utils.combine_variable_scopes(self.variable_scope,
                                       comparator_walker.variable_scope)
         self.line += op_walker.line + comparator_walker.line
Ejemplo n.º 18
0
 def visit_FunctionDef(self, node):
     # print '{0}: FunctionDef - def {1}():'.format(node.lineno, node.name)
     utils.set_type(self.data, node.lineno, 'func')
     utils.set_name(self.data, node.lineno, node.name)
     utils.add_function_def(self.data, node.name, node.lineno)
     self.variable_scope[node.name] = []
     arg_walker = WalkAST(node.name)
     arg_walker.lineno = node.lineno
     arg_walker.visit(node.args)
     utils.combine_data(node.lineno, self.data, arg_walker.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   arg_walker.variable_scope)
     for stmts in [node.body, node.decorator_list]:
         for stmt in stmts:
             walker = walk_ast(stmt, node.name)
             utils.add_string_to_data(stmt.lineno, walker.data, walker.line)
             utils.add_function_line(self.data, node.name, stmt.lineno)
             # utils.combine_data(stmt.lineno, self.data, walker.data)
             utils.combine_all_data(self.data, walker.data)
             utils.combine_variable_scopes(self.variable_scope,
                                           walker.variable_scope)
Ejemplo n.º 19
0
 def visit_AugAssign(self, node):
     # print '{0}: AugAssign'.format(node.lineno)
     target_walker = walk_ast_for_names(node.target, self.current_scope)
     utils.add_string_to_data(node.lineno, target_walker.data,
                              target_walker.line)
     utils.combine_data(node.lineno, self.data, target_walker.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   target_walker.variable_scope)
     op_walker = walk_ast_for_expr(node.op, self.current_scope)
     utils.combine_variable_scopes(self.variable_scope,
                                   op_walker.variable_scope)
     value_walker = walk_ast_for_expr(node.value, self.current_scope)
     utils.combine_variable_scopes(self.variable_scope,
                                   value_walker.variable_scope)
     utils.add_string_to_data(node.lineno, value_walker.data,
                              value_walker.line)
     utils.combine_data(node.lineno, self.data, value_walker.data)
     self.line = '{0}{1}{2}'.format(target_walker.line, op_walker.line,
                                    value_walker.line)
     utils.add_string_to_data(node.lineno, self.data, self.line)
     utils.set_type(self.data, node.lineno, 'assign')
     utils.set_assign(self.data, node.lineno, target_walker.line)
Ejemplo n.º 20
0
 def visit_With(self, node):
     # print '{0}: With'.format(node.lineno)
     context_expr = walk_ast_for_expr(node.context_expr, self.current_scope)
     utils.add_string_to_data(node.lineno, context_expr.data,
                              context_expr.line)
     utils.combine_data(node.lineno, self.data, context_expr.data)
     utils.combine_variable_scopes(self.variable_scope,
                                   context_expr.variable_scope)
     if node.optional_vars is not None:
         optional_vars_walker = walk_ast_for_names(node.optional_vars,
                                                   self.current_scope)
         utils.set_type(self.data, node.lineno, 'assign')
         utils.set_assign(self.data, node.lineno, optional_vars_walker.line)
         utils.combine_variable_scopes(self.variable_scope,
                                       optional_vars_walker.variable_scope)
     for stmt in node.body:
         walker = walk_ast(stmt, self.current_scope)
         utils.add_string_to_data(stmt.lineno, walker.data, walker.line)
         utils.combine_data(stmt.lineno, self.data, walker.data)
         utils.combine_variable_scopes(self.variable_scope,
                                       walker.variable_scope)
         utils.remove_empty_string(self.data, stmt.lineno)
Ejemplo n.º 21
0
 def visit_Call(self, node):
     # print '{0}: Call'.format(node.lineno)
     func_walker = walk_ast_for_names(node.func, self.current_scope)
     if not isinstance(node.func, ast.Name):
         utils.combine_data(node.lineno, self.data, func_walker.data)
         utils.combine_variable_scopes(self.variable_scope,
                                       func_walker.variable_scope)
     self.line += func_walker.line + '('
     first_arg = True
     for arg in node.args:
         arg_walker = walk_ast_for_expr(arg, self.current_scope)
         utils.add_string_to_data(node.lineno, arg_walker.data,
                                  arg_walker.line)
         utils.combine_data(node.lineno, self.data, arg_walker.data)
         utils.combine_variable_scopes(self.variable_scope,
                                       arg_walker.variable_scope)
         if not first_arg:
             self.line += ','
         first_arg = False
         self.line += arg_walker.line
     self.line += ')'
     utils.add_additional_lines(self.data, node.lineno, func_walker.line)
     utils.add_string_to_data(node.lineno, self.data, self.line)
Ejemplo n.º 22
0
def load_data(gold: bool) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Load, pre-process and combine the required data.

    Args:
        gold: whether to use GOLD annotations or PRED annotations.

    Returns:
        all the features combined, verb senses, sense labels ground truth.
    """

    if gold:
        images_features = pd.read_pickle(
            'data/features/GOLD/images_features_nametrim.pkl')
        embedded_annotations = pd.read_pickle(
            'data/features/GOLD/verse_embedding.pkl')
    else:
        images_features = pd.read_pickle(
            'data/features/PRED/images_features_new.pkl')
        embedded_annotations = pd.read_pickle(
            'data/features/PRED/pred_verse_embedding.pkl')

    images_features['e_image'] = images_features['e_image'].apply(
        lambda x: x / np.linalg.norm(x, ord=2))
    full_features = combine_data(embedded_annotations, images_features)

    senses = pd.read_csv('data/labels/verse_visualness_labels.tsv',
                         sep='\t',
                         dtype={'sense_num': str})
    sense_labels = pd.read_csv(
        'data/labels/3.5k_verse_gold_image_sense_annotations.csv',
        dtype={'sense_chosen': str})

    if gold:
        sense_labels['image'] = sense_labels['image'].apply(filter_image_name)
    sense_labels = sense_labels[sense_labels['sense_chosen'] !=
                                '-1']  # Drop unclassifiable elements
    sense_labels.reset_index(inplace=True, drop=True)

    return full_features, senses, sense_labels
Ejemplo n.º 23
0
def load_data_and_predict(parameters):
    """
    Preprocess input data and make predictions.
    :param parameters:
    :return:
    """
    # Convert user input to model input PKL format
    pkl_file = parameters['input'] + '.pkl'
    preprocess.write_user_input_to_model_input(parameters['input'], '', pkl_file)
    parameters['input'] = pkl_file

    max_chars_in_sample = parameters['max_chars_in_sample']
    utils.load_data_stride_x_chars_enc_dec(parameters['input'], parameters, stride=max_chars_in_sample / 2)
    file_ext = parameters['file_ext']
    char_to_num = parameters['char_to_num']

    if parameters['use_bpe']:
        if parameters['bpe_codes_file']:
            codes_file = parameters['bpe_codes_file']
        else:
            print 'BPE codes file does not exist!'
            exit()
        utils.load_bpe_data(parameters['input'] + '.bpe.' + str(max_chars_in_sample) + file_ext, char_to_num, codes_file, parameters['input'] + '.bpe-coded-' + os.path.basename(parameters['bpe_codes_file']) + '.' + str(max_chars_in_sample) + file_ext, parameters)

    if parameters['use_tokenization']:
        tok_char_to_num = parameters['tok_char_to_num']
        utils.load_tok_data(
            parameters['input'] + '.bpe.' + str(max_chars_in_sample) + file_ext,
            tok_char_to_num,
            parameters['input'] + '.tok.' + str(max_chars_in_sample) + file_ext, parameters)

    if parameters['use_word_embeddings']:
        vocab_dict = parameters['word_vocab_dict']
        # run word embedding features on dev and test data
        utils.load_word_embeddings_data(parameters['input'] + '.bpe.' + str(max_chars_in_sample) + '.pkl',
                                        vocab_dict,
                                        parameters['input'] + '.word-coded.' + str(max_chars_in_sample) + '.pkl',
                                        parameters)
    if parameters['use_bpe_embeddings']:
        if parameters['bpe_codes_file']:
            codes_file = parameters['bpe_codes_file']
        else:
            print 'BPE codes file does not exist!'
            exit()
        vocab_dict = parameters['bpe_vocab_dict']
        # run word embedding features on dev and test data
        utils.load_bpe_embeddings_data(parameters['input'] + '.bpe.' + str(max_chars_in_sample) + file_ext,
                                        vocab_dict,
                                        codes_file,
                                        parameters['input'] + '.bpe-embed-coded-' + os.path.basename(codes_file) + '.' + str(max_chars_in_sample) + file_ext,
                                        parameters)

    # determine what will be the combined data file
    combined_data_ext = utils.generate_combined_feat_ext(parameters)

    # Combine data features as necessary
    utils.combine_data(parameters['input'], combined_data_ext, parameters)

    # Make predictions
    metrics = utils.MetricsCheckpoint(parameters['model'], parameters)
    metrics.make_and_format_predictions(parameters['input'], metrics.test_X_data, parameters)

    # Output in specified format
    if parameters['output_format'] == 'iob':
        postprocess.ord_byte_iob_to_byte_iob(parameters['output'])
    else:  # st
        postprocess.byte_iob_to_src_tgt(parameters['output'], ordinal=True)
Ejemplo n.º 24
0
def main(args):

    # User parameters
    parser = OptionParser()
    parser.add_option('--model_path', help='Output file to save model to')
    parser.add_option('--input_format',
                      default='st',
                      help='Input format [iob|st]')
    parser.add_option(
        '--space_token',
        default='<SPACE>',
        help=
        'If input format is iob, then use space_token for spaces between bytes'
    )
    parser.add_option(
        '--train_data_file',
        help=
        'Training data: if input_format == "st", then there are two input files: train_data_file.src and train_data_file.tgt'
    )
    parser.add_option(
        '--dev_data_file',
        help=
        'Dev data: if input_format == "st", then there are two input files: dev_data_file.src and dev_data_file.tgt'
    )
    parser.add_option(
        '--test_data_file',
        help=
        'Test data: if input_format == "st", then there are two input files: test_data_file.src and test_data_file.tgt'
    )
    parser.add_option('--max_chars_in_sample',
                      default=150,
                      help='Max number of characters in a data sample')
    parser.add_option('--embedding_input_dim',
                      default=256 + 1,
                      help='Dimension of input vectors')
    parser.add_option('--embedding_output_dim',
                      default=100,
                      help='Dimension of output embedding vectors')
    parser.add_option('--embedding_max_len',
                      default=150,
                      help='Set length of input data (in byte characters)')
    parser.add_option('--cnn_filters',
                      default=250,
                      help='Number of filters in CNN output')
    parser.add_option('--cnn_kernel_size', default=7, help='Kernel size')
    parser.add_option('--cnn_padding',
                      default='same',
                      help='Type of border for CNN')
    parser.add_option('--cnn_act',
                      default='relu',
                      help='Activation fn for CNN')
    parser.add_option('--dense_final_act',
                      default='softmax',
                      help='Final activation fn in network')
    parser.add_option('--optimizer', default='adam', help='Optimizer')
    parser.add_option('--tag_scheme',
                      default='iobes',
                      help='IOBES or IOB2 tag scheme')
    parser.add_option('--batch_size',
                      default=256,
                      help='Number of samples to process in one batch')
    parser.add_option('--dropout',
                      default=0.5,
                      help='Fraction of input units to dropout')
    parser.add_option('--lstm_units', default=100, help=0)
    parser.add_option('--lstm_act',
                      default='tanh',
                      help='Activation fn for LSTM')
    parser.add_option('--num_byte_layers',
                      default=20,
                      help='Number of CNN layers in architecture')
    parser.add_option('--nb_workers',
                      default=1,
                      help='Number of threads or processes to use')
    parser.add_option('--nb_epochs',
                      default=300,
                      help='Number of epochs to train model for')
    parser.add_option('--residual',
                      default=1,
                      help='Whether to use residual connections')
    parser.add_option(
        '--skip_residuals',
        default=0,
        help='Whether to add a residual connection every other conv layer')
    parser.add_option('--lr',
                      default=0.00005,
                      help='Learning rate of optimizer')
    parser.add_option('--use_bpe',
                      default=0,
                      help='Whether to use byte pair encodings')
    parser.add_option('--num_operations',
                      default=50000,
                      help="Number of merge operations for BPE algorithm")
    parser.add_option('--reload',
                      default=0,
                      help="Whether to reload a previously trained model")
    parser.add_option('--blstm_on_top',
                      default=1,
                      help="Whether to use a BLSTM layer on top of the CNNs")
    parser.add_option('--crf_on_top',
                      default=1,
                      help="Whether to use a CRF layer on top")
    parser.add_option(
        '--use_word_embeddings',
        default=0,
        help="Whether to also use pretrained word embeddings as input")
    parser.add_option(
        '--use_bytes',
        default=1,
        help="Whether to use byte embedding as input. Default is true.")
    parser.add_option(
        '--word_embeddings_file',
        default='',
        help=
        "Pretrained word embedding file. This needs to be set in order to use word embeddings."
    )
    parser.add_option('--word_embedding_dim',
                      default=200,
                      help="Dimension of pretrained word embeddings")
    parser.add_option('--bpe_codes_file',
                      default='pmc_codes_file_ALL_50000',
                      help="Pretrained BPE file")
    parser.add_option(
        '--use_bpe_embeddings',
        default=1,
        help=
        "Whether to use pretrained BPE embeddings as input. Cannot be used with word embeddings"
    )
    parser.add_option(
        '--bpe_embeddings_file',
        default='models/bpe2vec.csv',
        help=
        "Pretrained BPE embedding file. This needs to be set in order to use BPE embeddings"
    )
    parser.add_option('--bpe_embedding_dim',
                      default=100,
                      help="Dimension of pretrained BPE embeddings")
    parser.add_option(
        '--byte_layer_for_embed',
        default=1,
        help=
        "Whether to use embedding inputs as inputs for the CNN layers (or only BLSTM layers)"
    )
    parser.add_option('--layer_for_bytes',
                      default='cnn',
                      help="stack of cnns or a blstm layer for bytes")
    parser.add_option('--temp_dir',
                      default='evaluation/temp',
                      help="Directory to write evaluation scores")
    parser.add_option(
        '--drop_bytes',
        default=1,
        help="Whether to drop a fraction of bytes for each input")
    parser.add_option('--byte_drop_fraction',
                      default=0.3,
                      help="Fraction of byte input to drop")
    parser.add_option(
        '--train_data_stride',
        default=75,
        help=
        "Stride in number of bytes to shift window to get next training sample"
    )
    parser.add_option('--trainable_bpe_embeddings',
                      default=1,
                      help="Whether BPE embeddings should be traininable")
    parser.add_option(
        '--override_parameters',
        default=0,
        help=
        "Whether to use specified parameters or parameters from saved model, if they exist"
    )
    parser.add_option('--get_probs',
                      default=0,
                      help="Get normalized log likelihoods of each sample")
    parser.add_option(
        '--get_vectors',
        default=0,
        help=
        "Get output vectors of second-to-last layer in the network. Currently only tested with the CNN-BLSTM-CRF configuration"
    )
    parser.add_option('--use_tokenization',
                      default=0,
                      help="Use tokenization features")
    parser.add_option(
        '--repickle_data',
        default=0,
        help=
        "Whether to re-process and pickle data even if the pickle file already exists"
    )
    parser.add_option(
        '--make_samples_unique',
        default=0,
        help=
        "Clean training data so that the user-provided samples are all unique")
    opts = parser.parse_args(args)[0]

    # Parameters
    parameters = OrderedDict()
    parameters['override_parameters'] = int(opts.override_parameters) == 1
    parameters['model_path'] = opts.model_path
    parameters['reload'] = int(opts.reload) == 1
    parameters_path = parameters['model_path'] + '_parameters.pkl'

    if parameters['reload'] or parameters[
            'override_parameters']:  # load parameters from file
        with open(parameters_path, 'rb') as f:
            parameters = pkl.load(f)
    else:  # set parameters
        parameters['max_chars_in_sample'] = int(opts.max_chars_in_sample)
        parameters['embedding_input_dim'] = int(opts.embedding_input_dim)
        parameters['embedding_output_dim'] = int(opts.embedding_output_dim)
        parameters['embedding_max_len'] = int(opts.embedding_max_len)
        parameters['cnn_filters'] = int(opts.cnn_filters)
        parameters['cnn_kernel_size'] = int(opts.cnn_kernel_size)
        parameters['cnn_padding'] = opts.cnn_padding
        parameters['cnn_act'] = opts.cnn_act
        parameters['dense_final_act'] = opts.dense_final_act
        parameters['optimizer'] = opts.optimizer
        parameters['tag_scheme'] = opts.tag_scheme
        parameters['dropout'] = float(opts.dropout)
        parameters['lstm_units'] = int(opts.lstm_units)
        parameters['lstm_act'] = opts.lstm_act
        parameters['num_byte_layers'] = int(opts.num_byte_layers)
        parameters['residual'] = int(opts.residual) == 1
        parameters['skip_residuals'] = int(opts.skip_residuals) == 1
        parameters['lr'] = float(opts.lr)
        parameters['use_bpe'] = int(opts.use_bpe) == 1
        parameters['num_operations'] = int(
            opts.num_operations)  # will be overridden if bpe_codes_file is set
        parameters['blstm_on_top'] = int(opts.blstm_on_top) == 1
        parameters['crf_on_top'] = int(opts.crf_on_top) == 1
        parameters['use_bytes'] = int(opts.use_bytes) == 1
        parameters['word_embeddings_file'] = opts.word_embeddings_file
        if parameters['word_embeddings_file']:
            parameters['use_word_embeddings'] = int(
                opts.use_word_embeddings) == 1
        else:
            parameters['use_word_embeddings'] = False
        parameters['word_embeddings_dim'] = int(opts.word_embedding_dim)
        parameters['bpe_codes_file'] = opts.bpe_codes_file
        if parameters['bpe_codes_file']:
            with open(parameters['bpe_codes_file'], 'rU') as f:
                lines = f.readlines()
            num_operations = len(lines) - 2
            parameters['num_operations'] = num_operations
        parameters['bpe_embeddings_file'] = opts.bpe_embeddings_file
        if parameters['bpe_embeddings_file']:
            parameters['use_bpe_embeddings'] = int(
                opts.use_bpe_embeddings) == 1
        else:
            parameters['use_bpe_embeddings'] = False
        parameters['bpe_embeddings_dim'] = int(opts.bpe_embedding_dim)
        parameters['byte_layer_for_embed'] = int(
            opts.byte_layer_for_embed) == 1
        parameters['layer_for_bytes'] = opts.layer_for_bytes
        parameters['trainable_bpe_embeddings'] = int(
            opts.trainable_bpe_embeddings) == 1
        parameters['use_tokenization'] = int(
            opts.use_tokenization) == 1  # in IOBES format
        parameters['space_token'] = opts.space_token

    # set data files
    parameters['train_data_file'] = opts.train_data_file
    parameters['dev_data_file'] = opts.dev_data_file
    parameters['test_data_file'] = opts.test_data_file
    parameters['get_probs'] = int(opts.get_probs) == 1
    parameters['get_vectors'] = int(opts.get_vectors) == 1
    parameters['repickle_data'] = int(opts.repickle_data) == 1
    parameters['input_format'] = opts.input_format
    parameters['make_samples_unique'] = int(opts.make_samples_unique) == 1
    parameters['temp_dir'] = opts.temp_dir
    parameters['drop_bytes'] = int(opts.drop_bytes) == 1
    parameters['byte_drop_fraction'] = float(opts.byte_drop_fraction)
    parameters['train_data_stride'] = int(opts.train_data_stride)
    parameters['override_parameters'] = int(opts.override_parameters) == 1
    parameters['model_path'] = opts.model_path
    parameters['reload'] = int(opts.reload) == 1
    parameters['batch_size'] = int(opts.batch_size)
    parameters['nb_workers'] = int(opts.nb_workers)
    parameters['nb_epochs'] = int(opts.nb_epochs)

    # Convert user input to model input PKL format if PKL file doesn't already exist
    train_pkl_file = parameters['train_data_file'] + '.pkl'
    dev_pkl_file = parameters['dev_data_file'] + '.pkl'
    test_pkl_file = parameters['test_data_file'] + '.pkl'
    if parameters['repickle_data'] or parameters[
            'make_samples_unique'] or not (os.path.isfile(train_pkl_file)
                                           and os.path.isfile(dev_pkl_file)
                                           and os.path.isfile(test_pkl_file)):
        train_src_file = parameters['train_data_file'] + '.src'
        train_tgt_file = parameters['train_data_file'] + '.tgt'
        dev_src_file = parameters['dev_data_file'] + '.src'
        dev_tgt_file = parameters['dev_data_file'] + '.tgt'
        test_src_file = parameters['test_data_file'] + '.src'
        test_tgt_file = parameters['test_data_file'] + '.tgt'

        if parameters['input_format'] == 'iob':  # convert to st format
            preprocess.write_user_byte_iob_input_to_src_tgt_input(
                parameters['train_data_file'],
                train_src_file,
                train_tgt_file,
                space_token=parameters['space_token'])
            preprocess.write_user_byte_iob_input_to_src_tgt_input(
                parameters['dev_data_file'],
                dev_src_file,
                dev_tgt_file,
                space_token=parameters['space_token'])
            preprocess.write_user_byte_iob_input_to_src_tgt_input(
                parameters['test_data_file'],
                test_src_file,
                test_tgt_file,
                space_token=parameters['space_token'])

        # train_pkl_file = parameters['train_data_file'] + '.pkl'
        # dev_pkl_file = parameters['dev_data_file'] + '.pkl'
        # test_pkl_file = parameters['test_data_file'] + '.pkl'

        if parameters['make_samples_unique']:  # unique training samples
            unique_train_file = utils.remove_duplicate_samples(
                parameters['train_data_file'])
            train_src_file = unique_train_file + '.src'
            train_tgt_file = unique_train_file + '.tgt'

        preprocess.write_user_input_to_model_input(train_src_file,
                                                   train_tgt_file,
                                                   train_pkl_file)
        preprocess.write_user_input_to_model_input(dev_src_file, dev_tgt_file,
                                                   dev_pkl_file)
        preprocess.write_user_input_to_model_input(test_src_file,
                                                   test_tgt_file,
                                                   test_pkl_file)

    parameters['train_data_file'] = train_pkl_file
    parameters['dev_data_file'] = dev_pkl_file
    parameters['test_data_file'] = test_pkl_file

    # Load train, dev, and test data
    tags = utils.collect_tags(parameters['train_data_file'])
    parameters['tags'] = tags
    tag_to_num = utils.build_tag_to_num(tags)
    parameters['tag_to_num'] = tag_to_num
    char_to_num = utils.build_char_to_num()
    parameters['char_to_num'] = char_to_num
    if parameters['use_tokenization']:
        tok_char_to_num = utils.build_tok_char_to_num()
        parameters['tok_char_to_num'] = tok_char_to_num
    enc_dec_tag_to_num = utils.build_enc_dec_tag_to_num(parameters)
    parameters['enc_dec_tag_to_num'] = enc_dec_tag_to_num
    parameters['num_enc_dec_tags'] = len(enc_dec_tag_to_num)
    parameters['num_iobes_tags'] = len(tag_to_num)
    max_chars_in_sample = parameters['max_chars_in_sample']
    parameters['enc_dec_output_length'] = max_chars_in_sample

    print 'Parameters'
    for param_key in parameters:
        if 'vocab_dict' not in param_key:
            print '\t', param_key, ': ', parameters[param_key]

    parameters['file_ext'] = '.pkl'
    utils.load_data_stride_x_chars_enc_dec(
        parameters['train_data_file'],
        parameters,
        stride=parameters['train_data_stride'])
    utils.load_data_stride_x_chars_enc_dec(parameters['dev_data_file'],
                                           parameters,
                                           stride=max_chars_in_sample / 2)
    utils.load_data_stride_x_chars_enc_dec(parameters['test_data_file'],
                                           parameters,
                                           stride=max_chars_in_sample / 2)
    file_ext = parameters['file_ext']

    if parameters['drop_bytes']:
        char_to_num['<DROP>'] = len(char_to_num)
        utils.byte_dropout(parameters['train_data_file'], parameters)
        parameters['train_data_file'] += '.bytedrop'

    if parameters['use_bpe']:
        if parameters['bpe_codes_file']:
            codes_file = parameters['bpe_codes_file']
        else:
            codes_file = 'codes_file.' + str(parameters['num_operations'])
            if not os.path.isfile(codes_file):
                # run bpe on train data
                utils.gen_bpe_code_file(
                    parameters['train_data_file'] + '.bpe.' +
                    str(max_chars_in_sample) + '.pkl',
                    parameters['num_operations'], codes_file)

        parameters['bpe_codes_file'] = codes_file

        # add code_file to char_to_num
        utils.add_bpe_to_vocab_dictionary(codes_file, char_to_num)

        # use codes to run bpe on data
        utils.load_bpe_data(
            parameters['train_data_file'].replace('.bytedrop', '') + '.bpe.' +
            str(max_chars_in_sample) + file_ext, char_to_num, codes_file,
            parameters['train_data_file'].replace('.bytedrop', '') +
            '.bpe-coded-' + os.path.basename(parameters['bpe_codes_file']) +
            '.' + str(max_chars_in_sample) + file_ext, parameters)
        utils.load_bpe_data(
            parameters['dev_data_file'] + '.bpe.' + str(max_chars_in_sample) +
            file_ext, char_to_num, codes_file, parameters['dev_data_file'] +
            '.bpe-coded-' + os.path.basename(parameters['bpe_codes_file']) +
            '.' + str(max_chars_in_sample) + file_ext, parameters)
        utils.load_bpe_data(
            parameters['test_data_file'] + '.bpe.' + str(max_chars_in_sample) +
            file_ext, char_to_num, codes_file, parameters['test_data_file'] +
            '.bpe-coded-' + os.path.basename(parameters['bpe_codes_file']) +
            '.' + str(max_chars_in_sample) + file_ext, parameters)

    if parameters['use_tokenization']:
        utils.load_tok_data(
            parameters['train_data_file'].replace('.bytedrop', '') + '.bpe.' +
            str(max_chars_in_sample) + file_ext, tok_char_to_num,
            parameters['train_data_file'].replace('.bytedrop', '') + '.tok.' +
            str(max_chars_in_sample) + file_ext, parameters)
        utils.load_tok_data(
            parameters['dev_data_file'] + '.bpe.' + str(max_chars_in_sample) +
            file_ext, tok_char_to_num, parameters['dev_data_file'] + '.tok.' +
            str(max_chars_in_sample) + file_ext, parameters)
        utils.load_tok_data(
            parameters['test_data_file'] + '.bpe.' + str(max_chars_in_sample) +
            file_ext, tok_char_to_num, parameters['test_data_file'] + '.tok.' +
            str(max_chars_in_sample) + file_ext, parameters)

    if parameters['use_word_embeddings']:
        vocab_dict = OrderedDict()

        # add vocab to vocab_dict
        print 'Adding words...'
        utils.add_embeddings_to_vocab_dictionary(
            parameters['word_embeddings_file'], vocab_dict, parameters)
        print 'Done adding words'

        # add unknown, space, and pad embedding to vocab dictionary
        vocab_dict['<UNKNOWN>'] = (len(vocab_dict),
                                   np.random.rand(
                                       parameters['word_embeddings_dim']))
        vocab_dict['<SPACE>'] = (len(vocab_dict),
                                 np.random.rand(
                                     parameters['word_embeddings_dim']))

        # run word embedding features on dev and test data
        utils.load_word_embeddings_data(
            parameters['train_data_file'].replace('.bytedrop', '') + '.bpe.' +
            str(max_chars_in_sample) + '.pkl', vocab_dict,
            parameters['train_data_file'].replace('.bytedrop', '') +
            '.word-coded.' + str(max_chars_in_sample) + '.pkl', parameters)
        utils.load_word_embeddings_data(
            parameters['dev_data_file'] + '.bpe.' + str(max_chars_in_sample) +
            '.pkl', vocab_dict, parameters['dev_data_file'] + '.word-coded.' +
            str(max_chars_in_sample) + '.pkl', parameters)
        utils.load_word_embeddings_data(
            parameters['test_data_file'] + '.bpe.' + str(max_chars_in_sample) +
            '.pkl', vocab_dict, parameters['test_data_file'] + '.word-coded.' +
            str(max_chars_in_sample) + '.pkl', parameters)

        parameters['word_vocab_dict'] = vocab_dict

    if parameters['use_bpe_embeddings']:
        if parameters['bpe_codes_file']:
            codes_file = parameters['bpe_codes_file']
        else:
            codes_file = 'codes_file.' + str(parameters['num_operations'])
            parameters['bpe_codes_file'] = codes_file

        vocab_dict = OrderedDict()

        # add vocab to vocab_dict
        print 'Adding BPE embeddings...'
        utils.add_embeddings_to_vocab_dictionary(
            parameters['bpe_embeddings_file'], vocab_dict, parameters)
        print 'Done adding BPE embeddings'

        # add unknown, space, and pad embedding to vocab dictionary
        vocab_dict['<UNKNOWN>'] = (len(vocab_dict),
                                   np.random.rand(
                                       parameters['bpe_embeddings_dim']))

        # run word embedding features on dev and test data
        utils.load_bpe_embeddings_data(
            parameters['train_data_file'].replace('.bytedrop', '') + '.bpe.' +
            str(max_chars_in_sample) + file_ext, vocab_dict, codes_file,
            parameters['train_data_file'].replace('.bytedrop', '') +
            '.bpe-embed-coded-' + os.path.basename(codes_file) + '.' +
            str(max_chars_in_sample) + file_ext, parameters)
        utils.load_bpe_embeddings_data(
            parameters['dev_data_file'] + '.bpe.' + str(max_chars_in_sample) +
            file_ext, vocab_dict, codes_file, parameters['dev_data_file'] +
            '.bpe-embed-coded-' + os.path.basename(codes_file) + '.' +
            str(max_chars_in_sample) + file_ext, parameters)
        utils.load_bpe_embeddings_data(
            parameters['test_data_file'] + '.bpe.' + str(max_chars_in_sample) +
            file_ext, vocab_dict, codes_file, parameters['test_data_file'] +
            '.bpe-embed-coded-' + os.path.basename(codes_file) + '.' +
            str(max_chars_in_sample) + file_ext, parameters)

        parameters['bpe_vocab_dict'] = vocab_dict

    # update embedding params
    parameters['embedding_input_dim'] = len(char_to_num)
    print 'Total embedding dimensions:', parameters['embedding_input_dim']

    # determine what will be the combined data file
    combined_data_ext = utils.generate_combined_feat_ext(parameters)

    # Combine data features as necessary
    utils.combine_data(parameters['train_data_file'], combined_data_ext,
                       parameters)
    utils.combine_data(parameters['dev_data_file'], combined_data_ext,
                       parameters)
    utils.combine_data(parameters['test_data_file'], combined_data_ext,
                       parameters)

    # Save parameters
    if not parameters['reload'] and not parameters['override_parameters']:
        with open(parameters_path, 'wb') as p:
            pkl.dump(parameters, p)

    train_model(parameters)