Beispiel #1
0
def main():
    """
    Performs dataset loading / caching and prints out some
    information about the dataset.

    Allowed cmd-line flags:
        -s TS_FILES: Uses the reduced trainsed (TS_FILES trainset files)
        -o MIN_OCCUR: Only uses terms that occur MIN_OCCUR or more times
            in the trainset. Other terms are replaced with a special token.
        -f MIN_FILES: Only uses terms that occur in MIN_FILES or more files
            in the trainset. Other terms are replaced with a special token.
        -n Also do n-gram extraction.
    """

    logging.basicConfig(level=logging.INFO)

    subset = util.argv('-s', None, int)
    min_occ = util.argv('-o', 1, int)
    min_files = util.argv('-f', 1, int)

    if '-n' in sys.argv:

        def extract_and_report(n, tree):
            """
            Extracts n-grams and prints out basic info,
            which caches them for subsequent usage.
            """
            t0 = time()
            res = load_ngrams(n, np.ones(6), tree, subset, min_occ, min_files)
            log.info("%s %d-grams, count=%d, extracted in %.2f seconds",
                     "Tree" if tree else "Linear", n, res[0].shape[0],
                     time() - t0)
            del res

        #   extract linear ngrams
        for n in range(1, 5):
            extract_and_report(n, False)
        extract_and_report(4, True)

    else:
        data = load(subset, min_occ, min_files)
        del data
Beispiel #2
0
def show_image(file: BinaryIO):
    'Save the image to a file if Pillow is not installed.'
    try:
        from PIL import Image
    except ImportError:
        img_path = argv(2) or 'captcha.jpg'
        with open(img_path, 'wb') as f:
            f.write(file.read())
        print('Open the image "%s" to solve the CAPTCHA.' % img_path)
    else:
        Image.open(file).show()
Beispiel #3
0
def main():

    #print ''

    log.setDebug(util.argv(_cmds['Debug']))
    actions.supress(util.argv(_cmds['Supress']))

    args = sys.argv

    if len(args) == 1:
        log.comment(
            "No arguments specified, listing your Gists. Try '%s help' if you need help."
            % util.fileName)
        print('')

    del args[0]  # Delete the filename
    cmd = None

    log.debug("Arguments " + str(args))

    #--------------------------------------------
    # If args[0] is a command. We remove it from the list. args now contains only the  command arguments
    # else we keep as is and try to evaluate the command
    #--------------------------------------------
    if _hasCmd(args):
        cmd = args[0]
        del args[
            0]  # Delete the command. Arguments remaining are the options for each command
    else:
        cmd = _deriveCmd(args)

    log.debug("Adjusted cmd: " + str(cmd))
    log.debug("Adjusted arguments " + str(args))
    #--------------------------------------------
    # Handle commands
    #--------------------------------------------
    if cmd == None:
        _printNoMatch()
    elif cmd in (_cmds['Help']):
        actions.help()
    elif cmd in (_cmds['List']):
        actions.list()
    elif cmd in (_cmds['Token']):
        actions.updateCredentials()
    elif cmd in (_cmds['Open']):
        if len(args) == 1:
            actions.open(args[0])
        else:
            _printNoMatch()
    elif cmd in (_cmds['View']):
        if len(args) == 1:
            actions.view(args[0])
        elif len(args) == 2:
            actions.view(args[0], fileName=args[1])
    elif cmd in (_cmds['Download']):
        if len(args) == 2:
            actions.get(args[0], path=args[1])
        elif len(args) == 3:
            actions.get(args[0], fileName=args[1], path=args[2])
    elif cmd in (_cmds['New']):
        # Each option will prompt for public/pvt and description. In silent mode, assumes private and no description.
        if len(args) == 0:
            actions.new()
        elif len(args) == 1:
            # create File
            # create Content
            if util.isFileOrDir(args[0]) == True:
                actions.new(filename=args[0])
            else:
                actions.new(content=args[0])
        elif len(args) == 2:
            # create Boolean and File
            # create Boolean and Content
            # create Description and File
            # create Description and Content
            if util.parseBool(args[0]) != None:
                if util.isFileOrDir(args[1]) == True:
                    actions.new(public=util.parseBool(args[0]),
                                filename=args[1])
                else:
                    actions.new(public=util.parseBool(args[0]),
                                content=args[1])
            else:
                if util.isFileOrDir(args[1]) == True:
                    actions.new(description=args[0], filename=args[1])
                else:
                    actions.new(description=args[0], content=args[1])
        elif len(args) == 3 and util.parseBool(args[0]) != None:
            # create Boolean, Description and File
            # create Boolean, Description and Content
            if util.isFileOrDir(args[2]) == True:
                actions.new(public=util.parseBool(args[0]),
                            description=args[1],
                            filename=args[2])
            else:
                actions.new(public=util.parseBool(args[0]),
                            description=args[1],
                            content=args[2])
        else:
            _printNoMatch()
    elif cmd in (_cmds['Append']):
        # Each option will prompt for public/pvt and description.
        if len(args) == 0:
            _printNoMatch()
        elif len(args) == 2:
            # append: id File
            # append: id Content
            if util.isFileOrDir(args[1]) == True:
                actions.append(args[0], filename=args[1])
            else:
                actions.append(args[0], content=args[1])
        elif len(args) == 3:
            # append: id Description File
            # append: id Description Content
            if util.isFileOrDir(args[2]) == True:
                actions.append(args[0], description=args[1], filename=args[2])
            else:
                actions.append(args[0], description=args[1], content=args[2])
        else:
            actions.append(args[0])
    elif cmd in (_cmds['Update']):
        # Each option will prompt for public/pvt and description.
        if len(args) == 0:
            _printNoMatch()
        elif len(args) == 2:
            # append: id File
            # append: id Content
            if util.isFileOrDir(args[1]) == True:
                actions.update(args[0], filename=args[1])
            else:
                actions.update(args[0], content=args[1])
        elif len(args) == 3:
            # append: id Description File
            # append: id Description Content
            if util.isFileOrDir(args[2]) == True:
                actions.update(args[0], description=args[1], filename=args[2])
            else:
                actions.update(args[0], description=args[1], content=args[2])
        else:
            actions.update(args[0])
    elif cmd in (_cmds['Delete']):
        actions.delete(args[0])
    elif cmd in (_cmds['Backup']):
        _printNoImpl()
        actions.backup()
    elif cmd in (_cmds['Search']):
        _printNoImpl()
        actions.search()
    else:
        _printNoMatch()

    log.debug("Done.")
    print('')
Beispiel #4
0
            assert len(models) == 1
        except:
            print(line)
        else:
            model = models[0]
        if model not in lst:
            lst[model] = []
        lst[model].append(code)
    return lst


def main(src, dest):
    'Append train models to the existing JSON file.'
    with open(src) as f:
        lst = group(f)
    print('\n'.join(sorted(lst.keys())))
    print(len(lst), 'models found.')

    if os.path.isfile(dest):
        with open(dest) as f:
            existing = json.load(f)
            pprint(existing)
            lst.update(existing)

    with open(dest, 'w') as f:
        json.dump(lst, f)


if __name__ == '__main__':
    main(argv(1) or 'models.txt', argv(2) or 'models.json')
Beispiel #5
0
#### Begin loop ####
while loop >= 0:
	loop = loop + 1
	print loop #print the number of times the script has looped
	time.sleep(0.05)#space out the loop so as not to run too fast
	print recv #prints everything received from freenode. Remove this to clean up the debugging
	f.write(recv)# Log recv for later
	#iterate through plugins executing all functions
	data = {
		's': s ,'recv': recv ,
		'nick':nick,
		'loop': loop ,'numr' : numr ,
		'channel' : channel,
		'maxspam' : maxspam,
		'channelops':channelops,
		'plugclass' : plugclass}# format data to send to plugins
	thread.start_new_thread(runplugins,())
	if '!update' in recv and util.argv('!update',recv)['user'] in data['admins']:
        	status = 'Successful'
                try:
                	execfile('./plugins.py')
                        from plugins import *
                       	execfile('./util.py')
                except Exception, err:
                        print sys.exc_info()[1]
                        status = 'Failed'
                args = argv('!update', data['recv'])
                s.send(util.say(args['channel'],'Dynamic update: ' + status))
	#get recv last. I thought this would be a good idea. I can't remember why, but there was a reason.
	recv = s.recv(recvbits)	
Beispiel #6
0
    data = load_trains(file.read())
    codes = set(emu_codes(data))
    codes = sorted(codes, key=lambda code: ord(code[0]) * 1e4 + int(code[1:]))
    print('Ready, %d trains to be checked.' % len(codes))
    return codes


def batch_query(me: Automation, codes: Iterable, img_dir: str, models: TextIO):
    'Save screenshots and train models for all the given trains.'
    for code in codes:
        try:
            me.query(code)
        except LookupError:
            print(code, 'not found?')
        else:
            img_path = os.path.join(img_dir, '%s.png' % code)
            me.get_shot().save(img_path)
            print(code, me.get_text(), file=models)


if __name__ == '__main__':
    me = Automation()
    with open(path) as f:
        codes = unique_trains(f)
    img_dir = argv(3) or 'img'
    mkdir(img_dir)
    time.sleep(5)

    with open(argv(2) or 'models.txt', 'w') as f:
        batch_query(me, codes, img_dir, f)
Beispiel #7
0
#!/usr/bin/env python3

import json
from typing import Iterable, Tuple

from sql import sql_shell
from util import shell, argv, open
path = argv(1) or 'train_list.js'


def decompose(s):
    'Split the information string by delimiters.'
    # G1234(Station A-Station B)
    return s.translate(
        {ord(k): v
         for k, v in {
             '(': '|',
             '-': '|',
             ')': ''
         }.items()}).split('|')


def load_trains(script: str) -> dict:
    'Deserialize the jsonp script.'
    # https://kyfw.12306.cn/otn/resources/js/query/train_list.js
    json_text = script.partition('=')[2]
    return json.loads(json_text)


def parse_trains(data: dict) -> Iterable[Tuple[str, str, str, str]]:
    'Flatten the train list and return all items in it.'
Beispiel #8
0
    time.sleep(0.05)  #space out the loop so as not to run too fast
    print recv  #prints everything received from freenode. Remove this to clean up the debugging
    f.write(recv)  # Log recv for later
    #iterate through plugins executing all functions
    data = {
        's': s,
        'recv': recv,
        'nick': nick,
        'loop': loop,
        'numr': numr,
        'channel': channel,
        'maxspam': maxspam,
        'channelops': channelops,
        'plugclass': plugclass
    }  # format data to send to plugins
    thread.start_new_thread(runplugins, ())
    if '!update' in recv and util.argv('!update',
                                       recv)['user'] in data['admins']:
        status = 'Successful'
        try:
            execfile('./plugins.py')
            from plugins import *
            execfile('./util.py')
        except Exception, err:
            print sys.exc_info()[1]
            status = 'Failed'
        args = argv('!update', data['recv'])
        s.send(util.say(args['channel'], 'Dynamic update: ' + status))
    #get recv last. I thought this would be a good idea. I can't remember why, but there was a reason.
    recv = s.recv(recvbits)
Beispiel #9
0
#!/usr/bin/env python3

from typing import Iterable, List

from sql import sql_shell
from util import shell, argv, open
path = argv(1) or 'station_name.js'


def load_stations(script: str) -> Iterable[List[str]]:
    'Split the dataset by delimiters.'
    # skip javascript stuff around single quotes
    # skip the first '@' character in the string
    packed_stations = script.split("'")[1][1:]
    for s in packed_stations.split('@'):
        yield s.split('|')


def dump_stations(stations: Iterable[List[str]]) -> str:
    'Serialize the stations to delimiter-separated strings.'
    serialized = '@'.join('|'.join(s) for s in stations)
    return "var station_names = '@%s';" % serialized


if __name__ == '__main__':
    with open(path) as f:
        s = list(load_stations(f.read()))

    for interpreter in shell, sql_shell:
        interpreter({'s': s}, 'len(s) == %d.' % len(s))
Beispiel #10
0
        if not match:
            return print(page.name, 'X')

        for code, abbr, province in self.provinces:
            if province in match.group(1):
                station[-1] = abbr
                return print(station[1], '->', abbr)

    def convert(self, **kwargs) -> str:
        'Convert between language variants, such as zh-CN and zh-TW.'
        return self.site.get('parse', **kwargs)['parse']['displaytitle']


def load_provices(file: TextIO) -> Iterable[List[str]]:
    'Load the province list from a text file.'
    for line in file:
        if line.strip():
            yield line.split()


if __name__ == '__main__':
    with open(path) as f:
        stations = list(load_stations(f.read()))
    with open(argv(2) or 'provinces.txt') as f:
        provinces = list(load_provices(f))

    stations = Wikipedia(stations, provinces).fill_missing_provinces()

    with open(path, 'w') as f:
        print(dump_stations(stations), file=f)
Beispiel #11
0
def main():
    logging.basicConfig(level=logging.INFO)
    log.info("Performing final eval")

    #   get the data handling parameters
    ts_reduction = util.argv('-s', None, int)
    min_occ = util.argv('-o', 5, int)
    min_files = util.argv('-f', 2, int)
    n = util.argv('-n', 4, int)
    use_tree = '-t' in sys.argv
    bool_format = lambda s: s.lower() in ["1", "true", "yes", "t", "y"]
    ft_format = lambda s: map(bool_format, s)
    ftr_use = np.array(util.argv('-u', ft_format("001000"), ft_format))
    use_lbl = '-l' in sys.argv

    #   nnet rbm-s only support one-feature ngrams
    assert ftr_use.sum() == 1

    #   get nnet training parameters
    epochs = util.argv('-ep', 20, int)
    eps_llbl = util.argv('-eps_llbl', 0.0001, float)
    eps_lmlp = util.argv('-eps_lmlp', 0.0001, float)
    mnb_size = util.argv('-mnb', 500, int)
    d = util.argv('-d', 150, int)

    def path():
        """
        Returns a file-name base (without extension)
        for the model being evaluated.
        """
        #   the directory for this model
        dir = "%s_%s_%d-gram_features-%s_data-subset_%r-min_occ_%r-min_files_%r"\
            % ("llbl" if use_lbl else "lmlp",
                "tree" if use_tree else "linear", n,
                "".join([str(int(b)) for b in ftr_use]),
                ts_reduction, min_occ, min_files)
        dir = os.path.join('eval', dir)
        if not os.path.exists(dir):
            os.makedirs(dir)

        #   filename base for this model
        eps = eps_llbl if use_lbl else eps_lmlp
        file = "d-%d_train_mnb-%d_epochs-%d_eps-%.5f" % (d, mnb_size, epochs,
                                                         eps)

        return os.path.join(dir, file)

    #   load data
    ngrams, q_groups, answers, feature_sizes = data.load_ngrams(
        n,
        ftr_use,
        use_tree,
        subset=ts_reduction,
        min_occ=min_occ,
        min_files=min_files)
    used_ftr_sizes = feature_sizes[ftr_use]
    #   remember, we only use one feature
    vocab_size = used_ftr_sizes[0]
    log.info("Data loaded, %d ngrams", ngrams.shape[0])

    #   split data into sets
    x_train, x_valid, x_test = util.dataset_split(
        ngrams, int(min(1e4, 0.05 * ngrams.shape[0])), 0.05, rng=456)

    def eval_msscc(prob_function):
        """
        Evaluates the given probability function on the
        Microsoft Research Sentence Completion Challenge.

        :param prob_function: A function that takes an array
            of ngrams of shape (N, n) and returns probabilities
            of each ngram.
        """
        sentence_prob = lambda sent: np.log(prob_function(sent)).sum()
        predictions = map(lambda qg: np.argmax(map(sentence_prob, qg)),
                          q_groups)
        return (answers == np.array(predictions)).mean()

    def eval_ngram(smoothing_param, use_kn=True):
        """
        A function that creates and evaluates ngram models
        with [additive / knesser-ney] smooting.
        """
        #   get and eval Ngram model
        ngram_model = ngram.NgramModel(n, use_tree, ftr_use, feature_sizes,
                                       ts_reduction, min_occ, min_files, 0.0,
                                       0.0, x_train)

        def perplexity(smoothing, valid_set=True):
            dataset = x_valid if valid_set else x_test
            if use_kn:
                ngram_model.set_delta(smoothing)
                probability = ngram_model.probability_kn(dataset)
            else:
                ngram_model.set_lmbd(smoothing)
                probability = ngram_model.probability_additive(dataset)

            log_loss = -np.log(probability).mean()
            perplexity = np.exp(log_loss)
            log.info(
                "Ngrams %s, smoothing=%.e: log_loss: %.4f, perplexity: %.2f",
                "knesser_ney" if use_kn else "additive", smoothing, log_loss,
                perplexity)
            return perplexity

        best_lmbd = smoothing_param[np.argmin(map(perplexity,
                                                  smoothing_param))]
        log.info("Final eval of ngram model: %.2f",
                 perplexity(best_lmbd, False))

        log.info("Final eval on MRSCC: %.3f",
                 eval_msscc(ngram_model.probability_kn))

    def eval_net():
        """
        A function that creates, trains and evaluates an LMLP or LLBL.
        """
        #   well store the results in a dictionary
        #   and return that
        results = {}

        #   create and evaluate a LLBL
        if use_lbl:
            net = LLBL(n, vocab_size, d, 64353)
        else:
            net = LNNet(n, vocab_size, d, 64353)

        #   train models while validation llos falls for a
        #   significant margin w.r.t. an epoch of training
        #   or a maximum number of epochs is reached
        epoch_llos = []
        results["epoch_validation_logloss"] = epoch_llos

        def epoch_callback(net, epoch):

            epoch_llos.append(net.evaluate(x_valid, mnb_size))
            log.info("Epoch %d, validation cost: %.4f", epoch, epoch_llos[-1])
            if epoch < 4:
                return True
            else:
                return np.exp(epoch_llos[-4]) - np.exp(epoch_llos[-1]) > 1.

        training_time = time()
        net.epoch_callback = epoch_callback
        train_cost = net.train(x_train, mnb_size, epochs,
                               eps_llbl if use_lbl else eps_lmlp)
        training_time = time() - training_time
        results["train_time"] = training_time
        results["epoch_train_logloss"] = train_cost

        #   final evaluation on the test set
        evaluation = net.evaluate(x_test, mnb_size)
        results["final_eval"] = evaluation
        log.info("### Final evaluation score: %.4f", evaluation)

        #   final evaluation on the sentence completion
        prob_f = theano.function([net.input], net.probability)
        mrscc_eval = eval_msscc(prob_f)
        results["mrscc_eval"] = mrscc_eval
        log.info("### MRSCC evaluation score: %.3f", mrscc_eval)

        return results

    if '-eval_ngram' in sys.argv:
        #   evaluate ngram models, additive and knesser-ney
        # ngram_lmbd = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
        # eval_ngram(ngram_lmbd, False)
        ngram_delta = [0.4, 0.8, 0.9, 1.0]
        eval_ngram(ngram_delta, True)

    if '-eval_net' in sys.argv:
        results = eval_net()
        with open(path() + ".json", 'w+') as json_file:
            json.dump(results, json_file, indent=2)
        plot_log_loss(results["epoch_validation_logloss"],
                      results["epoch_train_logloss"],
                      path() + ".pdf")
Beispiel #12
0
def main ( ):

  #print ''

  log.setDebug( util.argv( _cmds['Debug'] ) )
  actions.supress( util.argv( _cmds['Supress']) )

  args = sys.argv
  
  if len(args) == 1:
    log.comment ("No arguments specified, listing your Gists. Try '%s help' if you need help." % util.fileName)
    print ''

  del args[0] # Delete the filename
  cmd = None

  log.debug ("Arguments " + str( args ))

  #--------------------------------------------
  # If args[0] is a command. We remove it from the list. args now contains only the  command arguments
  # else we keep as is and try to evaluate the command
  #--------------------------------------------
  if _hasCmd( args ):
    cmd = args[0]
    del args[0] # Delete the command. Arguments remaining are the options for each command
  else:
    cmd = _deriveCmd( args )

  log.debug ("Adjusted cmd: " + str(cmd))
  log.debug ("Adjusted arguments " + str( args ))
  #--------------------------------------------
  # Handle commands
  #--------------------------------------------
  if cmd == None:
    _printNoMatch()
  elif cmd in (_cmds['Help']):
    actions.help()
  elif cmd in (_cmds['List']):
    actions.list()
  elif cmd in (_cmds['Token']):
    actions.updateCredentials()
  elif cmd in (_cmds['View']):
    if len(args) == 1:
      actions.view( args[0] )
    elif len(args) == 2:
      actions.view( args[0], fileName=args[1])
  elif cmd in (_cmds['Download']):
    if len(args) == 2:
      actions.get( args[0], path=args[1] )
    elif len(args) == 3:
      actions.get( args[0], fileName=args[1], path=args[2] )
  elif cmd in (_cmds['New']):
    # Each option will prompt for public/pvt and description. In silent mode, assumes private and no description.
    if len(args) == 0:
      actions.new()
    elif len(args) == 1:
      # create File
      # create Content
      if util.isFileOrDir(args[0]) == True:
        actions.new( filename = args[0] )
      else:
        actions.new( content = args[0] )
    elif len(args) == 2: 
      # create Boolean and File
      # create Boolean and Content
      # create Description and File 
      # create Description and Content 
      if util.parseBool( args[0] ) != None:
        if util.isFileOrDir(args[1]) == True:
          actions.new( public=util.parseBool( args[0] ), filename=args[1] )
        else:
          actions.new( public=util.parseBool( args[0] ), content=args[1] )
      else:
        if util.isFileOrDir(args[1]) == True:
          actions.new( description=args[0], filename=args[1] )
        else:
          actions.new( description=args[0], content=args[1] )
    elif len(args) == 3 and util.parseBool( args[0] ) != None:
      # create Boolean, Description and File
      # create Boolean, Description and Content
      if util.isFileOrDir(args[2]) == True:
        actions.new( public=util.parseBool( args[0] ), description=args[1], filename=args[2] )
      else:
        actions.new( public=util.parseBool( args[0] ), description=args[1], content=args[2] )
    else:
      _printNoMatch()
  elif cmd in (_cmds['Append']):
    # Each option will prompt for public/pvt and description.
    if len(args) == 0:
      _printNoMatch()
    elif len(args) == 2: 
      # append: id File
      # append: id Content 
      if util.isFileOrDir(args[1]) == True:
        actions.append( args[0], filename=args[1] )
      else:
        actions.append( args[0], content=args[1] )
    elif len(args) == 3:
      # append: id Description File
      # append: id Description Content
      if util.isFileOrDir(args[2]) == True:
        actions.append( args[0], description=args[1], filename=args[2] )
      else:
        actions.append( args[0], description=args[1], content=args[2] )
    else:
      actions.append( args[0] )
  elif cmd in (_cmds['Update']):
    # Each option will prompt for public/pvt and description.
    if len(args) == 0:
      _printNoMatch()
    elif len(args) == 2: 
      # append: id File
      # append: id Content 
      if util.isFileOrDir(args[1]) == True:
        actions.update( args[0], filename=args[1] )
      else:
        actions.update( args[0], content=args[1] )
    elif len(args) == 3:
      # append: id Description File
      # append: id Description Content
      if util.isFileOrDir(args[2]) == True:
        actions.update( args[0], description=args[1], filename=args[2] )
      else:
        actions.update( args[0], description=args[1], content=args[2] )
    else:
      actions.update( args[0] )
  elif cmd in (_cmds['Delete']):
    actions.delete( args[0] )
  elif cmd in (_cmds['Backup']):
    _printNoImpl()
    actions.backup( )
  elif cmd in (_cmds['Search']):
    _printNoImpl()
    actions.search( )
  else:
    _printNoMatch()

  log.debug ("Done.")
  print ''
def main():
    """
    Trains and evaluates neural
    language models on the Microsoft Sentence Completion
    Challenge dataset.

    Allowed cmd-line flags:
        -s TS_FILES : Uses the reduced trainsed (TS_FILES trainset files)
        -o MIN_OCCUR : Only uses terms that occur MIN_OCCUR or more times
            in the trainset. Other terms are replaced with a special token.
        -f MIN_FILES : Only uses terms that occur in MIN_FILES or more files
            in the trainset. Other terms are replaced with a special token.
        -n : n-gram length (default 4)
        -t : Use tree-grams (default does not ues tree-grams)
        -u FTRS : Features to use. FTRS must be a string composed of zeros
            and ones, of length 5. Ones indicate usage of following features:
            (word, lemma, google_pos, penn_pos, dependency_type), respectively.

    Neural-net specific cmd-line flags:
        -ep EPOCHS : Number of training epochs, defaults to 20.
        -eps EPS : Learning rate, defaults to 0.005.
        -mnb MBN_SIZE : Size of the minibatch, defaults to 2000.

    """
    logging.basicConfig(level=logging.INFO)
    log.info("Evaluating model")

    #   get the data handling parameters
    ts_reduction = util.argv('-s', None, int)
    min_occ = util.argv('-o', 5, int)
    min_files = util.argv('-f', 2, int)
    n = util.argv('-n', 4, int)
    use_tree = '-t' in sys.argv
    bool_format = lambda s: s.lower() in ["1", "true", "yes", "t", "y"]
    ft_format = lambda s: map(bool_format, s)
    ftr_use = np.array(util.argv('-u', ft_format("001000"), ft_format))
    val_per_epoch = util.argv('-v', 10, int)

    #   nnets only support one-feature ngrams
    assert ftr_use.sum() == 1

    #   get nnet training parameters
    use_lbl = '-l' in sys.argv
    epochs = util.argv('-ep', 20, int)
    eps = util.argv('-eps', 0.002, float)
    mnb_size = util.argv('-mnb', 2000, int)
    n_hid = util.argv('-h', 1000, int)
    d = util.argv('-d', 100, int)

    #   load data
    ngrams, q_groups, answers, feature_sizes = data.load_ngrams(
        n, ftr_use, use_tree, subset=ts_reduction,
        min_occ=min_occ, min_files=min_files)
    used_ftr_sizes = feature_sizes[ftr_use]
    #   remember, we only use one feature
    vocab_size = used_ftr_sizes[0]
    log.info("Data loaded, %d ngrams", ngrams.shape[0])

    #   split data into sets
    x_train, x_valid, x_test = util.dataset_split(ngrams, 0.05, 0.05, rng=456)

    #   generate a version of the validation set that has
    #   the first term (the conditioned one) randomized
    #   w.r.t. unigram distribution
    #   so first create the unigram distribution, no smoothing
    unigrams_data = data.load_ngrams(1, ftr_use, False, subset=ts_reduction,
                                     min_occ=min_occ, min_files=min_files)[0]
    unigrams_data = NgramModel(1, False, ftr_use, feature_sizes, ts_reduction,
                               min_occ, min_files, 0.0, 0.0, unigrams_data)
    unigrams_dist = unigrams_data.probability_additive(
        np.arange(vocab_size).reshape(vocab_size, 1))
    unigrams_dist /= unigrams_dist.sum()
    #   finally, generate validation sets with randomized term
    x_valid_r = random_ngrams(x_valid, vocab_size, False, unigrams_dist)

    #   the directory for this model
    dir = "%s_%s_%d-gram_features-%s_data-subset_%r-min_occ_%r-min_files_%r"\
        % ("llbl" if use_lbl else "lmlp",
            "tree" if use_tree else "linear", n,
            "".join([str(int(b)) for b in ftr_use]),
            ts_reduction, min_occ, min_files)
    dir = os.path.join(_DIR, dir)
    if not os.path.exists(dir):
        os.makedirs(dir)

    #   filename base for this model
    file = "nhid-%d_d-%d_train_mnb-%d_epochs-%d_eps-%.5f" % (
        n_hid, d, mnb_size, epochs, eps)

    #   store the logs
    if False:
        log_file_handler = logging.FileHandler(
            os.path.join(dir, file + ".log"))
        log_file_handler.setLevel(logging.INFO)
        logging.root.addHandler(log_file_handler)

    #   we will plot log-lik ratios for every _VALIDATE_MNB minibatches
    #   we will also plot true mean log-lik
    valid_on = {"x_valid": x_valid[:_LL_SIZE], "x_valid_r": x_valid_r[
        :_LL_SIZE], "x_train": x_train[:_LL_SIZE]}
    valid_ll = {k: [] for k in valid_on.keys()}
    valid_p_mean = {k: [] for k in valid_on.keys()}

    #   how often we validate
    mnb_count = (x_train.shape[0] - 1) / mnb_size + 1
    _VALIDATE_MNB = mnb_count / val_per_epoch

    def mnb_callback(net, epoch, mnb):
        """
        Callback function called after every minibatch.
        """
        if (mnb + 1) % _VALIDATE_MNB:
            return

        #   calculate log likelihood using the exact probability
        probability_f = theano.function([net.input], net.probability)
        for name, valid_set in valid_on.iteritems():
            p = probability_f(valid_set)
            valid_ll[name].append(np.log(p).mean())
            valid_p_mean[name].append(p.mean())

        log.info('Epoch %d, mnb: %d, x_valid mean-log-lik: %.5f'
                 ' , x_valid p-mean: %.5f'
                 ' , ln(p(x_valid) / p(x_valid_r).mean(): %.5f',
                 epoch, mnb, valid_ll["x_valid"][-1],
                 valid_p_mean["x_valid"][-1],
                 valid_ll["x_valid"][-1] - valid_ll["x_valid_r"][-1])

    #   track if the model progresses on the sentence completion challenge
    # sent_challenge = []

    def epoch_callback(net, epoch):

        #   log some info about the parameters, just so we know
        param_mean_std = [(k, v.mean(), v.std())
                          for k, v in net.params().iteritems()]
        log.info("Epoch %d: %s", epoch, "".join(
            ["\n\t%s: %.5f +- %.5f" % pms for pms in param_mean_std]))

        #   evaluate model on the sentence completion challenge
        # probability_f = theano.function([net.input], net.probability)
        # qg_log_lik = [[np.log(probability_f(q)).sum() for q in q_g]
        #               for q_g in q_groups]
        # predictions = map(lambda q_g: np.argmax(q_g), qg_log_lik)
        # sent_challenge.append((np.array(predictions) == answers).mean())
        # log.info('Epoch %d sentence completion eval score: %.4f',
        #          epoch, sent_challenge[-1])

    log.info("Creating model")
    if use_lbl:
        net = LLBL(n, vocab_size, d, 12345)
    else:
        net = LMLP(n, vocab_size, d, 12345)
    net.mnb_callback = mnb_callback
    net.epoch_callback = epoch_callback
    train_cost, valid_cost, _ = net.train(
        x_train, x_valid, mnb_size, epochs, eps)

    #   plot training progress info
    #   first we need values for the x-axis (minibatch count)
    mnb_count = (x_train.shape[0] - 1) / mnb_size + 1
    mnb_valid_ep = mnb_count / _VALIDATE_MNB
    x_axis_mnb = np.tile((np.arange(mnb_valid_ep) + 1) * _VALIDATE_MNB, epochs)
    x_axis_mnb += np.repeat(np.arange(epochs) * mnb_count, mnb_valid_ep)
    x_axis_mnb = np.hstack(([0], x_axis_mnb))

    plt.figure(figsize=(16, 12))
    plt.subplot(221)
    plt.plot(mnb_count * (np.arange(epochs) + 1), train_cost, 'b-',
             label='train')
    plt.plot(mnb_count * (np.arange(epochs) + 1), valid_cost, 'g-',
             label='valid')
    plt.axhline(min(valid_cost), linestyle='--', color='g')
    plt.yticks(list(plt.yticks()[0]) + [min(valid_cost)])
    plt.title('cost')
    plt.grid()
    plt.legend(loc=1)

    plt.subplot(222)
    for name, valid_set in valid_ll.items():
        plt.plot(x_axis_mnb, valid_set, label=name)
    plt.ylim((np.log(0.5 / vocab_size),
              max([max(v) for v in valid_ll.values()]) + 0.5))
    plt.axhline(max(valid_ll["x_valid"]), linestyle='--', color='g')
    plt.yticks(list(plt.yticks()[0]) + [max(valid_ll["x_valid"])])
    plt.title('log-likelihood(x)')
    plt.grid()
    plt.legend(loc=4)

    plt.subplot(224)
    for name, valid_set in valid_p_mean.items():
        plt.plot(x_axis_mnb, valid_set, label=name)
    plt.title('p(x).mean()')
    plt.grid()
    plt.legend(loc=4)

    # plt.subplot(224)
    # plt.plot(mnb_count * np.arange(epochs + 1), sent_challenge, 'g-')
    # plt.title('sent_challenge')
    # plt.grid()

    plt.savefig(os.path.join(dir, file + ".pdf"))