Beispiel #1
0
def main():
    global TIMER
    TIMER = Timer()

    oparser = argparse.ArgumentParser(description="intelligent crawling with q-learning")
    oparser.add_argument("--config-file", dest="configFile", required=True,
                         help="Path to config file (containing MySQL login etc.)")
    oparser.add_argument("--language-pair", dest="langPair", required=True,
                         help="The 2 language we're interested in, separated by ,")
    oparser.add_argument("--save-dir", dest="saveDir", default=".",
                         help="Directory that model WIP are saved to. If existing model exists then load it")
    oparser.add_argument("--save-plots", dest="saveDirPlots", default="plot",
                     help="Directory ")
    oparser.add_argument("--num-train-hosts", dest="numTrainHosts", type=int,
                         default=1, help="Number of domains to train on")
    oparser.add_argument("--num-test-hosts", dest="numTestHosts", type=int,
                         default=3, help="Number of domains to test on")
    oparser.add_argument("--max-crawl", dest="maxCrawl", type=int,
                         default=sys.maxsize, help="Maximum number of pages to crawl")
    oparser.add_argument("--gamma", dest="gamma", type=float,
                         default=0.999, help="Reward discount")
    options = oparser.parse_args()

    np.random.seed()
    np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)}, linewidth=666)

    if not os.path.exists(options.saveDir): os.makedirs(options.saveDir, exist_ok=True)
    if not os.path.exists("pickled_domains"): os.makedirs("pickled_domains", exist_ok=True)

    languages = GetLanguages(options.configFile)
    params = LearningParams(languages, options, languages.maxLangId, languages.GetLang("None"))

    print("options.numTrainHosts", options.numTrainHosts)
    #hosts = ["http://vade-retro.fr/"]
    hosts = ["http://telasmos.org/"]
    #hosts = ["http://www.buchmann.ch/", "http://telasmos.org/", "http://tagar.es/"]
    #hosts = ["http://www.visitbritain.com/"]

    #hostsTest = ["http://vade-retro.fr/"]
    #hostsTest = ["http://www.visitbritain.com/"]
    hostsTest = ["http://www.visitbritain.com/", "http://chopescollection.be/", "http://www.bedandbreakfast.eu/"]

    envs = GetEnvs(options.configFile, languages, hosts[:options.numTrainHosts])
    envsTest = GetEnvs(options.configFile, languages, hostsTest[:options.numTestHosts])

    tf.reset_default_graph()
    qn = Qnetwork(params)
    init = tf.global_variables_initializer()

    saver = None #tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init)

        Train(params, sess, saver, qn, envs, envsTest)
Beispiel #2
0
def main():
    global TIMER
    TIMER = Timer()

    oparser = argparse.ArgumentParser(
        description="intelligent crawling with q-learning")
    oparser.add_argument(
        "--config-file",
        dest="configFile",
        required=True,
        help="Path to config file (containing MySQL login etc.)")
    oparser.add_argument(
        "--language-pair",
        dest="langPair",
        required=True,
        help="The 2 language we're interested in, separated by ,")
    oparser.add_argument(
        "--save-dir",
        dest="saveDir",
        default=".",
        help=
        "Directory that model WIP are saved to. If existing model exists then load it"
    )
    oparser.add_argument("--save-plots",
                         dest="saveDirPlots",
                         default="plot",
                         help="Directory ")
    oparser.add_argument(
        "--delete-duplicate-transitions",
        dest="deleteDuplicateTransitions",
        default=False,
        help="If True then only unique transition are used in each batch")
    oparser.add_argument("--num-train-hosts",
                         dest="numTrainHosts",
                         type=int,
                         default=1,
                         help="Number of domains to train on")
    oparser.add_argument("--num-test-hosts",
                         dest="numTestHosts",
                         type=int,
                         default=3,
                         help="Number of domains to test on")
    options = oparser.parse_args()

    np.random.seed()
    np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)},
                        linewidth=666)

    languages = GetLanguages(options.configFile)
    params = LearningParams(languages, options.saveDir, options.saveDirPlots,
                            options.deleteDuplicateTransitions,
                            options.langPair, languages.maxLangId,
                            languages.GetLang("None"))

    if not os.path.exists(options.saveDirPlots): os.mkdir(options.saveDirPlots)

    #hostName = "http://vade-retro.fr/"
    hosts = ["http://www.buchmann.ch/"
             ]  #, "http://telasmos.org/", "http://tagar.es/"]
    #hostName = "http://www.visitbritain.com/"

    #hostNameTest = "http://vade-retro.fr/"
    #hostNameTest = "http://www.buchmann.ch/"
    hostsTest = [
        "http://www.visitbritain.com/", "http://chopescollection.be/",
        "http://www.bedandbreakfast.eu/"
    ]

    envs = GetEnvs(options.configFile, languages,
                   hosts[:options.numTrainHosts])
    envsTest = GetEnvs(options.configFile, languages,
                       hostsTest[:options.numTestHosts])

    tf.reset_default_graph()
    qns = Qnets(params)
    init = tf.global_variables_initializer()

    saver = None  #tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init)

        totRewards, totDiscountedRewards = Train(params, sess, saver, qns,
                                                 envs, envsTest)