Ejemplo n.º 1
0
def main(config_file, restart):
    cparser = ConfigParser()
    cparser.read(config_file)
    config = Config(cparser)
    config.cache_server = get_cache_server(config, restart)
    crawler = Crawler(config, restart)
    crawler.start()
Ejemplo n.º 2
0
def main(config_file, restart):
    file_count = 1
    #reads data.txt to check if it has data and moves it to a different file
    if os.path.exists("data.txt") and os.path.getsize("data.txt") > 2:
        #generate a new file
        while os.path.exists("records/data_record" + str(file_count) + ".txt"):
            file_count += 1

        with open("data.txt", "r") as infile:
            with open("records/data_record" + str(file_count) + ".txt",
                      "w") as outfile:
                outfile.write(infile.read())

    #reads subdomains.txt to check if it has data and moves it to a different file
    if os.path.exists(
            "subdomains.txt") and os.path.getsize("subdomains.txt") > 2:
        #generate a new file
        with open("subdomains.txt", "r") as infile:
            with open("records/subdomains_record" + str(file_count) + ".txt",
                      "w") as outfile:
                outfile.write(infile.read())

    #create or overwrite subdomains.txt
    # with open("subdomains.txt", "w") as file_contents:
    #     file_contents.write("{}")

    cparser = ConfigParser()
    cparser.read(config_file)
    config = Config(cparser)
    config.cache_server = get_cache_server(config, restart)
    crawler = Crawler(config, restart)
    crawler.start()
Ejemplo n.º 3
0
def main(config_file, restart):
    #opens up config.ini file & reads them
    cparser = ConfigParser()
    cparser.read(config_file)
    config = Config(cparser)
    config.cache_server = get_cache_server(config, restart)
    #establishing the crawler
    crawler = Crawler(config, restart)
    crawler.start()
Ejemplo n.º 4
0
def main(config_file, restart):
    cparser = ConfigParser()
    cparser.read(config_file)
    config = Config(cparser)
    config.cache_server = get_cache_server(config, restart)

    if restart:
        for filename in config.get_scraper_files():
            if os.path.exists(filename):
                os.remove(filename)

    crawler = Crawler(config, restart)
    crawler.start()
Ejemplo n.º 5
0
def main(config_file, restart):
    if DELETE_DATA_FILES:
        for file in DATA_FILES:
            if os.path.exists(file):
                os.remove(file)
    if DELETE_LOG_FILES:
        for file in LOG_FILES:
            if os.path.exists(file):
                os.remove(file)
    cparser = ConfigParser()
    cparser.read(config_file)
    config = Config(cparser)
    config.cache_server = get_cache_server(config, restart)
    crawler = Crawler(config, restart)
    crawler.start()
Ejemplo n.º 6
0
def main(config_file, restart):
    print("main begin")
    cparser = ConfigParser()
    print("parser created")
    cparser.read(config_file)
    print("config file successfully read")
    config = Config(cparser)
    print("some other config")
    
    config.cache_server = get_cache_server(config, restart)
    print("cache server obtained")

    crawler = Crawler(config, restart)
    print("Crawler created")
    crawler.start()
Ejemplo n.º 7
0
def main(config_file, restart):
    crawler = None

    try:
        cparser = ConfigParser()
        cparser.read(config_file)
        global config
        config = Config(cparser)
        config.cache_server = get_cache_server(config, restart)
        crawler = Crawler(config, restart)
        crawler.start()
    except KeyboardInterrupt:
        if crawler:
            crawler.frontier.save.sync()
        print('Keyboard Interrupt Detected !!')
    finally:
        if crawler:
            crawler.frontier.close()
        print('Goodbye !!')
Ejemplo n.º 8
0
def main(config_file, restart):
    ### Saving the list of text words in a separate file just in case
    ### the program crashes and the list of words get gone
    ### opening and closing textlist.txt if it already exists
    ### this will overwrite and erase the previous content

    ### will fresh start only if --restart parameter given
    ### also adding the list of url's
    if restart:
        tmp = open('textlist.txt', 'w')
        tmp.close()
        tmp2 = open('urllist.txt', 'w')
        tmp2.close()

    cparser = ConfigParser()
    cparser.read(config_file)
    config = Config(cparser)
    config.cache_server = get_cache_server(config, restart)
    crawler = Crawler(config, restart)
    crawler.start()