def main(website, depth, production=True): staging = pickle.load(open("i14y_creds.pickle", "r")) website = "https://" + website if production: c = Crawler(website, int(depth)) else: c = Crawler(website, int(depth), username=os.environ["staging_username"], password=os.environ["staging_password"], basic_auth_required=True) c.crawl() c.save_to_json() index = json.load(open('index.json', 'r')) #ToDo: Create a staging drawer and request a second search token for ind, elem in enumerate(index): i14yClient.create(ind, elem['content'], elem['url'], elem['created'], staging[0], staging[1], title=elem['title'], description=elem['description'], promote=elem['promote'], language=elem['language'])
def test_uniqueify(): c = Crawler("https://www.vets.gov", 2) c.crawl() c.uniqueify() c.save_to_json() index = json.load(open('index.json', 'r')) urls = [] unique_urls = [] for ind, elem in enumerate(index): urls.append(elem['url']) unique_urls = list(set(urls)) urls = [str(url) for url in urls] unique_urls = [str(url) for url in unique_urls] urls.sort() unique_urls.sort() print print "unique urls", len(unique_urls) print unique_urls print print print "urls", len(urls) print urls assert urls == unique_urls
from ingestion.engine import Crawler c = Crawler("http://127.0.0.1:5000", 2, testing=True, protocol="http") c.crawl() print c.data print c.urls