def main(website, depth, production=True): staging = pickle.load(open("i14y_creds.pickle", "r")) website = "https://" + website if production: c = Crawler(website, int(depth)) else: c = Crawler(website, int(depth), username=os.environ["staging_username"], password=os.environ["staging_password"], basic_auth_required=True) c.crawl() c.save_to_json() index = json.load(open('index.json', 'r')) #ToDo: Create a staging drawer and request a second search token for ind, elem in enumerate(index): i14yClient.create(ind, elem['content'], elem['url'], elem['created'], staging[0], staging[1], title=elem['title'], description=elem['description'], promote=elem['promote'], language=elem['language'])
def main(website,depth): website = "https://" + website c = Crawler(website,int(depth)) c.crawl() c.save_to_json() index = json.load(open('index.json','r')) for ind,elem in enumerate(index): i14yClient.create(ind,elem['content'],elem['path'], elem['created'],os.environ["drawer_handle"],os.environ["search_secret_token"], title=elem['title'],description=elem['description'], promote=elem['promote'],language=elem['language'])
def test_uniqueify(): c = Crawler("https://www.vets.gov", 2) c.crawl() c.uniqueify() c.save_to_json() index = json.load(open('index.json', 'r')) urls = [] unique_urls = [] for ind, elem in enumerate(index): urls.append(elem['url']) unique_urls = list(set(urls)) urls = [str(url) for url in urls] unique_urls = [str(url) for url in unique_urls] urls.sort() unique_urls.sort() print print "unique urls", len(unique_urls) print unique_urls print print print "urls", len(urls) print urls assert urls == unique_urls
from ingestion.engine import Crawler from api.clients import i14yClient from sys import argv import pickle import json website_creds = pickle.load(open("website_creds.pickle","r")) backend_creds = pickle.load(open("backend_creds.pickle","r")) c = Crawler(argv[1],int(argv[2])) #,username=website_creds["username"],password=website_creds["password"],basic_auth_required=True) c.crawl() #print c.data #print c.urls c.save_to_json() index = json.load(open('index.json','r')) for ind,elem in enumerate(index): i14yClient.create(ind,elem['content'],elem['path'], elem['created'],backend_creds["drawer_handle"],backend_creds["secret_token"], title=elem['title'],description=elem['description'], promote=elem['promote'],language=elem['language'])