Esempio n. 1
0
def main(website, depth, production=True):
    staging = pickle.load(open("i14y_creds.pickle", "r"))
    website = "https://" + website
    if production:
        c = Crawler(website, int(depth))
    else:
        c = Crawler(website,
                    int(depth),
                    username=os.environ["staging_username"],
                    password=os.environ["staging_password"],
                    basic_auth_required=True)
    c.crawl()
    c.save_to_json()
    index = json.load(open('index.json', 'r'))
    #ToDo: Create a staging drawer and request a second search token
    for ind, elem in enumerate(index):
        i14yClient.create(ind,
                          elem['content'],
                          elem['url'],
                          elem['created'],
                          staging[0],
                          staging[1],
                          title=elem['title'],
                          description=elem['description'],
                          promote=elem['promote'],
                          language=elem['language'])
Esempio n. 2
0
def test_uniqueify():
    c = Crawler("https://www.vets.gov", 2)
    c.crawl()
    c.uniqueify()
    c.save_to_json()
    index = json.load(open('index.json', 'r'))
    urls = []
    unique_urls = []
    for ind, elem in enumerate(index):
        urls.append(elem['url'])
    unique_urls = list(set(urls))
    urls = [str(url) for url in urls]
    unique_urls = [str(url) for url in unique_urls]
    urls.sort()
    unique_urls.sort()
    print
    print "unique urls", len(unique_urls)
    print unique_urls
    print
    print
    print "urls", len(urls)
    print urls

    assert urls == unique_urls
Esempio n. 3
0
from ingestion.engine import Crawler

c = Crawler("http://127.0.0.1:5000", 2, testing=True, protocol="http")
c.crawl()
print c.data
print c.urls