def test_single_page_crawl():
    c = Crawler("http://localhost:5000/test_one",1)
    print "initialized crawler.."
    c.crawl()
    print "crawled website.."
    assert c.urls == ["http://localhost:5000/test_one"] 
    assert c.data == ["Hello World!"]
def test_single_page_with_complex_content():
    c = Crawler("http://hackingagainstslavery.github.io",1)
    print "initalized crawler.."
    c.crawl()
    print "crawled website.."
    assert c.urls == ['https://github.com/hackingagainstslavery', 'http://hackingagainstslavery.slack.com', 'http://hackingagainstslavery.github.io']
    print c.data
    assert c.data == ["Hacking Against Slavery HomeAboutBlog Welcome to Hacking Against Slavery! An informal organization for ending slavery Sign up for our slack channel: http://hackingagainstslavery.slack.com by emailing us at [email protected] out our github: https://github.com/hackingagainstslaverySeperately our voices are weak. Together our voices are strong. It's up to all of us to fight the evil in the world. And it starts by ensuring freedom for all. email github.com/hackingagainstslavery"]
def main(website,depth):
    website = "https://" + website
    c = Crawler(website,int(depth))
    c.crawl()
    c.save_to_json()
    index = json.load(open('index.json','r'))
    for ind,elem in enumerate(index):
        i14yClient.create(ind,elem['content'],elem['path'],
                          elem['created'],os.environ["drawer_handle"],os.environ["search_secret_token"],
                          title=elem['title'],description=elem['description'],
                          promote=elem['promote'],language=elem['language']) 
def main(website, depth, production=True):
    staging = pickle.load(open("i14y_creds.pickle", "r"))
    website = "https://" + website
    if production:
        c = Crawler(website, int(depth))
    else:
        c = Crawler(website,
                    int(depth),
                    username=os.environ["staging_username"],
                    password=os.environ["staging_password"],
                    basic_auth_required=True)
    c.crawl()
    c.save_to_json()
    index = json.load(open('index.json', 'r'))
    #ToDo: Create a staging drawer and request a second search token
    for ind, elem in enumerate(index):
        i14yClient.create(ind,
                          elem['content'],
                          elem['url'],
                          elem['created'],
                          staging[0],
                          staging[1],
                          title=elem['title'],
                          description=elem['description'],
                          promote=elem['promote'],
                          language=elem['language'])
Exemple #5
0
def test_uniqueify():
    c = Crawler("https://www.vets.gov", 2)
    c.crawl()
    c.uniqueify()
    c.save_to_json()
    index = json.load(open('index.json', 'r'))
    urls = []
    unique_urls = []
    for ind, elem in enumerate(index):
        urls.append(elem['url'])
    unique_urls = list(set(urls))
    urls = [str(url) for url in urls]
    unique_urls = [str(url) for url in unique_urls]
    urls.sort()
    unique_urls.sort()
    print
    print "unique urls", len(unique_urls)
    print unique_urls
    print
    print
    print "urls", len(urls)
    print urls

    assert urls == unique_urls
from ingestion.engine import Crawler

c = Crawler("http://127.0.0.1:5000",2,testing=True,protocol="http")
c.crawl()
print c.data
print c.urls
from ingestion.engine import Crawler
from api.clients import i14yClient
from sys import argv
import pickle
import json

website_creds = pickle.load(open("website_creds.pickle","r"))
backend_creds = pickle.load(open("backend_creds.pickle","r"))
c = Crawler(argv[1],int(argv[2])) #,username=website_creds["username"],password=website_creds["password"],basic_auth_required=True)
c.crawl()
#print c.data
#print c.urls
c.save_to_json()
index = json.load(open('index.json','r'))
for ind,elem in enumerate(index):
    i14yClient.create(ind,elem['content'],elem['path'],
                      elem['created'],backend_creds["drawer_handle"],backend_creds["secret_token"],
                      title=elem['title'],description=elem['description'],
                      promote=elem['promote'],language=elem['language']) 
    
from ingestion.engine import Crawler

c = Crawler("http://127.0.0.1:5000", 2, testing=True, protocol="http")
c.crawl()
print c.data
print c.urls