Python Crawler Exemples

Langage de programmation: Python

Espace de nommage/Pack: ingestion.engine

Class/Type: Crawler

Exemples au hotexamples.com: 8

Python Crawler - 8 exemples trouvés. Ce sont les exemples réels les mieux notés de ingestion.engine.Crawler extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

Crawler(3)

crawl(3)

save_to_json(2)

uniqueify(1)

Méthodes fréquemment utilisées

Crawler (3)

crawl (3)

save_to_json (2)

uniqueify (1)

Associées

getPageTwisted

sysinfo

CoverageIdManager

EqnsetVisitorNodeCollector

DropboxService

nochange_filter

pacman_fitness_function

SDK

serialize_func

_User

Related in langs

FormInput (PHP)

Social_Media (PHP)

System.Data.Common.DbConnectionStringBuilder (C#)

Campo (C#)

gsignal (C++)

tic_Logout (C++)

CleanDB (Go)

mecab_new2 (Go)

Diagram (Java)

InsertStatementContext (Java)

Exemple #1

0

Afficher le fichier

Fichier : test_ingestion.py Projet : ayaleloehr/search_ingestion

def test_single_page_crawl(): c = Crawler("http://localhost:5000/test_one",1) print "initialized crawler.." c.crawl() print "crawled website.." assert c.urls == ["http://localhost:5000/test_one"] assert c.data == ["Hello World!"]

Exemple #2

0

Afficher le fichier

Fichier : test_ingestion.py Projet : ayaleloehr/search_ingestion

def test_single_page_with_complex_content(): c = Crawler("http://hackingagainstslavery.github.io",1) print "initalized crawler.." c.crawl() print "crawled website.." assert c.urls == ['https://github.com/hackingagainstslavery', 'http://hackingagainstslavery.slack.com', 'http://hackingagainstslavery.github.io'] print c.data assert c.data == ["Hacking Against Slavery HomeAboutBlog Welcome to Hacking Against Slavery! An informal organization for ending slavery Sign up for our slack channel: http://hackingagainstslavery.slack.com by emailing us at [email protected] out our github: https://github.com/hackingagainstslaverySeperately our voices are weak. Together our voices are strong. It's up to all of us to fight the evil in the world. And it starts by ensuring freedom for all. email github.com/hackingagainstslavery"]

Exemple #3

0

Afficher le fichier

Fichier : generate_index.py Projet : ayaleloehr/search_ingestion

def main(website,depth): website = "https://" + website c = Crawler(website,int(depth)) c.crawl() c.save_to_json() index = json.load(open('index.json','r')) for ind,elem in enumerate(index): i14yClient.create(ind,elem['content'],elem['path'], elem['created'],os.environ["drawer_handle"],os.environ["search_secret_token"], title=elem['title'],description=elem['description'], promote=elem['promote'],language=elem['language'])

Exemple #4

0

Afficher le fichier

Fichier : generate_index.py Projet : afcarl/search_ingestion

def main(website, depth, production=True): staging = pickle.load(open("i14y_creds.pickle", "r")) website = "https://" + website if production: c = Crawler(website, int(depth)) else: c = Crawler(website, int(depth), username=os.environ["staging_username"], password=os.environ["staging_password"], basic_auth_required=True) c.crawl() c.save_to_json() index = json.load(open('index.json', 'r')) #ToDo: Create a staging drawer and request a second search token for ind, elem in enumerate(index): i14yClient.create(ind, elem['content'], elem['url'], elem['created'], staging[0], staging[1], title=elem['title'], description=elem['description'], promote=elem['promote'], language=elem['language'])

Exemple #5

0

Afficher le fichier

def test_uniqueify(): c = Crawler("https://www.vets.gov", 2) c.crawl() c.uniqueify() c.save_to_json() index = json.load(open('index.json', 'r')) urls = [] unique_urls = [] for ind, elem in enumerate(index): urls.append(elem['url']) unique_urls = list(set(urls)) urls = [str(url) for url in urls] unique_urls = [str(url) for url in unique_urls] urls.sort() unique_urls.sort() print print "unique urls", len(unique_urls) print unique_urls print print print "urls", len(urls) print urls assert urls == unique_urls

Exemple #6

0

Afficher le fichier

Fichier : testing_script.py Projet : ayaleloehr/search_ingestion

from ingestion.engine import Crawler c = Crawler("http://127.0.0.1:5000",2,testing=True,protocol="http") c.crawl() print c.data print c.urls

Exemple #7

0

Afficher le fichier

Fichier : generate_index.py Projet : ayaleloehr/search_ingestion

from ingestion.engine import Crawler from api.clients import i14yClient from sys import argv import pickle import json website_creds = pickle.load(open("website_creds.pickle","r")) backend_creds = pickle.load(open("backend_creds.pickle","r")) c = Crawler(argv[1],int(argv[2])) #,username=website_creds["username"],password=website_creds["password"],basic_auth_required=True) c.crawl() #print c.data #print c.urls c.save_to_json() index = json.load(open('index.json','r')) for ind,elem in enumerate(index): i14yClient.create(ind,elem['content'],elem['path'], elem['created'],backend_creds["drawer_handle"],backend_creds["secret_token"], title=elem['title'],description=elem['description'], promote=elem['promote'],language=elem['language'])

Exemple #8

0

Afficher le fichier

Fichier : testing_script.py Projet : afcarl/search_ingestion

from ingestion.engine import Crawler c = Crawler("http://127.0.0.1:5000", 2, testing=True, protocol="http") c.crawl() print c.data print c.urls