Python CrawlerFactory Examples

Programming Language: Python

Namespace/Package Name: crawlers.CrawlerFactory

Class/Type: CrawlerFactory

Examples at hotexamples.com: 7

Python CrawlerFactory - 7 examples found. These are the top rated real world Python examples of crawlers.CrawlerFactory.CrawlerFactory extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CrawlerFactory(3)

get_crawler(3)

create_crawler(1)

Example #1

Show file

def run_crawlers():
    portals = request.get_json(force=True)["portals"]
    options = request.get_json(force=True)["options"]
    factory = CrawlerFactory()
    data = []
    for portal in portals:
        try:
            data.append(factory.create_crawler(portal, options).fetch_data())
        except AttributeError as err:
            print(err)
            return jsonify(err), 400
        except BaseException as err:
            print(err)
            return jsonify({"message": "Something went wrong in crawler"}), 400
    if not data:
        return jsonify(data)
    return jsonify(flatten(data))

Example #2

Show file

File: test_tweets.py Project: nihaofuyue0617/pythia

'''
Created on 22 Jan 2012

@author: george
'''
import datetime
from crawlers.CrawlerFactory import CrawlerFactory
from database.model.tweets import TwoGroupsTweet
from mongoengine import *

f = CrawlerFactory()
t = f.get_crawler("topsy")

search_hashtags = "uk OR #uk OR #UK or #usa OR #USA OR #US OR usa OR us"
t.search_for(search_hashtags)
t.search_between(from_date=datetime.datetime(2011, 01, 23, 0, 0, 0), 
                 to_date=datetime.datetime(2011, 01, 25, 0, 0, 0), 
                 granularity_days=1, 
                 granularity_hours=0, 
                 granularity_mins=0)
t.retrieve_items_of_type(TwoGroupsTweet)
t.crawl()

Example #3

Show file

File: crawlers_tests.py Project: nihaofuyue0617/pythia

 def test_construction_of_twitter_crawlers(self):
     factory = CrawlerFactory()
     t = factory.get_crawler("twitter")
     t.login()
     info = t.getUserInfoByScreenName("GeorgeEracleous")

Example #4

Show file

File: training_authors.py Project: nihaofuyue0617/pythia

'''
Created on 22 Jan 2012

@author: george
'''
import datetime
from crawlers.CrawlerFactory import CrawlerFactory
from database.model.tweets import *
from database.model.agents import *
from mongoengine import *
import tools.utils
from urlparse import urlparse
from database.warehouse import WarehouseServer

f = CrawlerFactory()
twitter = f.get_crawler("twitter")
#twitter.login()
ws = WarehouseServer()

from_date = datetime.datetime(2011, 1, 25, 0, 0, 0)
to_date = datetime.datetime(2011, 1, 26, 0, 00, 0)
items = ws.get_documents_by_date(from_date, to_date, limit=100)
screen_names = []
for tweet in items:
    screen_names.append(tweet.author_screen_name)
screen_names = set(screen_names)
print len(screen_names)
# A terrible hack to save the screen_names of users which are mentioned in tweets
# but they are not yet in the database. They'll be considered after all authors have
#been stored.
mentions_of_not_stored_users = []

Example #5

Show file

File: training_authors.py Project: aurora1625/pythia

'''
Created on 22 Jan 2012

@author: george
'''
import datetime
from crawlers.CrawlerFactory import CrawlerFactory
from database.model.tweets import *
from database.model.agents import *
from mongoengine import *
import tools.utils
from urlparse import urlparse
from database.warehouse import WarehouseServer

f = CrawlerFactory()
twitter = f.get_crawler("twitter")
#twitter.login()
ws = WarehouseServer()

from_date = datetime.datetime(2011, 1, 25, 0, 0, 0)
to_date = datetime.datetime(2011, 1, 26, 0, 00, 0) 
items = ws.get_documents_by_date(from_date, to_date, limit=100)  
screen_names = []
for tweet in items:
    screen_names.append(tweet.author_screen_name)
screen_names = set(screen_names)
print len(screen_names)
# A terrible hack to save the screen_names of users which are mentioned in tweets 
# but they are not yet in the database. They'll be considered after all authors have 
#been stored.
mentions_of_not_stored_users = []

Example #6

Show file

File: annotate_authors.py Project: giorgosera/pythia-hackathon

'''
Created on 22 Jan 2012

@author: george
'''
from database.model.agents import TrainingAuthor
from crawlers.CrawlerFactory import CrawlerFactory

f = CrawlerFactory()
crawler = f.get_crawler("scrapy")

crawler.setup(user_type=TrainingAuthor)
crawler.crawl(store=True)

Example #7

Show file

File: crawlers_tests.py Project: aurora1625/pythia

 def test_construction_of_twitter_crawlers(self):
     factory = CrawlerFactory()
     t = factory.get_crawler("twitter")
     t.login()
     info = t.getUserInfoByScreenName("GeorgeEracleous")