Example #1
0
def run_crawlers():
    portals = request.get_json(force=True)["portals"]
    options = request.get_json(force=True)["options"]
    factory = CrawlerFactory()
    data = []
    for portal in portals:
        try:
            data.append(factory.create_crawler(portal, options).fetch_data())
        except AttributeError as err:
            print(err)
            return jsonify(err), 400
        except BaseException as err:
            print(err)
            return jsonify({"message": "Something went wrong in crawler"}), 400
    if not data:
        return jsonify(data)
    return jsonify(flatten(data))
Example #2
0
'''
Created on 22 Jan 2012

@author: george
'''
import datetime
from crawlers.CrawlerFactory import CrawlerFactory
from database.model.tweets import TwoGroupsTweet
from mongoengine import *

f = CrawlerFactory()
t = f.get_crawler("topsy")

search_hashtags = "uk OR #uk OR #UK or #usa OR #USA OR #US OR usa OR us"
t.search_for(search_hashtags)
t.search_between(from_date=datetime.datetime(2011, 01, 23, 0, 0, 0), 
                 to_date=datetime.datetime(2011, 01, 25, 0, 0, 0), 
                 granularity_days=1, 
                 granularity_hours=0, 
                 granularity_mins=0)
t.retrieve_items_of_type(TwoGroupsTweet)
t.crawl()

Example #3
0
 def test_construction_of_twitter_crawlers(self):
     factory = CrawlerFactory()
     t = factory.get_crawler("twitter")
     t.login()
     info = t.getUserInfoByScreenName("GeorgeEracleous")
'''
Created on 22 Jan 2012

@author: george
'''
import datetime
from crawlers.CrawlerFactory import CrawlerFactory
from database.model.tweets import *
from database.model.agents import *
from mongoengine import *
import tools.utils
from urlparse import urlparse
from database.warehouse import WarehouseServer

f = CrawlerFactory()
twitter = f.get_crawler("twitter")
#twitter.login()
ws = WarehouseServer()

from_date = datetime.datetime(2011, 1, 25, 0, 0, 0)
to_date = datetime.datetime(2011, 1, 26, 0, 00, 0)
items = ws.get_documents_by_date(from_date, to_date, limit=100)
screen_names = []
for tweet in items:
    screen_names.append(tweet.author_screen_name)
screen_names = set(screen_names)
print len(screen_names)
# A terrible hack to save the screen_names of users which are mentioned in tweets
# but they are not yet in the database. They'll be considered after all authors have
#been stored.
mentions_of_not_stored_users = []
Example #5
0
'''
Created on 22 Jan 2012

@author: george
'''
import datetime
from crawlers.CrawlerFactory import CrawlerFactory
from database.model.tweets import *
from database.model.agents import *
from mongoengine import *
import tools.utils
from urlparse import urlparse
from database.warehouse import WarehouseServer

f = CrawlerFactory()
twitter = f.get_crawler("twitter")
#twitter.login()
ws = WarehouseServer()

from_date = datetime.datetime(2011, 1, 25, 0, 0, 0)
to_date = datetime.datetime(2011, 1, 26, 0, 00, 0) 
items = ws.get_documents_by_date(from_date, to_date, limit=100)  
screen_names = []
for tweet in items:
    screen_names.append(tweet.author_screen_name)
screen_names = set(screen_names)
print len(screen_names)
# A terrible hack to save the screen_names of users which are mentioned in tweets 
# but they are not yet in the database. They'll be considered after all authors have 
#been stored.
mentions_of_not_stored_users = [] 
'''
Created on 22 Jan 2012

@author: george
'''
from database.model.agents import TrainingAuthor
from crawlers.CrawlerFactory import CrawlerFactory

f = CrawlerFactory()
crawler = f.get_crawler("scrapy")

crawler.setup(user_type=TrainingAuthor)
crawler.crawl(store=True)
Example #7
0
 def test_construction_of_twitter_crawlers(self):
     factory = CrawlerFactory()
     t = factory.get_crawler("twitter")
     t.login()
     info = t.getUserInfoByScreenName("GeorgeEracleous")