Example #1
0
    def test_logging(self):
        logger = get_logger(LoggingSection.TEST, __name__)

        logger.debug("Logging some debug info.")
        logger.info("Logging some info.")
        logger.warning("Logging a warning.")
        logger.critical("Logging a critical error.")

        print 'Logging tests passed'
# stdlib imports
from django.shortcuts import render
from springpython.context import ApplicationContext
from django.http import HttpResponseRedirect

# project imports
from HTResearch.DataModel.enums import AccountType
from HTResearch.Utilities.context import DAOContext
from HTResearch.Utilities.logutil import LoggingSection, get_logger
from HTResearch.WebClient.WebClient.views.shared_views import not_found
from HTResearch.WebClient.WebClient.views.shared_views import unauthorized
from HTResearch.WebClient.WebClient.models import EditContactForm

#region Globals
logger = get_logger(LoggingSection.CLIENT, __name__)
ctx = ApplicationContext(DAOContext())
#endregion


def contact_profile(request, id):
    """
    Sends a request to the Contact Profile page and retrieves Contact information for the profile.

    Arguments:
        id (string): The id of the contact.

    Returns:
        A rendered page of the Contact Profile.
    """
    user_id = request.session['user_id'] if 'user_id' in request.session else None
    account_type = request.session['account_type'] if 'account_type' in request.session else None
from scrapy.exceptions import DropItem

from HTResearch.DataAccess.dao import *
from HTResearch.Utilities.converter import *
from HTResearch.URLFrontier.urlfrontier import URLFrontier
from HTResearch.Utilities.logutil import LoggingSection, get_logger


logger = get_logger(LoggingSection.CRAWLER, __name__)


class ItemSwitch(object):
    """Redirect Items to Appropriate Pipeline Handler"""

    def __init__(self):
        self.frontier = URLFrontier()
        self.contact_dao = ContactDAO()
        self.org_dao = OrganizationDAO()
        self.pub_dao = PublicationDAO()
        self.url_dao = URLMetadataDAO()

    def process_item(self, item, spider):
        """Consumes item from spider and passes to correct handler asynchronously"""
        item_class = item.__class__.__name__

        # switch to handle item based on class type
        if item_class == "ScrapedUrl":
            # Create DAO for URL with empty fields
            # Pass it to URLFrontier, which will add it iff it is new
            self._store_url(item)
        elif item_class == "ScrapedContact":
Example #4
0
from datetime import datetime
from mongoengine import Q
from mongoengine.fields import StringField
import re

# project imports
from HTResearch.DataAccess.dto import *
from HTResearch.DataAccess.connection import DBConnection
from HTResearch.DataModel.enums import OrgTypesEnum
from HTResearch.Utilities.geocoder import geocode
from HTResearch.Utilities.url_tools import UrlUtility
from HTResearch.Utilities.logutil import LoggingSection, get_logger
from HTResearch.Utilities import decorators

#region Globals
logger = get_logger(LoggingSection.CLIENT, __name__)
#endregion


class DAO(object):
    """
    A generic DAO class that may be subclassed by DAOs for operations on
    specific documents.

    Attributes:
        conn (DBConnection): The database connection class to use.
    """

    def __init__(self):
        self.conn = DBConnection
Example #5
0
from scrapy.exceptions import DropItem

from HTResearch.DataAccess.dao import *
from HTResearch.Utilities.converter import *
from HTResearch.URLFrontier.urlfrontier import URLFrontier
from HTResearch.Utilities.logutil import LoggingSection, get_logger

logger = get_logger(LoggingSection.CRAWLER, __name__)


class ItemSwitch(object):
    """Redirect Items to Appropriate Pipeline Handler"""
    def __init__(self):
        self.frontier = URLFrontier()
        self.contact_dao = ContactDAO()
        self.org_dao = OrganizationDAO()
        self.pub_dao = PublicationDAO()
        self.url_dao = URLMetadataDAO()

    def process_item(self, item, spider):
        """Consumes item from spider and passes to correct handler asynchronously"""
        item_class = item.__class__.__name__

        # switch to handle item based on class type
        if item_class == "ScrapedUrl":
            # Create DAO for URL with empty fields
            # Pass it to URLFrontier, which will add it iff it is new
            self._store_url(item)
        elif item_class == "ScrapedContact":
            self._store_contact(item)
        elif item_class == "ScrapedOrganization":
Example #6
0
#

# stdlib imports
import hashlib
from multiprocessing import Queue, Process, Condition, RLock
from Queue import Empty, Full

# project imports
from HTResearch.DataAccess.dto import URLMetadataDTO
from HTResearch.DataModel.model import URLMetadata
from HTResearch.Utilities.converter import DTOConverter
from HTResearch.Utilities.types import Singleton
from HTResearch.Utilities.logutil import LoggingSection, get_logger

#region Globals
logger = get_logger(LoggingSection.FRONTIER, __name__)
#endregion


class CacheJobs():
    """An enum-like class for the different cache-related jobs."""
    Fill, Empty = range(2)


class URLFrontierRules:
    """A class for the different information associated with different caching rules."""
    def __init__(self, required_domains=[], blocked_domains=[], sort_list=["last_visited"]):
        self._required_domains = required_domains
        self._blocked_domains = blocked_domains
        self._sort_list = sort_list
        self._checksum = self._generate_checksum()
Example #7
0
#
# connection.py
# A module for wrapping MongoDB connection details.
#

# stdlib imports
from mongoengine.connection import connect, disconnect, get_connection

# project imports
from HTResearch.Utilities.config import get_config_value
from HTResearch.Utilities.logutil import LoggingSection, get_logger

#region Globals
logger = get_logger(LoggingSection.DATA, __name__)
#endregion


class DBConnection(object):
    """A class that encapsulates the MongoDB connection."""

    def __init__(self):
        try:
            host = get_config_value("MONGO", "host")
            port = int(get_config_value("MONGO", "port"))
            name = get_config_value("MONGO", "name")
            disconnect()
            connect(db=name, host=host, port=port)
            self.conn = get_connection()
        except:
            logger.error('Connection to MongoDB could not be established.')
Example #8
0
 #             _,----'  _______________________  `----._
 #          ,-'  __,---'  ___________________  `---.__  `-.
 #       ,-'  ,-'  __,---'  _______________  `---.__  `-.  `-.
 #     ,'  ,-'  ,-'  __,---'                `---.__  `-.  `-.  `.
 #    /  ,'  ,-'  ,-'                               `-.  `-.  `.  \
 #   / ,'  ,' ,--'     "'There exist limitless          `--. `.  `. \
 #  | /  ,' ,'    opportunities in every industry.         `. `.  \ |
 # ,--. ,--.         Where there is an open mind,             _______
 #( `  "   ')     there will always be a frontier.'          (_______)
 # >-  .  -<              -Charles Kettering"                /       \
 #( ,      .)                                               ( G O L D )
 # `--'^`--'                -Paul Poulsen                    \_______/
 #    /_\


    logger = get_logger(LoggingSection.CRAWLER, 'app.py')
    logger.info("Starting a web crawl")
    ctx = ApplicationContext(URLFrontierContext())
    frontier = ctx.get_object("URLFrontier")
    frontier.start_cache_process()

    spider = OrgSpider()
    query = 'human trafficking'
    #spider.start_urls = ['http://scholar.google.com/scholar?q=' + query + '&hl=en']
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()
from sgmllib import SGMLParseError
from datetime import datetime

from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from ..items import ScrapedUrl

from HTResearch.DataModel.model import PageRankInfo, PageRankVector, UrlCountPair
from HTResearch.Utilities.url_tools import UrlUtility
from HTResearch.Utilities.logutil import get_logger, LoggingSection


_linkscraper_logger = get_logger(LoggingSection.CRAWLER, __name__)


class LinkScraper:
    """A scraper to find all URLs in a page """

    def __init__(self):
        self._link_extractor = SgmlLinkExtractor()

    def parse(self, response):
        """Scrape a spider's HttpRequest.Response for links"""

        # sanity check
        if self._link_extractor is None:
            self._link_extractor = SgmlLinkExtractor()

        # use scrapy SgmlLinkExtractor to extract links
        try:
            links = self._link_extractor.extract_links(response)
Example #10
0
from sgmllib import SGMLParseError
from datetime import datetime

from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from ..items import ScrapedUrl

from HTResearch.DataModel.model import PageRankInfo, PageRankVector, UrlCountPair
from HTResearch.Utilities.url_tools import UrlUtility
from HTResearch.Utilities.logutil import get_logger, LoggingSection

_linkscraper_logger = get_logger(LoggingSection.CRAWLER, __name__)


class LinkScraper:
    """A scraper to find all URLs in a page """
    def __init__(self):
        self._link_extractor = SgmlLinkExtractor()

    def parse(self, response):
        """Scrape a spider's HttpRequest.Response for links"""

        # sanity check
        if self._link_extractor is None:
            self._link_extractor = SgmlLinkExtractor()

        # use scrapy SgmlLinkExtractor to extract links
        try:
            links = self._link_extractor.extract_links(response)
        except SGMLParseError as e:
            # Page was poorly formatted, oh well
Example #11
0
#
# connection.py
# A module for wrapping MongoDB connection details.
#

# stdlib imports
from mongoengine.connection import connect, disconnect, get_connection

# project imports
from HTResearch.Utilities.config import get_config_value
from HTResearch.Utilities.logutil import LoggingSection, get_logger

#region Globals
logger = get_logger(LoggingSection.DATA, __name__)
#endregion


class DBConnection(object):
    """A class that encapsulates the MongoDB connection."""
    def __init__(self):
        try:
            host = get_config_value("MONGO", "host")
            port = int(get_config_value("MONGO", "port"))
            name = get_config_value("MONGO", "name")
            disconnect()
            connect(db=name, host=host, port=port)
            self.conn = get_connection()
        except:
            logger.error('Connection to MongoDB could not be established.')

    def __enter__(self):