def test_logging(self): logger = get_logger(LoggingSection.TEST, __name__) logger.debug("Logging some debug info.") logger.info("Logging some info.") logger.warning("Logging a warning.") logger.critical("Logging a critical error.") print 'Logging tests passed'
# stdlib imports from django.shortcuts import render from springpython.context import ApplicationContext from django.http import HttpResponseRedirect # project imports from HTResearch.DataModel.enums import AccountType from HTResearch.Utilities.context import DAOContext from HTResearch.Utilities.logutil import LoggingSection, get_logger from HTResearch.WebClient.WebClient.views.shared_views import not_found from HTResearch.WebClient.WebClient.views.shared_views import unauthorized from HTResearch.WebClient.WebClient.models import EditContactForm #region Globals logger = get_logger(LoggingSection.CLIENT, __name__) ctx = ApplicationContext(DAOContext()) #endregion def contact_profile(request, id): """ Sends a request to the Contact Profile page and retrieves Contact information for the profile. Arguments: id (string): The id of the contact. Returns: A rendered page of the Contact Profile. """ user_id = request.session['user_id'] if 'user_id' in request.session else None account_type = request.session['account_type'] if 'account_type' in request.session else None
from scrapy.exceptions import DropItem from HTResearch.DataAccess.dao import * from HTResearch.Utilities.converter import * from HTResearch.URLFrontier.urlfrontier import URLFrontier from HTResearch.Utilities.logutil import LoggingSection, get_logger logger = get_logger(LoggingSection.CRAWLER, __name__) class ItemSwitch(object): """Redirect Items to Appropriate Pipeline Handler""" def __init__(self): self.frontier = URLFrontier() self.contact_dao = ContactDAO() self.org_dao = OrganizationDAO() self.pub_dao = PublicationDAO() self.url_dao = URLMetadataDAO() def process_item(self, item, spider): """Consumes item from spider and passes to correct handler asynchronously""" item_class = item.__class__.__name__ # switch to handle item based on class type if item_class == "ScrapedUrl": # Create DAO for URL with empty fields # Pass it to URLFrontier, which will add it iff it is new self._store_url(item) elif item_class == "ScrapedContact":
from datetime import datetime from mongoengine import Q from mongoengine.fields import StringField import re # project imports from HTResearch.DataAccess.dto import * from HTResearch.DataAccess.connection import DBConnection from HTResearch.DataModel.enums import OrgTypesEnum from HTResearch.Utilities.geocoder import geocode from HTResearch.Utilities.url_tools import UrlUtility from HTResearch.Utilities.logutil import LoggingSection, get_logger from HTResearch.Utilities import decorators #region Globals logger = get_logger(LoggingSection.CLIENT, __name__) #endregion class DAO(object): """ A generic DAO class that may be subclassed by DAOs for operations on specific documents. Attributes: conn (DBConnection): The database connection class to use. """ def __init__(self): self.conn = DBConnection
from scrapy.exceptions import DropItem from HTResearch.DataAccess.dao import * from HTResearch.Utilities.converter import * from HTResearch.URLFrontier.urlfrontier import URLFrontier from HTResearch.Utilities.logutil import LoggingSection, get_logger logger = get_logger(LoggingSection.CRAWLER, __name__) class ItemSwitch(object): """Redirect Items to Appropriate Pipeline Handler""" def __init__(self): self.frontier = URLFrontier() self.contact_dao = ContactDAO() self.org_dao = OrganizationDAO() self.pub_dao = PublicationDAO() self.url_dao = URLMetadataDAO() def process_item(self, item, spider): """Consumes item from spider and passes to correct handler asynchronously""" item_class = item.__class__.__name__ # switch to handle item based on class type if item_class == "ScrapedUrl": # Create DAO for URL with empty fields # Pass it to URLFrontier, which will add it iff it is new self._store_url(item) elif item_class == "ScrapedContact": self._store_contact(item) elif item_class == "ScrapedOrganization":
# # stdlib imports import hashlib from multiprocessing import Queue, Process, Condition, RLock from Queue import Empty, Full # project imports from HTResearch.DataAccess.dto import URLMetadataDTO from HTResearch.DataModel.model import URLMetadata from HTResearch.Utilities.converter import DTOConverter from HTResearch.Utilities.types import Singleton from HTResearch.Utilities.logutil import LoggingSection, get_logger #region Globals logger = get_logger(LoggingSection.FRONTIER, __name__) #endregion class CacheJobs(): """An enum-like class for the different cache-related jobs.""" Fill, Empty = range(2) class URLFrontierRules: """A class for the different information associated with different caching rules.""" def __init__(self, required_domains=[], blocked_domains=[], sort_list=["last_visited"]): self._required_domains = required_domains self._blocked_domains = blocked_domains self._sort_list = sort_list self._checksum = self._generate_checksum()
# # connection.py # A module for wrapping MongoDB connection details. # # stdlib imports from mongoengine.connection import connect, disconnect, get_connection # project imports from HTResearch.Utilities.config import get_config_value from HTResearch.Utilities.logutil import LoggingSection, get_logger #region Globals logger = get_logger(LoggingSection.DATA, __name__) #endregion class DBConnection(object): """A class that encapsulates the MongoDB connection.""" def __init__(self): try: host = get_config_value("MONGO", "host") port = int(get_config_value("MONGO", "port")) name = get_config_value("MONGO", "name") disconnect() connect(db=name, host=host, port=port) self.conn = get_connection() except: logger.error('Connection to MongoDB could not be established.')
# _,----' _______________________ `----._ # ,-' __,---' ___________________ `---.__ `-. # ,-' ,-' __,---' _______________ `---.__ `-. `-. # ,' ,-' ,-' __,---' `---.__ `-. `-. `. # / ,' ,-' ,-' `-. `-. `. \ # / ,' ,' ,--' "'There exist limitless `--. `. `. \ # | / ,' ,' opportunities in every industry. `. `. \ | # ,--. ,--. Where there is an open mind, _______ #( ` " ') there will always be a frontier.' (_______) # >- . -< -Charles Kettering" / \ #( , .) ( G O L D ) # `--'^`--' -Paul Poulsen \_______/ # /_\ logger = get_logger(LoggingSection.CRAWLER, 'app.py') logger.info("Starting a web crawl") ctx = ApplicationContext(URLFrontierContext()) frontier = ctx.get_object("URLFrontier") frontier.start_cache_process() spider = OrgSpider() query = 'human trafficking' #spider.start_urls = ['http://scholar.google.com/scholar?q=' + query + '&hl=en'] settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run()
from sgmllib import SGMLParseError from datetime import datetime from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from ..items import ScrapedUrl from HTResearch.DataModel.model import PageRankInfo, PageRankVector, UrlCountPair from HTResearch.Utilities.url_tools import UrlUtility from HTResearch.Utilities.logutil import get_logger, LoggingSection _linkscraper_logger = get_logger(LoggingSection.CRAWLER, __name__) class LinkScraper: """A scraper to find all URLs in a page """ def __init__(self): self._link_extractor = SgmlLinkExtractor() def parse(self, response): """Scrape a spider's HttpRequest.Response for links""" # sanity check if self._link_extractor is None: self._link_extractor = SgmlLinkExtractor() # use scrapy SgmlLinkExtractor to extract links try: links = self._link_extractor.extract_links(response)
from sgmllib import SGMLParseError from datetime import datetime from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from ..items import ScrapedUrl from HTResearch.DataModel.model import PageRankInfo, PageRankVector, UrlCountPair from HTResearch.Utilities.url_tools import UrlUtility from HTResearch.Utilities.logutil import get_logger, LoggingSection _linkscraper_logger = get_logger(LoggingSection.CRAWLER, __name__) class LinkScraper: """A scraper to find all URLs in a page """ def __init__(self): self._link_extractor = SgmlLinkExtractor() def parse(self, response): """Scrape a spider's HttpRequest.Response for links""" # sanity check if self._link_extractor is None: self._link_extractor = SgmlLinkExtractor() # use scrapy SgmlLinkExtractor to extract links try: links = self._link_extractor.extract_links(response) except SGMLParseError as e: # Page was poorly formatted, oh well
# # connection.py # A module for wrapping MongoDB connection details. # # stdlib imports from mongoengine.connection import connect, disconnect, get_connection # project imports from HTResearch.Utilities.config import get_config_value from HTResearch.Utilities.logutil import LoggingSection, get_logger #region Globals logger = get_logger(LoggingSection.DATA, __name__) #endregion class DBConnection(object): """A class that encapsulates the MongoDB connection.""" def __init__(self): try: host = get_config_value("MONGO", "host") port = int(get_config_value("MONGO", "port")) name = get_config_value("MONGO", "name") disconnect() connect(db=name, host=host, port=port) self.conn = get_connection() except: logger.error('Connection to MongoDB could not be established.') def __enter__(self):