import Queue import urlparse from django.db.transaction import commit_on_success import requests import utilities.tselogging as logging logger = logging.getLogger('tse.u.util') # Decorators def log_args_and_ret_values(func): def inner(*args, **kwargs): ret = func(*args, **kwargs) logger.debug( "ARGS_LOGGER: {f_name}{args} ====> {ret}".format( f_name=func.__name__, args=(args, kwargs), kwargs=kwargs, ret=ret)) return ret return inner def constant(f): '''Simple read-only decorator''' def fset(self, value):
from searchEngine.models import WordFromIndexedPage, IndexedPage from collections import defaultdict import utilities.tselogging as logging from utilities.util import bulk_save logger = logging.getLogger("tse.se.example") # Example executions. # Say "google" is an already saved WordFromIndexedPage Instance, and we # get these words from nltk clean html list_of_words = ["new", "google", "googl3", "google4"] # Database queries (eg, save(), get, filter) etc do disk-IO op, # hence are slower than mem access, # we could cache stuffs, and batch save all objects at the end :) # with a function like: cache = {} # word<String> : wordL<WordFromIndexedPage> cacheWordLocation = defaultdict(list) # it's ok, we do this just once...-ish url = IndexedPage.objects.get(pk="http://www.google.com") for word in list_of_words: if word not in cache: cache[word] = WordFromIndexedPage(indexedPage=url, word=word) cacheWordLocation[word].append(32) for key in cache.keys(): cache[key].set_offsets(cacheWordLocation[key])
import threading from indexer import Indexer from utilities.util import TQueue, RunOnMainThread, profile_main import utilities.tselogging as logging from searchEngine.models import IndexedPage, WordFromIndexedPage logger = logging.getLogger("tse.se.crawler") class Crawler(threading.Thread): """Class representing a single instance of a crawler.""" POP_TIMEOUT_IN_SECONDS = 1000 def __init__( self, max_links_to_crawl=100, max_active_indexers=5, links_queue=TQueue(["google.com"]), # passed to indexers to populate. main_thread_cmd_queue=TQueue() ): # passed to indexers: commands to run on main thread threading.Thread.__init__(self) self.max_links_to_crawl = max(max_links_to_crawl, 1) self.max_active_indexers = max(max_active_indexers, 1) self.links_queue = links_queue self.main_thread_cmd_queue = main_thread_cmd_queue self.finished_indexers_list = [] self.number_of_non_trivial_indexes = 0
from urlparse import urljoin import re import threading from django.db import transaction import lxml.html import nltk import requests from searchEngine.models import WordFromIndexedPage, IndexedPage from utilities.util import bulk_save, final_url_after_redirects, profile_main, \ constant, RunOnMainThread import utilities.tselogging as logging logger = logging.getLogger("tse.se.indexer") class _CONST(object): @constant def INDEXER_URL_CONNECTION_TIMEOUT(self): return 5000 CONST = _CONST() #@transaction.atomic def page_already_indexed(indexed_page, text_content_hash): original = IndexedPage.objects.filter(text_content_hash=text_content_hash) if original.exists():
from collections import defaultdict from urlparse import urljoin import re import threading from django.db import transaction import lxml.html import nltk import requests from searchEngine.models import WordFromIndexedPage, IndexedPage from utilities.util import bulk_save, final_url_after_redirects, profile_main, \ constant, RunOnMainThread import utilities.tselogging as logging logger = logging.getLogger("tse.se.indexer") class _CONST(object): @constant def INDEXER_URL_CONNECTION_TIMEOUT(self): return 5000 CONST = _CONST() #@transaction.atomic def page_already_indexed(indexed_page, text_content_hash): original = IndexedPage.objects.filter(text_content_hash=text_content_hash) if original.exists():
from searchEngine.models import WordFromIndexedPage, IndexedPage from collections import defaultdict import utilities.tselogging as logging from utilities.util import bulk_save logger = logging.getLogger("tse.se.example") # Example executions. # Say "google" is an already saved WordFromIndexedPage Instance, and we # get these words from nltk clean html list_of_words = ["new", "google", "googl3", "google4"] # Database queries (eg, save(), get, filter) etc do disk-IO op, # hence are slower than mem access, # we could cache stuffs, and batch save all objects at the end :) # with a function like: cache = {} # word<String> : wordL<WordFromIndexedPage> cacheWordLocation = defaultdict(list) # it's ok, we do this just once...-ish url = IndexedPage.objects.get(pk="http://www.google.com") for word in list_of_words: if word not in cache: cache[word] = WordFromIndexedPage(indexedPage=url, word=word) cacheWordLocation[word].append(32) for key in cache.keys():
import threading from indexer import Indexer from utilities.util import TQueue, RunOnMainThread, profile_main import utilities.tselogging as logging from searchEngine.models import IndexedPage, WordFromIndexedPage logger = logging.getLogger("tse.se.crawler") class Crawler(threading.Thread): """Class representing a single instance of a crawler.""" POP_TIMEOUT_IN_SECONDS = 1000 def __init__( self, max_links_to_crawl=100, max_active_indexers=5, links_queue=TQueue(["google.com"]), # passed to indexers to populate. main_thread_cmd_queue=TQueue()): # passed to indexers: commands to run on main thread threading.Thread.__init__(self) self.max_links_to_crawl = max(max_links_to_crawl, 1) self.max_active_indexers = max(max_active_indexers, 1) self.links_queue = links_queue self.main_thread_cmd_queue = main_thread_cmd_queue self.finished_indexers_list = [] self.number_of_non_trivial_indexes = 0