コード例 #1
0
import Queue
import urlparse

from django.db.transaction import commit_on_success
import requests

import utilities.tselogging as logging


logger = logging.getLogger('tse.u.util')

# Decorators


def log_args_and_ret_values(func):
    def inner(*args, **kwargs):
        ret = func(*args, **kwargs)
        logger.debug(
            "ARGS_LOGGER: {f_name}{args} ====> {ret}".format(
                f_name=func.__name__,
                args=(args, kwargs),
                kwargs=kwargs,
                ret=ret))
        return ret
    return inner


def constant(f):
    '''Simple read-only decorator'''

    def fset(self, value):
コード例 #2
0
from searchEngine.models import WordFromIndexedPage, IndexedPage
from collections import defaultdict
import utilities.tselogging as logging
from utilities.util import bulk_save

logger = logging.getLogger("tse.se.example")

# Example executions.
# Say "google" is an already saved WordFromIndexedPage Instance, and we
# get these words from nltk clean html
list_of_words = ["new", "google", "googl3", "google4"]

# Database queries (eg, save(), get, filter) etc do disk-IO op,
# hence are slower than mem access,
# we could cache stuffs, and batch save all objects at the end :)
# with a function like:

cache = {}  # word<String> : wordL<WordFromIndexedPage>
cacheWordLocation = defaultdict(list)

# it's ok, we do this just once...-ish
url = IndexedPage.objects.get(pk="http://www.google.com")
for word in list_of_words:
    if word not in cache:
        cache[word] = WordFromIndexedPage(indexedPage=url, word=word)

    cacheWordLocation[word].append(32)

for key in cache.keys():
    cache[key].set_offsets(cacheWordLocation[key])
コード例 #3
0
import threading

from indexer import Indexer
from utilities.util import TQueue, RunOnMainThread, profile_main
import utilities.tselogging as logging
from searchEngine.models import IndexedPage, WordFromIndexedPage

logger = logging.getLogger("tse.se.crawler")


class Crawler(threading.Thread):
    """Class representing a single instance of a crawler."""

    POP_TIMEOUT_IN_SECONDS = 1000

    def __init__(
        self,
        max_links_to_crawl=100,
        max_active_indexers=5,
        links_queue=TQueue(["google.com"]),  # passed to indexers to populate.
        main_thread_cmd_queue=TQueue()
    ):  # passed to indexers: commands to run on main thread
        threading.Thread.__init__(self)

        self.max_links_to_crawl = max(max_links_to_crawl, 1)
        self.max_active_indexers = max(max_active_indexers, 1)
        self.links_queue = links_queue
        self.main_thread_cmd_queue = main_thread_cmd_queue

        self.finished_indexers_list = []
        self.number_of_non_trivial_indexes = 0
コード例 #4
0
ファイル: indexer.py プロジェクト: JohnMofor/toySearchEngine
from urlparse import urljoin
import re
import threading

from django.db import transaction
import lxml.html
import nltk
import requests

from searchEngine.models import WordFromIndexedPage, IndexedPage
from utilities.util import bulk_save, final_url_after_redirects, profile_main, \
    constant, RunOnMainThread
import utilities.tselogging as logging


logger = logging.getLogger("tse.se.indexer")


class _CONST(object):

    @constant
    def INDEXER_URL_CONNECTION_TIMEOUT(self):
        return 5000

CONST = _CONST()


#@transaction.atomic
def page_already_indexed(indexed_page, text_content_hash):
    original = IndexedPage.objects.filter(text_content_hash=text_content_hash)
    if original.exists():
コード例 #5
0
from collections import defaultdict
from urlparse import urljoin
import re
import threading

from django.db import transaction
import lxml.html
import nltk
import requests

from searchEngine.models import WordFromIndexedPage, IndexedPage
from utilities.util import bulk_save, final_url_after_redirects, profile_main, \
    constant, RunOnMainThread
import utilities.tselogging as logging

logger = logging.getLogger("tse.se.indexer")


class _CONST(object):
    @constant
    def INDEXER_URL_CONNECTION_TIMEOUT(self):
        return 5000


CONST = _CONST()


#@transaction.atomic
def page_already_indexed(indexed_page, text_content_hash):
    original = IndexedPage.objects.filter(text_content_hash=text_content_hash)
    if original.exists():
コード例 #6
0
ファイル: example.py プロジェクト: JohnMofor/toySearchEngine
from searchEngine.models import WordFromIndexedPage, IndexedPage
from collections import defaultdict
import utilities.tselogging as logging
from utilities.util import bulk_save

logger = logging.getLogger("tse.se.example")


# Example executions.
# Say "google" is an already saved WordFromIndexedPage Instance, and we
# get these words from nltk clean html
list_of_words = ["new", "google", "googl3", "google4"]

# Database queries (eg, save(), get, filter) etc do disk-IO op,
# hence are slower than mem access,
# we could cache stuffs, and batch save all objects at the end :)
# with a function like:


cache = {}  # word<String> : wordL<WordFromIndexedPage>
cacheWordLocation = defaultdict(list)

# it's ok, we do this just once...-ish
url = IndexedPage.objects.get(pk="http://www.google.com")
for word in list_of_words:
    if word not in cache:
        cache[word] = WordFromIndexedPage(indexedPage=url, word=word)

    cacheWordLocation[word].append(32)

for key in cache.keys():
コード例 #7
0
import threading

from indexer import Indexer
from utilities.util import TQueue, RunOnMainThread, profile_main
import utilities.tselogging as logging
from searchEngine.models import IndexedPage, WordFromIndexedPage

logger = logging.getLogger("tse.se.crawler")


class Crawler(threading.Thread):

    """Class representing a single instance of a crawler."""

    POP_TIMEOUT_IN_SECONDS = 1000

    def __init__(
            self,
            max_links_to_crawl=100,
            max_active_indexers=5,
            links_queue=TQueue(["google.com"]),  # passed to indexers to populate.
            main_thread_cmd_queue=TQueue()):  # passed to indexers: commands to run on main thread
        threading.Thread.__init__(self)

        self.max_links_to_crawl = max(max_links_to_crawl, 1)
        self.max_active_indexers = max(max_active_indexers, 1)
        self.links_queue = links_queue
        self.main_thread_cmd_queue = main_thread_cmd_queue

        self.finished_indexers_list = []
        self.number_of_non_trivial_indexes = 0