Beispiel #1
0
    def visit_node(self, data, model_template, models, title=None):
        """ Recursively traverse the children and create new Contents from
        paragraphs. """
        accepted_tags = Config.get_value(["model", "accepted_tags"])

        for child in data:
            if "children" in child:
                title_text = "{} - {}".format(title, child["text"]) \
                             if title else child["text"]
                self.visit_node(child["children"], model_template,
                                models, title=title_text)

            elif child["tag"] in accepted_tags:
                # Hit a leaf node in recursion tree. We extract the text here
                # and continue.
                keywords = [KeyWord(*kw)
                            for kw in get_keywords(self.__vectorizer,
                                                   self.__feature_names,
                                                   "{} {}"
                                                   .format(title,
                                                           child["text"]))]

                content = Content(title, child["text"],
                                  child["links"], keywords)
                new_model = copy.deepcopy(model_template)
                new_model["id"] = child["id"]
                new_model["content"] = content.get_content()
                models.append(new_model)

        return models
Beispiel #2
0
from sklearn.metrics.pairwise import cosine_similarity

from spellchecker import SpellChecker

from nltk.corpus import wordnet as wn

from chatbot.model.model_factory import ModelFactory
from chatbot.nlp.keyword import get_tfidf_model, get_stopwords, lemmatize, nb
from chatbot.nlp.synset import SynsetWrapper
from chatbot.util.config_util import Config
from chatbot.util.logger_util import set_logger

if str(os.getenv("LOG")) == "TRUE":
    set_logger()

NOT_FOUND = Config.get_value(['query_system', 'not_found'])
MULTIPLE_ANSWERS = Config.get_value(['query_system', 'multiple_answers'])
CHAR_LIMIT = Config.get_value(['query_system', 'character_limit'])
MAX_ANSWERS = Config.get_value(['query_system', 'max_answers'])
URL_FROM_TEXT = Config.get_value(['query_system', 'url_from_text'])

factory = ModelFactory.get_instance()
factory.set_db()


def _handle_not_found(query_text):
    '''
    Inserts this specific query text into the unknown queries collection as
    well as returning a fallback string.
    '''
    try:
Beispiel #3
0
import copy
import json

from chatbot.util.config_util import Config
from chatbot.nlp.keyword import lemmatize

SYNSET_FILE = Config.get_value(['query_system', 'custom_synset_file'])


class SynsetWrapper():
    ''' Wrapper for a custom synset list. Interfaces with a text file where
    each line consists of synonyms split by comma '''
    __instance = None

    @staticmethod
    def get_instance():
        ''' Static access method '''
        if SynsetWrapper.__instance is None:
            SynsetWrapper()
        return SynsetWrapper.__instance

    def __init__(self):
        ''' Virtually private constrcutor '''
        if SynsetWrapper.__instance is not None:
            raise Exception('This class is a singleton!')
        else:
            self.__read_synset_file()
            SynsetWrapper.__instance = self

    def get_synset(self, token):
        ''' Return a synset for a given token '''
class InfoGatheringSpider(scrapy.Spider):
    # Name of the spider. This is the name to use from the Scrapy CLI.
    name = 'info_gathering'

    config = Config.get_value(['scraper'])

    # The following few lines contain command line flags.
    # All flags default to false, so do not explicitly set them as so.
    # See the GitHub Wiki for information about how these are used.

    # Enable to display additional debugging information to output when the
    # crawler is run.  In practice, this will pretty print the exported tree
    # when a page is scraped.

    debug = 'debug' if config['debug'] else None

    # Elements which sometimes are used to indicate a header.
    alternative_headers = config['alternative_headers']

    # Root url for all web pages
    root_url = config['url']['root_url']

    # The links to start the crawling process on.
    start_urls = [root_url]

    # Paths on the site which are allowed. Only paths which match
    # these will ever be visited.
    allowed_paths = list(map(re.compile, config['url']['allowed_paths']))

    # Pages in this list will be visited and links on them will
    # be visited, however the data will not be scraped.
    scrape_blacklist = list(map(re.compile, config['blacklist']['scrape']))

    # These links will never be visited, even if the path is allowed above.
    visit_blacklist = list(map(re.compile, config['blacklist']['visit']))

    # These selectors will be removed from all pages, as they contain very
    # little actual information, and are equal on all pages.
    garbage_elements = set(config['blacklist']['elements'])

    # Elements containing text equal to one of these sentences will be
    # removed from all pages.

    garbage_text = set(config['blacklist']['texts'])

    # Elements containing an url in href that starts with the following
    # will be removed
    garbage_start_urls = set(config['blacklist']['garbage_start_urls'])

    # Elements containing an url in href that ends with the following
    # will be removed.
    garbage_resources = set(config['blacklist']['resources'])

    # The text used for the title on 404 pages. Used to detect silent 404
    # error.
    not_found_text = config['blacklist']['not_found_text']

    # Hierarchy for sorting categories.
    # Elements with level=None will follow normal html hierarchy
    hierarchy = config['hierarchy']

    # If a tag is listed here, sequences of tabs belonging to one of these
    # types will all be merged into one tag. For example, directly following
    # paragraph tags will be merged into one big paragraph, separated with
    # newlines.  The value corresponding to each key is the word limit for when
    # the following tag can be merged together
    concatenation_tags_word_limit = config['concatenation']

    # Of the elements in the hierarchy, these tags will not be created as nodes
    # if their parent is in the set of parents.
    ignored_child_tags = config['blacklist']['ignored_child_tags_for_parents']

    def normalize(self, text):
        return unicodedata.normalize('NFKC', text)

    def extract_metadata(self, root, soup, page_id):
        ''' Extract keywords metadata from the header of the page and add them
        as children of the tree root element. '''

        # Attempt finding the keywords meta tag on the page.
        keywords = soup.find('meta', attrs={'name': 'keywords'})

        if keywords and 'content' in keywords.attrs:
            # Add the keywords beneath the title in the tree, if the meta tag
            # has the content attribute correctly specified.
            TreeElement('meta',
                        page_id,
                        keywords.attrs['content'],
                        parent=root)

    def locate_parent(self, elem_tag, current_parent, root):
        ''' Locate the parent element on which we should insert the next
        node in the tree, based on our hierarchy of tags. '''

        # Data about this elements position in the hierarchy.
        elem_level = None
        if elem_tag in self.hierarchy:
            elem_level = self.hierarchy[elem_tag]

        # The parent which will be used for the next node in the tree.
        parent = None

        # Search for the appropriate parent element.
        search_parent = current_parent

        while True:
            # If we reach the root node, use it.
            if search_parent == root:
                parent = root
                break

            # We reached a tag of the same type, so use it.
            if search_parent.tag == elem_tag:
                parent = search_parent.parent
                break

            # Whether the search parent is in the hierarchy or not.
            search_parent_level = None
            if search_parent.tag in self.hierarchy:
                search_parent_level = self.hierarchy[search_parent.tag]

            if search_parent_level:
                # If both tags are in the hierarchy, check their level.
                if elem_level:
                    if elem_level > search_parent_level:
                        parent = search_parent
                        break

                    if elem_level == search_parent_level:
                        # If elements are in same level in hierarchy.
                        parent = search_parent.parent
                        break
                else:
                    # Element where hierarchy is not defined.
                    parent = search_parent
                    break

            # Update the current parent while searching.
            search_parent = search_parent.parent

        # Return the parent element candidate.
        return parent

    def generate_tree(self, response):
        ''' Creates a tree structure describing the given page. This structure
        is based on headers, creating a hierarchy based on text pieces which
        are positioned in between different types of headers. '''

        # Reset id to 0 when on a new page.
        TreeElement.counter = 0

        # Hash the page URL, it will be used as an ID.
        page_id = sha1(response.url.encode()).hexdigest()

        # Parse the HTML using BeautifulSoup. Make sure we use LXML for
        # parsing.
        soup = BeautifulSoup(response.text, 'lxml')

        # We only care about elements on the page which are defined in the
        # hierarchy.
        elements = soup.find_all(self.hierarchy.keys())

        # We remove the header and footer tags from the page to reduce
        # bloat, as these contain little useful information.
        for garbage_selector in self.garbage_elements:
            for garbage_element in soup.select(garbage_selector):
                garbage_element.decompose()

        # Locate the title element. It might be used for the tree root.
        title = self.normalize(soup.find('title').text.strip())

        # Do not continue with this page if we detect it as a silent 404.
        if self.not_found_text in title:
            return

        # Use the title as the tree root.
        root = TreeElement('title', page_id, title)

        # Attempt extracting the keywords and adding them to the tree.
        self.extract_metadata(root, soup, page_id)

        # Current position in the hierarchy.
        current_parent = root

        for elem in elements:
            # Replace BR tags with newlines.
            for br in elem.find_all('br'):
                br.replace_with('\n')

            # Remove trailing and tailing spaces from the node contents.
            elem_text = self.normalize(elem.text.strip())

            # Find the type of this element.
            elem_tag = elem.name

            # Do not allow tree nodes with empty text.
            if not elem_text:
                continue

            # Set list for list point element
            if elem_tag == 'li':
                elem_text = '- ' + elem_text

            # Do not include elements with element text containing
            # blacklisted sentences.
            if any(sentence in elem_text for sentence in self.garbage_text):
                continue

            if self.alternative_headers:
                # If a paragraph contains for example a strong tag, we can
                # treat that combination as a header. This check avoids adding
                # the strong tag in addition to the custom header.
                if elem_tag in self.alternative_headers and \
                               current_parent.tag == 'h6' and \
                               self.normalize(current_parent.text) \
                               == elem_text:
                    continue

                if elem_tag == 'p':
                    # Find all alternative header tags inside this paragraph.
                    headers = elem.find_all(self.alternative_headers)

                    # Check if there is only 1 alternative header tag, and
                    # check if it contains all of the text inside the
                    # paragraph.
                    if len(headers) == 1 and elem_text \
                            == self.normalize(headers[0].text.strip()):
                        # Locate the parent in which a H6 tag would be
                        # inserted.
                        parent = self.locate_parent('h6', current_parent, root)

                        # Add a custom H6 element.
                        current_parent = TreeElement(
                            'h6',
                            page_id,
                            elem_text,
                            parent,
                        )
                        continue

            # Locate the parent element to use based on the hierarchy.
            parent = self.locate_parent(elem_tag, current_parent, root)

            # Concatenate tags like paragraph tags which directly follow each
            # other.
            if elem_tag in self.concatenation_tags_word_limit and \
                    parent.children:
                last_child = parent.children[-1]

                # Start a new paragraph if the last child already has children
                if last_child and last_child.tag == elem_tag and \
                        not last_child.children:
                    # Concatenate the texts until limit reached
                    if len(elem_text.split()) \
                            <= self.concatenation_tags_word_limit[elem_tag]:
                        last_child.text += '\n' + elem_text
                        continue

            # Add the anchor's href url when finding an anchor
            # If anchor, don't create a new element, but add url instead to
            # parent
            if elem_tag == 'a':
                # Create a valid url from the href url if any
                url = self.create_valid_url(elem.get('href'))

                # If the url from href is invalid, ignore anchor tag
                if url is None:
                    continue

                # If the URL is unequal to the elem text
                if url != elem_text:
                    # Add the element text to parent instead of creating a
                    # new element
                    if elem_text in self.normalize(parent.text):
                        current_parent.links.append([elem_text, url])
                        continue

                    current_parent.links.append([url, url])

            elif elem_tag in self.ignored_child_tags \
                    and current_parent.tag \
                    in self.ignored_child_tags[elem_tag]:
                # If the parent's text includes this element's text,
                # don't create a node for this element.
                continue
            else:
                # Create the new element.
                current_parent = TreeElement(
                    elem_tag,
                    page_id,
                    elem_text,
                    parent,
                )

        return root

    # Returns a valid url based on blacklisting and type
    def create_valid_url(self, url):
        ''' Takes in an url from an anchor tag's href.
        Returns None if the url is None, blacklisted or invalid.
        Returns an absolute url otherwise. '''

        # If the url isn't defined
        if url is None:
            return None

        # Check if the url stars with blacklisted characters
        for start_url in self.garbage_start_urls:
            if url.startswith(start_url):
                return None

        # Check if the url is a blacklisted resource or file type
        for end_url in self.garbage_resources:
            if url.endswith(end_url):
                # This url is blacklisted, ignore this element
                return None

        # If the url is relative or a valid resource link
        if not bool(urlparse(url).netloc):
            # Concatenate the root and relative url
            url = urljoin(self.root_url, url)

        return url

    def pretty_print_tree(self, root):
        ''' Print a scraped tree for debugging. '''

        for pre, fill, node in RenderTree(root):
            # We remove newlines from the text with spaces to preserve
            # the shape of the tree when printing in the terminal.
            print('{}{}: {}'.format(pre, node.tag,
                                    node.text.replace('\n', ' ')))

        # Also add a new line before the next tree.
        print()

    def parse(self, response):
        ''' Parses pages which have been requested from the server. '''

        # Only store HTML responses, not other attachments.
        if isinstance(response, HtmlResponse):
            if not any(
                    re.match(regex, response.url)
                    for regex in self.scrape_blacklist):
                # Generate a tree structure describing this page.
                root = self.generate_tree(response)

                # The parser might choose to ignore this page, for example when
                # we detect that the page is a 404 page. In that case, skip the
                # page.
                if root:
                    # Pretty print the node tree if the DEBUG flag is set.
                    if self.debug:
                        self.pretty_print_tree(root)

                    # Export the tree using the DictExporter. Scrapy will then
                    # convert this dictionary to a JSON structure for us,
                    # automatically.
                    exporter = DictExporter()
                    tree = exporter.export(root)

                    yield {
                        # Export the page URL and the tree structure.
                        'url': response.url,
                        'tree': tree,
                    }

            # Follow all links from allowed domains.
            for next_page in LinkExtractor().extract_links(response):
                for allowed_path in self.allowed_paths:
                    # Only follow links that are in the list of allowed paths.
                    if re.match(allowed_path, next_page.url) and not \
                            any(re.match(regex, next_page.url)
                                for regex in self.visit_blacklist):
                        yield response.follow(next_page, self.parse)
                        break
Beispiel #5
0
from sklearn.metrics.pairwise import cosine_similarity

from spellchecker import SpellChecker

from nltk.corpus import wordnet as wn

from chatbot.model.model_factory import ModelFactory
from chatbot.nlp.keyword import get_tfidf_model, get_stopwords, lemmatize, nb
from chatbot.nlp.synset import SynsetWrapper
from chatbot.util.config_util import Config
from chatbot.util.logger_util import set_logger

if str(os.getenv("LOG")) == "TRUE":
    set_logger()

NOT_FOUND = Config.get_value(['query_system', 'not_found'])
MULTIPLE_ANSWERS = Config.get_value(['query_system', 'multiple_answers'])
CHAR_LIMIT = Config.get_value(['query_system', 'character_limit'])
MAX_ANSWERS = Config.get_value(['query_system', 'max_answers'])
URL_FROM_TEXT = Config.get_value(['query_system', 'url_from_text'])

ANSWER_THRESHOLD = Config.get_value(['query_system', 'answer_threshold'])
SIMILARITY_THRESHOLD = Config.get_value(
    ['query_system', 'similarity_threshold'])

factory = ModelFactory.get_instance()
factory.set_db()


def _handle_not_found(query_text):
    '''