def query_to_vector(raw_query, word2col):
    # create empty query vector
    query_vector = np.zeros(len(word2col))

    # tokenize query
    query_tokens = text_processing.plain_text_to_tokens(raw_query)  # , stopwords file)

    # update term frequencies of query vector
    for token in query_tokens:
        column_index = word2col[token]
        query_vector[column_index] += 1

    return query_vector
Esempio n. 2
0
    def query_to_vector(self, raw_query):
        # create empty query vector
        query_vector = np.zeros(len(self.word2col))

        # tokenize query
        query_tokens = text_processing.plain_text_to_tokens(
            raw_query)  # , stopwords file)

        # update term frequencies of query vector
        for token in query_tokens:
            try:
                column_index = self.word2col[token]
                query_vector[column_index] += 1
            except KeyError:
                logger.info("Query word not found in index: %s (stemmed)" %
                            token)

        return query_vector
def query_to_vector_slow(raw_query):
    # all that is needed is word2col dictonary
    word2col_file_path = file_io.get_path('word2col_file_path', None)
    with open(word2col_file_path) as json_data:
        word2col = json.load(json_data)

    # create empty query vector
    query_vector = np.zeros(len(word2col))

    # tokenize query
    query_tokens = text_processing.plain_text_to_tokens(raw_query)  # , stopwords file)

    # update term frequencies of query vector
    for token in query_tokens:
        column_index = word2col[token]
        query_vector[column_index] += 1

    return query_vector
def load_title_document_id_term_frequency_dictionaries(indexed_directory_name):
    logger.info("Loading Title Frequency Dictionaries")
    document_title_dictionary_file_template = \
        file_io.get_template('document_title_file_path') % (indexed_directory_name, '*')
    document_id_term_frequency_dictionary = {}
    for dtd_path in glob.glob(document_title_dictionary_file_template):
        with open(dtd_path) as json_data:
            dtd = json.load(json_data)
            doc_id = str(dtd['document_id'])
            doc_title = dtd['title']

            # No title
            if doc_title is None:
                doc_title = 'NO_TITLE'

            title_tokens = text_processing.plain_text_to_tokens(
                doc_title)  # ,stopwords_file)
            doc_term_freq_dict = text_processing.word_frequency_dict(
                title_tokens)
            document_id_term_frequency_dictionary[doc_id] = doc_term_freq_dict
    return document_id_term_frequency_dictionary
Esempio n. 5
0
def pull_summary(requested_url, included_attributes=("requested_url", "redirect_history", "status_code", "content_type","content_hash", "normalized_a_hrefs", 'normalized_img_srcs')):
    """ access a given url and return a python dictionary of page data. """

    response_summary = {
        'requested_url': requested_url,
        'status_code' : 404
    }

    try:

        # make request
        response = requests.get(requested_url)

        # set "status_code" value
        response_summary['status_code'] = response.status_code

        # log status code
        logger.info("Response Status Code: %d" % response.status_code)

        # continue if status is 200
        if response.status_code == 200:

            # set 'content_hash' value
            response_summary['content_hash'] = str(hashlib.md5(str.encode(response.text)).hexdigest())

            # set 'redirect_history'  value
            response_summary['redirect_history'] = []
            for redirect in response.history:
                response_summary['redirect_history'].append((redirect.url, redirect.status))

            # set 'content_type' value
            if 'content-type' in response.headers:
                response_summary['content_type'] = response.headers['content-type']

            # set 'binary_response_content' value
            if 'binary_response_content' in included_attributes:
                response_summary['binary_response_content'] = response.content

            # set 'plain_text' value
            if 'plain_text' in included_attributes:
                response_summary['plain_text'] = None
                if response_summary['content_type'] in file_parser.acepted_content_types():
                    response_summary['plain_text'] = file_parser.extract_plain_text(response.text, response_summary['content_type'])

            # set 'tokens' value
            if 'tokens' in included_attributes:
                if response_summary['content_type'].split(';')[0] in file_parser.acepted_content_types():
                    plain_text = file_parser.extract_plain_text(response.text, response_summary['content_type'])
                    response_summary['tokens'] = text_processing.plain_text_to_tokens(plain_text)

            if 'term_frequency_dict' in included_attributes:
               if response_summary['content_type'].split(';')[0] in file_parser.acepted_content_types():
                   plain_text = file_parser.extract_plain_text(response.text, response_summary['content_type'])
                   tokens = text_processing.plain_text_to_tokens(plain_text)
                   response_summary['term_frequency_dict'] = text_processing.word_frequency_dict(tokens)

            # if type "text/html" - read links
            if ('normalized_a_hrefs' or 'normalized_img_srcs') in included_attributes:
                if response.headers['content-type'][:9] == "text/html":

                    # Note: base_url is requested_url

                    # set 'normalized_a_hrefs'
                    response_summary['normalized_a_hrefs'] = normalize_urls(requested_url, file_parser.extract_a_hrefs_list(response.text))

                    # set 'normalized_img_srcs'
                    response_summary['normalized_img_srcs'] = normalize_urls(requested_url, file_parser.extract_img_srcs_list(response.text))

    except:
        logger.error("Requested Page: %s, Failed to read." % response_summary['requested_url'])
        logger.error(sys.exc_info())

    # filter attributes not in included_attributes tuple parameter
    response_summary = {k: v for k, v in response_summary.items() if k in included_attributes}

    return response_summary
Esempio n. 6
0
    def crawl_site(self,
                   seed_url,
                   output_directory_name,
                   max_urls_to_index=None,
                   stopwords_file=None):

        self.output_directory_name = output_directory_name
        self.max_urls_to_index = max_urls_to_index
        self.stopwords_file = stopwords_file

        # resolve seed url
        self.seed_url = self.url_resolver.resolve(seed_url)

        # set forbidden_urls
        self.read_robots()

        # add seed url to url frontier
        self.url_frontier.add(self.seed_url)

        # log info
        logger.info("Beginning Site Crawl: %s" % self.seed_url)
        if self.max_urls_to_index is not None:
            logger.info("Number of Sites to Index: %d" %
                        self.max_urls_to_index)
        else:
            logger.info("Index Forever")

        # begin crawl
        while self.continue_indexing():

            # retrieve url to index
            target_url = self.url_frontier.remove()

            # ensure it is resolved
            target_url = self.url_resolver.resolve(target_url)

            # add it to url_id_map (index)
            self.url_id_map.add(target_url)

            # log info
            logger.info("Crawling URL Number: %d" %
                        self.url_id_map[target_url])
            logger.info("Crawling URL: %s" % target_url)

            # access site and get response summary
            response_summary = url_accessor.get_response_summary(
                target_url, self.url_resolver)

            if not response_summary['broken']:

                # save response summary (add id , remove binary_response_content)
                written_response_summary = {
                    k: v
                    for k, v in response_summary.items()
                    if k != 'binary_response_content'
                }
                written_response_summary['url_id'] = self.url_id_map[
                    target_url]


                response_summary_directory = self.directory_structure_dict['path_templates'][
                                                 'response_summaries_directory_path_template'] \
                                             % (self.output_directory_name)

                # if response_summary_directory does not exist, create it
                if not os.path.exists(response_summary_directory):
                    os.makedirs(response_summary_directory)

                response_summary_file_path = self.directory_structure_dict['path_templates']['response_summaries_file_path_template'] \
                    % (self.output_directory_name, written_response_summary['url_id'])

                # write response summary file
                with open(response_summary_file_path, 'w') as file:
                    file.write(json.dumps(written_response_summary))

                if response_summary[
                        'document_hash'] not in self.indexed_document_hashes:

                    # extract and tokenize plain text, then save to document file
                    plain_text = self.file_parser.extract_plain_text(
                        response_summary['binary_response_content'],
                        response_summary['content_type'])
                    tokens = text_processing.plain_text_to_tokens(
                        plain_text, self.stopwords_file)

                    # write tokens to document file
                    document_directory = self.directory_structure_dict['path_templates'][
                                                     'document_directory_path_template'] \
                                                 % (self.output_directory_name, response_summary['document_hash'])

                    # if document_directory does not exist, create it
                    if not os.path.exists(document_directory):
                        os.makedirs(document_directory)

                    document_tokens_file_path = self.directory_structure_dict['path_templates'][
                                                     'document_tokens_file_path_template'] \
                                                 % (self.output_directory_name, response_summary['document_hash'])

                    with open(document_tokens_file_path, 'w') as file:
                        file.write(json.dumps(tokens))

                    # update document hash index if document has not been seen
                    self.indexed_document_hashes.add(
                        response_summary['document_hash'])

            # Add New Urls To Queue And Continue Crawling
            #self.url_frontier.add_list(self.filter_urls(response_summary['resolved_normalized_a_hrefs']))
            for filtered_url in self.filter_urls(
                    response_summary['resolved_normalized_a_hrefs']):
                self.url_frontier.add(filtered_url)

            print("Before Filter")
            print(response_summary['resolved_normalized_a_hrefs'])
            print("After Filter")
            print(
                self.filter_urls(
                    response_summary['resolved_normalized_a_hrefs']))
            print("Queue")
            print(self.url_frontier.to_list())