Example #1
0
    def add_web_page_summary(self, web_page_summary, output_directory_name):
        """ Resolves web page links, Indexes and writes web page summary only if resolved requested url is not in index."""

        # resolved requested url
        requested_url = web_page_summary['requested_url']
        resolved_requested_url = self.url_resolver.resolve(requested_url)

        # check if resolved requested url is in index. if it is, return
        if resolved_requested_url in self.url_id_index:
            return

        # add new url to index
        self.url_id_index.add(resolved_requested_url)

        # if not in index, resolve all web page links and write to file
        resolved_normalized_a_hrefs = self.url_resolver.resolve_list(web_page_summary['normalized_a_hrefs'])
        resolved_normalized_img_srcs = self.url_resolver.resolve_list(web_page_summary['normalized_img_srcs'])

        # add web_page_summary  resolved links
        # and add the additional url_id key value pair before writing to file
        written_web_page_summary = web_page_summary.copy()
        written_web_page_summary['id'] = self.url_id_index[resolved_requested_url]
        written_web_page_summary['resolved_requested_url'] = resolved_requested_url
        written_web_page_summary['resolved_normalized_a_hrefs'] = resolved_normalized_a_hrefs
        written_web_page_summary['resolved_normalized_img_srcs'] = resolved_normalized_img_srcs

        # write file
        file_io.save('web_page_summary_file_path', written_web_page_summary, [output_directory_name, written_web_page_summary['id']])
Example #2
0
    def save_term_frequency_dictionary(self, term_frequency_dictionary, content_hash, output_directory_name):

        # add new document hash to index
        self.hash_id_index.add(content_hash)
        document_id = self.hash_id_index[content_hash]

        # write file
        file_io.save('document_frequency_dict_file_path', term_frequency_dictionary, [output_directory_name, document_id])
 def write_log_file(self):
     self.log_info_dict = {}
     self.log_info_dict['seed_url'] = self.seed_url
     self.log_info_dict['max_urls_to_index'] = self.max_urls_to_index
     self.log_info_dict['robots_dissaloud_path'] = self.forbidden_urls
     # write file
     file_io.save('log_file', self.log_info_dict,
                  [self.output_directory_name])
Example #4
0
    def save_indexes(self):
        logger.info("Saving Index Files")
        # add log
        self.document_indexer.save_document_indexer()
        self.url_indexer.save_url_indexer()

        # save url frontier
        # write file
        file_io.save('url_frontier_file', self.url_frontier.to_dict(), None)
Example #5
0
    def save_document_title_dictionary(self, document_title, content_hash,
                                       output_directory_name):
        # add new document hash to index
        self.hash_id_index.add(content_hash)
        document_id = self.hash_id_index[content_hash]

        # create page html json with meta data
        document_title_dictionary = {
            'title': document_title['title'],
            'document_id': document_id,
            'content_hash': content_hash
        }
        logger.info("Saving Document Title, ID: %d" % document_id)
        # write file
        file_io.save('document_title_file_path', document_title_dictionary,
                     [output_directory_name, document_id])
Example #6
0
    def save_term_frequency_dictionary(self, term_frequency_dictionary,
                                       content_hash, output_directory_name):
        # add new document hash to index
        self.hash_id_index.add(content_hash)
        document_id = self.hash_id_index[content_hash]

        # add doc id and hash to term frequency dictionary
        term_frequency_dictionary['document_id'] = document_id
        term_frequency_dictionary['content_hash'] = content_hash

        logger.info("Saving Document Term Frequency Dictonary ID: %d" %
                    document_id)
        # write file
        file_io.save('document_term_frequency_dictionary_file_path',
                     term_frequency_dictionary,
                     [output_directory_name, document_id])
Example #7
0
    def save_url_indexer(self):
        # write file
        file_io.save('url_id_map_file', self.url_id_index.to_dict(), None)
        file_io.save('resolved_url_map_file', self.url_resolver.to_dict(),
                     None)

        # new
        file_io.save('hash_url_list_map_file', self.hash_url_list_map, None)
def save_leader_follower_dictionary(doc_freq_matrix_dataFrame):
    logger.info("Saving Leader Follower File...")
    leader_follower_docID_json = cluster_pruning_leader_follower_dict(doc_freq_matrix_dataFrame, to_json=True)
    # write file
    file_io.save('leader_follower_file_path', leader_follower_docID_json, None)
Example #9
0
    def add_web_page_summary(self, web_page_summary, output_directory_name):
        """ Resolves web page links, Indexes and writes web page summary only if resolved requested url is not in index."""

        # resolved requested url
        requested_url = web_page_summary['requested_url']
        resolved_requested_url = self.url_resolver.resolve(requested_url)

        # check if resolved requested url is in index. if it is, return
        if resolved_requested_url in self.url_id_index:
            return

        logger.info("Adding new URL to index: %s" % resolved_requested_url)

        # add new url to index
        self.url_id_index.add(resolved_requested_url)

        # if not in index, resolve all web page links and write to file
        # and if has attributes
        resolved_normalized_a_hrefs = []
        if 'normalized_a_hrefs' in web_page_summary:
            resolved_normalized_a_hrefs = self.url_resolver.resolve_list(
                web_page_summary['normalized_a_hrefs'])
        resolved_normalized_img_srcs = []
        if 'normalized_img_srcs' in web_page_summary:
            resolved_normalized_img_srcs = self.url_resolver.resolve_list(
                web_page_summary['normalized_img_srcs'])

        # add web_page_summary  resolved links
        # and add the additional url_id key value pair before writing to file
        written_web_page_summary = web_page_summary.copy()
        written_web_page_summary['url_id'] = self.url_id_index[
            resolved_requested_url]
        written_web_page_summary[
            'resolved_requested_url'] = resolved_requested_url
        written_web_page_summary[
            'resolved_normalized_a_hrefs'] = resolved_normalized_a_hrefs
        written_web_page_summary[
            'resolved_normalized_img_srcs'] = resolved_normalized_img_srcs

        # write file
        logger.info("Saving response summary")
        file_io.save(
            'web_page_access_log_and_metadata_file_path',
            written_web_page_summary,
            [output_directory_name, written_web_page_summary['url_id']])

        # keeping track of pointer urls to same content
        # update hash_url_list_map
        if 'content_hash' in web_page_summary:
            content_hash = web_page_summary['content_hash']

            requested_url_list = []
            resolved_requested_url_list = []
            redirection_urls_list = []

            try:
                requested_url_list = [web_page_summary['requested_url']]
            except:
                pass
            try:
                resolved_requested_url_list = [
                    web_page_summary['resolved_requested_url']
                ]
            except:
                pass
            try:
                redirection_urls_list = web_page_summary[
                    'redirect_history']  # already list
            except:
                pass
            urls = requested_url_list + resolved_requested_url_list + redirection_urls_list

            if content_hash in self.hash_url_list_map:
                existing_urls = self.hash_url_list_map[content_hash]
                urls += existing_urls

            self.hash_url_list_map[content_hash] = list(set(urls))
Example #10
0
 def save_document_indexer(self):
     # write file
     file_io.save('hash_id_map_file', self.hash_id_index.to_dict(), None)
def build_matrices_and_maps(indexed_directory_name_list):

    output_directory_name = '_'.join(indexed_directory_name_list)

    # create term_frequency_dictionaries and find unique_words
    combined_full_id_tf_dict = {}
    combined_title_id_tf_dict = {}
    for indexed_directory_name in indexed_directory_name_list:
        full_id_tf_dict = load_document_term_frequency_dictionaries(
            indexed_directory_name)
        title_id_tf_dict = load_title_document_id_term_frequency_dictionaries(
            indexed_directory_name)
        combined_full_id_tf_dict.update(full_id_tf_dict)
        combined_title_id_tf_dict.update(title_id_tf_dict)
    unique_words = find_all_unique_words(
        [combined_full_id_tf_dict, combined_title_id_tf_dict])

    # create full, title and leader dvms and maps
    full_document_vector_matrix, docID2row, word2col = matrix_and_maps(
        combined_full_id_tf_dict, unique_words)
    title_document_vector_matrix, _, _ = matrix_and_maps(
        combined_title_id_tf_dict, unique_words)
    leader_document_vector_matrix, leader_row_2_cluster_indices, leader_row_2_cluster_ids = \
        cluster_pruning_matrix_and_maps(full_document_vector_matrix, docID2row)
    tfidf_matrix = build_tfidf_matrix(full_document_vector_matrix)
    tfidf_leader_document_vector_matrix, tfidf_leader_row_2_cluster_indices, tfidf_leader_row_2_cluster_ids = \
        cluster_pruning_matrix_and_maps(tfidf_matrix, docID2row)

    # save matrices and maps
    file_io.save('full_document_vector_matrix_file_path',
                 full_document_vector_matrix, [output_directory_name],
                 output_type='numpy_array')
    file_io.save('title_document_vector_matrix_file_path',
                 title_document_vector_matrix, [output_directory_name],
                 output_type='numpy_array')
    file_io.save('leader_document_vector_matrix_file_path',
                 leader_document_vector_matrix, [output_directory_name],
                 output_type='numpy_array')
    file_io.save('tfidf_matrix_file_path',
                 tfidf_matrix, [output_directory_name],
                 output_type='numpy_array')
    file_io.save('tfidf_leader_document_vector_matrix_file_path',
                 tfidf_leader_document_vector_matrix, [output_directory_name],
                 output_type='numpy_array')

    # save all maps in one file
    matrix_maps = {
        'tfidf_leader_row_2_cluster_indices':
        tfidf_leader_row_2_cluster_indices,
        'tfidf_leader_row_2_cluster_ids': tfidf_leader_row_2_cluster_ids,
        'leader_row_2_cluster_indices': leader_row_2_cluster_indices,
        'leader_row_2_cluster_ids': leader_row_2_cluster_ids,
        'docID2url': get_docID2url_map(),
        'row2docID': {v: k
                      for k, v in docID2row.items()},
        'docID2row': docID2row,
        'col2word': {v: k
                     for k, v in word2col.items()},
        'word2col': word2col
    }
    file_io.save('matrix_maps_file_path',
                 matrix_maps, [output_directory_name],
                 output_type='pickle_dict')