def load_url_indexer(self):

        # load url_id_index
        url_indexer_file_path = file_io.get_path('url_id_map_file', None)
        if url_indexer_file_path is not None:
            self.url_id_index.load(url_indexer_file_path)

        # load url_resolver
        url_resolver_file_path = file_io.get_path('resolved_url_map_file',
                                                  None)
        if url_resolver_file_path is not None:
            self.url_resolver.load(url_resolver_file_path)
def get_docID2url_map():
    hash_id_map_file = file_io.get_path("hash_id_map_file", None, force=True)
    with open(hash_id_map_file) as json_data:
        hash_id_map = json.load(json_data)

    # hash_id_df = pd.DataFrame(hash_id_tuple_list, columns=['hash', 'id'])

    hash_url_list_map_file = file_io.get_path("hash_url_list_map_file", None, force=True)
    with open(hash_url_list_map_file) as json_data:
        hash_url_list_map = json.load(json_data)

    # take all urls
    # docID_url_map = {hash_id_map[hash]:hash_url_list_map[hash] for hash in hash_id_map.keys()}

    # take only first url
    docID_url_map = {hash_id_map[hash]: hash_url_list_map[hash][0] for hash in hash_id_map.keys()}
    return docID_url_map
Esempio n. 3
0
 def get_title(
         self,
         docID):  # TODO: Merge title databases so outputdir is not needed
     title_path = file_io.get_path('document_title_file_path',
                                   [self.output_directory_name, docID])
     with open(title_path) as json_data:
         dtd = json.load(json_data)
         doc_title = dtd['title']
     return doc_title
Esempio n. 4
0
    def load_indexes(self):
        logger.info("Loading Index Files")
        self.url_indexer.load_url_indexer()
        self.document_indexer.load_document_indexer()

        # load url frontier
        url_frontier_file_path = file_io.get_path('url_frontier_file', None)
        if url_frontier_file_path is not None:
            self.url_frontier.load(url_frontier_file_path)
def load_leader_follower_dictionary():
    logger.info("Loading Leader Follower File...")
    leader_follower_docID_dict_file_path = file_io.get_path('leader_follower_file_path', None)
    if leader_follower_docID_dict_file_path is not None:
        with open(leader_follower_docID_dict_file_path) as json_data:
            leader_follower_dict_json = json.load(json_data)
            # cast docID key to int
            leader_follower_dict = {int(k): v for k, v in leader_follower_dict_json.items()}
        return leader_follower_dict
    else:
        logger.error("Leader Follower File Not Found")
Esempio n. 6
0
    def load_url_indexer(self):

        # load url_id_index
        url_indexer_file_path = file_io.get_path('url_id_map_file', None)
        if url_indexer_file_path is not None:
            self.url_id_index.load(url_indexer_file_path)

        # load url_resolver
        url_resolver_file_path = file_io.get_path('resolved_url_map_file',
                                                  None)
        if url_resolver_file_path is not None:
            self.url_resolver.load(url_resolver_file_path)

        # new
        # load hash_url_list_map
        hash_url_list_map_path = file_io.get_path('hash_url_list_map_file',
                                                  None)
        if hash_url_list_map_path is not None:
            with open(hash_url_list_map_path) as json_data:
                self.hash_url_list_map = json.load(json_data)
def get_docID2url_map():

    hash_id_map_file = file_io.get_path("hash_id_map_file", None, force=True)
    with open(hash_id_map_file) as json_data:
        hash_id_map = json.load(json_data)

    hash_url_list_map_file = file_io.get_path("hash_url_list_map_file",
                                              None,
                                              force=True)
    with open(hash_url_list_map_file) as json_data:
        hash_url_list_map = json.load(json_data)

    # take all urls
    # docID_url_map = {hash_id_map[hash]:hash_url_list_map[hash] for hash in hash_id_map.keys()}

    # take only first url
    docID_url_map = {
        hash_id_map[hash]: hash_url_list_map[hash][0]
        for hash in hash_id_map.keys()
    }
    return docID_url_map
def get_document_term_frequency_matrix(indexed_directory_name, write=True):
    # check if document term frequency matrix has already been created
    matrix_file = file_io.get_path("document_term_frequency_matrix_file_path", None, force=True)

    # if the document term frequency matrix has already been created already exists load and return
    if os.path.isfile(matrix_file):
        logger.info("Accessing document term frequency matrix already in existence at: %s" % matrix_file)
        return pd.read_csv(matrix_file, index_col="docID")  # , dtype={'docID': np.int64, 'document text': str})

    # if the document term frequency matrix does not exist, construct it from document frequency json files
    id_tf_dict = load_document_frequency_dicts(indexed_directory_name)

    unique_words = set()
    for doc_id, tf_dict in id_tf_dict.items():
        unique_words = unique_words | set(tf_dict.keys())

    doc_freq_matrix = pd.DataFrame(columns=unique_words)  # , index=id_tf_dict.keys())
    doc_freq_matrix.index.name = "docID"
    for doc_id, tf_dict in id_tf_dict.items():
        terms, freqs = zip(*tf_dict.items())
        df = pd.DataFrame(data=[freqs], columns=terms, index=[int(doc_id)])
        # df.index.name = "docID"
        doc_freq_matrix = pd.concat([doc_freq_matrix, df], join='outer')
    doc_freq_matrix = doc_freq_matrix.fillna(value=0)

    # sort by docID
    doc_freq_matrix = doc_freq_matrix.sort_index()  # by='docID')

    # set index column name to docID
    # doc_freq_matrix.index.name = 'docID'

    # write to csv
    if write:
        logger.info("Writing Document Term Frequency Matrix")
        matrix_file = file_io.get_path("document_term_frequency_matrix_file_path", None, force=True)
        doc_freq_matrix.to_csv(matrix_file)

    return doc_freq_matrix
Esempio n. 9
0
    def load_matrices(self, matrix_names):

        if "leader_document_vector_matrix" in matrix_names:
            # load leader document vector matrix
            ldvm_file_path = file_io.get_path(
                "leader_document_vector_matrix_file_path",
                [self.output_directory_name])
            self.leader_document_vector_matrix = np.load(ldvm_file_path)

        if "title_document_vector_matrix" in matrix_names:
            # load title document vector matrix
            tdvm_file_path = file_io.get_path(
                "title_document_vector_matrix_file_path",
                [self.output_directory_name])
            self.title_document_vector_matrix = np.load(tdvm_file_path)

        if "full_document_vector_matrix" in matrix_names:
            # load full document vector matrix
            fdvm_file_path = file_io.get_path(
                "full_document_vector_matrix_file_path",
                [self.output_directory_name])
            self.full_document_vector_matrix = np.load(fdvm_file_path)

        if "tfidf_matrix" in matrix_names:
            # load tfidf matrix
            tfidf_matrix_file_path = file_io.get_path(
                "tfidf_matrix_file_path", [self.output_directory_name])
            self.tfidf_matrix = np.load(tfidf_matrix_file_path)

        if "tfidf_leader_document_vector_matrix" in matrix_names:
            # load leader document vector matrix
            tfidf_ldvm_file_path = file_io.get_path(
                "tfidf_leader_document_vector_matrix_file_path",
                [self.output_directory_name])
            self.tfidf_leader_document_vector_matrix = np.load(
                tfidf_ldvm_file_path)
def query_to_vector_slow(raw_query):
    # all that is needed is word2col dictonary
    word2col_file_path = file_io.get_path('word2col_file_path', None)
    with open(word2col_file_path) as json_data:
        word2col = json.load(json_data)

    # create empty query vector
    query_vector = np.zeros(len(word2col))

    # tokenize query
    query_tokens = text_processing.plain_text_to_tokens(raw_query)  # , stopwords file)

    # update term frequencies of query vector
    for token in query_tokens:
        column_index = word2col[token]
        query_vector[column_index] += 1

    return query_vector
Esempio n. 11
0
    def load_maps(self):

        # load matrix maps
        matrix_maps_file_path = file_io.get_path("matrix_maps_file_path",
                                                 [self.output_directory_name])
        with open(matrix_maps_file_path, 'rb') as pickle_file:
            self.matrix_maps = pickle.load(pickle_file)

        # loading without optimization loads all
        self.word2col = self.matrix_maps['word2col']  # query_to_vector
        self.col2word = self.matrix_maps['col2word']  # vector_to_tokens
        self.leader_row_2_cluster_indices = self.matrix_maps[
            'leader_row_2_cluster_indices']  # cluster
        self.leader_row_2_cluster_ids = self.matrix_maps[
            'leader_row_2_cluster_ids']  # cluster
        self.tfidf_leader_row_2_cluster_indices = self.matrix_maps[
            'tfidf_leader_row_2_cluster_indices']  # tfidf cluster
        self.tfidf_leader_row_2_cluster_ids = self.matrix_maps[
            'tfidf_leader_row_2_cluster_ids']  # tfidf cluster
        self.docID2url = self.matrix_maps['docID2url']  # cluster
        self.row2docID = self.matrix_maps['row2docID']
Esempio n. 12
0
def get_document_term_frequency_matrix(indexed_directory_name, write=True):
    id_tf_dict = load_document_frequency_dicts(indexed_directory_name)

    unique_words = set()
    for doc_id, tf_dict in id_tf_dict.items():
        unique_words = unique_words | set(tf_dict.keys())

    doc_freq_matrix = pd.DataFrame(columns=unique_words)
    for doc_id, tf_dict in id_tf_dict.items():
        terms, freqs = zip(*tf_dict.items())
        df = pd.DataFrame(data=[freqs], columns=terms, index=[doc_id])
        doc_freq_matrix = pd.merge(doc_freq_matrix, df, how='outer')

    doc_freq_matrix = doc_freq_matrix.fillna(value=0)

    # write to csv
    if write:
        logger.info("Writing Document Term Frequency Matrix")
        matrix_file = file_io.get_path(
            "document_term_frequency_matrix_file_path", None, force=True)
        doc_freq_matrix.to_csv(matrix_file)

    return doc_freq_matrix
Esempio n. 13
0
def load_log_file(indexed_directory_name):
    log_file_path = file_io.get_path('log_file', [indexed_directory_name])
    if log_file_path is not None:
        with open(log_file_path) as json_data:
            log_info = json.load(json_data)
    return log_info
Esempio n. 14
0
 def load_document_indexer(self):
     document_indexer_file_path = file_io.get_path('hash_id_map_file', None)
     if document_indexer_file_path is not None:
         self.hash_id_index.load(document_indexer_file_path)