Ejemplo n.º 1
0
def create_graph():
    graph = nx.DiGraph()
    dev_directory = "DEV/"
    graph.add_nodes_from(range(1, len(mapping)+1))
    for direct in os.listdir(dev_directory):

            if direct != '.DS_Store':
                
                for file in os.listdir(dev_directory + direct):
                    
                    # posting(file) returns url and stuff for DOC_ID_DICT and for tokenizer
                    temp = Url(dev_directory + direct + '/' + file)
                    if temp.get_url() in mapping:
                        list_of_links = get_all_sublinks(temp.get_html(), temp.get_url())
                        graph.add_edges_from([(mapping[temp.get_url()], final_edge)for final_edge in list_of_links])
                        if mapping[temp.get_url()] == 34480 or mapping[temp.get_url()] == 34479:
    return graph
Ejemplo n.º 2
0
def get_all_files(dev_directory):

    file_count_name = 'indexes2/inverted_index_'
    file_count = 0
    file_count_name_count = 1
    doc_id = 1
    inverted_index = defaultdict(list)
    reader = Html_Reader()
    for direct in os.listdir(dev_directory):

        if direct != '.DS_Store':

            for file in os.listdir(dev_directory + direct):

                # posting(file) returns url and stuff for DOC_ID_DICT and for tokenizer
                temp = Url(dev_directory + direct + '/' + file)
                try:
                    read = reader.read_file(temp.get_html(), doc_id,
                                            inverted_index)
                    if read:
                        # write to a file the current inverted index, if it is above a certain file count
                        file_count += 1
                        DOC_ID_DICT[doc_id] = temp.get_url()
                        doc_id += 1

                except Exception as e:
                    with open('error.txt', 'w+') as error_file:
                        error_file.write(str(e) + str(temp.get_url()) + "\n")
                if file_count == 1000:
                    write_to_index(inverted_index, file_count_name_count,
                                   file_count_name)
                    inverted_index = defaultdict(list)
                elif file_count > 1000:
                    file_count_name_count += 1
                    file_count = 0
                    # adds the current dict to a file for a partial index
                    # change file_count_name also to write to a different file

    write_to_index(inverted_index, file_count_name_count, file_count_name)
    write_doc_ids(DOC_ID_DICT)