Ejemplo n.º 1
0
def build_index(in_dir, out_dict, out_postings):
    """
    build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('indexing...')

    indexing_doc_files = sorted(map(int, os.listdir(in_dir)))

    dictionary = Dictionary(out_dict)
    postings = PostingsFile(out_postings)

    temp_dictionary = defaultdict(lambda: defaultdict(int))

    # For each document get the terms and add it into the temporary in-memory posting lists
    for document in indexing_doc_files:
        terms = util.read_document(in_dir, document)
        tf_for_doc = defaultdict(int)

        for term in terms:
            tf_for_doc[term] += 1
            temp_dictionary[term][document] += 1

        # Maintain normalised length and count in dictionary.txt
        dictionary.add_normalised_doc_length(document, tf_for_doc)
        dictionary.add_doc_count()

    # Format posting to store in posting list
    postings.format_posting(temp_dictionary)

    # Save dictionary and posting list with offsets tracking
    postings.save(dictionary)
    dictionary.save()
def process_csv(dataset_file, out_dict):
    """
    Parses and processes the CSV data file to create the index and postings lists.

    Params:
        - dataset_file: Path to dataset
        - out_dict: Path to save dictionary to

    Returns:
        - dictionary: Dictionary containing index and postings
    """
    dictionary = Dictionary(out_dict)

    with open(dataset_file, encoding="utf8") as dataset_csv:
        i = 0
        prev_docId = 0

        csv_reader = csv.reader(dataset_csv)
        for row in csv_reader:
            i += 1

            # Skip CSV header
            if i == 1:
                continue

            docId = row[0]

            # Skip duplicate document IDs
            if prev_docId == docId:
                continue

            # For each document, get the content tokens and add it to the posting lists
            tokens = util.preprocess_content(row[1] + " " + row[2] + " " +
                                             row[3] + " " + row[4])
            normalised_tf = dictionary.add_tokens_of_doc(tokens, docId)

            # Maintain document lengths and count in dictionary
            dictionary.add_normalised_doc_length(docId, normalised_tf)
            dictionary.add_court_weight(docId, court.get_court_weight(row[4]))
            dictionary.add_doc_count()

            prev_docId = docId

    dataset_csv.close()

    return dictionary