def build_index(in_dir, out_dict, out_postings): """ build index from documents stored in the input directory, then output the dictionary file and postings file """ print('indexing...') indexing_doc_files = sorted(map(int, os.listdir(in_dir))) dictionary = Dictionary(out_dict) postings = PostingsFile(out_postings) temp_dictionary = defaultdict(lambda: defaultdict(int)) # For each document get the terms and add it into the temporary in-memory posting lists for document in indexing_doc_files: terms = util.read_document(in_dir, document) tf_for_doc = defaultdict(int) for term in terms: tf_for_doc[term] += 1 temp_dictionary[term][document] += 1 # Maintain normalised length and count in dictionary.txt dictionary.add_normalised_doc_length(document, tf_for_doc) dictionary.add_doc_count() # Format posting to store in posting list postings.format_posting(temp_dictionary) # Save dictionary and posting list with offsets tracking postings.save(dictionary) dictionary.save()
def process_csv(dataset_file, out_dict): """ Parses and processes the CSV data file to create the index and postings lists. Params: - dataset_file: Path to dataset - out_dict: Path to save dictionary to Returns: - dictionary: Dictionary containing index and postings """ dictionary = Dictionary(out_dict) with open(dataset_file, encoding="utf8") as dataset_csv: i = 0 prev_docId = 0 csv_reader = csv.reader(dataset_csv) for row in csv_reader: i += 1 # Skip CSV header if i == 1: continue docId = row[0] # Skip duplicate document IDs if prev_docId == docId: continue # For each document, get the content tokens and add it to the posting lists tokens = util.preprocess_content(row[1] + " " + row[2] + " " + row[3] + " " + row[4]) normalised_tf = dictionary.add_tokens_of_doc(tokens, docId) # Maintain document lengths and count in dictionary dictionary.add_normalised_doc_length(docId, normalised_tf) dictionary.add_court_weight(docId, court.get_court_weight(row[4])) dictionary.add_doc_count() prev_docId = docId dataset_csv.close() return dictionary