Example #1
0
def build_index(in_dir, out_dict, out_postings):
    """
    Build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('Indexing...')

    stemmer = PorterStemmer()
    dictionaries = Dictionaries(out_dict)
    postings = Postings(out_postings)
    offset = 1

    for docID in os.listdir(in_dir):
        f = open(f'{in_dir}/{docID}', 'r')
        content_tokens = word_tokenize(f.read())
        for word in content_tokens:
            term = stemmer.stem(word=word).lower()

            if dictionaries.has_term(term):
                old_offset = dictionaries.get_offset(term)
                postings.add_docId_to_offset(old_offset, docID)
            else:
                dictionaries.add_term(term, offset)
                postings.add_doc_id(offset)
                postings.add_docId_to_offset(offset, docID)
                offset += 1

            dictionaries.increment_frequency(term)

    postings.save_to_file(dictionaries)
    dictionaries.save_to_file()
Example #2
0
def build_index(directory, dictionary_file, postings_file):
	files = os.listdir(directory)
	dictionary = Dictionary(dictionary_file)
	postings = Postings(postings_file)
	stemmer = nltk.stem.porter.PorterStemmer()
	last = ''
	for doc_id in files:
		tf_list = {}
		line_number = 1
		offset = 0
		# Use linecache to get line
		line = linecache.getline(os.path.join(directory, doc_id), line_number)
		while line != '':
			# tokenize lines into sentences
			sentences = nltk.sent_tokenize(line)
			for sentence in sentences:
				# tokenize sentence
				tokens = nltk.word_tokenize(sentence)
				for token in tokens:
					# apply stemming and case folding
					stemmed_token = stemmer.stem(token).lower()
					# if term already exists in dictionary, we find row number
					if dictionary.has_term(stemmed_token):
						offset = dictionary.get_offset(stemmed_token) 
						# If postings for that term already has doc id, 
						# then increment tf,
						# Else increment df and add the doc id
						if postings.has_doc_id(doc_id, offset):
							postings.increment_tf(doc_id, offset)	
						else:
							dictionary.increment_df(stemmed_token)
							postings.add_doc_id(doc_id, offset)
					# else, we add it to dictionary and postings
					else:
						offset = postings.add_new_term()
						postings.add_doc_id(doc_id, offset)
						dictionary.add_new_term(stemmed_token, offset)

					#Keep track of tf values of all terms in doc
					if stemmed_token in tf_list:
						tf_list[stemmed_token] += 1
					else:
						tf_list[stemmed_token] = 1
						
			line_number += 1
			line = linecache.getline(os.path.join(directory, doc_id), line_number)
		# Store doc length
		dictionary.add_doc_length(doc_id, tf_list.values())
	# save data
	postings.save(dictionary)
	dictionary.save()
Example #3
0
def build_index(in_dir, out_dict, out_postings):
    """
    Build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('Indexing...')

    stemmer = PorterStemmer()
    dictionaries = Dictionaries(out_dict)
    postings = Postings(out_postings)
    offset = 1
    count = len(os.listdir(in_dir))

    for docID in os.listdir(in_dir):
        f = open(f'{in_dir}/{docID}', 'r')
        content = f.read()
        sentences = sent_tokenize(content)
        doc_terms = []
        for sentence in sentences:
            for word in word_tokenize(sentence):
                term = stemmer.stem(word=word.lower())
                doc_terms.append(term)

        # Calculate weighted term frequencies for each term
        weighted_term_freqs = [(x[0], get_term_frequency_weight(x[1]))
                               for x in Counter(doc_terms).most_common()]
        # Calculate document vector length
        doc_length = math.sqrt(
            sum(map(lambda x: x[1] * x[1], weighted_term_freqs)))

        for term, normalised_tf in weighted_term_freqs:
            if dictionaries.has_term(term):
                old_offset = dictionaries.get_offset(term)
                postings.add_docId_tf_to_offset(old_offset, docID,
                                                normalised_tf / doc_length)
            else:
                dictionaries.add_term(term, offset)
                postings.add_doc_id(offset)
                postings.add_docId_tf_to_offset(offset, docID,
                                                normalised_tf / doc_length)
                offset += 1

    postings.save_to_file(dictionaries, count)
    dictionaries.save_to_file()
Example #4
0
def build_index(directory, dictionary_file, postings_file):
	files = os.listdir(directory)
	dictionary = Dictionary(dictionary_file)
	postings = Postings(postings_file)
	stemmer = nltk.stem.porter.PorterStemmer()
	last = ''
	for doc_id in files:
		postings.add_doc(doc_id)
		line_number = 1
		#Use linecache to get line
		line = linecache.getline(os.path.join(directory, doc_id), line_number)
		while line != '':
			#tokenize lines into sentences
			sentences = nltk.sent_tokenize(line)
			for sentence in sentences:
				#tokenize sentence
				tokens = nltk.word_tokenize(sentence)
				for token in tokens:
					#apply stemming and case folding
					stemmed_token = stemmer.stem(token).lower()
					#if term alraeady exists in dictionary, we find row number
					if dictionary.has_term(stemmed_token):
						offset = dictionary.get_offset(stemmed_token)
						result = postings.add_doc_id(doc_id, offset)
						# Result indicates if the doc id is new
						if result:
							dictionary.increment_frequency(stemmed_token)
					#else, we add it to dictionary and postings
					else:
						offset = postings.add_new_term()
						postings.add_doc_id(doc_id, offset)
						dictionary.add_new_term(stemmed_token, offset)
						
			line_number += 1
			line = linecache.getline(os.path.join(directory, doc_id), line_number)
	#save data
	postings.save(dictionary)
	dictionary.save()