def build_index(in_dir, out_dict, out_postings):
    """
    Build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('Indexing...')

    stemmer = PorterStemmer()
    dictionaries = Dictionaries(out_dict)
    postings = Postings(out_postings)
    offset = 1

    for docID in os.listdir(in_dir):
        f = open(f'{in_dir}/{docID}', 'r')
        content_tokens = word_tokenize(f.read())
        for word in content_tokens:
            term = stemmer.stem(word=word).lower()

            if dictionaries.has_term(term):
                old_offset = dictionaries.get_offset(term)
                postings.add_docId_to_offset(old_offset, docID)
            else:
                dictionaries.add_term(term, offset)
                postings.add_doc_id(offset)
                postings.add_docId_to_offset(offset, docID)
                offset += 1

            dictionaries.increment_frequency(term)

    postings.save_to_file(dictionaries)
    dictionaries.save_to_file()
Exemple #2
0
def build_index(training_data_dir, dictionary_file, postings_file, is_debug):
    training_files = sorted(os.listdir(training_data_dir),
                            key=lambda x: int(x))
    if is_debug:
        training_files = training_files[:DEBUG_LIMIT]

    dictionary = Dictionary(dictionary_file)
    postings = Postings(postings_file)
    for training_file in training_files:
        doc_id = int(training_file)
        doc_path = osp.join(training_data_dir, training_file)
        postings.not_list().add(doc_id)
        add_doc_to_index(doc_id, doc_path, dictionary, postings)
    postings.save()

    # turn line nos to byte offsets
    f = open(postings_file)
    current_line = 1
    f.readline()  # skip postings list containing all doc ids
    while True:
        term = dictionary.term_for_offset(current_line)
        dictionary.add_term(term, f.tell())
        line = f.readline()
        if not line:
            break
        current_line += 1
    dictionary.save()
Exemple #3
0
def build_index(training_data_dir, dictionary_file, postings_file, is_debug):
    training_files = sorted(os.listdir(training_data_dir), key=lambda x: x)
    if is_debug:
        training_files = training_files[:DEBUG_LIMIT]

    dictionary = Dictionary(dictionary_file)
    postings = Postings(postings_file)
    for training_file in training_files:
        doc_id = training_file
        doc_path = osp.join(training_data_dir, training_file)
        add_doc_to_index(doc_id, doc_path, dictionary, postings)
    postings.save()

    # turn line nos to byte offsets
    f = open(postings_file)
    current_line = 0
    while True:
        term = dictionary.term_for_offset(current_line)
        dictionary.add_term(term, f.tell(), update_freq=False)
        line = f.readline()
        if not line:
            break
        current_line += 1
    dictionary.generate_idf(len(training_files))
    dictionary.save()
Exemple #4
0
def build_index(directory, dictionary_file, postings_file):
	files = os.listdir(directory)
	dictionary = Dictionary(dictionary_file)
	postings = Postings(postings_file)
	stemmer = nltk.stem.porter.PorterStemmer()
	last = ''
	for doc_id in files:
		tf_list = {}
		line_number = 1
		offset = 0
		# Use linecache to get line
		line = linecache.getline(os.path.join(directory, doc_id), line_number)
		while line != '':
			# tokenize lines into sentences
			sentences = nltk.sent_tokenize(line)
			for sentence in sentences:
				# tokenize sentence
				tokens = nltk.word_tokenize(sentence)
				for token in tokens:
					# apply stemming and case folding
					stemmed_token = stemmer.stem(token).lower()
					# if term already exists in dictionary, we find row number
					if dictionary.has_term(stemmed_token):
						offset = dictionary.get_offset(stemmed_token) 
						# If postings for that term already has doc id, 
						# then increment tf,
						# Else increment df and add the doc id
						if postings.has_doc_id(doc_id, offset):
							postings.increment_tf(doc_id, offset)	
						else:
							dictionary.increment_df(stemmed_token)
							postings.add_doc_id(doc_id, offset)
					# else, we add it to dictionary and postings
					else:
						offset = postings.add_new_term()
						postings.add_doc_id(doc_id, offset)
						dictionary.add_new_term(stemmed_token, offset)

					#Keep track of tf values of all terms in doc
					if stemmed_token in tf_list:
						tf_list[stemmed_token] += 1
					else:
						tf_list[stemmed_token] = 1
						
			line_number += 1
			line = linecache.getline(os.path.join(directory, doc_id), line_number)
		# Store doc length
		dictionary.add_doc_length(doc_id, tf_list.values())
	# save data
	postings.save(dictionary)
	dictionary.save()
Exemple #5
0
 def __init__(self, company_id="demo"):
     self.graph_id = company_id
     self.orphans = set()
     self.orphan_list = list()
     self.node_map = dict()
     self.node_id_map = dict()
     self.db_info = dict()
     self.search_postings = Postings()
     # get utils for intents.
     with open(os.path.realpath("chatbot/intentUtils.json")) as data_file:
         self.graph_utils = json.load(data_file)
         # replace escaped slashes by a single slash.
         for class_string in self.graph_utils["class"]:
             if type(self.graph_utils["class"][class_string]) is str:
                 self.graph_utils["class"][class_string] = self.graph_utils[
                     "class"][class_string].replace('\\\\', '\\')
def build_index(in_dir, out_dict, out_postings):
    """
    Build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('Indexing...')

    stemmer = PorterStemmer()
    dictionaries = Dictionaries(out_dict)
    postings = Postings(out_postings)
    offset = 1
    count = len(os.listdir(in_dir))

    for docID in os.listdir(in_dir):
        f = open(f'{in_dir}/{docID}', 'r')
        content = f.read()
        sentences = sent_tokenize(content)
        doc_terms = []
        for sentence in sentences:
            for word in word_tokenize(sentence):
                term = stemmer.stem(word=word.lower())
                doc_terms.append(term)

        # Calculate weighted term frequencies for each term
        weighted_term_freqs = [(x[0], get_term_frequency_weight(x[1]))
                               for x in Counter(doc_terms).most_common()]
        # Calculate document vector length
        doc_length = math.sqrt(
            sum(map(lambda x: x[1] * x[1], weighted_term_freqs)))

        for term, normalised_tf in weighted_term_freqs:
            if dictionaries.has_term(term):
                old_offset = dictionaries.get_offset(term)
                postings.add_docId_tf_to_offset(old_offset, docID,
                                                normalised_tf / doc_length)
            else:
                dictionaries.add_term(term, offset)
                postings.add_doc_id(offset)
                postings.add_docId_tf_to_offset(offset, docID,
                                                normalised_tf / doc_length)
                offset += 1

    postings.save_to_file(dictionaries, count)
    dictionaries.save_to_file()
Exemple #7
0
def run_search(dict_file, postings_file, queries_file, results_file):
    """
    Using the given dictionary file and postings file,
    perform searching on the given queries file and output the results to a file
    """
    print('Running search on the queries...')

    dictionaries = Dictionaries(dict_file)
    dictionaries.load()
    postings = Postings(postings_file)
    searcher = Searcher(dictionaries, postings)

    result_string = ''
    with open(queries_file, 'r') as f, open(results_file, 'w') as o:
        for i, query in enumerate(f):
            searcher.set_query(query.strip())
            output = searcher.evaluate_query()
            result_string += output.strip() + '\n'
            searcher.clear_postings()
        f.close()
        o.write(result_string.strip())
        o.close()
Exemple #8
0
def build_index(directory, dictionary_file, postings_file):
	files = os.listdir(directory)
	dictionary = Dictionary(dictionary_file)
	postings = Postings(postings_file)
	stemmer = nltk.stem.porter.PorterStemmer()
	last = ''
	for doc_id in files:
		postings.add_doc(doc_id)
		line_number = 1
		#Use linecache to get line
		line = linecache.getline(os.path.join(directory, doc_id), line_number)
		while line != '':
			#tokenize lines into sentences
			sentences = nltk.sent_tokenize(line)
			for sentence in sentences:
				#tokenize sentence
				tokens = nltk.word_tokenize(sentence)
				for token in tokens:
					#apply stemming and case folding
					stemmed_token = stemmer.stem(token).lower()
					#if term alraeady exists in dictionary, we find row number
					if dictionary.has_term(stemmed_token):
						offset = dictionary.get_offset(stemmed_token)
						result = postings.add_doc_id(doc_id, offset)
						# Result indicates if the doc id is new
						if result:
							dictionary.increment_frequency(stemmed_token)
					#else, we add it to dictionary and postings
					else:
						offset = postings.add_new_term()
						postings.add_doc_id(doc_id, offset)
						dictionary.add_new_term(stemmed_token, offset)
						
			line_number += 1
			line = linecache.getline(os.path.join(directory, doc_id), line_number)
	#save data
	postings.save(dictionary)
	dictionary.save()
Exemple #9
0
 def __init__(self, dictionary_file, postings_file):
     self.dictionary = Dictionary(dictionary_file)
     self.postings = Postings(postings_file)
     self.dictionary.load()
     self.all_docs = self.postings.load_list(0)
Exemple #10
0
 def build_extraction_postings(self, db_object, redis_object,
                               extraction_indices):
     if self.db_info and self.db_info.get("mappings"):
         map_names = self.db_info.get("mappings", []) or []
         for map_name in map_names:
             # initializations
             mapping = dict()
             postings_object = Postings()
             # skip if entry for the map exists in redis
             map_value = redis_object.get(map_name)
             # in case the entry has not been populated before
             if not map_value:
                 # get mapping from DB
                 mapping = db_object["mappings"].find_one(
                     {"name": map_name})
                 mapping.pop("_id")
                 # store mapping in Redis
                 redis_object.set(map_name, json.dumps(mapping))
             else:
                 mapping = json.loads(map_value)
             entries = mapping.get("map")
             tokenized_entries = []
             fields_to_index = mapping.get("toIndex")
             # build postings
             for i, entry in enumerate(entries):
                 # use active entries
                 if entry.get("active"):
                     # merge all texts
                     stripped_text = utils.remove_non_alpha_num_chars(
                         " ".join(
                             filter(
                                 lambda x: bool(x),
                                 reduce(lambda x, y: x + y, [
                                     entry.get(field, []) or [] if type(
                                         entry.get(field, []) or []) == list
                                     else [str(entry[field])]
                                     for field in fields_to_index
                                 ], []))))[0]
                     # generate tokens
                     if stripped_text:
                         map(
                             lambda x: postings_object.
                             add_document_for_token(x, i),
                             set(utils.lemmatize_text(
                                 stripped_text.lower())))
                     if not map_value:
                         # construct tokens for all constituents of the entry and store in redis if not already there
                         tokenized_elements = map(
                             lambda x: sorted(
                                 utils.lemmatize_text(
                                     utils.remove_non_alpha_num_chars(x)[0])
                             ),
                             filter(
                                 lambda x: bool(x),
                                 reduce(lambda x, y: x + y, [
                                     entry.get(field, []) or [] if type(
                                         entry.get(field, []) or []) == list
                                     else [str(entry[field])]
                                     for field in fields_to_index
                                 ], [])))
                         tokenized_entries.append(tokenized_elements)
                 else:
                     if not map_value:
                         tokenized_entries.append(None)
             extraction_indices[map_name] = postings_object
             if not map_value:
                 # set tokenized mappings in redis if not already there
                 redis_object.set("tokenized" + map_name,
                                  json.dumps(tokenized_entries))
 def __init__(self, fd, fp):
     self.dictionary = Dictionary(fd, load=True)
     self.postings = Postings(fp, mode='r')