def get_neg_sentence(token, text): # gets the sentence in which the token appears in only if the processed sentence is negative sentences = text.split('.') for sentence in sentences: words = sentence.split() for word in words: if (process_text(word) == token and get_sentiment(process_text(sentence)) < 0 and get_sentiment(sentence) < 0): return sentence return None
def run_basic_search(query): file_index = 0 directory = '../input' for file in os.listdir(directory): if file.endswith("html"): with open(os.path.join(directory, file), "r", encoding='utf8', errors='ignore') as f: html = f.read() words = process_text(html) frequency = 0 word_index = 0 indexes = [] for word in words: word = process_word(word) if word != "": if word in query: frequency += 1 indexes.append(word_index) word_index += 1 if frequency > 0: results[file_index] = [frequency, file, indexes] file_index += 1 sorted_results = sorted(results.values(), key=lambda kv: kv[0], reverse=True) get_results(sorted_results)
def get_results(sorted_res): for value in sorted_res: path = '../input/' + value[1] with open(path, "r", encoding='utf8', errors='ignore') as f: html = f.read() word_list = process_text(html) snippet = print_snippet(word_list, value[2]) print("%d\t\t\t %s\t\t\t\t\t%s" % (value[0], value[1], snippet))
def refresh(self): if len(self.files) == 0: return cv_image = cv2.cvtColor(preprocess.process_text(self.files[self.i]), cv2.COLOR_GRAY2RGB) pil_image = Image.fromarray(cv_image) pil_image = pil_image.resize((300,300), PIL.Image.ANTIALIAS) pil_image.save("/tmp/i.gif") gif1 = Tkinter.PhotoImage(file = "/tmp/i.gif") self.canvas.create_image(50, 10, image = gif1, anchor = Tkinter.NW) self.root.refresh() return
def refresh(self): if len(self.files) == 0: return cv_image = cv2.cvtColor(preprocess.process_text(self.files[self.i]), cv2.COLOR_GRAY2RGB) pil_image = Image.fromarray(cv_image) pil_image = pil_image.resize((300, 300), PIL.Image.ANTIALIAS) pil_image.save("/tmp/i.gif") gif1 = Tkinter.PhotoImage(file="/tmp/i.gif") self.canvas.create_image(50, 10, image=gif1, anchor=Tkinter.NW) self.root.refresh() return
def run_sqlite_search(query): conn = sqlite3.connect('inverted-index.db') c = conn.cursor() sql = ''' SELECT p.documentName AS docName, SUM(frequency) AS freq, GROUP_CONCAT(indexes) AS idxs FROM Posting p WHERE p.word IN ({seq}) GROUP BY p.documentName ORDER BY freq DESC;'''.format(seq=','.join(['?'] * len(query))) cursor = c.execute(sql, query) for row in cursor: path = '../input/' + row[0] with open(path, "r", encoding='utf8', errors='ignore') as f: html = f.read() words = process_text(html) indexes = [int(s) for s in row[2].split(',')] snippet = print_snippet(words, indexes) print("%d\t\t %s\t\t\t%s" % (row[1], row[0], snippet))
def indexer(): directory = '../input' print('Indexing...') for file in os.listdir(directory): if file.endswith("html"): with open(os.path.join(directory, file), "r", encoding='utf8', errors='ignore') as f: html = f.read() words = process_text(html) unique_words = [] for word in words: word = process_word(word) if word != "": try: write_to_index_word(word) except Exception as e: pass if word not in unique_words: generate_posting(word, words, file) unique_words.append(word)