def handler(postingList): """ @describe: 将磁盘上的倒排记录表加载到内存 @param postingList: 一项倒排记录表磁盘存储项,包括词项、文档频率、高低端倒排记录表 """ # 初始化 term = postingList['term'] df = postingList['df'] highPostingList = postingList['highPostingList'] lowPostingList = postingList['lowPostingList'] # 定义倒排词项 hash[term] = Term() hash[term].df = df # 高端倒排记录表整理 if len(highPostingList) > 0: link = Link() for posting in highPostingList: link.append( Posting(posting['docID'], posting['tf'], posting['static_grade'], posting['time'])) hash[term].high_posting_list = link.head # 低端倒排记录表整理 if len(lowPostingList) > 0: link = Link() for posting in lowPostingList: link.append( Posting(posting['docID'], posting['tf'], posting['static_grade'], posting['time'])) hash[term].low_posting_list = link.head
def indexer(inverted_index): doc_id_file = open("doc_ids_urls.txt", "w") n = 0 #/home/fghiasi/M1_project/searchEngine/examples/aiclub_ics_uci_edu #/home/fghiasi/inf141Proj2_last_update/inf141Proj2/Assignment3/DEV #dev_path = "C:\\Users\\NoobMaster69\\Desktop\\School\\CS 121 - Info Retrieval\\Assignments\\3-Search-Engine\\M1\\developer\\DEV" dev_path = "doh" documents = searching_all_files(dev_path) #documents = searching_all_files('/home/fghiasi/inf141Proj2_last_update/inf141Proj2/Assignment3/DEV') # documents = ['/home/fghiasi/M1_project/searchEngine/examples/aiclub_ics_uci_edu/8ef6d99d9f9264fc84514cdd2e680d35843785310331e1db4bbd06dd2b8eda9b.json'] for document in documents: n += 1 content = extract_json_content(document, 'content') url = extract_json_content(document, 'url') doc_id_url_str = "{} {}\n".format(n, url) doc_id_file.write(doc_id_url_str) text = tokenize(content) word_freq = computeWordFrequencies(text) for i, token in enumerate(text): if token not in inverted_index: inverted_index[token] = [] inverted_index[token].append(Posting(n, word_freq[token], i)) # print(token, " ", Posting(n, word_freq[token], i)) doc_id_file.close() return inverted_index
def build_index(): """ Builds and index from a given folder. Normalizes the documents, tokezine them, and create the index. This function is called only when the user has provided a wrong index file, or even when it did not provide anything at all. """ processors.append(NormalizerProcessor()) # Fetches every documents from the input folder print('[FETCHING]\tReading text files from \'{0}\'...'.format(folder)) documents = Document.fetch(folder, True) # Normalizes every loaded documents print('[PROCESSING]\tNormalizing words from every documents...') tokenize_all(documents) # Creates the index by mapping every word # to all the documents that reference it print('[INDEXING]\tBuilding index from words...\n') posting_list = Posting.index(tokenized_documents) index = Index.build(posting_list) return index
def finalizeIndex(threadNum, N, duplicates): #load a partialIndex, it contains a dict in format {token:list of tuples} #tuple is in the format (docid, wordfreq) #ex. to access the word freq do partialIndex[token][1] with open(tempIndexPath + str(threadNum) + ".tmp", "rb") as file: with open(indexPath + str(threadNum) + ".p", "wb") as dump: partialIndex = pickle.load(file) postings = defaultdict(list) positions = dict( ) #contains seek positions for each token in index #construct posting list for each token in partialIndex for token in partialIndex: positions[token] = dump.tell( ) #record the seek position for ez access later #calculate tfidf, construct posting list with docid, termfreq, tfidf for tokentuple in partialIndex[token]: if tokentuple[0] in duplicates: continue tf = math.log10(tokentuple[1]) + 1 df = len(partialIndex[token]) posting = Posting(tokentuple[0], tokentuple[1], tf * math.log10(N / df)) postings[token].append(posting) pickle.dump(postings[token], dump, pickle.HIGHEST_PROTOCOL) del partialIndex gc.collect() #dump the positions dict with open(indexPath + str(threadNum) + ".positions", "wb") as dump: pickle.dump(positions, dump)
def search(dict_file: str, post_file: str, query_in: str, query_out: str): """ Open all the file and the load the dictionary and the list of all the documents id Then, parse each query and create the respective AST that can evaluate itself Finally, print the query result in the out file *params* - dict_file The filename of the dictionary file - post_file The filename of the postings file - query_in The filename of the query file - query_out The filename of the output file """ with open(dict_file, mode="rb") as dictionary_file,\ open(post_file, mode="rb") as postings_file,\ open(query_in, encoding="utf8") as q_in,\ open(query_out, mode="w", encoding="utf8") as q_out: dictionary = pickle.load(dictionary_file) posting = Posting(dictionary, postings_file) file_list = posting['__all__'] for query in q_in: print(" ".join( map( str, shunting_yard_AST(tokenize(query)).eval( posting, file_list).list)), end='\n', file=q_out)
def reduce(self): for tok_doc in self.tok_docs: for token, frequency in tok_doc.tokens_freq.items(): posting = Posting(tok_doc.document.doc_id, freq=frequency, occ=tok_doc.tokens_occ[token]) # print(posting.__dict__) self.reduced_terms[token].append(posting.__dict__) return self.reduced_terms
def deserializeIndexItem(line): line = line.split(" ") postings = [] for i in range(2, len(line)): if i % 2 == 0: postings.append(Posting(line[i], int(line[i + 1]))) return line[0], line[1], postings
def reduce_tok_doc(tok_doc, reduced_terms=None, documents=None): reduced_terms = defaultdict( list) if reduced_terms is None else reduced_terms for token, frequency in tok_doc.tokens_freq.items(): posting = Posting(tok_doc.document.doc_id, freq=frequency, occ=tok_doc.tokens_occ[token]) # print(posting.__dict__) reduced_terms[token].append(posting.__dict__) return reduced_terms
def _expand_inverted_index(self, quoted_phrases, expansion): inv_idx = self.inverted_index for phrase in quoted_phrases: phrase = word_tokenize(phrase) # Create a new term representing the phrase and add it to the expansion new_term = "_".join(phrase) expansion[new_term] = list() # Get the doc_ids in the lines for each term in the phrase term_lines = [[posting for posting in inv_idx[word]] for word in phrase] # Select the documents which contain all the phrase's words filtered_postings = [] intersection = self._intersection(term_lines, filtered_postings) while len(intersection) > 0: # Occurrences is a list of lists (matrix), where each line i contains # the occurrences of the ith word of the phrase in the document # intersection[0]. occurrences = [ postings_list[0].positions for postings_list in filtered_postings ] # Subtract from all elements in each line of occurrences the line # index (line 0 is ignored). for i in range(1, len(occurrences)): occurrences[i] = [(occurrence - i) for occurrence in occurrences[i]] # The positions in which there are intersections are occurrences # of the whole phrase in the document. phrase_occurrences = self._intersection(occurrences) # If there are occurrences, create a Posting for the document in # expansion[new_term] if len(phrase_occurrences) > 0: new_posting = Posting(intersection[0]) new_posting.positions = deepcopy(phrase_occurrences) expansion[new_term].append(new_posting) # Delete the heads of intersection and filtered_posting's lines for postings_list in filtered_postings: del postings_list[0] del intersection[0]
def readQuery(city, queryEmail): ''' Queries a city, and parses the result into a list of Posting objects. Args: city: A string representation of the city to query queryEmaiL: a boolean whether to search for the reply_to e-mail associated with each post. This (unfortunately) is quite slow. returns: postingsObject: A list of postings. ''' if city == 'test': #A stored xml file city = 'seattle' f = open('files/seattle.xml') minneapolisString = f.read() f.close() dom = parseString(minneapolisString) else: dom = __parseQuery(city) domItems = dom.getElementsByTagName('item') postings = [] for i, item in enumerate(domItems): title = item.getElementsByTagName('title')[0].childNodes[0].wholeText postingDate = item.getElementsByTagName( 'dc:date')[0].childNodes[0].wholeText permalink = item.getElementsByTagName( 'link')[0].childNodes[0].wholeText #Get the reply to field #Currently commented out, due to speed issues. #TODO: Change this to work if (queryEmail): print i permalinkPage = urllib.urlopen(permalink) permalinkHTML = permalinkPage.read() permalinkPage.close() match = re.search('(mailto:)([^?]+)', permalinkHTML) if match: mailToLink = match.group(2) else: mailToLink = "emailnotfound" else: mailToLink = "emailnotfound" shortDescription = item.getElementsByTagName( 'description')[0].childNodes[0].wholeText postings.append( Posting(city, title, permalink, shortDescription, postingDate, mailToLink)) postingsObject = Postings(postings) return postingsObject
def _expand_inverted_index(self, quoted_phrases, expansion): inv_idx = self.inverted_index for phrase in quoted_phrases: phrase = word_tokenize(phrase) # Create a new term representing the phrase and add it to the expansion new_term = "_".join(phrase) expansion[new_term] = list() # Get the doc_ids in the lines for each term in the phrase term_lines = [[posting for posting in inv_idx[word]] for word in phrase] # Select the documents which contain all the phrase's words filtered_postings = [] intersection = self._intersection(term_lines, filtered_postings) while len(intersection) > 0: # Occurrences is a list of lists (matrix), where each line i contains # the occurrences of the ith word of the phrase in the document # intersection[0]. occurrences = [postings_list[0].positions for postings_list in filtered_postings] # Subtract from all elements in each line of occurrences the line # index (line 0 is ignored). for i in range(1, len(occurrences)): occurrences[i] = [(occurrence-i) for occurrence in occurrences[i]] # The positions in which there are intersections are occurrences # of the whole phrase in the document. phrase_occurrences = self._intersection(occurrences) # If there are occurrences, create a Posting for the document in # expansion[new_term] if len(phrase_occurrences) > 0: new_posting = Posting(intersection[0]) new_posting.positions = deepcopy(phrase_occurrences) expansion[new_term].append(new_posting) # Delete the heads of intersection and filtered_posting's lines for postings_list in filtered_postings: del postings_list[0] del intersection[0]
def handler(weibo): """ 处理微博 """ tokens = jieba.cut_for_search(weibo['mt']) # 微博静态评分计算:a*log(转发数)+b*log(评论数) static_grade = math.log(weibo['rc'] + 1) + math.log(weibo['cc'] + 1) # 对生成的词条进行归一化处理 self.__normalization(weibo['_id'], tokens, static_grade, weibo['ct']) # 添加nickname到nickname倒排链表 nicknames = weibo['nc'] nicknames.append(weibo['sn']) for nickname in nicknames: self.__addNickNamePosting( nickname, Posting(weibo['_id'], 1, static_grade, weibo['ct']))
def get_postings(tokenized_word, file_ptr): # file_ptr should be inverted_index postings = [] for line in file_ptr: #"workshop\n" line_txt = line.strip("\n") #"workshop" if line_txt == tokenized_word: #we found the token while line_txt != '$': line_txt = file_ptr.readline().strip("\n") if line_txt != "$": p_values = line_txt.strip().split( ',') # 1,22,1035 -> ['1', '22', '1035'] assert 3 == len(p_values) postings.append( Posting(p_values[0], p_values[1], p_values[2])) file_ptr.seek(0) return postings file_ptr.seek(0) return postings # []
def add_tokens_of_doc(self, content, docId): """ Processes content of this document and updates indices and postings lists. Params: - content: tokens in the document content - docId: document ID Returns: - normalised_tf: normalised length of document """ tf_freq = {} # Term frequencies of this document position = 0 # position of token in content for token in content: if token not in self.terms: self.terms[token] = {} self.terms[token]["offset"] = None self.terms[token]["size"] = None self.terms[token]["docFreq"] = 1 self.terms[token]["posting"] = Posting() else: if token not in tf_freq: self.terms[token]["docFreq"] += 1 self.terms[token]["posting"].add_doc_to_postings(docId) self.terms[token]["posting"].add_pos_to_doc(docId, position) position += 1 if token in tf_freq: tf_freq[token] += 1 else: tf_freq[token] = 1 normalised_tf = 0 for token in tf_freq.keys(): freq = tf_freq[token] normalised_tf += pow((1 + util.log10(freq)), 2) return sqrt(normalised_tf)
def create_tokens(self, root, path, url): ''' This function should return word tokens for a given file ''' #http://nltk.org #https://pythonspot.com/tokenizing-words-and-sentences-with-nltk/ -has info on stop words and stemming with open(root + path, 'r') as myfile: soup = BeautifulSoup(myfile, 'lxml') # kill all script and style elements for script in soup( ["script", "style"] ): #Source: https://stackoverflow.com/questions/22799990/beatifulsoup4-get-text-still-has-javascript script.decompose() #rip it out #Get title try: title = self.cleanhtml(str(soup.find_all("title")[0])) print("PRINTING TITLE") print(title) except IndexError: title = "" raw_text = soup.get_text() raw_tokens = nltk.word_tokenize(raw_text) print("Compute RAW TOKENS") filtered_tokens = self.remove_stop_words(raw_tokens) print("Compute filtered_tokens") #Stemming of all the tokens gathered words_counter = self.create_stemmed_word_count_dictionary( filtered_tokens) print("Compute word_dict") for word, count in words_counter.items(): if (self.is_ascii(word) and (not self.is_number(word))): if len(word) < 182 and len( word) > 2: #Can't have large strings for db keys posting = Posting(path, url, title) posting.set_frequency(count) posting.set_length_of_doc(len(raw_tokens)) if word not in self.tokens.keys(): self.tokens[word] = [posting] else: self.tokens[word].append(posting) if path == "www.ics.uci.edu/faculty": print(words_counter)
def __normalization(self, docID, tokens, static_grade, time): """ @describe: 对词条进行语言预处理,归一化,之后自动将归一化之后的词项添加到倒排索引中 @param docID: 文档ID @param tokens: 文档中的所有词项 @param static_grade: 文档的静态评分 @param time: 微博发布时间 """ terms = {} # 临时词典,用以存储该文档中的所有词及对应的tf for token in tokens: if token not in '‘~!@#$%^&*()_+{}|:"<>?`-=[]\;\',./ !¥……()——:“”《》?·【】、;‘’,。丶~→......的了是转发回复赞谢谢thttpV': # 过滤掉特殊字符 if not terms.has_key(token): # 该文档中第一次出现的词 terms[token] = 1 else: terms[token] += 1 # 将归一化之后的词项添加到倒排记录表 for term in terms: self.__addPosting(term, Posting(docID, terms[term], static_grade, time)) # terms[term]中存储的是词项频率
def indexDocument(url, content): indices = defaultdict(lambda: []) # Build the soup of the current html file soup = BeautifulSoup(content, "html.parser") # Tokenize the html and compute frequencies of the tokens words = tk.tokenize(soup.get_text()) frequencies = tk.computeWordFrequencies(words) # Get the text that is bolded boldedText = getTextInTags(soup, "b") boldedText += getTextInTags(soup, "strong") # Get the text in the headers headerText = getTextInTags(soup, "header") for i in range(1, 7): headerText += " " headerText += getTextInTags(soup, f"h{i}") # Get the text in the title titleText = getTextInTags(soup, "title") # Loop through all token-frequencies for token, frequency in frequencies.items(): score = frequency # Add to the score for its occurrences in important html tags score += boldedText.count(token) * 5 score += headerText.count(token) * 10 score += titleText.count(token) * 15 # Add the posting to the current index indices[token].append(Posting(url, score)) return indices
def init_inverted_index(): idx = 1 doc_list = [] inverted_index = InvertedIndex() while True: try: document = deserialize(str(idx)+".dbf") doc_list.append(document) idx += 1; except IOError: break total = len(doc_list) inverted_index.n = total for document in doc_list: lower_doc = str(document).lower() tokens = nltk.word_tokenize(lower_doc) for pos in range(0,len(tokens)): tk = tokens[pos] if not tk in inverted_index: inverted_index[tk] = list() term_data = inverted_index[tk] if not document.id in map(lambda p: p.doc_id, term_data): term_data.append(Posting(document.id)) for posting in term_data: if posting.doc_id == document.id: posting.positions.append(pos) break print "{0:.2f}% completed...".format(float(document.id)/total * 100) serialize(inverted_index,"inverted_index.idx")
dialog_cnpj_padrao.txt_cnpj_padrao.textChanged.connect( on_cnpj_padrao_alterado) dialog_cnpj_padrao.txt_cnpj_padrao.keyPressEvent = txt_cnpj_padrao_keyPressEvent ui.btn_cnpj.clicked.connect(mostra_dialogCnpj) dialog_cnpj_padrao.btn_ok.clicked.connect(confirma_cnpj_padrao) dialog_cnpj_padrao.btn_cancel.clicked.connect(Dialog_Cnpj.reject) ui.txt_num_notas.setValidator( QtGui.QIntValidator(constant.MIN_NOTAS, constant.MAX_NOTAS)) ui.txt_num_notas.setText(str(constant.DEFAULT_NUMERO_NOTAS)) ui.txt_cnpj_estab.setText(constant.EMPTY_STR) servico_posting = Posting() ui.btn_postar.clicked.connect(on_abre_postar) m = Messages() lista_notas = [] lista_cnpj = [] mes_tipo = 1 #mes atual mes_sel = define_mes_padrao() mes_sel_int = define_mes_padrao_int() if constant.INICIA_DB_INICIO: init_db() #cria lista (notas e empresas)
while posting is not None: if i < self.topK: docIDs.append(posting.docID) i = i + 1 posting = posting.next return i, docIDs def normalization(self, tfsList): # log and normalization length = 0 for i in xrange(0, len(tfsList)): length = length + (1 + math.log10(tfsList[i])) * ( 1 + math.log10(tfsList[i])) length = math.sqrt(length) for i in xrange(0, len(tfsList)): tfsList[i] = (1 + math.log10(tfsList[i])) / length if __name__ == "__main__": from term import Term from posting import Posting terms = [Term(100), Term(10), Term(1)] posting4 = Posting('doc4', [1, 1, 2], 100, 0.001, None) posting3 = Posting('doc3', [2, 1, 2], 200, 0.003, posting4) posting2 = Posting('doc2', [2, 2, 2], 110, 0.004, posting3) posting = Posting('doc1', [3, 3, 3], 900, 0.005, posting2) temp = Ranker() docid = temp.rankByRelevancy(terms, posting) for doc in docid: print doc #just for test
def processDirectory(dirpath, filenames, curr_docid): # PorterStemmer for tokenizing p = PorterStemmer() # index will be the inverted index. It will be offloaded at several points in the program. index = defaultdict(list) # freq is a dictionary that stores the frequency (tf) of each term. Cleared every time a file is finished with parsing. freq = defaultdict(int) # docid map to be returned did = {} for f in filenames: file = open(dirpath + '\\' + f, 'r') l = file.readline() # Tries to load json, with an except statement triggered if there is a Value Error. Should never be triggered. try: json_dict = json.loads(l) except ValueError: print('Loading file ' + str(dirpath) + str(f) + ' has failed') soup = BeautifulSoup(json_dict['content'], features='lxml') # Updates doc_id map with new URL did[curr_docid] = tuple([json_dict['url'], len(soup.get_text())]) # Suppress BeautifulSoup warnings about URLs in text. warnings.filterwarnings("ignore", category=UserWarning, module='bs4') # Parsing section. Essentially checks to make sure that a stem is greater than 2 characters, but not completely composed of numbers. # Temporarily stores the frequency of each word as the tfidf. Updated at the end of indexing. for w in nltk.tokenize.word_tokenize(soup.get_text()): freq[p.stem(w)] += 1 for i, j in freq.items(): if (re.match("^[a-zA-Z0-9][a-zA-Z0-9]+$", i) and (not re.match("^[0-9][0-9]+$", i))): index[i].append(Posting(curr_docid, float(j), 1, 0)) # Special Weighing for bold words. for w in soup.find_all('b'): for t in nltk.tokenize.word_tokenize(w.text): if (index.get(t) != None): index[t][-1].setpriority(2) # Special weighing for headers. for w in soup.find_all(re.compile('^h[1-6]$')): for t in nltk.tokenize.word_tokenize(w.text): if (index.get(t) != None): index[t][-1].setpriority(3) # Special weighing for titles. for w in soup.find_all('title'): for t in nltk.tokenize.word_tokenize(w.text): if (index.get(t) != None): index[t][-1].setpriority(4) curr_docid += 1 file.close() freq.clear() dump(index, os.path.basename(dirpath)) return did
from posting import Posting try: import cPickle as pickle except ImportError: import pickle stemmer = PorterStemmer() stopwords = {stemmer.stem(i) for i in stopwords.words("english")} print({stemmer.stem(w.lower()) for w in {word for sent in sent_tokenize("government's\nbusiness") # (re.sub("[-']", " ", file.read())) for word in word_tokenize(sent)}}) with open("dictionary.txt", mode="rb") as dictionary_file,\ open("postings.txt", mode="rb") as postings_file: dictionary = pickle.load(dictionary_file) posting = Posting(dictionary, postings_file) counter_numbers = 0 counter_number_disksize = 0 counter_stopwords = 0 counter_disksize = 0 counter_possessive = 0 counter_dash = 0 for k in dictionary: if re.match(r'[\d.,/]+', k): # Number and dates counter_numbers += 1 counter_number_disksize += dictionary[k].size elif re.match(r'.*\'.*', k): # Containing an hypen # print(k, posting[k].list) counter_possessive += 1 elif re.match(r'.*-.*', k): # Containing a dash # print(k, posting[k].list)
def scrape(): ''' Driving function to web scrape Parameters: None Returns: None ''' #Loads urls print("Running Auto Job Application Tracker") print("Attempting to open insert.txt") try: new_applications = open("Config/Insert.txt", "r") urls = new_applications.read().splitlines() new_applications.close() total = len(urls) if total == 0: print( "Insert.txt is empty, please make sure you save the file before running this script." ) print("Exiting script...") sys.exit(1) else: print("Found {} Urls, proceeding to scrape data".format(total)) for num, job_url in enumerate(urls, 1): print("--------------------------------") print("Parsing application {}/{}".format(num, total)) try: #Open connection to job url uClient = uReq(job_url) #Html Parsing page_soup = soup(uClient.read(), "html.parser") uClient.close() except: print("Incorrect url, please check: {}".format(job_url)) empty = Posting(url=job_url) empty.set_empty() empty.append() else: #Creates appropriate posting object if "indeed" in job_url: ad = Posting("INDEED", job_url, page_soup, constants.INDEED_TITLE_TAG, constants.INDEED_TITLE_CLASS, constants.INDEED_COMPANY_TAG, constants.INDEED_COMPANY_CLASS, constants.INDEED_LOC_TAG, constants.INDEED_LOC_CLASS) elif "linkedin" in job_url: ad = Posting("LINKEDIN", job_url, page_soup, constants.LINKEDIN_TITLE_TAG, constants.LINKEDIN_TITLE_CLASS, constants.LINKEDIN_COMPANY_TAG, constants.LINKEDIN_COMPANY_CLASS, constants.LINKEDIN_LOC_TAG, constants.LINKEDIN_LOC_CLASS) elif "workopolis" in job_url: ad = Posting("WORKOPOLIS", job_url, page_soup, constants.WORKOPOLIS_TITLE_TAG, constants.WORKOPOLIS_TITLE_CLASS, constants.WORKOPOLIS_COMPANY_TAG, constants.WORKOPOLIS_COMPANY_CLASS, constants.WORKOPOLIS_LOC_TAG, constants.WORKOPOLIS_LOC_CLASS) elif "glassdoor" in job_url: ad = Posting("GLASSDOOR", job_url, page_soup, constants.GLASSDOOR_TITLE_TAG, constants.GLASSDOOR_TITLE_CLASS, constants.GLASSDOOR_COMPANY_TAG, constants.GLASSDOOR_COMPANY_CLASS, constants.GLASSDOOR_LOC_TAG, constants.GLASSDOOR_LOC_CLASS) else: print("URL not supported") continue ad.find_data() ad.find_skills() ad.append() except FileNotFoundError: print("File does not exist, creating that file now...") print("Please copy your URLs into 'insert.txt'") new_applications = open("Config/Insert.txt", "w") print("Exiting script...") sys.exit(1)