def search_multiline_string_for_pattern(self, s, p, start_line_indices, end_line_indices): """Search a given searchable string possibly containing multiple newlines and a specific pattern and return a list of SearchResult instances (without filename) """ lines_before = [] lines_after = [] search_results = [] matches = p.finditer(s) for m in matches: m_line_start_index = 0 m_line_end_index = len(s) - 1 before_start_indices = [ x for x in start_line_indices if x <= m.start() ] before_line_count = 0 if before_start_indices: m_line_start_index = before_start_indices.pop() before_line_count = len(before_start_indices) before_start_indices = before_start_indices[self.settings. linesbefore * -1:] m_line_end_index = end_line_indices[start_line_indices.index( m_line_start_index)] line = s[m_line_start_index:m_line_end_index] if self.settings.linesbefore and before_line_count: lines_before = self.get_lines_before(s, before_start_indices, start_line_indices, end_line_indices) if self.do_lines_after(): after_start_indices = [ x for x in start_line_indices if x > m.start() ] after_start_indices = after_start_indices[:self.settings. linesafter] lines_after = self.get_lines_after(s, after_start_indices, start_line_indices, end_line_indices) if after_start_indices and not lines_after: continue if (lines_before and not self.lines_before_match(lines_before)) or \ (lines_after and not self.lines_after_match(lines_after)): continue match_start_index = m.start() - m_line_start_index + 1 match_end_index = m.end() - m_line_start_index + 1 search_result = SearchResult(pattern=p.pattern, linenum=before_line_count + 1, line=line, match_start_index=match_start_index, match_end_index=match_end_index, lines_before=list(lines_before), lines_after=list(lines_after)) search_results.append(search_result) if self.settings.firstmatch: break return search_results
def search_binary_file_obj(self, sf, fo): """Search a binary file file object""" contents = fo.read() for s in self.settings.searchpatterns: if s.search(contents): search_result = SearchResult( pattern=s.pattern, # TODO: switch to SearchFile instance filename=str(sf), linenum=0, line=None) self.add_search_result(search_result)
def rerank(self, queryKps, queryDescs, searchResult, numResults=10): # start the search timer and initialize the re-ranked results dictionary startTime = datetime.datetime.now() reranked = {} # grab the image indexes from the initial search results and sort them in # ascending order so the feature indexes can be grabbed from HDF5 resultIdxs = np.array([r[-1] for r in searchResult.results]) resultIdxs.sort() # loop over the starting and ending indexes into the features dataset for # each image for (i, (start, end)) in zip(resultIdxs, self.featuresDB["index"][resultIdxs, ...]): # grab the rows from the features dataset and break the rows into # keypoints and feature vectors rows = self.featuresDB["features"][start:end] (kps, descs) = (rows[:, :2], rows[:, 2:]) # determine matched inlier keypoints and grab the indexes of the # matched keypoints into the bag-of-visual-words bovwIdxs = self.match(queryKps, queryDescs.astype("float32"), kps, descs.astype("float32")) # provided that at least some keypoints were matched, the final score # for the spatial verification is the sum of the idf values for the # inlier words if bovwIdxs is not None: score = self.idf[bovwIdxs].sum() reranked[i] = score # if no spatially verified matches were found, return the initial search # result object if len(reranked) == 0: return searchResult # otherwise, sort the spatially verified results results = sorted([(v, self.featuresDB["image_ids"][k], k) for (k, v) in reranked.items()], reverse=True) # loop over the initial search results for (score, imageID, imageIdx) in searchResult.results: # only add the initial result to the list of results if the image has # NOT been spatially verified if imageIdx not in reranked: results.append((score, imageID, imageIdx)) # return the spatially verified and re-ranked results return SearchResult(results[:numResults], (datetime.datetime.now() - startTime).total_seconds())
def execute(self, target=os.getcwd(), thread_count=multiprocessing.cpu_count()): """ Executes the search query. :param target: If a string, it is the target directory to recurse. If a list, it is either a list of strings representing the files to process or TargetFile instances. :param thread_count: The number of threads to use when processing the query. This defaults to the number of available processing cores the executing system has. If None, then the query is processed in this thread. This uses the multiprocessing module internally, so you actually get concurrent processing. :return: A list of SearchResult objects. :rtype: list """ # Collect a file list first (so we can use multiprocessing later) file_list = target if type(target) is list else [] if type(target) is not list: for directory, directory_names, filenames in os.walk(target): for filename in filenames: relative_path = os.path.join(directory, filename) if os.path.isfile(relative_path): file_list.append(parse.TargetFile(relative_path)) else: file_list = [ parse.TargetFile(file_entry) if type(file_entry) is str else file_entry for file_entry in file_list ] matched_files = [] if thread_count is None: # Process the file list for current_file in file_list: result = self._execute(current_file, self.compiled_data) if result is True: matched_files.append(SearchResult(path=current_file.path)) else: thread_pool = multiprocessing.Pool(thread_count) passed_data = [(self, current_file) for current_file in file_list] result_files = thread_pool.imap(_threaded_execute, passed_data) for result in result_files: matched_files += result return matched_files
def search(self, queryHist, numResults=10, maxCandidates=200): # start the timer to track how long the search took startTime = datetime.datetime.now() # determine the candidates and sort them in ascending order so they can # be read from the bag-of-visual-words database candidateIdxs = self.buildCandidates(queryHist, maxCandidates=maxCandidates) candidateIdxs.sort() # grab the histograms for the candidates from the bag-of-visual-words # database and initialize the results dictionary hists = self.bovwDB["bovw"][candidateIdxs] queryHist = queryHist.toarray() results = {} # if the inverse document frequency array has been supplied, multiply the # query by it if self.idf is not None: queryHist *= self.idf # loop over the histograms for (candidate, hist) in zip(candidateIdxs, hists): # if the inverse document frequency array has been supplied, multiply # the histogram by it if self.idf is not None: hist *= self.idf # compute the distance between the histograms and updated the results # dictionary d = self.distanceMetric(hist, queryHist) results[candidate] = d # sort the results, this time replacing the image indexes with the image # IDs themselves results = sorted([(v, self.featuresDB["image_ids"][k], k) for (k, v) in results.items()]) results = results[:numResults] # return the search results return SearchResult(results, (datetime.datetime.now() - startTime).total_seconds())
def rerank(self, queryKps, queryDescs, searchResult, numResults=10): startTime = datetime.datetime.now() reranked = {} resultIdxs = np.array([r[-1] for r in searchResult.results]) resultIdxs.sort() for (i, (start, end)) in zip(resultIdxs, self.featuresDB["index"][resultIdxs, ...]): # grab the features from featruesDB which is # [keypointX, keypointY, description_1, description_2, ..., description_N] rows = self.featuresDB["features"][start:end] (kps, descs) = (rows[:, :2], rows[:, 2:]) # determine matched inlier keypoint and grab the indexes of the matched keypoints bovwIdxs = self.match(queryKps, queryDescs.astype("float32"), kps, descs.astype("float32")) # sum the socre of the idf value if bovwIdxs is not None: score = self.idf[bovwIdxs].sum() reranked[i] = score if len(reranked) == 0: return searchResult # test # mttest = [mtv, self.featuresDB["image_ids"][0]] # print("mttest: {}".format(mttest)) # test results = sorted([(v, self.featuresDB["image_ids"][k], k) for (k, v) in reranked.items()], reverse=True) for (score, imageID, imageIdx) in searchResult.results: if imageIdx not in reranked: results.append((score, imageID, imageIdx)) return SearchResult(results[:numResults], (datetime.datetime.now() - startTime).total_seconds())
def iexecute(self, path=os.getcwd(), thread_count=8): """ Executes the search query and returns an iterator to results from the query. This allows you to execute queries asynchronously and retrieve results as they become available. :param target: If a string, it is the target directory to recurse. If a list, it is either a list of strings representing the files to process or TargetFile instances. :param thread_count: The number of threads to use when processing the query. This defaults to the number of available processing cores the executing system has. If None, then the query is processed in this thread. This uses the multiprocessing module internally, so you actually get concurrent processing. :return: A generator to the results of the query. :rtype: generator """ # Collect a file list first (so we can use multiprocessing later) file_list = path if type(path) is list else [] if type(path) is not list: for directory, directory_names, filenames in os.walk(path): for filename in filenames: relative_path = os.path.join(directory, filename) if os.path.isfile(relative_path): file_list.append(parse.TargetFile(relative_path)) if thread_count is None: for current_file in file_list: result = self._execute(current_file, self.compiled_data) if result is True: yield SearchResult(path=current_file.path) else: thread_pool = multiprocessing.Pool(thread_count) passed_data = [(self, current_file) for current_file in file_list] result_files = thread_pool.imap(_threaded_execute, passed_data) for result_list in result_files: for result in result_list: yield result
def search(self, queryHist, numResults=10, maxCandidates=200): # get the start time startTime = datetime.datetime.now() # 根據 queryHist, maxCandidates 挑出 候選者的 ID candidateIdxs = self.buildCandidates(queryHist, maxCandidates) # 由小到大排序,因為 HDF5 不按照順序讀取會很慢 candidateIdxs.sort() # 以 候選者 ID 從 BOVW 裡挑出 feature histogram hists = self.bovwDB["bovw"][candidateIdxs] queryHist = queryHist.toarray() results = {} if self.idf is not None: # weighting queryHist by tf-idf queryHist *= self.idf # loop over the histograms for (candidate, hist) in zip(candidateIdxs, hists): # weighting hist by tf-idf if self.idf is not None: hist += self.idf # calculate chi-squared distance d = self.distanceMetric(hist, queryHist) results[candidate] = d # 根據 distance 來排序 並將 image index 換成 image ID results = sorted([(v, self.featuresDB["image_ids"][k], k) for (k, v) in results.items()]) results = results[:numResults] return SearchResult(results, (datetime.datetime.now() - startTime).total_seconds())
headers = {"Content-type": "application/json", "Cookie": cookie} conn = httplib.HTTPSConnection("www.notion.so") conn.request("POST", "/api/v3/search", buildnotionsearchquerydata(), headers) response = conn.getresponse() data = response.read() data = data.replace("<gzkNfoUU>", "") data = data.replace("</gzkNfoUU>", "") conn.close() # Extract search results from notion response searchResultList = [] searchResults = Payload(data) for x in searchResults.results: searchResultObject = SearchResult(x.get('id')) if "properties" in searchResults.recordMap.get('block').get( searchResultObject.id).get('value'): searchResultObject.title = \ searchResults.recordMap.get('block').get(searchResultObject.id).get('value').get('properties').get('title')[ 0][0] else: searchResultObject.title = x.get('highlight').get('text') if "pathText" in x.get('highlight'): searchResultObject.subtitle = x.get('highlight').get('pathText') else: searchResultObject.subtitle = " " if "format" in searchResults.recordMap.get('block').get( searchResultObject.id).get('value'): if "page_icon" in searchResults.recordMap.get('block').get( searchResultObject.id).get('value').get('format'):
# data = json.load(json_file) # data = data['data'] ITEM_TYPE_ICONNAME_MAPPING = { 2: 'doc.png', 3: 'sheet.png', } TITLE_REPLACE_PATTERN = "<em>(.*)</em>" # Extract search results from lark response searchResultList = [] objs = data['entities']['objs'] for k in data['tokens']: obj = objs[k] searchResultObject = SearchResult(obj['token']) title = obj['title'] title = re.sub(TITLE_REPLACE_PATTERN, r"\1", title) searchResultObject.title = title viewed_time = datetime.fromtimestamp(obj['open_time']).strftime('%H:%M') updated_time = datetime.fromtimestamp(obj['edit_time']).isoformat() searchResultObject.subtitle = "Author: " + obj['author'] + ", You viewed " + viewed_time + ", " + obj['edit_name'] + " updated " + updated_time searchResultObject.link = obj['url'] if ITEM_TYPE_ICONNAME_MAPPING.has_key(obj['type']): searchResultObject.icon = "itemicons/" + ITEM_TYPE_ICONNAME_MAPPING.get(obj['type']) searchResultList.append(searchResultObject) itemList = [] for searchResultObject in searchResultList: item = {} item["uid"] = searchResultObject.id
def search_line_iterator(self, lines): """Consecutively search the lines of a line iterator and return results""" pattern_match_dict = {} linenum = 0 lines_before = deque() lines_after = deque() results = [] while True: if lines_after: line = lines_after.popleft() else: try: line = lines.next().rstrip('\r\n') except StopIteration: break except AttributeError: #common.log('AttributeError: %s' % e) break linenum += 1 if self.settings.linesafter: while len(lines_after) < self.settings.linesafter: try: lines_after.append(lines.next().rstrip('\r\n')) except StopIteration: break for p in self.settings.searchpatterns: if self.settings.firstmatch and p in pattern_match_dict: continue # find all matches for the line matchiter = p.finditer(line) while True: try: match = matchiter.next() except StopIteration: break else: # if there are lines_before or lines_after and none matches # then continue if (lines_before and not self.lines_before_match(lines_before)) or \ (lines_after and not self.lines_after_match(lines_after)): continue # capture lines after until a linesaftertopattern or # linesafteruntilpattern is matched, if any are defined lines_after_to_match = False lines_after_until_match = False if self.do_lines_after_or_until(): # check to see if lines_after has a match if self.settings.linesaftertopatterns and \ any_matches_any_pattern(lines_after, self.settings.linesaftertopatterns): lines_after_to_match = True if self.settings.linesafteruntilpatterns and \ any_matches_any_pattern(lines_after, self.settings.linesafteruntilpatterns): lines_after_until_match = True # if not read in more lines until a match or EOF while not lines_after_to_match and \ not lines_after_until_match: try: next_line = lines.next().rstrip('\r\n') lines_after.append(next_line) if self.settings.linesaftertopatterns and \ matches_any_pattern(next_line, self.settings.linesaftertopatterns): lines_after_to_match = True elif self.settings.linesafteruntilpatterns and \ matches_any_pattern(next_line, self.settings.linesafteruntilpatterns): lines_after_until_match = True except StopIteration: break sr_lines_after = [] if self.do_lines_after_or_until(): if lines_after_to_match: sr_lines_after = list(lines_after) elif lines_after_until_match: sr_lines_after = list(lines_after)[:-1] else: sr_lines_after = list(lines_after) search_result = \ SearchResult(pattern=p.pattern, linenum=linenum, line=line, match_start_index=match.start() + 1, match_end_index=match.end() + 1, lines_before=list(lines_before), lines_after=sr_lines_after, maxlinelength=self.settings.maxlinelength) results.append(search_result) pattern_match_dict[p] = 1 if self.settings.linesbefore: if len(lines_before) == self.settings.linesbefore: lines_before.popleft() if len(lines_before) < self.settings.linesbefore: lines_before.append(line) return results