Example #1
0
 def search_multiline_string_for_pattern(self, s, p, start_line_indices,
                                         end_line_indices):
     """Search a given searchable string possibly containing multiple newlines
        and a specific pattern and return a list of SearchResult instances
        (without filename)
     """
     lines_before = []
     lines_after = []
     search_results = []
     matches = p.finditer(s)
     for m in matches:
         m_line_start_index = 0
         m_line_end_index = len(s) - 1
         before_start_indices = [
             x for x in start_line_indices if x <= m.start()
         ]
         before_line_count = 0
         if before_start_indices:
             m_line_start_index = before_start_indices.pop()
             before_line_count = len(before_start_indices)
             before_start_indices = before_start_indices[self.settings.
                                                         linesbefore * -1:]
         m_line_end_index = end_line_indices[start_line_indices.index(
             m_line_start_index)]
         line = s[m_line_start_index:m_line_end_index]
         if self.settings.linesbefore and before_line_count:
             lines_before = self.get_lines_before(s, before_start_indices,
                                                  start_line_indices,
                                                  end_line_indices)
         if self.do_lines_after():
             after_start_indices = [
                 x for x in start_line_indices if x > m.start()
             ]
             after_start_indices = after_start_indices[:self.settings.
                                                       linesafter]
             lines_after = self.get_lines_after(s, after_start_indices,
                                                start_line_indices,
                                                end_line_indices)
             if after_start_indices and not lines_after:
                 continue
         if (lines_before and
             not self.lines_before_match(lines_before)) or \
             (lines_after and
              not self.lines_after_match(lines_after)):
             continue
         match_start_index = m.start() - m_line_start_index + 1
         match_end_index = m.end() - m_line_start_index + 1
         search_result = SearchResult(pattern=p.pattern,
                                      linenum=before_line_count + 1,
                                      line=line,
                                      match_start_index=match_start_index,
                                      match_end_index=match_end_index,
                                      lines_before=list(lines_before),
                                      lines_after=list(lines_after))
         search_results.append(search_result)
         if self.settings.firstmatch:
             break
     return search_results
Example #2
0
 def search_binary_file_obj(self, sf, fo):
     """Search a binary file file object"""
     contents = fo.read()
     for s in self.settings.searchpatterns:
         if s.search(contents):
             search_result = SearchResult(
                 pattern=s.pattern,
                 # TODO: switch to SearchFile instance
                 filename=str(sf),
                 linenum=0,
                 line=None)
             self.add_search_result(search_result)
    def rerank(self, queryKps, queryDescs, searchResult, numResults=10):
        # start the search timer and initialize the re-ranked results dictionary
        startTime = datetime.datetime.now()
        reranked = {}

        # grab the image indexes from the initial search results and sort them in
        # ascending order so the feature indexes can be grabbed from HDF5
        resultIdxs = np.array([r[-1] for r in searchResult.results])
        resultIdxs.sort()

        # loop over the starting and ending indexes into the features dataset for
        # each image
        for (i, (start, end)) in zip(resultIdxs,
                                     self.featuresDB["index"][resultIdxs,
                                                              ...]):
            # grab the rows from the features dataset and break the rows into
            # keypoints and feature vectors
            rows = self.featuresDB["features"][start:end]
            (kps, descs) = (rows[:, :2], rows[:, 2:])

            # determine matched inlier keypoints and grab the indexes of the
            # matched keypoints into the bag-of-visual-words
            bovwIdxs = self.match(queryKps, queryDescs.astype("float32"), kps,
                                  descs.astype("float32"))

            # provided that at least some keypoints were matched, the final score
            # for the spatial verification is the sum of the idf values for the
            # inlier words
            if bovwIdxs is not None:
                score = self.idf[bovwIdxs].sum()
                reranked[i] = score

        # if no spatially verified matches were found, return the initial search
        # result object
        if len(reranked) == 0:
            return searchResult

        # otherwise, sort the spatially verified results
        results = sorted([(v, self.featuresDB["image_ids"][k], k)
                          for (k, v) in reranked.items()],
                         reverse=True)

        # loop over the initial search results
        for (score, imageID, imageIdx) in searchResult.results:
            # only add the initial result to the list of results if the image has
            # NOT been spatially verified
            if imageIdx not in reranked:
                results.append((score, imageID, imageIdx))

        # return the spatially verified and re-ranked results
        return SearchResult(results[:numResults], (datetime.datetime.now() -
                                                   startTime).total_seconds())
Example #4
0
    def execute(self,
                target=os.getcwd(),
                thread_count=multiprocessing.cpu_count()):
        """
            Executes the search query.

            :param target: If a string, it is the target directory to recurse. If a list, it is either a list of strings representing
                the files to process or TargetFile instances.
            :param thread_count: The number of threads to use when processing the query. This defaults to the number of available processing
                cores the executing system has. If None, then the query is processed in this thread. This uses the multiprocessing module internally,
                so you actually get concurrent processing.

            :return: A list of SearchResult objects.
            :rtype: list
        """

        # Collect a file list first (so we can use multiprocessing later)
        file_list = target if type(target) is list else []
        if type(target) is not list:
            for directory, directory_names, filenames in os.walk(target):
                for filename in filenames:
                    relative_path = os.path.join(directory, filename)

                    if os.path.isfile(relative_path):
                        file_list.append(parse.TargetFile(relative_path))
        else:
            file_list = [
                parse.TargetFile(file_entry)
                if type(file_entry) is str else file_entry
                for file_entry in file_list
            ]

        matched_files = []
        if thread_count is None:
            # Process the file list
            for current_file in file_list:
                result = self._execute(current_file, self.compiled_data)
                if result is True:
                    matched_files.append(SearchResult(path=current_file.path))
        else:
            thread_pool = multiprocessing.Pool(thread_count)
            passed_data = [(self, current_file) for current_file in file_list]

            result_files = thread_pool.imap(_threaded_execute, passed_data)
            for result in result_files:
                matched_files += result

        return matched_files
    def search(self, queryHist, numResults=10, maxCandidates=200):
        # start the timer to track how long the search took
        startTime = datetime.datetime.now()

        # determine the candidates and sort them in ascending order so they can
        # be read from the bag-of-visual-words database
        candidateIdxs = self.buildCandidates(queryHist,
                                             maxCandidates=maxCandidates)
        candidateIdxs.sort()

        # grab the histograms for the candidates from the bag-of-visual-words
        # database and initialize the results dictionary
        hists = self.bovwDB["bovw"][candidateIdxs]
        queryHist = queryHist.toarray()
        results = {}

        # if the inverse document frequency array has been supplied, multiply the
        # query by it
        if self.idf is not None:
            queryHist *= self.idf

        # loop over the histograms
        for (candidate, hist) in zip(candidateIdxs, hists):
            # if the inverse document frequency array has been supplied, multiply
            # the histogram by it
            if self.idf is not None:
                hist *= self.idf

            # compute the distance between the histograms and updated the results
            # dictionary
            d = self.distanceMetric(hist, queryHist)
            results[candidate] = d

        # sort the results, this time replacing the image indexes with the image
        # IDs themselves
        results = sorted([(v, self.featuresDB["image_ids"][k], k)
                          for (k, v) in results.items()])
        results = results[:numResults]

        # return the search results
        return SearchResult(results, (datetime.datetime.now() -
                                      startTime).total_seconds())
Example #6
0
    def rerank(self, queryKps, queryDescs, searchResult, numResults=10):
        startTime = datetime.datetime.now()
        reranked = {}

        resultIdxs = np.array([r[-1] for r in searchResult.results])
        resultIdxs.sort()

        for (i, (start, end)) in zip(resultIdxs,
                                     self.featuresDB["index"][resultIdxs,
                                                              ...]):

            # grab the features from featruesDB which is
            # [keypointX, keypointY, description_1, description_2, ..., description_N]
            rows = self.featuresDB["features"][start:end]
            (kps, descs) = (rows[:, :2], rows[:, 2:])

            # determine matched inlier keypoint and grab the indexes of the matched keypoints
            bovwIdxs = self.match(queryKps, queryDescs.astype("float32"), kps,
                                  descs.astype("float32"))

            # sum the socre of the idf value
            if bovwIdxs is not None:
                score = self.idf[bovwIdxs].sum()
                reranked[i] = score

        if len(reranked) == 0:
            return searchResult

        # test
        # mttest = [mtv, self.featuresDB["image_ids"][0]]
        # print("mttest: {}".format(mttest))
        # test
        results = sorted([(v, self.featuresDB["image_ids"][k], k)
                          for (k, v) in reranked.items()],
                         reverse=True)

        for (score, imageID, imageIdx) in searchResult.results:
            if imageIdx not in reranked:
                results.append((score, imageID, imageIdx))

        return SearchResult(results[:numResults], (datetime.datetime.now() -
                                                   startTime).total_seconds())
Example #7
0
    def iexecute(self, path=os.getcwd(), thread_count=8):
        """
            Executes the search query and returns an iterator to results from the query. This allows you to execute queries asynchronously and retrieve results
            as they become available.

            :param target: If a string, it is the target directory to recurse. If a list, it is either a list of strings representing
                the files to process or TargetFile instances.
            :param thread_count: The number of threads to use when processing the query. This defaults to the number of available processing
                cores the executing system has. If None, then the query is processed in this thread. This uses the multiprocessing module internally,
                so you actually get concurrent processing.

            :return: A generator to the results of the query.
            :rtype: generator
        """

        # Collect a file list first (so we can use multiprocessing later)
        file_list = path if type(path) is list else []
        if type(path) is not list:
            for directory, directory_names, filenames in os.walk(path):
                for filename in filenames:
                    relative_path = os.path.join(directory, filename)

                    if os.path.isfile(relative_path):
                        file_list.append(parse.TargetFile(relative_path))

        if thread_count is None:
            for current_file in file_list:
                result = self._execute(current_file, self.compiled_data)
                if result is True:
                    yield SearchResult(path=current_file.path)
        else:
            thread_pool = multiprocessing.Pool(thread_count)
            passed_data = [(self, current_file) for current_file in file_list]

            result_files = thread_pool.imap(_threaded_execute, passed_data)
            for result_list in result_files:
                for result in result_list:
                    yield result
Example #8
0
    def search(self, queryHist, numResults=10, maxCandidates=200):

        # get the start time
        startTime = datetime.datetime.now()

        # 根據 queryHist, maxCandidates 挑出 候選者的 ID
        candidateIdxs = self.buildCandidates(queryHist, maxCandidates)

        # 由小到大排序,因為 HDF5 不按照順序讀取會很慢
        candidateIdxs.sort()

        # 以 候選者 ID 從 BOVW 裡挑出 feature histogram
        hists = self.bovwDB["bovw"][candidateIdxs]
        queryHist = queryHist.toarray()
        results = {}

        if self.idf is not None:
            # weighting queryHist by tf-idf
            queryHist *= self.idf

        # loop over the histograms
        for (candidate, hist) in zip(candidateIdxs, hists):
            # weighting hist by tf-idf
            if self.idf is not None:
                hist += self.idf
            # calculate chi-squared distance
            d = self.distanceMetric(hist, queryHist)
            results[candidate] = d

        # 根據 distance 來排序 並將 image index 換成 image ID
        results = sorted([(v, self.featuresDB["image_ids"][k], k)
                          for (k, v) in results.items()])

        results = results[:numResults]

        return SearchResult(results, (datetime.datetime.now() -
                                      startTime).total_seconds())
Example #9
0
headers = {"Content-type": "application/json", "Cookie": cookie}
conn = httplib.HTTPSConnection("www.notion.so")
conn.request("POST", "/api/v3/search", buildnotionsearchquerydata(), headers)
response = conn.getresponse()

data = response.read()
data = data.replace("<gzkNfoUU>", "")
data = data.replace("</gzkNfoUU>", "")

conn.close()

# Extract search results from notion response
searchResultList = []
searchResults = Payload(data)
for x in searchResults.results:
    searchResultObject = SearchResult(x.get('id'))
    if "properties" in searchResults.recordMap.get('block').get(
            searchResultObject.id).get('value'):
        searchResultObject.title = \
            searchResults.recordMap.get('block').get(searchResultObject.id).get('value').get('properties').get('title')[
                0][0]
    else:
        searchResultObject.title = x.get('highlight').get('text')
    if "pathText" in x.get('highlight'):
        searchResultObject.subtitle = x.get('highlight').get('pathText')
    else:
        searchResultObject.subtitle = " "
    if "format" in searchResults.recordMap.get('block').get(
            searchResultObject.id).get('value'):
        if "page_icon" in searchResults.recordMap.get('block').get(
                searchResultObject.id).get('value').get('format'):
Example #10
0
#     data = json.load(json_file)
# data = data['data']

ITEM_TYPE_ICONNAME_MAPPING = {
    2: 'doc.png',
    3: 'sheet.png',
}

TITLE_REPLACE_PATTERN = "<em>(.*)</em>"

# Extract search results from lark response
searchResultList = []
objs = data['entities']['objs']
for k in data['tokens']:
    obj = objs[k]
    searchResultObject = SearchResult(obj['token'])
    title = obj['title']
    title = re.sub(TITLE_REPLACE_PATTERN, r"\1", title)
    searchResultObject.title = title
    viewed_time = datetime.fromtimestamp(obj['open_time']).strftime('%H:%M')
    updated_time = datetime.fromtimestamp(obj['edit_time']).isoformat()
    searchResultObject.subtitle = "Author: " + obj['author'] + ", You viewed " + viewed_time + ", " + obj['edit_name'] + " updated " + updated_time
    searchResultObject.link = obj['url']
    if ITEM_TYPE_ICONNAME_MAPPING.has_key(obj['type']):
        searchResultObject.icon = "itemicons/" + ITEM_TYPE_ICONNAME_MAPPING.get(obj['type'])
    searchResultList.append(searchResultObject)

itemList = []
for searchResultObject in searchResultList:
    item = {}
    item["uid"] = searchResultObject.id
Example #11
0
 def search_line_iterator(self, lines):
     """Consecutively search the lines of a line iterator and return results"""
     pattern_match_dict = {}
     linenum = 0
     lines_before = deque()
     lines_after = deque()
     results = []
     while True:
         if lines_after:
             line = lines_after.popleft()
         else:
             try:
                 line = lines.next().rstrip('\r\n')
             except StopIteration:
                 break
             except AttributeError:
                 #common.log('AttributeError: %s' % e)
                 break
         linenum += 1
         if self.settings.linesafter:
             while len(lines_after) < self.settings.linesafter:
                 try:
                     lines_after.append(lines.next().rstrip('\r\n'))
                 except StopIteration:
                     break
         for p in self.settings.searchpatterns:
             if self.settings.firstmatch and p in pattern_match_dict:
                 continue
             # find all matches for the line
             matchiter = p.finditer(line)
             while True:
                 try:
                     match = matchiter.next()
                 except StopIteration:
                     break
                 else:
                     # if there are lines_before or lines_after and none matches
                     # then continue
                     if (lines_before and
                         not self.lines_before_match(lines_before)) or \
                         (lines_after and
                          not self.lines_after_match(lines_after)):
                         continue
                     # capture lines after until a linesaftertopattern or
                     # linesafteruntilpattern is matched, if any are defined
                     lines_after_to_match = False
                     lines_after_until_match = False
                     if self.do_lines_after_or_until():
                         # check to see if lines_after has a match
                         if self.settings.linesaftertopatterns and \
                            any_matches_any_pattern(lines_after,
                                self.settings.linesaftertopatterns):
                             lines_after_to_match = True
                         if self.settings.linesafteruntilpatterns and \
                            any_matches_any_pattern(lines_after,
                            self.settings.linesafteruntilpatterns):
                             lines_after_until_match = True
                         # if not read in more lines until a match or EOF
                         while not lines_after_to_match and \
                               not lines_after_until_match:
                             try:
                                 next_line = lines.next().rstrip('\r\n')
                                 lines_after.append(next_line)
                                 if self.settings.linesaftertopatterns and \
                                    matches_any_pattern(next_line,
                                        self.settings.linesaftertopatterns):
                                     lines_after_to_match = True
                                 elif self.settings.linesafteruntilpatterns and \
                                      matches_any_pattern(next_line,
                                          self.settings.linesafteruntilpatterns):
                                     lines_after_until_match = True
                             except StopIteration:
                                 break
                     sr_lines_after = []
                     if self.do_lines_after_or_until():
                         if lines_after_to_match:
                             sr_lines_after = list(lines_after)
                         elif lines_after_until_match:
                             sr_lines_after = list(lines_after)[:-1]
                     else:
                         sr_lines_after = list(lines_after)
                     search_result = \
                         SearchResult(pattern=p.pattern,
                                      linenum=linenum,
                                      line=line,
                                      match_start_index=match.start() + 1,
                                      match_end_index=match.end() + 1,
                                      lines_before=list(lines_before),
                                      lines_after=sr_lines_after,
                                      maxlinelength=self.settings.maxlinelength)
                     results.append(search_result)
                     pattern_match_dict[p] = 1
         if self.settings.linesbefore:
             if len(lines_before) == self.settings.linesbefore:
                 lines_before.popleft()
             if len(lines_before) < self.settings.linesbefore:
                 lines_before.append(line)
     return results