コード例 #1
0
    def _pass2_process_token(self, document, position, zone, token):
        # Vector space structure
        # vector_space[d][t][0] = normalized frequency of term t in document d
        # vector_space[d][t][1] = positions of term t in document d for each zone
        # vector_space[d][t][2] = frequency of term t in document d
        # positions are in this format: ZoneNumber | position
        # Ensure token is in lowercase and eligible for index
        if token in constants.DO_NOT_INDEX or len(token) <= 1:
            return

        p = PorterStemmer()
        # First, let's stem the token
        token = token.lower()
        token = p.stem(token.lower(), 0, len(token) - 1)

        # Find term's index in vector space
        if token not in self._ifile:
            return
        t = self._ifile[token][0]
        if not t in self._vector_space[document.document_id]:
            self._vector_space[document.document_id][t] = [
                0.0, [[], [], [], []], 0
            ]

        self._vector_space[document.document_id][t][0] = \
            (Zones.WEIGHTS[zone] / document.weighted_length) \
            + self._vector_space[document.document_id][t][0]

        self._vector_space[document.document_id][t][1][zone].append(position)
        self._vector_space[document.document_id][t][2] += 1
コード例 #2
0
    def _pass2_process_token(self, document, position, zone, token):
        # Vector space structure
        # vector_space[d][t][0] = normalized frequency of term t in document d
        # vector_space[d][t][1] = positions of term t in document d for each zone
        # vector_space[d][t][2] = frequency of term t in document d
        # positions are in this format: ZoneNumber | position
        # Ensure token is in lowercase and eligible for index
        if token in constants.DO_NOT_INDEX or len(token) <= 1:
            return

        p = PorterStemmer()
        # First, let's stem the token
        token = token.lower()
        token = p.stem(token.lower(), 0,len(token)-1)

        # Find term's index in vector space
        if token not in self._ifile:
            return
        t = self._ifile[token][0]
        if not t in self._vector_space[document.document_id]:
            self._vector_space[document.document_id][t] = [0.0, [[],[],[],[]],0]

        self._vector_space[document.document_id][t][0] = \
            (Zones.WEIGHTS[zone] / document.weighted_length) \
            + self._vector_space[document.document_id][t][0]

        self._vector_space[document.document_id][t][1][zone].append(position)
        self._vector_space[document.document_id][t][2] += 1
コード例 #3
0
    def _pass1_process_token(self, doc_id, token):
        # Inverted file structure:
        # self._ifile[token] = [id, df, postings_list]

        # Let's make sure the token is not in our "do not index"

        if token in constants.DO_NOT_INDEX or len(token) <= 1:
            return
        p = PorterStemmer()
        # First, let's stem the token
        token = token.lower()
        token = p.stem(token.lower(), 0, len(token) - 1)

        with self._ifile_lock:
            if self._ifile is None:
                logging.error("INDEXER-P1-THREAD-%d: Attempting to index"
                              " a document while index file is closed" %
                              thread_no)
                raise Exception("Index file has been closed")

            if token not in self._ifile:
                self._ifile[token] = [0, 0, set()]
            self._ifile[token][2].add(doc_id)
コード例 #4
0
    def _pass1_process_token(self, doc_id, token):
        # Inverted file structure:
        # self._ifile[token] = [id, df, postings_list]

        # Let's make sure the token is not in our "do not index"

        if token in constants.DO_NOT_INDEX or len(token) <= 1:
            return
        p = PorterStemmer()
        # First, let's stem the token
        token = token.lower()
        token = p.stem(token.lower(), 0,len(token)-1)

        with self._ifile_lock:                    
            if self._ifile is None:
                logging.error("INDEXER-P1-THREAD-%d: Attempting to index"
                              " a document while index file is closed"
                              % thread_no)
                raise Exception("Index file has been closed")

            if token not in self._ifile:
                self._ifile[token] = [0,0,set()]
            self._ifile[token][2].add(doc_id)
コード例 #5
0
    def from_string(query): 
        """Parse specified query and return query object"""
        queryObj = Query()
        queryObj.phrase_search = False        
        query = query.strip().lower()


        # Determine if it's a "command" query
        if (query.startswith("similar ") or query.startswith("df ") or \
            query.startswith("freq ") or query.startswith("doc ") or \
            query.startswith("tf ") or query.startswith("title ")) and \
            len(query.split(" ")) > 1:
                queryObj.cmd = query.split(" ")[0].strip()
                query = query.replace(queryObj.cmd + " ", "", 1) # remove cmd
                                                              # from query str

        # For "tf " queries, extract first parameter early on, so we
        # don't have to hack later when we process the query terms
        if queryObj.cmd == "tf":
            if len(query.split(" ")) < 2: 
                # This is not a valid "tf " query
                queryObj.cmd = None
            else:
                queryObj.raw_terms.append(query.split(" ")[0])
                query = " ".join(query.split(" ")[1:])

        # Clean up and determine if phrase search        
        if query.replace("!", "").startswith('"'):
            queryObj.phrase_search = True
        
        last_grp = None

        gid = 0
        _groups = []

        # Populate groups
        if not queryObj.phrase_search:            
            for group in query.split(" "):
                if group.strip().startswith("!"):
                    _groups.append(group.strip()[1:])
                    queryObj.negated_groups[gid] = True
                    gid = gid + 1
                else:
                    _groups.append(group.strip())
                    queryObj.negated_groups[gid] = False
                    gid = gid + 1
        else:
            for group in query.split('"'):
                if group.strip(' "') == '': 
                    continue
                if group.strip(' "') == '!': 
                    last_grp = group
                    continue

                if last_grp is not None and "!" in last_grp:
                    _groups.append(group)
                    queryObj.negated_groups[gid] = True
                    gid = gid + 1
                else:
                    _groups.append(group)
                    queryObj.negated_groups[gid] = False
                    gid = gid + 1
                last_grp = group

        # Stem tokens in groups (except for "similar" queries) 
        # and remove inelgible tokens
        for group in _groups:

            _query_terms = []
            if queryObj.cmd == "doc" or queryObj.cmd == "title":
                _query_terms = group.split(" ")

            else:
                _query_terms = re.compile(indexer.constants.DELIMITERS).split(group)

            query_terms = []
            for term in _query_terms:
                term = term.lower()

                if term not in indexer.constants.DO_NOT_INDEX:
                    queryObj.raw_terms.append(term)
                    # Stem
                    if queryObj.cmd != "similar":
                        p = PorterStemmer()
                        term = term.lower()
                        term = p.stem(term, 0,len(term)-1)
                    query_terms.append(term)
            queryObj.groups.append(' '.join(query_terms))

        return queryObj