예제 #1
0
def tokenizePrint():
    while True:
        analyzer = StandardAnalyzer()
        ts = analyzer.tokenStream("myfield", [ input("Input the string to tokenize: ") ])
        #    OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

        ts.reset() # Resets this stream to the beginning. (Required)
        while ts.incrementToken() :
            # Use AttributeSource.reflectAsString(boolean)
            # for token stream debugging.
            print("token: " + ts.getTerm().text)
            print('pos: ', ts.getPosition())
            #print("token start offset: " + offsetAtt.startOffset())
            #print("  token end offset: " + offsetAtt.endOffset())

        ts.end()   # Perform end-of-stream operations, e.g. set the final offset.

        ts.close() # Release resources associated with this stream.
예제 #2
0
파일: Query.py 프로젝트: wtl-zju/IR_ZJU
class Query(object):
    def __init__(self, string: str):
        self.q = StandardAnalyzer()
        self.s = string

    def tokenVector(self):
        t = self.q.tokenStream("userfield", [ self.s ])
        t.reset()
        s = []
        l = {}
        while t.incrementToken():
            s.append(t.getTerm().text)
        l = set(s)
        return l
예제 #3
0
class PhraseQuery(Query):
    def __init__(self, string: str):
        Query.__init__(self, string)
        self.q = StandardAnalyzer()
        self.s = string

    def tokenVector(self):
        t = self.q.tokenStream("userfield", [ self.s ])
        t.reset()
        l = []
        while t.incrementToken():
            l.append(t.getTerm().text)
        return l

    def booleanQuery(self):
        l = self.tokenVector()
        i = 0
        num = len(l)
        for i in range( num-1 ):
            l.append("AND")

        return l
예제 #4
0
 def __init__(self, string: str):
     Query.__init__(self, string)
     self.q = StandardAnalyzer()
     self.s = string