class LSI_TFIDF():
    lw = lg.LogWriter()
    # get files
    path = ''  # path name
    index_path = config['LSI_pickle_path']
    files = []
    documents = {}
    sortedDocuments = []
    contents = []
    X = None
    re = None
    word = None
    vectorizer = None
    tfidf = None
    s = None
    u = None
    d = None
    idf = None
    lineNo = {}
    expireTime = 30
    end_time = time.clock()

    def indexing(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        self.files.remove('.DS_Store')
        fs = len(self.files)
        self.tfidf = TfidfVectorizer()
        i = 0
        while i < fs:  # go through the folder
            file = self.files[i]
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                if len(self.documents[file]['content'].strip()) > 0:
                    self.contents.append(self.documents[file]['content'])
                    #store the line numbers of the term
                    self.lineNo[file] = {}
                    j = 0
                    for line in self.documents[file]['content'].split('\n'):
                        lineList = [line]
                        if len(lineList) > 0:
                            try:
                                self.tfidf.fit_transform(
                                    lineList
                                )  #get the unique standard term of this line
                            except ValueError:
                                j += 1
                                continue
                            for term in self.tfidf.vocabulary_:
                                if term in self.lineNo[file]:
                                    self.lineNo[file][term].append(j)
                                else:
                                    self.lineNo[file][term] = [j]
                        j += 1
                    i += 1
                else:
                    self.documents.pop(file)
                    self.files.remove(file)
                    fs -= 1
            else:
                self.files.remove(file)
        print('finish reading')
        # self.files = list(self.documents.keys())
        size = len(self.documents)
        self.lw.write_info_log("get " + str(size) + " documents")
        self.lw.write_info_log("indexing...")
        self.stopwords = [
            'and', 'edition', 'for', 'in', 'little', 'of', 'the', 'to', 'print'
        ]
        self.re = self.tfidf.fit_transform(
            self.contents).toarray().T  # tf-idf values
        self.idf = self.tfidf.idf_
        self.word = self.word = list(self.tfidf.vocabulary_.keys())

        #compression matrix
        self.re = dok_matrix(self.re)
        # self.X=dok_matrix(self.X)
        print("start SVD")
        # svd decomposition
        self.u, self.s, self.d = svds(self.re,
                                      k=500,
                                      return_singular_vectors='u')
        print('start dumping')
        # store the index into the pickle
        with open(
                self.index_path, 'wb'
        ) as f:  # use pickle module to save data into file 'CodexIndex.pik'
            pickle.dump(self.s, f, True)
            pickle.dump(self.u, f, True)
            pickle.dump(self.d, f, True)
            pickle.dump(self.tfidf, f, True)
            pickle.dump(self.lineNo, f, True)
            print('finish')

    def getResult(self, query):
        self.vectorizer = CountVectorizer()
        # if there exist the pickle file, read it
        if os.path.exists(self.index_path):
            print("in===1")
            rfile = open(self.index_path, 'rb')
            self.s = pickle.load(rfile)
            self.u = pickle.load(rfile)
            self.d = pickle.load(rfile)
            self.tfidf = pickle.load(rfile)
            self.lineNo = pickle.load(rfile)

            self.idf = self.tfidf.idf_
            self.word = list(self.tfidf.vocabulary_.keys())
            self.files = list(self.lineNo.keys())

        else:  # if there is no such pickle file, indexing
            self.indexing()

        l = self.MatrixSearching(query, self.s, self.u, self.d.T)
        if l is None:
            return Results.Results(0)
        print("in===2")
        results = Results.Results(numOfResults=l[3],
                                  matchingLines=l[2],
                                  hitDocs=l[1],
                                  fullHitLines=l[0])

        return results  # return results

    def MatrixSearching(self, query, s, u, d):

        qFreq = self.vectorizer.fit_transform(
            [query]).toarray().T  # make the vectorizer fit the query
        qWord = self.vectorizer.get_feature_names(
        )  # the unique terms after preprocessing
        qArr = np.zeros([1, len(self.word)])

        # fill in the tf-idf into the empty Xq matrix
        ifEmpty = True
        j = 0
        for w in qWord:
            i = qWord.index(w)
            if w in self.word:
                j = self.word.index(w)
                qArr[0][j] = qFreq[i] * self.idf[j]
                ifEmpty = False

        # give the warning and stop searching if no terms found
        if ifEmpty:
            self.lw.write_warning_log("Nothing found!")
            return None

        # similarities from Dq=X.T * T * S-1.
        sDiagno = np.diag(np.array(s))
        sInv = np.linalg.inv(sDiagno)
        Dq = np.dot(qArr, u)
        Dq = np.dot(Dq, sInv)

        matchingLines = {}  # {similarity:[(docName, [hit lines])] }
        hitDocs = {}  # {lengthHits:[(docName,[hit lines])]}
        fullHitLines = {}  # {fullHitNum:[(docName,[hit lines])]}
        length = 0
        for i in range(len(d)):
            k = self.files[i]
            similarity = ((np.dot(Dq, d[i])) / ((np.linalg.norm(Dq)) *
                                                (np.linalg.norm(d[i]))))[0]
            length += 1
            hitLines = []
            hitWords = 0
            commonLines = []
            for t in qWord:
                if t in self.lineNo[k]:
                    hitWords += 1
                    hitLines = list(
                        set(hitLines).union(set(self.lineNo[k][t])))
                    if hitWords == 1:
                        commonLines = self.lineNo[k][t]
                    commonLines = list(
                        set(commonLines).intersection(set(self.lineNo[k][t])))
            lengthHit = len(hitLines) * hitWords
            if hitWords > 1:
                fullHit = len(commonLines)
            else:
                fullHit = 0
            if fullHit > 0:
                if fullHit in fullHitLines:
                    fullHitLines[fullHit].append((k, hitLines))
                else:
                    fullHitLines[fullHit] = [(k, hitLines)]
            elif lengthHit > 0:
                if lengthHit in hitDocs:
                    hitDocs[lengthHit].append((k, hitLines))
                else:
                    hitDocs[lengthHit] = [(k, hitLines)]
            else:
                if similarity > 0:
                    if similarity not in matchingLines:
                        matchingLines[similarity] = [(k, hitLines)]
                    else:
                        matchingLines[similarity].append((k, hitLines))
                else:
                    # don't store it
                    length -= 1

        return (fullHitLines, hitDocs, matchingLines, length)
class JavaAST():
    r = redis.Redis(
        host='localhost', port=6379,
        decode_responses=True)  # host是redis主机,需要redis服务端和客户端都启动 redis默认端口是6379
    lw = lg.LogWriter()
    path = ''  # path name
    index_path = configs['AST_java_pickle_path']

    weights = {}  # {weight:[fileNames] }
    fileIndex = {}  # {fileName: {weight:{nodeHash:(startLine,EndLine)] } }
    files = []
    documents = {}
    lastLineNo = 0

    # these parameters should be tuned
    matchingThreshold = 0.6
    weightThreshold = 10  # weight outweigh weightThreshold will be taken into consideration
    blockThreshold = 50  # weight outweigh the blockthreshold means this node will be a code block which should be included into the global searching
    pageNum = configs['page_num']
    wholeSimilarity = 0
    matchingBlock = {
    }  # {docID: (the startline and endline of the matching blocks)}.
    blockWeights = {
    }  # {docID: (startline, endline): weight of the biggest matching block}
    expireTime = 1

    def readFiles(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        for file in self.files:  # go through the folder
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                # self.documents[file]=open(self.path+'/'+file,'r').read()
                if len(self.documents[file]['content'].strip()) > 0:
                    try:
                        tree = javalang.parse.parse(
                            self.documents[file]['content'])
                    except (javalang.parser.JavaSyntaxError):
                        self.lw.write_error_log("syntax error! " + file)
                        continue
                    # remove strings and variable names
                    self.fileIndex[file] = {}
                    names = []  # self defined name
                    self.lastLineNo = 0
                    self.index(tree, file, names, {}, {}, False)
                    # print(self.fileIndex[file])
                else:
                    self.documents.pop(file)
        self.files = list(self.documents.keys())

        self.lw.write_info_log("get " + str(len(self.documents)) +
                               " documents")
        # use pickle module to save data into file 'CodexIndexAST.pik'
        with open(self.index_path, 'wb') as f:
            pickle.dump(self.weights, f, True)
            pickle.dump(self.fileIndex, f, True)

            # self.names=[]
            # tree=javalang.parse.parse(q3)
            # self.fileIndex['q3'] = {}
            # self.index(tree, 'q3')
            # print('#############################################')
            # for weight in self.fileIndex['q2']:
            #     if weight in self.fileIndex['q3']:
            #         print(weight)
            #         print(self.fileIndex['q2'][weight])
            #         print(self.fileIndex['q3'][weight])
            #     else:
            #         print(weight)
            #
            # print(self.fileIndex['q2'])
            # print(self.fileIndex['q3'])

    def index(self, root, fileName, names, nestHash, qLineNums, nestedDic):
        weight = 1
        min = 0
        max = 0
        i = 0
        startLine = 0
        endLine = 0
        # print('-----------------------')
        # print(root)

        attriValues = ''  # "attr1 attr2 attr3"
        if isinstance(root, list) and len(root) == 0:
            return (weight, min, max, '')
        hashAttris = None
        if not isinstance(root, list):
            # if hasattr(root, "_position"):
            #     print(root._position)
            if isinstance(root, javalang.ast.Node):
                children = list(root.children)
            elif isinstance(root, tuple):
                children = root
            else:
                min = self.lastLineNo + 1
                max = self.lastLineNo + 1
                return (weight, min, max, attriValues)
            # get attributes information
            hasContent = False
            if hasattr(root, 'attrs'):
                attriValues += '( '
                for a in root.attrs:
                    v = root.__getattribute__(a)
                    if a is not 'documentation':
                        # remove identifier names
                        # except javalang.tree.ReferenceType, javalang.tree.ReferenceType,
                        if a is 'name' and (
                            (not isinstance(root, javalang.tree.ReferenceType)
                             and
                             not isinstance(root, javalang.tree.ReferenceType))
                                or v in names):
                            if v not in names:
                                names.append(v)
                            # print('================')
                            # print((type(root),a))
                            # print(v)
                            children.remove(v)
                            continue
                        elif a is 'member':
                            # PROBLEM: if the member is a method name not in self.names(defined below the current node), we will fail to ignore it

                            if v in names:
                                # print('~~~~~~~~~~~~~~~~~~~~~~~~~')
                                # print(v)
                                children.remove(v)
                                continue
                        elif a == 'qualifier':
                            # remove printing out
                            if v == 'System.out':
                                return (0, min, max, None)
                            elif v in names:
                                children.remove(v)
                                continue
                        elif v == 'MethodInvocation':
                            if hasattr(v, 'qualifier') and v.__getattribute__(
                                    'qualifier') == 'System.out':
                                return (0, min, max, None)
                        # ignore values like strings, numbers, booleans, null
                        elif a == 'value' and type(v) is str:
                            children.remove(v)
                            continue
                        elif v != None and v != '' and not (isinstance(
                                v, list) and len(v) == 0):
                            if not isinstance(v, list):
                                if isinstance(
                                        v, javalang.tree.
                                        MethodInvocation) and hasattr(
                                            v, 'attrs') and v.__getattribute__(
                                                'qualifier') == 'System.out':
                                    return (0, min, max, None)

                                hasContent = True
                                # print(v)
                                if isinstance(v, set) and len(v) > 1:
                                    # print('//////////////////////////////////////')
                                    v1 = list(v)
                                    v1.sort()
                                    # print(v1)
                                    attriValues += str(v1) + ": "
                                else:
                                    # print('111111111111111111')
                                    # print(v)
                                    attriValues += str(v) + ": "
                                    # print(attriValues)

                            if isinstance(v, (javalang.ast.Node, tuple, list)):
                                children.remove(v)
                                t = self.index(v, fileName, names, nestHash,
                                               qLineNums, nestedDic)

                                weight += t[0]
                                if t[1] > 0:
                                    startLine = t[1]
                                    if i == 0:
                                        min = startLine
                                    elif startLine < min:
                                        min = startLine
                                    i += 1
                                if t[2] > 0:
                                    endLine = t[2]
                                    if endLine > max:
                                        max = endLine
                                    i += 1
                                if t[3] != '' and t[
                                        3] is not None and t[3] != '( ':
                                    hasContent = True
                                    attriValues += t[3] + ', '
                    else:
                        children.remove(v)
            if len(children) > 0:
                if not hasattr(root, 'attrs'):
                    attriValues += '( '

                for child in children:
                    # ignore some meaningless nodes
                    if child != None and child != '' and not isinstance(
                            child, list) and child not in names:
                        if isinstance(child, set) and len(child) > 1:
                            # print('//////////////////////////////////////')
                            child1 = list(child)
                            child1.sort()
                            # print(child1)
                            attriValues += str(child1) + ": "
                        else:
                            # print('22222222222222222')
                            # print(child)
                            attriValues += str(child) + ': '
                    if isinstance(child, (javalang.ast.Node, tuple, list)):
                        t = self.index(child, fileName, names, nestHash,
                                       qLineNums, nestedDic)

                        weight += t[0]
                        if t[1] > 0:
                            startLine = t[1]
                            if i == 0:
                                min = startLine
                            elif startLine < min:
                                min = startLine
                            i += 1
                        if t[2] > 0:
                            endLine = t[2]
                            if endLine > max:
                                max = endLine
                            i += 1
                        if t[3] is not '' and t[3] is not None and t[3] != '( ':
                            hasContent = True
                            attriValues += t[3] + ', '

            if hasContent:
                attriValues += ' )'
            else:
                # no brackets
                attriValues = attriValues.lstrip('( ')
            # work out line number
            if hasattr(root, "_position"):
                lineNo = root._position[0]
                if min == 0 and max == 0:
                    min = lineNo
                    max = lineNo

            # put the weight into weights
            if weight >= self.weightThreshold:
                if min == 0 and max == 0:
                    min = self.lastLineNo + 1
                    max = self.lastLineNo + 1
                self.lastLineNo = max
                if not nestedDic:
                    if weight in self.weights:
                        if fileName not in self.weights[weight]:
                            self.weights[weight].append(fileName)
                    else:
                        self.weights[weight] = [fileName]

                # hash the attribute values list
                m = hashlib.md5()
                m.update(attriValues.encode("utf8"))
                hashAttris = m.hexdigest()

                # print(',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,')
                # print(self.fileIndex[fileName])
                # put the node into fileIndex
                if weight not in self.fileIndex[fileName]:
                    self.fileIndex[fileName][weight] = {}
                self.fileIndex[fileName][weight][hashAttris] = (min, max)

                # print(weight)
                # print(attriValues)
                # print((str(root),hashAttris,min,max))
                # put all its childern in this file into the current node
                if nestedDic:
                    nestHash[(weight, hashAttris, min, max)] = {}
                    qLineNums[hashAttris] = (min, max)
                    for w in self.fileIndex[fileName]:
                        if w < weight:
                            keys = list(self.fileIndex[fileName][w].keys())
                            for k in keys:
                                t = self.fileIndex[fileName][w][k]
                                if t[0] >= min and t[1] <= max:
                                    # print('11111111111111111111111111111111')
                                    # print((w,k,t[0],t[1]))
                                    # print((weight,hashAttris,min,max))
                                    # the block is the sub node of the current node
                                    nestHash[(weight, hashAttris, min,
                                              max)][(w, k, t[0], t[1])] = {}
                                    self.fileIndex[fileName][w].pop(k)

                    # put the children in nestHash into the current node
                    keys2 = list(nestHash.keys())
                    for k in keys2:
                        if k in nestHash:
                            if k[0] < weight and k[2] >= min and k[3] <= max:
                                # print('!!!!!!!!!!!!!!!!!!!!!!!!!')
                                # print(k)
                                # print((weight,hashAttris,min,max))
                                nestHash[(weight, hashAttris, min,
                                          max)][k] = nestHash.pop(k)
            if max > 0:
                self.lastLineNo = max
            return (weight, min, max, attriValues)

        else:
            l = []
            length = len(root)
            j = 0
            while j < length:
                r = root[j]
                rStr = ''
                if r is not None and r is not '':
                    t = self.index(r, fileName, names, nestHash, qLineNums,
                                   nestedDic)
                    weight += t[0]
                    if t[1] > 0:
                        startLine = t[1]
                        if i == 0:
                            min = startLine
                        elif startLine < min:
                            min = startLine
                        i += 1
                    if t[2] > 0:
                        endLine = t[2]
                        if endLine > max:
                            max = endLine
                        i += 1
                    if t[3] is not None:
                        rStr += str(r) + ':'
                        if t[3] is not '':
                            rStr += t[3]
                        else:
                            rStr += ''
                        l.append(rStr)
                        j += 1
                    else:
                        root.pop(j)
                        length -= 1
                else:
                    root.pop(j)
                    length -= 1

            if len(l) > 0:
                # sort the list in order to ignore the code order change
                l.sort()
                attriValues += '[ ' + ''.join(l) + ' ]'
            if max > 0:
                self.lastLineNo = max
            return (weight, min, max, attriValues)

            # interface to front end. Input query, return a Result instance

    def getResults(self, query, page):
        globalSimilarity = None
        matchingBlocks = None
        componentDocuments = []
        if not self.r.exists(query):  # if the result is not in the redis
            # read pickle file
            if os.path.exists(self.index_path):
                rfile = open(self.index_path, 'rb')
                self.weights = pickle.load(rfile)
                self.fileIndex = pickle.load(rfile)
            else:
                self.readFiles()

            # store the result of the query into redis
            matchingLines = {}  # {fileName:[(qStart,qEnd, fStart,fEnd)]}
            similarities = self.search(query, matchingLines)
            if similarities == None:
                self.lw.write_error_log('Pickle files not found!')
                return None
            elif similarities == 0:
                return 0
            # get the normal relevant documents and the suspected plagiarized documents
            globalSimilarity = self.wholeSimilarity
            matchingBlocks = self.matchingBlock
            documentList = sorted(similarities,
                                  key=similarities.get,
                                  reverse=True)
            plagiarismList = []  # [sorted plagiarised files]
            i = 0
            for d in documentList:
                if similarities[d] > self.matchingThreshold:
                    plagiarismList.append(d)
                    # print(similarities[d])
                    # matchingLines[d].sort()
                    # print(matchingLines[d])
                    i += 1
                else:
                    break
            documentList = documentList[i:]
            componentDocuments = list(matchingBlocks.keys())
            # store data into the redis server
            self.lw.write_info_log(
                "storing results into redis in form of list")
            self.r.rpush(query, plagiarismList)
            self.r.rpush(query, documentList)
            self.r.rpush(query, matchingLines)
            if globalSimilarity >= self.matchingThreshold and len(
                    matchingBlocks) != 0 and len(componentDocuments) > 1:
                if len(plagiarismList) > 0:
                    if globalSimilarity >= similarities[plagiarismList[0]]:
                        self.r.rpush(query, globalSimilarity)
                        self.r.rpush(query, matchingBlocks)
                        self.r.rpush(query, componentDocuments)
                        self.r.rpush(query, self.blockWeights)
                    else:
                        componentDocuments = []
                        matchingBlocks = None
                        globalSimilarity = None
                else:
                    # if no plagiarised case is found, display the component programs
                    self.r.rpush(query, globalSimilarity)
                    self.r.rpush(query, matchingBlocks)
                    self.r.rpush(query, componentDocuments)
            else:
                componentDocuments = []
                matchingBlocks = None
                globalSimilarity = None

        # get the result list of this query from redis
        else:
            self.lw.write_info_log("geting results from redis")
            plagiarismList = eval(self.r.lindex(query, 0))
            documentList = eval(self.r.lindex(query, 1))
            matchingLines = eval(self.r.lindex(query, 2))
            if self.r.llen(query) >= 6:
                globalSimilarity = eval(self.r.lindex(query, 3))
                matchingBlocks = eval(self.r.lindex(query, 4))
                componentDocuments = eval(self.r.lindex(query, 5))
                self.blockWeights = eval(self.r.lindex(query, 6))

        self.r.expire(query, self.expireTime)  # expire after 30s

        # encalsulate results into the object:Result
        documentListLength = len(documentList)
        plagiarismListLength = len(plagiarismList)
        matchingblocksLength = len(componentDocuments)
        length = documentListLength + plagiarismListLength + matchingblocksLength
        results = Results.Results(numOfResults=length,
                                  matchingLines=matchingLines,
                                  globalSimilarity=globalSimilarity,
                                  matchingBlocks=matchingBlocks,
                                  blockWeights=self.blockWeights)
        disMatchingBlocks = []
        disPlagiarismList = []
        disDocumentList = []
        if (
                page - 1
        ) * self.pageNum < matchingblocksLength:  # need to display the maching blocks
            disMatchingBlocks = componentDocuments[
                (page - 1) *
                self.pageNum:min(page * self.pageNum, matchingblocksLength)]
            results.setComponentDocuments(disMatchingBlocks)

        if (
                page - 1
        ) * self.pageNum < matchingblocksLength + plagiarismListLength and page * self.pageNum >= matchingblocksLength:
            # need to display the plagiarism documents
            if len(disMatchingBlocks) == 0 and page > 1:  # not start from 0
                disPlagiarismList = plagiarismList[
                    (page - 1) * self.pageNum - matchingblocksLength:min((
                        page * self.pageNum -
                        matchingblocksLength), plagiarismListLength)]
            else:  # start from 0
                disPlagiarismList = plagiarismList[
                    0:min(self.pageNum, plagiarismListLength)]
            results.setPlagiarismList(disPlagiarismList)

        if page * self.pageNum > matchingblocksLength + plagiarismListLength:  # need to dispaly the relevant documents
            if len(disMatchingBlocks) == 0 and len(
                    disPlagiarismList
            ) == 0 and (page - 1) * self.pageNum <= length:  # not start from 0
                disDocumentList = documentList[
                    (page - 1) * self.pageNum - matchingblocksLength -
                    plagiarismListLength:min((
                        page * self.pageNum - matchingblocksLength -
                        plagiarismListLength), documentListLength)]
            elif (page - 1) * self.pageNum <= length:  # start from 0
                disDocumentList = documentList[0:min((
                    self.pageNum - matchingblocksLength -
                    plagiarismListLength), documentListLength)]
            else:
                self.lw.write_error_log("page number out of range")
                return None
            results.setDocumentList(disDocumentList)

        # print('==============')
        # results.toString()
        return results

    def search(self, query, matchingLines
               ):  # matchingLines {fileName:[(qStart,qEnd, fStart,fEnd)]}
        # refresh the global variables
        self.wholeSimilarity = 0
        self.matchingBlock = {}
        self.blockWeights = {}
        qTree = {
        }  # {(weight,nodeHash,startLine, endLine):{nested dictionaries}}
        qLineNums = {}

        root = javalang.parse.parse(query)
        self.fileIndex['query'] = {}
        names = []
        self.lastLineNo = 0
        self.index(root, 'query', names, qTree, qLineNums, True)
        # print(qTree)
        # print(qLineNums)
        self.fileIndex.pop('query')
        similarities = {}  # {fileName:score}
        maxWeight = list(qTree.keys())[0][0]
        # print(maxWeight)
        self.similarities(qTree, self.weights, similarities, maxWeight,
                          qLineNums, matchingLines)

        # work out the global similarity
        for dic in self.blockWeights:
            biggestKey = sorted(self.blockWeights[dic],
                                key=self.blockWeights[dic].get,
                                reverse=True)[0]
            if self.blockWeights[dic][biggestKey] > self.blockThreshold:
                ds = list(self.matchingBlock.keys())
                store = True
                for d in ds:
                    block = self.matchingBlock[d]
                    # do not store in if the new block is included in some block within the matchBlock
                    if biggestKey[0] >= block[0] and biggestKey[1] <= block[1]:
                        store = False
                        break
                    # delete the older block included in the new block
                    elif biggestKey[0] <= block[0] and biggestKey[1] >= block[
                            1]:
                        self.matchingBlock.pop(d)
                        self.wholeSimilarity -= self.blockWeights[d][
                            block] / maxWeight

                    # deal with the block that have some part overlapping with old blocks (store the one with bigger weight)
                    elif (biggestKey[0] <= block[1] and biggestKey[0] >=
                          block[0]) or (biggestKey[1] <= block[1]
                                        and biggestKey[1] >= block[0]):
                        if self.blockWeights[dic][
                                biggestKey] > self.blockWeights[d][block]:
                            self.matchingBlock.pop(d)
                            self.wholeSimilarity -= self.blockWeights[d][
                                block] / maxWeight
                        else:
                            store = False
                            break

                # store the new block
                if store:
                    self.matchingBlock[dic] = biggestKey
                    self.wholeSimilarity += self.blockWeights[dic][
                        biggestKey] / maxWeight

        return similarities

    # calculate the similarities between corpus and query
    def similarities(self, qTree, weights, similarities, maxWeight, qLineNums,
                     matchingLines):
        # matchingBlock: {docID: (the startline and endline of the matching blocks)}.
        # blockWeights: {docID: (qStartline, qEndline): weight of the biggest matching block}
        if maxWeight is None:
            maxWeight = 1
        for w in qTree:
            if isinstance(w, tuple):
                find = False
                if w[0] in weights:
                    for file in weights[w[0]]:
                        # check if the nodeHash is in this file
                        if w[1] in self.fileIndex[file][w[0]]:
                            find = True
                            qs = w[2]
                            qe = w[3]
                            fs = self.fileIndex[file][w[0]][w[1]][0]
                            fe = self.fileIndex[file][w[0]][w[1]][1]
                            if file in similarities:
                                matchingLines[file].append((qs, qe, fs, fe))
                                similarities[file] += w[0] / maxWeight
                            else:
                                matchingLines[file] = [(qs, qe, fs, fe)]
                                similarities[file] = w[0] / maxWeight

                            # merge lines in query program to construct the code blocks
                            forwMerge = False
                            BackMerge = False
                            if file not in self.blockWeights:
                                self.blockWeights[file] = {}
                            elif (qs, qe) in self.blockWeights[file]:
                                if w[0] > self.blockWeights[file][(qs, qe)]:
                                    self.blockWeights[file][(qs, qe)] = w[0]
                                continue
                            keys = list(self.blockWeights[file].keys())
                            for mLines in keys:
                                if mLines[1] < qs:
                                    insertion = False
                                    # check insertion
                                    for k in qLineNums:
                                        lines = qLineNums[k]
                                        if (lines[0] > mLines[1] and lines[0] <
                                                qs) or (lines[1] > mLines[1]
                                                        and lines[1] < qs):
                                            insertion = True
                                            break
                                    if not insertion:
                                        self.blockWeights[file][(
                                            mLines[0],
                                            qe)] = w[0] + self.blockWeights[
                                                file][mLines]
                                        self.blockWeights[file].pop(mLines)
                                        forwMerge = True
                                elif mLines[0] > qe:
                                    insertion = False
                                    # check insertion
                                    for lines in qLineNums.values():
                                        if (lines[1] < mLines[0] and lines[1] >
                                                qe) or (lines[0] < mLines[0]
                                                        and lines[0] > qe):
                                            insertion = True
                                            break
                                    if not insertion:
                                        self.blockWeights[file][(
                                            qs, mLines[1]
                                        )] = w[0] + self.blockWeights[file][
                                            mLines]
                                        self.blockWeights[file].pop(mLines)
                                        BackMerge = True
                                if forwMerge and BackMerge:
                                    break
                            if not forwMerge and not BackMerge:
                                self.blockWeights[file][(qs, qe)] = w[0]
                if not find and qTree[w] is not None:
                    if len(qTree[w]) > 0:
                        self.similarities(qTree[w], weights, similarities,
                                          maxWeight, qLineNums, matchingLines)

    def import_in(self, filename):
        dic = conv.to_dic(file_name=filename)
        print(dic['content'])
Example #3
0
class ASTSearching(Singleton):
    r = redis.Redis(
        host='localhost', port=6379,
        decode_responses=True)  # host是redis主机,需要redis服务端和客户端都启动 redis默认端口是6379
    lw = lg.LogWriter()
    path = ""  # path name
    index_path = config['AST_python_pickle_path']
    files = []
    documents = {}
    # hashTrees={}#{fileName: {nodeHash: {nested dictionaries with hash values in stand of nodes} } }
    # -----compare with hashTrees and choose the efficient one-------
    hashDic = {}  # {fileName:{weight:[nodeHash]}
    visitor = mv.MyVisitor()
    weights = {}  # {weight:[fileNames] }
    lineNums = {}  # {fileName: {nodeHash: (startLine, endLine)}}

    # these parameters should be tuned
    matchingThreshold = 0.6
    weightThreshold = 10  # weight outweigh weightThreshold will be taken into consideration
    blockThreshold = 50  # weight outweigh the blockthreshold means this node will be a code block which should be included into the global searching
    pageNum = 10
    wholeSimilarity = 0
    matchingBlock = {
    }  # {docID: (the startline and endline of the matching blocks)}.
    blockWeights = {
    }  # {docID: (startline, endline): weight of the biggest matching block}
    expireTime = 1

    # parse the corpus
    def ReadFiles(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        for file in self.files:  # go through the folder
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                if len(self.documents[file]['content'].strip()) > 0:
                    try:
                        root = ast.parse(str(self.documents[file]['content']))
                    except (SyntaxError):
                        self.lw.write_error_log("syntax error! " + file)
                        continue
                    # remove strings and variable names
                    self.visitor.visit(root)
                    self.lineNums[file] = {}
                    self.hashDic[file] = {}
                    self.Indexing(root, self.lineNums[file], self.weights,
                                  file)
                else:
                    self.documents.pop(file)
        self.files = list(self.documents.keys())

        self.lw.write_info_log("get " + str(len(self.documents)) +
                               " documents")
        # use pickle module to save data into file 'CodexIndexAST.pik'
        with open(self.index_path, 'wb') as f:
            pickle.dump(self.weights, f, True)
            pickle.dump(self.lineNums, f, True)
            pickle.dump(self.hashDic, f, True)

    # turn every document root into index
    def Indexing(self, node, lineNums, weights, fileName):
        weight = 1
        min = 0
        max = 0
        i = 0
        startLine = 0
        endLine = 0
        if isinstance(node, ast.AST):
            m = hashlib.md5()
            m.update(ast.dump(node).encode("utf8"))
            nodeStr = m.hexdigest()
            for n, m in ast.iter_fields(node):
                tuple = self.Indexing(m, lineNums, weights, fileName)
                weight += tuple[0]
                if tuple[1] > 0:
                    startLine = tuple[1]
                    if i == 0:
                        min = startLine
                    elif startLine < min:
                        min = startLine
                    i += 1
                if tuple[2] > 0:
                    endLine = tuple[2]
                    if endLine > max:
                        max = endLine
                    i += 1
            if node._attributes:
                lineNo = getattr(node, 'lineno')
                if min == 0 and max == 0:
                    min = lineNo
                    max = lineNo

            if weight >= self.weightThreshold:
                if weight in weights:
                    if fileName not in weights[weight]:
                        weights[weight].append(fileName)
                else:
                    weights[weight] = [fileName]
                # put the hash node into hash dictionary
                if weight in self.hashDic[fileName]:
                    self.hashDic[fileName][weight].append(nodeStr)
                else:
                    self.hashDic[fileName][weight] = [nodeStr]

                lineNums[nodeStr] = (min, max)

            return (weight, min, max)

        elif isinstance(node, list):
            for x in node:
                tuple = self.Indexing(x, lineNums, weights, fileName)
                weight += tuple[0]
                if tuple[1] > 0:
                    startLine = tuple[1]
                    if i == 0:
                        min = startLine
                    elif startLine < min:
                        min = startLine
                    i += 1
                if tuple[2] > 0:
                    endLine = tuple[2]
                    if endLine > max:
                        max = endLine
                    i += 1

            return (weight, min, max)
        return (weight, min, max)

    # interface to front end. Input query, return a Result instance
    def getResults(self, query, page):
        globalSimilarity = 0
        matchingBlocks = {}
        componentDocuments = []
        if not self.r.exists(query):  # if the result is not in the redis

            if os.path.exists(self.index_path):
                rfile = open(self.index_path, 'rb')
                self.weights = pickle.load(rfile)
                self.lineNums = pickle.load(rfile)
                self.hashDic = pickle.load(rfile)
            else:
                self.ReadFiles()

            # store the result of the query into redis
            matchingLines = {}  # {fileName:[(qStart,qEnd, fStart,fEnd)]}
            similarities = self.search(query, matchingLines)
            if similarities == None:
                self.lw.write_error_log('Pickle files not found!')
                return None
            elif similarities == 0:
                return 0
            # get the normal relevant documents and the suspected plagiarized documents
            globalSimilarity = self.wholeSimilarity
            matchingBlocks = self.matchingBlock
            documentList = sorted(similarities,
                                  key=similarities.get,
                                  reverse=True)
            plagiarismList = []  # [sorted plagiarised files]
            i = 0
            for d in documentList:
                if similarities[d] > self.matchingThreshold:
                    plagiarismList.append(d)
                    # print(similarities[d])
                    # matchingLines[d].sort()
                    # print(matchingLines[d])
                    i += 1
                else:
                    break
            documentList = documentList[i:]
            componentDocuments = list(matchingBlocks.keys())
            # store data into the redis server
            self.lw.write_info_log(
                "storing results into redis in form of list")
            self.r.rpush(query, plagiarismList)
            self.r.rpush(query, documentList)
            self.r.rpush(query, matchingLines)
            if globalSimilarity >= self.matchingThreshold and len(
                    matchingBlocks) != 0 and len(componentDocuments) > 1:
                if len(plagiarismList) > 0:
                    if globalSimilarity >= similarities[plagiarismList[0]]:
                        self.r.rpush(query, globalSimilarity)
                        self.r.rpush(query, matchingBlocks)
                        self.r.rpush(query, componentDocuments)
                    else:
                        componentDocuments = []
                        matchingBlocks = None
                        globalSimilarity = None
                else:
                    self.r.rpush(query, globalSimilarity)
                    self.r.rpush(query, matchingBlocks)
                    self.r.rpush(query, componentDocuments)
            else:
                componentDocuments = []
                matchingBlocks = None
                globalSimilarity = None

        # get the result list of this query from redis
        else:
            self.lw.write_info_log("geting results from redis")
            plagiarismList = eval(self.r.lindex(query, 0))
            documentList = eval(self.r.lindex(query, 1))
            matchingLines = eval(self.r.lindex(query, 2))
            if self.r.llen(query) >= 6:
                globalSimilarity = eval(self.r.lindex(query, 3))
                matchingBlocks = eval(self.r.lindex(query, 4))
                componentDocuments = eval(self.r.lindex(query, 5))

        self.r.expire(query, self.expireTime)  # expire after 30s

        # encalsulate results into the object:Result
        documentListLength = len(documentList)
        plagiarismListLength = len(plagiarismList)
        matchingblocksLength = len(componentDocuments)
        length = documentListLength + plagiarismListLength + matchingblocksLength
        results = Results.Results(numOfResults=length,
                                  matchingLines=matchingLines,
                                  globalSimilarity=globalSimilarity,
                                  matchingBlocks=matchingBlocks)
        disMatchingBlocks = []
        disPlagiarismList = []
        disDocumentList = []
        if (
                page - 1
        ) * self.pageNum < matchingblocksLength:  # need to display the maching blocks
            disMatchingBlocks = componentDocuments[
                (page - 1) *
                self.pageNum:min(page * self.pageNum, matchingblocksLength)]
            results.setComponentDocuments(disMatchingBlocks)

        if (
                page - 1
        ) * self.pageNum < matchingblocksLength + plagiarismListLength and page * self.pageNum >= matchingblocksLength:
            # need to display the plagiarism documents
            if len(disMatchingBlocks) == 0 and page > 1:  # not start from 0
                disPlagiarismList = plagiarismList[
                    (page - 1) * self.pageNum - matchingblocksLength:min((
                        page * self.pageNum -
                        matchingblocksLength), plagiarismListLength)]
            else:  # start from 0
                disPlagiarismList = plagiarismList[
                    0:min(self.pageNum, plagiarismListLength)]
            results.setPlagiarismList(disPlagiarismList)

        if page * self.pageNum > matchingblocksLength + plagiarismListLength:  # need to dispaly the relevant documents
            if len(disMatchingBlocks) == 0 and len(
                    disPlagiarismList
            ) == 0 and (page - 1) * self.pageNum <= length:  # not start from 0
                disDocumentList = documentList[
                    (page - 1) * self.pageNum - matchingblocksLength -
                    plagiarismListLength:min((
                        page * self.pageNum - matchingblocksLength -
                        plagiarismListLength), documentListLength)]
            elif (page - 1) * self.pageNum <= length:  # start from 0
                disDocumentList = documentList[0:min((
                    self.pageNum - matchingblocksLength -
                    plagiarismListLength), documentListLength)]
            else:
                self.lw.write_error_log("page number out of range")
                return None
            results.setDocumentList(disDocumentList)

        # print('==============')
        # results.toString()
        return results

    # break the query tree into nodes and calculate their weights
    def queryWeight(self, node, lineNums, tree):
        weight = 1
        min = 0
        max = 0
        i = 0
        startLine = 0
        endLine = 0

        if isinstance(node, ast.AST):
            m = hashlib.md5()
            m.update(ast.dump(node).encode("utf8"))
            nodeStr = m.hexdigest()
            tree[nodeStr] = {}
            for n, m in ast.iter_fields(node):
                tuple = self.queryWeight(m, lineNums, tree[nodeStr])
                weight += tuple[0]
                if tuple[1] > 0:
                    startLine = tuple[1]
                    if i == 0:
                        min = startLine
                    elif startLine < min:
                        min = startLine
                    i += 1
                if tuple[2] > 0:
                    endLine = tuple[2]
                    if endLine > max:
                        max = endLine
                    i += 1
            if node._attributes:
                lineNo = getattr(node, 'lineno')
                if min == 0 and max == 0:
                    min = lineNo
                    max = lineNo
            if weight >= self.weightThreshold:
                lineNums[nodeStr] = (min, max)
                tree[(weight, nodeStr)] = tree.pop(nodeStr)
                if len(tree[(weight, nodeStr)]) == 0:
                    tree[(weight, nodeStr)] = None
            else:
                tree.pop(nodeStr)

            return (weight, min, max)

        elif isinstance(node, list):
            for x in node:
                tuple = self.queryWeight(x, lineNums, tree)
                weight += tuple[0]
                if tuple[1] > 0:
                    startLine = tuple[1]
                    if i == 0:
                        min = startLine
                    elif startLine < min:
                        min = startLine
                    i += 1
                if tuple[2] > 0:
                    endLine = tuple[2]
                    if endLine > max:
                        max = endLine
                    i += 1

            return (weight, min, max)
        return (weight, min, max)

    # search plagiarism code with query
    def search(self, query, matchingLines):
        # refresh the global variables
        self.wholeSimilarity = 0
        self.matchingBlock = {}
        self.blockWeights = {}
        qTree = {}  # {(weight,nodeHash):{nested dictionaries}}
        qLineNums = {}  # {nodeHash:(start,end)}
        try:
            qNode = ast.parse(query)
        except (SyntaxError):
            self.lw.write_error_log("syntax error in qeury! ")
            return 0
        self.visitor.visit(qNode)
        # print(ast.dump(qNode,include_attributes=True))
        self.queryWeight(qNode, qLineNums, qTree)
        # print(qTree)
        # print(qLineNums)
        maxWeight = list(qTree.keys())[0][0]
        similarities = {}  # {fileName:score}
        self.similarities(qTree, self.weights, similarities, maxWeight,
                          qLineNums, self.lineNums, matchingLines)

        # work out the global similarity
        for dic in self.blockWeights:
            biggestKey = sorted(self.blockWeights[dic],
                                key=self.blockWeights[dic].get,
                                reverse=True)[0]
            if self.blockWeights[dic][biggestKey] > self.blockThreshold:
                ds = list(self.matchingBlock.keys())
                store = True
                for d in ds:
                    block = self.matchingBlock[d]
                    # do not store in if the new block is included in some block within the matchBlock
                    if biggestKey[0] >= block[0] and biggestKey[1] <= block[1]:
                        store = False
                        break
                    # delete the older block included in the new block
                    elif biggestKey[0] <= block[0] and biggestKey[1] >= block[
                            1]:
                        self.matchingBlock.pop(d)
                        self.wholeSimilarity -= self.blockWeights[d][
                            block] / maxWeight

                    # deal with the block that have some part overlapping with old blocks (store the one with bigger weight)
                    elif (biggestKey[0] <= block[1] and biggestKey[0] >=
                          block[0]) or (biggestKey[1] <= block[1]
                                        and biggestKey[1] >= block[0]):
                        if self.blockWeights[dic][
                                biggestKey] > self.blockWeights[d][block]:
                            self.matchingBlock.pop(d)
                            self.wholeSimilarity -= self.blockWeights[d][
                                block] / maxWeight
                        else:
                            store = False
                            break

                # store the new block
                if store:
                    self.matchingBlock[dic] = biggestKey
                    self.wholeSimilarity += self.blockWeights[dic][
                        biggestKey] / maxWeight

        return similarities

    # calculate the similarities between corpus and query
    def similarities(self, qTree, weights, similarities, maxWeight, qLineNums,
                     lineNums, matchingLines):
        if maxWeight is None:
            maxWeight = 1
        for w in qTree:
            if isinstance(w, tuple):
                find = False
                if w[0] in weights:
                    for file in weights[w[0]]:
                        if w[1] in self.hashDic[file][w[0]]:
                            find = True
                            qs = qLineNums[w[1]][0]
                            qe = qLineNums[w[1]][1]
                            fs = lineNums[file][w[1]][0]
                            fe = lineNums[file][w[1]][1]
                            if file in similarities:
                                matchingLines[file].append((qs, qe, fs, fe))
                                similarities[file] += w[0] / maxWeight
                            else:
                                matchingLines[file] = [(qs, qe, fs, fe)]
                                similarities[file] = w[0] / maxWeight

                            # merge lines in query program to construct the code blocks
                            forwMerge = False
                            BackMerge = False
                            if file not in self.blockWeights:
                                self.blockWeights[file] = {}
                            elif (qs, qe) in self.blockWeights[file]:
                                if w[0] > self.blockWeights[file][(qs, qe)]:
                                    self.blockWeights[file][(qs, qe)] = w[0]
                                continue
                            keys = list(self.blockWeights[file].keys())
                            for mLines in keys:
                                if mLines[1] < qs:
                                    insertion = False
                                    # check insertion
                                    for k in qLineNums:
                                        lines = qLineNums[k]
                                        if (lines[0] > mLines[1] and lines[0] <
                                                qs) or (lines[1] > mLines[1]
                                                        and lines[1] < qs):
                                            insertion = True
                                            break
                                    if not insertion:
                                        self.blockWeights[file][(
                                            mLines[0],
                                            qe)] = w[0] + self.blockWeights[
                                                file][mLines]
                                        self.blockWeights[file].pop(mLines)
                                        forwMerge = True
                                elif mLines[0] > qe:
                                    insertion = False
                                    # check insertion
                                    for lines in qLineNums.values():
                                        if (lines[1] < mLines[0] and lines[1] >
                                                qe) or (lines[0] < mLines[0]
                                                        and lines[0] > qe):
                                            insertion = True
                                            break
                                    if not insertion:
                                        self.blockWeights[file][(
                                            qs, mLines[1]
                                        )] = w[0] + self.blockWeights[file][
                                            mLines]
                                        self.blockWeights[file].pop(mLines)
                                        BackMerge = True
                                if forwMerge and BackMerge:
                                    break
                            if not forwMerge and not BackMerge:
                                self.blockWeights[file][(qs, qe)] = w[0]
                if not find and qTree[w] is not None:
                    if len(qTree[w]) > 0:
                        self.similarities(qTree[w], weights, similarities,
                                          maxWeight, qLineNums, lineNums,
                                          matchingLines)

    # find a key in a nested dictionary
    def dict_get(self, weight, d, objkey, default, weights, fileName):
        for k, v in d.items():
            # if find the key, delete this node (avoid repeated searching)
            if k == objkey:
                # weights[weight].remove(fileName)
                return d.pop(k)
            else:
                if isinstance(v, dict):
                    # if  k[0]>objkey[0]:
                    ret = self.dict_get(weight, v, objkey, default, weights,
                                        fileName)
                    if ret is not default:
                        return ret
        return default

    def import_in(self, filename):
        dic = conv.to_dic(file_name=filename)

        # return  self.compareQueries(dic['code'],q1)

        # compare if two queries are the same using hash functions
        def compareQueries(self, query1, query2):
            h1 = self.nodeToHash(query1)
            h2 = self.nodeToHash(query2)
            return h1 == h2

        # parse a query
        def nodeToHash(self, node):
            qRoot = ast.parse(node)
            self.visitor.visit(qRoot)
            qt = ast.dump(qRoot)
            m = hashlib.md5()
            m.update(qt.encode("utf8"))
            h = m.hexdigest()
            return h
Example #4
0
class LSI_TFIDF():
    r = redis.Redis(host='localhost', port=6379, decode_responses=True)
    lw = lg.LogWriter()
    # get files
    path = ''  # path name
    index_path = configs['LSI_pickle_path']
    files = []
    documents = {}
    sortedDocuments = []
    contents = []
    X = None
    re = None
    word = None
    vectorizer = None
    tfidf = None
    s = None
    u = None
    d = None
    idf = None
    lineNo = {}
    expireTime = 600
    end_time = time.clock()
    pageNum = configs['page_num']

    # def __init__(self):
    # self.vectorizer = CountVectorizer()
    # #if there exist the pickle file, read it
    # if os.path.exists(self.index_path):
    #     rfile=open(self.index_path, 'rb')
    #     self.s = pickle.load(rfile)
    #     self.u = pickle.load(rfile)
    #     self.d = pickle.load(rfile)
    #     self.tfidf = pickle.load(rfile)
    #     self.lineNo=pickle.load(rfile)
    #
    #     self.idf = self.tfidf.idf_
    #     self.word=list(self.tfidf.vocabulary_.keys())
    #     self.files=list(self.lineNo.keys())
    #
    # else:#if there is no such pickle file, indexing
    #     self.indexing()

    # indexing
    def indexing(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        fs = len(self.files)
        self.tfidf = TfidfVectorizer()
        i = 0
        while i < fs:  # go through the folder
            file = self.files[i]
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                if len(self.documents[file]['content'].strip()) > 0:
                    self.contents.append(self.documents[file]['content'])
                    # store the line numbers of the term
                    self.lineNo[file] = {}
                    j = 0
                    for line in self.documents[file]['content'].split('\n'):
                        lineList = [line]
                        if len(lineList) > 0:
                            try:
                                self.tfidf.fit_transform(
                                    lineList
                                )  # get the unique standard term of this line
                            except ValueError:
                                j += 1
                                continue
                            for term in self.tfidf.vocabulary_:
                                if term in self.lineNo[file]:
                                    self.lineNo[file][term].append(j)
                                else:
                                    self.lineNo[file][term] = [j]
                        j += 1
                    i += 1
                else:
                    self.documents.pop(file)
                    self.files.remove(file)
                    fs -= 1
            else:
                self.files.remove(file)
        print('finish reading')
        # self.files = list(self.documents.keys())
        size = len(self.documents)
        self.lw.write_info_log("get " + str(size) + " documents")
        self.lw.write_info_log("indexing...")
        self.re = self.tfidf.fit_transform(
            self.contents).toarray().T  # tf-idf values
        self.idf = self.tfidf.idf_
        self.word = self.word = list(self.tfidf.vocabulary_.keys())

        # compression matrix
        self.re = dok_matrix(self.re)
        # self.X=dok_matrix(self.X)
        print("start SVD")
        # svd decomposition
        self.u, self.s, self.d = svds(self.re, k=1000)
        print('start dumping')
        # store the index into the pickle
        with open(
                self.index_path, 'wb'
        ) as f:  # use pickle module to save data into file 'CodexIndex.pik'
            pickle.dump(self.s, f, True)
            pickle.dump(self.u, f, True)
            pickle.dump(self.d, f, True)
            pickle.dump(self.tfidf, f, True)
            pickle.dump(self.lineNo, f, True)
            print('finish')

    def getResult(self, query, page):
        if not self.r.exists(query):  # if the result is not in the redis
            self.vectorizer = CountVectorizer()
            # if there exist the pickle file, read it
            if os.path.exists(self.index_path):
                rfile = open(self.index_path, 'rb')
                self.s = pickle.load(rfile)
                self.u = pickle.load(rfile)
                self.d = pickle.load(rfile)
                self.tfidf = pickle.load(rfile)
                self.lineNo = pickle.load(rfile)

                self.idf = self.tfidf.idf_
                self.word = list(self.tfidf.vocabulary_.keys())
                self.files = list(self.lineNo.keys())

            else:  # if there is no such pickle file, indexing
                self.indexing()

            l = self.MatrixSearching(query, self.s, self.u, self.d.T)
            if l is None:
                return (0, [])

            fullHitLines = l[0]
            hitDocs = l[1]
            matchingLines = l[2]
            numOfResults = l[3]
            fullHitLineskeys = list(fullHitLines.keys())
            hitDocskeys = list(hitDocs.keys())
            matchingLineskeys = list(matchingLines.keys())
            fullHitLineskeys.sort(reverse=True)
            hitDocskeys.sort(reverse=True)
            matchingLineskeys.sort(reverse=True)
            displayList = []  # [(docName,[hit lines])]
            if len(fullHitLineskeys) > 0:
                for k in fullHitLineskeys:
                    for t in fullHitLines[k]:
                        displayList.append(t)
            if len(hitDocskeys) > 0:
                # print('================')
                for k in hitDocskeys:
                    for t in hitDocs[k]:
                        displayList.append(t)
            if len(matchingLines) > 0:
                for k in matchingLineskeys:
                    for t in matchingLines[k]:
                        displayList.append(t)

            self.lw.write_info_log(
                "storing results into redis in form of list")
            self.r.rpush(query, numOfResults)
            self.r.rpush(query, displayList)

        else:
            self.lw.write_info_log("geting results from redis")
            numOfResults = eval(self.r.lindex(query, 0))
            displayList = eval(self.r.lindex(query, 1))

        self.r.expire(query, self.expireTime)
        currentDisplay = displayList[(page - 1) * self.pageNum:page *
                                     self.pageNum]
        return (numOfResults, currentDisplay)

    def MatrixSearching(self, query, s, u, d):

        qFreq = self.vectorizer.fit_transform(
            [query]).toarray().T  # make the vectorizer fit the query
        qWord = self.vectorizer.get_feature_names(
        )  # the unique terms after preprocessing
        qArr = np.zeros([1, len(self.word)])

        # fill in the tf-idf into the empty Xq matrix
        ifEmpty = True
        j = 0
        for w in qWord:
            i = qWord.index(w)
            if w in self.word:
                j = self.word.index(w)
                qArr[0][j] = qFreq[i] * self.idf[j]
                ifEmpty = False

        # give the warning and stop searching if no terms found
        if ifEmpty:
            self.lw.write_warning_log("Nothing found!")
            return None

        # similarities from Dq=X.T * T * S-1.
        sDiagno = np.diag(np.array(s))
        sInv = np.linalg.inv(sDiagno)
        Dq = np.dot(qArr, u)
        Dq = np.dot(Dq, sInv)

        matchingLines = {}  # {similarity:[(docName, [hit lines])] }
        hitDocs = {}  # {lengthHits:[(docName,[hit lines])]}
        fullHitLines = {}  # {fullHitNum:[(docName,[hit lines])]}
        length = 0
        for i in range(len(d)):
            k = self.files[i]
            similarity = ((np.dot(Dq, d[i])) / ((np.linalg.norm(Dq)) *
                                                (np.linalg.norm(d[i]))))[0]
            length += 1
            hitLines = []
            hitWords = 0
            ifMiss = False
            commonLines = []
            for t in qWord:
                if t in self.lineNo[k]:
                    hitWords += 1
                    hitLines = list(
                        set(hitLines).union(set(self.lineNo[k][t])))
                    if not ifMiss:
                        if hitWords == 1:
                            commonLines = self.lineNo[k][t]
                        else:
                            commonLines = list(
                                set(commonLines).intersection(
                                    set(self.lineNo[k][t])))
                else:
                    ifMiss = True
            lengthHit = len(hitLines) * hitWords
            if hitWords > 1 and not ifMiss:
                fullHit = len(commonLines)
            else:
                fullHit = 0
            if fullHit > 0:
                if fullHit in fullHitLines:
                    fullHitLines[fullHit].append((k, hitLines))
                else:
                    fullHitLines[fullHit] = [(k, hitLines)]
            elif lengthHit > 0 and len(qWord) == 1:
                # print('-----------')
                if lengthHit in hitDocs:
                    hitDocs[lengthHit].append((k, hitLines))
                else:
                    hitDocs[lengthHit] = [(k, hitLines)]
            else:
                if similarity > 0:
                    if similarity not in matchingLines:
                        matchingLines[similarity] = [(k, [])]
                    else:
                        matchingLines[similarity].append((k, []))
                else:
                    # don't store it
                    length -= 1
        # print(hitDocs)
        return (fullHitLines, hitDocs, matchingLines, length)