Python LogWriter Examples

Programming Language: Python

Namespace/Package Name: CodexMRS.vendor

Class/Type: LogWriter

Examples at hotexamples.com: 4

Python LogWriter - 4 examples found. These are the top rated real world Python examples of CodexMRS.vendor.LogWriter extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

LogWriter(4)

Frequently Used Methods

LogWriter (4)

Example #1

Show file

File: LSI_TFIDF.py Project: SoapKe/CodEX-Source-Code-Search-Engine

class LSI_TFIDF():
    lw = lg.LogWriter()
    # get files
    path = ''  # path name
    index_path = config['LSI_pickle_path']
    files = []
    documents = {}
    sortedDocuments = []
    contents = []
    X = None
    re = None
    word = None
    vectorizer = None
    tfidf = None
    s = None
    u = None
    d = None
    idf = None
    lineNo = {}
    expireTime = 30
    end_time = time.clock()

    def indexing(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        self.files.remove('.DS_Store')
        fs = len(self.files)
        self.tfidf = TfidfVectorizer()
        i = 0
        while i < fs:  # go through the folder
            file = self.files[i]
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                if len(self.documents[file]['content'].strip()) > 0:
                    self.contents.append(self.documents[file]['content'])
                    #store the line numbers of the term
                    self.lineNo[file] = {}
                    j = 0
                    for line in self.documents[file]['content'].split('\n'):
                        lineList = [line]
                        if len(lineList) > 0:
                            try:
                                self.tfidf.fit_transform(
                                    lineList
                                )  #get the unique standard term of this line
                            except ValueError:
                                j += 1
                                continue
                            for term in self.tfidf.vocabulary_:
                                if term in self.lineNo[file]:
                                    self.lineNo[file][term].append(j)
                                else:
                                    self.lineNo[file][term] = [j]
                        j += 1
                    i += 1
                else:
                    self.documents.pop(file)
                    self.files.remove(file)
                    fs -= 1
            else:
                self.files.remove(file)
        print('finish reading')
        # self.files = list(self.documents.keys())
        size = len(self.documents)
        self.lw.write_info_log("get " + str(size) + " documents")
        self.lw.write_info_log("indexing...")
        self.stopwords = [
            'and', 'edition', 'for', 'in', 'little', 'of', 'the', 'to', 'print'
        ]
        self.re = self.tfidf.fit_transform(
            self.contents).toarray().T  # tf-idf values
        self.idf = self.tfidf.idf_
        self.word = self.word = list(self.tfidf.vocabulary_.keys())

        #compression matrix
        self.re = dok_matrix(self.re)
        # self.X=dok_matrix(self.X)
        print("start SVD")
        # svd decomposition
        self.u, self.s, self.d = svds(self.re,
                                      k=500,
                                      return_singular_vectors='u')
        print('start dumping')
        # store the index into the pickle
        with open(
                self.index_path, 'wb'
        ) as f:  # use pickle module to save data into file 'CodexIndex.pik'
            pickle.dump(self.s, f, True)
            pickle.dump(self.u, f, True)
            pickle.dump(self.d, f, True)
            pickle.dump(self.tfidf, f, True)
            pickle.dump(self.lineNo, f, True)
            print('finish')

    def getResult(self, query):
        self.vectorizer = CountVectorizer()
        # if there exist the pickle file, read it
        if os.path.exists(self.index_path):
            print("in===1")
            rfile = open(self.index_path, 'rb')
            self.s = pickle.load(rfile)
            self.u = pickle.load(rfile)
            self.d = pickle.load(rfile)
            self.tfidf = pickle.load(rfile)
            self.lineNo = pickle.load(rfile)

            self.idf = self.tfidf.idf_
            self.word = list(self.tfidf.vocabulary_.keys())
            self.files = list(self.lineNo.keys())

        else:  # if there is no such pickle file, indexing
            self.indexing()

        l = self.MatrixSearching(query, self.s, self.u, self.d.T)
        if l is None:
            return Results.Results(0)
        print("in===2")
        results = Results.Results(numOfResults=l[3],
                                  matchingLines=l[2],
                                  hitDocs=l[1],
                                  fullHitLines=l[0])

        return results  # return results

    def MatrixSearching(self, query, s, u, d):

        qFreq = self.vectorizer.fit_transform(
            [query]).toarray().T  # make the vectorizer fit the query
        qWord = self.vectorizer.get_feature_names(
        )  # the unique terms after preprocessing
        qArr = np.zeros([1, len(self.word)])

        # fill in the tf-idf into the empty Xq matrix
        ifEmpty = True
        j = 0
        for w in qWord:
            i = qWord.index(w)
            if w in self.word:
                j = self.word.index(w)
                qArr[0][j] = qFreq[i] * self.idf[j]
                ifEmpty = False

        # give the warning and stop searching if no terms found
        if ifEmpty:
            self.lw.write_warning_log("Nothing found!")
            return None

        # similarities from Dq=X.T * T * S-1.
        sDiagno = np.diag(np.array(s))
        sInv = np.linalg.inv(sDiagno)
        Dq = np.dot(qArr, u)
        Dq = np.dot(Dq, sInv)

        matchingLines = {}  # {similarity:[(docName, [hit lines])] }
        hitDocs = {}  # {lengthHits:[(docName,[hit lines])]}
        fullHitLines = {}  # {fullHitNum:[(docName,[hit lines])]}
        length = 0
        for i in range(len(d)):
            k = self.files[i]
            similarity = ((np.dot(Dq, d[i])) / ((np.linalg.norm(Dq)) *
                                                (np.linalg.norm(d[i]))))[0]
            length += 1
            hitLines = []
            hitWords = 0
            commonLines = []
            for t in qWord:
                if t in self.lineNo[k]:
                    hitWords += 1
                    hitLines = list(
                        set(hitLines).union(set(self.lineNo[k][t])))
                    if hitWords == 1:
                        commonLines = self.lineNo[k][t]
                    commonLines = list(
                        set(commonLines).intersection(set(self.lineNo[k][t])))
            lengthHit = len(hitLines) * hitWords
            if hitWords > 1:
                fullHit = len(commonLines)
            else:
                fullHit = 0
            if fullHit > 0:
                if fullHit in fullHitLines:
                    fullHitLines[fullHit].append((k, hitLines))
                else:
                    fullHitLines[fullHit] = [(k, hitLines)]
            elif lengthHit > 0:
                if lengthHit in hitDocs:
                    hitDocs[lengthHit].append((k, hitLines))
                else:
                    hitDocs[lengthHit] = [(k, hitLines)]
            else:
                if similarity > 0:
                    if similarity not in matchingLines:
                        matchingLines[similarity] = [(k, hitLines)]
                    else:
                        matchingLines[similarity].append((k, hitLines))
                else:
                    # don't store it
                    length -= 1

        return (fullHitLines, hitDocs, matchingLines, length)

Example #2

Show file

File: java_AST.py Project: SoapKe/CodEX-Source-Code-Search-Engine

class JavaAST():
    r = redis.Redis(
        host='localhost', port=6379,
        decode_responses=True)  # host是redis主机，需要redis服务端和客户端都启动 redis默认端口是6379
    lw = lg.LogWriter()
    path = ''  # path name
    index_path = configs['AST_java_pickle_path']

    weights = {}  # {weight:[fileNames] }
    fileIndex = {}  # {fileName: {weight:{nodeHash:(startLine,EndLine)] } }
    files = []
    documents = {}
    lastLineNo = 0

    # these parameters should be tuned
    matchingThreshold = 0.6
    weightThreshold = 10  # weight outweigh weightThreshold will be taken into consideration
    blockThreshold = 50  # weight outweigh the blockthreshold means this node will be a code block which should be included into the global searching
    pageNum = configs['page_num']
    wholeSimilarity = 0
    matchingBlock = {
    }  # {docID: (the startline and endline of the matching blocks)}.
    blockWeights = {
    }  # {docID: (startline, endline): weight of the biggest matching block}
    expireTime = 1

    def readFiles(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        for file in self.files:  # go through the folder
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                # self.documents[file]=open(self.path+'/'+file,'r').read()
                if len(self.documents[file]['content'].strip()) > 0:
                    try:
                        tree = javalang.parse.parse(
                            self.documents[file]['content'])
                    except (javalang.parser.JavaSyntaxError):
                        self.lw.write_error_log("syntax error! " + file)
                        continue
                    # remove strings and variable names
                    self.fileIndex[file] = {}
                    names = []  # self defined name
                    self.lastLineNo = 0
                    self.index(tree, file, names, {}, {}, False)
                    # print(self.fileIndex[file])
                else:
                    self.documents.pop(file)
        self.files = list(self.documents.keys())

        self.lw.write_info_log("get " + str(len(self.documents)) +
                               " documents")
        # use pickle module to save data into file 'CodexIndexAST.pik'
        with open(self.index_path, 'wb') as f:
            pickle.dump(self.weights, f, True)
            pickle.dump(self.fileIndex, f, True)

            # self.names=[]
            # tree=javalang.parse.parse(q3)
            # self.fileIndex['q3'] = {}
            # self.index(tree, 'q3')
            # print('#############################################')
            # for weight in self.fileIndex['q2']:
            #     if weight in self.fileIndex['q3']:
            #         print(weight)
            #         print(self.fileIndex['q2'][weight])
            #         print(self.fileIndex['q3'][weight])
            #     else:
            #         print(weight)
            #
            # print(self.fileIndex['q2'])
            # print(self.fileIndex['q3'])

    def index(self, root, fileName, names, nestHash, qLineNums, nestedDic):
        weight = 1
        min = 0
        max = 0
        i = 0
        startLine = 0
        endLine = 0
        # print('-----------------------')
        # print(root)

        attriValues = ''  # "attr1 attr2 attr3"
        if isinstance(root, list) and len(root) == 0:
            return (weight, min, max, '')
        hashAttris = None
        if not isinstance(root, list):
            # if hasattr(root, "_position"):
            #     print(root._position)
            if isinstance(root, javalang.ast.Node):
                children = list(root.children)
            elif isinstance(root, tuple):
                children = root
            else:
                min = self.lastLineNo + 1
                max = self.lastLineNo + 1
                return (weight, min, max, attriValues)
            # get attributes information
            hasContent = False
            if hasattr(root, 'attrs'):
                attriValues += '( '
                for a in root.attrs:
                    v = root.__getattribute__(a)
                    if a is not 'documentation':
                        # remove identifier names
                        # except javalang.tree.ReferenceType, javalang.tree.ReferenceType,
                        if a is 'name' and (
                            (not isinstance(root, javalang.tree.ReferenceType)
                             and
                             not isinstance(root, javalang.tree.ReferenceType))
                                or v in names):
                            if v not in names:
                                names.append(v)
                            # print('================')
                            # print((type(root),a))
                            # print(v)
                            children.remove(v)
                            continue
                        elif a is 'member':
                            # PROBLEM: if the member is a method name not in self.names(defined below the current node), we will fail to ignore it

                            if v in names:
                                # print('~~~~~~~~~~~~~~~~~~~~~~~~~')
                                # print(v)
                                children.remove(v)
                                continue
                        elif a == 'qualifier':
                            # remove printing out
                            if v == 'System.out':
                                return (0, min, max, None)
                            elif v in names:
                                children.remove(v)
                                continue
                        elif v == 'MethodInvocation':
                            if hasattr(v, 'qualifier') and v.__getattribute__(
                                    'qualifier') == 'System.out':
                                return (0, min, max, None)
                        # ignore values like strings, numbers, booleans, null
                        elif a == 'value' and type(v) is str:
                            children.remove(v)
                            continue
                        elif v != None and v != '' and not (isinstance(
                                v, list) and len(v) == 0):
                            if not isinstance(v, list):
                                if isinstance(
                                        v, javalang.tree.
                                        MethodInvocation) and hasattr(
                                            v, 'attrs') and v.__getattribute__(
                                                'qualifier') == 'System.out':
                                    return (0, min, max, None)

                                hasContent = True
                                # print(v)
                                if isinstance(v, set) and len(v) > 1:
                                    # print('//////////////////////////////////////')
                                    v1 = list(v)
                                    v1.sort()
                                    # print(v1)
                                    attriValues += str(v1) + ": "
                                else:
                                    # print('111111111111111111')
                                    # print(v)
                                    attriValues += str(v) + ": "
                                    # print(attriValues)

                            if isinstance(v, (javalang.ast.Node, tuple, list)):
                                children.remove(v)
                                t = self.index(v, fileName, names, nestHash,
                                               qLineNums, nestedDic)

                                weight += t[0]
                                if t[1] > 0:
                                    startLine = t[1]
                                    if i == 0:
                                        min = startLine
                                    elif startLine < min:
                                        min = startLine
                                    i += 1
                                if t[2] > 0:
                                    endLine = t[2]
                                    if endLine > max:
                                        max = endLine
                                    i += 1
                                if t[3] != '' and t[
                                        3] is not None and t[3] != '( ':
                                    hasContent = True
                                    attriValues += t[3] + ', '
                    else:
                        children.remove(v)
            if len(children) > 0:
                if not hasattr(root, 'attrs'):
                    attriValues += '( '

                for child in children:
                    # ignore some meaningless nodes
                    if child != None and child != '' and not isinstance(
                            child, list) and child not in names:
                        if isinstance(child, set) and len(child) > 1:
                            # print('//////////////////////////////////////')
                            child1 = list(child)
                            child1.sort()
                            # print(child1)
                            attriValues += str(child1) + ": "
                        else:
                            # print('22222222222222222')
                            # print(child)
                            attriValues += str(child) + ': '
                    if isinstance(child, (javalang.ast.Node, tuple, list)):
                        t = self.index(child, fileName, names, nestHash,
                                       qLineNums, nestedDic)

                        weight += t[0]
                        if t[1] > 0:
                            startLine = t[1]
                            if i == 0:
                                min = startLine
                            elif startLine < min:
                                min = startLine
                            i += 1
                        if t[2] > 0:
                            endLine = t[2]
                            if endLine > max:
                                max = endLine
                            i += 1
                        if t[3] is not '' and t[3] is not None and t[3] != '( ':
                            hasContent = True
                            attriValues += t[3] + ', '

            if hasContent:
                attriValues += ' )'
            else:
                # no brackets
                attriValues = attriValues.lstrip('( ')
            # work out line number
            if hasattr(root, "_position"):
                lineNo = root._position[0]
                if min == 0 and max == 0:
                    min = lineNo
                    max = lineNo

            # put the weight into weights
            if weight >= self.weightThreshold:
                if min == 0 and max == 0:
                    min = self.lastLineNo + 1
                    max = self.lastLineNo + 1
                self.lastLineNo = max
                if not nestedDic:
                    if weight in self.weights:
                        if fileName not in self.weights[weight]:
                            self.weights[weight].append(fileName)
                    else:
                        self.weights[weight] = [fileName]

                # hash the attribute values list
                m = hashlib.md5()
                m.update(attriValues.encode("utf8"))
                hashAttris = m.hexdigest()

                # print(',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,')
                # print(self.fileIndex[fileName])
                # put the node into fileIndex
                if weight not in self.fileIndex[fileName]:
                    self.fileIndex[fileName][weight] = {}
                self.fileIndex[fileName][weight][hashAttris] = (min, max)

                # print(weight)
                # print(attriValues)
                # print((str(root),hashAttris,min,max))
                # put all its childern in this file into the current node
                if nestedDic:
                    nestHash[(weight, hashAttris, min, max)] = {}
                    qLineNums[hashAttris] = (min, max)
                    for w in self.fileIndex[fileName]:
                        if w < weight:
                            keys = list(self.fileIndex[fileName][w].keys())
                            for k in keys:
                                t = self.fileIndex[fileName][w][k]
                                if t[0] >= min and t[1] <= max:
                                    # print('11111111111111111111111111111111')
                                    # print((w,k,t[0],t[1]))
                                    # print((weight,hashAttris,min,max))
                                    # the block is the sub node of the current node
                                    nestHash[(weight, hashAttris, min,
                                              max)][(w, k, t[0], t[1])] = {}
                                    self.fileIndex[fileName][w].pop(k)

                    # put the children in nestHash into the current node
                    keys2 = list(nestHash.keys())
                    for k in keys2:
                        if k in nestHash:
                            if k[0] < weight and k[2] >= min and k[3] <= max:
                                # print('!!!!!!!!!!!!!!!!!!!!!!!!!')
                                # print(k)
                                # print((weight,hashAttris,min,max))
                                nestHash[(weight, hashAttris, min,
                                          max)][k] = nestHash.pop(k)
            if max > 0:
                self.lastLineNo = max
            return (weight, min, max, attriValues)

        else:
            l = []
            length = len(root)
            j = 0
            while j < length:
                r = root[j]
                rStr = ''
                if r is not None and r is not '':
                    t = self.index(r, fileName, names, nestHash, qLineNums,
                                   nestedDic)
                    weight += t[0]
                    if t[1] > 0:
                        startLine = t[1]
                        if i == 0:
                            min = startLine
                        elif startLine < min:
                            min = startLine
                        i += 1
                    if t[2] > 0:
                        endLine = t[2]
                        if endLine > max:
                            max = endLine
                        i += 1
                    if t[3] is not None:
                        rStr += str(r) + ':'
                        if t[3] is not '':
                            rStr += t[3]
                        else:
                            rStr += ''
                        l.append(rStr)
                        j += 1
                    else:
                        root.pop(j)
                        length -= 1
                else:
                    root.pop(j)
                    length -= 1

            if len(l) > 0:
                # sort the list in order to ignore the code order change
                l.sort()
                attriValues += '[ ' + ''.join(l) + ' ]'
            if max > 0:
                self.lastLineNo = max
            return (weight, min, max, attriValues)

            # interface to front end. Input query, return a Result instance

    def getResults(self, query, page):
        globalSimilarity = None
        matchingBlocks = None
        componentDocuments = []
        if not self.r.exists(query):  # if the result is not in the redis
            # read pickle file
            if os.path.exists(self.index_path):
                rfile = open(self.index_path, 'rb')
                self.weights = pickle.load(rfile)
                self.fileIndex = pickle.load(rfile)
            else:
                self.readFiles()

            # store the result of the query into redis
            matchingLines = {}  # {fileName:[(qStart,qEnd, fStart,fEnd)]}
            similarities = self.search(query, matchingLines)
            if similarities == None:
                self.lw.write_error_log('Pickle files not found!')
                return None
            elif similarities == 0:
                return 0
            # get the normal relevant documents and the suspected plagiarized documents
            globalSimilarity = self.wholeSimilarity
            matchingBlocks = self.matchingBlock
            documentList = sorted(similarities,
                                  key=similarities.get,
                                  reverse=True)
            plagiarismList = []  # [sorted plagiarised files]
            i = 0
            for d in documentList:
                if similarities[d] > self.matchingThreshold:
                    plagiarismList.append(d)
                    # print(similarities[d])
                    # matchingLines[d].sort()
                    # print(matchingLines[d])
                    i += 1
                else:
                    break
            documentList = documentList[i:]
            componentDocuments = list(matchingBlocks.keys())
            # store data into the redis server
            self.lw.write_info_log(
                "storing results into redis in form of list")
            self.r.rpush(query, plagiarismList)
            self.r.rpush(query, documentList)
            self.r.rpush(query, matchingLines)
            if globalSimilarity >= self.matchingThreshold and len(
                    matchingBlocks) != 0 and len(componentDocuments) > 1:
                if len(plagiarismList) > 0:
                    if globalSimilarity >= similarities[plagiarismList[0]]:
                        self.r.rpush(query, globalSimilarity)
                        self.r.rpush(query, matchingBlocks)
                        self.r.rpush(query, componentDocuments)
                        self.r.rpush(query, self.blockWeights)
                    else:
                        componentDocuments = []
                        matchingBlocks = None
                        globalSimilarity = None
                else:
                    # if no plagiarised case is found, display the component programs
                    self.r.rpush(query, globalSimilarity)
                    self.r.rpush(query, matchingBlocks)
                    self.r.rpush(query, componentDocuments)
            else:
                componentDocuments = []
                matchingBlocks = None
                globalSimilarity = None

        # get the result list of this query from redis
        else:
            self.lw.write_info_log("geting results from redis")
            plagiarismList = eval(self.r.lindex(query, 0))
            documentList = eval(self.r.lindex(query, 1))
            matchingLines = eval(self.r.lindex(query, 2))
            if self.r.llen(query) >= 6:
                globalSimilarity = eval(self.r.lindex(query, 3))
                matchingBlocks = eval(self.r.lindex(query, 4))
                componentDocuments = eval(self.r.lindex(query, 5))
                self.blockWeights = eval(self.r.lindex(query, 6))

        self.r.expire(query, self.expireTime)  # expire after 30s

        # encalsulate results into the object:Result
        documentListLength = len(documentList)
        plagiarismListLength = len(plagiarismList)
        matchingblocksLength = len(componentDocuments)
        length = documentListLength + plagiarismListLength + matchingblocksLength
        results = Results.Results(numOfResults=length,
                                  matchingLines=matchingLines,
                                  globalSimilarity=globalSimilarity,
                                  matchingBlocks=matchingBlocks,
                                  blockWeights=self.blockWeights)
        disMatchingBlocks = []
        disPlagiarismList = []
        disDocumentList = []
        if (
                page - 1
        ) * self.pageNum < matchingblocksLength:  # need to display the maching blocks
            disMatchingBlocks = componentDocuments[
                (page - 1) *
                self.pageNum:min(page * self.pageNum, matchingblocksLength)]
            results.setComponentDocuments(disMatchingBlocks)

        if (
                page - 1
        ) * self.pageNum < matchingblocksLength + plagiarismListLength and page * self.pageNum >= matchingblocksLength:
            # need to display the plagiarism documents
            if len(disMatchingBlocks) == 0 and page > 1:  # not start from 0
                disPlagiarismList = plagiarismList[
                    (page - 1) * self.pageNum - matchingblocksLength:min((
                        page * self.pageNum -
                        matchingblocksLength), plagiarismListLength)]
            else:  # start from 0
                disPlagiarismList = plagiarismList[
                    0:min(self.pageNum, plagiarismListLength)]
            results.setPlagiarismList(disPlagiarismList)

        if page * self.pageNum > matchingblocksLength + plagiarismListLength:  # need to dispaly the relevant documents
            if len(disMatchingBlocks) == 0 and len(
                    disPlagiarismList
            ) == 0 and (page - 1) * self.pageNum <= length:  # not start from 0
                disDocumentList = documentList[
                    (page - 1) * self.pageNum - matchingblocksLength -
                    plagiarismListLength:min((
                        page * self.pageNum - matchingblocksLength -
                        plagiarismListLength), documentListLength)]
            elif (page - 1) * self.pageNum <= length:  # start from 0
                disDocumentList = documentList[0:min((
                    self.pageNum - matchingblocksLength -
                    plagiarismListLength), documentListLength)]
            else:
                self.lw.write_error_log("page number out of range")
                return None
            results.setDocumentList(disDocumentList)

        # print('==============')
        # results.toString()
        return results

    def search(self, query, matchingLines
               ):  # matchingLines {fileName:[(qStart,qEnd, fStart,fEnd)]}
        # refresh the global variables
        self.wholeSimilarity = 0
        self.matchingBlock = {}
        self.blockWeights = {}
        qTree = {
        }  # {(weight,nodeHash,startLine, endLine):{nested dictionaries}}
        qLineNums = {}

        root = javalang.parse.parse(query)
        self.fileIndex['query'] = {}
        names = []
        self.lastLineNo = 0
        self.index(root, 'query', names, qTree, qLineNums, True)
        # print(qTree)
        # print(qLineNums)
        self.fileIndex.pop('query')
        similarities = {}  # {fileName:score}
        maxWeight = list(qTree.keys())[0][0]
        # print(maxWeight)
        self.similarities(qTree, self.weights, similarities, maxWeight,
                          qLineNums, matchingLines)

        # work out the global similarity
        for dic in self.blockWeights:
            biggestKey = sorted(self.blockWeights[dic],
                                key=self.blockWeights[dic].get,
                                reverse=True)[0]
            if self.blockWeights[dic][biggestKey] > self.blockThreshold:
                ds = list(self.matchingBlock.keys())
                store = True
                for d in ds:
                    block = self.matchingBlock[d]
                    # do not store in if the new block is included in some block within the matchBlock
                    if biggestKey[0] >= block[0] and biggestKey[1] <= block[1]:
                        store = False
                        break
                    # delete the older block included in the new block
                    elif biggestKey[0] <= block[0] and biggestKey[1] >= block[
                            1]:
                        self.matchingBlock.pop(d)
                        self.wholeSimilarity -= self.blockWeights[d][
                            block] / maxWeight

                    # deal with the block that have some part overlapping with old blocks (store the one with bigger weight)
                    elif (biggestKey[0] <= block[1] and biggestKey[0] >=
                          block[0]) or (biggestKey[1] <= block[1]
                                        and biggestKey[1] >= block[0]):
                        if self.blockWeights[dic][
                                biggestKey] > self.blockWeights[d][block]:
                            self.matchingBlock.pop(d)
                            self.wholeSimilarity -= self.blockWeights[d][
                                block] / maxWeight
                        else:
                            store = False
                            break

                # store the new block
                if store:
                    self.matchingBlock[dic] = biggestKey
                    self.wholeSimilarity += self.blockWeights[dic][
                        biggestKey] / maxWeight

        return similarities

    # calculate the similarities between corpus and query
    def similarities(self, qTree, weights, similarities, maxWeight, qLineNums,
                     matchingLines):
        # matchingBlock: {docID: (the startline and endline of the matching blocks)}.
        # blockWeights: {docID: (qStartline, qEndline): weight of the biggest matching block}
        if maxWeight is None:
            maxWeight = 1
        for w in qTree:
            if isinstance(w, tuple):
                find = False
                if w[0] in weights:
                    for file in weights[w[0]]:
                        # check if the nodeHash is in this file
                        if w[1] in self.fileIndex[file][w[0]]:
                            find = True
                            qs = w[2]
                            qe = w[3]
                            fs = self.fileIndex[file][w[0]][w[1]][0]
                            fe = self.fileIndex[file][w[0]][w[1]][1]
                            if file in similarities:
                                matchingLines[file].append((qs, qe, fs, fe))
                                similarities[file] += w[0] / maxWeight
                            else:
                                matchingLines[file] = [(qs, qe, fs, fe)]
                                similarities[file] = w[0] / maxWeight

                            # merge lines in query program to construct the code blocks
                            forwMerge = False
                            BackMerge = False
                            if file not in self.blockWeights:
                                self.blockWeights[file] = {}
                            elif (qs, qe) in self.blockWeights[file]:
                                if w[0] > self.blockWeights[file][(qs, qe)]:
                                    self.blockWeights[file][(qs, qe)] = w[0]
                                continue
                            keys = list(self.blockWeights[file].keys())
                            for mLines in keys:
                                if mLines[1] < qs:
                                    insertion = False
                                    # check insertion
                                    for k in qLineNums:
                                        lines = qLineNums[k]
                                        if (lines[0] > mLines[1] and lines[0] <
                                                qs) or (lines[1] > mLines[1]
                                                        and lines[1] < qs):
                                            insertion = True
                                            break
                                    if not insertion:
                                        self.blockWeights[file][(
                                            mLines[0],
                                            qe)] = w[0] + self.blockWeights[
                                                file][mLines]
                                        self.blockWeights[file].pop(mLines)
                                        forwMerge = True
                                elif mLines[0] > qe:
                                    insertion = False
                                    # check insertion
                                    for lines in qLineNums.values():
                                        if (lines[1] < mLines[0] and lines[1] >
                                                qe) or (lines[0] < mLines[0]
                                                        and lines[0] > qe):
                                            insertion = True
                                            break
                                    if not insertion:
                                        self.blockWeights[file][(
                                            qs, mLines[1]
                                        )] = w[0] + self.blockWeights[file][
                                            mLines]
                                        self.blockWeights[file].pop(mLines)
                                        BackMerge = True
                                if forwMerge and BackMerge:
                                    break
                            if not forwMerge and not BackMerge:
                                self.blockWeights[file][(qs, qe)] = w[0]
                if not find and qTree[w] is not None:
                    if len(qTree[w]) > 0:
                        self.similarities(qTree[w], weights, similarities,
                                          maxWeight, qLineNums, matchingLines)

    def import_in(self, filename):
        dic = conv.to_dic(file_name=filename)
        print(dic['content'])

Example #3

Show file

class ASTSearching(Singleton):
    r = redis.Redis(
        host='localhost', port=6379,
        decode_responses=True)  # host是redis主机，需要redis服务端和客户端都启动 redis默认端口是6379
    lw = lg.LogWriter()
    path = ""  # path name
    index_path = config['AST_python_pickle_path']
    files = []
    documents = {}
    # hashTrees={}#{fileName: {nodeHash: {nested dictionaries with hash values in stand of nodes} } }
    # -----compare with hashTrees and choose the efficient one-------
    hashDic = {}  # {fileName:{weight:[nodeHash]}
    visitor = mv.MyVisitor()
    weights = {}  # {weight:[fileNames] }
    lineNums = {}  # {fileName: {nodeHash: (startLine, endLine)}}

    # these parameters should be tuned
    matchingThreshold = 0.6
    weightThreshold = 10  # weight outweigh weightThreshold will be taken into consideration
    blockThreshold = 50  # weight outweigh the blockthreshold means this node will be a code block which should be included into the global searching
    pageNum = 10
    wholeSimilarity = 0
    matchingBlock = {
    }  # {docID: (the startline and endline of the matching blocks)}.
    blockWeights = {
    }  # {docID: (startline, endline): weight of the biggest matching block}
    expireTime = 1

    # parse the corpus
    def ReadFiles(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        for file in self.files:  # go through the folder
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                if len(self.documents[file]['content'].strip()) > 0:
                    try:
                        root = ast.parse(str(self.documents[file]['content']))
                    except (SyntaxError):
                        self.lw.write_error_log("syntax error! " + file)
                        continue
                    # remove strings and variable names
                    self.visitor.visit(root)
                    self.lineNums[file] = {}
                    self.hashDic[file] = {}
                    self.Indexing(root, self.lineNums[file], self.weights,
                                  file)
                else:
                    self.documents.pop(file)
        self.files = list(self.documents.keys())

        self.lw.write_info_log("get " + str(len(self.documents)) +
                               " documents")
        # use pickle module to save data into file 'CodexIndexAST.pik'
        with open(self.index_path, 'wb') as f:
            pickle.dump(self.weights, f, True)
            pickle.dump(self.lineNums, f, True)
            pickle.dump(self.hashDic, f, True)

    # turn every document root into index
    def Indexing(self, node, lineNums, weights, fileName):
        weight = 1
        min = 0
        max = 0
        i = 0
        startLine = 0
        endLine = 0
        if isinstance(node, ast.AST):
            m = hashlib.md5()
            m.update(ast.dump(node).encode("utf8"))
            nodeStr = m.hexdigest()
            for n, m in ast.iter_fields(node):
                tuple = self.Indexing(m, lineNums, weights, fileName)
                weight += tuple[0]
                if tuple[1] > 0:
                    startLine = tuple[1]
                    if i == 0:
                        min = startLine
                    elif startLine < min:
                        min = startLine
                    i += 1
                if tuple[2] > 0:
                    endLine = tuple[2]
                    if endLine > max:
                        max = endLine
                    i += 1
            if node._attributes:
                lineNo = getattr(node, 'lineno')
                if min == 0 and max == 0:
                    min = lineNo
                    max = lineNo

            if weight >= self.weightThreshold:
                if weight in weights:
                    if fileName not in weights[weight]:
                        weights[weight].append(fileName)
                else:
                    weights[weight] = [fileName]
                # put the hash node into hash dictionary
                if weight in self.hashDic[fileName]:
                    self.hashDic[fileName][weight].append(nodeStr)
                else:
                    self.hashDic[fileName][weight] = [nodeStr]

                lineNums[nodeStr] = (min, max)

            return (weight, min, max)

        elif isinstance(node, list):
            for x in node:
                tuple = self.Indexing(x, lineNums, weights, fileName)
                weight += tuple[0]
                if tuple[1] > 0:
                    startLine = tuple[1]
                    if i == 0:
                        min = startLine
                    elif startLine < min:
                        min = startLine
                    i += 1
                if tuple[2] > 0:
                    endLine = tuple[2]
                    if endLine > max:
                        max = endLine
                    i += 1

            return (weight, min, max)
        return (weight, min, max)

    # interface to front end. Input query, return a Result instance
    def getResults(self, query, page):
        globalSimilarity = 0
        matchingBlocks = {}
        componentDocuments = []
        if not self.r.exists(query):  # if the result is not in the redis

            if os.path.exists(self.index_path):
                rfile = open(self.index_path, 'rb')
                self.weights = pickle.load(rfile)
                self.lineNums = pickle.load(rfile)
                self.hashDic = pickle.load(rfile)
            else:
                self.ReadFiles()

            # store the result of the query into redis
            matchingLines = {}  # {fileName:[(qStart,qEnd, fStart,fEnd)]}
            similarities = self.search(query, matchingLines)
            if similarities == None:
                self.lw.write_error_log('Pickle files not found!')
                return None
            elif similarities == 0:
                return 0
            # get the normal relevant documents and the suspected plagiarized documents
            globalSimilarity = self.wholeSimilarity
            matchingBlocks = self.matchingBlock
            documentList = sorted(similarities,
                                  key=similarities.get,
                                  reverse=True)
            plagiarismList = []  # [sorted plagiarised files]
            i = 0
            for d in documentList:
                if similarities[d] > self.matchingThreshold:
                    plagiarismList.append(d)
                    # print(similarities[d])
                    # matchingLines[d].sort()
                    # print(matchingLines[d])
                    i += 1
                else:
                    break
            documentList = documentList[i:]
            componentDocuments = list(matchingBlocks.keys())
            # store data into the redis server
            self.lw.write_info_log(
                "storing results into redis in form of list")
            self.r.rpush(query, plagiarismList)
            self.r.rpush(query, documentList)
            self.r.rpush(query, matchingLines)
            if globalSimilarity >= self.matchingThreshold and len(
                    matchingBlocks) != 0 and len(componentDocuments) > 1:
                if len(plagiarismList) > 0:
                    if globalSimilarity >= similarities[plagiarismList[0]]:
                        self.r.rpush(query, globalSimilarity)
                        self.r.rpush(query, matchingBlocks)
                        self.r.rpush(query, componentDocuments)
                    else:
                        componentDocuments = []
                        matchingBlocks = None
                        globalSimilarity = None
                else:
                    self.r.rpush(query, globalSimilarity)
                    self.r.rpush(query, matchingBlocks)
                    self.r.rpush(query, componentDocuments)
            else:
                componentDocuments = []
                matchingBlocks = None
                globalSimilarity = None

        # get the result list of this query from redis
        else:
            self.lw.write_info_log("geting results from redis")
            plagiarismList = eval(self.r.lindex(query, 0))
            documentList = eval(self.r.lindex(query, 1))
            matchingLines = eval(self.r.lindex(query, 2))
            if self.r.llen(query) >= 6:
                globalSimilarity = eval(self.r.lindex(query, 3))
                matchingBlocks = eval(self.r.lindex(query, 4))
                componentDocuments = eval(self.r.lindex(query, 5))

        self.r.expire(query, self.expireTime)  # expire after 30s

        # encalsulate results into the object:Result
        documentListLength = len(documentList)
        plagiarismListLength = len(plagiarismList)
        matchingblocksLength = len(componentDocuments)
        length = documentListLength + plagiarismListLength + matchingblocksLength
        results = Results.Results(numOfResults=length,
                                  matchingLines=matchingLines,
                                  globalSimilarity=globalSimilarity,
                                  matchingBlocks=matchingBlocks)
        disMatchingBlocks = []
        disPlagiarismList = []
        disDocumentList = []
        if (
                page - 1
        ) * self.pageNum < matchingblocksLength:  # need to display the maching blocks
            disMatchingBlocks = componentDocuments[
                (page - 1) *
                self.pageNum:min(page * self.pageNum, matchingblocksLength)]
            results.setComponentDocuments(disMatchingBlocks)

        if (
                page - 1
        ) * self.pageNum < matchingblocksLength + plagiarismListLength and page * self.pageNum >= matchingblocksLength:
            # need to display the plagiarism documents
            if len(disMatchingBlocks) == 0 and page > 1:  # not start from 0
                disPlagiarismList = plagiarismList[
                    (page - 1) * self.pageNum - matchingblocksLength:min((
                        page * self.pageNum -
                        matchingblocksLength), plagiarismListLength)]
            else:  # start from 0
                disPlagiarismList = plagiarismList[
                    0:min(self.pageNum, plagiarismListLength)]
            results.setPlagiarismList(disPlagiarismList)

        if page * self.pageNum > matchingblocksLength + plagiarismListLength:  # need to dispaly the relevant documents
            if len(disMatchingBlocks) == 0 and len(
                    disPlagiarismList
            ) == 0 and (page - 1) * self.pageNum <= length:  # not start from 0
                disDocumentList = documentList[
                    (page - 1) * self.pageNum - matchingblocksLength -
                    plagiarismListLength:min((
                        page * self.pageNum - matchingblocksLength -
                        plagiarismListLength), documentListLength)]
            elif (page - 1) * self.pageNum <= length:  # start from 0
                disDocumentList = documentList[0:min((
                    self.pageNum - matchingblocksLength -
                    plagiarismListLength), documentListLength)]
            else:
                self.lw.write_error_log("page number out of range")
                return None
            results.setDocumentList(disDocumentList)

        # print('==============')
        # results.toString()
        return results

    # break the query tree into nodes and calculate their weights
    def queryWeight(self, node, lineNums, tree):
        weight = 1
        min = 0
        max = 0
        i = 0
        startLine = 0
        endLine = 0

        if isinstance(node, ast.AST):
            m = hashlib.md5()
            m.update(ast.dump(node).encode("utf8"))
            nodeStr = m.hexdigest()
            tree[nodeStr] = {}
            for n, m in ast.iter_fields(node):
                tuple = self.queryWeight(m, lineNums, tree[nodeStr])
                weight += tuple[0]
                if tuple[1] > 0:
                    startLine = tuple[1]
                    if i == 0:
                        min = startLine
                    elif startLine < min:
                        min = startLine
                    i += 1
                if tuple[2] > 0:
                    endLine = tuple[2]
                    if endLine > max:
                        max = endLine
                    i += 1
            if node._attributes:
                lineNo = getattr(node, 'lineno')
                if min == 0 and max == 0:
                    min = lineNo
                    max = lineNo
            if weight >= self.weightThreshold:
                lineNums[nodeStr] = (min, max)
                tree[(weight, nodeStr)] = tree.pop(nodeStr)
                if len(tree[(weight, nodeStr)]) == 0:
                    tree[(weight, nodeStr)] = None
            else:
                tree.pop(nodeStr)

            return (weight, min, max)

        elif isinstance(node, list):
            for x in node:
                tuple = self.queryWeight(x, lineNums, tree)
                weight += tuple[0]
                if tuple[1] > 0:
                    startLine = tuple[1]
                    if i == 0:
                        min = startLine
                    elif startLine < min:
                        min = startLine
                    i += 1
                if tuple[2] > 0:
                    endLine = tuple[2]
                    if endLine > max:
                        max = endLine
                    i += 1

            return (weight, min, max)
        return (weight, min, max)

    # search plagiarism code with query
    def search(self, query, matchingLines):
        # refresh the global variables
        self.wholeSimilarity = 0
        self.matchingBlock = {}
        self.blockWeights = {}
        qTree = {}  # {(weight,nodeHash):{nested dictionaries}}
        qLineNums = {}  # {nodeHash:(start,end)}
        try:
            qNode = ast.parse(query)
        except (SyntaxError):
            self.lw.write_error_log("syntax error in qeury! ")
            return 0
        self.visitor.visit(qNode)
        # print(ast.dump(qNode,include_attributes=True))
        self.queryWeight(qNode, qLineNums, qTree)
        # print(qTree)
        # print(qLineNums)
        maxWeight = list(qTree.keys())[0][0]
        similarities = {}  # {fileName:score}
        self.similarities(qTree, self.weights, similarities, maxWeight,
                          qLineNums, self.lineNums, matchingLines)

        # work out the global similarity
        for dic in self.blockWeights:
            biggestKey = sorted(self.blockWeights[dic],
                                key=self.blockWeights[dic].get,
                                reverse=True)[0]
            if self.blockWeights[dic][biggestKey] > self.blockThreshold:
                ds = list(self.matchingBlock.keys())
                store = True
                for d in ds:
                    block = self.matchingBlock[d]
                    # do not store in if the new block is included in some block within the matchBlock
                    if biggestKey[0] >= block[0] and biggestKey[1] <= block[1]:
                        store = False
                        break
                    # delete the older block included in the new block
                    elif biggestKey[0] <= block[0] and biggestKey[1] >= block[
                            1]:
                        self.matchingBlock.pop(d)
                        self.wholeSimilarity -= self.blockWeights[d][
                            block] / maxWeight

                    # deal with the block that have some part overlapping with old blocks (store the one with bigger weight)
                    elif (biggestKey[0] <= block[1] and biggestKey[0] >=
                          block[0]) or (biggestKey[1] <= block[1]
                                        and biggestKey[1] >= block[0]):
                        if self.blockWeights[dic][
                                biggestKey] > self.blockWeights[d][block]:
                            self.matchingBlock.pop(d)
                            self.wholeSimilarity -= self.blockWeights[d][
                                block] / maxWeight
                        else:
                            store = False
                            break

                # store the new block
                if store:
                    self.matchingBlock[dic] = biggestKey
                    self.wholeSimilarity += self.blockWeights[dic][
                        biggestKey] / maxWeight

        return similarities

    # calculate the similarities between corpus and query
    def similarities(self, qTree, weights, similarities, maxWeight, qLineNums,
                     lineNums, matchingLines):
        if maxWeight is None:
            maxWeight = 1
        for w in qTree:
            if isinstance(w, tuple):
                find = False
                if w[0] in weights:
                    for file in weights[w[0]]:
                        if w[1] in self.hashDic[file][w[0]]:
                            find = True
                            qs = qLineNums[w[1]][0]
                            qe = qLineNums[w[1]][1]
                            fs = lineNums[file][w[1]][0]
                            fe = lineNums[file][w[1]][1]
                            if file in similarities:
                                matchingLines[file].append((qs, qe, fs, fe))
                                similarities[file] += w[0] / maxWeight
                            else:
                                matchingLines[file] = [(qs, qe, fs, fe)]
                                similarities[file] = w[0] / maxWeight

                            # merge lines in query program to construct the code blocks
                            forwMerge = False
                            BackMerge = False
                            if file not in self.blockWeights:
                                self.blockWeights[file] = {}
                            elif (qs, qe) in self.blockWeights[file]:
                                if w[0] > self.blockWeights[file][(qs, qe)]:
                                    self.blockWeights[file][(qs, qe)] = w[0]
                                continue
                            keys = list(self.blockWeights[file].keys())
                            for mLines in keys:
                                if mLines[1] < qs:
                                    insertion = False
                                    # check insertion
                                    for k in qLineNums:
                                        lines = qLineNums[k]
                                        if (lines[0] > mLines[1] and lines[0] <
                                                qs) or (lines[1] > mLines[1]
                                                        and lines[1] < qs):
                                            insertion = True
                                            break
                                    if not insertion:
                                        self.blockWeights[file][(
                                            mLines[0],
                                            qe)] = w[0] + self.blockWeights[
                                                file][mLines]
                                        self.blockWeights[file].pop(mLines)
                                        forwMerge = True
                                elif mLines[0] > qe:
                                    insertion = False
                                    # check insertion
                                    for lines in qLineNums.values():
                                        if (lines[1] < mLines[0] and lines[1] >
                                                qe) or (lines[0] < mLines[0]
                                                        and lines[0] > qe):
                                            insertion = True
                                            break
                                    if not insertion:
                                        self.blockWeights[file][(
                                            qs, mLines[1]
                                        )] = w[0] + self.blockWeights[file][
                                            mLines]
                                        self.blockWeights[file].pop(mLines)
                                        BackMerge = True
                                if forwMerge and BackMerge:
                                    break
                            if not forwMerge and not BackMerge:
                                self.blockWeights[file][(qs, qe)] = w[0]
                if not find and qTree[w] is not None:
                    if len(qTree[w]) > 0:
                        self.similarities(qTree[w], weights, similarities,
                                          maxWeight, qLineNums, lineNums,
                                          matchingLines)

    # find a key in a nested dictionary
    def dict_get(self, weight, d, objkey, default, weights, fileName):
        for k, v in d.items():
            # if find the key, delete this node (avoid repeated searching)
            if k == objkey:
                # weights[weight].remove(fileName)
                return d.pop(k)
            else:
                if isinstance(v, dict):
                    # if  k[0]>objkey[0]:
                    ret = self.dict_get(weight, v, objkey, default, weights,
                                        fileName)
                    if ret is not default:
                        return ret
        return default

    def import_in(self, filename):
        dic = conv.to_dic(file_name=filename)

        # return  self.compareQueries(dic['code'],q1)

        # compare if two queries are the same using hash functions
        def compareQueries(self, query1, query2):
            h1 = self.nodeToHash(query1)
            h2 = self.nodeToHash(query2)
            return h1 == h2

        # parse a query
        def nodeToHash(self, node):
            qRoot = ast.parse(node)
            self.visitor.visit(qRoot)
            qt = ast.dump(qRoot)
            m = hashlib.md5()
            m.update(qt.encode("utf8"))
            h = m.hexdigest()
            return h

Example #4

Show file

class LSI_TFIDF():
    r = redis.Redis(host='localhost', port=6379, decode_responses=True)
    lw = lg.LogWriter()
    # get files
    path = ''  # path name
    index_path = configs['LSI_pickle_path']
    files = []
    documents = {}
    sortedDocuments = []
    contents = []
    X = None
    re = None
    word = None
    vectorizer = None
    tfidf = None
    s = None
    u = None
    d = None
    idf = None
    lineNo = {}
    expireTime = 600
    end_time = time.clock()
    pageNum = configs['page_num']

    # def __init__(self):
    # self.vectorizer = CountVectorizer()
    # #if there exist the pickle file, read it
    # if os.path.exists(self.index_path):
    #     rfile=open(self.index_path, 'rb')
    #     self.s = pickle.load(rfile)
    #     self.u = pickle.load(rfile)
    #     self.d = pickle.load(rfile)
    #     self.tfidf = pickle.load(rfile)
    #     self.lineNo=pickle.load(rfile)
    #
    #     self.idf = self.tfidf.idf_
    #     self.word=list(self.tfidf.vocabulary_.keys())
    #     self.files=list(self.lineNo.keys())
    #
    # else:#if there is no such pickle file, indexing
    #     self.indexing()

    # indexing
    def indexing(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        fs = len(self.files)
        self.tfidf = TfidfVectorizer()
        i = 0
        while i < fs:  # go through the folder
            file = self.files[i]
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                if len(self.documents[file]['content'].strip()) > 0:
                    self.contents.append(self.documents[file]['content'])
                    # store the line numbers of the term
                    self.lineNo[file] = {}
                    j = 0
                    for line in self.documents[file]['content'].split('\n'):
                        lineList = [line]
                        if len(lineList) > 0:
                            try:
                                self.tfidf.fit_transform(
                                    lineList
                                )  # get the unique standard term of this line
                            except ValueError:
                                j += 1
                                continue
                            for term in self.tfidf.vocabulary_:
                                if term in self.lineNo[file]:
                                    self.lineNo[file][term].append(j)
                                else:
                                    self.lineNo[file][term] = [j]
                        j += 1
                    i += 1
                else:
                    self.documents.pop(file)
                    self.files.remove(file)
                    fs -= 1
            else:
                self.files.remove(file)
        print('finish reading')
        # self.files = list(self.documents.keys())
        size = len(self.documents)
        self.lw.write_info_log("get " + str(size) + " documents")
        self.lw.write_info_log("indexing...")
        self.re = self.tfidf.fit_transform(
            self.contents).toarray().T  # tf-idf values
        self.idf = self.tfidf.idf_
        self.word = self.word = list(self.tfidf.vocabulary_.keys())

        # compression matrix
        self.re = dok_matrix(self.re)
        # self.X=dok_matrix(self.X)
        print("start SVD")
        # svd decomposition
        self.u, self.s, self.d = svds(self.re, k=1000)
        print('start dumping')
        # store the index into the pickle
        with open(
                self.index_path, 'wb'
        ) as f:  # use pickle module to save data into file 'CodexIndex.pik'
            pickle.dump(self.s, f, True)
            pickle.dump(self.u, f, True)
            pickle.dump(self.d, f, True)
            pickle.dump(self.tfidf, f, True)
            pickle.dump(self.lineNo, f, True)
            print('finish')

    def getResult(self, query, page):
        if not self.r.exists(query):  # if the result is not in the redis
            self.vectorizer = CountVectorizer()
            # if there exist the pickle file, read it
            if os.path.exists(self.index_path):
                rfile = open(self.index_path, 'rb')
                self.s = pickle.load(rfile)
                self.u = pickle.load(rfile)
                self.d = pickle.load(rfile)
                self.tfidf = pickle.load(rfile)
                self.lineNo = pickle.load(rfile)

                self.idf = self.tfidf.idf_
                self.word = list(self.tfidf.vocabulary_.keys())
                self.files = list(self.lineNo.keys())

            else:  # if there is no such pickle file, indexing
                self.indexing()

            l = self.MatrixSearching(query, self.s, self.u, self.d.T)
            if l is None:
                return (0, [])

            fullHitLines = l[0]
            hitDocs = l[1]
            matchingLines = l[2]
            numOfResults = l[3]
            fullHitLineskeys = list(fullHitLines.keys())
            hitDocskeys = list(hitDocs.keys())
            matchingLineskeys = list(matchingLines.keys())
            fullHitLineskeys.sort(reverse=True)
            hitDocskeys.sort(reverse=True)
            matchingLineskeys.sort(reverse=True)
            displayList = []  # [(docName,[hit lines])]
            if len(fullHitLineskeys) > 0:
                for k in fullHitLineskeys:
                    for t in fullHitLines[k]:
                        displayList.append(t)
            if len(hitDocskeys) > 0:
                # print('================')
                for k in hitDocskeys:
                    for t in hitDocs[k]:
                        displayList.append(t)
            if len(matchingLines) > 0:
                for k in matchingLineskeys:
                    for t in matchingLines[k]:
                        displayList.append(t)

            self.lw.write_info_log(
                "storing results into redis in form of list")
            self.r.rpush(query, numOfResults)
            self.r.rpush(query, displayList)

        else:
            self.lw.write_info_log("geting results from redis")
            numOfResults = eval(self.r.lindex(query, 0))
            displayList = eval(self.r.lindex(query, 1))

        self.r.expire(query, self.expireTime)
        currentDisplay = displayList[(page - 1) * self.pageNum:page *
                                     self.pageNum]
        return (numOfResults, currentDisplay)

    def MatrixSearching(self, query, s, u, d):

        qFreq = self.vectorizer.fit_transform(
            [query]).toarray().T  # make the vectorizer fit the query
        qWord = self.vectorizer.get_feature_names(
        )  # the unique terms after preprocessing
        qArr = np.zeros([1, len(self.word)])

        # fill in the tf-idf into the empty Xq matrix
        ifEmpty = True
        j = 0
        for w in qWord:
            i = qWord.index(w)
            if w in self.word:
                j = self.word.index(w)
                qArr[0][j] = qFreq[i] * self.idf[j]
                ifEmpty = False

        # give the warning and stop searching if no terms found
        if ifEmpty:
            self.lw.write_warning_log("Nothing found!")
            return None

        # similarities from Dq=X.T * T * S-1.
        sDiagno = np.diag(np.array(s))
        sInv = np.linalg.inv(sDiagno)
        Dq = np.dot(qArr, u)
        Dq = np.dot(Dq, sInv)

        matchingLines = {}  # {similarity:[(docName, [hit lines])] }
        hitDocs = {}  # {lengthHits:[(docName,[hit lines])]}
        fullHitLines = {}  # {fullHitNum:[(docName,[hit lines])]}
        length = 0
        for i in range(len(d)):
            k = self.files[i]
            similarity = ((np.dot(Dq, d[i])) / ((np.linalg.norm(Dq)) *
                                                (np.linalg.norm(d[i]))))[0]
            length += 1
            hitLines = []
            hitWords = 0
            ifMiss = False
            commonLines = []
            for t in qWord:
                if t in self.lineNo[k]:
                    hitWords += 1
                    hitLines = list(
                        set(hitLines).union(set(self.lineNo[k][t])))
                    if not ifMiss:
                        if hitWords == 1:
                            commonLines = self.lineNo[k][t]
                        else:
                            commonLines = list(
                                set(commonLines).intersection(
                                    set(self.lineNo[k][t])))
                else:
                    ifMiss = True
            lengthHit = len(hitLines) * hitWords
            if hitWords > 1 and not ifMiss:
                fullHit = len(commonLines)
            else:
                fullHit = 0
            if fullHit > 0:
                if fullHit in fullHitLines:
                    fullHitLines[fullHit].append((k, hitLines))
                else:
                    fullHitLines[fullHit] = [(k, hitLines)]
            elif lengthHit > 0 and len(qWord) == 1:
                # print('-----------')
                if lengthHit in hitDocs:
                    hitDocs[lengthHit].append((k, hitLines))
                else:
                    hitDocs[lengthHit] = [(k, hitLines)]
            else:
                if similarity > 0:
                    if similarity not in matchingLines:
                        matchingLines[similarity] = [(k, [])]
                    else:
                        matchingLines[similarity].append((k, []))
                else:
                    # don't store it
                    length -= 1
        # print(hitDocs)
        return (fullHitLines, hitDocs, matchingLines, length)