class LSI_TFIDF(): lw = lg.LogWriter() # get files path = '' # path name index_path = config['LSI_pickle_path'] files = [] documents = {} sortedDocuments = [] contents = [] X = None re = None word = None vectorizer = None tfidf = None s = None u = None d = None idf = None lineNo = {} expireTime = 30 end_time = time.clock() def indexing(self): self.lw.write_info_log("reading files...") self.files = os.listdir(self.path) # get all the file names self.files.remove('.DS_Store') fs = len(self.files) self.tfidf = TfidfVectorizer() i = 0 while i < fs: # go through the folder file = self.files[i] if not os.path.isdir(file): # judge if it is a folder self.documents[file] = conv.to_dic(self.path + "/" + file) if len(self.documents[file]['content'].strip()) > 0: self.contents.append(self.documents[file]['content']) #store the line numbers of the term self.lineNo[file] = {} j = 0 for line in self.documents[file]['content'].split('\n'): lineList = [line] if len(lineList) > 0: try: self.tfidf.fit_transform( lineList ) #get the unique standard term of this line except ValueError: j += 1 continue for term in self.tfidf.vocabulary_: if term in self.lineNo[file]: self.lineNo[file][term].append(j) else: self.lineNo[file][term] = [j] j += 1 i += 1 else: self.documents.pop(file) self.files.remove(file) fs -= 1 else: self.files.remove(file) print('finish reading') # self.files = list(self.documents.keys()) size = len(self.documents) self.lw.write_info_log("get " + str(size) + " documents") self.lw.write_info_log("indexing...") self.stopwords = [ 'and', 'edition', 'for', 'in', 'little', 'of', 'the', 'to', 'print' ] self.re = self.tfidf.fit_transform( self.contents).toarray().T # tf-idf values self.idf = self.tfidf.idf_ self.word = self.word = list(self.tfidf.vocabulary_.keys()) #compression matrix self.re = dok_matrix(self.re) # self.X=dok_matrix(self.X) print("start SVD") # svd decomposition self.u, self.s, self.d = svds(self.re, k=500, return_singular_vectors='u') print('start dumping') # store the index into the pickle with open( self.index_path, 'wb' ) as f: # use pickle module to save data into file 'CodexIndex.pik' pickle.dump(self.s, f, True) pickle.dump(self.u, f, True) pickle.dump(self.d, f, True) pickle.dump(self.tfidf, f, True) pickle.dump(self.lineNo, f, True) print('finish') def getResult(self, query): self.vectorizer = CountVectorizer() # if there exist the pickle file, read it if os.path.exists(self.index_path): print("in===1") rfile = open(self.index_path, 'rb') self.s = pickle.load(rfile) self.u = pickle.load(rfile) self.d = pickle.load(rfile) self.tfidf = pickle.load(rfile) self.lineNo = pickle.load(rfile) self.idf = self.tfidf.idf_ self.word = list(self.tfidf.vocabulary_.keys()) self.files = list(self.lineNo.keys()) else: # if there is no such pickle file, indexing self.indexing() l = self.MatrixSearching(query, self.s, self.u, self.d.T) if l is None: return Results.Results(0) print("in===2") results = Results.Results(numOfResults=l[3], matchingLines=l[2], hitDocs=l[1], fullHitLines=l[0]) return results # return results def MatrixSearching(self, query, s, u, d): qFreq = self.vectorizer.fit_transform( [query]).toarray().T # make the vectorizer fit the query qWord = self.vectorizer.get_feature_names( ) # the unique terms after preprocessing qArr = np.zeros([1, len(self.word)]) # fill in the tf-idf into the empty Xq matrix ifEmpty = True j = 0 for w in qWord: i = qWord.index(w) if w in self.word: j = self.word.index(w) qArr[0][j] = qFreq[i] * self.idf[j] ifEmpty = False # give the warning and stop searching if no terms found if ifEmpty: self.lw.write_warning_log("Nothing found!") return None # similarities from Dq=X.T * T * S-1. sDiagno = np.diag(np.array(s)) sInv = np.linalg.inv(sDiagno) Dq = np.dot(qArr, u) Dq = np.dot(Dq, sInv) matchingLines = {} # {similarity:[(docName, [hit lines])] } hitDocs = {} # {lengthHits:[(docName,[hit lines])]} fullHitLines = {} # {fullHitNum:[(docName,[hit lines])]} length = 0 for i in range(len(d)): k = self.files[i] similarity = ((np.dot(Dq, d[i])) / ((np.linalg.norm(Dq)) * (np.linalg.norm(d[i]))))[0] length += 1 hitLines = [] hitWords = 0 commonLines = [] for t in qWord: if t in self.lineNo[k]: hitWords += 1 hitLines = list( set(hitLines).union(set(self.lineNo[k][t]))) if hitWords == 1: commonLines = self.lineNo[k][t] commonLines = list( set(commonLines).intersection(set(self.lineNo[k][t]))) lengthHit = len(hitLines) * hitWords if hitWords > 1: fullHit = len(commonLines) else: fullHit = 0 if fullHit > 0: if fullHit in fullHitLines: fullHitLines[fullHit].append((k, hitLines)) else: fullHitLines[fullHit] = [(k, hitLines)] elif lengthHit > 0: if lengthHit in hitDocs: hitDocs[lengthHit].append((k, hitLines)) else: hitDocs[lengthHit] = [(k, hitLines)] else: if similarity > 0: if similarity not in matchingLines: matchingLines[similarity] = [(k, hitLines)] else: matchingLines[similarity].append((k, hitLines)) else: # don't store it length -= 1 return (fullHitLines, hitDocs, matchingLines, length)
class JavaAST(): r = redis.Redis( host='localhost', port=6379, decode_responses=True) # host是redis主机,需要redis服务端和客户端都启动 redis默认端口是6379 lw = lg.LogWriter() path = '' # path name index_path = configs['AST_java_pickle_path'] weights = {} # {weight:[fileNames] } fileIndex = {} # {fileName: {weight:{nodeHash:(startLine,EndLine)] } } files = [] documents = {} lastLineNo = 0 # these parameters should be tuned matchingThreshold = 0.6 weightThreshold = 10 # weight outweigh weightThreshold will be taken into consideration blockThreshold = 50 # weight outweigh the blockthreshold means this node will be a code block which should be included into the global searching pageNum = configs['page_num'] wholeSimilarity = 0 matchingBlock = { } # {docID: (the startline and endline of the matching blocks)}. blockWeights = { } # {docID: (startline, endline): weight of the biggest matching block} expireTime = 1 def readFiles(self): self.lw.write_info_log("reading files...") self.files = os.listdir(self.path) # get all the file names if '.DS_Store' in self.files: self.files.remove('.DS_Store') for file in self.files: # go through the folder if not os.path.isdir(file): # judge if it is a folder self.documents[file] = conv.to_dic(self.path + "/" + file) # self.documents[file]=open(self.path+'/'+file,'r').read() if len(self.documents[file]['content'].strip()) > 0: try: tree = javalang.parse.parse( self.documents[file]['content']) except (javalang.parser.JavaSyntaxError): self.lw.write_error_log("syntax error! " + file) continue # remove strings and variable names self.fileIndex[file] = {} names = [] # self defined name self.lastLineNo = 0 self.index(tree, file, names, {}, {}, False) # print(self.fileIndex[file]) else: self.documents.pop(file) self.files = list(self.documents.keys()) self.lw.write_info_log("get " + str(len(self.documents)) + " documents") # use pickle module to save data into file 'CodexIndexAST.pik' with open(self.index_path, 'wb') as f: pickle.dump(self.weights, f, True) pickle.dump(self.fileIndex, f, True) # self.names=[] # tree=javalang.parse.parse(q3) # self.fileIndex['q3'] = {} # self.index(tree, 'q3') # print('#############################################') # for weight in self.fileIndex['q2']: # if weight in self.fileIndex['q3']: # print(weight) # print(self.fileIndex['q2'][weight]) # print(self.fileIndex['q3'][weight]) # else: # print(weight) # # print(self.fileIndex['q2']) # print(self.fileIndex['q3']) def index(self, root, fileName, names, nestHash, qLineNums, nestedDic): weight = 1 min = 0 max = 0 i = 0 startLine = 0 endLine = 0 # print('-----------------------') # print(root) attriValues = '' # "attr1 attr2 attr3" if isinstance(root, list) and len(root) == 0: return (weight, min, max, '') hashAttris = None if not isinstance(root, list): # if hasattr(root, "_position"): # print(root._position) if isinstance(root, javalang.ast.Node): children = list(root.children) elif isinstance(root, tuple): children = root else: min = self.lastLineNo + 1 max = self.lastLineNo + 1 return (weight, min, max, attriValues) # get attributes information hasContent = False if hasattr(root, 'attrs'): attriValues += '( ' for a in root.attrs: v = root.__getattribute__(a) if a is not 'documentation': # remove identifier names # except javalang.tree.ReferenceType, javalang.tree.ReferenceType, if a is 'name' and ( (not isinstance(root, javalang.tree.ReferenceType) and not isinstance(root, javalang.tree.ReferenceType)) or v in names): if v not in names: names.append(v) # print('================') # print((type(root),a)) # print(v) children.remove(v) continue elif a is 'member': # PROBLEM: if the member is a method name not in self.names(defined below the current node), we will fail to ignore it if v in names: # print('~~~~~~~~~~~~~~~~~~~~~~~~~') # print(v) children.remove(v) continue elif a == 'qualifier': # remove printing out if v == 'System.out': return (0, min, max, None) elif v in names: children.remove(v) continue elif v == 'MethodInvocation': if hasattr(v, 'qualifier') and v.__getattribute__( 'qualifier') == 'System.out': return (0, min, max, None) # ignore values like strings, numbers, booleans, null elif a == 'value' and type(v) is str: children.remove(v) continue elif v != None and v != '' and not (isinstance( v, list) and len(v) == 0): if not isinstance(v, list): if isinstance( v, javalang.tree. MethodInvocation) and hasattr( v, 'attrs') and v.__getattribute__( 'qualifier') == 'System.out': return (0, min, max, None) hasContent = True # print(v) if isinstance(v, set) and len(v) > 1: # print('//////////////////////////////////////') v1 = list(v) v1.sort() # print(v1) attriValues += str(v1) + ": " else: # print('111111111111111111') # print(v) attriValues += str(v) + ": " # print(attriValues) if isinstance(v, (javalang.ast.Node, tuple, list)): children.remove(v) t = self.index(v, fileName, names, nestHash, qLineNums, nestedDic) weight += t[0] if t[1] > 0: startLine = t[1] if i == 0: min = startLine elif startLine < min: min = startLine i += 1 if t[2] > 0: endLine = t[2] if endLine > max: max = endLine i += 1 if t[3] != '' and t[ 3] is not None and t[3] != '( ': hasContent = True attriValues += t[3] + ', ' else: children.remove(v) if len(children) > 0: if not hasattr(root, 'attrs'): attriValues += '( ' for child in children: # ignore some meaningless nodes if child != None and child != '' and not isinstance( child, list) and child not in names: if isinstance(child, set) and len(child) > 1: # print('//////////////////////////////////////') child1 = list(child) child1.sort() # print(child1) attriValues += str(child1) + ": " else: # print('22222222222222222') # print(child) attriValues += str(child) + ': ' if isinstance(child, (javalang.ast.Node, tuple, list)): t = self.index(child, fileName, names, nestHash, qLineNums, nestedDic) weight += t[0] if t[1] > 0: startLine = t[1] if i == 0: min = startLine elif startLine < min: min = startLine i += 1 if t[2] > 0: endLine = t[2] if endLine > max: max = endLine i += 1 if t[3] is not '' and t[3] is not None and t[3] != '( ': hasContent = True attriValues += t[3] + ', ' if hasContent: attriValues += ' )' else: # no brackets attriValues = attriValues.lstrip('( ') # work out line number if hasattr(root, "_position"): lineNo = root._position[0] if min == 0 and max == 0: min = lineNo max = lineNo # put the weight into weights if weight >= self.weightThreshold: if min == 0 and max == 0: min = self.lastLineNo + 1 max = self.lastLineNo + 1 self.lastLineNo = max if not nestedDic: if weight in self.weights: if fileName not in self.weights[weight]: self.weights[weight].append(fileName) else: self.weights[weight] = [fileName] # hash the attribute values list m = hashlib.md5() m.update(attriValues.encode("utf8")) hashAttris = m.hexdigest() # print(',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,') # print(self.fileIndex[fileName]) # put the node into fileIndex if weight not in self.fileIndex[fileName]: self.fileIndex[fileName][weight] = {} self.fileIndex[fileName][weight][hashAttris] = (min, max) # print(weight) # print(attriValues) # print((str(root),hashAttris,min,max)) # put all its childern in this file into the current node if nestedDic: nestHash[(weight, hashAttris, min, max)] = {} qLineNums[hashAttris] = (min, max) for w in self.fileIndex[fileName]: if w < weight: keys = list(self.fileIndex[fileName][w].keys()) for k in keys: t = self.fileIndex[fileName][w][k] if t[0] >= min and t[1] <= max: # print('11111111111111111111111111111111') # print((w,k,t[0],t[1])) # print((weight,hashAttris,min,max)) # the block is the sub node of the current node nestHash[(weight, hashAttris, min, max)][(w, k, t[0], t[1])] = {} self.fileIndex[fileName][w].pop(k) # put the children in nestHash into the current node keys2 = list(nestHash.keys()) for k in keys2: if k in nestHash: if k[0] < weight and k[2] >= min and k[3] <= max: # print('!!!!!!!!!!!!!!!!!!!!!!!!!') # print(k) # print((weight,hashAttris,min,max)) nestHash[(weight, hashAttris, min, max)][k] = nestHash.pop(k) if max > 0: self.lastLineNo = max return (weight, min, max, attriValues) else: l = [] length = len(root) j = 0 while j < length: r = root[j] rStr = '' if r is not None and r is not '': t = self.index(r, fileName, names, nestHash, qLineNums, nestedDic) weight += t[0] if t[1] > 0: startLine = t[1] if i == 0: min = startLine elif startLine < min: min = startLine i += 1 if t[2] > 0: endLine = t[2] if endLine > max: max = endLine i += 1 if t[3] is not None: rStr += str(r) + ':' if t[3] is not '': rStr += t[3] else: rStr += '' l.append(rStr) j += 1 else: root.pop(j) length -= 1 else: root.pop(j) length -= 1 if len(l) > 0: # sort the list in order to ignore the code order change l.sort() attriValues += '[ ' + ''.join(l) + ' ]' if max > 0: self.lastLineNo = max return (weight, min, max, attriValues) # interface to front end. Input query, return a Result instance def getResults(self, query, page): globalSimilarity = None matchingBlocks = None componentDocuments = [] if not self.r.exists(query): # if the result is not in the redis # read pickle file if os.path.exists(self.index_path): rfile = open(self.index_path, 'rb') self.weights = pickle.load(rfile) self.fileIndex = pickle.load(rfile) else: self.readFiles() # store the result of the query into redis matchingLines = {} # {fileName:[(qStart,qEnd, fStart,fEnd)]} similarities = self.search(query, matchingLines) if similarities == None: self.lw.write_error_log('Pickle files not found!') return None elif similarities == 0: return 0 # get the normal relevant documents and the suspected plagiarized documents globalSimilarity = self.wholeSimilarity matchingBlocks = self.matchingBlock documentList = sorted(similarities, key=similarities.get, reverse=True) plagiarismList = [] # [sorted plagiarised files] i = 0 for d in documentList: if similarities[d] > self.matchingThreshold: plagiarismList.append(d) # print(similarities[d]) # matchingLines[d].sort() # print(matchingLines[d]) i += 1 else: break documentList = documentList[i:] componentDocuments = list(matchingBlocks.keys()) # store data into the redis server self.lw.write_info_log( "storing results into redis in form of list") self.r.rpush(query, plagiarismList) self.r.rpush(query, documentList) self.r.rpush(query, matchingLines) if globalSimilarity >= self.matchingThreshold and len( matchingBlocks) != 0 and len(componentDocuments) > 1: if len(plagiarismList) > 0: if globalSimilarity >= similarities[plagiarismList[0]]: self.r.rpush(query, globalSimilarity) self.r.rpush(query, matchingBlocks) self.r.rpush(query, componentDocuments) self.r.rpush(query, self.blockWeights) else: componentDocuments = [] matchingBlocks = None globalSimilarity = None else: # if no plagiarised case is found, display the component programs self.r.rpush(query, globalSimilarity) self.r.rpush(query, matchingBlocks) self.r.rpush(query, componentDocuments) else: componentDocuments = [] matchingBlocks = None globalSimilarity = None # get the result list of this query from redis else: self.lw.write_info_log("geting results from redis") plagiarismList = eval(self.r.lindex(query, 0)) documentList = eval(self.r.lindex(query, 1)) matchingLines = eval(self.r.lindex(query, 2)) if self.r.llen(query) >= 6: globalSimilarity = eval(self.r.lindex(query, 3)) matchingBlocks = eval(self.r.lindex(query, 4)) componentDocuments = eval(self.r.lindex(query, 5)) self.blockWeights = eval(self.r.lindex(query, 6)) self.r.expire(query, self.expireTime) # expire after 30s # encalsulate results into the object:Result documentListLength = len(documentList) plagiarismListLength = len(plagiarismList) matchingblocksLength = len(componentDocuments) length = documentListLength + plagiarismListLength + matchingblocksLength results = Results.Results(numOfResults=length, matchingLines=matchingLines, globalSimilarity=globalSimilarity, matchingBlocks=matchingBlocks, blockWeights=self.blockWeights) disMatchingBlocks = [] disPlagiarismList = [] disDocumentList = [] if ( page - 1 ) * self.pageNum < matchingblocksLength: # need to display the maching blocks disMatchingBlocks = componentDocuments[ (page - 1) * self.pageNum:min(page * self.pageNum, matchingblocksLength)] results.setComponentDocuments(disMatchingBlocks) if ( page - 1 ) * self.pageNum < matchingblocksLength + plagiarismListLength and page * self.pageNum >= matchingblocksLength: # need to display the plagiarism documents if len(disMatchingBlocks) == 0 and page > 1: # not start from 0 disPlagiarismList = plagiarismList[ (page - 1) * self.pageNum - matchingblocksLength:min(( page * self.pageNum - matchingblocksLength), plagiarismListLength)] else: # start from 0 disPlagiarismList = plagiarismList[ 0:min(self.pageNum, plagiarismListLength)] results.setPlagiarismList(disPlagiarismList) if page * self.pageNum > matchingblocksLength + plagiarismListLength: # need to dispaly the relevant documents if len(disMatchingBlocks) == 0 and len( disPlagiarismList ) == 0 and (page - 1) * self.pageNum <= length: # not start from 0 disDocumentList = documentList[ (page - 1) * self.pageNum - matchingblocksLength - plagiarismListLength:min(( page * self.pageNum - matchingblocksLength - plagiarismListLength), documentListLength)] elif (page - 1) * self.pageNum <= length: # start from 0 disDocumentList = documentList[0:min(( self.pageNum - matchingblocksLength - plagiarismListLength), documentListLength)] else: self.lw.write_error_log("page number out of range") return None results.setDocumentList(disDocumentList) # print('==============') # results.toString() return results def search(self, query, matchingLines ): # matchingLines {fileName:[(qStart,qEnd, fStart,fEnd)]} # refresh the global variables self.wholeSimilarity = 0 self.matchingBlock = {} self.blockWeights = {} qTree = { } # {(weight,nodeHash,startLine, endLine):{nested dictionaries}} qLineNums = {} root = javalang.parse.parse(query) self.fileIndex['query'] = {} names = [] self.lastLineNo = 0 self.index(root, 'query', names, qTree, qLineNums, True) # print(qTree) # print(qLineNums) self.fileIndex.pop('query') similarities = {} # {fileName:score} maxWeight = list(qTree.keys())[0][0] # print(maxWeight) self.similarities(qTree, self.weights, similarities, maxWeight, qLineNums, matchingLines) # work out the global similarity for dic in self.blockWeights: biggestKey = sorted(self.blockWeights[dic], key=self.blockWeights[dic].get, reverse=True)[0] if self.blockWeights[dic][biggestKey] > self.blockThreshold: ds = list(self.matchingBlock.keys()) store = True for d in ds: block = self.matchingBlock[d] # do not store in if the new block is included in some block within the matchBlock if biggestKey[0] >= block[0] and biggestKey[1] <= block[1]: store = False break # delete the older block included in the new block elif biggestKey[0] <= block[0] and biggestKey[1] >= block[ 1]: self.matchingBlock.pop(d) self.wholeSimilarity -= self.blockWeights[d][ block] / maxWeight # deal with the block that have some part overlapping with old blocks (store the one with bigger weight) elif (biggestKey[0] <= block[1] and biggestKey[0] >= block[0]) or (biggestKey[1] <= block[1] and biggestKey[1] >= block[0]): if self.blockWeights[dic][ biggestKey] > self.blockWeights[d][block]: self.matchingBlock.pop(d) self.wholeSimilarity -= self.blockWeights[d][ block] / maxWeight else: store = False break # store the new block if store: self.matchingBlock[dic] = biggestKey self.wholeSimilarity += self.blockWeights[dic][ biggestKey] / maxWeight return similarities # calculate the similarities between corpus and query def similarities(self, qTree, weights, similarities, maxWeight, qLineNums, matchingLines): # matchingBlock: {docID: (the startline and endline of the matching blocks)}. # blockWeights: {docID: (qStartline, qEndline): weight of the biggest matching block} if maxWeight is None: maxWeight = 1 for w in qTree: if isinstance(w, tuple): find = False if w[0] in weights: for file in weights[w[0]]: # check if the nodeHash is in this file if w[1] in self.fileIndex[file][w[0]]: find = True qs = w[2] qe = w[3] fs = self.fileIndex[file][w[0]][w[1]][0] fe = self.fileIndex[file][w[0]][w[1]][1] if file in similarities: matchingLines[file].append((qs, qe, fs, fe)) similarities[file] += w[0] / maxWeight else: matchingLines[file] = [(qs, qe, fs, fe)] similarities[file] = w[0] / maxWeight # merge lines in query program to construct the code blocks forwMerge = False BackMerge = False if file not in self.blockWeights: self.blockWeights[file] = {} elif (qs, qe) in self.blockWeights[file]: if w[0] > self.blockWeights[file][(qs, qe)]: self.blockWeights[file][(qs, qe)] = w[0] continue keys = list(self.blockWeights[file].keys()) for mLines in keys: if mLines[1] < qs: insertion = False # check insertion for k in qLineNums: lines = qLineNums[k] if (lines[0] > mLines[1] and lines[0] < qs) or (lines[1] > mLines[1] and lines[1] < qs): insertion = True break if not insertion: self.blockWeights[file][( mLines[0], qe)] = w[0] + self.blockWeights[ file][mLines] self.blockWeights[file].pop(mLines) forwMerge = True elif mLines[0] > qe: insertion = False # check insertion for lines in qLineNums.values(): if (lines[1] < mLines[0] and lines[1] > qe) or (lines[0] < mLines[0] and lines[0] > qe): insertion = True break if not insertion: self.blockWeights[file][( qs, mLines[1] )] = w[0] + self.blockWeights[file][ mLines] self.blockWeights[file].pop(mLines) BackMerge = True if forwMerge and BackMerge: break if not forwMerge and not BackMerge: self.blockWeights[file][(qs, qe)] = w[0] if not find and qTree[w] is not None: if len(qTree[w]) > 0: self.similarities(qTree[w], weights, similarities, maxWeight, qLineNums, matchingLines) def import_in(self, filename): dic = conv.to_dic(file_name=filename) print(dic['content'])
class ASTSearching(Singleton): r = redis.Redis( host='localhost', port=6379, decode_responses=True) # host是redis主机,需要redis服务端和客户端都启动 redis默认端口是6379 lw = lg.LogWriter() path = "" # path name index_path = config['AST_python_pickle_path'] files = [] documents = {} # hashTrees={}#{fileName: {nodeHash: {nested dictionaries with hash values in stand of nodes} } } # -----compare with hashTrees and choose the efficient one------- hashDic = {} # {fileName:{weight:[nodeHash]} visitor = mv.MyVisitor() weights = {} # {weight:[fileNames] } lineNums = {} # {fileName: {nodeHash: (startLine, endLine)}} # these parameters should be tuned matchingThreshold = 0.6 weightThreshold = 10 # weight outweigh weightThreshold will be taken into consideration blockThreshold = 50 # weight outweigh the blockthreshold means this node will be a code block which should be included into the global searching pageNum = 10 wholeSimilarity = 0 matchingBlock = { } # {docID: (the startline and endline of the matching blocks)}. blockWeights = { } # {docID: (startline, endline): weight of the biggest matching block} expireTime = 1 # parse the corpus def ReadFiles(self): self.lw.write_info_log("reading files...") self.files = os.listdir(self.path) # get all the file names if '.DS_Store' in self.files: self.files.remove('.DS_Store') for file in self.files: # go through the folder if not os.path.isdir(file): # judge if it is a folder self.documents[file] = conv.to_dic(self.path + "/" + file) if len(self.documents[file]['content'].strip()) > 0: try: root = ast.parse(str(self.documents[file]['content'])) except (SyntaxError): self.lw.write_error_log("syntax error! " + file) continue # remove strings and variable names self.visitor.visit(root) self.lineNums[file] = {} self.hashDic[file] = {} self.Indexing(root, self.lineNums[file], self.weights, file) else: self.documents.pop(file) self.files = list(self.documents.keys()) self.lw.write_info_log("get " + str(len(self.documents)) + " documents") # use pickle module to save data into file 'CodexIndexAST.pik' with open(self.index_path, 'wb') as f: pickle.dump(self.weights, f, True) pickle.dump(self.lineNums, f, True) pickle.dump(self.hashDic, f, True) # turn every document root into index def Indexing(self, node, lineNums, weights, fileName): weight = 1 min = 0 max = 0 i = 0 startLine = 0 endLine = 0 if isinstance(node, ast.AST): m = hashlib.md5() m.update(ast.dump(node).encode("utf8")) nodeStr = m.hexdigest() for n, m in ast.iter_fields(node): tuple = self.Indexing(m, lineNums, weights, fileName) weight += tuple[0] if tuple[1] > 0: startLine = tuple[1] if i == 0: min = startLine elif startLine < min: min = startLine i += 1 if tuple[2] > 0: endLine = tuple[2] if endLine > max: max = endLine i += 1 if node._attributes: lineNo = getattr(node, 'lineno') if min == 0 and max == 0: min = lineNo max = lineNo if weight >= self.weightThreshold: if weight in weights: if fileName not in weights[weight]: weights[weight].append(fileName) else: weights[weight] = [fileName] # put the hash node into hash dictionary if weight in self.hashDic[fileName]: self.hashDic[fileName][weight].append(nodeStr) else: self.hashDic[fileName][weight] = [nodeStr] lineNums[nodeStr] = (min, max) return (weight, min, max) elif isinstance(node, list): for x in node: tuple = self.Indexing(x, lineNums, weights, fileName) weight += tuple[0] if tuple[1] > 0: startLine = tuple[1] if i == 0: min = startLine elif startLine < min: min = startLine i += 1 if tuple[2] > 0: endLine = tuple[2] if endLine > max: max = endLine i += 1 return (weight, min, max) return (weight, min, max) # interface to front end. Input query, return a Result instance def getResults(self, query, page): globalSimilarity = 0 matchingBlocks = {} componentDocuments = [] if not self.r.exists(query): # if the result is not in the redis if os.path.exists(self.index_path): rfile = open(self.index_path, 'rb') self.weights = pickle.load(rfile) self.lineNums = pickle.load(rfile) self.hashDic = pickle.load(rfile) else: self.ReadFiles() # store the result of the query into redis matchingLines = {} # {fileName:[(qStart,qEnd, fStart,fEnd)]} similarities = self.search(query, matchingLines) if similarities == None: self.lw.write_error_log('Pickle files not found!') return None elif similarities == 0: return 0 # get the normal relevant documents and the suspected plagiarized documents globalSimilarity = self.wholeSimilarity matchingBlocks = self.matchingBlock documentList = sorted(similarities, key=similarities.get, reverse=True) plagiarismList = [] # [sorted plagiarised files] i = 0 for d in documentList: if similarities[d] > self.matchingThreshold: plagiarismList.append(d) # print(similarities[d]) # matchingLines[d].sort() # print(matchingLines[d]) i += 1 else: break documentList = documentList[i:] componentDocuments = list(matchingBlocks.keys()) # store data into the redis server self.lw.write_info_log( "storing results into redis in form of list") self.r.rpush(query, plagiarismList) self.r.rpush(query, documentList) self.r.rpush(query, matchingLines) if globalSimilarity >= self.matchingThreshold and len( matchingBlocks) != 0 and len(componentDocuments) > 1: if len(plagiarismList) > 0: if globalSimilarity >= similarities[plagiarismList[0]]: self.r.rpush(query, globalSimilarity) self.r.rpush(query, matchingBlocks) self.r.rpush(query, componentDocuments) else: componentDocuments = [] matchingBlocks = None globalSimilarity = None else: self.r.rpush(query, globalSimilarity) self.r.rpush(query, matchingBlocks) self.r.rpush(query, componentDocuments) else: componentDocuments = [] matchingBlocks = None globalSimilarity = None # get the result list of this query from redis else: self.lw.write_info_log("geting results from redis") plagiarismList = eval(self.r.lindex(query, 0)) documentList = eval(self.r.lindex(query, 1)) matchingLines = eval(self.r.lindex(query, 2)) if self.r.llen(query) >= 6: globalSimilarity = eval(self.r.lindex(query, 3)) matchingBlocks = eval(self.r.lindex(query, 4)) componentDocuments = eval(self.r.lindex(query, 5)) self.r.expire(query, self.expireTime) # expire after 30s # encalsulate results into the object:Result documentListLength = len(documentList) plagiarismListLength = len(plagiarismList) matchingblocksLength = len(componentDocuments) length = documentListLength + plagiarismListLength + matchingblocksLength results = Results.Results(numOfResults=length, matchingLines=matchingLines, globalSimilarity=globalSimilarity, matchingBlocks=matchingBlocks) disMatchingBlocks = [] disPlagiarismList = [] disDocumentList = [] if ( page - 1 ) * self.pageNum < matchingblocksLength: # need to display the maching blocks disMatchingBlocks = componentDocuments[ (page - 1) * self.pageNum:min(page * self.pageNum, matchingblocksLength)] results.setComponentDocuments(disMatchingBlocks) if ( page - 1 ) * self.pageNum < matchingblocksLength + plagiarismListLength and page * self.pageNum >= matchingblocksLength: # need to display the plagiarism documents if len(disMatchingBlocks) == 0 and page > 1: # not start from 0 disPlagiarismList = plagiarismList[ (page - 1) * self.pageNum - matchingblocksLength:min(( page * self.pageNum - matchingblocksLength), plagiarismListLength)] else: # start from 0 disPlagiarismList = plagiarismList[ 0:min(self.pageNum, plagiarismListLength)] results.setPlagiarismList(disPlagiarismList) if page * self.pageNum > matchingblocksLength + plagiarismListLength: # need to dispaly the relevant documents if len(disMatchingBlocks) == 0 and len( disPlagiarismList ) == 0 and (page - 1) * self.pageNum <= length: # not start from 0 disDocumentList = documentList[ (page - 1) * self.pageNum - matchingblocksLength - plagiarismListLength:min(( page * self.pageNum - matchingblocksLength - plagiarismListLength), documentListLength)] elif (page - 1) * self.pageNum <= length: # start from 0 disDocumentList = documentList[0:min(( self.pageNum - matchingblocksLength - plagiarismListLength), documentListLength)] else: self.lw.write_error_log("page number out of range") return None results.setDocumentList(disDocumentList) # print('==============') # results.toString() return results # break the query tree into nodes and calculate their weights def queryWeight(self, node, lineNums, tree): weight = 1 min = 0 max = 0 i = 0 startLine = 0 endLine = 0 if isinstance(node, ast.AST): m = hashlib.md5() m.update(ast.dump(node).encode("utf8")) nodeStr = m.hexdigest() tree[nodeStr] = {} for n, m in ast.iter_fields(node): tuple = self.queryWeight(m, lineNums, tree[nodeStr]) weight += tuple[0] if tuple[1] > 0: startLine = tuple[1] if i == 0: min = startLine elif startLine < min: min = startLine i += 1 if tuple[2] > 0: endLine = tuple[2] if endLine > max: max = endLine i += 1 if node._attributes: lineNo = getattr(node, 'lineno') if min == 0 and max == 0: min = lineNo max = lineNo if weight >= self.weightThreshold: lineNums[nodeStr] = (min, max) tree[(weight, nodeStr)] = tree.pop(nodeStr) if len(tree[(weight, nodeStr)]) == 0: tree[(weight, nodeStr)] = None else: tree.pop(nodeStr) return (weight, min, max) elif isinstance(node, list): for x in node: tuple = self.queryWeight(x, lineNums, tree) weight += tuple[0] if tuple[1] > 0: startLine = tuple[1] if i == 0: min = startLine elif startLine < min: min = startLine i += 1 if tuple[2] > 0: endLine = tuple[2] if endLine > max: max = endLine i += 1 return (weight, min, max) return (weight, min, max) # search plagiarism code with query def search(self, query, matchingLines): # refresh the global variables self.wholeSimilarity = 0 self.matchingBlock = {} self.blockWeights = {} qTree = {} # {(weight,nodeHash):{nested dictionaries}} qLineNums = {} # {nodeHash:(start,end)} try: qNode = ast.parse(query) except (SyntaxError): self.lw.write_error_log("syntax error in qeury! ") return 0 self.visitor.visit(qNode) # print(ast.dump(qNode,include_attributes=True)) self.queryWeight(qNode, qLineNums, qTree) # print(qTree) # print(qLineNums) maxWeight = list(qTree.keys())[0][0] similarities = {} # {fileName:score} self.similarities(qTree, self.weights, similarities, maxWeight, qLineNums, self.lineNums, matchingLines) # work out the global similarity for dic in self.blockWeights: biggestKey = sorted(self.blockWeights[dic], key=self.blockWeights[dic].get, reverse=True)[0] if self.blockWeights[dic][biggestKey] > self.blockThreshold: ds = list(self.matchingBlock.keys()) store = True for d in ds: block = self.matchingBlock[d] # do not store in if the new block is included in some block within the matchBlock if biggestKey[0] >= block[0] and biggestKey[1] <= block[1]: store = False break # delete the older block included in the new block elif biggestKey[0] <= block[0] and biggestKey[1] >= block[ 1]: self.matchingBlock.pop(d) self.wholeSimilarity -= self.blockWeights[d][ block] / maxWeight # deal with the block that have some part overlapping with old blocks (store the one with bigger weight) elif (biggestKey[0] <= block[1] and biggestKey[0] >= block[0]) or (biggestKey[1] <= block[1] and biggestKey[1] >= block[0]): if self.blockWeights[dic][ biggestKey] > self.blockWeights[d][block]: self.matchingBlock.pop(d) self.wholeSimilarity -= self.blockWeights[d][ block] / maxWeight else: store = False break # store the new block if store: self.matchingBlock[dic] = biggestKey self.wholeSimilarity += self.blockWeights[dic][ biggestKey] / maxWeight return similarities # calculate the similarities between corpus and query def similarities(self, qTree, weights, similarities, maxWeight, qLineNums, lineNums, matchingLines): if maxWeight is None: maxWeight = 1 for w in qTree: if isinstance(w, tuple): find = False if w[0] in weights: for file in weights[w[0]]: if w[1] in self.hashDic[file][w[0]]: find = True qs = qLineNums[w[1]][0] qe = qLineNums[w[1]][1] fs = lineNums[file][w[1]][0] fe = lineNums[file][w[1]][1] if file in similarities: matchingLines[file].append((qs, qe, fs, fe)) similarities[file] += w[0] / maxWeight else: matchingLines[file] = [(qs, qe, fs, fe)] similarities[file] = w[0] / maxWeight # merge lines in query program to construct the code blocks forwMerge = False BackMerge = False if file not in self.blockWeights: self.blockWeights[file] = {} elif (qs, qe) in self.blockWeights[file]: if w[0] > self.blockWeights[file][(qs, qe)]: self.blockWeights[file][(qs, qe)] = w[0] continue keys = list(self.blockWeights[file].keys()) for mLines in keys: if mLines[1] < qs: insertion = False # check insertion for k in qLineNums: lines = qLineNums[k] if (lines[0] > mLines[1] and lines[0] < qs) or (lines[1] > mLines[1] and lines[1] < qs): insertion = True break if not insertion: self.blockWeights[file][( mLines[0], qe)] = w[0] + self.blockWeights[ file][mLines] self.blockWeights[file].pop(mLines) forwMerge = True elif mLines[0] > qe: insertion = False # check insertion for lines in qLineNums.values(): if (lines[1] < mLines[0] and lines[1] > qe) or (lines[0] < mLines[0] and lines[0] > qe): insertion = True break if not insertion: self.blockWeights[file][( qs, mLines[1] )] = w[0] + self.blockWeights[file][ mLines] self.blockWeights[file].pop(mLines) BackMerge = True if forwMerge and BackMerge: break if not forwMerge and not BackMerge: self.blockWeights[file][(qs, qe)] = w[0] if not find and qTree[w] is not None: if len(qTree[w]) > 0: self.similarities(qTree[w], weights, similarities, maxWeight, qLineNums, lineNums, matchingLines) # find a key in a nested dictionary def dict_get(self, weight, d, objkey, default, weights, fileName): for k, v in d.items(): # if find the key, delete this node (avoid repeated searching) if k == objkey: # weights[weight].remove(fileName) return d.pop(k) else: if isinstance(v, dict): # if k[0]>objkey[0]: ret = self.dict_get(weight, v, objkey, default, weights, fileName) if ret is not default: return ret return default def import_in(self, filename): dic = conv.to_dic(file_name=filename) # return self.compareQueries(dic['code'],q1) # compare if two queries are the same using hash functions def compareQueries(self, query1, query2): h1 = self.nodeToHash(query1) h2 = self.nodeToHash(query2) return h1 == h2 # parse a query def nodeToHash(self, node): qRoot = ast.parse(node) self.visitor.visit(qRoot) qt = ast.dump(qRoot) m = hashlib.md5() m.update(qt.encode("utf8")) h = m.hexdigest() return h
class LSI_TFIDF(): r = redis.Redis(host='localhost', port=6379, decode_responses=True) lw = lg.LogWriter() # get files path = '' # path name index_path = configs['LSI_pickle_path'] files = [] documents = {} sortedDocuments = [] contents = [] X = None re = None word = None vectorizer = None tfidf = None s = None u = None d = None idf = None lineNo = {} expireTime = 600 end_time = time.clock() pageNum = configs['page_num'] # def __init__(self): # self.vectorizer = CountVectorizer() # #if there exist the pickle file, read it # if os.path.exists(self.index_path): # rfile=open(self.index_path, 'rb') # self.s = pickle.load(rfile) # self.u = pickle.load(rfile) # self.d = pickle.load(rfile) # self.tfidf = pickle.load(rfile) # self.lineNo=pickle.load(rfile) # # self.idf = self.tfidf.idf_ # self.word=list(self.tfidf.vocabulary_.keys()) # self.files=list(self.lineNo.keys()) # # else:#if there is no such pickle file, indexing # self.indexing() # indexing def indexing(self): self.lw.write_info_log("reading files...") self.files = os.listdir(self.path) # get all the file names if '.DS_Store' in self.files: self.files.remove('.DS_Store') fs = len(self.files) self.tfidf = TfidfVectorizer() i = 0 while i < fs: # go through the folder file = self.files[i] if not os.path.isdir(file): # judge if it is a folder self.documents[file] = conv.to_dic(self.path + "/" + file) if len(self.documents[file]['content'].strip()) > 0: self.contents.append(self.documents[file]['content']) # store the line numbers of the term self.lineNo[file] = {} j = 0 for line in self.documents[file]['content'].split('\n'): lineList = [line] if len(lineList) > 0: try: self.tfidf.fit_transform( lineList ) # get the unique standard term of this line except ValueError: j += 1 continue for term in self.tfidf.vocabulary_: if term in self.lineNo[file]: self.lineNo[file][term].append(j) else: self.lineNo[file][term] = [j] j += 1 i += 1 else: self.documents.pop(file) self.files.remove(file) fs -= 1 else: self.files.remove(file) print('finish reading') # self.files = list(self.documents.keys()) size = len(self.documents) self.lw.write_info_log("get " + str(size) + " documents") self.lw.write_info_log("indexing...") self.re = self.tfidf.fit_transform( self.contents).toarray().T # tf-idf values self.idf = self.tfidf.idf_ self.word = self.word = list(self.tfidf.vocabulary_.keys()) # compression matrix self.re = dok_matrix(self.re) # self.X=dok_matrix(self.X) print("start SVD") # svd decomposition self.u, self.s, self.d = svds(self.re, k=1000) print('start dumping') # store the index into the pickle with open( self.index_path, 'wb' ) as f: # use pickle module to save data into file 'CodexIndex.pik' pickle.dump(self.s, f, True) pickle.dump(self.u, f, True) pickle.dump(self.d, f, True) pickle.dump(self.tfidf, f, True) pickle.dump(self.lineNo, f, True) print('finish') def getResult(self, query, page): if not self.r.exists(query): # if the result is not in the redis self.vectorizer = CountVectorizer() # if there exist the pickle file, read it if os.path.exists(self.index_path): rfile = open(self.index_path, 'rb') self.s = pickle.load(rfile) self.u = pickle.load(rfile) self.d = pickle.load(rfile) self.tfidf = pickle.load(rfile) self.lineNo = pickle.load(rfile) self.idf = self.tfidf.idf_ self.word = list(self.tfidf.vocabulary_.keys()) self.files = list(self.lineNo.keys()) else: # if there is no such pickle file, indexing self.indexing() l = self.MatrixSearching(query, self.s, self.u, self.d.T) if l is None: return (0, []) fullHitLines = l[0] hitDocs = l[1] matchingLines = l[2] numOfResults = l[3] fullHitLineskeys = list(fullHitLines.keys()) hitDocskeys = list(hitDocs.keys()) matchingLineskeys = list(matchingLines.keys()) fullHitLineskeys.sort(reverse=True) hitDocskeys.sort(reverse=True) matchingLineskeys.sort(reverse=True) displayList = [] # [(docName,[hit lines])] if len(fullHitLineskeys) > 0: for k in fullHitLineskeys: for t in fullHitLines[k]: displayList.append(t) if len(hitDocskeys) > 0: # print('================') for k in hitDocskeys: for t in hitDocs[k]: displayList.append(t) if len(matchingLines) > 0: for k in matchingLineskeys: for t in matchingLines[k]: displayList.append(t) self.lw.write_info_log( "storing results into redis in form of list") self.r.rpush(query, numOfResults) self.r.rpush(query, displayList) else: self.lw.write_info_log("geting results from redis") numOfResults = eval(self.r.lindex(query, 0)) displayList = eval(self.r.lindex(query, 1)) self.r.expire(query, self.expireTime) currentDisplay = displayList[(page - 1) * self.pageNum:page * self.pageNum] return (numOfResults, currentDisplay) def MatrixSearching(self, query, s, u, d): qFreq = self.vectorizer.fit_transform( [query]).toarray().T # make the vectorizer fit the query qWord = self.vectorizer.get_feature_names( ) # the unique terms after preprocessing qArr = np.zeros([1, len(self.word)]) # fill in the tf-idf into the empty Xq matrix ifEmpty = True j = 0 for w in qWord: i = qWord.index(w) if w in self.word: j = self.word.index(w) qArr[0][j] = qFreq[i] * self.idf[j] ifEmpty = False # give the warning and stop searching if no terms found if ifEmpty: self.lw.write_warning_log("Nothing found!") return None # similarities from Dq=X.T * T * S-1. sDiagno = np.diag(np.array(s)) sInv = np.linalg.inv(sDiagno) Dq = np.dot(qArr, u) Dq = np.dot(Dq, sInv) matchingLines = {} # {similarity:[(docName, [hit lines])] } hitDocs = {} # {lengthHits:[(docName,[hit lines])]} fullHitLines = {} # {fullHitNum:[(docName,[hit lines])]} length = 0 for i in range(len(d)): k = self.files[i] similarity = ((np.dot(Dq, d[i])) / ((np.linalg.norm(Dq)) * (np.linalg.norm(d[i]))))[0] length += 1 hitLines = [] hitWords = 0 ifMiss = False commonLines = [] for t in qWord: if t in self.lineNo[k]: hitWords += 1 hitLines = list( set(hitLines).union(set(self.lineNo[k][t]))) if not ifMiss: if hitWords == 1: commonLines = self.lineNo[k][t] else: commonLines = list( set(commonLines).intersection( set(self.lineNo[k][t]))) else: ifMiss = True lengthHit = len(hitLines) * hitWords if hitWords > 1 and not ifMiss: fullHit = len(commonLines) else: fullHit = 0 if fullHit > 0: if fullHit in fullHitLines: fullHitLines[fullHit].append((k, hitLines)) else: fullHitLines[fullHit] = [(k, hitLines)] elif lengthHit > 0 and len(qWord) == 1: # print('-----------') if lengthHit in hitDocs: hitDocs[lengthHit].append((k, hitLines)) else: hitDocs[lengthHit] = [(k, hitLines)] else: if similarity > 0: if similarity not in matchingLines: matchingLines[similarity] = [(k, [])] else: matchingLines[similarity].append((k, [])) else: # don't store it length -= 1 # print(hitDocs) return (fullHitLines, hitDocs, matchingLines, length)