def nls_result(request): q = request.GET['q'] p = int(request.GET['p']) tfidf = NLP_LSI_TFIDF() result = tfidf.getResult(query=q, page=p) pages = [] f = result[1] total_p = (result[0] / 10) + 1 t_p = int(total_p) p_p = max(p - 5, 1) n_p = min(p + 5, total_p) while total_p > 0: pages.append(0) total_p -= 1 files = [] for f_f in f: f_name = f_f[0] temp = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/so/" + f_name) m_l = '' for t_f_f in f_f[1]: m_l += str(t_f_f + 1) m_l += ',' m_l = m_l[0:len(m_l) - 1] fei = FrontEndInterface(temp, m_l) files.append(fei) return render(request, 'nlp-result.html', {'results': files, 'q': q, 'p': p, 'pages': pages, 'p_p': p_p, 'n_p': n_p, 'pre': p - 1, 'next': p + 1, 't_p': t_p})
def readFiles(self): self.lw.write_info_log("reading files...") self.files = os.listdir(self.path) # get all the file names if '.DS_Store' in self.files: self.files.remove('.DS_Store') for file in self.files: # go through the folder if not os.path.isdir(file): # judge if it is a folder self.documents[file] = conv.to_dic(self.path + "/" + file) # self.documents[file]=open(self.path+'/'+file,'r').read() if len(self.documents[file]['content'].strip()) > 0: try: tree = javalang.parse.parse( self.documents[file]['content']) except (javalang.parser.JavaSyntaxError): self.lw.write_error_log("syntax error! " + file) continue # remove strings and variable names self.fileIndex[file] = {} names = [] # self defined name self.lastLineNo = 0 self.index(tree, file, names, {}, {}, False) # print(self.fileIndex[file]) else: self.documents.pop(file) self.files = list(self.documents.keys()) self.lw.write_info_log("get " + str(len(self.documents)) + " documents") # use pickle module to save data into file 'CodexIndexAST.pik' with open(self.index_path, 'wb') as f: pickle.dump(self.weights, f, True) pickle.dump(self.fileIndex, f, True)
def ReadFiles(self): self.lw.write_info_log("reading files...") self.files = os.listdir(self.path) # get all the file names if '.DS_Store' in self.files: self.files.remove('.DS_Store') for file in self.files: # go through the folder if not os.path.isdir(file): # judge if it is a folder self.documents[file] = conv.to_dic(self.path + "/" + file) if len(self.documents[file]['content'].strip()) > 0: try: root = ast.parse(str(self.documents[file]['content'])) except (SyntaxError): self.lw.write_error_log("syntax error! " + file) continue # remove strings and variable names self.visitor.visit(root) self.lineNums[file] = {} self.hashDic[file] = {} self.Indexing(root, self.lineNums[file], self.weights, file) else: self.documents.pop(file) self.files = list(self.documents.keys()) self.lw.write_info_log("get " + str(len(self.documents)) + " documents") # use pickle module to save data into file 'CodexIndexAST.pik' with open(self.index_path, 'wb') as f: pickle.dump(self.weights, f, True) pickle.dump(self.lineNums, f, True) pickle.dump(self.hashDic, f, True)
def detail(request): id = request.GET['id'] ml = request.GET['ml'] m_l = ml fci_obj = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/lsi/" + id + '.json') return render(request, 'detail.html', { 'detail': fci_obj, 'match_lines': m_l })
def search(request): q = request.GET['q'] p = int(request.GET['p']) timestamp = time.time() client = Client("as", "137.43.92.9", 9609, { 'operate_type': 1, 'query': q, 'page': p, 'timestamp': str(timestamp) }) client.send_message() server = CommunicationServer() message = server.receive_message(socket_name=str(timestamp)) result = message['result'] # tfidf = LSI_TFIDF() # result = tfidf.getResult(query=q, page=p) pages = [] f = result[1] total_p = (result[0] / configs['others']['page_num']) + 1 t_p = int(total_p) p_p = max(p - 5, 1) n_p = min(p + 5, total_p) while total_p > 0: pages.append(0) total_p -= 1 files = [] for f_f in f: f_name = f_f[0] temp = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/lsi/" + f_name) m_l = '' for t_f_f in f_f[1]: m_l += str(t_f_f + 1) m_l += ',' m_l = m_l[0:len(m_l) - 1] fei = FrontEndInterface(temp, m_l) files.append(fei) return render( request, 'search-result.html', { 'results': files, 'q': q, 'p': p, 'pages': pages, 'p_p': p_p, 'n_p': n_p, 'pre': p - 1, 'next': p + 1, 't_p': t_p })
def snippet_detail(request): id = request.GET['id'] ml = request.GET['ml'] qml = request.GET['qml'] timestamp = request.GET['ts'] l = int(request.GET['l']) r = redis.Redis(host='localhost', port=6379, decode_responses=True) q = r.get(timestamp) language = '' print(l) if l == 3: language = 'python' else: language = 'java' m_l = ml fci_obj = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/" + language + "/" + id + '.json') return render(request, 'snippet-detail.html', {'detail': fci_obj, 'match_lines': m_l, 'query_match_lines': qml, 'query': q})
def import_in(self, filename): dic = conv.to_dic(file_name=filename) # return self.compareQueries(dic['code'],q1) # compare if two queries are the same using hash functions def compareQueries(self, query1, query2): h1 = self.nodeToHash(query1) h2 = self.nodeToHash(query2) return h1 == h2 # parse a query def nodeToHash(self, node): qRoot = ast.parse(node) self.visitor.visit(qRoot) qt = ast.dump(qRoot) m = hashlib.md5() m.update(qt.encode("utf8")) h = m.hexdigest() return h
def import_in(self, filename): dic = conv.to_dic(file_name=filename) print(dic['content'])
def plagiarizeResult(request): snippet = request.POST['snippet'] page = int(request.POST['p']) operate_type = request.POST['l'] timestamp = time.time() m = hashlib.md5() m.update((str(timestamp) + snippet).encode("utf8")) ts = m.hexdigest() r = redis.Redis(host='localhost', port=6379, decode_responses=True) r.set(ts, snippet, ex=3000) operate_type = int(operate_type) # timestamp = time.time() # client = Client("yeats.ucd.ie", "10.141.131.14", 9609, # {'operate_type': operate_type, 'query': snippet, 'page': page, 'timestamp': timestamp}) # client.send_message() # server = CommunicationServer() # message = server.receive_message(socket_name=str(timestamp)) # result = message['result'] ast = None language = '' if operate_type == 3: # ast = ASTSearching() ast = 3 language = 'python' else: # ast = JavaAST() ast = 4 language = 'java' timestamp = time.time() client = Client( "as", "137.43.92.9", 9609, { 'operate_type': ast, 'query': snippet, 'page': page, 'timestamp': str(timestamp) }) client.send_message() server = CommunicationServer() message = server.receive_message(socket_name=str(timestamp)) result = message['result'] # result = ast.getResults(snippet, page) if result == 0: return render(request, 'snippet-result.html', { 'snippet': snippet, }) else: result = result.to_dict() is_global = False plagiarize_list = [] document_list = [] component_document = [] global_similarity = 0 if result != None: total_num = result['numOfResults'] total_page = (total_num / config.configs['others']['page_num']) + 1 matching_blocks = result['matchingBlocks'] global_similarity = result['globalSimilarity'] matching_lines = result['matchingLines'] blockWeights = result['blockWeights'] if global_similarity != None and global_similarity > 0: is_global = True cd = result['componentDocuments'] component_document = [] for c in cd: qml = str(matching_blocks[c][0]) + '-' + str( matching_blocks[c][1]) ml = '' for mls in matching_lines[c]: ml += str(mls[2]) + '-' + str(mls[3]) + ',' print(qml) fobj = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/" + language + "/" + c) fei = FrontEndInterface(fobj, ml) fei.set_query_match_lines(qml) print(fei.get_query_match_lines(), '==========') component_document.append(fei) for t in result['plagiarismList']: ml = '' qml = '' for mls in matching_lines[t]: qml += str(mls[0]) + '-' + str(mls[1]) + ',' ml += str(mls[2]) + '-' + str(mls[3]) + ',' fobj = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/" + language + "/" + t) fei = FrontEndInterface(fobj, ml) fei.set_query_match_lines(qml) plagiarize_list.append(fei) for t in result['documentList']: ml = '' qml = '' for mls in matching_lines[t]: qml += str(mls[0]) + '-' + str(mls[1]) + ',' ml += str(mls[2]) + '-' + str(mls[3]) + ',' fobj = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/" + language + "/" + t) fei = FrontEndInterface(fobj, ml) fei.set_query_match_lines(qml) document_list.append(fei) if global_similarity != None: global_similarity *= 100 global_similarity = '%.2f' % global_similarity return render( request, 'snippet-result.html', { 'snippet': snippet, "is_global": is_global, 'component_documents': component_document, "global_similarity": global_similarity, "plagiarize_list": plagiarize_list, "document_list": document_list, "l": operate_type, 'ts': ts })
def indexing(self): self.lw.write_info_log("reading files...") self.files = os.listdir(self.path) # get all the file names if '.DS_Store' in self.files: self.files.remove('.DS_Store') fs = len(self.files) self.tfidf = TfidfVectorizer() i = 0 while i < fs: # go through the folder file = self.files[i] if not os.path.isdir(file): # judge if it is a folder self.documents[file] = conv.to_dic(self.path + "/" + file) if len(self.documents[file]['content'].strip()) > 0: self.contents.append(self.documents[file]['content']) # store the line numbers of the term self.lineNo[file] = {} j = 0 for line in self.documents[file]['content'].split('\n'): lineList = [line] if len(lineList) > 0: try: self.tfidf.fit_transform(lineList) # get the unique standard term of this line except ValueError: j += 1 continue for term in self.tfidf.vocabulary_: if term in self.lineNo[file]: self.lineNo[file][term].append(j) else: self.lineNo[file][term] = [j] j += 1 i += 1 else: self.documents.pop(file) self.files.remove(file) fs -= 1 else: self.files.remove(file) print('finish reading') # self.files = list(self.documents.keys()) size = len(self.documents) self.lw.write_info_log("get " + str(size) + " documents") self.lw.write_info_log("indexing...") self.re = self.tfidf.fit_transform(self.contents).toarray().T # tf-idf values self.idf = self.tfidf.idf_ self.word = self.word = list(self.tfidf.vocabulary_.keys()) # compression matrix self.re = dok_matrix(self.re) # self.X=dok_matrix(self.X) print("start SVD") # svd decomposition self.u, self.s, self.d = svds(self.re, k=1000) print('start dumping') # store the index into the pickle with open(self.index_path, 'wb')as f: # use pickle module to save data into file 'CodexIndex.pik' pickle.dump(self.s, f, True) pickle.dump(self.u, f, True) pickle.dump(self.d, f, True) pickle.dump(self.tfidf, f, True) pickle.dump(self.lineNo, f, True) print('finish')