Exemple #1
0
def nls_result(request):
    q = request.GET['q']
    p = int(request.GET['p'])
    tfidf = NLP_LSI_TFIDF()
    result = tfidf.getResult(query=q, page=p)
    pages = []
    f = result[1]
    total_p = (result[0] / 10) + 1
    t_p = int(total_p)
    p_p = max(p - 5, 1)
    n_p = min(p + 5, total_p)
    while total_p > 0:
        pages.append(0)
        total_p -= 1
    files = []
    for f_f in f:
        f_name = f_f[0]
        temp = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/so/" + f_name)
        m_l = ''
        for t_f_f in f_f[1]:
            m_l += str(t_f_f + 1)
            m_l += ','
        m_l = m_l[0:len(m_l) - 1]
        fei = FrontEndInterface(temp, m_l)
        files.append(fei)
    return render(request, 'nlp-result.html',
                  {'results': files, 'q': q, 'p': p, 'pages': pages, 'p_p': p_p, 'n_p': n_p, 'pre': p - 1,
                   'next': p + 1, 't_p': t_p})
    def readFiles(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        for file in self.files:  # go through the folder
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                # self.documents[file]=open(self.path+'/'+file,'r').read()
                if len(self.documents[file]['content'].strip()) > 0:
                    try:
                        tree = javalang.parse.parse(
                            self.documents[file]['content'])
                    except (javalang.parser.JavaSyntaxError):
                        self.lw.write_error_log("syntax error! " + file)
                        continue
                    # remove strings and variable names
                    self.fileIndex[file] = {}
                    names = []  # self defined name
                    self.lastLineNo = 0
                    self.index(tree, file, names, {}, {}, False)
                    # print(self.fileIndex[file])
                else:
                    self.documents.pop(file)
        self.files = list(self.documents.keys())

        self.lw.write_info_log("get " + str(len(self.documents)) +
                               " documents")
        # use pickle module to save data into file 'CodexIndexAST.pik'
        with open(self.index_path, 'wb') as f:
            pickle.dump(self.weights, f, True)
            pickle.dump(self.fileIndex, f, True)
Exemple #3
0
    def ReadFiles(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        for file in self.files:  # go through the folder
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                if len(self.documents[file]['content'].strip()) > 0:
                    try:
                        root = ast.parse(str(self.documents[file]['content']))
                    except (SyntaxError):
                        self.lw.write_error_log("syntax error! " + file)
                        continue
                    # remove strings and variable names
                    self.visitor.visit(root)
                    self.lineNums[file] = {}
                    self.hashDic[file] = {}
                    self.Indexing(root, self.lineNums[file], self.weights,
                                  file)
                else:
                    self.documents.pop(file)
        self.files = list(self.documents.keys())

        self.lw.write_info_log("get " + str(len(self.documents)) +
                               " documents")
        # use pickle module to save data into file 'CodexIndexAST.pik'
        with open(self.index_path, 'wb') as f:
            pickle.dump(self.weights, f, True)
            pickle.dump(self.lineNums, f, True)
            pickle.dump(self.hashDic, f, True)
def detail(request):
    id = request.GET['id']
    ml = request.GET['ml']
    m_l = ml
    fci_obj = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/lsi/" +
                               id + '.json')
    return render(request, 'detail.html', {
        'detail': fci_obj,
        'match_lines': m_l
    })
def search(request):
    q = request.GET['q']
    p = int(request.GET['p'])

    timestamp = time.time()
    client = Client("as", "137.43.92.9", 9609, {
        'operate_type': 1,
        'query': q,
        'page': p,
        'timestamp': str(timestamp)
    })
    client.send_message()
    server = CommunicationServer()
    message = server.receive_message(socket_name=str(timestamp))
    result = message['result']
    # tfidf = LSI_TFIDF()
    # result = tfidf.getResult(query=q, page=p)
    pages = []
    f = result[1]
    total_p = (result[0] / configs['others']['page_num']) + 1
    t_p = int(total_p)
    p_p = max(p - 5, 1)
    n_p = min(p + 5, total_p)
    while total_p > 0:
        pages.append(0)
        total_p -= 1
    files = []
    for f_f in f:
        f_name = f_f[0]
        temp = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/lsi/" +
                                f_name)
        m_l = ''
        for t_f_f in f_f[1]:
            m_l += str(t_f_f + 1)
            m_l += ','
        m_l = m_l[0:len(m_l) - 1]
        fei = FrontEndInterface(temp, m_l)
        files.append(fei)
    return render(
        request, 'search-result.html', {
            'results': files,
            'q': q,
            'p': p,
            'pages': pages,
            'p_p': p_p,
            'n_p': n_p,
            'pre': p - 1,
            'next': p + 1,
            't_p': t_p
        })
Exemple #6
0
def snippet_detail(request):
    id = request.GET['id']
    ml = request.GET['ml']
    qml = request.GET['qml']
    timestamp = request.GET['ts']
    l = int(request.GET['l'])
    r = redis.Redis(host='localhost', port=6379, decode_responses=True)
    q = r.get(timestamp)
    language = ''
    print(l)
    if l == 3:
        language = 'python'
    else:
        language = 'java'
    m_l = ml
    fci_obj = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/" + language + "/" + id + '.json')
    return render(request, 'snippet-detail.html',
                  {'detail': fci_obj, 'match_lines': m_l, 'query_match_lines': qml, 'query': q})
Exemple #7
0
    def import_in(self, filename):
        dic = conv.to_dic(file_name=filename)

        # return  self.compareQueries(dic['code'],q1)

        # compare if two queries are the same using hash functions
        def compareQueries(self, query1, query2):
            h1 = self.nodeToHash(query1)
            h2 = self.nodeToHash(query2)
            return h1 == h2

        # parse a query
        def nodeToHash(self, node):
            qRoot = ast.parse(node)
            self.visitor.visit(qRoot)
            qt = ast.dump(qRoot)
            m = hashlib.md5()
            m.update(qt.encode("utf8"))
            h = m.hexdigest()
            return h
 def import_in(self, filename):
     dic = conv.to_dic(file_name=filename)
     print(dic['content'])
def plagiarizeResult(request):
    snippet = request.POST['snippet']
    page = int(request.POST['p'])
    operate_type = request.POST['l']
    timestamp = time.time()
    m = hashlib.md5()
    m.update((str(timestamp) + snippet).encode("utf8"))
    ts = m.hexdigest()
    r = redis.Redis(host='localhost', port=6379, decode_responses=True)
    r.set(ts, snippet, ex=3000)
    operate_type = int(operate_type)
    # timestamp = time.time()
    # client = Client("yeats.ucd.ie", "10.141.131.14", 9609,
    #                 {'operate_type': operate_type, 'query': snippet, 'page': page, 'timestamp': timestamp})
    # client.send_message()
    # server = CommunicationServer()
    # message = server.receive_message(socket_name=str(timestamp))
    # result = message['result']
    ast = None
    language = ''
    if operate_type == 3:
        # ast = ASTSearching()
        ast = 3
        language = 'python'
    else:
        # ast = JavaAST()
        ast = 4
        language = 'java'
    timestamp = time.time()
    client = Client(
        "as", "137.43.92.9", 9609, {
            'operate_type': ast,
            'query': snippet,
            'page': page,
            'timestamp': str(timestamp)
        })
    client.send_message()
    server = CommunicationServer()
    message = server.receive_message(socket_name=str(timestamp))
    result = message['result']
    # result = ast.getResults(snippet, page)
    if result == 0:
        return render(request, 'snippet-result.html', {
            'snippet': snippet,
        })
    else:
        result = result.to_dict()
    is_global = False
    plagiarize_list = []
    document_list = []
    component_document = []
    global_similarity = 0
    if result != None:
        total_num = result['numOfResults']
        total_page = (total_num / config.configs['others']['page_num']) + 1
        matching_blocks = result['matchingBlocks']
        global_similarity = result['globalSimilarity']
        matching_lines = result['matchingLines']
        blockWeights = result['blockWeights']
        if global_similarity != None and global_similarity > 0:
            is_global = True
            cd = result['componentDocuments']
            component_document = []
            for c in cd:
                qml = str(matching_blocks[c][0]) + '-' + str(
                    matching_blocks[c][1])
                ml = ''
                for mls in matching_lines[c]:
                    ml += str(mls[2]) + '-' + str(mls[3]) + ','
                print(qml)
                fobj = fci.to_fciObject(config.configs['paths']['FCI_path'] +
                                        "/" + language + "/" + c)
                fei = FrontEndInterface(fobj, ml)
                fei.set_query_match_lines(qml)
                print(fei.get_query_match_lines(), '==========')
                component_document.append(fei)
        for t in result['plagiarismList']:
            ml = ''
            qml = ''
            for mls in matching_lines[t]:
                qml += str(mls[0]) + '-' + str(mls[1]) + ','
                ml += str(mls[2]) + '-' + str(mls[3]) + ','
            fobj = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/" +
                                    language + "/" + t)
            fei = FrontEndInterface(fobj, ml)
            fei.set_query_match_lines(qml)
            plagiarize_list.append(fei)
        for t in result['documentList']:
            ml = ''
            qml = ''
            for mls in matching_lines[t]:
                qml += str(mls[0]) + '-' + str(mls[1]) + ','
                ml += str(mls[2]) + '-' + str(mls[3]) + ','
            fobj = fci.to_fciObject(config.configs['paths']['FCI_path'] + "/" +
                                    language + "/" + t)
            fei = FrontEndInterface(fobj, ml)
            fei.set_query_match_lines(qml)
            document_list.append(fei)
        if global_similarity != None:
            global_similarity *= 100
            global_similarity = '%.2f' % global_similarity
    return render(
        request, 'snippet-result.html', {
            'snippet': snippet,
            "is_global": is_global,
            'component_documents': component_document,
            "global_similarity": global_similarity,
            "plagiarize_list": plagiarize_list,
            "document_list": document_list,
            "l": operate_type,
            'ts': ts
        })
Exemple #10
0
    def indexing(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        fs = len(self.files)
        self.tfidf = TfidfVectorizer()
        i = 0
        while i < fs:  # go through the folder
            file = self.files[i]
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                if len(self.documents[file]['content'].strip()) > 0:
                    self.contents.append(self.documents[file]['content'])
                    # store the line numbers of the term
                    self.lineNo[file] = {}
                    j = 0
                    for line in self.documents[file]['content'].split('\n'):
                        lineList = [line]
                        if len(lineList) > 0:
                            try:
                                self.tfidf.fit_transform(lineList)  # get the unique standard term of this line
                            except ValueError:
                                j += 1
                                continue
                            for term in self.tfidf.vocabulary_:
                                if term in self.lineNo[file]:
                                    self.lineNo[file][term].append(j)
                                else:
                                    self.lineNo[file][term] = [j]
                        j += 1
                    i += 1
                else:
                    self.documents.pop(file)
                    self.files.remove(file)
                    fs -= 1
            else:
                self.files.remove(file)
        print('finish reading')
        # self.files = list(self.documents.keys())
        size = len(self.documents)
        self.lw.write_info_log("get " + str(size) + " documents")
        self.lw.write_info_log("indexing...")
        self.re = self.tfidf.fit_transform(self.contents).toarray().T  # tf-idf values
        self.idf = self.tfidf.idf_
        self.word = self.word = list(self.tfidf.vocabulary_.keys())

        # compression matrix
        self.re = dok_matrix(self.re)
        # self.X=dok_matrix(self.X)
        print("start SVD")
        # svd decomposition
        self.u, self.s, self.d = svds(self.re, k=1000)
        print('start dumping')
        # store the index into the pickle
        with open(self.index_path, 'wb')as f:  # use pickle module to save data into file 'CodexIndex.pik'
            pickle.dump(self.s, f, True)
            pickle.dump(self.u, f, True)
            pickle.dump(self.d, f, True)
            pickle.dump(self.tfidf, f, True)
            pickle.dump(self.lineNo, f, True)
            print('finish')