def save_so_questions_to_json_files(self, fci_object):
        FCIConverter.to_local_json_file(self.json_files_path, fci_object)

        if self.connection is not None:
            FCIConverter.to_master_json_file(self.master_json_path, fci_object,
                                             self.connection)
            self.log_writer.write_info_log("Question saved to master server")
    def save_fci_objects_to_json_files(self, fci_object, file_name):
        FCIConverter.to_local_json_file(self.json_files_path, fci_object)

        if self.connection is not None:
            FCIConverter.to_master_json_file(self.master_json_path, fci_object,
                                             self.connection)
            self.log_writer.write_info_log(file_name +
                                           " saved to master server")
Ejemplo n.º 3
0
    def ReadFiles(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        for file in self.files:  # go through the folder
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                if len(self.documents[file]['content'].strip()) > 0:
                    try:
                        root = ast.parse(str(self.documents[file]['content']))
                    except (SyntaxError):
                        self.lw.write_error_log("syntax error! " + file)
                        continue
                    # remove strings and variable names
                    self.visitor.visit(root)
                    self.lineNums[file] = {}
                    self.hashDic[file] = {}
                    self.Indexing(root, self.lineNums[file], self.weights,
                                  file)
                else:
                    self.documents.pop(file)
        self.files = list(self.documents.keys())

        self.lw.write_info_log("get " + str(len(self.documents)) +
                               " documents")
        # use pickle module to save data into file 'CodexIndexAST.pik'
        with open(self.index_path, 'wb') as f:
            pickle.dump(self.weights, f, True)
            pickle.dump(self.lineNums, f, True)
            pickle.dump(self.hashDic, f, True)
    def readFiles(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        for file in self.files:  # go through the folder
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                # self.documents[file]=open(self.path+'/'+file,'r').read()
                if len(self.documents[file]['content'].strip()) > 0:
                    try:
                        tree = javalang.parse.parse(
                            self.documents[file]['content'])
                    except (javalang.parser.JavaSyntaxError):
                        self.lw.write_error_log("syntax error! " + file)
                        continue
                    # remove strings and variable names
                    self.fileIndex[file] = {}
                    names = []  # self defined name
                    self.lastLineNo = 0
                    self.index(tree, file, names, {}, {}, False)
                    # print(self.fileIndex[file])
                else:
                    self.documents.pop(file)
        self.files = list(self.documents.keys())

        self.lw.write_info_log("get " + str(len(self.documents)) +
                               " documents")
        # use pickle module to save data into file 'CodexIndexAST.pik'
        with open(self.index_path, 'wb') as f:
            pickle.dump(self.weights, f, True)
            pickle.dump(self.fileIndex, f, True)
    def get_info_from_so_files(self):
        '''
        with open("python_title_answer.txt", 'r', encoding='UTF-8') as tsv:
        #with open("python_title_answer.txt", 'r', encoding='UTF-8') as f:
            for line in csv.reader(tsv, dialect="excel-tab"):
                try:
                    content = line[2]
                    code = line[3]
                    fci_object = FormattedCodeInterface()
                    fci_object.set_content(line[2])
                    fci_object.set_code(line[3])
                    print(line[2])
                    print(line[3])
                except Exception as e:
                    print(e)
        '''

        #file = open('python_title_answer.txt', 'r', encoding='UTF-8')
        #f = open('python_title_answer.txt', 'rb')
        f = open('python_title_answer.txt')
        '''
        content =''
        for line in file.readlines():
            content += line
        print(content)
        '''
        data = []
        for line in f:
            # Remove the new line at the end and then split the string based on
            # tabs. This creates a python list of the values.
            # el = line.encode('utf8')
            values = line.split('\t')
            content = values[2]
            # code = values[3].encode('UTF-8')
            code = values[3]
            print(code)
            fci_object = FormattedCodeInterface()
            fci_object.set_content(content)
            fci_object.set_code(code)
            FCIConverter.to_local_json_file("json_files", fci_object)
            # values = dl.strip().split('\t')
            # data.append([float(v) for v in values])
        print(data)
        f.close()  # close the file'''
Ejemplo n.º 6
0
    def import_in(self, filename):
        dic = conv.to_dic(file_name=filename)

        # return  self.compareQueries(dic['code'],q1)

        # compare if two queries are the same using hash functions
        def compareQueries(self, query1, query2):
            h1 = self.nodeToHash(query1)
            h2 = self.nodeToHash(query2)
            return h1 == h2

        # parse a query
        def nodeToHash(self, node):
            qRoot = ast.parse(node)
            self.visitor.visit(qRoot)
            qt = ast.dump(qRoot)
            m = hashlib.md5()
            m.update(qt.encode("utf8"))
            h = m.hexdigest()
            return h
    def indexing(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        self.files.remove('.DS_Store')
        fs = len(self.files)
        self.tfidf = TfidfVectorizer()
        i = 0
        while i < fs:  # go through the folder
            file = self.files[i]
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                if len(self.documents[file]['content'].strip()) > 0:
                    self.contents.append(self.documents[file]['content'])
                    #store the line numbers of the term
                    self.lineNo[file] = {}
                    j = 0
                    for line in self.documents[file]['content'].split('\n'):
                        lineList = [line]
                        if len(lineList) > 0:
                            try:
                                self.tfidf.fit_transform(
                                    lineList
                                )  #get the unique standard term of this line
                            except ValueError:
                                j += 1
                                continue
                            for term in self.tfidf.vocabulary_:
                                if term in self.lineNo[file]:
                                    self.lineNo[file][term].append(j)
                                else:
                                    self.lineNo[file][term] = [j]
                        j += 1
                    i += 1
                else:
                    self.documents.pop(file)
                    self.files.remove(file)
                    fs -= 1
            else:
                self.files.remove(file)
        print('finish reading')
        # self.files = list(self.documents.keys())
        size = len(self.documents)
        self.lw.write_info_log("get " + str(size) + " documents")
        self.lw.write_info_log("indexing...")
        self.stopwords = [
            'and', 'edition', 'for', 'in', 'little', 'of', 'the', 'to', 'print'
        ]
        self.re = self.tfidf.fit_transform(
            self.contents).toarray().T  # tf-idf values
        self.idf = self.tfidf.idf_
        self.word = self.word = list(self.tfidf.vocabulary_.keys())

        #compression matrix
        self.re = dok_matrix(self.re)
        # self.X=dok_matrix(self.X)
        print("start SVD")
        # svd decomposition
        self.u, self.s, self.d = svds(self.re,
                                      k=500,
                                      return_singular_vectors='u')
        print('start dumping')
        # store the index into the pickle
        with open(
                self.index_path, 'wb'
        ) as f:  # use pickle module to save data into file 'CodexIndex.pik'
            pickle.dump(self.s, f, True)
            pickle.dump(self.u, f, True)
            pickle.dump(self.d, f, True)
            pickle.dump(self.tfidf, f, True)
            pickle.dump(self.lineNo, f, True)
            print('finish')
 def import_in(self, filename):
     dic = conv.to_dic(file_name=filename)
     print(dic['content'])
            if buffer != 0:
                print(buffer)
                for j in range(int(buffer / 4)):
                    after += r'\\'
            after += i
            buffer = 0
    return after


path = "/Users/quanyewu/Desktop/files/so_50k"

list_dirs = os.walk(path)
for root, dirs, files in list_dirs:
    i = 0
    for file in files:
        # print(path + "/" + file)
        obj = fc.to_fciObject(path + "/" + file)
        # obj.set_code()
        # dic = obj.to_dictionary()
        # to_write = json.dumps(dic)
        to_write = obj.get_code()
        print(to_write)
        print('---------------------')
        print(convert(to_write))
        print('==============================')
        i += 1
        if i == 3:
            break
            # f = open(path + "/" + file, 'w')
            # f.write(to_write)