def save_so_questions_to_json_files(self, fci_object): FCIConverter.to_local_json_file(self.json_files_path, fci_object) if self.connection is not None: FCIConverter.to_master_json_file(self.master_json_path, fci_object, self.connection) self.log_writer.write_info_log("Question saved to master server")
def save_fci_objects_to_json_files(self, fci_object, file_name): FCIConverter.to_local_json_file(self.json_files_path, fci_object) if self.connection is not None: FCIConverter.to_master_json_file(self.master_json_path, fci_object, self.connection) self.log_writer.write_info_log(file_name + " saved to master server")
def ReadFiles(self): self.lw.write_info_log("reading files...") self.files = os.listdir(self.path) # get all the file names if '.DS_Store' in self.files: self.files.remove('.DS_Store') for file in self.files: # go through the folder if not os.path.isdir(file): # judge if it is a folder self.documents[file] = conv.to_dic(self.path + "/" + file) if len(self.documents[file]['content'].strip()) > 0: try: root = ast.parse(str(self.documents[file]['content'])) except (SyntaxError): self.lw.write_error_log("syntax error! " + file) continue # remove strings and variable names self.visitor.visit(root) self.lineNums[file] = {} self.hashDic[file] = {} self.Indexing(root, self.lineNums[file], self.weights, file) else: self.documents.pop(file) self.files = list(self.documents.keys()) self.lw.write_info_log("get " + str(len(self.documents)) + " documents") # use pickle module to save data into file 'CodexIndexAST.pik' with open(self.index_path, 'wb') as f: pickle.dump(self.weights, f, True) pickle.dump(self.lineNums, f, True) pickle.dump(self.hashDic, f, True)
def readFiles(self): self.lw.write_info_log("reading files...") self.files = os.listdir(self.path) # get all the file names if '.DS_Store' in self.files: self.files.remove('.DS_Store') for file in self.files: # go through the folder if not os.path.isdir(file): # judge if it is a folder self.documents[file] = conv.to_dic(self.path + "/" + file) # self.documents[file]=open(self.path+'/'+file,'r').read() if len(self.documents[file]['content'].strip()) > 0: try: tree = javalang.parse.parse( self.documents[file]['content']) except (javalang.parser.JavaSyntaxError): self.lw.write_error_log("syntax error! " + file) continue # remove strings and variable names self.fileIndex[file] = {} names = [] # self defined name self.lastLineNo = 0 self.index(tree, file, names, {}, {}, False) # print(self.fileIndex[file]) else: self.documents.pop(file) self.files = list(self.documents.keys()) self.lw.write_info_log("get " + str(len(self.documents)) + " documents") # use pickle module to save data into file 'CodexIndexAST.pik' with open(self.index_path, 'wb') as f: pickle.dump(self.weights, f, True) pickle.dump(self.fileIndex, f, True)
def get_info_from_so_files(self): ''' with open("python_title_answer.txt", 'r', encoding='UTF-8') as tsv: #with open("python_title_answer.txt", 'r', encoding='UTF-8') as f: for line in csv.reader(tsv, dialect="excel-tab"): try: content = line[2] code = line[3] fci_object = FormattedCodeInterface() fci_object.set_content(line[2]) fci_object.set_code(line[3]) print(line[2]) print(line[3]) except Exception as e: print(e) ''' #file = open('python_title_answer.txt', 'r', encoding='UTF-8') #f = open('python_title_answer.txt', 'rb') f = open('python_title_answer.txt') ''' content ='' for line in file.readlines(): content += line print(content) ''' data = [] for line in f: # Remove the new line at the end and then split the string based on # tabs. This creates a python list of the values. # el = line.encode('utf8') values = line.split('\t') content = values[2] # code = values[3].encode('UTF-8') code = values[3] print(code) fci_object = FormattedCodeInterface() fci_object.set_content(content) fci_object.set_code(code) FCIConverter.to_local_json_file("json_files", fci_object) # values = dl.strip().split('\t') # data.append([float(v) for v in values]) print(data) f.close() # close the file'''
def import_in(self, filename): dic = conv.to_dic(file_name=filename) # return self.compareQueries(dic['code'],q1) # compare if two queries are the same using hash functions def compareQueries(self, query1, query2): h1 = self.nodeToHash(query1) h2 = self.nodeToHash(query2) return h1 == h2 # parse a query def nodeToHash(self, node): qRoot = ast.parse(node) self.visitor.visit(qRoot) qt = ast.dump(qRoot) m = hashlib.md5() m.update(qt.encode("utf8")) h = m.hexdigest() return h
def indexing(self): self.lw.write_info_log("reading files...") self.files = os.listdir(self.path) # get all the file names self.files.remove('.DS_Store') fs = len(self.files) self.tfidf = TfidfVectorizer() i = 0 while i < fs: # go through the folder file = self.files[i] if not os.path.isdir(file): # judge if it is a folder self.documents[file] = conv.to_dic(self.path + "/" + file) if len(self.documents[file]['content'].strip()) > 0: self.contents.append(self.documents[file]['content']) #store the line numbers of the term self.lineNo[file] = {} j = 0 for line in self.documents[file]['content'].split('\n'): lineList = [line] if len(lineList) > 0: try: self.tfidf.fit_transform( lineList ) #get the unique standard term of this line except ValueError: j += 1 continue for term in self.tfidf.vocabulary_: if term in self.lineNo[file]: self.lineNo[file][term].append(j) else: self.lineNo[file][term] = [j] j += 1 i += 1 else: self.documents.pop(file) self.files.remove(file) fs -= 1 else: self.files.remove(file) print('finish reading') # self.files = list(self.documents.keys()) size = len(self.documents) self.lw.write_info_log("get " + str(size) + " documents") self.lw.write_info_log("indexing...") self.stopwords = [ 'and', 'edition', 'for', 'in', 'little', 'of', 'the', 'to', 'print' ] self.re = self.tfidf.fit_transform( self.contents).toarray().T # tf-idf values self.idf = self.tfidf.idf_ self.word = self.word = list(self.tfidf.vocabulary_.keys()) #compression matrix self.re = dok_matrix(self.re) # self.X=dok_matrix(self.X) print("start SVD") # svd decomposition self.u, self.s, self.d = svds(self.re, k=500, return_singular_vectors='u') print('start dumping') # store the index into the pickle with open( self.index_path, 'wb' ) as f: # use pickle module to save data into file 'CodexIndex.pik' pickle.dump(self.s, f, True) pickle.dump(self.u, f, True) pickle.dump(self.d, f, True) pickle.dump(self.tfidf, f, True) pickle.dump(self.lineNo, f, True) print('finish')
def import_in(self, filename): dic = conv.to_dic(file_name=filename) print(dic['content'])
if buffer != 0: print(buffer) for j in range(int(buffer / 4)): after += r'\\' after += i buffer = 0 return after path = "/Users/quanyewu/Desktop/files/so_50k" list_dirs = os.walk(path) for root, dirs, files in list_dirs: i = 0 for file in files: # print(path + "/" + file) obj = fc.to_fciObject(path + "/" + file) # obj.set_code() # dic = obj.to_dictionary() # to_write = json.dumps(dic) to_write = obj.get_code() print(to_write) print('---------------------') print(convert(to_write)) print('==============================') i += 1 if i == 3: break # f = open(path + "/" + file, 'w') # f.write(to_write)