def CorpusCreatorDict(self, folderPath, file_extension): file_read_Write = FileReadWrite(folderPath) self.file_content_dict = file_read_Write.readFilesWordByWordInDict(folderPath, file_extension) #self.file_path_info = file_read_Write.getFilePath() #import pdb #pdb.set_trace() return self.file_content_dict
def readSourceCorpus(source_path_address): sourceFileRead = FileReadWrite(source_path_address) file_content_dict = sourceFileRead.readFilesWordByWordInDict( source_path_address, '.java') file_path_info = sourceFileRead.getFilePath() #print('---------------------------------------------------------------------------------------------------------------------------') #print (str(file_path_info)) list_of_files = processSourceCorpusPaths(file_path_info) return list_of_files
def readNoMethodInfo(path_address): sourceFileRead = FileReadWrite(path_address) file_content = sourceFileRead.fileReadSingleReturnListByLine(path_address) #print('---------------------------------------------------------------------------------------------------------------------------') #print (str(file_content)) list_no_of_method_dict = {} for line in file_content: #print (line) line_content = line.split(',') line_length = len(line_content) bsourceID = line_content[0] no_of_method = line_content[1] list_no_of_method_dict[bsourceID] = no_of_method return list_no_of_method_dict
#querypath="E:\PhD\LSI\Repo\\"+corpus+"\BugData\\" source_content_all={} source_content_all=creator.CorpusCreatorDict(sourcepath, '.java') print ('Total files in corpus ') print(len(source_content_all)) print (source_content_all) vector_space = VectorSpace(source_content_all) file_path_all=vector_space.get_file_path_all() print (file_path_all) document_ID_file_info_mapping=vector_space.get_document_ID_file_info_mapping() print (document_ID_file_info_mapping) keywords_docs_string=str(vector_space.vector_index_to_keyword_mapping) file_read_write=FileReadWrite(sourcepath) file_read_write.writeFiles(keywordsfilepath, keywords_docs_string) print (len(vector_space.vector_index_to_keyword_mapping)) #import pdb #pdb.set_trace() print ("Keyords-document vector/matrix") print ('length of vector_space.collection_of_document_term_vectors') print len(vector_space.collection_of_document_term_vectors) document_term_matrix=vector_space.collection_of_document_term_vectors document_ID_file_info_mapping=vector_space.get_document_ID_file_info_mapping() # Create LSI model using TF-IDF model tf_idf = TFIDF(document_term_matrix) tf_idf_transformated_matrix=tf_idf.transform() print ('length of tf_idf_transformated_matrix') print len(tf_idf_transformated_matrix)
bsourceID = line_content[0] no_of_method = line_content[1] list_no_of_method_dict[bsourceID] = no_of_method return list_no_of_method_dict def processSourceCorpusPaths(file_path_info): list_of_files = [] for i in range(0, len(file_path_info)): #print (file_path_info[i]) processed_file_path = process_file_path(file_path_info[i], '\\') #print(processed_file_path) list_of_files.append(processed_file_path) return list_of_files corpus = 'SWT' file_read_write = FileReadWrite('E:\PhD\LSI\Repo\\' + corpus + '\\data\gitInfo' + corpus + '.txt') git_content = [] git_content = file_read_write.fileReadSingleReturnListByLine( 'E:\PhD\LSI\Repo\\' + corpus + '\\data\gitInfo' + corpus + '.txt') list_of_source_files = readSourceCorpus('E:\PhD\LSI\Repo\\' + corpus + '\processedSourceCodes3') list_no_of_method_dict = readNoMethodInfo("E:\PhD\LSI\Repo\\" + corpus + "\data\\listNoOfMethod.txt") #print (str(git_content)) content_to_write = goldset_creator(git_content, list_of_source_files, list_no_of_method_dict) #checkSourceExistance(content_to_write, list_of_source_files)
return processd_final_content def process_file_path(file_path): #print (file_path.rfind('/')) print(file_path) file_path_new = file_path.replace("\\", ",") print(file_path_new) list = [] for match in re.finditer(',', file_path_new): print(match.start(), match.end()) list.append(str(match.end())) print(str(list[6])) file_address = file_path_new[int(list[5]):] file_address_new = file_address.replace(",", ".") print(file_address_new) #print (file_address) return file_address_new corpus = 'SWT' topdir = 'E:\PhD\LSI\Repo\\' + corpus + '\Source\swt-3.659BLA\\' exten = '.java' pathToWrite = 'E:\PhD\LSI\Repo\\' + corpus + '\\ProcessedSourceCorpusDec19\\' file_read_write = FileReadWrite(topdir) os.path.walk(topdir, traverse_folder, (exten, pathToWrite))
def CorpusCreator(self, folderPath, file_extension): # Read each file file_read_Write = FileReadWrite(folderPath) file_content_all = file_read_Write.readFilesWordByWord(folderPath, file_extension) self.file_path_info=file_read_Write.getFilePath() return file_content_all
from xml.etree import ElementTree from numpy.distutils.system_info import f2py_info from util.FileReadWrite import FileReadWrite corpus = 'AspectJ' tree = ElementTree.parse("E:\\BugLocator\\data\\" + corpus + "BugRepository.xml") root = tree.getroot() # find the first 'item' object children = root.getchildren() for child in children: print(child.tag, child.attrib) file_read_write = FileReadWrite("E:\\PhD\\Repo\\" + corpus + "\\data\\bugIdsNotCluded.txt") bugIDs = file_read_write.fileReadSingleReturnListByLine( "E:\\PhD\\Repo\\" + corpus + "\\data\\bugIdsNotCluded.txt") print(bugIDs) for b in bugIDs: BugID = b[:-1] print(BugID) for bid in root.findall("./bug/[@id='" + BugID + "']"): print(bid.attrib) root.remove(bid) tree.write("E:\\BugLocator\\data\\" + corpus + "BugRepositoryPy.xml")