def CorpusCreatorDict(self, folderPath, file_extension):
     file_read_Write = FileReadWrite(folderPath)
     self.file_content_dict = file_read_Write.readFilesWordByWordInDict(folderPath, file_extension)
     #self.file_path_info = file_read_Write.getFilePath()
     #import pdb
     #pdb.set_trace()
     return self.file_content_dict
def readSourceCorpus(source_path_address):
    sourceFileRead = FileReadWrite(source_path_address)
    file_content_dict = sourceFileRead.readFilesWordByWordInDict(
        source_path_address, '.java')
    file_path_info = sourceFileRead.getFilePath()
    #print('---------------------------------------------------------------------------------------------------------------------------')
    #print (str(file_path_info))

    list_of_files = processSourceCorpusPaths(file_path_info)
    return list_of_files
def readNoMethodInfo(path_address):
    sourceFileRead = FileReadWrite(path_address)
    file_content = sourceFileRead.fileReadSingleReturnListByLine(path_address)

    #print('---------------------------------------------------------------------------------------------------------------------------')
    #print (str(file_content))
    list_no_of_method_dict = {}
    for line in file_content:
        #print (line)
        line_content = line.split(',')
        line_length = len(line_content)
        bsourceID = line_content[0]
        no_of_method = line_content[1]
        list_no_of_method_dict[bsourceID] = no_of_method
    return list_no_of_method_dict
Ejemplo n.º 4
0
#querypath="E:\PhD\LSI\Repo\\"+corpus+"\BugData\\"
source_content_all={}
source_content_all=creator.CorpusCreatorDict(sourcepath, '.java')


print ('Total files in corpus ')
print(len(source_content_all))
print (source_content_all)

vector_space = VectorSpace(source_content_all)
file_path_all=vector_space.get_file_path_all()
print (file_path_all)
document_ID_file_info_mapping=vector_space.get_document_ID_file_info_mapping()
print (document_ID_file_info_mapping)
keywords_docs_string=str(vector_space.vector_index_to_keyword_mapping)
file_read_write=FileReadWrite(sourcepath)
file_read_write.writeFiles(keywordsfilepath, keywords_docs_string)
print (len(vector_space.vector_index_to_keyword_mapping))
#import pdb
#pdb.set_trace()
print ("Keyords-document vector/matrix")
print ('length of vector_space.collection_of_document_term_vectors')
print len(vector_space.collection_of_document_term_vectors)

document_term_matrix=vector_space.collection_of_document_term_vectors
document_ID_file_info_mapping=vector_space.get_document_ID_file_info_mapping()
# Create LSI model using TF-IDF model
tf_idf = TFIDF(document_term_matrix)
tf_idf_transformated_matrix=tf_idf.transform()
print ('length of tf_idf_transformated_matrix')
print len(tf_idf_transformated_matrix)
        bsourceID = line_content[0]
        no_of_method = line_content[1]
        list_no_of_method_dict[bsourceID] = no_of_method
    return list_no_of_method_dict


def processSourceCorpusPaths(file_path_info):
    list_of_files = []
    for i in range(0, len(file_path_info)):
        #print (file_path_info[i])
        processed_file_path = process_file_path(file_path_info[i], '\\')
        #print(processed_file_path)
        list_of_files.append(processed_file_path)
    return list_of_files


corpus = 'SWT'
file_read_write = FileReadWrite('E:\PhD\LSI\Repo\\' + corpus +
                                '\\data\gitInfo' + corpus + '.txt')
git_content = []
git_content = file_read_write.fileReadSingleReturnListByLine(
    'E:\PhD\LSI\Repo\\' + corpus + '\\data\gitInfo' + corpus + '.txt')
list_of_source_files = readSourceCorpus('E:\PhD\LSI\Repo\\' + corpus +
                                        '\processedSourceCodes3')
list_no_of_method_dict = readNoMethodInfo("E:\PhD\LSI\Repo\\" + corpus +
                                          "\data\\listNoOfMethod.txt")
#print (str(git_content))
content_to_write = goldset_creator(git_content, list_of_source_files,
                                   list_no_of_method_dict)

#checkSourceExistance(content_to_write, list_of_source_files)
Ejemplo n.º 6
0
    return processd_final_content


def process_file_path(file_path):
    #print (file_path.rfind('/'))
    print(file_path)
    file_path_new = file_path.replace("\\", ",")
    print(file_path_new)

    list = []
    for match in re.finditer(',', file_path_new):
        print(match.start(), match.end())
        list.append(str(match.end()))

    print(str(list[6]))

    file_address = file_path_new[int(list[5]):]
    file_address_new = file_address.replace(",", ".")
    print(file_address_new)
    #print (file_address)
    return file_address_new


corpus = 'SWT'
topdir = 'E:\PhD\LSI\Repo\\' + corpus + '\Source\swt-3.659BLA\\'
exten = '.java'
pathToWrite = 'E:\PhD\LSI\Repo\\' + corpus + '\\ProcessedSourceCorpusDec19\\'

file_read_write = FileReadWrite(topdir)
os.path.walk(topdir, traverse_folder, (exten, pathToWrite))
 def CorpusCreator(self, folderPath, file_extension):
     # Read each file
     file_read_Write = FileReadWrite(folderPath)
     file_content_all = file_read_Write.readFilesWordByWord(folderPath, file_extension)
     self.file_path_info=file_read_Write.getFilePath()
     return file_content_all
Ejemplo n.º 8
0
from xml.etree import ElementTree

from numpy.distutils.system_info import f2py_info

from util.FileReadWrite import FileReadWrite
corpus = 'AspectJ'
tree = ElementTree.parse("E:\\BugLocator\\data\\" + corpus +
                         "BugRepository.xml")
root = tree.getroot()
# find the first 'item' object
children = root.getchildren()
for child in children:
    print(child.tag, child.attrib)

file_read_write = FileReadWrite("E:\\PhD\\Repo\\" + corpus +
                                "\\data\\bugIdsNotCluded.txt")

bugIDs = file_read_write.fileReadSingleReturnListByLine(
    "E:\\PhD\\Repo\\" + corpus + "\\data\\bugIdsNotCluded.txt")
print(bugIDs)
for b in bugIDs:
    BugID = b[:-1]
    print(BugID)
    for bid in root.findall("./bug/[@id='" + BugID + "']"):
        print(bid.attrib)
        root.remove(bid)

tree.write("E:\\BugLocator\\data\\" + corpus + "BugRepositoryPy.xml")