Python tokenize_stopwords_stemmer Beispiele, computeSimilarity.tokenize_stopwords_stemmer Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: getAPIdscpScors.py Projekt: CongyingXU/poj_RecommdAPI

def computeSimilarityScors(newReportSummary, newReportDescription , All_3partAPIinfo_list):      #,Src_info_file_dir):
    all_APIdescription = []
    for ele in All_3partAPIinfo_list:
        all_APIdescription.append(ele[5])
    #版本二，数据已经经过文本处理，节省时间
    newRportSD= computeSimilarity.tokenize_stopwords_stemmer( [newReportSummary,newReportDescription] )
    scores = Half_computeSimilarity(newRportSD,all_APIdescription)
    
    """
    #用于将数据归一化处理
    Max_socre  = scores[0]
    for score in scores:
        if Max_socre < score and score < 1 :#里面有一个是其本身。最大值为一
            Max_socre = score
    if Max_socre != 0:
        for i in range(len(scores)):
            scores[i] = scores[i] / Max_socre
    """
    
    #print time(),44
    Scores={}
    for i  in range(len(All_3partAPIinfo_list)):
        API = All_3partAPIinfo_list[i][0]+ '.' + All_3partAPIinfo_list[i][1]
        Scores[ API ] = scores[i]
        
    """
    Scores = sorted(Scores.iteritems(), key = lambda asd:asd[1], reverse = True)
    return Scores # [(key,value)]
    """
    return Scores   #直接用字典，这)# + ran样便于后续的关键字查找

Beispiel #2

0

Datei anzeigen

Datei: extrSrcFileInfo.py Projekt: CongyingXU/poj_RecommdAPI

             sheet2.write(i,column+2,comments[column*30000 :].decode('utf-8'))
     i = i + 1
          
 f.save('/Users/apple/Documents/API推荐项目/hbase/SrcInfo.xls')#保存文件    
 """
 
 #版本2  文本预处理版
 #project_dict= { "类的路径" : [【类名】，【方法名】，【( 变量类型, 变量名 ),...,( 变量类型, 变量名 )】，【注释】] , ... } 
 
 i=1
 for key in project_dict:
     sheet1.write(i,0,key.decode('utf-8'))
     #sheet2.write(i,0,key.decode('utf-8'))
     
     #类名
     Class_name0=computeSimilarity.tokenize_stopwords_stemmer(project_dict[key][0])
     Class_name=''
     for word in Class_name0:
         Class_name = Class_name+' ' + word
     
     sheet1.write(i,1,Class_name.strip(' '))
     #sheet2.write(i,1,Class_name.strip(' '))
     
     #方法名
     method_name=''
     for name in project_dict[key][1]:
         method_name = name + ' '+method_name
     
     Method_name0=Tokenize_stopwords_stemmer([method_name])
     Method_name=''
     for word in Method_name0:

Beispiel #3

0

Datei anzeigen

Datei: getStrucCmptScors.py Projekt: CongyingXU/poj_RecommdAPI

def computeSimilarityScors(newReportSummary, newReportDescription,
                           sheet1):  #,Src_info_file_dir):

    all_classdir = []

    all_className = []
    all_methodName = []
    all_variableName = []
    all_comments = []
    """
    workbook = xlrd.open_workbook(Src_info_file_dir,'r')
    sheet1 = workbook.sheet_by_name('sheet1')
    """
    #sheet2 = workbook.sheet_by_name('comments')
    #print 1
    #print sheet1.cell(6,28).value.encode('utf-8')
    for i in range(1, sheet1.nrows):

        all_classdir.append(sheet1.cell(i, 0).value.encode('utf-8'))

        className = []
        className = sheet1.cell(i, 1).value.encode('utf-8').split(' ')
        all_className.append(className)

        methodName = []
        methodName = sheet1.cell(i, 2).value.encode('utf-8').split(' ')
        all_methodName.append(methodName)

        variableName = []
        variableName = sheet1.cell(i, 3).value.encode('utf-8').split(' ')
        all_variableName.append(variableName)

        comments_str = ''
        j = 6
        while 1:
            try:  #因为不确定  注释占了几格
                comments_str = comments_str + ' ' + sheet1.cell(
                    i, j).value.encode('utf-8')
                j = j + 1
            except IndexError:
                break
        comments_str = comments_str.strip(' ')
        comments = []
        comments = comments_str.split(' ')
        all_comments.append(comments)
        #print i

    #版本二，数据已经经过文本处理，节省时间
    newRportS = computeSimilarity.tokenize_stopwords_stemmer(
        [newReportSummary])
    newRportD = computeSimilarity.tokenize_stopwords_stemmer(
        [newReportDescription])
    #print 11

    all_classNameS_result = Half_computeSimilarity(newRportS, all_className)
    all_methodNameS_result = Half_computeSimilarity(newRportS, all_methodName)
    #print 22
    all_variableNameS_result = Half_computeSimilarity(newRportS,
                                                      all_variableName)
    all_commentsS_result = Half_computeSimilarity(newRportS, all_comments)
    #print 33

    all_classNameD_result = Half_computeSimilarity(newRportD, all_className)
    all_methodNameD_result = Half_computeSimilarity(newRportD, all_methodName)
    #print 44
    all_variableNameD_result = Half_computeSimilarity(newRportD,
                                                      all_variableName)
    all_commentsD_result = Half_computeSimilarity(newRportD, all_comments)

    #print 55
    """
    #版本一：数据无文本预处理
    #print sheet1.cell(6,28).value.encode('utf-8')    
    for i in range(1,sheet1.nrows):
        
        all_classdir.append(sheet1.cell(i,0).value.encode('utf-8'))
        
        className=[]
        className.append(sheet1.cell(i,1).value.encode('utf-8'))
        all_className.append(className)
        
        methodName=[]
        methodName.append(sheet1.cell(i,2).value.encode('utf-8'))
        all_methodName.append(methodName)

        variableName=[]
        variableName.append(sheet1.cell(i,3).value.encode('utf-8'))
        all_variableName.append(variableName)
        
        
        comments_str=''
        j=2
        while 1 :
            try:#因为不确定  注释占了几格
                comments_str= comments_str + sheet2.cell(i,j).value.encode('utf-8')
                j=j+1
            except IndexError:
                break
        comments=[]
        comments.append(comments_str)
        all_comments.append(comments)
    
    newRportS=[newReportSummary]
    newRportD=[newReportDescription]
    
    all_classNameS_result = computeSimilarity.all_computeSimilarity(newRportS,all_className)
    all_methodNameS_result = computeSimilarity.all_computeSimilarity(newRportS,all_methodName)
    all_variableNameS_result = computeSimilarity.all_computeSimilarity(newRportS,all_variableName)
    all_commentsS_result = computeSimilarity.all_computeSimilarity(newRportS,all_comments)
        
    all_classNameD_result = computeSimilarity.all_computeSimilarity(newRportD,all_className)
    all_methodNameD_result = computeSimilarity.all_computeSimilarity(newRportD,all_methodName)
    all_variableNameD_result = computeSimilarity.all_computeSimilarity(newRportD,all_variableName)
    all_commentsD_result = computeSimilarity.all_computeSimilarity(newRportD,all_comments)
    """
    """
        result =  all_classNameS_result +  all_classNameD_result
        + all_methodNameS_result +  all_methodNameD_result
        + all_variableNameS_result +  all_variableNameD_result
        + all_commentsS_result +  all_commentsD_result
    """

    result = []
    result.append(all_classdir)
    result.append(all_classNameS_result)
    result.append(all_classNameD_result)
    result.append(all_methodNameS_result)
    result.append(all_methodNameD_result)
    result.append(all_variableNameS_result)
    result.append(all_variableNameD_result)
    result.append(all_commentsS_result)
    result.append(all_commentsD_result)

    return result

Beispiel #4

0

Datei anzeigen

Datei: getStrucCmptScors.py Projekt: CongyingXU/poj_RecommdAPI

def computeSimilarityScors(newReportSummary,newReportDescription        ,sheet1):      #,Src_info_file_dir):
   
    all_classdir=[]
    
    all_program_lanuage=[]
    all_comments=[]
    """
    workbook = xlrd.open_workbook(Src_info_file_dir,'r')
    sheet1 = workbook.sheet_by_name('sheet1')
    """
    #sheet2 = workbook.sheet_by_name('comments')
    #print 1
    #print sheet1.cell(6,28).value.encode('utf-8')    
    for i in range(1,sheet1.nrows):
        
        all_classdir.append(sheet1.cell(i,0).value.encode('utf-8'))
        className=[]
        className = sheet1.cell(i,1).value.encode('utf-8').split(' ')
        methodName=[]
        methodName = sheet1.cell(i,2).value.encode('utf-8').split(' ')
        variableName=[]
        variableName = sheet1.cell(i,3).value.encode('utf-8').split(' ')
        program_lanuage = className + methodName  + variableName
        all_program_lanuage.append(program_lanuage)
        
        comments_str=''
        j=6
        while 1 :
            try:#因为不确定  注释占了几格
                comments_str= comments_str + ' ' + sheet1.cell(i,j).value.encode('utf-8')
                j=j+1
            except IndexError:
                break
        comments_str = comments_str.strip(' ')
        comments=[]
        comments = comments_str.split(' ')
        all_comments.append(comments)
        #print i
        
   
    
    #版本二，数据已经经过文本处理，节省时间
    newRportSD= computeSimilarity.tokenize_stopwords_stemmer( [newReportSummary + newReportDescription] )
    #print 11
    
    all_program_language_result = Half_computeSimilarity(newRportSD,all_program_lanuage)
    all_commentsSD_result = Half_computeSimilarity(newRportSD,all_comments)
    #归一化处理
    Max_socre  = all_program_language_result[0]
    for score in all_program_language_result:
        if Max_socre < score and score < 1 :#里面有一个是其本身。最大值为一
            Max_socre = score
    if Max_socre != 0:
        for i in range(len(all_program_language_result)):
            all_program_language_result[i] = all_program_language_result[i] / Max_socre
    
    Max_socre  = all_commentsSD_result[0]
    for score in all_commentsSD_result:
        if Max_socre < score and score < 1 :#里面有一个是其本身。最大值为一
            Max_socre = score
    if Max_socre != 0:
        for i in range(len(all_commentsSD_result)):
            all_commentsSD_result[i] = all_commentsSD_result[i] / Max_socre
            
    result=[]
    result.append(all_classdir)
    result.append(all_program_language_result)
    result.append(all_commentsSD_result)
    
    return result