def generateContributionFile():
    changelog = pd.read_csv(filepathhelper.path(dataset, "changelog.csv"),
                            quotechar='"',
                            sep=';')
    with open(filepathhelper.path(dataset, 'rp'), 'rb') as f:
        rp = pickle.load(f)

    train = pd.read_csv(filepathhelper.path(dataset, "trainissuekey.csv"))
    #select only changelog in in
    changelog = changelog[changelog.issuekey.isin(
        train.issuekey)][['issuekey', 'username']]

    user = set()
    for i in rp:
        user.update(rp[i])

    lognamegroup = changelog.groupby(['username']).groups
    logkeygroup = changelog.groupby(['issuekey']).count()

    person = set(changelog['username'])
    user = list(user)

    username = []
    contribution = []

    for i in tqdm(range(len(user))):
        activity = []
        if user[i] in person:
            loglist = changelog[changelog['username'] == user[i]]
            logcount = loglist.groupby(['issuekey']).count().reset_index()

            for index, row in logcount.iterrows():
                activity.append(row['username'] /
                                logkeygroup.loc[row['issuekey']].values[0])

            username.append(user[i])
            contribution.append(sum(activity) / len(activity))

    d = {'username': username, 'contribution': contribution}
    df = pd.DataFrame(data=d)
    df.to_csv(filepathhelper.path(dataset, 'contribution.csv'))
Example #2
0
def calculatePairScore(dataset, tagpair, commentscore, train, outfile):

    #train = pd.read_csv(filepathhelper.path(dataset, train), encoding='iso-8859-1')
    tagcomment = pd.read_csv(filepathhelper.path(dataset, tagpair),
                             encoding='iso-8859-1')

    #remove white space at beginning
    tagcomment['tagger'] = tagcomment.tagger.str.lstrip()
    tagcomment['taggee'] = tagcomment.taggee.str.lstrip()

    #remove user null
    tagcomment = tagcomment[tagcomment['tagger'] != ' ']
    tagcomment = tagcomment[tagcomment['taggee'] != ' ']

    #remove user that contains / (error from extract tagger and taggee)
    tagcomment = tagcomment[~tagcomment.tagger.str.contains('/', na=False)]
    tagcomment = tagcomment[~tagcomment.taggee.str.contains('/', na=False)]

    check = ~tagcomment.tagger.str.isdigit()
    #remove user all digit
    tagcomment = tagcomment[check]
    #check = ~tagcomment.taggee.str.isdigit()
    #tagcomment = tagcomment[check ]

    #    commentscore = filterTrainSet(commentscore,train)
    #    commentscore = commentscore.drop_duplicates()
    #    print(commentscore)

    #tagcomment = tagcomment.set_index('commentid').join(commentscore.set_index('commentid'))
    tagcomment = pd.merge(tagcomment,
                          commentscore,
                          left_on='commentid',
                          right_on='commentid',
                          how='inner')
    pair = tagcomment.groupby(['tagger', 'taggee']).agg({
        'positivescore': 'mean',
        'negativescore': 'mean'
    })
    pair.reset_index().set_index(['tagger', 'taggee'
                                  ]).sort_index(level=0).to_csv(outfile)
Example #3
0
def calculatePairScoreFilter(dataset, tagpair, commentscore, train, outfile):

    #train = pd.read_csv(filepathhelper.path(dataset, train), encoding='iso-8859-1')
    tagcomment = pd.read_csv(filepathhelper.path(dataset, tagpair),
                             encoding='iso-8859-1')

    commentscore = filterTrainSet(commentscore, train)
    commentscore = commentscore.drop_duplicates()
    #    print(commentscore)

    #tagcomment = tagcomment.set_index('commentid').join(commentscore.set_index('commentid'))
    tagcomment = pd.merge(tagcomment,
                          commentscore,
                          left_on='commentid',
                          right_on='commentid',
                          how='inner')
    pair = tagcomment.groupby(['tagger', 'taggee']).agg({
        'positivescore': 'mean',
        'negativescore': 'mean'
    })
    pair.reset_index().set_index(['tagger', 'taggee'
                                  ]).sort_index(level=0).to_csv(outfile)
Example #4
0
def IssueFamiliarity(team, issuekey='', descript='', fortest=False):
    member = team['developer'] + team['integrator'] + team['tester'] + team[
        'reviewer'] + team['assignee']
    member = set(member)
    if descript != '':
        with open("temp.txt", "w") as f:
            f.write(descript)
        subprocess.call([
            'java', '-jar', 'topicSim.jar',
            filepathhelper.path(dataset, 'model_nonLabel_500_9_1'), 'temp.txt'
        ])
        with open("topicDist", "r") as f:
            s = f.read().split('\n')[:-1]
        tdin = ast.literal_eval(s[0])

        os.remove("temp.txt")
        os.remove("topicDist")
    elif issuekey != '' and not fortest:
        inissuerow = topdists.loc[issuekey]
        tdin = inissuerow['topdist']  #faster
        tdin_squaresum = inissuerow['squaresum']  #faster
        tdin = cp.array(tdin)  #gpu
        tdin_squaresum = cp.array(tdin_squaresum)  #gpu
        #tdin_squaresum = math.sqrt((np.array(tdin)**2).sum()) #faster

    issuefam = 0
    if fortest:
        issuefam = np.array([maxcosim(m, issuekey) for m in member]).sum()
    else:
        for m in member:
            if m not in PtoI:
                continue
            participated_issue = PtoI[m].copy()
            participated_issue.discard(issuekey)
            if len(participated_issue) > 0:
                participated_issue_topdist = topdists.loc[participated_issue]
                todistmatrix = cp.array([
                    cp.array(i)
                    for i in participated_issue_topdist['topdist'].values
                ])
                squaresum = cp.array(
                    participated_issue_topdist['squaresum'].values)
                maxfam = (todistmatrix.dot(tdin) /
                          (squaresum * tdin_squaresum)).max()

                #             num_participated  = participated_issue_topdist.shape[0]
                #             pitd_squaresum = participated_issue_topdist['squaresum']
                #             pitd = participated_issue_topdist['topdist']
                #             tdin_squaresum_list = [tdin_squaresum for i in range(0,num_participated)]
                #             tdin_list = [tdin for i in range(0,num_participated)]
                #             cosinargs = tuple(zip(pitd_squaresum,pitd,tdin_squaresum_list,tdin_list))

                #             with mp.Pool(NUMTHREADS) as pool:
                #                 result = pool.starmap(cosim,cosinargs, chunksize=NUMTHREADS)
                #                 maxfam = np.array([i for i in result]).max()
                #             for issue in participated_issue:
                #                 issuerow = topdists.loc[issue]
                #                 td = issuerow['topdist']
                #                 td_squaresum = issuerow['squaresum']
                #                 maxfam = max(maxfam,cosim(td_squaresum,td,tdin_squaresum,tdin))
                issuefam = issuefam + maxfam
    return float(issuefam) / len(member)
Example #5
0
import pickle
# from tqdm import tqdm_notebook as tqdm
import subprocess
import ast
from neo4j import GraphDatabase
import neo4j
import multiprocessing as mp
from functools import lru_cache
import hashlib
import re
from networkx.algorithms.approximation import steiner_tree
# # File Need

# In[4]:

teams = pd.read_csv(filepathhelper.path(dataset, 'team.csv'))
closeresolve = pd.read_csv(filepathhelper.path(dataset, 'closeresolve.csv'),
                           sep=';')
winissues = pd.read_csv(filepathhelper.path(dataset, 'winissue.csv'))
assignees = pd.read_csv(filepathhelper.path(dataset, 'assignee.csv'), sep=';')
trainset = pd.read_csv(filepathhelper.path(dataset, 'trainissuekey.csv'))
issuecomponent = pd.read_csv(filepathhelper.path(dataset,
                                                 'component_title.csv'),
                             sep=';;;',
                             engine='python')
#######################################################################################
df = pd.read_csv(filepathhelper.path(dataset, 'global_pair_score.csv'),
                 encoding='iso-8859-1')

pos = df['positivescore']
neg = df['negativescore']
# In[ ]:

from numpy import newaxis, minimum


def warshall(mat):
    n = len(mat)
    for k in tqdm(range(n)):
        mat = minimum(mat, mat[newaxis, k, :] + mat[:, k, newaxis])

    return mat


# In[ ]:

tags = pd.read_csv(filepathhelper.path(dataset, 'tags.csv'),
                   encoding='ISO-8859-1')
teams = pd.read_csv(filepathhelper.path(dataset, 'team.csv'))
closeresolve = pd.read_csv(filepathhelper.path(dataset, 'closeresolve.csv'),
                           sep=';')
teams = teams[(teams['issuekey'].isin(closeresolve['issuekey']))]
winissues = pd.read_csv(filepathhelper.path(dataset, 'winissue.csv'))
assignees = pd.read_csv(filepathhelper.path(dataset, 'assignee.csv'), sep=';')
assignees = assignees[(assignees['issuekey'].isin(teams['issuekey']))]

# assignees.shape
# assignees.head()
# closeresolve.head()

# # train on only train dataset
Example #7
0
curdir = os.getcwd()
while 'filepathhelper.py' not in os.listdir(curdir):
    curdir = os.path.dirname(curdir)
sys.path.append(curdir)
import filepathhelper

import math
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
#from tqdm import tqdm_notebook as tqdm

teams = pd.read_csv(filepathhelper.path(dataset, 'team.csv'))
closeresolve = pd.read_csv(filepathhelper.path(dataset, 'closeresolve.csv'),
                           sep=';')
winissues = pd.read_csv(filepathhelper.path(dataset, 'winissue.csv'))
assignees = pd.read_csv(filepathhelper.path(dataset, 'assignee.csv'), sep=';')
trainset = pd.read_csv(filepathhelper.path(dataset, 'trainissuekey.csv'))

teams = teams[(teams['issuekey'].isin(closeresolve['issuekey']))]
assignees = assignees[(assignees['issuekey'].isin(teams['issuekey']))]
teams = teams[teams['issuekey'].isin(trainset['issuekey'])]
assignees = assignees[assignees['issuekey'].isin(trainset['issuekey'])]
winissues = winissues[winissues['issuekey'].isin(trainset['issuekey'])]
assignees.set_index('issuekey', inplace=True)

username = set()
team = {}
Example #8
0
curdir = os.getcwd()
while 'filepathhelper.py' not in os.listdir(curdir):
       curdir = os.path.dirname(curdir)
sys.path.append(curdir)
import filepathhelper

import math
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
#from tqdm import tqdm_notebook as tqdm


teams = pd.read_csv(filepathhelper.path(dataset,'team.csv'))
closeresolve = pd.read_csv(filepathhelper.path(dataset,'closeresolve.csv'),sep=';')
winissues = pd.read_csv(filepathhelper.path(dataset,'winissue.csv'))
assignees = pd.read_csv(filepathhelper.path(dataset,'assignee.csv'),sep=';')
trainset = pd.read_csv(filepathhelper.path(dataset,'trainissuekey.csv'))
issuecomponent = pd.read_csv(filepathhelper.path(dataset,'component_title.csv'),sep=';;;',engine='python')

teams = teams[(teams['issuekey'].isin(closeresolve['issuekey']))]
assignees = assignees[(assignees['issuekey'].isin(teams['issuekey']))]
teams = teams[teams['issuekey'].isin(trainset['issuekey'])]
assignees = assignees[assignees['issuekey'].isin(trainset['issuekey'])]
winissues = winissues[winissues['issuekey'].isin(trainset['issuekey'])]

dev = list(teams['dev'].unique())
tmp = []
for i in dev:
Example #9
0
while 'filepathhelper.py' not in os.listdir(curdir):
    curdir = os.path.dirname(curdir)
sys.path.append(curdir)
import filepathhelper


def issueContribution(userlog, changelog):
    print()


if __name__ == "__main__":

    if len(sys.argv) == 2:
        dataset = sys.argv[1]
        if dataset == 'Moodle':
            changelog = pd.read_csv(filepathhelper.path(
                dataset, 'changelog.csv'),
                                    sep=';')
            developer = pd.read_csv(
                filepathhelper.path(dataset, 'developer.csv'))
            tester = pd.read_csv(filepathhelper.path(dataset, 'tester.csv'))
            reviewer = pd.read_csv(
                filepathhelper.path(dataset, 'peer_reviewer.csv'))
            integrator = pd.read_csv(
                filepathhelper.path(dataset, 'integrator.csv'))

            developer = set(developer['username'])
            tester = set(tester['username'])
            reviewer = set(reviewer['username'])
            integrator = set(integrator['username'])

            person = developer.union(tester, reviewer)
Example #10
0
    curdir = os.path.dirname(curdir)
with open(os.path.join(curdir, 'config.json'), 'r') as f:
    dataset = json.load(f)['dataset']

curdir = os.getcwd()
while 'filepathhelper.py' not in os.listdir(curdir):
    curdir = os.path.dirname(curdir)
sys.path.append(curdir)
import filepathhelper

if __name__ == "__main__":
    if 1 > 0:
        #    if len(sys.argv) == 2:
        #        dataset = sys.argv[1]
        #        print(dataset)
        changelog = pd.read_csv(filepathhelper.path(dataset, 'changelog.csv'),
                                sep=';')
        with open(filepathhelper.path(dataset, 'rp'), 'rb') as f:
            rp = pickle.load(f)
        person = set()
        for i in rp:
            person.update(rp[i])

        loggroup = changelog.groupby(['username']).groups
        userlog = set(changelog['username'])
        print('finish reading files')
        activity = {}
        for p in tqdm(person):
            if p in userlog:
                acttime = set()
                for i in loggroup[p]:
Example #11
0
            elif r.startswith('tester'):
                rankdict['team']['tester'].append(team[r])
        rank.append(rankdict)
        rankno=rankno+1
    return rank

def saveOutput(outname):
    outdata = result
#    outfile = 'out\\'+outname+'.json'
    outfile = outname
    with open(outfile, 'w') as outfile:
        json.dump(outdata, outfile)


if __name__== "__main__":
    inputname = filepathhelper.path(dataset,'input_test.json')
    random.seed(123)
    outdata = {} 
    RANK=100
    result = []
    with open(filepathhelper.path(dataset,'rp'),'rb') as f:
        rp = pickle.load(f)
    if len(sys.argv) == 2: 
        
        outputname = sys.argv[1]
        individual = False
    elif len(sys.argv) == 3:
        outputname = sys.argv[1]
        individual = True
        missingrole = sys.argv[2]+str('1')
    else: