def getNTForFile(topic_file_dict_, count_of_files_in_corpus, datasetFileParam):
    dict_file_NT = {}
    allPuppFiles = file_mapper.getPuppetFileList(datasetFileParam)
    for file_index in xrange(count_of_files_in_corpus):
        file_name = allPuppFiles[file_index]
        matchingTopics = getMatchedTopics(file_index, topic_file_dict_)
        tmp_nt_holder = []
        for mTop in matchingTopics:
            tmp_nt_holder.append(mTop)
        dict_file_NT[file_name] = tmp_nt_holder

    return dict_file_NT
def getNDTForFile(defect_density_, topic_file_dict_, count_of_files_in_corpus,
                  datasetFileParam):
    dict_file_NDT = {}
    median_, mean_ = getStatofValueDict(defect_density_)
    allPuppFiles = file_mapper.getPuppetFileList(datasetFileParam)
    for file_index in xrange(count_of_files_in_corpus):
        file_name = allPuppFiles[file_index]
        matchingTopics = getMatchedTopics(file_index, topic_file_dict_)
        tmp_ndt_holder = []
        for mTop in matchingTopics:
            density_of_topic = defect_density_[mTop]
            if (density_of_topic > mean_):
                tmp_ndt_holder.append(mTop)
        dict_file_NDT[file_name] = tmp_ndt_holder

    return dict_file_NDT
def getTMForFile(topic_file_dict_, count_of_files_in_corpus, topic_count_param,
                 topicProbParam, datasetFileParam):
    dict_file_TM = {}
    allPuppFiles = file_mapper.getPuppetFileList(datasetFileParam)
    for file_index in xrange(count_of_files_in_corpus):
        file_name = allPuppFiles[file_index]
        matchingTopics = getMatchedTopics(file_index, topic_file_dict_)
        prob_topic_holder = []
        for topicCnt in xrange(topic_count_param):
            if topicCnt in matchingTopics:
                prob_of_this_topic = file_mapper.getTopicProbOfTheTopic(
                    topicCnt, topicProbParam, file_index)
            else:
                prob_of_this_topic = float(0)
            prob_topic_holder.append(prob_of_this_topic)
        dict_file_TM[file_name] = prob_topic_holder
    return dict_file_TM
def getDensityOfDefectsForTopic(topicToDefectParam, datasetFileParam):
    topic_to_defect_categ_dict = {}
    topic_to_defect_density_ = {}
    allPuppFiles = file_mapper.getPuppetFileList(datasetFileParam)
    puppetFileDict = file_mapper.getPuppetFileDetails()
    for topic_, mappedFiles in topicToDefectParam.iteritems():
        for file_index in mappedFiles:
            file_ = allPuppFiles[file_index]
            defect_categ = puppetFileDict[file_]
            #print defect_categ
            if topic_ not in topic_to_defect_categ_dict:
                topic_to_defect_categ_dict[topic_] = [defect_categ]
            else:
                topic_to_defect_categ_dict[
                    topic_] = topic_to_defect_categ_dict[topic_] + [
                        defect_categ
                    ]
    ## convert list of lists to one single list
    ## this dictionary holds topic to category mapping
    tmp_dict = {}
    for k_, v_ in topic_to_defect_categ_dict.items():
        tmp_ = []
        for elem_list in v_:
            for elem in elem_list:
                if elem != 'N':
                    tmp_.append(elem)
        tmp_dict[k_] = dict(collections.Counter(tmp_))
    ## we extracted thec ategories per each topic, lets use them to get the defetc density metric
    for topic_, mappedFiles in topicToDefectParam.iteritems():
        loc_per_topic = 0
        defect_per_topic = 0
        for file_index in mappedFiles:
            file_ = allPuppFiles[file_index]
            sloc_file_ = sum(1 for line in open(file_))
            loc_per_topic = loc_per_topic + sloc_file_
        categories = tmp_dict[topic_]
        ## this is the dictioanry of categories only: each ket is a category
        for k_, v_ in categories.items():
            defect_per_topic = defect_per_topic + v_
        topic_to_defect_density_[topic_] = float(defect_per_topic) / float(
            loc_per_topic)
    return topic_to_defect_density_
def getDTMForFile(defect_density_, topic_file_dict_, count_of_files_in_corpus,
                  topic_count_param, topicProbParam, datasetFileParam):
    dict_file_DTM = {}
    median_, mean_ = getStatofValueDict(defect_density_)
    allPuppFiles = file_mapper.getPuppetFileList(datasetFileParam)
    for file_index in xrange(count_of_files_in_corpus):
        file_name = allPuppFiles[file_index]
        matchingTopics = getMatchedTopics(file_index, topic_file_dict_)
        tmp_dtm_holder = []
        for topicCnt in xrange(topic_count_param):
            if topicCnt in defect_density_:
                density_of_topic = defect_density_[topicCnt]
            else:
                density_of_topic = float(0)
            if ((topicCnt in matchingTopics) and (density_of_topic > mean_)):
                prob_of_this_topic = file_mapper.getTopicProbOfTheTopic(
                    topicCnt, topicProbParam, file_index)
            else:
                prob_of_this_topic = float(0)
            tmp_dtm_holder.append(prob_of_this_topic)
        dict_file_DTM[file_name] = tmp_dtm_holder
    return dict_file_DTM