예제 #1
0
def refine_cluster(license_cluster, verbose=0):
  '''
  :param license_cluster: Initial license cluster based on the same root license name
  :return: Refined license cluster array using cosine similarity >= MAX_ALLOWED_DISTANCE (0.97)
  '''
  cluster = {}
  for key, initial_cluster in license_cluster.items():
    # for every initial_cluster, call cosine sim and union find
    for i in range(len(initial_cluster)):
      if i + 1 < len(initial_cluster):
        for j in range(i + 1, len(initial_cluster)):
          dist = cosine_similarity(wordFrequency(initial_cluster[i]['processed_text'].split(" ")),
                                   wordFrequency(initial_cluster[j]['processed_text'].split(" ")))
          if verbose > 0:
            print(key, initial_cluster[i]['shortname'], initial_cluster[j]['shortname'], dist)
          if dist > MAX_ALLOWED_DISTANCE:
            if verbose > 0:
              print("Pushed in cluster", key, initial_cluster[i]['shortname'],
                    initial_cluster[j]['shortname'], dist)
            if key in cluster:
              cluster[key].append([initial_cluster[i]['shortname'],
                                   initial_cluster[j]['shortname']])
            else:
              cluster[key] = [[initial_cluster[i]['shortname'],
                               initial_cluster[j]['shortname']]]
  result = []
  for key, arr in cluster.items():
    cluster[key] = union_and_find(arr)
    # convert the set to list
    for clustr in cluster[key]:
      result.append(list(clustr))

  return result
예제 #2
0
    def scan(self, filePath):
        '''
    Python Module to classify license using histogram similarity algorithm

    :param filePath: Input file path that needs to be scanned
    :return: License short name with maximum intersection with word frequency of licenses
    '''
        processedData = super().loadFile(filePath)
        if self.verbose > 0:
            print("PROCESSED DATA IS ", processedData)
            print("LICENSES[0]", str(self.licenseList.iloc[0]))

        temp = exactMatcher(processedData, self.licenseList)
        if temp == -1:
            # create array of frequency array of licenses
            licensesFrequency = []
            for idx in range(len(self.licenseList)):
                license = self.licenseList.at[idx, 'processed_text']
                licensesFrequency.append(
                    wordFrequency(re.findall(r'\b[a-z]{3,15}\b', license)))

            processedLicense = wordFrequency(
                re.findall(r'\b[a-z]{3,15}\b', processedData))

            if self.verbose > 0:
                print("Frequency array of licenses", licensesFrequency[0])
                print("Frequency table of input data", processedLicense)

            # Histogram Similarity Algorithm
            globalCount = 0
            result = 0
            for idx in range(len(licensesFrequency)):
                tempCount = 0
                for word, processedLicenseWordFreq in processedLicense.items():
                    licenseWordFreq = licensesFrequency[idx].get(word, 0)
                    if min(licenseWordFreq, processedLicenseWordFreq) > 0:
                        tempCount = tempCount + min(licenseWordFreq,
                                                    processedLicenseWordFreq)
                if self.verbose > 0:
                    print(idx, self.licenseList.at[idx, 'shortname'],
                          tempCount)
                if globalCount < tempCount:
                    result = idx
                    globalCount = tempCount
            if self.verbose > 0:
                print("Result is license with ID", result)
            return str(self.licenseList.at[result, 'shortname'])

        else:
            return temp
예제 #3
0
    def scan(self, inputFile):
        '''
    :param inputFile: Input file path that needs to be scanned
    :return: Array of JSON with the output of scan of the file.

    +------------+-----------------------------------------------------------+
    | shortname  | Short name of the license                                 |
    +------------+-----------------------------------------------------------+
    | sim_type   | Type of similarity from which the result is generated     |
    +------------+-----------------------------------------------------------+
    | sim_score  | Similarity score for the algorithm used mentioned above   |
    +------------+-----------------------------------------------------------+
    | desc       | Description/ comments for the similarity measure          |
    +------------+-----------------------------------------------------------+
    '''
        processedData = super().loadFile(inputFile)
        matches = initial_match(self.commentFile, processedData,
                                self.licenseList)

        # Full text Bi-gram Cosine Similarity Match
        Cosine_matches = []
        Dice_matches = []
        Bigram_cosine_matches = []

        initial_guess = self.__Ngram_guess(processedData)
        ngram_guesses = []
        for guess in initial_guess:
            for x in guess['shortname']:
                ngram_guesses.append(x)

        all_guesses = unique([l['shortname'] for l in matches])
        self.licenseList = self.licenseList[
            (self.licenseList.shortname.isin(ngram_guesses)) |
            (self.licenseList.shortname.isin(all_guesses))]
        self.licenseList.sort_values('shortname').reset_index(drop=True)

        for idx in range(len(self.licenseList)):

            if self.simType == self.NgramAlgo.cosineSim:
                # cosine similarity with unigram
                cosineSim = cosine_similarity(
                    wordFrequency(self.licenseList.iloc[idx]
                                  ['processed_text'].split(" ")),
                    wordFrequency(processedData.split(" ")))
                if cosineSim >= 0.6:
                    Cosine_matches.append({
                        'shortname':
                        self.licenseList.iloc[idx]['shortname'],
                        'sim_type':
                        'CosineSim',
                        'sim_score':
                        cosineSim,
                        'description':
                        ''
                    })
                if self.verbose > 0:
                    print("Cosine Sim ", str(cosineSim),
                          self.licenseList.iloc[idx]['shortname'])

            elif self.simType == self.NgramAlgo.diceSim:
                # dice similarity
                diceSim = textdistance.sorensen(
                    self.licenseList.iloc[idx]['processed_text'].split(" "),
                    processedData.split(" "))
                if diceSim >= 0.6:
                    Dice_matches.append({
                        'shortname':
                        self.licenseList.iloc[idx]['shortname'],
                        'sim_type':
                        'DiceSim',
                        'sim_score':
                        diceSim,
                        'description':
                        ''
                    })
                if self.verbose > 0:
                    print("Dice Sim ", str(diceSim),
                          self.licenseList.iloc[idx]['shortname'])

            elif self.simType == self.NgramAlgo.bigramCosineSim:
                bigram_cosine_sim = cosine_similarity(
                    wordFrequency(
                        self.__bigram_tokenize(
                            self.licenseList.iloc[idx]['processed_text'])),
                    wordFrequency(self.__bigram_tokenize(processedData)))
                if bigram_cosine_sim >= 0.9:
                    Bigram_cosine_matches.append({
                        'shortname':
                        self.licenseList.iloc[idx]['shortname'],
                        'sim_type':
                        'BigramCosineSim',
                        'sim_score':
                        bigram_cosine_sim,
                        'description':
                        ''
                    })
                    if self.verbose > 0:
                        print("Bigram Cosine Sim ", str(bigram_cosine_sim),
                              self.licenseList.iloc[idx]['shortname'])

        if self.simType == self.NgramAlgo.cosineSim and len(
                Cosine_matches) > 0:
            matches = list(itertools.chain(matches, Cosine_matches))

        if self.simType == self.NgramAlgo.diceSim and len(Dice_matches) > 0:
            matches = list(itertools.chain(matches, Dice_matches))

        if self.simType == self.NgramAlgo.bigramCosineSim and len(
                Bigram_cosine_matches) > 0:
            matches = list(itertools.chain(matches, Bigram_cosine_matches))

        matches.sort(key=lambda x: x['sim_score'], reverse=True)
        return matches