def refine_cluster(license_cluster, verbose=0): ''' :param license_cluster: Initial license cluster based on the same root license name :return: Refined license cluster array using cosine similarity >= MAX_ALLOWED_DISTANCE (0.97) ''' cluster = {} for key, initial_cluster in license_cluster.items(): # for every initial_cluster, call cosine sim and union find for i in range(len(initial_cluster)): if i + 1 < len(initial_cluster): for j in range(i + 1, len(initial_cluster)): dist = cosine_similarity(wordFrequency(initial_cluster[i]['processed_text'].split(" ")), wordFrequency(initial_cluster[j]['processed_text'].split(" "))) if verbose > 0: print(key, initial_cluster[i]['shortname'], initial_cluster[j]['shortname'], dist) if dist > MAX_ALLOWED_DISTANCE: if verbose > 0: print("Pushed in cluster", key, initial_cluster[i]['shortname'], initial_cluster[j]['shortname'], dist) if key in cluster: cluster[key].append([initial_cluster[i]['shortname'], initial_cluster[j]['shortname']]) else: cluster[key] = [[initial_cluster[i]['shortname'], initial_cluster[j]['shortname']]] result = [] for key, arr in cluster.items(): cluster[key] = union_and_find(arr) # convert the set to list for clustr in cluster[key]: result.append(list(clustr)) return result
def scan(self, filePath): ''' Python Module to classify license using histogram similarity algorithm :param filePath: Input file path that needs to be scanned :return: License short name with maximum intersection with word frequency of licenses ''' processedData = super().loadFile(filePath) if self.verbose > 0: print("PROCESSED DATA IS ", processedData) print("LICENSES[0]", str(self.licenseList.iloc[0])) temp = exactMatcher(processedData, self.licenseList) if temp == -1: # create array of frequency array of licenses licensesFrequency = [] for idx in range(len(self.licenseList)): license = self.licenseList.at[idx, 'processed_text'] licensesFrequency.append( wordFrequency(re.findall(r'\b[a-z]{3,15}\b', license))) processedLicense = wordFrequency( re.findall(r'\b[a-z]{3,15}\b', processedData)) if self.verbose > 0: print("Frequency array of licenses", licensesFrequency[0]) print("Frequency table of input data", processedLicense) # Histogram Similarity Algorithm globalCount = 0 result = 0 for idx in range(len(licensesFrequency)): tempCount = 0 for word, processedLicenseWordFreq in processedLicense.items(): licenseWordFreq = licensesFrequency[idx].get(word, 0) if min(licenseWordFreq, processedLicenseWordFreq) > 0: tempCount = tempCount + min(licenseWordFreq, processedLicenseWordFreq) if self.verbose > 0: print(idx, self.licenseList.at[idx, 'shortname'], tempCount) if globalCount < tempCount: result = idx globalCount = tempCount if self.verbose > 0: print("Result is license with ID", result) return str(self.licenseList.at[result, 'shortname']) else: return temp
def scan(self, inputFile): ''' :param inputFile: Input file path that needs to be scanned :return: Array of JSON with the output of scan of the file. +------------+-----------------------------------------------------------+ | shortname | Short name of the license | +------------+-----------------------------------------------------------+ | sim_type | Type of similarity from which the result is generated | +------------+-----------------------------------------------------------+ | sim_score | Similarity score for the algorithm used mentioned above | +------------+-----------------------------------------------------------+ | desc | Description/ comments for the similarity measure | +------------+-----------------------------------------------------------+ ''' processedData = super().loadFile(inputFile) matches = initial_match(self.commentFile, processedData, self.licenseList) # Full text Bi-gram Cosine Similarity Match Cosine_matches = [] Dice_matches = [] Bigram_cosine_matches = [] initial_guess = self.__Ngram_guess(processedData) ngram_guesses = [] for guess in initial_guess: for x in guess['shortname']: ngram_guesses.append(x) all_guesses = unique([l['shortname'] for l in matches]) self.licenseList = self.licenseList[ (self.licenseList.shortname.isin(ngram_guesses)) | (self.licenseList.shortname.isin(all_guesses))] self.licenseList.sort_values('shortname').reset_index(drop=True) for idx in range(len(self.licenseList)): if self.simType == self.NgramAlgo.cosineSim: # cosine similarity with unigram cosineSim = cosine_similarity( wordFrequency(self.licenseList.iloc[idx] ['processed_text'].split(" ")), wordFrequency(processedData.split(" "))) if cosineSim >= 0.6: Cosine_matches.append({ 'shortname': self.licenseList.iloc[idx]['shortname'], 'sim_type': 'CosineSim', 'sim_score': cosineSim, 'description': '' }) if self.verbose > 0: print("Cosine Sim ", str(cosineSim), self.licenseList.iloc[idx]['shortname']) elif self.simType == self.NgramAlgo.diceSim: # dice similarity diceSim = textdistance.sorensen( self.licenseList.iloc[idx]['processed_text'].split(" "), processedData.split(" ")) if diceSim >= 0.6: Dice_matches.append({ 'shortname': self.licenseList.iloc[idx]['shortname'], 'sim_type': 'DiceSim', 'sim_score': diceSim, 'description': '' }) if self.verbose > 0: print("Dice Sim ", str(diceSim), self.licenseList.iloc[idx]['shortname']) elif self.simType == self.NgramAlgo.bigramCosineSim: bigram_cosine_sim = cosine_similarity( wordFrequency( self.__bigram_tokenize( self.licenseList.iloc[idx]['processed_text'])), wordFrequency(self.__bigram_tokenize(processedData))) if bigram_cosine_sim >= 0.9: Bigram_cosine_matches.append({ 'shortname': self.licenseList.iloc[idx]['shortname'], 'sim_type': 'BigramCosineSim', 'sim_score': bigram_cosine_sim, 'description': '' }) if self.verbose > 0: print("Bigram Cosine Sim ", str(bigram_cosine_sim), self.licenseList.iloc[idx]['shortname']) if self.simType == self.NgramAlgo.cosineSim and len( Cosine_matches) > 0: matches = list(itertools.chain(matches, Cosine_matches)) if self.simType == self.NgramAlgo.diceSim and len(Dice_matches) > 0: matches = list(itertools.chain(matches, Dice_matches)) if self.simType == self.NgramAlgo.bigramCosineSim and len( Bigram_cosine_matches) > 0: matches = list(itertools.chain(matches, Bigram_cosine_matches)) matches.sort(key=lambda x: x['sim_score'], reverse=True) return matches