def scan(self, filePath): ''' Read the content content of filename, extract the comments and preprocess them. Find the Damerau Levenshtein distance between the preprocessed file content and the license text. :param filePath: Path of the file to scan :return: Returns the license's short name with least damerau levenshtien distance ''' processedData = super().loadFile(filePath) temp = exactMatcher(processedData, self.licenseList) if temp == -1: # Classify the license with minimum distance with scanned file globalDistance = sys.maxsize result = 0 for idx in range(len(self.licenseList)): distance = damerau_levenshtein_distance( processedData.split(" "), self.licenseList.iloc[idx]['processed_text'].split(" ")) if self.verbose > 0: print( str(idx) + " " + self.licenseList.iloc[idx]['shortname'] + " " + str(distance)) if distance < globalDistance: globalDistance = distance result = idx return str(self.licenseList.iloc[result]['shortname']) else: return temp[0]
def scan(self, filePath): ''' Python Module to classify license using histogram similarity algorithm :param filePath: Input file path that needs to be scanned :return: License short name with maximum intersection with word frequency of licenses ''' processedData = super().loadFile(filePath) if self.verbose > 0: print("PROCESSED DATA IS ", processedData) print("LICENSES[0]", str(self.licenseList.iloc[0])) temp = exactMatcher(processedData, self.licenseList) if temp == -1: # create array of frequency array of licenses licensesFrequency = [] for idx in range(len(self.licenseList)): license = self.licenseList.at[idx, 'processed_text'] licensesFrequency.append( wordFrequency(re.findall(r'\b[a-z]{3,15}\b', license))) processedLicense = wordFrequency( re.findall(r'\b[a-z]{3,15}\b', processedData)) if self.verbose > 0: print("Frequency array of licenses", licensesFrequency[0]) print("Frequency table of input data", processedLicense) # Histogram Similarity Algorithm globalCount = 0 result = 0 for idx in range(len(licensesFrequency)): tempCount = 0 for word, processedLicenseWordFreq in processedLicense.items(): licenseWordFreq = licensesFrequency[idx].get(word, 0) if min(licenseWordFreq, processedLicenseWordFreq) > 0: tempCount = tempCount + min(licenseWordFreq, processedLicenseWordFreq) if self.verbose > 0: print(idx, self.licenseList.at[idx, 'shortname'], tempCount) if globalCount < tempCount: result = idx globalCount = tempCount if self.verbose > 0: print("Result is license with ID", result) return str(self.licenseList.at[result, 'shortname']) else: return temp