def __tfidfcosinesim(self, inputFile): ''' TF-IDF Cosine Similarity Algorithm. Used TfidfVectorizer to implement it. :param inputFile: Input file path :return: Sorted array of JSON of scanner results with sim_type as __tfidfcosinesim ''' processedData1 = super().loadFile(inputFile) matches = initial_match(self.commentFile, processedData1, self.licenseList) startTime = time.time() all_documents = self.licenseList['processed_text'].tolist() sklearn_tfidf = TfidfVectorizer(min_df=0, use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=tokenize) all_documents_matrix = sklearn_tfidf.fit_transform(all_documents).toarray() search_martix = sklearn_tfidf.transform([processedData1]).toarray()[0] for counter, value in enumerate(all_documents_matrix, start=0): sim_score = self.__cosine_similarity(value, search_martix) if sim_score >= 0.3: matches.append({ 'shortname': self.licenseList.iloc[counter]['shortname'], 'sim_type': "TF-IDF Cosine Sim", 'sim_score': sim_score, 'desc': '' }) matches.sort(key=lambda x: x['sim_score'], reverse=True) if self.verbose > 0: print("time taken is " + str(time.time() - startTime) + " sec") return matches
def __tfidfsumscore(self, inputFile): ''' TF-IDF Sum Score Algorithm. Used TfidfVectorizer to implement it. :param inputFile: Input file path :return: Sorted array of JSON of scanner results with sim_type as __tfidfsumscore ''' processedData1 = super().loadFile(inputFile) matches = initial_match(self.commentFile, processedData1, self.licenseList) startTime = time.time() # unique words from tokenized input file processedData = unique(processedData1.split(" ")) all_documents = self.licenseList['processed_text'].tolist() all_documents.append(processedData1) sklearn_tfidf = TfidfVectorizer(min_df=0, use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=tokenize, vocabulary=processedData) sklearn_representation = sklearn_tfidf.fit_transform( all_documents).toarray() score_arr = [] result = 0 for counter, value in enumerate( sklearn_representation[:len(sklearn_representation) - 1], start=0): sim_score = sum(value) score_arr.append({ 'shortname': self.licenseList.iloc[counter]['shortname'], 'sim_type': "Sum of TF-IDF score", 'sim_score': sim_score, 'desc': "Score can be greater than 1 also" }) score_arr.sort(key=lambda x: x['sim_score'], reverse=True) matches = list(itertools.chain(matches, score_arr[:5])) matches.sort(key=lambda x: x['sim_score'], reverse=True) if self.verbose > 0: print("time taken is " + str(time.time() - startTime) + " sec") return matches
def scan(self, inputFile): ''' :param inputFile: Input file path that needs to be scanned :return: Array of JSON with the output of scan of the file. +------------+-----------------------------------------------------------+ | shortname | Short name of the license | +------------+-----------------------------------------------------------+ | sim_type | Type of similarity from which the result is generated | +------------+-----------------------------------------------------------+ | sim_score | Similarity score for the algorithm used mentioned above | +------------+-----------------------------------------------------------+ | desc | Description/ comments for the similarity measure | +------------+-----------------------------------------------------------+ ''' processedData = super().loadFile(inputFile) matches = initial_match(self.commentFile, processedData, self.licenseList) # Full text Bi-gram Cosine Similarity Match Cosine_matches = [] Dice_matches = [] Bigram_cosine_matches = [] initial_guess = self.__Ngram_guess(processedData) ngram_guesses = [] for guess in initial_guess: for x in guess['shortname']: ngram_guesses.append(x) all_guesses = unique([l['shortname'] for l in matches]) self.licenseList = self.licenseList[ (self.licenseList.shortname.isin(ngram_guesses)) | (self.licenseList.shortname.isin(all_guesses))] self.licenseList.sort_values('shortname').reset_index(drop=True) for idx in range(len(self.licenseList)): if self.simType == self.NgramAlgo.cosineSim: # cosine similarity with unigram cosineSim = cosine_similarity( wordFrequency(self.licenseList.iloc[idx] ['processed_text'].split(" ")), wordFrequency(processedData.split(" "))) if cosineSim >= 0.6: Cosine_matches.append({ 'shortname': self.licenseList.iloc[idx]['shortname'], 'sim_type': 'CosineSim', 'sim_score': cosineSim, 'description': '' }) if self.verbose > 0: print("Cosine Sim ", str(cosineSim), self.licenseList.iloc[idx]['shortname']) elif self.simType == self.NgramAlgo.diceSim: # dice similarity diceSim = textdistance.sorensen( self.licenseList.iloc[idx]['processed_text'].split(" "), processedData.split(" ")) if diceSim >= 0.6: Dice_matches.append({ 'shortname': self.licenseList.iloc[idx]['shortname'], 'sim_type': 'DiceSim', 'sim_score': diceSim, 'description': '' }) if self.verbose > 0: print("Dice Sim ", str(diceSim), self.licenseList.iloc[idx]['shortname']) elif self.simType == self.NgramAlgo.bigramCosineSim: bigram_cosine_sim = cosine_similarity( wordFrequency( self.__bigram_tokenize( self.licenseList.iloc[idx]['processed_text'])), wordFrequency(self.__bigram_tokenize(processedData))) if bigram_cosine_sim >= 0.9: Bigram_cosine_matches.append({ 'shortname': self.licenseList.iloc[idx]['shortname'], 'sim_type': 'BigramCosineSim', 'sim_score': bigram_cosine_sim, 'description': '' }) if self.verbose > 0: print("Bigram Cosine Sim ", str(bigram_cosine_sim), self.licenseList.iloc[idx]['shortname']) if self.simType == self.NgramAlgo.cosineSim and len( Cosine_matches) > 0: matches = list(itertools.chain(matches, Cosine_matches)) if self.simType == self.NgramAlgo.diceSim and len(Dice_matches) > 0: matches = list(itertools.chain(matches, Dice_matches)) if self.simType == self.NgramAlgo.bigramCosineSim and len( Bigram_cosine_matches) > 0: matches = list(itertools.chain(matches, Bigram_cosine_matches)) matches.sort(key=lambda x: x['sim_score'], reverse=True) return matches