예제 #1
0
  def __tfidfcosinesim(self, inputFile):
    '''
    TF-IDF Cosine Similarity Algorithm. Used TfidfVectorizer to implement it.

    :param inputFile: Input file path
    :return: Sorted array of JSON of scanner results with sim_type as __tfidfcosinesim
    '''
    processedData1 = super().loadFile(inputFile)
    matches = initial_match(self.commentFile, processedData1, self.licenseList)

    startTime = time.time()

    all_documents = self.licenseList['processed_text'].tolist()
    sklearn_tfidf = TfidfVectorizer(min_df=0, use_idf=True, smooth_idf=True,
                                    sublinear_tf=True, tokenizer=tokenize)

    all_documents_matrix = sklearn_tfidf.fit_transform(all_documents).toarray()
    search_martix = sklearn_tfidf.transform([processedData1]).toarray()[0]

    for counter, value in enumerate(all_documents_matrix, start=0):
      sim_score = self.__cosine_similarity(value, search_martix)
      if sim_score >= 0.3:
        matches.append({
          'shortname': self.licenseList.iloc[counter]['shortname'],
          'sim_type': "TF-IDF Cosine Sim",
          'sim_score': sim_score,
          'desc': ''
        })
    matches.sort(key=lambda x: x['sim_score'], reverse=True)
    if self.verbose > 0:
      print("time taken is " + str(time.time() - startTime) + " sec")
    return matches
예제 #2
0
파일: tfidf.py 프로젝트: fossology/atarashi
    def __tfidfsumscore(self, inputFile):
        '''
    TF-IDF Sum Score Algorithm. Used TfidfVectorizer to implement it.

    :param inputFile: Input file path
    :return: Sorted array of JSON of scanner results with sim_type as __tfidfsumscore
    '''
        processedData1 = super().loadFile(inputFile)
        matches = initial_match(self.commentFile, processedData1,
                                self.licenseList)

        startTime = time.time()

        # unique words from tokenized input file
        processedData = unique(processedData1.split(" "))

        all_documents = self.licenseList['processed_text'].tolist()
        all_documents.append(processedData1)
        sklearn_tfidf = TfidfVectorizer(min_df=0,
                                        use_idf=True,
                                        smooth_idf=True,
                                        sublinear_tf=True,
                                        tokenizer=tokenize,
                                        vocabulary=processedData)

        sklearn_representation = sklearn_tfidf.fit_transform(
            all_documents).toarray()

        score_arr = []
        result = 0
        for counter, value in enumerate(
                sklearn_representation[:len(sklearn_representation) - 1],
                start=0):
            sim_score = sum(value)
            score_arr.append({
                'shortname':
                self.licenseList.iloc[counter]['shortname'],
                'sim_type':
                "Sum of TF-IDF score",
                'sim_score':
                sim_score,
                'desc':
                "Score can be greater than 1 also"
            })
        score_arr.sort(key=lambda x: x['sim_score'], reverse=True)
        matches = list(itertools.chain(matches, score_arr[:5]))
        matches.sort(key=lambda x: x['sim_score'], reverse=True)
        if self.verbose > 0:
            print("time taken is " + str(time.time() - startTime) + " sec")
        return matches
예제 #3
0
    def scan(self, inputFile):
        '''
    :param inputFile: Input file path that needs to be scanned
    :return: Array of JSON with the output of scan of the file.

    +------------+-----------------------------------------------------------+
    | shortname  | Short name of the license                                 |
    +------------+-----------------------------------------------------------+
    | sim_type   | Type of similarity from which the result is generated     |
    +------------+-----------------------------------------------------------+
    | sim_score  | Similarity score for the algorithm used mentioned above   |
    +------------+-----------------------------------------------------------+
    | desc       | Description/ comments for the similarity measure          |
    +------------+-----------------------------------------------------------+
    '''
        processedData = super().loadFile(inputFile)
        matches = initial_match(self.commentFile, processedData,
                                self.licenseList)

        # Full text Bi-gram Cosine Similarity Match
        Cosine_matches = []
        Dice_matches = []
        Bigram_cosine_matches = []

        initial_guess = self.__Ngram_guess(processedData)
        ngram_guesses = []
        for guess in initial_guess:
            for x in guess['shortname']:
                ngram_guesses.append(x)

        all_guesses = unique([l['shortname'] for l in matches])
        self.licenseList = self.licenseList[
            (self.licenseList.shortname.isin(ngram_guesses)) |
            (self.licenseList.shortname.isin(all_guesses))]
        self.licenseList.sort_values('shortname').reset_index(drop=True)

        for idx in range(len(self.licenseList)):

            if self.simType == self.NgramAlgo.cosineSim:
                # cosine similarity with unigram
                cosineSim = cosine_similarity(
                    wordFrequency(self.licenseList.iloc[idx]
                                  ['processed_text'].split(" ")),
                    wordFrequency(processedData.split(" ")))
                if cosineSim >= 0.6:
                    Cosine_matches.append({
                        'shortname':
                        self.licenseList.iloc[idx]['shortname'],
                        'sim_type':
                        'CosineSim',
                        'sim_score':
                        cosineSim,
                        'description':
                        ''
                    })
                if self.verbose > 0:
                    print("Cosine Sim ", str(cosineSim),
                          self.licenseList.iloc[idx]['shortname'])

            elif self.simType == self.NgramAlgo.diceSim:
                # dice similarity
                diceSim = textdistance.sorensen(
                    self.licenseList.iloc[idx]['processed_text'].split(" "),
                    processedData.split(" "))
                if diceSim >= 0.6:
                    Dice_matches.append({
                        'shortname':
                        self.licenseList.iloc[idx]['shortname'],
                        'sim_type':
                        'DiceSim',
                        'sim_score':
                        diceSim,
                        'description':
                        ''
                    })
                if self.verbose > 0:
                    print("Dice Sim ", str(diceSim),
                          self.licenseList.iloc[idx]['shortname'])

            elif self.simType == self.NgramAlgo.bigramCosineSim:
                bigram_cosine_sim = cosine_similarity(
                    wordFrequency(
                        self.__bigram_tokenize(
                            self.licenseList.iloc[idx]['processed_text'])),
                    wordFrequency(self.__bigram_tokenize(processedData)))
                if bigram_cosine_sim >= 0.9:
                    Bigram_cosine_matches.append({
                        'shortname':
                        self.licenseList.iloc[idx]['shortname'],
                        'sim_type':
                        'BigramCosineSim',
                        'sim_score':
                        bigram_cosine_sim,
                        'description':
                        ''
                    })
                    if self.verbose > 0:
                        print("Bigram Cosine Sim ", str(bigram_cosine_sim),
                              self.licenseList.iloc[idx]['shortname'])

        if self.simType == self.NgramAlgo.cosineSim and len(
                Cosine_matches) > 0:
            matches = list(itertools.chain(matches, Cosine_matches))

        if self.simType == self.NgramAlgo.diceSim and len(Dice_matches) > 0:
            matches = list(itertools.chain(matches, Dice_matches))

        if self.simType == self.NgramAlgo.bigramCosineSim and len(
                Bigram_cosine_matches) > 0:
            matches = list(itertools.chain(matches, Bigram_cosine_matches))

        matches.sort(key=lambda x: x['sim_score'], reverse=True)
        return matches