コード例 #1
0
def filter_source_str(contents):
    """Filter the source content to remove punctuation and duplicate words.

    @param str contents: String from a `file.read()` call.
    @returns: `set()`.
    """
    # Downcase to remove duplicates due to capitalisation.
    ret_val = contents.lower()
    # Remove punctuation from the string.
    translator = str.maketrans('', '', punctuation)
    ret_val = ret_val.translate(translator)
    ret_val = OrderedSet(ret_val.split())
    return list(ret_val)
コード例 #2
0
    def __get_centroid(self, cluster=None):
        centroid = ''

        # centroid for a particular cluster
        if cluster:
            for log_id in cluster:
                if centroid == '':
                    centroid = self.preprocessed_logs[log_id]
                else:
                    centroid = ' '.join([centroid, self.preprocessed_logs[log_id]])
        # centroid for the whole logs
        else:
            for log_id in self.preprocessed_logs:
                if centroid == '':
                    centroid = self.preprocessed_logs[log_id]
                else:
                    centroid = ' '.join([centroid, self.preprocessed_logs[log_id]])

        centroid = OrderedSet(centroid.split())
        centroid = ' '.join(list(centroid))
        return centroid