def filter_source_str(contents): """Filter the source content to remove punctuation and duplicate words. @param str contents: String from a `file.read()` call. @returns: `set()`. """ # Downcase to remove duplicates due to capitalisation. ret_val = contents.lower() # Remove punctuation from the string. translator = str.maketrans('', '', punctuation) ret_val = ret_val.translate(translator) ret_val = OrderedSet(ret_val.split()) return list(ret_val)
def __get_centroid(self, cluster=None): centroid = '' # centroid for a particular cluster if cluster: for log_id in cluster: if centroid == '': centroid = self.preprocessed_logs[log_id] else: centroid = ' '.join([centroid, self.preprocessed_logs[log_id]]) # centroid for the whole logs else: for log_id in self.preprocessed_logs: if centroid == '': centroid = self.preprocessed_logs[log_id] else: centroid = ' '.join([centroid, self.preprocessed_logs[log_id]]) centroid = OrderedSet(centroid.split()) centroid = ' '.join(list(centroid)) return centroid