m = self.RE_LINE.search(input) if not m: return [] contents = m.group(5) tokens = [] for elem in self.analyzer(contents): if hasattr(elem, "pos"): tokens.append((elem.text, elem.pos)) else: tokens.append((elem.text, None)) unique_tokens = list(set([text for (text, pos) in tokens])) return sorted(unique_tokens) def __repr__(self): return "LogDatum: %s" % (self.get_dict_representation(), ) def __str__(self): return "%s" % (self.get_dict_representation(), ) if __name__ == "__main__": logger.debug("starting") fields_to_index = ["datetime", "keywords", "failure_type", "failure_id", "contents_hash"] try: base_parser.main(APP_NAME, NgmgMsMessagesParserLogDatum, fields_to_index) except KeyboardInterrupt: logger.debug("CTRL-C") finally: logger.debug("exiting")
I'm going to cheat and use Whoosh.""" m = self.RE_LINE.search(input) if not m: return [] contents = m.group(7) tokens = [] for elem in self.analyzer(contents): if hasattr(elem, "pos"): tokens.append((elem.text, elem.pos)) else: tokens.append((elem.text, None)) unique_tokens = list(set([text for (text, pos) in tokens])) return sorted(unique_tokens) def __repr__(self): return "LogDatum: %s" % (self.get_dict_representation(), ) def __str__(self): return "%s" % (self.get_dict_representation(), ) if __name__ == "__main__": logger.debug("starting") try: fields_to_index = ["datetime", "keywords", "error_level", "error_id", "contents_hash"] base_parser.main(APP_NAME, NgmgEpParserLogDatum, fields_to_index) except KeyboardInterrupt: logger.debug("CTRL-C") finally: logger.debug("exiting")
"""Given a blob of input prepare a list of strings that is suitable for full-text indexing by MongoDB. I'm going to cheat and use Whoosh.""" tokens = [] for elem in self.analyzer(input): if hasattr(elem, "pos"): tokens.append((elem.text, elem.pos)) else: tokens.append((elem.text, None)) unique_tokens = list(set([text for (text, pos) in tokens])) return sorted(unique_tokens) def __repr__(self): return "LogDatum: %s" % (self.get_dict_representation(), ) def __str__(self): return "%s" % (self.get_dict_representation(), ) if __name__ == "__main__": logger.debug("starting") try: fields_to_index = ["datetime", "keywords", "contents_hash", "source", "event_type", "component_path", "sensor_num", "sensor_type"] base_parser.main(APP_NAME, NgmgShmHpilistParserLogDatum, fields_to_index) except KeyboardInterrupt: logger.debug("CTRL-C") finally: logger.debug("exiting")