def from_utf8(text): """Convert TEXT to Unicode from UTF-8""" result = text if ((sys.version_info.major < 3) and (not isinstance(text, unicode))): result = result.decode("UTF-8", 'ignore') debug.trace_fmtd(8, "from_utf8({t}) => {r}", t=text, r=result) return result
def get_file_size(filename): """Returns size of FILENAME or -1 if not found""" size = -1 if os.path.exists(filename): size = os.path.getsize(filename) debug.trace_fmtd(5, "get_file_size({f}) => {s}", f=filename, s=size) return size
def to_utf8(text): """Convert TEXT to UTF-8 (e.g., for I/O)""" result = text if ((sys.version_info.major < 3) and (isinstance(text, unicode))): result = result.encode("UTF-8", 'ignore') debug.trace_fmtd(8, "to_utf8({t}) => {r}", t=text, r=result) return result
def get_module_version(module_name): """Get version number for MODULE_NAME (string)""" # note: used in bash function (alias): # python-module-version() = { python -c "print(get_module_version('$1))"; }' # Try to load the module with given name # TODO: eliminate eval and just import directly try: eval("import {m}".format(m=module_name)) except: debug.trace_fmtd(6, "Exception importing module '{m}': {exc}", m=module_name, exc=sys.exc_info()) return "-1.-1.-1" # Try to get the version number for the module # TODO: eliminate eval and use attr() # TODO: try other conventions besides module.__version__ member variable version = "?.?.?" try: version = eval("module_name.__version__") except: debug.trace_fmtd(6, "Exception evaluating '{m}.__version__': {exc}", m=module_name, exc=sys.exc_info()) ## TODO: version = "0.0.0" return version
def main(args): """Supporting code for command-line processing""" debug.trace_fmtd(6, "main({a})", a=args) user = getenv_text("USER") print_stderr( "Warning, {u}: Not intended for direct invocation".format(u=user)) return
def __init__(self): """Class constructor""" debug.trace_fmtd(4, "tc.__init__(); self=={s}", s=self) self.keys = [] self.classifier = None if USE_SVM: self.cat_pipeline = Pipeline( [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC(kernel=SVM_KERNEL, C=SVM_PENALTY, max_iter=SVM_MAX_ITER, verbose=SVM_VERBOSE))]) if USE_SGD: self.cat_pipeline = Pipeline( [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss=SGD_LOSS, penalty=SGD_PENALTY, alpha=SGD_ALPHA, random_state=SGD_SEED, ## TODO: max_iter=SGD_MAX_ITER, n_iter=SGD_MAX_ITER, ## tol=SGD_TOLERANCE verbose=SGD_VERBOSE))]) return
def to_string(text): """Ensure TEXT is a string type""" result = text if (not isinstance(result, types.StringTypes)): result = "%s" % text debug.trace_fmtd(8, "to_string({t}) => {r}", t=text, r=result) return result
def print_full_stack(stream=sys.stderr): """Prints stack trace (for use in error messages, etc.)""" # Notes: Developed originally for Android stack tracing support. # Based on http://blog.dscpl.com.au/2015/03/generating-full-stack-traces-for.html. # TODO: Update based on author's code update (e.g., ???) # TODO: Fix off-by-one error in display of offending statement! debug.trace_fmtd(7, "print_full_stack(stream={s})", s=stream) stream.write("Traceback (most recent call last):\n") try: # Note: Each tuple has the form (frame, filename, line_number, function, context, index) item = None # Show call stack excluding caller for item in reversed(inspect.stack()[2:]): stream.write(' File "{1}", line {2}, in {3}\n'.format(*item)) for line in item[4]: stream.write(' ' + line.lstrip()) # Show context of the exception from caller to offending line stream.write(" ----------\n") for item in inspect.trace(): stream.write(' File "{1}", line {2}, in {3}\n'.format(*item)) for line in item[4]: stream.write(' ' + line.lstrip()) except: debug.trace_fmtd(3, "Unable to produce stack trace: {exc}", exc=sys.exc_info()) stream.write("\n") return
def to_float(text, default_value=0): """Interpret TEXT as integer, using default_value""" result = default_value try: result = float(text) except (TypeError, ValueError): debug.trace_fmtd(6, "Exception in to_int: {exc}", exc=sys.exc_info()) return result
def remove_extension(filename): """Return FILENAME without final extension""" # ex: remove_extension("document.pdf") => "document") new_filename = re.sub(r"\.[^\.]*$", "", filename) debug.trace_fmtd(4, "remove_extension({f}) => {r}", f=filename, r=new_filename) return new_filename
def main(args): """Supporting code for command-line processing""" debug.trace_fmtd(6, "main({a})", a=args) if (len(args) != 2): system.print_stderr("Usage: {p} model".format(p=args[0])) return model = args[1] start_web_controller(model) return
def to_bool(value): """Converts VALUE to boolean value, False iff in {0, False, and "False"}, ignoring case.""" # TODO: add "off" as well value_text = str(value) bool_value = True if (value_text.lower() == "false") or (value_text == "0"): bool_value = False debug.trace_fmtd(7, "to_bool({v}) => {r}", v=value, r=bool_value) return bool_value
def train(self, filename): """Train classifier using tabular FILENAME with label and text""" debug.trace_fmtd(4, "tc.train({f})", f=filename) (labels, values) = read_categorization_data(filename) self.keys = sorted(numpy.unique(labels)) label_indices = [self.keys.index(l) for l in labels] self.classifier = self.cat_pipeline.fit(values, label_indices) debug.trace_object(7, self.classifier, "classifier") return
def to_int(text, default_value=0): """Interpret TEXT as integer, using default_value""" # TODO: use generic to_num with argument specifying type result = default_value try: result = int(text) except (TypeError, ValueError): debug.trace_fmtd(6, "Exception in to_int: {exc}", exc=sys.exc_info()) return result
def load(self, filename): """Load classifier from FILENAME""" debug.trace_fmtd(4, "tc.load({f})", f=filename) try: (self.keys, self.classifier) = system.load_object(filename) except (TypeError, ValueError): system.print_stderr("Problem loading classifier from {f}: {exc}". format(f=filename, exc=sys.exc_info())) return
def stop(self, **kwargs): """Stops the web search server and saves cached data to disk""" debug.trace_fmtd(5, "wc.stop(s:{s}, kw:{kw})", s=self, kw=kwargs) if os.environ.get("HOST_NICKNAME") in ["hostwinds", "ec2-micro"]: return "Call security!" cherrypy.engine.stop() cherrypy.engine.exit() # TODO: use HTML so shutdown shown in title return "Adios"
def round_num(value, precision=PRECISION): """Round VALUE [to PRECISION places, {p} by default]""".format(p=PRECISION) rounded_value = round(value, precision) debug.trace_fmtd(8, "round_num({v}, {p}) => {r}", v=value, p=precision, r=rounded_value) return rounded_value
def intersection(list1, list2): """Return intersection of LIST1 and LIST2""" # note: wrapper around set.intersection used for tracing result = set(list1).intersection(set(list2)) debug.trace_fmtd(7, "intersection({l1}, {l2}) => {r}", l1=list1, l2=list2, r=result) return result
def non_empty_file(filename): """Whether file exists and is non-empty""" non_empty = False try: non_empty = (os.path.getsize(filename) > 0) except OSError: debug.trace_fmtd(6, "Exception in non_empty_file: {exc}", exc=sys.exc_info()) debug.trace_fmtd(5, "non_empty_file({f}) => {r}", f=filename, r=non_empty) return non_empty
def getenv_text(var, default="", _description=None): """Returns textual value for environment variable VAR (or DEFAULT value)""" text_value = os.getenv(var) if not text_value: debug.trace_fmtd(6, "getenv_text: no value for var {v}", v=var) text_value = default debug.trace_fmtd(5, "getenv_text('{v}', [{d}]) => {r}", v=var, d=default, r=text_value) return (text_value)
def load_object(file_name, ignore_error=False): """Loads object from FILE_NAME in pickle format""" obj = None try: with open(file_name, 'r') as f: obj = pickle.load(f) except (IOError, ValueError): if (not ignore_error): print_stderr("Error: Unable to load object from {f}: {exc}".format( f=file_name, exc=sys.exc_info())) debug.trace_fmtd(7, "load_object({f}) => {o}", f=file_name, o=obj) return obj
def read_entire_file(filename): """Read all of FILENAME and return as a string""" data = "" try: with open(filename) as f: data = from_utf8(f.read()) except IOError: debug.trace_fmtd(1, "Error: Unable to read file '{f}': {exc}", f=filename, exc=sys.exc_info()) debug.trace_fmtd(7, "read_entire_file({f}) => {r}", f=filename, r=data) return data
def getenv_number(var, default=-1.0, _description=None): """Returns number based on environment VAR (or DEFAULT value).""" # Note: use getenv_int or getenv_float for typed variants num_value = default value_text = getenv_text(var) if (len(value_text) > 0): num_value = float(value_text) debug.trace_fmtd(6, "getenv_number({v}, {d}) => {r}", v=var, d=default, r=num_value) return (num_value)
def getenv_bool(var, default=False, _description=None): """Returns boolean flag based on environment VAR (or DEFAULT value)""" # Note: "0" or "False" is interpreted as False, and any other value as True. bool_value = default value_text = getenv_text(var) if (len(value_text) > 0): bool_value = to_bool(value_text) debug.trace_fmtd(6, "getenv_bool({v}, {d}) => {r}", v=var, d=default, r=bool_value) return (bool_value)
def difference(list1, list2): """Return set difference from LIST1 vs LIST2, preserving order""" # TODO: optmize (e.g., via a hash table) # EX: difference([5, 4, 3, 2, 1], [1, 2, 3]) => [5, 4] diff = [] for item1 in list1: if item1 not in list2: diff.append(item1) debug.trace_fmtd(7, "difference({l1}, {l2}) => {d}", l1=list1, l2=list2, d=diff) return diff
def save_object(file_name, obj): """Saves OBJ to FILE_NAME in pickle format""" # Note: The data file is created in binary mode to avoid quirk under Windows. # See https://stackoverflow.com/questions/556269/importerror-no-module-named-copy-reg-pickle. debug.trace_fmtd(6, "save_object({f}, _)", f=file_name) try: with open(file_name, 'wb') as f: pickle.dump(obj, f) except (IOError, TypeError, ValueError): debug.trace_fmtd(1, "Error: Unable to save object to {f}: {exc}", f=file_name, exc=sys.exc_info()) return
def test(self, filename, report=False, stream=sys.stdout): """Test classifier over tabular data from FILENAME with label and text, returning accuracy. Optionally, a detailed performance REPORT is output to STREAM.""" debug.trace_fmtd(4, "tc.test({f})", f=filename) ## OLD: (labels, values) = read_categorization_data(filename) (all_labels, all_values) = read_categorization_data(filename) ## BAD: actual_indices = [self.keys.index(l) for l in labels] # TODO: use hash of positions actual_indices = [] values = [] labels = [] for (i, label) in enumerate(all_labels): if label in self.keys: values.append(all_values[i]) actual_indices.append(self.keys.index(label)) labels.append(label) else: debug.trace_fmtd(4, "Ignoring test label {l} not in training data (line {n})", l=label, n=(i + 1)) predicted_indices = self.classifier.predict(values) ## TODO: predicted_labels = [self.keys[i] for i in predicted_indices] num_ok = sum([(actual_indices[i] == predicted_indices[i]) for i in range(len(actual_indices))]) accuracy = float(num_ok) / len(values) if report: if VERBOSE: stream.write("\n") stream.write("Actual\tPredict\n") for i in range(len(actual_indices)): stream.write("{act}\t{pred}\n". format(act=self.keys[actual_indices[i]], pred=self.keys[predicted_indices[i]])) stream.write("\n") ## BAD: sklearn_report(actual_indices, predicted_indices, self.keys, stream) ## OLD: keys = sorted(numpy.unique(labels)) keys = self.keys sklearn_report(actual_indices, predicted_indices, keys, stream) if OUTPUT_BAD: bad_instances = "Actual\tBad\tText\n" # TODO: for (i, actual_index) in enumerate(actual_indices) for i in range(len(actual_indices)): if (actual_indices[i] != predicted_indices[i]): text = values[i] context = (text[:CONTEXT_LEN] + "...\n") if (len(text) > CONTEXT_LEN) else text # TODO: why is pylint flagging the format string as invalid? bad_instances += u"{g}\t{b}\t{t}".format( g=self.keys[actual_indices[i]], b=self.keys[predicted_indices[i]], t=context) system.write_file(filename + ".bad", bad_instances) return accuracy
def write_file(filename, text): """Create FILENAME with TEXT""" with open(filename, "w") as f: try: ## OLD: f.write(to_utf8(text) + "\n") f.write(to_utf8(text)) if not text.endswith("\n"): f.write("\n") except (IOError, ValueError): debug.trace_fmtd(1, "Error: Problem writing file '{f}': {exc}", f=filename, exc=sys.exc_info()) return
def quote_url_text(text): """Quote TEXT to make suitable for use in URL. Note: This return the input if the text has encoded characters (i.e., %HH) where H is uppercase hex digit.""" # Note: This is a wrapper around quote_plus and thus escapes slashes, along with spaces and other special characters (";?:@&=+$,\"'"). # EX: quote_url_text("<2/") => "%3C2%2f" # EX: quote_url_text("Joe's hat") => "Joe%27s+hat" # EX: quote_url_text("Joe%27s+hat") => "Joe%27s+hat" debug.trace_fmtd(7, "in quote_url_text({t})", t=text) result = text if not re.search("%[0-9A-F]{2}", text): if sys.version_info.major > 2: result = urllib.parse.quote_plus(text) else: result = urllib.quote_plus(to_utf8(text)) debug.trace_fmtd(6, "out quote_url_text({t}) => {r}", t=text, r=result) return result
def __init__(self, model_filename, *args, **kwargs): """Class constructor: initializes search engine server""" debug.trace_fmtd(5, "web_controller.__init__(s:{s}, a:{a}, kw:{k})__", s=self, a=args, k=kwargs) self.text_cat = TextCategorizer() self.text_cat.load(model_filename) self.category_image = defaultdict(lambda: "/static/unknown-with-question-marks.png") # HACK: wikipedia categorization specific self.category_image.update(CATEGORY_IMAGE_HASH) # Note: To avoid cross-origin type errrors, Access-Control-Allow-Origin # is made open. See following: # - http://cleanbugs.com/item/how-to-get-cross-origin-sharing-cors-post-request-working-a-resource-413656.html # - https://stackoverflow.com/questions/6054473/python-cherrypy-how-to-add-header # TODO: put cherrypy config in start_web_controller (or put it's configuration here) ## BAD: cherrypy.response.headers["Access-Control-Allow-Origin"] = "*" return