Ejemplo n.º 1
0
def from_utf8(text):
    """Convert TEXT to Unicode from UTF-8"""
    result = text
    if ((sys.version_info.major < 3) and (not isinstance(text, unicode))):
        result = result.decode("UTF-8", 'ignore')
    debug.trace_fmtd(8, "from_utf8({t}) => {r}", t=text, r=result)
    return result
Ejemplo n.º 2
0
def get_file_size(filename):
    """Returns size of FILENAME or -1 if not found"""
    size = -1
    if os.path.exists(filename):
        size = os.path.getsize(filename)
    debug.trace_fmtd(5, "get_file_size({f}) => {s}", f=filename, s=size)
    return size
Ejemplo n.º 3
0
def to_utf8(text):
    """Convert TEXT to UTF-8 (e.g., for I/O)"""
    result = text
    if ((sys.version_info.major < 3) and (isinstance(text, unicode))):
        result = result.encode("UTF-8", 'ignore')
    debug.trace_fmtd(8, "to_utf8({t}) => {r}", t=text, r=result)
    return result
Ejemplo n.º 4
0
def get_module_version(module_name):
    """Get version number for MODULE_NAME (string)"""
    # note: used in bash function (alias):
    #     python-module-version() = { python -c "print(get_module_version('$1))"; }'

    # Try to load the module with given name
    # TODO: eliminate eval and just import directly
    try:
        eval("import {m}".format(m=module_name))
    except:
        debug.trace_fmtd(6,
                         "Exception importing module '{m}': {exc}",
                         m=module_name,
                         exc=sys.exc_info())
        return "-1.-1.-1"

    # Try to get the version number for the module
    # TODO: eliminate eval and use attr()
    # TODO: try other conventions besides module.__version__ member variable
    version = "?.?.?"
    try:
        version = eval("module_name.__version__")
    except:
        debug.trace_fmtd(6,
                         "Exception evaluating '{m}.__version__': {exc}",
                         m=module_name,
                         exc=sys.exc_info())
        ## TODO: version = "0.0.0"
    return version
Ejemplo n.º 5
0
def main(args):
    """Supporting code for command-line processing"""
    debug.trace_fmtd(6, "main({a})", a=args)
    user = getenv_text("USER")
    print_stderr(
        "Warning, {u}: Not intended for direct invocation".format(u=user))
    return
Ejemplo n.º 6
0
    def __init__(self):
        """Class constructor"""
        debug.trace_fmtd(4, "tc.__init__(); self=={s}", s=self)
        self.keys = []
        self.classifier = None
        if USE_SVM:
            self.cat_pipeline = Pipeline(
                [('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', SVC(kernel=SVM_KERNEL,
                             C=SVM_PENALTY,
                             max_iter=SVM_MAX_ITER,
                             verbose=SVM_VERBOSE))])
        if USE_SGD:
            self.cat_pipeline = Pipeline(
                [('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', SGDClassifier(loss=SGD_LOSS,
                                       penalty=SGD_PENALTY,
                                       alpha=SGD_ALPHA,
                                       random_state=SGD_SEED,
                                       ## TODO: max_iter=SGD_MAX_ITER,
                                       n_iter=SGD_MAX_ITER,
                                       ## tol=SGD_TOLERANCE
                                       verbose=SGD_VERBOSE))])

        return
Ejemplo n.º 7
0
def to_string(text):
    """Ensure TEXT is a string type"""
    result = text
    if (not isinstance(result, types.StringTypes)):
        result = "%s" % text
    debug.trace_fmtd(8, "to_string({t}) => {r}", t=text, r=result)
    return result
Ejemplo n.º 8
0
def print_full_stack(stream=sys.stderr):
    """Prints stack trace (for use in error messages, etc.)"""
    # Notes: Developed originally for Android stack tracing support.
    # Based on http://blog.dscpl.com.au/2015/03/generating-full-stack-traces-for.html.
    # TODO: Update based on author's code update (e.g., ???)
    # TODO: Fix off-by-one error in display of offending statement!
    debug.trace_fmtd(7, "print_full_stack(stream={s})", s=stream)
    stream.write("Traceback (most recent call last):\n")
    try:
        # Note: Each tuple has the form (frame, filename, line_number, function, context, index)
        item = None
        # Show call stack excluding caller
        for item in reversed(inspect.stack()[2:]):
            stream.write('  File "{1}", line {2}, in {3}\n'.format(*item))
        for line in item[4]:
            stream.write('  ' + line.lstrip())
        # Show context of the exception from caller to offending line
        stream.write("  ----------\n")
        for item in inspect.trace():
            stream.write('  File "{1}", line {2}, in {3}\n'.format(*item))
        for line in item[4]:
            stream.write('  ' + line.lstrip())
    except:
        debug.trace_fmtd(3,
                         "Unable to produce stack trace: {exc}",
                         exc=sys.exc_info())
    stream.write("\n")
    return
Ejemplo n.º 9
0
def to_float(text, default_value=0):
    """Interpret TEXT as integer, using default_value"""
    result = default_value
    try:
        result = float(text)
    except (TypeError, ValueError):
        debug.trace_fmtd(6, "Exception in to_int: {exc}", exc=sys.exc_info())
    return result
Ejemplo n.º 10
0
def remove_extension(filename):
    """Return FILENAME without final extension"""
    # ex: remove_extension("document.pdf") => "document")
    new_filename = re.sub(r"\.[^\.]*$", "", filename)
    debug.trace_fmtd(4,
                     "remove_extension({f}) => {r}",
                     f=filename,
                     r=new_filename)
    return new_filename
Ejemplo n.º 11
0
def main(args):
    """Supporting code for command-line processing"""
    debug.trace_fmtd(6, "main({a})", a=args)
    if (len(args) != 2):
        system.print_stderr("Usage: {p} model".format(p=args[0]))
        return
    model = args[1]
    start_web_controller(model)
    return
Ejemplo n.º 12
0
def to_bool(value):
    """Converts VALUE to boolean value, False iff in {0, False, and "False"}, ignoring case."""
    # TODO: add "off" as well
    value_text = str(value)
    bool_value = True
    if (value_text.lower() == "false") or (value_text == "0"):
        bool_value = False
    debug.trace_fmtd(7, "to_bool({v}) => {r}", v=value, r=bool_value)
    return bool_value
Ejemplo n.º 13
0
 def train(self, filename):
     """Train classifier using tabular FILENAME with label and text"""
     debug.trace_fmtd(4, "tc.train({f})", f=filename)
     (labels, values) = read_categorization_data(filename)
     self.keys = sorted(numpy.unique(labels))
     label_indices = [self.keys.index(l) for l in labels]
     self.classifier = self.cat_pipeline.fit(values, label_indices)
     debug.trace_object(7, self.classifier, "classifier")
     return
Ejemplo n.º 14
0
def to_int(text, default_value=0):
    """Interpret TEXT as integer, using default_value"""
    # TODO: use generic to_num with argument specifying type
    result = default_value
    try:
        result = int(text)
    except (TypeError, ValueError):
        debug.trace_fmtd(6, "Exception in to_int: {exc}", exc=sys.exc_info())
    return result
Ejemplo n.º 15
0
 def load(self, filename):
     """Load classifier from FILENAME"""
     debug.trace_fmtd(4, "tc.load({f})", f=filename)
     try:
         (self.keys, self.classifier) = system.load_object(filename)
     except (TypeError, ValueError):
         system.print_stderr("Problem loading classifier from {f}: {exc}".
                             format(f=filename, exc=sys.exc_info()))
     return
Ejemplo n.º 16
0
 def stop(self, **kwargs):
     """Stops the web search server and saves cached data to disk"""
     debug.trace_fmtd(5, "wc.stop(s:{s}, kw:{kw})", s=self, kw=kwargs)
     if os.environ.get("HOST_NICKNAME") in ["hostwinds", "ec2-micro"]:
         return "Call security!"
     cherrypy.engine.stop()
     cherrypy.engine.exit()
     # TODO: use HTML so shutdown shown in title
     return "Adios"
Ejemplo n.º 17
0
def round_num(value, precision=PRECISION):
    """Round VALUE [to PRECISION places, {p} by default]""".format(p=PRECISION)
    rounded_value = round(value, precision)
    debug.trace_fmtd(8,
                     "round_num({v}, {p}) => {r}",
                     v=value,
                     p=precision,
                     r=rounded_value)
    return rounded_value
Ejemplo n.º 18
0
def intersection(list1, list2):
    """Return intersection of LIST1 and LIST2"""
    # note: wrapper around set.intersection used for tracing
    result = set(list1).intersection(set(list2))
    debug.trace_fmtd(7,
                     "intersection({l1}, {l2}) => {r}",
                     l1=list1,
                     l2=list2,
                     r=result)
    return result
Ejemplo n.º 19
0
def non_empty_file(filename):
    """Whether file exists and is non-empty"""
    non_empty = False
    try:
        non_empty = (os.path.getsize(filename) > 0)
    except OSError:
        debug.trace_fmtd(6,
                         "Exception in non_empty_file: {exc}",
                         exc=sys.exc_info())
    debug.trace_fmtd(5, "non_empty_file({f}) => {r}", f=filename, r=non_empty)
    return non_empty
Ejemplo n.º 20
0
def getenv_text(var, default="", _description=None):
    """Returns textual value for environment variable VAR (or DEFAULT value)"""
    text_value = os.getenv(var)
    if not text_value:
        debug.trace_fmtd(6, "getenv_text: no value for var {v}", v=var)
        text_value = default
    debug.trace_fmtd(5,
                     "getenv_text('{v}',  [{d}]) => {r}",
                     v=var,
                     d=default,
                     r=text_value)
    return (text_value)
Ejemplo n.º 21
0
def load_object(file_name, ignore_error=False):
    """Loads object from FILE_NAME in pickle format"""
    obj = None
    try:
        with open(file_name, 'r') as f:
            obj = pickle.load(f)
    except (IOError, ValueError):
        if (not ignore_error):
            print_stderr("Error: Unable to load object from {f}: {exc}".format(
                f=file_name, exc=sys.exc_info()))
    debug.trace_fmtd(7, "load_object({f}) => {o}", f=file_name, o=obj)
    return obj
Ejemplo n.º 22
0
def read_entire_file(filename):
    """Read all of FILENAME and return as a string"""
    data = ""
    try:
        with open(filename) as f:
            data = from_utf8(f.read())
    except IOError:
        debug.trace_fmtd(1,
                         "Error: Unable to read file '{f}': {exc}",
                         f=filename,
                         exc=sys.exc_info())
    debug.trace_fmtd(7, "read_entire_file({f}) => {r}", f=filename, r=data)
    return data
Ejemplo n.º 23
0
def getenv_number(var, default=-1.0, _description=None):
    """Returns number based on environment VAR (or DEFAULT value)."""
    # Note: use getenv_int or getenv_float for typed variants
    num_value = default
    value_text = getenv_text(var)
    if (len(value_text) > 0):
        num_value = float(value_text)
    debug.trace_fmtd(6,
                     "getenv_number({v}, {d}) => {r}",
                     v=var,
                     d=default,
                     r=num_value)
    return (num_value)
Ejemplo n.º 24
0
def getenv_bool(var, default=False, _description=None):
    """Returns boolean flag based on environment VAR (or DEFAULT value)"""
    # Note: "0" or "False" is interpreted as False, and any other value as True.
    bool_value = default
    value_text = getenv_text(var)
    if (len(value_text) > 0):
        bool_value = to_bool(value_text)
    debug.trace_fmtd(6,
                     "getenv_bool({v}, {d}) => {r}",
                     v=var,
                     d=default,
                     r=bool_value)
    return (bool_value)
Ejemplo n.º 25
0
def difference(list1, list2):
    """Return set difference from LIST1 vs LIST2, preserving order"""
    # TODO: optmize (e.g., via a hash table)
    # EX: difference([5, 4, 3, 2, 1], [1, 2, 3]) => [5, 4]
    diff = []
    for item1 in list1:
        if item1 not in list2:
            diff.append(item1)
    debug.trace_fmtd(7,
                     "difference({l1}, {l2}) => {d}",
                     l1=list1,
                     l2=list2,
                     d=diff)
    return diff
Ejemplo n.º 26
0
def save_object(file_name, obj):
    """Saves OBJ to FILE_NAME in pickle format"""
    # Note: The data file is created in binary mode to avoid quirk under Windows.
    # See https://stackoverflow.com/questions/556269/importerror-no-module-named-copy-reg-pickle.
    debug.trace_fmtd(6, "save_object({f}, _)", f=file_name)
    try:
        with open(file_name, 'wb') as f:
            pickle.dump(obj, f)
    except (IOError, TypeError, ValueError):
        debug.trace_fmtd(1,
                         "Error: Unable to save object to {f}: {exc}",
                         f=file_name,
                         exc=sys.exc_info())
    return
Ejemplo n.º 27
0
    def test(self, filename, report=False, stream=sys.stdout):
        """Test classifier over tabular data from FILENAME with label and text, returning accuracy. Optionally, a detailed performance REPORT is output to STREAM."""
        debug.trace_fmtd(4, "tc.test({f})", f=filename)
        ## OLD: (labels, values) = read_categorization_data(filename)
        (all_labels, all_values) = read_categorization_data(filename)

        ## BAD: actual_indices = [self.keys.index(l) for l in labels]
        # TODO: use hash of positions
        actual_indices = []
        values = []
        labels = []
        for (i, label) in enumerate(all_labels):
            if label in self.keys:
                values.append(all_values[i])
                actual_indices.append(self.keys.index(label))
                labels.append(label)
            else:
                debug.trace_fmtd(4, "Ignoring test label {l} not in training data (line {n})",
                                 l=label, n=(i + 1))
        predicted_indices = self.classifier.predict(values)
        ## TODO: predicted_labels = [self.keys[i] for i in predicted_indices]
        num_ok = sum([(actual_indices[i] == predicted_indices[i]) for i in range(len(actual_indices))])
        accuracy = float(num_ok) / len(values)
        if report:
            if VERBOSE:
                stream.write("\n")
                stream.write("Actual\tPredict\n")
                for i in range(len(actual_indices)):
                    stream.write("{act}\t{pred}\n".
                                 format(act=self.keys[actual_indices[i]],
                                        pred=self.keys[predicted_indices[i]]))
                stream.write("\n")
            ## BAD: sklearn_report(actual_indices, predicted_indices, self.keys, stream)
            ## OLD: keys = sorted(numpy.unique(labels))
            keys = self.keys
            sklearn_report(actual_indices, predicted_indices, keys, stream)
        if OUTPUT_BAD:
            bad_instances = "Actual\tBad\tText\n"
            # TODO: for (i, actual_index) in enumerate(actual_indices)
            for i in range(len(actual_indices)):
                if (actual_indices[i] != predicted_indices[i]):
                    text = values[i]
                    context = (text[:CONTEXT_LEN] + "...\n") if (len(text) > CONTEXT_LEN) else text
                    # TODO: why is pylint flagging the format string as invalid?
                    bad_instances += u"{g}\t{b}\t{t}".format(
                        g=self.keys[actual_indices[i]],
                        b=self.keys[predicted_indices[i]],
                        t=context)
            system.write_file(filename + ".bad", bad_instances)
        return accuracy
Ejemplo n.º 28
0
def write_file(filename, text):
    """Create FILENAME with TEXT"""
    with open(filename, "w") as f:
        try:
            ## OLD: f.write(to_utf8(text) + "\n")
            f.write(to_utf8(text))
            if not text.endswith("\n"):
                f.write("\n")
        except (IOError, ValueError):
            debug.trace_fmtd(1,
                             "Error: Problem writing file '{f}': {exc}",
                             f=filename,
                             exc=sys.exc_info())
    return
Ejemplo n.º 29
0
def quote_url_text(text):
    """Quote TEXT to make suitable for use in URL. Note: This return the input if the text has encoded characters (i.e., %HH) where H is uppercase hex digit."""
    # Note: This is a wrapper around quote_plus and thus escapes slashes, along with spaces and other special characters (";?:@&=+$,\"'").
    # EX: quote_url_text("<2/") => "%3C2%2f"
    # EX: quote_url_text("Joe's hat") => "Joe%27s+hat"
    # EX: quote_url_text("Joe%27s+hat") => "Joe%27s+hat"
    debug.trace_fmtd(7, "in quote_url_text({t})", t=text)
    result = text
    if not re.search("%[0-9A-F]{2}", text):
        if sys.version_info.major > 2:
            result = urllib.parse.quote_plus(text)
        else:
            result = urllib.quote_plus(to_utf8(text))
    debug.trace_fmtd(6, "out quote_url_text({t}) => {r}", t=text, r=result)
    return result
Ejemplo n.º 30
0
 def __init__(self, model_filename, *args, **kwargs):
     """Class constructor: initializes search engine server"""
     debug.trace_fmtd(5, "web_controller.__init__(s:{s}, a:{a}, kw:{k})__",
                      s=self, a=args, k=kwargs)
     self.text_cat = TextCategorizer()
     self.text_cat.load(model_filename)
     self.category_image = defaultdict(lambda: "/static/unknown-with-question-marks.png")
     # HACK: wikipedia categorization specific
     self.category_image.update(CATEGORY_IMAGE_HASH)
     # Note: To avoid cross-origin type errrors, Access-Control-Allow-Origin
     # is made open. See following:
     # - http://cleanbugs.com/item/how-to-get-cross-origin-sharing-cors-post-request-working-a-resource-413656.html
     # - https://stackoverflow.com/questions/6054473/python-cherrypy-how-to-add-header
     # TODO: put cherrypy config in start_web_controller (or put it's configuration here)
     ## BAD: cherrypy.response.headers["Access-Control-Allow-Origin"] = "*"
     return