def file_size(filename):
    """Returns size of FILENAME in bytes (or -1 if not found)"""
    size = -1
    if os.path.exists(filename):
        size = os.path.getsize(filename)
    tpo.debug_format("file_size({f}) => {s}", 5, f=filename, s=size)
    return size
def read_lines(filename=None, make_unicode=False):
    """Returns list of lines from FILENAME without newlines (or other extra whitespace)
    @notes: Uses stdin if filename is None. Optionally returned as unicode."""
    # TODO: use enumerate(f); refine exception in except; 
    # TODO: force unicode if UTF8 encountered
    lines = []
    f = None
    try:
        # Open the file
        if not filename:
            tpo.debug_format("Reading from stdin", 4)
            f = sys.stdin
        else:
            f = open(filename)
            if not f:
                raise IOError
        # Read line by line
        for line in f:
            line = line.strip("\n")
            if make_unicode:
                line = tpo.ensure_unicode(line)
            lines.append(line)
    except IOError:
        debug_print("Warning: Exception reading file %s: %s" % (filename, str(sys.exc_info())), 2)
    finally:
        if f:
            f.close()
    debug_print("read_lines(%s) => %s" % (filename, lines), 6)
    return lines
def create_directory(path):
    """Wrapper around os.mkdir over PATH (with tracing)"""
    if not os.path.exists(path):
        os.mkdir(path)
        debug_format("os.mkdir({p})", 6, p=path)
    else:
        assertion(os.path.isdir(path))
    return
def getenv_filename(var, default="", description=None):
    """Returns text filename based on environment variable VAR (or string version of DEFAULT) 
    with optional DESCRIPTION. This includes a sanity check for file being non-empty."""
    debug_format("getenv_filename({v}, {d}, {desc})", 6,
                 v=var, d=default, desc=description)
    filename = tpo.getenv_text(var, default, description)
    if filename and not non_empty_file(filename):
        tpo.print_stderr("Error: filename %s empty or missing for environment option %s" % (filename, var))
    return filename
Example #5
0
def tokenize(text):
    """Tokenize TEXT according to regex word tokens (i.e., \W+), which defaults to [A-Za-z0-9_]+"""
    # TODO: Allow for tokenization regex to be overwritten
    token_regex = r"\W+" if not PRESERVE else r"\S+"
    tokens = [t.strip() for t in re.split(token_regex, text) if t.strip()]
    if DOWNCASE:
        tokens = [t.lower() for t in tokens]

    tpo.debug_format("tokenize({txt}) => t", 7, txt=text, t=tokens)
    return tokens
def delete_file(filename):
    """Deletes FILENAME"""
    debug_print("delete_file(%s)" % tpo.normalize_unicode(filename), 5)
    assertion(os.path.exists(filename))
    ok = False
    try:
        ok = os.remove(filename)
        debug_format("remove{f} => {r}", 6, f=filename, r=ok)
    except OSError:
        debug_print("Exception during deletion of {filename}: " + str(sys.exc_info()), 5)
    return ok
def get_directory_listing(dirname, make_unicode=False):
    """Returns files in DIRNAME"""
    all_file_names = []
    try:
        all_file_names = os.listdir(dirname)
    except OSError:
        tpo.debug_format("Exception during get_directory_listing: {exc}", 4,
                         exc=str(sys.exc_info()))
    if make_unicode:
        all_file_names = [tpo.ensure_unicode(f) for f in all_file_names]
    tpo.debug_format("get_directory_listing({dir}) => {files}", 5,
                     dir=dirname, files=all_file_names)
    return all_file_names
def resolve_path(filename, base_dir=None):
    """Resolves path for FILENAME related to BASE_DIR if not in current directory. Note: this uses the script directory for the calling module if BASE_DIR not specified (i.e., as if os.path.dirname(__file__) passed)."""
    path = filename
    if not os.path.exists(path):
        if not base_dir:
            frame = None
            try:
                frame = inspect.currentframe().f_back
                base_dir = os.path.dirname(frame.f_globals['__file__'])
            except (AttributeError, KeyError):
                base_dir = ""
                debug_print("Exception during resolve_path: " + str(sys.exc_info()), 5)
            finally:
                if frame:
                    del frame
        path = os.path.join(base_dir, path)
    debug_format("resolve_path({f}) => {p}", 4, f=filename, p=path)
    return path
def extract_matches(pattern, lines, fields=1):
    """Checks for PATTERN matches in LINES of text returning list of tuples with replacement groups"""
    # ex: extract_matches(r"^(\S+) \S+", ["John D.", "Jane D.", "Plato"]) => ["John", "Jane"]
    assert type(lines) == list
    if pattern.find("(") == -1:
        pattern = "(" + pattern + ")"
    matches = []
    for line in lines:
        try:
            match = re.search(pattern, line)
            if match:
                result = match.group(1) if (fields == 1) else [match.group(i + 1) for i in range(fields)]
                matches.append(result)
        except (re.error, IndexError):
            debug_print("Warning: Exception in pattern matching: %s" % str(sys.exc_info()), 2)
    debug_print("extract_matches(%s, _, [%s]) => %s" % (pattern, fields, matches), 7)
    double_indent = INDENT + INDENT
    debug_format("{ind}input lines: {{\n{res}\n{ind}}}", 8,
                 ind=INDENT, res=indent_lines("\n".join(lines), double_indent))
    return matches
Example #10
0
    def __iter__(self):
        """Returns iterator producing one line at a time"""
        # Derive the list of filenames to process
        # TODO: support recursive directory descent
        tpo.debug_print("in MySentences.__iter__()", 6)
        file_names = None
        if os.path.isdir(self.file_name):
            dir_name = self.file_name
            file_names = [
                os.path.join(dir_name, f) for f in os.listdir(dir_name)
            ]
        else:
            file_names = [self.file_name]

        # Feed each sentence individually from each file
        # TODO: add preprocessing (e.g., tokenize, make lowercase, etc.)
        for file_name in file_names:
            if os.path.isdir(file_name):
                tpo.debug_format("Warning: skipping subdirectory {f}",
                                 tpo.WARNING,
                                 f=file_name)
                continue
            tpo.debug_format("Processing file {f}", tpo.DETAILED, f=file_name)
            for line in open(file_name):
                ## OLD: tokens = line.split()
                tokens = tokenize(line)
                tpo.debug_format("MySentences.__iter__: yielding {t}",
                                 6,
                                 t=tokens)
                yield tokens
        tpo.debug_print("out MySentences.__iter__()", 6)
        return
 def assertion(condition):
     """Issues warning if CONDITION doesn't hold"""
     # EX: assertion(2 + 2 != 5)
     # TODO: rename as soft_assertion???; add to tpo_common.py (along with run???)
     if not condition:
         # Try to get file and line number from stack frame
         # note: not available during interactive use
         filename = None
         line_num = -1
         frame = None
         try:
             frame = inspect.currentframe().f_back
             tpo.debug_trace("frame=%s", frame, level=8)
             tpo.trace_object(frame, 9, "frame")
             filename = frame.f_globals.get("__file__")
             if filename and filename.endswith(".pyc"):
                 filename = filename[:-1]
             line_num = frame.f_lineno
         finally:
             if frame:
                 del frame
         
         # Get text for line and extract the condition from invocation,
         # ignoring comments and function name.
         # TODO: define function for extracting line, so this can be put in tpo_common.py
         line = "???"
         if filename:
             line = run("tail --lines=+{l} '{f}' | head -1", 
                        subtrace_level=8, f=filename, l=line_num)
         condition = re.sub(r"^\s*\S*assertion\((.*)\)\s*(\#.*)?$", 
                            "\\1", line)
 
         # Print the assertion warning
         line_spec = "???"
         if filename:
             line_spec = "{f}:{l}".format(f=filename, l=line_num)
         debug_format("*** Warning: assertion failed: ({c}) at {ls}", 
                      tpo.WARNING, c=condition, ls=line_spec)
     return
def get_matching_files(pattern):
    """Get list of files matching pattern via shell globbing"""
    files = glob.glob(pattern)
    tpo.debug_format("get_matching_files({p}) => {l}", 5,
                     p=pattern, l=files)
    return files
def form_path(*filenames):
    """Wrapper around os.path.join over FILENAMEs (with tracing)"""
    path = os.path.join(*filenames)
    debug_format("form_path{f} => {p}", 6, f=tuple(filenames), p=path)
    return path
Example #14
0
def main():
    """Entry point for script"""
    tpo.debug_print("main(): sys.argv=%s" % sys.argv, 4)

    # Parse command-line arguments
    env_options = tpo.formatted_environment_option_descriptions(indent="  ")
    usage_description = tpo.format("""
Creates Google word2vec model (via gensim) of word distributions inferrred from 
the occurrences in the input text file. Note: input should be a text file 
(or directory) when creating from scratch or the basename of model file 
if loading existing model.

Notes:
- The input file should have one document per line (multiple sentences allowed).
- The following environment options are available:
  {env}
    """,
                                   env=env_options)
    parser = argparse.ArgumentParser(
        description=usage_description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--save",
                        default=False,
                        action='store_true',
                        help="Save model to disk")
    parser.add_argument("--load",
                        default=False,
                        action='store_true',
                        help="Load model from disk")
    parser.add_argument("--print",
                        default=False,
                        action='store_true',
                        help="Print vectors on standard output")
    parser.add_argument(
        "filename",
        default=None,
        help=
        "Input data filename (or basename when loading previously saved model); if a directory all files within are processed"
    )
    parser.add_argument(
        "--output-basename",
        default=None,
        help=
        "Basename to use for output (by default input file without .txt extension)"
    )
    parser.add_argument(
        "--show-similarity",
        default=False,
        action='store_true',
        help="Show similar terms for those from input (one per line)")
    # TODO: parser.add_argument("--language-model", default=None, help="Language model to use for rating similar terms")
    args = vars(parser.parse_args())
    tpo.debug_print("args = %s" % args, 5)
    filename = args['filename']
    save = args['save']
    load = args['load']
    print_vectors = args['print']
    show_similarity = args['show_similarity']
    output_basename = args['output_basename']
    # TODO: put version of glue_helper's assertion into tpo_common.py already!
    gh.assertion(filename)

    # Derive the basename if not given (checking one of .txt/.list/.prep extensions if training or .word2vec if loading)
    # TODO: rework in terms of stripping whatever file extension is used (e.g., "it.fubar" => "it")
    if not output_basename:
        input_extensions = [".txt", ".list", ".prep"
                            ] if (not load) else [WORD2VEC_MODEL_EXT]
        output_basename = filename
        for extension in input_extensions:
            output_basename = gh.remove_extension(filename, extension)
            if (output_basename != filename):
                break
    tpo.debug_print("output_basename=%s" % output_basename, 5)

    # Enable logging if debugging
    if (tpo.debugging_level()):
        # TODO: use mapping from symbolic LEVEL user option (e.g., via getenv)
        level = logging.INFO if (tpo.debug_level < 4) else logging.DEBUG
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=level)

    # Optionally set random seed
    if RANDOM_SEED != -1:
        tpo.debug_format("Setting random seed to {RANDOM_SEED}")
        numpy.random.seed(RANDOM_SEED)

    # Process the input file(s), either creating model from scratch or loading existing one
    if load:
        model = Word2Vec.load(filename)
    else:
        sentences = MySentences(filename)
        if tpo.verbose_debugging():
            # TODO: try to develop develop read-only function that makes copy of iterator
            sentences = list(sentences)
            gh.assertion(len(sentences) > 0)
            tpo.debug_format("sentences={s}", 6, s=sentences)
        # Notes: 1 is default for word2vec (todo, try None)
        seed = 1 if (RANDOM_SEED == -1) else RANDOM_SEED
        model = Word2Vec(sentences, workers=NUM_WORKERS, seed=seed)

        # Optionally save model to disk
        if (save):
            model.save(output_basename + WORD2VEC_MODEL_EXT)

    # Print the vector representations
    # TODO: add option to print word similarity matrix
    if print_vectors:
        all_words = sorted(model.vocab.keys())
        tpo.debug_format("model={m}", 6, m=model)
        print("Vocaulary terms: %s" % all_words)
        for word in all_words:
            tpo.debug_format("model[%s]=%s" % (word, model[word]), 5)
            print("%s\t%s" % (word, model[word]))

    # Show similarity info for terms from input
    # TODO: add better recovery for terms unknown
    if show_similarity:
        tpo.debug_print("Show similarity for terms from stdin", 4)
        print("term(s): similarity info")
        for line in sys.stdin:
            ## OLD: terms = [t.strip() for t in re.split(r"\W+", line.strip().lower())]
            terms = tokenize(line)
            try:
                # TODO: shows language model score for terms replaced by related terms
                if not terms:
                    pass
                elif len(terms) > 1 or SKIP_INDIVIDUAL:
                    print(
                        "[%s]: %s" %
                        (", ".join(terms), format_related_terms(model, terms)))
                else:
                    if not SKIP_INDIVIDUAL:
                        for term in terms:
                            print("[%s]: %s" %
                                  (term, format_related_terms(model, [term])))
                print("")
            except KeyError:
                tpo.print_stderr("Error: %s" % str(sys.exc_info()))
    return
Example #15
0
 def __init__(self, file_name):
     """Class constructor: FILE_NAME is text file or directory"""
     tpo.debug_format("MySentences.__init__({f})", 6, f=file_name)
     self.file_name = file_name
     return