def file_size(filename): """Returns size of FILENAME in bytes (or -1 if not found)""" size = -1 if os.path.exists(filename): size = os.path.getsize(filename) tpo.debug_format("file_size({f}) => {s}", 5, f=filename, s=size) return size
def read_lines(filename=None, make_unicode=False): """Returns list of lines from FILENAME without newlines (or other extra whitespace) @notes: Uses stdin if filename is None. Optionally returned as unicode.""" # TODO: use enumerate(f); refine exception in except; # TODO: force unicode if UTF8 encountered lines = [] f = None try: # Open the file if not filename: tpo.debug_format("Reading from stdin", 4) f = sys.stdin else: f = open(filename) if not f: raise IOError # Read line by line for line in f: line = line.strip("\n") if make_unicode: line = tpo.ensure_unicode(line) lines.append(line) except IOError: debug_print("Warning: Exception reading file %s: %s" % (filename, str(sys.exc_info())), 2) finally: if f: f.close() debug_print("read_lines(%s) => %s" % (filename, lines), 6) return lines
def create_directory(path): """Wrapper around os.mkdir over PATH (with tracing)""" if not os.path.exists(path): os.mkdir(path) debug_format("os.mkdir({p})", 6, p=path) else: assertion(os.path.isdir(path)) return
def getenv_filename(var, default="", description=None): """Returns text filename based on environment variable VAR (or string version of DEFAULT) with optional DESCRIPTION. This includes a sanity check for file being non-empty.""" debug_format("getenv_filename({v}, {d}, {desc})", 6, v=var, d=default, desc=description) filename = tpo.getenv_text(var, default, description) if filename and not non_empty_file(filename): tpo.print_stderr("Error: filename %s empty or missing for environment option %s" % (filename, var)) return filename
def tokenize(text): """Tokenize TEXT according to regex word tokens (i.e., \W+), which defaults to [A-Za-z0-9_]+""" # TODO: Allow for tokenization regex to be overwritten token_regex = r"\W+" if not PRESERVE else r"\S+" tokens = [t.strip() for t in re.split(token_regex, text) if t.strip()] if DOWNCASE: tokens = [t.lower() for t in tokens] tpo.debug_format("tokenize({txt}) => t", 7, txt=text, t=tokens) return tokens
def delete_file(filename): """Deletes FILENAME""" debug_print("delete_file(%s)" % tpo.normalize_unicode(filename), 5) assertion(os.path.exists(filename)) ok = False try: ok = os.remove(filename) debug_format("remove{f} => {r}", 6, f=filename, r=ok) except OSError: debug_print("Exception during deletion of {filename}: " + str(sys.exc_info()), 5) return ok
def get_directory_listing(dirname, make_unicode=False): """Returns files in DIRNAME""" all_file_names = [] try: all_file_names = os.listdir(dirname) except OSError: tpo.debug_format("Exception during get_directory_listing: {exc}", 4, exc=str(sys.exc_info())) if make_unicode: all_file_names = [tpo.ensure_unicode(f) for f in all_file_names] tpo.debug_format("get_directory_listing({dir}) => {files}", 5, dir=dirname, files=all_file_names) return all_file_names
def resolve_path(filename, base_dir=None): """Resolves path for FILENAME related to BASE_DIR if not in current directory. Note: this uses the script directory for the calling module if BASE_DIR not specified (i.e., as if os.path.dirname(__file__) passed).""" path = filename if not os.path.exists(path): if not base_dir: frame = None try: frame = inspect.currentframe().f_back base_dir = os.path.dirname(frame.f_globals['__file__']) except (AttributeError, KeyError): base_dir = "" debug_print("Exception during resolve_path: " + str(sys.exc_info()), 5) finally: if frame: del frame path = os.path.join(base_dir, path) debug_format("resolve_path({f}) => {p}", 4, f=filename, p=path) return path
def extract_matches(pattern, lines, fields=1): """Checks for PATTERN matches in LINES of text returning list of tuples with replacement groups""" # ex: extract_matches(r"^(\S+) \S+", ["John D.", "Jane D.", "Plato"]) => ["John", "Jane"] assert type(lines) == list if pattern.find("(") == -1: pattern = "(" + pattern + ")" matches = [] for line in lines: try: match = re.search(pattern, line) if match: result = match.group(1) if (fields == 1) else [match.group(i + 1) for i in range(fields)] matches.append(result) except (re.error, IndexError): debug_print("Warning: Exception in pattern matching: %s" % str(sys.exc_info()), 2) debug_print("extract_matches(%s, _, [%s]) => %s" % (pattern, fields, matches), 7) double_indent = INDENT + INDENT debug_format("{ind}input lines: {{\n{res}\n{ind}}}", 8, ind=INDENT, res=indent_lines("\n".join(lines), double_indent)) return matches
def __iter__(self): """Returns iterator producing one line at a time""" # Derive the list of filenames to process # TODO: support recursive directory descent tpo.debug_print("in MySentences.__iter__()", 6) file_names = None if os.path.isdir(self.file_name): dir_name = self.file_name file_names = [ os.path.join(dir_name, f) for f in os.listdir(dir_name) ] else: file_names = [self.file_name] # Feed each sentence individually from each file # TODO: add preprocessing (e.g., tokenize, make lowercase, etc.) for file_name in file_names: if os.path.isdir(file_name): tpo.debug_format("Warning: skipping subdirectory {f}", tpo.WARNING, f=file_name) continue tpo.debug_format("Processing file {f}", tpo.DETAILED, f=file_name) for line in open(file_name): ## OLD: tokens = line.split() tokens = tokenize(line) tpo.debug_format("MySentences.__iter__: yielding {t}", 6, t=tokens) yield tokens tpo.debug_print("out MySentences.__iter__()", 6) return
def assertion(condition): """Issues warning if CONDITION doesn't hold""" # EX: assertion(2 + 2 != 5) # TODO: rename as soft_assertion???; add to tpo_common.py (along with run???) if not condition: # Try to get file and line number from stack frame # note: not available during interactive use filename = None line_num = -1 frame = None try: frame = inspect.currentframe().f_back tpo.debug_trace("frame=%s", frame, level=8) tpo.trace_object(frame, 9, "frame") filename = frame.f_globals.get("__file__") if filename and filename.endswith(".pyc"): filename = filename[:-1] line_num = frame.f_lineno finally: if frame: del frame # Get text for line and extract the condition from invocation, # ignoring comments and function name. # TODO: define function for extracting line, so this can be put in tpo_common.py line = "???" if filename: line = run("tail --lines=+{l} '{f}' | head -1", subtrace_level=8, f=filename, l=line_num) condition = re.sub(r"^\s*\S*assertion\((.*)\)\s*(\#.*)?$", "\\1", line) # Print the assertion warning line_spec = "???" if filename: line_spec = "{f}:{l}".format(f=filename, l=line_num) debug_format("*** Warning: assertion failed: ({c}) at {ls}", tpo.WARNING, c=condition, ls=line_spec) return
def get_matching_files(pattern): """Get list of files matching pattern via shell globbing""" files = glob.glob(pattern) tpo.debug_format("get_matching_files({p}) => {l}", 5, p=pattern, l=files) return files
def form_path(*filenames): """Wrapper around os.path.join over FILENAMEs (with tracing)""" path = os.path.join(*filenames) debug_format("form_path{f} => {p}", 6, f=tuple(filenames), p=path) return path
def main(): """Entry point for script""" tpo.debug_print("main(): sys.argv=%s" % sys.argv, 4) # Parse command-line arguments env_options = tpo.formatted_environment_option_descriptions(indent=" ") usage_description = tpo.format(""" Creates Google word2vec model (via gensim) of word distributions inferrred from the occurrences in the input text file. Note: input should be a text file (or directory) when creating from scratch or the basename of model file if loading existing model. Notes: - The input file should have one document per line (multiple sentences allowed). - The following environment options are available: {env} """, env=env_options) parser = argparse.ArgumentParser( description=usage_description, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--save", default=False, action='store_true', help="Save model to disk") parser.add_argument("--load", default=False, action='store_true', help="Load model from disk") parser.add_argument("--print", default=False, action='store_true', help="Print vectors on standard output") parser.add_argument( "filename", default=None, help= "Input data filename (or basename when loading previously saved model); if a directory all files within are processed" ) parser.add_argument( "--output-basename", default=None, help= "Basename to use for output (by default input file without .txt extension)" ) parser.add_argument( "--show-similarity", default=False, action='store_true', help="Show similar terms for those from input (one per line)") # TODO: parser.add_argument("--language-model", default=None, help="Language model to use for rating similar terms") args = vars(parser.parse_args()) tpo.debug_print("args = %s" % args, 5) filename = args['filename'] save = args['save'] load = args['load'] print_vectors = args['print'] show_similarity = args['show_similarity'] output_basename = args['output_basename'] # TODO: put version of glue_helper's assertion into tpo_common.py already! gh.assertion(filename) # Derive the basename if not given (checking one of .txt/.list/.prep extensions if training or .word2vec if loading) # TODO: rework in terms of stripping whatever file extension is used (e.g., "it.fubar" => "it") if not output_basename: input_extensions = [".txt", ".list", ".prep" ] if (not load) else [WORD2VEC_MODEL_EXT] output_basename = filename for extension in input_extensions: output_basename = gh.remove_extension(filename, extension) if (output_basename != filename): break tpo.debug_print("output_basename=%s" % output_basename, 5) # Enable logging if debugging if (tpo.debugging_level()): # TODO: use mapping from symbolic LEVEL user option (e.g., via getenv) level = logging.INFO if (tpo.debug_level < 4) else logging.DEBUG logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=level) # Optionally set random seed if RANDOM_SEED != -1: tpo.debug_format("Setting random seed to {RANDOM_SEED}") numpy.random.seed(RANDOM_SEED) # Process the input file(s), either creating model from scratch or loading existing one if load: model = Word2Vec.load(filename) else: sentences = MySentences(filename) if tpo.verbose_debugging(): # TODO: try to develop develop read-only function that makes copy of iterator sentences = list(sentences) gh.assertion(len(sentences) > 0) tpo.debug_format("sentences={s}", 6, s=sentences) # Notes: 1 is default for word2vec (todo, try None) seed = 1 if (RANDOM_SEED == -1) else RANDOM_SEED model = Word2Vec(sentences, workers=NUM_WORKERS, seed=seed) # Optionally save model to disk if (save): model.save(output_basename + WORD2VEC_MODEL_EXT) # Print the vector representations # TODO: add option to print word similarity matrix if print_vectors: all_words = sorted(model.vocab.keys()) tpo.debug_format("model={m}", 6, m=model) print("Vocaulary terms: %s" % all_words) for word in all_words: tpo.debug_format("model[%s]=%s" % (word, model[word]), 5) print("%s\t%s" % (word, model[word])) # Show similarity info for terms from input # TODO: add better recovery for terms unknown if show_similarity: tpo.debug_print("Show similarity for terms from stdin", 4) print("term(s): similarity info") for line in sys.stdin: ## OLD: terms = [t.strip() for t in re.split(r"\W+", line.strip().lower())] terms = tokenize(line) try: # TODO: shows language model score for terms replaced by related terms if not terms: pass elif len(terms) > 1 or SKIP_INDIVIDUAL: print( "[%s]: %s" % (", ".join(terms), format_related_terms(model, terms))) else: if not SKIP_INDIVIDUAL: for term in terms: print("[%s]: %s" % (term, format_related_terms(model, [term]))) print("") except KeyError: tpo.print_stderr("Error: %s" % str(sys.exc_info())) return
def __init__(self, file_name): """Class constructor: FILE_NAME is text file or directory""" tpo.debug_format("MySentences.__init__({f})", 6, f=file_name) self.file_name = file_name return