def format_related_terms(model, positive_terms, max_num=NUM_TOP): """Determine related terms from MODEL for POSITIVE_TERMS, returning at most MAX_NUM entries each.""" # Try to get most similar terms. If words are not in the vocabulary # try with the remainder if any. all_related_info = [] try: all_related_info = model.most_similar(positive=positive_terms) except KeyError: missing = [w for w in positive_terms if w not in model] tpo.print_stderr("Warning: omitting words not in model: %s" % missing) ok_words = tpo.difference(positive_terms, missing) if ok_words: try: all_related_info = model.most_similar(positive=ok_words) except: tpo.print_stderr("Unexpected error in format_related_terms: " + str(sys.exc_info())) # Add related terms unless filtered due to low frequency or embedded punctuation related_specs = [] for (term, score) in all_related_info: if SKIP_LOW_FREQUENCY_TERMS and term.lower() not in TERM_FREQ_HASH: tpo.debug_print("Skipping low frequency related term '%s'" % term, 6) continue if SKIP_TERMS_WITH_PUNCTUATION and re.search(r"\W", term): tpo.debug_print( "Skipping related term '%s' due to punctuation" % term, 6) continue related_specs.append(term + ": " + tpo.round_num(score)) if len(related_specs) == max_num: break return ", ".join(related_specs)
def read_lines(filename=None, make_unicode=False): """Returns list of lines from FILENAME without newlines (or other extra whitespace) @notes: Uses stdin if filename is None. Optionally returned as unicode.""" # TODO: use enumerate(f); refine exception in except; # TODO: force unicode if UTF8 encountered lines = [] f = None try: # Open the file if not filename: tpo.debug_format("Reading from stdin", 4) f = sys.stdin else: f = open(filename) if not f: raise IOError # Read line by line for line in f: line = line.strip("\n") if make_unicode: line = tpo.ensure_unicode(line) lines.append(line) except IOError: debug_print("Warning: Exception reading file %s: %s" % (filename, str(sys.exc_info())), 2) finally: if f: f.close() debug_print("read_lines(%s) => %s" % (filename, lines), 6) return lines
def write_file(filename, text, append=False): """Writes FILENAME using contents in TEXT, adding trailing newline and optionally for APPEND""" ## TEST: debug_print(u"write_file(%s, %s)" % (filename, text), 7) ## TEST: debug_print(u"write_file(%s, %s)" % (filename, tpo.normalize_unicode(text)), 7) debug_print("write_file(%s, %s)" % (tpo.normalize_unicode(filename), tpo.normalize_unicode(text)), 7) text_lines = text.rstrip("\n").split("\n") return write_lines(filename, text_lines, append)
def __iter__(self): """Returns iterator producing one line at a time""" # Derive the list of filenames to process # TODO: support recursive directory descent tpo.debug_print("in MySentences.__iter__()", 6) file_names = None if os.path.isdir(self.file_name): dir_name = self.file_name file_names = [ os.path.join(dir_name, f) for f in os.listdir(dir_name) ] else: file_names = [self.file_name] # Feed each sentence individually from each file # TODO: add preprocessing (e.g., tokenize, make lowercase, etc.) for file_name in file_names: if os.path.isdir(file_name): tpo.debug_format("Warning: skipping subdirectory {f}", tpo.WARNING, f=file_name) continue tpo.debug_format("Processing file {f}", tpo.DETAILED, f=file_name) for line in open(file_name): ## OLD: tokens = line.split() tokens = tokenize(line) tpo.debug_format("MySentences.__iter__: yielding {t}", 6, t=tokens) yield tokens tpo.debug_print("out MySentences.__iter__()", 6) return
def elide(text, max_len=MAX_ELIDED_TEXT_LEN): """Returns TEXT elided to at most MAX_LEN characters (with '...' used to indicate remainder). Note: intended for tracing long string.""" # TODO: add support for eliding at word-boundaries result = text if len(result) > max_len: result = result[:max_len] + "..." tpo.debug_print("elide({%s}, [{%s}]) => {%s}" % (text, max_len, result), 7) return result
def copy_file(source, target): """Copy SOURCE file to TARGET file""" # Note: meta data is not copied (e.g., access control lists)); see # https://docs.python.org/2/library/shutil.html debug_print("copy_file(%s, %s)" % (tpo.normalize_unicode(source), tpo.normalize_unicode(target)), 5) assertion(non_empty_file(source)) shutil.copy(source, target) assertion(non_empty_file(target)) return
def remove_extension(filename, extension): """Returns FILENAME without EXTENSION. Note: similar to basename() but retainting directory portion.""" # EX: remove_extension("/tmp/solr-4888.log", ".log") => "/tmp/solr-4888" # EX: remove_extension("/tmp/fubar.py", ".py") => "/tmp/fubar" # EX: remove_extension("/tmp/fubar.py", "py") => "/tmp/fubar." # NOTE: Unlike os.path.splitext, only the specific extension is removed (not whichever extension used). pos = filename.find(extension) base = filename[:pos] if (pos > -1) else filename debug_print("remove_extension(%s, %s) => %s" % (filename, extension, base), 5) return base
def delete_file(filename): """Deletes FILENAME""" debug_print("delete_file(%s)" % tpo.normalize_unicode(filename), 5) assertion(os.path.exists(filename)) ok = False try: ok = os.remove(filename) debug_format("remove{f} => {r}", 6, f=filename, r=ok) except OSError: debug_print("Exception during deletion of {filename}: " + str(sys.exc_info()), 5) return ok
def basename(filename, extension=None): """Remove directory and from FILENAME along with optional EXTENSION, as with Unix basename command. Note: the period in the extension must be explicitly supplied (e.g., '.data' not 'data')""" # EX: basename("fubar.py", ".py") => "fubar" # EX: basename("fubar.py", "py") => "fubar." # EX: basename("/tmp/solr-4888.log", ".log") => "solr-4888" base = os.path.basename(filename) if extension != None: pos = base.find(extension) if pos > -1: base = base[:pos] debug_print("basename(%s, %s) => %s" % (filename, extension, base), 5) return base
def resolve_path(filename, base_dir=None): """Resolves path for FILENAME related to BASE_DIR if not in current directory. Note: this uses the script directory for the calling module if BASE_DIR not specified (i.e., as if os.path.dirname(__file__) passed).""" path = filename if not os.path.exists(path): if not base_dir: frame = None try: frame = inspect.currentframe().f_back base_dir = os.path.dirname(frame.f_globals['__file__']) except (AttributeError, KeyError): base_dir = "" debug_print("Exception during resolve_path: " + str(sys.exc_info()), 5) finally: if frame: del frame path = os.path.join(base_dir, path) debug_format("resolve_path({f}) => {p}", 4, f=filename, p=path) return path
def extract_matches(pattern, lines, fields=1): """Checks for PATTERN matches in LINES of text returning list of tuples with replacement groups""" # ex: extract_matches(r"^(\S+) \S+", ["John D.", "Jane D.", "Plato"]) => ["John", "Jane"] assert type(lines) == list if pattern.find("(") == -1: pattern = "(" + pattern + ")" matches = [] for line in lines: try: match = re.search(pattern, line) if match: result = match.group(1) if (fields == 1) else [match.group(i + 1) for i in range(fields)] matches.append(result) except (re.error, IndexError): debug_print("Warning: Exception in pattern matching: %s" % str(sys.exc_info()), 2) debug_print("extract_matches(%s, _, [%s]) => %s" % (pattern, fields, matches), 7) double_indent = INDENT + INDENT debug_format("{ind}input lines: {{\n{res}\n{ind}}}", 8, ind=INDENT, res=indent_lines("\n".join(lines), double_indent)) return matches
def run(command, trace_level=4, subtrace_level=None, just_issue=False, **namespace): """Invokes COMMAND via system shell, using TRACE_LEVEL for debugging output, returning result. The command can use format-style templates, resolved from caller's namespace. The optional SUBTRACE_LEVEL sets tracing for invoked commands [defalt is same as TRACE_LEVEL); this works around problem with stderr not being separated, which can be a problem when tracing unit tests. Notes: This function doesn't work under Win32. Tabs are not preserved so redirect stdut to file if needed""" # TODO: make sure no template markers left in command text (e.g., "tar cvfz {tar_file}") # EX: "root" in run("ls /") # Note: Script tracing controlled DEBUG_LEVEL environment variable. assertion(isinstance(trace_level, int)) debug_print("run(%s, [trace_level=%s], [subtrace_level=%s])" % (command, trace_level, subtrace_level), (trace_level + 2)) global default_subtrace_level # Keep track of current debug level setting debug_level_env = None if subtrace_level is None: subtrace_level = default_subtrace_level if subtrace_level != trace_level: debug_level_env = os.getenv("DEBUG_LEVEL") setenv("DEBUG_LEVEL", str(subtrace_level)) # Expand the command template # TODO: make this optional command_line = command if re.search("{.*}", command): command_line = tpo.format(command_line, indirect_caller=True, ignore_exception=False, **namespace) debug_print("issuing: %s" % command_line, trace_level) # Run the command # TODO: check for errors (e.g., "sh: wordnet.py: not found"); make wait explicit wait = not command.endswith("&") assertion(wait or not just_issue) result = getoutput(command_line) if wait else str(os.system(command_line)) # Restore debug level setting in environment if debug_level_env: setenv("DEBUG_LEVEL", debug_level_env) debug_print("run(_) => {\n%s\n}" % indent_lines(result), (trace_level + 1)) return result
def issue(command, trace_level=4, subtrace_level=None, **namespace): """Wrapper around run() for when output is not being saved (i.e., just issues command). Note: this captures stderr unless redirected and displays when debugging""" # EX: issue("ls /") => "" # EX: issue("xeyes &") debug_print("run(%s, [trace_level=%s], [subtrace_level=%s])" % (command, trace_level, subtrace_level), (trace_level + 1)) # Add stderr redirect to temporary log file, unless redirection already present log_file = None if tpo.debugging() and (not "2>" in command) and (not "2|&1" in command): log_file = TEMP_LOG_FILE command += " 2>| " + log_file # Run the command and trace output command_line = command if re.search("{.*}", command_line): command_line = tpo.format(command_line, indirect_caller=True, ignore_exception=False, **namespace) output = run(command_line, trace_level, subtrace_level, just_issue=True) tpo.debug_print("stdout from command: {\n%s\n}\n" % indent(output), (2 + trace_level)) # Trace out any standard error output and remove temporary log file (unless debugging) if log_file: if tpo.debugging() and non_empty_file(log_file): stderr_output = indent(read_file(log_file)) tpo.debug_print("stderr output from command: {\n%s\n}\n" % indent(stderr_output)) if not tpo.detailed_debugging(): delete_file(log_file) return
def write_lines(filename, text_lines, append=False): """Creates FILENAME using TEXT_LINES with newlines added and optionally for APPEND""" debug_print("write_lines(%s, _)" % (filename), 5) debug_print(" text_lines=%s" % text_lines, 6) f = None try: mode = 'a' if append else 'w' f = open(filename, mode) for line in text_lines: line = tpo.normalize_unicode(line) f.write(line + "\n") except IOError: debug_print("Warning: Exception writing file %s: %s" % (filename, str(sys.exc_info())), 2) finally: if f: f.close() return
def non_empty_file(filename): """Whether FILENAME exists and is non-empty""" size = (os.path.getsize(filename) if os.path.exists(filename) else -1) non_empty = (size > 0) debug_print("non_empty_file(%s) => %s (filesize=%s)" % (filename, non_empty, size), 5) return non_empty
def extract_match(pattern, lines, fields=1): """Extracts first match of PATTERN in LINES for FIELDS""" matches = extract_matches(pattern, lines, fields) result = (matches[0] if (len(matches) > 0) else None) debug_print("match: %s" % result, 5) return result
def main(): """Entry point for script""" tpo.debug_print("main(): sys.argv=%s" % sys.argv, 4) # Parse command-line arguments env_options = tpo.formatted_environment_option_descriptions(indent=" ") usage_description = tpo.format(""" Creates Google word2vec model (via gensim) of word distributions inferrred from the occurrences in the input text file. Note: input should be a text file (or directory) when creating from scratch or the basename of model file if loading existing model. Notes: - The input file should have one document per line (multiple sentences allowed). - The following environment options are available: {env} """, env=env_options) parser = argparse.ArgumentParser( description=usage_description, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--save", default=False, action='store_true', help="Save model to disk") parser.add_argument("--load", default=False, action='store_true', help="Load model from disk") parser.add_argument("--print", default=False, action='store_true', help="Print vectors on standard output") parser.add_argument( "filename", default=None, help= "Input data filename (or basename when loading previously saved model); if a directory all files within are processed" ) parser.add_argument( "--output-basename", default=None, help= "Basename to use for output (by default input file without .txt extension)" ) parser.add_argument( "--show-similarity", default=False, action='store_true', help="Show similar terms for those from input (one per line)") # TODO: parser.add_argument("--language-model", default=None, help="Language model to use for rating similar terms") args = vars(parser.parse_args()) tpo.debug_print("args = %s" % args, 5) filename = args['filename'] save = args['save'] load = args['load'] print_vectors = args['print'] show_similarity = args['show_similarity'] output_basename = args['output_basename'] # TODO: put version of glue_helper's assertion into tpo_common.py already! gh.assertion(filename) # Derive the basename if not given (checking one of .txt/.list/.prep extensions if training or .word2vec if loading) # TODO: rework in terms of stripping whatever file extension is used (e.g., "it.fubar" => "it") if not output_basename: input_extensions = [".txt", ".list", ".prep" ] if (not load) else [WORD2VEC_MODEL_EXT] output_basename = filename for extension in input_extensions: output_basename = gh.remove_extension(filename, extension) if (output_basename != filename): break tpo.debug_print("output_basename=%s" % output_basename, 5) # Enable logging if debugging if (tpo.debugging_level()): # TODO: use mapping from symbolic LEVEL user option (e.g., via getenv) level = logging.INFO if (tpo.debug_level < 4) else logging.DEBUG logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=level) # Optionally set random seed if RANDOM_SEED != -1: tpo.debug_format("Setting random seed to {RANDOM_SEED}") numpy.random.seed(RANDOM_SEED) # Process the input file(s), either creating model from scratch or loading existing one if load: model = Word2Vec.load(filename) else: sentences = MySentences(filename) if tpo.verbose_debugging(): # TODO: try to develop develop read-only function that makes copy of iterator sentences = list(sentences) gh.assertion(len(sentences) > 0) tpo.debug_format("sentences={s}", 6, s=sentences) # Notes: 1 is default for word2vec (todo, try None) seed = 1 if (RANDOM_SEED == -1) else RANDOM_SEED model = Word2Vec(sentences, workers=NUM_WORKERS, seed=seed) # Optionally save model to disk if (save): model.save(output_basename + WORD2VEC_MODEL_EXT) # Print the vector representations # TODO: add option to print word similarity matrix if print_vectors: all_words = sorted(model.vocab.keys()) tpo.debug_format("model={m}", 6, m=model) print("Vocaulary terms: %s" % all_words) for word in all_words: tpo.debug_format("model[%s]=%s" % (word, model[word]), 5) print("%s\t%s" % (word, model[word])) # Show similarity info for terms from input # TODO: add better recovery for terms unknown if show_similarity: tpo.debug_print("Show similarity for terms from stdin", 4) print("term(s): similarity info") for line in sys.stdin: ## OLD: terms = [t.strip() for t in re.split(r"\W+", line.strip().lower())] terms = tokenize(line) try: # TODO: shows language model score for terms replaced by related terms if not terms: pass elif len(terms) > 1 or SKIP_INDIVIDUAL: print( "[%s]: %s" % (", ".join(terms), format_related_terms(model, terms))) else: if not SKIP_INDIVIDUAL: for term in terms: print("[%s]: %s" % (term, format_related_terms(model, [term]))) print("") except KeyError: tpo.print_stderr("Error: %s" % str(sys.exc_info())) return
def read_file(filename, make_unicode=False): """Returns text from FILENAME (single string), including newline(s). Note: optionally returned as unicde.""" debug_print("read_file(%s)" % filename, 7) text = "\n".join(read_lines(filename, make_unicode=make_unicode)) return (text + "\n") if text else ""