def format_ds(input_file): """Reads the file at the path pointed at by input_file and returns Docuscope-formatted results from the Ity DocuscopeTagger, in string form""" with open(input_file, 'r') as f: text_contents = f.read() tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) tagger = DocuscopeTagger(return_included_tags=True) tags = tagger.tag(tokens) # do an ugly hack to fix lat names for t in tags[1]: new_tag = list(t['rules'][0]) new_tag[0] = new_tag[0].rsplit('.')[-1] new_rules = list(t['rules']) new_rules.pop(0) new_rules.insert(0, new_tag) t['rules'] = tuple(new_rules) formatter = LATFormatter.LATFormatter() return formatter.format(tags=tags, tokens=tokens, s=text_contents, input_file=input_file)
def tag_text(text_path, corpus_info, corpus_data_files, tags, formats=None, write_to_disk=False): # Open the text file and get its contents. if not os.path.exists(text_path): raise ValueError("Text file '%s' does not exist." % text_path) text_name = os.path.basename(text_path) text_file = codecs.open(text_path, encoding="UTF-8") text_contents = text_file.read() text_file.close() # Tokenize. tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) # Import and instantiate the taggers. tag_dicts = {} tag_maps = {} # TODO: Parallelize? for tag_name, tag_args in tags.items(): if tag_name in tag_dicts or tag_name in tag_maps: raise NotImplementedError("Tagging multiple times with the same tagger is not yet supported.") tagger_name = tag_name + "Tagger" tagger_module = getattr(__import__("Ity.Taggers", fromlist=tagger_name), tagger_name) # Add some additional instantiation arguments for specific taggers. # TODO: Clean up Taggers' init() arguments. if tag_args is None: tagger_init_args = {} else: tagger_init_args = tag_args # Optionally use the rules file that was uploaded with the if tag_name == "SimpleRule" and ( "SimpleRule" in corpus_data_files and "saved" in corpus_data_files["SimpleRule"] and len(corpus_data_files["SimpleRule"]["saved"]) > 0 ): if "rules_filename" not in tagger_init_args: if len(corpus_data_files["SimpleRule"]["saved"]) > 1: raise NotImplementedError("Multiple rules files for SimpleRuleTagger is not yet supported.") tagger_init_args.update( rules_filename=corpus_data_files["SimpleRule"]["saved"][0] ) # Otherwise, SimpleRuleTagger will use the default rules file it knows the path to internally. elif tag_name == "TopicModel": tagger_init_args.update( corpus_name=corpus_info["name"] ) # Instantiate this tagger. tagger_instance = tagger_module(**tagger_init_args) # Tag with this tagger. single_tag_data, single_tag_maps = tagger_instance.tag(tokens) tag_dicts[tag_name] = single_tag_data tag_maps[tag_name] = single_tag_maps # Return the text name, list of tag dicts, and some token counts. output_dict = dict( text_path=text_path, text_name=text_name, text_key=nameToKey(os.path.splitext(text_name)[0]), corpus_name=corpus_info["name"], text_contents=text_contents, # tokens=tokens, tag_dicts=tag_dicts, # tags=tags, num_tokens=len(tokens), num_word_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["WORD"] ]), num_punctuation_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["PUNCTUATION"] ]), num_included_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] not in tokenizer.excluded_token_types ]), num_excluded_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] in tokenizer.excluded_token_types ]) ) if formats is not None: format_outputs = format_text(tag_maps, tokens, output_dict, corpus_info, formats, write_to_disk=write_to_disk) output_dict["format_outputs"] = format_outputs output_dict["html_name"] = os.path.basename(format_outputs["HTML"]["app"]) # del output_dict["tags"] return output_dict
def _tag_text_with_existing_instances(text_path, corpus_info, corpus_data_files, taggers, formatters=None, write_to_disk=False): # Open the text file and get its contents. if not os.path.exists(text_path): raise ValueError("Text file '%s' does not exist." % text_path) text_name = os.path.basename(text_path) # Try to decode the file with multiple encodings text_file = None text_contents = None for encoding in ["UTF-8", "ISO-8859-1", "CP1252"]: try: text_file = codecs.open(text_path, encoding=encoding) text_contents = text_file.read() break except UnicodeDecodeError: pass finally: if text_file is not None: text_file.close() if text_contents is None: raise NotImplementedError("Could not find a valid encoding for input file %s" % text_path) # Tokenize. tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) # Import and instantiate the taggers. tag_dicts = {} tag_maps = {} # TODO: Parallelize? for tag_name, tagger in taggers.items(): if tag_name in tag_dicts or tag_name in tag_maps: raise NotImplementedError("Tagging multiple times with the same tagger is not yet supported.") # Tag with this tagger. single_tag_data, single_tag_maps = tagger.tag(tokens) tag_dicts[tag_name] = single_tag_data tag_maps[tag_name] = single_tag_maps # Return the text name, list of tag dicts, and some token counts. output_dict = dict( text_path=text_path, text_name=text_name, text_key=nameToKey(os.path.splitext(text_name)[0]), corpus_name=corpus_info["name"], text_contents=text_contents, # tokens=tokens, tag_dicts=tag_dicts, # tags=tags, num_tokens=len(tokens), num_word_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["WORD"] ]), num_punctuation_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["PUNCTUATION"] ]), num_included_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] not in tokenizer.excluded_token_types ]), num_excluded_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] in tokenizer.excluded_token_types ]) ) if formatters is not None: format_outputs = _format_text_with_existing_instances(tag_maps, tokens, output_dict, corpus_info, formatters, write_to_disk=write_to_disk) output_dict["format_outputs"] = format_outputs output_dict["html_name"] = os.path.basename(format_outputs["HTML"]["app"]) # del output_dict["tags"] return output_dict
def csv_formatter_app(args): # Get the input files with the appropriate file extension. patterns = None if args.file_extension is not None: patterns = ("\." + args.file_extension + "$",) # Figure out which tagger we need. imported_tagger = getattr(__import__("Ity.Taggers", fromlist=[args.tagger_module_name]), args.tagger_module_name) # Make sure the corpus folder at corpus_path exists. # If args.corpus_path is an absolute path, os.path.join() will do the right thing. corpus_path = os.path.join( corpus_root, args.corpus_path ) if not os.path.exists(corpus_path): raise ValueError("Corpus at path '%s' does not exist.") # TopicModelTagger and a few other things may need this. corpus_name = os.path.basename(corpus_path) # Filter by file names in the corpus. if args.filenames is not None and len(args.filenames) > 0: for index, filename in enumerate(args.filenames): args.filenames[index] = os.path.join(corpus_path, filename) input_paths = FilePaths.valid_paths(args.filenames, patterns=patterns, recursion_levels=3, debug=args.debug) else: input_paths = FilePaths.valid_paths((corpus_path,), patterns=patterns, recursion_levels=3, debug=args.debug) ################################ #### Initialize Ity Modules #### ################################ tokenizer = RegexTokenizer() # Instantiate *one* tagger. Note that TopicModelTagger needs a model_path given to it. # TODO: Support for multiple taggers. # TODO: Run the TopicModel generator for a brand new corpus for which we have no metadata. # TODO: It seems like TopicModelTagger implies some kind of CorpusTagger with corpus-specific data. It'd be good to make that a real subclass. if args.tagger_module_name == "TopicModelTagger": tagger = imported_tagger(corpus_name=corpus_name) # Use the rules filename for SimpleRuleTagger if we got one. Otherwise, SimpleRuleTagger will use the rules in "default.csv". elif args.tagger_module_name == "SimpleRuleTagger" and args.rules_file is not None: tagger = imported_tagger(rules_filename=args.rules_file) else: tagger = imported_tagger() formatter = CSVFormatter() # Keep calm and DO THINGS tags_list = [] tokens_list = [] str_list = [] text_name_list = [] # Process each text in the corpus. for path_index, path in enumerate(input_paths): # Get the name of the text. That appears as output in the CSV. text_name = os.path.splitext(os.path.basename(path))[0] text_name_list.append(text_name) start_time = time() # Open the file and get its contents. the_file = codecs.open(path, encoding="utf-8") the_str = the_file.read() the_file.close() str_list.append(the_str) # Tokenize tokens = tokenizer.tokenize(the_str) tokens_list.append(tokens) # Tag tag_data, tag_maps = tagger.tag(tokens) tags_list.append([tag_data, tag_maps]) end_time = time() # Debug output if args.debug: message = "\t** Processed '%s' (%u / %u) in %f seconds. **" % ( os.path.basename(path), path_index + 1, len(input_paths), end_time - start_time ) print message # Output the CSV. csv_str = formatter.batch_format( tags_list=tags_list, tokens_list=tokens_list, corpus_name=corpus_name, s_list=str_list, text_name_list=text_name_list ) # Write the csv_str out to a file. if args.output_filename is None: csv_filename = corpus_name + "_" + args.tagger_module_name + ".csv" else: csv_filename = args.output_filename # Do we have a specified output directory in the args object? if args.output_dir is not None: csv_dir = os.path.abspath( os.path.expanduser(args.output_dir) ) else: # Output the CSV in the current working directory by default. csv_dir = os.path.abspath(os.path.dirname(__file__)) # Create the output directory if it doesn't exist. if not os.path.exists(csv_dir): os.makedirs(csv_dir) # Get the full file path to the output CSV. csv_path = os.path.join( csv_dir, csv_filename ) # Write the CSV to disk. try: csv_file = codecs.open(csv_path, encoding="utf-8", mode="w") csv_file.write(csv_str) csv_file.close() # Debug output if args.debug: message = "** Wrote CSV containing tagged data for corpus '%s' to '%s'. **" % (corpus_name, csv_path) print message return csv_path except IOError: if args.debug: message = "**** Error writing out CSV containing tagged data for corpus '%s' to '%s'. ****" % (corpus_name, csv_path) print message return None
def csv_formatter_app(args): # Get the input files with the appropriate file extension. patterns = None if args.file_extension is not None: patterns = ("\." + args.file_extension + "$", ) # Figure out which tagger we need. imported_tagger = getattr( __import__("Ity.Taggers", fromlist=[args.tagger_module_name]), args.tagger_module_name) # Make sure the corpus folder at corpus_path exists. # If args.corpus_path is an absolute path, os.path.join() will do the right thing. corpus_path = os.path.join(corpus_root, args.corpus_path) if not os.path.exists(corpus_path): raise ValueError("Corpus at path '%s' does not exist.") # TopicModelTagger and a few other things may need this. corpus_name = os.path.basename(corpus_path) # Filter by file names in the corpus. if args.filenames is not None and len(args.filenames) > 0: for index, filename in enumerate(args.filenames): args.filenames[index] = os.path.join(corpus_path, filename) input_paths = FilePaths.valid_paths(args.filenames, patterns=patterns, recursion_levels=3, debug=args.debug) else: input_paths = FilePaths.valid_paths((corpus_path, ), patterns=patterns, recursion_levels=3, debug=args.debug) ################################ #### Initialize Ity Modules #### ################################ tokenizer = RegexTokenizer() # Instantiate *one* tagger. Note that TopicModelTagger needs a model_path given to it. # TODO: Support for multiple taggers. # TODO: Run the TopicModel generator for a brand new corpus for which we have no metadata. # TODO: It seems like TopicModelTagger implies some kind of CorpusTagger with corpus-specific data. It'd be good to make that a real subclass. if args.tagger_module_name == "TopicModelTagger": tagger = imported_tagger(corpus_name=corpus_name) # Use the rules filename for SimpleRuleTagger if we got one. Otherwise, SimpleRuleTagger will use the rules in "default.csv". elif args.tagger_module_name == "SimpleRuleTagger" and args.rules_file is not None: tagger = imported_tagger(rules_filename=args.rules_file) else: tagger = imported_tagger() formatter = CSVFormatter() # Keep calm and DO THINGS tags_list = [] tokens_list = [] str_list = [] text_name_list = [] # Process each text in the corpus. for path_index, path in enumerate(input_paths): # Get the name of the text. That appears as output in the CSV. text_name = os.path.splitext(os.path.basename(path))[0] text_name_list.append(text_name) start_time = time() # Open the file and get its contents. the_file = codecs.open(path, encoding="utf-8") the_str = the_file.read() the_file.close() str_list.append(the_str) # Tokenize tokens = tokenizer.tokenize(the_str) tokens_list.append(tokens) # Tag tag_data, tag_maps = tagger.tag(tokens) tags_list.append([tag_data, tag_maps]) end_time = time() # Debug output if args.debug: message = "\t** Processed '%s' (%u / %u) in %f seconds. **" % ( os.path.basename(path), path_index + 1, len(input_paths), end_time - start_time) print message # Output the CSV. csv_str = formatter.batch_format(tags_list=tags_list, tokens_list=tokens_list, corpus_name=corpus_name, s_list=str_list, text_name_list=text_name_list) # Write the csv_str out to a file. if args.output_filename is None: csv_filename = corpus_name + "_" + args.tagger_module_name + ".csv" else: csv_filename = args.output_filename # Do we have a specified output directory in the args object? if args.output_dir is not None: csv_dir = os.path.abspath(os.path.expanduser(args.output_dir)) else: # Output the CSV in the current working directory by default. csv_dir = os.path.abspath(os.path.dirname(__file__)) # Create the output directory if it doesn't exist. if not os.path.exists(csv_dir): os.makedirs(csv_dir) # Get the full file path to the output CSV. csv_path = os.path.join(csv_dir, csv_filename) # Write the CSV to disk. try: csv_file = codecs.open(csv_path, encoding="utf-8", mode="w") csv_file.write(csv_str) csv_file.close() # Debug output if args.debug: message = "** Wrote CSV containing tagged data for corpus '%s' to '%s'. **" % ( corpus_name, csv_path) print message return csv_path except IOError: if args.debug: message = "**** Error writing out CSV containing tagged data for corpus '%s' to '%s'. ****" % ( corpus_name, csv_path) print message return None
def tag_text(text_path, corpus_info, corpus_data_files, tags, formats=None, write_to_disk=False): # Open the text file and get its contents. if not os.path.exists(text_path): raise ValueError("Text file '%s' does not exist." % text_path) text_name = os.path.basename(text_path) text_file = codecs.open(text_path, encoding="UTF-8") text_contents = text_file.read() text_file.close() # Tokenize. tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) # Import and instantiate the taggers. tag_dicts = {} tag_maps = {} # TODO: Parallelize? for tag_name, tag_args in tags.items(): if tag_name in tag_dicts or tag_name in tag_maps: raise NotImplementedError( "Tagging multiple times with the same tagger is not yet supported." ) tagger_name = tag_name + "Tagger" tagger_module = getattr( __import__("Ity.Taggers", fromlist=tagger_name), tagger_name) # Add some additional instantiation arguments for specific taggers. # TODO: Clean up Taggers' init() arguments. if tag_args is None: tagger_init_args = {} else: tagger_init_args = tag_args # Optionally use the rules file that was uploaded with the if tag_name == "SimpleRule" and ( "SimpleRule" in corpus_data_files and "saved" in corpus_data_files["SimpleRule"] and len(corpus_data_files["SimpleRule"]["saved"]) > 0): if "rules_filename" not in tagger_init_args: if len(corpus_data_files["SimpleRule"]["saved"]) > 1: raise NotImplementedError( "Multiple rules files for SimpleRuleTagger is not yet supported." ) tagger_init_args.update( rules_filename=corpus_data_files["SimpleRule"]["saved"][0]) # Otherwise, SimpleRuleTagger will use the default rules file it knows the path to internally. elif tag_name == "TopicModel": tagger_init_args.update(corpus_name=corpus_info["name"]) # Instantiate this tagger. tagger_instance = tagger_module(**tagger_init_args) # Tag with this tagger. single_tag_data, single_tag_maps = tagger_instance.tag(tokens) tag_dicts[tag_name] = single_tag_data tag_maps[tag_name] = single_tag_maps # Return the text name, list of tag dicts, and some token counts. output_dict = dict( text_path=text_path, text_name=text_name, text_key=nameToKey(os.path.splitext(text_name)[0]), corpus_name=corpus_info["name"], text_contents=text_contents, # tokens=tokens, tag_dicts=tag_dicts, # tags=tags, num_tokens=len(tokens), num_word_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["WORD"] ]), num_punctuation_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["PUNCTUATION"] ]), num_included_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] not in tokenizer.excluded_token_types ]), num_excluded_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] in tokenizer.excluded_token_types ])) if formats is not None: format_outputs = format_text(tag_maps, tokens, output_dict, corpus_info, formats, write_to_disk=write_to_disk) output_dict["format_outputs"] = format_outputs output_dict["html_name"] = os.path.basename( format_outputs["HTML"]["app"]) # del output_dict["tags"] return output_dict
def _tag_text_with_existing_instances(text_path, corpus_info, corpus_data_files, taggers, formatters=None, write_to_disk=False): # Open the text file and get its contents. if not os.path.exists(text_path): raise ValueError("Text file '%s' does not exist." % text_path) text_name = os.path.basename(text_path) # Try to decode the file with multiple encodings text_file = None text_contents = None for encoding in ["UTF-8", "ISO-8859-1", "CP1252"]: try: text_file = codecs.open(text_path, encoding=encoding) text_contents = text_file.read() break except UnicodeDecodeError: pass finally: if text_file is not None: text_file.close() if text_contents is None: raise NotImplementedError( "Could not find a valid encoding for input file %s" % text_path) # Tokenize. tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) # Import and instantiate the taggers. tag_dicts = {} tag_maps = {} # TODO: Parallelize? for tag_name, tagger in taggers.items(): if tag_name in tag_dicts or tag_name in tag_maps: raise NotImplementedError( "Tagging multiple times with the same tagger is not yet supported." ) # Tag with this tagger. single_tag_data, single_tag_maps = tagger.tag(tokens) tag_dicts[tag_name] = single_tag_data tag_maps[tag_name] = single_tag_maps # Return the text name, list of tag dicts, and some token counts. output_dict = dict( text_path=text_path, text_name=text_name, text_key=nameToKey(os.path.splitext(text_name)[0]), corpus_name=corpus_info["name"], text_contents=text_contents, # tokens=tokens, tag_dicts=tag_dicts, # tags=tags, num_tokens=len(tokens), num_word_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["WORD"] ]), num_punctuation_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["PUNCTUATION"] ]), num_included_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] not in tokenizer.excluded_token_types ]), num_excluded_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] in tokenizer.excluded_token_types ])) if formatters is not None: format_outputs = _format_text_with_existing_instances( tag_maps, tokens, output_dict, corpus_info, formatters, write_to_disk=write_to_disk) output_dict["format_outputs"] = format_outputs output_dict["html_name"] = os.path.basename( format_outputs["HTML"]["app"]) # del output_dict["tags"] return output_dict