def import_path(pathname, verbal=False): sys.stdout.write("\n") path_results = [] mogu_files = [] directories = [] single_file = pathname.endswith(".mogu") if not single_file: mogu_files = [entry for entry in os.listdir(pathname) if \ entry.endswith(".mogu")] directories = [entry for entry in os.listdir(pathname) if \ os.path.isdir(os.path.join(pathname,entry))] else: mogu_files = [pathname] head,tail = os.path.split(pathname) shortname = tail for directory in directories: path_results.extend(import_path(os.path.join(pathname,directory), verbal)) for i,mogufile in enumerate(mogu_files): # Display nice progress information so the user doesn't think # something is wrong sys.stdout.write("\r%s "%(" "*80)) sys.stdout.flush() sys.stdout.write("\r%s Progress: %d%s (%s)" % (shortname, ((i+1.0)/float(len(mogu_files)))*100, "%", mogufile)) sys.stdout.flush() # Append lexed results to results if single_file: sys.stdout.write("Importing single file: %s" % mogufile) sys.stdout.flush() path_results.extend(FileImporter.import_file(mogufile)) else: path_results.extend( FileImporter.import_file( os.path.join(pathname,mogufile),verbal)) sys.stdout.write("\n") # The results will be a list of tuples, each of which will contain two # entries: # index 0 will contain the OrderedDict of token names: tokens. # index 1 will contain the actual map used to parse the tokens return path_results
def mogu_import(args): results = [] # Holds the final result of all consumption operations redis_objects = [] #Holds the objects that can be written to Redis write = not args.testing # If testing, don't actually write to Redis for path in args.command[1:]: if path.endswith(".mogu"): # If the path points to a specific file results.extend(FileImporter.import_file(path,args.v)) else: # The path is a directory results.extend(PathImporter.import_path(path,args.v)) converter = PythonObjectConverter.PythonObjectConverter() for result in results: redis_objects.extend(converter.convert(result)) # SANITY CHECKS # # First, make sure that all symbols referenced are defined. for registry in [ SymbolRegistry.widgetRegistry, SymbolRegistry.templateRegistry, SymbolRegistry.dataRegistry, SymbolRegistry.validatorRegistry, SymbolRegistry.policyRegistry ]: if not registry: # Returns false if a symbol in the registry is not defined sys.stderr.write(display_undefined_symbols(registry)) sys.stderr.write("\n== REFUSING TO CONTINUE ==\n") sys.exit() if registry.nonreferenced(): # Warns the user if something was defined but never used i = None sys.stderr.write("\n== WARNING: %s contains the following symbols that are defined but never referenced == \n" % registry.label) for symbol in registry.nonreferenced(): sys.stderr.write("\t- %s\n" % symbol) # Give the user a chance to halt the import and fix the problem if not args.yes: i = raw_input("Continue anyway? [y to continue, anything else to cancel]: ") if i != 'y' and not args.yes: sys.stderr.write("Exiting...") sys.exit() # Sanity checks complete. Now the process of actually writing # to Redis # TODO Don't forget to make RedisWriter deal with purging/merging if write: writer = RedisWriter.RedisWriter(args) #TODO RedisWriter should read dbconfig.conf instead if args.v: sys.stderr.write("Writing imported files to database!\n") writer.write(redis_objects)
the_dir = '/Users/hah661/Documents/Northwestern/MyPHD/social_policy_course/SocPol_Video/transcript_txts/' files = [ 'F2014_1.txtout.csv', 'F2014_2.txtout.csv', 'F2014_3.txtout.csv', 'F2014_4.txtout.csv', 'F2014_5.txtout.csv', 'W2014_1.txtout.csv', 'W2014_2.txtout.csv', 'W2014_3.txtout.csv', 'W2014_4.txtout.csv' ] all_spoken = [] for filename in files: all_spoken = all_spoken + FileImporter.import_file(the_dir+filename) chunk_tags = Minute_chunker.chunk(all_spoken) all_spoken = TagHandler.apply_tags(all_spoken, chunk_tags, "minute_chunk") all_spoken = TagHandler.remove_tag(all_spoken, "minute_chunk") x = CentroidClusterModule()
import chunker as c import xml_writer as x # import chunks import CentroidClusterModule from CentroidClusterModule import CentroidClusterModule import nltk import csv import Minute_chunker import FileImporter import TagHandler the_dir = '/Users/hah661/Documents/Northwestern/MyPHD/social_policy_course/SocPol_Video/transcript_txts/' files = [ 'F2014_1.txtout.csv', 'F2014_2.txtout.csv', 'F2014_3.txtout.csv', 'F2014_4.txtout.csv', 'F2014_5.txtout.csv', 'W2014_1.txtout.csv', 'W2014_2.txtout.csv', 'W2014_3.txtout.csv', 'W2014_4.txtout.csv' ] all_spoken = [] for filename in files: all_spoken = all_spoken + FileImporter.import_file(the_dir + filename) chunk_tags = Minute_chunker.chunk(all_spoken) all_spoken = TagHandler.apply_tags(all_spoken, chunk_tags, "minute_chunk") all_spoken = TagHandler.remove_tag(all_spoken, "minute_chunk") x = CentroidClusterModule()