def build_index(corpus_dir, corpus_type, stop_file, index_file, tag_file, word_count_file, synch_freq): index_jar = jar.Jar(index_file, word_count_file, synch_freq, stop_file) if corpus_type == "phpBB": post_db = corpus_dir + ".db" read_corpora.get_phpBB_posts(corpus_dir, post_db) for title, post in iter_indexed_posts(post_db, read_corpora.POST_DELIMITER, index_jar): index_jar.add_doc(source=corpus_dir, title=title, text=post) elif corpus_type == "xml": if not tag_file: raise AttributeError( "a tag file must be supplied when parsing an xml corpus") for file_name, title, heading, text in read_corpora.iter_xml( corpus_dir, tag_file): title = index_jar.index_and_count_text(title) heading = index_jar.index_and_count_text(heading) text = index_jar.index_and_count_text(text) index_jar.add_doc(file_name, title, heading, text) else: raise AttributeError("invalid corpus type %s\n\ must be one of phpBB or xml" % corpus_type) #one final synch and then we are done index_jar.synchronize()
def main(argv): colorama.init() argv = build_utils.ExpandFileArgs(argv) parser = optparse.OptionParser() build_utils.AddDepfileOption(parser) parser.add_option('--src-gendirs', help='Directories containing generated java files.') parser.add_option('--java-srcjars', action='append', default=[], help='List of srcjars to include in compilation.') parser.add_option( '--bootclasspath', action='append', default=[], help='Boot classpath for javac. If this is specified multiple times, ' 'they will all be appended to construct the classpath.') parser.add_option( '--classpath', action='append', help='Classpath for javac. If this is specified multiple times, they ' 'will all be appended to construct the classpath.') parser.add_option( '--javac-includes', help='A list of file patterns. If provided, only java files that match' 'one of the patterns will be compiled.') parser.add_option( '--jar-excluded-classes', default='', help='List of .class file patterns to exclude from the jar.') parser.add_option( '--chromium-code', type='int', help='Whether code being compiled should be built with stricter ' 'warnings for chromium code.') parser.add_option('--use-errorprone-path', help='Use the Errorprone compiler at this path.') parser.add_option('--classes-dir', help='Directory for compiled .class files.') parser.add_option('--jar-path', help='Jar output path.') parser.add_option('--jar-source-path', help='Source jar output path.') parser.add_option( '--jar-source-base-dir', help= 'Base directory for the source files included in the output source jar.' ) parser.add_option('--main-class', help='The class containing the main method.') parser.add_option('--manifest-entry', action='append', help='Key:value pairs to add to the .jar manifest.') parser.add_option('--stamp', help='Path to touch on success.') options, args = parser.parse_args(argv) if options.main_class and not options.jar_path: parser.error('--main-class requires --jar-path') bootclasspath = [] for arg in options.bootclasspath: bootclasspath += build_utils.ParseGypList(arg) classpath = [] for arg in options.classpath: classpath += build_utils.ParseGypList(arg) java_srcjars = [] for arg in options.java_srcjars: java_srcjars += build_utils.ParseGypList(arg) java_files = args if options.src_gendirs: src_gendirs = build_utils.ParseGypList(options.src_gendirs) java_files += build_utils.FindInDirectories(src_gendirs, '*.java') input_files = bootclasspath + classpath + java_srcjars + java_files with build_utils.TempDir() as temp_dir: classes_dir = os.path.join(temp_dir, 'classes') os.makedirs(classes_dir) if java_srcjars: java_dir = os.path.join(temp_dir, 'java') os.makedirs(java_dir) for srcjar in java_srcjars: build_utils.ExtractAll(srcjar, path=java_dir, pattern='*.java') java_files += build_utils.FindInDirectory(java_dir, '*.java') if options.javac_includes: javac_includes = build_utils.ParseGypList(options.javac_includes) filtered_java_files = [] for f in java_files: for include in javac_includes: if fnmatch.fnmatch(f, include): filtered_java_files.append(f) break java_files = filtered_java_files if len(java_files) != 0: DoJavac(bootclasspath, classpath, classes_dir, options.chromium_code, options.use_errorprone_path, java_files) if options.jar_path: if options.main_class or options.manifest_entry: if options.manifest_entry: entries = map(lambda e: e.split(":"), options.manifest_entry) else: entries = [] manifest_file = os.path.join(temp_dir, 'manifest') CreateManifest(manifest_file, classpath, options.main_class, entries) else: manifest_file = None jar.JarDirectory(classes_dir, build_utils.ParseGypList( options.jar_excluded_classes), options.jar_path, manifest_file=manifest_file) if options.jar_source_path: jar.Jar(java_files, options.jar_source_base_dir, options.jar_source_path) if options.classes_dir: # Delete the old classes directory. This ensures that all .class files in # the output are actually from the input .java files. For example, if a # .java file is deleted or an inner class is removed, the classes # directory should not contain the corresponding old .class file after # running this action. build_utils.DeleteDirectory(options.classes_dir) shutil.copytree(classes_dir, options.classes_dir) if options.depfile: build_utils.WriteDepfile( options.depfile, input_files + build_utils.GetPythonDependencies()) if options.stamp: build_utils.Touch(options.stamp)