def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=0, context_type='document', ignore=['.json', '.log', '.err', '.pickle', '.npz'], decode=True, sentences=False, simple=True, tokenizer='default'): from vsm.corpus import Corpus # ensure that nltk packages are downloaded ensure_nltk_data_downloaded() # import appropriate tokenizer if tokenizer == 'default': from vsm.extensions.corpusbuilders.util import word_tokenize tokenizer = word_tokenize elif tokenizer == 'simple': from topicexplorer.tokenizer import simple_tokenizer tokenizer = simple_tokenizer elif tokenizer == 'zh': from topicexplorer.lib.chinese import modern_chinese_tokenizer tokenizer = modern_chinese_tokenizer elif tokenizer == 'ltc' or tokenizer == 'och': from topicexplorer.lib.chinese import ancient_chinese_tokenizer tokenizer = ancient_chinese_tokenizer elif tokenizer == 'inpho': from topicexplorer.extensions.inpho import inpho_tokenizer tokenizer = inpho_tokenizer elif tokenizer == 'brain': from hyperbrain.parse import brain_tokenizer tokenizer = brain_tokenizer else: raise NotImplementedError( "Tokenizer '{}' is not included in topicexplorer".format(tokenizer)) # pre-process PDF files contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern(corpus_path, '*.pdf') if contains_pdfs: corpus_path = process_pdfs(corpus_path) print("Building corpus from", corpus_path, end=' ') corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore) print("with {} function".format(corpusbuilder.__name__)) c = corpusbuilder(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) if contains_pdfs: from vsm.viewer.wrappers import doc_label_name import re label_name = doc_label_name(c.context_types[0]) new_labels = [re.sub('txt$', 'pdf', label) for label in c.context_data[0][label_name]] c.context_data[0][label_name] = new_labels filename = get_corpus_filename( corpus_path, model_path, nltk_stop, stop_freq, context_type) c.save(filename) return filename
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=1, context_type='document', ignore=['.json','.log','.err','.pickle','.npz']): # pre-process PDF files if corpus_path[-4:] == '.pdf' or util.contains_pattern(corpus_path, '*.pdf'): if os.path.isdir(corpus_path): print "PDF files detected, extracting plaintext to", corpus_path + '-txt' if corpus_path.endswith('/'): corpus_path = corpus_path[:-1] pdf.main(corpus_path, corpus_path + '-txt') corpus_path += '-txt' else: print "PDF files detected, extracting plaintext to",\ corpus_path.replace('.pdf','.txt') pdf.main(corpus_path) corpus_path = corpus_path.replace('.pdf','.txt') print "Building corpus from", corpus_path if os.path.isfile(corpus_path): print "Constructing toy corpus, each line is a document" c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, stop_freq=stop_freq, autolabel=True) elif os.path.isdir(corpus_path): contents = os.listdir(corpus_path) contents = [os.path.join(corpus_path,obj) for obj in contents if not any([obj.endswith(suffix) for suffix in ignore])] count_dirs = len(filter(os.path.isdir, contents)) count_files = len(filter(os.path.isfile, contents)) print "Detected %d folders and %d files in %s" %\ (count_dirs, count_files, corpus_path) if count_files > 0 and count_dirs == 0: print "Constructing directory corpus, each file is a document" c = dir_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, chunk_name=context_type, ignore=ignore) elif count_dirs > 0 and count_files == 0: print "Constructing collection corpus, each folder is a document" context_type='book' c = coll_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore) else: raise IOError("Invalid Path: empty directory") else: raise IOError("Invalid path") filename = get_corpus_filename( corpus_path, model_path, nltk_stop, stop_freq, context_type) c.save(filename) return filename
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=0, context_type='document', ignore=['.json', '.log', '.err', '.pickle', '.npz'], decode=True, sentences=False, simple=True, tokenizer='default'): from vsm.corpus import Corpus from vsm.extensions.corpusbuilders import coll_corpus, dir_corpus, toy_corpus if sentences: print "Importing sentence constructors" from vsm.extensions.ldasentences import dir_corpus, toy_corpus # import appropriate tokenizer if tokenizer == 'default': from vsm.extensions.corpusbuilders.util import word_tokenize tokenizer = word_tokenize elif tokenizer == 'zh': from topicexplorer.lib.chinese import modern_chinese_tokenizer tokenizer = modern_chinese_tokenizer elif tokenizer == 'ltc' or tokenizer == 'och': from topicexplorer.lib.chinese import ancient_chinese_tokenizer tokenizer = ancient_chinese_tokenizer else: raise NotImplementedError( "Tokenizer '{}' is not included in topicexplorer".format( tokenizer)) # pre-process PDF files contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern( corpus_path, '*.pdf') if contains_pdfs: corpus_path = process_pdfs(corpus_path) print "Building corpus from", corpus_path, corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore) print "with {} function".format(corpusbuilder.__name__) c = corpusbuilder(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) ''' if os.path.isfile(corpus_path): print "Constructing toy corpus, each line is a document" if sentences: c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, stop_freq=stop_freq, autolabel=True, decode=decode) else: c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, stop_freq=stop_freq, autolabel=True, decode=decode, simple=simple, tokenizer=tokenizer) elif os.path.isdir(corpus_path): contents = listdir_nohidden(corpus_path) contents = [os.path.join(corpus_path,obj) for obj in contents if not any([obj.endswith(suffix) for suffix in ignore])] count_dirs = len(filter(os.path.isdir, contents)) count_files = len(filter(os.path.isfile, contents)) print "Detected %d folders and %d files in %s" %\ (count_dirs, count_files, corpus_path) if count_files > 0 and count_dirs == 0: print "Constructing directory corpus, each file is a document" if sentences: c = dir_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, chunk_name=context_type, ignore=ignore, decode=decode) else: c = dir_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, chunk_name=context_type, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) elif count_dirs > 0 and count_files == 0 and not sentences: print "Constructing collection corpus, each folder is a document" context_type='book' c = coll_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) elif count_dirs > 0 and count_files == 0 and sentences: raise NotImplementedError("""Collection corpuses are too large for sentence parsing. Reduce your corpus to a single folder or file.""") else: raise IOError("Invalid Path: empty directory") else: raise IOError("Invalid path") ''' if contains_pdfs: from vsm.viewer.wrappers import doc_label_name import re label_name = doc_label_name(c.context_types[0]) new_labels = [ re.sub('txt$', 'pdf', label) for label in c.context_data[0][label_name] ] c.context_data[0][label_name] = new_labels filename = get_corpus_filename(corpus_path, model_path, nltk_stop, stop_freq, context_type) c.save(filename) return filename
def write_config(args, config_file=None): """ If config_file is None, then a name is automatically generated """ config = ConfigParser() config.add_section("main") config.set("main", "path", os.path.abspath(args.model_path)) config.set("main", "corpus_file", os.path.abspath(args.corpus_filename)) config.set("main", "raw_corpus", os.path.abspath(args.corpus_path)) config.set("main", "sentences", args.sentences) if args.bibtex: config.set("main", "label_module", "topicexplorer.extensions.bibtex") config.add_section("bibtex") config.set("bibtex", "path", args.bibtex) config.add_section("www") config.set("www", "corpus_name", args.corpus_print_name) config.set("www", "icons", "fingerprint,link") config.set("www", "fulltext", "false") config.set("www", "tokenizer", args.tokenizer); # adds a pdf element to the config file and set it to true if # pdf documents were being used in the corpus if args.corpus_path[-4:] == '.pdf' or contains_pattern(args.corpus_path, '*.pdf'): config.set("www", "pdf", "true") config.add_section("logging") config.set("logging", "path", "logs/%s/{0}.log" % args.corpus_name) if args.htrc: config = add_htrc_metadata(config, corpus_filename=os.path.abspath(args.corpus_filename)) if not args.corpus_print_name: config.set("www", "corpus_name", "HTRC Data Capsule") if args.tokenizer in ['zh','ltc','och']: config.set("main", "lang", "cn") if config_file is None: config_file = args.corpus_name + ".ini" if os.path.basename(args.corpus_path) == args.corpus_name: config_file = os.path.join(args.corpus_path, '..', config_file) config_file = os.path.normpath(config_file) overwrite = None if os.path.exists(config_file) and not args.quiet else True while not overwrite: overwrite = input("\nConfig file {0} exists. Overwrite? [Y/n] ".format(config_file)) overwrite = overwrite.lower().strip() if overwrite == 'n': config_i = 0 while os.path.exists(config_file): config_file = args.corpus_name + ".%d.ini" % config_i config_i += 1 config_file = input("Enter new filename [default: {0}]: ".format(config_file))\ or config_file elif overwrite == '' or overwrite == 'y': overwrite = True config.set("main", "corpus_desc", config_file+'.md') print("Writing configuration file", config_file) with open(config_file, "w") as configfh: config.write(configfh) return config_file
def write_config(args, config_file=None): """ If config_file is None, then a name is automatically generated """ config = ConfigParser() config.add_section("main") config.set("main", "path", os.path.abspath(args.model_path)) config.set("main", "corpus_file", os.path.abspath(args.corpus_filename)) config.set("main", "raw_corpus", os.path.abspath(args.corpus_path)) config.set("main", "sentences", args.sentences) if args.bibtex: config.set("main", "label_module", "topicexplorer.extensions.bibtex") config.add_section("bibtex") config.set("bibtex", "path", args.bibtex) config.add_section("www") config.set("www", "corpus_name", args.corpus_print_name) config.set("www", "icons", "fingerprint,link") config.set("www", "fulltext", "false") config.set("www", "tokenizer", args.tokenizer) # adds a pdf element to the config file and set it to true if # pdf documents were being used in the corpus if args.corpus_path[-4:] == '.pdf' or contains_pattern( args.corpus_path, '*.pdf'): config.set("www", "pdf", "true") config.add_section("logging") config.set("logging", "path", "logs/%s/{0}.log" % args.corpus_name) if args.htrc: config = add_htrc_metadata(config, corpus_filename=os.path.abspath( args.corpus_filename)) if not args.corpus_print_name: config.set("www", "corpus_name", "HTRC Data Capsule") if args.tokenizer in ['zh', 'ltc', 'och']: config.set("main", "lang", "cn") if config_file is None: config_file = args.corpus_name + ".ini" if os.path.basename(args.corpus_path) == args.corpus_name: config_file = os.path.join(args.corpus_path, '..', config_file) config_file = os.path.normpath(config_file) overwrite = None if os.path.exists( config_file) and not args.quiet else True while not overwrite: overwrite = input( "\nConfig file {0} exists. Overwrite? [Y/n] ".format( config_file)) overwrite = overwrite.lower().strip() if overwrite == 'n': config_i = 0 while os.path.exists(config_file): config_file = args.corpus_name + ".%d.ini" % config_i config_i += 1 config_file = input("Enter new filename [default: {0}]: ".format(config_file))\ or config_file elif overwrite == '' or overwrite == 'y': overwrite = True config.set("main", "corpus_desc", config_file + '.md') print("Writing configuration file", config_file) with open(config_file, "w") as configfh: config.write(configfh) return config_file
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=0, context_type='document', ignore=['.json', '.log', '.err', '.pickle', '.npz'], decode=True, sentences=False, simple=True, tokenizer='default'): from vsm.corpus import Corpus # ensure that nltk packages are downloaded ensure_nltk_data_downloaded() # import appropriate tokenizer if tokenizer == 'default': from vsm.extensions.corpusbuilders.util import word_tokenize tokenizer = word_tokenize elif tokenizer == 'simple': from topicexplorer.tokenizer import simple_tokenizer tokenizer = simple_tokenizer elif tokenizer == 'zh': from topicexplorer.lib.chinese import modern_chinese_tokenizer tokenizer = modern_chinese_tokenizer elif tokenizer == 'ltc' or tokenizer == 'och': from topicexplorer.lib.chinese import ancient_chinese_tokenizer tokenizer = ancient_chinese_tokenizer elif tokenizer == 'inpho': from topicexplorer.extensions.inpho import inpho_tokenizer tokenizer = inpho_tokenizer elif tokenizer == 'brain': from hyperbrain.parse import brain_tokenizer tokenizer = brain_tokenizer else: raise NotImplementedError( "Tokenizer '{}' is not included in topicexplorer".format( tokenizer)) # pre-process PDF files contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern( corpus_path, '*.pdf') if contains_pdfs: corpus_path = process_pdfs(corpus_path) print("Building corpus from", corpus_path, end=' ') corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore) print("with {} function".format(corpusbuilder.__name__)) c = corpusbuilder(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) if contains_pdfs: from vsm.viewer.wrappers import doc_label_name import re label_name = doc_label_name(c.context_types[0]) new_labels = [ re.sub('txt$', 'pdf', label) for label in c.context_data[0][label_name] ] c.context_data[0][label_name] = new_labels filename = get_corpus_filename(corpus_path, model_path, nltk_stop, stop_freq, context_type) c.save(filename) return filename
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=1, context_type='document', ignore=['.json', '.log', '.err', '.pickle', '.npz']): # pre-process PDF files if corpus_path[-4:] == '.pdf' or util.contains_pattern( corpus_path, '*.pdf'): if os.path.isdir(corpus_path): print "PDF files detected, extracting plaintext to", corpus_path + '-txt' if corpus_path.endswith('/'): corpus_path = corpus_path[:-1] pdf.main(corpus_path, corpus_path + '-txt') corpus_path += '-txt' else: print "PDF files detected, extracting plaintext to",\ corpus_path.replace('.pdf','.txt') pdf.main(corpus_path) corpus_path = corpus_path.replace('.pdf', '.txt') print "Building corpus from", corpus_path if os.path.isfile(corpus_path): print "Constructing toy corpus, each line is a document" c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, stop_freq=stop_freq, autolabel=True) elif os.path.isdir(corpus_path): contents = os.listdir(corpus_path) contents = [ os.path.join(corpus_path, obj) for obj in contents if not any([obj.endswith(suffix) for suffix in ignore]) ] count_dirs = len(filter(os.path.isdir, contents)) count_files = len(filter(os.path.isfile, contents)) print "Detected %d folders and %d files in %s" %\ (count_dirs, count_files, corpus_path) if count_files > 0 and count_dirs == 0: print "Constructing directory corpus, each file is a document" c = dir_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, chunk_name=context_type, ignore=ignore) elif count_dirs > 0 and count_files == 0: print "Constructing collection corpus, each folder is a document" context_type = 'book' c = coll_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore) else: raise IOError("Invalid Path: empty directory") else: raise IOError("Invalid path") filename = get_corpus_filename(corpus_path, model_path, nltk_stop, stop_freq, context_type) c.save(filename) return filename
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=0, context_type='document', ignore=['.json','.log','.err','.pickle','.npz'], decode=True, sentences=False, simple=True, tokenizer='default'): from vsm.corpus import Corpus from vsm.extensions.corpusbuilders import coll_corpus, dir_corpus, toy_corpus if sentences: print "Importing sentence constructors" from vsm.extensions.ldasentences import dir_corpus, toy_corpus # import appropriate tokenizer if tokenizer == 'default': from vsm.extensions.corpusbuilders.util import word_tokenize tokenizer = word_tokenize elif tokenizer == 'zh': from topicexplorer.lib.chinese import modern_chinese_tokenizer tokenizer = modern_chinese_tokenizer elif tokenizer == 'ltc' or tokenizer == 'och': from topicexplorer.lib.chinese import ancient_chinese_tokenizer tokenizer = ancient_chinese_tokenizer else: raise NotImplementedError("Tokenizer '{}' is not included in topicexplorer".format(tokenizer)) # pre-process PDF files contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern(corpus_path, '*.pdf') if contains_pdfs: corpus_path = process_pdfs(corpus_path) print "Building corpus from", corpus_path, corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore) print "with {} function".format(corpusbuilder.__name__) c = corpusbuilder(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) ''' if os.path.isfile(corpus_path): print "Constructing toy corpus, each line is a document" if sentences: c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, stop_freq=stop_freq, autolabel=True, decode=decode) else: c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, stop_freq=stop_freq, autolabel=True, decode=decode, simple=simple, tokenizer=tokenizer) elif os.path.isdir(corpus_path): contents = listdir_nohidden(corpus_path) contents = [os.path.join(corpus_path,obj) for obj in contents if not any([obj.endswith(suffix) for suffix in ignore])] count_dirs = len(filter(os.path.isdir, contents)) count_files = len(filter(os.path.isfile, contents)) print "Detected %d folders and %d files in %s" %\ (count_dirs, count_files, corpus_path) if count_files > 0 and count_dirs == 0: print "Constructing directory corpus, each file is a document" if sentences: c = dir_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, chunk_name=context_type, ignore=ignore, decode=decode) else: c = dir_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, chunk_name=context_type, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) elif count_dirs > 0 and count_files == 0 and not sentences: print "Constructing collection corpus, each folder is a document" context_type='book' c = coll_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) elif count_dirs > 0 and count_files == 0 and sentences: raise NotImplementedError("""Collection corpuses are too large for sentence parsing. Reduce your corpus to a single folder or file.""") else: raise IOError("Invalid Path: empty directory") else: raise IOError("Invalid path") ''' if contains_pdfs: from vsm.viewer.wrappers import doc_label_name import re label_name = doc_label_name(c.context_types[0]) new_labels = [re.sub('txt$', 'pdf', label) for label in c.context_data[0][label_name]] c.context_data[0][label_name] = new_labels filename = get_corpus_filename( corpus_path, model_path, nltk_stop, stop_freq, context_type) c.save(filename) return filename