def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=1, context_type='document', ignore=['.json','.log','.err','.pickle','.npz']): # pre-process PDF files if corpus_path[-4:] == '.pdf' or util.contains_pattern(corpus_path, '*.pdf'): if os.path.isdir(corpus_path): print "PDF files detected, extracting plaintext to", corpus_path + '-txt' if corpus_path.endswith('/'): corpus_path = corpus_path[:-1] pdf.main(corpus_path, corpus_path + '-txt') corpus_path += '-txt' else: print "PDF files detected, extracting plaintext to",\ corpus_path.replace('.pdf','.txt') pdf.main(corpus_path) corpus_path = corpus_path.replace('.pdf','.txt') print "Building corpus from", corpus_path if os.path.isfile(corpus_path): print "Constructing toy corpus, each line is a document" c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, stop_freq=stop_freq, autolabel=True) elif os.path.isdir(corpus_path): contents = os.listdir(corpus_path) contents = [os.path.join(corpus_path,obj) for obj in contents if not any([obj.endswith(suffix) for suffix in ignore])] count_dirs = len(filter(os.path.isdir, contents)) count_files = len(filter(os.path.isfile, contents)) print "Detected %d folders and %d files in %s" %\ (count_dirs, count_files, corpus_path) if count_files > 0 and count_dirs == 0: print "Constructing directory corpus, each file is a document" c = dir_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, chunk_name=context_type, ignore=ignore) elif count_dirs > 0 and count_files == 0: print "Constructing collection corpus, each folder is a document" context_type='book' c = coll_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore) else: raise IOError("Invalid Path: empty directory") else: raise IOError("Invalid path") filename = get_corpus_filename( corpus_path, model_path, nltk_stop, stop_freq, context_type) c.save(filename) return filename
def process_pdfs(corpus_path, ignore=['.json', '.log', '.err', '.pickle', '.npz']): from topicexplorer.lib import pdf if os.path.isfile(corpus_path): print("PDF file detected, extracting plaintext to", corpus_path.replace('.pdf', '.txt')) pdf.main(corpus_path) corpus_path = corpus_path.replace('.pdf', '.txt') elif os.path.isdir(corpus_path): print("PDF files detected, extracting plaintext to", corpus_path + '-txt') if corpus_path.endswith('/'): corpus_path = corpus_path[:-1] # TODO: Add processing of collections contents = listdir_nohidden(corpus_path) contents = [os.path.join(corpus_path, obj) for obj in contents if not any([obj.endswith(suffix) for suffix in ignore])] count_dirs = len(list(filter(os.path.isdir, contents))) count_files = len(list(filter(os.path.isfile, contents))) if count_files > 0 and count_dirs == 0: # process all files pdf.main(corpus_path, corpus_path + '-txt') elif count_dirs > 0 and count_files == 0: # process each subdirectory for directory in contents: pdf.main(directory, directory.replace(corpus_path, corpus_path + '-txt')) else: raise IOError("Invalid Path: empty directory") corpus_path += '-txt' return corpus_path
def process_pdfs(corpus_path, ignore=['.json', '.log', '.err', '.pickle', '.npz']): from topicexplorer.lib import pdf if os.path.isfile(corpus_path): print("PDF file detected, extracting plaintext to", corpus_path.replace('.pdf', '.txt')) pdf.main(corpus_path) corpus_path = corpus_path.replace('.pdf', '.txt') elif os.path.isdir(corpus_path): print("PDF files detected, extracting plaintext to", corpus_path + '-txt') if corpus_path.endswith('/'): corpus_path = corpus_path[:-1] # TODO: Add processing of collections contents = listdir_nohidden(corpus_path) contents = [ os.path.join(corpus_path, obj) for obj in contents if not any([obj.endswith(suffix) for suffix in ignore]) ] count_dirs = len(list(filter(os.path.isdir, contents))) count_files = len(list(filter(os.path.isfile, contents))) if count_files > 0 and count_dirs == 0: # process all files pdf.main(corpus_path, corpus_path + '-txt') elif count_dirs > 0 and count_files == 0: # process each subdirectory for directory in contents: pdf.main(directory, directory.replace(corpus_path, corpus_path + '-txt')) else: raise IOError("Invalid Path: empty directory") corpus_path += '-txt' return corpus_path
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=1, context_type='document', ignore=['.json', '.log', '.err', '.pickle', '.npz']): # pre-process PDF files if corpus_path[-4:] == '.pdf' or util.contains_pattern( corpus_path, '*.pdf'): if os.path.isdir(corpus_path): print "PDF files detected, extracting plaintext to", corpus_path + '-txt' if corpus_path.endswith('/'): corpus_path = corpus_path[:-1] pdf.main(corpus_path, corpus_path + '-txt') corpus_path += '-txt' else: print "PDF files detected, extracting plaintext to",\ corpus_path.replace('.pdf','.txt') pdf.main(corpus_path) corpus_path = corpus_path.replace('.pdf', '.txt') print "Building corpus from", corpus_path if os.path.isfile(corpus_path): print "Constructing toy corpus, each line is a document" c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, stop_freq=stop_freq, autolabel=True) elif os.path.isdir(corpus_path): contents = os.listdir(corpus_path) contents = [ os.path.join(corpus_path, obj) for obj in contents if not any([obj.endswith(suffix) for suffix in ignore]) ] count_dirs = len(filter(os.path.isdir, contents)) count_files = len(filter(os.path.isfile, contents)) print "Detected %d folders and %d files in %s" %\ (count_dirs, count_files, corpus_path) if count_files > 0 and count_dirs == 0: print "Constructing directory corpus, each file is a document" c = dir_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, chunk_name=context_type, ignore=ignore) elif count_dirs > 0 and count_files == 0: print "Constructing collection corpus, each folder is a document" context_type = 'book' c = coll_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore) else: raise IOError("Invalid Path: empty directory") else: raise IOError("Invalid path") filename = get_corpus_filename(corpus_path, model_path, nltk_stop, stop_freq, context_type) c.save(filename) return filename