Beispiel #1
0
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=1,
    context_type='document', ignore=['.json','.log','.err','.pickle','.npz']):
   
    # pre-process PDF files
    if corpus_path[-4:] == '.pdf' or util.contains_pattern(corpus_path, '*.pdf'):
        if os.path.isdir(corpus_path):
            print "PDF files detected, extracting plaintext to", corpus_path + '-txt'
            if corpus_path.endswith('/'):
                corpus_path = corpus_path[:-1]
            pdf.main(corpus_path, corpus_path + '-txt')
            corpus_path += '-txt'
        else:
            print "PDF files detected, extracting plaintext to",\
                corpus_path.replace('.pdf','.txt')
            pdf.main(corpus_path)
            corpus_path = corpus_path.replace('.pdf','.txt')

    print "Building corpus from", corpus_path

    if os.path.isfile(corpus_path):
        print "Constructing toy corpus, each line is a document"
        c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, 
                       stop_freq=stop_freq, autolabel=True)
    elif os.path.isdir(corpus_path):
        contents = os.listdir(corpus_path)
        contents = [os.path.join(corpus_path,obj) for obj in contents 
            if not any([obj.endswith(suffix) for suffix in ignore])]
        count_dirs = len(filter(os.path.isdir, contents))
        count_files = len(filter(os.path.isfile, contents))

        print "Detected %d folders and %d files in %s" %\
            (count_dirs, count_files, corpus_path)

        if count_files > 0 and count_dirs == 0:
            print "Constructing directory corpus, each file is a document"
            c = dir_corpus(corpus_path, nltk_stop=nltk_stop,
                           stop_freq=stop_freq, chunk_name=context_type,
                           ignore=ignore)
        elif count_dirs > 0 and count_files == 0:
            print "Constructing collection corpus, each folder is a document"
            context_type='book'
            c = coll_corpus(corpus_path, nltk_stop=nltk_stop,
                            stop_freq=stop_freq, ignore=ignore)
        else:
            raise IOError("Invalid Path: empty directory")
    else:
        raise IOError("Invalid path")

    filename = get_corpus_filename(
        corpus_path, model_path, nltk_stop, stop_freq, context_type)
    c.save(filename)
    return filename 
Beispiel #2
0
def process_pdfs(corpus_path, ignore=['.json', '.log', '.err', '.pickle', '.npz']):
    from topicexplorer.lib import pdf
    if os.path.isfile(corpus_path):
        print("PDF file detected, extracting plaintext to",
            corpus_path.replace('.pdf', '.txt'))
        pdf.main(corpus_path)
        corpus_path = corpus_path.replace('.pdf', '.txt')
    elif os.path.isdir(corpus_path):
        print("PDF files detected, extracting plaintext to", corpus_path + '-txt')

        if corpus_path.endswith('/'):
            corpus_path = corpus_path[:-1]

        # TODO: Add processing of collections
        contents = listdir_nohidden(corpus_path)
        contents = [os.path.join(corpus_path, obj) for obj in contents
                    if not any([obj.endswith(suffix) for suffix in ignore])]
        count_dirs = len(list(filter(os.path.isdir, contents)))
        count_files = len(list(filter(os.path.isfile, contents)))

        if count_files > 0 and count_dirs == 0:
            # process all files
            pdf.main(corpus_path, corpus_path + '-txt')
        elif count_dirs > 0 and count_files == 0:
            # process each subdirectory
            for directory in contents:
                pdf.main(directory,
                         directory.replace(corpus_path, corpus_path + '-txt'))
        else:
            raise IOError("Invalid Path: empty directory")

        corpus_path += '-txt'
    return corpus_path
Beispiel #3
0
def process_pdfs(corpus_path,
                 ignore=['.json', '.log', '.err', '.pickle', '.npz']):
    from topicexplorer.lib import pdf
    if os.path.isfile(corpus_path):
        print("PDF file detected, extracting plaintext to",
              corpus_path.replace('.pdf', '.txt'))
        pdf.main(corpus_path)
        corpus_path = corpus_path.replace('.pdf', '.txt')
    elif os.path.isdir(corpus_path):
        print("PDF files detected, extracting plaintext to",
              corpus_path + '-txt')

        if corpus_path.endswith('/'):
            corpus_path = corpus_path[:-1]

        # TODO: Add processing of collections
        contents = listdir_nohidden(corpus_path)
        contents = [
            os.path.join(corpus_path, obj) for obj in contents
            if not any([obj.endswith(suffix) for suffix in ignore])
        ]
        count_dirs = len(list(filter(os.path.isdir, contents)))
        count_files = len(list(filter(os.path.isfile, contents)))

        if count_files > 0 and count_dirs == 0:
            # process all files
            pdf.main(corpus_path, corpus_path + '-txt')
        elif count_dirs > 0 and count_files == 0:
            # process each subdirectory
            for directory in contents:
                pdf.main(directory,
                         directory.replace(corpus_path, corpus_path + '-txt'))
        else:
            raise IOError("Invalid Path: empty directory")

        corpus_path += '-txt'
    return corpus_path
Beispiel #4
0
def build_corpus(corpus_path,
                 model_path,
                 nltk_stop=False,
                 stop_freq=1,
                 context_type='document',
                 ignore=['.json', '.log', '.err', '.pickle', '.npz']):

    # pre-process PDF files
    if corpus_path[-4:] == '.pdf' or util.contains_pattern(
            corpus_path, '*.pdf'):
        if os.path.isdir(corpus_path):
            print "PDF files detected, extracting plaintext to", corpus_path + '-txt'
            if corpus_path.endswith('/'):
                corpus_path = corpus_path[:-1]
            pdf.main(corpus_path, corpus_path + '-txt')
            corpus_path += '-txt'
        else:
            print "PDF files detected, extracting plaintext to",\
                corpus_path.replace('.pdf','.txt')
            pdf.main(corpus_path)
            corpus_path = corpus_path.replace('.pdf', '.txt')

    print "Building corpus from", corpus_path

    if os.path.isfile(corpus_path):
        print "Constructing toy corpus, each line is a document"
        c = toy_corpus(corpus_path,
                       is_filename=True,
                       nltk_stop=nltk_stop,
                       stop_freq=stop_freq,
                       autolabel=True)
    elif os.path.isdir(corpus_path):
        contents = os.listdir(corpus_path)
        contents = [
            os.path.join(corpus_path, obj) for obj in contents
            if not any([obj.endswith(suffix) for suffix in ignore])
        ]
        count_dirs = len(filter(os.path.isdir, contents))
        count_files = len(filter(os.path.isfile, contents))

        print "Detected %d folders and %d files in %s" %\
            (count_dirs, count_files, corpus_path)

        if count_files > 0 and count_dirs == 0:
            print "Constructing directory corpus, each file is a document"
            c = dir_corpus(corpus_path,
                           nltk_stop=nltk_stop,
                           stop_freq=stop_freq,
                           chunk_name=context_type,
                           ignore=ignore)
        elif count_dirs > 0 and count_files == 0:
            print "Constructing collection corpus, each folder is a document"
            context_type = 'book'
            c = coll_corpus(corpus_path,
                            nltk_stop=nltk_stop,
                            stop_freq=stop_freq,
                            ignore=ignore)
        else:
            raise IOError("Invalid Path: empty directory")
    else:
        raise IOError("Invalid path")

    filename = get_corpus_filename(corpus_path, model_path, nltk_stop,
                                   stop_freq, context_type)
    c.save(filename)
    return filename