Ejemplo n.º 1
0
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=0,
                 context_type='document', ignore=['.json', '.log', '.err', '.pickle', '.npz'],
                 decode=True, sentences=False, simple=True, tokenizer='default'):

    from vsm.corpus import Corpus

    # ensure that nltk packages are downloaded
    ensure_nltk_data_downloaded()

    # import appropriate tokenizer
    if tokenizer == 'default':
        from vsm.extensions.corpusbuilders.util import word_tokenize
        tokenizer = word_tokenize
    elif tokenizer == 'simple':
        from topicexplorer.tokenizer import simple_tokenizer
        tokenizer = simple_tokenizer
    elif tokenizer == 'zh':
        from topicexplorer.lib.chinese import modern_chinese_tokenizer
        tokenizer = modern_chinese_tokenizer
    elif tokenizer == 'ltc' or tokenizer == 'och':
        from topicexplorer.lib.chinese import ancient_chinese_tokenizer
        tokenizer = ancient_chinese_tokenizer
    elif tokenizer == 'inpho':
        from topicexplorer.extensions.inpho import inpho_tokenizer
        tokenizer = inpho_tokenizer
    elif tokenizer == 'brain':
        from hyperbrain.parse import brain_tokenizer
        tokenizer = brain_tokenizer
    else:
        raise NotImplementedError(
            "Tokenizer '{}' is not included in topicexplorer".format(tokenizer))

    # pre-process PDF files
    contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern(corpus_path, '*.pdf')
    if contains_pdfs:
        corpus_path = process_pdfs(corpus_path)

    print("Building corpus from", corpus_path, end=' ')
    corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore)
    print("with {} function".format(corpusbuilder.__name__))
    c = corpusbuilder(corpus_path, nltk_stop=nltk_stop,
                      stop_freq=stop_freq, ignore=ignore, decode=decode,
                      simple=simple, tokenizer=tokenizer)

    if contains_pdfs:
        from vsm.viewer.wrappers import doc_label_name
        import re
        label_name = doc_label_name(c.context_types[0])
        new_labels = [re.sub('txt$', 'pdf', label) for label in c.context_data[0][label_name]]
        c.context_data[0][label_name] = new_labels

    filename = get_corpus_filename(
        corpus_path, model_path, nltk_stop, stop_freq, context_type)
    c.save(filename)
    return filename
Ejemplo n.º 2
0
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=1,
    context_type='document', ignore=['.json','.log','.err','.pickle','.npz']):
   
    # pre-process PDF files
    if corpus_path[-4:] == '.pdf' or util.contains_pattern(corpus_path, '*.pdf'):
        if os.path.isdir(corpus_path):
            print "PDF files detected, extracting plaintext to", corpus_path + '-txt'
            if corpus_path.endswith('/'):
                corpus_path = corpus_path[:-1]
            pdf.main(corpus_path, corpus_path + '-txt')
            corpus_path += '-txt'
        else:
            print "PDF files detected, extracting plaintext to",\
                corpus_path.replace('.pdf','.txt')
            pdf.main(corpus_path)
            corpus_path = corpus_path.replace('.pdf','.txt')

    print "Building corpus from", corpus_path

    if os.path.isfile(corpus_path):
        print "Constructing toy corpus, each line is a document"
        c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, 
                       stop_freq=stop_freq, autolabel=True)
    elif os.path.isdir(corpus_path):
        contents = os.listdir(corpus_path)
        contents = [os.path.join(corpus_path,obj) for obj in contents 
            if not any([obj.endswith(suffix) for suffix in ignore])]
        count_dirs = len(filter(os.path.isdir, contents))
        count_files = len(filter(os.path.isfile, contents))

        print "Detected %d folders and %d files in %s" %\
            (count_dirs, count_files, corpus_path)

        if count_files > 0 and count_dirs == 0:
            print "Constructing directory corpus, each file is a document"
            c = dir_corpus(corpus_path, nltk_stop=nltk_stop,
                           stop_freq=stop_freq, chunk_name=context_type,
                           ignore=ignore)
        elif count_dirs > 0 and count_files == 0:
            print "Constructing collection corpus, each folder is a document"
            context_type='book'
            c = coll_corpus(corpus_path, nltk_stop=nltk_stop,
                            stop_freq=stop_freq, ignore=ignore)
        else:
            raise IOError("Invalid Path: empty directory")
    else:
        raise IOError("Invalid path")

    filename = get_corpus_filename(
        corpus_path, model_path, nltk_stop, stop_freq, context_type)
    c.save(filename)
    return filename 
Ejemplo n.º 3
0
def build_corpus(corpus_path,
                 model_path,
                 nltk_stop=False,
                 stop_freq=0,
                 context_type='document',
                 ignore=['.json', '.log', '.err', '.pickle', '.npz'],
                 decode=True,
                 sentences=False,
                 simple=True,
                 tokenizer='default'):

    from vsm.corpus import Corpus
    from vsm.extensions.corpusbuilders import coll_corpus, dir_corpus, toy_corpus
    if sentences:
        print "Importing sentence constructors"
        from vsm.extensions.ldasentences import dir_corpus, toy_corpus

    # import appropriate tokenizer
    if tokenizer == 'default':
        from vsm.extensions.corpusbuilders.util import word_tokenize
        tokenizer = word_tokenize
    elif tokenizer == 'zh':
        from topicexplorer.lib.chinese import modern_chinese_tokenizer
        tokenizer = modern_chinese_tokenizer
    elif tokenizer == 'ltc' or tokenizer == 'och':
        from topicexplorer.lib.chinese import ancient_chinese_tokenizer
        tokenizer = ancient_chinese_tokenizer
    else:
        raise NotImplementedError(
            "Tokenizer '{}' is not included in topicexplorer".format(
                tokenizer))

    # pre-process PDF files
    contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern(
        corpus_path, '*.pdf')
    if contains_pdfs:
        corpus_path = process_pdfs(corpus_path)

    print "Building corpus from", corpus_path,
    corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore)
    print "with {} function".format(corpusbuilder.__name__)

    c = corpusbuilder(corpus_path,
                      nltk_stop=nltk_stop,
                      stop_freq=stop_freq,
                      ignore=ignore,
                      decode=decode,
                      simple=simple,
                      tokenizer=tokenizer)
    '''
    if os.path.isfile(corpus_path):
        print "Constructing toy corpus, each line is a document"
        if sentences:
            c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, 
                           stop_freq=stop_freq, autolabel=True, decode=decode)
        else:
            c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, 
                           stop_freq=stop_freq, autolabel=True, decode=decode,
                           simple=simple, tokenizer=tokenizer)
    elif os.path.isdir(corpus_path):
        contents = listdir_nohidden(corpus_path)
        contents = [os.path.join(corpus_path,obj) for obj in contents 
            if not any([obj.endswith(suffix) for suffix in ignore])]
        count_dirs = len(filter(os.path.isdir, contents))
        count_files = len(filter(os.path.isfile, contents))

        print "Detected %d folders and %d files in %s" %\
            (count_dirs, count_files, corpus_path)

        if count_files > 0 and count_dirs == 0:
            print "Constructing directory corpus, each file is a document"
            if sentences:
                c = dir_corpus(corpus_path, nltk_stop=nltk_stop,
                               stop_freq=stop_freq, chunk_name=context_type,
                               ignore=ignore, decode=decode)
            else:
                c = dir_corpus(corpus_path, nltk_stop=nltk_stop,
                               stop_freq=stop_freq, chunk_name=context_type,
                               ignore=ignore, decode=decode, simple=simple, 
                               tokenizer=tokenizer)
        elif count_dirs > 0 and count_files == 0 and not sentences:
            print "Constructing collection corpus, each folder is a document"
            context_type='book'
            c = coll_corpus(corpus_path, nltk_stop=nltk_stop,
                            stop_freq=stop_freq, ignore=ignore, decode=decode,
                            simple=simple, tokenizer=tokenizer)
        elif count_dirs > 0 and count_files == 0 and sentences:
            raise NotImplementedError("""Collection corpuses are too large for
            sentence parsing. Reduce your corpus to a single folder or
            file.""")
        else:
            raise IOError("Invalid Path: empty directory")
    else:
        raise IOError("Invalid path")
    '''

    if contains_pdfs:
        from vsm.viewer.wrappers import doc_label_name
        import re
        label_name = doc_label_name(c.context_types[0])
        new_labels = [
            re.sub('txt$', 'pdf', label)
            for label in c.context_data[0][label_name]
        ]
        c.context_data[0][label_name] = new_labels

    filename = get_corpus_filename(corpus_path, model_path, nltk_stop,
                                   stop_freq, context_type)
    c.save(filename)
    return filename
Ejemplo n.º 4
0
def write_config(args, config_file=None):
    """
    If config_file is None, then a name is automatically generated
    """
    config = ConfigParser()
    config.add_section("main")
    config.set("main", "path", os.path.abspath(args.model_path))
    config.set("main", "corpus_file", os.path.abspath(args.corpus_filename))
    config.set("main", "raw_corpus", os.path.abspath(args.corpus_path))
    config.set("main", "sentences", args.sentences)

    if args.bibtex:
        config.set("main", "label_module", "topicexplorer.extensions.bibtex")
        config.add_section("bibtex")
        config.set("bibtex", "path", args.bibtex)

    config.add_section("www")
    config.set("www", "corpus_name", args.corpus_print_name)
    config.set("www", "icons", "fingerprint,link")
    config.set("www", "fulltext", "false")
    config.set("www", "tokenizer", args.tokenizer);
    # adds a pdf element to the config file and set it to true if
    # pdf documents were being used in the corpus
    if args.corpus_path[-4:] == '.pdf' or contains_pattern(args.corpus_path, '*.pdf'):
        config.set("www", "pdf", "true")

    config.add_section("logging")
    config.set("logging", "path", "logs/%s/{0}.log" % args.corpus_name)

    if args.htrc:
        config = add_htrc_metadata(config, corpus_filename=os.path.abspath(args.corpus_filename))
        if not args.corpus_print_name:
            config.set("www", "corpus_name", "HTRC Data Capsule")

    if args.tokenizer in ['zh','ltc','och']:
        config.set("main", "lang", "cn")

    if config_file is None:
        config_file = args.corpus_name + ".ini"

        if os.path.basename(args.corpus_path) == args.corpus_name:
            config_file = os.path.join(args.corpus_path, '..', config_file)
            config_file = os.path.normpath(config_file)

        overwrite = None if os.path.exists(config_file) and not args.quiet else True
        while not overwrite:
            overwrite = input("\nConfig file {0} exists. Overwrite? [Y/n] ".format(config_file))
            overwrite = overwrite.lower().strip()
            if overwrite == 'n':
                config_i = 0
                while os.path.exists(config_file):
                    config_file = args.corpus_name + ".%d.ini" % config_i
                    config_i += 1
                config_file = input("Enter new filename [default: {0}]: ".format(config_file))\
                    or config_file
            elif overwrite == '' or overwrite == 'y':
                overwrite = True

    config.set("main", "corpus_desc", config_file+'.md')

    print("Writing configuration file", config_file)
    with open(config_file, "w") as configfh:
        config.write(configfh)
    return config_file
Ejemplo n.º 5
0
def write_config(args, config_file=None):
    """
    If config_file is None, then a name is automatically generated
    """
    config = ConfigParser()
    config.add_section("main")
    config.set("main", "path", os.path.abspath(args.model_path))
    config.set("main", "corpus_file", os.path.abspath(args.corpus_filename))
    config.set("main", "raw_corpus", os.path.abspath(args.corpus_path))
    config.set("main", "sentences", args.sentences)

    if args.bibtex:
        config.set("main", "label_module", "topicexplorer.extensions.bibtex")
        config.add_section("bibtex")
        config.set("bibtex", "path", args.bibtex)

    config.add_section("www")
    config.set("www", "corpus_name", args.corpus_print_name)
    config.set("www", "icons", "fingerprint,link")
    config.set("www", "fulltext", "false")
    config.set("www", "tokenizer", args.tokenizer)
    # adds a pdf element to the config file and set it to true if
    # pdf documents were being used in the corpus
    if args.corpus_path[-4:] == '.pdf' or contains_pattern(
            args.corpus_path, '*.pdf'):
        config.set("www", "pdf", "true")

    config.add_section("logging")
    config.set("logging", "path", "logs/%s/{0}.log" % args.corpus_name)

    if args.htrc:
        config = add_htrc_metadata(config,
                                   corpus_filename=os.path.abspath(
                                       args.corpus_filename))
        if not args.corpus_print_name:
            config.set("www", "corpus_name", "HTRC Data Capsule")

    if args.tokenizer in ['zh', 'ltc', 'och']:
        config.set("main", "lang", "cn")

    if config_file is None:
        config_file = args.corpus_name + ".ini"

        if os.path.basename(args.corpus_path) == args.corpus_name:
            config_file = os.path.join(args.corpus_path, '..', config_file)
            config_file = os.path.normpath(config_file)

        overwrite = None if os.path.exists(
            config_file) and not args.quiet else True
        while not overwrite:
            overwrite = input(
                "\nConfig file {0} exists. Overwrite? [Y/n] ".format(
                    config_file))
            overwrite = overwrite.lower().strip()
            if overwrite == 'n':
                config_i = 0
                while os.path.exists(config_file):
                    config_file = args.corpus_name + ".%d.ini" % config_i
                    config_i += 1
                config_file = input("Enter new filename [default: {0}]: ".format(config_file))\
                    or config_file
            elif overwrite == '' or overwrite == 'y':
                overwrite = True

    config.set("main", "corpus_desc", config_file + '.md')

    print("Writing configuration file", config_file)
    with open(config_file, "w") as configfh:
        config.write(configfh)
    return config_file
Ejemplo n.º 6
0
def build_corpus(corpus_path,
                 model_path,
                 nltk_stop=False,
                 stop_freq=0,
                 context_type='document',
                 ignore=['.json', '.log', '.err', '.pickle', '.npz'],
                 decode=True,
                 sentences=False,
                 simple=True,
                 tokenizer='default'):

    from vsm.corpus import Corpus

    # ensure that nltk packages are downloaded
    ensure_nltk_data_downloaded()

    # import appropriate tokenizer
    if tokenizer == 'default':
        from vsm.extensions.corpusbuilders.util import word_tokenize
        tokenizer = word_tokenize
    elif tokenizer == 'simple':
        from topicexplorer.tokenizer import simple_tokenizer
        tokenizer = simple_tokenizer
    elif tokenizer == 'zh':
        from topicexplorer.lib.chinese import modern_chinese_tokenizer
        tokenizer = modern_chinese_tokenizer
    elif tokenizer == 'ltc' or tokenizer == 'och':
        from topicexplorer.lib.chinese import ancient_chinese_tokenizer
        tokenizer = ancient_chinese_tokenizer
    elif tokenizer == 'inpho':
        from topicexplorer.extensions.inpho import inpho_tokenizer
        tokenizer = inpho_tokenizer
    elif tokenizer == 'brain':
        from hyperbrain.parse import brain_tokenizer
        tokenizer = brain_tokenizer
    else:
        raise NotImplementedError(
            "Tokenizer '{}' is not included in topicexplorer".format(
                tokenizer))

    # pre-process PDF files
    contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern(
        corpus_path, '*.pdf')
    if contains_pdfs:
        corpus_path = process_pdfs(corpus_path)

    print("Building corpus from", corpus_path, end=' ')
    corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore)
    print("with {} function".format(corpusbuilder.__name__))
    c = corpusbuilder(corpus_path,
                      nltk_stop=nltk_stop,
                      stop_freq=stop_freq,
                      ignore=ignore,
                      decode=decode,
                      simple=simple,
                      tokenizer=tokenizer)

    if contains_pdfs:
        from vsm.viewer.wrappers import doc_label_name
        import re
        label_name = doc_label_name(c.context_types[0])
        new_labels = [
            re.sub('txt$', 'pdf', label)
            for label in c.context_data[0][label_name]
        ]
        c.context_data[0][label_name] = new_labels

    filename = get_corpus_filename(corpus_path, model_path, nltk_stop,
                                   stop_freq, context_type)
    c.save(filename)
    return filename
Ejemplo n.º 7
0
def build_corpus(corpus_path,
                 model_path,
                 nltk_stop=False,
                 stop_freq=1,
                 context_type='document',
                 ignore=['.json', '.log', '.err', '.pickle', '.npz']):

    # pre-process PDF files
    if corpus_path[-4:] == '.pdf' or util.contains_pattern(
            corpus_path, '*.pdf'):
        if os.path.isdir(corpus_path):
            print "PDF files detected, extracting plaintext to", corpus_path + '-txt'
            if corpus_path.endswith('/'):
                corpus_path = corpus_path[:-1]
            pdf.main(corpus_path, corpus_path + '-txt')
            corpus_path += '-txt'
        else:
            print "PDF files detected, extracting plaintext to",\
                corpus_path.replace('.pdf','.txt')
            pdf.main(corpus_path)
            corpus_path = corpus_path.replace('.pdf', '.txt')

    print "Building corpus from", corpus_path

    if os.path.isfile(corpus_path):
        print "Constructing toy corpus, each line is a document"
        c = toy_corpus(corpus_path,
                       is_filename=True,
                       nltk_stop=nltk_stop,
                       stop_freq=stop_freq,
                       autolabel=True)
    elif os.path.isdir(corpus_path):
        contents = os.listdir(corpus_path)
        contents = [
            os.path.join(corpus_path, obj) for obj in contents
            if not any([obj.endswith(suffix) for suffix in ignore])
        ]
        count_dirs = len(filter(os.path.isdir, contents))
        count_files = len(filter(os.path.isfile, contents))

        print "Detected %d folders and %d files in %s" %\
            (count_dirs, count_files, corpus_path)

        if count_files > 0 and count_dirs == 0:
            print "Constructing directory corpus, each file is a document"
            c = dir_corpus(corpus_path,
                           nltk_stop=nltk_stop,
                           stop_freq=stop_freq,
                           chunk_name=context_type,
                           ignore=ignore)
        elif count_dirs > 0 and count_files == 0:
            print "Constructing collection corpus, each folder is a document"
            context_type = 'book'
            c = coll_corpus(corpus_path,
                            nltk_stop=nltk_stop,
                            stop_freq=stop_freq,
                            ignore=ignore)
        else:
            raise IOError("Invalid Path: empty directory")
    else:
        raise IOError("Invalid path")

    filename = get_corpus_filename(corpus_path, model_path, nltk_stop,
                                   stop_freq, context_type)
    c.save(filename)
    return filename
Ejemplo n.º 8
0
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=0,
    context_type='document', ignore=['.json','.log','.err','.pickle','.npz'],
    decode=True, sentences=False, simple=True, tokenizer='default'):
   
    from vsm.corpus import Corpus
    from vsm.extensions.corpusbuilders import coll_corpus, dir_corpus, toy_corpus
    if sentences:
        print "Importing sentence constructors"
        from vsm.extensions.ldasentences import dir_corpus, toy_corpus


    # import appropriate tokenizer
    if tokenizer == 'default':
        from vsm.extensions.corpusbuilders.util import word_tokenize
        tokenizer = word_tokenize
    elif tokenizer == 'zh':
        from topicexplorer.lib.chinese import modern_chinese_tokenizer
        tokenizer = modern_chinese_tokenizer
    elif tokenizer == 'ltc' or tokenizer == 'och':
        from topicexplorer.lib.chinese import ancient_chinese_tokenizer
        tokenizer = ancient_chinese_tokenizer
    else:
        raise NotImplementedError("Tokenizer '{}' is not included in topicexplorer".format(tokenizer))


    # pre-process PDF files
    contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern(corpus_path, '*.pdf')
    if contains_pdfs:
        corpus_path = process_pdfs(corpus_path)

    print "Building corpus from", corpus_path,
    corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore)
    print "with {} function".format(corpusbuilder.__name__)

    c = corpusbuilder(corpus_path, nltk_stop=nltk_stop,
                      stop_freq=stop_freq, ignore=ignore, decode=decode,
                      simple=simple, tokenizer=tokenizer)

    '''
    if os.path.isfile(corpus_path):
        print "Constructing toy corpus, each line is a document"
        if sentences:
            c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, 
                           stop_freq=stop_freq, autolabel=True, decode=decode)
        else:
            c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, 
                           stop_freq=stop_freq, autolabel=True, decode=decode,
                           simple=simple, tokenizer=tokenizer)
    elif os.path.isdir(corpus_path):
        contents = listdir_nohidden(corpus_path)
        contents = [os.path.join(corpus_path,obj) for obj in contents 
            if not any([obj.endswith(suffix) for suffix in ignore])]
        count_dirs = len(filter(os.path.isdir, contents))
        count_files = len(filter(os.path.isfile, contents))

        print "Detected %d folders and %d files in %s" %\
            (count_dirs, count_files, corpus_path)

        if count_files > 0 and count_dirs == 0:
            print "Constructing directory corpus, each file is a document"
            if sentences:
                c = dir_corpus(corpus_path, nltk_stop=nltk_stop,
                               stop_freq=stop_freq, chunk_name=context_type,
                               ignore=ignore, decode=decode)
            else:
                c = dir_corpus(corpus_path, nltk_stop=nltk_stop,
                               stop_freq=stop_freq, chunk_name=context_type,
                               ignore=ignore, decode=decode, simple=simple, 
                               tokenizer=tokenizer)
        elif count_dirs > 0 and count_files == 0 and not sentences:
            print "Constructing collection corpus, each folder is a document"
            context_type='book'
            c = coll_corpus(corpus_path, nltk_stop=nltk_stop,
                            stop_freq=stop_freq, ignore=ignore, decode=decode,
                            simple=simple, tokenizer=tokenizer)
        elif count_dirs > 0 and count_files == 0 and sentences:
            raise NotImplementedError("""Collection corpuses are too large for
            sentence parsing. Reduce your corpus to a single folder or
            file.""")
        else:
            raise IOError("Invalid Path: empty directory")
    else:
        raise IOError("Invalid path")
    '''

    if contains_pdfs:
        from vsm.viewer.wrappers import doc_label_name
        import re
        label_name = doc_label_name(c.context_types[0])
        new_labels = [re.sub('txt$', 'pdf', label) for label in c.context_data[0][label_name]]
        c.context_data[0][label_name] = new_labels

    filename = get_corpus_filename(
        corpus_path, model_path, nltk_stop, stop_freq, context_type)
    c.save(filename)
    return filename