コード例 #1
0
ファイル: init.py プロジェクト: inpho/topic-explorer
def process_pdfs(corpus_path, ignore=['.json', '.log', '.err', '.pickle', '.npz']):
    from topicexplorer.lib import pdf
    if os.path.isfile(corpus_path):
        print("PDF file detected, extracting plaintext to",
            corpus_path.replace('.pdf', '.txt'))
        pdf.main(corpus_path)
        corpus_path = corpus_path.replace('.pdf', '.txt')
    elif os.path.isdir(corpus_path):
        print("PDF files detected, extracting plaintext to", corpus_path + '-txt')

        if corpus_path.endswith('/'):
            corpus_path = corpus_path[:-1]

        # TODO: Add processing of collections
        contents = listdir_nohidden(corpus_path)
        contents = [os.path.join(corpus_path, obj) for obj in contents
                    if not any([obj.endswith(suffix) for suffix in ignore])]
        count_dirs = len(list(filter(os.path.isdir, contents)))
        count_files = len(list(filter(os.path.isfile, contents)))

        if count_files > 0 and count_dirs == 0:
            # process all files
            pdf.main(corpus_path, corpus_path + '-txt')
        elif count_dirs > 0 and count_files == 0:
            # process each subdirectory
            for directory in contents:
                pdf.main(directory,
                         directory.replace(corpus_path, corpus_path + '-txt'))
        else:
            raise IOError("Invalid Path: empty directory")

        corpus_path += '-txt'
    return corpus_path
コード例 #2
0
ファイル: init.py プロジェクト: inpho/topic-explorer
def get_corpusbuilder_fn(corpus_path, sentences=False,
                         ignore=['.json', '.log', '.err', '.pickle', '.npz']):
    relpaths = [os.path.relpath(path, start=corpus_path)
                for path in listdir_nohidden(corpus_path, recursive=True)
                if os.path.isfile(path)
                and not any([path.endswith(i) for i in ignore])]

    if sentences:
        raise NotImplementedError("""Collection corpuses are too large for
        sentence parsing. Reduce your corpus to a single folder or
        file.""")
    else:
        from vsm.extensions.corpusbuilders.corpusstreamers import corpus_from_files
        return corpus_from_files
コード例 #3
0
def get_corpusbuilder_fn(corpus_path,
                         sentences=False,
                         ignore=['.json', '.log', '.err', '.pickle', '.npz']):
    relpaths = [
        os.path.relpath(path, start=corpus_path)
        for path in listdir_nohidden(corpus_path, recursive=True)
        if os.path.isfile(path) and not any([path.endswith(i) for i in ignore])
    ]

    dir_counts = defaultdict(int)
    for path in relpaths:
        dir_counts[os.path.dirname(path)] += 1

    dirs = dir_counts.keys()
    populated_levels = [
        1 + dir.count(os.path.sep) for dir, key in dir_counts.items()
    ]

    levels = max(populated_levels) - min(populated_levels)
    print("{} files, {} dirs, {} levels".format(len(relpaths), len(dirs),
                                                levels))

    if len(relpaths) == 1:
        if sentences:
            from vsm.extensions.ldasentences import toy_corpus
        else:
            from vsm.extensions.corpusbuilders import toy_corpus
        import functools
        toy_partial = functools.partial(toy_corpus,
                                        is_filename=True,
                                        autolabel=True)
        toy_partial.__name__ = 'toy_corpus'
        return toy_partial
    elif len(dirs) <= 1:
        if sentences:
            from vsm.extensions.ldasentences import dir_corpus
        else:
            from vsm.extensions.corpusbuilders import dir_corpus
        return dir_corpus
    elif sentences:
        raise NotImplementedError("""Collection corpuses are too large for
        sentence parsing. Reduce your corpus to a single folder or
        file.""")
    elif levels == 0 and max(populated_levels) == 1:
        from vsm.extensions.corpusbuilders import coll_corpus
        return coll_corpus
    else:
        from vsm.extensions.corpusbuilders import walk_corpus
        return walk_corpus
コード例 #4
0
ファイル: init.py プロジェクト: juneLLL/topic-explorer
def get_corpusbuilder_fn(corpus_path,
                         sentences=False,
                         ignore=['.json', '.log', '.err', '.pickle', '.npz']):
    relpaths = [
        os.path.relpath(path, start=corpus_path)
        for path in listdir_nohidden(corpus_path, recursive=True)
        if os.path.isfile(path) and not any([path.endswith(i) for i in ignore])
    ]

    if sentences:
        raise NotImplementedError("""Collection corpuses are too large for
        sentence parsing. Reduce your corpus to a single folder or
        file.""")
    else:
        from vsm.extensions.corpusbuilders.corpusstreamers import corpus_from_files
        return corpus_from_files
コード例 #5
0
ファイル: init.py プロジェクト: gitter-badger/topic-explorer
def get_corpusbuilder_fn(corpus_path, sentences=False, ignore=[]):
    relpaths = [os.path.relpath(path, start=corpus_path)
                    for path in listdir_nohidden(corpus_path, recursive=True) 
                        if os.path.isfile(path) 
                            and not any([path.endswith(i) for i in ignore])]

    dir_counts = defaultdict(int)
    for path in relpaths:
        dir_counts[os.path.dirname(path)] += 1

    dirs = dir_counts.keys()
    populated_levels = [dir.count(os.path.sep) 
        for dir, key in dir_counts.iteritems()]
    levels = max(populated_levels) - min(populated_levels)
    print "{} files, {} dirs, {} levels".format(len(relpaths), len(dirs), levels)

    if len(relpaths) == 1:
        if sentences:
            from vsm.extensions.ldasentences import toy_corpus
        else:
            from vsm.extensions.corpusbuilders import toy_corpus
        import functools
        return functools.partial(toy_corpus, is_filename=True, autolabel=True)
    elif len(dirs) <= 1:
        if sentences:
            from vsm.extensions.ldasentences import dir_corpus
        else:
            from vsm.extensions.corpusbuilders import dir_corpus
        return dir_corpus
    elif sentences:
        raise NotImplementedError("""Collection corpuses are too large for
        sentence parsing. Reduce your corpus to a single folder or
        file.""")
    elif levels == 0:
        from vsm.extensions.corpusbuilders import coll_corpus
        return coll_corpus
    else:
        from vsm.extensions.corpusbuilders import walk_corpus
        return walk_corpus
コード例 #6
0
ファイル: init.py プロジェクト: juneLLL/topic-explorer
def process_pdfs(corpus_path,
                 ignore=['.json', '.log', '.err', '.pickle', '.npz']):
    from topicexplorer.lib import pdf
    if os.path.isfile(corpus_path):
        print("PDF file detected, extracting plaintext to",
              corpus_path.replace('.pdf', '.txt'))
        pdf.main(corpus_path)
        corpus_path = corpus_path.replace('.pdf', '.txt')
    elif os.path.isdir(corpus_path):
        print("PDF files detected, extracting plaintext to",
              corpus_path + '-txt')

        if corpus_path.endswith('/'):
            corpus_path = corpus_path[:-1]

        # TODO: Add processing of collections
        contents = listdir_nohidden(corpus_path)
        contents = [
            os.path.join(corpus_path, obj) for obj in contents
            if not any([obj.endswith(suffix) for suffix in ignore])
        ]
        count_dirs = len(list(filter(os.path.isdir, contents)))
        count_files = len(list(filter(os.path.isfile, contents)))

        if count_files > 0 and count_dirs == 0:
            # process all files
            pdf.main(corpus_path, corpus_path + '-txt')
        elif count_dirs > 0 and count_files == 0:
            # process each subdirectory
            for directory in contents:
                pdf.main(directory,
                         directory.replace(corpus_path, corpus_path + '-txt'))
        else:
            raise IOError("Invalid Path: empty directory")

        corpus_path += '-txt'
    return corpus_path
コード例 #7
0
def main(args):
    # convert to unicode to avoid windows errors
    args.corpus_path = unicode(args.corpus_path, 'utf-8')

    # config corpus_path
    # process bibtex files
    args.bibtex = args.corpus_path.endswith('.bib')
    if args.bibtex:
        args.bibtex = args.corpus_path
        args.corpus_path = process_bibtex(args.corpus_path)

    # set corpus_name
    args.corpus_name = os.path.basename(args.corpus_path)
    if not args.corpus_name:
        args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path))

    if not args.corpus_print_name:
        args.corpus_print_name = prompt("Corpus Name",
                                        default=args.corpus_name)

    if args.htrc:
        import vsm.extensions.htrc as htrc
        htrc.proc_htrc_coll(args.corpus_path)

        import json
        data = [(id, htrc.metadata(id))
                for id in listdir_nohidden(args.corpus_path)
                if os.path.isdir(id)]
        data = dict(data)
        md_filename = os.path.join(args.corpus_path, '../metadata.json')
        with open(md_filename, 'wb') as outfile:
            json.dump(data, outfile)

    # configure model-path
    if args.model_path is None:
        if os.path.isdir(args.corpus_path):
            args.model_path = os.path.join(args.corpus_path, '../models/')
        else:
            args.model_path = os.path.dirname(args.corpus_path)
    if args.model_path and not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    args.corpus_filename = get_corpus_filename(args.corpus_path,
                                               args.model_path,
                                               stop_freq=args.stop_freq)
    if not args.rebuild and os.path.exists(args.corpus_filename):
        while args.rebuild not in ['y', 'n', True]:
            args.rebuild = raw_input("\nCorpus file found. Rebuild? [y/N] ")
            args.rebuild = args.rebuild.lower().strip()
            if args.rebuild == 'y':
                args.rebuild = True
            elif args.rebuild == '':
                args.rebuild = 'n'
    else:
        args.rebuild = True
    if args.rebuild == True:
        try:
            args.corpus_filename = build_corpus(args.corpus_path,
                                                args.model_path,
                                                stop_freq=args.stop_freq,
                                                decode=args.decode,
                                                sentences=args.sentences,
                                                simple=args.simple,
                                                tokenizer=args.tokenizer)
        except IOError:
            print "ERROR: invalid path, please specify either:"
            print "  * a single plain-text or PDF file,"
            print "  * a single bibtex (.bib) file with 'file' fields,"
            print "  * a folder of plain-text or PDF files, or"
            print "  * a folder of folders of plain-text or PDF files."
            print "\nExiting..."
            sys.exit(74)
        """
        except LookupError as e:
            if 'punkt' in e.message:
                print "\nERROR: sentence tokenizer not available, download by running:"
                print "    python -m nltk.downloader punkt"

            elif 'stopwords' in e.message:
                print "\nERROR: stopwords not available, download by running:"
                print "    python -m nltk.downloader stopwords"
            else:
                raise e
            print "\nExiting..."
            sys.exit(74)        
        """

    return write_config(args, args.config_file)
コード例 #8
0
ファイル: init.py プロジェクト: inpho/topic-explorer
def main(args):
    # TODO: remove this code, check if there is an issue and unit test
    # convert to unicode to avoid windows errors
    # args.corpus_path = args.corpus_path

    # config corpus_path
    # process bibtex files
    args.bibtex = args.corpus_path.endswith('.bib')
    if args.bibtex:
        args.bibtex = args.corpus_path
        args.corpus_path = process_bibtex(args.corpus_path, args.quiet)

    # set corpus_name
    args.corpus_name = os.path.basename(args.corpus_path)
    if not args.corpus_name:
        args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path))

    if not args.corpus_print_name and not args.quiet:
        args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name)

    # configure model-path
    if args.model_path is None:
        if os.path.isdir(args.corpus_path):
            args.model_path = os.path.join(args.corpus_path, '../models/')
        else:
            args.model_path = os.path.dirname(args.corpus_path)
    if args.model_path and not os.path.exists(os.path.abspath(args.model_path)):
        os.makedirs(os.path.abspath(args.model_path))

    args.corpus_filename = get_corpus_filename(
        args.corpus_path, args.model_path, stop_freq=args.stop_freq)
    if not args.rebuild and os.path.exists(os.path.abspath(args.corpus_filename)):
        if args.quiet:
            print("Path exits: {}".format(args.corpus_filename))
            sys.exit(1)
        else:
            args.rebuild = bool_prompt("\nCorpus file found. Rebuild? ",
                default=False)
    else:
        args.rebuild = True

    if args.htrc:
        import vsm.extensions.htrc as htrc
        if os.path.isdir(args.corpus_path):
            #htrc.proc_htrc_coll(args.corpus_path)
            ids = [id.replace('.txt','') for id in listdir_nohidden(args.corpus_path)]

            args.htrc_metapath = os.path.abspath(args.corpus_path + '/../')
            args.htrc_metapath = os.path.join(args.htrc_metapath,
                os.path.basename(args.corpus_path) + '.metadata.json')
        else:
            import topicexplorer.extensions.htrc_features as htrc_features
            with open(args.corpus_path) as idfile:
                ids = [row.strip() for row in idfile if row.strip()]

            c = htrc_features.create_corpus(ids, nltk_stop=args.nltk,freq=args.stop_freq)
            c.save(args.corpus_filename)


    if args.rebuild and (not args.htrc or os.path.isdir(args.corpus_path)):
        try:
            args.corpus_filename = build_corpus(
                args.corpus_path, args.model_path, stop_freq=args.stop_freq,
                decode=args.decode, nltk_stop=args.nltk, simple=args.simple,
                sentences=args.sentences, tokenizer=args.tokenizer)
        except IOError as e:
            print("ERROR: invalid path, please specify either:")
            print("  * a single plain-text or PDF file,")
            print("  * a single bibtex (.bib) file with 'file' fields,")
            print("  * a folder of plain-text or PDF files, or")
            print("  * a folder of folders of plain-text or PDF files.")
            print("\nExiting...")
            raise e
            sys.exit(74)
        """
        except LookupError as e:
            if 'punkt' in e.message:
                print "\nERROR: sentence tokenizer not available, download by running:"
                print "    python -m nltk.downloader punkt"

            elif 'stopwords' in e.message:
                print "\nERROR: stopwords not available, download by running:"
                print "    python -m nltk.downloader stopwords"
            else:
                raise e
            print "\nExiting..."
            sys.exit(74)
        """

    args.config_file = write_config(args, args.config_file)

    args.corpus_desc = args.config_file + '.md'
    if not args.quiet and os.path.exists(args.corpus_desc):
        while args.corpus_desc not in ['y', 'n', False]:
            args.corpus_desc = input("\nExisting corpus description found. Remove? [y/N] ")
            args.corpus_desc = args.corpus_desc.lower().strip()
            if args.corpus_desc == '':
                args.corpus_desc = False
        else:
            if args.corpus_desc == 'y':
                args.corpus_desc = args.config_file + '.md'

    if args.corpus_desc:
        with open(args.corpus_desc, 'w') as outfile:
            outfile.write(
"""This is an instance of the [InPhO Topic Explorer](http://inphodata.cogs.indiana.edu/). If you would like
to add a custom corpus description, either:
- Modify the contents of the file `{}`
- Change the main:corpus_desc path in `{}` to an existing Markdown file.
""".format(os.path.abspath(args.corpus_desc),
           os.path.abspath(args.config_file)))

    return args.config_file
コード例 #9
0
ファイル: init.py プロジェクト: juneLLL/topic-explorer
def main(args):
    # TODO: remove this code, check if there is an issue and unit test
    # convert to unicode to avoid windows errors
    # args.corpus_path = args.corpus_path

    # config corpus_path
    # process bibtex files
    args.bibtex = args.corpus_path.endswith('.bib')
    if args.bibtex:
        args.bibtex = args.corpus_path
        args.corpus_path = process_bibtex(args.corpus_path, args.quiet)

    # set corpus_name
    args.corpus_name = os.path.basename(args.corpus_path)
    if not args.corpus_name:
        args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path))

    if not args.corpus_print_name and not args.quiet:
        args.corpus_print_name = prompt("Corpus Name",
                                        default=args.corpus_name)

    # configure model-path
    if args.model_path is None:
        if os.path.isdir(args.corpus_path):
            args.model_path = os.path.join(args.corpus_path, '../models/')
        else:
            args.model_path = os.path.dirname(args.corpus_path)
    if args.model_path and not os.path.exists(os.path.abspath(
            args.model_path)):
        os.makedirs(os.path.abspath(args.model_path))

    args.corpus_filename = get_corpus_filename(args.corpus_path,
                                               args.model_path,
                                               stop_freq=args.stop_freq)
    if not args.rebuild and os.path.exists(
            os.path.abspath(args.corpus_filename)):
        if args.quiet:
            print("Path exits: {}".format(args.corpus_filename))
            sys.exit(1)
        else:
            args.rebuild = bool_prompt("\nCorpus file found. Rebuild? ",
                                       default=False)
    else:
        args.rebuild = True

    if args.htrc:
        import vsm.extensions.htrc as htrc
        if os.path.isdir(args.corpus_path):
            #htrc.proc_htrc_coll(args.corpus_path)
            ids = [
                id.replace('.txt', '')
                for id in listdir_nohidden(args.corpus_path)
            ]

            args.htrc_metapath = os.path.abspath(args.corpus_path + '/../')
            args.htrc_metapath = os.path.join(
                args.htrc_metapath,
                os.path.basename(args.corpus_path) + '.metadata.json')
        else:
            import topicexplorer.extensions.htrc_features as htrc_features
            with open(args.corpus_path) as idfile:
                ids = [row.strip() for row in idfile if row.strip()]

            c = htrc_features.create_corpus(ids,
                                            nltk_stop=args.nltk,
                                            freq=args.stop_freq)
            c.save(args.corpus_filename)

    if args.rebuild and (not args.htrc or os.path.isdir(args.corpus_path)):
        try:
            args.corpus_filename = build_corpus(args.corpus_path,
                                                args.model_path,
                                                stop_freq=args.stop_freq,
                                                decode=args.decode,
                                                nltk_stop=args.nltk,
                                                simple=args.simple,
                                                sentences=args.sentences,
                                                tokenizer=args.tokenizer)
        except IOError as e:
            print("ERROR: invalid path, please specify either:")
            print("  * a single plain-text or PDF file,")
            print("  * a single bibtex (.bib) file with 'file' fields,")
            print("  * a folder of plain-text or PDF files, or")
            print("  * a folder of folders of plain-text or PDF files.")
            print("\nExiting...")
            raise e
            sys.exit(74)
        """
        except LookupError as e:
            if 'punkt' in e.message:
                print "\nERROR: sentence tokenizer not available, download by running:"
                print "    python -m nltk.downloader punkt"

            elif 'stopwords' in e.message:
                print "\nERROR: stopwords not available, download by running:"
                print "    python -m nltk.downloader stopwords"
            else:
                raise e
            print "\nExiting..."
            sys.exit(74)
        """

    args.config_file = write_config(args, args.config_file)

    args.corpus_desc = args.config_file + '.md'
    if not args.quiet and os.path.exists(args.corpus_desc):
        while args.corpus_desc not in ['y', 'n', False]:
            args.corpus_desc = input(
                "\nExisting corpus description found. Remove? [y/N] ")
            args.corpus_desc = args.corpus_desc.lower().strip()
            if args.corpus_desc == '':
                args.corpus_desc = False
        else:
            if args.corpus_desc == 'y':
                args.corpus_desc = args.config_file + '.md'

    if args.corpus_desc:
        with open(args.corpus_desc, 'w') as outfile:
            outfile.write(
                """This is an instance of the [InPhO Topic Explorer](http://inphodata.cogs.indiana.edu/). If you would like
to add a custom corpus description, either:
- Modify the contents of the file `{}`
- Change the main:corpus_desc path in `{}` to an existing Markdown file.
""".format(os.path.abspath(args.corpus_desc),
            os.path.abspath(args.config_file)))

    return args.config_file
コード例 #10
0
ファイル: init.py プロジェクト: gitter-badger/topic-explorer
def main(args):
    # convert to unicode to avoid windows errors
    args.corpus_path = unicode(args.corpus_path, 'utf-8')

    # config corpus_path
    # process bibtex files
    args.bibtex = args.corpus_path.endswith('.bib')
    if args.bibtex:
        args.bibtex = args.corpus_path
        args.corpus_path = process_bibtex(args.corpus_path)
        

    # set corpus_name
    args.corpus_name = os.path.basename(args.corpus_path)
    if not args.corpus_name:
        args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path))

    if not args.corpus_print_name:
        args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name)

    if args.htrc:
        import vsm.extensions.htrc as htrc
        htrc.proc_htrc_coll(args.corpus_path)
        
        import json
        data = [(id, htrc.metadata(id)) for id in listdir_nohidden(args.corpus_path)
                    if os.path.isdir(id)]
        data = dict(data)
        md_filename = os.path.join(args.corpus_path, '../metadata.json')
        with open(md_filename, 'wb') as outfile:
            json.dump(data, outfile)
    
    # configure model-path
    if args.model_path is None:
        if os.path.isdir(args.corpus_path):
            args.model_path = os.path.join(args.corpus_path, '../models/')
        else:
            args.model_path = os.path.dirname(args.corpus_path)
    if args.model_path and not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
  
    args.corpus_filename = get_corpus_filename(
        args.corpus_path, args.model_path, stop_freq=args.stop_freq)
    if not args.rebuild and os.path.exists(args.corpus_filename): 
        while args.rebuild not in ['y', 'n', True]:
            args.rebuild = raw_input("\nCorpus file found. Rebuild? [y/N] ")
            args.rebuild = args.rebuild.lower().strip()
            if args.rebuild == 'y':
                args.rebuild = True
            elif args.rebuild == '':
                args.rebuild = 'n'
    else:
        args.rebuild = True
    if args.rebuild == True:
        try:
            args.corpus_filename = build_corpus(args.corpus_path, args.model_path, 
                                                stop_freq=args.stop_freq, decode=args.decode,
                                                sentences=args.sentences,
                                                simple=args.simple,tokenizer=args.tokenizer)
        except IOError:
            print "ERROR: invalid path, please specify either:"
            print "  * a single plain-text or PDF file,"
            print "  * a single bibtex (.bib) file with 'file' fields,"
            print "  * a folder of plain-text or PDF files, or"
            print "  * a folder of folders of plain-text or PDF files."
            print "\nExiting..."
            sys.exit(74)
        """
        except LookupError as e:
            if 'punkt' in e.message:
                print "\nERROR: sentence tokenizer not available, download by running:"
                print "    python -m nltk.downloader punkt"

            elif 'stopwords' in e.message:
                print "\nERROR: stopwords not available, download by running:"
                print "    python -m nltk.downloader stopwords"
            else:
                raise e
            print "\nExiting..."
            sys.exit(74)        
        """

    return write_config(args, args.config_file)