Beispiel #1
0
def main(args):
    if args.model_path is None:
        if os.path.isdir(args.corpus_path):
            args.model_path = os.path.join(args.corpus_path, '../models/')
        else:
            args.model_path = os.path.dirname(args.corpus_path)
    if args.model_path and not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    args.corpus_name = os.path.basename(args.corpus_path)
    if not args.corpus_name:
        args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path))

    if not args.corpus_print_name:
        args.corpus_print_name = prompt("Corpus Name",
                                        default=args.corpus_name)

    if args.htrc:
        import vsm.extensions.htrc as htrc
        htrc.proc_htrc_coll(args.corpus_path)

        import json
        data = [(id, htrc.metadata(id)) for id in os.listdir(args.corpus_path)
                if os.path.isdir(id)]
        data = dict(data)
        md_filename = os.path.join(args.corpus_path, '../metadata.json')
        with open(md_filename, 'wb') as outfile:
            json.dump(data, outfile)

    args.corpus_filename = get_corpus_filename(args.corpus_path,
                                               args.model_path,
                                               stop_freq=5)
    if not args.rebuild and os.path.exists(args.corpus_filename):
        while args.rebuild not in ['y', 'n', True]:
            args.rebuild = raw_input("\nCorpus file found. Rebuild? [y/N] ")
            args.rebuild = args.rebuild.lower().strip()
            if args.rebuild == 'y':
                args.rebuild = True
            elif args.rebuild == '':
                args.rebuild = 'n'
    else:
        args.rebuild = True
    if args.rebuild == True:
        try:
            args.corpus_filename = build_corpus(args.corpus_path,
                                                args.model_path,
                                                stop_freq=5)
        except IOError:
            print "ERROR: invalid path, please specify either:"
            print "  * a single plain-text file,"
            print "  * a folder of plain-text files, or"
            print "  * a folder of folders of plain-text files."
            print "\nExiting..."
            sys.exit(74)

    return write_config(args, args.config_file)
Beispiel #2
0
def main(args):
    if args.model_path is None:
        if os.path.isdir(args.corpus_path):
            args.model_path = os.path.join(args.corpus_path, '../models/')
        else:
            args.model_path = os.path.dirname(args.corpus_path)
    if args.model_path and not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    args.corpus_name = os.path.basename(args.corpus_path)
    if not args.corpus_name:
        args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path))

    if not args.corpus_print_name:
        args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name)

    if args.htrc:
        import vsm.extensions.htrc as htrc
        htrc.proc_htrc_coll(args.corpus_path)
        
        import json
        data = [(id, htrc.metadata(id)) for id in os.listdir(args.corpus_path)
                    if os.path.isdir(id)]
        data = dict(data)
        md_filename = os.path.join(args.corpus_path, '../metadata.json')
        with open(md_filename, 'wb') as outfile:
            json.dump(data, outfile)
  
    args.corpus_filename = get_corpus_filename(
        args.corpus_path, args.model_path, stop_freq=5)
    if not args.rebuild and os.path.exists(args.corpus_filename): 
        while args.rebuild not in ['y', 'n', True]:
            args.rebuild = raw_input("\nCorpus file found. Rebuild? [y/N] ")
            args.rebuild = args.rebuild.lower().strip()
            if args.rebuild == 'y':
	        args.rebuild = True
            elif args.rebuild == '':
                args.rebuild = 'n'
    else:
        args.rebuild = True
    if args.rebuild == True:
        try:
            args.corpus_filename = build_corpus(args.corpus_path, args.model_path, 
                                                stop_freq=5)
        except IOError:
            print "ERROR: invalid path, please specify either:"
            print "  * a single plain-text file,"
            print "  * a folder of plain-text files, or"
            print "  * a folder of folders of plain-text files."
            print "\nExiting..."
            sys.exit(74)

    return write_config(args, args.config_file)
Beispiel #3
0
def main(args):
    # convert to unicode to avoid windows errors
    args.corpus_path = unicode(args.corpus_path, 'utf-8')

    # config corpus_path
    # process bibtex files
    args.bibtex = args.corpus_path.endswith('.bib')
    if args.bibtex:
        args.bibtex = args.corpus_path
        args.corpus_path = process_bibtex(args.corpus_path)

    # set corpus_name
    args.corpus_name = os.path.basename(args.corpus_path)
    if not args.corpus_name:
        args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path))

    if not args.corpus_print_name:
        args.corpus_print_name = prompt("Corpus Name",
                                        default=args.corpus_name)

    if args.htrc:
        import vsm.extensions.htrc as htrc
        htrc.proc_htrc_coll(args.corpus_path)

        import json
        data = [(id, htrc.metadata(id))
                for id in listdir_nohidden(args.corpus_path)
                if os.path.isdir(id)]
        data = dict(data)
        md_filename = os.path.join(args.corpus_path, '../metadata.json')
        with open(md_filename, 'wb') as outfile:
            json.dump(data, outfile)

    # configure model-path
    if args.model_path is None:
        if os.path.isdir(args.corpus_path):
            args.model_path = os.path.join(args.corpus_path, '../models/')
        else:
            args.model_path = os.path.dirname(args.corpus_path)
    if args.model_path and not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    args.corpus_filename = get_corpus_filename(args.corpus_path,
                                               args.model_path,
                                               stop_freq=args.stop_freq)
    if not args.rebuild and os.path.exists(args.corpus_filename):
        while args.rebuild not in ['y', 'n', True]:
            args.rebuild = raw_input("\nCorpus file found. Rebuild? [y/N] ")
            args.rebuild = args.rebuild.lower().strip()
            if args.rebuild == 'y':
                args.rebuild = True
            elif args.rebuild == '':
                args.rebuild = 'n'
    else:
        args.rebuild = True
    if args.rebuild == True:
        try:
            args.corpus_filename = build_corpus(args.corpus_path,
                                                args.model_path,
                                                stop_freq=args.stop_freq,
                                                decode=args.decode,
                                                sentences=args.sentences,
                                                simple=args.simple,
                                                tokenizer=args.tokenizer)
        except IOError:
            print "ERROR: invalid path, please specify either:"
            print "  * a single plain-text or PDF file,"
            print "  * a single bibtex (.bib) file with 'file' fields,"
            print "  * a folder of plain-text or PDF files, or"
            print "  * a folder of folders of plain-text or PDF files."
            print "\nExiting..."
            sys.exit(74)
        """
        except LookupError as e:
            if 'punkt' in e.message:
                print "\nERROR: sentence tokenizer not available, download by running:"
                print "    python -m nltk.downloader punkt"

            elif 'stopwords' in e.message:
                print "\nERROR: stopwords not available, download by running:"
                print "    python -m nltk.downloader stopwords"
            else:
                raise e
            print "\nExiting..."
            sys.exit(74)        
        """

    return write_config(args, args.config_file)
Beispiel #4
0
def main(args):
    # TODO: remove this code, check if there is an issue and unit test
    # convert to unicode to avoid windows errors
    # args.corpus_path = args.corpus_path

    # config corpus_path
    # process bibtex files
    args.bibtex = args.corpus_path.endswith('.bib')
    if args.bibtex:
        args.bibtex = args.corpus_path
        args.corpus_path = process_bibtex(args.corpus_path, args.quiet)

    # set corpus_name
    args.corpus_name = os.path.basename(args.corpus_path)
    if not args.corpus_name:
        args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path))

    if not args.corpus_print_name and not args.quiet:
        args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name)

    # configure model-path
    if args.model_path is None:
        if os.path.isdir(args.corpus_path):
            args.model_path = os.path.join(args.corpus_path, '../models/')
        else:
            args.model_path = os.path.dirname(args.corpus_path)
    if args.model_path and not os.path.exists(os.path.abspath(args.model_path)):
        os.makedirs(os.path.abspath(args.model_path))

    args.corpus_filename = get_corpus_filename(
        args.corpus_path, args.model_path, stop_freq=args.stop_freq)
    if not args.rebuild and os.path.exists(os.path.abspath(args.corpus_filename)):
        if args.quiet:
            print("Path exits: {}".format(args.corpus_filename))
            sys.exit(1)
        else:
            args.rebuild = bool_prompt("\nCorpus file found. Rebuild? ",
                default=False)
    else:
        args.rebuild = True

    if args.htrc:
        import vsm.extensions.htrc as htrc
        if os.path.isdir(args.corpus_path):
            #htrc.proc_htrc_coll(args.corpus_path)
            ids = [id.replace('.txt','') for id in listdir_nohidden(args.corpus_path)]

            args.htrc_metapath = os.path.abspath(args.corpus_path + '/../')
            args.htrc_metapath = os.path.join(args.htrc_metapath,
                os.path.basename(args.corpus_path) + '.metadata.json')
        else:
            import topicexplorer.extensions.htrc_features as htrc_features
            with open(args.corpus_path) as idfile:
                ids = [row.strip() for row in idfile if row.strip()]

            c = htrc_features.create_corpus(ids, nltk_stop=args.nltk,freq=args.stop_freq)
            c.save(args.corpus_filename)


    if args.rebuild and (not args.htrc or os.path.isdir(args.corpus_path)):
        try:
            args.corpus_filename = build_corpus(
                args.corpus_path, args.model_path, stop_freq=args.stop_freq,
                decode=args.decode, nltk_stop=args.nltk, simple=args.simple,
                sentences=args.sentences, tokenizer=args.tokenizer)
        except IOError as e:
            print("ERROR: invalid path, please specify either:")
            print("  * a single plain-text or PDF file,")
            print("  * a single bibtex (.bib) file with 'file' fields,")
            print("  * a folder of plain-text or PDF files, or")
            print("  * a folder of folders of plain-text or PDF files.")
            print("\nExiting...")
            raise e
            sys.exit(74)
        """
        except LookupError as e:
            if 'punkt' in e.message:
                print "\nERROR: sentence tokenizer not available, download by running:"
                print "    python -m nltk.downloader punkt"

            elif 'stopwords' in e.message:
                print "\nERROR: stopwords not available, download by running:"
                print "    python -m nltk.downloader stopwords"
            else:
                raise e
            print "\nExiting..."
            sys.exit(74)
        """

    args.config_file = write_config(args, args.config_file)

    args.corpus_desc = args.config_file + '.md'
    if not args.quiet and os.path.exists(args.corpus_desc):
        while args.corpus_desc not in ['y', 'n', False]:
            args.corpus_desc = input("\nExisting corpus description found. Remove? [y/N] ")
            args.corpus_desc = args.corpus_desc.lower().strip()
            if args.corpus_desc == '':
                args.corpus_desc = False
        else:
            if args.corpus_desc == 'y':
                args.corpus_desc = args.config_file + '.md'

    if args.corpus_desc:
        with open(args.corpus_desc, 'w') as outfile:
            outfile.write(
"""This is an instance of the [InPhO Topic Explorer](http://inphodata.cogs.indiana.edu/). If you would like
to add a custom corpus description, either:
- Modify the contents of the file `{}`
- Change the main:corpus_desc path in `{}` to an existing Markdown file.
""".format(os.path.abspath(args.corpus_desc),
           os.path.abspath(args.config_file)))

    return args.config_file
Beispiel #5
0
def main(args):
    # TODO: remove this code, check if there is an issue and unit test
    # convert to unicode to avoid windows errors
    # args.corpus_path = args.corpus_path

    # config corpus_path
    # process bibtex files
    args.bibtex = args.corpus_path.endswith('.bib')
    if args.bibtex:
        args.bibtex = args.corpus_path
        args.corpus_path = process_bibtex(args.corpus_path, args.quiet)

    # set corpus_name
    args.corpus_name = os.path.basename(args.corpus_path)
    if not args.corpus_name:
        args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path))

    if not args.corpus_print_name and not args.quiet:
        args.corpus_print_name = prompt("Corpus Name",
                                        default=args.corpus_name)

    # configure model-path
    if args.model_path is None:
        if os.path.isdir(args.corpus_path):
            args.model_path = os.path.join(args.corpus_path, '../models/')
        else:
            args.model_path = os.path.dirname(args.corpus_path)
    if args.model_path and not os.path.exists(os.path.abspath(
            args.model_path)):
        os.makedirs(os.path.abspath(args.model_path))

    args.corpus_filename = get_corpus_filename(args.corpus_path,
                                               args.model_path,
                                               stop_freq=args.stop_freq)
    if not args.rebuild and os.path.exists(
            os.path.abspath(args.corpus_filename)):
        if args.quiet:
            print("Path exits: {}".format(args.corpus_filename))
            sys.exit(1)
        else:
            args.rebuild = bool_prompt("\nCorpus file found. Rebuild? ",
                                       default=False)
    else:
        args.rebuild = True

    if args.htrc:
        import vsm.extensions.htrc as htrc
        if os.path.isdir(args.corpus_path):
            #htrc.proc_htrc_coll(args.corpus_path)
            ids = [
                id.replace('.txt', '')
                for id in listdir_nohidden(args.corpus_path)
            ]

            args.htrc_metapath = os.path.abspath(args.corpus_path + '/../')
            args.htrc_metapath = os.path.join(
                args.htrc_metapath,
                os.path.basename(args.corpus_path) + '.metadata.json')
        else:
            import topicexplorer.extensions.htrc_features as htrc_features
            with open(args.corpus_path) as idfile:
                ids = [row.strip() for row in idfile if row.strip()]

            c = htrc_features.create_corpus(ids,
                                            nltk_stop=args.nltk,
                                            freq=args.stop_freq)
            c.save(args.corpus_filename)

    if args.rebuild and (not args.htrc or os.path.isdir(args.corpus_path)):
        try:
            args.corpus_filename = build_corpus(args.corpus_path,
                                                args.model_path,
                                                stop_freq=args.stop_freq,
                                                decode=args.decode,
                                                nltk_stop=args.nltk,
                                                simple=args.simple,
                                                sentences=args.sentences,
                                                tokenizer=args.tokenizer)
        except IOError as e:
            print("ERROR: invalid path, please specify either:")
            print("  * a single plain-text or PDF file,")
            print("  * a single bibtex (.bib) file with 'file' fields,")
            print("  * a folder of plain-text or PDF files, or")
            print("  * a folder of folders of plain-text or PDF files.")
            print("\nExiting...")
            raise e
            sys.exit(74)
        """
        except LookupError as e:
            if 'punkt' in e.message:
                print "\nERROR: sentence tokenizer not available, download by running:"
                print "    python -m nltk.downloader punkt"

            elif 'stopwords' in e.message:
                print "\nERROR: stopwords not available, download by running:"
                print "    python -m nltk.downloader stopwords"
            else:
                raise e
            print "\nExiting..."
            sys.exit(74)
        """

    args.config_file = write_config(args, args.config_file)

    args.corpus_desc = args.config_file + '.md'
    if not args.quiet and os.path.exists(args.corpus_desc):
        while args.corpus_desc not in ['y', 'n', False]:
            args.corpus_desc = input(
                "\nExisting corpus description found. Remove? [y/N] ")
            args.corpus_desc = args.corpus_desc.lower().strip()
            if args.corpus_desc == '':
                args.corpus_desc = False
        else:
            if args.corpus_desc == 'y':
                args.corpus_desc = args.config_file + '.md'

    if args.corpus_desc:
        with open(args.corpus_desc, 'w') as outfile:
            outfile.write(
                """This is an instance of the [InPhO Topic Explorer](http://inphodata.cogs.indiana.edu/). If you would like
to add a custom corpus description, either:
- Modify the contents of the file `{}`
- Change the main:corpus_desc path in `{}` to an existing Markdown file.
""".format(os.path.abspath(args.corpus_desc),
            os.path.abspath(args.config_file)))

    return args.config_file
Beispiel #6
0
def main(args):
    # convert to unicode to avoid windows errors
    args.corpus_path = unicode(args.corpus_path, 'utf-8')

    # config corpus_path
    # process bibtex files
    args.bibtex = args.corpus_path.endswith('.bib')
    if args.bibtex:
        args.bibtex = args.corpus_path
        args.corpus_path = process_bibtex(args.corpus_path)
        

    # set corpus_name
    args.corpus_name = os.path.basename(args.corpus_path)
    if not args.corpus_name:
        args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path))

    if not args.corpus_print_name:
        args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name)

    if args.htrc:
        import vsm.extensions.htrc as htrc
        htrc.proc_htrc_coll(args.corpus_path)
        
        import json
        data = [(id, htrc.metadata(id)) for id in listdir_nohidden(args.corpus_path)
                    if os.path.isdir(id)]
        data = dict(data)
        md_filename = os.path.join(args.corpus_path, '../metadata.json')
        with open(md_filename, 'wb') as outfile:
            json.dump(data, outfile)
    
    # configure model-path
    if args.model_path is None:
        if os.path.isdir(args.corpus_path):
            args.model_path = os.path.join(args.corpus_path, '../models/')
        else:
            args.model_path = os.path.dirname(args.corpus_path)
    if args.model_path and not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
  
    args.corpus_filename = get_corpus_filename(
        args.corpus_path, args.model_path, stop_freq=args.stop_freq)
    if not args.rebuild and os.path.exists(args.corpus_filename): 
        while args.rebuild not in ['y', 'n', True]:
            args.rebuild = raw_input("\nCorpus file found. Rebuild? [y/N] ")
            args.rebuild = args.rebuild.lower().strip()
            if args.rebuild == 'y':
                args.rebuild = True
            elif args.rebuild == '':
                args.rebuild = 'n'
    else:
        args.rebuild = True
    if args.rebuild == True:
        try:
            args.corpus_filename = build_corpus(args.corpus_path, args.model_path, 
                                                stop_freq=args.stop_freq, decode=args.decode,
                                                sentences=args.sentences,
                                                simple=args.simple,tokenizer=args.tokenizer)
        except IOError:
            print "ERROR: invalid path, please specify either:"
            print "  * a single plain-text or PDF file,"
            print "  * a single bibtex (.bib) file with 'file' fields,"
            print "  * a folder of plain-text or PDF files, or"
            print "  * a folder of folders of plain-text or PDF files."
            print "\nExiting..."
            sys.exit(74)
        """
        except LookupError as e:
            if 'punkt' in e.message:
                print "\nERROR: sentence tokenizer not available, download by running:"
                print "    python -m nltk.downloader punkt"

            elif 'stopwords' in e.message:
                print "\nERROR: stopwords not available, download by running:"
                print "    python -m nltk.downloader stopwords"
            else:
                raise e
            print "\nExiting..."
            sys.exit(74)        
        """

    return write_config(args, args.config_file)