Ejemplo n.º 1
0
def get_host_port(args):
    """
    Returns the hostname and port number
    """
    import topicexplorer.config
    config = topicexplorer.config.read(args.config)

    # automatic port assignment
    def test_port(port):
        try:
            host = args.host or config.get("www", "host")
            if host == '0.0.0.0':
                host = 'localhost'
            try:
                s = socket.create_connection((host, port), 2)
                s.close()
                raise IOError("Socket connectable on port {0}".format(port))
            except socket.error:
                pass
            return port
        except IOError:
            if not args.quiet:
                port = int_prompt(
                    "Conflict on port {0}. Enter new port:".format(port))
                return test_port(port)
            else:
                raise IOError(
                    "Conflict on port {0}. Try running with -p to manually set new port.".format(port))

    port = args.port or int(config.get('www', 'port').format(0))
    port = test_port(port)

    # prompt to save
    if (int(config.get("www", "port").format(0))) != port:
        if not args.quiet and bool_prompt(
            "Change default baseport to {0}?".format(port), default=True):
            config.set("www", "port", text(port))

            # create deep copy of configuration
            # see http://stackoverflow.com/a/24343297
            config_string = StringIO()
            config.write(config_string)

            # skip DEFAULT section
            config_string.seek(0)
            idx = config_string.getvalue().index("[main]")
            config_string.seek(idx)

            # read deep copy
            new_config = ConfigParser()
            config.read_file(config_string)

            # write deep copy without DEFAULT section
            # this preserves DEFAULT for rest of program
            with open(args.config, 'w') as configfh:
                new_config.write(configfh)

    # hostname assignment
    host = args.host or config.get('www', 'host')
    return host, port
Ejemplo n.º 2
0
def lang_prompt(languages):
    global langs
    out_langs = set()
    print("Stoplist the following languages?", end=' ')
    for lang in languages:
        if lang in langs:
            if bool_prompt("{}?".format(langs[lang].capitalize()), default=True):
                out_langs.add(lang)
    return out_langs
Ejemplo n.º 3
0
def lang_prompt(languages):
    global langs
    out_langs = set()
    print("Stoplist the following languages?", end=' ')
    for lang in languages:
        if lang in langs:
            if bool_prompt("{}?".format(langs[lang].capitalize()), default=True):
                out_langs.add(lang)
    return out_langs
Ejemplo n.º 4
0
def main(args):
    if args.cluster:
        cluster(args.cluster, args.config_file)
        return

    config = topicexplorer.config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main", "topics"))))
                if args.quiet:
                    args.k = [int(n) for n in default.split()]
            else:
                raise NoOptionError('main', 'topics')
        except NoOptionError:
            default = ' '.join(map(str, range(20, 100, 20)))

        while args.k is None:
            ks = input("Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print("\nTIP: number of topics can be specified with argument '-k N N N ...':")
                    print("         topicexplorer train %s -k %s\n" %\
                        (args.config_file, ' '.join(map(str, args.k))))
            except ValueError:
                print("Enter valid integers, separated by spaces!")

    if args.processes < 0:
        import multiprocessing
        args.processes = multiprocessing.cpu_count() + args.processes

    print("Loading corpus... ")
    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if (model_pattern is not None and not args.rebuild and (args.quiet or args.cont or
            bool_prompt("""Existing topic models found. You can continue training or start a new model. 
Do you want to continue training your existing models? """, default=True))):

        from vsm.model.lda import LDA
        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None and not args.quiet:    # pragma: no cover
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration * 1.5), min=m.iteration)

            print("\nTIP: number of training iterations can be specified with argument '--iter N':")
            print("         topicexplorer train --iter %d %s\n" % (args.iter, args.config_file))
        elif args.iter is None and args.quiet:      # pragma: no cover
            args.iter = int(m.iteration * 1.5)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main", "topics"))
        if args.k != config_topics:
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)

            build_models(corpus, corpus_filename, model_path,
                         config.get("main", "context_type"),
                         new_models, n_iterations=args.iter,
                         n_proc=args.processes, seed=args.seed,
                         dry_run=args.dry_run)

            model_pattern = continue_training(model_pattern, continuing_models,
                                              args.iter, n_proc=args.processes,
                                              dry_run=args.dry_run)

        else:
            model_pattern = continue_training(model_pattern, args.k, args.iter,
                                              n_proc=args.processes, 
                                              dry_run=args.dry_run)
    else:
        # build a new model
        if args.iter is None and not args.quiet:    # pragma: no cover
            args.iter = int_prompt("Number of training iterations:", default=200)

            print("\nTIP: number of training iterations can be specified with argument '--iter N':")
            print("         topicexplorer train --iter %d %s\n" % (args.iter, args.config_file))
        elif args.iter is None and args.quiet:      # pragma: no cover
            args.iter = 200

        # TODO: if only one context_type, make it just the one context type.
        ctxs = corpus.context_types
        if len(ctxs) == 1:
            args.context_type = ctxs[0]
        else:
            ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
            if args.context_type not in ctxs:
                while args.context_type not in ctxs:
                    contexts = ctxs[:]
                    contexts[0] = contexts[0].upper()
                    contexts = '/'.join(contexts)
                    args.context_type = input("Select a context type [%s] : " % contexts)
                    if args.context_type.strip() == '':
                        args.context_type = ctxs[0]
                    if args.context_type == ctxs[0].upper():
                        args.context_type = ctxs[0]
    
                print("\nTIP: context type can be specified with argument '--context-type TYPE':")
                print("         topicexplorer train --context-type %s %s\n" % (args.context_type, args.config_file))


        print("\nTIP: This configuration can be automated as:")
        print("         topicexplorer train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type, 
                ' '.join(map(str, args.k))))
        model_pattern = build_models(corpus, corpus_filename, model_path,
                                     args.context_type, args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes, seed=args.seed,
                                     dry_run=args.dry_run)
    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))

    if not args.dry_run:
        if config.has_option("main", "cluster"):
            cluster_path = config.get("main", "cluster", fallback=None)
            config.remove_option("main", "cluster")
            try:
                if cluster_path:
                    os.remove(cluster_path)
            except (OSError, IOError):
                # fail silently on IOError
                pass


        with open(args.config_file, "w") as configfh:
            config.write(configfh)
Ejemplo n.º 5
0
def main(args):
    global context_type, lda_c, lda_m, lda_v, label, id_fn
    
    # load in the configuration file
    config = ConfigParser({
        'certfile' : None,
        'keyfile' : None,
        'ca_certs' : None,
        'ssl' : False,
        'port' : '8000',
        'host' : '0.0.0.0',
        'topic_range' : '{0},{1},1'.format(args.k, args.k+1),
        'icons': 'link',
        'corpus_link' : None,
        'doc_title_format' : None,
        'doc_url_format' : None,
        'topics': None})
    config.read(args.config)

    # path variables
    path = config.get('main', 'path')
    context_type = config.get('main', 'context_type')
    corpus_file = config.get('main', 'corpus_file')
    model_pattern = config.get('main', 'model_pattern') 

    # automatic port assignment

    def test_port(port):
        try:
            host = args.host or config.get("www","host")
            if host == '0.0.0.0':
                host = 'localhost'
            try:
                s = socket.create_connection((host,port), 2)
                s.close()
                raise IOError("Socket connectable on port {0}".format(port))
            except socket.error:
                pass
            return port
        except IOError:
            port = int_prompt(
                "Conflict on port {0}. Enter new port:".format(port)) 
            return test_port(port)

    port = args.port or int(config.get('www','port').format(0)) + args.k
    port = test_port(port)
    
    # prompt to save
    if (int(config.get("www","port").format(0)) + args.k) != port:
        if bool_prompt("Change default baseport to {0}?".format(port - args.k),
                       default=True):
            config.set("www","port", str(port - args.k))

            # create deep copy of configuration
            # see http://stackoverflow.com/a/24343297
            config_string = StringIO()
            config.write(config_string)

            # skip DEFAULT section
            config_string.seek(0)
            idx = config_string.getvalue().index("[main]")
            config_string.seek(idx)

            # read deep copy
            new_config = ConfigParser()
            new_config.readfp(config_string)

            # write deep copy without DEFAULT section
            # this preserves DEFAULT for rest of program
            with open(args.config, 'wb') as configfh:
                new_config.write(configfh)


    # hostname assignment
    host = args.host or config.get('www','host')

    # LDA objects
    lda_c = Corpus.load(corpus_file)
    lda_m = None
    lda_v = None
    def load_model(k):
        global lda_m, lda_v
        lda_m = LDA.load(model_pattern.format(k))
        lda_v = LDAViewer(lda_c, lda_m)

    load_model(args.k)

    # label function imports
    try:
        label_module = config.get('main', 'label_module')
        label_module = import_module(label_module)
        print "imported label module"
        label_module.init(config.get('main','path'), lda_v, context_type)
    except (ImportError, NoOptionError, AttributeError):
        pass

    try:
        label = label_module.label
        print "imported label function"
    except (AttributeError, UnboundLocalError):
        label = lambda x: x
        print "using default label function"
        
    try:
        id_fn = label_module.id_fn
        print "imported id function"
    except (AttributeError, UnboundLocalError):
        id_fn = def_label_fn
        print "using default id function"

    config_icons = config.get('www','icons').split(",")

    @route('/icons.js')
    def icons():
        with open(resource_filename(__name__, '../www/icons.js')) as icons:
            text = '{0}\n var icons = {1};'\
                .format(icons.read(), json.dumps(config_icons))
        return text


    # index page parameterization
    corpus_name = config.get('www','corpus_name')
    corpus_link = config.get('www','corpus_link')
    doc_title_format = config.get('www', 'doc_title_format')
    doc_url_format = config.get('www', 'doc_url_format')

    if config.get('main', 'topic_range'):
        topic_range = map(int, config.get('main', 'topic_range').split(','))
        topic_range = range(*topic_range)
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))
    topic_range = [{'k' : k, 'port' : int(config.get('www','port').format(0)) + k} 
                        for k in topic_range] 

    renderer = pystache.Renderer(escape=lambda u: u)

    @route('/')
    def index():
        response.set_header('Expires', _cache_date())

        with open(resource_filename(__name__, '../www/index.mustache.html'),
                  encoding='utf-8') as tmpl_file:
            template = tmpl_file.read()
        return renderer.render(template, 
            {'corpus_name' : corpus_name,
             'corpus_link' : corpus_link,
             'context_type' : context_type,
             'topic_range' : topic_range,
             'doc_title_format' : doc_title_format,
             'doc_url_format' : doc_url_format})


    @route('/<filename:path>')
    @_set_acao_headers
    def send_static(filename):
        return static_file(filename, root=resource_filename(__name__, '../www/'))

    if args.ssl or config.get('main', 'ssl'):
        certfile = args.certfile or config.get('ssl', 'certfile')
        keyfile = args.keyfile or config.get('ssl', 'keyfile')
        ca_certs = args.ca_certs or config.get('ssl', 'ca_certs')

        run(host=host, port=port, server=SSLWSGIRefServer,
            certfile=certfile, keyfile=keyfile, ca_certs=ca_certs)
    else:
        run(host=host, port=port)
Ejemplo n.º 6
0
def main(args):
    # CONFIGURATION PARSING
    # load in the configuration file
    config = ConfigParser({
        'certfile' : None,
        'keyfile' : None,
        'ca_certs' : None,
        'ssl' : False,
        'port' : '8000',
        'host' : '0.0.0.0',
        'icons': 'link',
        'corpus_link' : None,
        'doc_title_format' : None,
        'doc_url_format' : None,
        'topic_range': None,
        'fulltext' : 'false',
        'raw_corpus': None,
        'topics': None})
    config.read(args.config_file)

    if config.get('main', 'topic_range'):
        topic_range = map(int, config.get('main', 'topic_range').split(','))
        topic_range = range(*topic_range)
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))
    print topic_range

    # LAUNCHING SERVERS
    # Cross-platform compatability
    def get_log_file(k):
        if config.has_section('logging'):
            path = config.get('logging','path')
            path = path.format(k)
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))

            return open(path, 'a')
        else:
            return subprocess.PIPE


    def test_baseport(host, baseport, topic_range):
        try:
            for k in topic_range:
                port = baseport + k
                try:
                    s = socket.create_connection((host,port), 2)
                    s.close()
                    raise IOError("Socket connectable on port {0}".format(port))
                except socket.error:
                    pass
            return baseport
        except IOError:
            baseport = int_prompt(
                "Conflict on port {0}. Enter new base port: [CURRENT: {1}]"\
                    .format(port, baseport)) 
            return test_baseport(host, baseport, topic_range)

    host = config.get("www","host")
    if host == '0.0.0.0':
        host = socket.gethostname()

    baseport = int(config.get("www","port").format(0))
    baseport = test_baseport(host, baseport, topic_range)

    # prompt to save
    if int(config.get("www","port").format(0)) != baseport:
        if bool_prompt("Change default baseport to {0}?".format(baseport),
                       default=True):
            config.set("www","port", baseport)

            # create deep copy of configuration
            # see http://stackoverflow.com/a/24343297
            config_string = StringIO()
            config.write(config_string)

            # skip DEFAULT section
            config_string.seek(0)
            idx = config_string.getvalue().index("[main]")
            config_string.seek(idx)

            # read deep copy
            new_config = ConfigParser()
            new_config.readfp(config_string)

            # write deep copy without DEFAULT section
            # this preserves DEFAULT for rest of program
            with open(args.config_file,'wb') as configfh:
                new_config.write(configfh)


    try:
        grp_fn = os.setsid
    except AttributeError:
        grp_fn = None
    fulltext = '--fulltext' if args.fulltext else ''
    procs = [subprocess.Popen("vsm serve -k {k} -p {port} {config_file} {fulltext}".format(
        k=k, port=(baseport+k), config_file=args.config_file,fulltext=fulltext),
        shell=True, stdout=get_log_file(k), stderr=subprocess.STDOUT,
        preexec_fn=grp_fn) for k in topic_range]

    print "pid","port"
    for proc,k in zip(procs, topic_range):
        port = baseport + k
        print proc.pid, "http://{host}:{port}/".format(host=host,port=port)


    # CLEAN EXIT AND SHUTDOWN OF SERVERS
    def signal_handler(signal,frame):
        print "\n"
        for p, k in zip(procs, topic_range):
            print "Stopping {}-topic model (Process ID: {})".format(k, p.pid)
            # Cross-Platform Compatability
            if platform.system() == 'Windows':
                subprocess.call(['taskkill', '/F', '/T', '/PID', str(p.pid)],
                        stdout=open(os.devnull), stderr=open(os.devnull))
            else:
                os.killpg(p.pid, signal)

        sys.exit()

    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    port = baseport + topic_range[0]
    url = "http://{host}:{port}/".format(host=host,port=port)

    # TODO: Add enhanced port checking
    while True:
        wait_count = 0
        try:
            urllib.urlopen(url)
            print "Server successfully started"
            break
        except:
            time.sleep(1)
            wait_count += 1

        if wait_count == 60:
            print "\nLaunching the server seems to be taking a long time."
            print "This may be due to loading in a large corpus."

            print "\nTo test launching a single model, press Ctrl+C to abort launch,"
            print "then use the `serve` command to find the error message:"
            print "\tvsm serve {config} -k {k}".format(
                config=args.config_file, k=topic_range[0])
    
        for proc,k in zip(procs, topic_range):
            if proc.poll() is not None:
                print "\nAn error has occurred launching the {}-topic model.".format(k)
                try:
                    with get_log_file(k) as logfile:
                        print "A log has been written to: {}\n".format(logfile.name)
                except AttributeError:
                    # No log file, things are a-ok.
                    pass

                print "Use the `serve` command to debug errors:"
                print "\tvsm serve {config} -k {k}".format(config=args.config_file, k=k)
                for p in procs:
                    if p.poll() is None:
                        try:
                            os.killpg(p.pid, signal.SIGTERM)
                        except AttributeError:
                            # Cross-Platform Compatability
                            subprocess.call(['taskkill', '/F', '/T', '/PID', str(p.pid)])    
    
                sys.exit(1)

    if args.browser:
        webbrowser.open(url)
        print "TIP: Browser launch can be disabled with the '--no-browser' argument:"
        print "vsm launch --no-browser", args.config_file, "\n"

    print "Press Ctrl+C to shutdown the Topic Explorer server"
    # Cross-platform Compatability
    try:
        signal.pause()
    except AttributeError:
        # Windows hack
        while True:
            time.sleep(1)
Ejemplo n.º 7
0
def main(args):
    # TODO: remove this code, check if there is an issue and unit test
    # convert to unicode to avoid windows errors
    # args.corpus_path = args.corpus_path

    # config corpus_path
    # process bibtex files
    args.bibtex = args.corpus_path.endswith('.bib')
    if args.bibtex:
        args.bibtex = args.corpus_path
        args.corpus_path = process_bibtex(args.corpus_path, args.quiet)

    # set corpus_name
    args.corpus_name = os.path.basename(args.corpus_path)
    if not args.corpus_name:
        args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path))

    if not args.corpus_print_name and not args.quiet:
        args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name)

    # configure model-path
    if args.model_path is None:
        if os.path.isdir(args.corpus_path):
            args.model_path = os.path.join(args.corpus_path, '../models/')
        else:
            args.model_path = os.path.dirname(args.corpus_path)
    if args.model_path and not os.path.exists(os.path.abspath(args.model_path)):
        os.makedirs(os.path.abspath(args.model_path))

    args.corpus_filename = get_corpus_filename(
        args.corpus_path, args.model_path, stop_freq=args.stop_freq)
    if not args.rebuild and os.path.exists(os.path.abspath(args.corpus_filename)):
        if args.quiet:
            print("Path exits: {}".format(args.corpus_filename))
            sys.exit(1)
        else:
            args.rebuild = bool_prompt("\nCorpus file found. Rebuild? ",
                default=False)
    else:
        args.rebuild = True

    if args.htrc:
        import vsm.extensions.htrc as htrc
        if os.path.isdir(args.corpus_path):
            #htrc.proc_htrc_coll(args.corpus_path)
            ids = [id.replace('.txt','') for id in listdir_nohidden(args.corpus_path)]

            args.htrc_metapath = os.path.abspath(args.corpus_path + '/../')
            args.htrc_metapath = os.path.join(args.htrc_metapath,
                os.path.basename(args.corpus_path) + '.metadata.json')
        else:
            import topicexplorer.extensions.htrc_features as htrc_features
            with open(args.corpus_path) as idfile:
                ids = [row.strip() for row in idfile if row.strip()]

            c = htrc_features.create_corpus(ids, nltk_stop=args.nltk,freq=args.stop_freq)
            c.save(args.corpus_filename)


    if args.rebuild and (not args.htrc or os.path.isdir(args.corpus_path)):
        try:
            args.corpus_filename = build_corpus(
                args.corpus_path, args.model_path, stop_freq=args.stop_freq,
                decode=args.decode, nltk_stop=args.nltk, simple=args.simple,
                sentences=args.sentences, tokenizer=args.tokenizer)
        except IOError as e:
            print("ERROR: invalid path, please specify either:")
            print("  * a single plain-text or PDF file,")
            print("  * a single bibtex (.bib) file with 'file' fields,")
            print("  * a folder of plain-text or PDF files, or")
            print("  * a folder of folders of plain-text or PDF files.")
            print("\nExiting...")
            raise e
            sys.exit(74)
        """
        except LookupError as e:
            if 'punkt' in e.message:
                print "\nERROR: sentence tokenizer not available, download by running:"
                print "    python -m nltk.downloader punkt"

            elif 'stopwords' in e.message:
                print "\nERROR: stopwords not available, download by running:"
                print "    python -m nltk.downloader stopwords"
            else:
                raise e
            print "\nExiting..."
            sys.exit(74)
        """

    args.config_file = write_config(args, args.config_file)

    args.corpus_desc = args.config_file + '.md'
    if not args.quiet and os.path.exists(args.corpus_desc):
        while args.corpus_desc not in ['y', 'n', False]:
            args.corpus_desc = input("\nExisting corpus description found. Remove? [y/N] ")
            args.corpus_desc = args.corpus_desc.lower().strip()
            if args.corpus_desc == '':
                args.corpus_desc = False
        else:
            if args.corpus_desc == 'y':
                args.corpus_desc = args.config_file + '.md'

    if args.corpus_desc:
        with open(args.corpus_desc, 'w') as outfile:
            outfile.write(
"""This is an instance of the [InPhO Topic Explorer](http://inphodata.cogs.indiana.edu/). If you would like
to add a custom corpus description, either:
- Modify the contents of the file `{}`
- Change the main:corpus_desc path in `{}` to an existing Markdown file.
""".format(os.path.abspath(args.corpus_desc),
           os.path.abspath(args.config_file)))

    return args.config_file
Ejemplo n.º 8
0
def main(args):
    # TODO: remove this code, check if there is an issue and unit test
    # convert to unicode to avoid windows errors
    # args.corpus_path = args.corpus_path

    # config corpus_path
    # process bibtex files
    args.bibtex = args.corpus_path.endswith('.bib')
    if args.bibtex:
        args.bibtex = args.corpus_path
        args.corpus_path = process_bibtex(args.corpus_path, args.quiet)

    # set corpus_name
    args.corpus_name = os.path.basename(args.corpus_path)
    if not args.corpus_name:
        args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path))

    if not args.corpus_print_name and not args.quiet:
        args.corpus_print_name = prompt("Corpus Name",
                                        default=args.corpus_name)

    # configure model-path
    if args.model_path is None:
        if os.path.isdir(args.corpus_path):
            args.model_path = os.path.join(args.corpus_path, '../models/')
        else:
            args.model_path = os.path.dirname(args.corpus_path)
    if args.model_path and not os.path.exists(os.path.abspath(
            args.model_path)):
        os.makedirs(os.path.abspath(args.model_path))

    args.corpus_filename = get_corpus_filename(args.corpus_path,
                                               args.model_path,
                                               stop_freq=args.stop_freq)
    if not args.rebuild and os.path.exists(
            os.path.abspath(args.corpus_filename)):
        if args.quiet:
            print("Path exits: {}".format(args.corpus_filename))
            sys.exit(1)
        else:
            args.rebuild = bool_prompt("\nCorpus file found. Rebuild? ",
                                       default=False)
    else:
        args.rebuild = True

    if args.htrc:
        import vsm.extensions.htrc as htrc
        if os.path.isdir(args.corpus_path):
            #htrc.proc_htrc_coll(args.corpus_path)
            ids = [
                id.replace('.txt', '')
                for id in listdir_nohidden(args.corpus_path)
            ]

            args.htrc_metapath = os.path.abspath(args.corpus_path + '/../')
            args.htrc_metapath = os.path.join(
                args.htrc_metapath,
                os.path.basename(args.corpus_path) + '.metadata.json')
        else:
            import topicexplorer.extensions.htrc_features as htrc_features
            with open(args.corpus_path) as idfile:
                ids = [row.strip() for row in idfile if row.strip()]

            c = htrc_features.create_corpus(ids,
                                            nltk_stop=args.nltk,
                                            freq=args.stop_freq)
            c.save(args.corpus_filename)

    if args.rebuild and (not args.htrc or os.path.isdir(args.corpus_path)):
        try:
            args.corpus_filename = build_corpus(args.corpus_path,
                                                args.model_path,
                                                stop_freq=args.stop_freq,
                                                decode=args.decode,
                                                nltk_stop=args.nltk,
                                                simple=args.simple,
                                                sentences=args.sentences,
                                                tokenizer=args.tokenizer)
        except IOError as e:
            print("ERROR: invalid path, please specify either:")
            print("  * a single plain-text or PDF file,")
            print("  * a single bibtex (.bib) file with 'file' fields,")
            print("  * a folder of plain-text or PDF files, or")
            print("  * a folder of folders of plain-text or PDF files.")
            print("\nExiting...")
            raise e
            sys.exit(74)
        """
        except LookupError as e:
            if 'punkt' in e.message:
                print "\nERROR: sentence tokenizer not available, download by running:"
                print "    python -m nltk.downloader punkt"

            elif 'stopwords' in e.message:
                print "\nERROR: stopwords not available, download by running:"
                print "    python -m nltk.downloader stopwords"
            else:
                raise e
            print "\nExiting..."
            sys.exit(74)
        """

    args.config_file = write_config(args, args.config_file)

    args.corpus_desc = args.config_file + '.md'
    if not args.quiet and os.path.exists(args.corpus_desc):
        while args.corpus_desc not in ['y', 'n', False]:
            args.corpus_desc = input(
                "\nExisting corpus description found. Remove? [y/N] ")
            args.corpus_desc = args.corpus_desc.lower().strip()
            if args.corpus_desc == '':
                args.corpus_desc = False
        else:
            if args.corpus_desc == 'y':
                args.corpus_desc = args.config_file + '.md'

    if args.corpus_desc:
        with open(args.corpus_desc, 'w') as outfile:
            outfile.write(
                """This is an instance of the [InPhO Topic Explorer](http://inphodata.cogs.indiana.edu/). If you would like
to add a custom corpus description, either:
- Modify the contents of the file `{}`
- Change the main:corpus_desc path in `{}` to an existing Markdown file.
""".format(os.path.abspath(args.corpus_desc),
            os.path.abspath(args.config_file)))

    return args.config_file
Ejemplo n.º 9
0
def main(args):
    config = ConfigParser()
    config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main",
                                                            "topics"))))
            else:
                raise NoOptionError
        except NoOptionError:
            default = ' '.join(map(str, range(20, 100, 20)))

        while args.k is None:
            ks = raw_input(
                "Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print "\nTIP: number of topics can be specified with argument '-k N N N ...':"
                    print "         vsm train %s -k %s\n" %\
                             (args.config_file, ' '.join(map(str, args.k)))
            except ValueError:
                print "Enter valid integers, separated by spaces!"

    if args.processes < 0:
        args.processes = multiprocessing.cpu_count() + args.processes

    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if model_pattern is not None and\
        bool_prompt("Existing models found. Continue training?", default=True):

        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None:
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration * 1.5),
                                   min=m.iteration)

            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter,
                                                         args.config_file)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main", "topics"))
        if args.k != config_topics:
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)

            build_models(corpus,
                         corpus_filename,
                         model_path,
                         config.get("main", "context_type"),
                         new_models,
                         n_iterations=args.iter,
                         n_proc=args.processes,
                         seed=args.seed)

            model_pattern = continue_training(model_pattern,
                                              continuing_models,
                                              args.iter,
                                              n_proc=args.processes)

        else:
            model_pattern = continue_training(model_pattern,
                                              args.k,
                                              args.iter,
                                              n_proc=args.processes)

    else:
        # build a new model
        if args.iter is None:
            args.iter = int_prompt("Number of training iterations:",
                                   default=200)

            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter,
                                                         args.config_file)

        ctxs = corpus.context_types
        ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
        if args.context_type not in ctxs:
            while args.context_type not in ctxs:
                contexts = ctxs[:]
                contexts[0] = contexts[0].upper()
                contexts = '/'.join(contexts)
                args.context_type = raw_input("Select a context type [%s] : " %
                                              contexts)
                if args.context_type.strip() == '':
                    args.context_type = ctxs[0]
                if args.context_type == ctxs[0].upper():
                    args.context_type = ctxs[0]

            print "\nTIP: context type can be specified with argument '--context-type TYPE':"
            print "         vsm train --context-type %s %s\n" % (
                args.context_type, args.config_file)

        print "\nTIP: This configuration can be automated as:"
        print "         vsm train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type,
                ' '.join(map(str, args.k)))

        model_pattern = build_models(corpus,
                                     corpus_filename,
                                     model_path,
                                     args.context_type,
                                     args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes,
                                     seed=args.seed)

    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))

    with open(args.config_file, "wb") as configfh:
        config.write(configfh)
Ejemplo n.º 10
0
def main(args):
    # CONFIGURATION PARSING
    # load in the configuration file
    config = ConfigParser({
        'certfile': None,
        'keyfile': None,
        'ca_certs': None,
        'ssl': False,
        'port': '8000',
        'host': '0.0.0.0',
        'icons': 'link',
        'corpus_link': None,
        'doc_title_format': None,
        'doc_url_format': None,
        'topic_range': None,
        'topics': None
    })
    config.read(args.config_file)

    if config.get('main', 'topic_range'):
        topic_range = map(int, config.get('main', 'topic_range').split(','))
        topic_range = range(*topic_range)
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))
    print topic_range

    # LAUNCHING SERVERS
    # Cross-platform compatability
    def get_log_file(k):
        if config.has_section('logging'):
            path = config.get('logging', 'path')
            path = path.format(k)
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))

            return open(path, 'a')
        else:
            return subprocess.PIPE

    def test_baseport(baseport, topic_range):
        try:
            host = config.get("www", "host")
            if host == '0.0.0.0':
                host = 'localhost'
            for k in topic_range:
                port = baseport + k
                try:
                    s = socket.create_connection((host, port), 2)
                    s.close()
                    raise IOError(
                        "Socket connectable on port {0}".format(port))
                except socket.error:
                    pass
            return baseport
        except IOError:
            baseport = int_prompt(
                "Conflict on port {0}. Enter new base port: [CURRENT: {1}]"\
                    .format(port, baseport))
            return test_baseport(baseport, topic_range)

    baseport = test_baseport(int(config.get("www", "port").format(0)),
                             topic_range)

    # prompt to save
    if int(config.get("www", "port").format(0)) != baseport:
        if bool_prompt("Change default baseport to {0}?".format(baseport),
                       default=True):
            config.set("www", "port", baseport)

            # create deep copy of configuration
            # see http://stackoverflow.com/a/24343297
            config_string = StringIO()
            config.write(config_string)

            # skip DEFAULT section
            config_string.seek(0)
            idx = config_string.getvalue().index("[main]")
            config_string.seek(idx)

            # read deep copy
            new_config = ConfigParser()
            new_config.readfp(config_string)

            # write deep copy without DEFAULT section
            # this preserves DEFAULT for rest of program
            with open(args.config_file, 'wb') as configfh:
                new_config.write(configfh)

    try:
        grp_fn = os.setsid
    except AttributeError:
        grp_fn = None
    procs = [
        subprocess.Popen("vsm serve -k {k} -p {port} {config_file}".format(
            k=k, port=(baseport + k), config_file=args.config_file),
                         shell=True,
                         stdout=get_log_file(k),
                         stderr=subprocess.STDOUT,
                         preexec_fn=grp_fn) for k in topic_range
    ]

    print "pid", "port"
    for proc, k in zip(procs, topic_range):
        port = baseport + k
        host = config.get("www", "host")
        print proc.pid, "http://{host}:{port}/".format(host=host, port=port)

    # CLEAN EXIT AND SHUTDOWN OF SERVERS
    def signal_handler(signal, frame):
        print "\n"
        for p in procs:
            print "killing", p.pid
            # Cross-Platform Compatability
            try:
                os.killpg(p.pid, signal)
            except AttributeError:
                subprocess.call(['taskkill', '/F', '/T', '/PID', str(p.pid)])

        sys.exit()

    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    port = baseport + topic_range[0]
    host = config.get("www", "host")
    if host == '0.0.0.0':
        host = 'localhost'
    url = "http://{host}:{port}/".format(host=host, port=port)

    # TODO: Add enhanced port checking
    while True:
        try:
            urllib.urlopen(url)
            print "Server successfully started"
            break
        except:
            time.sleep(1)
    if args.browser:
        webbrowser.open(url)
        print "TIP: Browser launch can be disabled with the '--no-browser' argument:"
        print "vsm launch --no-browser", args.config_file, "\n"

    print "Press Ctrl+C to shutdown the Topic Explorer server"
    # Cross-platform Compatability
    try:
        signal.pause()
    except AttributeError:
        # Windows hack
        while True:
            time.sleep(1)
Ejemplo n.º 11
0
def main(args):
    if args.cluster:
        cluster(args.cluster, args.config_file)
        return

    config = topicexplorer.config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main",
                                                            "topics"))))
                if args.quiet:
                    args.k = [int(n) for n in default.split()]
            else:
                raise NoOptionError('main', 'topics')
        except NoOptionError:
            default = ' '.join(map(str, range(20, 100, 20)))

        while args.k is None:
            ks = input("Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print(
                        "\nTIP: number of topics can be specified with argument '-k N N N ...':"
                    )
                    print("         topicexplorer train %s -k %s\n" %\
                        (args.config_file, ' '.join(map(str, args.k))))
            except ValueError:
                print("Enter valid integers, separated by spaces!")

    if args.processes < 0:
        import multiprocessing
        args.processes = multiprocessing.cpu_count() + args.processes

    print("Loading corpus... ")
    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if (model_pattern is not None and not args.rebuild and
        (args.quiet or args.cont or bool_prompt(
            """Existing topic models found. You can continue training or start a new model. 
Do you want to continue training your existing models? """,
            default=True))):

        from vsm.model.lda import LDA
        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None and not args.quiet:  # pragma: no cover
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration * 1.5),
                                   min=m.iteration)

            print(
                "\nTIP: number of training iterations can be specified with argument '--iter N':"
            )
            print("         topicexplorer train --iter %d %s\n" %
                  (args.iter, args.config_file))
        elif args.iter is None and args.quiet:  # pragma: no cover
            args.iter = int(m.iteration * 1.5)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main", "topics"))
        if args.k != config_topics:
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)

            build_models(corpus,
                         corpus_filename,
                         model_path,
                         config.get("main", "context_type"),
                         new_models,
                         n_iterations=args.iter,
                         n_proc=args.processes,
                         seed=args.seed,
                         dry_run=args.dry_run)

            model_pattern = continue_training(model_pattern,
                                              continuing_models,
                                              args.iter,
                                              n_proc=args.processes,
                                              dry_run=args.dry_run)

        else:
            model_pattern = continue_training(model_pattern,
                                              args.k,
                                              args.iter,
                                              n_proc=args.processes,
                                              dry_run=args.dry_run)
    else:
        # build a new model
        if args.iter is None and not args.quiet:  # pragma: no cover
            args.iter = int_prompt("Number of training iterations:",
                                   default=200)

            print(
                "\nTIP: number of training iterations can be specified with argument '--iter N':"
            )
            print("         topicexplorer train --iter %d %s\n" %
                  (args.iter, args.config_file))
        elif args.iter is None and args.quiet:  # pragma: no cover
            args.iter = 200

        # TODO: if only one context_type, make it just the one context type.
        ctxs = corpus.context_types
        if len(ctxs) == 1:
            args.context_type = ctxs[0]
        else:
            ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
            if args.context_type not in ctxs:
                while args.context_type not in ctxs:
                    contexts = ctxs[:]
                    contexts[0] = contexts[0].upper()
                    contexts = '/'.join(contexts)
                    args.context_type = input("Select a context type [%s] : " %
                                              contexts)
                    if args.context_type.strip() == '':
                        args.context_type = ctxs[0]
                    if args.context_type == ctxs[0].upper():
                        args.context_type = ctxs[0]

                print(
                    "\nTIP: context type can be specified with argument '--context-type TYPE':"
                )
                print("         topicexplorer train --context-type %s %s\n" %
                      (args.context_type, args.config_file))

        print("\nTIP: This configuration can be automated as:")
        print("         topicexplorer train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type,
                ' '.join(map(str, args.k))))
        model_pattern = build_models(corpus,
                                     corpus_filename,
                                     model_path,
                                     args.context_type,
                                     args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes,
                                     seed=args.seed,
                                     dry_run=args.dry_run)
    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))

    if not args.dry_run:
        if config.has_option("main", "cluster"):
            cluster_path = config.get("main", "cluster", fallback=None)
            config.remove_option("main", "cluster")
            try:
                if cluster_path:
                    os.remove(cluster_path)
            except (OSError, IOError):
                # fail silently on IOError
                pass

        with open(args.config_file, "w") as configfh:
            config.write(configfh)
Ejemplo n.º 12
0
def main(args):
    from vsm.corpus import Corpus
    from vsm.model.lda import LDA

    config = ConfigParser()
    config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main", "topics"))))
            else:
                raise NoOptionError
        except NoOptionError:
            default = ' '.join(map(str, range(20,100,20)))

        while args.k is None:
            ks = raw_input("Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print "\nTIP: number of topics can be specified with argument '-k N N N ...':"
                    print "         vsm train %s -k %s\n" %\
                             (args.config_file, ' '.join(map(str, args.k)))
            except ValueError:
                print "Enter valid integers, separated by spaces!"
        
    if args.processes < 0:
        args.processes = multiprocessing.cpu_count() + args.processes

    print "Loading corpus... "
    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if model_pattern is not None and\
        bool_prompt("Existing models found. Continue training?", default=True):
    
        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None:
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration*1.5), min=m.iteration)
    
            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter, args.config_file)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main","topics"))
        if args.k != config_topics :
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)
        
            build_models(corpus, corpus_filename, model_path, 
                                         config.get("main", "context_type"),
                                         new_models, n_iterations=args.iter,
                                         n_proc=args.processes, seed=args.seed)

            model_pattern = continue_training(model_pattern, continuing_models,
                                              args.iter, n_proc=args.processes)

        else:
            model_pattern = continue_training(model_pattern, args.k, args.iter,
                                              n_proc=args.processes)

    else:
        # build a new model
        if args.iter is None:
            args.iter = int_prompt("Number of training iterations:", default=200)
    
            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter, args.config_file)
    
        ctxs = corpus.context_types
        ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
        if args.context_type not in ctxs:
            while args.context_type not in ctxs:
                contexts = ctxs[:]
                contexts[0] = contexts[0].upper()
                contexts = '/'.join(contexts)
                args.context_type = raw_input("Select a context type [%s] : " % contexts)
                if args.context_type.strip() == '':
                    args.context_type = ctxs[0]
                if args.context_type == ctxs[0].upper():
                    args.context_type = ctxs[0]
    
            print "\nTIP: context type can be specified with argument '--context-type TYPE':"
            print "         vsm train --context-type %s %s\n" % (args.context_type, args.config_file)
    
    
        print "\nTIP: This configuration can be automated as:"
        print "         vsm train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type, 
                ' '.join(map(str, args.k)))
        model_pattern = build_models(corpus, corpus_filename, model_path, 
                                     args.context_type, args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes, seed=args.seed,
                                     dry_run=args.dry_run)
    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))
    
    if not args.dry_run:
        with open(args.config_file, "wb") as configfh:
            config.write(configfh)
Ejemplo n.º 13
0
def main(args):
    global context_type, lda_c, lda_m, lda_v, label, id_fn

    # load in the configuration file
    config = ConfigParser({
        'certfile': None,
        'keyfile': None,
        'ca_certs': None,
        'ssl': False,
        'port': '8000',
        'host': '0.0.0.0',
        'topic_range': '{0},{1},1'.format(args.k, args.k + 1),
        'icons': 'link',
        'corpus_link': None,
        'doc_title_format': None,
        'doc_url_format': None,
        'topics': None
    })
    config.read(args.config)

    # path variables
    path = config.get('main', 'path')
    context_type = config.get('main', 'context_type')
    corpus_file = config.get('main', 'corpus_file')
    model_pattern = config.get('main', 'model_pattern')

    # automatic port assignment

    def test_port(port):
        try:
            host = args.host or config.get("www", "host")
            if host == '0.0.0.0':
                host = 'localhost'
            try:
                s = socket.create_connection((host, port), 2)
                s.close()
                raise IOError("Socket connectable on port {0}".format(port))
            except socket.error:
                pass
            return port
        except IOError:
            port = int_prompt(
                "Conflict on port {0}. Enter new port:".format(port))
            return test_port(port)

    port = args.port or int(config.get('www', 'port').format(0)) + args.k
    port = test_port(port)

    # prompt to save
    if (int(config.get("www", "port").format(0)) + args.k) != port:
        if bool_prompt("Change default baseport to {0}?".format(port - args.k),
                       default=True):
            config.set("www", "port", str(port - args.k))

            # create deep copy of configuration
            # see http://stackoverflow.com/a/24343297
            config_string = StringIO()
            config.write(config_string)

            # skip DEFAULT section
            config_string.seek(0)
            idx = config_string.getvalue().index("[main]")
            config_string.seek(idx)

            # read deep copy
            new_config = ConfigParser()
            new_config.readfp(config_string)

            # write deep copy without DEFAULT section
            # this preserves DEFAULT for rest of program
            with open(args.config, 'wb') as configfh:
                new_config.write(configfh)

    # hostname assignment
    host = args.host or config.get('www', 'host')

    # LDA objects
    lda_c = Corpus.load(corpus_file)
    lda_m = None
    lda_v = None

    def load_model(k):
        global lda_m, lda_v
        lda_m = LDA.load(model_pattern.format(k))
        lda_v = LDAViewer(lda_c, lda_m)

    load_model(args.k)

    # label function imports
    try:
        label_module = config.get('main', 'label_module')
        label_module = import_module(label_module)
        print "imported label module"
        label_module.init(config.get('main', 'path'), lda_v, context_type)
    except (ImportError, NoOptionError, AttributeError):
        pass

    try:
        label = label_module.label
        print "imported label function"
    except (AttributeError, UnboundLocalError):
        label = lambda x: x
        print "using default label function"

    try:
        id_fn = label_module.id_fn
        print "imported id function"
    except (AttributeError, UnboundLocalError):
        id_fn = def_label_fn
        print "using default id function"

    config_icons = config.get('www', 'icons').split(",")

    @route('/icons.js')
    def icons():
        with open(resource_filename(__name__, '../www/icons.js')) as icons:
            text = '{0}\n var icons = {1};'\
                .format(icons.read(), json.dumps(config_icons))
        return text

    # index page parameterization
    corpus_name = config.get('www', 'corpus_name')
    corpus_link = config.get('www', 'corpus_link')
    doc_title_format = config.get('www', 'doc_title_format')
    doc_url_format = config.get('www', 'doc_url_format')

    if config.get('main', 'topic_range'):
        topic_range = map(int, config.get('main', 'topic_range').split(','))
        topic_range = range(*topic_range)
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))
    topic_range = [{
        'k': k,
        'port': int(config.get('www', 'port').format(0)) + k
    } for k in topic_range]

    renderer = pystache.Renderer(escape=lambda u: u)

    @route('/')
    def index():
        response.set_header('Expires', _cache_date())

        with open(resource_filename(__name__, '../www/index.mustache.html'),
                  encoding='utf-8') as tmpl_file:
            template = tmpl_file.read()
        return renderer.render(
            template, {
                'corpus_name': corpus_name,
                'corpus_link': corpus_link,
                'context_type': context_type,
                'topic_range': topic_range,
                'doc_title_format': doc_title_format,
                'doc_url_format': doc_url_format
            })

    @route('/<filename:path>')
    @_set_acao_headers
    def send_static(filename):
        return static_file(filename,
                           root=resource_filename(__name__, '../www/'))

    if args.ssl or config.get('main', 'ssl'):
        certfile = args.certfile or config.get('ssl', 'certfile')
        keyfile = args.keyfile or config.get('ssl', 'keyfile')
        ca_certs = args.ca_certs or config.get('ssl', 'ca_certs')

        run(host=host,
            port=port,
            server=SSLWSGIRefServer,
            certfile=certfile,
            keyfile=keyfile,
            ca_certs=ca_certs)
    else:
        run(host=host, port=port)
Ejemplo n.º 14
0
def main(args):
    # CONFIGURATION PARSING
    # load in the configuration file
    config = ConfigParser({
        'certfile': None,
        'keyfile': None,
        'ca_certs': None,
        'ssl': False,
        'port': '8000',
        'host': '0.0.0.0',
        'icons': 'link',
        'corpus_link': None,
        'doc_title_format': None,
        'doc_url_format': None,
        'topic_range': None,
        'fulltext': 'false',
        'raw_corpus': None,
        'topics': None
    })
    config.read(args.config_file)

    if config.get('main', 'topic_range'):
        topic_range = map(int, config.get('main', 'topic_range').split(','))
        topic_range = range(*topic_range)
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))
    print topic_range

    # LAUNCHING SERVERS
    # Cross-platform compatability
    def get_log_file(k):
        if config.has_section('logging'):
            path = config.get('logging', 'path')
            path = path.format(k)
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))

            return open(path, 'a')
        else:
            return subprocess.PIPE

    def test_baseport(host, baseport, topic_range):
        try:
            for k in topic_range:
                port = baseport + k
                try:
                    s = socket.create_connection((host, port), 2)
                    s.close()
                    raise IOError(
                        "Socket connectable on port {0}".format(port))
                except socket.error:
                    pass
            return baseport
        except IOError:
            baseport = int_prompt(
                "Conflict on port {0}. Enter new base port: [CURRENT: {1}]"\
                    .format(port, baseport))
            return test_baseport(host, baseport, topic_range)

    host = config.get("www", "host")
    if host == '0.0.0.0':
        host = socket.gethostname()

    baseport = int(config.get("www", "port").format(0))
    baseport = test_baseport(host, baseport, topic_range)

    # prompt to save
    if int(config.get("www", "port").format(0)) != baseport:
        if bool_prompt("Change default baseport to {0}?".format(baseport),
                       default=True):
            config.set("www", "port", baseport)

            # create deep copy of configuration
            # see http://stackoverflow.com/a/24343297
            config_string = StringIO()
            config.write(config_string)

            # skip DEFAULT section
            config_string.seek(0)
            idx = config_string.getvalue().index("[main]")
            config_string.seek(idx)

            # read deep copy
            new_config = ConfigParser()
            new_config.readfp(config_string)

            # write deep copy without DEFAULT section
            # this preserves DEFAULT for rest of program
            with open(args.config_file, 'wb') as configfh:
                new_config.write(configfh)

    try:
        grp_fn = os.setsid
    except AttributeError:
        grp_fn = None
    fulltext = '--fulltext' if args.fulltext else ''
    procs = [
        subprocess.Popen(
            "vsm serve -k {k} -p {port} {config_file} {fulltext}".format(
                k=k,
                port=(baseport + k),
                config_file=args.config_file,
                fulltext=fulltext),
            shell=True,
            stdout=get_log_file(k),
            stderr=subprocess.STDOUT,
            preexec_fn=grp_fn) for k in topic_range
    ]

    print "pid", "port"
    for proc, k in zip(procs, topic_range):
        port = baseport + k
        print proc.pid, "http://{host}:{port}/".format(host=host, port=port)

    # CLEAN EXIT AND SHUTDOWN OF SERVERS
    def signal_handler(signal, frame):
        print "\n"
        for p, k in zip(procs, topic_range):
            print "Stopping {}-topic model (Process ID: {})".format(k, p.pid)
            # Cross-Platform Compatability
            if platform.system() == 'Windows':
                subprocess.call(['taskkill', '/F', '/T', '/PID',
                                 str(p.pid)],
                                stdout=open(os.devnull),
                                stderr=open(os.devnull))
            else:
                os.killpg(p.pid, signal)

        sys.exit()

    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    port = baseport + topic_range[0]
    url = "http://{host}:{port}/".format(host=host, port=port)

    # TODO: Add enhanced port checking
    while True:
        wait_count = 0
        try:
            urllib.urlopen(url)
            print "Server successfully started"
            break
        except:
            time.sleep(1)
            wait_count += 1

        if wait_count == 60:
            print "\nLaunching the server seems to be taking a long time."
            print "This may be due to loading in a large corpus."

            print "\nTo test launching a single model, press Ctrl+C to abort launch,"
            print "then use the `serve` command to find the error message:"
            print "\tvsm serve {config} -k {k}".format(config=args.config_file,
                                                       k=topic_range[0])

        for proc, k in zip(procs, topic_range):
            if proc.poll() is not None:
                print "\nAn error has occurred launching the {}-topic model.".format(
                    k)
                try:
                    with get_log_file(k) as logfile:
                        print "A log has been written to: {}\n".format(
                            logfile.name)
                except AttributeError:
                    # No log file, things are a-ok.
                    pass

                print "Use the `serve` command to debug errors:"
                print "\tvsm serve {config} -k {k}".format(
                    config=args.config_file, k=k)
                for p in procs:
                    if p.poll() is None:
                        try:
                            os.killpg(p.pid, signal.SIGTERM)
                        except AttributeError:
                            # Cross-Platform Compatability
                            subprocess.call(
                                ['taskkill', '/F', '/T', '/PID',
                                 str(p.pid)])

                sys.exit(1)

    if args.browser:
        webbrowser.open(url)
        print "TIP: Browser launch can be disabled with the '--no-browser' argument:"
        print "vsm launch --no-browser", args.config_file, "\n"

    print "Press Ctrl+C to shutdown the Topic Explorer server"
    # Cross-platform Compatability
    try:
        signal.pause()
    except AttributeError:
        # Windows hack
        while True:
            time.sleep(1)
Ejemplo n.º 15
0
def main(args):
    # CONFIGURATION PARSING
    # load in the configuration file
    config = ConfigParser({
        'certfile' : None,
        'keyfile' : None,
        'ca_certs' : None,
        'ssl' : False,
        'port' : '8000',
        'host' : '0.0.0.0',
        'icons': 'link',
        'corpus_link' : None,
        'doc_title_format' : None,
        'doc_url_format' : None,
        'topic_range': None,
        'topics': None})
    config.read(args.config_file)

    if config.get('main', 'topic_range'):
        topic_range = map(int, config.get('main', 'topic_range').split(','))
        topic_range = range(*topic_range)
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))
    print topic_range

    # LAUNCHING SERVERS
    # Cross-platform compatability
    def get_log_file(k):
        if config.has_section('logging'):
            path = config.get('logging','path')
            path = path.format(k)
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))

            return open(path, 'a')
        else:
            return subprocess.PIPE


    def test_baseport(baseport, topic_range):
        try:
            host = config.get("www","host")
            if host == '0.0.0.0':
                host = 'localhost'
            for k in topic_range:
                port = baseport + k
                try:
                    s = socket.create_connection((host,port), 2)
                    s.close()
                    raise IOError("Socket connectable on port {0}".format(port))
                except socket.error:
                    pass
            return baseport
        except IOError:
            baseport = int_prompt(
                "Conflict on port {0}. Enter new base port: [CURRENT: {1}]"\
                    .format(port, baseport)) 
            return test_baseport(baseport)

    baseport = test_baseport(int(config.get("www","port").format(0)),
                             topic_range)

    # prompt to save
    if int(config.get("www","port").format(0)) != baseport:
        if bool_prompt("Change default baseport to {0}?".format(baseport),
                       default=True):
            config.set("www","port", baseport)

            # create deep copy of configuration
            # see http://stackoverflow.com/a/24343297
            config_string = StringIO()
            config.write(config_string)

            # skip DEFAULT section
            config_string.seek(0)
            idx = config_string.getvalue().index("[main]")
            config_string.seek(idx)

            # read deep copy
            new_config = ConfigParser()
            new_config.readfp(config_string)

            # write deep copy without DEFAULT section
            # this preserves DEFAULT for rest of program
            with open(args.config_file,'wb') as configfh:
                new_config.write(configfh)


    try:
        grp_fn = os.setsid
    except AttributeError:
        grp_fn = None
    procs = [subprocess.Popen("vsm serve -k {k} -p {port} {config_file}".format(
        k=k, port=(baseport+k), config_file=args.config_file),
        shell=True, stdout=get_log_file(k), stderr=subprocess.STDOUT,
        preexec_fn=grp_fn) for k in topic_range]

    print "pid","port"
    for proc,k in zip(procs, topic_range):
        port = baseport + k
        host = config.get("www","host")
        print proc.pid, "http://{host}:{port}/".format(host=host,port=port)


    # CLEAN EXIT AND SHUTDOWN OF SERVERS
    def signal_handler(signal,frame):
        print "\n"
        for p in procs:
            print "killing", p.pid
            # Cross-Platform Compatability
            try:
                os.killpg(p.pid, signal)
            except AttributeError:
                subprocess.call(['taskkill', '/F', '/T', '/PID', str(p.pid)])    

        sys.exit()

    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    port = baseport + topic_range[0]
    host = config.get("www","host")
    if host == '0.0.0.0':
        host = 'localhost'
    url = "http://{host}:{port}/".format(host=host,port=port)

    # TODO: Add enhanced port checking
    while True:
        try:
            urllib.urlopen(url)
            print "Server successfully started"
            break
        except:
            time.sleep(1)
    if args.browser:
        webbrowser.open(url)
        print "TIP: Browser launch can be disabled with the '--no-browser' argument:"
        print "vsm launch --no-browser", args.config_file, "\n"

    print "Press Ctrl+C to shutdown the Topic Explorer server"
    # Cross-platform Compatability
    try:
        signal.pause()
    except AttributeError:
        # Windows hack
        while True:
            time.sleep(1)