Beispiel #1
0
def main(args):
    from vsm.corpus import Corpus

    config = topicexplorer.config.read(args.config_file)

    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)

    context_type = config.get('main', 'context_type')

    if args.add:
        metadata = parse_metadata_from_csvfile(args.add, context_type)
        c = add_metadata(c,
                         context_type,
                         metadata,
                         force=args.force,
                         rename=args.rename)
        c.save(args.corpus_path)
    if args.list:
        extract_labels(c, context_type, args.list)
    if args.extract:
        extract_metadata(c, context_type, args.extract)
    if args.htrc:
        config = add_htrc_metadata(config, corpus=c)
        with open(args.config_file, "w") as configfh:
            config.write(configfh)
Beispiel #2
0
def get_host_port(args):
    """
    Returns the hostname and port number
    """
    import topicexplorer.config
    config = topicexplorer.config.read(args.config)

    # automatic port assignment
    def test_port(port):
        try:
            host = args.host or config.get("www", "host")
            if host == '0.0.0.0':
                host = 'localhost'
            try:
                s = socket.create_connection((host, port), 2)
                s.close()
                raise IOError("Socket connectable on port {0}".format(port))
            except socket.error:
                pass
            return port
        except IOError:
            if not args.quiet:
                port = int_prompt(
                    "Conflict on port {0}. Enter new port:".format(port))
                return test_port(port)
            else:
                raise IOError(
                    "Conflict on port {0}. Try running with -p to manually set new port.".format(port))

    port = args.port or int(config.get('www', 'port').format(0))
    port = test_port(port)

    # prompt to save
    if (int(config.get("www", "port").format(0))) != port:
        if not args.quiet and bool_prompt(
            "Change default baseport to {0}?".format(port), default=True):
            config.set("www", "port", text(port))

            # create deep copy of configuration
            # see http://stackoverflow.com/a/24343297
            config_string = StringIO()
            config.write(config_string)

            # skip DEFAULT section
            config_string.seek(0)
            idx = config_string.getvalue().index("[main]")
            config_string.seek(idx)

            # read deep copy
            new_config = ConfigParser()
            config.read_file(config_string)

            # write deep copy without DEFAULT section
            # this preserves DEFAULT for rest of program
            with open(args.config, 'w') as configfh:
                new_config.write(configfh)

    # hostname assignment
    host = args.host or config.get('www', 'host')
    return host, port
Beispiel #3
0
def init(app, config_file):
    global metadata
    config = topicexplorer.config.read(config_file)

    try:
        filename = config.get('bibtex', 'path')
    except ConfigParserError:
        model_path = config.get('main', 'path')
        filename = os.path.join(model_path, 'library.bib')

    print("Loading Bibtex metadata from", filename)
    bib = parse_file(filename)

    metadata = dict()
    for entry in bib.entries:
        key = '/' + bib.entries[entry].fields.get('file', '').replace(
            ':pdf', '')[1:]
        if 'C$\\backslash$:' in key:
            key = key.replace('C$\\backslash$:', '')
            key = key[1:]
            key = os.path.normpath(key)
        key = os.path.basename(key)
        try:
            citation = pybtex.format_from_file(filename,
                                               style='plain',
                                               output_backend='text',
                                               citations=[entry])[3:]
            metadata[key] = citation
        except PybtexError:
            metadata[key] = filename
Beispiel #4
0
def init(app, config_file):
    global metadata
    config = topicexplorer.config.read(config_file)

    try:
        filename = config.get('bibtex', 'path')
    except ConfigParserError:
        model_path = config.get('main', 'path')
        filename = os.path.join(model_path, 'library.bib')

    print("Loading Bibtex metadata from", filename)
    bib = parse_file(filename)

    metadata = dict()
    for entry in bib.entries:
        key = '/' + bib.entries[entry].fields.get('file', '').replace(':pdf', '')[1:]
        if 'C$\\backslash$:' in key:
            key = key.replace('C$\\backslash$:', '')
            key = key[1:]
            key = os.path.normpath(key)
        key = os.path.basename(key)
        try:
            citation = pybtex.format_from_file(
                filename, style='plain', output_backend='text', citations=[entry])[3:]
            metadata[key] = citation
        except PybtexError:
            metadata[key] = filename
Beispiel #5
0
def add_htrc_metadata(config, corpus=None, corpus_filename=None):
    import htrc.metadata

    config.set("main", "label_module", "topicexplorer.extensions.htrc")
    config.set("www", "doc_title_format", '<a href="{1}">{0}</a>')
    config.set("www", "doc_url_format", 'http://hdl.handle.net/2027/{0}')
    config.set("www", "icons", "htrcbook,link")
    config.set("main", "htrc", True)
    
    if corpus_filename:
        corpus = Corpus.load(corpus_filename)
        config.set("main", "context_type", corpus.context_types[0])
    
    if corpus:
        ctx_type = config.get("main", "context_type")
        label_name = doc_label_name(ctx_type)
        ids = corpus.view_metadata(ctx_type)[label_name]
        
        htrc_metapath = os.path.abspath(config.get("main", "corpus_file"))
        htrc_metapath = os.path.join(
            os.path.dirname(htrc_metapath),
            os.path.basename(htrc_metapath) + '.metadata.json')

        print("Downloading metadata to ", htrc_metapath)
        htrc.metadata.get_metadata(ids, output_file=htrc_metapath)
        
        config.set("www", "htrc_metadata", htrc_metapath)

    return config
Beispiel #6
0
def add_htrc_metadata(config, corpus=None, corpus_filename=None):
    import htrc.metadata

    config.set("main", "label_module", "topicexplorer.extensions.htrc")
    config.set("www", "doc_title_format", '<a href="{1}">{0}</a>')
    config.set("www", "doc_url_format", 'http://hdl.handle.net/2027/{0}')
    config.set("www", "icons", "htrcbook,link")
    config.set("main", "htrc", "True")

    if corpus_filename:
        corpus = Corpus.load(corpus_filename)
        config.set("main", "context_type", corpus.context_types[0])

    if corpus:
        ctx_type = config.get("main", "context_type")
        label_name = doc_label_name(ctx_type)
        ids = corpus.view_metadata(ctx_type)[label_name]

        htrc_metapath = os.path.abspath(config.get("main", "corpus_file"))
        htrc_metapath = os.path.join(
            os.path.dirname(htrc_metapath),
            os.path.basename(htrc_metapath) + '.metadata.json')

        print("Downloading metadata to ", htrc_metapath)
        htrc.metadata.get_metadata(ids, output_file=htrc_metapath)

        config.set("www", "htrc_metadata", htrc_metapath)

    return config
Beispiel #7
0
def init(_app, config_file):
    #viewer, config, args):
    global app, metadata
    app = _app

    config = topicexplorer.config.read(config_file)

    model_path = config.get('main', 'path')
Beispiel #8
0
def init(_app, config_file):
    #viewer, config, args):
    global app, metadata
    app = _app

    config = topicexplorer.config.read(config_file)

    model_path = config.get('main', 'path')
Beispiel #9
0
def cluster(n_clusters, config_file):
    from .cluster import dimensionReduce
    dimension_reduce_model = dimensionReduce(config_file)

    dimension_reduce_model.fit_isomap()
    dimension_reduce_model.fit_kmeans(int(n_clusters))

    print("writing model files for Isomap and kmeans\n")
    config = topicexplorer.config.read(config_file)
    corpus_filename = config.get("main", "corpus_file")
    filename = '.'.join(corpus_filename.split('.')[:-1]) + '-cluster.csv'

    config.set("main", "cluster", filename)
    with open(config_file, "w") as configfh:
        config.write(configfh)
    dimension_reduce_model.write(config.get("main", "cluster"))

    return filename
Beispiel #10
0
def init(app, config_file):
    global raw_corpus_path
    config = topicexplorer.config.read(config_file)

    raw_corpus_path = config.get('main', 'raw_corpus')

    @app.route('/fulltext/<doc_id>')
    def get_doc(doc_id):
        return static_file(doc_id, root=raw_corpus_path)
Beispiel #11
0
def cluster(n_clusters, config_file):
    from .cluster import dimensionReduce
    dimension_reduce_model = dimensionReduce(config_file)

    dimension_reduce_model.fit_isomap()  
    dimension_reduce_model.fit_kmeans(int(n_clusters))

    print("writing model files for Isomap and kmeans\n")
    config = topicexplorer.config.read(config_file)
    corpus_filename = config.get("main", "corpus_file")
    filename = '.'.join(corpus_filename.split('.')[:-1]) + '-cluster.csv'

    config.set("main", "cluster", filename)
    with open(config_file, "w") as configfh:
        config.write(configfh)
    dimension_reduce_model.write(config.get("main", "cluster"))

    return filename
Beispiel #12
0
def init(app, config_file):
    global raw_corpus_path
    config = topicexplorer.config.read(config_file)

    raw_corpus_path = config.get('main', 'raw_corpus', fallback='ap/')

    @app.route('/fulltext/<doc_id>')
    def get_doc(doc_id):
        return static_file(doc_id, root=raw_corpus_path)
Beispiel #13
0
def init(_app, config_file):
    #viewer, config, args):
    global app, metadata
    app = _app

    config = topicexplorer.config.read(config_file)

    model_path = config.get('main', 'path')

    try:
        filename = config.get('www', 'htrc_metadata')
        if not filename:
            raise ValueError("Not a valid htrc metadata path.")
    except (NoSectionError, ValueError):
        filename = os.path.join(model_path, '../metadata.json')

    print("Loading HTRC metadata from", filename)
    with open(filename) as f:
        metadata = json.load(f)
Beispiel #14
0
def init(_app, config_file):
    #viewer, config, args):
    global app, metadata
    app = _app

    config = topicexplorer.config.read(config_file)

    model_path = config.get('main', 'path')

    try:
        filename = config.get('www', 'htrc_metadata')
        if not filename:
            raise ValueError("Not a valid htrc metadata path.")
    except (NoSectionError, ValueError): 
        filename = os.path.join(model_path, '../metadata.json')

    print("Loading HTRC metadata from", filename)
    with open(filename) as f:
        metadata = json.load(f)
Beispiel #15
0
def main(args):
    from vsm.corpus import Corpus

    config = topicexplorer.config.read(args.config_file)
    
    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)
    
    context_type = config.get('main', 'context_type')

    if args.add:
        metadata = parse_metadata_from_csvfile(args.add, context_type)
        c = add_metadata(c, context_type, metadata, force=args.force,
            rename=args.rename)
        c.save(args.corpus_path)
    if args.list:
        extract_labels(c, context_type, args.list)
    if args.extract:
        extract_metadata(c, context_type, args.extract)
    if args.htrc:
        config = add_htrc_metadata(config, corpus=c)
        with open(args.config_file, "w") as configfh:
            config.write(configfh)
Beispiel #16
0
def init(_app, config_file):
    global metadata
    global app
    app = _app

    config = topicexplorer.config.read(config_file)

    model_path = config.get('main', 'path')

    filename = os.path.join(model_path, '../metadata.json')
    print("Loading HTRC metadata from", filename)

    with open(filename) as f:
        metadata = json.load(f)
Beispiel #17
0
def main(args=None):
    #open config for reading
    config = topicexplorer.config.read(args.config)

    # clean up output file path
    if args.output is None:
        args.output = args.config.replace('.ini', '.tez') 
    if not args.output.endswith('.tez'):
        args.output += '.tez'

    # path variables
    context_type = config.get('main', 'context_type')
    corpus_file = config.get('main', 'corpus_file')
    model_pattern = config.get('main', 'model_pattern')
    cluster_path = config.get('main', 'cluster')
    corpus_desc = config.get('main', 'corpus_desc')
    
    # topic variables
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))
    if args.include_corpus:
        raw_corpus = config.get('main', 'raw_corpus')
    else:
        raw_corpus = None

    try:
        if config.getboolean('main', 'htrc'):
            htrc_metapath = config.get('www', 'htrc_metadata')
        else:
            htrc_metapath = None
    except:
        htrc_metapath = None

    # get manifest for zip file
    filenames = build_manifest(
        args.config, corpus_file, model_pattern, topic_range, cluster_path,
        raw_corpus=raw_corpus, corpus_desc=corpus_desc,
        htrc_metapath=htrc_metapath)

    zip_files(args.output, filenames, args.include_corpus)
Beispiel #18
0
def absolutize_config_file(config_file, output_dir):
    config_file = os.path.join(output_dir, config_file)

    config = topicexplorer.config.read(config_file)

    # path variables
    corpus_file = config.get('main', 'corpus_file')
    corpus_file = os.path.join(output_dir, corpus_file)
    corpus_file = os.path.abspath(corpus_file)
    config.set('main', 'corpus_file', corpus_file)

    model_pattern = config.get('main', 'model_pattern')
    model_pattern = os.path.join(output_dir, model_pattern)
    model_pattern = os.path.abspath(model_pattern)
    config.set('main', 'model_pattern', model_pattern)
    
    cluster_path = config.get('main', 'cluster')
    if cluster_path is not None and cluster_path != 'None':
        cluster_path = os.path.join(output_dir, cluster_path)
        cluster_path = os.path.abspath(cluster_path)
        config.set('main', 'cluster', cluster_path)
    
    path = config.get('main', 'path')
    if path is not None and path != 'None':
        path = os.path.join(output_dir, path)
        path = os.path.abspath(path)
        config.set('main', 'path', path)
    
    raw_corpus = config.get('main', 'raw_corpus')
    if raw_corpus is not None and raw_corpus != 'None':
        raw_corpus = os.path.join(output_dir, raw_corpus)
        raw_corpus = os.path.abspath(raw_corpus)
        config.set('main', 'raw_corpus', raw_corpus)
    
    corpus_desc = config.get('main', 'corpus_desc')
    if corpus_desc is not None and corpus_desc != 'None':
        corpus_desc = os.path.join(output_dir, corpus_desc)
        corpus_desc = os.path.abspath(corpus_desc)
        config.set('main', 'corpus_desc', corpus_desc)
    
    htrc_metadata = config.get('www', 'htrc_metadata')
    if htrc_metadata is not None and htrc_metadata != 'None':
        htrc_metadata = os.path.join(output_dir, htrc_metadata)
        htrc_metadata = os.path.abspath(htrc_metadata)
        config.set('www', 'htrc_metadata', htrc_metadata)

    with open(config_file, 'w', encoding='utf8') as configfile:
        config.write(configfile)
Beispiel #19
0
def absolutize_config_file(config_file, output_dir):
    config_file = os.path.join(output_dir, config_file)

    config = topicexplorer.config.read(config_file)

    # path variables
    corpus_file = config.get('main', 'corpus_file')
    corpus_file = os.path.join(output_dir, corpus_file)
    corpus_file = os.path.abspath(corpus_file)
    config.set('main', 'corpus_file', corpus_file)

    model_pattern = config.get('main', 'model_pattern')
    model_pattern = os.path.join(output_dir, model_pattern)
    model_pattern = os.path.abspath(model_pattern)
    config.set('main', 'model_pattern', model_pattern)

    cluster_path = config.get('main', 'cluster')
    if cluster_path is not None and cluster_path != 'None':
        cluster_path = os.path.join(output_dir, cluster_path)
        cluster_path = os.path.abspath(cluster_path)
        config.set('main', 'cluster', cluster_path)

    path = config.get('main', 'path')
    if path is not None and path != 'None':
        path = os.path.join(output_dir, path)
        path = os.path.abspath(path)
        config.set('main', 'path', path)

    raw_corpus = config.get('main', 'raw_corpus')
    if raw_corpus is not None and raw_corpus != 'None':
        raw_corpus = os.path.join(output_dir, raw_corpus)
        raw_corpus = os.path.abspath(raw_corpus)
        config.set('main', 'raw_corpus', raw_corpus)

    corpus_desc = config.get('main', 'corpus_desc')
    if corpus_desc is not None and corpus_desc != 'None':
        corpus_desc = os.path.join(output_dir, corpus_desc)
        corpus_desc = os.path.abspath(corpus_desc)
        config.set('main', 'corpus_desc', corpus_desc)

    htrc_metadata = config.get('www', 'htrc_metadata')
    if htrc_metadata is not None and htrc_metadata != 'None':
        htrc_metadata = os.path.join(output_dir, htrc_metadata)
        htrc_metadata = os.path.abspath(htrc_metadata)
        config.set('www', 'htrc_metadata', htrc_metadata)

    with open(config_file, 'w', encoding='utf8') as configfile:
        config.write(configfile)
Beispiel #20
0
 def test_port(port):
     try:
         host = args.host or config.get("www", "host")
         if host == '0.0.0.0':
             host = 'localhost'
         try:
             s = socket.create_connection((host, port), 2)
             s.close()
             raise IOError("Socket connectable on port {0}".format(port))
         except socket.error:
             pass
         return port
     except IOError:
         if not args.quiet:
             port = int_prompt(
                 "Conflict on port {0}. Enter new port:".format(port))
             return test_port(port)
         else:
             raise IOError(
                 "Conflict on port {0}. Try running with -p to manually set new port.".format(port))
Beispiel #21
0
def create_relative_config_file(config_file, manifest, include_corpus=False):
    if sys.version_info[0] == 3:
        root = os.path.commonpath(map(os.path.abspath, manifest)) + '/'
    else:
        root = os.path.commonprefix(map(os.path.abspath, manifest))
    
    config = topicexplorer.config.read(config_file)

    # path variables
    corpus_file = config.get('main', 'corpus_file')
    model_pattern = config.get('main', 'model_pattern')
    cluster_path = config.get('main', 'cluster')
    path = config.get('main', 'path')
    raw_corpus = config.get('main', 'raw_corpus')
    corpus_desc = config.get('main', 'corpus_desc')
    
    config.set('main', 'corpus_file', corpus_file.replace(root, ''))
    config.set('main', 'model_pattern', model_pattern.replace(root, ''))
    if cluster_path is not None:
        config.set('main', 'cluster', cluster_path.replace(root, ''))
    if path is not None:
        config.set('main', 'path', path.replace(root, ''))
    if raw_corpus is not None and include_corpus:
        config.set('main', 'raw_corpus', raw_corpus.replace(root, ''))
    else:
        config.set('main', 'raw_corpus', None)
    if corpus_desc is not None:
        config.set('main', 'corpus_desc', corpus_desc.replace(root, ''))
    try:
        if config.getboolean('main', 'htrc'):
            htrc_metapath = config.get('www', 'htrc_metadata')
            if htrc_metapath is not None:
                config.set('www', 'htrc_metadata', htrc_metapath.replace(root, ''))
    except:
        pass

    tempfh = NamedTemporaryFile(prefix='tez.'+config_file, delete=False)
    temp_config_file = tempfh.name
    tempfh.close()
    with open(temp_config_file, 'w', encoding='utf-8') as tempfile:
        config.write(tempfile)

    return temp_config_file
Beispiel #22
0
def main(args):
    config = topicexplorer.config.read(args.config_file)

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.lang is None:
        args.lang = []

    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)

    if c.original_length != len(c.corpus):
        print("Corpus has already been prepared. Proceed to training or")
        print("re-init the corpus to apply a different set of stopwords.")
        print("\nTIP: Train the LDA models with:")
        print("         topicexplorer train", args.config_file)
        sys.exit(1)

    # auto-guess a language
    """
    new_langs = [lang for lang in detect_langs(c) if lang in langs and lang not in args.lang]
    if new_langs:
        args.lang.extend(new_langs)
    """

    # add default locale if no other languages are specified
    # do not add if in quiet mode -- make everything explicit
    if not args.lang and not args.quiet:
        import locale
        locale = locale.getdefaultlocale()[0].split('_')[0].lower()
        if locale in langs.keys():
            args.lang.append(locale)

    # check for any new candidates
    args.lang = [lang for lang in args.lang if stop_language(c, langs[lang])]
    if args.lang and not args.quiet:
        args.lang = lang_prompt(args.lang)

    stoplist = set()
    # Apply stop words
    print(" ")
    for lang in args.lang:
        print("Applying", langs[lang], "stopwords")
        candidates = stop_language(c, langs[lang])
        if len(candidates):
            stoplist.update(candidates)

    # Apply custom stopwords file
    if args.stopword_file:
        with open(args.stopword_file, encoding='utf8') as swf:
            #candidates = [unidecode(word.strip()) for word in swf]
            candidates = [word.strip() for word in swf]

            if len(candidates):
                print("Applying custom stopword file to remove {} word{}.".
                      format(len(candidates),
                             's' if len(candidates) > 1 else ''))
                stoplist.update(candidates)

    if args.min_word_len:
        candidates = get_small_words(c, args.min_word_len)
        if len(candidates):
            print("Filtering {} small word{} with less than {} characters.".
                  format(len(candidates), 's' if len(candidates) > 1 else '',
                         args.min_word_len))
            stoplist.update(candidates)

    # cache item counts
    items, counts = get_corpus_counts(c)
    if args.high_filter is None and args.high_percent is None and not args.quiet:
        args.high_filter, candidates = get_high_filter(c,
                                                       words=stoplist,
                                                       items=items,
                                                       counts=counts)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.high_filter is None and args.high_percent is None and args.quiet:
        pass
    elif args.high_filter:
        candidates = get_candidate_words(c,
                                         args.high_filter,
                                         sort=False,
                                         items=items,
                                         counts=counts)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.high_percent:
        args.high_filter = get_closest_bin(c,
                                           1 - (args.high_percent / 100.),
                                           counts=counts)
        print(args.high_filter)
        candidates = get_candidate_words(c,
                                         args.high_filter,
                                         sort=False,
                                         items=items,
                                         counts=counts)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)

    if args.low_filter is None and args.low_percent is None and not args.quiet:
        args.low_filter, candidates = get_low_filter(c,
                                                     words=stoplist,
                                                     items=items,
                                                     counts=counts)
        if len(candidates):
            print("Filtering {} low frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.low_filter is None and args.low_percent is None and args.quiet:
        pass
    elif args.low_filter:
        candidates = get_candidate_words(c,
                                         -1 * args.low_filter,
                                         sort=False,
                                         items=items,
                                         counts=counts)
        if len(candidates):
            print("Filtering {} low frequency words.".format(len(candidates)))
            stoplist.update(candidates)

    elif args.low_percent:
        args.low_filter = get_closest_bin(c,
                                          1 - (args.low_percent / 100.),
                                          reverse=True,
                                          counts=counts)
        print(args.low_filter)
        candidates = get_candidate_words(c,
                                         -1 * args.low_filter,
                                         sort=False,
                                         items=items,
                                         counts=counts)
        if len(candidates):
            print("Filtering {} low frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)

    if not stoplist:
        print("No stopwords applied.\n\n")

        sys.exit(0)
    else:
        print("\n\nApplying {} stopword{}".format(
            len(stoplist), 's' if len(stoplist) > 1 else ''))
        c.in_place_stoplist(stoplist)
        print("\n")

    def name_corpus(dirname, languages, lowfreq=None, highfreq=None):
        corpus_name = [dirname]

        if args.lang:
            corpus_name.append('nltk')
            corpus_name.append(''.join(args.lang))

        if lowfreq is not None and lowfreq > 0:
            corpus_name.append('freq%s' % lowfreq)
        if highfreq is not None and highfreq > 0:
            corpus_name.append('N%s' % highfreq)

        corpus_name = '-'.join(corpus_name)
        corpus_name += '.npz'
        return corpus_name

    dirname = os.path.basename(args.corpus_path).split('-nltk-')[0].replace(
        '.npz', '')
    corpus_name = name_corpus(dirname, ['en'], args.low_filter,
                              args.high_filter)

    model_path = os.path.dirname(args.corpus_path)
    args.corpus_path = os.path.join(model_path, corpus_name)
    c.save(args.corpus_path)

    config.set("main", "corpus_file", args.corpus_path)
    config.remove_option("main", "model_pattern")
    with open(args.config_file, 'w') as configfh:
        config.write(configfh)
Beispiel #23
0
def create_app(args):
    config = topicexplorer.config.read(args.config)

    # path variables
    context_type = config.get('main', 'context_type')
    corpus_file = config.get('main', 'corpus_file')
    model_pattern = config.get('main', 'model_pattern')
    cluster_path = config.get('main', 'cluster')

    # language customization
    lang = config.get('main', 'lang')

    # set topic_range
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))

    # get icons_list
    config_icons = config.get('www', 'icons').split(",")
    if args.fulltext or config.getboolean('www', 'fulltext'):
        if not any('fulltext' in icon
                   for icon in config_icons) and 'ap' not in config_icons:
            # determines what fulltext function to use depending on the pdf tag that
            # was added in the init.py file
            if (config.getboolean('www', 'pdf')):
                config_icons.insert(0, 'fulltext-pdf')
            else:
                config_icons.insert(0, 'fulltext-inline')

    # Create application object
    corpus_name = config.get('www', 'corpus_name')
    corpus_link = config.get('www', 'corpus_link')
    doc_title_format = config.get('www', 'doc_title_format')
    doc_url_format = config.get('www', 'doc_url_format')
    home_link = config.get('www', 'home_link')
    label_module = config.get('main', 'label_module')
    corpus_path = config.get('main', 'raw_corpus')
    corpus_desc = config.get('main', 'corpus_desc')
    fulltext = args.fulltext or config.getboolean('www', 'fulltext')
    tokenizer = config.get('www', 'tokenizer')
    label_file = config.get('main', 'label_file')

    app = Application(corpus_file=corpus_file,
                      model_pattern=model_pattern,
                      topic_range=topic_range,
                      context_type=context_type,
                      label_module=label_module,
                      config_file=args.config,
                      corpus_path=corpus_path,
                      fulltext=fulltext,
                      lang=lang,
                      icons=config_icons,
                      corpus_name=corpus_name,
                      corpus_link=corpus_link if corpus_link != 'None' else '',
                      doc_title_format=doc_title_format,
                      doc_url_format=doc_url_format,
                      cluster_path=cluster_path,
                      corpus_desc=corpus_desc,
                      home_link=home_link,
                      tokenizer=tokenizer,
                      label_file=label_file)
    """
    host, port = get_host_port(args) 
    """
    # app.run(host='0.0.0.0', port=8081)
    return app
Beispiel #24
0
from builtins import range
from vsm import *
from vsm.viewer.wrappers import doc_label_name

import os.path
from collections import defaultdict

# load in the configuration file
import topicexplorer.config
config_file = r"$config_file"
config = topicexplorer.config.read(config_file)

# load the corpus
if config.getboolean('main', 'sentences'):
    from vsm.extensions.ldasentences import CorpusSent
    c = CorpusSent.load(config.get('main', 'corpus_file'))
else:
    c = Corpus.load(config.get('main', 'corpus_file'))
context_type = config.get('main', 'context_type')
ctx_metadata = c.view_metadata(context_type)
all_ids = ctx_metadata[doc_label_name(context_type)]

# create topic model patterns
pattern = config.get('main', 'model_pattern')
if config.get('main', 'topic_range'):
    topic_range = list(map(int, config.get('main', 'topic_range').split(',')))
    topic_range = list(range(*topic_range))
if config.get('main', 'topics'):
    topic_range = eval(config.get('main', 'topics'))

Beispiel #25
0
def main(args):
    config = topicexplorer.config.read(args.config_file)

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.lang is None:
        args.lang = []

    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)

    if c.original_length != len(c.corpus):
        print("Corpus has already been prepared. Proceed to training or")
        print("re-init the corpus to apply a different set of stopwords.")
        print("\nTIP: Train the LDA models with:")
        print("         topicexplorer train", args.config_file)
        sys.exit(1)

    # auto-guess a language
    """
    new_langs = [lang for lang in detect_langs(c) if lang in langs and lang not in args.lang]
    if new_langs:
        args.lang.extend(new_langs)
    """

    # add default locale if no other languages are specified
    # do not add if in quiet mode -- make everything explicit
    if not args.lang and not args.quiet:
        import locale
        locale = locale.getdefaultlocale()[0].split('_')[0].lower()
        if locale in langs.keys():
            args.lang.append(locale)

    # check for any new candidates
    args.lang = [lang for lang in args.lang if stop_language(c, langs[lang])]
    if args.lang and not args.quiet:
        args.lang = lang_prompt(args.lang)

    stoplist = set()
    # Apply stop words
    print(" ")
    for lang in args.lang:
        print("Applying", langs[lang], "stopwords")
        candidates = stop_language(c, langs[lang])
        if len(candidates):
            stoplist.update(candidates)

    # Apply custom stopwords file
    if args.stopword_file:
        with open(args.stopword_file, encoding='utf8') as swf:
            #candidates = [unidecode(word.strip()) for word in swf]
            candidates = [word.strip() for word in swf]

            if len(candidates):
                print("Applying custom stopword file to remove {} word{}.".format(
                    len(candidates), 's' if len(candidates) > 1 else ''))
                stoplist.update(candidates)
    
    if args.min_word_len:
        candidates = get_small_words(c, args.min_word_len)
        if len(candidates):
            print("Filtering {} small word{} with less than {} characters.".format(
                len(candidates), 's' if len(candidates) > 1 else '', args.min_word_len))
            stoplist.update(candidates)


    # cache item counts
    items, counts = get_corpus_counts(c)
    if args.high_filter is None and args.high_percent is None and not args.quiet:
        args.high_filter, candidates = get_high_filter(c, words=stoplist, items=items, counts=counts)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(len(candidates),
                                                               's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.high_filter is None and args.high_percent is None and args.quiet:
        pass
    elif args.high_filter:
        candidates = get_candidate_words(c, args.high_filter, sort=False, items=items, counts=counts)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(len(candidates),
                                                               's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.high_percent:
        args.high_filter = get_closest_bin(c, 1 - (args.high_percent / 100.), counts=counts)
        print(args.high_filter)
        candidates = get_candidate_words(c, args.high_filter, sort=False, items=items, counts=counts)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(len(candidates),
                                                               's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)

    if args.low_filter is None and args.low_percent is None and not args.quiet:
        args.low_filter, candidates = get_low_filter(c, words=stoplist, items=items, counts=counts)
        if len(candidates):
            print("Filtering {} low frequency word{}.".format(len(candidates),
                                                              's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.low_filter is None and args.low_percent is None and args.quiet:
        pass
    elif args.low_filter:
        candidates = get_candidate_words(c, -1 * args.low_filter, sort=False, items=items, counts=counts)
        if len(candidates):
            print("Filtering {} low frequency words.".format(len(candidates)))
            stoplist.update(candidates)

    elif args.low_percent:
        args.low_filter = get_closest_bin(c, 1 - (args.low_percent / 100.), reverse=True, counts=counts)
        print(args.low_filter)
        candidates = get_candidate_words(c, -1 * args.low_filter, sort=False, items=items, counts=counts)
        if len(candidates):
            print("Filtering {} low frequency word{}.".format(len(candidates),
                                                               's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)



    if not stoplist:
        print("No stopwords applied.\n\n")

        sys.exit(0)
    else:
        print("\n\nApplying {} stopword{}".format(len(stoplist),
                                                  's' if len(stoplist) > 1 else ''))
        c.in_place_stoplist(stoplist)
        print("\n")

    def name_corpus(dirname, languages, lowfreq=None, highfreq=None):
        corpus_name = [dirname]

        if args.lang:
            corpus_name.append('nltk')
            corpus_name.append(''.join(args.lang))

        if lowfreq is not None and lowfreq > 0:
            corpus_name.append('freq%s' % lowfreq)
        if highfreq is not None and highfreq > 0:
            corpus_name.append('N%s' % highfreq)

        corpus_name = '-'.join(corpus_name)
        corpus_name += '.npz'
        return corpus_name

    dirname = os.path.basename(args.corpus_path).split('-nltk-')[0].replace('.npz', '')
    corpus_name = name_corpus(dirname, ['en'], args.low_filter, args.high_filter)

    model_path = os.path.dirname(args.corpus_path)
    args.corpus_path = os.path.join(model_path, corpus_name)
    c.save(args.corpus_path)

    config.set("main", "corpus_file", args.corpus_path)
    config.remove_option("main", "model_pattern")
    with open(args.config_file, 'w') as configfh:
        config.write(configfh)
Beispiel #26
0
def create_app(args):
    config = topicexplorer.config.read(args.config)

    # path variables
    context_type = config.get('main', 'context_type')
    corpus_file = config.get('main', 'corpus_file')
    model_pattern = config.get('main', 'model_pattern')
    cluster_path = config.get('main', 'cluster')

    # language customization
    lang = config.get('main', 'lang')

    # set topic_range
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))

    # get icons_list
    config_icons = config.get('www', 'icons').split(",")
    if args.fulltext or config.getboolean('www', 'fulltext'):
        if not any('fulltext' in icon for icon in config_icons) and 'ap' not in config_icons:
            # determines what fulltext function to use depending on the pdf tag that
            # was added in the init.py file
            if (config.getboolean('www', 'pdf')):
                config_icons.insert(0, 'fulltext-pdf')
            else:
                config_icons.insert(0, 'fulltext-inline')

    # Create application object
    corpus_name = config.get('www', 'corpus_name')
    corpus_link = config.get('www', 'corpus_link')
    doc_title_format = config.get('www', 'doc_title_format')
    doc_url_format = config.get('www', 'doc_url_format')
    home_link = config.get('www', 'home_link')
    label_module = config.get('main', 'label_module')
    corpus_path = config.get('main', 'raw_corpus')
    corpus_desc = config.get('main', 'corpus_desc')
    fulltext = args.fulltext or config.getboolean('www', 'fulltext')
    tokenizer = config.get('www', 'tokenizer')

    app = Application(corpus_file=corpus_file,
                      model_pattern=model_pattern,
                      topic_range=topic_range,
                      context_type=context_type,
                      label_module=label_module,
                      config_file=args.config,
                      corpus_path=corpus_path,
                      fulltext=fulltext,
                      lang=lang,
                      icons=config_icons,
                      corpus_name=corpus_name,
                      corpus_link=corpus_link if corpus_link != 'None' else '',
                      doc_title_format=doc_title_format,
                      doc_url_format=doc_url_format,
                      cluster_path=cluster_path,
                      corpus_desc=corpus_desc,
                      home_link=home_link,
                      tokenizer=tokenizer)

    """
    host, port = get_host_port(args) 
    """
    # app.run(host='0.0.0.0', port=8081)
    return app
Beispiel #27
0
def main(args):
    if args.cluster:
        cluster(args.cluster, args.config_file)
        return

    config = topicexplorer.config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main", "topics"))))
                if args.quiet:
                    args.k = [int(n) for n in default.split()]
            else:
                raise NoOptionError('main', 'topics')
        except NoOptionError:
            default = ' '.join(map(str, range(20, 100, 20)))

        while args.k is None:
            ks = input("Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print("\nTIP: number of topics can be specified with argument '-k N N N ...':")
                    print("         topicexplorer train %s -k %s\n" %\
                        (args.config_file, ' '.join(map(str, args.k))))
            except ValueError:
                print("Enter valid integers, separated by spaces!")

    if args.processes < 0:
        import multiprocessing
        args.processes = multiprocessing.cpu_count() + args.processes

    print("Loading corpus... ")
    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if (model_pattern is not None and not args.rebuild and (args.quiet or args.cont or
            bool_prompt("""Existing topic models found. You can continue training or start a new model. 
Do you want to continue training your existing models? """, default=True))):

        from vsm.model.lda import LDA
        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None and not args.quiet:    # pragma: no cover
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration * 1.5), min=m.iteration)

            print("\nTIP: number of training iterations can be specified with argument '--iter N':")
            print("         topicexplorer train --iter %d %s\n" % (args.iter, args.config_file))
        elif args.iter is None and args.quiet:      # pragma: no cover
            args.iter = int(m.iteration * 1.5)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main", "topics"))
        if args.k != config_topics:
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)

            build_models(corpus, corpus_filename, model_path,
                         config.get("main", "context_type"),
                         new_models, n_iterations=args.iter,
                         n_proc=args.processes, seed=args.seed,
                         dry_run=args.dry_run)

            model_pattern = continue_training(model_pattern, continuing_models,
                                              args.iter, n_proc=args.processes,
                                              dry_run=args.dry_run)

        else:
            model_pattern = continue_training(model_pattern, args.k, args.iter,
                                              n_proc=args.processes, 
                                              dry_run=args.dry_run)
    else:
        # build a new model
        if args.iter is None and not args.quiet:    # pragma: no cover
            args.iter = int_prompt("Number of training iterations:", default=200)

            print("\nTIP: number of training iterations can be specified with argument '--iter N':")
            print("         topicexplorer train --iter %d %s\n" % (args.iter, args.config_file))
        elif args.iter is None and args.quiet:      # pragma: no cover
            args.iter = 200

        # TODO: if only one context_type, make it just the one context type.
        ctxs = corpus.context_types
        if len(ctxs) == 1:
            args.context_type = ctxs[0]
        else:
            ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
            if args.context_type not in ctxs:
                while args.context_type not in ctxs:
                    contexts = ctxs[:]
                    contexts[0] = contexts[0].upper()
                    contexts = '/'.join(contexts)
                    args.context_type = input("Select a context type [%s] : " % contexts)
                    if args.context_type.strip() == '':
                        args.context_type = ctxs[0]
                    if args.context_type == ctxs[0].upper():
                        args.context_type = ctxs[0]
    
                print("\nTIP: context type can be specified with argument '--context-type TYPE':")
                print("         topicexplorer train --context-type %s %s\n" % (args.context_type, args.config_file))


        print("\nTIP: This configuration can be automated as:")
        print("         topicexplorer train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type, 
                ' '.join(map(str, args.k))))
        model_pattern = build_models(corpus, corpus_filename, model_path,
                                     args.context_type, args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes, seed=args.seed,
                                     dry_run=args.dry_run)
    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))

    if not args.dry_run:
        if config.has_option("main", "cluster"):
            cluster_path = config.get("main", "cluster", fallback=None)
            config.remove_option("main", "cluster")
            try:
                if cluster_path:
                    os.remove(cluster_path)
            except (OSError, IOError):
                # fail silently on IOError
                pass


        with open(args.config_file, "w") as configfh:
            config.write(configfh)
Beispiel #28
0
def main(args):
    if args.cluster:
        cluster(args.cluster, args.config_file)
        return

    config = topicexplorer.config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main",
                                                            "topics"))))
                if args.quiet:
                    args.k = [int(n) for n in default.split()]
            else:
                raise NoOptionError('main', 'topics')
        except NoOptionError:
            default = ' '.join(map(str, range(20, 100, 20)))

        while args.k is None:
            ks = input("Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print(
                        "\nTIP: number of topics can be specified with argument '-k N N N ...':"
                    )
                    print("         topicexplorer train %s -k %s\n" %\
                        (args.config_file, ' '.join(map(str, args.k))))
            except ValueError:
                print("Enter valid integers, separated by spaces!")

    if args.processes < 0:
        import multiprocessing
        args.processes = multiprocessing.cpu_count() + args.processes

    print("Loading corpus... ")
    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if (model_pattern is not None and not args.rebuild and
        (args.quiet or args.cont or bool_prompt(
            """Existing topic models found. You can continue training or start a new model. 
Do you want to continue training your existing models? """,
            default=True))):

        from vsm.model.lda import LDA
        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None and not args.quiet:  # pragma: no cover
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration * 1.5),
                                   min=m.iteration)

            print(
                "\nTIP: number of training iterations can be specified with argument '--iter N':"
            )
            print("         topicexplorer train --iter %d %s\n" %
                  (args.iter, args.config_file))
        elif args.iter is None and args.quiet:  # pragma: no cover
            args.iter = int(m.iteration * 1.5)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main", "topics"))
        if args.k != config_topics:
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)

            build_models(corpus,
                         corpus_filename,
                         model_path,
                         config.get("main", "context_type"),
                         new_models,
                         n_iterations=args.iter,
                         n_proc=args.processes,
                         seed=args.seed,
                         dry_run=args.dry_run)

            model_pattern = continue_training(model_pattern,
                                              continuing_models,
                                              args.iter,
                                              n_proc=args.processes,
                                              dry_run=args.dry_run)

        else:
            model_pattern = continue_training(model_pattern,
                                              args.k,
                                              args.iter,
                                              n_proc=args.processes,
                                              dry_run=args.dry_run)
    else:
        # build a new model
        if args.iter is None and not args.quiet:  # pragma: no cover
            args.iter = int_prompt("Number of training iterations:",
                                   default=200)

            print(
                "\nTIP: number of training iterations can be specified with argument '--iter N':"
            )
            print("         topicexplorer train --iter %d %s\n" %
                  (args.iter, args.config_file))
        elif args.iter is None and args.quiet:  # pragma: no cover
            args.iter = 200

        # TODO: if only one context_type, make it just the one context type.
        ctxs = corpus.context_types
        if len(ctxs) == 1:
            args.context_type = ctxs[0]
        else:
            ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
            if args.context_type not in ctxs:
                while args.context_type not in ctxs:
                    contexts = ctxs[:]
                    contexts[0] = contexts[0].upper()
                    contexts = '/'.join(contexts)
                    args.context_type = input("Select a context type [%s] : " %
                                              contexts)
                    if args.context_type.strip() == '':
                        args.context_type = ctxs[0]
                    if args.context_type == ctxs[0].upper():
                        args.context_type = ctxs[0]

                print(
                    "\nTIP: context type can be specified with argument '--context-type TYPE':"
                )
                print("         topicexplorer train --context-type %s %s\n" %
                      (args.context_type, args.config_file))

        print("\nTIP: This configuration can be automated as:")
        print("         topicexplorer train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type,
                ' '.join(map(str, args.k))))
        model_pattern = build_models(corpus,
                                     corpus_filename,
                                     model_path,
                                     args.context_type,
                                     args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes,
                                     seed=args.seed,
                                     dry_run=args.dry_run)
    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))

    if not args.dry_run:
        if config.has_option("main", "cluster"):
            cluster_path = config.get("main", "cluster", fallback=None)
            config.remove_option("main", "cluster")
            try:
                if cluster_path:
                    os.remove(cluster_path)
            except (OSError, IOError):
                # fail silently on IOError
                pass

        with open(args.config_file, "w") as configfh:
            config.write(configfh)