Esempio n. 1
0
def main():
    from argparse import ArgumentParser
    from topicexplorer.lib.util import is_valid_configfile

    # Construct argument parser
    parser = ArgumentParser()
    parser.add_argument('-p', '--port', type=int, default=8000)
    # parser.add_argument('config', help="Configuration File",
    #     type=is_valid_configfile)
    parser.add_argument('corpus')
    args = parser.parse_args()
    """
    # load in the configuration file
    config = ConfigParser({
        'raw_corpus' : None,
        'fulltext' : 'false'})
    config.read(args.config)
    
    # path variables
    corpus_file = config.get('main', 'corpus_file')
    
    # Load text model objects
    corpus = Corpus.load(corpus_file)
    """
    global corpus
    corpus = Corpus.load(args.corpus)
    from argparse import Namespace

    # bibtex.init(None, None, Namespace(bibtex='library.bib'))

    # Launch server
    port = args.port
    host = '0.0.0.0'
    root.run(server='paste', host=host, port=port)
Esempio n. 2
0
def main(args):
    from vsm.corpus import Corpus

    config = ConfigParser({"htrc": False, "sentences": "False"})
    config.read(args.config_file)

    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)

    context_type = config.get('main', 'context_type')

    if args.add:
        metadata = parse_metadata_from_csvfile(args.add, context_type)
        c = add_metadata(c,
                         context_type,
                         metadata,
                         force=args.force,
                         rename=args.rename)
        c.save(args.corpus_path)
    if args.list:
        extract_labels(c, context_type, args.list)
    if args.extract:
        extract_metadata(c, context_type, args.extract)
    if args.htrc:
        config = add_htrc_metadata(config, corpus=c)
        with open(args.config_file, "w") as configfh:
            config.write(configfh)
def main(args):
    from vsm.corpus import Corpus

    config = ConfigParser({"htrc": False,
        "sentences": "False"})
    config.read(args.config_file)
    
    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)
    
    context_type = config.get('main', 'context_type')

    if args.add:
        metadata = parse_metadata_from_csvfile(args.add, context_type)
        c = add_metadata(c, context_type, metadata, force=args.force,
            rename=args.rename)
        c.save(args.corpus_path)
    if args.list:
        extract_labels(c, context_type, args.list)
    if args.extract:
        extract_metadata(c, context_type, args.extract)
Esempio n. 4
0

def nested_arr_to_np(arr, arrarr=False):

    outli = []
    for r in arr:
        inli = []
        for c in r:
            inli.append(c)

        if arrarr:
            inli = np.array(inli)
        outli.append(inli)

    return np.array(outli)


if __name__ == '__main__':
    from vsm.corpus import Corpus

    path = '../org/knowceans/gibbstest/'
    c = Corpus.load(path + 'church_corp.npz')

    writepath = '/home/doori/inpho/org/knowceans/gibbstest/'
    ctx = 'document'
    # java can't process '..' in the path.
    gw, m = lda_run(c, path + 'churchcorp.txt', ctx, 10000, 2,
                    writepath + 'church-meta.txt', 0.01, 0.01)

    save(m, ctx, writepath + 'church_lda.npz', writepath + 'church-meta.txt')
Esempio n. 5
0
def main(args):
    global context_type, lda_c, lda_m, lda_v, label, id_fn
    
    # load in the configuration file
    config = ConfigParser({
        'certfile' : None,
        'keyfile' : None,
        'ca_certs' : None,
        'ssl' : False,
        'port' : '8000',
        'host' : '0.0.0.0',
        'topic_range' : '{0},{1},1'.format(args.k, args.k+1),
        'icons': 'link',
        'corpus_link' : None,
        'doc_title_format' : None,
        'doc_url_format' : None,
        'topics': None})
    config.read(args.config)

    # path variables
    path = config.get('main', 'path')
    context_type = config.get('main', 'context_type')
    corpus_file = config.get('main', 'corpus_file')
    model_pattern = config.get('main', 'model_pattern') 

    # automatic port assignment

    def test_port(port):
        try:
            host = args.host or config.get("www","host")
            if host == '0.0.0.0':
                host = 'localhost'
            try:
                s = socket.create_connection((host,port), 2)
                s.close()
                raise IOError("Socket connectable on port {0}".format(port))
            except socket.error:
                pass
            return port
        except IOError:
            port = int_prompt(
                "Conflict on port {0}. Enter new port:".format(port)) 
            return test_port(port)

    port = args.port or int(config.get('www','port').format(0)) + args.k
    port = test_port(port)
    
    # prompt to save
    if (int(config.get("www","port").format(0)) + args.k) != port:
        if bool_prompt("Change default baseport to {0}?".format(port - args.k),
                       default=True):
            config.set("www","port", str(port - args.k))

            # create deep copy of configuration
            # see http://stackoverflow.com/a/24343297
            config_string = StringIO()
            config.write(config_string)

            # skip DEFAULT section
            config_string.seek(0)
            idx = config_string.getvalue().index("[main]")
            config_string.seek(idx)

            # read deep copy
            new_config = ConfigParser()
            new_config.readfp(config_string)

            # write deep copy without DEFAULT section
            # this preserves DEFAULT for rest of program
            with open(args.config, 'wb') as configfh:
                new_config.write(configfh)


    # hostname assignment
    host = args.host or config.get('www','host')

    # LDA objects
    lda_c = Corpus.load(corpus_file)
    lda_m = None
    lda_v = None
    def load_model(k):
        global lda_m, lda_v
        lda_m = LDA.load(model_pattern.format(k))
        lda_v = LDAViewer(lda_c, lda_m)

    load_model(args.k)

    # label function imports
    try:
        label_module = config.get('main', 'label_module')
        label_module = import_module(label_module)
        print "imported label module"
        label_module.init(config.get('main','path'), lda_v, context_type)
    except (ImportError, NoOptionError, AttributeError):
        pass

    try:
        label = label_module.label
        print "imported label function"
    except (AttributeError, UnboundLocalError):
        label = lambda x: x
        print "using default label function"
        
    try:
        id_fn = label_module.id_fn
        print "imported id function"
    except (AttributeError, UnboundLocalError):
        id_fn = def_label_fn
        print "using default id function"

    config_icons = config.get('www','icons').split(",")

    @route('/icons.js')
    def icons():
        with open(resource_filename(__name__, '../www/icons.js')) as icons:
            text = '{0}\n var icons = {1};'\
                .format(icons.read(), json.dumps(config_icons))
        return text


    # index page parameterization
    corpus_name = config.get('www','corpus_name')
    corpus_link = config.get('www','corpus_link')
    doc_title_format = config.get('www', 'doc_title_format')
    doc_url_format = config.get('www', 'doc_url_format')

    if config.get('main', 'topic_range'):
        topic_range = map(int, config.get('main', 'topic_range').split(','))
        topic_range = range(*topic_range)
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))
    topic_range = [{'k' : k, 'port' : int(config.get('www','port').format(0)) + k} 
                        for k in topic_range] 

    renderer = pystache.Renderer(escape=lambda u: u)

    @route('/')
    def index():
        response.set_header('Expires', _cache_date())

        with open(resource_filename(__name__, '../www/index.mustache.html'),
                  encoding='utf-8') as tmpl_file:
            template = tmpl_file.read()
        return renderer.render(template, 
            {'corpus_name' : corpus_name,
             'corpus_link' : corpus_link,
             'context_type' : context_type,
             'topic_range' : topic_range,
             'doc_title_format' : doc_title_format,
             'doc_url_format' : doc_url_format})


    @route('/<filename:path>')
    @_set_acao_headers
    def send_static(filename):
        return static_file(filename, root=resource_filename(__name__, '../www/'))

    if args.ssl or config.get('main', 'ssl'):
        certfile = args.certfile or config.get('ssl', 'certfile')
        keyfile = args.keyfile or config.get('ssl', 'keyfile')
        ca_certs = args.ca_certs or config.get('ssl', 'ca_certs')

        run(host=host, port=port, server=SSLWSGIRefServer,
            certfile=certfile, keyfile=keyfile, ca_certs=ca_certs)
    else:
        run(host=host, port=port)
Esempio n. 6
0
 def _load_corpus(self, corpus_file):
     self.c = Corpus.load(corpus_file, load_corpus=False)
     self.labels = self.c.view_metadata(self.context_type)[self.label_name]
Esempio n. 7
0
def main(args):
    config = ConfigParser()
    config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main",
                                                            "topics"))))
            else:
                raise NoOptionError
        except NoOptionError:
            default = ' '.join(map(str, range(20, 100, 20)))

        while args.k is None:
            ks = raw_input(
                "Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print "\nTIP: number of topics can be specified with argument '-k N N N ...':"
                    print "         vsm train %s -k %s\n" %\
                             (args.config_file, ' '.join(map(str, args.k)))
            except ValueError:
                print "Enter valid integers, separated by spaces!"

    if args.processes < 0:
        args.processes = multiprocessing.cpu_count() + args.processes

    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if model_pattern is not None and\
        bool_prompt("Existing models found. Continue training?", default=True):

        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None:
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration * 1.5),
                                   min=m.iteration)

            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter,
                                                         args.config_file)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main", "topics"))
        if args.k != config_topics:
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)

            build_models(corpus,
                         corpus_filename,
                         model_path,
                         config.get("main", "context_type"),
                         new_models,
                         n_iterations=args.iter,
                         n_proc=args.processes,
                         seed=args.seed)

            model_pattern = continue_training(model_pattern,
                                              continuing_models,
                                              args.iter,
                                              n_proc=args.processes)

        else:
            model_pattern = continue_training(model_pattern,
                                              args.k,
                                              args.iter,
                                              n_proc=args.processes)

    else:
        # build a new model
        if args.iter is None:
            args.iter = int_prompt("Number of training iterations:",
                                   default=200)

            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter,
                                                         args.config_file)

        ctxs = corpus.context_types
        ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
        if args.context_type not in ctxs:
            while args.context_type not in ctxs:
                contexts = ctxs[:]
                contexts[0] = contexts[0].upper()
                contexts = '/'.join(contexts)
                args.context_type = raw_input("Select a context type [%s] : " %
                                              contexts)
                if args.context_type.strip() == '':
                    args.context_type = ctxs[0]
                if args.context_type == ctxs[0].upper():
                    args.context_type = ctxs[0]

            print "\nTIP: context type can be specified with argument '--context-type TYPE':"
            print "         vsm train --context-type %s %s\n" % (
                args.context_type, args.config_file)

        print "\nTIP: This configuration can be automated as:"
        print "         vsm train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type,
                ' '.join(map(str, args.k)))

        model_pattern = build_models(corpus,
                                     corpus_filename,
                                     model_path,
                                     args.context_type,
                                     args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes,
                                     seed=args.seed)

    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))

    with open(args.config_file, "wb") as configfh:
        config.write(configfh)
Esempio n. 8
0
def main(args):
    from vsm.corpus import Corpus
    from vsm.model.lda import LDA

    config = ConfigParser()
    config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main", "topics"))))
            else:
                raise NoOptionError
        except NoOptionError:
            default = ' '.join(map(str, range(20,100,20)))

        while args.k is None:
            ks = raw_input("Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print "\nTIP: number of topics can be specified with argument '-k N N N ...':"
                    print "         vsm train %s -k %s\n" %\
                             (args.config_file, ' '.join(map(str, args.k)))
            except ValueError:
                print "Enter valid integers, separated by spaces!"
        
    if args.processes < 0:
        args.processes = multiprocessing.cpu_count() + args.processes

    print "Loading corpus... "
    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if model_pattern is not None and\
        bool_prompt("Existing models found. Continue training?", default=True):
    
        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None:
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration*1.5), min=m.iteration)
    
            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter, args.config_file)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main","topics"))
        if args.k != config_topics :
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)
        
            build_models(corpus, corpus_filename, model_path, 
                                         config.get("main", "context_type"),
                                         new_models, n_iterations=args.iter,
                                         n_proc=args.processes, seed=args.seed)

            model_pattern = continue_training(model_pattern, continuing_models,
                                              args.iter, n_proc=args.processes)

        else:
            model_pattern = continue_training(model_pattern, args.k, args.iter,
                                              n_proc=args.processes)

    else:
        # build a new model
        if args.iter is None:
            args.iter = int_prompt("Number of training iterations:", default=200)
    
            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter, args.config_file)
    
        ctxs = corpus.context_types
        ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
        if args.context_type not in ctxs:
            while args.context_type not in ctxs:
                contexts = ctxs[:]
                contexts[0] = contexts[0].upper()
                contexts = '/'.join(contexts)
                args.context_type = raw_input("Select a context type [%s] : " % contexts)
                if args.context_type.strip() == '':
                    args.context_type = ctxs[0]
                if args.context_type == ctxs[0].upper():
                    args.context_type = ctxs[0]
    
            print "\nTIP: context type can be specified with argument '--context-type TYPE':"
            print "         vsm train --context-type %s %s\n" % (args.context_type, args.config_file)
    
    
        print "\nTIP: This configuration can be automated as:"
        print "         vsm train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type, 
                ' '.join(map(str, args.k)))
        model_pattern = build_models(corpus, corpus_filename, model_path, 
                                     args.context_type, args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes, seed=args.seed,
                                     dry_run=args.dry_run)
    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))
    
    if not args.dry_run:
        with open(args.config_file, "wb") as configfh:
            config.write(configfh)
Esempio n. 9
0
prop = []
for i in range(0, len(sttest)):
    if sttest[i][1] == 'start':
        prop = []
        inprop = True
    if endtest[i][1] == 'end':
        if prop != []:
            prop.append(sttest[i][0])
            props.append(prop)
            prop = []
        inprop = False
    if inprop:
        prop.append(sttest[i][0])

#Get topics for each prop
c = Corpus.load(exepath+sys.argv[4])
m = LDA.load(exepath+sys.argv[5])

v = LDAViewer(c,m)
stopwords = stopwords.words('english')

allowed_chars=string.ascii_letters
trans_table = string.maketrans('','')

print "Applying topic model"
#Remove props with only words in stoplist
vsmprops = []
np = []
for p in props:
    np = [w.lower() for w in p if w.lower() not in stopwords and not w.translate(trans_table,allowed_chars)]
    if len(np) > 0:
Esempio n. 10
0
from collections import defaultdict
import numpy as np

from hyperbrain.parse import *
from vsm.corpus import Corpus

import sys
c = Corpus.load(sys.argv[-1])

# get all terms in corpus
abi_vocab = [word for word in c.words if word.startswith('abi:')]

# get all counts
abi_counts = defaultdict(int)
for word in abi_vocab:
    id = int(word.replace('abi:', ''))
    count = (c.corpus == c.words_int[word]).sum()
    abi_counts[id] = count


# calculate how many children there are of each node
def get_child_counts(key):
    if children[key]:
        return abi_counts[key] + sum([
            get_child_counts(child_key)
            for child_key in children[key] if child_key != key
        ])
    else:
        return abi_counts[key]

Esempio n. 11
0
 def _load_corpus(self, corpus_file):
     self.c = Corpus.load(corpus_file, load_corpus=False)
     self.labels = self.c.view_metadata(self.context_type)[self.label_name]
Esempio n. 12
0
   

def nested_arr_to_np(arr, arrarr=False):
    
    outli = []
    for r in arr:
        inli = []
        for c in r:
            inli.append(c)
        
        if arrarr:
            inli = np.array(inli)    
        outli.append(inli)
    
    return np.array(outli)

    
if __name__=='__main__':
    from vsm.corpus import Corpus
   
    path = '../org/knowceans/gibbstest/'
    c = Corpus.load(path+'church_corp.npz')

    writepath = '/home/doori/inpho/org/knowceans/gibbstest/'
    ctx = 'document'
    # java can't process '..' in the path.
    gw, m = lda_run(c, path+'churchcorp.txt', ctx, 10000, 2, 
                    writepath+'church-meta.txt', 0.01, 0.01)

    save(m, ctx, writepath+'church_lda.npz', writepath+'church-meta.txt')
Esempio n. 13
0
def main(args):
    global context_type, lda_c, lda_m, lda_v, label, id_fn

    # load in the configuration file
    config = ConfigParser({
        'certfile': None,
        'keyfile': None,
        'ca_certs': None,
        'ssl': False,
        'port': '8000',
        'host': '0.0.0.0',
        'topic_range': '{0},{1},1'.format(args.k, args.k + 1),
        'icons': 'link',
        'corpus_link': None,
        'doc_title_format': None,
        'doc_url_format': None,
        'topics': None
    })
    config.read(args.config)

    # path variables
    path = config.get('main', 'path')
    context_type = config.get('main', 'context_type')
    corpus_file = config.get('main', 'corpus_file')
    model_pattern = config.get('main', 'model_pattern')

    # automatic port assignment

    def test_port(port):
        try:
            host = args.host or config.get("www", "host")
            if host == '0.0.0.0':
                host = 'localhost'
            try:
                s = socket.create_connection((host, port), 2)
                s.close()
                raise IOError("Socket connectable on port {0}".format(port))
            except socket.error:
                pass
            return port
        except IOError:
            port = int_prompt(
                "Conflict on port {0}. Enter new port:".format(port))
            return test_port(port)

    port = args.port or int(config.get('www', 'port').format(0)) + args.k
    port = test_port(port)

    # prompt to save
    if (int(config.get("www", "port").format(0)) + args.k) != port:
        if bool_prompt("Change default baseport to {0}?".format(port - args.k),
                       default=True):
            config.set("www", "port", str(port - args.k))

            # create deep copy of configuration
            # see http://stackoverflow.com/a/24343297
            config_string = StringIO()
            config.write(config_string)

            # skip DEFAULT section
            config_string.seek(0)
            idx = config_string.getvalue().index("[main]")
            config_string.seek(idx)

            # read deep copy
            new_config = ConfigParser()
            new_config.readfp(config_string)

            # write deep copy without DEFAULT section
            # this preserves DEFAULT for rest of program
            with open(args.config, 'wb') as configfh:
                new_config.write(configfh)

    # hostname assignment
    host = args.host or config.get('www', 'host')

    # LDA objects
    lda_c = Corpus.load(corpus_file)
    lda_m = None
    lda_v = None

    def load_model(k):
        global lda_m, lda_v
        lda_m = LDA.load(model_pattern.format(k))
        lda_v = LDAViewer(lda_c, lda_m)

    load_model(args.k)

    # label function imports
    try:
        label_module = config.get('main', 'label_module')
        label_module = import_module(label_module)
        print "imported label module"
        label_module.init(config.get('main', 'path'), lda_v, context_type)
    except (ImportError, NoOptionError, AttributeError):
        pass

    try:
        label = label_module.label
        print "imported label function"
    except (AttributeError, UnboundLocalError):
        label = lambda x: x
        print "using default label function"

    try:
        id_fn = label_module.id_fn
        print "imported id function"
    except (AttributeError, UnboundLocalError):
        id_fn = def_label_fn
        print "using default id function"

    config_icons = config.get('www', 'icons').split(",")

    @route('/icons.js')
    def icons():
        with open(resource_filename(__name__, '../www/icons.js')) as icons:
            text = '{0}\n var icons = {1};'\
                .format(icons.read(), json.dumps(config_icons))
        return text

    # index page parameterization
    corpus_name = config.get('www', 'corpus_name')
    corpus_link = config.get('www', 'corpus_link')
    doc_title_format = config.get('www', 'doc_title_format')
    doc_url_format = config.get('www', 'doc_url_format')

    if config.get('main', 'topic_range'):
        topic_range = map(int, config.get('main', 'topic_range').split(','))
        topic_range = range(*topic_range)
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))
    topic_range = [{
        'k': k,
        'port': int(config.get('www', 'port').format(0)) + k
    } for k in topic_range]

    renderer = pystache.Renderer(escape=lambda u: u)

    @route('/')
    def index():
        response.set_header('Expires', _cache_date())

        with open(resource_filename(__name__, '../www/index.mustache.html'),
                  encoding='utf-8') as tmpl_file:
            template = tmpl_file.read()
        return renderer.render(
            template, {
                'corpus_name': corpus_name,
                'corpus_link': corpus_link,
                'context_type': context_type,
                'topic_range': topic_range,
                'doc_title_format': doc_title_format,
                'doc_url_format': doc_url_format
            })

    @route('/<filename:path>')
    @_set_acao_headers
    def send_static(filename):
        return static_file(filename,
                           root=resource_filename(__name__, '../www/'))

    if args.ssl or config.get('main', 'ssl'):
        certfile = args.certfile or config.get('ssl', 'certfile')
        keyfile = args.keyfile or config.get('ssl', 'keyfile')
        ca_certs = args.ca_certs or config.get('ssl', 'ca_certs')

        run(host=host,
            port=port,
            server=SSLWSGIRefServer,
            certfile=certfile,
            keyfile=keyfile,
            ca_certs=ca_certs)
    else:
        run(host=host, port=port)
Esempio n. 14
0
def main(args):
    from vsm.corpus import Corpus
    config = ConfigParser({"htrc": False})
    config.read(args.config_file)
    
    if args.lang is None:
        args.lang = []

    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)
    
    # check for htrc metadata
    if args.htrc or config.get("main","htrc"):
        htrc_langs = get_htrc_langs(args)
        if htrc_langs:
            args.lang.extend(new_langs)

    # auto-guess a language
    """
    new_langs = [lang for lang in detect_langs(c) if lang in langs and lang not in args.lang]
    if new_langs:
        args.lang.extend(new_langs)
    """

    # check for any new candidates
    args.lang = [lang for lang in args.lang if stop_language(c, langs[lang])]
    if args.lang and not args.quiet:
        args.lang = lang_prompt(args.lang) 

    stoplist = set() 
    # Apply stop words
    print " "
    for lang in args.lang:
        print "Applying", langs[lang], "stopwords"
        candidates = stop_language(c, langs[lang])
        if len(candidates):
            stoplist.update(candidates)

    # Apply custom stopwords file
    if args.stopword_file:
        with open(args.stopword_file, encoding='utf8') as swf:
            candidates = [unidecode(word.strip()) for word in swf]
            if len(candidates):
                print "Applying custom stopword file to remove {} word{}.".format(len(candidates),
                's' if len(candidates) > 1 else '')
                stoplist.update(candidates)

    if args.min_word_len:
        candidates = get_small_words(c, args.min_word_len)
        if len(candidates):
            print "Filtering {} small word{} with less than {} characters.".format(len(candidates),
                's' if len(candidates) > 1 else '', args.min_word_len)
            stoplist.update(candidates)
    
    if not args.special_chars:
        candidates = get_special_chars(c)
        if len(candidates):
            print "Filtering {} word{} with special characters.".format(len(candidates),
                's' if len(candidates) > 1 else '')
            stoplist.update(candidates)
   
    if not args.high_filter:
        high_filter, candidates = get_high_filter(args, c, words=stoplist)
        if len(candidates):
            print "Filtering {} high frequency word{}.".format(len(candidates),
                's' if len(candidates) > 1 else '')
            stoplist.update(candidates)
    else:
        high_filter = args.high_filter
        candidates = get_candidate_words(c,args.high_filter, sort=False)
        if len(candidates):
            print "Filtering {} high frequency word{}.".format(len(candidates),
                's' if len(candidates) > 1 else '')
            stoplist.update(candidates)

    if not args.low_filter:
        low_filter, candidates = get_low_filter(args, c, words=stoplist)
        if len(candidates):
            print "Filtering {} low frequency word{}.".format(len(candidates),
                's' if len(candidates) > 1 else '')
            stoplist.update(candidates)
    else:
        low_filter = args.low_filter
        candidates  = get_candidate_words(c, -1*args.low_filter, sort=False)
        if len(candidates):
            print "Filtering {} low frequency words.".format(len(candidates))
            stoplist.update(candidates)

    if stoplist:
        print "\n\nApplying {} stopword{}".format(len(stoplist),
                's' if len(stoplist) > 1 else '')
        c.in_place_stoplist(stoplist)
        print "\n"

    def name_corpus(dirname, languages, lowfreq=None, highfreq=None):
        items, counts = get_items_counts(c.corpus)

        corpus_name = [dirname]
        if args.lang:
            corpus_name.append('nltk')
            corpus_name.append(''.join(args.lang))
        if lowfreq > 0:
            corpus_name.append('freq%s'%lowfreq)
        else:
            corpus_name.append('freq%s'%min(counts))

        if highfreq > 0:
            corpus_name.append('N%s'%highfreq)
        else:
            corpus_name.append('freq%s'%max(counts))

        corpus_name = '-'.join(corpus_name)
        corpus_name += '.npz'
        return corpus_name
   
    dirname = os.path.basename(args.corpus_path).split('-nltk-')[0].replace('.npz','')
    corpus_name = name_corpus(dirname, ['en'], low_filter, high_filter)

    model_path = os.path.dirname(args.corpus_path)
    args.corpus_path = os.path.join(model_path, corpus_name) 
    c.save(args.corpus_path)

    config.set("main", "corpus_file", args.corpus_path)
    config.remove_option("main", "model_pattern")
    with open(args.config_file, 'wb') as configfh:
        config.write(configfh)