Example #1
0
def main():
    from argparse import ArgumentParser
    from topicexplorer.lib.util import is_valid_configfile

    # Construct argument parser
    parser = ArgumentParser()
    parser.add_argument('-p', '--port', type=int, default=8000)
    # parser.add_argument('config', help="Configuration File",
    #     type=is_valid_configfile)
    parser.add_argument('corpus')
    args = parser.parse_args()
    """
    # load in the configuration file
    config = ConfigParser({
        'raw_corpus' : None,
        'fulltext' : 'false'})
    config.read(args.config)
    
    # path variables
    corpus_file = config.get('main', 'corpus_file')
    
    # Load text model objects
    corpus = Corpus.load(corpus_file)
    """
    global corpus
    corpus = Corpus.load(args.corpus)
    from argparse import Namespace

    # bibtex.init(None, None, Namespace(bibtex='library.bib'))

    # Launch server
    port = args.port
    host = '0.0.0.0'
    root.run(server='paste', host=host, port=port)
Example #2
0
def main(args):
    from vsm.corpus import Corpus

    config = ConfigParser({"htrc": False, "sentences": "False"})
    config.read(args.config_file)

    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)

    context_type = config.get('main', 'context_type')

    if args.add:
        metadata = parse_metadata_from_csvfile(args.add, context_type)
        c = add_metadata(c,
                         context_type,
                         metadata,
                         force=args.force,
                         rename=args.rename)
        c.save(args.corpus_path)
    if args.list:
        extract_labels(c, context_type, args.list)
    if args.extract:
        extract_metadata(c, context_type, args.extract)
    if args.htrc:
        config = add_htrc_metadata(config, corpus=c)
        with open(args.config_file, "w") as configfh:
            config.write(configfh)
Example #3
0
def walk_corpus(walk_dir,
                chunk_name='document',
                encoding='utf8',
                ignore=IGNORE,
                nltk_stop=True,
                stop_freq=1,
                add_stop=None,
                decode=False,
                verbose=1,
                simple=False,
                tokenizer=word_tokenize):

    filenames = []
    for root, dirs, files in os.walk(walk_dir):
        for file in files:
            filenames.append(os.path.join(root, file))

    # filter the blacklist (typically .json, .log, etc.)
    filenames = filter_by_suffix(filenames, ignore)
    files = []
    for filename in filenames:
        if encoding == 'detect':
            encoding = detect_encoding(filename)

        try:
            if decode:
                with open(filename, mode='r', encoding=encoding) as f:
                    files.append(unidecode(f.read()))
            else:
                with open(filename, mode='r', encoding=encoding) as f:
                    files.append(f.read())
        except UnicodeDecodeError:
            encoding = detect_encoding(filename)
            if decode:
                with open(filename, mode='r', encoding=encoding) as f:
                    files.append(unidecode(f.read()))
            else:
                with open(filename, mode='r', encoding=encoding) as f:
                    files.append(f.read())

    words, tok = dir_tokenize(files,
                              filenames,
                              chunk_name=chunk_name,
                              paragraphs=False,
                              verbose=verbose,
                              simple=simple,
                              tokenizer=tokenizer)
    names, data = list(zip(*list(tok.items())))

    c = Corpus(words, context_data=data, context_types=names)
    if nltk_stop or stop_freq or add_stop:
        c = apply_stoplist(c,
                           nltk_stop=nltk_stop,
                           freq=stop_freq,
                           add_stop=add_stop)
    return c
Example #4
0
def empty_corpus(context_type='document'):
    """
    Creates an empty Corpus with defined context_type.

    :param context_type: A type of tokenization. Default is 'document'.
    :type context_type: string

    :returns: An empty Corpus with no words or context_data.

    :See Also: :class:`vsm.corpus.Corpus`
    """
    return Corpus([],
                  context_data=[np.array([], dtype=[('idx', np.int)])],
                  context_types=[context_type])
Example #5
0
def corpus_fromlist(ls, context_type='context', remove_empty=True):
    """
    Takes a list of lists or arrays containing strings or integers and
    returns a Corpus object. The label associated to a given context
    is `context_type` prepended to the context index.
    
    :param ls: List of lists or List of arrays containing strings or integers.
    :type ls: list

    :param context_type: A type of tokenization.
    :type context_type: string, optional

    :returns: A Corpus object built from `ls`.

    :See Also: :class:`vsm.corpus.Corpus`

    **Examples**

    >>> ls = [['a', 'b'], ['c'], ['d', 'e']]
    >>> c = corpus_fromlist(ls, context_type='sentence')
    >>> c.view_contexts('sentence', as_strings=True)
    [array(['a', 'b'], dtype='|S1'),
     array(['c'], dtype='|S1'),
     array(['d', 'e'], dtype='|S1')]
    >>> c.context_data
    [array([(2, 'sentence_0'), (3, 'sentence_1'), (5, 'sentence_2')], 
          dtype=[('idx', '<i8'), ('sentence_label', '|S10')])]
    """
    corpus = chain.from_iterable(ls)  #[w for ctx in ls for w in ctx]
    indices = np.cumsum([len(sbls) for sbls in ls])

    metadata = ['{0}_{1}'.format(context_type, i) for i in range(len(indices))]
    md_type = np.array(metadata).dtype
    md_type = np.object_
    dtype = [('idx', np.int), (context_type + '_label', md_type)]
    context_data = [np.array(list(zip(indices, metadata)), dtype=dtype)]

    return Corpus(corpus,
                  context_data=context_data,
                  context_types=[context_type],
                  words_corpus=chain.from_iterable(copy(ctx) for ctx in ls),
                  remove_empty=remove_empty)
def main(args):
    from vsm.corpus import Corpus

    config = ConfigParser({"htrc": False,
        "sentences": "False"})
    config.read(args.config_file)
    
    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)
    
    context_type = config.get('main', 'context_type')

    if args.add:
        metadata = parse_metadata_from_csvfile(args.add, context_type)
        c = add_metadata(c, context_type, metadata, force=args.force,
            rename=args.rename)
        c.save(args.corpus_path)
    if args.list:
        extract_labels(c, context_type, args.list)
    if args.extract:
        extract_metadata(c, context_type, args.extract)
Example #7
0
def corpus_from_strings(strings,
                        metadata=[],
                        decode=False,
                        nltk_stop=True,
                        stop_freq=0,
                        add_stop=None,
                        tokenizer=word_tokenize):
    """
    Takes a list of strings and returns a Corpus object whose document
    tokens are the strings.
    :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`.
    :type tokenizer: lambda s -> tokens

    """
    if decode:
        for i in range(len(strings)):
            if isinstance(strings[i], str):
                strings[i] = unidecode(strings[i])

    documents = [tokenizer(s) for s in strings]
    corpus = sum(documents, [])
    indices = np.cumsum([len(d) for d in documents])
    del documents

    if len(metadata) == 0:
        metadata = ['document_{0}'.format(i) for i in range(len(strings))]
    md_type = np.array(metadata).dtype
    md_type = np.object_
    dtype = [('idx', np.int), ('document_label', md_type)]
    context_data = [np.array(list(zip(indices, metadata)), dtype=dtype)]

    c = Corpus(corpus, context_data=context_data, context_types=['document'])
    if nltk_stop or stop_freq or add_stop:
        c = apply_stoplist(c,
                           nltk_stop=nltk_stop,
                           freq=stop_freq,
                           add_stop=add_stop)
    return c
Example #8
0
def main(args):
    global context_type, lda_c, lda_m, lda_v, label, id_fn

    # load in the configuration file
    config = ConfigParser({
        'certfile': None,
        'keyfile': None,
        'ca_certs': None,
        'ssl': False,
        'port': '8000',
        'host': '0.0.0.0',
        'topic_range': '{0},{1},1'.format(args.k, args.k + 1),
        'icons': 'link',
        'corpus_link': None,
        'doc_title_format': None,
        'doc_url_format': None,
        'topics': None
    })
    config.read(args.config)

    # path variables
    path = config.get('main', 'path')
    context_type = config.get('main', 'context_type')
    corpus_file = config.get('main', 'corpus_file')
    model_pattern = config.get('main', 'model_pattern')

    # automatic port assignment

    def test_port(port):
        try:
            host = args.host or config.get("www", "host")
            if host == '0.0.0.0':
                host = 'localhost'
            try:
                s = socket.create_connection((host, port), 2)
                s.close()
                raise IOError("Socket connectable on port {0}".format(port))
            except socket.error:
                pass
            return port
        except IOError:
            port = int_prompt(
                "Conflict on port {0}. Enter new port:".format(port))
            return test_port(port)

    port = args.port or int(config.get('www', 'port').format(0)) + args.k
    port = test_port(port)

    # prompt to save
    if (int(config.get("www", "port").format(0)) + args.k) != port:
        if bool_prompt("Change default baseport to {0}?".format(port - args.k),
                       default=True):
            config.set("www", "port", str(port - args.k))

            # create deep copy of configuration
            # see http://stackoverflow.com/a/24343297
            config_string = StringIO()
            config.write(config_string)

            # skip DEFAULT section
            config_string.seek(0)
            idx = config_string.getvalue().index("[main]")
            config_string.seek(idx)

            # read deep copy
            new_config = ConfigParser()
            new_config.readfp(config_string)

            # write deep copy without DEFAULT section
            # this preserves DEFAULT for rest of program
            with open(args.config, 'wb') as configfh:
                new_config.write(configfh)

    # hostname assignment
    host = args.host or config.get('www', 'host')

    # LDA objects
    lda_c = Corpus.load(corpus_file)
    lda_m = None
    lda_v = None

    def load_model(k):
        global lda_m, lda_v
        lda_m = LDA.load(model_pattern.format(k))
        lda_v = LDAViewer(lda_c, lda_m)

    load_model(args.k)

    # label function imports
    try:
        label_module = config.get('main', 'label_module')
        label_module = import_module(label_module)
        print "imported label module"
        label_module.init(config.get('main', 'path'), lda_v, context_type)
    except (ImportError, NoOptionError, AttributeError):
        pass

    try:
        label = label_module.label
        print "imported label function"
    except (AttributeError, UnboundLocalError):
        label = lambda x: x
        print "using default label function"

    try:
        id_fn = label_module.id_fn
        print "imported id function"
    except (AttributeError, UnboundLocalError):
        id_fn = def_label_fn
        print "using default id function"

    config_icons = config.get('www', 'icons').split(",")

    @route('/icons.js')
    def icons():
        with open(resource_filename(__name__, '../www/icons.js')) as icons:
            text = '{0}\n var icons = {1};'\
                .format(icons.read(), json.dumps(config_icons))
        return text

    # index page parameterization
    corpus_name = config.get('www', 'corpus_name')
    corpus_link = config.get('www', 'corpus_link')
    doc_title_format = config.get('www', 'doc_title_format')
    doc_url_format = config.get('www', 'doc_url_format')

    if config.get('main', 'topic_range'):
        topic_range = map(int, config.get('main', 'topic_range').split(','))
        topic_range = range(*topic_range)
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))
    topic_range = [{
        'k': k,
        'port': int(config.get('www', 'port').format(0)) + k
    } for k in topic_range]

    renderer = pystache.Renderer(escape=lambda u: u)

    @route('/')
    def index():
        response.set_header('Expires', _cache_date())

        with open(resource_filename(__name__, '../www/index.mustache.html'),
                  encoding='utf-8') as tmpl_file:
            template = tmpl_file.read()
        return renderer.render(
            template, {
                'corpus_name': corpus_name,
                'corpus_link': corpus_link,
                'context_type': context_type,
                'topic_range': topic_range,
                'doc_title_format': doc_title_format,
                'doc_url_format': doc_url_format
            })

    @route('/<filename:path>')
    @_set_acao_headers
    def send_static(filename):
        return static_file(filename,
                           root=resource_filename(__name__, '../www/'))

    if args.ssl or config.get('main', 'ssl'):
        certfile = args.certfile or config.get('ssl', 'certfile')
        keyfile = args.keyfile or config.get('ssl', 'keyfile')
        ca_certs = args.ca_certs or config.get('ssl', 'ca_certs')

        run(host=host,
            port=port,
            server=SSLWSGIRefServer,
            certfile=certfile,
            keyfile=keyfile,
            ca_certs=ca_certs)
    else:
        run(host=host, port=port)
Example #9
0
def coll_corpus(coll_dir,
                encoding='utf8',
                ignore=IGNORE,
                nltk_stop=True,
                stop_freq=1,
                add_stop=None,
                decode=False,
                verbose=1,
                simple=False,
                tokenizer=word_tokenize):
    """
    `coll_corpus` is a convenience function for generating Corpus
    objects from a directory of plain text files.

    It will also strip punctuation and arabic numerals outside the
    range 1-29. All letters are made lowercase.

    :param coll_dir: Directory containing a collections of books
        which contain pages as plain-text files.
    :type coll_dir: string-like
    
    :param encoding: A string indicating the file encoding or 'detect',
        in which case `chardet` is used to automatically guess the encoding.
        Default is `utf8`.
    :type encoding: string, optional
    
    :param ignore: The list containing suffixes of files to be filtered.
        The suffix strings are normally file types. Default is ['.json',
        '.log','.pickle', '.DS_Store'].
    :type ignore: list of strings, optional

    :param nltk_stop: If `True` then the corpus object is masked 
        using the NLTK English stop words. Default is `False`.
    :type nltk_stop: boolean, optional
    
    :param stop_freq: The upper bound for a word to be masked on 
        the basis of its collection frequency. Default is 1.
    :type stop_freq: int, optional
    
    :param add_stop: A list of stop words. Default is `None`.
    :type add_stop: array-like, optional
    
    :param decode: If `True` then unicode characters are converted to
        ASCII. Default is `False`.
    :type decode: boolean, optional

    :param verbose: Verbosity level. 1 prints a progress bar.
    :type verbose: int, default 1 

    :returns: c : a Corpus object
        Contains the tokenized corpus built from the plain-text files
        in `coll_dir` corpus. Document tokens are named `documents`.
    """
    books = []
    book_names = os.listdir(coll_dir)
    book_names = filter_by_suffix(book_names, ignore)
    book_names.sort()

    for book_name in book_names:
        pages = []
        book_path = os.path.join(coll_dir, book_name)
        page_names = os.listdir(book_path)
        page_names = filter_by_suffix(page_names, ignore)
        page_names.sort()

        for page_name in page_names:
            page_file = book_name + '/' + page_name
            page_name = os.path.join(book_path, page_name)
            if encoding == 'detect':
                encoding = detect_encoding(page_name)
            try:
                if decode:
                    with open(page_name, mode='r', encoding=encoding) as f:
                        pages.append((unidecode(f.read()), page_file))
                else:
                    with open(page_name, mode='r', encoding=encoding) as f:
                        pages.append((f.read(), page_file))
            except UnicodeDecodeError:
                encoding = detect_encoding(page_name)
                if decode:
                    with open(page_name, mode='r', encoding=encoding) as f:
                        pages.append((unidecode(f.read()), page_file))
                else:
                    with open(page_name, mode='r', encoding=encoding) as f:
                        pages.append((f.read(), page_file))

        books.append(pages)

    words, tok = coll_tokenize(books,
                               book_names,
                               simple=simple,
                               tokenizer=tokenizer)
    names, data = list(zip(*list(tok.items())))

    c = Corpus(words, context_data=data, context_types=names)
    in_place_stoplist(c,
                      nltk_stop=nltk_stop,
                      freq=stop_freq,
                      add_stop=add_stop)

    return c
Example #10
0
def json_corpus(json_file,
                doc_key,
                label_key,
                encoding='utf8',
                nltk_stop=False,
                stop_freq=0,
                add_stop=None,
                tokenizer=word_tokenize):
    """
    `json_corpus` is a convenience function for generating Corpus
    objects from a json file. It construct a corpus, document labels
    and metadata respectively from the specified fields in the json file.

    `json_corpus` will perform word-level tokenization. 
    It will also strip punctuation and arabic numerals
    outside the range 1-29. All letters are made lowercase.

    :param json_file: Json file name containing documents and metadata.
    :type json_file: string-like
    
    :param doc_key: Name of the key for documents.
    :type doc_key: string-like

    :param label_key: Name of the key used for document labels. Labels are 
    used when a viewer function outputs a list of documents. Any field other
    than `doc_key` and `label_key` is stored as metadata.
    :type label_key: string-like

    :param encoding: A string indicating the file encoding or 'detect',
        in which case `chardet` is used to automatically guess the encoding.
        Default is `utf8`.
    :type encoding: string, optional
    
    :param nltk_stop: If `True` then the corpus object is masked using
        the NLTK English stop words. Default is `False`.
    :type nltk_stop: boolean, optional

    :param stop_freq: The upper bound for a word to be masked on the basis of its
        collection frequency. Default is 0.
    :type stop_freq: int, optional

    :param add_stop: A list of stop words. Default is `None`.
    :type add_stop: array-like, optional
    
    :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`.
    :type tokenizer: lambda s -> tokens

    :returns: c : a Corpus object
        Contains the tokenized corpus built from the input plain-text
        corpus. Document tokens are named `documents`.

    :See Also: :class:`vsm.corpus.Corpus`, 
        :meth:`vsm.corpus.util.paragraph_tokenize`, 
        :meth:`vsm.corpus.util.apply_stoplist`
    """
    import json

    if encoding == 'detect':
        encoding = detect_encoding(json_file)
    with open(json_file, 'r', encoding=encoding) as f:
        json_data = json.load(f)

    docs = []
    label = []
    metadata = []
    for i in json_data:
        docs.append(i.pop(doc_key, None).encode('ascii', 'ignore'))
        label.append(i.pop(label_key, None))
        metadata.append(i)  # metadata are all the rest

    docs = [tokenizer(d) for d in docs]

    corpus = sum(docs, [])
    tok = np.cumsum(np.array([len(d) for d in docs]))

    # add document label and metadata
    dtype = [('idx', np.array(tok).dtype), ('document_label', np.object_),
             ('metadata', np.array(metadata).dtype)
             ]  # todo: create separate dtype for each key?
    tok = np.array(list(zip(tok, label, metadata)), dtype=dtype)

    c = Corpus(corpus, context_data=[tok], context_types=['document'])
    if nltk_stop or stop_freq or add_stop:
        c = apply_stoplist(c,
                           nltk_stop=nltk_stop,
                           freq=stop_freq,
                           add_stop=add_stop)
    return c
Example #11
0
from collections import defaultdict
import numpy as np

from hyperbrain.parse import *
from vsm.corpus import Corpus

import sys
c = Corpus.load(sys.argv[-1])

# get all terms in corpus
abi_vocab = [word for word in c.words if word.startswith('abi:')]

# get all counts
abi_counts = defaultdict(int)
for word in abi_vocab:
    id = int(word.replace('abi:', ''))
    count = (c.corpus == c.words_int[word]).sum()
    abi_counts[id] = count


# calculate how many children there are of each node
def get_child_counts(key):
    if children[key]:
        return abi_counts[key] + sum([
            get_child_counts(child_key)
            for child_key in children[key] if child_key != key
        ])
    else:
        return abi_counts[key]

Example #12
0
def dir_corpus(plain_dir, chunk_name='article', encoding='utf8', 
               paragraphs=True, word_len=2, nltk_stop=True, stop_freq=1, 
               add_stop=None, corpus_sent=True, 
               ignore=['.log', '.pickle', '.xml'], decode=False, simple=False):
    """
    `dir_corpus` is a convenience function for generating Corpus
    objects from a directory of plain text files.

    `dir_corpus` will retain file-level tokenization and perform
    sentence and word tokenizations. Optionally, it will provide
    paragraph-level tokenizations.

    It will also strip punctuation and arabic numerals outside the
    range 1-29. All letters are made lowercase.

    :param plain_dir: String containing directory containing a 
        plain-text corpus.
    :type plain_dir: string-like
    
    :param chunk_name: The name of the tokenization corresponding 
        to individual files. For example, if the files are pages 
        of a book, one might set `chunk_name` to `pages`. Default 
        is `articles`.
    :type chunk_name: string-like, optional
    
    :param paragraphs: If `True`, a paragraph-level tokenization 
        is included. Defaults to `True`.
    :type paragraphs: boolean, optional
    
    :param word_len: Filters words whose lengths are <= word_len.
        Default is 2.
    :type word_len: int, optional

    :param nltk_stop: If `True` then the corpus object is masked 
        using the NLTK English stop words. Default is `False`.
    :type nltk_stop: boolean, optional
    
    :param stop_freq: The upper bound for a word to be masked on 
        the basis of its collection frequency. Default is 1.
    :type stop_freq: int, optional

    :param corpus_sent: If `True` a CorpusSent object is returned.
        Otherwise Corpus object is returned. Default is `True`. 
    :type corpus_sent: boolean, optional

    :param add_stop: A list of stop words. Default is `None`.
    :type add_stop: array-like, optional

    :param ignore: The list containing suffixes of files to be filtered.
        The suffix strings are normally file types. Default is ['.json',
        '.log', '.pickle'].
    :type ignore: list of strings, optional

    :returns: c : Corpus or CorpusSent
        Contains the tokenized corpus built from the input plain-text
        corpus. Document tokens are named `documents`.
    
    :See Also: :class: Corpus, :class: CorpusSent, :meth: dir_tokenize,
        :meth: apply_stoplist
    """
    chunks = []
    filenames = os.listdir(plain_dir)
    filenames = filter_by_suffix(filenames, ignore)
    filenames.sort()

    for filename in filenames:
        filename = os.path.join(plain_dir, filename)
        if encoding == 'detect':
            encoding = detect_encoding(filename)
        try:
            if decode:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(unidecode(f.read()))
            else:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(f.read())
        except UnicodeDecodeError:
            encoding = detect_encoding(filename)
            if decode:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(unidecode(f.read()))
            else:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(f.read())

    words, tok, sent = dir_tokenize(chunks, filenames, chunk_name=chunk_name,
                              paragraphs=paragraphs)
    names, data = zip(*tok.items())
    
    if corpus_sent:
        c = CorpusSent(words, sent, context_data=data, context_types=names,
			remove_empty=False)
    else:
        c = Corpus(words, context_data=data, context_types=names)
    
    in_place_stoplist(c, nltk_stop=nltk_stop, add_stop=add_stop, freq=stop_freq)

    return c
Example #13
0
 def _load_corpus(self, corpus_file):
     self.c = Corpus.load(corpus_file, load_corpus=False)
     self.labels = self.c.view_metadata(self.context_type)[self.label_name]
Example #14
0
   

def nested_arr_to_np(arr, arrarr=False):
    
    outli = []
    for r in arr:
        inli = []
        for c in r:
            inli.append(c)
        
        if arrarr:
            inli = np.array(inli)    
        outli.append(inli)
    
    return np.array(outli)

    
if __name__=='__main__':
    from vsm.corpus import Corpus
   
    path = '../org/knowceans/gibbstest/'
    c = Corpus.load(path+'church_corp.npz')

    writepath = '/home/doori/inpho/org/knowceans/gibbstest/'
    ctx = 'document'
    # java can't process '..' in the path.
    gw, m = lda_run(c, path+'churchcorp.txt', ctx, 10000, 2, 
                    writepath+'church-meta.txt', 0.01, 0.01)

    save(m, ctx, writepath+'church_lda.npz', writepath+'church-meta.txt')
Example #15
0
prop = []
for i in range(0, len(sttest)):
    if sttest[i][1] == 'start':
        prop = []
        inprop = True
    if endtest[i][1] == 'end':
        if prop != []:
            prop.append(sttest[i][0])
            props.append(prop)
            prop = []
        inprop = False
    if inprop:
        prop.append(sttest[i][0])

#Get topics for each prop
c = Corpus.load(exepath+sys.argv[4])
m = LDA.load(exepath+sys.argv[5])

v = LDAViewer(c,m)
stopwords = stopwords.words('english')

allowed_chars=string.ascii_letters
trans_table = string.maketrans('','')

print "Applying topic model"
#Remove props with only words in stoplist
vsmprops = []
np = []
for p in props:
    np = [w.lower() for w in p if w.lower() not in stopwords and not w.translate(trans_table,allowed_chars)]
    if len(np) > 0:
Example #16
0
def toy_corpus(plain_corpus,
               is_filename=False,
               encoding='utf8',
               nltk_stop=False,
               stop_freq=0,
               add_stop=None,
               decode=False,
               metadata=None,
               autolabel=False,
               tokenizer=word_tokenize,
               simple=False):
    """
    `toy_corpus` is a convenience function for generating Corpus
    objects from a given string or a single file.

    `toy_corpus` will perform both word and document-level
    tokenization. It will also strip punctuation and arabic numerals
    outside the range 1-29. All letters are made lowercase.

    Document tokens are delimited by two or more line breaks. E.g.,

        <document 0>

        <document 1>

        ...

        <document n>

    where <document i> is any chunk of text to be tokenized by word.

    :param plain_corpus: String containing a plain-text corpus or a 
        filename of a file containing one.
    :type plain_corpus: string-like
    
    :param is_filename: If `True` then `plain_corpus` is treated like
        a filename. Otherwise, `plain_corpus` is presumed to contain 
        the corpus. Default is `False`.
    :type is_filename: boolean, optional

    :param encoding: A string indicating the file encoding or 'detect',
        in which case `chardet` is used to automatically guess the encoding.
        Default is `utf8`.
    :type encoding: string, optional
    
    :param nltk_stop: If `True` then the corpus object is masked using
        the NLTK English stop words. Default is `False`.
    :type nltk_stop: boolean, optional

    :param stop_freq: The upper bound for a word to be masked on the basis of its
        collection frequency. Default is 0.
    :type stop_freq: int, optional

    :param add_stop: A list of stop words. Default is `None`.
    :type add_stop: array-like, optional
    
    :param decode: If `True` then unicode characters are converted to
        ASCII. Default is `False`.
    :type decode: boolean, optional

    :param metadata: A list of strings providing metadata about the documents. If
        provided, must have length equal to the number of documents.
        Default is `None`.
    :type metadata: array-like, optional
    
    :param autolabel: A boolean specifying whether to automatically label
        documents by position in file. Default is False
    :type metadata: boolean, optional
    
    :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`.
    :type tokenizer: lambda s -> tokens
    
    :returns: c : a Corpus object
        Contains the tokenized corpus built from the input plain-text
        corpus. Document tokens are named `documents`.

    :See Also: :class:`vsm.corpus.Corpus`, 
        :meth:`vsm.corpus.util.paragraph_tokenize`, 
        :meth:`vsm.corpus.util.apply_stoplist`
    """
    if is_filename:
        if encoding == 'detect':
            encoding = detect_encoding(plain_corpus)

        with open(plain_corpus, 'rb', encoding=encoding) as f:
            plain_corpus = f.read()

    if decode:
        plain_corpus = unidecode(plain_corpus)

    docs = paragraph_tokenize(plain_corpus)
    docs = [tokenizer(d) for d in docs]

    corpus = sum(docs, [])
    tok = np.cumsum(np.array([len(d) for d in docs]))

    if not metadata and autolabel:
        metadata = ['Document {0}'.format(i) for i in range(len(tok))]

    if metadata:
        if not len(metadata) == len(tok):
            msg = 'Metadata mismatch: metadata length is {0} and number'\
                   'of documents is {1}'.format(len(metadata), len(tok))
            raise Exception(msg)
        else:
            md_type = np.object_
            dtype = [('idx', np.array(tok).dtype), ('document_label', md_type)]
            tok = np.array(list(zip(tok, metadata)), dtype=dtype)
    else:
        dtype = [('idx', np.array(tok).dtype)]
        tok = np.array([(i, ) for i in tok], dtype=dtype)

    c = Corpus(corpus, context_data=[tok], context_types=['document'])
    if nltk_stop or stop_freq or add_stop:
        c = apply_stoplist(c,
                           nltk_stop=nltk_stop,
                           freq=stop_freq,
                           add_stop=add_stop)
    return c
Example #17
0
def main(args):
    from vsm.corpus import Corpus
    from vsm.model.lda import LDA

    config = ConfigParser()
    config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main", "topics"))))
            else:
                raise NoOptionError
        except NoOptionError:
            default = ' '.join(map(str, range(20,100,20)))

        while args.k is None:
            ks = raw_input("Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print "\nTIP: number of topics can be specified with argument '-k N N N ...':"
                    print "         vsm train %s -k %s\n" %\
                             (args.config_file, ' '.join(map(str, args.k)))
            except ValueError:
                print "Enter valid integers, separated by spaces!"
        
    if args.processes < 0:
        args.processes = multiprocessing.cpu_count() + args.processes

    print "Loading corpus... "
    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if model_pattern is not None and\
        bool_prompt("Existing models found. Continue training?", default=True):
    
        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None:
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration*1.5), min=m.iteration)
    
            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter, args.config_file)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main","topics"))
        if args.k != config_topics :
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)
        
            build_models(corpus, corpus_filename, model_path, 
                                         config.get("main", "context_type"),
                                         new_models, n_iterations=args.iter,
                                         n_proc=args.processes, seed=args.seed)

            model_pattern = continue_training(model_pattern, continuing_models,
                                              args.iter, n_proc=args.processes)

        else:
            model_pattern = continue_training(model_pattern, args.k, args.iter,
                                              n_proc=args.processes)

    else:
        # build a new model
        if args.iter is None:
            args.iter = int_prompt("Number of training iterations:", default=200)
    
            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter, args.config_file)
    
        ctxs = corpus.context_types
        ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
        if args.context_type not in ctxs:
            while args.context_type not in ctxs:
                contexts = ctxs[:]
                contexts[0] = contexts[0].upper()
                contexts = '/'.join(contexts)
                args.context_type = raw_input("Select a context type [%s] : " % contexts)
                if args.context_type.strip() == '':
                    args.context_type = ctxs[0]
                if args.context_type == ctxs[0].upper():
                    args.context_type = ctxs[0]
    
            print "\nTIP: context type can be specified with argument '--context-type TYPE':"
            print "         vsm train --context-type %s %s\n" % (args.context_type, args.config_file)
    
    
        print "\nTIP: This configuration can be automated as:"
        print "         vsm train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type, 
                ' '.join(map(str, args.k)))
        model_pattern = build_models(corpus, corpus_filename, model_path, 
                                     args.context_type, args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes, seed=args.seed,
                                     dry_run=args.dry_run)
    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))
    
    if not args.dry_run:
        with open(args.config_file, "wb") as configfh:
            config.write(configfh)
Example #18
0
def file_corpus(filename,
                encoding='utf8',
                nltk_stop=True,
                stop_freq=1,
                add_stop=None,
                decode=False,
                simple=False,
                tokenizer=word_tokenize):
    """
    `file_corpus` is a convenience function for generating Corpus
    objects from a a plain text corpus contained in a single string.

    `file_corpus` will strip punctuation and arabic numerals outside
    the range 1-29. All letters are made lowercase.

    :param filename: File name of the plain text file.
    :type plain_dir: string-like

    :param encoding: A string indicating the file encoding or 'detect',
        in which case `chardet` is used to automatically guess the encoding.
        Default is `utf8`.
    :type encoding: string, optional
    
    :param nltk_stop: If `True` then the corpus object is masked 
        using the NLTK English stop words. Default is `False`.
    :type nltk_stop: boolean, optional
    
    :param stop_freq: The upper bound for a word to be masked on 
        the basis of its collection frequency. Default is 1.
    :type stop_freq: int, optional
    
    :param add_stop: A list of stop words. Default is `None`.
    :type add_stop: array-like, optional
    
    :param decode: If `True` then unicode characters are converted to
        ASCII. Default is `False`.
    :type decode: boolean, optional

    :returns: c : a Corpus object
        Contains the tokenized corpus built from the input plain-text
        corpus. Document tokens are named `documents`.
    
    :See Also: :class:`vsm.corpus.Corpus`, 
        :meth:`file_tokenize`, 
        :meth:`vsm.corpus.util.apply_stoplist`
    """
    if encoding == 'detect':
        encoding = detect_encoding(filename)
    try:
        with open(filename, mode='r', encoding=encoding) as f:
            text = f.read()
    except UnicodeDecodeError:
        encoding = detect_encoding(filename)

    if decode:
        text = unidecode(text)

    words, tok = file_tokenize(text, simple=simple, tokenizer=tokenizer)
    names, data = list(zip(*list(tok.items())))

    c = Corpus(words, context_data=data, context_types=names)
    if nltk_stop or stop_freq or add_stop:
        c = apply_stoplist(c,
                           nltk_stop=nltk_stop,
                           freq=stop_freq,
                           add_stop=add_stop)
    return c
Example #19
0
def main(args):
    config = ConfigParser()
    config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main",
                                                            "topics"))))
            else:
                raise NoOptionError
        except NoOptionError:
            default = ' '.join(map(str, range(20, 100, 20)))

        while args.k is None:
            ks = raw_input(
                "Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print "\nTIP: number of topics can be specified with argument '-k N N N ...':"
                    print "         vsm train %s -k %s\n" %\
                             (args.config_file, ' '.join(map(str, args.k)))
            except ValueError:
                print "Enter valid integers, separated by spaces!"

    if args.processes < 0:
        args.processes = multiprocessing.cpu_count() + args.processes

    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if model_pattern is not None and\
        bool_prompt("Existing models found. Continue training?", default=True):

        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None:
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration * 1.5),
                                   min=m.iteration)

            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter,
                                                         args.config_file)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main", "topics"))
        if args.k != config_topics:
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)

            build_models(corpus,
                         corpus_filename,
                         model_path,
                         config.get("main", "context_type"),
                         new_models,
                         n_iterations=args.iter,
                         n_proc=args.processes,
                         seed=args.seed)

            model_pattern = continue_training(model_pattern,
                                              continuing_models,
                                              args.iter,
                                              n_proc=args.processes)

        else:
            model_pattern = continue_training(model_pattern,
                                              args.k,
                                              args.iter,
                                              n_proc=args.processes)

    else:
        # build a new model
        if args.iter is None:
            args.iter = int_prompt("Number of training iterations:",
                                   default=200)

            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter,
                                                         args.config_file)

        ctxs = corpus.context_types
        ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
        if args.context_type not in ctxs:
            while args.context_type not in ctxs:
                contexts = ctxs[:]
                contexts[0] = contexts[0].upper()
                contexts = '/'.join(contexts)
                args.context_type = raw_input("Select a context type [%s] : " %
                                              contexts)
                if args.context_type.strip() == '':
                    args.context_type = ctxs[0]
                if args.context_type == ctxs[0].upper():
                    args.context_type = ctxs[0]

            print "\nTIP: context type can be specified with argument '--context-type TYPE':"
            print "         vsm train --context-type %s %s\n" % (
                args.context_type, args.config_file)

        print "\nTIP: This configuration can be automated as:"
        print "         vsm train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type,
                ' '.join(map(str, args.k)))

        model_pattern = build_models(corpus,
                                     corpus_filename,
                                     model_path,
                                     args.context_type,
                                     args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes,
                                     seed=args.seed)

    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))

    with open(args.config_file, "wb") as configfh:
        config.write(configfh)
Example #20
0
def dir_corpus(plain_dir,
               chunk_name='article',
               encoding='utf8',
               paragraphs=True,
               ignore=IGNORE,
               nltk_stop=True,
               stop_freq=1,
               add_stop=None,
               decode=False,
               verbose=1,
               simple=False,
               tokenizer=word_tokenize):
    """
    `dir_corpus` is a convenience function for generating Corpus
    objects from a directory of plain text files.

    `dir_corpus` will retain file-level tokenization and perform
    sentence and word tokenizations. Optionally, it will provide
    paragraph-level tokenizations.

    It will also strip punctuation and arabic numerals outside the
    range 1-29. All letters are made lowercase.

    :param plain_dir: String containing directory containing a 
        plain-text corpus.
    :type plain_dir: string-like
    
    :param chunk_name: The name of the tokenization corresponding 
        to individual files. For example, if the files are pages 
        of a book, one might set `chunk_name` to `pages`. Default 
        is `articles`.
    :type chunk_name: string-like, optional
    
    :param encoding: A string indicating the file encoding or 'detect',
        in which case `chardet` is used to automatically guess the encoding.
        Default is `utf8`.
    :type encoding: string, optional
    
    :param paragraphs: If `True`, a paragraph-level tokenization 
        is included. Defaults to `True`.
    :type paragraphs: boolean, optional
    
    :param ignore: The list containing suffixes of files to be filtered.
        The suffix strings are normally file types. Default is ['.json',
        '.log','.pickle', '.DS_Store'].
    :type ignore: list of strings, optional

    :param nltk_stop: If `True` then the corpus object is masked 
        using the NLTK English stop words. Default is `False`.
    :type nltk_stop: boolean, optional
    
    :param stop_freq: The upper bound for a word to be masked on 
        the basis of its collection frequency. Default is 1.
    :type stop_freq: int, optional
    
    :param add_stop: A list of stop words. Default is `None`.
    :type add_stop: array-like, optional
    
    :param decode: If `True` then unicode characters are converted to
        ASCII. Default is `False`.
    :type decode: boolean, optional

    :param verbose: Verbosity level. 1 prints a progress bar.
    :type verbose: int, default 1 

    :returns: c : a Corpus object
        Contains the tokenized corpus built from the input plain-text
        corpus. Document tokens are named `documents`.
    
    :See Also: :class:`vsm.corpus.Corpus`, 
            :meth:`dir_tokenize`, 
            :meth:`vsm.corpus.util.apply_stoplist`
    """
    chunks = []
    filenames = os.listdir(plain_dir)
    filenames = filter_by_suffix(filenames, ignore)
    filenames.sort()

    for filename in filenames:
        filename = os.path.join(plain_dir, filename)
        if encoding == 'detect':
            encoding = detect_encoding(filename)
        try:
            if decode:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(unidecode(f.read()))
            else:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(f.read())
        except UnicodeDecodeError:
            encoding = detect_encoding(filename)
            if decode:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(unidecode(f.read()))
            else:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(f.read())

    words, tok = dir_tokenize(chunks,
                              filenames,
                              chunk_name=chunk_name,
                              paragraphs=paragraphs,
                              verbose=verbose,
                              simple=simple,
                              tokenizer=tokenizer)
    names, data = list(zip(*list(tok.items())))

    c = Corpus(words, context_data=data, context_types=names)
    if nltk_stop or stop_freq or add_stop:
        c = apply_stoplist(c,
                           nltk_stop=nltk_stop,
                           freq=stop_freq,
                           add_stop=add_stop)
    return c
Example #21
0
    def test_LdaCgsQuerySampler_init(self):

        old_corp = Corpus([], remove_empty=False)
        old_corp.corpus = np.array([ 0, 1, 1, 0, 0, 1 ], dtype='i')
        old_corp.context_data = [ np.array([(3, ), (3, )], dtype=[('idx', 'i')]) ]
        old_corp.context_types = [ 'document' ]
        old_corp.words = np.array([ '0', '1' ], dtype='i')
        old_corp.words_int = { '0': 0, '1': 1 }

        new_corp = Corpus([], remove_empty=False)
        new_corp.corpus = np.array([ 0, 0 ], dtype='i')
        new_corp.context_data = [ np.array([(2, )], dtype=[('idx', 'i')]) ]
        new_corp.context_types = [ 'document' ]
        new_corp.words = np.array([ '0', '1' ], dtype='i')
        new_corp.words_int = { '0': 0, '1': 1 }

        m = LdaCgsSeq(corpus=old_corp, context_type='document', K=2, V=2)
        m.Z[:] = np.array([0, 0, 0, 1, 1, 1], dtype='i')
        m.word_top[:] = np.array([[ 1.01, 2.01 ],
                                  [ 2.01, 1.01 ]], dtype='d')
        m.top_doc[:] = np.array([[ 3.01, 0.01 ], 
                                 [ 0.01, 3.01 ]], dtype='d')
        m.inv_top_sums[:] = 1. / m.word_top.sum(0)

        q = LdaCgsQuerySampler(m, new_corpus=new_corp, old_corpus=old_corp)
        self.assertTrue(q.V==2)
        self.assertTrue(q.K==2)
        self.assertTrue(len(q.corpus)==2)
        self.assertTrue((q.corpus==new_corp.corpus).all())
        self.assertTrue(len(q.indices)==1)
        self.assertTrue((q.indices==
                         new_corp.view_metadata('document')['idx']).all())
        self.assertTrue(q.word_top.shape==(2, 2))
        self.assertTrue((q.word_top==m.word_top).all())
        self.assertTrue(q.top_doc.shape==(2, 1))
        self.assertTrue((q.top_doc==[[ 0.01 ],
                                     [ 0.01 ]]).all())
        self.assertTrue(q.inv_top_sums.shape==(2, ))
        self.assertTrue((q.inv_top_sums==m.inv_top_sums).all())
        self.assertTrue(q.alpha.shape==(2, 1))
        self.assertTrue((q.alpha==m.alpha).all())
        self.assertTrue(q.beta.shape==(2, 1))
        self.assertTrue((q.beta==m.beta).all())
Example #22
0
def random_corpus(corpus_len,
                  n_words,
                  min_token_len,
                  max_token_len,
                  context_type='document',
                  metadata=False,
                  seed=None):
    """
    Generates a random integer corpus.

    :param corpus_len: Size of the Corpus.
    :type corpus_len: int

    :param n_words: Number of words to draw random integers from.
    :type n_words: int

    :param min_token_len: minimum token length used to create indices
        for corpus.
    :type min_token_len: int

    :param max_token_len: maximum token length used to create indices
        for corpus.
    :type max_token_len: int

    :param context_type: A type of tokenization. Default is 'document'.
    :type context_type: string, optional

    :param metadata: If `True` generates metadata. If `False` the only
        metadata for the corpus is the index information.
    :type metadata: boolean, optional
    
    :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`.
    :type tokenizer: lambda s -> tokens

    :returns: Corpus object with random integers as its entries. 

    :See Also: :class:`vsm.corpus.Corpus`
    """
    random_state = np.random.RandomState(seed)
    corpus = random_state.randint(n_words, size=corpus_len)
    corpus = [str(word) for word in corpus]

    indices = []
    i = np.random.randint(min_token_len, max_token_len)
    while i < corpus_len:
        indices.append(i)
        i += np.random.randint(min_token_len, max_token_len)
    indices.append(corpus_len)

    if metadata:
        metadata_ = [
            '{0}_{1}'.format(context_type, i) for i in range(len(indices))
        ]
        dtype = [('idx', np.array(indices).dtype),
                 (context_type + '_label', np.object_)]
        rand_tok = np.array(list(zip(indices, metadata_)), dtype=dtype)
    else:
        rand_tok = np.array([(i, ) for i in indices],
                            dtype=[('idx', np.array(indices).dtype)])

    return Corpus(corpus,
                  context_types=[context_type],
                  context_data=[rand_tok])
Example #23
0
    def test_LdaCgsQuerySampler_init(self):

        old_corp = Corpus([], remove_empty=False)
        old_corp.corpus = np.array([0, 1, 1, 0, 0, 1], dtype='i')
        old_corp.context_data = [
            np.array([(3, ), (3, )], dtype=[('idx', 'i')])
        ]
        old_corp.context_types = ['document']
        old_corp.words = np.array(['0', '1'], dtype='i')
        old_corp.words_int = {'0': 0, '1': 1}

        new_corp = Corpus([], remove_empty=False)
        new_corp.corpus = np.array([0, 0], dtype='i')
        new_corp.context_data = [np.array([(2, )], dtype=[('idx', 'i')])]
        new_corp.context_types = ['document']
        new_corp.words = np.array(['0', '1'], dtype='i')
        new_corp.words_int = {'0': 0, '1': 1}

        m = LdaCgsSeq(corpus=old_corp, context_type='document', K=2, V=2)
        m.Z[:] = np.array([0, 0, 0, 1, 1, 1], dtype='i')
        m.word_top[:] = np.array([[1.01, 2.01], [2.01, 1.01]], dtype='d')
        m.top_doc[:] = np.array([[3.01, 0.01], [0.01, 3.01]], dtype='d')
        m.inv_top_sums[:] = 1. / m.word_top.sum(0)

        q = LdaCgsQuerySampler(m, new_corpus=new_corp, old_corpus=old_corp)
        self.assertTrue(q.V == 2)
        self.assertTrue(q.K == 2)
        self.assertTrue(len(q.corpus) == 2)
        self.assertTrue((q.corpus == new_corp.corpus).all())
        self.assertTrue(len(q.indices) == 1)
        self.assertTrue(
            (q.indices == new_corp.view_metadata('document')['idx']).all())
        self.assertTrue(q.word_top.shape == (2, 2))
        self.assertTrue((q.word_top == m.word_top).all())
        self.assertTrue(q.top_doc.shape == (2, 1))
        self.assertTrue((q.top_doc == np.array([[0.01], [0.01]],
                                               dtype=q.top_doc.dtype)).all())
        self.assertTrue(q.inv_top_sums.shape == (2, ))
        self.assertTrue((q.inv_top_sums == m.inv_top_sums).all())
        self.assertTrue(q.alpha.shape == (2, 1))
        self.assertTrue((q.alpha == m.alpha).all())
        self.assertTrue(q.beta.shape == (2, 1))
        self.assertTrue((q.beta == m.beta).all())
Example #24
0
def main(args):
    global context_type, lda_c, lda_m, lda_v, label, id_fn
    
    # load in the configuration file
    config = ConfigParser({
        'certfile' : None,
        'keyfile' : None,
        'ca_certs' : None,
        'ssl' : False,
        'port' : '8000',
        'host' : '0.0.0.0',
        'topic_range' : '{0},{1},1'.format(args.k, args.k+1),
        'icons': 'link',
        'corpus_link' : None,
        'doc_title_format' : None,
        'doc_url_format' : None,
        'topics': None})
    config.read(args.config)

    # path variables
    path = config.get('main', 'path')
    context_type = config.get('main', 'context_type')
    corpus_file = config.get('main', 'corpus_file')
    model_pattern = config.get('main', 'model_pattern') 

    # automatic port assignment

    def test_port(port):
        try:
            host = args.host or config.get("www","host")
            if host == '0.0.0.0':
                host = 'localhost'
            try:
                s = socket.create_connection((host,port), 2)
                s.close()
                raise IOError("Socket connectable on port {0}".format(port))
            except socket.error:
                pass
            return port
        except IOError:
            port = int_prompt(
                "Conflict on port {0}. Enter new port:".format(port)) 
            return test_port(port)

    port = args.port or int(config.get('www','port').format(0)) + args.k
    port = test_port(port)
    
    # prompt to save
    if (int(config.get("www","port").format(0)) + args.k) != port:
        if bool_prompt("Change default baseport to {0}?".format(port - args.k),
                       default=True):
            config.set("www","port", str(port - args.k))

            # create deep copy of configuration
            # see http://stackoverflow.com/a/24343297
            config_string = StringIO()
            config.write(config_string)

            # skip DEFAULT section
            config_string.seek(0)
            idx = config_string.getvalue().index("[main]")
            config_string.seek(idx)

            # read deep copy
            new_config = ConfigParser()
            new_config.readfp(config_string)

            # write deep copy without DEFAULT section
            # this preserves DEFAULT for rest of program
            with open(args.config, 'wb') as configfh:
                new_config.write(configfh)


    # hostname assignment
    host = args.host or config.get('www','host')

    # LDA objects
    lda_c = Corpus.load(corpus_file)
    lda_m = None
    lda_v = None
    def load_model(k):
        global lda_m, lda_v
        lda_m = LDA.load(model_pattern.format(k))
        lda_v = LDAViewer(lda_c, lda_m)

    load_model(args.k)

    # label function imports
    try:
        label_module = config.get('main', 'label_module')
        label_module = import_module(label_module)
        print "imported label module"
        label_module.init(config.get('main','path'), lda_v, context_type)
    except (ImportError, NoOptionError, AttributeError):
        pass

    try:
        label = label_module.label
        print "imported label function"
    except (AttributeError, UnboundLocalError):
        label = lambda x: x
        print "using default label function"
        
    try:
        id_fn = label_module.id_fn
        print "imported id function"
    except (AttributeError, UnboundLocalError):
        id_fn = def_label_fn
        print "using default id function"

    config_icons = config.get('www','icons').split(",")

    @route('/icons.js')
    def icons():
        with open(resource_filename(__name__, '../www/icons.js')) as icons:
            text = '{0}\n var icons = {1};'\
                .format(icons.read(), json.dumps(config_icons))
        return text


    # index page parameterization
    corpus_name = config.get('www','corpus_name')
    corpus_link = config.get('www','corpus_link')
    doc_title_format = config.get('www', 'doc_title_format')
    doc_url_format = config.get('www', 'doc_url_format')

    if config.get('main', 'topic_range'):
        topic_range = map(int, config.get('main', 'topic_range').split(','))
        topic_range = range(*topic_range)
    if config.get('main', 'topics'):
        topic_range = eval(config.get('main', 'topics'))
    topic_range = [{'k' : k, 'port' : int(config.get('www','port').format(0)) + k} 
                        for k in topic_range] 

    renderer = pystache.Renderer(escape=lambda u: u)

    @route('/')
    def index():
        response.set_header('Expires', _cache_date())

        with open(resource_filename(__name__, '../www/index.mustache.html'),
                  encoding='utf-8') as tmpl_file:
            template = tmpl_file.read()
        return renderer.render(template, 
            {'corpus_name' : corpus_name,
             'corpus_link' : corpus_link,
             'context_type' : context_type,
             'topic_range' : topic_range,
             'doc_title_format' : doc_title_format,
             'doc_url_format' : doc_url_format})


    @route('/<filename:path>')
    @_set_acao_headers
    def send_static(filename):
        return static_file(filename, root=resource_filename(__name__, '../www/'))

    if args.ssl or config.get('main', 'ssl'):
        certfile = args.certfile or config.get('ssl', 'certfile')
        keyfile = args.keyfile or config.get('ssl', 'keyfile')
        ca_certs = args.ca_certs or config.get('ssl', 'ca_certs')

        run(host=host, port=port, server=SSLWSGIRefServer,
            certfile=certfile, keyfile=keyfile, ca_certs=ca_certs)
    else:
        run(host=host, port=port)
Example #25
0
 def _load_corpus(self, corpus_file):
     self.c = Corpus.load(corpus_file, load_corpus=False)
     self.labels = self.c.view_metadata(self.context_type)[self.label_name]
Example #26
0

def nested_arr_to_np(arr, arrarr=False):

    outli = []
    for r in arr:
        inli = []
        for c in r:
            inli.append(c)

        if arrarr:
            inli = np.array(inli)
        outli.append(inli)

    return np.array(outli)


if __name__ == '__main__':
    from vsm.corpus import Corpus

    path = '../org/knowceans/gibbstest/'
    c = Corpus.load(path + 'church_corp.npz')

    writepath = '/home/doori/inpho/org/knowceans/gibbstest/'
    ctx = 'document'
    # java can't process '..' in the path.
    gw, m = lda_run(c, path + 'churchcorp.txt', ctx, 10000, 2,
                    writepath + 'church-meta.txt', 0.01, 0.01)

    save(m, ctx, writepath + 'church_lda.npz', writepath + 'church-meta.txt')
Example #27
0
def main(args):
    from vsm.corpus import Corpus
    config = ConfigParser({"htrc": False})
    config.read(args.config_file)
    
    if args.lang is None:
        args.lang = []

    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)
    
    # check for htrc metadata
    if args.htrc or config.get("main","htrc"):
        htrc_langs = get_htrc_langs(args)
        if htrc_langs:
            args.lang.extend(new_langs)

    # auto-guess a language
    """
    new_langs = [lang for lang in detect_langs(c) if lang in langs and lang not in args.lang]
    if new_langs:
        args.lang.extend(new_langs)
    """

    # check for any new candidates
    args.lang = [lang for lang in args.lang if stop_language(c, langs[lang])]
    if args.lang and not args.quiet:
        args.lang = lang_prompt(args.lang) 

    stoplist = set() 
    # Apply stop words
    print " "
    for lang in args.lang:
        print "Applying", langs[lang], "stopwords"
        candidates = stop_language(c, langs[lang])
        if len(candidates):
            stoplist.update(candidates)

    # Apply custom stopwords file
    if args.stopword_file:
        with open(args.stopword_file, encoding='utf8') as swf:
            candidates = [unidecode(word.strip()) for word in swf]
            if len(candidates):
                print "Applying custom stopword file to remove {} word{}.".format(len(candidates),
                's' if len(candidates) > 1 else '')
                stoplist.update(candidates)

    if args.min_word_len:
        candidates = get_small_words(c, args.min_word_len)
        if len(candidates):
            print "Filtering {} small word{} with less than {} characters.".format(len(candidates),
                's' if len(candidates) > 1 else '', args.min_word_len)
            stoplist.update(candidates)
    
    if not args.special_chars:
        candidates = get_special_chars(c)
        if len(candidates):
            print "Filtering {} word{} with special characters.".format(len(candidates),
                's' if len(candidates) > 1 else '')
            stoplist.update(candidates)
   
    if not args.high_filter:
        high_filter, candidates = get_high_filter(args, c, words=stoplist)
        if len(candidates):
            print "Filtering {} high frequency word{}.".format(len(candidates),
                's' if len(candidates) > 1 else '')
            stoplist.update(candidates)
    else:
        high_filter = args.high_filter
        candidates = get_candidate_words(c,args.high_filter, sort=False)
        if len(candidates):
            print "Filtering {} high frequency word{}.".format(len(candidates),
                's' if len(candidates) > 1 else '')
            stoplist.update(candidates)

    if not args.low_filter:
        low_filter, candidates = get_low_filter(args, c, words=stoplist)
        if len(candidates):
            print "Filtering {} low frequency word{}.".format(len(candidates),
                's' if len(candidates) > 1 else '')
            stoplist.update(candidates)
    else:
        low_filter = args.low_filter
        candidates  = get_candidate_words(c, -1*args.low_filter, sort=False)
        if len(candidates):
            print "Filtering {} low frequency words.".format(len(candidates))
            stoplist.update(candidates)

    if stoplist:
        print "\n\nApplying {} stopword{}".format(len(stoplist),
                's' if len(stoplist) > 1 else '')
        c.in_place_stoplist(stoplist)
        print "\n"

    def name_corpus(dirname, languages, lowfreq=None, highfreq=None):
        items, counts = get_items_counts(c.corpus)

        corpus_name = [dirname]
        if args.lang:
            corpus_name.append('nltk')
            corpus_name.append(''.join(args.lang))
        if lowfreq > 0:
            corpus_name.append('freq%s'%lowfreq)
        else:
            corpus_name.append('freq%s'%min(counts))

        if highfreq > 0:
            corpus_name.append('N%s'%highfreq)
        else:
            corpus_name.append('freq%s'%max(counts))

        corpus_name = '-'.join(corpus_name)
        corpus_name += '.npz'
        return corpus_name
   
    dirname = os.path.basename(args.corpus_path).split('-nltk-')[0].replace('.npz','')
    corpus_name = name_corpus(dirname, ['en'], low_filter, high_filter)

    model_path = os.path.dirname(args.corpus_path)
    args.corpus_path = os.path.join(model_path, corpus_name) 
    c.save(args.corpus_path)

    config.set("main", "corpus_file", args.corpus_path)
    config.remove_option("main", "model_pattern")
    with open(args.config_file, 'wb') as configfh:
        config.write(configfh)