Esempio n. 1
0
def page_url(corpus, ctx_type, book_path, book_id, jsonfile):
    """
    Modified htrc_*_label_fn. The individual volumes don't have 'book' as a context type.
    """
    import json
    from vsm.viewer import doc_label_name
    import re

    urls = []
    corp_md = corpus.view_metadata('page')

    jsonpath = os.path.join(book_path, jsonfile)
    with open(jsonpath, 'r') as f:
        md = json.load(f)
        url = ''
        li = sorted(md['items'], key=lambda k: int(k['lastUpdate']))
        url = li[-1]['itemURL']
            
        if ctx_type == 'book':
            urls.append( unidecode(url))
        else: # urls for pages
            page_md = corpus.view_metadata('page')
            files = page_md[doc_label_name('page')] 
            
            nums = [re.findall('[1-9][0-9]*', a)[-1] for a in files]
            for i in nums:
                s = url + '?urlappend=%3Bseq={0}'.format(i)
                urls.append( unidecode(s))
    return urls
Esempio n. 2
0
File: htrc.py Progetto: inpho/vsm
def page_url(corpus, ctx_type, book_path, book_id, jsonfile):
    """
    Modified htrc_*_label_fn. The individual volumes don't have 'book' as a context type.
    """
    import json
    from vsm.viewer import doc_label_name
    import re

    urls = []
    corp_md = corpus.view_metadata("page")

    jsonpath = os.path.join(book_path, jsonfile)
    with open(jsonpath, "r") as f:
        md = json.load(f)
        url = ""
        li = sorted(md["items"], key=lambda k: int(k["lastUpdate"]))
        url = li[-1]["itemURL"]

        if ctx_type == "book":
            urls.append(unidecode(url))
        else:  # urls for pages
            page_md = corpus.view_metadata("page")
            files = page_md[doc_label_name("page")]

            nums = [re.findall("[1-9][0-9]*", a)[-1] for a in files]
            for i in nums:
                s = url + "?urlappend=%3Bseq={0}".format(i)
                urls.append(unidecode(s))
    return urls
Esempio n. 3
0
def url_metadata(corpus, ctx_type, coll_dir):
    """
    Returns a list of urls whose order matches with the existing metadata.
    It creates url metadata that can be added to a Corpus object with
    add_metadata function in vsm.corpus.util.
    """

    import json
    from vsm.viewer import doc_label_name

    md = []
    corp_md = corpus.view_metadata('book')
    book_labels = corp_md[doc_label_name('book')]
    
    for book_label in book_labels:
        coll_path = os.path.join(coll_dir, book_label)
        booklist = os.listdir(coll_path)
        book = filter_by_suffix(booklist, ignore=['.txt', '.pickle'])
        
        book_path = os.path.join(coll_path, book[0])
        with open(book_path, 'r') as f:
            d = json.load(f)
            url = ''
            li = sorted(d['items'], key=lambda k: int(k['lastUpdate']))
            url = li[-1]['itemURL']

            if ctx_type == 'book':
                md.append( unidecode(url))
            else:
                for i in xrange(1, len(booklist)):
                    s = url + '?urlappend=%3Bseq={0}'.format(i)
                    md.append( unidecode(s))
    return md
Esempio n. 4
0
def url_metadata(corpus, ctx_type, coll_dir):
    """
    Returns a list of urls whose order matches with the existing metadata.
    It creates url metadata that can be added to a Corpus object with
    add_metadata function in vsm.ext.corpusbuilders.util.

    :param corpus: Corpus to add url metadata to. Urls match with the existing
        metadata of `corpus`.
    :type corpus: Corpus

    :param ctx_type: A type of tokenization.
    :type ctx_type: string

    :param coll_dir: Path for the collection directory. Either htrc 86 plain
        or htrc 1315 plain directory.
    :type coll_dir: string

    :returns: md : List of urls to be added to corpus

    :See Also: :meth: add_metadata
    """

    import json
    from vsm.viewer import doc_label_name
    import re

    urls = []
    corp_md = corpus.view_metadata('book')
    book_labels = corp_md[doc_label_name('book')]

    for book_label in book_labels:
        coll_path = os.path.join(coll_dir, book_label)
        booklist = os.listdir(coll_path)
        book = filter_by_suffix(booklist, ignore=['.txt', '.pickle'])

        book_path = os.path.join(coll_path, book[0])
        with open(book_path, 'r') as f:
            d = json.load(f)
            url = ''
            li = sorted(d['items'], key=lambda k: int(k['lastUpdate']))
            url = li[-1]['itemURL']

            if ctx_type == 'book':
                urls.append(unidecode(url))
            else:  # urls for pages
                page_md = corpus.view_metadata('page')
                files = [
                    a for a in page_md['file'] if a.startswith(book_label)
                ]
                nums = [re.findall('[1-9][0-9]*', a)[-1] for a in files]
                for i in nums:
                    s = url + '?urlappend=%3Bseq={0}'.format(i)
                    urls.append(unidecode(s))
    return urls
Esempio n. 5
0
File: htrc.py Progetto: inpho/vsm
def url_metadata(corpus, ctx_type, coll_dir):
    """
    Returns a list of urls whose order matches with the existing metadata.
    It creates url metadata that can be added to a Corpus object with
    add_metadata function in vsm.ext.corpusbuilders.util.

    :param corpus: Corpus to add url metadata to. Urls match with the existing
        metadata of `corpus`.
    :type corpus: Corpus

    :param ctx_type: A type of tokenization.
    :type ctx_type: string

    :param coll_dir: Path for the collection directory. Either htrc 86 plain
        or htrc 1315 plain directory.
    :type coll_dir: string

    :returns: md : List of urls to be added to corpus

    :See Also: :meth: add_metadata
    """

    import json
    from vsm.viewer import doc_label_name
    import re

    urls = []
    corp_md = corpus.view_metadata("book")
    book_labels = corp_md[doc_label_name("book")]

    for book_label in book_labels:
        coll_path = os.path.join(coll_dir, book_label)
        booklist = os.listdir(coll_path)
        book = filter_by_suffix(booklist, ignore=[".txt", ".pickle"])

        book_path = os.path.join(coll_path, book[0])
        with open(book_path, "r") as f:
            d = json.load(f)
            url = ""
            li = sorted(d["items"], key=lambda k: int(k["lastUpdate"]))
            url = li[-1]["itemURL"]

            if ctx_type == "book":
                urls.append(unidecode(url))
            else:  # urls for pages
                page_md = corpus.view_metadata("page")
                files = [a for a in page_md["file"] if a.startswith(book_label)]
                nums = [re.findall("[1-9][0-9]*", a)[-1] for a in files]
                for i in nums:
                    s = url + "?urlappend=%3Bseq={0}".format(i)
                    urls.append(unidecode(s))
    return urls
Esempio n. 6
0
def sim_doc_doc(corp, mat, context_type, doc_or_docs, weights=None,
                norms=None, filter_nan=True, print_len=10,
                label_fn=def_label_fn, as_strings=True,
                sim_fn=row_cosines, order='d'):
    """
    """
    # Resolve `doc_or_docs`
    label_name = doc_label_name(context_type)    
    if (isstr(doc_or_docs) or isint(doc_or_docs) 
        or isinstance(doc_or_docs, dict)):
        docs = [res_doc_type(corp, context_type, label_name, doc_or_docs)[0]]
    else:
        docs = [res_doc_type(corp, context_type, label_name, d)[0] 
                for d in doc_or_docs]

    # Assume documents are columns, so transpose
    mat = mat.T

    # Generate pseudo-document
    if issparse(mat):
        rows = mat.tocsr()[docs].toarray()
    else:
        rows = mat[docs]
    doc = np.average(rows, weights=weights, axis=0)[np.newaxis, :]

    # Compute cosines
    d_arr = sim_fn(doc, mat, norms=norms)

    # Label data
    if as_strings:
        md = corp.view_metadata(context_type)
        docs = label_fn(md)
        d_arr = enum_sort(d_arr, indices=docs, field_name='doc')
    else:
        d_arr = enum_sort(d_arr, filter_nan=filter_nan)

    if order=='d':
        pass
    elif order=='i':
        w_arr = w_arr[::-1]
    else:
        raise Exception('Invalid order parameter.')

    d_arr = d_arr.view(LabeledColumn)
    # TODO: Finish this header
    d_arr.col_header = 'Documents: '
    d_arr.subcol_headers = ['Document', 'Value']
    d_arr.col_len = print_len

    return d_arr
Esempio n. 7
0
def simmat_documents(corp, matrix, context_type, doc_list,
                     norms=None, sim_fn=row_cos_mat):
    """
    """
    label_name = doc_label_name(context_type)

    indices, labels = zip(*[res_doc_type(corp, context_type, label_name, doc) 
                            for doc in doc_list])
    indices, labels = np.array(indices), np.array(labels)

    sm = sim_fn(indices, matrix.T, norms=norms, fill_tril=True)
    sm = sm.view(IndexedSymmArray)
    sm.labels = labels
    
    return sm