def page_url(corpus, ctx_type, book_path, book_id, jsonfile): """ Modified htrc_*_label_fn. The individual volumes don't have 'book' as a context type. """ import json from vsm.viewer import doc_label_name import re urls = [] corp_md = corpus.view_metadata('page') jsonpath = os.path.join(book_path, jsonfile) with open(jsonpath, 'r') as f: md = json.load(f) url = '' li = sorted(md['items'], key=lambda k: int(k['lastUpdate'])) url = li[-1]['itemURL'] if ctx_type == 'book': urls.append( unidecode(url)) else: # urls for pages page_md = corpus.view_metadata('page') files = page_md[doc_label_name('page')] nums = [re.findall('[1-9][0-9]*', a)[-1] for a in files] for i in nums: s = url + '?urlappend=%3Bseq={0}'.format(i) urls.append( unidecode(s)) return urls
def page_url(corpus, ctx_type, book_path, book_id, jsonfile): """ Modified htrc_*_label_fn. The individual volumes don't have 'book' as a context type. """ import json from vsm.viewer import doc_label_name import re urls = [] corp_md = corpus.view_metadata("page") jsonpath = os.path.join(book_path, jsonfile) with open(jsonpath, "r") as f: md = json.load(f) url = "" li = sorted(md["items"], key=lambda k: int(k["lastUpdate"])) url = li[-1]["itemURL"] if ctx_type == "book": urls.append(unidecode(url)) else: # urls for pages page_md = corpus.view_metadata("page") files = page_md[doc_label_name("page")] nums = [re.findall("[1-9][0-9]*", a)[-1] for a in files] for i in nums: s = url + "?urlappend=%3Bseq={0}".format(i) urls.append(unidecode(s)) return urls
def url_metadata(corpus, ctx_type, coll_dir): """ Returns a list of urls whose order matches with the existing metadata. It creates url metadata that can be added to a Corpus object with add_metadata function in vsm.corpus.util. """ import json from vsm.viewer import doc_label_name md = [] corp_md = corpus.view_metadata('book') book_labels = corp_md[doc_label_name('book')] for book_label in book_labels: coll_path = os.path.join(coll_dir, book_label) booklist = os.listdir(coll_path) book = filter_by_suffix(booklist, ignore=['.txt', '.pickle']) book_path = os.path.join(coll_path, book[0]) with open(book_path, 'r') as f: d = json.load(f) url = '' li = sorted(d['items'], key=lambda k: int(k['lastUpdate'])) url = li[-1]['itemURL'] if ctx_type == 'book': md.append( unidecode(url)) else: for i in xrange(1, len(booklist)): s = url + '?urlappend=%3Bseq={0}'.format(i) md.append( unidecode(s)) return md
def url_metadata(corpus, ctx_type, coll_dir): """ Returns a list of urls whose order matches with the existing metadata. It creates url metadata that can be added to a Corpus object with add_metadata function in vsm.ext.corpusbuilders.util. :param corpus: Corpus to add url metadata to. Urls match with the existing metadata of `corpus`. :type corpus: Corpus :param ctx_type: A type of tokenization. :type ctx_type: string :param coll_dir: Path for the collection directory. Either htrc 86 plain or htrc 1315 plain directory. :type coll_dir: string :returns: md : List of urls to be added to corpus :See Also: :meth: add_metadata """ import json from vsm.viewer import doc_label_name import re urls = [] corp_md = corpus.view_metadata('book') book_labels = corp_md[doc_label_name('book')] for book_label in book_labels: coll_path = os.path.join(coll_dir, book_label) booklist = os.listdir(coll_path) book = filter_by_suffix(booklist, ignore=['.txt', '.pickle']) book_path = os.path.join(coll_path, book[0]) with open(book_path, 'r') as f: d = json.load(f) url = '' li = sorted(d['items'], key=lambda k: int(k['lastUpdate'])) url = li[-1]['itemURL'] if ctx_type == 'book': urls.append(unidecode(url)) else: # urls for pages page_md = corpus.view_metadata('page') files = [ a for a in page_md['file'] if a.startswith(book_label) ] nums = [re.findall('[1-9][0-9]*', a)[-1] for a in files] for i in nums: s = url + '?urlappend=%3Bseq={0}'.format(i) urls.append(unidecode(s)) return urls
def url_metadata(corpus, ctx_type, coll_dir): """ Returns a list of urls whose order matches with the existing metadata. It creates url metadata that can be added to a Corpus object with add_metadata function in vsm.ext.corpusbuilders.util. :param corpus: Corpus to add url metadata to. Urls match with the existing metadata of `corpus`. :type corpus: Corpus :param ctx_type: A type of tokenization. :type ctx_type: string :param coll_dir: Path for the collection directory. Either htrc 86 plain or htrc 1315 plain directory. :type coll_dir: string :returns: md : List of urls to be added to corpus :See Also: :meth: add_metadata """ import json from vsm.viewer import doc_label_name import re urls = [] corp_md = corpus.view_metadata("book") book_labels = corp_md[doc_label_name("book")] for book_label in book_labels: coll_path = os.path.join(coll_dir, book_label) booklist = os.listdir(coll_path) book = filter_by_suffix(booklist, ignore=[".txt", ".pickle"]) book_path = os.path.join(coll_path, book[0]) with open(book_path, "r") as f: d = json.load(f) url = "" li = sorted(d["items"], key=lambda k: int(k["lastUpdate"])) url = li[-1]["itemURL"] if ctx_type == "book": urls.append(unidecode(url)) else: # urls for pages page_md = corpus.view_metadata("page") files = [a for a in page_md["file"] if a.startswith(book_label)] nums = [re.findall("[1-9][0-9]*", a)[-1] for a in files] for i in nums: s = url + "?urlappend=%3Bseq={0}".format(i) urls.append(unidecode(s)) return urls
def sim_doc_doc(corp, mat, context_type, doc_or_docs, weights=None, norms=None, filter_nan=True, print_len=10, label_fn=def_label_fn, as_strings=True, sim_fn=row_cosines, order='d'): """ """ # Resolve `doc_or_docs` label_name = doc_label_name(context_type) if (isstr(doc_or_docs) or isint(doc_or_docs) or isinstance(doc_or_docs, dict)): docs = [res_doc_type(corp, context_type, label_name, doc_or_docs)[0]] else: docs = [res_doc_type(corp, context_type, label_name, d)[0] for d in doc_or_docs] # Assume documents are columns, so transpose mat = mat.T # Generate pseudo-document if issparse(mat): rows = mat.tocsr()[docs].toarray() else: rows = mat[docs] doc = np.average(rows, weights=weights, axis=0)[np.newaxis, :] # Compute cosines d_arr = sim_fn(doc, mat, norms=norms) # Label data if as_strings: md = corp.view_metadata(context_type) docs = label_fn(md) d_arr = enum_sort(d_arr, indices=docs, field_name='doc') else: d_arr = enum_sort(d_arr, filter_nan=filter_nan) if order=='d': pass elif order=='i': w_arr = w_arr[::-1] else: raise Exception('Invalid order parameter.') d_arr = d_arr.view(LabeledColumn) # TODO: Finish this header d_arr.col_header = 'Documents: ' d_arr.subcol_headers = ['Document', 'Value'] d_arr.col_len = print_len return d_arr
def simmat_documents(corp, matrix, context_type, doc_list, norms=None, sim_fn=row_cos_mat): """ """ label_name = doc_label_name(context_type) indices, labels = zip(*[res_doc_type(corp, context_type, label_name, doc) for doc in doc_list]) indices, labels = np.array(indices), np.array(labels) sm = sim_fn(indices, matrix.T, norms=norms, fill_tril=True) sm = sm.view(IndexedSymmArray) sm.labels = labels return sm