def new(cls, collection_id, doc_types): """Create a new collection. Until registered, collection assets are in-memory/overwritable. """ if cls._register.exists(collection_id): raise KeyError(f"collection name {collection_id} already taken.") # create records db collection_schema = {} for doc_type in doc_types: collection_schema = update_dict(collection_schema, doc_type.schema) records_db = Data.new(collection_schema) # create/overwrite temporary search indexes _collection_id = f"tmp_{collection_id}" search_indices = [] for doc_type in doc_types: index_name = return_index_name(_collection_id, doc_type.__name__) search_indices.append(index_name) # overwrite temp indices of same name if they exist if cls.client.index_exists(index_name): cls.client.drop_index(index_name) cls.client.new_index(index_name=index_name, mapping=doc_type.index_mapping) configd = { "doc_types": [doc_type.__name__ for doc_type in doc_types], } coll = cls( collection_id=collection_id, doc_types=doc_types, records_db=records_db, configd=configd, ) coll.search_indices = search_indices return coll
class ArxivDoc(DocType): """Arxiv publication document type.""" schema = { "authors": object, "publish_date": float, } schema = update_dict(schema, BASE_SCHEMA) index_mapping = { "mappings": { "properties": { "publish_date": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_second", }, "authors": { "type": "text", "analyzer": "standard" }, } } } index_mapping = update_dict(index_mapping, BASE_MAPPING) @staticmethod def gen_record(document_id, primary_doc, gen_links): """Generate record from arxiv url. # example document_id: https://arxiv.org/abs/1810.04805 arxiv reference: https://arxiv.org/help/api/user-manual#_calling_the_api # api url = 'http://export.arxiv.org/api/query?id_list=1311.5600' """ paper_id = document_id.split("abs/")[-1] search = arxiv.Search(id_list=[paper_id]) result = next(search.get()) record = gen_arxiv_record_from_result(result, primary_doc=primary_doc) return record @staticmethod def gen_search_index(record, link_content=None): """Generate a search index from a record.""" document_id = record["document_id"] record_index = { "document_name": record["document_name"], "document_type": record["document_type"].__name__, "content": record["content"], "authors": record["authors"], "publish_date": record["publish_date"], "link_content": link_content, } return (document_id, record_index) @staticmethod def gen_links(text): """Return citations found in text.""" return [] @staticmethod def gen_from_source(source_id, *source_args, **source_kwargs): """Return document ids from a document source (e.g. folder or query).""" pass @staticmethod def resolve_id(document_id): arxiv_dict = ArxivDoc.gen_record(document_id, primary_doc=None, gen_links=False) return arxiv_dict["document_id"] @staticmethod def resolve_source_id(source_id): return source_id @staticmethod def is_valid(document_id): match_str = re.match(r"https?://arxiv.org/abs/\d+\.\d+v?\d\Z", document_id) return bool(match_str) @staticmethod def preview(record): time_str = (pd.to_datetime(record["publish_date"], unit="s").isoformat().split("T")[0]) preview = (f"{record['document_name']}\n" f"{', '.join(record['authors'])}\n\n" f"{time_str}\n" f"Abstract: {record['content']}") return preview
class MarkdownDoc(DocType): """Markdown document type.""" schema = BASE_SCHEMA index_mapping = { "mappings": { "properties": { "document_name": { "type": "text", }, } } } index_mapping = update_dict(index_mapping, BASE_MAPPING) @staticmethod def gen_record(document_id, primary_doc, gen_links): """Generate a record from a markdown file.""" document_id = MarkdownDoc.resolve_id(document_id) raw_doc = parse_lib.return_file_contents(document_id) raw_links = MarkdownDoc.gen_links(raw_doc) if gen_links else [] links = [] for link in raw_links: try: res_link = parse_lib.resolve_path(link, document_id) except AttributeError: logger.exception(f"unable to resolve link: {link}") if os.path.exists(res_link): links.append(res_link) else: links.append(link) record = { "document_id": document_id, "document_name": document_id, "primary_doc": primary_doc, "document_type": MarkdownDoc, "content": raw_doc, "links": links, } return record @staticmethod def gen_search_index(record, link_content=None): """Generate a search index entry from a record.""" document_id = record["document_id"] record_index = { "document_name": record["document_id"], "document_type": record["document_type"].__name__, "content": record["content"], "link_content": link_content, } return (document_id, record_index) @staticmethod def gen_links(text): """Return links from markdown text.""" html = mistune.html(text) soup = BeautifulSoup(html, features="html.parser") links = list(set([link.get("href") for link in soup.findAll("a")])) return links @staticmethod def gen_from_source(source_id, extensions=[".md"]): """Return markdown docs from folder source_id, recursively.""" doc_ids = parse_lib.return_files(source_id, extensions=extensions) doc_ids = [MarkdownDoc.resolve_id(doc_id) for doc_id in doc_ids] return doc_ids @staticmethod def resolve_id(document_id): return parse_lib.resolve_path(document_id) @staticmethod def resolve_source_id(source_id): return parse_lib.resolve_path(source_id) @staticmethod def is_valid(document_id, extensions=[".md"]): cond1 = parse_lib.exists_on_fs(parse_lib.host_path_to_url(document_id)) cond2 = os.path.splitext(document_id)[-1] in extensions return bool(cond1 and cond2) @staticmethod def preview(record): preview_text = "(file contents):\n" f'{record["content"]}' return preview_text
class YoutubeDoc(DocType): """Doc type for youtube videos.""" schema = { "publish_date": float, } schema = update_dict(schema, BASE_SCHEMA) index_mapping = { "mappings": { "properties": { "publish_date": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_second", }, } } } index_mapping = update_dict(index_mapping, BASE_MAPPING) @staticmethod def gen_record(document_id, primary_doc, gen_links): """Generate record from youtube url. # example document_id: https://www.youtube.com/watch?v=3LtQWxhqjqI """ video_id = document_id.split("v=")[-1] try: transcript_data = YouTubeTranscriptApi.get_transcript(video_id) text = " ".join([d["text"] for d in transcript_data]) except (NoTranscriptAvailable, NoTranscriptFound, TranscriptsDisabled) as err: text = "(no transcript available)" # get title: res = requests.get(document_id) soup = BeautifulSoup(markup=res.text, features="html.parser") title = soup.find("title").text links = [] record = { "document_id": document_id, "document_name": title, "primary_doc": primary_doc, "document_type": YoutubeDoc, "content": text, "links": links, } return record @staticmethod def gen_search_index(record, link_content=None): """Generate a search index from a record.""" document_id = record["document_id"] record_index = { "document_name": record["document_name"], "document_type": record["document_type"].__name__, "content": record["content"], } return (document_id, record_index) @staticmethod def gen_links(text): """Return citations found in text.""" return [] @staticmethod def gen_from_source(source_id, *source_args, **source_kwargs): """Return document ids from a document source (e.g. folder or query).""" pass @staticmethod def resolve_id(document_id): return document_id @staticmethod def resolve_source_id(source_id): return source_id @staticmethod def is_valid(document_id): url = requests.urllib3.util.parse_url(document_id) cond = bool(url.host == "www.youtube.com") if cond: return True else: return False @staticmethod def preview(record): preview = (f"{record['document_name']}\n" f"Preview: {record['content'][0:1200]}") return preview