Example #1
0
    def new(cls, collection_id, doc_types):
        """Create a new collection.

        Until registered, collection assets are in-memory/overwritable.
        """
        if cls._register.exists(collection_id):
            raise KeyError(f"collection name {collection_id} already taken.")

        # create records db
        collection_schema = {}
        for doc_type in doc_types:
            collection_schema = update_dict(collection_schema, doc_type.schema)
        records_db = Data.new(collection_schema)

        # create/overwrite temporary search indexes
        _collection_id = f"tmp_{collection_id}"
        search_indices = []
        for doc_type in doc_types:
            index_name = return_index_name(_collection_id, doc_type.__name__)
            search_indices.append(index_name)
            # overwrite temp indices of same name if they exist
            if cls.client.index_exists(index_name):
                cls.client.drop_index(index_name)
            cls.client.new_index(index_name=index_name,
                                 mapping=doc_type.index_mapping)

        configd = {
            "doc_types": [doc_type.__name__ for doc_type in doc_types],
        }
        coll = cls(
            collection_id=collection_id,
            doc_types=doc_types,
            records_db=records_db,
            configd=configd,
        )
        coll.search_indices = search_indices
        return coll
Example #2
0
class ArxivDoc(DocType):
    """Arxiv publication document type."""

    schema = {
        "authors": object,
        "publish_date": float,
    }
    schema = update_dict(schema, BASE_SCHEMA)

    index_mapping = {
        "mappings": {
            "properties": {
                "publish_date": {
                    "type": "date",
                    "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_second",
                },
                "authors": {
                    "type": "text",
                    "analyzer": "standard"
                },
            }
        }
    }
    index_mapping = update_dict(index_mapping, BASE_MAPPING)

    @staticmethod
    def gen_record(document_id, primary_doc, gen_links):
        """Generate record from arxiv url.
        # example document_id: https://arxiv.org/abs/1810.04805
        arxiv reference: https://arxiv.org/help/api/user-manual#_calling_the_api
        # api url = 'http://export.arxiv.org/api/query?id_list=1311.5600'
        """
        paper_id = document_id.split("abs/")[-1]
        search = arxiv.Search(id_list=[paper_id])
        result = next(search.get())
        record = gen_arxiv_record_from_result(result, primary_doc=primary_doc)
        return record

    @staticmethod
    def gen_search_index(record, link_content=None):
        """Generate a search index from a record."""
        document_id = record["document_id"]
        record_index = {
            "document_name": record["document_name"],
            "document_type": record["document_type"].__name__,
            "content": record["content"],
            "authors": record["authors"],
            "publish_date": record["publish_date"],
            "link_content": link_content,
        }
        return (document_id, record_index)

    @staticmethod
    def gen_links(text):
        """Return citations found in text."""
        return []

    @staticmethod
    def gen_from_source(source_id, *source_args, **source_kwargs):
        """Return document ids from a document source (e.g. folder or query)."""
        pass

    @staticmethod
    def resolve_id(document_id):
        arxiv_dict = ArxivDoc.gen_record(document_id,
                                         primary_doc=None,
                                         gen_links=False)
        return arxiv_dict["document_id"]

    @staticmethod
    def resolve_source_id(source_id):
        return source_id

    @staticmethod
    def is_valid(document_id):
        match_str = re.match(r"https?://arxiv.org/abs/\d+\.\d+v?\d\Z",
                             document_id)
        return bool(match_str)

    @staticmethod
    def preview(record):

        time_str = (pd.to_datetime(record["publish_date"],
                                   unit="s").isoformat().split("T")[0])
        preview = (f"{record['document_name']}\n"
                   f"{', '.join(record['authors'])}\n\n"
                   f"{time_str}\n"
                   f"Abstract: {record['content']}")
        return preview
Example #3
0
class MarkdownDoc(DocType):
    """Markdown document type."""

    schema = BASE_SCHEMA
    index_mapping = {
        "mappings": {
            "properties": {
                "document_name": {
                    "type": "text",
                },
            }
        }
    }
    index_mapping = update_dict(index_mapping, BASE_MAPPING)

    @staticmethod
    def gen_record(document_id, primary_doc, gen_links):
        """Generate a record from a markdown file."""
        document_id = MarkdownDoc.resolve_id(document_id)
        raw_doc = parse_lib.return_file_contents(document_id)
        raw_links = MarkdownDoc.gen_links(raw_doc) if gen_links else []
        links = []
        for link in raw_links:
            try:
                res_link = parse_lib.resolve_path(link, document_id)
            except AttributeError:
                logger.exception(f"unable to resolve link: {link}")
            if os.path.exists(res_link):
                links.append(res_link)
            else:
                links.append(link)
        record = {
            "document_id": document_id,
            "document_name": document_id,
            "primary_doc": primary_doc,
            "document_type": MarkdownDoc,
            "content": raw_doc,
            "links": links,
        }
        return record

    @staticmethod
    def gen_search_index(record, link_content=None):
        """Generate a search index entry from a record."""
        document_id = record["document_id"]
        record_index = {
            "document_name": record["document_id"],
            "document_type": record["document_type"].__name__,
            "content": record["content"],
            "link_content": link_content,
        }
        return (document_id, record_index)

    @staticmethod
    def gen_links(text):
        """Return links from markdown text."""
        html = mistune.html(text)
        soup = BeautifulSoup(html, features="html.parser")
        links = list(set([link.get("href") for link in soup.findAll("a")]))
        return links

    @staticmethod
    def gen_from_source(source_id, extensions=[".md"]):
        """Return markdown docs from folder source_id, recursively."""
        doc_ids = parse_lib.return_files(source_id, extensions=extensions)
        doc_ids = [MarkdownDoc.resolve_id(doc_id) for doc_id in doc_ids]
        return doc_ids

    @staticmethod
    def resolve_id(document_id):
        return parse_lib.resolve_path(document_id)

    @staticmethod
    def resolve_source_id(source_id):
        return parse_lib.resolve_path(source_id)

    @staticmethod
    def is_valid(document_id, extensions=[".md"]):

        cond1 = parse_lib.exists_on_fs(parse_lib.host_path_to_url(document_id))
        cond2 = os.path.splitext(document_id)[-1] in extensions
        return bool(cond1 and cond2)

    @staticmethod
    def preview(record):

        preview_text = "(file contents):\n" f'{record["content"]}'
        return preview_text
Example #4
0
class YoutubeDoc(DocType):
    """Doc type for youtube videos."""

    schema = {
        "publish_date": float,
    }
    schema = update_dict(schema, BASE_SCHEMA)

    index_mapping = {
        "mappings": {
            "properties": {
                "publish_date": {
                    "type": "date",
                    "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_second",
                },
            }
        }
    }
    index_mapping = update_dict(index_mapping, BASE_MAPPING)

    @staticmethod
    def gen_record(document_id, primary_doc, gen_links):
        """Generate record from youtube url.

        # example document_id: https://www.youtube.com/watch?v=3LtQWxhqjqI
        """
        video_id = document_id.split("v=")[-1]
        try:
            transcript_data = YouTubeTranscriptApi.get_transcript(video_id)
            text = " ".join([d["text"] for d in transcript_data])
        except (NoTranscriptAvailable, NoTranscriptFound,
                TranscriptsDisabled) as err:
            text = "(no transcript available)"

        # get title:
        res = requests.get(document_id)
        soup = BeautifulSoup(markup=res.text, features="html.parser")
        title = soup.find("title").text
        links = []
        record = {
            "document_id": document_id,
            "document_name": title,
            "primary_doc": primary_doc,
            "document_type": YoutubeDoc,
            "content": text,
            "links": links,
        }
        return record

    @staticmethod
    def gen_search_index(record, link_content=None):
        """Generate a search index from a record."""
        document_id = record["document_id"]
        record_index = {
            "document_name": record["document_name"],
            "document_type": record["document_type"].__name__,
            "content": record["content"],
        }
        return (document_id, record_index)

    @staticmethod
    def gen_links(text):
        """Return citations found in text."""
        return []

    @staticmethod
    def gen_from_source(source_id, *source_args, **source_kwargs):
        """Return document ids from a document source (e.g. folder or query)."""
        pass

    @staticmethod
    def resolve_id(document_id):
        return document_id

    @staticmethod
    def resolve_source_id(source_id):
        return source_id

    @staticmethod
    def is_valid(document_id):
        url = requests.urllib3.util.parse_url(document_id)
        cond = bool(url.host == "www.youtube.com")
        if cond:
            return True
        else:
            return False

    @staticmethod
    def preview(record):

        preview = (f"{record['document_name']}\n"
                   f"Preview: {record['content'][0:1200]}")
        return preview