Esempio n. 1
0
    def es_stream_docs(cls):

        """
        Index institutions with cited syllabi.

        Yields:
            dict: The next document.
        """

        # TODO: De-circularize.
        from osp.locations.models.doc_inst import Document_Institution
        from osp.citations.hlom.models.citation import HLOM_Citation

        count = fn.Count(Document_Institution.id)

        cited = cls.select(cls, count).join(Document_Institution).group_by(cls.id).order_by(count.desc())

        for inst in query_bar(cited):

            name = inst.metadata["Campus_Name"] or inst.metadata["Institution_Name"]

            city = inst.metadata["Campus_City"] or inst.metadata["Institution_City"]

            yield {
                "_id": inst.id,
                "count": inst.count,
                "name": name,
                "city": city,
                "state": inst.metadata["Institution_State"],
                "url": inst.metadata["Institution_Web_Address"],
                "lon": inst.metadata.get("Longitude"),
                "lat": inst.metadata.get("Latitude"),
            }
Esempio n. 2
0
    def deduplicate(cls):

        """
        Deduplicate cited texts.
        """

        for text in query_bar(cls.select_cited()):

            # Has the hash been seen?
            seen = config.redis.sismember(
                redis_keys.OSP_DEDUP,
                text.hash,
            )

            # If so, don't display this text.
            if seen:
                text.display = False

            else:

                # If not, display this text.
                text.display = True

                # And reserve the hash.
                config.redis.sadd(
                    redis_keys.OSP_DEDUP,
                    text.hash,
                )

            text.save()
Esempio n. 3
0
    def validate(cls, *args, **kwargs):

        """
        Validate all cited texts.
        """

        config = Validate_Config(*args, **kwargs)

        for text in query_bar(cls.select_cited()):

            text.valid = not (

                # Title
                text.title_contains_surname or
                text.title_blacklisted(config.blacklisted_titles) or
                text.title_is_toponym or

                # Surname
                text.surname_blacklisted(config.blacklisted_surnames) or
                text.surname_is_toponym or

                # Focus
                text.unfocused(config.max_fuzz)

            )

            text.save()
Esempio n. 4
0
    def deduplicate(cls):
        """
        Deduplicate cited texts.
        """

        for text in query_bar(cls.select_cited()):

            # Has the hash been seen?
            seen = config.redis.sismember(
                redis_keys.OSP_DEDUP,
                text.hash,
            )

            # If so, don't display this text.
            if seen:
                text.display = False

            else:

                # If not, display this text.
                text.display = True

                # And reserve the hash.
                config.redis.sadd(
                    redis_keys.OSP_DEDUP,
                    text.hash,
                )

            text.save()
Esempio n. 5
0
    def rank(cls):

        """
        Write citation counts and ranks.
        """

        count = fn.COUNT(HLOM_Citation.id)

        query = (
            cls.select(cls, count)
            .join(HLOM_Citation, on=(HLOM_Citation.record==cls.id))
            .order_by(count.desc())
            .group_by(cls.id)
        )

        # Get up citation counts.
        counts = [r.count for r in query]

        # Rank in ascending order.
        ranks = rankdata(counts, 'max')
        ranks = ranks.max()+1 - ranks

        for i, r in enumerate(query_bar(query)):
            r.metadata['citation_count'] = counts[i]
            r.metadata['rank'] = int(ranks[i])
            r.save()
Esempio n. 6
0
    def add_edges(self):

        """
        For each syllabus, register citation pairs as edges.
        """

        text_ids = (
            fn.array_agg(Text.id)
            .coerce(False)
            .alias('text_ids')
        )

        docs = (
            Citation
            .select(Citation.document, text_ids)
            .join(Text)
            .where(Text.display==True)
            .where(Text.valid==True)
            .group_by(Citation.document)
        )

        for row in query_bar(docs):
            for tid1, tid2 in combinations(row.text_ids, 2):

                # If the edge exists, increment the weight.

                if self.graph.has_edge(tid1, tid2):
                    self.graph[tid1][tid2]['weight'] += 1

                # Otherwise, initialize the edge.

                else:
                    self.graph.add_edge(tid1, tid2, weight=1)
Esempio n. 7
0
    def add_edges(self, max_citations=50):

        """
        For each syllabus, register citation pairs as edges.

        Args:
            max_citations (int): Discard documents with > N citations.
        """

        # Aggregate the CNs.
        texts = (
            fn.array_agg(HLOM_Record.control_number)
            .coerce(False)
            .alias('texts')
        )

        # Select syllabi and cited CNs.
        documents = (
            HLOM_Citation
            .select(HLOM_Citation.document, texts)
            .join(HLOM_Record)
            .having(fn.count(HLOM_Record.id) <= max_citations)
            .distinct(HLOM_Citation.document)
            .group_by(HLOM_Citation.document)
        )

        for row in query_bar(documents):
            for cn1, cn2 in combinations(row.texts, 2):

                # If the edge exists, +1 the weight.
                if self.graph.has_edge(cn1, cn2):
                    self.graph[cn1][cn2]['weight'] += 1

                # Otherwise, initialize the edge.
                else: self.graph.add_edge(cn1, cn2, weight=1)
    def validate(cls, *args, **kwargs):

        """
        Validate all cited texts.
        """

        config = Validate_Config(*args, **kwargs)

        for text in query_bar(cls.select_cited()):

            text.valid = not (

                # Title
                text.title_and_author_overlap or
                text.title_blacklisted(config.blacklisted_titles) or
                text.title_is_toponym or

                # Surname
                text.surname_blacklisted(config.blacklisted_surnames) or
                text.surname_is_toponym or

                # Focus
                text.unfocused(config.max_fuzz, config.whitelist)

            )

            text.save()
def truncated(out_file, frag_len):

    """
    Write a CSV with truncated document texts.
    """

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    query = (
        Document_Text
        .select(Document_Text.text, Document.path)
        .join(Document)
    )

    for row in query_bar(query):

        # Truncate the text.
        fragment = row.text[:frag_len]

        writer.writerow({
            'id': row.path,
            'title': row.path,
            'text': fragment
        })
Esempio n. 10
0
def queue_text():
    """
    Queue text extraction tasks in the worker.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_text, doc.id)
def random(out_file, n):

    """
    Write a CSV with plaintext for N random docs.
    """

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    query = (
        Document_Text
        .select(Document_Text.text, Document.path)
        .join(Document)
        .order_by(fn.random())
        .limit(n)
    )

    for row in query_bar(query):

        writer.writerow({
            'id': row.path,
            'title': row.path,
            'text': row.text
        })
Esempio n. 12
0
    def add_edges(self, max_texts=20):
        """
        For each syllabus, register citation pairs as edges.

        Args:
            max_texts (int): Ignore docs with > than N citations.
        """

        text_ids = (fn.array_agg(Text.id).coerce(False).alias('text_ids'))

        docs = (Citation.select(Citation.document, text_ids).join(Text).having(
            fn.count(Text.id) <= max_texts).where(Text.display == True).where(
                Text.valid == True).group_by(Citation.document))

        for row in query_bar(docs):
            for tid1, tid2 in combinations(row.text_ids, 2):

                # If the edge exists, increment the weight.

                if self.graph.has_edge(tid1, tid2):
                    self.graph[tid1][tid2]['weight'] += 1

                # Otherwise, initialize the edge.

                else:
                    self.graph.add_edge(tid1, tid2, weight=1)
Esempio n. 13
0
File: text.py Progetto: overview/osp
    def es_stream_docs(cls, just_cited=False):

        """
        Index all texts.

        Args:
            just_cited (bool): If true, just index texts with citations.

        Yields:
            dict: The next document.
        """

        # By default, index everything.
        if not just_cited:
            query = cls.select()

        # Omit texts without citations.
        else:
            query = (
                cls.select()
                .group_by(cls.id)
                .join(HLOM_Citation, on=(
                    cls.document==HLOM_Citation.document
                ))
            )

        for row in query_bar(query):
            yield row.es_doc
Esempio n. 14
0
def queue_text():

    """
    Queue text extraction tasks in the worker.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_text, doc.id)
Esempio n. 15
0
def queue_file_metadata():

    """
    Queue file metadata extraction tasks.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_file_metadata, doc.id)
Esempio n. 16
0
def queue_semester():

    """
    Queue semester regex extraction tasks.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_semester, doc.id)
Esempio n. 17
0
def queue_archive_url():

    """
    Queue Internet Archive timestamp extraction tasks.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_archive_url, doc.id)
Esempio n. 18
0
def run_doc_to_inst():

    """
    Match documents -> institutions.
    """

    for doc in query_bar(Document.select()):
        try: doc_to_inst(doc.id)
        except: pass
Esempio n. 19
0
def run_doc_to_fields():

    """
    Match documents -> fields.
    """

    for doc in query_bar(Document.select()):
        try: doc_to_fields(doc.id)
        except: pass
Esempio n. 20
0
def run_doc_to_inst():
    """
    Match documents -> institutions.
    """

    for doc in query_bar(Document.select()):
        try:
            doc_to_inst(doc.id)
        except:
            pass
def run_doc_to_fields():
    """
    Match documents -> fields.
    """

    for doc in query_bar(Document.select()):
        try:
            doc_to_fields(doc.id)
        except:
            pass
Esempio n. 22
0
    def es_stream_docs(cls):

        """
        Index all rows.

        Yields:
            dict: The next document.
        """

        for row in query_bar(cls.select()):
            yield row.es_doc
    def link(cls):

        """
        Link documents -> institutions.
        """

        domain_to_inst = defaultdict(list)

        # Map domain -> [(regex, inst), ...]
        for inst in ServerSide(Institution.select()):

            domain = parse_domain(inst.url)

            regex = seed_to_regex(inst.url)

            domain_to_inst[domain].append((regex, inst))

        for doc in query_bar(Document.select()):

            try:

                # TODO: Get rid of @property.
                url = doc.syllabus.url

                domain = parse_domain(url)

                # Find institutions with matching URLs.
                matches = []
                for pattern, inst in domain_to_inst[domain]:

                    match = pattern.search(url)

                    if match:
                        matches.append((match.group(), inst))

                if matches:

                    # Sort by length of match, descending.
                    matches = sorted(
                        matches,
                        key=lambda x: len(x[0]),
                        reverse=True,
                    )

                    # Link to the institution with the longest match.
                    cls.create(
                        institution=matches[0][1],
                        document=doc,
                    )

            except Exception as e:
                print(e)
    def es_stream_docs(cls):
        """
        Index document texts.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Document_Text.select()):

            yield dict(
                _id=row.document_id,
                body=row.text,
            )
    def es_stream_docs(cls):
        """
        Index institutions.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Institution.select()):

            yield dict(
                _id=row.id,
                name=row.name,
            )
    def es_stream_docs(cls):

        """
        Index subfields.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Subfield.select()):

            yield dict(
                _id = row.id,
                name = row.name,
            )
    def es_stream_docs(cls):

        """
        Index institutions.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Institution.select()):

            yield dict(
                _id = row.id,
                name = row.name,
            )
    def es_stream_docs(cls):

        """
        Index subfields.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Subfield.select()):

            yield dict(
                _id = row.id,
                name = row.name,
            )
    def es_stream_docs(cls):

        """
        Index document texts.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Document_Text.select()):

            yield dict(
                _id = row.document_id,
                body = row.text,
            )
Esempio n. 30
0
def random(out_file, n):
    """
    Write a CSV with plaintext for N random docs.
    """

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    query = (Document_Text.select(Document_Text.text,
                                  Document.path).join(Document).order_by(
                                      fn.random()).limit(n))

    for row in query_bar(query):

        writer.writerow({'id': row.path, 'title': row.path, 'text': row.text})
Esempio n. 31
0
    def dedupe(cls):

        """
        Write deduping hashes.
        """

        from .citation import HLOM_Citation

        cited = (
            cls.select()
            .join(HLOM_Citation)
            .group_by(cls.id)
        )

        for record in query_bar(cited):
            record.metadata['deduping_hash'] = record.hash
            record.save()
    def es_stream_docs(cls):

        """
        Stream Elasticsearch docs.

        Yields:
            dict: The next document.
        """

        query = (
            Citation.select()
            .join(Text)
            .where(Text.display==True)
            .where(Text.valid==True)
        )

        for row in query_bar(query):

            doc = {}

            # Local fields:

            doc['_id'] = row.id
            doc['text_id'] = row.text_id
            doc['document_id'] = row.document_id
            doc['corpus'] = row.text.corpus

            # Field references:

            subfield = row.subfield

            if subfield:
                doc['subfield_id'] = subfield.id
                doc['field_id'] = subfield.field_id

            # Institution reference:

            inst = row.institution

            if inst:
                doc['institution_id'] = inst.id
                doc['state'] = inst.state
                doc['country'] = inst.country

            yield doc
Esempio n. 33
0
File: text.py Progetto: overview/osp
    def term_counts(cls, limit=None):

        """
        Get frequency counts for all unique word types.

        Returns:
            dict: type -> count
        """

        texts = cls.select().limit(limit)
        counts = Counter()

        for row in query_bar(texts):
            row.tokenize()
            for term, offsets in row.terms.items():
                counts[term] += len(offsets)

        return counts
Esempio n. 34
0
    def es_stream_docs(cls):

        """
        Index institutions with cited syllabi.

        Yields:
            dict: The next document.
        """

        # TODO: De-circularize.
        from osp.locations.models.doc_inst import Document_Institution
        from osp.citations.hlom.models.citation import HLOM_Citation

        count = fn.Count(Document_Institution.id)

        cited = (
            cls.select(cls, count)
            .join(Document_Institution)
            .group_by(cls.id)
            .order_by(count.desc())
        )

        for inst in query_bar(cited):

            name = (
                inst.metadata['Campus_Name'] or
                inst.metadata['Institution_Name']
            )

            city = (
                inst.metadata['Campus_City'] or
                inst.metadata['Institution_City']
            )

            yield {
                '_id':      inst.id,
                'count':    inst.count,
                'name':     name,
                'city':     city,
                'state':    inst.metadata['Institution_State'],
                'url':      inst.metadata['Institution_Web_Address'],
                'lon':      inst.metadata.get('Longitude'),
                'lat':      inst.metadata.get('Latitude'),
            }
Esempio n. 35
0
def truncated(out_file, frag_len):
    """
    Write a CSV with truncated document texts.
    """

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    query = (Document_Text.select(Document_Text.text,
                                  Document.path).join(Document))

    for row in query_bar(query):

        # Truncate the text.
        fragment = row.text[:frag_len]

        writer.writerow({'id': row.path, 'title': row.path, 'text': fragment})
    def es_stream_docs(cls):
        """
        Stream Elasticsearch docs.

        Yields:
            dict: The next document.
        """

        query = (Citation.select().join(Text).where(
            Text.display == True).where(Text.valid == True))

        for row in query_bar(query):

            doc = {}

            # Local fields:

            doc['_id'] = row.id
            doc['text_id'] = row.text_id
            doc['document_id'] = row.document_id
            doc['corpus'] = row.text.corpus

            # Field references:

            subfield = row.subfield

            if subfield:
                doc['subfield_id'] = subfield.id
                doc['field_id'] = subfield.field_id

            # Institution reference:

            inst = row.institution

            if inst:
                doc['institution_id'] = inst.id
                doc['state'] = inst.state
                doc['country'] = inst.country

            yield doc