def es_stream_docs(cls): """ Index institutions with cited syllabi. Yields: dict: The next document. """ # TODO: De-circularize. from osp.locations.models.doc_inst import Document_Institution from osp.citations.hlom.models.citation import HLOM_Citation count = fn.Count(Document_Institution.id) cited = cls.select(cls, count).join(Document_Institution).group_by(cls.id).order_by(count.desc()) for inst in query_bar(cited): name = inst.metadata["Campus_Name"] or inst.metadata["Institution_Name"] city = inst.metadata["Campus_City"] or inst.metadata["Institution_City"] yield { "_id": inst.id, "count": inst.count, "name": name, "city": city, "state": inst.metadata["Institution_State"], "url": inst.metadata["Institution_Web_Address"], "lon": inst.metadata.get("Longitude"), "lat": inst.metadata.get("Latitude"), }
def deduplicate(cls): """ Deduplicate cited texts. """ for text in query_bar(cls.select_cited()): # Has the hash been seen? seen = config.redis.sismember( redis_keys.OSP_DEDUP, text.hash, ) # If so, don't display this text. if seen: text.display = False else: # If not, display this text. text.display = True # And reserve the hash. config.redis.sadd( redis_keys.OSP_DEDUP, text.hash, ) text.save()
def validate(cls, *args, **kwargs): """ Validate all cited texts. """ config = Validate_Config(*args, **kwargs) for text in query_bar(cls.select_cited()): text.valid = not ( # Title text.title_contains_surname or text.title_blacklisted(config.blacklisted_titles) or text.title_is_toponym or # Surname text.surname_blacklisted(config.blacklisted_surnames) or text.surname_is_toponym or # Focus text.unfocused(config.max_fuzz) ) text.save()
def rank(cls): """ Write citation counts and ranks. """ count = fn.COUNT(HLOM_Citation.id) query = ( cls.select(cls, count) .join(HLOM_Citation, on=(HLOM_Citation.record==cls.id)) .order_by(count.desc()) .group_by(cls.id) ) # Get up citation counts. counts = [r.count for r in query] # Rank in ascending order. ranks = rankdata(counts, 'max') ranks = ranks.max()+1 - ranks for i, r in enumerate(query_bar(query)): r.metadata['citation_count'] = counts[i] r.metadata['rank'] = int(ranks[i]) r.save()
def add_edges(self): """ For each syllabus, register citation pairs as edges. """ text_ids = ( fn.array_agg(Text.id) .coerce(False) .alias('text_ids') ) docs = ( Citation .select(Citation.document, text_ids) .join(Text) .where(Text.display==True) .where(Text.valid==True) .group_by(Citation.document) ) for row in query_bar(docs): for tid1, tid2 in combinations(row.text_ids, 2): # If the edge exists, increment the weight. if self.graph.has_edge(tid1, tid2): self.graph[tid1][tid2]['weight'] += 1 # Otherwise, initialize the edge. else: self.graph.add_edge(tid1, tid2, weight=1)
def add_edges(self, max_citations=50): """ For each syllabus, register citation pairs as edges. Args: max_citations (int): Discard documents with > N citations. """ # Aggregate the CNs. texts = ( fn.array_agg(HLOM_Record.control_number) .coerce(False) .alias('texts') ) # Select syllabi and cited CNs. documents = ( HLOM_Citation .select(HLOM_Citation.document, texts) .join(HLOM_Record) .having(fn.count(HLOM_Record.id) <= max_citations) .distinct(HLOM_Citation.document) .group_by(HLOM_Citation.document) ) for row in query_bar(documents): for cn1, cn2 in combinations(row.texts, 2): # If the edge exists, +1 the weight. if self.graph.has_edge(cn1, cn2): self.graph[cn1][cn2]['weight'] += 1 # Otherwise, initialize the edge. else: self.graph.add_edge(cn1, cn2, weight=1)
def validate(cls, *args, **kwargs): """ Validate all cited texts. """ config = Validate_Config(*args, **kwargs) for text in query_bar(cls.select_cited()): text.valid = not ( # Title text.title_and_author_overlap or text.title_blacklisted(config.blacklisted_titles) or text.title_is_toponym or # Surname text.surname_blacklisted(config.blacklisted_surnames) or text.surname_is_toponym or # Focus text.unfocused(config.max_fuzz, config.whitelist) ) text.save()
def truncated(out_file, frag_len): """ Write a CSV with truncated document texts. """ # CSV writer. cols = ['id', 'title', 'text'] writer = csv.DictWriter(out_file, cols) writer.writeheader() query = ( Document_Text .select(Document_Text.text, Document.path) .join(Document) ) for row in query_bar(query): # Truncate the text. fragment = row.text[:frag_len] writer.writerow({ 'id': row.path, 'title': row.path, 'text': fragment })
def queue_text(): """ Queue text extraction tasks in the worker. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_text, doc.id)
def random(out_file, n): """ Write a CSV with plaintext for N random docs. """ # CSV writer. cols = ['id', 'title', 'text'] writer = csv.DictWriter(out_file, cols) writer.writeheader() query = ( Document_Text .select(Document_Text.text, Document.path) .join(Document) .order_by(fn.random()) .limit(n) ) for row in query_bar(query): writer.writerow({ 'id': row.path, 'title': row.path, 'text': row.text })
def add_edges(self, max_texts=20): """ For each syllabus, register citation pairs as edges. Args: max_texts (int): Ignore docs with > than N citations. """ text_ids = (fn.array_agg(Text.id).coerce(False).alias('text_ids')) docs = (Citation.select(Citation.document, text_ids).join(Text).having( fn.count(Text.id) <= max_texts).where(Text.display == True).where( Text.valid == True).group_by(Citation.document)) for row in query_bar(docs): for tid1, tid2 in combinations(row.text_ids, 2): # If the edge exists, increment the weight. if self.graph.has_edge(tid1, tid2): self.graph[tid1][tid2]['weight'] += 1 # Otherwise, initialize the edge. else: self.graph.add_edge(tid1, tid2, weight=1)
def es_stream_docs(cls, just_cited=False): """ Index all texts. Args: just_cited (bool): If true, just index texts with citations. Yields: dict: The next document. """ # By default, index everything. if not just_cited: query = cls.select() # Omit texts without citations. else: query = ( cls.select() .group_by(cls.id) .join(HLOM_Citation, on=( cls.document==HLOM_Citation.document )) ) for row in query_bar(query): yield row.es_doc
def queue_file_metadata(): """ Queue file metadata extraction tasks. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_file_metadata, doc.id)
def queue_semester(): """ Queue semester regex extraction tasks. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_semester, doc.id)
def queue_archive_url(): """ Queue Internet Archive timestamp extraction tasks. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_archive_url, doc.id)
def run_doc_to_inst(): """ Match documents -> institutions. """ for doc in query_bar(Document.select()): try: doc_to_inst(doc.id) except: pass
def run_doc_to_fields(): """ Match documents -> fields. """ for doc in query_bar(Document.select()): try: doc_to_fields(doc.id) except: pass
def es_stream_docs(cls): """ Index all rows. Yields: dict: The next document. """ for row in query_bar(cls.select()): yield row.es_doc
def link(cls): """ Link documents -> institutions. """ domain_to_inst = defaultdict(list) # Map domain -> [(regex, inst), ...] for inst in ServerSide(Institution.select()): domain = parse_domain(inst.url) regex = seed_to_regex(inst.url) domain_to_inst[domain].append((regex, inst)) for doc in query_bar(Document.select()): try: # TODO: Get rid of @property. url = doc.syllabus.url domain = parse_domain(url) # Find institutions with matching URLs. matches = [] for pattern, inst in domain_to_inst[domain]: match = pattern.search(url) if match: matches.append((match.group(), inst)) if matches: # Sort by length of match, descending. matches = sorted( matches, key=lambda x: len(x[0]), reverse=True, ) # Link to the institution with the longest match. cls.create( institution=matches[0][1], document=doc, ) except Exception as e: print(e)
def es_stream_docs(cls): """ Index document texts. Yields: dict: The next document. """ for row in query_bar(Document_Text.select()): yield dict( _id=row.document_id, body=row.text, )
def es_stream_docs(cls): """ Index institutions. Yields: dict: The next document. """ for row in query_bar(Institution.select()): yield dict( _id=row.id, name=row.name, )
def es_stream_docs(cls): """ Index subfields. Yields: dict: The next document. """ for row in query_bar(Subfield.select()): yield dict( _id = row.id, name = row.name, )
def es_stream_docs(cls): """ Index institutions. Yields: dict: The next document. """ for row in query_bar(Institution.select()): yield dict( _id = row.id, name = row.name, )
def es_stream_docs(cls): """ Index document texts. Yields: dict: The next document. """ for row in query_bar(Document_Text.select()): yield dict( _id = row.document_id, body = row.text, )
def random(out_file, n): """ Write a CSV with plaintext for N random docs. """ # CSV writer. cols = ['id', 'title', 'text'] writer = csv.DictWriter(out_file, cols) writer.writeheader() query = (Document_Text.select(Document_Text.text, Document.path).join(Document).order_by( fn.random()).limit(n)) for row in query_bar(query): writer.writerow({'id': row.path, 'title': row.path, 'text': row.text})
def dedupe(cls): """ Write deduping hashes. """ from .citation import HLOM_Citation cited = ( cls.select() .join(HLOM_Citation) .group_by(cls.id) ) for record in query_bar(cited): record.metadata['deduping_hash'] = record.hash record.save()
def es_stream_docs(cls): """ Stream Elasticsearch docs. Yields: dict: The next document. """ query = ( Citation.select() .join(Text) .where(Text.display==True) .where(Text.valid==True) ) for row in query_bar(query): doc = {} # Local fields: doc['_id'] = row.id doc['text_id'] = row.text_id doc['document_id'] = row.document_id doc['corpus'] = row.text.corpus # Field references: subfield = row.subfield if subfield: doc['subfield_id'] = subfield.id doc['field_id'] = subfield.field_id # Institution reference: inst = row.institution if inst: doc['institution_id'] = inst.id doc['state'] = inst.state doc['country'] = inst.country yield doc
def term_counts(cls, limit=None): """ Get frequency counts for all unique word types. Returns: dict: type -> count """ texts = cls.select().limit(limit) counts = Counter() for row in query_bar(texts): row.tokenize() for term, offsets in row.terms.items(): counts[term] += len(offsets) return counts
def es_stream_docs(cls): """ Index institutions with cited syllabi. Yields: dict: The next document. """ # TODO: De-circularize. from osp.locations.models.doc_inst import Document_Institution from osp.citations.hlom.models.citation import HLOM_Citation count = fn.Count(Document_Institution.id) cited = ( cls.select(cls, count) .join(Document_Institution) .group_by(cls.id) .order_by(count.desc()) ) for inst in query_bar(cited): name = ( inst.metadata['Campus_Name'] or inst.metadata['Institution_Name'] ) city = ( inst.metadata['Campus_City'] or inst.metadata['Institution_City'] ) yield { '_id': inst.id, 'count': inst.count, 'name': name, 'city': city, 'state': inst.metadata['Institution_State'], 'url': inst.metadata['Institution_Web_Address'], 'lon': inst.metadata.get('Longitude'), 'lat': inst.metadata.get('Latitude'), }
def truncated(out_file, frag_len): """ Write a CSV with truncated document texts. """ # CSV writer. cols = ['id', 'title', 'text'] writer = csv.DictWriter(out_file, cols) writer.writeheader() query = (Document_Text.select(Document_Text.text, Document.path).join(Document)) for row in query_bar(query): # Truncate the text. fragment = row.text[:frag_len] writer.writerow({'id': row.path, 'title': row.path, 'text': fragment})
def es_stream_docs(cls): """ Stream Elasticsearch docs. Yields: dict: The next document. """ query = (Citation.select().join(Text).where( Text.display == True).where(Text.valid == True)) for row in query_bar(query): doc = {} # Local fields: doc['_id'] = row.id doc['text_id'] = row.text_id doc['document_id'] = row.document_id doc['corpus'] = row.text.corpus # Field references: subfield = row.subfield if subfield: doc['subfield_id'] = subfield.id doc['field_id'] = subfield.field_id # Institution reference: inst = row.institution if inst: doc['institution_id'] = inst.id doc['state'] = inst.state doc['country'] = inst.country yield doc