def ingest_single_parse(docket, deletions, insertions, parser): if parser not in ('sentence', '4-gram'): raise "Parser must be one of 'sentence' or '4-gram'. Got '%s'." % parser corpora = get_corpora_by_metadata('docket_id', docket.id) parsed_corpora = [c for c in corpora if c.metadata.get('parser') == parser] if len(parsed_corpora) == 0: c = Corpus(metadata=dict(docket_id=docket.id, agency_id=docket.agency, parser=parser)) print "Created new corpus #%s for %s parse." % (c.id, parser) elif len(parsed_corpora) == 1: c = parsed_corpora[0] print "Updating existing corpus #%s for %s parse." % (c.id, parser) print "Deleting documents at %s..." % datetime.now() c.delete_by_metadata('document_id', deletions + [d['metadata']['document_id'] for d in insertions]) else: raise "More than one sentence parse for docket %s found. Shouldn't happen--will need ot manually remove extra corpora." % docket.id print "Inserting documents at %s..." % datetime.now() if parser == 'sentence': i = DocumentIngester(c, parser=sentence_parse, compute_similarities=False) elif parser == '4-gram': i = DocumentIngester(c, parser=ngram_parser(4), compute_similarities=True) i.ingest(insertions) print "Removing hierarchy, if cached, at %s..." % datetime.now() c.delete_hierarchy_cache()
def ingest_docket(agency, docket, docs, ngrams=None): print "Beginning processing %s at %s" % (docket, datetime.now()) parser_tag = "%s-grams" % ngrams if ngrams else 'sentence' c = Corpus(metadata=dict(docket=docket, agency=agency, parser=parser_tag)) if ngrams: i = DocumentIngester(c, parser=ngram_parser(int(ngrams))) else: i = DocumentIngester(c) i.ingest(docs) print "Finished processing at %s" % datetime.now() print "Added %d documents in corpus %d" % (len(docs), c.id)
def ingest_single_parse(docket_id, deletions, insertions, parser): if parser not in ('sentence', '4-gram'): raise "Parser must be one of 'sentence' or '4-gram'. Got '%s'." % parser corpora = get_corpora_by_metadata('docket_id', docket_id) parsed_corpora = [c for c in corpora if c.metadata.get('parser') == parser] if len(parsed_corpora) == 0: dockets = list(Docket.objects(id=docket_id).only('agency')) docket = dockets[0] if dockets else Docket() c = Corpus(metadata=dict(docket_id=docket_id, agency_id=docket.agency if docket. agency else docket_id.split("-")[0], parser=parser)) print "Created new corpus #%s for %s parse." % (c.id, parser) elif len(parsed_corpora) == 1: c = parsed_corpora[0] print "Updating existing corpus #%s for %s parse." % (c.id, parser) print "Deleting documents at %s..." % datetime.now() c.delete_by_metadata( 'document_id', deletions + [d['metadata']['document_id'] for d in insertions]) else: raise "More than one sentence parse for docket %s found. Shouldn't happen--will need ot manually remove extra corpora." % docket_id print "Inserting documents at %s..." % datetime.now() if parser == 'sentence': i = DocumentIngester(c, parser=sentence_parse, compute_similarities=False) elif parser == '4-gram': i = DocumentIngester(c, parser=ngram_parser(4), compute_similarities=True) i.ingest(insertions) print "Removing hierarchy, if cached, at %s..." % datetime.now() c.delete_hierarchy_cache()
def handle(self, ls_docs_path, **options): docs = list() for doc in json.load(open(ls_docs_path, 'r')): docs += [d['text'] for d in doc['documents'] if d.get('text')] cleaned_docs = [d.encode('ascii', 'replace') for d in docs] print "Beginning processing at %s" % datetime.now() with transaction.commit_on_success(): c = Corpus() if options.get('ngrams'): i = DocumentIngester(c, parser=ngram_parser(int(options['ngrams']))) else: i = DocumentIngester(c) i.ingest(cleaned_docs) print "Finished processing at %s" % datetime.now() print "Added %d documents in corpus %d" % (len(cleaned_docs), c.id)
def handle(self, ls_docs_path, **options): docs = list() for doc in json.load(open(ls_docs_path, 'r')): docs += [d['text'] for d in doc['documents'] if d.get('text')] cleaned_docs = [d.encode('ascii', 'replace') for d in docs] print "Beginning processing at %s" % datetime.now() with transaction.commit_on_success(): c = Corpus() if options.get('ngrams'): i = DocumentIngester(c, parser=ngram_parser(int( options['ngrams']))) else: i = DocumentIngester(c) i.ingest(cleaned_docs) print "Finished processing at %s" % datetime.now() print "Added %d documents in corpus %d" % (len(cleaned_docs), c.id)