Esempio n. 1
0
def ingest_single_parse(docket, deletions, insertions, parser):
    if parser not in ('sentence', '4-gram'):
        raise "Parser must be one of 'sentence' or '4-gram'. Got '%s'." % parser

    corpora = get_corpora_by_metadata('docket_id', docket.id)

    parsed_corpora = [c for c in corpora if c.metadata.get('parser') == parser]

    if len(parsed_corpora) == 0:
        c = Corpus(metadata=dict(docket_id=docket.id, agency_id=docket.agency, parser=parser))
        print "Created new corpus #%s for %s parse." % (c.id, parser)
    
    elif len(parsed_corpora) == 1:
        c = parsed_corpora[0]
        print "Updating existing corpus #%s for %s parse." % (c.id, parser)
        
        print "Deleting documents at %s..." % datetime.now()
        c.delete_by_metadata('document_id', deletions + [d['metadata']['document_id'] for d in insertions])
    
    else:
        raise "More than one sentence parse for docket %s found. Shouldn't happen--will need ot manually remove extra corpora." % docket.id
    
    print "Inserting documents at %s..." % datetime.now()
    if parser == 'sentence':
        i = DocumentIngester(c, parser=sentence_parse, compute_similarities=False)
    elif parser == '4-gram':
        i = DocumentIngester(c, parser=ngram_parser(4), compute_similarities=True)
    i.ingest(insertions)

    print "Removing hierarchy, if cached, at %s..." % datetime.now()
    c.delete_hierarchy_cache()
Esempio n. 2
0
def ingest_docket(agency, docket, docs, ngrams=None):
    print "Beginning processing %s at %s" % (docket, datetime.now())
    
    parser_tag = "%s-grams" % ngrams if ngrams else 'sentence'
    
    c = Corpus(metadata=dict(docket=docket, agency=agency, parser=parser_tag))
    if ngrams:
        i = DocumentIngester(c, parser=ngram_parser(int(ngrams)))
    else:
        i = DocumentIngester(c)
    i.ingest(docs)
    
    print "Finished processing at %s" % datetime.now()
    print "Added %d documents in corpus %d" % (len(docs), c.id)
Esempio n. 3
0
def ingest_single_parse(docket_id, deletions, insertions, parser):
    if parser not in ('sentence', '4-gram'):
        raise "Parser must be one of 'sentence' or '4-gram'. Got '%s'." % parser

    corpora = get_corpora_by_metadata('docket_id', docket_id)

    parsed_corpora = [c for c in corpora if c.metadata.get('parser') == parser]

    if len(parsed_corpora) == 0:
        dockets = list(Docket.objects(id=docket_id).only('agency'))
        docket = dockets[0] if dockets else Docket()
        c = Corpus(metadata=dict(docket_id=docket_id,
                                 agency_id=docket.agency if docket.
                                 agency else docket_id.split("-")[0],
                                 parser=parser))
        print "Created new corpus #%s for %s parse." % (c.id, parser)

    elif len(parsed_corpora) == 1:
        c = parsed_corpora[0]
        print "Updating existing corpus #%s for %s parse." % (c.id, parser)

        print "Deleting documents at %s..." % datetime.now()
        c.delete_by_metadata(
            'document_id',
            deletions + [d['metadata']['document_id'] for d in insertions])

    else:
        raise "More than one sentence parse for docket %s found. Shouldn't happen--will need ot manually remove extra corpora." % docket_id

    print "Inserting documents at %s..." % datetime.now()
    if parser == 'sentence':
        i = DocumentIngester(c,
                             parser=sentence_parse,
                             compute_similarities=False)
    elif parser == '4-gram':
        i = DocumentIngester(c,
                             parser=ngram_parser(4),
                             compute_similarities=True)
    i.ingest(insertions)

    print "Removing hierarchy, if cached, at %s..." % datetime.now()
    c.delete_hierarchy_cache()
    def handle(self, ls_docs_path, **options):
        docs = list()
        for doc in json.load(open(ls_docs_path, 'r')):
            docs += [d['text'] for d in doc['documents'] if d.get('text')]

        cleaned_docs = [d.encode('ascii', 'replace') for d in docs]

        print "Beginning processing at %s" % datetime.now()

        with transaction.commit_on_success():
            c = Corpus()
            if options.get('ngrams'):
                i = DocumentIngester(c, parser=ngram_parser(int(options['ngrams'])))
            else:
                i = DocumentIngester(c)
            i.ingest(cleaned_docs)

        print "Finished processing at %s" % datetime.now()
        
        print "Added %d documents in corpus %d" % (len(cleaned_docs), c.id)
Esempio n. 5
0
    def handle(self, ls_docs_path, **options):
        docs = list()
        for doc in json.load(open(ls_docs_path, 'r')):
            docs += [d['text'] for d in doc['documents'] if d.get('text')]

        cleaned_docs = [d.encode('ascii', 'replace') for d in docs]

        print "Beginning processing at %s" % datetime.now()

        with transaction.commit_on_success():
            c = Corpus()
            if options.get('ngrams'):
                i = DocumentIngester(c,
                                     parser=ngram_parser(int(
                                         options['ngrams'])))
            else:
                i = DocumentIngester(c)
            i.ingest(cleaned_docs)

        print "Finished processing at %s" % datetime.now()

        print "Added %d documents in corpus %d" % (len(cleaned_docs), c.id)