Ejemplo n.º 1
0
    def handle(self, *args, **options):
        sfm = from_django_conf('default')
        response = sfm.queue()

        if response['success'] == True:
            queued = [r for r in response['rows'] if r['status'] == 'Queued']
            active = [r for r in response['rows'] if r['status'] == 'Active']

            print 'Length: {0}{1}'.format(len(queued) + len(active),
                                          '' if response.get('cursors', {}).get('next') == '' else '+')
            for (action, cnt) in freq([r['action'] for r in queued]).iteritems():
                print '  {action!s:<20}  {cnt!s:>10}'.format(action=action, cnt=cnt)

            if len(active) == 0:
                print 'No active tasks'
            else:
                print 'Active task(s):'
                for r in active:
                    fmtstr = '  Task #{id}: {action} (priority {priority})'
                    if r['action'] == 'Add Association':
                        fmtstr += ' for document ({doctype}, {docid})'
                    elif r['action'] == 'Add Associations':
                        fmtstr += ' from {source} => {target}'
                    
                    print fmtstr.format(**r)
Ejemplo n.º 2
0
    def handle(self, *args, **options):
        sfm = from_django_conf('default')
        response = sfm.queue()

        if response['success'] == True:
            queued = [r for r in response['rows'] if r['status'] == 'Queued']
            active = [r for r in response['rows'] if r['status'] == 'Active']

            print 'Length: {0}{1}'.format(
                len(queued) + len(active),
                '' if response.get('cursors', {}).get('next') == '' else '+')
            for (action, cnt) in freq([r['action']
                                       for r in queued]).iteritems():
                print '  {action!s:<20}  {cnt!s:>10}'.format(action=action,
                                                             cnt=cnt)

            if len(active) == 0:
                print 'No active tasks'
            else:
                print 'Active task(s):'
                for r in active:
                    fmtstr = '  Task #{id}: {action} (priority {priority})'
                    if r['action'] == 'Add Association':
                        fmtstr += ' for document ({doctype}, {docid})'
                    elif r['action'] == 'Add Associations':
                        fmtstr += ' from {source} => {target}'

                    print fmtstr.format(**r)
Ejemplo n.º 3
0
def execute_search(doc, doctype=None):
    sfm = from_django_conf()
    response = sfm.search(doc.text, doctype)

    if isinstance(response, str):
        # Pass the SFM error back to the client
        return HttpResponse(response, content_type='text/html')


    drop_common_fragments(settings.APIPROXY.get('commonality_threshold', 0.4), response)

    ignore_proper_nouns(settings.APIPROXY.get('proper_noun_threshold', 0.8),
                        doc.text, response)

    ignore_repetitious_characters(settings.APIPROXY.get('minimum_unique_characters', 3),
                                  doc.text, response)

    if doc.url:
        response['documents']['rows'][:] = [r for r in response['documents']['rows']
    
                                            if r.get('url') != doc.url]
    embellish(doc.text,
                response,
                **settings.APIPROXY.get('embellishments', {}))
    return response
Ejemplo n.º 4
0
def document(request, doctype, docid):
    """
    Proxies requests for specific documents to Superfastmatch.

    Does not implement the DELETE method so as to avoid access
    control issues.
    """

    sfm = from_django_conf()
    if request.method == 'POST' or request.method == 'PUT':
        params = QueryDict(
            request.raw_post_data) if request.method == 'PUT' else request.POST
        defer = (request.method == 'PUT')
        text = params['text']
        params = dict([(k, v) for (k, v) in params.items()
                       if k not in ['put', 'text']])
        response = sfm.add(doctype, docid, text=text, defer=defer, **params)
        http_status = 202

    elif request.method == 'GET':
        response = sfm.document(doctype, docid)
        http_status = 200

    else:
        return HttpResponseBadRequest(
            'Only the GET and POST methods are supported.')

    if isinstance(response, str):
        return HttpResponse(response, content_type='text/html')
    else:
        return HttpResponse(json.dumps(response),
                            status=http_status,
                            content_type='application/json')
Ejemplo n.º 5
0
def execute_search(doc, doctype=None):
    sfm = from_django_conf()
    response = sfm.search(doc.text, doctype)

    if isinstance(response, str):
        # Pass the SFM error back to the client
        return HttpResponse(response, content_type='text/html')

    drop_common_fragments(settings.APIPROXY.get('commonality_threshold', 0.4),
                          response)

    ignore_proper_nouns(settings.APIPROXY.get('proper_noun_threshold', 0.8),
                        doc.text, response)

    ignore_repetitious_characters(
        settings.APIPROXY.get('minimum_unique_characters', 3), doc.text,
        response)

    if doc.url:
        response['documents']['rows'][:] = [
            r for r in response['documents']['rows'] if r.get('url') != doc.url
        ]
    embellish(doc.text, response,
              **settings.APIPROXY.get('embellishments', {}))
    return response
Ejemplo n.º 6
0
    def handle(self, *args, **options):
        self.errors = set()

        sfm = from_django_conf()

        docs = DocumentIterator(sfm, order_by='docid',
                                doctype=options['doctype'],
                                chunksize=1000,
                                fetch_text=False)

        try:
            for doc in docs:
                try:
                    release = Release.objects.get(id=doc['docid'])
                    doctype = release.source.doc_type or settings.DEFAULT_DOCTYPE
                    if doctype != doc['doctype']:
                        logging.warning("Doctype mismatch for document ({0[doctype]},{0[docid]}) and release #{1.id} (source: {1.source}, doctype: {2}).".format(doc, release, doctype))

                except Release.DoesNotExist:
                    if options['dry_run'] == False:
                        sfm.delete(doc['doctype'], doc['docid'])
                        logging.warning("Deleting document ({0[doctype]},{0[docid]}) because there is no corresponding press release.".format(doc))
                    else:
                        logging.warning("Document ({0[doctype]},{0[docid]}) does not have a corresponding press release.".format(doc)) 

        except ValueError:
            logging.error("Failed on document {0},{1}".format(doc['doctype'], doc['docid']))
Ejemplo n.º 7
0
    def handle(self, *args, **kwargs):
        if not hasattr(settings, 'SUPERFASTMATCH'):
            raise CommandError('You must configure SUPERFASTMATCH in your project settings.')

        if not hasattr(settings, 'DEFAULT_DOCTYPE'):
            raise CommandError('You must specify a DEFAULT_DOCTYPE in your project settings.')

        self.sfm = from_django_conf()

        sources = Source.objects.filter(source_type=2)

        for source in sources:
            try:
                if source.is_stale():
                    self.scrape_releases(source)
                    source.last_retrieved = now()
                    source.save()

                    failures = SourceScrapeFailure.objects.filter(resolved__isnull=True,
                                                                  source=source)
                    for f in failures:
                        f.resolved = now()
                        f.save()

            except SourceScrapeFailure as failure:
                failure.save()

            except Exception as e:
                failure = SourceScrapeFailure.objects.create(source=source,
                                                             description=unicode(e))
Ejemplo n.º 8
0
def document(request, doctype, docid):
    """
    Proxies requests for specific documents to Superfastmatch.
    """

    sfm = from_django_conf()
    if request.method == 'POST':
        params = request.POST
        text = params['text']
        defer = ('put' not in params) or (params['put'] == 'False')
        params = dict([(k, v)
                       for (k, v) in params.items()
                       if k not in ['put', 'text']])
        response = sfm.add(doctype, docid, text=text, defer=defer, **params)
        http_status = 202

    else:
        response = sfm.document(doctype, docid)
        http_status = 200

    if isinstance(response, str):
        return HttpResponse(response, content_type='text/html')
    else:
        return HttpResponse(json.dumps(response),
                            status=http_status,
                            content_type='application/json')
Ejemplo n.º 9
0
def document(request, doctype, docid):
    """
    Proxies requests for specific documents to Superfastmatch.

    Does not implement the DELETE method so as to avoid access
    control issues.
    """

    sfm = from_django_conf()
    if request.method == 'POST' or request.method == 'PUT':
        params = QueryDict(request.raw_post_data) if request.method == 'PUT' else request.POST
        defer = (request.method == 'PUT')
        text = params['text']
        params = dict([(k, v)
                       for (k, v) in params.items()
                       if k not in ['put', 'text']])
        response = sfm.add(doctype, docid, text=text, defer=defer, **params)
        http_status = 202

    elif request.method == 'GET':
        response = sfm.document(doctype, docid)
        http_status = 200

    else:
        return HttpResponseBadRequest('Only the GET and POST methods are supported.')

    if isinstance(response, str):
        return HttpResponse(response, content_type='text/html')
    else:
        return HttpResponse(json.dumps(response),
                            status=http_status,
                            content_type='application/json')
Ejemplo n.º 10
0
    def handle(self, *args, **options):
        for (key, cfg) in settings.SUPERFASTMATCH.iteritems():
            sfm = from_django_conf(key)
            try:
                if isinstance(sfm, superfastmatch.federated.FederatedClient):
                    print "{0} (federated)".format(key)
                    for (doctypes, client) in sfm.clients().iteritems():
                        print "  doctypes: {0}".format(
                            doctypes.replace(':', ', '))
                        print "    url: {0}".format(client.url)
                        documents = client.documents(doctype=doctypes,
                                                     order_by='docid',
                                                     limit=1)
                        print "    documents: {0}".format(documents['total'])
                else:
                    print "{0}".format(key)
                    print '  url: {0}'.format(sfm.url)

                    documents = sfm.documents()
                    if documents['success'] == True:
                        print '  documents: {0}'.format(documents['total'])
                    else:
                        print '  Unable to query for documents.'
            except (superfastmatch.SuperFastMatchError, socket.error) as e:
                print '  Unable to query for documents: {0}'.format(str(e))
Ejemplo n.º 11
0
    def handle(self, server, inpath, *args, **options):
        if not os.path.exists(inpath):
            raise CommandError("No such file: {0}".format(inpath))

        sfm = from_django_conf(server)
        restore(sfm,
                inpath,
                doctype_mappingstr=options.get('doctypes'),
                dryrun=options.get('dryrun'))
Ejemplo n.º 12
0
def recall(request, uuid, doctype, docid):
    sfm = from_django_conf('sidebyside')
    try:
        sfm_results = sfm.search(text=None, uuid=uuid)
    except superfastmatch.SuperFastMatchError, e:
        if e.status == httplib.NOT_FOUND:
            raise Http404('Article {uuid} not found'.format(uuid=uuid))
        else:
            raise
Ejemplo n.º 13
0
def recall(request, uuid, doctype, docid):
    sfm = from_django_conf('sidebyside')
    try:
        sfm_results = sfm.search(text=None, uuid=uuid)
    except superfastmatch.SuperFastMatchError, e:
        if e.status == httplib.NOT_FOUND:
            raise Http404('Article {uuid} not found'.format(uuid=uuid))
        else:
            raise
Ejemplo n.º 14
0
    def handle(self, server, outpath, doctype_rangestr=None, *args, **options):
        sfm = from_django_conf(server)
        
        if os.path.exists(outpath):
            raise CommandError("I have nothing against {0}, why would I overwrite it?".format(outpath))

        if doctype_rangestr is not None:
            parse_doctype_range(doctype_rangestr)
        backup(sfm, outpath, doctype_rangestr)
Ejemplo n.º 15
0
    def handle(self, server, outpath, doctype_rangestr=None, *args, **options):
        sfm = from_django_conf(server)

        if os.path.exists(outpath):
            raise CommandError(
                "I have nothing against {0}, why would I overwrite it?".format(
                    outpath))

        if doctype_rangestr is not None:
            parse_doctype_range(doctype_rangestr)
        backup(sfm, outpath, doctype_rangestr)
Ejemplo n.º 16
0
def attach_document_text(results, maxdocs=None):
    sfm = from_django_conf('sidebyside')
    if maxdocs:
        results['documents']['rows'].sort(key=itemgetter('characters'))

    for (idx, row) in enumerate(results['documents']['rows']):
        if maxdocs and idx >= maxdocs:
            return

        doc_result = sfm.document(row['doctype'], row['docid'])
        if doc_result['success'] == True:
            row['text'] = doc_result['text']
Ejemplo n.º 17
0
def attach_document_text(results, maxdocs=None):
    sfm = from_django_conf('sidebyside')
    if maxdocs:
        results['documents']['rows'].sort(key=itemgetter('characters'))

    for (idx, row) in enumerate(results['documents']['rows']):
        if maxdocs and idx >= maxdocs:
            return

        doc_result = sfm.document(row['doctype'], row['docid'])
        if doc_result['success'] == True:
            row['text'] = doc_result['text']
Ejemplo n.º 18
0
def association(request, doctype=None):
    """
    Proxies requests for lists of associations to Superfastmatch.
    """

    sfm = from_django_conf()
    page = request.GET.get('cursor')
    response = sfm.associations(doctype, page)
    if isinstance(response, str):
        return HttpResponse(response, content_type='text/html')
    else:
        return HttpResponse(json.dumps(response), content_type='application/json')
Ejemplo n.º 19
0
def association(request, doctype=None):
    """
    Proxies requests for lists of associations to Superfastmatch.
    """

    sfm = from_django_conf()
    page = request.GET.get('cursor')
    response = sfm.associations(doctype, page)
    if isinstance(response, str):
        return HttpResponse(response, content_type='text/html')
    else:
        return HttpResponse(json.dumps(response),
                            content_type='application/json')
Ejemplo n.º 20
0
def document_list(request, doctype=None):
    """
    Proxies requests for lists of documents to Superfastmatch.
    """

    sfm = from_django_conf()
    page = request.GET.get('cursor')
    order_by = request.GET.get('order_by', 'docid')
    limit = request.GET.get('limit', '100')
    response = sfm.documents(doctype, page=page, order_by=order_by, limit=limit)
    if isinstance(response, str):
        return HttpResponse(response, content_type='text/html')
    else:
        return HttpResponse(json.dumps(response), content_type='application/json')
Ejemplo n.º 21
0
def search_against_url(request, url):
    """
    Accepts a URL as either a suffix of the URI or a POST request
    parameter. Downloads the content, feeds it through the
    readability article grabber, then submits the article text
    to superfastmatch for comparison.
    """

    (scheme, _1, _2, _3, _4, _5) = urlparse(url)
    if scheme not in ('http', 'https'):
        return search_page(
            request, error='The URL must begin with either http or https.')

    sfm = from_django_conf('sidebyside')
    try:
        (title, text) = fetch_and_clean(url)
    except requests.exceptions.Timeout:
        return search_page(
            request, error="Sorry, that news article couldn't be retrieved.")

    try:
        sfm_results = sfm.search(text=text, title=title, url=url)
        drop_silly_results(sfm_results)
        sort_by_coverage(sfm_results)

        #if they submit a url, don't return the exact same url in the results
        for r in sfm_results['documents']['rows']:
            if r.get('url') == url:
                sfm_results['documents']['rows'].remove(r)

        if sfm_results.has_key('text'): text = sfm_results['text']
        else: text = ''

        if sfm_results.has_key('title'): title = sfm_results['title']
        else: title = 'No Title'

        return search_result_page(request,
                                  sfm_results,
                                  text,
                                  source_title=title,
                                  source_url=url)
    except superfastmatch.SuperFastMatchError, e:
        if e.status == httplib.NOT_FOUND:
            raise HttpResponse('No such article {0}'.format(url))
        elif settings.DEBUG == True:
            return HttpResponse(e.response[1], status=e.response[0])
        else:
            raise
Ejemplo n.º 22
0
def search_against_uuid(request, uuid):
    sfm = from_django_conf('sidebyside')
    try:
        sfm_results = sfm.search(text=None, uuid=uuid)
        drop_silly_results(sfm_results)
        sort_by_coverage(sfm_results)
        return search_result_page(request, sfm_results, 
                                  source_text=sfm_results.get('text'), 
                                  source_title=sfm_results.get('title'))
    except superfastmatch.SuperFastMatchError, e:
        if e.status == httplib.NOT_FOUND:
            logging.critical(u'Error communicating with the superfastmatch server: {}'.format(unicode(e)))
            raise Http404('No such article {0}'.format(uuid))
        elif settings.DEBUG == True:
            return HttpResponse(e.response[1], status=e.response[0])
        else:
            raise
Ejemplo n.º 23
0
def document_list(request, doctype=None):
    """
    Proxies requests for lists of documents to Superfastmatch.
    """

    sfm = from_django_conf()
    page = request.GET.get('cursor')
    order_by = request.GET.get('order_by', 'docid')
    limit = request.GET.get('limit', '100')
    response = sfm.documents(doctype,
                             page=page,
                             order_by=order_by,
                             limit=limit)
    if isinstance(response, str):
        return HttpResponse(response, content_type='text/html')
    else:
        return HttpResponse(json.dumps(response),
                            content_type='application/json')
Ejemplo n.º 24
0
def search_against_url(request, url):
    """
    Accepts a URL as either a suffix of the URI or a POST request
    parameter. Downloads the content, feeds it through the
    readability article grabber, then submits the article text
    to superfastmatch for comparison.
    """

    (scheme, _1, _2, _3, _4, _5) = urlparse(url)
    if scheme not in ('http', 'https'):
        return search_page(request, error='The URL must begin with either http or https.')

    sfm = from_django_conf('sidebyside')
    try:
        (title, text) = fetch_and_clean(url)
    except requests.exceptions.Timeout:
        return search_page(request, error="Sorry, that news article couldn't be retrieved.")

    try:
        sfm_results = sfm.search(text=text, title=title, url=url)
        drop_silly_results(sfm_results)
        sort_by_coverage(sfm_results)


        #if they submit a url, don't return the exact same url in the results
        for r in sfm_results['documents']['rows']:
            if r.get('url') == url:
                sfm_results['documents']['rows'].remove(r)

        if sfm_results.has_key('text'): text = sfm_results['text']
        else: text = ''

        if sfm_results.has_key('title'): title = sfm_results['title']
        else: title='No Title'

        return search_result_page(request, sfm_results, text,
                                  source_title=title, source_url=url)
    except superfastmatch.SuperFastMatchError, e:
        if e.status == httplib.NOT_FOUND:
            raise HttpResponse('No such article {0}'.format(url))
        elif settings.DEBUG == True:
            return HttpResponse(e.response[1], status=e.response[0])
        else:
            raise
Ejemplo n.º 25
0
    def handle(self, *args, **options):
        if not hasattr(settings, "SUPERFASTMATCH"):
            raise CommandError("You must configure SUPERFASTMATCH in your project settings.")

        self.sfm = from_django_conf()

        for url in args:
            try:
                if url.startswith("http://") or url.startswith("https://"):
                    release = Release.objects.get(url=url)
                    body = get_link_content(release.url)
                    release.title = kill_control_characters(release.title)
                    release.body = body
                    release.updated = now()
                    release.save()
                    logging.info("Updated release {0}: {1}".format(release.id, release.url))
                else:
                    logging.warning("Skipping non-HTTP link {0}".format(release.url))
            except Exception as e:
                logging.error("Failed to rescrape {0}: {1}".format(url, str(e)))
Ejemplo n.º 26
0
def search_against_uuid(request, uuid):
    sfm = from_django_conf('sidebyside')
    try:
        sfm_results = sfm.search(text=None, uuid=uuid)
        drop_silly_results(sfm_results)
        sort_by_coverage(sfm_results)
        return search_result_page(request,
                                  sfm_results,
                                  source_text=sfm_results.get('text'),
                                  source_title=sfm_results.get('title'))
    except superfastmatch.SuperFastMatchError, e:
        if e.status == httplib.NOT_FOUND:
            logging.critical(
                u'Error communicating with the superfastmatch server: {}'.
                format(unicode(e)))
            raise Http404('No such article {0}'.format(uuid))
        elif settings.DEBUG == True:
            return HttpResponse(e.response[1], status=e.response[0])
        else:
            raise
Ejemplo n.º 27
0
    def handle(self, *args, **options):
        for (key, cfg) in settings.SUPERFASTMATCH.iteritems():
            sfm = from_django_conf(key)
            try:
                if isinstance(sfm, superfastmatch.federated.FederatedClient):
                    print "{0} (federated)".format(key)
                    for (doctypes, client) in sfm.clients().iteritems():
                        print "  doctypes: {0}".format(doctypes.replace(':', ', '))
                        print "    url: {0}".format(client.url)
                        documents = client.documents(doctype=doctypes, order_by='docid', limit=1)
                        print "    documents: {0}".format(documents['total'])
                else:
                    print "{0}".format(key)
                    print '  url: {0}'.format(sfm.url)

                    documents = sfm.documents()
                    if documents['success'] == True:
                        print '  documents: {0}'.format(documents['total'])
                    else:
                        print '  Unable to query for documents.'
            except (superfastmatch.SuperFastMatchError, socket.error) as e:
                print '  Unable to query for documents: {0}'.format(str(e))
Ejemplo n.º 28
0
    def handle(self, sample_size, *args, **options):
        logging.basicConfig(level=getattr(logging, options['loglevel'].upper()))
        self.errors = set()

        try:
            sample_size = int(sample_size)
        except ValueError:
            raise CommandError("sample_size must be an integer.")

        self.sfm = from_django_conf()

        sample = random_sample(sample_size)
        for release in sample:
            self.check_release(release)

        
        log_fn = logging.error if len(self.errors) > 0 else logging.info
        log_fn(repr({
            'Sample size': len(sample),
            'Errors': len(self.errors),
            'Error rate': round(len(self.errors) / len(sample), 2)
        }))
Ejemplo n.º 29
0
def permalink(request, uuid, doctype, docid):
    sfm = from_django_conf('sidebyside')
    try:
        sfm_results = sfm.search(text=None, uuid=uuid)
        drop_silly_results(sfm_results)
        if len(sfm_results['documents']['rows']) == 0:
            raise Http404('No such article {0}'.format(uuid))

        sort_by_coverage(sfm_results)

        try:
            matching_row = [
                r for r in sfm_results['documents']['rows']
                if r['doctype'] == int(doctype) and r['docid'] == int(docid)
            ][0]
        except IndexError:
            return redirect('sidebyside-uuid-search', uuid=uuid)

        if not matching_row.get('text'):
            try:
                md = MatchedDocument.objects.get(doc_type=doctype,
                                                 doc_id=docid)
                matching_row['text'] = md.text
            except MatchedDocument.DoesNotExist:
                doc = sfm.document(doctype, docid)
                if doc:
                    matching_row['text'] = doc['text']

        return search_result_page(request,
                                  sfm_results,
                                  source_text=sfm_results['text'],
                                  source_title=sfm_results.get('title'),
                                  source_url=sfm_results.get('url'))
    except superfastmatch.SuperFastMatchError, e:
        if e.status == httplib.NOT_FOUND:
            raise Http404('No such article {0}'.format(uuid))
        else:
            raise
Ejemplo n.º 30
0
def record_matches(doc, response, update_matches=False):
    sfm = from_django_conf()
    for r in response['documents']['rows']:
        if 'url' not in r:
            # We don't want to record a match for a document we don't have a URL for
            # because we cannot provide a link back to the original.
            continue

        if r['url'] == doc.url:
            continue

        (md, created) = MatchedDocument.objects.get_or_create(
            doc_id=r['docid'], doc_type=r['doctype'])
        if created or update_matches:
            sfm_doc = sfm.document(r['doctype'], r['docid'])
            if sfm_doc['success'] == False:
                # If we can't fetch the text, the site probably won't be able
                # to either, so just ignore this result row.
                continue
            md.text = sfm_doc['text']
            md.source_url = r['url']
            md.source_name = r.get('source')
            md.source_headline = r['title']
            md.save()

        (match, created) = Match.objects.get_or_create(search_document=doc,
                                                       matched_document=md)
        if created or update_matches:
            stats = calculate_coverage(doc.text, r)
            match.percent_churned = str(stats[1])
            match.overlapping_characters = stats[0]
            density = r.get('density')
            match.fragment_density = Decimal(str(density)) if density else None
            match.response = json.dumps(response)
        match.number_matches += 1
        match.save()

        r['match_id'] = match.id
Ejemplo n.º 31
0
def record_matches(doc, response, update_matches=False):
    sfm = from_django_conf()
    for r in response['documents']['rows']:
        if 'url' not in r:
            # We don't want to record a match for a document we don't have a URL for
            # because we cannot provide a link back to the original.
            continue

        if r['url'] == doc.url:
            continue

        (md, created) = MatchedDocument.objects.get_or_create(doc_id=r['docid'],
                                                              doc_type=r['doctype'])
        if created or update_matches:
            sfm_doc = sfm.document(r['doctype'], r['docid'])
            if sfm_doc['success'] == False:
                # If we can't fetch the text, the site probably won't be able
                # to either, so just ignore this result row.
                continue
            md.text = sfm_doc['text']
            md.source_url = r['url']
            md.source_name = r.get('source')
            md.source_headline = r['title']
            md.save()

        (match, created) = Match.objects.get_or_create(search_document=doc,
                                                       matched_document=md)
        if created or update_matches:
            stats = calculate_coverage(doc.text, r)
            match.percent_churned = str(stats[1])
            match.overlapping_characters = stats[0]
            density = r.get('density')
            match.fragment_density = Decimal(str(density)) if density else None
            match.response = json.dumps(response)
        match.number_matches += 1
        match.save()

        r['match_id'] = match.id
Ejemplo n.º 32
0
    def handle(self, *args, **options):
        if not hasattr(settings, 'SUPERFASTMATCH'):
            raise CommandError('You must configure SUPERFASTMATCH in your project settings.')

        if not hasattr(settings, 'DEFAULT_DOCTYPE'):
            raise CommandError('You must specify a DEFAULT_DOCTYPE in your project settings.')

        self.sfm = from_django_conf()

        sources = Source.objects.filter(source_type=2)
        if len(args) == 1:
            arg = args[0]
            if arg.startswith('http://') or arg.startswith('https://'):
                sources = sources.filter(url=arg)
            else:
                try:
                    sources = sources.filter(id=int(arg))
                except ValueError:
                    raise CommandError("Arguments must be source IDs or feed URLs")

        for source in sources:
            try:
                if source.is_stale() or options['including_stale']:
                    self.scrape_releases(source)
                    source.last_retrieved = now()
                    source.last_failure = None
                    source.save()

            except SourceScrapeFailure as failure:
                failure.save()

            except Exception as e:
                buf = StringIO()
                print_exc(1000, buf)
                failure = SourceScrapeFailure.objects.create(source=source,
                                                             traceback=buf.getvalue(),
                                                             description=unicode(e))
Ejemplo n.º 33
0
def permalink(request, uuid, doctype, docid):
    sfm = from_django_conf('sidebyside')
    try:
        sfm_results = sfm.search(text=None, uuid=uuid)
        drop_silly_results(sfm_results)
        if len(sfm_results['documents']['rows']) == 0:
            raise Http404('No such article {0}'.format(uuid))

        sort_by_coverage(sfm_results)

        try:
            matching_row = [r 
                            for r in sfm_results['documents']['rows']
                            if r['doctype'] == int(doctype)
                            and r['docid'] == int(docid)][0]
        except IndexError:
            return redirect('sidebyside-uuid-search', uuid=uuid)

        if not matching_row.get('text'):
            try:
                md = MatchedDocument.objects.get(doc_type=doctype, doc_id=docid)
                matching_row['text'] = md.text
            except MatchedDocument.DoesNotExist:
                doc = sfm.document(doctype, docid)
                if doc:
                    matching_row['text'] = doc['text']

        return search_result_page(request, sfm_results,
                                  source_text=sfm_results['text'],
                                  source_title=sfm_results.get('title'),
                                  source_url=sfm_results.get('url'))
    except superfastmatch.SuperFastMatchError, e:
        if e.status == httplib.NOT_FOUND:
            raise Http404('No such article {0}'.format(uuid))
        else:
            raise
Ejemplo n.º 34
0
def search_against_text(request, text):
    sfm = from_django_conf('sidebyside')
    sfm_results = sfm.search(text)
    drop_silly_results(sfm_results)
    sort_by_coverage(sfm_results)
    return search_result_page(request, sfm_results, text)
Ejemplo n.º 35
0
def search_against_text(request, text):
    sfm = from_django_conf('sidebyside')
    sfm_results = sfm.search(text)
    drop_silly_results(sfm_results)
    sort_by_coverage(sfm_results)
    return search_result_page(request, sfm_results, text)
Ejemplo n.º 36
0
    def handle(self, server, inpath, *args, **options):
        if not os.path.exists(inpath):
            raise CommandError("No such file: {0}".format(inpath))

        sfm = from_django_conf(server)
        restore(sfm, inpath, doctype_mappingstr=options.get('doctypes'), dryrun=options.get('dryrun'))