Example #1
0
def query(request):
    '''get GlobalResults()'''
    if request.method == "POST":
        query = request.POST["query"]
        source = get_resource(RESOURCE_TYPE_SCOAP)
        if source["crawler"] is not None:
            crawlerbot = load_crawler(source)()
            #old raw way to get results_nb
            results = crawlerbot.scan_results(query)
            #ids = crawlerbot.get_ids(query)
            return JsonHttpResponse({"results_nb": crawlerbot.results_nb})
Example #2
0
def save(request, project_id):
    '''save'''
    if request.method == "POST":

        query = request.POST.get("query")
        try:
            N = int(request.POST.get("N"))
        except:
            N = 0
        print(query, N)
        #for next time
        #ids = request.POST["ids"]
        source = get_resource(RESOURCE_TYPE_SCOAP)
        if N == 0:
            raise Http404()
        if N > QUERY_SIZE_N_MAX:
            N = QUERY_SIZE_N_MAX

        try:
            project_id = int(project_id)
        except ValueError:
            raise Http404()
        # do we have a valid project?
        project = session.query(Node).filter(Node.id == project_id).first()
        if project is None:
            raise Http404()
        user = cache.User[request.user.id]
        if not user.owns(project):
            return HttpResponseForbidden()
        # corpus node instanciation as a Django model

        corpus = Node(name=query,
                      user_id=request.user.id,
                      parent_id=project_id,
                      typename='CORPUS',
                      hyperdata={
                          "action": "Scrapping data",
                          "language_id": "en"
                      })

        #download_file
        crawler_bot = load_crawler(source)()
        #for now no way to force downloading X records

        #the long running command
        filename = crawler_bot.download(query)
        corpus.add_resource(
            type=source["type"]
            #,  name = source["name"]
            ,
            path=crawler_bot.path)

        session.add(corpus)
        session.commit()
        #corpus_id = corpus.id

        try:
            scheduled(parse_extract_indexhyperdata)(corpus.id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------

        return render(
            template_name='pages/projects/wait.html',
            request=request,
            context={
                'user': request.user,
                'project': project,
            },
        )

    data = [query_string, query, N]
    print(data)
    return JsonHttpResponse(data)