Ejemplo n.º 1
0
    def _run_recipe(self, source_type, expected_results):
        """
        Each of the resources input test can follow this common recipe base

        @param source_type:          int   (cf. constants.py RESOURCETYPES)
        @param expected_results:    int[]   (number of docs for each sample corpora of this source)
        """
        source = get_resource(source_type)
        source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_")

        self.test_name = ">>  "+ sys._getframe().f_code.co_name +"_"+str(source_name)+":"
        self.log.debug("\n" + self.test_name)

        for i,sample_file in enumerate(self.sample_files[source_name]):
            print("... sample_file:", sample_file)
            expected_ndocs = expected_results[i]
            name = "test_"+source_name+str(i)
            self.log.debug("\t- Checking creation of corpus %s" %name)
            self.corpus = self._create_corpus(name, source_type, sample_file)
            db_corpus = self._get_corpus(name)
            #corpus check
            self.assertEqual(self.corpus.name, db_corpus.name)
            self.log.debug("\t- Checking creation of resource type '%s' " %get_resource(source_type)["name"])
            self.assertEqual(self.corpus.resources()[0]["type"], db_corpus.resources()[0]["type"])
            self.log.debug("\t- Parsing and indexing corpus")
            parse(self.corpus)
            real_ndocs = self.__count_node_children__(self.corpus, "DOCUMENT")
            # print('==>\t'+str(source_type)+'\t'+str(i)+'\t'+sample_file+'\t'+str(real_ndocs))
            self.assertEqual(real_ndocs, expected_ndocs)
            status = self.corpus.status()
            self.log.debug("\t- Extracting ngrams")
            extract_ngrams(self.corpus)
            # ngrams = self.__count_node_children__(self.corpus, "NGRAMS")
            status = self.corpus.status()
            self.assertTrue(status["complete"])
def newCorpus(project, source, name=None, query=None):
    error = False

    if name is None:
        name = query

    if not isinstance(project, ProjectNode):
        error = "a valid project"
    if not isinstance(source, int) and not isinstance(source, str):
        error = "a valid source identifier: id or name"
    elif not isinstance(query, str):
        error = "a valid query"
    elif not isinstance(name, str):
        error = "a valid name"

    if error:
        raise NotebookError("Please provide %s." % error)

    resource = get_resource(source) if isinstance(source, int) else \
               get_resource_by_name(source)

    moissonneur_name = get_moissonneur_name(resource) if resource else \
                       source.lower()

    try:
        moissonneur = get_moissonneur(moissonneur_name)
    except ImportError:
        raise NotebookError("Invalid source identifier: %r" % source)

    return run_moissonneur(moissonneur, project, name, query)
Ejemplo n.º 3
0
def query(request):
    '''get GlobalResults()'''
    if request.method == "POST":
        query = request.POST["query"]
        source = get_resource(RESOURCE_TYPE_SCOAP)
        if source["crawler"] is not None:
            crawlerbot = load_crawler(source)()
            #old raw way to get results_nb
            results = crawlerbot.scan_results(query)
            #ids = crawlerbot.get_ids(query)
            return JsonHttpResponse({"results_nb": crawlerbot.results_nb})
Ejemplo n.º 4
0
def ngramtable(request, project_id, corpus_id):
    '''
    Browse and modify all lists together.
       => maplist and mainlist terms in a table
          with groupings as '+' nodes
       => uses API GET batch of lists
       => uses API PUT/DEL for list modifications
       => uses frontend AJAX through Ngrams_dyna_charts_and_table.js
    # TODO refactor Ngrams_dyna_charts_and_table.js
    '''
    # corpus still necessary to find all lists
    corpus = cache.Node[corpus_id]

    # and the project just for project.id in corpusBannerTop
    project = cache.Node[project_id]

    # retrieve all corpora of this user for list import option
    # POSSIBILITY: could do same task in ajax "only if needed"
    #              (use api for that when merged)
    corpora_infos_q = (session.query(Node.id, Node.name).filter(
        Node.typename == "CORPUS").filter(Node.user_id == project.user_id))
    # .filter(Node.id != corpus_id)
    corpora_infos = corpora_infos_q.all()

    source_type = corpus.resources()[0]['type']
    # rendered page : terms.html
    return render(
        template_name='pages/corpora/terms.html',
        request=request,
        context={
            'debug': settings.DEBUG,
            'user': request.user,
            'date': datetime.now(),
            'project': project,
            'corpus': corpus,
            'resourcename': get_resource(source_type)['name'],
            'view': 'terms',

            # for the CSV import modal
            'importroute': "/api/ngramlists/import?onto_corpus=%i" % corpus.id,
            'corporainfos': corpora_infos,
            #user params
            'user_parameters': get_user_params(request.user),
            'languages': USER_LANG
        },
    )
Ejemplo n.º 5
0
    def __init__(self, record):

        #the name of corpus
        #that will be built in case of internal fileparsing
        self.record       = record
        self.name         = record["corpus_name"]
        self.project_id   = record["project_id"]
        self.user_id      = record["user_id"]
        self.resource     = record["source"]
        self.type         = get_resource(self.resource)
        self.query        = record["query"]
        #format the sampling
        self.n_last_years = 5
        self.YEAR         = date.today().year
        #pas glop
        # mais easy version
        self.MONTH        = str(date.today().month)
        
        if len(self.MONTH) == 1:
            self.MONTH = "0"+self.MONTH
        
        self.MAX_RESULTS = QUERY_SIZE_N_MAX
        
        try:
            self.results_nb = int(record["count"])
        except KeyError:
            #n'existe pas encore
            self.results_nb = 0
        try:
            self.webEnv   = record["webEnv"]
            self.queryKey = record["queryKey"]
            self.retMax   = record["retMax"]
        except KeyError:
            #n'exsite pas encore
            self.queryKey = None
            self.webEnv = None
            self.retMax = 1
        self.status = [None]
        self.path = "/tmp/results.txt"
Ejemplo n.º 6
0
def save(request, project_id):
    '''save'''
    if request.method == "POST":

        query = request.POST.get("query")
        try:
            N = int(request.POST.get("N"))
        except:
            N = 0
        print(query, N)
        #for next time
        #ids = request.POST["ids"]
        source = get_resource(RESOURCE_TYPE_SCOAP)
        if N == 0:
            raise Http404()
        if N > QUERY_SIZE_N_MAX:
            N = QUERY_SIZE_N_MAX

        try:
            project_id = int(project_id)
        except ValueError:
            raise Http404()
        # do we have a valid project?
        project = session.query(Node).filter(Node.id == project_id).first()
        if project is None:
            raise Http404()
        user = cache.User[request.user.id]
        if not user.owns(project):
            return HttpResponseForbidden()
        # corpus node instanciation as a Django model

        corpus = Node(name=query,
                      user_id=request.user.id,
                      parent_id=project_id,
                      typename='CORPUS',
                      hyperdata={
                          "action": "Scrapping data",
                          "language_id": "en"
                      })

        #download_file
        crawler_bot = load_crawler(source)()
        #for now no way to force downloading X records

        #the long running command
        filename = crawler_bot.download(query)
        corpus.add_resource(
            type=source["type"]
            #,  name = source["name"]
            ,
            path=crawler_bot.path)

        session.add(corpus)
        session.commit()
        #corpus_id = corpus.id

        try:
            scheduled(parse_extract_indexhyperdata)(corpus.id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------

        return render(
            template_name='pages/projects/wait.html',
            request=request,
            context={
                'user': request.user,
                'project': project,
            },
        )

    data = [query_string, query, N]
    print(data)
    return JsonHttpResponse(data)
Ejemplo n.º 7
0
def save(request, project_id, return_corpus=False):
    print("testISTEX:")
    print(request.method)
    alist = ["bar", "foo"]
    # implicit global session
    # do we have a valid project id?
    try:
        project_id = int(project_id)
    except ValueError:
        raise Http404()

    # do we have a valid project?
    project = (session.query(Node).filter(Node.id == project_id).filter(
        Node.typename == 'PROJECT')).first()

    if project is None:
        raise Http404()

    # do we have a valid user?
    user = request.user
    if not user.is_authenticated():
        return redirect('/auth/?next=%s' % request.path)
    if project.user_id != user.id:
        return HttpResponseForbidden()

    query_string = ""
    if request.method == "POST":
        query = "-"
        query_string = "-"

        #N = QUERY_SIZE_N_MAX

        if "query" in request.POST:
            query = request.POST["query"]
            query_string = query.replace(" ", "+")  # url encoded q

        if "N" in request.POST:
            if request.POST["N"] == "NaN":
                N = QUERY_SIZE_N_MAX
            else:
                N = int(request.POST["N"])  # query_size from views_opti

            if N > QUERY_SIZE_N_MAX:
                N = QUERY_SIZE_N_MAX
                #msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
                #print("ERROR (scrap: istex d/l ): ",msg)
                #raise ValueError(msg)

        print("Scrapping Istex: '%s' (%i)" % (query_string, N))

        urlreqs = []
        pagesize = 50
        tasks = Scraper()
        chunks = list(tasks.chunks(range(N), pagesize))

        for k in chunks:
            if (k[0] + pagesize) > N: pagesize = N - k[0]
            urlreqs.append(
                "http://api.istex.fr/document/?q=" + query_string +
                "&output=id,corpusName,title,genre,language,doi,host,publicationDate,abstract,author&"
                + "from=" + str(k[0]) + "&size=" + str(pagesize))

        # corpus node instanciation as a Django model

        corpus = Node(name=query,
                      user_id=request.user.id,
                      parent_id=project_id,
                      typename='CORPUS',
                      hyperdata={
                          "action": "Scrapping data",
                          "language_id": None
                      })

        tasks = Scraper()

        for i in range(8):
            t = threading.Thread(target=tasks.worker2)  #thing to do
            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
            t.start()

        for url in urlreqs:
            tasks.q.put(url)  #put a task in th queue
        tasks.q.join()  # wait until everything is finished

        dwnldsOK = 0
        for filename in tasks.firstResults:
            if filename != False:
                # add the uploaded resource to the corpus
                corpus.add_resource(
                    type=get_resource(RESOURCE_TYPE_ISTEX)["type"],
                    path=filename)
                dwnldsOK += 1

        session.add(corpus)
        session.commit()
        #corpus_id = corpus.id

        if dwnldsOK == 0:
            return JsonHttpResponse(["fail"])
        ###########################
        ###########################
        try:
            scheduled(parse_extract_indexhyperdata)(corpus.id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------

        if return_corpus:
            return corpus

        return render(
            template_name='pages/projects/wait.html',
            request=request,
            context={
                'user': request.user,
                'project': project,
            },
        )

    data = [query_string, query, N]
    print(data)
    return JsonHttpResponse(data)