Example #1
0
 def create_corpus(self):
     #create a corpus
     corpus = Node(name=self.query,
                   user_id=self.user_id,
                   parent_id=self.project_id,
                   typename='CORPUS',
                   hyperdata={
                       "action": "Scrapping data",
                       "language_id": self.type["default_language"],
                   })
     self.corpus_id = corpus.id
     if len(self.paths) > 0:
         for path in self.paths:
             #add the resource
             corpus.add_resource(type=self.type["type"],
                                 name=self.type["name"],
                                 path=path)
         session.add(corpus)
         session.commit()
         scheduled(parse_extract_indexhyperdata(corpus.id))
     else:
         #add the resource
         corpus.add_resource(type=self.type["type"],
                             name=self.type["name"],
                             path=self.path)
         session.add(corpus)
         session.commit()
         scheduled(parse_extract_indexhyperdata(corpus.id))
     return corpus
    def post(self, request):
        """
        Merge the lists of a corpus with other lists from a CSV source
                                                 or from another corpus

        params in request.GET:
            onto_corpus:  the corpus whose lists are getting patched

        params in request.data:
            csvfile:      the csv file

        /!\ We assume we checked the file size client-side before upload
        """
        if not request.user.is_authenticated():
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        # the corpus with the target lists to be patched
        params = get_parameters(request)
        corpus_id = int(params.pop("onto_corpus"))
        corpus_node = cache.Node[corpus_id]

        if request.user.id != corpus_node.user_id:
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        # request also contains the file
        # csv_file has type django.core.files.uploadedfile.InMemoryUploadedFile
        #                                                 ----------------------
        csv_file = request.data['csvfile']

        csv_contents = csv_file.read().decode("UTF-8").split("\n")
        csv_file.close()
        del csv_file

        # import the csv
        # try:
        log_msg = "Async generation"

        corpus_node_id = corpus_node.id
        scheduled(import_and_merge_ngramlists)(csv_contents,
                                               corpus_node_id,
                                               overwrite=bool(
                                                   params.get('overwrite')))

        return JsonHttpResponse({
            'log': log_msg,
        }, 200)
Example #3
0
    def patch(self, request, corpusnode_id):
        """
        PATCH triggers recount of metrics for the specified corpus.

        ex PATCH http://localhost:8000/api/metrics/14072
                                                   -----
                                                 corpus_id
        """
        print("==> update metrics request on ", corpusnode_id)

        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        try:
            corpus = cache.Node[int(corpusnode_id)]
        except:
            corpus = None

        if corpus is None:
            raise ValidationException("%s is not a valid corpus node id." %
                                      corpusnode_id)

        else:
            t_before = datetime.now()
            # =============
            scheduled(recount)(corpus.id)
            # =============
            t_after = datetime.now()

            return JsonHttpResponse({
                'corpus_id':
                corpusnode_id,
                'took':
                "%f s." % (t_after - t_before).total_seconds()
            })
Example #4
0
def save(request, project_id):
    '''save'''
    if request.method == "POST":

        query = request.POST.get("query")
        try:
            N = int(request.POST.get("N"))
        except:
            N = 0
        print(query, N)
        #for next time
        #ids = request.POST["ids"]
        source = get_resource(RESOURCE_TYPE_SCOAP)
        if N == 0:
            raise Http404()
        if N > QUERY_SIZE_N_MAX:
            N = QUERY_SIZE_N_MAX

        try:
            project_id = int(project_id)
        except ValueError:
            raise Http404()
        # do we have a valid project?
        project = session.query(Node).filter(Node.id == project_id).first()
        if project is None:
            raise Http404()
        user = cache.User[request.user.id]
        if not user.owns(project):
            return HttpResponseForbidden()
        # corpus node instanciation as a Django model

        corpus = Node(name=query,
                      user_id=request.user.id,
                      parent_id=project_id,
                      typename='CORPUS',
                      hyperdata={
                          "action": "Scrapping data",
                          "language_id": "en"
                      })

        #download_file
        crawler_bot = load_crawler(source)()
        #for now no way to force downloading X records

        #the long running command
        filename = crawler_bot.download(query)
        corpus.add_resource(
            type=source["type"]
            #,  name = source["name"]
            ,
            path=crawler_bot.path)

        session.add(corpus)
        session.commit()
        #corpus_id = corpus.id

        try:
            scheduled(parse_extract_indexhyperdata)(corpus.id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------

        return render(
            template_name='pages/projects/wait.html',
            request=request,
            context={
                'user': request.user,
                'project': project,
            },
        )

    data = [query_string, query, N]
    print(data)
    return JsonHttpResponse(data)
Example #5
0
def project(request, project_id):

    # security check
    project = session.query(Node).filter(Node.id == project_id).first()
    user = cache.User[request.user.id]

    if project is None:
        raise Http404()
    if not user.owns(project):
        return HttpResponseForbidden()
    # end of security check

    # new corpus
    if request.method == 'POST':
        corpus = project.add_child(
            name=request.POST['name'],
            typename='CORPUS',
        )
        corpus.add_resource(
            type=int(request.POST['type']),
            path=upload(request.FILES['file']),
        )
        session.add(corpus)
        session.commit()

        # parse_extract: fileparsing -> ngram extraction -> lists
        scheduled(parse_extract_indexhyperdata)(corpus.id)
        wait = True
    else:
        wait = False

    # corpora within this project
    corpora = project.children('CORPUS', order=True).all()
    sourcename2corpora = defaultdict(list)
    for corpus in corpora:
        # we only consider the first resource of the corpus to determine its type
        resources = corpus.resources()
        if len(resources):
            resource = resources[0]
            #resource_type_name = RESOURCETYPES[resource['type']]['name']
            resource_type_name = get_resource(resource["type"])["name"]
        else:
            print("(WARNING) PROJECT view: no listed resource")
        # add some data for the viewer
        corpus.count = corpus.children('DOCUMENT').count()
        status = corpus.status()
        if status is not None and not status['complete']:
            if not status['error']:
                corpus.status_message = '(in progress: %s, %d complete)' % (
                    status['action'].replace('_', ' '),
                    status['progress'],
                )
            else:
                corpus.status_message = '(aborted: "%s" after %i docs)' % (
                    status['error'][-1], status['progress'])
        else:
            corpus.status_message = ''
        # add
        sourcename2corpora[resource_type_name].append(corpus)
    # source & their respective counts
    total_documentscount = 0
    sourcename2documentscount = defaultdict(int)
    for sourcename, corpora in sourcename2corpora.items():
        sourcename = re.sub(' \(.*$', '', sourcename)
        for corpus in corpora:
            count = corpus.children('DOCUMENT').count()
            sourcename2documentscount[sourcename] += count
            total_documentscount += count
    donut = [{
        'source':
        sourcename,
        'count':
        count,
        'part':
        round(count * 100.0 /
              total_documentscount, 1) if total_documentscount else 0,
    } for sourcename, count in sourcename2documentscount.items()]
    if wait:

        return render(
            template_name='pages/projects/wait.html',
            request=request,
            context={
                'form': NewCorpusForm,
                'user': request.user,
                'date': datetime.now(),
                'project': project,
                'donut': donut,
                'list_corpora': dict(sourcename2corpora),
                'whitelists': [],
                'blacklists': [],
                'cooclists': [],
                'number': len(corpora),
                'query_size': QUERY_SIZE_N_DEFAULT,
                # status refreshing params (when active workflows)
                'status_refresh_initial_interval':
                PROJECT_VIEW_REFRESH_INTERVAL,
                'status_refresh_max_attempts':
                PROJECT_VIEW_MAX_REFRESH_ATTEMPTS,
            },
        )
    # response!
    return render(
        template_name='pages/projects/project.html',
        request=request,
        context={
            'form': NewCorpusForm,
            'user': request.user,
            'date': datetime.now(),
            'project': project,
            'donut': donut,
            'list_corpora': dict(sourcename2corpora),
            'whitelists': [],
            'blacklists': [],
            'cooclists': [],
            'number': len(corpora),
            'query_size': QUERY_SIZE_N_DEFAULT,
            # status refreshing params (when active workflows)
            'status_refresh_initial_interval': PROJECT_VIEW_REFRESH_INTERVAL,
            'status_refresh_max_attempts': PROJECT_VIEW_MAX_REFRESH_ATTEMPTS,
        },
    )
def save(request, project_id, return_corpus=False):
    print("testISTEX:")
    print(request.method)
    alist = ["bar", "foo"]
    # implicit global session
    # do we have a valid project id?
    try:
        project_id = int(project_id)
    except ValueError:
        raise Http404()

    # do we have a valid project?
    project = (session.query(Node).filter(Node.id == project_id).filter(
        Node.typename == 'PROJECT')).first()

    if project is None:
        raise Http404()

    # do we have a valid user?
    user = request.user
    if not user.is_authenticated():
        return redirect('/auth/?next=%s' % request.path)
    if project.user_id != user.id:
        return HttpResponseForbidden()

    query_string = ""
    if request.method == "POST":
        query = "-"
        query_string = "-"

        #N = QUERY_SIZE_N_MAX

        if "query" in request.POST:
            query = request.POST["query"]
            query_string = query.replace(" ", "+")  # url encoded q

        if "N" in request.POST:
            if request.POST["N"] == "NaN":
                N = QUERY_SIZE_N_MAX
            else:
                N = int(request.POST["N"])  # query_size from views_opti

            if N > QUERY_SIZE_N_MAX:
                N = QUERY_SIZE_N_MAX
                #msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
                #print("ERROR (scrap: istex d/l ): ",msg)
                #raise ValueError(msg)

        print("Scrapping Istex: '%s' (%i)" % (query_string, N))

        urlreqs = []
        pagesize = 50
        tasks = Scraper()
        chunks = list(tasks.chunks(range(N), pagesize))

        for k in chunks:
            if (k[0] + pagesize) > N: pagesize = N - k[0]
            urlreqs.append(
                "http://api.istex.fr/document/?q=" + query_string +
                "&output=id,corpusName,title,genre,language,doi,host,publicationDate,abstract,author&"
                + "from=" + str(k[0]) + "&size=" + str(pagesize))

        # corpus node instanciation as a Django model

        corpus = Node(name=query,
                      user_id=request.user.id,
                      parent_id=project_id,
                      typename='CORPUS',
                      hyperdata={
                          "action": "Scrapping data",
                          "language_id": None
                      })

        tasks = Scraper()

        for i in range(8):
            t = threading.Thread(target=tasks.worker2)  #thing to do
            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
            t.start()

        for url in urlreqs:
            tasks.q.put(url)  #put a task in th queue
        tasks.q.join()  # wait until everything is finished

        dwnldsOK = 0
        for filename in tasks.firstResults:
            if filename != False:
                # add the uploaded resource to the corpus
                corpus.add_resource(
                    type=get_resource(RESOURCE_TYPE_ISTEX)["type"],
                    path=filename)
                dwnldsOK += 1

        session.add(corpus)
        session.commit()
        #corpus_id = corpus.id

        if dwnldsOK == 0:
            return JsonHttpResponse(["fail"])
        ###########################
        ###########################
        try:
            scheduled(parse_extract_indexhyperdata)(corpus.id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------

        if return_corpus:
            return corpus

        return render(
            template_name='pages/projects/wait.html',
            request=request,
            context={
                'user': request.user,
                'project': project,
            },
        )

    data = [query_string, query, N]
    print(data)
    return JsonHttpResponse(data)
Example #7
0
def save( request , project_id ) :
    # implicit global session
    # do we have a valid project id?
    try:
        project_id = int(project_id)
    except ValueError:
        raise Http404()
    # do we have a valid project?

    project = session.query( Node ).filter(Node.id == project_id).first()

    if project is None:
        raise Http404()


    user = cache.User[request.user.id]
    if not user.owns(project):
        return HttpResponseForbidden()


    if request.method == "POST":
        queries = request.POST["query"]
        name    = request.POST["string"]

        # here we just realize queries already prepared by getGlobalStats
        #    ===> no need to repeat N parameter like in testISTEX <===

        instancia  = Scraper()
        thequeries = json.loads(queries)

        # fyi the sum of our prepared yearly proportional quotas
        sampled_sum = sum([year_q['retmax'] for year_q in thequeries])
        print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum))

        urlreqs = []
        for yearquery in thequeries:
            urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
        alist = ["tudo fixe" , "tudo bem"]


        # corpus node instanciation as a Django model
        corpus = project.add_child( name=name
                                  , typename = "CORPUS"
                                  )

        # """
        # urlreqs: List of urls to query.
        # - Then, to each url in urlreqs you do:
        #     eFetchResult = urlopen(url)
        #     eFetchResult.read()  # this will output the XML... normally you write this to a XML-file.
        # """

        tasks = Scraper()

        for i in range(8):
            t = threading.Thread(target=tasks.worker2) #thing to do
            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
            t.start()
        for url in urlreqs:
            tasks.q.put( url ) #put a task in the queue
        tasks.q.join() # wait until everything is finished

        dwnldsOK = 0

        for filename in tasks.firstResults :
            print(filename)
            if filename != False:
                # add the uploaded resource to the corpus
                corpus.add_resource( type = get_resource_by_name('Pubmed [XML]')["type"]
                                   , path = filename
                                   , url  = None
                                   )
                print("Adding the resource")
                dwnldsOK+=1

        session.add(corpus)
        session.commit()
        corpus_id = corpus.id

        if dwnldsOK == 0 :
            return JsonHttpResponse(["fail"])
        try:
            scheduled(parse_extract_indexhyperdata)(corpus_id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------
        sleep(1)
        return HttpResponseRedirect('/projects/' + str(project_id))

    data = alist
    return JsonHttpResponse(data)
Example #8
0
def get_graph(request=None,
              corpus=None,
              field1='ngrams',
              field2='ngrams',
              mapList_id=None,
              groupList_id=None,
              cooc_id=None,
              type='node_link',
              start=None,
              end=None,
              distance='conditional',
              bridgeness=5,
              threshold=1,
              isMonopartite=True,
              saveOnly=True):
    '''
    Get_graph : main steps:
    0) Check the parameters

    get_graph :: GraphParameters -> Either (Dic Nodes Links) (Dic State Length)
        where type Length = Int

    get_graph first checks the parameters and return either graph data or a dict with
    state "type" with an integer to indicate the size of the parameter
    (maybe we could add a String in that step to factor and give here the error message)

    1) compute_graph (see function above)
    2) return graph

    '''
    overwrite_node_contents = False

    # Case of graph has been computed already
    if cooc_id is not None:
        print("GRAPH#%d ... Loading data already computed." % int(cooc_id))
        node = session.query(Node).filter(Node.id == cooc_id).first()

        # Structure of the Node.hyperdata[distance][bridbeness]
        # All parameters (but distance and bridgeness)
        # are in Node.hyperdata["parameters"]

        # Check distance of the graph
        if node.hyperdata.get(distance, None) is not None:
            graph = node.hyperdata[distance]

            # Check bridgeness of the graph
            if graph.get(str(bridgeness), None) is not None:
                return graph[str(bridgeness)]

    # new graph: we give it an empty node with new id and status
    elif saveOnly:
        # NB: we do creation already here (instead of same in countCooccurrences)
        #     to guarantee a unique ref id to the saveOnly graph (async generation)
        new_node = corpus.add_child(typename="COOCCURRENCES",
                                    name="GRAPH (in corpus %s)" % corpus.id)

        session.add(new_node)
        session.commit()
        cooc_id = new_node.id
        cooc_name = new_node.name
        cooc_date = new_node.date
        # and the empty content will need redoing by countCooccurrences
        overwrite_node_contents = True
        print("GRAPH #%d ... Created new empty data node for saveOnly" %
              int(cooc_id))

    # Case of graph has not been computed already
    # First, check the parameters

    # Case of mapList not big enough
    # ==============================

    # if we do not have any mapList_id already
    if mapList_id is None:
        mapList_id = session.query(
            Node.id).filter(Node.typename == "MAPLIST").first()[0]

    mapList_size = session.query(NodeNgram).filter(
        NodeNgram.node_id == mapList_id).count()

    if mapList_size < graph_constraints['mapList']:
        # Do not compute the graph if mapList is not big enough
        return {'state': "mapListError", "length": mapList_size}

    # Instantiate query for case of corpus not big enough
    # ===================================================
    corpus_size_query = (session.query(Node).filter(
        Node.typename == "DOCUMENT").filter(Node.parent_id == corpus.id))

    # Filter corpus by date if any start date
    # ---------------------------------------
    if start is not None:
        #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
        date_start = datetime.strptime(str(start), "%Y-%m-%d")
        date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")

        Start = aliased(NodeHyperdata)
        corpus_size_query = (corpus_size_query.join(
            Start, Start.node_id == Node.id).filter(
                Start.key == 'publication_date').filter(
                    Start.value_utc >= date_start_utc))

    # Filter corpus by date if any end date
    # -------------------------------------
    if end is not None:
        date_end = datetime.strptime(str(end), "%Y-%m-%d")
        date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")

        End = aliased(NodeHyperdata)

        corpus_size_query = (corpus_size_query.join(
            End, End.node_id == Node.id).filter(
                End.key == 'publication_date').filter(
                    End.value_utc <= date_end_utc))

    # Finally test if the size of the corpora is big enough
    # --------------------------------
    corpus_size = corpus_size_query.count()

    if saveOnly is not None and saveOnly == "True":
        scheduled(compute_graph)(
            corpus_id=corpus.id,
            cooc_id=cooc_id
            #, field1="ngrams", field2="ngrams"
            ,
            start=start,
            end=end,
            mapList_id=mapList_id,
            groupList_id=groupList_id,
            isMonopartite=True,
            threshold=threshold,
            distance=distance,
            bridgeness=bridgeness,
            save_on_db=True,
            reset=overwrite_node_contents
            #, limit=size
        )

        return {
            "state": "saveOnly",
            "target_id": cooc_id,
            "target_name": cooc_name,
            "target_date": cooc_date
        }

    elif corpus_size > graph_constraints['corpusMax']:
        # Then compute cooc asynchronously with celery
        scheduled(compute_graph)(
            corpus_id=corpus.id,
            cooc_id=cooc_id
            #, field1="ngrams", field2="ngrams"
            ,
            start=start,
            end=end,
            mapList_id=mapList_id,
            groupList_id=groupList_id,
            isMonopartite=True,
            threshold=threshold,
            distance=distance,
            bridgeness=bridgeness,
            save_on_db=True,
            reset=overwrite_node_contents
            #, limit=size
        )
        # Dict to inform user that corpus maximum is reached
        # then graph is computed asynchronously
        return {"state": "corpusMax", "length": corpus_size}

    elif corpus_size <= graph_constraints['corpusMin']:
        # Do not compute the graph if corpus is not big enough
        return {"state": "corpusMin", "length": corpus_size}

    else:
        # If graph_constraints are ok then compute the graph in live
        data = compute_graph(
            corpus_id=corpus.id,
            cooc_id=cooc_id
            #, field1="ngrams", field2="ngrams"
            ,
            start=start,
            end=end,
            mapList_id=mapList_id,
            groupList_id=groupList_id,
            isMonopartite=True,
            threshold=threshold,
            distance=distance,
            bridgeness=bridgeness,
            save_on_db=True,
            reset=overwrite_node_contents
            #, limit=size
        )

    # case when 0 coocs are observed (usually b/c not enough ngrams in maplist)

    if len(data) == 0:
        print("GRAPH #   ... GET_GRAPH: 0 coocs in matrix")
        data = {'nodes': [], 'links': []}  # empty data

    return data