Exemple #1
0
def insert_citation(review_id, ref_id, citation_d):
    citation = model.Citation()
    citation.project_id = review_id
    # Ensure that ref_id is an integer, drop it otherwise.
    # try:
    #     ref_id = int(ref_id)
    # except:
    #     ref_id = None

    citation.refman = ref_id
    pmid = citation_d['pmid']
    citation.pmid = pmid if (pmid is not None and pmid != '') else 0
    # we truncate the citation if it's too long!
    citation.title = citation_d['title'][:MAX_TITLE_LENGTH] if \
                                citation_d['title'] is not None else "(no title found)"
    citation.abstract = citation_d['abstract'][:9980] if \
                                citation_d['abstract'] is not None else None
    citation.authors = " and ".join(citation_d['authors'])
    citation.keywords = ','.join(citation_d['keywords'])
    citation.journal = citation_d['journal']

    model.Session.add(citation)
    model.Session.commit()

    return citation.id
Exemple #2
0
    def test_delete_citation(self):
        """ Deleting Citation entry cascades

        Verify that entries in CitationTask table are destroyed when
        corresponding citation and/or task is deleted

        """

        # Create citation and task objects
        c1 = model.Citation()
        c2 = model.Citation()
        t1 = model.Task()
        t2 = model.Task()

        # Append tasks to citation, incidentally this verifies that
        # the relationship are properly set.
        c1.tasks.append(t1)
        c1.tasks.append(t2)
        c2.tasks.append(t2)

        # Persist the changes.
        Session.add(c1)
        Session.add(c2)
        Session.commit()

        # Verify first that the entries in table CitationTask actually exist
        assert len(Session.query(model.citations_tasks_table).all()) == 3

        # Finally remove one of the citations and check cascade
        Session.delete(c1)
        Session.commit()
        assert len(Session.query(model.citations_tasks_table).all()) == 1

        # Do the same for when the task is removed
        Session.delete(t2)
        assert len(Session.query(model.citations_tasks_table).all()) == 0
def _create_reviews(p_id, iter_size, which_iter):
    lock_file_path = join(dirname(abspath(__file__)), '_delete_lock.lck')

    if not isfile(lock_file_path):
        Session.query(
            model.Citation).filter(model.Citation.project_id != p_id).delete()
        Session.query(
            model.Label).filter(model.Label.project_id != p_id).delete()
        Session.commit()
        open(lock_file_path, 'w+').close()

    u_id = 2629
    k_init = 400
    c_count = len(
        Session.query(model.Citation).filter_by(project_id=p_id).all())
    k_inc = 100

    for itercount in range(iter_size * which_iter,
                           iter_size * which_iter + iter_size):
        ### THIS is the code for one run of the experiment

        ## labeled citation counter
        labeled_citation_counter = 0

        labels = Session.query(model.Label).filter_by(project_id=p_id).all()
        user = Session.query(model.User).filter_by(id=u_id).first()
        citations = Session.query(
            model.Citation).filter_by(project_id=p_id).all()
        print len(citations)
        c_count = len(citations)
        r_sample = defaultdict(list)

        sample_indexes = sample(range(c_count), k_init)
        C_r = []
        for ii in sample_indexes:
            C_r.append(citations[ii])
        for cc in C_r:
            for ll in Session.query(model.Label).filter_by(
                    project_id=p_id).filter_by(study_id=cc.id).all():
                r_sample[ll.study_id].append(ll)

        new_review = model.Project()
        new_review.leaders.append(user)
        new_review.initial_round_size = 0
        new_review.tag_privacy = True

        Session.add(new_review)
        Session.flush()

        state_dict = defaultdict(int)
        citation_dict = {}

        for c in citations:
            citation = model.Citation()
            citation.project_id = new_review.id
            citation.title = c.title
            citation.abstract = c.abstract
            citation.keywords = c.keywords
            citation.refman = c.refman
            model.Session.add(citation)
            Session.flush()

            citation_dict[citation.id] = c.id

            if c.id in r_sample:
                labeled_citation_counter += 1
                state_dict[citation.id] = 1
                for t in r_sample[c.id]:
                    label = model.Label()
                    label.project_id = new_review.id
                    label.study_id = citation.id
                    label.label = t.label
                    model.Session.add(label)

        print new_review.id
        Session.commit()

        ## i is a counter for the current increment
        i = 0

        while True:

            ## we want to change the increment size if there are a certain number of citations is labeled
            #if labeled_citation_counter > 15000:
            #    k_inc = 2000
            #elif labeled_citation_counter > 5000:
            #    k_inc = 1000
            #else:
            #    k_inc = 500

            r_sample = defaultdict(list)
            print "EXPERIMENT NO: " + str(itercount)
            make_predictions(new_review.id)

            ######################## here's where I record the results
            preds_for_review = Session.query(model.Prediction).filter(
                model.Prediction.project_id == new_review.id).all()
            path_to_preds_out = os.path.join(
                "_exports",
                "predictions_%d_%d_of_%d.csv" % (p_id, i, itercount))
            with open(path_to_preds_out, 'w+') as fout:
                csv_out = csv.writer(fout)
                preds_file_headers = [
                    "citation_id", "refman", "title",
                    "predicted p of being relevant",
                    "'hard' screening prediction*", "state"
                ]
                csv_out.writerow(preds_file_headers)
                sorted_preds = sorted(preds_for_review,
                                      key=lambda x: x.predicted_probability,
                                      reverse=True)

                for pred in sorted_preds:
                    citation = Session.query(model.Citation).filter(
                        model.Citation.id == pred.study_id).first()
                    #citation = self._get_citation_from_id(pred.study_id)
                    citation_title = citation.title.encode('ascii', 'ignore')
                    row_str = [
                        citation.id, citation.refman, citation_title,
                        pred.predicted_probability, pred.prediction,
                        state_dict[citation.id]
                    ]
                    csv_out.writerow(row_str)
            ######################### ---------------------------

            i += 1
            if labeled_citation_counter >= c_count:
                break

            P_a = []
            for pa in Session.query(model.Prediction).filter_by(
                    project_id=new_review.id).order_by(
                        model.Prediction.predicted_probability.desc()).all():
                if state_dict[pa.study_id] == 0:
                    P_a.append(pa)
                    if len(P_a) == k_inc:
                        break

            if len(P_a) == 0:
                print "~~~NO PREDS!!!"
                ccc = [
                    label
                    for label in Session.query(model.Citation.id).filter_by(
                        project_id=new_review.id).filter(
                            ~model.Citation.labels.any()).limit(k_inc)
                ]
                print len(ccc)
                for cc in ccc:
                    labeled_citation_counter += 1
                    state_dict[cc.id] = 1
                    for ll in Session.query(model.Label).filter_by(
                            study_id=citation_dict[cc.id]).all():
                        label = model.Label()
                        label.project_id = new_review.id
                        label.study_id = cc.id
                        label.label = ll.label
                        model.Session.add(label)
            else:
                for pp in P_a:
                    labeled_citation_counter += 1
                    state_dict[pp.study_id] = 2
                    for ll in Session.query(
                            model.Label).filter_by(project_id=p_id).filter_by(
                                study_id=citation_dict[pp.study_id]).all():
                        label = model.Label()
                        label.project_id = new_review.id
                        label.study_id = pp.study_id
                        label.label = ll.label
                        model.Session.add(label)
            Session.commit()

            print len(
                Session.query(
                    model.Label).filter_by(project_id=new_review.id).all())
    return