Ejemplo n.º 1
0
    def generate_abstracts(self, list_of_pmids):
        """
        Generates list of documents using pmids and the restapi interface from tmtools.
        Source: "http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/"
        :param list_of_pmids: strings
        :return nalaf.structures.Dataset: dataset
        """
        # if os.path.isfile('cache.json'):
        #     with open('cache.json') as f:
        #           tm_var = json.load()
        # else:
        url_tmvar = 'http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/Mutation/{0}/JSON/'
        url_converter = 'http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/'

        # load cache.json if exists
        if os.path.exists('cache.json'):
            with open('cache.json', 'r', encoding='utf-8') as f:
                tm_var = json.load(f)
        else:
            tm_var = {}

        for pmid in list_of_pmids:
            if pmid not in tm_var:  # if pmid was not already downloaded from tmTools
                req = requests.get(url_tmvar.format(pmid))
                try:
                    tm_var[pmid] = req.json()
                except ValueError:
                    pass
        # cache the tmVar annotations so we don't pull them every time
        with open('cache.json', 'w') as file:
            json.dump(tm_var, file, indent=4)

        # for key in tm_var:
        #     print(json.dumps(tm_var[key], indent=4))

        dataset = Dataset()
        for doc_id in list_of_pmids:
            if doc_id in tm_var:
                doc = Document()
                text = tm_var[doc_id]['text']
                part = Part(text)
                denotations = tm_var[doc_id]['denotations']
                annotations = []
                for deno in denotations:
                    ann = Entity(
                        class_id=self.mut_class_id,
                        offset=int(deno['span']['begin']),
                        text=text[deno['span']['begin']:deno['span']['end']])
                    annotations.append(ann)
                    # note should the annotations from tmvar go to predicted_annotations or annotations?
                part.annotations = annotations
                doc.parts['abstract'] = part
                dataset.documents[doc_id] = doc

        return dataset
Ejemplo n.º 2
0
    def setUpClass(cls):
        # create a sample dataset1 (1) to test
        cls.dataset1 = Dataset()
        doc_1 = Document()

        text = '.... aaaa .... bbbb .... cccc .... dddd .... eeee .... ffff .... gggg .... hhhh .... jjjj'
        part_1 = Part(text)

        cls.dataset1.documents['doc_1'] = doc_1
        doc_1.parts['part_1'] = part_1

        exact_1 = Entity(STUB_E_ID_1, 5, 'aaaa')
        exact_1.subclass = 1
        exact_2 = Entity(STUB_E_ID_1, 55, 'ffff')
        exact_2.subclass = 2
        exact_3 = Entity(STUB_E_ID_1, 75, 'hhhh')
        exact_3.subclass = 2

        overlap_1_1 = Entity(STUB_E_ID_1, 25, 'cccc')
        overlap_1_1.subclass = 1
        overlap_1_2 = Entity(STUB_E_ID_1, 26, 'cc')
        overlap_1_2.subclass = 1

        overlap_2_1 = Entity(STUB_E_ID_1, 32, '.. ddd')
        overlap_2_1.subclass = 2
        overlap_2_2 = Entity(STUB_E_ID_1, 36, 'ddd ...')
        overlap_2_2.subclass = 2

        overlap_3_1 = Entity(STUB_E_ID_1, 65, 'gggg')
        overlap_3_1.subclass = 1
        overlap_3_2 = Entity(STUB_E_ID_1, 62, '.. gggg ..')
        overlap_3_2.subclass = 2

        missing_1 = Entity('e2', 45, 'eeee')
        missing_1.subclass = 1
        missing_2 = Entity('e2', 84, 'jjjj')
        missing_2.subclass = 1

        spurios = Entity('e2', 15, 'bbbb')
        spurios.subclass = 1

        part_1.annotations = [
            exact_1, exact_2, exact_3, overlap_1_1, overlap_2_1, overlap_3_1,
            missing_1, missing_2
        ]
        part_1.predicted_annotations = [
            exact_1, exact_2, exact_3, overlap_1_2, overlap_2_2, overlap_3_2,
            spurios
        ]