Esempio n. 1
0
    def stitchAllBlobs(slidelist, quiet=True, debug=False):
        t_start_stitching = time.time()
        printl('')
        for slide_num, slide in enumerate(slidelist[:-1]):
            # Skipping last slide, because pairing go from lower slide to upper slide, so it's already processed with the second to last slide
            # IE blob2ds in the last slide are partners to the previous slide's blob2ds, and have no direct possible partners of their own
            t_start_stitching_this_slide = time.time()
            printl('Stitching %s blob2ds from slide #%s/%s to %s blob2ds from slide #%s/%s' % (len(slide.blob2dlist), slide_num + 1,
                len(slidelist), len(slidelist[slide_num+1].blob2dlist), str(slide_num + 2), len(slidelist)), end=' ')

            progress = ProgressBar(max_val=len(slide.blob2dlist), increments=20,
                                   symbol='.')  # Note actually more responsive to do based on blob than # of pixels, due to using only a subset to stitch
            for b_num, blob1 in enumerate(slide.blob2dlist):
                blob1 = Blob2d.get(blob1)
                if len(blob1.possible_partners) > 0:
                    if debug:
                        printl('  Starting on a new blob from bloblist:' + str(blob1) + ' which has:' + str(
                            len(blob1.possible_partners)) + ' possible partners')
                for b2_num, blob2 in enumerate(blob1.possible_partners):
                    blob2 = Blob2d.get(blob2)
                    if debug:
                        printl('   Comparing to blob2:' + str(blob2))
                    new_stitch = Pairing(blob1.id, blob2.id, 1.1, 36, quiet=quiet) # TODO use this to assign ids to pairings
                progress.update(b_num, set_val=True)

            if quiet and not debug:
                progress.finish()
                print_elapsed_time(t_start_stitching_this_slide, time.time(), prefix='took')
        print_elapsed_time(t_start_stitching, time.time(), prefix='Stitching all slides took', endline=False)
        printl(' total')
Esempio n. 2
0
 def followstitches(cursorblob, blob2dlist):
     """
     Recursive support function for get_stitched_partners
     :param cursorblob:
     :param blob2dlist:
     :param: cursorblob: The blob whose stitching is examined for connected blob2ds
     :param: blob2dlist: The accumulated list of a blob2ds which are connected directly or indirectly to the inital seed blob
     """
     if type(cursorblob) is int:
         cursorblob = Blob2d.get(cursorblob)
     if hasattr(cursorblob,
                'pairings') and len(cursorblob.pairings) != 0:
         if cursorblob not in blob2dlist:
             if hasattr(cursorblob,
                        'assignedto3d') and cursorblob.assignedto3d:
                 printl(
                     '====> DB Warning, adding a blob to list that has already been assigned: '
                     + str(cursorblob))
             cursorblob.assignedto3d = True
             blob2dlist.append(cursorblob)
             for pairing in cursorblob.pairings:
                 for blob in (pairing.lowerblob, pairing.upperblob):
                     followstitches(blob, blob2dlist)
     else:
         Blob2d.blobswithoutstitches += 1
Esempio n. 3
0
    def extract_blob3ds(all_slides, stitched=True):
        printl('Extracting 3D blobs by combining 2D blobs into 3D', flush=True)
        blob3dlist = []
        if not stitched:
            warn(
                'Extracting blob3ds, and have been told that they haven\'t been stitched. This will be inaccurate'
            )
            printl(
                'Extracting blob3ds, and have been told that they haven\'t been stitched. This will be inaccurate'
            )  # DEBUG

        for slide_num, slide in enumerate(all_slides):
            for blob in slide.blob2dlist:
                if Blob2d.get(blob).b3did == -1:
                    if stitched:  # The much better option! ESPECIALLY for recursive_depth = 0
                        buf = [
                            b2d for b2d in Blob2d.get(
                                blob).get_stitched_partners()
                        ]  # old method
                        # buf = [Blob2d.get(b2d) for b2d in Blob2d.get(blob).getpartnerschain()] # IDEALLY could use this for both... for now, it doesnt work well
                    else:
                        buf = [
                            Blob2d.get(b2d)
                            for b2d in Blob2d.get(blob).getpartnerschain()
                        ]  # TODO setting partners needs filtering like stitching
                    if len(buf) != 0:
                        blob3dlist.append(Blob3d([b2d.id for b2d in buf]))
        return blob3dlist
Esempio n. 4
0
 def save_image(self, filename):
     from scipy import misc as scipy_misc
     array_rep = self.edge_to_array(buffer=0)
     img = scipy_misc.toimage(array_rep, cmin=0.0, cmax=255.0)
     savename = Config.FIGURES_DIR + filename
     printl('Saving Image of Blob2d as: ' + str(savename))
     img.save(savename)
Esempio n. 5
0
 def printdescendants(self, rdepth=0):
     pad = ''
     for _ in range(rdepth):
         pad += '-'
     printl(pad + str(self))
     for child in self.children:
         b2d = Blob2d.all[child]
         b2d.printdescendants(rdepth=rdepth + 1)
Esempio n. 6
0
def main(conf, current_chunk, total_chunks):
    # doc_id         text,
    # sentence_index int,
    # sentence_text  text,
    # tokens         text[],
    # lemmas         text[],
    # pos_tags       text[],
    # ner_tags       text[],
    # doc_offsets    int[],
    # dep_types      text[],
    # dep_tokens     int[]
    printl('Loading sentence data, Chunk {} of {}'.format(current_chunk,
                                                          total_chunks))

    if conf['data_tgz']:
        article_list = glob.glob(os.path.join(conf['data_directory'], '*.tgz'))
    article_chunk = [a for a in article_list if a.endswith('_{}_combined.tgz'.format(current_chunk))]
    if len(article_chunk) < 1:
        printl('Sentence loader - Chunk {} - no file found'.format(current_chunk))
        sys.exit(1)
    elif len(article_chunk) > 1:
        printl('Sentence loader - Chunk {} - multiple files found: {} (importing anyway)'.format(current_chunk,
                                                                                                 str(article_chunk)))

    for article_archive in article_chunk:
        printl(article_archive)
        with tarfile.open(article_archive, "r:gz") as tar, tempfile.TemporaryDirectory() as td:
            
            # corenlp output:
            output_files = filter_files_from_tar(tar, 'output_files')

            if conf['parse_pubtator']:
                # pubtator output:
                pubtator_files = filter_files_from_tar(tar, 'pubtator')
            else:
                pubtator_files = []

            # extract pubtator and corenlp output files into tempdir
            tar.extractall(path=td, members=itertools.chain(output_files, pubtator_files))

            # glob/read through output_files and print file data
            output_filepaths = sorted(glob.glob(os.path.join(td,
                                                             '*',
                                                             'output_files',
                                                             '*')))
            if conf['parse_pubtator']:
                pubtator_filepaths = sorted(glob.glob(os.path.join(td,
                                                                   '*',
                                                                   'pubtator',
                                                                   '*')))
            else:
                # pubtator_filepaths = [None for _ in output_filepaths]
                sys.exit(1)

            for i, (fp, pubtator_fp) in enumerate(zip(output_filepaths, pubtator_filepaths)):
                parse_corenlp_output(conf, fp, pubtator_fp)
                if i % 1000 == 0:
                    printl('Processed file {} of chunk'.format(i))
Esempio n. 7
0
    def set_shape_contexts(self, num_bins):
        """
        Uses the methods described here: https://www.cs.berkeley.edu/~malik/papers/BMP-shape.pdf
        to set a shape context histogram (with num_bins), for each edge pixel in the blob.
        Note that only the edge_pixels are used to determine the shape context,
        and that only edge points derive a context.

        num_bins is the number of bins in the histogram for each point
        :param num_bins:
        """
        # Note making the reference point for each pixel itself
        # Note that angles are NORMALLY measured COUNTER-clockwise from the +x axis,
        # Note  however the += 180, used to remove the negative values,
        # NOTE  makes it so that angles are counterclockwise from the NEGATIVE x-axis
        assert type(num_bins) is int and num_bins > 1
        ledgep = len(self.lowerpixels)
        uedgep = len(self.upperpixels)

        self.lower_context_bins = np.zeros((ledgep, num_bins))  # Each edge pixel has rows of num_bins each
        self.upper_context_bins = np.zeros((uedgep, num_bins))  # Each edge pixel has rows of num_bins each
        # First bin is 0 - (360 / num_bins) degress
        for (pix_num, pixel) in enumerate(self.lowerpixels):
            pixel = Pixel.get(pixel)
            for (pix_num2, pixel2) in enumerate(self.lowerpixels):
                pixel2 = Pixel.get(pixel2)
                if pix_num != pix_num2:  # Only check against other pixels.
                    distance = math.sqrt(math.pow(pixel.x - pixel2.x, 2) + math.pow(pixel.y - pixel2.y, 2))
                    angle = math.degrees(
                        math.atan2(pixel2.y - pixel.y, pixel2.x - pixel.x))  # Note using atan2 handles the dy = 0 case
                    angle += 180
                    if not 0 <= angle <= 360:
                        printl('\n\n\n--ERROR: Angle=' + str(angle))
                    # Now need bin # and magnitude for histogram
                    bin_num = math.floor((angle / 360.) * (num_bins - 1))  # HACK PSOE from -1
                    value = math.log(distance, 10)
                    self.lower_context_bins[pix_num][bin_num] += value
        for (pix_num, pixel) in enumerate(self.upperpixels):
            pixel = Pixel.get(pixel)
            for (pix_num2, pixel2) in enumerate(self.upperpixels):
                pixel2 = Pixel.get(pixel2)
                if pix_num != pix_num2:  # Only check against other pixels.
                    distance = math.sqrt(math.pow(pixel.x - pixel2.x, 2) + math.pow(pixel.y - pixel2.y, 2))
                    angle = math.degrees(
                        math.atan2(pixel2.y - pixel.y, pixel2.x - pixel.x))  # Note using atan2 handles the dy = 0 case
                    angle += 180
                    if not 0 <= angle <= 360:
                        printl('\n\n\n--ERROR: Angle=' + str(angle))
                    # Now need bin # and magnitude for histogram
                    bin_num = math.floor((angle / 360.) * (num_bins - 1))  # HACK PSOE from -1
                    value = math.log(distance, 10)
                    self.upper_context_bins[pix_num][bin_num] += value
Esempio n. 8
0
def parse_corenlp_output(conf, filepath, pubtator_file_path):

    if conf['fuzzy_ner_match']:
        fuzzy_ratio = conf['fuzzy_ratio']
    else:
        fuzzy_ratio = False

    nlp_parser = NLPParser(filepath,
                           pubtator_file_path,
                           fuzzy_ner_match=fuzzy_ratio)
    if pubtator_file_path:
        if not nlp_parser.update_ner_pubtator():
            printl('Unable to update generic NER with PubTator matches')
    for row in nlp_parser.get_biothing_tokens():
        print(row, flush=True)

    return None
Esempio n. 9
0
def parse_corenlp_output(conf, filepath, pubtator_file_path):

    if conf['fuzzy_ner_match']:
        fuzzy_ratio = conf['fuzzy_ratio']
    else:
        fuzzy_ratio = False

    nlp_parser = NLPParser(filepath,
                           pubtator_file_path,
                           fuzzy_ner_match=fuzzy_ratio)
    if pubtator_file_path:
        if not nlp_parser.update_ner_pubtator():
            printl('Unable to update generic NER with PubTator matches')
    for row in nlp_parser.get_biothing_tokens():
        print(row, flush=True)

    return None
Esempio n. 10
0
def main(conf, current_chunk, total_chunks):

    printl('Loading article data, Chunk {} of {}'.format(current_chunk,
                                                          total_chunks))

    if conf['data_tgz']:
        article_list = glob.glob(os.path.join(conf['data_directory'], '*.tgz'))
    else:
        raise NotImplementedError("Can't handle nongzipped files yet")
    article_chunk = [a for a in article_list if a.endswith('_{}_combined.tgz'.format(current_chunk))]
    if len(article_chunk) < 1:
        printl('Article loader - Chunk {} - no file found'.format(current_chunk))
        sys.exit(1)
    elif len(article_chunk) > 1:
        printl('Article loader - Chunk {} - multiple files found: {} (importing anyway)'.format(current_chunk,
                                                                                                str(article_chunk)))

    # with open('/home/ubuntu/sandip/bioshovel_biocreative_update/src/deepdive/test_set_pmids.txt') as f:
    #     test_set_pmids = set([line.rstrip('\n') for line in f.readlines()])

    with open(conf['train_dev_test_ids_json']) as f:
        train_dev_test_dict = json.load(f)

    for article_archive in article_chunk:
        printl(article_archive)
        with tarfile.open(article_archive, "r:gz") as tar, tempfile.TemporaryDirectory() as td:
            input_files = filter_files_from_tar(tar, 'input_files')

            # extract input_files into tempdir
            tar.extractall(path=td, members=input_files)

            # glob/read through input_files and print file data
            input_filepaths = glob.glob(os.path.join(td,
                                                     '*',
                                                     'input_files',
                                                     '*'))

            for i, filepath in enumerate(input_filepaths):
                print_article_info(Path(filepath), article_archive, train_dev_test_dict)
                if i % 500 == 0:
                    printl('Processed file {} of chunk'.format(i))
Esempio n. 11
0
    def validate_id(self, quiet=True):
        """
        Checks that a blob2d's id has not been used, and updates it's id if it has been used
        It then adds the blob to the Blob2d master dictionary 'all'
        :param quiet:
        :return:
        """
        def get_next_id():
            index = Blob2d.min_free_id
            while index < len(Blob2d.used_ids) and Blob2d.used_ids[index] == 1:
                index += 1
            if index == len(Blob2d.used_ids):
                Blob2d.used_ids.append(
                    0
                )  # NOTE can alter this value, for now expanding by 50, which will be filled with zeros
            Blob2d.min_free_id = len(Blob2d.used_ids)
            return index

        if self.id >= len(Blob2d.used_ids):
            Blob2d.used_ids.resize(
                [self.id + 50]
            )  # NOTE can alter this value, for now expanding by 50, which will be filled with zeros
            Blob2d.used_ids[
                self.
                id] = 1  # 1 for used, no need to check if the value has been used as we are in a new range
            Blob2d.all[self.id] = self
        elif self.id < 0 or Blob2d.used_ids[
                self.id] == 1:  # This id has already been used
            oldid = self.id
            self.id = get_next_id()
            if not quiet:
                printl('Updated id from ' + str(oldid) + ' to ' +
                       str(self.id) + '  ' + str(self))
            Blob2d.all[self.id] = self
            Blob2d.used_ids[self.id] = 1
        else:  # Fill this id entry for the first time
            if not quiet:
                printl('Updated entry for ' + str(self.id))
            Blob2d.used_ids[self.id] = 1
            Blob2d.all[self.id] = self
Esempio n. 12
0
 def save2d(self, filename):
     """
     This saves the 2d area around a blob3d for all slides, so that it can be used for testing later
     :param filename: The base filename to save, will have numerical suffix
     :return:
     """
     from scipy import misc as scipy_misc
     slice_arrays = []
     for i in range(self.highslideheight - self.lowslideheight + 1):
         slice_arrays.append(
             np.zeros(
                 (self.maxx - self.minx + 1, self.maxy - self.miny + 1)))
     savename = Config.FIGURES_DIR + filename
     for b2d in self.blob2ds:
         for pixel in b2d.pixels:
             slice_arrays[pixel.z - self.lowslideheight][
                 pixel.x - self.minx][pixel.y - self.miny] = pixel.val
     for slice_num, slice_arr in enumerate(slice_arrays):
         img = scipy_misc.toimage(slice_arr, cmin=0.0, cmax=255.0)
         printl('Saving Image of Blob2d as: ' + str(savename) +
                str(slice_num) + '.png')
         img.save(savename + str(slice_num) + '.png')
Esempio n. 13
0
 def tag_blobs_singular(blob3dlist, quiet=False):
     singular_count = 0
     non_singular_count = 0
     for blob3d in blob3dlist:
         singular = True
         for blob2d_num, blob2d in enumerate(blob3d.blob2ds):
             if blob2d_num != 0 or blob2d_num != len(
                     blob3d.blob2ds):  # Endcap exceptions due to texture
                 if len(blob3d.pairings
                        ) > 3:  # Note ideally if > 2 # FIXME strange..
                     singular = False
                     break
         blob3d.isSingular = singular
         # Temp:
         if singular:
             singular_count += 1
         else:
             non_singular_count += 1
     if not quiet:
         printl('There are ' + str(singular_count) +
                ' singular 3d-blobs and ' + str(non_singular_count) +
                ' non-singular 3d-blobs')
Esempio n. 14
0
def filter_sparse_pixels(listin, local_dim_tuple, quiet=False):
    # TODO convert to ids
    local_xdim, local_ydim = local_dim_tuple
    max_float_array = np.zeros([local_xdim, local_ydim])
    for pixel in listin:
        max_float_array[pixel.x][
            pixel.y] = pixel.val  # Note Remember that these are pointers!
    filtered_pixels = []
    removed_pixel_ids = []
    for (pixn, pixel) in enumerate(
            listin
    ):  # pixel_number and the actual pixel (value, x-coordinate, y-coordinate)
        xpos = pixel.x  # Note: The naming scheme has been repaired
        ypos = pixel.y
        # Keep track of nz-neighbors, maximal-neighbors, neighbor sum
        buf_nzn = 0
        for horizontal_offset in range(
                -1, 2,
                1):  # NOTE CURRENTLY 1x1 # TODO rteplace with getneighbors
            for vertical_offset in range(-1, 2, 1):  # NOTE CURRENTLY 1x1
                if vertical_offset != 0 or horizontal_offset != 0:  # Don't measure the current pixel
                    if (local_xdim > xpos + horizontal_offset >= 0
                            and local_ydim > ypos + vertical_offset >=
                            0):  # Boundary check.
                        # neighbors_checked += 1
                        cur_neighbor_val = max_float_array[
                            xpos + horizontal_offset][ypos + vertical_offset]
                        if cur_neighbor_val > 0:
                            buf_nzn += 1
        if buf_nzn >= Config.minimal_nonzero_neighbors:
            filtered_pixels.append(pixel)
        else:
            removed_pixel_ids.append(pixel.id)
    if not quiet:
        printl('There are ' + str(len(listin) - len(filtered_pixels)) +
               ' dead pixels & ' + str(len(filtered_pixels)) + ' still alive')
    return filtered_pixels
Esempio n. 15
0
    def clean_b3ds():
        """
        This is a dev method, used to clean up errors in b3ds. Use sparingly!
        :return:
        """
        printl('<< CLEANING B3DS >>')
        # printl("These are the b3ds that will need fixing!")
        set_isBead_after = False
        adjusted_b3d_minmax = 0
        for b3d in Blob3d.all.values():
            if not hasattr(b3d, 'isBead'):
                b3d.isBead = None
                set_isBead_after = True
            remove_children = []
            for child in b3d.children:
                if child not in Blob3d.all:
                    remove_children.append(child)
            if len(remove_children):
                for child in remove_children:
                    b3d.children.remove(child)
                printl(' While cleaning b3d:' + str(b3d) +
                       ' had to remove children that no longer existed ' +
                       str(remove_children))
            if b3d.parent_id is None and b3d.recursive_depth != 0:
                printd(' Found b3d with None parent_id: ' + str(b3d),
                       Config.debug_b3d_merge)
            elif b3d.parent_id is not None and b3d.parent_id not in Blob3d.all:
                printl(' While cleaning b3d:' + str(b3d) +
                       ' had to set parent_id to None, because parent_id: ' +
                       str(b3d.parent_id) + ' is not a valid blob3d-id')
                b3d.parent_id = None

        if set_isBead_after:
            printl(
                ' While cleaning b3ds, found b3ds without isBead attr, so setting isBead for all b3ds'
            )
            Blob3d.tag_all_beads()
        if adjusted_b3d_minmax:
            warn("Had to adjust the ranges for a total of " +
                 str(adjusted_b3d_minmax) +
                 ' blob3ds because their b2ds were out of range')  # FIXME
Esempio n. 16
0
 def assign_alive_pixels_to_blob2dlist(self, quiet=False):
     self.assignPixelsToIds(
         self.alive_pixels)  # Note only printing when primary slide
     id_lists = pixels_to_id_lists(self.alive_pixels)
     self.blob2dlist = [
     ]  # Note that blobs in the blob list are ordered by number of pixels, not id, this makes merging faster
     for (blobnum, blobslist) in enumerate(id_lists):
         newb2d = Blob2d(blobslist, self.height)
         self.blob2dlist.append(newb2d.id)
     if not quiet:
         printl('There were ' + str(len(self.alive_pixels)) +
                ' alive pixels assigned to ' + str(len(self.blob2dlist)) +
                ' blobs.')
         self.tf = time.time()
         printl('Creating this slide took', end='')
         print_elapsed_time(self.t0, self.tf, prefix='')
         printl('')
Esempio n. 17
0
 def stitch_blob2ds(b2ds, debug=False):
     pairlist = []
     for b_num, blob1 in enumerate(b2ds):
         blob1 = Blob2d.get(blob1)
         if len(blob1.possible_partners) > 0:
             if debug:
                 printl('  Starting on a new blob from bloblist:' + str(blob1) + ' which has:' + str(
                     len(blob1.possible_partners)) + ' possible partners')
         for b2_num, blob2 in enumerate(blob1.possible_partners):
             blob2 = Blob2d.get(blob2)
             if debug:
                 printl('   Comparing to blob2:' + str(blob2))
             pair = Pairing(blob1.id, blob2.id, 1.1, 36, quiet=True)
             if pair.isConnected:
                 pairlist.append(pair)
             elif debug:
                 printl('    -Blobs not connected')
     return pairlist
Esempio n. 18
0
    def tag_all_beads():
        printd('Tagging bead blob3ds', Config.debug_bead_tagging)
        base_b3ds = Blob3d.at_depth(0, ids=False)
        printl(
            str(len(base_b3ds)) + ' / ' + str(len(Blob3d.all)) +
            ' blob3ds are at recursive_depth=0')

        # DEBUG
        num_base_with_children = len(
            list(b3d for b3d in base_b3ds if len(b3d.children)))
        printl(
            str(num_base_with_children) + ' / ' + str(len(base_b3ds)) +
            ' base b3ds have children!')

        for b3d in base_b3ds:
            b3d.check_bead()
        printd(
            ' ' + str(len(base_b3ds)) + ' of the ' + str(len(base_b3ds)) +
            ' base b3ds were tagged as beads', Config.debug_bead_tagging)

        # clean up
        unset = sorted(
            list(b3d for b3d in Blob3d.all.values() if b3d.isBead is None),
            key=lambda b3d: b3d.recursive_depth)  # Do by recursive depth
        if len(unset):
            printd(
                'When tagging all beads, there were ' + str(len(unset)) +
                ' b3ds which could not be reached from base b3ds',
                Config.debug_bead_tagging)
            printd(
                ' They are: ' + str(unset), Config.debug_bead_tagging
            )  # Want this to always be zero, otherwise theres a tree problem
        for b3d in unset:
            b3d.check_bead()
        printl("Total number of beads = " +
               str(sum(b3d.isBead for b3d in Blob3d.all.values())) + ' / ' +
               str(len(Blob3d.all)))
Esempio n. 19
0
def bloom_b3ds(blob3dlist, stitch=False):
    allb2ds = [Blob2d.get(b2d) for b3d in blob3dlist for b2d in b3d.blob2ds]
    printl('\nProcessing internals of ' + str(len(allb2ds)) +
           ' 2d blobs via \'blooming\' ',
           end='')
    t_start_bloom = time.time()
    num_unbloomed = len(allb2ds)
    pb = ProgressBar(max_val=sum(len(b2d.pixels) for b2d in allb2ds),
                     increments=50)
    for bnum, blob2d in enumerate(allb2ds):
        blob2d.gen_internal_blob2ds(
        )  # NOTE will have len 0 if no blooming can be done
        pb.update(len(blob2d.pixels), set_val=False
                  )  # set is false so that we add to an internal counter
    pb.finish()

    print_elapsed_time(t_start_bloom, time.time(), prefix='took')
    printl('Before blooming there were: ' + str(num_unbloomed) +
           ' b2ds contained within b3ds, there are now ' +
           str(len(Blob2d.all)))

    # Setting possible_partners
    printl(
        'Pairing all new blob2ds with their potential partners in adjacent slides'
    )
    max_avail_depth = max(b2d.recursive_depth for b2d in Blob2d.all.values())
    for cur_depth in range(max_avail_depth)[1:]:  # Skip those at depth 0
        depth = [
            b2d.id for b2d in Blob2d.all.values()
            if b2d.recursive_depth == cur_depth
        ]
        max_h_d = max(Blob2d.all[b2d].height for b2d in depth)
        min_h_d = min(Blob2d.all[b2d].height for b2d in depth)
        ids_by_height = [[] for _ in range(max_h_d - min_h_d + 1)]
        for b2d in depth:
            ids_by_height[Blob2d.get(b2d).height - min_h_d].append(b2d)
        for height_val, h in enumerate(
                ids_by_height[:-1]):  # All but the last one
            for b2d in h:
                b2d = Blob2d.all[b2d]
                b2d.set_possible_partners(ids_by_height[height_val + 1])

    # Creating b3ds
    printl('Creating 3d blobs from the generated 2d blobs')
    all_new_b3ds = []
    for depth_offset in range(
            max_avail_depth + 1
    )[1:]:  # Skip offset of zero, which refers to the b3ds which have already been stitched
        printd('Depth_offset: ' + str(depth_offset), Config.debug_blooming)
        new_b3ds = []
        for b3d in blob3dlist:
            all_d1_with_pp_in_this_b3d = []
            for b2d in b3d.blob2ds:
                # Note this is the alternative to storing b3dID with b2ds
                b2d = Blob2d.get(b2d)
                d_1 = [
                    blob for blob in b2d.getdescendants()
                    if blob.recursive_depth == b2d.recursive_depth +
                    depth_offset
                ]
                if len(d_1):
                    for desc in d_1:
                        if len(desc.possible_partners):
                            all_d1_with_pp_in_this_b3d.append(desc.id)
            all_d1_with_pp_in_this_b3d = set(all_d1_with_pp_in_this_b3d)
            if len(all_d1_with_pp_in_this_b3d) != 0:
                printd(' Working on b3d: ' + str(b3d), Config.debug_blooming)
                printd(
                    '  Len of all_d1_with_pp: ' +
                    str(len(all_d1_with_pp_in_this_b3d)),
                    Config.debug_blooming)
                printd('  They are: ' + str(all_d1_with_pp_in_this_b3d),
                       Config.debug_blooming)
                printd(
                    '   = ' + str(
                        list(
                            Blob2d.get(b2d)
                            for b2d in all_d1_with_pp_in_this_b3d)),
                    Config.debug_blooming)
            for b2d in all_d1_with_pp_in_this_b3d:
                b2d = Blob2d.get(b2d)
                printd(
                    '    Working on b2d: ' + str(b2d) + ' with pp: ' +
                    str(b2d.possible_partners), Config.debug_blooming)
                if b2d.b3did == -1:  # unset
                    cur_matches = [
                        b2d
                    ]  # NOTE THIS WAS CHANGED BY REMOVED .getdescendants() #HACK
                    for pp in b2d.possible_partners:
                        printd(
                            "     *Checking if pp:" + str(pp) +
                            ' is in all_d1: ' +
                            str(all_d1_with_pp_in_this_b3d),
                            Config.debug_blooming)
                        if pp in all_d1_with_pp_in_this_b3d:  # HACK REMOVED
                            printd("     Added partner: " + str(pp),
                                   Config.debug_blooming)
                            cur_matches += [
                                Blob2d.get(b)
                                for b in Blob2d.get(pp).getpartnerschain()
                            ]
                    if len(cur_matches) > 1:
                        printd("**LEN OF CUR_MATCHES MORE THAN 1",
                               Config.debug_blooming)
                        new_b3d_list = [
                            blob.id for blob in set(cur_matches)
                            if blob.recursive_depth == b2d.recursive_depth
                            and blob.b3did == -1
                        ]
                        if len(new_b3d_list):
                            new_b3ds.append(
                                Blob3d(new_b3d_list,
                                       r_depth=b2d.recursive_depth))
        all_new_b3ds += new_b3ds
    printl(' Made a total of ' + str(len(all_new_b3ds)) + ' new b3ds')

    if stitch:
        # Set up shape contexts
        printl('Setting shape contexts for stitching')
        for b2d in [
                Blob2d.all[b2d] for b3d in all_new_b3ds for b2d in b3d.blob2ds
        ]:
            b2d.set_shape_contexts(36)

        # Stitching
        printl('Stitching the newly generated 2d blobs')
        for b3d_num, b3d in enumerate(all_new_b3ds):
            printl(' Working on b3d: ' + str(b3d_num) + ' / ' +
                   str(len(all_new_b3ds)))
            Pairing.stitch_blob2ds(b3d.blob2ds, debug=False)
    return all_new_b3ds
Esempio n. 20
0
    exit()
    '''
    for blob3d in largest_base_b3ds:
        printl(blob3d)
        plot_b3ds([blob3d])
        blob3d.gen_skeleton()
        # plot_b3ds([blob3d], color='simple')

    # printl('Plotting b3ds with plotly')
    # plot_plotly(blob3dlist)
    # printl('Plotting b2ds with plotly')
    # plot_plotly(list(Blob2d.all.values()), b2ds=True)
    printl('Plotting all simple:')
    plot_b3ds(blob3dlist, color='simple')
    '''


if __name__ == '__main__':
    try:
        if Config.mayPlot:
            from serodraw import *
            filter_available_colors()
        main(
        )  # Loads or generates blobs, displays in 3d, then displays visual stats
        log.close()
    except Exception as exc:
        printl("\nEXECUTION FAILED!\n")
        printl(traceback.format_exc())
        printl('Writing object to log')
        log.close()
Esempio n. 21
0
    def __init__(self, filename=None, matrix=None, height=None, quiet=False):
        # Note: Must include either filename or matrix
        # When given a matrix instead of a filename of an image, the assumption is that
        # We are computing over blob2ds from within a blob3d,ie experimenting with a subslide
        assert not (matrix is None and filename is None)
        slices = []
        self.t0 = time.time()
        self.debugFlag = False
        if matrix is None:  # Only done if this is a primary slide # FIXME
            self.id_num = Slide.total_slides
            self.height = Slide.total_slides
            Slide.total_slides += 1
            self.filename = filename
            self.primary_slide = True
            imagein = Image.open(filename)
            if not quiet:
                printl('Starting on image: ' + filename)
            imarray = np.array(imagein)
            (self.local_xdim, self.local_ydim,
             self.local_zdim) = imarray.shape[0], imarray.shape[1], self.height
            if not quiet:
                if len(imarray.shape) > 2:
                    printl('The are ' + str(imarray.shape[2]) + ' channels',
                           end='')
                else:
                    printl('There is one channel')
            image_channels = imagein.split()
            for s in range(
                    len(image_channels)
            ):  # Better to split image and use splits for arrays than to split an array
                buf = np.array(image_channels[s])
                slices.append(buf)
                if np.amax(slices[s]) == 0:
                    if not quiet:
                        printl(', Channel #' + str(s) + ' is empty', end='')
            print('', end='\n')
        else:
            slices = [matrix]
            self.local_xdim, self.local_ydim = matrix.shape
            self.id_num = Slide.sub_slides
            self.height = height
            Slide.sub_slides += 1
            self.primary_slide = False

        pixels = []
        for curx in range(self.local_xdim):
            for cury in range(self.local_ydim):
                pixel_value = slices[Config.image_channel_to_use][curx][cury]
                if pixel_value >= Config.min_val_threshold:
                    pixels.append(
                        Pixel(pixel_value,
                              curx,
                              cury,
                              self.id_num,
                              validate=False)
                    )  # No need to validate at this point
        if not quiet:
            printl('The are ' + str(len(pixels)) +
                   ' pixels from the original ' +
                   str(self.local_xdim * self.local_ydim) +
                   ' pixels that are above the minimal pixel threshold')
        self.alive_pixels = filter_sparse_pixels(
            pixels, (self.local_xdim, self.local_ydim), quiet=quiet)

        if len(self.alive_pixels) == 0:
            warn('Didn\'t get any alive pixels from a slide!')
        else:
            self.assign_alive_pixels_to_blob2dlist(quiet=quiet)
Esempio n. 22
0
    def dataToSlides(stitch=True):
        t_gen_slides_0 = time.time()
        all_images = get_images()
        all_slides = []
        for imagefile in all_images:
            all_slides.append(
                Slide(imagefile)
            )  # Pixel computations are done here, as the slide is created.
        printl('Total # of non-zero pixels: ' + str(Pixel.total_pixels) +
               ', total number of pixels after filtering: ' +
               str(len(Pixel.all)))
        printl('Total # of blob2ds: ' + str(len(Blob2d.all)))
        printl('Generating ' + str(len(all_slides)) + ' slides took', end='')
        print_elapsed_time(t_gen_slides_0, time.time(), prefix='')
        printl(
            "Pairing all blob2ds with their potential partners in adjacent slides",
            flush=True)
        Slide.set_possible_partners(all_slides)

        if stitch:
            printl('Setting shape contexts for all blob2ds ',
                   flush=True,
                   end="")
            Slide.set_all_shape_contexts(all_slides)
            stitchlist = Pairing.stitchAllBlobs(
                all_slides, debug=False
            )  # TODO change this to work with a list of ids or blob2ds
        else:
            printl(
                '\n-> Skipping stitching the slides, this will result in less accurate blob3ds for the time being'
            )
        blob3dlist = Slide.extract_blob3ds(all_slides, stitched=stitch)
        printl('There are a total of ' + str(len(blob3dlist)) + ' blob3ds')
        return all_slides, blob3dlist  # Returns slides and all their blob3ds in a list
Esempio n. 23
0
    def __init__(self, blob2dlist, r_depth=0):
        self.id = Blob3d.next_id
        Blob3d.next_id += 1
        self.blob2ds = blob2dlist  # List of the blob 2ds used to create this blob3d
        # Now find my pairings
        self.pairings = []
        self.lowslideheight = min(
            Blob2d.get(blob).height for blob in self.blob2ds)
        self.highslideheight = max(
            Blob2d.get(blob).height for blob in self.blob2ds)
        self.recursive_depth = r_depth
        self.children = []
        self.parent_id = None
        self.isBead = None

        ids_that_are_removed_due_to_reusal = set()
        for blobid in self.blob2ds:
            blob = Blob2d.get(blobid)
            if Blob2d.all[
                    blob.
                    id].b3did != -1:  # DEBUG #FIXME THE ISSUES COME BACK TO THIS, find the source
                # warn('NOT assigning a new b3did (' + str(self.id) + ') to blob2d: ' + str(Blob2d.all[blob.id]))
                printl('---NOT assigning a new b3did (' + str(self.id) +
                       ') to blob2d: ' + str(Blob2d.all[blob.id]))
                Blob3d.possible_merges.append(
                    (Blob2d.all[blob.id].b3did, self.id, blob.id))
                ids_that_are_removed_due_to_reusal.add(blobid)  # HACK
            else:  # Note not adding to the new b3d
                Blob2d.all[blob.id].b3did = self.id
                for stitch in blob.pairings:
                    if stitch not in self.pairings:  # TODO set will be faster
                        self.pairings.append(stitch)
        # self.blob2ds = list(set(self.blob2ds) - ids_that_are_removed_due_to_reusal) # TODO fixed typo 10/10, check doesn't impact elsewhere before uncommenting
        self.maxx = max(Blob2d.get(blob).maxx for blob in self.blob2ds)
        self.maxy = max(Blob2d.get(blob).maxy for blob in self.blob2ds)
        self.miny = min(Blob2d.get(blob).miny for blob in self.blob2ds)
        self.minx = min(Blob2d.get(blob).minx for blob in self.blob2ds)
        self.avgx = sum(Blob2d.get(blob).avgx
                        for blob in self.blob2ds) / len(self.blob2ds)
        self.avgy = sum(Blob2d.get(blob).avgy
                        for blob in self.blob2ds) / len(self.blob2ds)
        self.avgz = (self.lowslideheight + self.highslideheight) / 2
        self.isSingular = False
        self.note = ''  # This is a note that can be manually added for identifying certain characteristics..
        if r_depth != 0:
            """
            This is one of the most convoluted and complicated parts of the project
            This occurs only when a blob3d is being created as a result of blooming
            The idea is that a blob3d is being creating from some blob2ds, which ideally were bloomed from a single blob2d
            However, sometimes bloomed blob2ds from multiple blob3ds end up being stitched together. The idea here is to combine those blob3ds together
            This is complicated because it may need to be recursively applied, to keep the condition that each blob2d and each blob3d are dervied from a single blob3d
            In the event that a blob3d would have multiple parent blob3ds, it's parents are combined
            """

            all_b2d_parents = [
                Blob2d.get(Blob2d.get(b2d).parent_id) for b2d in blob2dlist
            ]
            # printl('All b2d_parents of our b2ds that are going into a new b3d: ' + str(all_b2d_parents))
            parent_b3dids = set(
                [b2d.b3did for b2d in all_b2d_parents if b2d.b3did != -1])
            # printl('Their b3dids: ' + str(parent_b3dids))
            if len(parent_b3dids) > 0:
                printd(
                    'Attempting to create a new b3d with id: ' + str(self.id) +
                    '\nAll b2d_parents of our b2ds that are going into a new b3d: '
                    + str(all_b2d_parents) +
                    '\nAll of the b2ds\'_parents\' b3dids: ' +
                    str(parent_b3dids), Config.debug_b3d_merge)

                if len(parent_b3dids) > 1:
                    printd(
                        '*Found more than one b3d parent for b3d: ' +
                        str(self) + ', attempting to merge parents: ' +
                        str(list(Blob3d.get(b3d) for b3d in parent_b3dids)),
                        Config.debug_b3d_merge)
                    Blob3d.merge(list(parent_b3dids))
                    new_parent_b3dids = list(
                        set([
                            b2d.b3did for b2d in all_b2d_parents
                            if b2d.b3did != -1
                        ]))  # TODO can remove this, just for safety for now
                    printd(
                        '  Post merging b3d parents, updated available-parent b3dids: '
                        + str(new_parent_b3dids), Config.debug_b3d_merge)
                else:
                    new_parent_b3dids = list(parent_b3dids)
                self.parent_id = new_parent_b3dids[0]  # HACK HACK HACK
                if len(new_parent_b3dids) != 0 or self.parent_id == -1:
                    printd(
                        " Updating b3d " + str(self.id) +
                        '\'s parent_id to: ' + str(self.parent_id) +
                        ' from new_parent_ids(after regen after merge): ' +
                        str(
                            list(Blob3d.getb3d(b3d))
                            for b3d in new_parent_b3dids),
                        Config.debug_b3d_merge)
                Blob3d.all[self.parent_id].children.append(self.id)
                printd(
                    ' Added b3d ' + str(self.id) +
                    ' to parent\'s list of children, updated parent: ' +
                    str(Blob3d.all[self.parent_id]), Config.debug_b3d_merge)
                if len(new_parent_b3dids) != 1:
                    warn('New b3d (' + str(self.id) +
                         ') should have ended up with more than one parent!')
            else:
                warn('Creating a b3d at depth ' + str(r_depth) + ' with id ' +
                     str(self.id) + ' which could not find a b3d parent')
        self.validate()
        printd("Done creating new b3d:" + str(self), Config.debug_b3d_merge)
Esempio n. 24
0
def load(filename, directory=Config.PICKLEDIR):
    if directory[-1] not in ['/', '\\']:
        slash = '/'
    else:
        slash = ''
    filename = directory + slash + filename
    t_start = time.time()
    printl('Loading from file \'' + str(filename) + str('\''))

    printl('Loading b3ds ', end='', flush=True)
    t = time.time()

    buff = pickle.load(open(filename + '_b3ds', "rb"))
    Blob3d.all = buff['b3ds']

    found_merged_b3ds = False
    if "merged_b3ds" in buff:
        Blob3d.lists_of_merged_blob3ds = buff["merged_b3ds"]
        found_merged_b3ds = True
    found_merged_parents = False
    if "merged_parent_b3ds" in buff:
        Blob3d.list_of_merged_blob3d_parents = buff["merged_parent_b3ds"]
        found_merged_parents = True

    Blob3d.next_id = max(b3d.id for b3d in Blob3d.all.values()) + 1
    print_elapsed_time(t,
                       time.time(),
                       prefix='(' + str(len(Blob3d.all)) + ') took',
                       flush=True)
    if not found_merged_b3ds:
        print(
            " No lists of merged b3ds found, likely a legacy pickle file or small dataset"
        )
    if not found_merged_parents:
        print(
            " No lists of merged b3 parents found, likely a legacy pickle file or small dataset"
        )

    printl('Loading b2ds ', end='', flush=True)
    t = time.time()
    buff = pickle.load(open(filename + '_b2ds', "rb"))
    Blob2d.all = buff['b2ds']
    Blob2d.used_ids = buff['used_ids']
    Blob2d.total_blobs = len(Blob2d.all)
    print_elapsed_time(t,
                       time.time(),
                       prefix='(' + str(len(Blob2d.all)) + ') took',
                       flush=True)

    printl('Loading pixels ', end='', flush=True)
    t = time.time()
    buff = pickle.load(open(filename + '_pixels', "rb"))
    Pixel.all = buff['pixels']
    Pixel.total_pixels = len(Pixel.all)
    print_elapsed_time(t,
                       time.time(),
                       prefix='(' + str(len(Pixel.all)) + ') took',
                       flush=True)

    printl('Total time to load:', end='')
    print_elapsed_time(t_start, time.time(), prefix='')
Esempio n. 25
0
#!/usr/bin/env python3

import os
import json
import sys

from pubtator_parse import PubtatorParser
from util import printl
try:
    from fuzzywuzzy import fuzz
except ImportError:
    printl('fuzzywuzzy module not found -- fuzzy string matching disabled')
    fuzz = None

class NLPParser(object):

    def __init__(self, file_path, pubtator_file_path=None, fuzzy_ner_match=False):

        ''' fuzzy_ner_match is either False or an integer ratio threshold to use
            for Levenshtein distance between tokens
        '''

        with open(file_path) as f:
            self._json = json.load(f, strict=False)

        # for now, exclude files with strange XML contents, e.g.,
        #
        # ip-172-30-0-123 output_files> cat ../input_files/10179206
        # The wake-effect--emergency vehicle-related collisions.

        # INTRODUCTION: Emergency medical vehicle collisions (EMVCs) occurring 
Esempio n. 26
0
def save(blob3dlist, filename, directory=Config.PICKLEDIR):
    slash = ''
    if directory != '':
        if directory[-1] not in ['/', '\\']:
            slash = '/'
    filename = directory + slash + filename
    printl('\nSaving to file \'' + str(filename) + str('\''))
    done = False
    while not done:
        try:
            printl('Pickling ' + str(len(blob3dlist)) + ' b3ds ', end='')
            t0 = t = time.time()
            pickle.dump(
                {
                    'b3ds': Blob3d.all,
                    'possible_merges': Blob3d.possible_merges,
                    "merged_b3ds": Blob3d.lists_of_merged_blob3ds,
                    "merged_parent_b3ds": Blob3d.list_of_merged_blob3d_parents
                },
                open(filename + '_b3ds', "wb"),
                protocol=0)
            print_elapsed_time(t, time.time(), prefix='took')

            printl('Pickling ' + str(len(Blob2d.all)) + ' b2ds ', end='')
            t = time.time()
            pickle.dump({
                'b2ds': Blob2d.all,
                'used_ids': Blob2d.used_ids
            },
                        open(filename + '_b2ds', "wb"),
                        protocol=0)
            print_elapsed_time(t, time.time(), prefix='took')

            printl('Pickling ' + str(len(Pixel.all)) + ' pixels ', end='')
            t = time.time()
            pickle.dump(
                {
                    'pixels': Pixel.all,
                    'total_pixels': Pixel.total_pixels
                },
                open(filename + '_pixels', "wb"),
                protocol=0)
            print_elapsed_time(t, time.time(), prefix='took')
            done = True

            printl('Saving took:', end='')
            print_elapsed_time(t0, time.time(), prefix='')
        except RuntimeError:
            printl(
                '\nIf recursion depth has been exceeded, '
                'you may increase the maximal depth with: sys.setrecursionlimit(<newdepth>)'
            )
            printl('The current max recursion depth is: ' +
                   str(sys.getrecursionlimit()))
            printl(
                'Opening up an interactive console, press \'n\' then \'enter\' to load variables before interacting,'
                ' and enter \'exit\' to resume execution')
            debug()
            pass
    log.flush()
Esempio n. 27
0
    def pixels_to_blob2ds(pixellist, parent_id=-1, recursive_depth=0):
        alonepixels = []
        alive = set(pixellist)
        blob2dlists = []
        while len(alive):
            alivedict = Pixel.pixel_ids_to_dict(alive)
            pixel = next(iter(alive))  # Basically alive[0]
            neighbors = set(
                Pixel.get(pixel).get_neighbors_from_dict(alivedict))
            index = 1
            done = False
            while (len(neighbors) == 0 or not done) and len(alive) > 0:
                if index < len(alive):
                    try:
                        pixel = list(
                            alive
                        )[index]  # Basically alive[0] # TODO fix this to get the index set to the next iteration
                        index += 1
                    except Exception as exc:
                        printl('Error encountered: ' + str(exc))
                        printl('Index:' + str(index))
                        printl('Length of alive:' + str(len(alive)))
                        import pbt
                        pbt.set_trace()
                else:
                    done = True
                    # Assuming that all the remaining pixels are their own blob2ds essentially, and so are removed
                neighbors = set(
                    Pixel.get(pixel).get_neighbors_from_dict(alivedict))
                if len(neighbors) == 0:
                    alive = alive - {pixel}
                    alonepixels.append(pixel)
                    index -= 1  # Incase we damaged the index
                    if index < 0:  # HACK
                        index = 0
            oldneighbors = set()  # TODO can make this more efficient
            while len(oldneighbors) != len(neighbors):
                oldneighbors = set(neighbors)
                newneighbors = set(neighbors)
                for pixel in neighbors:
                    newneighbors = newneighbors | set(
                        pixel.get_neighbors_from_dict(alivedict))
                neighbors = newneighbors
            blob2dlists.append(list(neighbors))
            alive = alive - set(n.id for n in neighbors)

        b2ds = [
            Blob2d(blob2dlist,
                   blob2dlist[0].z,
                   parent_id=parent_id,
                   recursive_depth=recursive_depth)
            for blob2dlist in blob2dlists if len(blob2dlist) > 0
        ]

        # TODO this update is very expensive, need to separate this lists of children from the blob2ds (into another dict), therefore no need for a deep copy of a blob2d

        Blob2d.all[parent_id].children = Blob2d.all[parent_id].children + [
            b2d.id for b2d in b2ds
        ]

        if Blob2d.get(parent_id).recursive_depth > 0:
            Blob2d.all[parent_id].pixels += [
                pixel for b2d in b2ds for pixel in b2d.pixels
            ]
        b2ds = [b2d.id for b2d in b2ds]
        return b2ds
Esempio n. 28
0
def do_stat_analysis():
    printl("Now performing statistical analysis...")
    b3d_count = len(Blob3d.all)
    base_b3ds = list(b3d for b3d in Blob3d.all.values()
                     if b3d.recursive_depth == 0)
    beads = list(b3d for b3d in Blob3d.all.values()
                 if b3d.isBead)  # TODO optimize

    beads_per_strand = []
    loose_beads = [
    ]  # These are beads that are solitary (not part of a strand)
    beads_in_strands = []
    strands = []

    for b3d in base_b3ds:  # TODO see if this conflicts with the current 'isBead' labeling
        buf = b3d.get_first_child_beads()
        num_children = len(buf)
        if num_children != 0:
            # Has bead children; is a strand
            if b3d.isBead:
                loose_beads.append(b3d)
            else:
                # Not a bead, so is a strand (since these are from base b3ds)
                if b3d.isBead:
                    print("WARNING adding b3d to strands, when isBead: " +
                          str(b3d))
                strands.append(b3d)
                beads_per_strand.append(num_children)
                beads_in_strands += buf
        else:
            # No children, therefore implicitly a loose bead?
            if not b3d.isBead:
                print("WARNING adding b3d to loose beads, when not isBead: " +
                      str(b3d))
            loose_beads.append(b3d)
    number_of_strands = len(beads_per_strand)
    printl('Total number of beads: ' + str(len(beads)) + ' out of ' +
           str(b3d_count) + ' total b3ds')
    printl('Total number of base b3ds: ' + str(len(base_b3ds)) + ' out of ' +
           str(b3d_count) + ' total b3ds')
    printl('Total number of loose beads: ' + str(len(loose_beads)) +
           ' out of ' + str(b3d_count) + ' total b3ds')
    printl('Total number of strands: ' + str(len(strands)) + ' out of ' +
           str(b3d_count) + ' total b3ds')

    plot_hist_xyz(base_b3ds)
    plot_hist_xyz(beads, type='All_Bead_b3ds')
    plot_hist_xyz(loose_beads, type='Loose_bead_b3ds')
    plot_hist_xyz(strands, type='Strand_b3ds')
    plot_hist_xyz(beads_in_strands, type='Beads_in_strand_b3ds')
    #
    plot_corr(base_b3ds)
    plot_corr(beads, type='All_Bead_b3ds')
    plot_corr(loose_beads, type='Loose_bead_b3ds')
    plot_corr(strands, type='Strand_b3ds')
    plot_corr(beads_in_strands, type='Beads_in_strand_b3ds')

    n1, bins1, patches1 = plt.hist(beads_per_strand,
                                   bins=max(beads_per_strand))
    plt.xlabel("Number of beads per strand")
    plt.ylabel("Number of b3ds")
    plt.title("Strand b3ds by number of beads")
    plt.tight_layout()
    plt.show()
Esempio n. 29
0
 def mergeblobs(bloblist):
     """
     Returns a NEW list of blobs, which have been merged after having their ids updated (externally, beforehand)
     Use the global variable 'debug_set_merge' to control output
     :param bloblist:
     """
     newlist = []
     copylist = list(
         bloblist
     )  # http://stackoverflow.com/questions/2612802/how-to-clone-or-copy-a-list-in-python
     printd('Blobs to merge:' + str(copylist), Config.debug_set_merge)
     while len(copylist) > 0:
         printd('Len of copylist:' + str(len(copylist)),
                Config.debug_set_merge)
         blob1 = copylist[0]
         newpixels = []
         merged = False
         printd('**Curblob:' + str(blob1), Config.debug_set_merge)
         for (index2, blob2) in enumerate(copylist[1:]):
             if blob2 == blob1:
                 printd(
                     '   Found blobs to merge: ' + str(blob1) + ' & ' +
                     str(blob2), Config.debug_set_merge)
                 if Blob2d.get(blob1).recursive_depth != Blob2d.get(
                         blob2).recursive_depth:
                     printl(
                         'WARNING merging two blobs of different recursive depths:'
                         + str(blob1) + ' & ' + str(blob2))
                 merged = True
                 newpixels = newpixels + Blob2d.get(blob2).pixels
         if not merged:
             printd('--Never merged on blob:' + str(blob1),
                    Config.debug_set_merge)
             newlist.append(blob1)
             del copylist[0]
         else:
             printd(' Merging, newlist-pre:', Config.debug_set_merge)
             printd(' Merging, copylist-pre:', Config.debug_set_merge)
             index = 0
             while index < len(copylist):
                 printd(' Checking to delete:' + str(copylist[index]),
                        Config.debug_set_merge)
                 if copylist[index] == blob1:
                     printd('  Deleting:' + str(copylist[index]),
                            Config.debug_set_merge)
                     del copylist[index]
                     index -= 1
                 index += 1
             newlist.append(
                 Blob2d(Blob2d.get(blob1).pixels + newpixels,
                        Blob2d.get(blob1).height,
                        recursive_depth=Blob2d.get(blob1).recursive_depth,
                        parent_id=min(
                            Blob2d.get(blob1).parentID,
                            Blob2d.get(blob2).parentID)))
             printd(' Merging, newlist-post:' + str(newlist),
                    Config.debug_set_merge)
             printd(' Merging, copylist-post:' + str(copylist),
                    Config.debug_set_merge)
     printd('Merge result' + str(newlist), Config.debug_set_merge)
     return newlist
Esempio n. 30
0
def main():
    # printl('Current recusion limit: ' + str(sys.getrecursionlimit()) + ' updating to: ' + str(Config.recursion_limit))
    # sys.setrecursionlimit(Config.recursion_limit)  # HACK
    if Config.test_instead_of_data:
        picklefile = 'All_test_pre_b3d_tree.pickle'
    else:
        picklefile = Config.PICKLE_FILE_PREFIX + ".pickle"
    if not Config.dePickle:
        all_slides, blob3dlist = Slide.dataToSlides(
            stitch=Config.base_b3ds_with_stitching)
        # Reads in images and converts them to slides.
        # This process involves generating Pixels & Blob2ds & Blob3ds & Pairings
        printl("Saving a 'recursive depth 0' (rd0) copy")
        save(blob3dlist, picklefile + '_rd0_only')
        log.flush()
        if Config.process_internals:
            bloomed_b3ds = bloom_b3ds(
                blob3dlist, stitch=Config.stitch_bloomed_b2ds
            )  # Also sets partners + optionally stitching
            printl('Blooming resulted in ' + str(len(bloomed_b3ds)) +
                   ' new b3ds:')
            blob3dlist = blob3dlist + bloomed_b3ds

        save(blob3dlist, picklefile)
        log.flush()

        # print("\n\nThere were a total of %s lists of merged b3d lists" % (len(Blob3d.lists_of_merged_blob3ds)))
        # for index, sublist in enumerate(Blob3d.lists_of_merged_blob3ds):
        #     print("Index:%s, sublist_len:%s, sublist_ids:%s, sublist:%s" % (index, len(sublist), [b3d.id for b3d in sublist], sublist))
        #     plot(sublist)

    else:
        if Config.load_base_only:
            load(picklefile + '_rd0_only')
            blob3dlist = list(Blob3d.all.values())
            if Config.process_internals:
                bloomed_b3ds = bloom_b3ds(blob3dlist,
                                          stitch=Config.stitch_bloomed_b2ds)
                # Includes setting partners, and optionally stitching
                printl('Blooming resulted in ' + str(len(bloomed_b3ds)) +
                       ' new b3ds:')
                for b3d in bloomed_b3ds:
                    printl(b3d)
                blob3dlist = blob3dlist + bloomed_b3ds
        else:
            load(picklefile)
            blob3dlist = list(Blob3d.all.values())

    Blob3d.clean_b3ds(
    )  # This is for safety - prioritizing successful execution to perfectly correct data
    printl('Setting beads!')
    Blob3d.tag_all_beads()

    # largest_base_b3ds = sorted(list(blob3d for blob3d in Blob3d.all.values() if blob3d.recursive_depth == 0),
    #                       key=lambda b3d: b3d.get_edge_pixel_count(), reverse=True)  # Do by recursive depth

    # for b3d in blob3dlist:
    #     print(b3d)
    # plot([b3d], ids=False)
    # gen_skeleton_new(b3d)

    # print("P:" + str(b3d.get_pixels()))
    # print("EP:" + str(b3d.get_edge_pixels()))
    # for child in b3d.children:
    #     print(' ' + str(Blob3d.get(child)))

    plot(blob3dlist,
         ids=False,
         stitches=True,
         buffering=True,
         parentlines=True,
         explode=True,
         show_debug_colors=True)

    # TODO Calculate and plot different statistics about the data.
    # Good examples are:
    #   Total number of b3ds, distribution of number of pixels in blob3ds
    #   Density over the 3d volume over the scans, as a density map and as 3 histograms, for:
    #         Total beads, singular beads,
    #   Average number of beads per strand
    #
    # do_stat_analysis()
    exit()
    '''
Esempio n. 31
0
def main(conf, current_chunk, total_chunks):
    # doc_id         text,
    # sentence_index int,
    # sentence_text  text,
    # tokens         text[],
    # lemmas         text[],
    # pos_tags       text[],
    # ner_tags       text[],
    # doc_offsets    int[],
    # dep_types      text[],
    # dep_tokens     int[]
    printl('Loading sentence data, Chunk {} of {}'.format(
        current_chunk, total_chunks))

    if conf['data_tgz']:
        article_list = glob.glob(os.path.join(conf['data_directory'], '*.tgz'))
    article_chunk = [
        a for a in article_list
        if a.endswith('_{}_combined.tgz'.format(current_chunk))
    ]
    if len(article_chunk) < 1:
        printl(
            'Sentence loader - Chunk {} - no file found'.format(current_chunk))
        sys.exit(1)
    elif len(article_chunk) > 1:
        printl(
            'Sentence loader - Chunk {} - multiple files found: {} (importing anyway)'
            .format(current_chunk, str(article_chunk)))

    for article_archive in article_chunk:
        printl(article_archive)
        with tarfile.open(article_archive,
                          "r:gz") as tar, tempfile.TemporaryDirectory() as td:

            # corenlp output:
            output_files = filter_files_from_tar(tar, 'output_files')

            if conf['parse_pubtator']:
                # pubtator output:
                pubtator_files = filter_files_from_tar(tar, 'pubtator')
            else:
                pubtator_files = []

            # extract pubtator and corenlp output files into tempdir
            tar.extractall(path=td,
                           members=itertools.chain(output_files,
                                                   pubtator_files))

            # glob/read through output_files and print file data
            output_filepaths = sorted(
                glob.glob(os.path.join(td, '*', 'output_files', '*')))
            # if conf['parse_pubtator']:
            #     pubtator_filepaths = sorted(glob.glob(os.path.join(td,
            #                                                        '*',
            #                                                        'pubtator',
            #                                                        '*')))
            # else:
            #     pubtator_filepaths = [None for _ in output_filepaths]
            pubtator_filepaths = [None for _ in output_filepaths]

            for i, (fp, pubtator_fp) in enumerate(
                    zip(output_filepaths, pubtator_filepaths)):
                parse_corenlp_output(conf, fp, pubtator_fp)
                if i % 1000 == 0:
                    printl('Processed file {} of chunk'.format(i))
Esempio n. 32
0
#!/usr/bin/env python3

import os
import json
import sys

from pubtator_parse import PubtatorParser
from util import printl
try:
    from fuzzywuzzy import fuzz
except ImportError:
    printl('fuzzywuzzy module not found -- fuzzy string matching disabled')
    fuzz = None


class NLPParser(object):
    def __init__(self,
                 file_path,
                 pubtator_file_path=None,
                 fuzzy_ner_match=False):
        ''' fuzzy_ner_match is either False or an integer ratio threshold to use
            for Levenshtein distance between tokens
        '''

        with open(file_path) as f:
            self._json = json.load(f, strict=False)

        # for now, exclude files with strange XML contents, e.g.,
        #
        # ip-172-30-0-123 output_files> cat ../input_files/10179206
        # The wake-effect--emergency vehicle-related collisions.