Esempio n. 1
0
 def test_to_list(self):
     objs = [
         {
             "x0": 0,
             "top": 0,
             "x1": 20,
             "bottom": 20,
         },
         {
             "x0": 10,
             "top": 10,
             "x1": 15,
             "bottom": 15,
         },
     ]
     assert utils.to_list(objs) == objs
     assert utils.to_list(tuple(objs)) == objs
     assert utils.to_list((o for o in objs)) == objs
     assert utils.to_list(pd.DataFrame(objs)) == objs
def extract_text(chars, x_tolerance=utils.DEFAULT_X_TOLERANCE,
                 y_tolerance=utils.DEFAULT_Y_TOLERANCE):
    if len(chars) == 0:
        return None

    chars = utils.to_list(chars)
    doctop_clusters = utils.cluster_objects(chars, "doctop", y_tolerance)

    lines = (collate_line(line_chars, x_tolerance)
             for line_chars in doctop_clusters)

    coll = "|&|".join(lines)
    return coll
Esempio n. 3
0
def segment_to_lines(segments, x_tolerance=X_TOLERANCE):
    '''对含有chars的块进行分行'''
    segment_lines = []

    for segment in segments:
        if not isinstance(segment, list):
            segment_lines.append(segment)
        elif len(segment) == 0:
            segment_lines.append(segment)
        else:
            chars = to_list(segment)
            clusters = cluster_chars(chars)
            line_chars = [
                collate_line_chars(cluster, x_tolerance)
                for cluster in clusters
            ]
            segment_lines.append(line_chars)
    return segment_lines
def extract_words(page,
                  x_tolerance=DEFAULT_X_TOLERANCE,
                  y_tolerance=DEFAULT_Y_TOLERANCE,
                  keep_blank_chars=False):
    x_tolerance = decimalize(x_tolerance)
    y_tolerance = decimalize(y_tolerance)

    def process_word_chars(chars):
        x0, top, x1, bottom = objects_to_bbox(chars)
        return {
            "x0": x0,
            "x1": x1,
            "top": top,
            "bottom": bottom,
            "text": "".join(map(itemgetter("text"), chars)),
            "chars": chars
        }

    def make_set_clusters(doctop_cluster):
        new_clusters = []
        for c in doctop_cluster:
            new_cluster = [simplejson.dumps(c[i]) for i in range(len(c))]
            new_cluster = list(set(new_cluster))
            cluster_to_dict = []
            for i in range(len(new_cluster)):
                d = simplejson.loads(new_cluster[i])
                for k in d.keys():
                    if type(d[k]) == float:
                        d[k] = Decimal(str(d[k]))
                cluster_to_dict.append(d)
            new_clusters.append(cluster_to_dict)
        return new_clusters

    def check_two_chars(char1, char2):
        if abs(char1['x0'] - char2['x0']) < 1:
            return False
        return True

    def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE):
        get_text = itemgetter("text")
        chars_sorted = sorted(chars, key=itemgetter("x0"))
        new_chars_sorted = []
        for i in range(len(chars_sorted)):
            if i == 0 or check_two_chars(chars_sorted[i], chars_sorted[i - 1]):
                new_chars_sorted.append(chars_sorted[i])
        chars_sorted = new_chars_sorted
        words = []
        current_word = []

        for char in chars_sorted:
            if not keep_blank_chars and get_text(char).isspace():
                if len(current_word) > 0:
                    words.append(current_word)
                    current_word = []
                else:
                    pass
            elif len(current_word) == 0:
                current_word.append(char)
            else:
                last_char = current_word[-1]
                if char["x0"] > (last_char["x1"] + tolerance):
                    words.append(current_word)
                    current_word = []
                current_word.append(char)

        if len(current_word) > 0:
            words.append(current_word)
        processed_words = list(map(process_word_chars, words))
        return processed_words

    chars = to_list(page.chars)
    doctop_clusters = cluster_objects(chars, "doctop", y_tolerance)
    doctop_clusters = make_set_clusters(doctop_clusters)
    nested = [
        get_line_words(line_chars, tolerance=x_tolerance)
        for line_chars in doctop_clusters
    ]
    # text = ''.join([nested[2][i]['x0'] for i in range(len(nested[2]))])
    # x0 = [nested[2][i]['x0'] for i in range(len(nested[2]))]
    # print(x0)
    # print(nested[2])
    # print(2 / 0)

    words = list(itertools.chain(*nested))
    return words
 def draw_circles(self, list_of_circles, **kwargs):
     for x in utils.to_list(list_of_circles):
         self.draw_circle(x, **kwargs)
     return self
 def draw_rects(self, list_of_rects, **kwargs):
     for x in utils.to_list(list_of_rects):
         self.draw_rect(x, **kwargs)
     return self
 def draw_hlines(self, locations, **kwargs):
     for x in utils.to_list(locations):
         self.draw_hline(x, **kwargs)
     return self