Example #1
0
def __extract_textboxes(document):
    """Extract textboxes from document.

    It is kept here instead of document since ArthurDocument does not need to know the concept of a textbox.
    In ArthurDocument a textbox is just a set of features happen to have the same textbox_id. Different document
    type may have different configurations and approaches to this.

    Args:
        document(ArthurDocument): ArthurDocument instance textboxes will be extracted from.

    Returns:
        list: List of textboxes i.e. grouped features from document.
    """
    features = document.get_features()
    page_feature_id = ArthurDocument.get_feature_id('page')
    textbox_feature_id = ArthurDocument.get_feature_id('textbox_id')

    page_textbox_pairs = features[:, [page_feature_id, textbox_feature_id]]
    unique_page_textbox_pairs = unique_rows(page_textbox_pairs)
    textboxes = []
    for page, textbox_id in unique_page_textbox_pairs:
        textbox = features[np.where(
            (features[:, page_feature_id]==page) * 
            (features[:, textbox_feature_id]==textbox_id)
        )]
        textboxes.append(textbox)
    return textboxes
Example #2
0
    def process_batch(zipfile, corpus_dir, batch, total, counter=0):
        for docname in batch:
            counter += 1
            if not os.path.exists(corpus_dir):
                os.makedirs(corpus_dir)
            filename = os.path.join(corpus_dir, docname+'.txt')
            if os.path.isfile(filename) and not overwrite:
                if stdout is not None:
                    stdout.write("%s already exists (%i/%i)\n" % (docname, counter, total))
            else:
                content = zipfile.read(docname)
                if stdout is not None:
                    stdout.write("processing %s (%i/%i)\n" % (docname, counter, total))
                document = ArthurDocument(content, name=docname)

                textboxes = __extract_textboxes(document)

                texts = []
                for idx, textbox in enumerate(textboxes):
                    remove = __find_duplicates(textbox)
                    ctextbox = np.delete(textbox, remove, axis=0)
                    texts.append(document.get_text(ctextbox))

                if len(texts) > 0:
                    if not os.path.isdir(corpus_dir):
                        os.mkdir(corpus_dir)

                    with open(filename,'w') as fout:
                        for text in texts:
                            print>>fout, text
                else:
                    if stdout is not None:
                        stdout.write("    empty text! moving on...\n")
Example #3
0
def __find_duplicates(features):
    """Finds duplicates of a set of features.
    
    Example of usage
    >>> pdf_path = os.path.join(base_path, 'test', 'test.pdf')
    >>> f = open(pdf_path, 'rb')
    >>> document = ArthurDocument(f.read(), doctype='pdf')
    >>> textboxes = __extract_textboxes(document)
    >>> print(document.get_text(textboxes[11]))
    Property TypeProperty Type Property TypeProperty Type Single Family

    >>> remove_indexes = __find_duplicates(textboxes[11])
    >>> cfeatures = np.delete(textboxes[11], remove_indexes, axis=0)
    >>> print(document.get_text(cfeatures))
    Property Type Single Family

    Args:
        features(np.array): List of features to find duplicates of.

    Returns:
        list: Returns a tuple of corrected block and removed indexes.
    """
    fxid = ArthurDocument.get_feature_id('x')
    fyid = ArthurDocument.get_feature_id('y')
    positions = features[:,[fxid,fyid]]
    tree = cKDTree(positions)

    # Removes duplicate elements that are close together
    radius = 0.4
    neighbors = tree.query_ball_point(positions, radius)
    neighbors = np.unique(neighbors)
    # This returns numpy array like:
    # [[0, 13, 26, 39] [1, 14, 27, 40] [5, 31, 44, 18] [11, 24, 37, 50]
    # [16, 29, 42, 3] [17, 30, 43, 4] [21, 8, 34, 47] [22, 35, 48, 9]
    # [32, 45, 19, 6] [36, 23, 10, 49] [38, 12, 25, 51] [41, 28, 2, 15]
    # [46, 33, 7, 20] [52] [53] [54] [55] [56] [57] [58] [59] [60] [61] [62]
    # [63] [64]]
    #
    # Which we will then remove duplicates e.g. remove index 13, 26, 39, 14, 27, etc.
    removed = []
    for n in neighbors:
        removed.extend(np.sort(n)[1:])
    
    # Removes image elements
    removed.extend(np.where(features[:,ArthurDocument.get_feature_id('img_width')] != -1)[0].tolist())

    return removed
    def extract_expressions(self, document, features=None):
        """Returns expressions from given features and multi-word expressions.
        
        In addition to passing a document into this method, MWEs or Multi-Word Expressions
        can be given to treat some multi words as one expression.

        >>> from document import ArthurDocument
        >>> pdf_path = base_path + '/test/test.pdf'
        >>> with open(pdf_path, 'rb') as f:
        ...     document = ArthurDocument(f.read())
        >>> features = document.get_features()[730:816,:]
        >>> print(document.get_text(features)) # doctest:+ELLIPSIS
        VICTORIA'S CROWN JEWEL OF WATERFRONT ESTATES. Nestled on a quiet cove in the exclusive

        Multi-word expression should be detected:
        >>> clusterer = DumbClusterer(mwes=['crown jewel', 'waterfront estates'])
        >>> expressions = clusterer.extract_expressions(document, features)
        >>> print(expressions[2]['text'])
        CROWN JEWEL

        x position should equal x of "C" from "CROWN JEWEL" :
        >>> expressions[2]['x'] == features[11, ArthurDocument.get_feature_id('x')]
        True

        and width should equal to width of "CROWN JEWEL":
        >>> expr_width = expressions[2]['x1']-expressions[2]['x']
        >>> ftr_width = features[21, ArthurDocument.get_feature_id('x1')] - features[11, ArthurDocument.get_feature_id('x')]
        >>> expr_width == ftr_width
        True

        Args:
            document(ArthurDocument): Document to extract data fields from.
            features(list): List of features containing data fields to extract. If not given, use
                            all document features.
            mwes(list): List of Multi-Word Expressions. Example value:
                        `['property type', 'single family)]`. With that list, both "property type"
                        and "single family" will each be treated as single expressions.        
        Returns:
            np.array: An array of data_fields.
        """
        mwes = self.mwes
        if features is None:
            features = document.get_features()
        text = document.get_text(features)
        for idx, mwe in enumerate(mwes):
            if isinstance(mwe, str):
                mwes[idx] = word_tokenize(mwe.lower())
            elif hasattr(mwe, '__iter__'):
                mwes[idx] = [x.lower() for x in mwe]
        tokenizer = MWETokenizer(mwes, separator=' ')
        tokenized = tokenizer.tokenize(word_tokenize(text.lower()))

        expressions = []
        pos = 0
        for token in tokenized:
            # token could be "deez nutz" but text contains multiple spaces e.g. "deez  nutz",
            # so we need to split the token and find position of first and last characters.
            words = token.split()
            start_pos = text.lower().index(words[0], pos)
            for word in words:
                ipos = text.lower().index(word, pos)
                end_pos = ipos + len(word)
            pos = end_pos
            min_x = 0
            max_x = 0
            min_y = 0
            max_y = 0
            page = 0
            if len(features[start_pos:end_pos,:] > 0):
                min_x =  np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x')]
                max_x =  np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x1')]
                min_y =  np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y')]
                max_y =  np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y1')]
                page = features[start_pos, ArthurDocument.get_feature_id('page')]

            expressions.append({
                'text': text[start_pos:end_pos],
                'x': min_x,
                'x1': max_x,
                'y': min_y,
                'y1': max_y,
                'page': page
            })
        return expressions