Esempio n. 1
0
 def __init__(self, PDF_Info, p_supporting_data, p_attributes):
     self.PDF_Info = PDF_Info
     self.supporting_data = p_supporting_data
     self.attributes = p_attributes
     self.keyword = Keywords(self.PDF_Info, self.supporting_data, self.attributes)
     self.Expression_Currency = Expression_Currency(self.PDF_Info, self.supporting_data, self.attributes)
     self.Expression_Dates = Expression_Dates(self.PDF_Info, self.supporting_data, self.attributes)
     self.Expression_List = Expression_List(self.PDF_Info, self.supporting_data, self.attributes)
     self.Expression_Numbers = Expression_Numbers(self.PDF_Info, self.supporting_data, self.attributes)
     self.Expression_Qualify = Expression_Qualify(self.PDF_Info, self.supporting_data, self.attributes)
     self.seperators = Separator()
Esempio n. 2
0
class Expression():
    # class to resolve an expression in form of key-value pair: 
    # search for terms that occur together where pattern of expression matches with terms in pdf

        
    def __init__(self, PDF_Info, p_supporting_data, p_attributes):
        self.PDF_Info = PDF_Info
        self.supporting_data = p_supporting_data
        self.attributes = p_attributes
        self.keyword = Keywords(self.PDF_Info, self.supporting_data, self.attributes)
        self.Expression_Currency = Expression_Currency(self.PDF_Info, self.supporting_data, self.attributes)
        self.Expression_Dates = Expression_Dates(self.PDF_Info, self.supporting_data, self.attributes)
        self.Expression_List = Expression_List(self.PDF_Info, self.supporting_data, self.attributes)
        self.Expression_Numbers = Expression_Numbers(self.PDF_Info, self.supporting_data, self.attributes)
        self.Expression_Qualify = Expression_Qualify(self.PDF_Info, self.supporting_data, self.attributes)
        self.seperators = Separator()

    def findAll(self, p_expression, p_attribute_name='', p_list_name=''):
        """
        returns key-value pairs that matches the pattern of expression.

        Args:
            p_expression (dictionary): 
            p_supporting_data (dictionary): contains master data, synonyms..etc.
            PDF_Info (PDF_Information): information extracted from pdf by PDF_OCR_Reader.
            p_attributes (list): atributes that will qualify/verify the results.    
            
        Returns:
        
            list of terms combination that match the expression.

        """
        l_Keyword = p_expression['Keyword']
        l_Separator = p_expression['Separator']
        l_Value = p_expression['Value']
        expression_match = self.get_expression_match(l_Value, p_attribute_name=p_attribute_name,p_keyword=l_Keyword, p_list_name=p_list_name)
        if l_Keyword:
            return self.extract_expression_match(expression_match)
        else:
            return expression_match

    def extract_expression_match(self, p_expression_match):
        out_put = []
        for match in p_expression_match:
            out_put.append([match[0], match[1], match[2][3], match[2][4]])
        return out_put
        


    

    def initialize_objects(self, PDF_Info, p_supporting_data, p_attributes):
        """ 
        based on value type this initiallize the correct class object: date, currency, list, decimal.
        returns he class object
        """
        pass

    def get_expression_match(self, p_value, p_attribute_name = '', p_keyword = '', p_list_name = ''):
        l_keyword_list=[]
        if p_keyword:
            l_keyword_list = self.keyword_pdf_info(p_keyword)
        if p_value.startswith('Currency'):
            l_expression_currency =  self.get_expression_currency(l_keyword_list, p_attribute_name)
            return l_expression_currency
        elif p_value.startswith('Date'):
            l_expression_dates =  self.get_expression_dates(l_keyword_list, p_attribute_name)
            return l_expression_dates
        elif p_value.startswith('Number'):
            l_expression_numbers =  self.get_expression_numbers(l_keyword_list, p_attribute_name)
            return l_expression_numbers
        elif p_value.startswith('List'):
            # l_list_name re.search(r"\(([A-Za-z0-9_]+)\)", s)
            # l_list_name = l_list_name.group(1))
            l_expression_lists = self.get_expression_lists(l_keyword_list, p_list_name, p_attribute_name)
            return l_expression_lists


    def keyword_pdf_info(self, p_keyword):
        return [p_keyword, self.keyword.get(p_keyword)]
    

    def get_expression_currency(self, p_keyword_list, p_attribute_name):
        currency_match = self.Expression_Currency.get(p_keyword_list)
        return self.Expression_Qualify.qualify(currency_match, p_attribute_name)

    
    def get_expression_dates(self, p_keyword_list, p_attribute_name):
        date_match = self.Expression_Dates.get(p_keyword_list)
        return self.Expression_Qualify.qualify(date_match, p_attribute_name)

    
    def get_expression_numbers(self, p_keyword_list, p_attribute_name):
        numbers_match = self.Expression_Numbers.get(p_keyword_list)
        return self.Expression_Qualify.qualify(numbers_match, p_attribute_name)


    def get_expression_lists(self, p_keyword_list, p_list_name, p_attribute_name =''):
        list_match = self.Expression_List.get(p_list_name, p_keyword_list)
        if p_keyword_list:
            return self.Expression_Qualify.qualify(list_match, p_attribute_name)
        else:
            return list_match
Esempio n. 3
0
 def setUp(self):
     self.PDF_Info = z_test_data.PDF_Info
     self.supporting_data = []
     self.attributes = []
     self.x = Keywords(self.PDF_Info, self.supporting_data, self.attributes)
Esempio n. 4
0
class testKeywords(unittest.TestCase):
    def setUp(self):
        self.PDF_Info = z_test_data.PDF_Info
        self.supporting_data = []
        self.attributes = []
        self.x = Keywords(self.PDF_Info, self.supporting_data, self.attributes)

    def test_get(self):
        keyWord = 'February 20, 2020'
        out_put = self.x.get(keyWord)
        for o in out_put:
            print(o)
        print('------------------out_put-----------------')
        keyWord = 'February 29'
        out_put = self.x.get(keyWord)
        # out_put =self.x.strip_details(out_put)
        print(out_put)
        print('------------------out_put-----------------')

    def test_synonyms_keyword(self):
        pass

    def test_search_keyword_pdf_info(self):
        keyWord = 'February 20, 2020'
        out_put = self.x.search_keyword_pdf_info(keyWord)
        for o in out_put:
            print(o)

        keyWord = 'February 20, 2022'
        out_put = self.x.search_keyword_pdf_info(keyWord)

    def test_qualify(self):
        l_key = ('February 20, 2020', ('february', '20', ',', '2020'))
        l_pdf_keyword = [
            (5, 1, 4, 1, 1, 4, 1134, 973, 286, 67, 95, 'February', 0),
            [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, '20', 0],
             ('right', 22)],
            [[5, 1, 4, 1, 2, 5, 1487, 1056, 93, 62, 94, ',', 1], ('down', 21)],
            [[5, 1, 4, 1, 2, 6, 1609, 1056, 156, 53, 95, '2020', 0],
             ('right', 29)]
        ]
        out_put = self.x.qualify(l_key, l_pdf_keyword)
        print(out_put)

    def test_strip_details(self):
        keywords = [[(5, 1, 4, 1, 1, 4, 1134, 973, 286, 67, 95, 'February', 0),
                     [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, '20', 0],
                      ('right', 22)],
                     [[5, 1, 4, 1, 2, 5, 1487, 1056, 93, 62, 94, ',', 1],
                      ('down', 21)],
                     [[5, 1, 4, 1, 2, 6, 1609, 1056, 156, 53, 95, '2020', 0],
                      ('right', 29)]],
                    [(5, 1, 4, 1, 2, 4, 1179, 1056, 285, 67, 95, 'February',
                      0),
                     [[5, 1, 4, 1, 2, 5, 1487, 1056, 93, 62, 94, '20', 0],
                      ('right', 23)],
                     [[5, 1, 4, 1, 2, 5, 1487, 1056, 93, 62, 94, ',', 1],
                      ('right', 23)],
                     [[5, 1, 4, 1, 2, 6, 1609, 1056, 156, 53, 95, '2020', 0],
                      ('right', 29)]]]
        out_put = self.x.strip_details(keywords)
        print(out_put)
Esempio n. 5
0
 def __init__(self, PDF_Info, p_supporting_data, p_attributes):
     self.pdf_info = PDF_Info
     self.supporting_data = p_supporting_data
     self.attributes = p_attributes
     self.KW = Keywords(self.pdf_info, self.supporting_data, self.attributes)
Esempio n. 6
0
class Expression_List():

    def __init__(self, PDF_Info, p_supporting_data, p_attributes):
        self.pdf_info = PDF_Info
        self.supporting_data = p_supporting_data
        self.attributes = p_attributes
        self.KW = Keywords(self.pdf_info, self.supporting_data, self.attributes)

    
    def get(self, p_list_name, p_keyword = []):
        l_list_data = self.get_list_data_pdf_info(p_list_name)
        l_list_data = self.strip_details(l_list_data)
        l_list_data = self.qualify(l_list_data)
        if p_keyword:
            l_list_expression_dtl = self.list_expression(p_keyword, l_list_data)
            return self.extract_list_expression(l_list_expression_dtl)
        else:
            for i, data in enumerate(l_list_data):
                l_list_data[i].insert(0, p_list_name)
            return l_list_data


    

    def extract_list_expression(self, p_exps_dtls):
        out_put = []
        for expression in p_exps_dtls:
            l_KeyWord, l_list_data = expression[0], expression[1]
            l_ExpAlign, l_direction, l_distance, l_value_dtls,l_key_dtls = expression[2], expression[3], expression[4], expression[5], expression[6]
            out_put.append([l_KeyWord, l_list_data, [l_ExpAlign, l_direction, l_distance, l_value_dtls, l_key_dtls]])
        return out_put



    def qualify(self, p_dates):
        return p_dates

            
                
    def list_expression(self, p_keywords, p_list_data):
        out_put = []
        NN = Network_Navigtor()
        l_keyword_str = p_keywords[0]
        for p_keyword in p_keywords[1]:
            for l_list_data in p_list_data:
                l_list_data_str = l_list_data[0]
                NN.align(self.pdf_info, p_keyword, l_list_data[1])
                if NN.alignment_list:
                    out_put.append([l_keyword_str, l_list_data_str, NN.alignment_list, NN.direction, NN.distance, l_list_data[1], p_keyword])
        return out_put



    def strip_details(self, p_dates):
        out_put = []
        for key in p_dates:
            for l_date in p_dates[key]:
                out_put.append([key, l_date])
        return out_put



    def get_list_data_pdf_info(self, p_name):
        list_terms = []
        # get list from supporting data
        l_list_data = self.supporting_data.get(p_name)
        # search each term in list in keywords, match will be found only if term exixts in pdf.
        if l_list_data:
            for l_data in l_list_data:
                list_terms.append(self.supporting_data.get_term(l_data))
        out_put = self.search_list_data_pdf_info(list_terms)
        return out_put

            
    
    def search_list_data_pdf_info(self, p_list_data):
        out_put = {}
        # list data is added to keywords before searching keywords in pdf term network.
        # need to fetch list data from keywords.
        for l_list_data in p_list_data:
            list_terms = self.KW.get(l_list_data)        
            out_put[l_list_data] = list_terms
        return out_put
Esempio n. 7
0
class Expression_Dates():
    def __init__(self, PDF_Info, p_supporting_data, p_attributes):
        self.pdf_info = PDF_Info
        self.supporting_data = p_supporting_data
        self.attributes = p_attributes
        self.KW = Keywords(self.pdf_info, self.supporting_data,
                           self.attributes)

    def get(self, p_keyword):
        l_dates = self.search_dates_pdf_info()
        l_dates = self.strip_details(l_dates)
        l_dates = self.qualify(l_dates)
        l_date_expression_dtl = self.date_expression(p_keyword, l_dates)
        return self.extract_date_expression(l_date_expression_dtl)

    def extract_date_expression(self, p_exps_dtls):
        out_put = []
        for expression in p_exps_dtls:
            l_KeyWord, l_Date = expression[0], expression[1]
            l_ExpAlign, l_direction, l_distance, l_value_dtls, l_key_dtls = expression[
                2], expression[3], expression[4], expression[5], expression[6]
            out_put.append([
                l_KeyWord, l_Date,
                [
                    l_ExpAlign, l_direction, l_distance, l_value_dtls,
                    l_key_dtls
                ]
            ])
        return out_put

    def qualify(self, p_dates):
        return p_dates

    def search_date_pdf_info(self, p_date):
        out_put = {}
        # for each term in p_date get the details from pdf_info
        for l_date in p_date:
            # get pdf_info information for l_date using keyword class module.
            # each date is added to keyword list before keyword information extraction, thus date details need to be
            # extracted from pdf_info.keywords
            date_terms = self.KW.get(l_date)
            out_put[l_date] = date_terms
        return out_put

    def search_dates_pdf_info(self):
        out_put = {}
        for key in self.pdf_info.dates:
            pdf_date = self.search_date_pdf_info(self.pdf_info.dates[key])
            if pdf_date:
                out_put.update(pdf_date)
        return out_put

    def date_expression(self, p_keywords, p_dates):
        out_put = []
        NN = Network_Navigtor()
        if p_keywords:
            l_keyword_str = p_keywords[0]
            # using network nvigator check if date aligns with date
            # for two terms to be considered aligned atleast one term in keyword needs to be aligned with one term in date
            # traverse over all occurances of keyword in pdf
            for p_keyword in p_keywords[1]:
                # traverse over all occurances of date in pdf
                for l_date in p_dates:
                    l_date_str = l_date[0]
                    # check if p_keyword aligns with l_date
                    NN.align(self.pdf_info, p_keyword, l_date[1])
                    if NN.alignment_list:
                        out_put.append([
                            l_keyword_str, l_date_str, NN.alignment_list,
                            NN.direction, NN.distance, l_date[1], p_keyword
                        ])
        return out_put

    def get_full_date_from_network(self):
        pass

    def strip_details(self, p_dates):
        # convert dictionary to list of format:
        # [["date_str",[[term1], [term2]]], ["date_str",[[term1], [term2]]]]
        out_put = []
        for key in p_dates:
            for l_date in p_dates[key]:
                out_put.append([key, l_date])
        return out_put