def __init__(self, PDF_Info, p_supporting_data, p_attributes): self.PDF_Info = PDF_Info self.supporting_data = p_supporting_data self.attributes = p_attributes self.keyword = Keywords(self.PDF_Info, self.supporting_data, self.attributes) self.Expression_Currency = Expression_Currency(self.PDF_Info, self.supporting_data, self.attributes) self.Expression_Dates = Expression_Dates(self.PDF_Info, self.supporting_data, self.attributes) self.Expression_List = Expression_List(self.PDF_Info, self.supporting_data, self.attributes) self.Expression_Numbers = Expression_Numbers(self.PDF_Info, self.supporting_data, self.attributes) self.Expression_Qualify = Expression_Qualify(self.PDF_Info, self.supporting_data, self.attributes) self.seperators = Separator()
class Expression(): # class to resolve an expression in form of key-value pair: # search for terms that occur together where pattern of expression matches with terms in pdf def __init__(self, PDF_Info, p_supporting_data, p_attributes): self.PDF_Info = PDF_Info self.supporting_data = p_supporting_data self.attributes = p_attributes self.keyword = Keywords(self.PDF_Info, self.supporting_data, self.attributes) self.Expression_Currency = Expression_Currency(self.PDF_Info, self.supporting_data, self.attributes) self.Expression_Dates = Expression_Dates(self.PDF_Info, self.supporting_data, self.attributes) self.Expression_List = Expression_List(self.PDF_Info, self.supporting_data, self.attributes) self.Expression_Numbers = Expression_Numbers(self.PDF_Info, self.supporting_data, self.attributes) self.Expression_Qualify = Expression_Qualify(self.PDF_Info, self.supporting_data, self.attributes) self.seperators = Separator() def findAll(self, p_expression, p_attribute_name='', p_list_name=''): """ returns key-value pairs that matches the pattern of expression. Args: p_expression (dictionary): p_supporting_data (dictionary): contains master data, synonyms..etc. PDF_Info (PDF_Information): information extracted from pdf by PDF_OCR_Reader. p_attributes (list): atributes that will qualify/verify the results. Returns: list of terms combination that match the expression. """ l_Keyword = p_expression['Keyword'] l_Separator = p_expression['Separator'] l_Value = p_expression['Value'] expression_match = self.get_expression_match(l_Value, p_attribute_name=p_attribute_name,p_keyword=l_Keyword, p_list_name=p_list_name) if l_Keyword: return self.extract_expression_match(expression_match) else: return expression_match def extract_expression_match(self, p_expression_match): out_put = [] for match in p_expression_match: out_put.append([match[0], match[1], match[2][3], match[2][4]]) return out_put def initialize_objects(self, PDF_Info, p_supporting_data, p_attributes): """ based on value type this initiallize the correct class object: date, currency, list, decimal. returns he class object """ pass def get_expression_match(self, p_value, p_attribute_name = '', p_keyword = '', p_list_name = ''): l_keyword_list=[] if p_keyword: l_keyword_list = self.keyword_pdf_info(p_keyword) if p_value.startswith('Currency'): l_expression_currency = self.get_expression_currency(l_keyword_list, p_attribute_name) return l_expression_currency elif p_value.startswith('Date'): l_expression_dates = self.get_expression_dates(l_keyword_list, p_attribute_name) return l_expression_dates elif p_value.startswith('Number'): l_expression_numbers = self.get_expression_numbers(l_keyword_list, p_attribute_name) return l_expression_numbers elif p_value.startswith('List'): # l_list_name re.search(r"\(([A-Za-z0-9_]+)\)", s) # l_list_name = l_list_name.group(1)) l_expression_lists = self.get_expression_lists(l_keyword_list, p_list_name, p_attribute_name) return l_expression_lists def keyword_pdf_info(self, p_keyword): return [p_keyword, self.keyword.get(p_keyword)] def get_expression_currency(self, p_keyword_list, p_attribute_name): currency_match = self.Expression_Currency.get(p_keyword_list) return self.Expression_Qualify.qualify(currency_match, p_attribute_name) def get_expression_dates(self, p_keyword_list, p_attribute_name): date_match = self.Expression_Dates.get(p_keyword_list) return self.Expression_Qualify.qualify(date_match, p_attribute_name) def get_expression_numbers(self, p_keyword_list, p_attribute_name): numbers_match = self.Expression_Numbers.get(p_keyword_list) return self.Expression_Qualify.qualify(numbers_match, p_attribute_name) def get_expression_lists(self, p_keyword_list, p_list_name, p_attribute_name =''): list_match = self.Expression_List.get(p_list_name, p_keyword_list) if p_keyword_list: return self.Expression_Qualify.qualify(list_match, p_attribute_name) else: return list_match
def setUp(self): self.PDF_Info = z_test_data.PDF_Info self.supporting_data = [] self.attributes = [] self.x = Keywords(self.PDF_Info, self.supporting_data, self.attributes)
class testKeywords(unittest.TestCase): def setUp(self): self.PDF_Info = z_test_data.PDF_Info self.supporting_data = [] self.attributes = [] self.x = Keywords(self.PDF_Info, self.supporting_data, self.attributes) def test_get(self): keyWord = 'February 20, 2020' out_put = self.x.get(keyWord) for o in out_put: print(o) print('------------------out_put-----------------') keyWord = 'February 29' out_put = self.x.get(keyWord) # out_put =self.x.strip_details(out_put) print(out_put) print('------------------out_put-----------------') def test_synonyms_keyword(self): pass def test_search_keyword_pdf_info(self): keyWord = 'February 20, 2020' out_put = self.x.search_keyword_pdf_info(keyWord) for o in out_put: print(o) keyWord = 'February 20, 2022' out_put = self.x.search_keyword_pdf_info(keyWord) def test_qualify(self): l_key = ('February 20, 2020', ('february', '20', ',', '2020')) l_pdf_keyword = [ (5, 1, 4, 1, 1, 4, 1134, 973, 286, 67, 95, 'February', 0), [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, '20', 0], ('right', 22)], [[5, 1, 4, 1, 2, 5, 1487, 1056, 93, 62, 94, ',', 1], ('down', 21)], [[5, 1, 4, 1, 2, 6, 1609, 1056, 156, 53, 95, '2020', 0], ('right', 29)] ] out_put = self.x.qualify(l_key, l_pdf_keyword) print(out_put) def test_strip_details(self): keywords = [[(5, 1, 4, 1, 1, 4, 1134, 973, 286, 67, 95, 'February', 0), [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, '20', 0], ('right', 22)], [[5, 1, 4, 1, 2, 5, 1487, 1056, 93, 62, 94, ',', 1], ('down', 21)], [[5, 1, 4, 1, 2, 6, 1609, 1056, 156, 53, 95, '2020', 0], ('right', 29)]], [(5, 1, 4, 1, 2, 4, 1179, 1056, 285, 67, 95, 'February', 0), [[5, 1, 4, 1, 2, 5, 1487, 1056, 93, 62, 94, '20', 0], ('right', 23)], [[5, 1, 4, 1, 2, 5, 1487, 1056, 93, 62, 94, ',', 1], ('right', 23)], [[5, 1, 4, 1, 2, 6, 1609, 1056, 156, 53, 95, '2020', 0], ('right', 29)]]] out_put = self.x.strip_details(keywords) print(out_put)
def __init__(self, PDF_Info, p_supporting_data, p_attributes): self.pdf_info = PDF_Info self.supporting_data = p_supporting_data self.attributes = p_attributes self.KW = Keywords(self.pdf_info, self.supporting_data, self.attributes)
class Expression_List(): def __init__(self, PDF_Info, p_supporting_data, p_attributes): self.pdf_info = PDF_Info self.supporting_data = p_supporting_data self.attributes = p_attributes self.KW = Keywords(self.pdf_info, self.supporting_data, self.attributes) def get(self, p_list_name, p_keyword = []): l_list_data = self.get_list_data_pdf_info(p_list_name) l_list_data = self.strip_details(l_list_data) l_list_data = self.qualify(l_list_data) if p_keyword: l_list_expression_dtl = self.list_expression(p_keyword, l_list_data) return self.extract_list_expression(l_list_expression_dtl) else: for i, data in enumerate(l_list_data): l_list_data[i].insert(0, p_list_name) return l_list_data def extract_list_expression(self, p_exps_dtls): out_put = [] for expression in p_exps_dtls: l_KeyWord, l_list_data = expression[0], expression[1] l_ExpAlign, l_direction, l_distance, l_value_dtls,l_key_dtls = expression[2], expression[3], expression[4], expression[5], expression[6] out_put.append([l_KeyWord, l_list_data, [l_ExpAlign, l_direction, l_distance, l_value_dtls, l_key_dtls]]) return out_put def qualify(self, p_dates): return p_dates def list_expression(self, p_keywords, p_list_data): out_put = [] NN = Network_Navigtor() l_keyword_str = p_keywords[0] for p_keyword in p_keywords[1]: for l_list_data in p_list_data: l_list_data_str = l_list_data[0] NN.align(self.pdf_info, p_keyword, l_list_data[1]) if NN.alignment_list: out_put.append([l_keyword_str, l_list_data_str, NN.alignment_list, NN.direction, NN.distance, l_list_data[1], p_keyword]) return out_put def strip_details(self, p_dates): out_put = [] for key in p_dates: for l_date in p_dates[key]: out_put.append([key, l_date]) return out_put def get_list_data_pdf_info(self, p_name): list_terms = [] # get list from supporting data l_list_data = self.supporting_data.get(p_name) # search each term in list in keywords, match will be found only if term exixts in pdf. if l_list_data: for l_data in l_list_data: list_terms.append(self.supporting_data.get_term(l_data)) out_put = self.search_list_data_pdf_info(list_terms) return out_put def search_list_data_pdf_info(self, p_list_data): out_put = {} # list data is added to keywords before searching keywords in pdf term network. # need to fetch list data from keywords. for l_list_data in p_list_data: list_terms = self.KW.get(l_list_data) out_put[l_list_data] = list_terms return out_put
class Expression_Dates(): def __init__(self, PDF_Info, p_supporting_data, p_attributes): self.pdf_info = PDF_Info self.supporting_data = p_supporting_data self.attributes = p_attributes self.KW = Keywords(self.pdf_info, self.supporting_data, self.attributes) def get(self, p_keyword): l_dates = self.search_dates_pdf_info() l_dates = self.strip_details(l_dates) l_dates = self.qualify(l_dates) l_date_expression_dtl = self.date_expression(p_keyword, l_dates) return self.extract_date_expression(l_date_expression_dtl) def extract_date_expression(self, p_exps_dtls): out_put = [] for expression in p_exps_dtls: l_KeyWord, l_Date = expression[0], expression[1] l_ExpAlign, l_direction, l_distance, l_value_dtls, l_key_dtls = expression[ 2], expression[3], expression[4], expression[5], expression[6] out_put.append([ l_KeyWord, l_Date, [ l_ExpAlign, l_direction, l_distance, l_value_dtls, l_key_dtls ] ]) return out_put def qualify(self, p_dates): return p_dates def search_date_pdf_info(self, p_date): out_put = {} # for each term in p_date get the details from pdf_info for l_date in p_date: # get pdf_info information for l_date using keyword class module. # each date is added to keyword list before keyword information extraction, thus date details need to be # extracted from pdf_info.keywords date_terms = self.KW.get(l_date) out_put[l_date] = date_terms return out_put def search_dates_pdf_info(self): out_put = {} for key in self.pdf_info.dates: pdf_date = self.search_date_pdf_info(self.pdf_info.dates[key]) if pdf_date: out_put.update(pdf_date) return out_put def date_expression(self, p_keywords, p_dates): out_put = [] NN = Network_Navigtor() if p_keywords: l_keyword_str = p_keywords[0] # using network nvigator check if date aligns with date # for two terms to be considered aligned atleast one term in keyword needs to be aligned with one term in date # traverse over all occurances of keyword in pdf for p_keyword in p_keywords[1]: # traverse over all occurances of date in pdf for l_date in p_dates: l_date_str = l_date[0] # check if p_keyword aligns with l_date NN.align(self.pdf_info, p_keyword, l_date[1]) if NN.alignment_list: out_put.append([ l_keyword_str, l_date_str, NN.alignment_list, NN.direction, NN.distance, l_date[1], p_keyword ]) return out_put def get_full_date_from_network(self): pass def strip_details(self, p_dates): # convert dictionary to list of format: # [["date_str",[[term1], [term2]]], ["date_str",[[term1], [term2]]]] out_put = [] for key in p_dates: for l_date in p_dates[key]: out_put.append([key, l_date]) return out_put