Exemple #1
0
def get_names(contents):
    text = contents['declaration']  # Names only occur in the declaration
    names = []
    # This Regex so far has been successful sometimes but not always, uncomment and test individually
    regex = [
        #':\s(.*)\s([\u06F0-\u06F90-9]{10})',
        #': (.*) \,',
        #'[\u06F0-\u06F90-9]{10} (.*) :',
        #namehelper.SALUTATIONS[0] + ' (.*)' + namehelper.NATIONAL_BANK_NUMBER + ' ([\u06F0-\u06F90-9]{10})',
        #'([\u06F0-\u06F90-9]{10}) ' + namehelper.NATIONAL_BANK_NUMBER + ' (.*) ' + namehelper.SALUTATIONS[0],
        #'([\u06F0-\u06F90-9]{10}) به شماره ملی'
    ]  # Note: This regex is currently innacurate, but serves as a test extraction
    for i in regex:  # Loop through every regex pattern
        namesNew = re.findall(i, text)
        for j in namesNew:
            # If the regex supported grabbing an ID, then add it otherwise don't
            if len(j) == 1:
                names += [{
                    'name': translator.convert(j[0]),
                    'employee_id': None
                }]
            else:
                names += [{
                    'name': translator.convert(j[0]),
                    'employee_id': translator.convert(j[1])
                }]
    return names
Exemple #2
0
def parse_dates(html, contents):
    newspaperdateId = 'cphMain_lblNewsPaperDate'
    registrationdateId = 'cphMain_lblNewsDate'

    soup = BeautifulSoup(html, 'html.parser')
    newspaper_date = translator.convert(
        soup.find(id=newspaperdateId).contents[0])
    registration_date = translator.convert(
        soup.find(id=registrationdateId).contents[0])

    date_regex = [
        '[\u06F0-\u06F90-9]{4}/[\u06F0-\u06F90-9]{2}/[\u06F0-\u06F90-9]{2}',
        '[\u06F0-\u06F90-9]{2}/[\u06F0-\u06F90-9]{2}/[\u06F0-\u06F90-9]{4}'
    ]
    random_dates = []
    for i in date_regex:
        random_dates += re.findall(i, contents['declaration'])

    meeting_date = None
    if len(random_dates) >= 1:
        meeting_date = translator.convert(random_dates[0])

    return {
        'newspaper_date': newspaper_date,
        'registration_date': registration_date,
        'meeting_date': meeting_date,
        'random_dates': random_dates
    }
def parse_name_sandwhich(contents):
    '''This function operates on the principle that a name
    always occurs between two words/phrases and therefore
    if we compile a list of every word that precedes a name
    and every word that follows it, we can therefore ensure we
    retreive every name.

    Args:
        contents - Generalized text content object as in parser.py

    Returns:
        list of dictionaries with keys `name` and `id`
    '''
    to_return = []
    text = contents['declaration']  # Parse out the main body
    for substring in PRECEDING_SET:
        for i in substring_indexes(substring, text):
            bottom = i + 1  # Move to the next character
            partition = 30  # Number of characters to search through
            search_span = text[bottom:bottom + partition]
            name_set = []
            re_id = '[\u06F0-\u06F90-9]{10}'
            id_search_span = text[bottom:bottom + partition + 10]
            ids = re.findall(re_id, id_search_span)
            id = None
            if len(ids) > 1:
                print('ERROR: multiple ids found in sandwhich method')
                print(ids)
                id = ids[
                    0]  # The first ID in the string is probably the right one
            if len(ids) == 1:
                id = ids[0]
            for proceed in PROCEEDING_SET:
                top = search_span.find(
                    proceed)  # Find first instance of the proceeding text
                if not top == -1:
                    name = search_span[:
                                       top]  # By definition idx=0 is the start of name span
                    name_set += [{
                        'name': translator.convert(name),
                        'employee_id': translator.convert(id)
                    }]
            if len(name_set) > 1:
                print(
                    'Multiple names where found for the same beggining. Ensure clean function works.'
                )
            to_return += name_set
    return to_return
Exemple #4
0
def double_tap_names(contents):
    # This function is built on the principle of finding any ID at all
    # and then confirming or denying the existence of other words, etc.
    # nearby until you know exactly where to cut the name from.
    # Notes: re namespaces regex variables
    text = contents['declaration']
    # Grabs ID and 40 characters on either side
    re_id_chunk = '.{30}[\u06F0-\u06F90-9]{10}.{20}'
    re_id = '[\u06F0-\u06F90-9]{10}'
    all_id_chunks = re.findall(
        re_id_chunk,
        text)  # Find every possible ID (might include business IDs, etc.)
    people = []
    for i in all_id_chunks:
        if len(re.findall(re_id, i)) > 1:
            print('Unfortunately we found multiple IDs in the string.')
        # Double tap
        confirmed_name = False
        parsed_name = None
        if namehelper.NATIONAL_BANK_NUMBER in i:  # This national bank number has only every appeared connected with a person's ID
            confirmed_name = True
            idx_signifier = i.find(namehelper.NATIONAL_BANK_NUMBER)
            parsed_name = i[:
                            idx_signifier]  # Liberal cutting but won't miss a portion of the name
        elif namehelper.SALUTATIONS[
                0] in i:  # If we see Mr. then we know it's a name
            confirmed_name = True
            idx_signifier = i.find(namehelper.SALUTATIONS[0])
            parsed_name = i[
                idx_signifier:]  # Too liberal as well but can be refined
        if not parsed_name:
            # In the future we will do named entity recognition here
            print('No name grabbed')
            pass
        if confirmed_name:
            parsed_name = translator.convert(
                parsed_name)  # Clean it up as much as we can
            people += [{
                'name':
                parsed_name,
                'employee_id':
                translator.convert(re.findall(re_id, i)[0])
            }]  # ID is changed to English numerals
    return people
Exemple #5
0
def parse(fileName):
    certaintyScore = 100  # This will be decreased by percentages if signs of uncertainty show
    # Get document ID
    file_chunks = fileName.split('/')
    last_chunk = file_chunks[len(file_chunks) - 1]
    document_id = last_chunk[:len(last_chunk) -
                             5]  # Remove the .html at the end
    print('Document ID:', document_id)
    # Grab file contents
    htmlFile = open(fileName)
    html = htmlFile.read()
    htmlFile.close()
    # Extract all pertinent sections
    soup = BeautifulSoup(html, 'html.parser')
    # This extracts the declaration which doesn't include the document information section. This is where the names occur.
    declaration = ''
    try:
        declaration = soup.find(class_='Jus').contents[0]
    except:
        print('Malformed file')
        return
    # This extracts where the national ID is sometimes stored
    title = ''
    try:
        title = soup.find(id='cphMain_lblNewsTitle').contents[0]
    except:
        title = ''
    # These are all the extracted text chunks functions will have available to parse
    contents = {'title': title, 'declaration': declaration}
    # Company ID retreival
    companyId = nationalid.parse_id(contents['title'])
    if DEBUG:
        print('National ID (title):', companyId)
    if companyId == None:
        certaintyScore *= 0.8  # Proof of concept not good
        companyId = nationalid.parse_id(
            contents['declaration']
        )  # This finds a company ID in the text but it might be referencing another corporation
    if DEBUG:
        print('National ID (declaration):',
              nationalid.parse_id(contents['declaration']))
    # Various date retreival
    try:
        dates = dateextract.parse_dates(html, contents)
    except:
        print('Malformed file. Returning early')
        return
    # Name retreival
    names = get_names(contents)
    names += double_tap_names(contents)
    names += namehelper.parse_name_sandwhich(contents)
    cleaned_names = clean(names)
    # Now we get the document date in English dates
    persian_date = dates['registration_date']
    date_data = persian_date.split('/')
    document_date = jdatetime.datetime(int(date_data[0]), int(date_data[1]),
                                       int(date_data[2])).togregorian()
    document_timestamp = time.mktime(document_date.timetuple())
    print(document_timestamp)
    return {
        'document_id': document_id,
        'document_date': document_timestamp,
        'company_id': translator.convert(companyId),
        's3_path': fileName[6:],
        'names': cleaned_names,
        'dates': dates,
        'raw_title': contents['title'],
        'raw_body': contents['declaration'],
        'certainty_score': certaintyScore,
        'parser_version': 1
    }
 def processRequest(self):
     """Interpret a request, relay to further processing and prepare response headers."""
     global debug
     if "rdf-translator-dev" in self.request.url:
         debug = True
     
     if self.html == True:
         self.do_pygmentize = True
         self.response.headers['Content-Type'] = "text/html"
     else:
         if self.target_format == "pretty-xml" or self.target_format == "xml":
             self.response.headers['Content-Type'] = "application/rdf+xml"
         elif self.target_format == "n3":
             self.response.headers['Content-Type'] = "text/n3"
         elif self.target_format == "turtle":
             self.response.headers['Content-Type'] = "text/turtle"
         elif self.target_format == "nquads":
             self.response.headers['Content-Type'] = "text/x-nquads"
         elif self.target_format == "nt":
             self.response.headers['Content-Type'] = "text/plain"
         elif self.target_format == "trix":
             self.response.headers['Content-Type'] = "application/xml"
         elif self.target_format == "rdf-json" or self.target_format == "rdf-json-pretty":
             self.response.headers['Content-Type'] = "application/json"
         elif self.target_format == "json-ld":
             self.response.headers['Content-Type'] = "application/ld+json"
         elif self.target_format == "rdfa" or self.target_format == "microdata":
             self.response.headers['Content-Type'] = "text/html"
         else:
             self.response.headers['Content-Type'] = "text/plain"
         
     if not self.source_format or self.source_format == "detect":
         if self.content:
             source = create_input_source(data=self.content, format=self.source_format)
             self.source_format = source.content_type
         elif self.page:
             source = create_input_source(location=self.page, format=self.source_format)
             self.source_format = source.content_type
             
         if self.source_format == "text/html":
             self.source_format = "rdfa" # microdata is fallback
             
     try:
         self.response_string = "<p style='color: red; font-weight: bold; padding-top: 12px'>Translation failed</p>"
         if self.content:
             self.response_string = translator.convert(self.content, do_pygmentize=self.do_pygmentize, file_format="string", source_format=self.source_format, target_format=self.target_format)
             if self.response_string.strip() == "" and self.source_format == "rdfa": # fix microdata test
                 self.response_string = translator.convert(self.content, do_pygmentize=self.do_pygmentize, file_format="string", source_format="microdata", target_format=self.target_format)
         elif self.page:
             self.response_string = translator.convert(self.page, do_pygmentize=self.do_pygmentize, file_format="file", source_format=self.source_format, target_format=self.target_format)
             if self.response_string.strip() == "" and self.source_format == "rdfa": # fix microdata test
                 self.response_string = translator.convert(self.page, do_pygmentize=self.do_pygmentize, file_format="file", source_format="microdata", target_format=self.target_format)
         if self.response_string.strip() == "":
             raise Exception("empty result returned")
     except Exception, e:
         self.response.set_status(500)
         if debug:
             tb = traceback.format_exc()
             e = "<pre style=\"color: red\">"+tb+"</pre>"
         else:
             e = "<pre style=\"color: red\">"+str(e)+"</pre>"
         error_message = "No error message available"
         if str(e).strip() != "":
             error_message = "Error message:<br>%s" % str(e)
         self.response_string = "<p style='color: red; font-weight: bold; padding-top: 12px'>Could not convert from %s to %s for provided resource...<br><br>%s</p>" % (self.source_format, self.target_format, error_message)
Exemple #7
0
    def processRequest(self):
        """Interpret a request, relay to further processing and prepare response headers."""
        global debug
        if "rdf-translator-dev" in self.request.url:
            debug = True

        if self.html == True:
            self.do_pygmentize = True
            self.response.headers['Content-Type'] = "text/html"
        else:
            if self.target_format == "pretty-xml" or self.target_format == "xml":
                self.response.headers['Content-Type'] = "application/rdf+xml"
            elif self.target_format == "n3":
                self.response.headers['Content-Type'] = "text/n3"
            elif self.target_format == "turtle":
                self.response.headers['Content-Type'] = "text/turtle"
            elif self.target_format == "nquads":
                self.response.headers['Content-Type'] = "text/x-nquads"
            elif self.target_format == "nt":
                self.response.headers['Content-Type'] = "text/plain"
            elif self.target_format == "trix":
                self.response.headers['Content-Type'] = "application/xml"
            elif self.target_format == "rdf-json" or self.target_format == "rdf-json-pretty":
                self.response.headers['Content-Type'] = "application/json"
            elif self.target_format == "json-ld":
                self.response.headers['Content-Type'] = "application/ld+json"
            elif self.target_format == "rdfa" or self.target_format == "microdata":
                self.response.headers['Content-Type'] = "text/html"
            else:
                self.response.headers['Content-Type'] = "text/plain"

        if not self.source_format or self.source_format == "detect":
            if self.content:
                source = create_input_source(data=self.content,
                                             format=self.source_format)
                self.source_format = source.content_type
            elif self.page:
                source = create_input_source(location=self.page,
                                             format=self.source_format)
                self.source_format = source.content_type

            if self.source_format == "text/html":
                self.source_format = "rdfa"  # microdata is fallback

        try:
            self.response_string = "<p style='color: red; font-weight: bold; padding-top: 12px'>Translation failed</p>"
            if self.content:
                self.response_string = translator.convert(
                    self.content,
                    do_pygmentize=self.do_pygmentize,
                    file_format="string",
                    source_format=self.source_format,
                    target_format=self.target_format)
                if self.response_string.strip(
                ) == "" and self.source_format == "rdfa":  # fix microdata test
                    self.response_string = translator.convert(
                        self.content,
                        do_pygmentize=self.do_pygmentize,
                        file_format="string",
                        source_format="microdata",
                        target_format=self.target_format)
            elif self.page:
                self.response_string = translator.convert(
                    self.page,
                    do_pygmentize=self.do_pygmentize,
                    file_format="file",
                    source_format=self.source_format,
                    target_format=self.target_format)
                if self.response_string.strip(
                ) == "" and self.source_format == "rdfa":  # fix microdata test
                    self.response_string = translator.convert(
                        self.page,
                        do_pygmentize=self.do_pygmentize,
                        file_format="file",
                        source_format="microdata",
                        target_format=self.target_format)
            if self.response_string.strip() == "":
                raise Exception("empty result returned")
        except Exception, e:
            self.response.set_status(500)
            if debug:
                tb = traceback.format_exc()
                e = "<pre style=\"color: red\">" + tb + "</pre>"
            else:
                e = "<pre style=\"color: red\">" + str(e) + "</pre>"
            error_message = "No error message available"
            if str(e).strip() != "":
                error_message = "Error message:<br>%s" % str(e)
            self.response_string = "<p style='color: red; font-weight: bold; padding-top: 12px'>Could not convert from %s to %s for provided resource...<br><br>%s</p>" % (
                self.source_format, self.target_format, error_message)