def __str__(self): # return reference string_OUT = "" # declare variables temp_string = "" has_non_ascii_characters = False # id? if (self.id) and (self.id != None) and (self.id > 0): string_OUT += "Domain " + str(self.id) + " - " # -- END check to see if id --# # name if self.domain_name: string_OUT += self.domain_name # -- END check to see if domain_name --# # description if self.description: temp_string = self.description if self.str_convert_to_ascii == True: # check for non-ASCII characters has_non_ascii_characters = StringHelper.has_non_ascii_characters(self.description) # yes? if has_non_ascii_characters == True: # convert to ASCII temp_string = temp_string.encode(encoding="ascii", errors="xmlcharrefreplace") # -- END check for non-ASCII characters. --# # -- END check to see if we are to convert to ASCII. --# string_OUT += " - " + temp_string # -- END check to see if description --# # source if self.source: string_OUT += " ( from: " + self.source + " )" # -- END check to see if source --# return string_OUT
def encode_data( self, value_IN, encoding_IN = None ): ''' accepts data to be passed with request. If data is a unicode string, encodes it to the encoding passed in. If no encoding passed in, uses the default encoding (UTF-8). ''' # return reference value_OUT = "" # declare variables is_data_unicode = False # see if data is a unicode object. is_data_unicode = StringHelper.is_unicode( value_IN ) if ( is_data_unicode == True ): # yes, its unicode. Encode it. Got an encoding? if ( ( encoding_IN is not None ) and ( encoding_IN != "" ) ): # yes - use it. my_encoding = encoding_IN else: # no - get default. my_encoding = self.get_default_encoding() #-- END check for encoding. --# # Encode. value_OUT = StringHelper.encode_string( value_IN, my_encoding ) else: # not unicode. Use as-is. value_OUT = value_IN #-- END check to see if data is unicode object --# return value_OUT
def test_replace_white_space( self ): # declare variables start_string = "" test_string = "" expected_string = "" # initialize start_string = "one bird two shoes and a cat" expected_string = "one bird two shoes and a cat" # do work test_string = StringHelper.replace_white_space( start_string, replace_with_IN = " " ) # and the assert self.assertEqual( test_string, expected_string )
# it is text - convert it to string. current_paragraph_text = unicode( paragraph_element ) else: # not text - just grab all the text out of it. #current_paragraph_text = ' '.join( paragraph_element.findAll( text = True ) ) current_paragraph_text = HTMLHelper.remove_html( str( paragraph_element ) ) #-- END check to see if current element is text. --# # clean up - convert HTML entities current_paragraph_text = bs_helper.convert_html_entities( current_paragraph_text ) # strip out extra white space current_paragraph_text = StringHelper.replace_white_space( current_paragraph_text ) # got any paragraph text? current_paragraph_text = current_paragraph_text.strip() if ( ( current_paragraph_text != None ) and ( current_paragraph_text != "" ) ): # yes. Add to paragraph text. paragraph_text_list.append( current_paragraph_text ) #-- END check to see if any text. --# #-- END loop over paragraph elements. --# # convert paragraph list to string paragraph_text = ' '.join( paragraph_text_list )
def get_unique_mention_string_list(self, replace_white_space_IN=False, *args, **kwargs): ''' Retrieves all DataSetMention-s that relate to this DataSet, across all citations. Builds and returns a set of the distinct strings used to refer to the dataset. ''' # return reference mention_list_OUT = [] # declare variables my_id = -1 mention_set = set() data_set_citation_data_qs = None citation_data = None mention_qs = None mention = None mention_string = None # get citation data data_set_citation_data_qs = DataSetCitationData.objects.filter( data_set_citation__data_set=self) # for each citation data, get all mentions, and add the value of each # to set. for citation_data in data_set_citation_data_qs: # get mentions mention_qs = citation_data.datasetmention_set.all() # for each mention, grab value and add to set if not already there. for mention in mention_qs: # get value mention_string = mention.value # is it in set? if (mention_string not in mention_set): # are we replacing white space for javascript? if (replace_white_space_IN == True): # replace more than one contiguous white space character # with a space. mention_string = StringHelper.replace_white_space( mention_string) #-- END check if we unicode_escape --# # no - add it. mention_set.add(mention_string) #-- END check to see if in set. --# #-- END loop over mentions. --# #-- END loop over citation data related to current data set --# # convert set to list. mention_list_OUT = list(mention_set) mention_list_OUT.sort() return mention_list_OUT
status_last = Person.get_person_lookup_status( person_last ) print( "status from both in last: " + status_last ) else: # Name parsed as two words, so go with parsed name? pass #-- END check to see if both first and last name. --# #-- END check to see if two-part name. --# # look for people with same full-string name. # get full name from parsed. full_name_test = StringHelper.object_to_unicode_string( parsed ) print( "FULL NAME - looking for \"" + full_name_test + "\"" ) full_name_qs = Person.objects.filter( full_name_string__iexact = full_name_test ) full_name_count = full_name_qs.count() if ( full_name_count > 0 ): for full_name_match in full_name_qs: print( "- FULL NAME - full name match: " + str( full_name_match ) ) #-- END loop over full name matches --# else: print( "- FULL NAME - no full name match for \"" + full_name_test + "\"" )
# see if name is br. if ( current_name == "br" ): # yes - paragraph break! output a message, and the string contents of the tag (just in case). print( "=======> paragraph break! - End of paragraph " + str( paragraph_counter ) + ". HTML element Contents: \"" + str( current_content ) + "\"" ) # add previous paragraph to paragraph list. paragraph_text_list = [] for paragraph_element in current_element_list: # convert current element to just text. Is it NavigableString? if ( isinstance( paragraph_element, NavigableString) ): # it is text - convert it to string. current_paragraph_text = StringHelper.object_to_unicode_string( paragraph_element ) else: # not text - just grab all the text out of it. #current_paragraph_text = ' '.join( paragraph_element.findAll( text = True ) ) current_paragraph_text = HTMLHelper.remove_html( str( paragraph_element ) ) #-- END check to see if current element is text. --# # clean up - convert HTML entities current_paragraph_text = bs_helper.convert_html_entities( current_paragraph_text ) # strip out extra white space current_paragraph_text = StringHelper.replace_white_space( current_paragraph_text )
def print_calais_json( cls, json_IN, logger_IN = None ): ''' Accepts OpenCalais API JSON object, prints selected parts of it to a string variable. Returns that string. ''' # return reference string_OUT = "" # declare variables me = "OpenCalaisV2ApiResponse.print_calais_json()" my_logger = None temp_string = "" properties_to_output_list = [] current_property = "" my_logger = logger_IN # set properties we want to output properties_to_output_list = [ "_type", "_typeGroup", "commonname", "name", "person" ] # loop over the stuff in the response: item_counter = 0 current_container = json_IN # got something in current_container? if ( current_container is not None ): # yes - loop on keys. for item in current_container.keys(): item_counter += 1 temp_string = "==> " + str( item_counter ) + ": " + item + "\n" string_OUT += temp_string if ( my_logger is not None ): my_logger.debug( "In " + me + ": " + temp_string ) #-- END check to see if logger --# # loop over properties that we care about. for current_property in properties_to_output_list: # is property in the current JSON item we are looking at? if ( current_property in current_container[ item ] ): # yes - output. current_property_value = current_container[ item ][ current_property ] # exception handling to try to deal with unicode added in # OpenCalais API version 2. try: # first, try using str() temp_string = str( current_property_value ) except Exception as e: # on exception, try using StringHelper.encode_string() temp_string = StringHelper.encode_string( current_property_value ) #-- END try/except --# temp_string = "----> " + current_property + ": " + temp_string + "\n" string_OUT += temp_string if ( my_logger is not None ): my_logger.debug( "In " + me + ": " + temp_string ) #-- END check to see if logger --# # is it a Quotation or a Person? if ( ( current_property_value == "Quotation" ) or ( current_property_value == "Person" ) ): string_OUT += str( current_container[ item ] ) + "\n" #-- END check to see if type is "Quotation" --# #-- END current_property --# #-- END loop over list of properties we want to output. --# #-- END loop over items --# #-- END check to see if JSON passed in. --# return string_OUT
end = span[ 1 ] print( str( match_count ) + " - " + character_entity + " - span: ( " + str( start ) + ", " + str( end ) + " )" ) replace_string = replace_string[:start] + character_entity + replace_string[end:] print( "replaced?: " + replace_string ) match_count = 0 replace_string = test_string re_match = re_test.search( replace_string ) while ( ( re_match ) and ( re_match != None ) ): # output the match and the span. match_count += 1 character_entity = re_match.group( 0 ).encode( 'ascii', 'xmlcharrefreplace' ) span = re_match.span() start = span[ 0 ] end = span[ 1 ] print( str( match_count ) + " - " + character_entity + " - span: ( " + str( start ) + ", " + str( end ) + " )" ) replace_string = replace_string[:start] + character_entity + replace_string[end:] re_match = re_test.search( replace_string ) print( "replaced?: " + replace_string ) from python_utilities.strings.string_helper import StringHelper helper_replaced = StringHelper.entitize_4_byte_unicode( test_string ) print( "helper replaced?: " + helper_replaced )
address_line_text = station_text_list[ address_line_index ] address_line_text = address_line_text.strip() station_address += address_line_text #-- END loop over remaining address lines (except the first one). --# print( "station_address: " + station_address ) # Now, we actually put the information together and store to # database. # make station description station_description += " - " + station_call_sign + " - " + station_address + "; " + station_city + ", " + station_state + " " + station_zip_code # clean out new lines station_description = StringHelper.clean_string( station_description ) # do we have a URL? if ( ( station_url ) and ( station_url != None ) and ( station_url != "" ) ): # adding to database. print( " ===> adding station: " + station_description ) domain_counter += 1 # domain name cleaned_url = station_url # handle redirect URLS? - first, see if this URL contains "goto=" #redirect_index = cleaned_url.find( URL_FORWARD_STRING ) #if ( redirect_index >= 0 ): # # yes. strip off everything before "goto="
# None - no next element. stop walking. keep_strolling = False #-- END check to see if there is a next element. --# #-- END loop over station elements. --# #print( a_list_bs ) #print( text_list_bs ) # make a string out of the text. station_text = " ".join( text_list_bs ) # clean string - strip out newlines, tabs, and more than one # contiguous space. station_text = StringHelper.clean_string( station_text ) # is this a news station? # Either: # - contains the word "news" # - contains "talk", but not "sport", "christian", or "religious" (I checked the sites that match these by hand, they don't have news). if ( ( NEWS_MATCH.lower() in station_text.lower() ) or ( ( TALK_MATCH.lower() in station_text.lower() ) and ( ( "sport" not in station_text.lower() ) and ( "christian" not in station_text.lower() ) and ( "religious" not in station_text.lower() ) ) ) ): # news! news_station_counter += 1 print( "- NEWS: " + bold_text + " - " + station_text ) # get URL. Got anything in <a> list? if ( len( a_list_bs ) > 0 ):