def __str__(self):

        # return reference
        string_OUT = ""

        # declare variables
        temp_string = ""
        has_non_ascii_characters = False

        # id?
        if (self.id) and (self.id != None) and (self.id > 0):

            string_OUT += "Domain " + str(self.id) + " - "

        # -- END check to see if id --#

        # name
        if self.domain_name:

            string_OUT += self.domain_name

        # -- END check to see if domain_name --#

        # description
        if self.description:

            temp_string = self.description

            if self.str_convert_to_ascii == True:

                # check for non-ASCII characters
                has_non_ascii_characters = StringHelper.has_non_ascii_characters(self.description)

                # yes?
                if has_non_ascii_characters == True:

                    # convert to ASCII
                    temp_string = temp_string.encode(encoding="ascii", errors="xmlcharrefreplace")

                # -- END check for non-ASCII characters. --#

            # -- END check to see if we are to convert to ASCII. --#

            string_OUT += " - " + temp_string

        # -- END check to see if description --#

        # source
        if self.source:

            string_OUT += " ( from: " + self.source + " )"

        # -- END check to see if source --#

        return string_OUT
    def encode_data( self, value_IN, encoding_IN = None ):
        
        '''
        accepts data to be passed with request.  If data is a unicode string,
           encodes it to the encoding passed in.  If no encoding passed in, uses
           the default encoding (UTF-8).
        '''
        
        # return reference
        value_OUT = ""
        
        # declare variables
        is_data_unicode = False
        
        # see if data is a unicode object.
        is_data_unicode = StringHelper.is_unicode( value_IN )
        if ( is_data_unicode == True ):
        
            # yes, its unicode.  Encode it.  Got an encoding?
            if ( ( encoding_IN is not None ) and ( encoding_IN != "" ) ):
            
                # yes - use it.
                my_encoding = encoding_IN
                
            else:
            
                # no - get default.
                my_encoding = self.get_default_encoding()
                
            #-- END check for encoding. --#
            
            # Encode.
            value_OUT = StringHelper.encode_string( value_IN, my_encoding )
        
        else:
        
            # not unicode. Use as-is.
            value_OUT = value_IN
        
        #-- END check to see if data is unicode object --#

        return value_OUT
Example #3
0
    def test_replace_white_space( self ):

        # declare variables
        start_string = ""
        test_string = ""
        expected_string = ""
        
        # initialize
        start_string = "one bird   two  shoes     and a cat"
        expected_string = "one bird two shoes and a cat"
        
        # do work
        test_string = StringHelper.replace_white_space( start_string, replace_with_IN = " " )
        
        # and the assert
        self.assertEqual( test_string, expected_string )
Example #4
0
                # it is text - convert it to string.
                current_paragraph_text = unicode( paragraph_element )
            
            else:
            
                # not text - just grab all the text out of it.
                #current_paragraph_text = ' '.join( paragraph_element.findAll( text = True ) )
                current_paragraph_text = HTMLHelper.remove_html( str( paragraph_element ) )
                
            #-- END check to see if current element is text. --#

            # clean up - convert HTML entities
            current_paragraph_text = bs_helper.convert_html_entities( current_paragraph_text )
            
            # strip out extra white space
            current_paragraph_text = StringHelper.replace_white_space( current_paragraph_text )
            
            # got any paragraph text?
            current_paragraph_text = current_paragraph_text.strip()
            if ( ( current_paragraph_text != None ) and ( current_paragraph_text != "" ) ):
            
                # yes.  Add to paragraph text.
                paragraph_text_list.append( current_paragraph_text )
                
            #-- END check to see if any text. --#
        
        #-- END loop over paragraph elements. --#
        
        # convert paragraph list to string
        paragraph_text = ' '.join( paragraph_text_list )
        
Example #5
0
    def get_unique_mention_string_list(self,
                                       replace_white_space_IN=False,
                                       *args,
                                       **kwargs):
        '''
        Retrieves all DataSetMention-s that relate to this DataSet, across
            all citations.  Builds and returns a set of the distinct strings
            used to refer to the dataset.
        '''

        # return reference
        mention_list_OUT = []

        # declare variables
        my_id = -1
        mention_set = set()
        data_set_citation_data_qs = None
        citation_data = None
        mention_qs = None
        mention = None
        mention_string = None

        # get citation data
        data_set_citation_data_qs = DataSetCitationData.objects.filter(
            data_set_citation__data_set=self)

        # for each citation data, get all mentions, and add the value of each
        #     to set.
        for citation_data in data_set_citation_data_qs:

            # get mentions
            mention_qs = citation_data.datasetmention_set.all()

            # for each mention, grab value and add to set if not already there.
            for mention in mention_qs:

                # get value
                mention_string = mention.value

                # is it in set?
                if (mention_string not in mention_set):

                    # are we replacing white space for javascript?
                    if (replace_white_space_IN == True):

                        # replace more than one contiguous white space character
                        #     with a space.
                        mention_string = StringHelper.replace_white_space(
                            mention_string)

                    #-- END check if we unicode_escape --#

                    # no - add it.
                    mention_set.add(mention_string)

                #-- END check to see if in set. --#

            #-- END loop over mentions. --#

        #-- END loop over citation data related to current data set --#

        # convert set to list.
        mention_list_OUT = list(mention_set)
        mention_list_OUT.sort()

        return mention_list_OUT
Example #6
0
        status_last = Person.get_person_lookup_status( person_last )
        print( "status from both in last: " + status_last )
    
    else:
    
        # Name parsed as two words, so go with parsed name?
        pass
    
    #-- END check to see if both first and last name. --#

#-- END check to see if two-part name. --#

# look for people with same full-string name.

# get full name from parsed.
full_name_test = StringHelper.object_to_unicode_string( parsed )
print( "FULL NAME - looking for \"" + full_name_test + "\"" )

full_name_qs = Person.objects.filter( full_name_string__iexact = full_name_test )
full_name_count = full_name_qs.count()
if ( full_name_count > 0 ):

    for full_name_match in full_name_qs:
    
        print( "- FULL NAME - full name match: " + str( full_name_match ) )
        
    #-- END loop over full name matches --#
    
else:

    print( "- FULL NAME - no full name match for \"" + full_name_test + "\"" )
    
    # see if name is br.
    if ( current_name == "br" ):
    
        # yes - paragraph break!  output a message, and the string contents of the tag (just in case).
        print( "=======> paragraph break! - End of paragraph " + str( paragraph_counter ) + ".  HTML element Contents: \"" + str( current_content ) + "\"" )
        
        # add previous paragraph to paragraph list.
        paragraph_text_list = []
        for paragraph_element in current_element_list:
        
            # convert current element to just text.  Is it NavigableString?
            if ( isinstance( paragraph_element, NavigableString) ):
            
                # it is text - convert it to string.
                current_paragraph_text = StringHelper.object_to_unicode_string( paragraph_element )
            
            else:
            
                # not text - just grab all the text out of it.
                #current_paragraph_text = ' '.join( paragraph_element.findAll( text = True ) )
                current_paragraph_text = HTMLHelper.remove_html( str( paragraph_element ) )
                
            #-- END check to see if current element is text. --#

            # clean up - convert HTML entities
            current_paragraph_text = bs_helper.convert_html_entities( current_paragraph_text )
            
            # strip out extra white space
            current_paragraph_text = StringHelper.replace_white_space( current_paragraph_text )
            
    def print_calais_json( cls, json_IN, logger_IN = None ):
    
        '''
        Accepts OpenCalais API JSON object, prints selected parts of it to a
           string variable.  Returns that string.
        '''
    
        # return reference
        string_OUT = ""
        
        # declare variables
        me = "OpenCalaisV2ApiResponse.print_calais_json()"
        my_logger = None
        temp_string = ""
        properties_to_output_list = []
        current_property = ""
        
        my_logger = logger_IN
        
        # set properties we want to output
        properties_to_output_list = [ "_type", "_typeGroup", "commonname", "name", "person" ]
        
        # loop over the stuff in the response:
        item_counter = 0
        current_container = json_IN
        
        # got something in current_container?
        if ( current_container is not None ):

            # yes - loop on keys.
            for item in current_container.keys():
            
                item_counter += 1
                temp_string = "==> " + str( item_counter ) + ": " + item + "\n"
                string_OUT += temp_string
    
                if ( my_logger is not None ):
                    my_logger.debug( "In " + me + ": " + temp_string )
                #-- END check to see if logger --#
                            
                # loop over properties that we care about.
                for current_property in properties_to_output_list:
                            
                    # is property in the current JSON item we are looking at?
                    if ( current_property in current_container[ item ] ):
    
                        # yes - output.
                        current_property_value = current_container[ item ][ current_property ]
                        
                        # exception handling to try to deal with unicode added in
                        #    OpenCalais API version 2.
                        try:
                        
                            # first, try using str()
                            temp_string = str( current_property_value )
                            
                        except Exception as e:
                        
                            # on exception, try using StringHelper.encode_string()
                            temp_string = StringHelper.encode_string( current_property_value )
                            
                        #-- END try/except --#
    
                        temp_string = "----> " + current_property + ": " + temp_string + "\n"
                        string_OUT += temp_string
    
                        if ( my_logger is not None ):
                            my_logger.debug( "In " + me + ": " + temp_string )
                        #-- END check to see if logger --#
    
                        # is it a Quotation or a Person?
                        if ( ( current_property_value == "Quotation" ) or ( current_property_value == "Person" ) ):
    
                            string_OUT += str( current_container[ item ] ) + "\n"
    
                        #-- END check to see if type is "Quotation" --#
    
                    #-- END current_property --#
    
                #-- END loop over list of properties we want to output. --#
                
            #-- END loop over items --#
            
        #-- END check to see if JSON passed in. --#
        
        return string_OUT
    end = span[ 1 ]
    print( str( match_count ) + " - " + character_entity + " - span: ( " + str( start ) + ", " + str( end ) + " )" )

    replace_string = replace_string[:start] + character_entity + replace_string[end:]
    
print( "replaced?: " + replace_string )

match_count = 0
replace_string = test_string
re_match = re_test.search( replace_string )
while ( ( re_match ) and ( re_match != None ) ):

    # output the match and the span.
    match_count += 1
    character_entity = re_match.group( 0 ).encode( 'ascii', 'xmlcharrefreplace' )
    span = re_match.span()
    start = span[ 0 ]
    end = span[ 1 ]
    print( str( match_count ) + " - " + character_entity + " - span: ( " + str( start ) + ", " + str( end ) + " )" )

    replace_string = replace_string[:start] + character_entity + replace_string[end:]

    re_match = re_test.search( replace_string )

print( "replaced?: " + replace_string )

from python_utilities.strings.string_helper import StringHelper

helper_replaced = StringHelper.entitize_4_byte_unicode( test_string )

print( "helper replaced?: " + helper_replaced )
                address_line_text = station_text_list[ address_line_index ]
                address_line_text = address_line_text.strip()
                station_address += address_line_text
                
            #-- END loop over remaining address lines (except the first one). --#

            print( "station_address: " + station_address )
            
            # Now, we actually put the information together and store to
            #    database.

            # make station description
            station_description += " - " + station_call_sign + " - " + station_address + "; " + station_city + ", " + station_state + " " + station_zip_code
            
            # clean out new lines
            station_description = StringHelper.clean_string( station_description )
            
            # do we have a URL?
            if ( ( station_url ) and ( station_url != None ) and ( station_url != "" ) ):
            
                # adding to database.
                print( "    ===> adding station: " + station_description )
                domain_counter += 1

                # domain name
                cleaned_url = station_url
                
                # handle redirect URLS? - first, see if this URL contains "goto="
                #redirect_index = cleaned_url.find( URL_FORWARD_STRING )
                #if ( redirect_index >= 0 ):
                #    # yes.  strip off everything before "goto="
                        # None - no next element.  stop walking.
                        keep_strolling = False
                    
                    #-- END check to see if there is a next element. --#
                
                #-- END loop over station elements. --#

                #print( a_list_bs )
                #print( text_list_bs )
                
                # make a string out of the text.
                station_text = " ".join( text_list_bs )
                
                # clean string - strip out newlines, tabs, and more than one
                #    contiguous space.
                station_text = StringHelper.clean_string( station_text )
                
                # is this a news station?
                # Either:
                # - contains the word "news"
                # - contains "talk", but not "sport", "christian", or "religious" (I checked the sites that match these by hand, they don't have news).
                if ( ( NEWS_MATCH.lower() in station_text.lower() ) or ( ( TALK_MATCH.lower() in station_text.lower() ) and ( ( "sport" not in station_text.lower() ) and ( "christian" not in station_text.lower() ) and ( "religious" not in station_text.lower() ) ) ) ):
                
                    # news!
                    news_station_counter += 1
                    
                    print( "- NEWS: " + bold_text + " - " + station_text )
                    
                    # get URL.  Got anything in <a> list?
                    if ( len( a_list_bs ) > 0 ):