def get_hashtags_and_user_mentions(special_characters,
                                   text,
                                   wanted_characters=['#', '@']):
    # Identify hashtags, user mentions and remove urls
    results = {}
    for character in special_characters:
        text = re.sub('(' + character + ')+', ' ' + character, text)
        count_character = text.count(character)
        if count_character > 0:
            while count_character > 0:
                start = text.find(character)
                print(text.find(" ", start))
                print(text.find("\n", start))
                if text.find(" ", start) <= text.find("\n", start):
                    end = text.find(" ", start)
                else:
                    end = text.find("\n", start)
                if end == -1:
                    end = len(text)
                text_to_remove = text[start:end]
                print(text_to_remove)
                if len(text_to_remove) > 2:
                    if character in wanted_characters:
                        if character in results.keys():
                            results[character].append(text_to_remove)
                        else:
                            results[character] = [text_to_remove]
                text = text.replace(text_to_remove, "")
                text = ' '.join(text.split())
                count_character = text.count(character)
    for wanted_character in wanted_characters:
        if wanted_character not in results.keys():
            results[wanted_character] = []
    text = text.strip(' ')
    text = ' '.join(text.split())
    results['clean_text'] = text
    return results
    def clean_text(self, text):
        """
        # Arguments
            text: text body to be preprocessed and cleaned

        # Return
            cleaned text
        """
        # handle non-ascii/special characters
        text = text.encode("utf-8")
        text = re.sub(r"\\[ux][a-z0-9]+", " ", str(text))
        text = str(text).replace("b", "")
        text = text.strip("'").lower()
        text = re.sub(r'[\:\-\(\)\%\d\.\\\/\_\[\]\+\,\#\"]+', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        word_list = text.split(' ')  # tokenization w.r.t space characters
        rel_words = [
            word for word in word_list
            if word not in self.stop and len(word) >= self.min_word_len
        ]  # relevant words
        rel_words_lemm = [
            self.lemmatizer.lemmatize(word, pos='v') for word in rel_words
        ]
        return " ".join(rel_words_lemm)
def clean_string(text):
    def pad_str(s):
        return ' ' + s + ' '

    # Empty question

    if type(text) != str or text == '':
        return ''

    # preventing first and last word being ignored by regex
    # and convert first word in question to lower case

    text = ' ' + text[0].lower() + text[1:] + ' '

    # replace all first char after either [.!?)"'] with lowercase
    # don't mind if we lowered a proper noun, it won't be a big problem

    def lower_first_char(pattern):
        matched_string = pattern.group(0)
        return matched_string[:-1] + matched_string[-1].lower()

    text = re.sub("(?<=[\.\?\)\!\'\"])[\s]*.", lower_first_char, text)

    # Replace weird chars in text

    text = re.sub("’", "'", text)  # special single quote
    text = re.sub("`", "'", text)  # special single quote
    text = re.sub("“", '"', text)  # special double quote
    text = re.sub("?", "?", text)
    text = re.sub("…", " ", text)
    text = re.sub("é", "e", text)

    # Clean shorthands

    text = re.sub(
        "\'s", " ", text
    )  # we have cases like "Sam is" or "Sam's" (i.e. his) these two cases aren't separable, I choose to compromise are kill "'s" directly
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)
    text = re.sub("can't", "can not", text)
    text = re.sub("n't", " not ", text)
    text = re.sub("i'm", "i am", text, flags=re.IGNORECASE)
    text = re.sub("\'re", " are ", text)
    text = re.sub("\'d", " would ", text)
    text = re.sub("\'ll", " will ", text)
    text = re.sub("e\.g\.", " eg ", text, flags=re.IGNORECASE)
    text = re.sub("b\.g\.", " bg ", text, flags=re.IGNORECASE)
    text = re.sub(r"(\W|^)([0-9]+)[kK](\W|$)", r"\1\g<2>000\3",
                  text)  # better regex provided by @armamut
    text = re.sub("e-mail", " email ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?U\.S\.A\.",
                  " America ",
                  text,
                  flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?United State(s)?",
                  " America ",
                  text,
                  flags=re.IGNORECASE)
    text = re.sub("\(s\)", " ", text, flags=re.IGNORECASE)
    text = re.sub("[c-fC-F]\:\/", " disk ", text)

    # replace the float numbers with a random number, it will be parsed as number afterward, and also been replaced with word "number"

    text = re.sub('[0-9]+\.[0-9]+', " 87 ", text)

    # remove comma between numbers, i.e. 15,000 -> 15000

    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)

    #     # all numbers should separate from words, this is too aggressive

    #     def pad_number(pattern):
    #         matched_string = pattern.group(0)
    #         return pad_str(matched_string)
    #     text = re.sub('[0-9]+', pad_number, text)

    # add padding to punctuations and special chars, we still need them later

    text = re.sub('\$', " dollar ", text)
    text = re.sub('\%', " percent ", text)
    text = re.sub('\&', " and ", text)

    def pad_pattern(pattern):
        matched_string = pattern.group(0)
        return pad_str(matched_string)

    text = re.sub('[\!\?\@\^\+\*\/\,\~\|\`\=\:\;\.\#\\\]', pad_pattern, text)

    text = re.sub('[^\x00-\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']),
                  text)  # replace non-ascii word with special word

    # indian dollar

    text = re.sub("(?<=[0-9])rs ", " rs ", text, flags=re.IGNORECASE)
    text = re.sub(" rs(?=[0-9])", " rs ", text, flags=re.IGNORECASE)

    # clean text rules get from : https://www.kaggle.com/currie32/the-importance-of-cleaning-text

    text = re.sub(r" (the[\s]+|The[\s]+)?US(A)? ", " America ", text)
    text = re.sub(r" UK ", " England ", text, flags=re.IGNORECASE)
    text = re.sub(r" india ", " India ", text)
    text = re.sub(r" switzerland ", " Switzerland ", text)
    text = re.sub(r" china ", " China ", text)
    text = re.sub(r" chinese ", " Chinese ", text)
    text = re.sub(r" imrovement ", " improvement ", text, flags=re.IGNORECASE)
    text = re.sub(r" intially ", " initially ", text, flags=re.IGNORECASE)
    text = re.sub(r" quora ", " Quora ", text, flags=re.IGNORECASE)
    text = re.sub(r" dms ", " direct messages ", text, flags=re.IGNORECASE)
    text = re.sub(r" demonitization ",
                  " demonetization ",
                  text,
                  flags=re.IGNORECASE)
    text = re.sub(r" actived ", " active ", text, flags=re.IGNORECASE)
    text = re.sub(r" kms ", " kilometers ", text, flags=re.IGNORECASE)
    text = re.sub(r" cs ", " computer science ", text, flags=re.IGNORECASE)
    text = re.sub(r" upvote", " up vote", text, flags=re.IGNORECASE)
    text = re.sub(r" iPhone ", " phone ", text, flags=re.IGNORECASE)
    text = re.sub(r" \0rs ", " rs ", text, flags=re.IGNORECASE)
    text = re.sub(r" calender ", " calendar ", text, flags=re.IGNORECASE)
    text = re.sub(r" ios ", " operating system ", text, flags=re.IGNORECASE)
    text = re.sub(r" gps ", " GPS ", text, flags=re.IGNORECASE)
    text = re.sub(r" gst ", " GST ", text, flags=re.IGNORECASE)
    text = re.sub(r" programing ", " programming ", text, flags=re.IGNORECASE)
    text = re.sub(r" bestfriend ", " best friend ", text, flags=re.IGNORECASE)
    text = re.sub(r" dna ", " DNA ", text, flags=re.IGNORECASE)
    text = re.sub(r" III ", " 3 ", text)
    text = re.sub(r" banglore ", " Banglore ", text, flags=re.IGNORECASE)
    text = re.sub(r" J K ", " JK ", text, flags=re.IGNORECASE)
    text = re.sub(r" J\.K\. ", " JK ", text, flags=re.IGNORECASE)

    # typos identified with my eyes

    text = re.sub(r" quikly ", " quickly ", text)
    text = re.sub(r" unseccessful ", " unsuccessful ", text)
    text = re.sub(r" demoniti[\S]+ ",
                  " demonetization ",
                  text,
                  flags=re.IGNORECASE)
    text = re.sub(r" demoneti[\S]+ ",
                  " demonetization ",
                  text,
                  flags=re.IGNORECASE)
    text = re.sub(r" addmision ", " admission ", text)
    text = re.sub(r" insititute ", " institute ", text)
    text = re.sub(r" connectionn ", " connection ", text)
    text = re.sub(r" permantley ", " permanently ", text)
    text = re.sub(r" sylabus ", " syllabus ", text)
    text = re.sub(r" sequrity ", " security ", text)
    text = re.sub(r" undergraduation ", " undergraduate ",
                  text)  # not typo, but GloVe can't find it
    text = re.sub(r"(?=[a-zA-Z])ig ", "ing ", text)
    text = re.sub(r" latop", " laptop", text)
    text = re.sub(r" programmning ", " programming ", text)
    text = re.sub(r" begineer ", " beginner ", text)
    text = re.sub(r" qoura ", " Quora ", text)
    text = re.sub(r" wtiter ", " writer ", text)
    text = re.sub(r" litrate ", " literate ", text)

    # for words like A-B-C-D or "A B C D",
    # if A,B,C,D individuaally has vector in glove:
    #     it can be treat as separate words
    # else:
    #     replace it as a special word, A_B_C_D is enough, we'll deal with that word later
    #
    # Testcase: 'a 3-year-old 4 -tier car'

    def dash_dealer(pattern):
        matched_string = pattern.group(0)
        splited = matched_string.split('-')
        splited = [sp.strip() for sp in splited if sp != ' ' and sp != '']
        joined = ' '.join(splited)
        parsed = nlp(unicode(joined))
        for token in parsed:
            # if one of the token is not common word, then join the word into one single word
            if not token.has_vector or token.text in SPECIAL_TOKENS.values():
                return '_'.join(splited)
        # if all tokens are common words, then split them
        return joined

    text = re.sub("[a-zA-Z0-9\-]*-[a-zA-Z0-9\-]*", dash_dealer, text)

    # try to see if sentence between quotes is meaningful
    # rule:
    #     if exist at least one word is "not number" and "length longer than 2" and "it can be identified by SpaCy":
    #         then consider the string is meaningful
    #     else:
    #         replace the string with a special word, i.e. quoted_item
    # Testcase:
    # i am a good (programmer)      -> i am a good programmer
    # i am a good (programmererer)  -> i am a good quoted_item
    # i am "i am a"                 -> i am quoted_item
    # i am "i am a programmer"      -> i am i am a programmer
    # i am "i am a programmererer"  -> i am quoted_item

    def quoted_string_parser(pattern):
        string = pattern.group(0)
        parsed = NLP(unicode(string[1:-1]))
        is_meaningful = False
        for token in parsed:
            # if one of the token is meaningful, we'll consider the full string is meaningful
            if len(token.text) > 2 and not token.text.isdigit(
            ) and token.has_vector:
                is_meaningful = True
            elif token.text in SPECIAL_TOKENS.values():
                is_meaningful = True

        if is_meaningful:
            return string
        else:
            return pad_str(string[0]) + SPECIAL_TOKENS['quoted'] + pad_str(
                string[-1])

    text = re.sub('\".*\"', quoted_string_parser, text)
    text = re.sub("\'.*\'", quoted_string_parser, text)
    text = re.sub("\(.*\)", quoted_string_parser, text)
    text = re.sub("\[.*\]", quoted_string_parser, text)
    text = re.sub("\{.*\}", quoted_string_parser, text)
    text = re.sub("\<.*\>", quoted_string_parser, text)

    text = re.sub('[\(\)\[\]\{\}\<\>\'\"]', pad_pattern, text)

    # the single 's' in this stage is 99% of not clean text, just kill it
    text = re.sub(' s ', " ", text)

    # reduce extra spaces into single spaces
    text = re.sub('[\s]+', " ", text)
    text = text.strip()

    return text
Exemple #4
0
 def clean_text(text):
     text = text.lower()
     text = re.sub(r'[^a-zA-Z0-9_\s]+', '', text)
     text = text.strip(' ')
     return text
Exemple #5
0
def cloak_textfooler(text, classifier_func, importance_func = importance_scores, select_func = lambda x,y: select_non_stopword(x), synonym_func = None, candidate_word_filter = None, candidate_sentence_filter = None, sim_func = None, sim_threshold = 0.8):
  text = text.strip() # Note that we're going to be splitting and rejoining this a lot, so all whitespace is equivalent
  # Determine importance scores for each word in the text.
  importance = importance_func(text, classifier_func)
  words = text.split()

  # Cull wordlist.
  selected_words_indexes = list()
  selected_words_importance = list()
  for word_id, word in enumerate(words):
    if select_func(word, importance[word_id]):
      selected_words_indexes.append(word_id)
      selected_words_importance.append(importance[word_id])

  # Sort wordlist by importance.
  sorted_word_indexes = [x for _,x in sorted(zip(selected_words_importance, selected_words_indexes),reverse=True)]
  
  # First, find our baseline prediction.
  current_text = ' '.join(words)
  orig_probs = classifier_func(current_text)
  orig_class = np.argmax(orig_probs)
  # MAIN LOOP:
  # For each word, in sorted order, identify synonyms, find the best candidate, and replace that word with it.
  # If we're able to change the predicted class, break. If not, keep our replacement, and keep going.
  for word_index in sorted_word_indexes:
    # Expand word into a set of candidates.
    word = words[word_index]
    candidates = synonym_func(word)

    # If we were provided a candidate filter at the word level, apply that now.
    if candidate_word_filter:
      filtered_candidates = list()
      for candidate in candidates:
        if candidate_word_filter(word,candidate):
          filtered_candidates.append(candidate)
      candidates = filtered_candidates

    # For each candidate, try replacing the word with that candidate. Remove candidate if it fails similarity test.
    final_candidates = list()
    final_class = list()
    final_probs = list()
    for candidate in candidates:
      new_text = ' '.join(words[:word_index] + [candidate] + words[word_index+1:])
      # If we were provided a sentence level candidate filter, apply that now.
      if candidate_sentence_filter:
        if not candidate_sentence_filter(text,new_text):
          continue
      # Assuming we passed the filter, check the similarity threshold.
      if sim_func(text, new_text) > sim_threshold:
        # If we pass, add this candidate to the final listing, along with its class probabilities and assignment.
        final_candidates.append(candidate)
        cand_probs = classifier_func(new_text)
        final_probs.append(cand_probs)
        final_class.append(np.argmax(cand_probs))

    # If no candidates remain, skip this word and keep looping.
    if len(final_candidates) == 0:
      continue

    # Find all candidates that break class.
    class_breakers = list()
    # zip together sentence with class
    for candidate_val in zip(final_candidates,final_class):
      if candidate_val[1] != orig_class:
        class_breakers.append(candidate_val[0])
    # If there's any candidates that change class:
    if len(class_breakers) > 0:
    #   Remove all candidates that don't.
    #   Select the one with the best similarity, and make the replacement.
      sims = list()
      for candidate in class_breakers:
        sims.append(sim_func(text,candidate))
      chosen = class_breakers[np.argmax(sims)]
    #   Break the loop.
      return chosen
    

    # Otherwise, select remaining candidate that maximizes probability damage..
    damage = list()
    for probs in final_probs:
      damage.append(orig_probs[orig_class] - probs[orig_class])
    # If no candidates improve our odds, leave the word as is.
    # Otherwise, replace text with selected candidate.
    if max(damage) > 0:
      current_text = final_candidates[np.argmax(damage)]

    # Keep looping.

  # If we get to the end of the loop without breaking, return what we've got as a failure.
  print("Unable to find successful attack against sentence:")
  print(text)
  print("Best candidate:")
  print(current_text)
  return current_text
Exemple #6
0
def clean_html_and_extract_text(raw_html):
    '''
       Clean an html string that comes from "cleaned_value"  column
    '''
    #    global foo

    ## use regular expressions to remove roman numberals inside brackets
    ## eg. (iv), (ix) etc.
    raw_html = re.sub('\([v|i|x]+\)', '', raw_html)
    # raw_html = re.sub('\s\d+\s', '', raw_html)

    ## clear off the non ascii characters, remove the html tags
    ## and get just the text from the document
    raw_html = bytes(raw_html, 'utf-16').decode("utf-16", 'ignore')
    _cleantext = BeautifulSoup(raw_html, 'lxml')
    for e in _cleantext.findAll('br'):
        e.replace_with(" ")
    _cleantext = _cleantext.getText(separator=u' ')
    cleantext = _cleantext

    cleantext = " ".join(cleantext.split())
    cleantext = ''.join(x for x in cleantext if x in string.printable)

    # foo.append(cleantext)

    # for checking on various libraries
    # extract_fog_score(cleantext)

    ## clear off punctuations in the text
    '''
    table = cleantext.maketrans("","", string.punctuation)
    cleantext = cleantext.translate(table)
    '''
    '''
    New implementation to remove the punctuation and replace with space
    Ref: https://stackoverflow.com/questions/42614458/how-to-replace-punctuation-with-whitespace
    '''
    punc_list = list(string.punctuation)
    translator = cleantext.maketrans(dict.fromkeys(punc_list, " "))
    cleantext = cleantext.lower().translate(translator)

    ## clear off numbers and normalize spaces between words
    ## and lowercase it
    cleantext = " ".join([
        text for text in cleantext.split(" ") if text.strip() is not ""
    ]).lower()
    '''
    cleantext = " ".join([text for text in cleantext.split(" ")
                          if text.strip() is not "" and text.isdigit() is False]).lower()
    '''

    ## remove any non-printable (non-ascii) characters in the text
    printable = set(string.printable)
    cleantext = list(filter(lambda x: x in printable, cleantext))
    cleantext = "".join(cleantext)

    ## remove roman numberals from string which
    ## are not in brackets
    toremove = [
        ' ii ', ' iii ', ' iv ', ' v ', ' vi ', ' vii ', ' viii ', ' ix ',
        ' x ', '!', '@', '#', '$', '%', '^', '&', '*', '$.'
    ]
    text_array = cleantext.split("\s+")
    cleantext = [word.strip() for word in text_array if word not in toremove]
    cleantext = " ".join(cleantext)

    ## clear off all arabic numerals / digits in the text which are attached
    ## together with text

    numbers = [1]
    while (len(numbers) != 0):
        numbers = re.findall('\d+', cleantext)
        for number in numbers:
            cleantext = cleantext.replace(number, " ")

    cleantext = re.sub(' +', ' ', cleantext)

    return cleantext.strip()

    # fog_index2 = textstat.textstat.gunning_fog(cleantext)

    # https://github.com/mmautner/readability
    # readability = Readability(cleantext)
    # fog_index3 = Readability.GunningFogIndex()

    # import ipdb
    # ipdb.set_trace()
    return fog_index
            print(character)
            start = text.find(character)
            end = text.find(" ", start)
            if end == -1:
                end = len(text)
            text_to_remove = text[start:end]
            print(text.count(text_to_remove))
            print(text_to_remove)
            if character == "#":
                hashtags.append(text_to_remove)
            elif character == "@":
                user_mentions.append(text_to_remove)
            text = text.replace(text_to_remove, "")
            text = ' '.join(text.split())
            count_character = text.count(character)
text = text.strip(' ')
text = ' '.join(text.split())
print(text)
print(user_mentions)
print(hashtags)

path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep
what_food_list_file = codecs.open(path + "list - what_food.txt",
                                  encoding='utf-8')
what_food_list = what_food_list_file.read().splitlines()
hashtags_with_what_words = []
for hashtag in hashtags:
    for word in what_food_list:
        if word in hashtag:
            print(word)
            hashtags_with_what_words.append(hashtag)