Beispiel #1
0
 def remove_punctuations_and_expressions(text_non_char):
     normalize_punctuations = ' '.join([
         word.lstrip(punctuation.replace(".",
                                         "")).rstrip(punctuation).strip()
         for word in text_non_char.split() if word not in punctuation
     ])
     normalize_punctuations = ' '.join([
         re.sub("[{" + punctuation.replace("-", "").replace(".", "") + "}]",
                " ", word) for word in normalize_punctuations.split()
     ])
     return normalize_punctuations
Beispiel #2
0
def load_dataset(path_xml):
    """Loads dataset into memory from xml file"""
    def get_label(c, length):
        s = ""
        for _ in range(length):
            s += c
        return s

    DOMTree = xml.dom.minidom.parse(path_xml)
    collection = DOMTree.documentElement
    sents = collection.getElementsByTagName("sentence")
    LABEL = ['B', 'I']

    dataset = []
    for sent in sents:
        text = sent.getElementsByTagName(
            'text')[0].childNodes[0].data  # type: str
        text = text.lower()
        tags_text = text

        aspects = sent.getElementsByTagName('aspectTerm')

        for aspect in aspects:
            term = aspect.getAttribute('term')  # type: str
            start = int(aspect.getAttribute('from'))
            end = int(aspect.getAttribute('to'))

            tokens = term.split()
            ttags = [
                get_label('B', len(tokens[i])) if i == 0 else get_label(
                    'I', len(tokens[i])) for i in range(len(tokens))
            ]
            tags_str = " ".join(ttags)

            tags_text = tags_text[:start] + tags_str + tags_text[end:]

        tags_text = "".join(c for c in tags_text
                            if c not in punctuation.replace('\'', ''))
        tags = tags_text.split()
        tags = [tag[0] if tag[0] in LABEL else 'O' for tag in tags]
        text = "".join(c for c in text
                       if c not in punctuation.replace('\'', ''))
        words = text.split()

        assert len(tags) == len(words)

        dataset.append((words, tags))

    return dataset
Beispiel #3
0
def slugify(string):
    '''
    Slugify a string. For example "Hello world!" becomes "hello-world"

    - Punctuation is removed.
    - White spaces are replaced by a hyphen.
    - Letters are lowercased.

    Args:
        string (str): The string to slugify.

    Returns:
        dict: The latest geojson file.
    '''
    # Lowercase
    string = string.lower()
    # Convert to ASCII characters
    string = unidecode(string)
    # Remove punctuation except for hyphens
    puncs = punctuation.replace('-', '')
    for punc in puncs:
        string = string.replace(punc, '')
    # Replace spaces with a single hyphen
    string = re.sub(r'\s+', '-', string)
    # Replace multiple hyphens with a single hyphen
    string = re.sub(r'\-+', '-', string)
    # Remove trailing hyphens
    string = string.strip('-')
    return string
    def myAtoi(self, str):
        """
        :type str: str
        :rtype: int
        """
        if not str:
            return None

        start = 0
        for i, char in enumerate(str):
            if char != ' ':
                start = i
                break

        if start == len(str)-1:
            return None
        else:
            str = str[start:]


        not_valid = punctuation.replace('+','-').replace('-','')
        if str[0] in not_valid:
            return None

        for i, char in enumerate(str):
            if char[0]
Beispiel #5
0
def get_ipa(word):
    
    transcription = search_ipa(word)
    # If not found, remove punctuation (except hyphens) and try again.
    if (not transcription) and (any(char in word for char in punctuation)):
        no_punc = re.sub("[%s]" % punctuation.replace("-", ""), "", word)
        transcription = search_ipa(no_punc)
        # If not found, collapse a hyphenated word into one and try again
        if (not transcription) and ("-" in no_punc):
            no_punc_one_word = no_punc.replace("-", "")
            transcription = search_ipa(no_punc_one_word)
            # If not found, try it with a space instead of hyphen.
            if not transcription:
                no_punc_with_space = no_punc.replace("-", " ")
                transcription = search_ipa(no_punc_with_space)
                # If not found, split where the hyphens are and look up the words separately.
                if not transcription:
                    word_split = no_punc.split("-")
                    transcription = []
                    for item in word_split:
                        ipa = search_ipa(item)
                        # Add each part to the transcription. First chop each off at the comma if any.
                        if ipa[0].find(",") != -1:
                            transcription.append(ipa[0][0:ipa[0].index(",")] + " /")
                        else:
                            transcription.append(ipa[0])
                    transcription = ["".join(transcription)]

    return transcription
Beispiel #6
0
def Categories(category=None):
    categories = User.getCategories()
    if category is None:
        # print(categories)
        for cat in categories:
            image = cat.get('image')
            cat.update({'image': compressImage(image)})
            print(categories[0].get('category'))
        return render_template('category.html',
                               categories=categories,
                               enumerate=enumerate)

    if any(char in punctuation.replace('-', '') for char in category):
        flash('Invalid Category!')
        print(category)
        return redirect('/categories', 302)

    cats = User.getCategories(category)
    if cats:
        imgs = [i for i in cats]
    else:
        flash('Category not found!')
        return redirect('/')

    if imgs:
        return render_template('imgByCat.html', category=category)
    else:
        flash('Category not Found!')
        return redirect('/categories', 302)
Beispiel #7
0
def row_clean(row):
    # Remove HTML special entities (e.g. &)
    row_no_special_entities = re.sub(r'\&\w*;', '', row)
    # Remove tickers (Clickable stock market symbols that work like hashtags and start with dollar signs instead)
    row_no_tickers = re.sub(r'\$\w*', '',
                            row_no_special_entities)  # Substitute. $ needs to be escaped because it means something in regex. \w means alphanumeric char or underscore.
    # Remove hyperlinks
    row_no_hyperlinks = re.sub(r'https?:\/\/.*\/\w*', '', row_no_tickers)
    # Remove hashtags
    row_no_hashtags = re.sub(r'#\w*', '', row_no_hyperlinks)
    # Remove Punctuation and split 's, 't, 've with a space for filter
    row_no_punctuation = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', row_no_hashtags)
    # Remove words with 2 or fewer letters (Also takes care of RT)
    row_no_small_words = re.sub(r'\b\w{1,2}\b', '', row_no_punctuation)  # \b represents a word boundary
    # Remove whitespace (including new line characters)
    row_no_whitespace = re.sub(r'\s\s+', ' ', row_no_small_words)

    row_no_whitespace = row_no_whitespace.lstrip(' ')  # Remove single space left on the left
    # Remove •
    row_no_ball = re.sub(r'•', ' ', row_no_whitespace)
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    row_no_emojis = ''.join(c for c in row_no_ball if
                            c <= '\uFFFF')  # Apart from emojis (plane 1), this also removes historic scripts and mathematical alphanumerics (also plane 1), ideographs (plane 2) and more.
    # Tokenize: Reduce length and remove handles
    tknzr = TweetTokenizer(preserve_case=True, reduce_len=True,
                           strip_handles=True)  # reduce_len changes, for example, waaaaaayyyy to waaayyy.
    tw_list = tknzr.tokenize(row_no_emojis)
    # Remove stopwords
    list_no_stopwords = [i for i in tw_list if i not in cache_english_stopwords]
    # Final filtered row
    row_filtered = ' '.join(list_no_stopwords)  # ''.join() would join without spaces between words.
    return row_filtered
Beispiel #8
0
def processTweet(tweet):
    # Remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','',tweet)
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    # To lowercase
    tweet = tweet.lower()
    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    # Remove Punctuation and split 's, 't, 've with a space for filter
    tweet = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet)
    # Remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    tweet = re.sub(r'أ ب ت ث ج ح خ د ذ ر ز س ش ص ض ط ظ ع غ ف ق ك ل م ن هـ و ي', ' ', tweet)
    # Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ') 
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    tweet = ''.join(c for c in tweet if c <= '\uFFFF') 
    return tweet
Beispiel #9
0
 def convert_to_file_name(self, text):
     sentence = str(text).replace(" ", "_")
     my_punctuation = punctuation.replace("_", "")
     sentence = (sentence.translate(str.maketrans("", "",
                                                  my_punctuation))).lower()
     file_name = sentence + self.sound_file_extension
     return (file_name)
Beispiel #10
0
def clean_lyrics(lyrics, song_per_line):
    # punctuation string: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    punctuation_regex = re.compile('[{}]'.format(punctuation.replace("'", "")))
    lyrics = punctuation_regex.sub(' ', lyrics)
    lyrics = re.sub(r"''+", " ", lyrics)
    lyrics = re.sub(r"'", "", lyrics)
    lyrics = re.sub(
        r"\\uFFFD", '_',
        lyrics)  #replaces unicode unknown characters with underscores
    lyrics = re.sub('\x00', "", lyrics)  #remove NUL bytes
    lyrics = lyrics.lower()
    lyrics = re.sub(r"\\r", ' ', lyrics)  #remove line ending symbols
    lyrics = re.sub(r"\\n", ' ', lyrics)  #remove line ending symbols
    lyrics = re.sub(r"[^a-z]_+[^a-z]", ' ',
                    lyrics)  #remove all underscores not preceded by a letter
    lyrics = re.sub(r"\\", ' ', lyrics)  #remove backslashes
    lyrics = re.sub('  +', ' ', lyrics)  #remove surplus spaces

    lyrics = replace_numbers(lyrics)

    lyrics = re.sub('\r', '\n', lyrics)  #to be on the safe side of things.
    lyrics = re.sub('\n +', '\n', lyrics)
    lyrics = re.sub("\n\n+", '\n', lyrics)  #remove empty lines
    if song_per_line:
        # lyrics = re.sub('\n', ' ', lyrics) #remove all brakes between lines within one song
        lyrics = re.sub('\n', '.\n',
                        lyrics)  #replace all breaks with breaks with dots
        lyrics = re.sub('  +', ' ', lyrics)  #remove surplus spaces
        lyrics = re.sub(' \.', '.', lyrics)  #remove surplus spaces

    return lyrics.strip()  #remove spare start end spaces
Beispiel #11
0
 def __init__(self):
     self.stopwords = list(
         map(
             lambda x: x.strip(),
             open("./data/ro_stopwords.txt", "r",
                  encoding="utf-8").readlines()))
     self.punctuation = punctuation.replace("$", "") + "0123456789"
Beispiel #12
0
 def add_sample(self, sample):
     if not isinstance(sample, str):
         raise TypeError
     # Calling add_sample should replace existing sample.
     # To avoid appending new values onto existing lists:
     self.sample = sample
     self.misspelled_words = []
     self.tokenized_sample = []
     self.tagged_sample = {}
     sample = sample.replace('\n', " ")
     sample = sample.rstrip(" ")
     for char in punctuation.replace("'", ""):
         sample = sample.replace(char, "")
     tokens = word_tokenize(sample)
     for word in tokens:
         if word.lower() in words.words():
             self.tokenized_sample.append(word)
         elif word.capitalize() in names.words():
             continue
         elif "'" in word:
             self.tokenized_sample.append(word)
         elif LEMMATIZER.lemmatize(word.lower()) not in words.words():
             if STEMMER.stem(word.lower()) not in words.words():
                 self.misspelled_words.append(word)
         else:
             self.tokenized_sample.append(word)
     self.tagged_sample = pos_tag(tokens)
Beispiel #13
0
 def render(self, context):
     from string import punctuation
     from django.template.defaultfilters import dictsortreversed
     try:
       removelist = ["a", "an", "as", "at", "but", "by", "for", "from",
                     "is", "in", "into", "of", "off", "on", "onto", "per",
                     "since", "than", "the", "this", "that", "to", "up", "via",
                     "with", "and", "it", "be", "was", "i","you","me","my","is","so",
                     "some","it's","its","are","if","some","there", "what","just", ""]
     
       cat_statuses = ""
       for status in Status.objects.all():
         cat_statuses += status.body + " " 
       wordlist = cat_statuses.split()
       punctuation = punctuation.replace('@', '')
       wordlist = [word.strip(punctuation).lower() for word in wordlist]
       wordfreq = [wordlist.count(p) for p in wordlist]
       dictionary = dict(zip(wordlist,wordfreq))
       word_dict_list = []
       for key in dictionary:
         if key not in removelist and not key.startswith('@'):
           word_dict_list.append({ 'name': key, 'count': dictionary[key] })
       context[self.varname] = dictsortreversed(word_dict_list, 'count')[:int(self.num)]
     except:
       pass
     return ''
Beispiel #14
0
def cleanTextFiles (data_folder, text_files = [], stop_words = {}):

    dictionary_list = []

    #loop through each text file, read and clean
    for i in range(0, len(text_files)):

        file_path = os.path.join(data_folder, text_files[i])
        file_open = open(file_path, encoding="utf-8-sig").read()

        #remove punctuation leave apostrophes, convert to lowercase and split
        my_punctuation = punctuation.replace("'", "")
        text = file_open.translate(str.maketrans("", "", my_punctuation))
        file_strings = text.lower().split()

        #clean file_strings from stop words
        cleaner = TextCleaner()
        updated_file_strings = cleaner.compareRemove(stop_words, file_strings)

        #lemmatize updated_file_strings
        lemmtzr = nltk.stem.wordnet.WordNetLemmatizer()
        lemmas = [lemmtzr.lemmatize(token) for token in updated_file_strings]

        #produce a dictionary with word occurrence counts
        counter = TextCounter()
        string_dictionary = counter.countElements(lemmas)

        dictionary_list.append(string_dictionary)

    return dictionary_list
Beispiel #15
0
def processTweet(tweet):
    from string import punctuation
    # Remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    #Convert @username to AT_USER
    tweet = re.sub(r'@[^\s]+', '', tweet)
    # remove numbers
    tweet = re.sub(r'\d+', ' ', tweet)
    tweet = re.sub(r'([a-z])([A-Z])', '\\1 \\2', tweet)
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    # To lowercase
    tweet = tweet.lower()
    # Remove hyperlinks
    tweet = re.sub(r'https:\/\/t.co\/.{9}', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#', ' ', tweet)
    # Remove Punctuation and split 's, 't, 've with a space for filter
    tweet = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet)
    tweet = re.sub(r'[^\w\s]', ' ', tweet)
    # Remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    # Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ')
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    tweet = ''.join(c for c in tweet if c <= '\uFFFF')
    return tweet
Beispiel #16
0
def config_entries(dir=os.getcwd()):
    """
        retrieve information from config file
        if there's no config.ini in provided directory: copy config.template.ini
        generate secret key if none is found in config.ini
    """
    config = configparser.ConfigParser()
    if 'config.ini' not in os.listdir(dir):
        # copy template file if config.ini doesn't exist
        print('creating config file')
        copyfile(dir + '/config.template.ini', dir + '/config.ini')
    config.read(dir + '/config.ini')

    host = config['server']['host']
    port = config['server']['port']

    try:
        # run exception if secret-key is not found or empty
        s_key = config['server']['secret-key']
        if len(s_key) == 0:
            raise Exception('invalid secret key')
    except:
        print('generating secret key')
        # generate secret key (16 characters)
        chars = ascii_letters + digits + punctuation.replace('%', '')
        s_key = ''.join(choices(chars, k=16))
        # write secret key to config file
        config['server']['secret-key'] = s_key
        with open('config.ini', 'w') as configfile:
            config.write(configfile)

    return {'secret-key': s_key, 'host': host, 'port': port}
    def __init__(self,
                 index: 'Index',
                 or_operator: str = 'OR',
                 and_operator: str = 'AND',
                 difference_operator: str = 'NOT',
                 punctuation: str = None):
        '''
        Parameters:
        ------------
        sets: a dictionary where keys correspond to query tokens, and values
              are sets containing CordDoc objects
        '''
        self.index = index
        self.token_to_set = self.index.docmap

        self.or_operator = or_operator
        self.and_operator = and_operator
        self.difference_operator = difference_operator

        self.operators = {
            self.or_operator, self.and_operator, self.difference_operator
        }

        self.punctuation = punctuation
        if self.punctuation is None:
            self.punctuation = PUNCTUATION.replace('(', '').replace(')', '')
Beispiel #18
0
    def from_dict(self, d):
        """
        Create a Pipeline from a dictionary. The change is in inplace.

        :argument: python dictionary
        :return: None
        """

        if 'uid' in d:
            if d['uid']:
                self._uid = d['uid']

        if 'name' in d:
            if d['name']:
                invalid_symbols = punctuation.replace('.','')
                if not isinstance(d['name'], str):
                    raise ree.TypeError(expected_type=str,
                                        actual_type=type(d['name']))

                if any(symbol in d['name'] for symbol in invalid_symbols):
                    raise ree.ValueError(obj=self._uid,
                                        attribute='name',
                                        actual_value=d['name'],
                                        expected_value="Valid object names can " +
                                        "contains letters, numbers and '.'. Any "
                                        "other character is not allowed")

                self._name = d['name']

        if 'state' in d:
            if isinstance(d['state'], str) or isinstance(d['state'], str):
                if d['state'] in list(states._pipeline_state_values.keys()):
                    self._state = d['state']
                else:
                    raise ree.ValueError(obj=self._uid,
                                     attribute='state',
                                     expected_value=list(states._pipeline_state_values.keys()),
                                     actual_value=d['state'])
            else:
                raise ree.TypeError(entity='state', expected_type=str,
                                actual_type=type(d['state']))

        else:
            self._state = states.INITIAL

        if 'state_history' in d:
            if isinstance(d['state_history'], list):
                self._state_history = d['state_history']
            else:
                raise ree.TypeError(entity='state_history', expected_type=list, actual_type=type(
                    d['state_history']))

        if 'completed' in d:
            if isinstance(d['completed'], bool):
                if d['completed']:
                    self._completed_flag.set()
            else:
                raise ree.TypeError(entity='completed', expected_type=bool,
                                actual_type=type(d['completed']))
    def repl(x):

        to_join = punctuation.replace('-', '').replace('_', '').replace(
            '.', '').replace('@', '').split()

        for p in " ".join(to_join):
            x = x.replace(p, ' ')
        return (x.strip())
Beispiel #20
0
def count_words(sentence):
    # need to remove ' as it is one chars we specifically want to keep inside a word
    for char in punctuation.replace("'", ""):
        sentence = sentence.lower().replace(char, " ")

    split_words = [item.strip("'") for item in sentence.split()]

    return {item: split_words.count(item) for item in split_words}
Beispiel #21
0
    def __init__(self):
        # List of symbols we don't care about
        self.SYMBOLS = " ".join(punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

        # tools for splitting text
        # TODO(krzum) add support for this in py3
        self.punctuation = punctuation.replace("'",'').replace('"','')
        self.trans = maketrans(punctuation, ' '*len(punctuation))
Beispiel #22
0
def tokenize(doc, keep_internal_punct=False):
    """

    Tokenize a string.

    The string should be converted to lowercase.

    If keep_internal_punct is False, then return only the alphanumerics (letters, numbers and underscore).

    If keep_internal_punct is True, then also retain punctuation that

    is inside of a word. E.g., in the example below, the token "isn't"

    is maintained when keep_internal_punct=True; otherwise, it is

    split into "isn" and "t" tokens.



    Params:

      doc....a string.

      keep_internal_punct...see above

    Returns:

      a numpy array containing the resulting tokens.



    >>> tokenize(" Hi there! Isn't this fun?", keep_internal_punct=False)

    array(['hi', 'there', 'isn', 't', 'this', 'fun'], 

          dtype='<U5')

    >>> tokenize("Hi there! Isn't this fun? ", keep_internal_punct=True)

    array(['hi', 'there', "isn't", 'this', 'fun'], 

          dtype='<U5')

    """

    ###TODO
    doc = doc.lower()
    replaceunderscore = punctuation.replace("_", "")
    if (keep_internal_punct == True):
        token = ' '.join(
            filter(None, (divide.strip(replaceunderscore)
                          for divide in doc.split())))
        token = re.sub(r'\s+', " ", token).split()
        return np.array(token, dtype="unicode")
    elif (keep_internal_punct == False):
        token = re.sub(r"[^\w]", " ", doc).split()
        return np.array(token, dtype="unicode")
    pass
Beispiel #23
0
def text_reg(word):
    remove = punctuation.replace('-', '').replace('<', '').replace('>', '')
    word = word.lower().replace("'s", "").replace("n't", "").replace('/n', '').replace('/v', '')
    word = word.translate(str.maketrans('', '', remove))
    if word in ('don','hasn','hadn','shouldn','couldn','wouldn','shan','weren',
                'didn','haven','isn','needn','aren','mustn','doesn','mightn','wasn','ain'):
        word = word.rstrip('n')

    return word
Beispiel #24
0
 def text_reg(word):
     remove = punctuation.replace('-', '').replace('<', '').replace('>', '')
     #remove = punctuation.replace('<', '').replace('>', '')
     word = word.lower().replace("'s", "").replace("n't", "").replace(
         '/n', '').replace('/v', '')
     word = word.translate(str.maketrans('', '', remove))
     word = word.replace('-', ' ')
     word = word.replace('—', ' ')
     return word
Beispiel #25
0
def tokenise(review):
    # punctuation string: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    punctuation_regex = re.compile('[{}]'.format(punctuation.replace("'", "")))
    review = review.strip()
    review = punctuation_regex.sub(' ', review)
    review = re.sub(r"'+", "", review)
    tokens = review.lower().split()
    tokens = ['6'*len(token) if token.isdigit() else token for token in tokens if token not in stopwords]
    return tokens
Beispiel #26
0
def valid_selection(start, end):
	from string import punctuation, whitespace
	punctuation = punctuation.replace("_", "")
	valid_characters = punctuation + whitespace
	if not (end.get_char() in valid_characters): return False
	if start.starts_line(): return True
	start.backward_char()
	if not (start.get_char() in valid_characters): return False
	return True
Beispiel #27
0
def strong_password(size=10):
  nbig = size/5
  nsmall = size/3
  v = ([choice(ascii_uppercase) for x in xrange(  nbig)] +
        [choice(ascii_lowercase) for x in xrange(nsmall)] +
        [choice(digits)          for x in xrange(size/2)] +
        [choice(punctuation.replace('|','-'))     for x in xrange(size/5)] )
  shuffle(v)
  return v[:size]
Beispiel #28
0
 def __init__(self):
     self.stop_words = stopwords.words('english')
     self.stop_words.extend([
         "rt", "n't", "'re", "gon", "na", "covid", "coronavirus", "covid-19"
     ])
     self.punctuation_to_remove = punctuation.replace('#', '').replace(
         '@', '').replace('%', '').replace('$', '')
     self.symbols = "<>:\"/\\|!?*~.'`-_()^,+=;"
     self.token_stemmer = stemmer.Stemmer()
Beispiel #29
0
def snake_case(string):
    'Convert a string to snake_case.'
    string = re.sub('(.)([A-Z][a-z])', r'\1_\2', string)  # *Aa -> *_Aa
    string = re.sub('(.)([0-9]+)', r'\1_\2', string)  # *00 -> *_00
    string = string.replace('-', '_')  # A-B -> A_B
    string = ''.join(c for c in string if c not in punctuation.replace('_', ''))
    string = string.replace(' ', '_').strip('_').lower()  # _A B -> a_b
    string = re.sub('_+', '_', string)  # a__b -> a_b
    return string
Beispiel #30
0
def build_vocab(frame):
    p = punct.replace('.', '')
    d = dict.fromkeys(p, ' ')
    tr = str.maketrans(d)
    text = frame.body.explode()
    text = str_app(text, 'translate', tr)
    text = str_app(text, 'replace', '\s+', ' ', regex=True)
    text = str_app(text, 'lower')
    text = str_app(text, 'split').explode()
    return text
Beispiel #31
0
def tweet_clean(tweet):
    #print('Original tweet:', tweet, '\n')
    # Remove HTML special entities (e.g. &amp;)
    tweet_no_special_entities = re.sub(r'\&\w*;', '', tweet)
    tweet_no_special_entities = re.sub(r'\@\w*;', '', tweet)
    #print('No special entitites:', tweet_no_special_entities, '\n')
    # Remove tickers
    tweet_no_tickers = re.sub(r'\$\w*', '', tweet_no_special_entities)
    #print('No tickers:', tweet_no_tickers, '\n')
    # Remove hyperlinks
    tweet_no_hyperlinks = re.sub(r'https?:\/\/.*\/\w*', '', tweet_no_tickers)
    #print('No hyperlinks:', tweet_no_hyperlinks, '\n')
    # Remove hashtags
    tweet_no_hashtags = re.sub(r'#\w*', '', tweet_no_hyperlinks)
    #print('No hashtags:', tweet_no_hashtags, '\n')
    # Remove Punctuation and split 's, 't, 've with a space for filter
    tweet_no_punctuation = re.sub(r'[' + punctuation.replace('@', '') + ']+',
                                  ' ', tweet_no_hashtags)
    #print('No punctuation:', tweet_no_punctuation, '\n')
    # Remove https
    tweet_no_https = re.sub(r'https', '', tweet_no_punctuation)
    tweet_no_https = re.sub(r'http', '', tweet_no_punctuation)
    #print('No https:', tweet_no_https, '\n')
    # Remove words with 2 or fewer letters
    tweet_no_small_words = re.sub(r'\b\w{1,2}\b', '', tweet_no_https)
    #print('No small words:', tweet_no_small_words, '\n')
    # Remove whitespace (including new line characters)
    tweet_no_whitespace = re.sub(r'\s\s+', ' ', tweet_no_small_words)
    tweet_no_whitespace = tweet_no_whitespace.lstrip(
        ' ')  # Remove single space remaining at the front of the tweet.
    #print('No whitespace:', tweet_no_whitespace, '\n')
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    # tweet_no_emojis = ''.join(c for c in tweet_no_whitespace if c <= '\uFFFF') # Apart from emojis (plane 1), this also removes historic scripts and mathematical alphanumerics (also plane 1), ideographs (plane 2) and more.
    # #print('No emojis:', tweet_no_whitespace, '\n')
    # Tokenize: Change to lowercase, reduce length and remove handles
    tknzr = TweetTokenizer(
        preserve_case=False, reduce_len=True, strip_handles=True
    )  # reduce_len changes, for example, waaaaaayyyy to waaayyy.
    tw_list = tknzr.tokenize(tweet_no_whitespace)
    #print('Tweet tokenize:', tw_list, '\n')
    # Remove stopwords
    list_no_stopwords = [i for i in tw_list if i not in english_stopwords]
    #print('No stop words:', list_no_stopwords, '\n')
    #

    # Final filtered tweet
    tweet_filtered = ' '.join(list_no_stopwords)

    tweet_filtered = tweet_filtered.replace(')', '')

    tweet_filtered = tweet_filtered.encode('ascii', 'ignore')

    # print 'Final tweet: '+tweet_filtered
    return (tweet_filtered)
Beispiel #32
0
 def __init__(self):
     # prep chars to remove except single quote and comma
     self.charsToRemove = punctuation.replace("'",
                                              "").replace(",", "").replace(
                                                  "-", "").replace(".", "")
     #  and add some other chars to remove
     self.charsToRemove += "®“”"
     self.charsToRemoveMap = np.full((65536), False)
     for i in range(len(self.charsToRemove)):
         c = self.charsToRemove[i]
         self.charsToRemoveMap[ord(c)] = True
Beispiel #33
0
def remove_punctuations(content):
    content = ''.join(characters for characters in content
                      if characters not in '!}{][)(\><=#"$%&,/*`\'')
    punc = punctuation.replace('-', "")
    content = ' '.join(
        token.strip(punc) for token in content.split() if token.strip(punc))
    content = ' '.join(
        token.replace("'", "") for token in content.split()
        if token.replace("'", ""))
    content = ' '.join(content.split())
    return content
Beispiel #34
0
 def __init__(self, coordinates):
     self.coordinates = coordinates
     self.map = []
     self.largest_x = 0
     self.largest_y = 0
     self.chars = ascii_lowercase + punctuation.replace('.', '').replace(
         ',', '')
     self.char_dict = {}
     self.distance_tracker = {}
     self.before_after_dict = {}
     self.nearest_char_per_coordinate = {}
Beispiel #35
0
 def _format_name(self, name):
     punc = punctuation.replace('_', '')
     name = str(name).translate(None, punc)
     name = name.replace(' ', '_')
     if name[0].isdigit():
         name = '_' + name
     if len(name) < 30:
         return name
     self.num_lg_names += 1
     sas_name = name[:20]
     sas_name += '_lg_{0}'.format(self.num_lg_names)
     return sas_name
Beispiel #36
0
 def __init__(self, string_in, filter_plus=None):
     from string import punctuation
     for item in list('._-@'):
         punctuation = punctuation.replace(item,'')
     for item in punctuation:
         string_in = string_in.replace(item, ' ')
     string_in = list(set(string_in.strip().split()))
     for items in string_in:
         for l in list('._-@'):
             items = items.strip(l)
         if self.isvalid(items, filter_plus):
             self.append(items)
Beispiel #37
0
    def _format_name(self, name):
        punc = punctuation.replace('_', '')
        name = str(name).translate(None, punc)
        name = name.replace(' ', '_')
        words = name.split('_')
        for i, w in enumerate(words):
            if i == 0:
                name = w.lower()
                continue
            name += w.capitalize()
        if name[0].isdigit():
            name = '_' + name

        return name
def extract_words(input_string):
    """
      Returns a list of lowercase words in a strong.
      Punctuation and digits are separated out into their own words.
    """


    for c in punctuation.replace('@', "") + digits :
        input_string = input_string.replace(c, "")

    print input_string
    splitted_string = input_string.lower().split()

    return [x for x in splitted_string if not (x.startswith("http") or x.startswith("@"))]
	def parse_hook_from_text(text):

		hook = text.strip().replace(" ", "_").lower()
		hook = hook.replace("/:", ":").replace("/", ":")
		hook = hook.replace(":", "_")

		for token in punctuation.replace("_", ""):
			hook = hook.replace(token, "")

		if ( hook.startswith("_") ):
			hook = hook[1:]
		if ( hook.endswith("_") ):
			hook = hook[:-1]

		return hook
Beispiel #40
0
def admin_keywords_submit(request):
    """
    Adds any new given keywords from the custom keywords field in the
    admin, and returns their IDs for use when saving a model with a
    keywords field.
    """
    keyword_ids, titles = [], []
    remove = punctuation.replace("-", "")  # Strip punctuation, allow dashes.
    for title in request.POST.get("text_keywords", "").split(","):
        title = "".join([c for c in title if c not in remove]).strip()
        if title:
            kw, created = Keyword.objects.get_or_create_iexact(title=title)
            keyword_id = str(kw.id)
            if keyword_id not in keyword_ids:
                keyword_ids.append(keyword_id)
                titles.append(title)
    return HttpResponse("%s|%s" % (",".join(keyword_ids), ", ".join(titles)))
def convert_lat_lon_strings(string):
    #cleans a latitude or longitude text string and converts into decimal degrees
    from string import punctuation
    for symbol in punctuation.replace('-','').replace('.',''):
        string = string.replace(symbol,' ') #replace punctuation (other than - and .) with space
    coord_list = string.split()
    if coord_list[-1] == 'N' or coord_list[-1] == 'S' or coord_list[-1] == 'E' or coord_list[-1] == 'W':
        if coord_list[-1] == "S" or coord_list[-1] == "W":
            #if the coordinate is in the southern or western hemisphere, the lat/lon is negative.
            if coord_list[0].find('-') == -1:
                coord_list[0] = '-' + coord_list[0] #change the hemisphere indicator to +/-
        coord_list.pop()#remove the hemisphere indicator
    coord_list = [float(coord) for coord in coord_list]#convert each element to float
    coordinate = convert_DMS_to_degrees(coord_list)
    if abs(coordinate) > 180:
        return 0
    return coordinate
Beispiel #42
0
 def render(self, context):
   from string import punctuation
   from django.template.defaultfilters import dictsortreversed
   try:
     cat_statuses = ""
     for status in Status.objects.all():
       cat_statuses += status.body + " " 
     wordlist = cat_statuses.split()
     punctuation = punctuation.replace('@', '')
     wordlist = [word.strip(punctuation).lower().replace("'s", '') for word in wordlist]
     wordfreq = [wordlist.count(p) for p in wordlist]
     dictionary = dict(zip(wordlist,wordfreq))
     word_dict_list = []
     for key in dictionary:
       if key.startswith('@'):
         word_dict_list.append({ 'name': key[1:], 'count': dictionary[key] })
     context[self.varname] = dictsortreversed(word_dict_list, 'count')[:int(self.num)]
   except:
     pass
   return ''
def convert_lat_lon_strings(string):
    #cleans a latitude or longitude text string into decimal degrees
    from string import punctuation
    for symbol in punctuation.replace('-','').replace('.',''):
        string = string.replace(symbol," ") #replace punctuation (other than - and .) with space
    coord_list = string.split()
    if coord_list[-1] == 'N' or coord_list[-1] == 'S' or coord_list[-1] == 'E' or coord_list[-1] == 'W':
        if coord_list[-1] == "S" or coord_list[-1] == "W":
            #if the coordinate is in the southern or western hemisphere, the lat/lon is negative.
            if coord_list[0].find('-') == -1: coord_list[0] = '-' + coord_list[0]
        coord_list.pop()#remove the hemisphere indicator
    coordinate = 0
    denominator = 1
    for i in range(len(coord_list)):
        #DMS to decimal formula: deg = D + M/60 + S/3600
        coordinate+=float(coord_list[i])/denominator
        denominator*=60
    if abs(coordinate) > 180:
        return 0
    return coordinate
import os
import nltk
import codecs
import collections
import hashlib
from nltk import sent_tokenize
from string import punctuation
import multiprocessing as mp
from multiprocessing import pool
from itertools import izip_longest

global del_words, punct, default_stopwords, cores, block, swear_words

del_words = '@'
punct = punctuation.replace('#', '').replace('\\', '')
default_stopwords = nltk.corpus.stopwords.words('german')
cores = 48
block = 10000
swear_words = []

def grouper(n, iterable, padvalue = None):
    return izip_longest(fillvalue = padvalue, *[
        iter(iterable)] * n)


def removeSpecialCharsWorker(line):
    if line:
        line = line.rstrip()
        if 'http' not in line:
            translated_phrase = line.encode('utf-8').translate(None, punct)
            words_list = [word for word in translated_phrase.split() if (not word.startswith(del_words)) and (not word.isdigit()) and (word not in default_stopwords)]
Beispiel #45
0
from string import punctuation
from sys import argv

# Create a list of all punctuation minus the hyphen
punctuationList = list(punctuation.replace("-", ""))

def main():
	# Get the name of the script and the name of the file from the command line
	script, filename = argv

	text = open(filename)
	contents = text.read()
	text.close()

	# Remove newline characters and seperator hypens from the file contents, and split the
	# contents into a list of all the words in it
	wordList = contents.replace("\n", " ").replace(" - ", " ").replace("--", " ").split(" ")

	# Create a dictionary with each of the words and the number of times that word occurs
	wordDict = {}
	for word in wordList:
		# Remove any punctuation from the file contents that doesn't act as a seperator for
		# two different words
		for char in punctuationList:
			word = word.replace(char, "")

		# Don't count words that are blank strings, such as empty lines in a file
		if word != "":
			word = word.lower()

			if word in wordDict:
Beispiel #46
0
#!usr/bin/python

import nltk
import json
import re
from string import punctuation
from collections import Counter

#TODO: Pick best name

#Globals
award_specific_stopwords = ['rt', 'golden', 'globes', 'goldenglobes', 'best', 'director', 'actor', 'actress', 'movie', 'motion', 'picture', 'film', 'tv', 'series', 'performance', 'television', 'snub', 'wins', 'win', 'congrats', 'congratulations', 'season', 'animated', 'animation', 'feature', 'comedy', 'drama', 'musical', 'screenplay', 'award', 'awards']
stopwords = nltk.corpus.stopwords.words() + award_specific_stopwords
names = nltk.corpus.names.words()
punctuation.replace('#', '')
punctuation.replace('@', '')

def query_name(tweets, pattern, g=None, n=None):
    """
Querys the tweets of a specific year to get all sufficient ngrams.

tweets is a dictionary object with the loaded json data from the tweets.

pattern is a regex pattern you want to query. Pass in a compiled regex pattern
for improved performance.

g specifies which type of n-gram you want, i.e.  g=1 for unigrams, 2 for
bigrams, and 3 for trigrams. If g is not provided, the query returns all types
of grams.

n is the number of results you want to see. If not provided, it shows all.
def clean_filename(filename):
	remove_punct_map = dict.fromkeys(map(ord, punctuation.replace('_', '') + '’'))
	return str(filename).translate(remove_punct_map).strip().replace(' ', '_')[:100]
Beispiel #48
0
import db_info
import MySQLdb
import collections
from string import punctuation

db = MySQLdb.connect(
    host=db_info.host,  # your host, usually localhost
    user=db_info.user,  # your username
    passwd=db_info.passwd,  # your password
    db=db_info.db,
)  # name of the data base

cur = db.cursor()  # cursor object for mysql query
twitter_punct = punctuation.replace("#", "")

# calculate the number of tweets for each hour and write to a .csv
def tweets_vs_time(cur):
    f = open("data/data.csv", "w")  # create new csv
    f.write("time,count\n")

    for i in range(15, 23):
        for j in range(0, 24):
            dateStart = "2013-04-%02d %02d" % (i, j)
            print "query: select count(*) from tweets where time like '" + dateStart + "%'"
            cur.execute("select count(*) from tweets where time like '" + dateStart + "%'")
            for row in cur.fetchall():
                dataset.append({"time": dateStart, "count": row})
                result = (str(dateStart) + "," + "%s\n") % (row[0],)
                f.write(result)

    print "done"
Beispiel #49
0
"""
config.py

Useful definitions and regular expressions.
"""

from string import punctuation

CORPUS_FOLDER = "data/corpus"
THESAURI_FOLDER = "data/thesaurus"
MAPPING_FOLDER = "data/mapping"
CORP_TAG = ".txt"
THES_TAG = ".thes"
MAP_TAG = ".map"

WORD = "\w+[\'-]?\w*"
PRICE = "\$[\d.]+"
PUNCTUATION_EXCEPT_HYPHEN = '[' + punctuation.replace('-', '') + ']'
Beispiel #50
0
from nltk.stem.porter import *
from nltk.corpus import stopwords
from string import punctuation


stopwords_list = stopwords.words('english')
stemmer = PorterStemmer()
twitter_punctuation = punctuation.replace('#', '') # we want to keep hashtags

def process_tweet(tweet):
    tweet = tweet.translate(tweet.maketrans('', '', twitter_punctuation)) # remove all punctuation except pound symbol
    word_list = tweet.split(' ')
    clean_words = []
    for word in word_list:
        if word not in stopwords:
            clean_words.append(stemmer.stem(word))
    return clean_words

def process_tweets(tweets):
    dataset = []
    for tweet in tweets:
        dataset.append(process_tweet(tweet))
    return dataset
Beispiel #51
0
# Author: Christopher Hench
# ==============================================================================

'''Working product to automatically generate SRT file from script. Intervals are given as user inputs while watching video. Line breaks are determined by a tree parsing algorithm.'''

import time
from nltk.parse import stanford
from nltk import sent_tokenize, Tree
from string import punctuation

punctuation = punctuation.replace("'", "")

line_limit = 34

with open("script_eng.txt") as f:
    raw_script = f.read()

# prepare stanford parser
parser = stanford.StanfordParser(
    path_to_jar="/Users/chench/Documents/stanford-parser-full-2015-12-09/stanford-parser.jar",
    path_to_models_jar="/Users/chench/Documents/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar")

sentences = parser.raw_parse_sents(sent_tokenize(raw_script))


def get_all_nodes(parent):
    '''
    extracts all chunk and word relations
    '''

    for node in parent:
ap_mac_pattern      = re.compile('.*Address: (.*?)\n', re.I | re.M  | re.S)
channel_pattern     = re.compile('.*Channel:? ?(\d\d?)', re.I | re.M  | re.S)
strength_pattern    = re.compile('.*Quality:?=? ?(\d+)\s*/?\s*(\d*)', re.I | re.M  | re.S)
altstrength_pattern = re.compile('.*Signal level:?=? ?(\d\d*)', re.I | re.M | re.S)
signaldbm_pattern   = re.compile('.*Signal level:?=? ?(-\d\d*)', re.I | re.M | re.S)
mode_pattern        = re.compile('.*Mode:(.*?)\n', re.I | re.M  | re.S)
freq_pattern        = re.compile('.*Frequency:(.*?)\n', re.I | re.M  | re.S)
ip_pattern          = re.compile(r'inet [Aa]d?dr[^.]*:([^.]*\.[^.]*\.[^.]*\.[0-9]*)', re.S)
bssid_pattern       = re.compile('.*Access Point: (([0-9A-Z]{2}:){5}[0-9A-Z]{2})', re.I | re.M | re.S)
wep_pattern         = re.compile('.*Encryption key:(.*?)\n', re.I | re.M  | re.S)
altwpa_pattern      = re.compile('(wpa_ie)', re.I | re.M | re.S)
wpa1_pattern        = re.compile('(WPA Version 1)', re.I | re.M  | re.S)
wpa2_pattern        = re.compile('(WPA2)', re.I | re.M  | re.S)
auth_pattern        = re.compile('.*wpa_state=(.*?)\n', re.I | re.M  | re.S)
RALINK_DRIVER = 'ralink legacy'
blacklist_strict = punctuation.replace("-", "") + " "
blacklist_norm = ";`$!*|><&\\"
blank_trans = maketrans("", "")
def _sanitize_string(string):
    if string:
        return translate(str(string), blank_trans, blacklist_norm)
    else:
        return string
def _sanitize_string_strict(string):
    if string:
        return translate(str(string), blank_trans, blacklist_strict)
    else:
        return string
def SetDNS(dns1=None, dns2=None, dns3=None):
    """ Set the DNS of the system to the specified DNS servers.
    Opens up resolv.conf and writes in the nameservers.
Beispiel #53
0
EncryptLite( sTest ) = 'Y=REjqRp=KurB=cVRbgIelZhuS{bOMhDG=t==nW==ABqX'

XOREncrypt( sTest )  = '3c0d094c1e1d0c0f074f0a17031b01480303144f0210' +
                       '011c0a0c45031a0a1a4518040a48090d16164801030b41'

getRot13( sTest )    = 'Gur dhvpx oebja sbk whzcrq bire gur ynml qbt.'

'''

from string             import punctuation, digits

from Collect.Cards      import ShuffleAndCut
from String.Replace     import getTextReversed
from String.Transform   import TranslatorFactory

sSafe   =  punctuation.replace( '\\', ' ' ) + digits

changePunct = TranslatorFactory( sSafe, getTextReversed( sSafe ) )



def DescendChars( sOrig,
        iOffset         = False,
        bStepIncrement  = False,
        bBackStep       = False,
        bBackwards      = False ):
    #
    from Iter.AllVers import lMap, iRange
    #
    def getIncrement( i ): return 0
    #