Example #1
0
File: app.py Project: TaaviE/OS
def murdesonastik_task(self, word):
    """Task that fetches MS"""
    try:
        html = sessions["murdesõnastik"].get(
            "http://www.eki.ee/dict/ems/index.cgi?F=K&Q=" + word).content
    except exceptions.ConnectionError:
        self.update_state(state=states.FAILURE, meta="Connection failure")
        raise Ignore()
    soup = BeautifulSoup(html, "html.parser")

    amount = soup.find_all("p", {"class": "inf"})[0].get_text()
    if "Päring ei andnud tulemusi!" in amount:
        amount = 0
        return {"progress": 100, "count": amount, "result": []}
    else:
        amount = amount.split(" ")[1]
    results = soup.find_all("div", {"class": "tervikart"})
    clean_results = []

    for result in results:
        if deaccent(str(word)) in deaccent(str(result)):
            clean_results.append(
                highlight_word_in_html(remove_tags_and_beautify(result), word))
    clean_results = clean_results[:6]

    if len(clean_results) == 0:
        amount = 0
        return {"progress": 100, "count": amount, "result": []}

    return {"progress": 100, "count": amount, "result": clean_results}
 def process_sentence(self, s, exclude_punct=False):
     st = []
     split_indices = []
     for i, tok in enumerate(s):
         if exclude_punct and tok in self.PUNCT:
             continue
         elif self.is_number(tok):
             try:
                 if s[i-1] == "(" and s[i+1] == ")" or s[i-1] == "〈" and s[i+1] == "〉":
                     pass
                 else:
                     tok = "<nUm>"
             except:
                 tok = "<nUm>"  # replace all numbers with a string <nUm>
         else:
             elem_with_valence = self.ELEMENT_VALENCE_IN_PAR.match(tok)
             if elem_with_valence is not None:
                 # change element name to symbol
                 elem_mention = elem_with_valence.group(1)
                 try:
                     formula = self.elem_name_dict[elem_mention.lower()]
                     matmention = elem_mention.lower()
                 except:
                     formula = elem_mention  # this was already the symbol
                     matmention = elem_mention
                 self.mat_list.append((matmention, formula))  # exclude the valence state from name
                 # split this for word2vec
                 st.append(matmention)
                 split_indices.append(i)
                 tok = elem_with_valence.group(2)
             elif tok in self.ELEMENTS_AND_NAMES:  # add element names to formulae
                 try:
                     formula = self.elem_name_dict[tok.lower()]
                     matmention = tok.lower()
                     tok = matmention
                 except:
                     formula = tok  # this was already the symbol
                     matmention = tok
                 self.mat_list.append((matmention, formula))
             elif self.is_simple_formula(tok):
                 formula = self.get_norm_formula(tok)
                 self.mat_list.append((tok, formula))
                 tok = formula
             elif (len(tok) == 1 or (len(tok) > 1 and tok[0].isupper() and tok[1:].islower())) \
                     and tok not in self.ELEMENTS and tok not in self.UNITS \
                     and self.ELEMENT_DIRECTION_IN_PAR.match(tok) is None:
                 # to lowercase if only first letter is uppercase (chemical elements already covered above)
                 tok = deaccent(tok.lower())
             else:
                 # splitting units from numbers (e.g. you can get 2mol., 3V, etc..)
                 nr_unit = self.NR_AND_UNIT.match(tok)
                 if nr_unit is None or nr_unit.group(2) not in self.UNITS:
                     tok = deaccent(tok)  # matches the pattern but not in the list of units
                 else:
                     # splitting the unit from number
                     st.append("<nUm>")
                     split_indices.append(i)
                     tok = nr_unit.group(2)  # the unit
         st.append(tok)
     return st, split_indices
def get_hyperonyms(main_word):
    HYPONYM = eq(utils.deaccent(main_word))
    RULE = or_(rule(HYPONYM, ATAKJE, START, MID, END), rule(HYPONYM, MID, END),
               rule(START_S, END, KAK, HYPONYM), rule(END, INCLUDING, HYPONYM))
    parser = Parser(RULE)
    text = utils.deaccent(wikipedia.summary(main_word))
    print(text)
    text = re.sub(r'\(.+?\)', '', text)
    text = text.lower().replace('* сергии радонежскии* ', '')
    for idx, match in enumerate(parser.findall(text.lower())):
        k = [_.value for _ in match.tokens]
        print(k)
 def pre_process(s):
     s = str(s)
     s = strip_tags(s)
     s = deaccent(s)
     s = strip_multiple_whitespaces(s)
     s = s.lower()
     return s
Example #5
0
def tokenize(text,
             lowercase=False,
             deacc=False,
             encoding='utf8',
             errors="strict",
             to_lower=False,
             lower=False):
    """
    Iteratively yield tokens as unicode strings, removing accent marks
    and optionally lowercasing the unidoce string by assigning True
    to one of the parameters, lowercase, to_lower, or lower.

    Input text may be either unicode or utf8-encoded byte string.

    The tokens on output are maximal contiguous sequences of alphabetic
    characters (no digits!).

    >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True))
    [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']

    """
    lowercase = lowercase or to_lower or lower
    text = to_unicode(text, encoding, errors=errors)
    if lowercase:
        text = text.lower()
    if deacc:
        text = deaccent(text)
    return simple_tokenize(text)
Example #6
0
def preprocess_text(tweet):
    """
    Function to process an aggregated user profile. This does the following:
    1. Decode html entities. eg. "AT&amp;T" will become "AT&T"
    2. Deaccent
    3. Remove links.
    4. Remove any user mentions (@name).
    5. Lemmatize and remove stopwords.
    
    Parameters:
    ----------
    text : String. If train_texts is a list of tweets, ' '.join and pass
    
    Returns:
    -------
    text : preprocessed (tokenized) tweet.
    """
    tweet = decode_htmlentities(tweet)
    tweet = deaccent(tweet)
    tweet = tweet.encode('ascii',
                         'ignore')  # To prevent UnicodeDecodeErrors later on
    tweet = re.sub(r'http\S+', '', str(tweet))  # Step 3
    tweet = re.sub(r'@\w+', '', str(tweet))  # Step 4
    tweet = tweet.split()
    tweet = lemmatize(' '.join(tweet),
                      re.compile('(NN)'),
                      stopwords=stopwords.words('english'),
                      min_length=3,
                      max_length=15)
    tweet = [word.split('/')[0] for word in tweet]
    return tweet
Example #7
0
    def token_gen(self, lines_str, lower=False, remove_accents=True):

        lines = deaccent(lines_str.strip()) if remove_accents else lines_str
        lines = lines.lower() if lower else lines

        match = self.pattern.search(lines)

        if match is None:
            return iter(self.empty)  # return empty iterator if no tokens

        last_token = None
        ends_at = 0

        while match is not None:
            starts_at = match.start()
            tailspace = ends_at != starts_at  # leading space for the current word
            ends_at = match.end()
            if last_token:
                # yield last_token, self.tailing_space if leading_space else self.no_tailing_space
                # yield last_token, leading_space
                # yield (last_token, leading_space) if mark_tailing_spaces else last_token
                yield Token(text=last_token, tailspace=tailspace)
            last_token = lines[starts_at:ends_at]
            match = self.pattern.search(lines, ends_at)

        # yield last_token, self.no_tailing_space
        # yield last_token, False
        # yield (last_token, False) if mark_tailing_spaces else last_token
        yield Token(text=last_token, tailspace=False)
Example #8
0
def extract_names(txt, nlp, n_sentences=2):
    """
    Use the spacy entity engine to extract person names from a text
    args:
        - txt: raw text
        - nlp: a spacy engine
    return:
        - list of names as strings
    """
    # to unicode & get rid of accent
    txt = deaccent(any2unicode(txt))
    # split according to reply forward (get rid of "entête")
    txt = "\n".join(re_fw_regex.split(txt))
    txt = txt.replace(">", " ")
    # split sentences
    sentences = sent_tokenize(txt)
    # tokenize + lemmatize + filter ?
    bow = []
    for sent in sentences[:n_sentences]:
        if REGEX:
            sent = " ".join(lower_upper_pat.split(sent))
            sent = " ".join(number_letter_pat.split(sent))
        doc = nlp(sent, parse=False)
        for tok in doc:
            lemma = drop_digits(replace_punct(tok.lemma_))
            if (lemma and (tok.ent_type_ != 'PERSON') and not tok.is_punct
                    and not tok.is_stop and lemma not in extendedstopwords
                    and not tok.like_num and not tok.is_space
                    and not tok.like_url and len(lemma) > 1 and not any(
                        (x in tok.orth_ for x in not_in_list))):
                bow.append(lemma)
    return bow
Example #9
0
def tokenize(text, deacc=False, encoding='utf8', lowercase=False, to_lower=False):
    """ Iteratively yield re-based tokens as unicode strings, removing accent marks and optionally lowercasing
    :param text: Input text
    :type text: str
    :param deacc: Remove accentuation
    :type deacc: bool
    :param encoding: Encoding of text
    :type encoding: str
    :param lowercase: To lowercase
    :type lowercase: bool
    :param to_lower: To lowercase
    :type to_lower: bool
    :return: Contiguous sequences of alphabetic characters (no digits!)
    :rtype: str
    # Example:
    # list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc=True))
    # [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']
    """
    lowercase = lowercase or to_lower
    from gensim.utils import to_unicode
    text = to_unicode(text, encoding, errors='ignore')
    if lowercase:
        text = text.lower()
    if deacc:
        from gensim.utils import deaccent
        # Example
        # --------
        # >>> from gensim.utils import deaccent
        # >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
        # u'Sef chomutovskych komunistu dostal postou bily prasek'
        text = deaccent(text)
    return re_tokenize(text)
 def review_to_wordlist( review_text, remove_stopwords=False ):
     # Function to convert a document to a sequence of words,
     # optionally removing stop words.  Returns a list of words.
     #
     # Removes any accents  
     review_text = utils.deaccent(review_text)  
     # Replace hypens with spaces  
     review_text = re.sub(r"-", " ", review_text)
     # Remove non-letters
     review_text = re.sub("[^a-zA-Z!?0-9]"," ", review_text)
     review_text = re.sub("[!]", " !", review_text)
     review_text = re.sub("[?]", " ?", review_text)
     # Removes email addresses  
     review_text = re.sub(r"[\w]+@[\.\w]+", "", review_text)  
     # Removes web addresses  
     review_text = re.sub(r"/[a-zA-Z]*[:\/\/]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", review_text) 
     # Convert words to lower case and split them
     words = review_text.lower().split()
     # Optionally remove stop words (false by default)
     if remove_stopwords:
         stops = set(stopwords.words("english"))
         words = [w for w in words if not w in stops]
     #Implement porter stemmer
     stemmer = PorterStemmer()
     words = [stemmer.stem(w) for w in words]
     # Return a list of words
     return(words)
Example #11
0
def custom_tokenize(text,
                    lowercase=False,
                    deacc=False,
                    encoding='utf8',
                    errors="strict",
                    to_lower=False,
                    lower=False,
                    cde=True):
    text = to_unicode(text, encoding, errors=errors)
    lowercase = lowercase or to_lower or lower
    if lowercase:
        text = text.lower()
    if deacc:
        text = deaccent(text)
    if cde:
        text = " ".join(text.split())
        cde_p = Paragraph(text)
        tokens = cde_p.tokens
        toks = []
        for sentence in tokens:
            toks.append([])
            for tok in sentence:
                if tok.text not in string.punctuation:
                    yield tok.text
    else:
        for match in PAT_ALPHABETIC.finditer(text):
            yield match.group()
Example #12
0
def bow_mail_body(txt, nlp):
    """
    args:
        - txt: raw text
        - nlp: a spacy engine
    """
    # to unicode & get rid of accent
    txt = deaccent(any2unicode(txt))
    # split according to reply forward (get rid of "entête")
    txt = "\n".join(re_fw_regex.split(txt))
    txt = txt.replace(">", " ")
    # split sentences
    sentences = sent_tokenize(txt)
    # tokenize + lemmatize + filter ?
    bow = []
    for sent in sentences:
        if REGEX:
            sent = " ".join(lower_upper_pat.split(sent))
            sent = " ".join(number_letter_pat.split(sent))
        doc = nlp(sent, parse=False, entity=False)
        for tok in doc:
            if (tok.lemma_ and not tok.is_punct and not tok.is_stop
                    and not tok.like_num and not tok.is_space
                    and not tok.like_url and len(tok) > 1 and not any(
                        (x in tok.orth_ for x in not_in_list))):
                if tok.orth_.startswith("-") or tok.orth_.endswith("-"):
                    bow.append(tok.lemma_.replace("-", ""))
                else:
                    bow.append(tok.lemma_)
    return bow
Example #13
0
def obter_link_name(nome):
    '''
    Deixa o nome passado em minúsculas, sem acentos e com '_' entre as palavras
    Parâmetros:
        nome (String) --> Nome a ser transformado
    Retorno: O nome em minúsculas, sem acentos e com '_' entre as palavras (String)
    '''
    return g_utils.deaccent(RE_ESPACO.sub('_', nome.lower()))
Example #14
0
    def token_gen(self, lines_str, lower=False, remove_accents=True):
        lines = deaccent(lines_str.strip()) if remove_accents else lines_str
        lines = lines.lower() if lower else lines

        tokens = self.bpe_tokenizer.EncodeAsPieces(re.sub(r"\d", "0", lines))

        for token in tokens:
            yield Token(text=token, tailspace=False)
Example #15
0
def tokenize_(string,_additional_stop_words=[]):
    text = "".join([w for w in string if w not in non_words])
    text = deaccent(text).split(' ')
    text = map(lambda x: x.lower().strip(), text)
    text = filter(lambda x: x not in stopwords and x not in _additional_stop_words, text)
    text = [*filter(lambda x: len(x) > 1, text)]
    if text == None:
        return []
    return text
Example #16
0
    def tokenize(text):
        text = to_unicode(text, encoding='utf8', errors='ignore')
        text = text.lower()

        # normalize unicode (i.e., remove accentuation)
        text = deaccent(text)

        for token, pos in pos_tag(word_tokenize(text)):
            # only Noun is acceptable: https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/
            if token in ['NN', 'NNS', 'NNP', 'NNPS']:
                yield token
Example #17
0
def clean_medical_documents(docs):
    r"""
    Clean medical reports.

    This function makes a simple pre-processing of medical texts. The
    steps are:
    1. String is deaccented.
    2. Sequences of at least 2 letters are extracted (numbers and other
       characters are ignored).
    3. Tokens with all uppercase letters are kept. Other tokens are
       converted to lowercase.
    4. The tokens 'pt', and 'pts' are replaced by 'patient'.

    Parameters
    ----------
    docs : list
        `list` of documents. Each document is a `str`.

    Returns
    -------
    docs : list
        `list` of documents. Each document is a `list` of lines. Each
        line is a `list` of tokens.

    Example
    -------
    >>> from support import clean_medical_documents
    >>> docs = [
    ...     'The pt. appears awake.',
    ...     'Cardiologist:  Dr. C. Núttèr',
    ... ]
    >>> clean_docs = clean_medical_documents(docs)
    >>> print(clean_docs)
    ['the patient appears awake', 'cardiologist dr nutter']
    >>>

    """
    # to not modify the input parameter
    docs = list(docs)
    regex = re.compile(pattern=r'[a-zA-Z]{2,}')
    for i, doc in enumerate(docs):
        doc = deaccent(doc)
        # keep uppercase words
        tokens = [
            token.lower() if not token.isupper() else token
            for token in regex.findall(doc)
        ]
        # replace tokens
        for j, token in enumerate(tokens):
            if token in ('pt', 'pts'):
                tokens[j] = 'patient'
        # clean document
        docs[i] = ' '.join(tokens)
    return docs
Example #18
0
def extractSalaryFromFile(number, file):
    groups = []
    with open(file, 'r') as f:
        for line in f:
            groups.append(remove_stopwords(deaccent(line)))
    texts = [[text for text in group.split()] for group in groups]
    words = []
    for g in texts:
        for w in g:
            words.append(w)
    s = dumbFindSalary(words)
    print("From text number " + str(number) + ", " + s)
def preprocess(s, stem=True):
    '''
    given a document or query string, returns a list of preprocessed words.
    we can decide whether to stem each word or not.
    '''
    if not stem:
        preprocess_filters = DEFAULT_FILTERS.copy()
        preprocess_filters.pop()  # remove stemming from list of filters
        wordList = preprocess_string(s, filters=preprocess_filters)
    else:
        wordList = preprocess_string(s)
    for i in range(len(wordList)):
        wordList[i] = deaccent(wordList[i])
    return wordList
Example #20
0
def preprocess_txt(raw_txt):
    """
    Preprocessing of raw txt before parsing with Spacy
    - deaccent, to unicode
    - split forward, redirect
    - replace the > of email reply
    - split lowerUpper
    - split letterNumber
    """
    txt = deaccent(any2unicode(raw_txt))
    txt = "\n".join(re_fw_regex.split(txt))
    txt = txt.replace(">", " ")
    txt = " ".join(lower_upper_pat.split(txt))
    txt = " ".join(number_letter_pat.split(txt))
    return txt
Example #21
0
def remove_non_plain(document):
    """
    Replaces urls, @usernames, #tags, emojis and numbers
    with a ' ' (space). Also removes accents and punctuation
    to finally remove redundant whitespace and lowercase all
    characters
    :param document: string
    :return: processed unicode string
    """
    document = to_unicode(document)
    document = non_plain_re.sub(' ', document)
    document = proc.strip_non_alphanum(document)
    document = proc.strip_numeric(document)
    document = proc.strip_multiple_whitespaces(document)
    document = deaccent(document)
    return document.lower()
def to_ascii(string):
    """
    Replace all non-ascii chars with ascii-equivalent, remove
    all non-printing characters,replace  all tabs with 4 spaces.
    
    Returns:
        A transformed string
    """
    tabs = re.compile('\t')
    newstring, _ = tabs.subn(' ' * 4, string)
    car_return_etc = re.compile('\r|\x0b|\x0c')
    newstring, _ = tabs.subn('\n', newstring)
    newstring = deaccent(newstring)
    #FIXME removes newlines, not intended behavior
    nonprintable = re.compile('[^ -~\n]')
    newstring, _ = nonprintable.subn('', newstring)
    return newstring.encode('ascii')
Example #23
0
    def get_tokens(text):
        text = to_unicode(text, encoding='utf8', errors='ignore')
        text = text.lower()

        # normalize unicode (i.e., remove accentuation)
        text = deaccent(text)

        bi = []
        for match in PAT_ALPHABETIC.finditer(text):
            uni = match.group()
            yield uni

            bi.append(uni)
            if len(bi) == 1:
                continue
            yield ' '.join(bi)
            del bi[0]
Example #24
0
 def regularize(text):
     '''return the regularized text'''
     r1 = u'^RT @.*?: |@+[^\s]*|^RT\s'  # exclude RT @... \n
     # http(s)
     r2 = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
     r3 = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'  # e-mail
     r4 = '\s+'  # multiple empty chars
     r5 = 'http[s]?:.*? '
     # r6 = "[^A-Za-z0-9_]"  # not alphabet number and _
     r7 = '\*.+?\*'
     sub_rule = r1 + '|' + r2 + '|' + r3 + '|' + r5 + '|' + r7
     text = html.unescape(text)
     text = deaccent(text)
     text = re.sub(sub_rule, " ", text)
     text = emoji.demojize(text, delimiters=('emo_', ' '))
     # text = re.sub(r6, ' ', text)
     text = re.sub(r4, ' ', text)
     return text.lower()
Example #25
0
def get_soups(links, name):
       '''
       This function iterates over all search pages, converts them into a BeautifulSoup object and stores them in a JSON file as 
       outside of this script. The keys of the dictoniary distinguish here between the different objects/HTML-pages. 
       '''
       count = 0
       dict_ = {}
       soups = []
       for link in tqdm(links):
           sleep(random.uniform(0.5, 2))
           request = requests.get(link)
           request.encoding='UTF-8'
           soups.append(BeautifulSoup(request.text,'lxml'))
       for soup in soups:
           dict_[count] = str(deaccent(soup).encode("utf-8"))
           count += 1
       with open(name, 'w') as write_file:
           json.dump(dict_, write_file, indent = 4)
Example #26
0
def cleaning(string):
    string = ' '.join(
        [make_cleaning(w, normalized_chars) for w in string.split()])
    string = re.sub('\(dot\)', '.', string)
    string = deaccent(string)

    # remove href
    string = (re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string) if
              (len(re.findall(r'\<a (.*?)\>', string)) > 0) and
              ('href' in re.findall(r'\<a (.*?)\>', string)[0]) else string)

    string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '',
                    string)
    string = re.sub(r'http\S+|www.\S+|bit.\S+', '', string)
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']

    return ' '.join(string)
Example #27
0
 def pre_process_name(self, name):
     """
     Takes a string as input, removes accents and 
     converts to lowercase
     """
     if type(name) == str and len(name) > 0:
         name = deaccent(name)
         name = name.lower()
         first_name = name[0]
         if '+' in name:
             last_name = name[name.rfind('+') + 1:]
         else:
             last_name = name[1:]
         first_name = first_name.replace('.', '').replace('-', '').replace(
             '\'', '').replace(' ', '')
         first_init = first_name[0] if len(first_name) > 0 else ''
         last_name = last_name.replace('.', '').replace('-', '').replace(
             '\'', '').replace(' ', '')
         name = (first_init, last_name)
         return name
Example #28
0
def cleanEmailText(text):
    # Removes any accents
    text = utils.deaccent(text)
    # Replace hypens with spaces
    text = re.sub(r"-", " ", text)
    # Removes dates
    text = re.sub(r"\d+/\d+/\d+", "", text)
    # Removes times
    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text)
    # Removes email addresses
    text = re.sub(r"[\w]+@[\.\w]+", "", text)
    # Removes web addresses
    text = re.sub(r"/[a-zA-Z]*[:\/\/]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text)
    # Remove any bad characters
    clndoc = ''
    for eachLetter in text:
        if eachLetter.isalpha() or eachLetter == ' ':
            clndoc += eachLetter
    text = ' '.join(clndoc.split())
    return text
def preprocessing_tweet_text(tweet_text) -> List[str]:
    """
    Neural Language Model like ELMo does not need much normalisation. Pre-trained ELMo model only need pre-tokenised text.

    :param tweet_text:
    :return:
    """
    if not isinstance(tweet_text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    norm_tweet = tweet_text.lower()
    # remove retweets
    # norm_tweet = re.sub('rt @?[a-zA-Z0-9_]+:?', '', norm_tweet)
    norm_tweet = re.sub(r'^(rt)( @\w*)?[: ]', '', norm_tweet)
    # remove URL
    norm_tweet = re.sub(r"http\S+", "", norm_tweet)
    # remove pic URL
    norm_tweet = re.sub(r"pic.twitter.com\S+", "", norm_tweet)
    # remove user mentions
    norm_tweet = re.sub(r"(?:\@|https?\://)\S+", "", norm_tweet)
    # remove punctuations:
    # norm_tweet = re.sub(pattern=r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', repl='', string=norm_tweet).strip()
    # deaccent
    norm_tweet = deaccent(norm_tweet)

    tknzr = TweetTokenizer()
    tokenised_norm_tweet = tknzr.tokenize(norm_tweet)

    # https://www.shanelynn.ie/word-embeddings-in-python-with-spacy-and-gensim/

    # Set the minimum number of tokens to be considered
    if len(tokenised_norm_tweet) < 4:
        return []

    num_unique_terms = len(set(tokenised_norm_tweet))

    # Set the minimum unique number of tokens to be considered (optional)
    if num_unique_terms < 2:
        return []

    return tokenised_norm_tweet
Example #30
0
def to_normalize(data):
    if verbose: print('#' * 10, 'Step - Normalize chars and dots:')

    normalized_chars = {}

    chars = '‒–―‐—━—-▬'
    for char in chars:
        normalized_chars[ord(char)] = '-'

    chars = '«»“”¨"'
    for char in chars:
        normalized_chars[ord(char)] = '"'

    chars = "’'ʻˈ´`′‘’\x92"
    for char in chars:
        normalized_chars[ord(char)] = "'"

    chars = '̲_'
    for char in chars:
        normalized_chars[ord(char)] = '_'

    chars = '\xad\x7f'
    for char in chars:
        normalized_chars[ord(char)] = ''

    chars = '\n\r\t\u200b\x96'
    for char in chars:
        normalized_chars[ord(char)] = ' '

    # Normalize chars and dots - SEE HELPER FOR DETAILS
    # Global
    data = list(
        map(
            lambda x: ' '.join(
                [_make_cleaning(i, normalized_chars) for i in x.split()]),
            data))
    data = list(map(lambda x: re.sub('\(dot\)', '.', x), data))
    data = list(map(lambda x: deaccent(x), data))

    return data
Example #31
0
def clean_string(string):
    # Empty strings
    if not string or string == 'N':
        return None

    string = deaccent(string).lower()

    # Remove quote text
    string = re.sub(re_reply_to, '', string)
    string = re.sub(re_quote_line, '', string)

    string = re.sub(re_youtube_link, ' YOUTUBELINK ', string)
    string = re.sub(re_link, ' WEBLINK ', string)
    string = re.sub(re_pol_board, ' pol ', string)
    string = re.sub(re_b_board, ' RANDOMBOARD ', string)
    string = re.sub(re_chan_board, ' CHANBOARD ', string)

    string = strip_punctuation(string)

    # Punctuation to remove completely
    # string = re.sub(re_punc_to_none, '', string)

    # Substitute in this order
    # string = re.sub(re_ellipsis, ' <ELLIPSIS> ', string)
    # string = re.sub(re_echoes, ' <ECHOES> ', string)
    # string = re.sub(re_pol_board, ' <POLBOARD> ', string)
    # string = re.sub(re_numbers, ' <NUMBER> ', string)
    # string = re.sub(re_period, ' <PERIOD> ', string)
    # string = re.sub(re_question, ' <QUESTION> ', string)

    # Replace all other punc to spaces and remove whitespace in between
    # string = re.sub(re_punc_to_space, ' ', string)

    string = ' '.join([word for word in [w.strip() for w in string.split()]])

    return string if string else None
Example #32
0
def article_to_bow(article):
    tokens = [deaccent(tok).lower() for tok in list(itertools.chain(*article))
              if tok not in stopwords.words('norwegian') and tok.isalpha()]

    return tokens