def normalize(text):
    normalizr = Normalizr(language='en')
    normalizations = [
        'remove_extra_whitespaces',
        ('replace_punctuation', {
            'replacement': ' '
        }), 'lower_case', ('remove_stop_words', {
            'ignore_case': 'False'
        })
    ]
    h = HTMLParser()
    text = normalizr.normalize(xstr(text), normalizations)
    return str(h.unescape(text))
Exemple #2
0
def text_processor(language='en', num=False, lower=False, level='token'):
    try:
        from normalizr import Normalizr
    except ImportError:
        try:
            from cucco import Cucco as Normalizr
        except ImportError:
            warnings.warn("Try installing normalizr or cucco")
            return lambda sent: sent

    normalizations = [
        ('replace_emails', {'replacement': '<email>'}),
        ('replace_emojis', {'replacement': '<emoji>'}),
        ('replace_urls', {'replacement': '<url>'})]
    normalizr = Normalizr()

    import re
    NUM = re.compile('[0-9]+')

    def processor(sent):
        sent = normalizr.normalize(sent, normalizations)
        if num:
            sent = NUM.sub('<num>', sent)  # number substitution
        if lower:
            sent = sent.lower()  # downcase
        return segmenter(sent, level=level)

    return processor
def normalisation(tweet):
    mention_removed = re.sub(r'(?:@[\w_]+)', '', tweet.lower())
    html_removed = re.sub(r'<[^>]+>', '', mention_removed)
    hashtag_removed = re.sub(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", '',
                             html_removed)
    removed_repeated_chars = re.sub(r'(.)\1+', r'\1\1', hashtag_removed)
    normalised_text1 = re.sub(' +', ' ', removed_repeated_chars)

    normalizr = Normalizr(language='en')

    normalizations = [
        ('replace_urls', {
            'replacement': ' '
        }),
        ('replace_punctuation', {
            'replacement': ' '
        }),
        ('replace_emojis', {
            'replacement': ' '
        }),
        ('replace_hyphens', {
            'replacement': ' '
        }),
        ('replace_symbols', {
            'replacement': ' '
        }),
        'remove_accent_marks',
        'remove_stop_words',
        'remove_extra_whitespaces',
    ]

    normalised_text2 = normalizr.normalize(normalised_text1, normalizations)
    array_words = normalised_text2.split()
    #print (array_words)

    normalised_text3 = [correction(word) for word in array_words]
    normalised_tweet = " ".join(normalised_text3)

    return normalised_tweet
Exemple #4
0
def text_processor(
    language='en',
    num=False,
    lower=False,
    level='token',
    normalize=True,
    max_len=None,
    min_len=0,
):
    normalizations = [('replace_emails', {
        'replacement': '<email>'
    }), ('replace_emojis', {
        'replacement': '<emoji>'
    }), ('replace_urls', {
        'replacement': '<url>'
    })]

    normalizer = None
    try:
        from normalizr import Normalizr
        normalizer = Normalizr().normalize
    except ImportError:
        try:
            from cucco import Cucco
            normalizer = Cucco().normalize
        except ImportError:
            warnings.warn(
                "Try installing normalizr or cucco for better normalization")

    NUM = re.compile('[0-9]+')

    def processor(sent):
        if normalize and normalizer is not None:
            sent = normalizer(sent, normalizations)
        if num:
            sent = NUM.sub('<num>', sent)  # number substitution
        if lower:
            sent = sent.lower()  # downcase

        sent = segmenter(sent, level=level)

        if len(sent) <= min_len:
            return None

        if max_len is not None and len(sent) > max_len:
            return sent[:max_len]
        else:
            return sent

    return processor
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Usage: %s output_dir corpus_name

Extracts and normalizes all texts that are saved by the rss feed-crawler.
The resulting file is a usable corpus for training with word2vec
"""

from pymongo import MongoClient
from normalizr import Normalizr
import codecs
import re
import sys

normalizr = Normalizr(language='de')

normalizations = [
    'remove_extra_whitespaces', 'replace_hyphens', 'remove_accent_marks',
    'replace_symbols', ('replace_punctuation', {
        'replacement': ' '
    })
]

category_extractors = {
    u'Spiegel':
    re.compile(r"http://www\.spiegel\.de/(\w+)/"),
    u'Tagesschau':
    re.compile(r"http://www\.tagesschau\.de/(\w+)/"),
    u'N24':
    re.compile(
Exemple #6
0
import sys
import nltk
import networkx as nx
import matplotlib.pyplot as plotter
from normalizr import Normalizr
from collections import Counter

normalizr = Normalizr(language='en')

#Corpus variables
#corpus = nx.Graph()
#crpFileName = "us history timeline v2.txt"
#crpFileName = "history.txt"
#crpFile = open(crpFileName, 'r')
#crpLines = crpFile.readline()
#crpLines = crpLines.split(" ")
#crpLines = [str for str in crpLines if str != ""]

#crpLines = set()#[]
#with open(crpFileName,'r') as f:
#for line in f:
#for word in line.split():
#if (word != "" and len(word) > 2 and word[0].isupper() and word[1].islower()):
#crpLines.append(word)
#crpLines.add(word)

#crpEdgeWeight = 1
#capNodeWeight = 5
#crpMaxX = len(crpLines)

#neighborNum = int(input("Min num neighbors for each node (2 or more):\r\n"))
Exemple #7
0
import tweepy
import time
from normalizr import Normalizr

#The current date
date = "(" + str(time.localtime().tm_mon) + "-" + str(
    time.localtime().tm_mday) + ")"

numTweetsToSave = input("Tweets to download: " + "\r\n")
numTweetsToSave = int(numTweetsToSave)
phraseToTrack = input("Phrase to track: " + "\r\n")
fileName = phraseToTrack + " " + date + ".txt"
tweetList = []  #Stores each tweet
stringList = []  #Stores each word from each tweet (2D)

normalizr = Normalizr(language='en')

#Consumer key & secret, access token & secret
ckey = "Fkd21xKJHEC6v9jrOJeesjGPG"
csecret = "tjZQjWPjwbIIZlPjfIN3t1aVFX6JnwUF2BYS3di4V9WhyOw4VJ"
atoken = "807338254202642432-Nzlv8Re6FV19FK621k9T5tjtFr2Knh0"
asecret = "8s5ynqCDSplh6rQTxkp0UETsPfqe4q6SkoPGGhEXVr5A0"
'''
The following code is based closely on code written by:

Harrison Kinsley
PythonProgramming.net

It is used with his permission and can be found in the following YouTube
playlist:
Exemple #8
0
class Parser:
    """
    This class offers a simple way to parse Spanish name (from Spain), classifying
    names and surnames and detecting the given name gender.

    Attributes:
        force_combinations (boolean): Force combinations during classification.
        force_split (boolean): Force name split if no surnames detected.
        normalize (boolean): Enable or disable normalization.
        require_surnames (boolean): Force the parser to guess gender on full names (those including names and surnames)
    """
    __names, __ratios = {}, {}
    __surnames = set()

    __normalizr = Normalizr('es')
    __normalizations = normalizations = [
        'replace_hyphens',
        ('replace_symbols', {
            'format': 'NFKC',
            'excluded': set(['ñ', 'Ñ', 'ç', 'Ç'])
        }), ('replace_punctuation', {
            'excluded': set('\'')
        }),
        ('remove_accent_marks', {
            'excluded': set([u'\N{COMBINING TILDE}', u'\N{COMBINING CEDILLA}'])
        }), 'remove_extra_whitespaces'
    ]

    def __init__(self,
                 force_combinations=True,
                 force_split=True,
                 normalize=True,
                 require_surnames=False):
        self.__force_combinations = force_combinations
        self.__force_split = force_split
        self.__normalize = normalize
        self.__require_surnames = require_surnames

        self._load_data()

    def _load_data(self):
        """
        Load all data files into memory.
        """
        self._load_names()
        self._load_name_surname_ratios()
        self._load_surnames()

    def _load_names(self):
        """
        Load names data file.

        This file contains a list of spanish given names with the probability for
        each one to be a male or female name.
        """
        for line in self.remove_file_comments('names_ine.tsv'):
            (name, frequency, prob_male) = line.split('\t')
            self.__names[name] = float(prob_male)

    def _load_name_surname_ratios(self):
        """
        Load name/surnames ratios data file.

        The file contains a list of names and surnames with the probability for each
        one to be a name (lower values) or a surname (higher values).
        """
        for line in self.remove_file_comments('name_surname_ratio.tsv'):
            (key, val) = line.split('\t')
            self.__ratios[key] = float(val)

    def _load_surnames(self):
        """
        Load names data file.

        This file contains a list of spanish surnames.
        """
        for line in self.remove_file_comments('surnames_ine.tsv'):
            # self.__surnames.append(line.split('\t')[0])
            self.__surnames.add(line.split('\t')[0])

    def remove_file_comments(self, relative_path):
        """
        Generator to remove comments from a file.

        Params:
            file: File to be processed.
        """
        with codecs.open(os.path.join(path, 'data', relative_path), 'r',
                         'UTF-8') as file:
            for line in file:
                line = line.strip()
                if not line.startswith('#'):
                    yield line

    def guess_gender(self, fullname):
        """
        Guess the gender of the given full name.

        Params:
            fullname: Full name from where we want to guess the gender.

        Returns:
            A JSON string with all the computed information.
        """
        if isinstance(fullname, str):
            if self.__normalize:
                fullname = self.__normalizr.normalize(
                    fullname, self.__normalizations).lower()

            names, surnames = self._classify(fullname)

            if names and (
                    not self.__require_surnames or
                (surnames or
                 (self.__force_split and self._is_splittable(names)))):
                real_name, ratio = self._get_gender_ratio(list(names))
                return self._create_answer(real_name, ratio, names, surnames)

    def _is_splittable(self, names):
        """
        Check if a list of names can be splitted in names and surnames.

        Params:
            names: List of name to be checked.

        Returns:
            True if can be splitted or false otherwise.
        """
        if not self.__require_surnames or len(names) > 1:
            last_name = names[-1]
            return last_name in self.__ratios and self.__ratios[last_name] < 1
        else:
            return False

    def _classify(self, fullname):
        """
        Split fullname into tokens and classify them into names and surnames based on datasets.

        Params:
            fullname: Full name to be classified.

        Returns:
            Two lists, one with names and the other with surnames.
        """
        names, surnames = [], []
        unclassified = []
        processed = []

        for word in fullname.split():
            combination_found = False

            if self.__force_combinations:
                combination_found = self._combine_words(
                    processed, word, names, surnames)

            if not combination_found:
                keep_going = True
                if unclassified:
                    if self._classify_word(unclassified[-1] + ' ' + word,
                                           names, surnames, unclassified):
                        keep_going = False
                if keep_going:
                    if unclassified:
                        self._classify_word(word, names, surnames)
                    else:
                        self._classify_word(word, names, surnames,
                                            unclassified)

                processed.append(word)

        return names, surnames

    def _combine_words(self, processed, word, names, surnames):
        """
        Try to combine last processed word with the word received as parameter.

        If the combination of both words is a name, this is added to the list, replacing
        the name added previously.

        Params:
            processed: List of words already processed.
            word: Current word.
            names: List of classified names.

        Returns:
            A valid combination if found or None otherwise.
        """
        found_combination = False

        if processed:
            last_word = processed[-1]
            combination = last_word + ' ' + word

            if combination in self.__names:
                names.append(combination)
                found_combination = True
            elif combination in self.__surnames:
                surnames.append(combination)
                found_combination = True

        if found_combination:
            processed[-1] = combination
            if names and last_word in names: names.pop(names.index(last_word))
            if surnames and last_word in surnames:
                surnames.pop(surnames.index(last_word))

        return found_combination

    def _classify_word(self, word, names, surnames, unclassified=None):
        """
        Try to classify a word in name or surname based on datasets.

        Params:
            word: Word to be classified.
            names: List of classified names.
            surnames: List of classified surnames.
            unclassified: List of words without match.

        Returns:
            True if the word was classified. False otherwise.
        """
        classified = True

        if word in self.__ratios:
            if (not names or self.__ratios[word] > 0.5) and not surnames:
                names.append(word)
            else:
                surnames.append(word)
        else:
            if word in self.__surnames and names:
                surnames.append(word)
            elif word in self.__names and not surnames:
                names.append(word)
            else:
                if unclassified is not None:
                    unclassified.append(word)
                classified = False

        if classified and unclassified is not None: unclassified.clear()

        return classified

    def _get_gender_ratio(self, names):
        """
        Returns the male/female ratio for the given names.

        To do this, the function compute possible names combining items on the list
        and try to form the longest name possible.

        The value returned go from 0 to 1. Values near to 1 represent a higher possibility
        of the evaluated name to be a male name.

        Params:
            names: List of names.

        Returns:
            The longest name computed by combining items in the list,
            and the male/female ratio.
        """
        for i in range(len(names), 0, -1):
            real_name = ' '.join(names[:i])
            if real_name in self.__names:
                return real_name, self.__names[real_name]

    def _create_answer(self, real_name, ratio, names, surnames):
        """
        Process computed data and generated a JSON answer.

        Params:
            real_name: Real name (computed name) extracted from the original text.
            ratio: Male/female ratio.
            names: Names identified on the original text.
            surnames: Surnames identified on the original text.

        Returns:
            A JSON string with all the computed information.
        """
        answer = OrderedDict()
        answer['names'] = names
        answer['surnames'] = surnames
        answer['real_name'] = real_name
        male = ratio > 0.5
        answer['gender'] = 'Male' if male else 'Female'
        answer['confidence'] = ratio if male else 1 - ratio
        return answer
Exemple #9
0
# -*- coding: utf-8 -*-

from normalizr import Normalizr

arq = open("negocios.txt", 'r')

arq_2 = open("FINAL_Negocios.txt", 'w')

#arq_2 = open("FINAL_Negocios.txt", 'w')

normalizr = Normalizr(language='en')

texto = ""

for s in arq:
    s = s.replace("\n", " ")
    s = s.replace("?", " ")
    s = s.replace('“', "")
    s = s.replace(":", " ")
    #s = s.replace("'", " ")
    s = s.replace("+", " ")
    s = s.replace(";", " ")
    s = s.replace(",", " ")
    s = s.replace('"', " ")
    s = s.replace('(', " ")
    s = s.replace(')', " ")
    s = s.replace("\t", "")
    s = s.replace('\\', "")
    s = s.replace('‘', " ")
    s = s.replace('!', " ")
    s = s.replace('.', " ")
Exemple #10
0
import html
import re
import fjlc.classifier.classifier_options as classifier_options
from fjlc.preprocessing.filters.regex_filters import RegexFilters
from normalizr import Normalizr

normalizr = Normalizr(language="en")


class Filters:
    USERNAME_PLACEHOLDER = " ||username|| "
    HASHTAG_PLACEHOLDER = " ||hashtag|| "
    RTTAG_PLACEHOLDER = " ||rt|| "
    URL_PLACEHOLDER = " ||url|| "

    def __init__(self, string_filters, token_filters):
        self.string_filters = string_filters
        self.token_filters = token_filters

    def apply(self, text):
        text = self.string_chain(text, self.string_filters)
        return self.token_chain(text, self.token_filters).strip()

    @staticmethod
    def string_chain(text, filters):
        """
        Chain several filters after each other, applies the filter on the entire string
        :param text: String to format
        :param filters: Sequence of filters to apply on String
        :return: The formatted String
        """