def normalize(text): normalizr = Normalizr(language='en') normalizations = [ 'remove_extra_whitespaces', ('replace_punctuation', { 'replacement': ' ' }), 'lower_case', ('remove_stop_words', { 'ignore_case': 'False' }) ] h = HTMLParser() text = normalizr.normalize(xstr(text), normalizations) return str(h.unescape(text))
def text_processor(language='en', num=False, lower=False, level='token'): try: from normalizr import Normalizr except ImportError: try: from cucco import Cucco as Normalizr except ImportError: warnings.warn("Try installing normalizr or cucco") return lambda sent: sent normalizations = [ ('replace_emails', {'replacement': '<email>'}), ('replace_emojis', {'replacement': '<emoji>'}), ('replace_urls', {'replacement': '<url>'})] normalizr = Normalizr() import re NUM = re.compile('[0-9]+') def processor(sent): sent = normalizr.normalize(sent, normalizations) if num: sent = NUM.sub('<num>', sent) # number substitution if lower: sent = sent.lower() # downcase return segmenter(sent, level=level) return processor
def normalisation(tweet): mention_removed = re.sub(r'(?:@[\w_]+)', '', tweet.lower()) html_removed = re.sub(r'<[^>]+>', '', mention_removed) hashtag_removed = re.sub(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", '', html_removed) removed_repeated_chars = re.sub(r'(.)\1+', r'\1\1', hashtag_removed) normalised_text1 = re.sub(' +', ' ', removed_repeated_chars) normalizr = Normalizr(language='en') normalizations = [ ('replace_urls', { 'replacement': ' ' }), ('replace_punctuation', { 'replacement': ' ' }), ('replace_emojis', { 'replacement': ' ' }), ('replace_hyphens', { 'replacement': ' ' }), ('replace_symbols', { 'replacement': ' ' }), 'remove_accent_marks', 'remove_stop_words', 'remove_extra_whitespaces', ] normalised_text2 = normalizr.normalize(normalised_text1, normalizations) array_words = normalised_text2.split() #print (array_words) normalised_text3 = [correction(word) for word in array_words] normalised_tweet = " ".join(normalised_text3) return normalised_tweet
def text_processor( language='en', num=False, lower=False, level='token', normalize=True, max_len=None, min_len=0, ): normalizations = [('replace_emails', { 'replacement': '<email>' }), ('replace_emojis', { 'replacement': '<emoji>' }), ('replace_urls', { 'replacement': '<url>' })] normalizer = None try: from normalizr import Normalizr normalizer = Normalizr().normalize except ImportError: try: from cucco import Cucco normalizer = Cucco().normalize except ImportError: warnings.warn( "Try installing normalizr or cucco for better normalization") NUM = re.compile('[0-9]+') def processor(sent): if normalize and normalizer is not None: sent = normalizer(sent, normalizations) if num: sent = NUM.sub('<num>', sent) # number substitution if lower: sent = sent.lower() # downcase sent = segmenter(sent, level=level) if len(sent) <= min_len: return None if max_len is not None and len(sent) > max_len: return sent[:max_len] else: return sent return processor
#!/usr/bin/env python # -*- coding: utf-8 -*- """ Usage: %s output_dir corpus_name Extracts and normalizes all texts that are saved by the rss feed-crawler. The resulting file is a usable corpus for training with word2vec """ from pymongo import MongoClient from normalizr import Normalizr import codecs import re import sys normalizr = Normalizr(language='de') normalizations = [ 'remove_extra_whitespaces', 'replace_hyphens', 'remove_accent_marks', 'replace_symbols', ('replace_punctuation', { 'replacement': ' ' }) ] category_extractors = { u'Spiegel': re.compile(r"http://www\.spiegel\.de/(\w+)/"), u'Tagesschau': re.compile(r"http://www\.tagesschau\.de/(\w+)/"), u'N24': re.compile(
import sys import nltk import networkx as nx import matplotlib.pyplot as plotter from normalizr import Normalizr from collections import Counter normalizr = Normalizr(language='en') #Corpus variables #corpus = nx.Graph() #crpFileName = "us history timeline v2.txt" #crpFileName = "history.txt" #crpFile = open(crpFileName, 'r') #crpLines = crpFile.readline() #crpLines = crpLines.split(" ") #crpLines = [str for str in crpLines if str != ""] #crpLines = set()#[] #with open(crpFileName,'r') as f: #for line in f: #for word in line.split(): #if (word != "" and len(word) > 2 and word[0].isupper() and word[1].islower()): #crpLines.append(word) #crpLines.add(word) #crpEdgeWeight = 1 #capNodeWeight = 5 #crpMaxX = len(crpLines) #neighborNum = int(input("Min num neighbors for each node (2 or more):\r\n"))
import tweepy import time from normalizr import Normalizr #The current date date = "(" + str(time.localtime().tm_mon) + "-" + str( time.localtime().tm_mday) + ")" numTweetsToSave = input("Tweets to download: " + "\r\n") numTweetsToSave = int(numTweetsToSave) phraseToTrack = input("Phrase to track: " + "\r\n") fileName = phraseToTrack + " " + date + ".txt" tweetList = [] #Stores each tweet stringList = [] #Stores each word from each tweet (2D) normalizr = Normalizr(language='en') #Consumer key & secret, access token & secret ckey = "Fkd21xKJHEC6v9jrOJeesjGPG" csecret = "tjZQjWPjwbIIZlPjfIN3t1aVFX6JnwUF2BYS3di4V9WhyOw4VJ" atoken = "807338254202642432-Nzlv8Re6FV19FK621k9T5tjtFr2Knh0" asecret = "8s5ynqCDSplh6rQTxkp0UETsPfqe4q6SkoPGGhEXVr5A0" ''' The following code is based closely on code written by: Harrison Kinsley PythonProgramming.net It is used with his permission and can be found in the following YouTube playlist:
class Parser: """ This class offers a simple way to parse Spanish name (from Spain), classifying names and surnames and detecting the given name gender. Attributes: force_combinations (boolean): Force combinations during classification. force_split (boolean): Force name split if no surnames detected. normalize (boolean): Enable or disable normalization. require_surnames (boolean): Force the parser to guess gender on full names (those including names and surnames) """ __names, __ratios = {}, {} __surnames = set() __normalizr = Normalizr('es') __normalizations = normalizations = [ 'replace_hyphens', ('replace_symbols', { 'format': 'NFKC', 'excluded': set(['ñ', 'Ñ', 'ç', 'Ç']) }), ('replace_punctuation', { 'excluded': set('\'') }), ('remove_accent_marks', { 'excluded': set([u'\N{COMBINING TILDE}', u'\N{COMBINING CEDILLA}']) }), 'remove_extra_whitespaces' ] def __init__(self, force_combinations=True, force_split=True, normalize=True, require_surnames=False): self.__force_combinations = force_combinations self.__force_split = force_split self.__normalize = normalize self.__require_surnames = require_surnames self._load_data() def _load_data(self): """ Load all data files into memory. """ self._load_names() self._load_name_surname_ratios() self._load_surnames() def _load_names(self): """ Load names data file. This file contains a list of spanish given names with the probability for each one to be a male or female name. """ for line in self.remove_file_comments('names_ine.tsv'): (name, frequency, prob_male) = line.split('\t') self.__names[name] = float(prob_male) def _load_name_surname_ratios(self): """ Load name/surnames ratios data file. The file contains a list of names and surnames with the probability for each one to be a name (lower values) or a surname (higher values). """ for line in self.remove_file_comments('name_surname_ratio.tsv'): (key, val) = line.split('\t') self.__ratios[key] = float(val) def _load_surnames(self): """ Load names data file. This file contains a list of spanish surnames. """ for line in self.remove_file_comments('surnames_ine.tsv'): # self.__surnames.append(line.split('\t')[0]) self.__surnames.add(line.split('\t')[0]) def remove_file_comments(self, relative_path): """ Generator to remove comments from a file. Params: file: File to be processed. """ with codecs.open(os.path.join(path, 'data', relative_path), 'r', 'UTF-8') as file: for line in file: line = line.strip() if not line.startswith('#'): yield line def guess_gender(self, fullname): """ Guess the gender of the given full name. Params: fullname: Full name from where we want to guess the gender. Returns: A JSON string with all the computed information. """ if isinstance(fullname, str): if self.__normalize: fullname = self.__normalizr.normalize( fullname, self.__normalizations).lower() names, surnames = self._classify(fullname) if names and ( not self.__require_surnames or (surnames or (self.__force_split and self._is_splittable(names)))): real_name, ratio = self._get_gender_ratio(list(names)) return self._create_answer(real_name, ratio, names, surnames) def _is_splittable(self, names): """ Check if a list of names can be splitted in names and surnames. Params: names: List of name to be checked. Returns: True if can be splitted or false otherwise. """ if not self.__require_surnames or len(names) > 1: last_name = names[-1] return last_name in self.__ratios and self.__ratios[last_name] < 1 else: return False def _classify(self, fullname): """ Split fullname into tokens and classify them into names and surnames based on datasets. Params: fullname: Full name to be classified. Returns: Two lists, one with names and the other with surnames. """ names, surnames = [], [] unclassified = [] processed = [] for word in fullname.split(): combination_found = False if self.__force_combinations: combination_found = self._combine_words( processed, word, names, surnames) if not combination_found: keep_going = True if unclassified: if self._classify_word(unclassified[-1] + ' ' + word, names, surnames, unclassified): keep_going = False if keep_going: if unclassified: self._classify_word(word, names, surnames) else: self._classify_word(word, names, surnames, unclassified) processed.append(word) return names, surnames def _combine_words(self, processed, word, names, surnames): """ Try to combine last processed word with the word received as parameter. If the combination of both words is a name, this is added to the list, replacing the name added previously. Params: processed: List of words already processed. word: Current word. names: List of classified names. Returns: A valid combination if found or None otherwise. """ found_combination = False if processed: last_word = processed[-1] combination = last_word + ' ' + word if combination in self.__names: names.append(combination) found_combination = True elif combination in self.__surnames: surnames.append(combination) found_combination = True if found_combination: processed[-1] = combination if names and last_word in names: names.pop(names.index(last_word)) if surnames and last_word in surnames: surnames.pop(surnames.index(last_word)) return found_combination def _classify_word(self, word, names, surnames, unclassified=None): """ Try to classify a word in name or surname based on datasets. Params: word: Word to be classified. names: List of classified names. surnames: List of classified surnames. unclassified: List of words without match. Returns: True if the word was classified. False otherwise. """ classified = True if word in self.__ratios: if (not names or self.__ratios[word] > 0.5) and not surnames: names.append(word) else: surnames.append(word) else: if word in self.__surnames and names: surnames.append(word) elif word in self.__names and not surnames: names.append(word) else: if unclassified is not None: unclassified.append(word) classified = False if classified and unclassified is not None: unclassified.clear() return classified def _get_gender_ratio(self, names): """ Returns the male/female ratio for the given names. To do this, the function compute possible names combining items on the list and try to form the longest name possible. The value returned go from 0 to 1. Values near to 1 represent a higher possibility of the evaluated name to be a male name. Params: names: List of names. Returns: The longest name computed by combining items in the list, and the male/female ratio. """ for i in range(len(names), 0, -1): real_name = ' '.join(names[:i]) if real_name in self.__names: return real_name, self.__names[real_name] def _create_answer(self, real_name, ratio, names, surnames): """ Process computed data and generated a JSON answer. Params: real_name: Real name (computed name) extracted from the original text. ratio: Male/female ratio. names: Names identified on the original text. surnames: Surnames identified on the original text. Returns: A JSON string with all the computed information. """ answer = OrderedDict() answer['names'] = names answer['surnames'] = surnames answer['real_name'] = real_name male = ratio > 0.5 answer['gender'] = 'Male' if male else 'Female' answer['confidence'] = ratio if male else 1 - ratio return answer
# -*- coding: utf-8 -*- from normalizr import Normalizr arq = open("negocios.txt", 'r') arq_2 = open("FINAL_Negocios.txt", 'w') #arq_2 = open("FINAL_Negocios.txt", 'w') normalizr = Normalizr(language='en') texto = "" for s in arq: s = s.replace("\n", " ") s = s.replace("?", " ") s = s.replace('“', "") s = s.replace(":", " ") #s = s.replace("'", " ") s = s.replace("+", " ") s = s.replace(";", " ") s = s.replace(",", " ") s = s.replace('"', " ") s = s.replace('(', " ") s = s.replace(')', " ") s = s.replace("\t", "") s = s.replace('\\', "") s = s.replace('‘', " ") s = s.replace('!', " ") s = s.replace('.', " ")
import html import re import fjlc.classifier.classifier_options as classifier_options from fjlc.preprocessing.filters.regex_filters import RegexFilters from normalizr import Normalizr normalizr = Normalizr(language="en") class Filters: USERNAME_PLACEHOLDER = " ||username|| " HASHTAG_PLACEHOLDER = " ||hashtag|| " RTTAG_PLACEHOLDER = " ||rt|| " URL_PLACEHOLDER = " ||url|| " def __init__(self, string_filters, token_filters): self.string_filters = string_filters self.token_filters = token_filters def apply(self, text): text = self.string_chain(text, self.string_filters) return self.token_chain(text, self.token_filters).strip() @staticmethod def string_chain(text, filters): """ Chain several filters after each other, applies the filter on the entire string :param text: String to format :param filters: Sequence of filters to apply on String :return: The formatted String """