コード例 #1
0
def main(components=None):
    initials, vowels, finals, repeat_cnt, total_cnt = components or gibberish_components(
    )
    pf = ProfanityFilter()
    cnt = 0
    profane_cnt = 0
    with alive_bar(total_cnt) as bar:
        for i in initials:
            for v in vowels:
                for f in finals:
                    prefix = ''.join([i, v, f])
                    if pf.is_profane(prefix):
                        print(
                            cnt, 'All %s words beginning with "%s..."' %
                            (repeat_cnt, prefix))
                        cnt += repeat_cnt
                        profane_cnt += repeat_cnt
                        bar(incr=repeat_cnt)
                        continue
                    for v2 in vowels:
                        for f2 in finals:
                            cnt += 1
                            word = ''.join([prefix, v2, f2])
                            if pf.is_profane(word):
                                profane_cnt += 1
                                print(cnt, word)
                            bar()
    print('Done! Found %s profane words in %s total' % (profane_cnt, cnt))
コード例 #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Profanity filter console utility')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-t',
                       '--text',
                       dest='text',
                       help='Test the given text for profanity')
    group.add_argument('-f',
                       '--file',
                       dest='path',
                       help='Test the given file for profanity')
    parser.add_argument(
        '-l',
        '--languages',
        dest='languages',
        default='en',
        help='Test for profanity using specified languages (comma separated)')
    parser.add_argument('-o',
                        '--output',
                        dest='output_file',
                        help='Write the censored output to a file')
    parser.add_argument('--show',
                        action='store_true',
                        help='Print the censored text')

    args = parser.parse_args()

    if args.text and args.path:
        parser.print_help()
        exit()

    if args.text:
        text = args.text
    elif args.path:
        with open(args.path) as f:
            text = ''.join(f.readlines())
    else:
        text = ''

    pf = ProfanityFilter(languages=args.languages.split(','))
    censored_text = pf.censor(text)

    if args.output_file:
        with open(args.output_file, 'w') as f:
            f.write(censored_text)
        print("Censored text written to output file at: " + args.output_file)

    if args.show:
        print("Censored text:\n")
        print(censored_text)

    if args.show or args.output_file:
        return

    if pf.is_clean(text):
        print("This text is clean.")
    else:
        print("This text is not clean!")
コード例 #3
0
def main():
    start_time = time()

    print("Running Basic Setup Steps....")
    config_loader = ConfigLoader()
    output_directory_path = config_loader.get_base_path(
    ) + config_loader.get_output_directory_name()
    if not os.path.exists(output_directory_path):
        os.makedirs(output_directory_path)
    parser = Parser(config_loader)
    profanity_filter = ProfanityFilter(config_loader, parser)
    de_duplicator = DeDuplicator(parser)
    keyword_dictionary_builder = KeywordDictionaryBuilder(parser)
    sym_spell_checker = SymSpellChecker(config_loader, parser)

    print("Running Parser....")
    parser.parse(config_loader.get_query_logs_file_path(),
                 config_loader.get_frequency_file_path(),
                 config_loader.get_max_total_queries())

    print("Running De-duplicator....")
    de_duplicator.remove_duplicates(
        config_loader.get_frequency_file_path(),
        config_loader.get_frequency_file_path(),
        config_loader.get_de_duplicated_keyword_ordered_1_file_path(),
        config_loader.get_de_duplicated_missing_space_1_file_path(),
        config_loader.get_de_duplicated_synonyms_1_file_path())

    print("Running Profanity Filter....")
    profanity_filter.remove_profane_queries(
        config_loader.get_frequency_file_path(),
        config_loader.get_frequency_file_path(),
        config_loader.get_filtered_profane_queries_file_path())

    print("Running Keyword Dictionary Builder....")
    keyword_dictionary_builder.build_dictionary_file_from_frequency_file(
        config_loader.get_frequency_file_path(),
        config_loader.get_dictionary_file_path())

    print("Running SymSpell Checker....")
    sym_spell_checker.run_sym_spell(config_loader.get_sym_spell_iterations(),
                                    config_loader.get_frequency_file_path(),
                                    config_loader.get_dictionary_file_path(),
                                    config_loader.get_dictionary_file_path())

    print("Running De-duplicator....")
    de_duplicator.remove_duplicates(
        config_loader.get_dictionary_file_path(),
        config_loader.get_dictionary_file_path(),
        config_loader.get_de_duplicated_keyword_ordered_2_file_path(),
        config_loader.get_de_duplicated_missing_space_2_file_path(),
        config_loader.get_de_duplicated_synonyms_2_file_path())

    print("Completed!!!")

    print("Total time taken: ", (time() - start_time) / 60, " minutes")
コード例 #4
0
def get_profanities(words, custom_profanities=None):
    pf = ProfanityFilter()
    if custom_profanities is not None:
        pf.custom_profane_word_dictionaries = {'en': custom_profanities}
    swears = []
    for w in words:
        cw = pf.censor_word(w)
        if cw.is_profane:
            swears.append(cw.original_profane_word)
    return swears
コード例 #5
0
 def __init__(self, profane_words_filepath: str):
     words = []
     with open(profane_words_filepath, encoding='utf8') as f:
         for line in f:
             word = line.strip()
             words.append(word)
             if word.count('ё') > 0:
                 word = word.replace('ё', 'е')
             words.append(word)
     self._ru_words = words
     self._ru_pf = ProfanityFilter()
     self._ru_pf.custom_profane_word_dictionaries = {'en': words}
     self._r = sr.Recognizer()
コード例 #6
0
def applyProfanityFilter():
    pf = ProfanityFilter()
    pf.censor_char = '@'

    with open('media/recording1/transcript.csv', mode='w+') as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csv_reader = csv.DictReader(csv_file)            
        for row in csv_reader:
            if pf.is_clean(row['sentence']):
                continue
            else:
                csv_writer.writerow(['***', '****', '****' , '*****', '*****'])
        csv_file.close
def Predict(texts):
    pf = ProfanityFilter()
    sid = SentimentIntensityAnalyzer()
    labels = []
    for text in texts:
        if (pf.is_profane(text)):
            labels.append(0)
        else:
            ss = sid.polarity_scores(text)
            if (ss['compound'] <= -0.05):
                labels.append(0)
            else:
                labels.append(1)
    return labels
コード例 #8
0
class CommentForm(forms.Form):

    name = forms.CharField(label='Your Name', max_length=100)
    email = forms.EmailField(label='Your Email')
    content = forms.CharField(label='Comment', widget=forms.Textarea)

    def __init__(self, *args, **kwargs):
        self.pf = ProfanityFilter()
        super(CommentForm, self).__init__(*args, **kwargs)

    def clean_name(self):
        name = self.cleaned_data['name']

        # The name can only have a certain size
        if len(name) > 80:
            raise ValidationError('The name cannot be longer than 80 characters')

        return name

    def clean_content(self):
        content = self.cleaned_data['content']

        # Profanity is not allowed
        if not self.pf.is_clean(content):
            raise ValidationError('Profanity is not allowed in the comments!')

        # TODO: Alternative is to use a html sanitizer
        # No html markup is allowed
        soup = BeautifulSoup(content, 'html.parser')
        if bool(soup.find()):
            raise ValidationError('No html markup allowed in the content of a comment! Please understand that '
                                  'permitting html markup in comments is risky and vulnerable to attacks.')

        return content
コード例 #9
0
def is_profane(url):

    if len(url) < 3:
        return False

    if getattr(settings, "ENABLE_FAST_PROFANITY_CHECKING", True):
        parts = urlparse(get_decodedurl(url))
        partslist = []
        if not (parts.path or parts.netloc):
            raise InvalidURLError(
                "Badly formatted URL passed to is_url_profane")
        splitters = r"\.|\/|\_|\-|\~|\$|\+|\!|\*|\(|\)|\,"  # all the URL-safe characters, escaped
        if parts.netloc:
            partslist = partslist + re.split(splitters, parts.netloc)
        if parts.path:
            partslist = partslist + re.split(splitters, parts.path)
        if parts.query:
            partslist = partslist + re.split(splitters, parts.query)

        # speed optimization
        check4btlw = True
        stringlist = []
        for item in partslist:
            if len(item) > 0:
                if len(item) > 5:
                    check4btlw = False
                for substring in get_all_substrings(item, 2):
                    if len(substring) > 0:
                        stringlist.append(substring)
        partslist = list(dict.fromkeys(stringlist))  # removes dupes

        if check4btlw:
            for part in partslist:
                if part in BAD_THREE_LETTER_WORDS:
                    return True

        score = PredictProfanity(partslist)
        if score.any() == 1:
            return True

        if getattr(settings, "ENABLE_DEEP_PROFANITY_CHECKING", True):
            pf = ProfanityFilter()
            for part in partslist:
                if pf.is_profane(part):
                    return True

    return False
コード例 #10
0
ファイル: views.py プロジェクト: vdarakjian/tcc-deep-learning
    def list(self, request):
        if all(k in request.query_params
               for k in ('comment', 'deep_flag', 'lang')):
            comment = request.query_params['comment']
            deep_flag = util.strtobool(request.query_params['deep_flag'])
            lang = request.query_params['lang']

            pf = ProfanityFilter(censor_whole_words=False,
                                 deep_analysis=deep_flag,
                                 languages=[lang])
            return Response({
                'comment': pf.censor(comment),
                'approved': pf.is_clean(comment)
            })
        else:
            return Response({'error_message': 'All params are required'},
                            status=status.HTTP_400_BAD_REQUEST)
コード例 #11
0
ファイル: chatbot.py プロジェクト: sclie001/shellhacks2020
def chat():
    incoming_msg = request.values.get('Body', '')
    resp = MessagingResponse()
    msg = resp.message()
    msg.body("")
    if "!start" in incoming_msg:
        msg.body(
            "Greetings! I am ModBot, here to watch over this chat. \n\nNow that you're all here, feel free to introduce yourselves. To break the ice, answer the following question: "
            + choose_icebreaker())
    elif "!icebreaker" in incoming_msg:
        msg.body("Answer the question: " + choose_icebreaker())
    else:
        pf = ProfanityFilter()
        if not pf.is_clean(incoming_msg):
            msg.body(
                "Please refrain from using inappropriate language. This is meant to be a safe space."
            )
    return str(resp)
コード例 #12
0
    def check_profanity_filter_text():

        pf = ProfanityFilter()

        #Opens the text file from the given location.
        file_location = open('/yourfilelocation/filename.txt')

        # Opens the file. This line should be added when if your text file is in same location a program.
        #file_location = open('profanity.txt')

        #Read is a built in function of python to read files.
        content_of_file = file_location.read()

        #Censor is a built in function of ProfanityFilter package to check profanity of a sentence.
        text = pf.censor(content_of_file)

        #Prints the contents of the file where offensive words are marked by "*".
        print(text)
コード例 #13
0
    def process(self, message, **kwargs):
        # burada custom olarak ne yapmak istiyorsak tanimliyoruz
        pf = ProfanityFilter()
        text = message.text
        value = "na"
        confidence = 0
        #ornegin text = "This is shit."
        # Eger kelime kufurse confidence skor olarak 100 atiyoruz
        if pf.is_profane(text):
            tokens = text.split(" ")
            for token in tokens:
                if pf.is_profane(token):
                    value = token
                    confidence =100

        if value != 'na':
            entity = self.convert_to_rasa(value, confidence)
            message.set("entities", [entity], add_to_output=True)
        else:
            pass 
コード例 #14
0
    def process(self, message, **kwargs):

        pf = ProfanityFilter()

        text = message.text
        #text = "This is shit"  == True  | False if True:
        value = 'na'
        confidence = 0
        if pf.is_profane(text):
            tokens = text.split(" ")
            for token in tokens:
                if pf.is_profane(token):
                    value = token
                    confidence = 100
        if value != 'na':
            entity = self.convert_to_rasa(value, confidence)

            message.set("entities", [entity], add_to_output=True)
        else:
            pass
コード例 #15
0
def test2(channel):
    import time
    start = time.time()
    from profanity_filter import ProfanityFilter
    pf = ProfanityFilter()
    end = time.time()
    #import json
    #j = json.load(open("/srv/CARL/channels/"+channel+".json",'r'))
    #s = ""
    #n = 0
    #for phrase in j["phrases"]:
    #clean = pf.is_clean(phrase)
    #s += str(clean) + " " + phrase + "<br/>\n"
    #if not clean: n += 1
    #return s + str(n) + "<br/>\n" + str(end-start)
    return str(end - start)
コード例 #16
0
class Administration(commands.Cog):
    """Commands for server admins."""

    def __init__(self, bot: commands.Bot):
        self.bot = bot
        self.check_actions.start()

        self.pf = ProfanityFilter()

    @property
    def db(self):
        return self.bot.get_cog("Database")

    async def check_message(self, ctx: commands.Context):
        ignore = False
        delete = False

        if not self.pf.is_clean(ctx.message.content) and not (
            ctx.message.channel.id == 728830756071276665
        ):
            ignore = True
            delete = True

        return ignore, delete

    async def mute_member(self, member: discord.Member, duration: timedelta = None):
        role = next(filter(lambda x: x.name == "Muted", member.guild.roles))

        if role in member.roles:
            raise AlreadyDoneError()

        if duration is None:
            self.db.update_member(member, muted=True)
            await member.add_roles(role)
        else:
            self.db.create_temp_action(member, "mute", duration)
            self.db.update_member(member, muted=True)
            await member.add_roles(role)

    async def unmute_member(self, member: discord.Member):
        if (
            role := next(filter(lambda x: x.name == "Muted", member.guild.roles))
        ) in member.roles:
            models.TempAction.objects(member=self.db.fetch_member(member)).delete()
            self.db.update_member(member, muted=False)
            await member.remove_roles(role)
        else:
コード例 #17
0
def profanity_filter():
    return ProfanityFilter()
コード例 #18
0
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import Word, TextBlob
from profanity_check import predict, predict_prob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pickle
from profanity_filter import ProfanityFilter
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.sequence import pad_sequences
import time
#----------------------------------------------------------
# Config ....
sid = SentimentIntensityAnalyzer()
nlp = spacy.load("en_core_web_sm")
pf = ProfanityFilter(nlps={'en': nlp})
nlp.add_pipe(pf.spacy_component, last=True)
stop = stopwords.words('english')
special_char = [
    '~', '@', '$', '#', '%', '^', '&', '*', '(', ')', '-', '_', ',', ';', '/',
    '\\', '>', '<', '|', '[', ']', '}', '{', '"', '\'', '`', '?', '!', '...'
]
path_dir = '/home/bassem/DataDriven_HatfulMemes/data/'
print('----------------------------------------------------------------------')
# Get features from text :


def getsentiment(text):
    # remove stopp words:
    text = text.replace('[^\w\s]', '')
    text = " ".join(x for x in text.split() if x not in stop)
コード例 #19
0
 def __init__(self, *args, **kwargs):
     self.pf = ProfanityFilter()
     super(CommentForm, self).__init__(*args, **kwargs)
コード例 #20
0
import discord
from profanity_filter import ProfanityFilter

client = discord.Client()

pf = ProfanityFilter()

pf.extra_profane_word_dictionaries = {
    'en': {
        'dumbass', 'MOTHERFUCKERS', 'motherfuckers', 'benchod', 'madrachod',
        'BENCHOD', 'MADRACHOD'
    }
}  # not case insensitive; should I add dhigger (the n word for Indians)... nah


@client.event
async def on_ready():
    print('We have logged in as {0.user}'.format(client))
    await client.change_presence(activity=discord.Activity(
        type=discord.ActivityType.listening,
        name="52 Stories by Omar Waseem on Spotify and iTunes",
        title="52 Stories",
        color=discord.Color.green()))  # selmshots left the podcast :(
    #await client.change_presence(activity=discord.Spotify(type=discord.ActivityType.listening, name="Spotify", title="52 Stories"))
    #await client.change_presence(activity=discord.Spotify(title="52 Stories"))


@client.event
async def on_message(message):
    if message.author == client.user:
        return
コード例 #21
0
import argparse
import json
import pickle
import os
import random
import subprocess
import torch
import time
import tqdm
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from style_paraphrase.inference_utils import GPT2Generator
from profanity_filter import ProfanityFilter

pf = ProfanityFilter()

parser = argparse.ArgumentParser()

parser.add_argument('--seed',
                    type=int,
                    default=34,
                    help='Random seed to use for selecting inputs.')
args = parser.parse_args()

with open("config.json", "r") as f:
    configuration = json.loads(f.read())
    OUTPUT_DIR = configuration["output_dir"]

with torch.cuda.device(0):
    print("Loading paraphraser....")
    paraphraser = GPT2Generator(OUTPUT_DIR + "/models/paraphraser_gpt2_large")
コード例 #22
0
import discord
from profanity_filter import ProfanityFilter

client = discord.Client()

pf = ProfanityFilter()


@client.event
async def on_ready():
    print('We have logged in as {0.user}'.format(client))
    await client.change_presence(activity=discord.Activity(
        type=discord.ActivityType.listening,
        name="52 Stories by Omar Waseem on Spotify and iTunes",
        title="52 Stories",
        color=discord.Color.green()))  # selmshots left the podcast :(


@client.event
async def on_message(message):
    if message.author == client.user:
        return

    if message.content.startswith('sultanim spam'):
        i = 1
        while (i <= 3):
            await message.channel.send(
                "SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! SPAM !!! "
            )
            i = i + 1
    if message.content.startswith('sultanim anti-fbi'):
コード例 #23
0
from profanity_filter import ProfanityFilter

pf = ProfanityFilter()

with open(input("Enter the name of Your File"), "r") as myFile:
    j = myFile.read()

filtered=pf.censor(j)
print(filtered)


コード例 #24
0
def test():
    from profanity_filter import ProfanityFilter
    pf = ProfanityFilter()
    return pf.censor("That's bullshit!")
コード例 #25
0
import logging
import configparser
import sys
import json
import os.path

from telegram import Update
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackContext
from profanity_filter import ProfanityFilter
from forex_python.converter import CurrencyRates
from forex_python.bitcoin import BtcConverter

config = configparser.ConfigParser()
config.read('bot_config.ini')

pf = ProfanityFilter(languages=['ru', 'en'])

token = config['DEFAULT']['BotToken']
updater = Updater(token=token, use_context=True)

dispatcher = updater.dispatcher
logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    level=logging.INFO)


class ChatMemberCensorRepository:
    repository_file = 'cencored_users'
    censored_users = set()

    def __init__(self):
コード例 #26
0
import os
from pycorenlp.corenlp import StanfordCoreNLP
from profanity_filter import ProfanityFilter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr

## INIT GLOBAL VARS
chunk_size = 5000
num_chunks = 0

## INIT REDDIT INSTANCE
reddit = praw.Reddit()

## INIT PROFANITY FILTER
pf = ProfanityFilter()
pf.censor_whole_words = False

## INIT VADER SENTIMENT
vader = SentimentIntensityAnalyzer()

## INIT TEXTCLEAN R
textclean = importr('textclean',
                    lib_loc="C:/Users/Ben/Documents/R/win-library/3.6")
importr('stringi', lib_loc="C:/Users/Ben/Documents/R/win-library/3.6")


## coreNLP sent. analysis
def getSentiment(text):
    ## connect to CoreNLP server
    host = "http://localhost"
コード例 #27
0
class ProfanityDetector:
    def __init__(self, profane_words_filepath: str):
        words = []
        with open(profane_words_filepath, encoding='utf8') as f:
            for line in f:
                word = line.strip()
                words.append(word)
                if word.count('ё') > 0:
                    word = word.replace('ё', 'е')
                words.append(word)
        self._ru_words = words
        self._ru_pf = ProfanityFilter()
        self._ru_pf.custom_profane_word_dictionaries = {'en': words}
        self._r = sr.Recognizer()

    def get_profanity(self, voice_path: str) -> list:
        data = []
        phrases = 0
        profane_phrases = 0
        for root, dirs, files in os.walk(voice_path):
            for file in files:
                phrases += 1
                with sr.AudioFile(os.path.join(root, file)) as source:
                    audio = self._r.record(source)
                    try:
                        res = self._r.recognize_google(audio, show_all=True)
                        res_ru = self._r.recognize_google(audio,
                                                          language="ru",
                                                          show_all=True)
                        if res_ru:
                            is_profane = False
                            text = ""
                            for text_alt in res_ru['alternative']:
                                text = text_alt['transcript']
                                if '*' in text:
                                    is_profane = True
                                    break
                                text = " ".join(word.lower()
                                                for word in text.split())
                                text = re.sub(r'-\s\r\n\s+|-\s\r\n|\r\n', '',
                                              text)
                                text = re.sub(
                                    r'[.,:;%©?*!@#$^&()\d]|[+=]|[\[]|[\]]|[/]|"|\s{2,}|-',
                                    ' ', text)
                                text = " ".join(
                                    pymorphy2.MorphAnalyzer().parse(str(
                                        word))[0].normal_form
                                    for word in text.split())
                                is_profane = self._ru_pf.is_profane(text)
                                if is_profane:
                                    break
                            if is_profane:
                                profane_phrases += 1
                                print(file, "RU PROFANE", text, "best:",
                                      res_ru['alternative'][0]['transcript'])
                            else:
                                print(file, "RU NOT PROFANE",
                                      res_ru['alternative'][0]['transcript'])
                            row = {
                                "filename":
                                file,
                                "lang":
                                "ru",
                                "is_profane":
                                is_profane,
                                "text_best_recogn":
                                res_ru['alternative'][0]['transcript'],
                                "text_profane":
                                text if is_profane else "",
                                "prob":
                                1 if is_profane else 0,
                            }
                            data.append(row)
                        if res:
                            is_profane = False
                            prob = 0
                            text = ""
                            for text_alt in res['alternative']:
                                text = text_alt['transcript']
                                if '*' in text:
                                    is_profane = True
                                    break
                                text = " ".join(word.lower()
                                                for word in text.split())
                                prob = predict_prob([text])[0]
                                is_profane = prob > 0.5
                                if is_profane:
                                    break
                            if is_profane:
                                profane_phrases += 1
                                print(file, "ENG PROFANE", text, "best:",
                                      res['alternative'][0]['transcript'])
                            else:
                                print(file, "ENG NOT PROFANE",
                                      res['alternative'][0]['transcript'])
                            row = {
                                "filename":
                                file,
                                "lang":
                                "eng",
                                "is_profane":
                                is_profane,
                                "text_best_recogn":
                                res['alternative'][0]['transcript'],
                                "text_profane":
                                text if is_profane else "",
                                "prob":
                                prob,
                            }
                            data.append(row)
                    except sr.UnknownValueError:
                        print(
                            "Google Speech Recognition could not understand audio"
                        )
                    except sr.RequestError as e:
                        print(
                            "Could not request results from Google Speech Recognition service; {0}"
                            .format(e))

        return data
コード例 #28
0
ファイル: views.py プロジェクト: chensam93/inkclusive
    def post(self, request):
        nlp = en_core_web_sm.load()
        pf = ProfanityFilter(nlps={'en': nlp})
        # pf.custom_profane_word_dictionaries = {'en': {'sold down the river', 'dog'}}
        # pf.extra_profane_word_dictionaries = {'en': {'sold', 'orange'}}
        wordlist = []
        context = {}

        # FILE UPLOADED
        if 'doc' in request.FILES:

            doc = request.FILES['doc']

            if doc.name.endswith(".docx"):
                docx = docx2python(doc, extract_image=False)
                context['doc'] = docx.text

            elif doc.name.endswith(".txt"):
                print("This is a test")

                mytext = str(doc.read())
                context['doc'] = mytext

            return render(request, 'index.html', context=context)

        # RETRIEVE WORDS AND SPLIT
        document = request.POST['document']
        word_lines = document.splitlines()

        # CHECK EACH WORD IF PROFANITY
        for line in word_lines:
            if line == '':
                wordlist.append(r'\n')

            # NO LINE BREAK CONTINUE HERE
            else:
                words = line.split()
                temp_list = []
                original_list = []

                # LOOP THROUGH EACH WORD.
                for word in words:

                    clean_word = clear_punctuation(word).lower()

                    in_db = Words.objects.all().filter(
                        word__icontains=clean_word)

                    # WORD IS IN DATABASE
                    if in_db:
                        temp_list.append(clean_word)

                        temp_word = " ".join(temp_list)

                        starting_phrase = Words.objects.all().filter(
                            word__istartswith=temp_word)

                        # CURRENT WORD IS THE START OF THE PHRASE
                        if starting_phrase:

                            original_list.append(word)

                            completed = Words.objects.all().filter(
                                word__iexact=temp_word)

                            # CURRENT PHRASE IS COMPLETED
                            if completed:
                                original = " ".join(original_list)
                                original_list.clear()

                                new_word = format_word(original)
                                wordlist.append(new_word)

                                temp_list.clear()

                            # # TEMP WORD DID NOT COMPLETE THE PHRASE
                            # else:
                            #     print('now we here bish')
                            #     original = " ".join(original_list)
                            #     original_list.clear()

                            #     wordlist.append(original)

                            #     temp_list.clear()

                        # NOT START OF PHRASE KEEP GOING
                        else:
                            wordlist.append(word)
                            temp_list.clear()
                            original_list.clear()

                    # WORD IS A PROFANITY
                    elif pf._is_profane_word('en', clean_word):

                        temp_word = " ".join(temp_list)
                        wordlist.append(temp_word)

                        new_word = format_word(word)
                        wordlist.append(new_word)
                        temp_list.clear()

                    # JUST A REGULAR WORD
                    else:
                        temp_word = " ".join(temp_list)
                        wordlist.append(temp_word)

                        wordlist.append(word)

                        temp_list.clear()

        context["results"] = " ".join(wordlist)
        context['document'] = document

        return render(request, 'index.html', context=context)
コード例 #29
0
def answer(carlAsked, userAnswered, allowProfanity):
    if allowProfanity:
        channel = "E2"
    else:
        channel = "default"
        from profanity_filter import ProfanityFilter
        pf = ProfanityFilter()

    storageFile = ROOT_DIR + "/channels/" + channel + ".json"

    if os.path.isfile(storageFile):
        storage = json.load(open(storageFile, 'r'))
    else:
        storage = {
            'phrases': [],
            'links': [],
        }

    illegalChars = ('{', '}', '[', ']', '(', ')', '|', '\\', '<', '>', '/')

    for illegalChar in illegalChars:
        carlAsked = carlAsked.replace(illegalChar, "")
        userAnswered = userAnswered.replace(illegalChar, "")

    phrases = storage['phrases']  #a list of phrases
    links = storage[
        'links']  #a list of links to other phrases from each phrase

    if len(userAnswered) == 0 or userAnswered[-1] not in ('.', '!', '?', '"',
                                                          "'"):
        userAnswered += '.'

    if len(userAnswered) > 250: userAnswered = userAnswered[:250]

    if carlAsked in phrases:
        askIdx = phrases.index(carlAsked)
    else:
        askIdx = -1

    futureAskIdx = -1

    if userAnswered in phrases:
        answerIdx = phrases.index(userAnswered)
        if len(links[answerIdx]) > 0:
            futureAskIdx = random.choice(links[answerIdx])
        else:
            futureAskIdx = getLeastUsed(links, answerIdx)  #exclude answerIdx
        if askIdx != -1:
            links[askIdx].append(answerIdx)
    else:
        bestIdx, best = spellcheckPhrase(userAnswered, phrases)
        if best > 0.6:
            if len(links[bestIdx]) > 0:
                futureAskIdx = random.choice(links[bestIdx])
            else:
                futureAskIdx = getLeastUsed(links, bestIdx)  #exclude answerIdx
            if askIdx != -1:
                links[askIdx].append(bestIdx)
        else:
            futureAskIdx = getLeastUsed(links, bestIdx)  #exclude answerIdx
        if allowProfanity or pf.is_clean(userAnswered):
            if askIdx != -1:
                links[askIdx].append(len(phrases))
            links.append([])
            phrases.append(userAnswered)
    json.dump(storage, open(storageFile, 'w'))
    return phrases[futureAskIdx]
コード例 #30
0
def profanity_filter_ru_en():
    return ProfanityFilter(languages=['ru', 'en'])