Ejemplo n.º 1
0
D = {}

for line in Word:
    for w in line:
        for x in line:
            if x == w:
                continue
            #print(x,w)
            if w not in D:
                D[w] = {}
            if x not in D[w]:
                D[w][x] = 0
            D[w][x] = D[w][x] + 1

dct = enchant.Dict("en_US")

#off = ["pcfg_dict_correlation_scores.txt", "pcfg_dict_num_correlation_scores.txt", "pcfg_ipv4_correlation_scores.txt" , "pcfg_ipv4_num_correlation_scores.txt", "srizbi_correlation_scores.txt", "torpig_correlation_scores.txt", "zeus_correlation_scores.txt", "kraken_correlation_scores.txt", "DNL1_correlation_scores.txt", "DNL2_correlation_scores.txt", "DNL3_correlation_scores.txt", "DNL4_correlation_scores.txt", "500KL1_correlation_scores.txt", "500KL2_correlation_scores.txt", "500KL3_correlation_scores.txt", "9ML1_correlation_scores.txt"];
#iff = ["pcfg_dict.txt", "pcfg_dict_num.txt", "pcfg_ipv4.txt" , "pcfg_ipv4_num.txt", "srizbi.txt", "torpig.txt", "zeus.txt", "kraken.txt", "DNL1.txt", "DNL2.txt", "DNL3.txt", "DNL4.txt", "500KL1.txt", "500KL2.txt", "500KL3.txt", "9ML1.txt"];
off = ["benign_correlation_scores.txt"]
iff = ["benign.txt"]

for it in range(0, 1):
    bad_repo = {}
    model_data = pickle.load(open('gib_model.pki', 'rb'))
    dct = enchant.Dict("en_US")
    outF = open(off[it], "w")

    with open(iff[it]) as f:  # change file name
        lines = f.readlines()
        count = 0
Ejemplo n.º 2
0
import enchant

dictio = enchant.Dict("en_US")

alph = "abcdefghijklmnopqrstuvwxyz"

file = open(
    "C:/Users/cdobb/AppData/Local/Programs/Python/Python38-32/Lib/site-packages/enchant/data/mingw32/share/enchant/hunspell/en_US.dic"
)

for line in file:
    if "/" in line:
        line = line[0:(line.index("/"))]
    line = line.lower()
    if len(line) == 5 and "rn" in line:
        temp = ""
        skip = line.index("rn")
        for i in range(0, len(line)):
            if i != skip and i != skip + 1:
                temp += line[i]
            if i == skip:
                temp += "m"
        if dictio.check(temp):
            print(line + ", " + temp)

file.close()
Ejemplo n.º 3
0
#!/usr/bin/python
import enchant
import itertools

dictionary = enchant.Dict("en_US")


def spellcheck(string):
    return dictionary.check(string)


def generator(letters, length):
    permutation = list(itertools.permutations(letters, length))
    for l in permutation:
        word = ''
        for c in l:
            word += c
        if (spellcheck(word)):
            print word


def main():
    string = raw_input("Enter all the letters\n")
    length = int(input("Enter length of the word\n"))
    generator(string, length)


if __name__ == "__main__":
    main()
Ejemplo n.º 4
0
#!/usr/bin/env python
import enchant

import sys

englishDict = enchant.Dict('en_US')


def checkWord(word):
    if englishDict.check(word):
        check = True
    else:
        check = False
    return (check)


def spellCheck(sentence, tolerance):
    words = [word for word in sentence.split(' ') if len(word) > 1]
    faults = [not checkWord(word) for word in words if len(word) != 0]

    if sum(faults) > tolerance:
        valid = False
    else:
        valid = True

    return (valid)


#####################################
# Very strict sentence cleaning
# currently removes ALL sentences will spelling errors(!)
Ejemplo n.º 5
0
def clean_the_text(text, remove_numbers=False):
    print('\n', '@' * 75, '\n', 'CLEANING THE TEXT', '\n\n')

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(text, 'lxml')

    #     print('PRETTYING UP THE TEXT IN THE CLEANING:  ', '\n\t', soup.prettify())
    #     text = soup.text

    from pattern.web import URL, plaintext
    text = plaintext(text, keep=[], linebreaks=2, indentation=False)

    import unicodedata
    text = unicodedata.normalize('NFKD', text).encode('ascii',
                                                      'ignore').decode(
                                                          'utf-8', 'ignore')

    import re
    clean = re.compile('<.*?>}{')
    text = re.sub(clean, '', text)
    text = text.replace('b"', '')
    text = text.replace("b'", '')
    text = text.replace("\'", "'")
    text = text.replace('\\n', ' ')
    text = text.replace('\\xc2\\xae', '')
    text = text.replace('\n', ' ')
    text = text.replace('\t', '')
    text = text.replace('\s+', '')
    text = text.replace('\r\r\r', '')
    text = text.replace('\\xc2\\xa9 ', '')
    text = text.replace('\\xe2\\x80\\x9c', '')
    text = text.replace('xe2x80x93', ',')
    text = text.replace('\\x0c', '')
    text = text.replace('\\xe2\\x80\\x9d', '')
    text = text.replace('\\xe2\\x80\\x90', '')
    text = text.replace('\\xe2\\x80\\x9331', '')
    text = text.replace('xe2x80x94', '')
    text = text.replace('\x0c', ' ')
    text = text.replace(']', '] ')
    text = text.replace('\\xe2\\x80\\x99', "'")
    text = text.replace('xe2x80x99', "'")
    text = text.replace('\\xe2\\x80\\x933', '')
    text = text.replace('\\xe2\\x80\\x935', '')
    text = text.replace('\\xef\\x82\\xb7', '')
    text = text.replace('\\', '')
    text = text.replace('xe2x80x99', "")
    text = text.replace('xe2x80x9cwexe2x80x9d', '')
    text = text.replace('xe2x80x93', ', ')
    text = text.replace('xe2x80x9cEUxe2x80x9d', '')
    text = text.replace('xe2x80x9cxe2x80x9d', '')
    text = text.replace('xe2x80x9cAvastxe2x80x9d', '')
    text = text.replace('xc2xa0', '')
    text = text.replace('xe2x80x9cxe2x80x9d', '')
    text = text.replace('xe2x80x9c', '')
    text = text.replace('xe2x80x9d', '')
    text = text.replace('tttttt', ' ')
    text = text.replace('activetttt.', '')
    text = text.replace('.sdeUptttt..sdeTogglettttreturn', '')
    text = text.replace('ttif', '')
    text = text.replace('.ttt.', ' ')
    text = text.replace(" t t ", ' ')
    text = text.replace('tttt ', '')
    text = text.replace(' tt ', ' ')
    text = text.replace(' t ', ' ')
    text = text.replace('ttt', '')
    text = text.replace('ttr', '')
    text = text.replace(' >t ', '')
    text = text.replace('.display', '')
    text = text.replace('div class', '')
    text = text.replace('div id', ' ')
    text = text.replace('Pocy', 'Policy')
    text = text.replace('xc2xa0a', ' ')
    text = text.replace(' b ', '')
    text = text.replace('rrrr', '')
    text = text.replace('r r r r r ', '')
    text = text.replace('rtttr', '')
    text = text.replace('    ', ' ')
    text = text.replace('   ', ' ')
    text = text.replace('  ', ' ')
    text = text.replace(' r ', ' ')
    text = text.replace(' tr ', ' ')
    text = text.replace(' rr  r  ', ' ')
    text = text.replace('r r r', '')
    text = text.replace('* t', '* ')
    text = text.replace('r *', ' *')
    text = text.replace('   tt t t rt ', ' ')
    text = text.replace('r rrr r trr ', ' ')
    text = text.replace(' r t', '')
    text = text.replace(' r tt', '')
    text = text.replace(' xe2x80x93 ', ' ')
    text = text.replace(' xe6xa8x82xe9xbdxa1xe6x9cx83  ', ' ')
    text = text.replace(' rrr ', ' ')
    text = text.replace(' rr ', ' ')
    text = text.replace(' r r ', '')
    text = text.replace('tr ', '')
    text = text.replace('* xe7xaex80xe4xbdx93xe4xb8xadxe6x96x87', '')
    text = text.replace('tt*', '')

    return text

    print('*' * 10, 'DROPPING NON-ENGLISH WORDS FROM THE TEXT', '*' * 10)
    from nltk.tokenize import word_tokenize
    token_text_w = word_tokenize(text)

    import enchant
    d = enchant.Dict('en_US')
    bad_words = []

    for word in token_text_w:
        if d.check(word) is not True:
            bad_words.append(word)

    bad_words = set(bad_words)

    for word in token_text_w:
        if word in bad_words:
            text = text.replace(word, '')

    #Trial of a new way of cleaning the text
    index = 0
    print('\n\n', '*' * 10, len(tokenize_by_sentences(a)), '*' * 10, '\n\n')
    for sent in tokenize_by_sentences(a):
        if 'js' in sent or 'css' in sent or 'png' in sent or 'woff2' in sent or ' div ' in sent or ' meta "" ' in sent or 'span' in sent:
            a = a.replace(sent, '')
            print('\n', '*' * 25, '\n',
                  'CLEANING TOKENIZED SENTENCES OF CODE IN INDEX', index,
                  '*' * 25)
            index += 1

    return (text)
Ejemplo n.º 6
0
    "17", "67"
]


def convert_plate(plate):
    for idx, used in enumerate(nums_used):
        real_plate = plate[:2] + plate[2:4].replace(used,
                                                    nums_conv[idx]) + plate[4:]
        if real_plate != plate:
            break
    return real_plate


print "Welcome to plate finder (running with " + str(num_cores) + " cores)"

dict = enchant.Dict("en_GB")
count = 0
found = 0
output = "plates"
ext = ".txt"


def process_plate(c1):
    out = open(output + "-" + str(uuid.uuid4()) + ext, "w")
    for c2 in chars_pre:
        for c34 in nums_used:
            for c5 in chars_rnd:
                for c6 in chars_rnd:
                    for c7 in chars_rnd:
                        plate = c1 + c2 + c34 + c5 + c6 + c7
                        if dict.check(plate):
Ejemplo n.º 7
0
from .features import Dictionary, RegexMatches, Stopwords

name = "vietnamese"

try:
    import enchant
    dictionary = enchant.Dict("vi")
except enchant.errors.DictNotFoundError:
    raise ImportError("No enchant-compatible dictionary found for 'vi'.  " +
                      "Consider installing 'hunspell-vi'.")

dictionary = Dictionary(name + ".dictionary", dictionary.check)
"""
:class:`~revscoring.languages.features.Dictionary` features via
`enchant.Dict <https://github.com/rfk/pyenchant>`_ "vi". Provided by `hunspell-vi`.
"""

# https://vi.wiktionary.org/wiki/Th%C3%A0nh_vi%C3%AAn:Laurent_Bouvier/
# Free_Vietnamese_Dictionary_Project_Vietnamese-Vietnamese#Allwiki_.28closed.29
stopwords = set([
    "ai", "bằng", "bị", "bộ", "cho", "chưa", "chỉ", "cuối", "cuộc", "các",
    "cách", "cái", "có", "cùng", "cũng", "cạnh", "cả", "cục", "của", "dùng",
    "dưới", "dừng", "giữa", "gì", "hay", "hoặc", "khi", "khác", "không",
    "luôn", "là", "làm", "lại", "mà", "mọi", "mỗi", "một", "nhiều", "như",
    "nhưng", "nào", "này", "nữa", "phải", "qua", "quanh", "quá", "ra", "rất",
    "sau", "sẽ", "sự", "theo", "thành", "thêm", "thì", "thứ", "trong", "trên",
    "trước", "trừ", "tuy", "tìm", "từng", "và", "vài", "vào", "vì", "vẫn",
    "về", "với", "xuống", "đang", "đã", "được", "đấy", "đầu", "đủ"
])

stopwords = Stopwords(name + ".stopwords", stopwords)
Ejemplo n.º 8
0
import nltk
  
# to remove stopword 
from nltk.corpus import stopwords 

from nltk.stem.porter import PorterStemmer 
from sklearn import linear_model, svm, neighbors, naive_bayes
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import confusion_matrix

import enchant

SPELLING_DICT = enchant.Dict("en_US")
# from nltk.tokenize import sent_tokenize
# from nltk import word_tokenize, pos_tag, ne_chunk

def load_data():
    data = pd.read_csv('deceptive-opinion.csv')
    data = data.drop(columns="hotel")
    data = data.drop(columns="source")
    data = data.drop(columns="polarity")
    data.rename(columns={'deceptive':'real'}, inplace=True)

    stop = stopwords.words('english')
    data['text'] = data['text'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))

    data.loc[data['real'] == 'truthful', 'real']  = 1
    data.loc[data['real'] == 'deceptive', 'real']  = 0
Ejemplo n.º 9
0
from .features import Dictionary, RegexMatches, Stopwords

name = "serbian"

try:
    import enchant
    dictionary = enchant.Dict("sr")
except enchant.errors.DictNotFoundError:
    raise ImportError("No enchant-compatible dictionary found for 'sr'.  " +
                      "Consider installing 'hunspell-sr'.")

dictionary = Dictionary(name + ".dictionary", dictionary.check)
"""
:class:`~revscoring.languages.features.Dictionary` features via
`enchant.Dict <https://github.com/rfk/pyenchant>`_ "sr". Provided by `hunspell-sr`.
"""

# https://meta.wikimedia.org/w/index.php?oldid=17213519
stopwords = [
    r"административна",
    r"административног",
    r"али",
    r"америчке",
    r"америчких",
    r"астрономија",
    r"база",
    r"без",
    r"београд",
    r"била",
    r"били",
    r"било",
Ejemplo n.º 10
0
 def __init__(self):
     super(Spellcheck, self).__init__()
     lang = os.environ.get("LC_CTYPE", "en_US.utf-8").split('.')[0]
     self.dictionary = enchant.Dict(lang)
     self.print_err("loaded dictionary for {}".format(lang))
     self.in_word = False
Ejemplo n.º 11
0
# import pymorphy2
import enchant, re, nltk, pymorphy2
from nltk.corpus import stopwords

dictionary = enchant.Dict("ru_RU")
nltk.download('stopwords')
morph = pymorphy2.MorphAnalyzer(lang='ru')

# print(morph.parse('стали'))
# print(morph.parse('стали')[0].normal_form)

# import pandas as pd
# df = pd.read_csv('data.csv', sep=r'((?:(^\d+));)|(^(?:(ID));)', skiprows=0, index_col=0)
# print(df)

# print(df)s
# print(df.to_string())

# from numpy import genfromtxt
# my_data = genfromtxt('data.csv', delimiter=';')
stopwords_ru = stopwords.words("russian")
data = ['ID\tQuestion\n']
missplells_log = []
words_errors = []

for idx, line in enumerate(open('onlyquestions').readlines()[:10]):
    # for idx,line in enumerate(open('onlyquestions').rseadlines()):
    t = line.strip()
    split_pattern = r'[«]?[а-яА-Я]+[»]?'
    splitted = re.findall(split_pattern, t)
    # splitted = t.split()
Ejemplo n.º 12
0
         if w in ['no', 'not']:
             print(label_class[i] + ': ' + 'no')
     match = re.search('(\d+%)', s)
     if match:
         pct = match.group(1)
         print(label_class[i] + ': ' + pct)
 if label_class[i] == 'sale restriction':
     for w in s.split():
         if w in gazetteers.words('countries.txt'):
             print(label_class[i] + ': ' + w)
             break
 if label_class[i] == 'tour code':
     for j in range(len(s.split())):
         if s.lower().split()[j] == 'code':
             w = s.split()[j+1]
             if not enchant.Dict("en_US").check(w):
                 print(label_class[i] + ': ' + w)
 if label_class[i] in ['ticketing period', 'travelling period']:
     w = s.split()
     nw = []
     for j in range(len(w)):
         # Process case like "RELEASED: DEC 29, 201514-"
         if w[j].lower() == 'released':
             if w[j+1].lower() in months or w[j+2].lower() in months:
                 w[j+3] = w[j+3][:4]
                 if w[j+3].isdecimal():
                     released_date = ' '.join(w[j+1:j+4])
         # Process case like "Ticket must be issued on/before31JAN, 2016"
         if w[j].isalnum() and not w[j].isalpha() and not w[j].isdecimal():
             for k, g in groupby(w[j], str.isalpha):
                 nw.append(''.join(list(g)))
Ejemplo n.º 13
0
def remove_from_dict(dict):
    d = enchant.Dict("ro_RO")
    for element in list(dict):
        d.remove(element)
Ejemplo n.º 14
0
def add_to_dict(dict):
    d = enchant.Dict("ro_RO")
    for i, element in enumerate(list(dict)):
        if not d.check(element):
            d.add(element)
        print('Element', i, 'out of', len(list(dict)))
Ejemplo n.º 15
0
def all_wordification(number):
    """Converts a number to a word"""

    import enchant  # English Dictionary
    # from nltk.corpus import words (Alternate word searcher)
    from itertools import product  # Cartesian Product
    from itertools import combinations  # Combination

    # Create a dictionary for number to letter conversion
    alph_num_dict = {
        '2': ('a', 'b', 'c'),
        '3': ('d', 'e', 'f'),
        '4': ('g', 'h', 'i'),
        '5': ('j', 'k', 'l'),
        '6': ('m', 'n', 'o'),
        '7': ('p', 'q', 'r', 's'),
        '8': ('t', 'u', 'v'),
        '9': ('w', 'x', 'y', 'z')
    }

    number = number[6:]  # delete area code
    number = number[:3] + number[4:]  # remove hyphen

    word = ""
    all_words = []
    d = enchant.Dict("en_US")  # check if word is a real english word

    # Find all combination of numbers via cartesian product
    # prod = list(product('012', repeat=7)) # Find all combinations of numbers # old cartesian product
    temp = []
    prod = []
    for index in range(len(number)):
        if number[index] in ['9', '7']:
            temp.append([0, 1, 2, 3])
        else:
            temp.append([0, 1, 2])

    for i in product(*temp):
        prod.append((i))

    comb = list(combinations([0, 1, 2, 3, 4, 5, 6, 7],
                             2))  # Find all inner combination of numbers

    # print(perm[0][1])
    i = 0

    for i in range(len(prod)):  # iterate through all number combos of word

        for index in range(
                len(number)
        ):  # iterate through number and convert each number to a letter
            p = number[index]
            word += alph_num_dict[p][int(prod[i][index])]  # add letter to word

        # if d.check(word): # add new words to list
        # all_words.append(word)

        for j in comb:  # iterate through all words/subwords and check if they are real words in dictionary
            temp = word[j[0]:j[1]]

            if d.check(temp):  # check if word is an english word
                # if temp in words.words(): # alternate word searcher
                temp = number[0:j[0]] + temp + number[j[1]:]
                all_words.append(temp)

        word = ""  # clear word

    return (all_words)
Ejemplo n.º 16
0
def timerun():
    a=datetime.datetime.now()
    print(a-datetime.datetime.now())

timerun()

#Exercise 3: Print a Word Provided by the User

b=input("Type something here:")
print(b)

#Exercise 4: Validate User Input
#For this I installed the Enchant library using 'pip install pyenchant'
import enchant
#It looked like only one dictionary at a time was available at least easily. So I used the US dictionary.
d = enchant.Dict('en_US')
i=1

#I had to initialize i as 1 before the loop. So i=1 would be a global setting needed for this program. Did I need to define it that way?
def runit():
    global i
    while i==1:
        e=input("Type a word here:")
        print(e)
        while(d.check(e) is False):
            e=input("Please type an English word:")
            print(e)
#I had to use an extra if statement here to get the same result that I did in R using recursion. I don't know if that's because I'm doing something wrong here.            
        if(d.check(e) is False):
            runit()
        i+=1
Ejemplo n.º 17
0
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

from xml.dom.minidom import parseString

#AUTO-load classifiers
#a trick to get the current module
_modname = globals()['__name__']
_this_mod = sys.modules[_modname]

_ppath = "/".join(_this_mod.__file__.split("/")[:-1])

d = enchant.Dict("en_US")
import json
# #CAN drop this if this is an app!
# DEPLOY_DIR="/home/lentaing/envs/newdc1.4/src"
# sys.path.insert(0, DEPLOY_DIR)
# from django.core.management import setup_environ
# from django.utils.encoding import smart_str
# import settings
# setup_environ(settings)
from django.utils.encoding import smart_str

from datacollection import models

#dynamically load classifiers
#import classifiers
import sra
Ejemplo n.º 18
0
    if request.method == "POST":
        user_string = request.json['text']
        # flag stores whether previous translated word was hindi or not
        flag = request.json['flag']

        if user_string:
            # Call to translate function to process the string
            predictions = generate_predictions(transliterator_obj, user_string,
                                               eng_dict, hin_dict, classifier,
                                               flag)
            return json.dumps({"lists": predictions})

        else:
            # return empty list if user sends empty string
            return json.dumps({"lists": []})


if __name__ == "__main__":
    # initializing english and hindi dictionaries
    eng_dict = enchant.Dict('en_US')
    hin_dict = enchant.Dict('hi_IN')

    # initializing object of Transliterator class
    transliterator_obj = Transliterator(source='eng', target='hin')

    # initializing object for language classifier
    classifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

    # run flask app
    app.run('0.0.0.0', debug=True)
Ejemplo n.º 19
0
def get_all_tweets(screen_name):

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth)

    alltweets = []

    new_tweets = api.user_timeline(screen_name=screen_name, count=200)

    alltweets.extend(new_tweets)

    oldest = alltweets[-1].id - 1

    while len(new_tweets) > 0:
        print("getting tweets before %s    " % (oldest))

        new_tweets = api.user_timeline(screen_name=screen_name,
                                       count=200,
                                       max_id=oldest)

        alltweets.extend(new_tweets)

        oldest = alltweets[-1].id - 1

        print("...%s tweets downloaded so far" % (len(alltweets)))

        output = list()

    for i in range(len(alltweets)):
        analysis = TextBlob(alltweets[i].text)
        words = alltweets[i].text.split()
        misspelled = ""
        vulgar = ""
        for x in range(len(words)):
            if words[x] in vulgar_terms.bad_words.keys():
                vulgar += words[x] + ", "

            spellword = words[x]
            if spellword.startswith("@") == False and spellword.endswith(
                    ",") == False and spellword.startswith(
                        "https://") == False and spellword.startswith(
                            "http://") == False and enchant.Dict(
                                "en_US").check(words[x]) == False:
                misspelled += words[x] + ", "

        misspelled = misspelled[:len(misspelled) - 2]
        vulgar = vulgar[:len(vulgar) - 2]

        if analysis.sentiment.polarity <= -0.2 and analysis.sentiment.subjectivity >= 0.5:
            if len(vulgar) != 0 and len(misspelled) != 0:
                output.append(alltweets[i].id_str + "*" + alltweets[i].text +
                              "*" + str(vulgar) + "*" + str(misspelled) + "*" +
                              str(analysis.sentiment.polarity) + "*" +
                              str(analysis.sentiment.subjectivity))
            elif len(vulgar) != 0:
                output.append(alltweets[i].id_str + "*" + alltweets[i].text +
                              "*" + str(vulgar) + "*" + "N/A" + "*" +
                              str(analysis.sentiment.polarity) + "*" +
                              str(analysis.sentiment.subjectivity))
            elif len(misspelled) != 0:
                output.append(alltweets[i].id_str + "*" + alltweets[i].text +
                              "*" + "N/A" + "*" + str(misspelled) + "*" +
                              str(analysis.sentiment.polarity) + "*" +
                              str(analysis.sentiment.subjectivity))
            else:
                output.append(alltweets[i].id_str + "*" + alltweets[i].text +
                              "*" + "N/A" + "*" + "N/A" + "*" +
                              str(analysis.sentiment.polarity) + "*" +
                              str(analysis.sentiment.subjectivity))
        elif len(vulgar) != 0 and len(misspelled) != 0:
            output.append(alltweets[i].id_str + "*" + alltweets[i].text + "*" +
                          str(vulgar) + "*" + str(misspelled) + "*" + "N/A" +
                          "*" + "N/A")
        elif len(vulgar) != 0:
            output.append(alltweets[i].id_str + "*" + alltweets[i].text + "*" +
                          str(vulgar) + "*" + "N/A" + "*" + "N/A" + "*" +
                          "N/A")
        elif len(misspelled) != 0:
            output.append(alltweets[i].id_str + "*" + alltweets[i].text + "*" +
                          "N/A" + "*" + str(misspelled) + "*" + "N/A" + "*" +
                          "N/A")

    for i in range(len(output)):
        print(output[i])

    # for i in range(len(output)):
    #     print(output[i])

    return output
Ejemplo n.º 20
0
 def __init__(self, dict_name='en', max_dist=2):
     self.spell_dict = enchant.Dict(dict_name)
     self.max_dist = max_dist
Ejemplo n.º 21
0
from .features import Dictionary, RegexMatches, Stemmed, Stopwords

name = "french"

try:
    import enchant
    dictionary = enchant.Dict("fr")
except enchant.errors.DictNotFoundError:
    raise ImportError("No enchant-compatible dictionary found for 'fr'.  " +
                      "Consider installing 'myspell-fr'.")

dictionary = Dictionary(name + ".dictionary", dictionary.check)
"""
:class:`~revscoring.languages.features.Dictionary` features via
`enchant.Dict <https://github.com/rfk/pyenchant>`_ "fr".  Provided by `myspell-fr`
"""

try:
    from nltk.corpus import stopwords as nltk_stopwords
    stopwords = set(nltk_stopwords.words('french') + ["a"])
except LookupError:
    raise ImportError("Could not load stopwords for {0}. ".format(__name__) +
                      "You may need to install the nltk 'stopwords' " +
                      "corpora.  See http://www.nltk.org/data.html")

stopwords = Stopwords(name + ".stopwords", stopwords)
"""
:class:`~revscoring.languages.features.Stopwords` features provided by
`nltk.corpus.stopwords <https://www.nltk.org/api/nltk.corpus.html>`_ "french"
"""
Ejemplo n.º 22
0
from tqdm import tqdm_notebook
from scipy.stats import rankdata
from itertools import product
import enchant
from pymystem3 import Mystem
import numpy as np
import pandas as pd
import re

import gensim

from sklearn.decomposition import PCA

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

enchant_d = enchant.Dict("ru")
mystem = SnowballStemmer("russian")
mystem_e = SnowballStemmer("english")
russian_stopwords = stopwords.words("russian")
english_stopwords = stopwords.words("english")
tokenizer = RegexpTokenizer(r'\w+')


def preprocess_text(text, stemmer_I):
    text = text.lower()
    #text = ' '.join(tokenizer.tokenize(text))
    if stemmer_I:
        tokens = [mystem.stem(mystem_e.stem(token)) for token in re.sub('[^a-zа-я0-9]', ' ', text).split()
                     if not token in russian_stopwords \
                     and not token in english_stopwords \
                     #and not token.isdigit() \
Ejemplo n.º 23
0
from utils.text_utils import tokenize, get_homophones_by_char, get_homophones_by_pinyin

traditional_sentence = '憂郁的臺灣烏龜'
simplified_sentence = traditional2simplified(traditional_sentence)
print(simplified_sentence)

simplified_sentence = '忧郁的台湾乌龟'
traditional_sentence = simplified2traditional(simplified_sentence)
print(traditional_sentence)

print(lazy_pinyin('中心'))  # 不带音调

print(tokenize('小姑娘蹦蹦跳跳的去了她外公家'))

# 判断拼音还是英文
en_dict = enchant.Dict("en_US")
print(en_dict.check("hello"))
print(en_dict.check("hello boy what is your name"))
strs = "hello boy what is your name"
flag = False
for word in strs:
    if en_dict.check(word):
        flag = True
    else:
        flag = False
        break
print(flag)
print(en_dict.check("zhangsan"))
print(en_dict.check("zhangsan ni zai zhe li ma ?"))

pron = get_homophones_by_char('长')
Ejemplo n.º 24
0
def Classification(filename, srcTest, labelDict, TotalSpamHam, k):
    matrixConfussion = defaultdict(int)
    predictClass = dict()

    kamusKataSpam = defaultdict(int)
    kamusKataHam = defaultdict(int)

    kamusKata = list()

    #openfile
    with open(filename, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            key = row['word']
            kamusKata.append(key)
            if key not in kamusKataSpam:
                if (row['spam'] != '0'):
                    kamusKataSpam[key] = row['spam']
            if key not in kamusKataHam:
                if (row['ham'] != '0'):
                    kamusKataHam[key] = row['ham']

    totalSpam = TotalSpamHam[0]
    totalHam = TotalSpamHam[1]

    probSpam = float(totalSpam + k) / (totalSpam + totalHam +
                                       len(totalSpamHam))
    probHam = float(totalHam + k) / (totalSpam + totalHam + len(totalSpamHam))

    tempprobSpam = copy.copy(probSpam)
    tempprobHam = copy.copy(probHam)

    kataDataUji = defaultdict(int)

    stopWords = set(stopwords.words('english'))
    d = enchant.Dict("en_US")
    lemma = nltk.wordnet.WordNetLemmatizer()

    files = os.listdir(srcTest)
    for file in files:
        fp = open(srcTest + file, 'r').read()
        listKata = fp.split(" ")
        for kata in listKata:
            kata = kata.lower()
            if kata not in stopWords:  # hilangin stopword
                kata = lemma.lemmatize(kata)  # stemming
                if kata != '':
                    if not d.check(kata):
                        suggest = d.suggest(kata)
                        if len(suggest) != 0:
                            kata = suggest[0]
                            kata = kata.lower()
                            kataDataUji[kata] += 1
                    else:
                        kataDataUji[kata] += 1

        for key in kataDataUji.keys():
            tempprobSpam *= (kataDataUji[key] *
                             ((float(kamusKataSpam[key]) + k) /
                              (len(kamusKataSpam) + len(kamusKata))))
            tempprobHam *= (kataDataUji[key] *
                            ((float(kamusKataHam[key]) + k) /
                             (len(kamusKataHam) + len(kamusKata))))

        if tempprobSpam < tempprobHam:
            predictClass[file] = '1'
        elif tempprobSpam >= tempprobHam:
            predictClass[file] = '0'

        tempprobSpam = copy.copy(probSpam)
        tempprobHam = copy.copy(probHam)

        kataDataUji.clear()

        if predictClass[file] == labelDict[file] and predictClass[file] == '0':
            matrixConfussion['TP'] += 1
        elif predictClass[file] == '1' and labelDict[file] == '0':
            matrixConfussion['FN'] += 1
        elif predictClass[file] == '0' and labelDict[file] == '1':
            matrixConfussion['FP'] += 1
        elif predictClass[file] == labelDict[file] and predictClass[
                file] == '1':
            matrixConfussion['TN'] += 1

    return (predictClass, matrixConfussion)
Ejemplo n.º 25
0
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as et
import re
import enchant
from enchant.checker import SpellChecker
d = enchant.Dict("de_CH")

filename = "/Users/Simon/UNI VII/bigdata/nzz/NZZ_1910_1920-with-uuid/1910-01/JM20121222000301997.xml"
# filename = "/Users/tabris/Downloads/NZZ_1910_1920-with-uuid/1910-08/JM20121222000281742.xml"

tree = et.parse(filename)
root = tree.getroot()
content = root.find('TX')
clist = []
craw = []
letter_ratio = []
lengths = []

#loop through xml file's p tags
for child in content.findall('P'):

    craw.append(unicode(child.text))

    #try to put content int0 string
    # NICO: use the unicode class, which offers pretty much all the functions of the
    # string class, but does not save them as 8-bit strings
    try:
        text = unicode(child.text)
    except:
        clist.append('shit_encoding')
        continue
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import TransformedTargetRegressor

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter, defaultdict

hidden_dim = 128

all_stopwords = stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()
english_dict = enchant.Dict("en_US")


def remove_digits(s):
    return ''.join([i for i in s if not i.isdigit()])


def lemmatize(s):
    return wordnet_lemmatizer.lemmatize(s.lower().encode(
        "ascii", "ignore").decode("utf-8"))


def clean(s):
    tmp_words = s.lower().split(" ")
    tmp_words = [
        word.translate(str.maketrans('', '', string.punctuation))
Ejemplo n.º 27
0
    def setTrainingVars(self,
                        P,
                        corp,
                        num_topics,
                        NTest,
                        NTrain,
                        lapp="",
                        includeLabels=False):
        self.includeLabels = includeLabels
        self.T = NTest
        self.TRAIN = NTrain
        self.corpus = corp
        self.dfs = self.corpus.dfs()
        self.K = num_topics

        loc = lapp + "exports/" + P + "/lda_states/ldapy" + str(self.K)
        self.lda = models.ldamodel.LdaModel.load(loc)

        for z in range(0, self.K):
            topic = self.lda.state.get_lambda()[z]
            topic = topic / topic.sum()
            bestn = matutils.argsort(topic, 100, reverse=True)
            terms = [(id, topic[id]) for id in bestn]

            #terms = lda.get_topic_terms(z,100)
            for term in terms:
                word = corp.dictionary[term[0]].lower()
                weight = term[1]
                occurences = self.dfs[term[0]]
                #idf = log(corpus.documentCount/(1+occurences))
                if word in self.wordweights:
                    if weight > self.wordweights[word]:
                        self.wordweights[word] = weight  #* idf
                else:
                    self.wordweights[word] = weight  #* idf
        #print('\n\n')

        with open(lapp + "exports/" + P + "/good_ADJ.txt", "r") as f:
            for line in f:
                self.good_adjs.append(line.strip())

        with open(lapp + "exports/" + P + "/bad_ADJ.txt", "r") as f:
            for line in f:
                self.bad_adjs.append(line.strip())

        with open(lapp + "exports/" + P + "/good_NOUN.txt", "r") as f:
            for line in f:
                self.good_verbs.append(line.strip())

        with open(lapp + "exports/" + P + "/bad_NOUN.txt", "r") as f:
            for line in f:
                self.bad_verbs.append(line.strip())

        with open(lapp + "exports/" + P + "/featuresAprioriLexicalPruned.txt",
                  "r") as f:
            for line in f:
                self.product_features.append(line.strip())

        with open(lapp + "inputs/badwords.txt", "r") as f:
            for line in f:
                self.bad_words.append(line.decode('utf-8').strip())
        self.currentGenerator = NTrain * 2

        self.nnn = NTrain * 2

        self.nlp = English()
        self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        self.d = enchant.Dict("en_US")
Ejemplo n.º 28
0
###########################################################################################
# Maps a given list of company names to their website domain names
# Add downweighting for companies with non-www starting
########################################################################################### 
import urllib
import json as m_json
from urlparse import urlparse
import enchant
import testData
import sys

URL_COUNT_WEIGHT = .25 
URL_ORDER_WEIGHT = -.25
URL_LEN_WEIGHT = -.1

ENGLISH_DICT = enchant.Dict("en_US")
TRIVIAL_WORDS = ["company", "inc", "group", "corporation", "co", "corp", "university", "college", "&", "llc", "the", "of", "a", "an"]

# Code adapted from http://stackoverflow.com/questions/3898574/google-search-using-python-script #
# Assume Q is a list of unique strings
def getURLForQuery(q, query2URLS):
    query = urllib.urlencode ( { 'q' : q } )
    response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
    json = m_json.loads ( response )
    results = json [ 'responseData' ] [ 'results' ]
    URLS = []
    for result in results:
        title = result['title']
        url = result['url']   # was URL in the original and that threw a name error exception
        URLS.append(url)
    query2URLS[q] = URLS
Ejemplo n.º 29
0
import sys
import enchant

print(80 * '-')
print('PYTHONPATH: %s' % sys.path)

# At least one backend should be available
backends = [x.name for x in enchant.Broker().describe()]
if len(backends) < 1:
    raise SystemExit('Error: No dictionary backend available')
print(80 * '-')
print('Backends: ' + ', '.join(backends))

# Usually en_US dictionary should be bundled.
langs = enchant.list_languages()
dicts = [x[0] for x in enchant.list_dicts()]
if len(dicts) < 1:
    raise SystemExit('No dictionary available')
print(80 * '-')
print('Languages: %s' % ', '.join(langs))
print('Dictionaries: %s' % dicts)
print(80 * '-')

# Try spell checking if English is availale
l = 'en_US'
if l in langs:
    d = enchant.Dict(l)
    print('d.check("hallo") %s' % d.check('hallo'))
    print('d.check("halllo") %s' % d.check('halllo'))
    print('d.suggest("halllo") %s' % d.suggest('halllo'))
Ejemplo n.º 30
0
    def default_dict(self, language):

        return enchant.Dict(language)