Example #1
0
    def apply_settings(self, x):

        if x == 'word':
            self.tokenizer_name = x
        elif x == 'sent':
            self.tokenizer_name = x

        if x == 'nltk':
            #    self.stop_dict  = stopwords.words('english')
            self.stop_dict_name = x
        elif x == 'Extend':
            # self.stop_dict = stopwords.words('english')
            self.stop_dict_name = x

        if x == 'Count':
            self.vec = CountVectorizer()
            self.vec_name = x
        elif x == 'TfiDf':
            # print(self.vec)
            self.vec = TfidfVectorizer()
            self.vec_name = x
            # print(self.vec)

        if x == 'Porter':
            self.stemmer = PorterStemmer()
            self.stemmer_name = x
        elif x == 'SnowBall':
            self.stemmer = SnowballStemmer(language='english',
                                           ignore_stopwords=True)
            self.stemmer_name = x
        elif x == 'ISR':
            self.stemmer = ISRIStemmer()
            self.stemmer_name = x
Example #2
0
 def stemming(self, text):
     self.ps = ISRIStemmer()
     text = text.split()
     self.stemmed_words = []
     for word in text:
         self.stemmed_words.append(self.ps.stem(word))
     return " ".join(self.stemmed_words)
Example #3
0
def ArabicStemming(text):
    st=ISRIStemmer()
    stemmedwords=[]
    word=text
    for w in word:
        stemmedwords.append(st.stem(w))
    return stemmedwords
    def stem(self, txt):
        st = ISRIStemmer()
        stem_words = []
        words = self.tokenize(txt)
        for w in words:
            stem_words.append(st.stem(w))

        return stem_words
Example #5
0
def request_tokenizing(req_text_path, save_path='../data/user_requests'):
    """
    :param req_text_path: the path to the request .txt file
    :param save_path: the path to save the structured .xml output
    :return: read a request, tokenize & stem it and save the info in structured xml file
    """
    f = open(req_text_path, encoding='utf8')
    line = f.readline()
    f.close()
    # reading stop words
    stop_words = stopwords.words('arabic')

    # deleting punctuation THIS IS A MATTER OF DISCUSSION
    line = re.sub(r'[^\w\s]', '', line)
    # NLP
    tokenizer = WordPunctTokenizer()
    stemmer = ISRIStemmer()

    tokens = tokenizer.tokenize(line)
    root = ET.Element('root')
    tok_elem = ET.SubElement(root, 'tokenization')

    i = 1
    sw = False
    for t in tokens:
        stem = stemmer.stem(t)
        if str(t) in stop_words:
            sw = True
        ET.SubElement(tok_elem,
                      'word',
                      id=str(i),
                      value=str(t),
                      stop_word=str(sw),
                      stem=stem)
        i += 1
        sw = False

    file_str = save_path + '/' + req_text_path.split('/')[-1].split(
        '.')[0] + '.xml'
    xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent="    ")
    with open(file_str, 'w', encoding='utf-8') as f:
        f.write(xmlstr)
    f.close()

    return file_str
Example #6
0
    def __init__(self):
        # initialise default words
        self.stop_dict = set(stopwords.words('english')).union(punctuation)
        self.vec = CountVectorizer()
        self.stemmer = PorterStemmer()
        self.lemmat = WordNetLemmatizer()
        self.stop_dict_name = 'nltk'
        self.vec_name = 'Count Vectorizer'
        self.stemmer_name = 'ProterStemmer'
        self.tokenizer_name = 'word'
        self.web_stop_words = [
            "a", "a's", "able", "about", "above", "according", "accordingly",
            "across", "actually", "after", "afterwards", "again", "against",
            "ain't", "all", "allow", "allows", "almost", "alone", "along",
            "already", "also", "although", "always", "am", "among", "amongst",
            "an", "and", "another", "any", "anybody", "anyhow", "anyone",
            "anything", "anyway", "anyways", "anywhere", "apart", "appear",
            "appreciate", "appropriate", "are", "aren't", "around", "as",
            "aside", "ask", "asking", "associated", "at", "available", "away",
            "awfully", "b", "be", "became", "because", "become", "becomes",
            "becoming", "been", "before", "beforehand", "behind", "being",
            "believe", "below", "beside", "besides", "best", "better",
            "between", "beyond", "both", "brief", "but", "by", "c", "c'mon",
            "c's", "came", "can", "can't", "cannot", "cant", "cause", "causes",
            "certain", "certainly", "changes", "clearly", "co", "com", "come",
            "comes", "concerning", "consequently", "consider", "considering",
            "contain", "containing", "contains", "corresponding", "could",
            "couldn't", "course", "currently", "d", "definitely", "described",
            "despite", "did", "didn't", "different", "do", "does", "doesn't",
            "doing", "don't", "done", "down", "downwards", "during", "e",
            "each", "edu", "eg", "eight", "either", "else", "elsewhere",
            "enough", "entirely", "especially", "et", "etc", "even", "ever",
            "every", "everybody", "everyone", "everything", "everywhere", "ex",
            "exactly", "example", "except", "f", "far", "few", "fifth",
            "first", "five", "followed", "following", "follows", "for",
            "former", "formerly", "forth", "four", "from", "further",
            "furthermore", "g", "get", "gets", "getting", "given", "gives",
            "go", "goes", "going", "gone", "got", "gotten", "greetings", "h",
            "had", "hadn't", "happens", "hardly", "has", "hasn't", "have",
            "haven't", "having", "he", "he's", "hello", "help", "hence", "her",
            "here", "here's", "hereafter", "hereby", "herein", "hereupon",
            "hers", "herself", "hi", "him", "himself", "his", "hither",
            "hopefully", "how", "howbeit", "however", "i", "i'd", "i'll",
            "i'm", "i've", "ie", "if", "ignored", "immediate", "in",
            "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates",
            "inner", "insofar", "instead", "into", "inward", "is", "isn't",
            "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k",
            "keep", "keeps", "kept", "know", "known", "knows", "l", "last",
            "lately", "later", "latter", "latterly", "least", "less", "lest",
            "let", "let's", "like", "liked", "likely", "little", "look",
            "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe",
            "me", "mean", "meanwhile", "merely", "might", "more", "moreover",
            "most", "mostly", "much", "must", "my", "myself", "n", "name",
            "namely", "nd", "near", "nearly", "necessary", "need", "needs",
            "neither", "never", "nevertheless", "new", "next", "nine", "no",
            "nobody", "non", "none", "noone", "nor", "normally", "not",
            "nothing", "novel", "now", "nowhere", "o", "obviously", "of",
            "off", "often", "oh", "ok", "okay", "old", "on", "once", "one",
            "ones", "only", "onto", "or", "other", "others", "otherwise",
            "ought", "our", "ours", "ourselves", "out", "outside", "over",
            "overall", "own", "p", "particular", "particularly", "per",
            "perhaps", "placed", "please", "plus", "possible", "presumably",
            "probably", "provides", "q", "que", "quite", "qv", "r", "rather",
            "rd", "re", "really", "reasonably", "regarding", "regardless",
            "regards", "relatively", "respectively", "right", "'s", "said",
            "same", "saw", "say", "saying", "says", "second", "secondly",
            "see", "seeing", "seem", "seemed", "seeming", "seems", "seen",
            "self", "selves", "sensible", "sent", "serious", "seriously",
            "seven", "several", "shall", "she", "should", "shouldn't", "since",
            "six", "so", "some", "somebody", "somehow", "someone", "something",
            "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry",
            "specified", "specify", "specifying", "still", "sub", "such",
            "sup", "sure", "t", "t's", "take", "taken", "tell", "tends", "th",
            "than", "thank", "thanks", "thanx", "that", "that's", "thats",
            "the", "their", "theirs", "them", "themselves", "then", "thence",
            "there", "there's", "thereafter", "thereby", "therefore",
            "therein", "theres", "thereupon", "these", "they", "they'd",
            "they'll", "they're", "they've", "think", "third", "this",
            "thorough", "thoroughly", "those", "though", "three", "through",
            "throughout", "thru", "thus", "to", "together", "too", "took",
            "toward", "towards", "tried", "tries", "truly", "try", "trying",
            "twice", "two", "u", "un", "under", "unfortunately", "unless",
            "unlikely", "until", "unto", "up", "upon", "us", "use", "used",
            "useful", "uses", "using", "usually", "uucp", "v", "value",
            "various", "very", "via", "viz", "vs", "w", "want", "wants", "was",
            "wasn't", "way", "we", "we'd", "we'll", "we're", "we've",
            "welcome", "well", "went", "were", "weren't", "what", "what's",
            "whatever", "when", "whence", "whenever", "where", "where's",
            "whereafter", "whereas", "whereby", "wherein", "whereupon",
            "wherever", "whether", "which", "while", "whither", "who", "who's",
            "whoever", "whole", "whom", "whose", "why", "will", "willing",
            "wish", "with", "within", "without", "won't", "wonder", "would",
            "wouldn't", "x", "y", "yes", "yet", "you", "you'd", "you'll",
            "you're", "you've", "your", "yours", "yourself", "yourselves", "z",
            "zero", "html", "ol"
        ]

        self.stop_web_punct = self.stop_dict.union(self.web_stop_words)
Example #7
0
class nlp():
    def __init__(self):
        # initialise default words
        self.stop_dict = set(stopwords.words('english')).union(punctuation)
        self.vec = CountVectorizer()
        self.stemmer = PorterStemmer()
        self.lemmat = WordNetLemmatizer()
        self.stop_dict_name = 'nltk'
        self.vec_name = 'Count Vectorizer'
        self.stemmer_name = 'ProterStemmer'
        self.tokenizer_name = 'word'
        self.web_stop_words = [
            "a", "a's", "able", "about", "above", "according", "accordingly",
            "across", "actually", "after", "afterwards", "again", "against",
            "ain't", "all", "allow", "allows", "almost", "alone", "along",
            "already", "also", "although", "always", "am", "among", "amongst",
            "an", "and", "another", "any", "anybody", "anyhow", "anyone",
            "anything", "anyway", "anyways", "anywhere", "apart", "appear",
            "appreciate", "appropriate", "are", "aren't", "around", "as",
            "aside", "ask", "asking", "associated", "at", "available", "away",
            "awfully", "b", "be", "became", "because", "become", "becomes",
            "becoming", "been", "before", "beforehand", "behind", "being",
            "believe", "below", "beside", "besides", "best", "better",
            "between", "beyond", "both", "brief", "but", "by", "c", "c'mon",
            "c's", "came", "can", "can't", "cannot", "cant", "cause", "causes",
            "certain", "certainly", "changes", "clearly", "co", "com", "come",
            "comes", "concerning", "consequently", "consider", "considering",
            "contain", "containing", "contains", "corresponding", "could",
            "couldn't", "course", "currently", "d", "definitely", "described",
            "despite", "did", "didn't", "different", "do", "does", "doesn't",
            "doing", "don't", "done", "down", "downwards", "during", "e",
            "each", "edu", "eg", "eight", "either", "else", "elsewhere",
            "enough", "entirely", "especially", "et", "etc", "even", "ever",
            "every", "everybody", "everyone", "everything", "everywhere", "ex",
            "exactly", "example", "except", "f", "far", "few", "fifth",
            "first", "five", "followed", "following", "follows", "for",
            "former", "formerly", "forth", "four", "from", "further",
            "furthermore", "g", "get", "gets", "getting", "given", "gives",
            "go", "goes", "going", "gone", "got", "gotten", "greetings", "h",
            "had", "hadn't", "happens", "hardly", "has", "hasn't", "have",
            "haven't", "having", "he", "he's", "hello", "help", "hence", "her",
            "here", "here's", "hereafter", "hereby", "herein", "hereupon",
            "hers", "herself", "hi", "him", "himself", "his", "hither",
            "hopefully", "how", "howbeit", "however", "i", "i'd", "i'll",
            "i'm", "i've", "ie", "if", "ignored", "immediate", "in",
            "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates",
            "inner", "insofar", "instead", "into", "inward", "is", "isn't",
            "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k",
            "keep", "keeps", "kept", "know", "known", "knows", "l", "last",
            "lately", "later", "latter", "latterly", "least", "less", "lest",
            "let", "let's", "like", "liked", "likely", "little", "look",
            "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe",
            "me", "mean", "meanwhile", "merely", "might", "more", "moreover",
            "most", "mostly", "much", "must", "my", "myself", "n", "name",
            "namely", "nd", "near", "nearly", "necessary", "need", "needs",
            "neither", "never", "nevertheless", "new", "next", "nine", "no",
            "nobody", "non", "none", "noone", "nor", "normally", "not",
            "nothing", "novel", "now", "nowhere", "o", "obviously", "of",
            "off", "often", "oh", "ok", "okay", "old", "on", "once", "one",
            "ones", "only", "onto", "or", "other", "others", "otherwise",
            "ought", "our", "ours", "ourselves", "out", "outside", "over",
            "overall", "own", "p", "particular", "particularly", "per",
            "perhaps", "placed", "please", "plus", "possible", "presumably",
            "probably", "provides", "q", "que", "quite", "qv", "r", "rather",
            "rd", "re", "really", "reasonably", "regarding", "regardless",
            "regards", "relatively", "respectively", "right", "'s", "said",
            "same", "saw", "say", "saying", "says", "second", "secondly",
            "see", "seeing", "seem", "seemed", "seeming", "seems", "seen",
            "self", "selves", "sensible", "sent", "serious", "seriously",
            "seven", "several", "shall", "she", "should", "shouldn't", "since",
            "six", "so", "some", "somebody", "somehow", "someone", "something",
            "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry",
            "specified", "specify", "specifying", "still", "sub", "such",
            "sup", "sure", "t", "t's", "take", "taken", "tell", "tends", "th",
            "than", "thank", "thanks", "thanx", "that", "that's", "thats",
            "the", "their", "theirs", "them", "themselves", "then", "thence",
            "there", "there's", "thereafter", "thereby", "therefore",
            "therein", "theres", "thereupon", "these", "they", "they'd",
            "they'll", "they're", "they've", "think", "third", "this",
            "thorough", "thoroughly", "those", "though", "three", "through",
            "throughout", "thru", "thus", "to", "together", "too", "took",
            "toward", "towards", "tried", "tries", "truly", "try", "trying",
            "twice", "two", "u", "un", "under", "unfortunately", "unless",
            "unlikely", "until", "unto", "up", "upon", "us", "use", "used",
            "useful", "uses", "using", "usually", "uucp", "v", "value",
            "various", "very", "via", "viz", "vs", "w", "want", "wants", "was",
            "wasn't", "way", "we", "we'd", "we'll", "we're", "we've",
            "welcome", "well", "went", "were", "weren't", "what", "what's",
            "whatever", "when", "whence", "whenever", "where", "where's",
            "whereafter", "whereas", "whereby", "wherein", "whereupon",
            "wherever", "whether", "which", "while", "whither", "who", "who's",
            "whoever", "whole", "whom", "whose", "why", "will", "willing",
            "wish", "with", "within", "without", "won't", "wonder", "would",
            "wouldn't", "x", "y", "yes", "yet", "you", "you'd", "you'll",
            "you're", "you've", "your", "yours", "yourself", "yourselves", "z",
            "zero", "html", "ol"
        ]

        self.stop_web_punct = self.stop_dict.union(self.web_stop_words)

    def get_settings(self):
        return [
            self.tokenizer_name, self.stop_dict_name, self.vec_name,
            self.stemmer_name
        ]

    def apply_settings(self, x):

        if x == 'word':
            self.tokenizer_name = x
        elif x == 'sent':
            self.tokenizer_name = x

        if x == 'nltk':
            #    self.stop_dict  = stopwords.words('english')
            self.stop_dict_name = x
        elif x == 'Extend':
            # self.stop_dict = stopwords.words('english')
            self.stop_dict_name = x

        if x == 'Count':
            self.vec = CountVectorizer()
            self.vec_name = x
        elif x == 'TfiDf':
            # print(self.vec)
            self.vec = TfidfVectorizer()
            self.vec_name = x
            # print(self.vec)

        if x == 'Porter':
            self.stemmer = PorterStemmer()
            self.stemmer_name = x
        elif x == 'SnowBall':
            self.stemmer = SnowballStemmer(language='english',
                                           ignore_stopwords=True)
            self.stemmer_name = x
        elif x == 'ISR':
            self.stemmer = ISRIStemmer()
            self.stemmer_name = x

    def get_tokens(self, input):
        if self.tokenizer_name == 'word':
            return str(word_tokenize(input))
        elif self.tokenizer_name == 'sent':
            return str(sent_tokenize(input))
        else:
            print('error invaid tokenizer type')

    def get_stemmer(self, input):
        #need to add different stemmers here
        text = ''
        if self.tokenizer_name == 'word':
            for i in word_tokenize(input):
                text += self.stemmer.stem(i) + ' '
        elif self.tokenizer_name == 'sent':
            for i in sent_tokenize(input):
                text += self.stemmer.stem(i) + ' '

        return text

    def valid_char(self, input):

        for j in input:
            if ord(j) <= 126 and ord(j) >= 33:
                continue

            else:
                return False

        return True

    def get_stopwords(self, input):

        input = re.sub(
            '[!@#$%^&*()\n_:><?\-.{}|+-,;""``~`—]|[0-9]|/|=|\[\]|\[\[\]\]',
            ' ', input)
        input = re.sub('[“’\']', '', input)

        # print('input after regex',input)

        if self.stop_dict_name == 'nltk':
            return str(
                list(i for i in word_tokenize(input)
                     if i not in self.stop_dict and not i.split(
                         '.')[-1].isdigit() and not i.split(',')[-1].isdigit()
                     and len(i) > 1 and self.valid_char(i)))
        elif self.stop_dict_name == 'Extend':
            # stopwords_en_punct = set(stopwords.words('english')).union(punctuation)

            return str(
                list(i for i in word_tokenize(input)
                     if i not in self.stop_web_punct and not i.split(
                         '.')[-1].isdigit() and not i.split(',')[-1].isdigit()
                     and len(i) > 1 and self.valid_char(i)))

    def get_vec(self, input):

        # if type == 'Count':
        # print(self.vec)
        return str(self.vec.fit_transform([input]).toarray())

        # elif type == 'TfiDf':
        #     return str(TfidfVectorizer().fit_transform([input]).toarray())

    def penn2morphy(self, penntag):
        """ Converts Penn Treebank tags to WordNet. """
        morphy_tag = {'NN': 'n', 'JJ': 'a', 'VB': 'v', 'RB': 'r'}
        try:
            return morphy_tag[penntag[:2]]
        except:
            return 'n'

    def lemmatize_sent(self, text):
        wnl = WordNetLemmatizer()
        # Text input is string, returns lowercased strings.
        return str([
            wnl.lemmatize(word.lower(), pos=self.penn2morphy(tag))
            for word, tag in pos_tag(word_tokenize(text))
        ])

    def nlp_cleaner(self, x, inf=0):

        if type(x) != str:
            return 'invalid input , input must be string'

        #1 word tokenization , lowercasing
        x = re.sub(
            '[!@#$%^&*()\n_:><?\-.{}|+-,;""``~`—]|[0-9]|/|=|\[\]|\[\[\]\]',
            ' ', x)
        x = re.sub('[“’\']', '', x)

        if self.tokenizer_name == 'word':
            x = list(map(str.lower, word_tokenize(x)))
        elif self.tokenizer_name == 'sent':
            x = list(map(str.lower, sent_tokenize(x)))
            print(self.tokenizer_name)

        if inf:
            print('tokenizer')
            print(x[0:10])

        #2 stop words , removing punctuations
        if self.stop_dict_name == 'nltk':

            x = list(
                i for i in x
                if i not in self.stop_dict and not i.split('.')[-1].isdigit()
                and not i.split(',')[-1].isdigit() and len(i) > 1
                and self.valid_char(i))
        elif self.stop_dict_name == 'Extend':

            x = list(i for i in x
                     if i not in self.stop_web_punct and not i.split(
                         '.')[-1].isdigit() and not i.split(',')[-1].isdigit()
                     and len(i) > 1 and self.valid_char(i))
            # print(self.stop_dict_name)

        if inf:
            print('StopWords')
            print(x[0:10])

        #3 Stemming and Lemmatization

        if self.stemmer_name == 'Porter':
            x = list(self.stemmer.stem(i) for i in x)
        elif self.stemmer_name == 'SnowBall':
            x = list(self.stemmer.stem(i) for i in x)
        elif self.stemmer_name == 'ISR':
            x = list(self.stemmer.stem(i) for i in x)

        if inf:
            print('after stemming')
            print(x[0:10])

        return x

    def create_vec(self, name, col='STORY', encoding='UTF-8'):
        ext = name.split('.')[-1]

        if ext == 'csv':
            data = pd.read_csv(name, encoding=encoding)
        elif ext == 'xlsx':
            data = pd.read_excel(io=name, encoding=encoding)

        data = self.vec.fit_transform(data[col])
        pickle.dump(data, open('vec_metrix_.txt', 'wb'))
        pickle.dump(self.vec, open(self.vec_name + '.txt', 'wb'))
from keras.utils import np_utils
import tensorflow as tf
from keras import backend as K

BATCH_SIZE = 16  # Batch size for GPU
NUM_WORDS = 10000  # Vocab length
MAX_LEN = 20  # Padding length (# of words)
LSTM_EMBED = 8  # Number of LSTM nodes

K.set_learning_phase(False)

data = pd.read_csv('../dataset/ASKFM-master/full_dataset.csv')
tokenizer = cPickle.load(
    open("../models/lstm-autoencoder-tokenizer.pickle", "rb"))

stemmer = ISRIStemmer()

stemmer = ISRIStemmer()

# Read the encoder model
model = tf.keras.models.load_model('../models/lstm25/lstm-encoder.h5',
                                   compile=False)
model.load_weights('../models/lstm_encoder_weights.h5')
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

# Create the encoding function
encode = K.function([model.input, K.learning_phase()],
                    [model.layers[1].output])

Questions = tokenizer.texts_to_sequences(data.Question)
# We pad sequences that are shorter than MAX_LEN
Example #9
0
from pyparsing import StringEnd, oneOf, FollowedBy, Optional, ZeroOrMore, SkipTo

file = open(
    "C:\\Users\Administrator\\Desktop\\myfolder\\corpora\\stats\\ielts-7to11-some.txt"
)
raw = file.read()

try:
    wordlist = nltk.word_tokenize(raw)

    lemmatizer = WordNetLemmatizer()
    print lemmatizer.lemmatize("ran")
    lanster = LancasterStemmer()
    porter = PorterStemmer()
    snowball = SnowballStemmer("english")
    isri = ISRIStemmer()
    rslp = RSLPStemmer()
    porter2 = Stemmer('english')

    endOfString = StringEnd()
    prefix = oneOf(
        "uni inter intro de con com anti pre pro per an ab ad af ac at as re in im ex en em un dis over sub syn out thermo philo geo for fore back"
    )
    suffix = oneOf("ish")
    #suffix = oneOf("or er ed ish ian ary ation tion al ing ible able ate ly ment ism ous ness ent ic ive "
    #               "ative tude ence ance ise ant age cide ium ion")

    word = (Optional(prefix)("prefixes") +
            SkipTo(suffix | suffix + FollowedBy(endOfString)
                   | endOfString)("root") +
            ZeroOrMore(suffix | suffix + FollowedBy(endOfString))("suffix"))
def batches_generator(train_data, batch_size=32):
    # For OHE inputs
    num_words = np.max(train_data) + 1
    timesteps = train_data.shape[1]
    while True:
        indices = np.random.choice(len(train_data), size=batch_size)
        X = train_data[indices]
        X = np_utils.to_categorical(X, num_words)
        X = X.reshape((batch_size, timesteps, num_words))
        yield (X, X)


train_data = pd.read_csv(
    "/home/omar/DataScience/DataSets/askfm/full_dataset.csv")

stemmer = ISRIStemmer()

# We don't need the answers, so let's drop them
train_data.drop('Answer', inplace=True, axis=1)

train_data = train_data[
    train_data.Question.apply(lambda x: len(x.split())) < MAX_LEN]

train_data.Question = train_data.Question.apply(
    lambda x: (re.sub('[^\u0620-\uFEF0\s]', '', x)).strip())

train_data = train_data[train_data.Question.apply(len) > 0]

# Stem the words
train_data.Question = train_data.Question.apply(
    lambda x: " ".join([stemmer.stem(i) for i in x.split()]))
Example #11
0
class Model:
    """docstring for Moddel"""
    def __init__(self):
        self.sc = StandardScaler()
        self.sex_enc = LabelEncoder()
        self.imputer = Imputer()
        self.classifier = LogisticRegression()

    def cleaner(self, text):
        text = text.lower()
        text = re.sub("@[^\s]+", "", text)
        text = text.replace(":)", "")
        text = text.replace("@", "")
        text = text.replace("#", "")
        text = text.replace(":(", "")
        return text

    def remove_stop_words(self, text):
        self.sw = stopwords.words("arabic")
        self.clean_words = []
        text = text.split()
        for word in text:
            if word not in self.sw:
                self.clean_words.append(word)
        return " ".join(self.clean_words)

    def stemming(self, text):
        self.ps = ISRIStemmer()
        text = text.split()
        self.stemmed_words = []
        for word in text:
            self.stemmed_words.append(self.ps.stem(word))
        return " ".join(self.stemmed_words)

    def run(self, text):
        text = self.cleaner(text)
        text = self.remove_stop_words(text)
        text = self.stemming(text)
        return text

    def read_df(self, path):
        self.df = pd.read_csv(path)

    def preprocessing(self):
        self.df['txt'] = self.df['txt'].apply(self.run)

    def split_df(self):
        self.tfidf = TfidfVectorizer()
        self.x = self.tfidf.fit_transform(self.df["txt"]).toarray()
        self.y = self.df['sentiment'].values

    def train_test(self, test_size):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            self.x, self.y, test_size=test_size, random_state=0)

    def train(self, classy):
        self.read_df("ASTD.csv")
        self.preprocessing()
        self.split_df()
        self.train_test(0.25)
        if (classy == "logistic"):

            self.classifier.fit(self.x_train, self.y_train)
        if (classy == "SVC"):
            self.classifier = SVC()
            self.classifier.fit(self.x_train, self.y_train)

        if (classy == "KNN"):
            self.classifier = KNeighborsClassifier()
            self.classifier.fit(self.x_train, self.y_train)
        self.y_pred = self.classifier.predict(self.x_test)
        return classification_report(self.y_test, self.y_pred)

    def evaluate(self):
        return self.classifier.score(self.x_test, self.y_test)

    def predict(self, test):
        test = self.run(test)
        test = self.tfidf.transform([test]).toarray()
        #test = self.sc.transform([test])
        return self.classifier.predict(test)
Example #12
0
class Preprocess(object):

    _valid_lang = ['en', 'cn', 'ar']
    _stemmer = ISRIStemmer()

    def __init__(self,
                 word_seg_config={},
                 doc_filter_config={},
                 word_stem_config={},
                 word_lower_config={},
                 word_filter_config={},
                 word_index_config={}):
        # set default configuration
        self._word_seg_config = {'enable': True, 'lang': 'ar'}
        self._doc_filter_config = {
            'enable': True,
            'min_len': 0,
            'max_len': six.MAXSIZE
        }
        self._word_stem_config = {'enable': True}
        self._word_lower_config = {'enable': [False]}
        self._word_filter_config = {
            'enable': False,
            'stop_words': stopwords.words('arabic'),
            'min_freq': 1,
            'max_freq': six.MAXSIZE,
            'words_useless': None
        }
        self._word_index_config = {'word_dict': None}

        self._word_seg_config.update(word_seg_config)
        self._doc_filter_config.update(doc_filter_config)
        self._word_stem_config.update(word_stem_config)
        self._word_lower_config.update(word_lower_config)
        self._word_filter_config.update(word_filter_config)
        self._word_index_config.update(word_index_config)

        self._word_dict = self._word_index_config['word_dict']
        self._words_stats = dict()

    def run(self, file_path):
        print('load...')
        dids, docs = Preprocess.load(file_path)

        if self._word_seg_config['enable']:
            print('word_seg...')
            docs = Preprocess.word_seg(docs, self._word_seg_config)

        if self._doc_filter_config['enable']:
            print('doc_filter...')
            dids, docs = Preprocess.doc_filter(dids, docs,
                                               self._doc_filter_config)

        if self._word_stem_config['enable']:
            print('word_stem...')
            docs = Preprocess.word_stem(docs)

        if self._word_lower_config['enable']:
            print('word_lower...')
            docs = Preprocess.word_lower(docs)

        self._words_stats = Preprocess.cal_words_stat(docs)

        if self._word_filter_config['enable']:
            print('word_filter...')
            docs, self._words_useless = Preprocess.word_filter(
                docs, self._word_filter_config, self._words_stats)

        print('word_index...')
        docs, self._word_dict = Preprocess.word_index(docs,
                                                      self._word_index_config)

        return dids, docs

    @staticmethod
    def parse(line):
        subs = line.split(' ', 1)
        if 1 == len(subs):
            return subs[0], ''
        else:
            return subs[0], subs[1]

    @staticmethod
    def load(file_path):
        dids = list()
        docs = list()
        f = codecs.open(file_path, 'r', encoding='utf8')
        for line in tqdm(f):
            line = line.strip()
            if '' != line:
                did, doc = Preprocess.parse(line)
                dids.append(did)
                docs.append(doc)
        f.close()
        return dids, docs

    @staticmethod
    def word_seg_ar(docs):
        docs = [wordpunct_tokenize(sent) for sent in tqdm(docs)]
        # show the progress of word segmentation with tqdm
        '''docs_seg = []
        print('docs size', len(docs))
        for i in tqdm(range(len(docs))):
            docs_seg.append(word_tokenize(docs[i]))'''
        return docs

    @staticmethod
    def word_seg_en(docs):
        docs = [word_tokenize(sent) for sent in tqdm(docs)]
        # show the progress of word segmentation with tqdm
        '''docs_seg = []
        print('docs size', len(docs))
        for i in tqdm(range(len(docs))):
            docs_seg.append(word_tokenize(docs[i]))'''
        return docs

    @staticmethod
    def word_seg_cn(docs):
        docs = [list(jieba.cut(sent)) for sent in docs]
        return docs

    @staticmethod
    def word_seg(docs, config):
        assert config['lang'].lower(
        ) in Preprocess._valid_lang, 'Wrong language type: %s' % config['lang']
        docs = getattr(
            Preprocess,
            '%s_%s' % (sys._getframe().f_code.co_name, config['lang']))(docs)
        return docs

    @staticmethod
    def cal_words_stat(docs):
        words_stats = {}
        docs_num = len(docs)
        for ws in docs:
            for w in ws:
                if w not in words_stats:
                    words_stats[w] = {}
                    words_stats[w]['cf'] = 0
                    words_stats[w]['df'] = 0
                    words_stats[w]['idf'] = 0
                words_stats[w]['cf'] += 1
            for w in set(ws):
                words_stats[w]['df'] += 1
        for w, winfo in words_stats.items():
            words_stats[w]['idf'] = np.log(
                (1. + docs_num) / (1. + winfo['df']))
        return words_stats

    @staticmethod
    def word_filter(docs, config, words_stats):
        if config['words_useless'] is None:
            config['words_useless'] = set()
            # filter with stop_words
            config['words_useless'].update(config['stop_words'])
            # filter with min_freq and max_freq
            for w, winfo in words_stats.items():
                # filter too frequent words or rare words
                if config['min_freq'] > winfo['df'] or config[
                        'max_freq'] < winfo['df']:
                    config['words_useless'].add(w)
        # filter with useless words
        docs = [[w for w in ws if w not in config['words_useless']]
                for ws in tqdm(docs)]
        return docs, config['words_useless']

    @staticmethod
    def doc_filter(dids, docs, config):
        new_docs = list()
        new_dids = list()
        for i in tqdm(range(len(docs))):
            if config['min_len'] <= len(docs[i]) <= config['max_len']:
                new_docs.append(docs[i])
                new_dids.append(dids[i])
        return new_dids, new_docs

    @staticmethod
    def word_stem(docs):
        docs = [[Preprocess._stemmer.stem(w) for w in ws] for ws in tqdm(docs)]
        return docs

    @staticmethod
    def word_lower(docs):
        docs = [[w.lower() for w in ws] for ws in tqdm(docs)]
        return docs

    @staticmethod
    def build_word_dict(docs):
        word_dict = dict()
        for ws in docs:
            for w in ws:
                word_dict.setdefault(w, len(word_dict))
        return word_dict

    @staticmethod
    def word_index(docs, config):
        if config['word_dict'] is None:
            config['word_dict'] = Preprocess.build_word_dict(docs)
        docs = [[
            config['word_dict'][w] for w in ws if w in config['word_dict']
        ] for ws in tqdm(docs)]
        return docs, config['word_dict']

    @staticmethod
    def save_lines(file_path, lines):
        f = codecs.open(file_path, 'w', encoding='utf8')
        for line in lines:
            line = line
            f.write(line + "\n")
        f.close()

    @staticmethod
    def load_lines(file_path):
        f = codecs.open(file_path, 'r', encoding='utf8')
        lines = f.readlines()
        f.close()
        return lines

    @staticmethod
    def save_dict(file_path, dic, sort=False):
        if sort:
            dic = sorted(dic.items(), key=lambda d: d[1], reverse=False)
            lines = ['%s %s' % (k, v) for k, v in dic]
        else:
            lines = ['%s %s' % (k, v) for k, v in dic.items()]
        Preprocess.save_lines(file_path, lines)

    @staticmethod
    def load_dict(file_path):
        lines = Preprocess.load_lines(file_path)
        dic = dict()
        for line in lines:
            k, v = line.split()
            dic[k] = v
        return dic

    def save_words_useless(self, words_useless_fp):
        Preprocess.save_lines(words_useless_fp, self._words_useless)

    def load_words_useless(self, words_useless_fp):
        self._words_useless = set(Preprocess.load_lines(words_useless_fp))

    def save_word_dict(self, word_dict_fp, sort=False):
        Preprocess.save_dict(word_dict_fp, self._word_dict, sort)

    def load_word_dict(self, word_dict_fp):
        self._word_dict = Preprocess.load_dict(word_dict_fp)

    def save_words_stats(self, words_stats_fp, sort=False):
        if sort:
            word_dic = sorted(self._word_dict.items(),
                              key=lambda d: d[1],
                              reverse=False)
            lines = [
                '%s %d %d %f' %
                (wid, self._words_stats[w]['cf'], self._words_stats[w]['df'],
                 self._words_stats[w]['idf']) for w, wid in word_dic
            ]
        else:
            lines = [
                '%s %d %d %f' %
                (wid, self._words_stats[w]['cf'], self._words_stats[w]['df'],
                 self._words_stats[w]['idf'])
                for w, wid in self._word_dict.items()
            ]
        Preprocess.save_lines(words_stats_fp, lines)

    def load_words_stats(self, words_stats_fp):
        lines = Preprocess.load_lines(words_stats_fp)
        for line in lines:
            wid, cf, df, idf = line.split()
            self._words_stats[wid] = {}
            self._words_stats[wid]['cf'] = int(cf)
            self._words_stats[wid]['df'] = int(df)
            self._words_stats[wid]['idf'] = float(idf)