Ejemplo n.º 1
0
import nltk.stem.porter as pt
import nltk.stem.lancaster as lc
import nltk.stem.snowball as sb

words = [
    'table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches',
    'grounded', 'dreamt', 'envision'
]

pt_stemmer = pt.PorterStemmer()  # 波特词干提取器,偏宽松
lc_stemmer = lc.LancasterStemmer()  # 朗卡斯特词干提取器,偏严格
sb_stemmer = sb.SnowballStemmer('english')  # 思诺博词干提取器,偏中庸
for word in words:
    pt_stem = pt_stemmer.stem(word)
    lc_stem = lc_stemmer.stem(word)
    sb_stem = sb_stemmer.stem(word)
    print('%8s %8s %8s %8s' % (word, pt_stem, lc_stem, sb_stem))
Ejemplo n.º 2
0
class Word:
    nonalpha_regex = r'[^a-zA-Z]'

    numeric_regex = r'[0-9]'

    html_regex = r'<.*?>'

    stemmer = snowball.SnowballStemmer("english")

    word_corpus = set(words.words())

    stop_words = {
        'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an',
        'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been',
        'before', 'being', 'below', 'between', 'both', 'both', 'but', 'by',
        'cant', 'cannot', 'could', 'couldn\'t', 'did', 'didn\'t', 'do', 'does',
        'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for',
        'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have',
        'haven\'t', 'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here',
        'here\'s', 'hers', 'herself', 'him', 'himself', 'his'
        'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', 'in',
        'into', 'is', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'let\'s', 'me',
        'more', 'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of',
        'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours',
        'ourselves', 'out', 'over', 'own', 'same', 'shan\'t', 'she', 'she\'d',
        'she\'ll', 'she\'s', 'should', 'shouldn\'t', 'so', 'some', 'such',
        'than', 'that', 'that\'s', 'the', 'their', 'theirs', 'them',
        'themselves', 'then', 'there', 'there\'s', 'these', 'they', 'they\'d',
        'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through', 'to',
        'too', 'under', 'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d',
        'we\'ll', 'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s',
        'when', 'when\'s', 'where', 'where\'s', 'which', 'while', 'who',
        'who\'s', 'whom', 'why', 'why\'s', 'with', 'with'
        'won\'t', 'would', 'would', 'would', 'would', 'wouldn\'t', 'you',
        'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
        'yourselves'
    }

    @staticmethod
    def strip(word):
        return re.sub(Word.html_regex, '', word)

    @staticmethod
    def sanitize(word):
        return re.sub(Word.nonalpha_regex, '', word)

    @staticmethod
    def normalize(word):
        return word.lower()

    @staticmethod
    def stem(word):
        return Word.stemmer.stem(word)

    @staticmethod
    def process(word):
        word = Word.normalize(word)
        word = Word.strip(word)
        word = Word.sanitize(word)
        word = Word.stem(word)
        return word

    @staticmethod
    def is_stop(word):
        return word in Word.stop_words

    @staticmethod
    def is_numeric(word):
        return re.match(Word.numeric_regex, word) is not None

    @staticmethod
    def is_html(word):
        return re.match(Word.html_regex, word) is not None

    @staticmethod
    def is_word(word):
        return word in Word.word_corpus
Ejemplo n.º 3
0
def main(word, keepNone=False):
    '''
    input:
    word(str):input word
    ---------------------
    return:
    meanExamList(list): [(mean1:[mean1exm1,mean1exm2,..]),(mean2,[mean2exam1,...])]
    '''
    print("connecting to the website ...")
    myUrl = "https://www.macmillandictionary.com/dictionary/british/" + word
    res = request.urlopen(myUrl)
    res.encoding = 'utf-8'

    print("finding needed information ...")
    soupStr = bs(res.read(), features="lxml")
    olBsL = soupStr.find_all("ol", class_="senses")
    if len(olBsL) > 0:
        olBs = olBsL[0]
    else:
        print("problems in '", word, "':no enough meanings")
        tempWord = word
        sb_stemmer = sb.SnowballStemmer("english")
        word = sb_stemmer.stem(word)
        if tempWord != word:
            return main(word)
        else:
            return []
    liBsList = olBs.find_all("li")

    print("finding meanings and examples ...")
    meaningList = []
    exampleList = []
    for liItem in liBsList:
        if len(liItem.find_all("div", class_="SENSE")) > 0:
            if len(liItem.\
                    find_all("div",class_="SENSE")[0].\
                    find_all("span",class_="DEFINITION"))>0:
                liDivItem=liItem.\
                    find_all("div",class_="SENSE")[0].\
                    find_all("span",class_="DEFINITION")[0].\
                    text
            elif len(liItem.\
                    find_all("div",class_="SENSE")[0].\
                    find_all("span",class_="GREF-ENTRY"))>0:
                liDivItem=liItem.\
                    find_all("div",class_="SENSE")[0].\
                    find_all("span",class_="GREF-ENTRY")[0].\
                    find_all("a")[0].text
                if liDivItem == None:
                    print("problems in", word, ": None")
                    return [("no means", "no examples")]
                else:
                    return main(liDivItem, keepNone=False)
            elif len(liItem.\
                    find_all("div",class_="SENSE")[0].\
                    find_all("div",class_="sideboxbody"))>0:
                liDivItem=liItem.\
                    find_all("div",class_="SENSE")[0].\
                    find_all("div",class_="sideboxbody")[0].\
                    find_all("a")[0].text
                if liDivItem == None:
                    print("problems in", word, ": None")
                    return [("no means", "no examples")]
                else:
                    return main(liDivItem, keepNone=False)
            else:
                return [("no means", "no examples")]
            meaningList.append(liDivItem)
        if len(liItem.find_all("div", class_="SENSE")) > 0:
            try:
                liDivItem=liItem.\
                    find_all("div",class_="SENSE")[0].\
                    find_all("p",class_="EXAMPLE")
                exampleList.append(
                    [liDivItemItem.text for liDivItemItem in liDivItem])
            except IndexError:
                pass
    meanExamList = list(zip(meaningList, exampleList))
    if len(meanExamList[0][1]) == 0:
        print("problems in '", word,
              "':no enough examples. replacing examples with meanings")
        meanExamList = [(meanExamItem[0], [meanExamItem[0]])
                        for meanExamItem in meanExamList]
    tempMeanExamList = []
    if keepNone == False:
        for row in meanExamList:
            if len(row[1]) != 0:
                tempMeanExamList.append(row)
        meanExamList = tempMeanExamList
    return meanExamList
Ejemplo n.º 4
0
 def __init__(self):
     self.stemmer = snowball.SnowballStemmer("english")
     self.Porter_stemmer = PorterStemmer()
Ejemplo n.º 5
0
    def run(self):
        global _independent_transformers

        self.tokenzier = treebank.TreebankWordTokenizer()
        self.stemmer = snowball.SnowballStemmer('english')

        train_data = rf_dataset.Dataset().load_all(
            'train', as_df=True)[['question1_clean', 'question2_clean']]
        test_data = rf_dataset.Dataset().load_all(
            'test', as_df=True)[['question1_clean', 'question2_clean']]

        all_data = pandas.concat([train_data, test_data], 0)
        all_q1 = list(all_data['question1_clean'])
        all_t1 = list(
            tqdm(multiprocessing.Pool().imap(self.tokenize,
                                             all_q1,
                                             chunksize=5000),
                 total=len(all_q1),
                 desc='Tokenizing: 1'))

        all_q2 = list(all_data['question2_clean'])
        all_t2 = list(
            tqdm(multiprocessing.Pool().imap(self.tokenize,
                                             all_q2,
                                             chunksize=5000),
                 total=len(all_q2),
                 desc='Tokenizing: 2'))

        all_indep_dists = list(
            tqdm(multiprocessing.Pool().imap(transform,
                                             zip(all_q1, all_q2, all_t1,
                                                 all_t2),
                                             chunksize=5000),
                 total=len(all_q1),
                 desc='Computing distances'))
        all_df = pandas.DataFrame(all_indep_dists)

        print('Loading dependent transforms')
        dependent_transformers = {
            'word_mover': WordMoverDistance(),
            'sentiment': SentimentDifference()
        }
        print('Finished loading!')

        for name, fn in dependent_transformers.items():
            dist = [
                fn(q1, q2, t1, t2)
                for q1, q2, t1, t2 in tqdm(zip(all_q1, all_q2, all_t1, all_t2),
                                           total=len(all_q1),
                                           desc=name)
            ]
            if isinstance(dist[0], dict):
                frame = pandas.DataFrame.from_dict(dist, orient='columns')
                for col in frame:
                    all_df[name + '_' + col] = frame[col]
            else:
                all_df[name] = dist

        self.output().makedirs()
        train_dists = all_df.iloc[:train_data.shape[0]]
        test_dists = all_df.iloc[train_data.shape[0]:]
        train_dists.to_msgpack(_train_loc)
        test_dists.to_msgpack(_test_loc)

        little_cls = ensemble.ExtraTreesClassifier(n_estimators=200, n_jobs=-1)
        little_cls.fit(
            train_dists.clip(-10000, 10000).values,
            rf_dataset.Dataset().load_all('train',
                                          as_df=True).is_duplicate.values)
        print(
            pandas.Series(little_cls.feature_importances_,
                          train_dists.columns).sort_values())

        with self.output().open('w') as f:
            f.write(
                str(
                    pandas.Series(little_cls.feature_importances_,
                                  train_dists.columns).sort_values()))
            f.write("\n")
Ejemplo n.º 6
0
def BuildInvertedIndex(document_paths):
    # In-memory indexer for creating inverted index

    stemmer = snowball.SnowballStemmer('english')
    DocumentIndex = {}  # {key = doc_id: value = (url, doc_path)}
    InvertedIndex = {
    }  # Inverted list storage (dictionary of tokens/words/n-grams + posting lists)
    n = 0  # Document numbering

    for document_path in document_paths:
        # Read json file which contains ['url', 'content', 'encoding'] for a document
        with open(document_path, 'r') as fh:
            json_object = json.load(fh)

        url, pageContent, encoding = json_object["url"], json_object[
            "content"], json_object["encoding"]

        # Ignore urls with fragments
        if urlparse(url).fragment != "": continue

        soup = BeautifulSoup(pageContent, 'lxml')
        text = soup.get_text()

        # check if the page contains any text
        if text == '': continue
        n += 1
        print(f"Indexing document #{n}")

        DocumentIndex[n] = (url, document_path)
        tokens = tokenize(text)  # tokenize text in html document
        tokenFrequency = get_token_frequency(tokens)
        tokens = set(tokens)  # remove duplicate tokens

        HTML_tag_fields = get_HTML_tag_fields(soup)

        for term_position, token in enumerate(tokens):
            # Check if a PostingList is present in the inverted index,
            # Add the new {token : PostingList} to inverted index otherwise.
            try:
                posting_list = InvertedIndex[stemmer.stem(token)]
                # Check if a Posting is present in the posting_list,
                # Add the new Posting to posting_list otherwise.
                try:
                    # If Posting 'n' is present in the posting_list,
                    # Append term position to posting.term_postitions
                    posting = posting_list[n]
                    posting.append_term_position(term_position)

                except IndexError:
                    # if Posting 'n' is not present in the posting_list
                    posting_list.append(
                        Posting(docid=n,
                                tf=tokenFrequency[token],
                                fields=get_posting_fields(
                                    HTML_tag_fields, token),
                                termPosition=term_position))

            except KeyError:
                InvertedIndex[stemmer.stem(token)] = PostingList(
                    Posting(docid=n,
                            tf=tokenFrequency[token],
                            fields=get_posting_fields(HTML_tag_fields, token),
                            termPosition=term_position))

    return DocumentIndex, InvertedIndex
Ejemplo n.º 7
0
    vectorizer = CountVectorizer(min_df=0)
    vectorizer.fit(corpusList)
    vocabList = vectorizer.get_feature_names()

    #     wordFqArr=np.array([word[1] for word in vectorizer.vocabulary_.items()])
    #     wordFqArr=(wordFqArr-np.mean(wordFqArr)*np.ones(wordFqArr.shape))/np.std(wordFqArr)
    #     print(list(vectorizer.vocabulary_.items()))
    #     print("mean of word frequence:",np.mean(wordFqArr))
    #     print("standard variance of word frequence:",np.std(wordFqArr))
    #     plt.hist(wordFqArr)
    #     plt.show()

    print("getting mean-example list ...")
    meanExamList = []
    wordMeanExamDict = {}
    sb_stemmer = sb.SnowballStemmer("english")
    for word in tqdm.tqdm(vocabList):
        if len(re.findall("[0-9]+", word)) > 0:
            continue
        if word not in wordMeanExamDict.keys():
            wordMeanExamDict[word] = TC.main(word.strip())
        stemedWord = sb_stemmer.stem(word)
        if stemedWord not in wordMeanExamDict.keys():
            wordMeanExamDict[stemedWord] = TC.main(stemedWord.strip())

    print("saving data ...")
    with open("data/GANDict.pkl", "wb+") as GANDictFile:
        pkl.dump(wordMeanExamDict, GANDictFile)

    print("loading data ...")
    with open("data/GANDict.pkl", "rb") as GANDictFile:
import xgboost as xgb
import cPickle as pickle

from string import punctuation
from nltk import word_tokenize
from nltk.stem import snowball

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
plt.style.use('ggplot')

stemmer = snowball.SnowballStemmer("english")


def load_data(filename='../labeledhate_5cats.p'):
    '''
    Load data into a data frame for use in running model
    '''
    return pickle.load(open(filename, 'rb'))


def stem_tokens(tokens, stemmer):
    '''Stem the tokens.'''
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed
    print("Loading Data")
    df = load_data()
    print("Splitting Data")
    X_train, X_test, y_train, y_test = splitdata(df, classes)

    #relabel the output for multiclass roc plot, score
    ylabel_bin = label_binarize(y_test.astype(int),
                                classes=[0, 1, 2, 3, 4],
                                sparse_output=False)

    ### Loop through # max_features? --> Use 5000 as a starting point, at least for now. ###
    ### Use english stop words
    ### Loop Through Vectorizer, stemmer/tokenizing ###
    vect_options = ['Count', 'Hash', 'Tfidf']

    stemmer_options = [snowball.SnowballStemmer("english")]
    #Note - wordnet.WordNetLemmatizer() has no .stem option & doesn't fit the format of this code.

    token_options = [None, tokenize]
    # token_options = [tokenize]

    for token in token_options:
        for stemmer in stemmer_options:
            for vect in vect_options:
                print('For vect {0}, stemmer {1} & token {2}'.format(
                    vect, stemmer, token))
                print('Vectorizing')
                vectfit_X_train, vectfit_X_test = vectorizer(
                    vectchoice=vect, stopwords='english', tokenize_me=token)
                print('Classifying')
                xg_train = xgb.DMatrix(vectfit_X_train, label=y_train)
Ejemplo n.º 10
0
def train(X_train, s_train, y_train):
    # count_vect = CountVectorizer()
    # X_train_counts = count_vect.fit_transform(twenty_train.data)
    # tfidf_transformer = TfidfTransformer()
    # X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    # TODO: do n-grams and add NLTK features! 

    # using default SVM params
    # penalty=’l2’, loss=’squared_hinge’, dual=True, tol=0.0001, C=1.0, 
    # multi_class=’ovr’, fit_intercept=True, intercept_scaling=1, 
    # class_weight=None, verbose=0, random_state=None, max_iter=1000
    

    stemmer = sb.SnowballStemmer('english')
    
    swlist = sw.words('english')
    swlist += [stemmer.stem(w) for w in swlist]
    swlist += ["'d", "'s", 'abov', 'ani', 'becaus', 'befor', 'could', 'doe', 'dure', 'might',
               'must', "n't", 'need', 'onc', 'onli', 'ourselv', 'sha', 'themselv', 'veri', 'whi',
               'wo', 'would', 'yourselv'] #complained about not having these as stop words
    pubs = ['buzzfe', 'buzzf', 'npr', 'cnn', 'vox', 'reuter', 'breitbart', 'fox', 'guardian','review', 'theatlant']
    punct = []#[':', '..', '“', '@', '%', ';', '→', ')', '#', '(', '*', '&', '[', ']', '…', '?','—', '‘', '$'] #gonna leave these in for now
    
    swlist += pubs
    swlist += punct
    if sys.argv[4].lower()=='true':
        tkzr = StemTokenizer()
    else:
        tkzr = None
    
    if sys.argv[5].lower()!='true':
        swlist = []
    
    if sys.argv[1].lower()=='rf':
        classTuple = ('rf', RandomForestClassifier(n_estimators=100, class_weight='balanced'))
    elif sys.argv[1].lower()=='svm':
        classTuple = ('svm', LinearSVC(class_weight='balanced'))
    elif sys.argv[1].lower()=='knn':
        classTuple = ('knn', KNeighborsClassifier(n_neighbors=5, metric='cosine'))
    else:
        sys.exit('unknown classifier')

    #what features are we using?
    if sys.argv[7].lower()=='word':
        text_clf = Pipeline([('vect', AugmentedCountVectorizer(stop_words=swlist,
                                                               tokenizer=tkzr)),
                             ('tfidf', TfidfTransformer()),
                             classTuple])
    elif sys.argv[7].lower()=='topic':
        text_clf = Pipeline([('vect', AugmentedCountVectorizer(stop_words=swlist,
                                                               tokenizer=tkzr)),
                             ('tfidf', LatentDirichletAllocation(n_components=50)),
                             classTuple])
    elif sys.argv[7].lower()=='style':
        text_clf = Pipeline([('vect', RemoveWords()), classTuple])
    elif sys.argv[7].lower()=='all':
        text_clf = Pipeline([('vect', AugmentedCountVectorizer(stop_words=swlist,
                                                               tokenizer=tkzr,
                                                               useStyleFeatures=True)),
                             ('tfidf', AllFeatureTransformer(n_components=50)),
                             classTuple])
    else:
        sys.exit('unknown features')
    

    text_clf = text_clf.fit((X_train, s_train), y_train)
    # TODO: save model
    return text_clf
Ejemplo n.º 11
0
from __future__ import print_function
import numpy as np
import nltk
import nltk.corpus as corpus
import nltk.stem.snowball as snowball
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import re
import random

ptbr_stem = snowball.SnowballStemmer('portuguese').stem

def create_lexicon(dataset):
    global max_doc_length
    lexicon = []
    with open(dataset, 'r') as f:
        contents = f.readlines()[7:]
        for line in contents:
            document = line.split(',')[-1]
            document = re.sub("'", '', document, flags = re.M)
            document = word_tokenize(document)

            for word in document:
                if word.lower() not in corpus.stopwords.words('portuguese'):
                    lexicon.append(word.lower())
def stem_tokenizer(doc):
    tokens = word_tokenize(doc)
    stemmer = snowball.SnowballStemmer("english", ignore_stopwords=True)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return ([tok.lower() for tok in stemmed_tokens if tok.isalpha()])
Ejemplo n.º 13
0
@author: Administrator
"""

import nltk.stem.porter as pt
import nltk.stem.lancaster as lc
import nltk.stem.snowball as sb

words = [
    'table', 'probably', 'wolves', 'dreamt', 'palying', 'is', 'beaches',
    'envision', 'grounded'
]
'''
提取词干
'''
stemmer_porter = pt.PorterStemmer()  # 偏宽松
stemmer_lancaster = lc.LancasterStemmer()  # 偏严格
stemmer_snowball = sb.SnowballStemmer('english')  # 适中
for word in words:
    pstem = stemmer_porter.stem(word)
    lstem = stemmer_lancaster.stem(word)
    sstem = stemmer_snowball.stem(word)
    print('{:10} {:10} {:10} {:10}'.format(word, pstem, lstem, sstem))
    #table      tabl       tabl       tabl
    #probably   probabl    prob       probabl
    #wolves     wolv       wolv       wolv
    #dreamt     dreamt     dreamt     dreamt
    #palying    pali       paly       pali
    #is         is         is         is
    #beaches    beach      beach      beach
    #envision   envis      envid      envis
    #grounded   ground     ground     ground
Ejemplo n.º 14
0
from extractors.common.extraction_keys import KEYS

"""
Post processor function which tries to find given place name from list of manually fixed place names so that
Finnish difficult conjugations or typos can be resolved to a correct Place name. As a bonus fills in region when
it is recorded in the Place name list. List itself was generated from csv with karelian-db repository's 
fix_place_names script. 

This processor should be run for Place names which are in conjugated format, for example birth places, which in karelian
books are usually written in form of "Ahlaisissa". Some conjugations are difficult to deal with naive Snowball stemmer 
and many OCR typos also seem to trip stemmer. Therefore a list of around 2500 place names were corrected by hand and rest
should be possible to merge with stemmer and string distance metric such as Jaro-Winkler.  
"""
manually_fixed_place_names_file = open('support_datasheets/place_names_with_alternative_forms.json', encoding='utf8')
manually_fixed_place_names = json.load(manually_fixed_place_names_file)
stemmer = snowball.SnowballStemmer('finnish')
manual_place_name_index = {}

"""
Every place name should be found from existing list of place names when searched by conservative Jaro-Winkler distance
of stemmed form of the name. This should minimize the problem of creating useless unique place names to the result set just
because same place name has slight difference in the end of the word such as conjugation.
"""
list_of_known_places_file = open('support_datasheets/place_name_list.csv', encoding='utf8')
list_of_known_places = list(csv.DictReader(list_of_known_places_file))
place_list_index = collections.OrderedDict()

# Create a hash map which has as a key different writing styles of place names
# which refer to correct data entry for those place names
for key, item in manually_fixed_place_names.items():
    manual_place_name_index[key] = item
Ejemplo n.º 15
0
 def __init__(self):
     self.stemmer = sb.SnowballStemmer('english')
Ejemplo n.º 16
0
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
import nltk.tokenize as tk
import nltk.corpus as nc
import nltk.stem.snowball as sb
import gensim.models.ldamodel as gm
import gensim.corpora as gc
doc = []
with open('../data/topic.txt', 'r') as f:
    for line in f.readlines():
        doc.append(line[:-1])
tokenizer = tk.RegexpTokenizer(r'\w+')
stopwords = nc.stopwords.words('english')
stemmer = sb.SnowballStemmer('english')
lines_tokens = []
for line in doc:
    tokens = tokenizer.tokenize(line.lower())
    line_tokens = []
    for token in tokens:
        if token not in stopwords:
            token = stemmer.stem(token)
            line_tokens.append(token)
    lines_tokens.append(line_tokens)
dic = gc.Dictionary(lines_tokens)
bow = []
for line_tokens in lines_tokens:
    row = dic.doc2bow(line_tokens)
    bow.append(row)
n_topics = 2
model = gm.LdaModel(bow, num_topics=n_topics, id2word=dic, passes=25)
topics = model.print_topics(num_topics=n_topics, num_words=4)
Ejemplo n.º 17
0
def run_classifer(X_train, y_train, X_test, y_test):
    # s_train = np.array(s_train) # samples x features
    # s_test = np.array(s_test)

    num_labels = 15
    batch_size = 100

    stemmer = sb.SnowballStemmer('english')
    
    swlist = sw.words('english')
    swlist += [stemmer.stem(w) for w in swlist]
    swlist += ["'d", "'s", 'abov', 'ani', 'becaus', 'befor', 'could', 'doe', 'dure', 'might',
               'must', "n't", 'need', 'onc', 'onli', 'ourselv', 'sha', 'themselv', 'veri', 'whi',
               'wo', 'would', 'yourselv'] #complained about not having these as stop words
    pubs = ['buzzfe', 'buzzf', 'npr', 'cnn', 'vox', 'reuter', 'breitbart', 'fox', 'guardian','review', 'theatlant']
    punct = []#[':', '..', '“', '@', '%', ';', '→', ')', '#', '(', '*', '&', '[', ']', '…', '?','—', '‘', '$'] #gonna leave these in for now
    
    swlist += pubs
    swlist += punct
    if sys.argv[3].lower()=='true':
        tkzr = StemTokenizer()
    else:
        tkzr = None
    
    if sys.argv[4].lower()!='true':
        swlist = []


    #what features are we using?
    if sys.argv[6].lower()=='word':
        count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr)
        count_vect.fit(X_train)
        X_train = count_vect.transform(X_train)
        X_test = count_vect.transform(X_test)
        tfidf_transformer = TfidfTransformer()
        tfidf_transformer.fit(X_train)
        X_train = tfidf_transformer.transform(X_train)
        X_test = tfidf_transformer.transform(X_test)

    elif sys.argv[6].lower()=='topic':
        count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr)
        count_vect.fit(X_train)
        X_train = count_vect.transform(X_train)
        X_test = count_vect.transform(X_test)
        lda_model = LatentDirichletAllocation(n_components=10)
        lda_model.fit(X_train)
        X_train = lda_model.transform(X_train)
        X_test = lda_model.transform(X_test)

    elif sys.argv[6].lower()=='style':
        X_train = csr_matrix(s_train)
        X_test = csr_matrix(s_test)

    elif sys.argv[6].lower()=='all':
        count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr)
        count_vect.fit(X_train)
        X_train = count_vect.transform(X_train)
        X_test = count_vect.transform(X_test)

        tfidf_transformer = TfidfTransformer()
        tfidf_transformer.fit(X_train)
        X_train_tf = tfidf_transformer.transform(X_train)
        X_test_tf = tfidf_transformer.transform(X_test)
        print(type(X_train_tf))

        lda_model = LatentDirichletAllocation(n_components=10)
        lda_model.fit(X_train)
        X_train_lda = lda_model.transform(X_train)
        X_test_lda = lda_model.transform(X_test)
        print(type(X_train_lda))

        X_train = csr_matrix(sparse.hstack([X_train_tf, csr_matrix(X_train_lda), csr_matrix(s_train)]))
        X_test = csr_matrix(sparse.hstack([X_test_tf, csr_matrix(X_test_lda), csr_matrix(s_test)]))

        print(type(X_train))

        # sparse.save_npz("X_train" + sys.argv[6] + ".npz", X_train)
        # sparse.save_npz("X_test" + sys.argv[6] + ".npz", X_test)

    else:
        sys.exit('unknown features')

    encoder = LabelBinarizer()
    encoder.fit(y_train)
    y_train = encoder.transform(y_train)
    y_test = encoder.transform(y_test)

    # sparse.save_npz("y_train" + sys.argv[6] + ".npz", y_train)
    # sparse.save_npz("y_test" + sys.argv[6] + ".npz", y_test)

    # load everything back
    # X_train = sparse.load_npz("X_train.npz")

    input_dim = X_train.shape[1]
    model = Sequential()
    model.add(Dense(512, input_shape=(input_dim,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    history = model.fit(X_train, y_train,
                        batch_size=batch_size,
                        epochs=5,
                        verbose=1,
                        validation_split=0.1)

    model.model.save(sys.argv[5] + '.h5')

    model = keras.models.load_model(sys.argv[5] + '.h5')
    score = model.evaluate(X_test, y_test,
                           batch_size=batch_size, verbose=1)

    print('Test accuracy:', score[1])

    y_pred = model.predict(X_test, batch_size=batch_size, verbose=1)
    # predicted = np.argmax(pred, axis=1)
    p, r, fs, s = precision_recall_fscore_support(y_test, y_pred)
    print(p, r, fs, s)
Ejemplo n.º 18
0
 def __init__(self):
     self.stemmer = snowball.SnowballStemmer("english")
Ejemplo n.º 19
0
# @FileName: myNLTKStem.py
# @Software: PyCharm Community Edition
# @introduction: 词干提取

import nltk.stem.porter as pt
import nltk.stem.lancaster as lc
import nltk.stem.snowball as sb


words = ['table', 'probably', 'wolves', 'playing', 'is', 'dog',
         'the', 'beaches', 'grounded', 'dreamt', 'envision'
         ]

# 词干提取
print('=======PorterStemmer=========')
stemmer = pt.PorterStemmer()
for word in words:
    stem = stemmer.stem(word)
    print(stem)

print('=======LancasterStemmer=========')
stemmer = lc.LancasterStemmer()
for word in words:
    stem = stemmer.stem(word)
    print(stem)

print('=======LancasterStemmer=========')
stemmer = sb.SnowballStemmer()
for word in words:
    stem = stemmer.stem(word)
    print(stem)