Ejemplo n.º 1
0
    def algorithms(self):
        """Returns a list of stemming algorithms provided by the py-stemmer
        library.
        """

        import Stemmer  # @UnresolvedImport

        return Stemmer.algorithms()
Ejemplo n.º 2
0
    def add_subparser(cls, parser):
        subparser = parser.add_parser("set-stemmer",
                                      help="Configure a stemmer")

        subparser.set_defaults(run=cls.run)

        subparser.add_argument("language", choices=Stemmer.algorithms(),
                               help="Stemmer language")
Ejemplo n.º 3
0
def porterStemmer(string):

    """
    Accepts a string and optionally a stemmer function working on
    single words, it defaults to the nltk PorterStemmer algorithm.

    Returns a stemmed string.
    """

    return Stemmer.stem(string)
Ejemplo n.º 4
0
 def _create_stemmers():
     """Create stemmers dictionary for all possible languages."""
     stemmers_initialized = {}
     for src_lang in Stemmer.algorithms():
         try:
             dst_lang = _lang_map.get(src_lang)
             if dst_lang:
                 stemmers_initialized[dst_lang] = Stemmer.Stemmer(src_lang, 40000)
         except (TypeError, KeyError):
             pass
     return stemmers_initialized
Ejemplo n.º 5
0
 def stemming(lang, stemming, words):
     """Lemmatize text.
     :param lang: lang text to lemmatize
     :param stemming: number loops of lemmatizing
     """
     import Stemmer as stemmer
     try:
         stemmer = stemmer.Stemmer(lang)
         for i in range(stemming):
             words = stemmer.stemWords(words)
         return words
     except KeyError:
         return words
Ejemplo n.º 6
0
# search_string = "Sachin Ramesh Tendulkar"

if len(sys.argv) < 3:
    print("Invalid arguments")
    sys.exit(1)

#search_string = ""
#for i in range(2, len(sys.argv)):
#  search_string += sys.argv[i] + " "

#search_string = search_string.strip()
index_file = sys.argv[1]
search_string = sys.argv[2]
index_file = index_file + "inverted_index.txt"

stemmer = Stemmer.Stemmer('english')

field_flag = 0
try:
    field_flag = search_string.index(":")
except:
    pass

if field_flag == 0:
    search_string = search_string.strip()
    words = search_string.split()
    for word in words:
        stemmed_word = word.lower()
        stemmed_word = stemmer.stemWord(stemmed_word)
        search(index_file, word, stemmed_word, field_flag)
Ejemplo n.º 7
0
def NMF_2():

	english_stemmer = Stemmer.Stemmer('en')
	class StemmedTfidfVectorizer(TfidfVectorizer):

		def build_analyzer(self):
			analyzer = super(TfidfVectorizer, self).build_analyzer()
			return lambda doc: english_stemmer.stemWords(analyzer(doc))

	cats = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

	print("Loading 20 newsgroups dataset for categories:")
	pprint(list(cats))

	newsgroups = fetch_20newsgroups(subset='all', categories = cats)
	
	print("%d documents" % len(newsgroups.data))
	print("%d categories" % len(newsgroups.target_names))

	print("Creating stemmed TFxIDF representation...")
	t0 = time()

	vect = StemmedTfidfVectorizer(stop_words='english')
	vectors = vect.fit_transform(newsgroups.data) # TFxIDF representation

	print("Done in %fs" % (time() - t0))
	print("n_samples: %d, n_features: %d" % vectors.shape)

	workbook = xlsxwriter.Workbook('partC_NMF.xlsx')

	print("Implementing NMF of dimension 2 on data...")

	nmf_ = NMF(n_components=2) # alpha value? l1 value?
	nmf_data = nmf_.fit_transform(vectors)

	print("Done.")

	print("Implementing non-linear transform on data...")

	offset = 0.001
	nmf_data_off=np.add(nmf_data,offset)
	log_nmfdata=np.log(nmf_data_off)

	print("Done.")

	labels = newsgroups.target
	labels_2 = []

	# Changing the labels from 0-7 to 0-1 
	for mark in labels:
		if mark <= 3:
			labels_2.append(0)
		else:
			labels_2.append(1)

	k = 2

	km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)

	print("Clustering sparse data with %s" % km)
	t0 = time()
	km.fit(nmf_data)
	km.fit(log_nmfdata)
	print("done in %0.3fs" % (time() - t0))

	# Transforming data back
	data2D = km.transform(nmf_data)
	data2D_logarithm =  km.transform(log_nmfdata)

	plt.figure(1)

	plt.subplot(221)
	print("Plotting labels of Kmeans algorithm using NMF")
	plt.title('NMF Dim 2 Kmeans Algorithm with NMF')
	plt.scatter(nmf_data[:,0], nmf_data[:,1], c=km.labels_)
	
	plt.subplot(222)
	print("Plotting ground truth")
	plt.title('True labels of data')
	plt.scatter(nmf_data[:,0], nmf_data[:,1], c=labels_2)

	plt.subplot(223)
	print("Plotting labels of Kmeans algorithm with nonlinear transform NMF")
	plt.title('NMF Dim 2 Kmeans Algorithm Nonlinear transform')
	plt.scatter(log_nmfdata[:,0], log_nmfdata[:,1], c=km.labels_)
	
	plt.subplot(224)
	print("Plotting ground truth with nonlinear transform")
	plt.title('Ground truth, nonlinear transform')
	plt.scatter(log_nmfdata[:,0], log_nmfdata[:,1], c=labels_2)


	plt.show()

	print ("Done.")
Ejemplo n.º 8
0
def algorithms():
    if cext_available:
        return Stemmer.language()
    else:
        return list(_languages.keys())
Ejemplo n.º 9
0
def getFMFTRL():
    #os.chdir('/Users/dhanley2/Documents/mercari/data')
    os.chdir('/home/darragh/mercari/data')
    train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8')
    test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8')
    glove_file = '../feat/glove.6B.50d.txt'
    threads = 4
    save_dir = '../feat'
       
    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    merge['target'] = np.log1p(merge["price"])
    submission = test[['test_id']]
    
    '''
    ix = (merge['brand_name']==merge['brand_name']) & \
            (~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(merge['name'].str.lower()))
    merge['name'][ix] = merge['brand_name'][ix] + ' ' +merge['name'][ix]
    '''
    
    
    #EXTRACT DEVELOPTMENT TEST
    trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90)
    
    del train
    del test
    gc.collect()
    
    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    #merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))
    
    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))
    
    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))
    
    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() - start_time))
    
    '''
    Encode Original Strings
    '''
    '''
    for col in ['item_description', 'name']:    
        wb = CountVectorizer()
        if 'X_orig' not in locals():
            X_orig = wb.fit_transform(merge[col])
        else:
            X_orig = hstack((X_orig, wb.fit_transform(merge[col])))
        print ('Shape of original hash', X_orig.shape)
    X_orig = X_orig.tocsr()
    
    X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 3, 0, 1), dtype=bool)]
    X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 100, 1, 0), dtype=bool)]    
    print ('Shape of original hash', X_orig.shape)
    X_orig = X_orig.tocoo()
    '''
    
    '''
    Stemmer
    '''
    
    # https://github.com/skbly7/usefulness/blob/ed11cd55080d553cf62873999a5e00b154057fbc/textpreprocess.py
    from nltk.tokenize import WordPunctTokenizer    # This is better for sentences containing unicode, like: u"N\u00faria Espert"
    word_tokenize = WordPunctTokenizer().tokenize
    import Stemmer
    import string
    ps = Stemmer.Stemmer("english")
    _wsre = re.compile("\s+")
    _alphanumre = re.compile("[\w\-\' ]", re.UNICODE)
    def _removestopwords(txtwords):
        global stoplist
    #    stoplist = stopwords.words("english")
        if stoplist is None:
            stoplist = frozenset([string.strip(l) for l in open(STOPFILE).readlines()])
        return [[w for w in t if w not in stoplist] for t in txtwords]
    
    def _stem(txtwords):
        return [stemmer.stemWords(t) for t in txtwords]
    
    def _removenonalphanumericchars(txtwords):
        return [[string.join([c for c in w if _alphanumre.search(c) is not None], "") for w in t] for t in txtwords]
    
    
    def _stripallwhitespace(txts):
        return [_wsre.sub("", txt) for txt in txts]
    stemmer = Stemmer.Stemmer("english")

    def textpreprocess(txt, 
                       sentencetokenize=False, 
                       replacehyphenbyspace=True, 
                       wordtokenize=False,
                       lowercase=True,
                       stem=True, 
                       removenonalphanumericchars=True, 
                       stripallwhitespace=True):
        """
        Note: For html2text, one could also use NCleaner (common.html2text.batch_nclean)
        Note: One could improve the sentence tokenization, by using the
        original HTML formatting in the tokenization.
        Note: We use the Porter stemmer. (Optimization: Shouldn't rebuild
        the PorterStemmer object each time this function is called.)
        """
    
        if sentencetokenize:
            txts = nltk.word_tokenize(txt)
            #txts = tokenizer.tokenize(txt.split())
        else:
            txts = txt.split()
        txt = None
        
        if replacehyphenbyspace:
            txts = [t.replace("-", " ") for t in txts]
    
        if wordtokenize:
            txtwords = [word_tokenize(t) for t in txts]
        else:
            txtwords = [string.split(t) for t in txts]
        txts = None
    
        if lowercase:
            txtwords = [[string.lower(w) for w in t] for t in txtwords]
    
        if stem:
            txtwords = _stem(txtwords)
    
        # TODO: Maybe remove Unicode accents? http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
    
        if removenonalphanumericchars:
            txtwords = _removenonalphanumericchars(txtwords)
    
        txtwords = [[w for w in t if w != ""] for t in txtwords]
    
        txts = [string.join(words) for words in txtwords]
    
        if stripallwhitespace:
            for _ in range(2):
                txts = _stripallwhitespace(txts)

        return string.join(txts, sep=" ")

    print('[{}] Start stemming'.format(time.time() - start_time))
    merge['stem_name'] =  [textpreprocess(s) for s in merge["name"].values]
    print('[{}] Stemming completed'.format(time.time() - start_time))
    
    '''
    Crossed columns
    '''
    # my understanding on how to replicate what layers.crossed_column does. One
    # can read here: https://www.tensorflow.org/tutorials/linear.
    def cross_columns(x_cols):
        """simple helper to build the crossed columns in a pandas dataframe
        """
        crossed_columns = dict()
        colnames = ['_'.join(x_c) for x_c in x_cols]
        for cname, x_c in zip(colnames, x_cols):
            crossed_columns[cname] = x_c
        return crossed_columns
    
    merge['item_condition_id_str'] = merge['item_condition_id'].astype(str)
    merge['shipping_str'] = merge['shipping'].astype(str)
    x_cols = (
              ['brand_name',  'item_condition_id_str'],
              ['brand_name',  'subcat_1'],
              ['brand_name',  'subcat_2'],
              ['brand_name',  'general_cat'],
              #['brand_name',  'subcat_1',  'item_condition_id_str'],
              #['brand_name',  'subcat_2',  'item_condition_id_str'],
              #['brand_name',  'general_cat',  'item_condition_id_str'],
              ['brand_name',  'shipping_str'],
              ['shipping_str',  'item_condition_id_str'],
              ['shipping_str',  'subcat_2'],
              ['item_condition_id_str',  'subcat_2']          
              )
    crossed_columns_d = cross_columns(x_cols)
    categorical_columns = list(
        merge.select_dtypes(include=['object']).columns)
    
    D = 2**30
    for k, v in crossed_columns_d.items():
        print ('Crossed column ', k)
        outls_ = []
        indicator = 0 
        for col in v:
            outls_.append((np.array(merge[col].apply(hash)))%D + indicator)
            indicator += 10**6
        merge[k] = sum(outls_).tolist()
    
    '''
    Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    lb = LabelBinarizer(sparse_output=True)
    x_col = lb.fit_transform(merge[cross_nm[0]])
    for i in range(1, len(cross_nm)):
        x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]])))
    del(lb)
    
    
    '''
    Hash name
    '''
    
    
    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0],
                                                                  "hash_size": 2 ** 29, "norm": None, "tf": 'binary',
                                                                  "idf": None,
                                                                  }), procs=8)
    wb.dictionary_freeze= True
    X_name = wb.fit_transform(merge['name'])
    del(wb)
    X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))    
    
    '''
    Hash category
    '''
    
    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                                  "hash_size": 2 ** 20, "norm": None, "tf": 'binary',
                                                                  "idf": None,
                                                                  }), procs=8)
    wb.dictionary_freeze= True
    cat = merge["category_name"].str.replace('/', ' ')
    X_cat = wb.fit_transform(cat)
    del(wb)
    X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `category` completed.'.format(time.time() - start_time))
    
    '''
    Count category
    '''
    
    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))
    
    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                                  "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
                                                                  "idf": None})
                             , procs=8)
    wb.dictionary_freeze= True
    X_description = wb.fit_transform(merge['item_description'])
    del(wb)
    X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time))
    
    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))
    
    X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True).values)
    
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape, X_stem_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat,
                           x_col, X_stem_name)).tocsr()
    
    
    print('[{}] Create sparse merge completed'.format(time.time() - start_time))
    
    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)
    
    gc.collect()
    if develop:
        #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233)
        train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[trnidx], y.values[validx]
        
    model = FM_FTRL(alpha=0.005, beta=0.005, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.005, L2_fm=0.0, init_fm=0.01,
                    D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15
    
    baseline = 1.
    for i in range(15):
        model.fit(train_X , train_y , verbose=1)
        predsfm = model.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm))
        print("FM_FTRL dev RMSLE:", score_)
        if score_ < baseline:
            baseline = score_
        else:
            break
        
    
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        predsfm = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm)))
        # 0.44532 
        # Full data 0.424681
    
    
    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))
    
    return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import fetch_20newsgroups
import heapq, operator

# Using Stemmer from the PyStemmer package because it's faster than the nltk stemmer
# Package can be downloaded from here: https://pypi.python.org/pypi/PyStemmer
import Stemmer
english_stemmer = Stemmer.Stemmer('en')


# Extension of the normal Tfidf vectorizer so that it stems words before analyzing
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: english_stemmer.stemWords(analyzer(doc))


# Download the 20 newsgroups dataset
documents = fetch_20newsgroups()

# Sample queries representing the interests of our users
queries = {
    "soccer":
    "soccer goal league championship striker player score coach football",
    "music":
    "music album cd lp song singer play listen genre album band",
    "cars":
    "car motor fuel petrol cylinder steering drive hybrid chassis engine mph",
    "films":
    "film movie actor director role genre scene camera",
Ejemplo n.º 11
0
	estimator = lambda fdist, bins: SimpleGoodTuringProbDist(fdist, bins=1e5)

words = []
directory = parsed.src_texts
n = parsed.n
output = parsed.o

for filename in os.listdir(directory):
	with open (directory+"/"+filename, "r") as file:
		inp = file.read()
		if parsed.text_encoding:
			inp = inp.decode(parsed.text_encoding)

		if filename != ".DS_Store":
			if parsed.word_type == "stem":
				stemmer = Stemmer.Stemmer('russian')
				words += stemmer.stemWords([inp])
			elif parsed.word_type == "surface_all":
				words += nltk.word_tokenize(inp)
			elif parsed.word_type == "surface_no_pm" or parsed.word_type[:7] == "suffix_":
				inp = inp.translate(None, string.punctuation)
				words += nltk.word_tokenize(inp)
			else:
				words += nltk.word_tokenize(inp)
			

if parsed.word_type[:7] == "suffix_":
	l = int(parsed.word_type.split("_")[1])
	words = [x[-l:] for x in words]

if parsed.unknown_word_freq:
Ejemplo n.º 12
0
import Stemmer as ps

# create stemmer class
stemmer = ps.Stemmer('english')


def stemText(words):
    return [stemmer.stemWord(word) for word in words]


def stemArticles(articles):
    return {i: stemText(words) for i, words in articles.items()}
Ejemplo n.º 13
0
 def build_analyzer(self):
     analyzer = super(CountVectorizer, self).build_analyzer()
     if self.is_lemma:
         return lambda doc: [self.wnl.lemmatize(t) for t in analyzer(doc)]
     else:
         return lambda doc: Stemmer.Stemmer('en').stemWords(analyzer(doc))
Ejemplo n.º 14
0
def graph_data_from_links(links,
                          filter_largest_subgraph=False,
                          ignore_self_loop=True,
                          directed=False):
    print('start graph data')
    import csv, random, io, sys, os
    import collections
    import time
    import string
    import Stemmer

    csv.field_size_limit(
        sys.maxsize
    )  # http://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072

    import nltk.tokenize
    from nltk.corpus import stopwords

    stemmer = Stemmer.Stemmer('english')
    stopwords = set(stopwords.words('english')).union(
        set(stopwords.words('french')))
    punc_table = dict(
        (ord(char), ' ') for char in string.punctuation if char not in '_-')

    def tokenize(string):
        string = string.translate(punc_table)  # remove punctuation
        for begin, end in nltk.tokenize.WhitespaceTokenizer().span_tokenize(
                string):
            word = string[begin:end]
            if not word.isdigit() and word not in stopwords:
                yield word

    X = io.StringIO()
    X_writer = csv.writer(X, delimiter=' ')
    DTM = io.StringIO()
    DTM_writer = csv.writer(DTM, delimiter=' ')

    links = list(csv.reader(io.StringIO(links)))

    print('links loaded')

    if filter_largest_subgraph:
        # amazingly complex algorithm to find the subgraph and filter the links :)
        groups = {}
        groups_sizes = []
        g = 0
        for link in links:
            if len(link) > 1:
                source, target = link[0], link[1]
                if source not in groups and target in groups:
                    groups[source] = groups[target]
                    groups_sizes[groups[target]] += 1
                elif source in groups and target not in groups:
                    groups[target] = groups[source]
                    groups_sizes[groups[source]] += 1
                elif source not in groups and target not in groups:
                    groups[target], groups[source] = g, g
                    groups_sizes.append(0)
                    groups_sizes[g] += 2
                    g += 1
                elif groups[target] != groups[source]:
                    if groups_sizes[groups[target]] > groups_sizes[
                            groups[source]]:
                        for node, group in groups.items():
                            if group == groups[source]:
                                groups[node] = groups[target]
                    else:
                        for node, group in groups.items():
                            if group == groups[target]:
                                groups[node] = groups[source]
        best_group = groups_sizes.index(max(groups_sizes))

        links = [
            link for link in links
            if len(link) > 1 and groups[link[0]] == best_group
        ]

        print('filtered largest subgraph')

    if not directed:
        # another amazing algo to symmetrize the links in case of undirected graphs
        new_links = []
        for link in links:
            new_links.append(link)
            new_links.append([link[1], link[0]] + link[2:])
        links = new_links

        print('symmetry forced')

    nodes_i = {}  # fast lookup of index
    terms_i = {}  # fast lookup of index
    nodes = []  # labels
    terms = []  # dictionnary
    stemm_to_lemm = {}

    def node_to_i(node):
        if node in nodes_i:
            return nodes_i[node]
        nodes.append(node)
        i = len(nodes) - 1
        nodes_i[node] = i
        return i

    def term_to_i(term):
        if term in terms_i:
            return terms_i[term]
        terms.append(term)
        i = len(terms) - 1
        terms_i[term] = i
        return i

    print('start making edges', len(links))

    edges = collections.OrderedDict()
    for link in links:
        if len(link) > 1:

            # tokenization
            tokens = []
            text = link[2] if len(link) > 2 else ''
            tokens = list(tokenize(text))

            # stemming
            if len(tokens) > 0:  # filter empty links
                start = node_to_i(link[0])
                end = node_to_i(link[1])
                if not ignore_self_loop or start != end:
                    edge_name = '%d,%d' % (start, end)
                    if edge_name not in edges:
                        edges[edge_name] = collections.Counter()
                    doc_terms = edges[edge_name]
                    for token, stemm in zip(tokens, stemmer.stemWords(tokens)):
                        token = token.lower()
                        if stemm in stemm_to_lemm:
                            lemm = stemm_to_lemm[stemm]
                        else:
                            stemm_to_lemm[stemm] = token.lower()
                            lemm = token
                        doc_terms[lemm] += 1

    print('edges made')

    def key_to_order_for_tdm(edge_name):
        start, end = [int(x) for x in edge_name.split(',')]
        return start + end * len(nodes)

    for curr_edge, edge_name in enumerate(
            sorted(edges.keys(), key=key_to_order_for_tdm)):
        start, end = edge_name.split(',')
        X_writer.writerow([start, end, 1])
        for token, count in edges[edge_name].items():
            DTM_writer.writerow([term_to_i(token), curr_edge, count])

    # add empty link to make the matrix square if it's not already a square
    start = end = len(nodes) - 1
    edge_name = '%d,%d' % (start, end)
    if edge_name not in edges:
        X_writer.writerow([start, end, 0])

    labels = io.StringIO()
    labels_writer = csv.writer(labels, delimiter=' ')
    labels_writer.writerow(nodes)

    dictionnary = io.StringIO()
    dictionnary_writer = csv.writer(dictionnary, delimiter=' ')
    dictionnary_writer.writerow(terms)

    print('data done')

    return {
        'edges': X.getvalue(),
        'tdm': DTM.getvalue(),
        'labels': labels.getvalue(),
        'dictionnary': dictionnary.getvalue()
    }
        adjusted_score = ''
    else:
        adjusted_score = float(row[2]) * (1 - similarity_score)
    out_row = row[:3] + row[5:7] + [adjusted_score]
    return out_row


if __name__ == "__main__":

    # Config
    DATA_DIR = '../data/aligner_output/'
    ALIGNMENT_OUTPUT = os.path.join(DATA_DIR, 'ncsl_alignments.csv')
    SCORES = os.path.join(DATA_DIR, 'ncsl_alignments_notext.csv')

    n = 1000  # size of comparison samples
    stemmer = Stemmer.Stemmer('english').stemWord

    logging.basicConfig(level=logging.INFO)

    with open(ALIGNMENT_OUTPUT, 'r', encoding='utf-8') as infile,\
         open(SCORES, 'w') as scorefile:

        reader = csv.reader(infile, delimiter=',', quotechar='"')
        score_writer = csv.writer(scorefile,
                                  delimiter=',',
                                  quotechar='"',
                                  quoting=csv.QUOTE_MINIMAL)

        # Count total number of rows in data
        m = sum([1 for row in reader])
        infile.seek(0)
Ejemplo n.º 16
0
import BasesFrases.Base_treinamento
import BasesFrases.Base_teste
import BasesFrases.Stop_words
import Stemmer
import Erros_classificador

# baixar atualizacoes
#nltk.download()

# variaveis
baseTreinamento = BasesFrases.Base_treinamento.vet_baseTreinamento
baseTeste = BasesFrases.Base_teste.vet_baseTeste
stopWords = BasesFrases.Stop_words.stopWordsNLTK

# aplicando stemming
frasesComStemmingTreinamento = Stemmer.aplicaStemmer(baseTreinamento)
frasesComStemmingTeste = Stemmer.aplicaStemmer(baseTeste)

# busca cada uma das palavras, apos quebrar todas em radicais
palavrasTreinamento = Stemmer.buscaPalavras(frasesComStemmingTreinamento)
palavrasTeste = Stemmer.buscaPalavras(frasesComStemmingTeste)

# quantidade de vezes que uma palavra se repete
frequenciaTreinamento = Stemmer.buscaFrequencia(palavrasTreinamento)
frequenciaTeste = Stemmer.buscaFrequencia(palavrasTeste)

# palavras que nao se repetem
palavrasUnicasTreinamento = Stemmer.buscaPalavrasUnicas(frequenciaTreinamento)
palavrasUnicasTeste = Stemmer.buscaPalavrasUnicas(frequenciaTeste)

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import Stemmer
import spacy
"""
Programa que se encarga de buscar las palabras emocionales dentro de una frase y obtener sus lexemas.
"""
"""
Para obtener los lexemas utilizamos el paquete en español de Stemmer.
"""
stemmer = Stemmer.Stemmer('spanish')
nlp = spacy.load('es')
"""
Algunas de las palabras en nuestro diccionario comparten lexema y hay que tratarlas de manera específica
para que no haya conflicto al buscarlas.
"""
iguales = [
    "amig", "espos", "enfad", "guap", "habit", "her", "jubil", "novi", "odi",
    "pioj"
]
buscar_iguales = [
    "amigo", "esposo", "enfado", "guapo", "habitante", "herido", "jubiloso",
    "novio", "odio", "piojo"
]

derivables = ["afect", "asesin", "com", "inspir", "libr", "salud", "verd"]
derivadas = [["afecto", "afectivo", "afectiva", "afectuso", "afectividad"],
             ["asesino", "asesinato"], ["comida", "comedor"],
             ["inspirado", "inspiración"], ["libre", "librar"],
             ["saludar", "saludo"], ["verde", "verdoso", "verdear"]]
Ejemplo n.º 18
0
]:
    FORMAT = F_ZLEGACY

# scanData.py <hgw_file> [--stopcats=<stop category file>]

hgwpath = args[0]  # hgw/gum.xml

TITLE_WEIGHT = 4
STOP_CATEGORY_FILTER = bool(options.stopcats)

# reToken = re.compile('[a-zA-Z\-]+')
reToken = re.compile("[^ \t\n\r`~!@#$%^&*()_=+|\[;\]\{\},./?<>:’'\\\\\"]+")
reAlpha = re.compile("^[a-zA-Z\-_]+$")
NONSTOP_THRES = 100

STEMMER = Stemmer.Stemmer('porter')

# read stop word list from 'lewis_smart_sorted_uniq.txt'
wordList = []
try:
    f = open('lewis_smart_sorted_uniq.txt', 'r')
    for word in f.readlines():
        wordList.append(word.strip())
    f.close()
except:
    print 'Stop words cannot be read! Please put "lewis_smart_sorted_uniq.txt" file containing stop words in this folder.'
    sys.exit(1)

STOP_WORDS = frozenset(wordList)

if STOP_CATEGORY_FILTER:
Ejemplo n.º 19
0
def algorithms():
    if cext_available:
        return Stemmer.language()
    else:
        return list(_languages.key())
Ejemplo n.º 20
0
from Stemmer import *
s = Stemmer('russian')
while True:
    print(s.stemWord(input()))
Ejemplo n.º 21
0
def PystemStemming(text):
    stemmer_rus = Stemmer.Stemmer('russian')
    stemmer_en = Stemmer.Stemmer('english')
    words = text.split(" ")
    words_out = stemmer_en.stemWords(stemmer_rus.stemWords(words))
    return " ".join(words_out)
Ejemplo n.º 22
0
 def __init__(self):
     self.nepali_stemmer = Stemmer.Stemmer(
         'nepali')  #initializing nepali stemmer
def main():
    stemmer = Stemmer.Stemmer("english")
    print stemmer.stemWord("cardsing")
class EnglishTfidfVectorizer(TfidfVectorizer):
    english_stemmer = Stemmer.Stemmer('en')

    def build_analyzer(self):
        analyzer = super(EnglishTfidfVectorizer, self).build_analyzer()
        return lambda doc: self.english_stemmer.stemWords(analyzer(doc))
Ejemplo n.º 25
0
import re
import string
import Stemmer

# top 25 most common words in English and "wikipedia":
# https://en.wikipedia.org/wiki/Most_common_words_in_English
STOPWORDS = set([
    'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i', 'it',
    'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but',
    'his', 'by', 'from', 'developer', 'engineer', 'quận', 'thành', 'huyện',
    'phố', 'city', 'district', 'street'
])
PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))
STEMMER = Stemmer.Stemmer('english')


def tokenize(text):
    return text.split()


def lowercase_filter(tokens):
    return [token.lower() for token in tokens]


def punctuation_filter(tokens):
    return [PUNCTUATION.sub('', token) for token in tokens]


def stopword_filter(tokens):
    return [token for token in tokens if token not in STOPWORDS]
Ejemplo n.º 26
0
 def __init__(self):
     self.alphabet = u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
     self.allwords = AllWordsDB()
     self.stemmer = Stemmer.Stemmer('russian')
Ejemplo n.º 27
0
    def _get_stemmer_fn(self):
        import Stemmer  # @UnresolvedImport

        stemmer = Stemmer.Stemmer(self.lang)
        stemmer.maxCacheSize = self.cachesize
        return stemmer.stemWord
Ejemplo n.º 28
0
    def __init__(self, topic_model, api=None):

        self.resource_id = None
        self.stemmer = None
        self.seed = None
        self.case_sensitive = False
        self.bigrams = False
        self.ntopics = None
        self.temp = None
        self.phi = None
        self.term_to_index = None
        self.topics = []

        if not (isinstance(topic_model, dict) and 'resource' in topic_model
                and topic_model['resource'] is not None):
            if api is None:
                api = BigML(storage=STORAGE)
            self.resource_id = get_topic_model_id(topic_model)
            if self.resource_id is None:
                raise Exception(
                    api.error_message(topic_model,
                                      resource_type='topicmodel',
                                      method='get'))
            query_string = ONLY_MODEL
            topic_model = retrieve_resource(api,
                                            self.resource_id,
                                            query_string=query_string)
        else:
            self.resource_id = get_topic_model_id(topic_model)

        if 'object' in topic_model and isinstance(topic_model['object'], dict):
            topic_model = topic_model['object']

        if 'topic_model' in topic_model \
                and isinstance(topic_model['topic_model'], dict):
            status = get_status(topic_model)
            if 'code' in status and status['code'] == FINISHED:

                model = topic_model['topic_model']
                self.topics = model['topics']

                if 'language' in model and model['language'] is not None:
                    lang = model['language']
                    if lang in CODE_TO_NAME:
                        self.stemmer = Stemmer.Stemmer(CODE_TO_NAME[lang])

                self.term_to_index = {
                    self.stem(term): index
                    for index, term in enumerate(model['termset'])
                }

                self.seed = abs(model['hashed_seed'])
                self.case_sensitive = model['case_sensitive']
                self.bigrams = model['bigrams']

                self.ntopics = len(model['term_topic_assignments'][0])

                self.alpha = model['alpha']
                self.ktimesalpha = self.ntopics * self.alpha

                self.temp = [0] * self.ntopics

                assignments = model['term_topic_assignments']
                beta = model['beta']
                nterms = len(self.term_to_index)

                sums = [
                    sum(n[index] for n in assignments)
                    for index in range(self.ntopics)
                ]

                self.phi = [[0 for _ in range(nterms)]
                            for _ in range(self.ntopics)]

                for k in range(self.ntopics):
                    norm = sums[k] + nterms * beta
                    for w in range(nterms):
                        self.phi[k][w] = (assignments[w][k] + beta) / norm

                ModelFields.__init__(self, model['fields'])
            else:
                raise Exception("The topic model isn't finished yet")
        else:
            raise Exception("Cannot create the topic model instance. Could not"
                            " find the 'topic_model' key in the"
                            " resource:\n\n%s" % topic_model)
Ejemplo n.º 29
0
#ddir = 'E:/workspace/data/cdiscount/'
#wdir = 'C:/Users/ngaude/Documents/GitHub/kaggle/cdiscount/'
ddir = '/home/ngaude/workspace/data/cdiscount/'
wdir = '/home/ngaude/workspace/github/kaggle/cdiscount/'

stopwords = []
with open(wdir + 'stop-words_french_1_fr.txt', "r") as f:
    stopwords += f.read().split('\n')

with open(wdir + 'stop-words_french_2_fr.txt', "r") as f:
    stopwords += f.read().split('\n')

stopwords += nltk.corpus.stopwords.words('french')
stopwords += ['voir', 'presentation']
stopwords = set(stopwords)
stemmer = Stemmer.Stemmer('french')

rayon = pd.read_csv(ddir + 'rayon.csv', sep=';')
itocat1 = list(np.unique(rayon.Categorie1))
cat1toi = {cat1: i for i, cat1 in enumerate(itocat1)}
itocat2 = list(np.unique(rayon.Categorie2))
cat2toi = {cat2: i for i, cat2 in enumerate(itocat2)}
itocat3 = list(np.unique(rayon.Categorie3))
cat3toi = {cat3: i for i, cat3 in enumerate(itocat3)}

f_itocat = ddir + 'joblib/itocat'
itocat = (itocat1, cat1toi, itocat2, cat2toi, itocat3, cat3toi)
joblib.dump(itocat, f_itocat)


def normalize_txt(txt):
import json
import sys
import re
import Stemmer
import bisect
import math
from collections import defaultdict
import time

stemmer = Stemmer.Stemmer("english")

STOP_WORDS = set(['whence', 'here', 'show', 'were', 'why', 'n’t', 'the', 'whereupon', 'not', 'more', 'how', 'eight', 'indeed', 'i', 'only', 'via', 'nine', 're', 'themselves', 'almost', 'to', 'already', 'front', 'least', 'becomes', 'thereby', 'doing', 'her', 'together', 'be', 'often', 'then', 'quite', 'less', 'many', 'they', 'ourselves', 'take', 'its', 'yours', 'each', 'would', 'may', 'namely', 'do', 'whose', 'whether', 'side', 'both', 'what', 'between', 'toward', 'our', 'whereby', "'m", 'formerly', 'myself', 'had', 'really', 'call', 'keep', "'re", 'hereupon', 'can', 'their', 'eleven', '’m', 'even', 'around', 'twenty', 'mostly', 'did', 'at', 'an', 'seems', 'serious', 'against', "n't", 'except', 'has', 'five', 'he', 'last', '‘ve', 'because', 'we', 'himself', 'yet', 'something', 'somehow', '‘m', 'towards', 'his', 'six', 'anywhere', 'us', '‘d', 'thru', 'thus', 'which', 'everything', 'become', 'herein', 'one', 'in', 'although', 'sometime', 'give', 'cannot', 'besides', 'across', 'noone', 'ever', 'that', 'over', 'among', 'during', 'however', 'when', 'sometimes', 'still', 'seemed', 'get', "'ve", 'him', 'with', 'part', 'beyond', 'everyone', 'same', 'this', 'latterly', 'no', 'regarding', 'elsewhere', 'others', 'moreover', 'else', 'back', 'alone', 'somewhere', 'are', 'will', 'beforehand', 'ten', 'very', 'most', 'three', 'former', '’re', 'otherwise', 'several', 'also', 'whatever', 'am', 'becoming', 'beside', '’s', 'nothing', 'some', 'since', 'thence', 'anyway', 'out', 'up', 'well', 'it', 'various', 'four', 'top', '‘s', 'than', 'under', 'might', 'could', 'by', 'too', 'and', 'whom', '‘ll', 'say', 'therefore', "'s", 'other', 'throughout', 'became', 'your', 'put', 'per', "'ll", 'fifteen', 'must', 'before', 'whenever', 'anyone', 'without', 'does', 'was', 'where', 'thereafter', "'d", 'another', 'yourselves', 'n‘t', 'see', 'go', 'wherever', 'just', 'seeming', 'hence', 'full', 'whereafter', 'bottom', 'whole', 'own', 'empty', 'due', 'behind', 'while', 'onto', 'wherein', 'off', 'again', 'a', 'two', 'above', 'therein', 'sixty', 'those', 'whereas', 'using', 'latter', 'used', 'my', 'herself', 'hers', 'or', 'neither', 'forty', 'thereupon', 'now', 'after', 'yourself', 'whither', 'rather', 'once', 'from', 'until', 'anything', 'few', 'into', 'such', 'being', 'make', 'mine', 'please', 'along', 'hundred', 'should', 'below', 'third', 'unless', 'upon', 'perhaps', 'ours', 'but', 'never', 'whoever', 'fifty', 'any', 'all', 'nobody', 'there', 'have', 'anyhow', 'of', 'seem', 'down', 'is', 'every', '’ll', 'much', 'none', 'further', 'me', 'who', 'nevertheless', 'about', 'everywhere', 'name', 'enough', '’d', 'next', 'meanwhile', 'though', 'through', 'on', 'first', 'been', 'hereby', 'if', 'move', 'so', 'either', 'amongst', 'for', 'twelve', 'nor', 'she', 'always', 'these', 'as', '’ve', 'amount', '‘re', 'someone', 'afterwards', 'you', 'nowhere', 'itself', 'done', 'hereafter', 'within', 'made', 'ca', 'them'])
# print(type(STOP_WORDS))
STOP_WORDS.add("cite")

query = ""
# store = False
k = 10

weight = {
    "t": 100,
    "i": 20,
    "b": 1,
    "c": 20,
    "l": 0.05,
    "r": 0.05
}

first_words = ""    
# SORT_SIZE = 10000
TITLE_SIZE = 2000
Ejemplo n.º 31
0
 def __setstate__(self, state):
     self.__dict__ = state
     self._stemmer = Stemmer.Stemmer("english")
Ejemplo n.º 32
0
def run():
    parser = argparse.ArgumentParser(description="A chat bot")

    # database options
    db_parser = argparse.ArgumentParser(add_help=False)
    db_parser.add_argument(
        '--dbname', default='chains',
        help="Specifies the brain database.")

    # simulation options
    note = ("Note that this option is overridden by database settings and "
            "so is only used at database initialisation time.")
    modelling_parser = argparse.ArgumentParser(add_help=False)
    modelling_parser.add_argument(
        '--chain-order', type=int, default=DEF_CHAIN_ORDER,
        help="Set the simulation chain size parameter. " + note)
    modelling_parser.add_argument(
        '--language', choices=Stemmer.algorithms(), default='english',
        help="Set the simulation language for the stemmer. " + note)

    # learning options
    learning_parser = argparse.ArgumentParser(add_help=False)
    learning_parser.add_argument(
        'infile', metavar='INFILE', nargs='?', type=argparse.FileType('r'),
        default=sys.stdin,
        help="An input file from which to learn")

    # reply options
    reply_parser = argparse.ArgumentParser(add_help=False)
    reply_parser.add_argument(
        'message', metavar='MSG', nargs='+', action='append',
        help="Specify a message to respond to.")

    subparsers = parser.add_subparsers(title='Subcommands', dest='subcommand')
    subparsers.required = True

    ### learn command ###
    learn_subparser = subparsers.add_parser(
        'learn', help="add source data to the corpus",
        parents=[learning_parser, db_parser, modelling_parser])
    learn_subparser.set_defaults(func=do_learn)

    ### response command
    reply_subparser = subparsers.add_parser(
        'reply', help="send a message to get a reply back",
        parents=[reply_parser, db_parser, modelling_parser])
    reply_subparser.set_defaults(func=do_response)

    ### shell command
    shell_subparser = subparsers.add_parser(
        'shell', help="enter an interactive shell",
        parents=[db_parser, modelling_parser])
    shell_subparser.set_defaults(func=do_shell)

    dargs = vars(parser.parse_args())

    for option in ('file', 'message'):
        if dargs.get(option):
            dargs[option] = [x for xs in dargs[option] for x in xs]

    dargs['func'](dargs)
Ejemplo n.º 33
0
def NMF_NLT_TFIDF():

	english_stemmer = Stemmer.Stemmer('en')
	class StemmedTfidfVectorizer(TfidfVectorizer):

		def build_analyzer(self):
			analyzer = super(TfidfVectorizer, self).build_analyzer()
			return lambda doc: english_stemmer.stemWords(analyzer(doc))

	cats = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

	print("Loading 20 newsgroups dataset for categories:")
	pprint(list(cats))

	newsgroups = fetch_20newsgroups(subset='all', categories = cats)
	
	print("%d documents" % len(newsgroups.data))
	print("%d categories" % len(newsgroups.target_names))

	print("Creating stemmed TFxIDF representation...")
	t0 = time()

	vect = StemmedTfidfVectorizer(stop_words='english')
	vectors = vect.fit_transform(newsgroups.data) # TFxIDF representation

	print("Done in %fs" % (time() - t0))
	print("n_samples: %d, n_features: %d" % vectors.shape)

	workbook = xlsxwriter.Workbook('part3_NMF_NLT.xlsx')

	purityMetricsNames = ['Homogeneity', 'Completeness', 'V-measure', 'Adjust Rand-Index', 'Adjusted Mutual Information Score']

	metric_list = {}

	for i in range(1,21):

		print("Implementing NMF on data...")
		nmf_ = NMF(n_components=i) # 
		nmf_data = nmf_.fit_transform(vectors)
		print("Done.")

		# Applying non-linear transform
		print("Implementing non-linear transform on data...")
		offset = 0.001
		nmf_data_off=np.add(nmf_data,offset)
		log_nmf_data=np.log(nmf_data_off)
		print("Done.")

		labels = newsgroups.target
		labels_2 = []

		# Changing the labels from 0-7 to 0-1 
		for mark in labels:
			if mark <= 3:
				labels_2.append(0)
			else:
				labels_2.append(1)

		k = 2

		km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)

		print("Clustering sparse data with %s" % km)
		t0 = time()
		km.fit(log_nmf_data)
		print("done in %0.3fs" % (time() - t0))

		print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_2, km.labels_))
		print("Completeness: %0.3f" % metrics.completeness_score(labels_2, km.labels_))
		print("V-measure: %0.3f" % metrics.v_measure_score(labels_2, km.labels_))
		print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels_2, km.labels_))
		print("Adjusted Mutual Information Score: %.3f" % metrics.adjusted_mutual_info_score(labels_2, km.labels_))
		print metrics.confusion_matrix(labels_2,km.labels_)

		purityMetrics = [metrics.homogeneity_score(labels_2, km.labels_), metrics.completeness_score(labels_2, km.labels_),metrics.v_measure_score(labels_2, km.labels_),metrics.adjusted_rand_score(labels_2, km.labels_),metrics.adjusted_mutual_info_score(labels_2, km.labels_)]

		# Writing to .xlsx file (For Confusion Matrix)
		worksheet = workbook.add_worksheet()
		obs = zip(km.labels_,labels_2)

		row = 0
		col = 0

		worksheet.write(row,col,'Predictions')
		worksheet.write(row,col+1,'Actuals')
		worksheet.write(row,col+6,'Dimension')
		worksheet.write(row+1,col+6,i)

		metric_list = dict(zip(purityMetricsNames,purityMetrics))
		pprint(dict(metric_list))

		for key in metric_list.keys():
			row += 1
			worksheet.write(row,col+11,key)
			worksheet.write(row,col+12,metric_list[key])

		row = 0
		col = 0

		for pred, actual in (obs):
			row += 1
			worksheet.write(row,col, pred)
			worksheet.write(row,col+1,actual)

		row = 1

		for things in labels:
			worksheet.write(row,col+2,things)
			row += 1

	workbook.close()
Ejemplo n.º 34
0
def tokenizer_snowballer(text):
    stemmer = Stemmer.Stemmer('spanish')
    return [
        stemmer.stemWord(t) for t in token_extract.findall(text.lower())
        if t not in vacias
    ]
Ejemplo n.º 35
0
    def twitter_sentiment_analysis(self, custom_user_dictionary_dict,
                                   clean_tweet):
        '''
		- Determinate sentiment analysis (polarity) of tweet
		Note: this simple model, isn't trained to handle sarcasm or ironic 
		sentences yet, i.e:

		"Hoy es un maravilloso e impresionante día de mierda"

		This is a typically ironic tweet but this model will categorize it 
		as "Positive" by majority of positive words. However, some words that
		aren't in positive_dictionary.json or negative_dictionary.json
		could be added by the user and selected as "Positive" or "Negative",
		so in consequence, a sarcasm tweet could be categorized correctly,
		but only by mayority of positive and negative words.
		'''
        import json
        from nltk.tokenize import word_tokenize

        import Stemmer
        from collections import Counter

        try:
            response_data = {}
            tokens = word_tokenize(clean_tweet)

            # Define Stemmer for Spanish Language
            stemmer = Stemmer.Stemmer('spanish')

            # Compare received tweet with the positive and negative
            # custom user dictionary
            positive = map(
                lambda x: x in custom_user_dictionary_dict['positive'],
                stemmer.stemWords(tokens))
            negative = map(
                lambda x: x in custom_user_dictionary_dict['negative'],
                stemmer.stemWords(tokens))
            pos = Counter(positive)[True]
            neg = Counter(negative)[True]
            total = pos + neg

            if total > 0:
                if pos == neg:
                    response_data["polarity"] = "NU"  # Neutral
                else:
                    if pos > neg:
                        response_data["polarity"] = "P"
                        # response_data["sentiment"] = (pos*100)/total
                    else:
                        response_data["polarity"] = "N"
                        # response_data["sentiment"] = (neg*100)/total

                response_data["positive_sentiment_score"] = pos / len(tokens)
                response_data["negative_sentiment_score"] = neg / len(tokens)
                response_data["neutral_sentiment_score"] = (
                    len(tokens) - (pos + neg)) / len(tokens)

            else:
                # None token match with any positive or negative word,
                # so it's weak to determine a sentiment score
                response_data["polarity"] = "NU"
                # Note: sentiment score could be 1, if all the words
                # are neutral, it means that it's necessary also a
                # neutral dictionary
                response_data["positive_sentiment_score"] = 0
                response_data["negative_sentiment_score"] = 0
                response_data["neutral_sentiment_score"] = 1

            # We still don't have a way to determine confidence level
            # with a sentiment analysis model from a dictionary with
            # positive and negative words
            response_data["confidence"] = "Undefined"

        except Exception as e:
            response_data = {
                "polarity": None,
                "positive_sentiment_score": None,
                "negative_sentiment_score": None,
                "neutral_sentiment_score": None,
                "confidence": "Undefined"
            }

        return json.dumps(response_data)
Ejemplo n.º 36
0
    def __init__(self, topic_model, api=None):

        self.resource_id = None
        self.stemmer = None
        self.seed = None
        self.case_sensitive = False
        self.bigrams = False
        self.ntopics = None
        self.temp = None
        self.phi = None
        self.term_to_index = None
        self.topics = []
        self.api = get_api_connection(api)

        self.resource_id, topic_model = get_resource_dict( \
            topic_model, "topicmodel", api=self.api)

        if 'object' in topic_model and isinstance(topic_model['object'], dict):
            topic_model = topic_model['object']

        if 'topic_model' in topic_model \
                and isinstance(topic_model['topic_model'], dict):
            status = get_status(topic_model)
            if 'code' in status and status['code'] == FINISHED:
                self.input_fields = topic_model['input_fields']
                model = topic_model['topic_model']
                self.topics = model['topics']

                if 'language' in model and  model['language'] is not None:
                    lang = model['language']
                    if lang in CODE_TO_NAME:
                        self.stemmer = Stemmer.Stemmer(CODE_TO_NAME[lang])

                self.term_to_index = {self.stem(term): index for index, term
                                      in enumerate(model['termset'])}

                self.seed = abs(model['hashed_seed'])
                self.case_sensitive = model['case_sensitive']
                self.bigrams = model['bigrams']

                self.ntopics = len(model['term_topic_assignments'][0])

                self.alpha = model['alpha']
                self.ktimesalpha = self.ntopics * self.alpha

                self.temp = [0] * self.ntopics

                assignments = model['term_topic_assignments']
                beta = model['beta']
                nterms = len(self.term_to_index)

                sums = [sum(n[index] for n in assignments) for index
                        in range(self.ntopics)]

                self.phi = [[0 for _ in range(nterms)]
                            for _ in range(self.ntopics)]

                for k in range(self.ntopics):
                    norm = sums[k] + nterms * beta
                    for w in range(nterms):
                        self.phi[k][w] = (assignments[w][k] + beta) / norm

                missing_tokens = model.get("missing_tokens")
                ModelFields.__init__(self, model['fields'],
                                     missing_tokens=missing_tokens)
            else:
                raise Exception("The topic model isn't finished yet")
        else:
            raise Exception("Cannot create the topic model instance. Could not"
                            " find the 'topic_model' key in the"
                            " resource:\n\n%s" % topic_model)
Ejemplo n.º 37
0
    def getTerms(self, withPositions=False):
        #start = timeit.default_timer()

        # split by whitespace
        terms = re.split('[\s]', self.text)

        stemmer = Stemmer.Stemmer('english')
        # get the english stopwords
        stopwords = []
        with open('snowball_stopwords_EN.txt', 'r') as document:
            stopwords += list(filter(None, re.split("[ \n]", document.read())))
        document.close()

        if withPositions:
            termsPositions = [
            ]  # [[term0pos0, term0pos1,...], [term1pos0, term1pos1, term1pos2]

        for pos in range(len(terms)):

            # in case there is more than one term in this split by whitespace list position
            tempTermList = []

            # maintain websites
            url_match = re.findall(
                r'(https?://(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9'
                r'][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?://(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|ww'
                r'w\.[a-zA-Z0-9]+\.[^\s]{2,})', terms[pos])
            # maintain emails
            email_match = re.findall(r'[\w.-]+@[\w.-]+', terms[pos])
            # maintain words with hyphens
            hyphen_match = re.findall(r"([A-Za-z]+-[A-Za-z]+)", terms[pos])
            # maintain apostrophes
            apostrophe_match = re.findall(r"([A-Za-z]+'[A-Za-z]*)", terms[pos])
            # maintain acronyms
            acronyms_match = re.findall(r'\b(?:[a-zA-Z]\.){2,}', terms[pos])
            # maintain siglas
            siglas_match = re.findall(r'\b(?:[A-Z]){2,}', terms[pos])

            if url_match:
                if url_match[0].endswith(').') or url_match[0].endswith('),'):
                    url_match = [
                        url_match[0][:-2]
                    ]  # ex: https://www.genomedetective.com/app/typingtool/cov).
                elif url_match[0].endswith(',') or url_match[0].endswith('.') or url_match[0].endswith(')') or \
                        url_match[0].endswith('}'):
                    url_match = [url_match[0][:-1]]
                tempTermList = url_match
            elif email_match:
                tempTermList = email_match
            elif hyphen_match:
                tempTermList = hyphen_match
            elif apostrophe_match:
                if apostrophe_match[0].endswith('\''):
                    apostrophe_match = [apostrophe_match[0][:-1]]
                tempTermList = apostrophe_match
            elif acronyms_match:
                tempTermList = acronyms_match
            elif siglas_match:
                tempTermList = siglas_match
            else:
                # remove html character entities, ex: &nbsp;
                term = re.sub(r'(&.+;)', '', terms[pos])

                # replaces all non-alphabetic characters by a space, splits on whitespace
                tempTermList = re.split('[\s]', re.sub(r'[^A-Za-z]', ' ',
                                                       term))

            while ('' in tempTermList):
                tempTermList.remove('')

            # lowercases all letters
            tempTermList = [term.lower() for term in tempTermList]

            # Removes stopwords from the list of the terms of the document.
            tempTermList = list(
                filter(lambda term: term not in stopwords, tempTermList))
            # Stemmes
            tempTermList = [stemmer.stemWord(term) for term in tempTermList]
            # ignores all tokens with less than 3 characters
            tempTermList = list(filter(lambda t: len(t) >= 3, tempTermList))

            if tempTermList != []:
                if withPositions:  # [[term0pos0, term0pos1,...], [term1pos0, term1pos1, term1pos2]
                    for termInd in range(len(tempTermList)):
                        if tempTermList[termInd] in self.terms:
                            termsPositions[self.terms.index(
                                tempTermList[termInd])] += [pos]
                        else:
                            self.terms += [tempTermList[termInd]]
                            termsPositions += [[pos]]
                else:
                    self.terms += [
                        term for term in tempTermList if term not in self.terms
                    ]

        if withPositions:
            #stop = timeit.default_timer()
            #print('getTerms: {} seconds'.format(stop - start))
            return self.terms, termsPositions

        #stop = timeit.default_timer()
        #print('getTerms: {} seconds'.format(stop - start))
        return self.terms
Ejemplo n.º 38
0
    u'Полиция Великобритании нашла основателя WikiLeaks, но, не арестовала',
    u'В Стокгольме и Осло сегодня состоится вручение Нобелевских премий'
]

# words = [u'В ДНР жалуются: Россия не дает денег на пенсию',
#          u'Крушение поезда в Индии: число погибших превысило 100 человек',
#          u'ДНР решила создать компьютерные игры о боях за Дебальцево и аэропорт',
#          u'Турция может не пойти в ЕС, а вступить в ШОС',
#          u'На Донбассе задержан минометчик ДНР, который обстреливал Майорск',
#          u'Политолог рассказал, как ускорить деоккупацию Крыма и Донбасса',
#          u'Столтенбкрг обсудил с Трампом будущее НАТО',
#          u'Обама призвал дать Трампу время и не ждать худшего',
#          u'Под Киевом столкнулись грузовик и автобус, есть погибший']

stop_words_list = stop_words.split(" ")
_stemmer = Stemmer()

words = [w.lower() for w in words]  # приводим все строки к нижнему регистру

unsymboled = []
for word in words:
    s = re.sub(r'[.,!?;:{}[]()-_]', '',
               word)  # с помощью регулярных выражений удаляем знаки препинания
    unsymboled.append(s)

listed = [s.split(" ")
          for s in unsymboled]  # разделяем предложения на отдельные слова

new = []
for sentence in listed:
    s = [i for i in sentence