Exemple #1
0
def load_stoplist(topic_words=False, lang="en"):
    try:
        if lang == "en":
            if topic_words: return set(get_stop_words("en") + STOP_LIST + get_topic_stoplist())
            else: return set(get_stop_words("en") + STOP_LIST + stopwords.words('english'))
        elif lang == "nl":
            return set(get_stop_words("nl") + stopwords.words('dutch') + STOP_LIST_NL)
    except:
        print "warning: no stopwords were downloaded. check nltk corpora"
        print format_exc()
        return set()
Exemple #2
0
    def test_filters(self):
            language = 'en'
            before = get_stop_words(language, False)
            letter = random.choice(random.choice(before))

            def remove_letter(stopwords, language):
                return [word for word in stopwords if letter not in word]
            stop_words.add_filter(remove_letter)
            after = get_stop_words(language, False)
            for stopword in after:
                self.assertFalse(letter in stopword)
            self.assertTrue(stop_words.remove_filter(remove_letter))
def get_most_freq(all_comments):
    APP_ROOT = os.path.dirname(os.path.abspath(__file__))
    APP_STATIC = os.path.join(APP_ROOT, 'static')
    file_name = os.path.join(APP_STATIC, 'freq_portugues.p')
    dict_freq = pickle.load(open(file_name, "rb" ) )

    web_stopWords = ["q","vc","vcs","tipo","ta","pra","pq","ne","sobre","ser","cara","la"]

    all_comments = remove_accents(all_comments)
    tokens = all_comments.split()

    #build token dictionary
    dict_tokens = {}
    for token in tokens:
        if token in dict_tokens:
            dict_tokens[token] += 1
        else:
            dict_tokens[token] = 1

    #remove stop words
    stopWords = get_stop_words('portuguese', cache=True)
    stopWords += get_stop_words('english', cache=True)
    stopWords += web_stopWords

    #remove stop words
    for word in stopWords:
        dict_tokens.pop(remove_accents(word), None)

    #for word in dict_tokens:
    #    print(dict_tokens[token])
    #    dict_tokens[token] = 1+math.log(dict_tokens[token])

    #sorted by frequency
    sorted_tokens = sorted(dict_tokens.items(), key=operator.itemgetter(1),reverse=True)
    num_tokens = int(min(len(sorted_tokens)/2, 1000))

    sorted_tokens = sorted_tokens[0:num_tokens]

    #normalize by frequency
    standart_frequency = dict_freq["acelga"]
    for i in range(len(sorted_tokens)):
        (token,value) = sorted_tokens[i]
        if token in dict_freq:
            sorted_tokens[i] = (token, math.log(value/dict_freq[token]))
        else:
            sorted_tokens[i] = (token,math.log(value/standart_frequency))

    sorted_tokens_after = sorted(sorted_tokens,key=operator.itemgetter(1), reverse=True)
    max_num_words = 100
    sorted_tokens_after = sorted_tokens_after[0:max_num_words]

    return sorted_tokens_after
Exemple #4
0
 def test_get_stop_words_cache(self):
     self.assertFalse('french' in stop_words.STOP_WORDS_CACHE)
     sw = get_stop_words('fr')
     self.assertTrue('french' in stop_words.STOP_WORDS_CACHE)
     original_stop_words_dir = stop_words.STOP_WORDS_DIR
     stop_words.STOP_WORDS_DIR = 'not-existing-directory'
     self.assertEqual(sw, get_stop_words('french'))
     stop_words.STOP_WORDS_DIR = original_stop_words_dir
     try:
         get_stop_words('klingon')
     except:
         pass
     self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE)
def word_list(text ) :

    list = {}
    words = text.split()
    stop_words = get_stop_words('en')          # stop words is a list of common words used in English
    stop_words = get_stop_words('english')     

    words = [word for word in words if word not in stop_words]    #removing stop words

    for i in words:
        if all(j.isdigit() for j in i):     # classifing token as number feature
            if list.has_key("NUMBER"):
                list["NUMBER"]+=1
            else:
                list["NUMBER"]=1

        elif (len (i) >=4 and i[0] == 'h' and i[1] == 't' and i[2] == 't' and i[3] == 'p'):
        	if list.has_key("LINKS"):     # classifing token as link feature
        		list["LINKS"]+=1
        	else:
        		list["LINKS"]=1
        	

        elif all(j in string.punctuation for j in i):
            if list.has_key("PUNCTUATION"):        # classifing token as punctuation feature
                list["PUNCTUATION"]+=1
            else:
                list["PUNCTUATION"]=1

        elif len(i.translate(None,string.punctuation)) < 3:
            continue

        elif i.upper()==i:
            if list.has_key("CAPSLOCK"):        # classifing token as capital word feature
                list["CAPSLOCK"]+=1
            else:
                list["CAPSLOCK"]=1
        
        else:
            j = i.translate(None,string.punctuation).lower()
            if list.has_key(j):
                list[j]+=1
            else:
                list[j]=1
            
    
    
    return list
Exemple #6
0
def textToWordList(txt):
    p_stemmer = RussianStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')]
    r = re.compile('^[а-я]+$')
    badword =[
        'дом',
        'город',
        "дорог",
        "час",
        "ноч",
        "слов",
        "утр",
        "стран",
        "пут",
        "путешеств",
        "мест",
        'нов',
        "друз",
        "добр"
    ]
    txt = txt.lower().replace("<br>", "\n")
    tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)]
    tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword]
    return tokens
def keywords_search(reviews):
    key_map = {}
    # for k in open(os.getcwd() + "/KeyWord/keyword_map_general.txt", 'r'):
    for k in open(keyword_general_path, 'r'):
        a = k.strip().split(", ")
        key_map[a[0]] = a[1]

    special_map = {}
    # for k in open(os.getcwd() + "/KeyWord/keyword_map_special.txt", 'r'):
    for k in open(keyword_special_path, 'r'):
        a = k.strip().split(", ")
        special_map[a[0]] = a[1]

    raw = reviews.lower()
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(raw)

    # remove punctuations
    no_punc_tokens = [i for i in tokens if (not i in string.punctuation+string.digits) and (not "." in i)]

    # remove stop words from tokens
    en_stop = get_stop_words('en')
    stopped_tokens = [i for i in no_punc_tokens if not i in en_stop]

    # stem tokens
    # wordnet_lemmatizer = WordNetLemmatizer()
    # stemmed_tokens = [wordnet_lemmatizer.lemmatize(i) for i in stopped_tokens ] 

    chosen_key_words = []

    # Search in general key word
    key_words_dict = dict.fromkeys(key_map.values(), 0)

    # Select keyword use only key word to select
    # s = set(stemmed_tokens)
    s = set(stopped_tokens)
    for t in key_map.keys():
        if t in s:
            key_words_dict[key_map[t]] += 1

    for d in sorted(zip(key_words_dict.values(), key_words_dict.keys()))[:-4:-1]:
        if d[0] > 0:
            chosen_key_words.append(d[1])

    # Search in special keyword
    special_words_dict = dict.fromkeys(special_map.values(), 0)
    #  Select keyword using wordnet

    # Select keyword use only key word to select
    # s = set(stemmed_tokens)
    s = set(stopped_tokens)
    for t in special_map.keys():
        if t in s:
            special_words_dict[special_map[t]] += 1

    for d in sorted(zip(special_words_dict.values(), special_words_dict.keys()))[:-3:-1]:
        if d[0] > 0:
            chosen_key_words.append(d[1])

    return ' '.join(chosen_key_words)
def load_dataset(dataset_file):
    """
    It is more efficient (O(n) vs. O(1)) to search a dictionary or a set
    compared to a list as they are implemented with a hash.
    Therefore, the dataset is kept with 2 dictionaries where
    the values are sets.
    """
    items_original_form = defaultdict(set)
    items_by_keyword_start = defaultdict(set)
    items_by_id = defaultdict(set)

    stop_words = get_stop_words('english')

    with open(dataset_file) as f:
        lines = csv.reader(f, delimiter=',')
        for line in lines:

            item_id, *descriptors = line

            # save original form (3 seperate fields:
            # id, description, company name) for output
            items_original_form[item_id] = descriptors

            # create 2 dictionaries for searching:
            # 1. Key: 3 lower-case first letters of each
            # word of item descriptors. Value: item ids.
            # 2. Key: item id. Value: item descriptors in lower-case.
            descriptors_set = set(" ".join(descriptors).lower().split())
            for d in descriptors_set:
                if d not in stop_words:
                    items_by_keyword_start[d[:3]].add(item_id)
            items_by_id[item_id] = descriptors_set

    return (items_by_keyword_start, items_by_id, items_original_form)
def lda(data):
	data = get_only_text(data)
	only_tweet = data
	length = len(only_tweet)
	length = min(20,length)
	for i in xrange(0,length):
		print i
		print only_tweet[i]
	return
	
	tokenizer = RegexpTokenizer(r'\w+')
	en_stop = get_stop_words('en')
	p_stemmer = PorterStemmer()

	length = len(only_tweet)
	length = min(20,length)
	total_texts = []
	for i in xrange(0,length):
		print only_tweet[i]
		print 
		to_lower = only_tweet[i].lower()
		tokens = tokenizer.tokenize(to_lower)
		stopped_tokens = [k for k in tokens if not k in en_stop]
		texts = [p_stemmer.stem(k) for k in stopped_tokens]
		total_texts.append(texts)

	dictionary = corpora.Dictionary(total_texts)
	corpus = [dictionary.doc2bow(text) for text in total_texts]

	ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
	result =  ldamodel.print_topics(num_topics=2, num_words=1)
	for i in result:
		print i
Exemple #10
0
def lemmatization_intern(lang, rss, result, doc):
    # Construction et configuration du wrapper
    tagger = treetaggerwrapper.TreeTagger(TAGLANG=lang, TAGDIR=treetagger_path,
                                          TAGINENC='utf-8', TAGOUTENC='utf-8')

    # Utilisation
    tags = tagger.TagText(rss)
    data = formatTTG(tags, tagger, stop_words.get_stop_words(language=lang))

    for k in [1, 2, 3]:
        i = 0
        liste = []
        while i <= len(data) - k:
            lemma = getLemma(data[i])

            for j in range(k - 1):
                lemma += " " + getLemma(data[i + j + 1])
            if lemma not in result:
                result[k-1][lemma] = 0
                doc[k-1][lemma] = 1
                liste += [lemma]
            elif lemma not in liste:
                doc[k-1][lemma] += 1
                liste += [lemma]

            result[k-1][lemma] += 1
            i += 1
    return result, doc
def bag_of_words_vectorizer(datafile, k_features):
    """
    Computes sparse term-document matrix of datafile documents, selects k best features by chi2 test.
    Yields batches of BATCH_SIZE of dense tdm vectors and vector of labels, transformed for keras nn.
    """
    data = []
    labels = []

    for jsoned_entity in open("data.json", errors="ignore").readlines():
        entity = json.loads(jsoned_entity)
        if entity["lang"] == "en":
            data.append(entity["text"])
            labels.append(entity["label"])

    vectorizer = TfidfVectorizer(stop_words=get_stop_words("english"))
    data = vectorizer.fit_transform(data)
    data = SelectKBest(chi2, k=k_features).fit_transform(data, labels)

    for vector_label_batch in batch(zip(data, labels), config.BATCH_SIZE):
        vectors = []
        labels = []
        for vec_label in vector_label_batch:
            vectors.append(vec_label[0].toarray())
            labels.append(vec_label[1])

        X = np.vstack(vectors)
        Y = np_utils.to_categorical(labels, 2)
        yield X, Y
Exemple #12
0
def process_line_mymodel(line):
    """
        @params
        line: list of all tokens contained in a line
        format: id_img nb_pairs(word, points) w1 p1 w2 p2 .... wn pn
        return: key, value for the dictionary
        key: id_img
        value: list of pairs w-p
        remove stop words?
    """
    en_stop = get_stop_words('en')
    #print en_stop
    key = line[0]
    nb_pairs = int(line[1])
    i = 0
    value = []
    weights = {}
    while i<nb_pairs*2:
        #print line[2+i]
        #if line[2+i] not in en_stop:
        value.append(re.sub(r'[^\x00-\x7f]',r'',line[2+i]))
        weights[re.sub(r'[^\x00-\x7f]',r'',line[2+i])]=int(line[3+i])
        i+=2

    #assert nb_pairs == len(value), "length of data diferent (nb_pairs =/= len(pairs))"
    return key, value, weights
def issue_analysis(df):
    df_sub = df[['Issue']]
    df_sub.insert(0, 'count', 1)

    Issue_List=[]
    for i in range(0,50):
        Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name)

    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')    # set tokenize Reg
    en_stop = get_stop_words('en')         # create English stop words list
    p_stemmer = PorterStemmer()            # Create p_stemmer of class PorterStemmer
    texts = []                             # list for tokenized documents in loop
    text_view = ''
                                                                
    # loop through document list
    for i in Issue_List:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
       
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        
        # stem tokens and add them to list
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)

        #print ' '.join(stemmed_tokens)
        text_view += ' '.join(stemmed_tokens)
        text_view += ' '

    wordcloud = WordCloud().generate(text_view)
    fig = plt.figure(figsize=(8,6))
    fig1 = fig.add_subplot(1,1,1)
    fig1.set_title("Top issued words", fontdict={'fontsize':25})
    fig1.imshow(wordcloud)
    fig1.axis("off")
    #plt.savefig('ComplainCount_WC.png')
    plt.savefig('ComplainCount_WC_2016.png')
    
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
    LDAText =  ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)
       
    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "issue_lda.html")
    #pyLDAvis.save_json(vis_data, "issue_lda.json")
    pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
    pyLDAvis.save_json(vis_data, "issue_lda_2016.json")

    return 0
def preprocess_wikidata(raw):
 # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower().split('../img/')[0]
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return (tokens, text)
Exemple #15
0
    def sentence_to_tokens(self, sentence, option='lemmatize'):
        """
        Given a sentence in english,
        return list of tokens with stop-word filtered, or lemmatized, tokenized
        :param sentence: English sentence
        :param option: lemmatize, stemming or none
        :return: list of non-stop word english tokens
        """

        log.debug("Tokenizing sentence")
        tokens = self.tokenizer.tokenize(sentence.lower())
        log.debug(tokens)

        # filter stop words
        log.debug("Filtering stop words")
        tokens = filter(lambda word: word not in get_stop_words('en'), tokens)
        log.debug(tokens)

        if option == 'lemmatize':
            # lemmatize
            log.debug("Lemmatizing")
            tokens = [self.lemmatizer.lemmatize(w) for w in tokens]
            log.debug(tokens)
        elif option == 'stem':
            # stemming
            log.debug("Stemming")
            tokens = [self.stemmer.stem(w) for w in tokens]
            log.debug(tokens)
        else:
            pass

        return tokens
def Tokenize(TextData):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = list()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    # clean and tokenize document string
    raw = TextData.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    tokens = stemmed_tokens

    TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt")
    fp = open(TOKENIZEDTEXT_FILE, "w")
    print(TOKENIZEDTEXT_FILE)
    # pickle.dump(tokens, fp)
    fp.write(str(tokens))
    fp.close()
Exemple #17
0
def modeling_tags(liked, numTopics):
	print '- my likes tag modeling :'
	
	# get documents
	documents = []
	for like in liked:
		s = unicode(' ')
		documents.append(s.join(liked[like][0]))

	# remove common and repeated words, and tokenize
	#stoplist = set('for a of the and to in'.split())
	stoplist = get_stop_words('en')
	texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
	frequency = defaultdict(int)
	for text in texts:
		for token in text: frequency[token] += 1
	texts = [[token for token in text if frequency[token] > 0] for text in texts]
	dictionary = corpora.Dictionary(texts)
	corpus = [dictionary.doc2bow(text) for text in texts]

	# transformations
	tfidf = models.TfidfModel(corpus)
	corpus_tfidf = tfidf[corpus]
	lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=numTopics)
	corpus_lsi = lsi[corpus_tfidf]
	index = similarities.MatrixSimilarity(corpus_lsi)
	
	# save transformation
	dictionary.save('/tmp/tags.dict')
	corpora.MmCorpus.serialize('/tmp/tags.mm', corpus)
	index.save('/tmp/tags.index')
	lsi.save('/tmp/tags.lsi')

	print '    ok'
	print ''
    def get_frequency(self):

        # Selecting all the text in the database
        cursor = self.select_content('Content')

        # Initialising variables
        words = []
        count_handle = Counter()

        # Generating common word list to be removed from the keyword list to be generated
        sw = stop_words.get_stop_words("english")

        # Extracting all words from the given database
        for row in cursor:
            words += re.compile('\w+').findall(row[1])

        #Remove stop words from 'words' list
        words = [w.lower() for w in words if w.lower() not in sw]

        # Calculating the frequency of all words in the given database
        for w in words:
            count_handle[w] += 1

        # Writing the keywords returned into the file = category+ "_keyword.txt"
        with open(self.out, 'w') as file_name:
            for word in count_handle.most_common(self.limit):
                file_name.write(word[0]+"\t"+str(word[1])+"\n")
def get_stopset():
    """
    Gets a set of stopwords
    """
    stopset = set(get_stop_words('en'))

    # get those contractions
    add_stops = nltk.word_tokenize(' '.join(stopset))
    stopset.update(add_stops)

    # make sure to get contractions without punctuation, so that
    # order of operations doesn't matter later
    add_stops = [stopword.strip(string.punctuation)
                 for stopword in stopset]
    stopset.update(add_stops)

    # custom stop words
    add_stops = [u'lp', u'ep',
                 u'record', u'records', u'recorded'
                 u'label', u'labels',
                 u'release', u'releases', u'released',
                 u'listen', u'listens', u'listened', u'listener',
                 u'version', u'versions',
                 u'album', u'albums',
                 u'song', u'songs',
                 u'track', u'tracks',
                 u'sound', u'sounds',
                 u'thing', u'things', u'something',
                 u'music']
    stopset.update(add_stops)
    return stopset
Exemple #20
0
	def createLDAModel(texts, n_topics, n_passes):
		"""Generates a LDA model from an array of texts
		"""
		tokenizer = RegexpTokenizer(r'\w+')
		#Create EN stop words list
		en_stop = get_stop_words('en')
		#Create p_stemmer of class PorterStemmer
		p_stemmer = PorterStemmer()

		texts_ = []

		# loop through document list
		for i in texts:
		    
		    # clean and tokenize document string
		    raw = i.lower()
		    tokens = tokenizer.tokenize(raw)
		    
		    # remove stop words from tokens
		    stopped_tokens = [i for i in tokens if not i in en_stop]
		    # stem tokens
		    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
		    # add tokens to list
		    texts_.append(stemmed_tokens)

		# turn our tokenized documents into a id <-> term dictionary
		dictionary = corpora.Dictionary(texts_)

		# convert tokenized documents into a document-term matrix
		corpus = [dictionary.doc2bow(text) for text in texts_]

		# generate LDA model
		ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word = dictionary, passes=n_passes)

		return(ldamodel)
Exemple #21
0
def get_corpus():
    db_conn = MySQLdb.connect(host="localhost", port=8889, db="linked_reverb", user="******", passwd="root")
    cursor = db_conn.cursor()
    cursor.execute("select argument1, argument2 from linked_entity80_a")

    ls_result = []
    ls_corpus = []

    row_count = int(cursor.rowcount)
    for i in range(0, row_count):
        row = cursor.fetchone()
        ls_result.append(row)

    stop_words = get_stop_words('en')

    for i in range(len(ls_result)):
        for item in ls_result[i][0].split(" "):
            if item in stop_words:
                pass
            else:
                ls_corpus.append(item)
        for item in ls_result[i][1].split(" "):
            if item in stop_words:
                pass
            else:
                ls_corpus.append(item)

                #
                # ls_corpus.append(ls_result[i][0].split(" "))
                # ls_corpus.append(ls_result[i][1].split(" "))

    db_conn.close()
    return ls_corpus
Exemple #22
0
def convert_amazon_to_dict(dict_field, is_text, in_fname, out_fname):
	id = 0
	num_entries = 0
	field_dict = {'':0}
	stop_words = get_stop_words('en')

	for entry in parse_amazon(in_fname):
		if entry.has_key(dict_field):
			num_entries += 1
			# if text field, parse and populate.
			if is_text:
				words = entry[dict_field].split()
				for word in words:
					stemmed_word = stem(word)
					if stemmed_word not in stop_words and stemmed_word not in field_dict:
						id += 1
						field_dict[stemmed_word] = id
			else:
				if entry[dict_field] not in field_dict:
					id += 1
					field_dict[entry[dict_field]] = id
				#printf('%s -> %d\n', entry[dict_field], id)
				#if id > 100:
				#	break
	print "num_entries:", num_entries
	print "length of field_dict:", len(field_dict)
	with open(out_fname, 'wb') as outf:
		pickle.dump(field_dict, outf)
def lda_approach_one():
    tokenizer = RegexpTokenizer(r'\w+')
    en_stop = get_stop_words('en')
    p_stemmer = PorterStemmer()
    # doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
    # doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
    # doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
    # doc_e = "Health professionals say that brocolli is good for your health."
    # doc_set = [doc_a, doc_b, doc_c, doc_e]
    print db.find().count()
    doc_set = [i['abstract'] for i in db.find()]
    texts = []
    for i in doc_set:
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in en_stop]
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = gensim.models.ldamodel.LdaModel(
        corpus,
        num_topics=4,
        id2word=dictionary,
        passes=20
    )
    print ldamodel.print_topics(10)
def getWordVector(inputString):
    tokenizer = RegexpTokenizer(r'\w+\'?\w+')

    # default English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    # It is considered to be the best for finding word roots
    p_stemmer = PorterStemmer()

    raw = inputString.lower() 
    tokens = tokenizer.tokenize(raw)    

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # now POS words which are nouns, adjectives, adverbs and verbs
    pos_tagged = nltk.pos_tag(stopped_tokens)
        
    # stem tokens
    # p_stemmer.stem(i[0]) and other additions in if condition - or i[1][0] == 'R' or i[1][0] == 'V' 

    stemmed_tokens = [i[0]
                        for i in pos_tagged
                        if i[1][0] == 'N'] # or i[1][0] == 'J']

    return stemmed_tokens
def cal_idf_overlap():
    list_subj = utils.list_subject

    ls_distance_final = []
    ls_distance_row = []
    #print len(list_att)
    stop_words = get_stop_words('en')
    tmp_corpus = []
    for i in range(len(list_subj)):
        item = str(list_subj[i]).split(" ")
        for token in item:
            if token in stop_words:
                pass
            else:
                tmp_corpus.append(token)
    #print "corpus", corpus

    length = len(list_subj)
    for i in range(0, length):
        if i == 500 or i == 1000 or i == 1500:
            print i
        for j in range(0, length):
            print i, j
            idf_instance = IDF.IDF(str(list_subj[i]),str(list_subj[j]), tmp_corpus)
            distance = idf_instance.cal_overlap()
            ls_distance_row.append(distance)
        ls_distance_final.append(ls_distance_row)
        ls_distance_row = []

    myarray = np.asarray(ls_distance_final)
    print myarray
    Z = linkage(myarray, "ward")
    thefile = open('/Users/Aaron/test.txt', 'w')
    for item in Z:
        thefile.write("%s\n" % item)

    plt.figure(figsize=(25, 10))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    dendrogram(
         Z,
         leaf_rotation=90.,  # rotates the x axis labels
         leaf_font_size=8.,  # font size for the x axis labels
     )
    plt.show()

    plt.title('Hierarchical Clustering Dendrogram (truncated)')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    dendrogram(
        Z,
        truncate_mode='lastp',  # show only the last p merged clusters
        p=30,  # show only the last p merged clusters
        show_leaf_counts=True,  # otherwise numbers in brackets are counts
        leaf_rotation=90.,
        leaf_font_size=12.,
        show_contracted=True,  # to get a distribution impression in truncated branches
    )
    plt.show()
def remove_stopwords_from_individual_text(tokens):
    '''
    Given a list of tokens, returns a list of strings without any stopwords.
    '''

    en_stop_words = stop_words.get_stop_words('en')

    return filter(lambda w: w not in en_stop_words, tokens)
Exemple #27
0
 def __init__(self):
     self.__stemmer = stem.PorterStemmer()
     s = stop_words.get_stop_words("en")
     self.__stopwords = []
     for word in s:
         if word.isalpha():
             self.__stopwords.append(self.__stemmer.stem(word.lower(), 0, len(word) - 1))
     self.__stopwords = set(self.__stopwords)
Exemple #28
0
 def __init__(self, dictionary, is_dev = False, model = ''):
     assert model is not '', "model cant be empty"
     self.en_stop = get_stop_words('en')
     self.dictionary_words = dictionary
     self.is_dev = is_dev
     filename_output = 'output_classify_' + model + '.txt'
     print 'opening file for output', filename_output
     self.file_output = open(filename_output, 'w')
     self.filename_output = filename_output
Exemple #29
0
def stopWord():
    '''These words don not indicate any sentiment and can be removed
    Repeating letter e.g hungrryyy for hungry
    Punctuation
    '''
    stopWords = get_stop_words('en')
    stopWords.append('at_user')
    stopWords.append('url') 
    return stopWords
def remove_stop_words(frequency_list):
    stop_words = get_stop_words('en')

    temp_list = []
    for key,value in frequency_list:
        if key not in stop_words:
            temp_list.append([key, value])

    return temp_list
Exemple #31
0
def load_stoplist():
    try:
        return set(get_stop_words("en") + STOP_LIST)
    except:
        print((format_exc()))
        return set()
def runFitting(params, objects):

    TASK = 'binary'
    #TASK = 'multi'
    '''
    Preparing data
    '''

    featureList = []

    if params["sentenceComplexityCheck"]:
        featureList.append("posTag")
    if params["embeddingsTermFreqFiltering"]:
        objects["freqFilter"].embeddingsEnabled = True
    if params["oneHotTermFreqFiltering"]:
        objects["freqFilter"].oneHotEnabled = True

    objects["liguisticFeatureExtractor"].setFeatureList(featureList)

    #print('Reading in offenseval training data...')
    if TASK == 'binary':
        IDsTrain, Xtrain, Ytrain = helperFunctions.read_corpus(
            offenseval_train)
    else:
        IDsTrain, Xtrain, Ytrain = helperFunctions.read_corpus(
            offenseval_train, binary=False)

    Xtrain = helperFunctions.clean_samples(Xtrain)
    print("train data read")
    '''
    Preparing vectorizer and classifier
    '''

    # Vectorizing data / Extracting features
    #print('Preparing tools (vectorizer, classifier) ...')
    if params["tweetTokenization"]:
        count_word = transformers.CountVectorizer(
            ngram_range=(1, 2),
            stop_words=stop_words.get_stop_words('en'),
            tokenizer=TweetTokenizer().tokenize)
    else:
        count_word = transformers.CountVectorizer(
            ngram_range=(1, 2), stop_words=stop_words.get_stop_words('en'))
    count_char = transformers.CountVectorizer(analyzer='char',
                                              ngram_range=(3, 7))

    embedder = features.Embeddings(objects["embeddings"], pool='max')

    vectorizer = FeatureUnion([('word', count_word), ('char', count_char),
                               ('word_embeds', embedder)])

    if len(featureList) > 0:
        vectorizer.transformer_list.append(
            ('lingFeats', objects["liguisticFeatureExtractor"]))

    if params["oneHotTermFreqFiltering"] or params[
            "embeddingsTermFreqFiltering"]:
        vectorizer.transformer_list.append(
            ('freqFilter', objects["freqFilter"]))

    if params["charNgramFreqFiltering"]:
        objects["charFreqFilter"].oneHotEnabled = True
        objects["charFreqFilter"].embeddingsEnabled = False
        vectorizer.transformer_list.append(
            ('charfreqFilter', objects["charFreqFilter"]))

    if params["POStagCheck"]:
        vectorizer.transformer_list.append(
            ('posTagger', transformers.posTagExtractor(Xtrain, Ytrain)))

    # Set up SVM classifier with unbalanced class weights
    """     if TASK == 'binary':
        # cl_weights_binary = None
        cl_weights_binary = {'OTHER':1, 'OFFENSE':10}
        clf = LinearSVC(class_weight=cl_weights_binary)
    else:
        # cl_weights_multi = None
        cl_weights_multi = {'OTHER':0.5,
                            'ABUSE':3,
                            'INSULT':3,
                            'PROFANITY':4}
        clf = LinearSVC(class_weight=cl_weights_multi) """
    clf = LinearSVC()
    #scaler = StandardScaler(with_mean=False)

    classifier = Pipeline([
        ('vectorize', vectorizer),
        #('scale', scaler),
        ('classify', clf)
    ])
    '''
    Actual training and predicting:
    '''

    ### predicting on set aside training data
    #print('Predicting on set aside data...')
    #Yguess = classifier.predict(XcustomTest)
    #result = cross_validate(classifier, Xtrain, Ytrain,cv=3)
    #print(result)
    ########

    print('Fitting on training data...')
    classifier.fit(Xtrain, Ytrain)
    #print('accuracy on set aside')
    #print(classifier.score(Xtest_raw, Y_test))
    #exit()

    #print('Predicting...')
    #Yguess = classifier.predict(Xtest)
    """     '''
    Outputting in format required
    '''

    print('Outputting predictions...')

    outdir = '/Users/balinthompot/RUG/Honours/HateSpeech/offenseval-rug-master/Submission'
    fname = 'rug_fine_2.txt'

    with open(outdir + '/' + fname, 'w', encoding='utf-8') as fo:
        assert len(Yguess) == len(Xtest_raw), 'Unequal length between samples and predictions!'
        for idx in range(len(Yguess)):
            # print(Xtest_raw[idx] + '\t' + Yguess[idx] + '\t' + 'XXX', file=fo) # binary task (coarse)
            print(Xtest_raw[idx] + '\t' + 'XXX' + '\t' + Yguess[idx], file=fo) # multi task (fine)

    print('Done.')
    """
    return classifier
Exemple #33
0
    def tokenize(self,
                 text: Text,
                 attribute: Text = TEXT_ATTRIBUTE) -> List[Token]:

        stop_words = get_stop_words('en')
        '''if not self.case_sensitive:
            text = text.lower()

        if attribute != INTENT_ATTRIBUTE:
            # remove 'not a word character' if
            words = re.sub(
                # there is a space or an end of a string after it
                r"[^\w#@&]+(?=\s|$)|"
                # there is a space or beginning of a string before it
                # not followed by a number
                r"(\s|^)[^\w#@&]+(?=[^0-9\s])|"
                # not in between numbers and not . or @ or & or - or #
                # e.g. 10'000.00 or [email protected]
                # and not url characters
                r"(?<=[^0-9\s])[^\w._~:/?#\[\]()@!$&*+,;=-]+(?=[^0-9\s])",
                " ",
                text,
            ).split()
            # if we removed everything like smiles `:)`, use the whole text as 1 token
            if not words:
                words = [text]
        else:
            words = (
                text.split(self.intent_split_symbol)
                if self.intent_tokenization_flag
                else [text]
            )

        running_offset = 0
        tokens = []

        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))

        self.add_cls_token(tokens, attribute)

        return tokens  '''
        # text = message.get(attribute)

        if not self.case_sensitive:
            text = text.lower()

        s = re.sub(r'[\W]', ' ', text)  # remove punctuations
        words = s.split()  # split into tokens
        for x in list(words):
            if x in stop_words:
                words.remove(x)  # remove stop words

        # if we removed everything like smiles `:)`, use the whole text as 1 token
        if not words:
            words = [text]
        else:
            words = (text.split(self.intent_split_symbol)
                     if self.intent_tokenization_flag else [text])

        running_offset = 0
        tokens = []

        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))

        self.add_cls_token(tokens, attribute)

        return tokens
from sktools.matrix_denser import MatrixDenser
import random
from sklearn.preprocessing import MinMaxScaler
from utils.rank import GaussRankScaler

import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch import nn

random.seed(42)
np.random.seed(42)

pd.set_option('display.max_rows', 500)
stop_words = get_stop_words('catalan')
submit = False

# %%
full_train = pd.read_csv("data/train.csv").rename(columns={
    "TÍTOL": "title",
    "Codi QdC": "code"
})
full_test = pd.read_csv("data/test.csv").rename(columns={
    "TÍTOL": "title",
    "Codi QdC": "code"
})
categories = pd.read_csv("data/categories.csv").drop(
    columns="Unnamed: 0").rename(columns={
        "Codi QdC": "code",
        "Títol de entrada del catàleg": "target_title"
Exemple #35
0
def load_stop_words():
    # x = stopwords.words("english")
    x = get_stop_words("en")
    return [s.encode('ascii') for s in x] + list(string.printable)
Exemple #36
0
#from sqlalchemy import create_engine
#import psycopg2
#import pandas as pd
#from scipy import spatial

import numpy as np
import re
import nltk

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

from stop_words import get_stop_words
english_stop_words = get_stop_words('en')

from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()

import gensim
from gensim import corpora, models


def clean(comment):

    comment = comment.lower()

    # Strip all HTML
    comment = re.sub(r'<[^<>]+>', ' ', comment)

    # Handle Numbers
    comment = re.sub(r'[0-9]+', '', comment)
Exemple #37
0
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from stop_words import get_stop_words
from nltk.corpus import stopwords
import time

#Claves Twitter
consumer_key = 'wAiOgsZu8811j2Ac3mnyquwiT'
consumer_secret = 'kOT4i7K8OoQnNZoDuYEHrsg5DmAW0TnpVyRWPVWWgr4NiYQmm0'
access_token = '26922451-bGQYatrkw4zQZgl5qwIwO8nQXtIln0ZbScSmp1Rqv'
access_secret = '1ZvfJFdBNSOBqmDmriOXqURGsO5Yudj4s8597LCqe9Wo5'

#Busca los stop words en español y en ingles
#Los stopwords incluyen preposiciones y artículos
stop_words = list(get_stop_words('es'))         #Have around 900 stopwords
nltk_words = list(stopwords.words('english'))   #Have around 150 stopwords
stop_words.extend(nltk_words)

# metodo para limpiar los tweets eliminando caracteres especiales 
# como acentos, ñ y  puntuación
def clean(tweet):
    output = []
    tw = tweet.split(' ')
    for palabra in tw:
        if not palabra in stop_words:
            if "http" not in palabra:
                output.append(palabra)
    pal = ' '.join(output)
    mapping = {'á':'a','é':'e','í':'i','ó':'o','ú':'u',
            'ñ':'n','ñ':'n','#':'' ,'Á':'A','É':'E','Í':'I',
from porter_stemmer import PorterStemmer
import re
import string
from stop_words import get_stop_words

stop = get_stop_words('english')


class Tokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()

    # only admit non-number with length>2
    def qualify(self, word):
        return len(word) > 2 and not word.isdigit()

    def process_desc(self, desc):

        ndesc = []

        for word in desc.split():

            # lowercase all characters
            word = word.lower()
            # replace words with hashtags with just the words
            if word[0] == "#":
                word = word[1:]
            # replace words with @ with "AT_USER"
            elif word[0] == "@":
                word = "AT_USER"
            # replace words with url beginnings with "URL"
Exemple #39
0
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import os
import datetime
import time

import enchant


tokenizer = RegexpTokenizer(r'\w+')  # r'[a-zA-Z]+'
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
# create English stop words list
en_stop = get_stop_words('en')
stop_word = stopwords.words('english')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
d = enchant.Dict("en_US")


# set1 = ['sexi', 'ass', 'bitch', 'oral', 'anal', 'gay', 'hot', 'fantasi', 'stalk', 'nasti', 'f**k', 'boob', 'dick',
#         'nake', 'suck', 'lip', 'size', 'lick', 'tongu', 'turn', 'booti', 'thigh', 'bra', 'bed', 'horni', 'seduct',
#         'ball', 'hoe', 'virgin', 'lesbian', 'bite', 'butt', 'straight', 'leg', 'beast', 'fluid', 'chocolati',
#         'syrup', 'v****a', 'threesom', 'belli', 'homosexu']

# set2 = ['crush', 'date', 'dreams', 'friend', 'miss', 'babe', 'sweeti', 'candi', 'look', 'pie', 'appeal',
#         'crave', 'propos', 'hit', 'cheek', 'feel', 'romanc', 'poetri', 'hang', 'desir', 'pleasur', 'bomb',
#         'cute', 'eye', 'hug', 'chick', 'marri', 'love', 'babi', 'exchang', 'coffe', 'video']
import argparse

import numpy as np
import pandas as pd
from tqdm import tqdm

# Custom library
from lib.textometry import *
from lib.helpers import *
from lib.constant import *
from lib.cooc import *


from lib.utils import partofspeech
from stop_words import get_stop_words
fr_stop = get_stop_words("french")


# Pandas shoots warning non-stop ... so chouh !
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)

code_questions = {
    1:"transition_eco",
    2:"democratie_et_citoy",
    3:"fiscalite_et_depense_publique",
    4:"organisation_de_etat_et_service_pub"
}

parser = argparse.ArgumentParser()
parser.add_argument("data_fn")
Exemple #41
0
 def set_stopwords(self, stopwords):
     self.stopwords = get_stop_words(self.DEFAULT_LANGUAGE)
     if self.language:
         self.stopwords.extend(get_stop_words(self.language))
     if stopwords:
         self.stopwords.extend([word.lower() for word in stopwords])
Exemple #42
0
def without_stop_words(raw):
    if not raw:
        return ''
    stop_words = get_stop_words('en')
    return ''.join([c for c in raw if c not in stop_words])
Exemple #43
0
time      :   2018-7-20
"""

import os
import sys
import jieba

reload(sys)
sys.setdefaultencoding('utf8')

train_data_url = './data/FudanTrainData/'
segment_data_url = './data/WordSegment/'

from stop_words import get_stop_words

STOP_WORDS_SET = get_stop_words()


def get_all_file_by_path(path=train_data_url):
    """获取某个目录下的所有训练文件"""
    file_path = []
    dir_list = os.listdir(train_data_url)
    for d in dir_list:
        file_path.extend(
            map(lambda x: train_data_url + d + '/' + x,
                os.listdir(train_data_url + d)))
    return file_path


def read_file_sentence(
        file_path='./data/FudanTrainData/C3-Art/C3-Art0002.txt'):
Exemple #44
0
import requests
import textwrap
import gensim
import config
import emoji

from pandas.io.json import json_normalize
from string import Template
from gensim import corpora
from pprint import pprint
from tqdm import tqdm

extra_chars = ['-', ',', '.', '!', '?', '(', ')', '[', ']', '\n']

morph = pymorphy2.MorphAnalyzer()
stop_words = get_stop_words('ru')


def get_wall(owner_id: str = '',
             domain: str = '',
             offset: int = 0,
             count: int = 10,
             filter: str = 'owner',
             extended: int = 0,
             fields: str = '',
             v: str = '5.103') -> pd.DataFrame:
    """
    Возвращает список записей со стены пользователя или сообщества.
    @see: https://vk.com/dev/wall.get
    :param owner_id: Идентификатор пользователя или сообщества, со стены которого необходимо получить записи.
    :param domain: Короткий адрес пользователя или сообщества.
Exemple #45
0
import pandas as pd
import numpy as np
from stop_words import get_stop_words

import matplotlib.pyplot as plt
import matplotlib as mpl

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

stop_words = get_stop_words('en')

STOPWORDS = [
    're', 've', 'people', 'said', 'president', 'thing', 'united states', 'way'
] + stop_words

df_pre = pd.read_csv('data/debate.csv', encoding="cp1252")

df = df_pre[['speaker', 'speech']]

text = df[df['speaker'] == 'Joe Biden']['speech'].tolist()
#print(text.head())

text = ' '.join(text).lower()

#print(text)

wordcloud = WordCloud(stopwords=STOPWORDS, collocations=True).generate(text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
Exemple #46
0
def load_stopwords(language):
    return [t for w in get_stop_words(language) for t in slugify(w).split("-")]
Exemple #47
0
# coding=utf-8
import config
import gensim
import pymorphy2
import pyLDAvis
import pyLDAvis.gensim
import requests
import re



from stop_words import get_stop_words
from string import punctuation


stop_words = get_stop_words('russian')


def get_wall(
    owner_id: str = '',
    domain: str='',
    offset: int=0,
    count: int=10,
    filter: str='owner',
    extended: int=0,
    fields: str='',
    v: str='5.103'
):
    """
    Возвращает список записей со стены пользователя или сообщества.
    @see: https://vk.com/dev/wall.get
Exemple #48
0
for i in mach_files:
	with open(i) as f:
		mach.append(f.readlines())


def process_string(s, process_stopwords, stopwords_list = []):
	s = re.sub(r'[^\w\s]','',s)
	s = s.lower() 
	s = word_tokenize(s) 
	s = [pt.stem(i) for i in s]
	if not process_stopwords:
		s = [i for i in s if i not in stopwords_list]
	return s

## processing stopwords
stopwords = " ".join(get_stop_words('en'))
stopwords = process_string(stopwords, True)

## a list of lists (each element a processed section of the text)
processed_mach = [process_string("".join(i), False, stopwords) for i in mach]

## Making corpus level list: unlisting the lists of lists
corpus_words = sum(processed_mach, [])

## Making list of 500 most used terms
common_words = dict(Counter(corpus_words).most_common(500))

with open("doc_term_mat.csv", 'ab') as f:

	## extracting only words (not counts)
	words = common_words.keys()
Exemple #49
0
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from stop_words import get_stop_words
from scipy.stats import zscore
import math

##APostrophes index
ap_idx = [
    264, 418, 635, 667, 690, 851, 947, 963, 968, 980, 1053, 1118, 1304, 1358,
    1406, 1546, 1558, 1596, 1667, 1776, 1784, 1813
]
ap_idx = list(np.asarray(ap_idx) - 1)

# STOPWORDS
itstops = get_stop_words('it')
itstops.append("c'è")
itstops.append("c'era")
itstops.append("c'erano")
itstops.append("l'")
itstops.append("'")
itstops.append("dell'")
itstops.append("nell'")
itstops.append("un'")
itstops.append("quell'")
itstops.append("po'")
print('STOPWORDS LOADED!')

##INPUT DIRECTORY
input_dir = 'output'
Exemple #50
0
def parse_group(group):
    group_id = '-' + group
    offset = 0
    all_posts = []

    r = requests.get(
        'https://api.vk.com/method/wall.get',
        params={
            'owner_id': group_id,
            'offset': offset,
            'count': 10,
            'access_token':
            'd933e827d933e827d933e82762d95bd7acdd933d933e827857a5be3f0d490a5fdc7bfbe',
            'v': '5.95'
        })
    posts = r.json()['response']['items']
    all_posts.extend(posts)

    data_posts = []
    likes_response = []
    all_likes = []

    for p in all_posts:
        data_posts.append(get_data(p))
        r = requests.get(
            'https://api.vk.com/method/likes.getList',
            params={
                'owner_id': group_id,
                'offset': offset,
                'type': 'post',
                'item_id': p['id'],
                'filter': 'likes',
                'friends_only': 0,
                'extended': 1,
                'count': p['likes']['count'],
                'access_token':
                'd933e827d933e827d933e82762d95bd7acdd933d933e827857a5be3f0d490a5fdc7bfbe',
                'v': '5.95'
            })
        likes_response.extend(r.json()['response']['items'])

    for like_response in likes_response:
        like = Like(group_id, like_response['id'], like_response['type'],
                    like_response['first_name'], like_response['last_name'])
        all_likes.append(like)
    write_likes_json(all_likes, group_id)

    write_posts_json(data_posts, group_id)
    my_stop_words = get_stop_words('ru')

    vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=my_stop_words)
    X = vectorizer.fit_transform([data_post.text for data_post in data_posts])
    idf = vectorizer.idf_

    #***************

    cv = CountVectorizer(max_df=0.85,
                         stop_words=my_stop_words,
                         max_features=10000)
    word_count_vector = cv.fit_transform(
        [data_post.text for data_post in data_posts])
    tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    tfidf_transformer.fit(word_count_vector)
    feature_names = cv.get_feature_names()
    #all keywords
    keywords = []
    morph = pymorphy2.MorphAnalyzer()

    #generate tf-idf for the given document
    for data_post in data_posts:
        tf_idf_vector = tfidf_transformer.transform(
            cv.transform([data_post.text]))

        #sort the tf-idf vectors by descending order of scores
        sorted_items = sort_coo(tf_idf_vector.tocoo())

        #extract only the top n; n here is 1
        results = extract_topn_from_vector(feature_names, sorted_items, 1)
        result = ''
        if results:
            result = next(iter(results))
        if result != '' and not result.isdigit():
            result = morph.parse(result)[0].normal_form
        if len(result) > 2:
            keyword = KeyWord(data_post.id, result, 1)
            keywords.append(keyword)
    return data_posts, keywords
Exemple #51
0
from stop_words import get_stop_words
import nltk
import pprint

client = MongoClient()
client = MongoClient('localhost', 27017)

db = client['test']
customer = db.customer

qndict = customer.find_one()
qndictOld = qndict.copy()

### Parser Function ###

stop_words = set(get_stop_words('english'))


def myKeyWordParser(myString):
    myStringLowered = myString.lower()
    word_tokens = nltk.tokenize.RegexpTokenizer(
        '\\b\\w*[a-zA-Z]\\w+\\b').tokenize(myStringLowered)
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence


### End of Parser Function ####
    def __init__(self, czech=None):
        """
        Constructor initialise the range of parameters of each tested model for pipeline.
        :param czech: czech stop words
        """
        if not czech:
            czech = nltk.word_tokenize(' '.join(get_stop_words('cz')))
        self._classifiers = [SVC(), NuSVC(), RandomForestClassifier(), LogisticRegression(),
                             # MLPClassifier(),
                             MultinomialNB(), ]

        self.parameters = [
            # SVC
            {
                'vect__max_df': (0.5, 0.75, 1.0),
                'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
                'vect__norm': ('l1', 'l2', None),
                'vect__stop_words': (czech, None),
                'cls__C': (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000),
                'cls__gamma': (0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001),
                'cls__kernel': ('linear', 'rbf', 'poly', 'sigmoid')
            },
            # NuSVC
            {
                'vect__max_df': (0.5, 0.75, 1.0),
                'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
                'vect__norm': ('l1', 'l2', None),
                'vect__stop_words': (czech, None),
                'cls__nu': (0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65),
                'cls__kernel': ('linear', 'rbf', 'poly', 'sigmoid')
            },
            # Random Forrest
            {
                'vect__max_df': (0.5, 0.75, 1.0),
                'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
                'vect__norm': ('l1', 'l2', None),
                'vect__stop_words': (czech, None),
                'cls__max_depth': (None, 10, 20, 30, 40, 50, 60, 70, 80, 90),
                'cls__max_feat': (10, 20, 30, 40, 50, 'sqrt', None),
            },
            # Logistic regression
            {
                'vect__max_df': (0.5, 0.75, 1.0),
                'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
                'vect__norm': ('l1', 'l2', None),
                'vect__stop_words': (czech, None),
                'cls__C': (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000),
                'cls__class_weight': ('balanced', None),
                'cls__penalty': ('l1', 'l2')
            },
            # Naive Bayes
            {
                'vect__max_df': (0.5, 0.75, 1.0),
                'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
                'vect__norm': ('l1', 'l2', None),
                'vect__stop_words': (czech, None),
                'cls__alpha': (0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5),
                'cls__fit_prior': (True, False)
            },
            # Maximum Entropy
            {
                'vect__max_df': (0.5, 0.75, 1.0),
                'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
                'vect__norm': ('l1', 'l2', None),
                'vect__stop_words': (czech, None),
                'cls__method': ('gis', 'iis', 'megam', 'tadm')
            }

        ]
        self.pipeline_data = zip(self._classifiers, self.parameters)
Exemple #53
0
def stop_words():
    """Retrieve the stop words for vectorization -Feel free to modify this function
    """
    return get_stop_words('es') + get_stop_words('ca') + get_stop_words('en')
Exemple #54
0
from PIL import Image
from wordcloud import WordCloud
#  import wordcloud
#  from nltk.corpus import stopwords
from stop_words import get_stop_words
import random

import numpy as np
import matplotlib.pyplot as plt

with open("Goethe_Sammler.txt", "r") as f:
    text = f.read()

goethe_mask = np.array(Image.open('Goethe_Schattenriss.jpg'))

blacklist = get_stop_words('german')
blacklist = set(blacklist)
blacklist = blacklist.union(
    {'wäre', 'konnte', 'lassen', 'sagte', 'muß', 'Oheim', 'Julie', 'sei'})


def grey_color(word,
               font_size,
               position,
               orientation,
               random_state=None,
               **kwargs):
    return ("hsl(0, 0%%, %d%%)" % np.random.randint(10, 20))


wc = WordCloud(background_color='white',
off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, 
few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, 
just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, 
didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, 
mightn't, mustn, mustn't, needn, needn't, shan, shan't, shouldn, shouldn't, wasn, wasn't, weren, 
weren't, won, won't, wouldn, wouldn't,

"""

# http://pypi.python.org/pypi/stop-words

# pip install stopwords

from stop_words import get_stop_words

pypi_stopwords = get_stop_words('en')

print(len(pypi_stopwords))  # 174

# self defined stopwords

stop_words = [
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
    'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
    'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
    'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
Exemple #56
0
import string
from datetime import datetime

from nltk.stem.lancaster import LancasterStemmer
# from nltk.corpus import stopwords
from stop_words import get_stop_words

punctuation = list(string.punctuation) + ['rt', 'via']
arabic_stopwords = get_stop_words('arabic')
english_stopwords = get_stop_words('english')
german_stopwords = get_stop_words('german')

# Gets the tweet time.
def get_time(tweet):
    return datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S +0000 %Y")


# Gets all hashtags.
def get_hashtags(tweet):
    return [tag['text'] for tag in tweet['entities']['hashtags']]


# Gets the screen names of any user mentions.
def get_user_mentions(tweet):
    return [m['screen_name'] for m in tweet['entities']['user_mentions']]


# Gets the text, sans links, hashtags, mentions, media, and symbols.
def get_text_cleaned(tweet):
    text = tweet['text']
    print()
    open('rmsle_%s.log' % name, 'a').write(str(lst[-1]) + '\n')


def filter_word(d, w, c='item_description'):
    return d[d[c].map(lambda x: w in x)]


# %% read data
df: pd.DataFrame = pd.read_csv('data/cleaned_train.csv')
df_test: pd.DataFrame = pd.read_csv('data/cleaned_test.csv')

# %% pre-processing
tokenizer = RegexpTokenizer(r'\w+')
slash_tokenizer = RegexpTokenizer(r'[^/]+')
en_stop: List[str] = get_stop_words('en')
stemmer = PorterStemmer()


def clean_text(t: str, tk=tokenizer) -> str:
    tokens = tk.tokenize(t.lower())
    tokens = [stemmer.stem(token) for token in tokens
              if token not in en_stop or len(token) >= 2 and not token.isnumeric()]
    return ' '.join(tokens)


def preprocess(d: pd.DataFrame, clean=False) -> pd.DataFrame:
    d: pd.DataFrame = d.copy()
    d.fillna('__nan__', inplace=True)
    if clean:
        print("Cleaning 'name'")
Exemple #58
0
from stop_words import get_stop_words

with open('stop_words.txt', 'w') as f:
    stop_words = get_stop_words('uk')
    for word in range(len(stop_words)):
        stop_words[word] += '\n'
    f.writelines(stop_words)
Exemple #59
0
 def __init__(self, max_workers):
     logger.info("Number of workers: %s", max_workers)
     self.max_workers = max_workers
     self.tokenizer = RegexpTokenizer(r'\w+')
     self.en_stopwords = set(get_stop_words('en'))
     self.p_stemmer = PorterStemmer()
# input_folder = data_folder+"data_input/"
# output_folder = data_folder+"data_output/" 


results_folder=data_folder+dataset+"_results/"    
    
lang = 'en'#'de';#'fr'
translator = Translator()

num_context_word = 5
senti_folder = "../senti_lexicon/"
senti_trans_folder = "../senti_lexicon_trans/"
senti_file_path = "senti_dict.json"

# nlp = spacy.load(lang+'_core_news_md')
stop_words = set(get_stop_words(lang))
xml_bilexicon_path = "../XML_translation/"
puncts = "—"

#************* List of functions
def convert_file_dicts(file_name):
    dict_toks = {}
    dict_NEs = {}
    dict_global_local_index = {}
    dict_lines_words = {}
    
    if file_name not in dict_toks:
        dict_toks[file_name] = []
        dict_NEs[file_name] = []
        dict_lines_words[file_name] = []
        dict_global_local_index[file_name] = {}