Python remove_stopwords Examples, stopwords.remove_stopwords Python Examples

Example #1

0

Show file

File: models.py Project: carlosds730/FriFarma

    def save(self, *args, **kwargs):
        self._en_description = stopwords.save_description(self.en_principio_activo, self.en_accion_terapeutica, self.en_presentacion, self.en_concentracion, self.form, 'en')
        self._es_description = stopwords.save_description(self.principio_activo, self.accion_terapeutica, self.presentacion, self.concentracion, self.form, 'es')
        tmp = stopwords.word_tokenize(self.en_name)
        self._en_name = stopwords.remove_stopwords(tmp, 'en')
        tmp = stopwords.word_tokenize(self.es_name)
        self._es_name = stopwords.remove_stopwords(tmp)
        super(Products, self).save(*args, **kwargs)
        import views

        views.collection_es = views.get_collection('es')
        views.collection_en = views.get_collection('en')

Example #2

0

Show file

def process_text():
    # text = "the phone camera is awesome"

    collection = read_raw_data()

    for record in collection.find({}):

        text = record['text']
        text = text.lower()
        tmp_text_array = text.split(" ")

        tmp_text_array = remove_unwanted_characters(tmp_text_array)

        # Change caring to care
        tmp_text_array = lemmatize(tmp_text_array)

        # remove and be mine
        tmp_text_array = remove_stopwords(tmp_text_array)

        tmp_text_array = token_postag(tmp_text_array)

        text_array.append(tmp_text_array)

        dict = freq_words(tmp_text_array)

        write_data_text_array(text_array)

        text_array.clear()

        write_data_token_frequency(dict)

Example #3

0

Show file

File: doc2vec.py Project: DDDog-WANG/Qiita_recommend

def clustering(dic):

    df = pd.read_csv('./data/hateb.csv')
    td = []

    with open("./data/stop.txt", "r") as f:
        stop_list = [v.rstrip() for v in f.readlines() if v != '\n']

    # 1文書ずつ、単語に分割してリストに入れていく[([単語1,単語2,単語3],文書id),...]こんなイメージ
    # words：文書に含まれる単語のリスト（単語の重複あり）
    # tags：文書の識別子（リストで指定．1つの文書に複数のタグを付与できる）
    for i in range(len(df)):
        wordlist = parseText(text=str(df['content'][i]), sysdic=dic)
        # 単語の文字種の統一、つづりや表記揺れの吸収
        normalizedlist = [normalize(word) for word in wordlist]
        # ストップワードの除去
        stopremovedlist = remove_stopwords(normalizedlist, stop_list)
        td.append(TaggedDocument(words=stopremovedlist, tags=[i]))

    #モデル作成
    model = Doc2Vec(documents=td, dm = 1, vector_size=300, window=8, min_count=10, workers=4)

    #ベクトルをリストに格納
    vectors_list=[model.docvecs[n] for n in range(len(model.docvecs))]

    #ドキュメント番号のリスト
    doc_nums=range(len(model.docvecs))

    #クラスタリング設定
    #クラスター数を変えたい場合はn_clustersを変えてください
    n_clusters = 8
    kmeans_model = KMeans(n_clusters=n_clusters, verbose=1, random_state=1, n_jobs=-1)

    #クラスタリング実行
    kmeans_model.fit(vectors_list)

    #クラスタリングデータにラベル付け
    labels=kmeans_model.labels_

    #ラベルとドキュメント番号の辞書づくり
    cluster_to_docs = defaultdict(list)
    for cluster_id, doc_num in zip(labels, doc_nums):
        cluster_to_docs[cluster_id].append(doc_num)

    #クラスター出力
    for docs in cluster_to_docs.values():
        print(docs)


    # DataFrameにcluster_idのカラムを追加
    df['cluster_id'] = labels

    df.to_csv('data/hateb_cluster.csv')

Example #4

0

Show file

File: bigram.py Project: bharathdintakurti/Natural-Language-Processing

def datainit(data):

    stopwords.remove_stopwords(data)
    global unigrams
    unigrams = unigramvocabulary.unigramvocabulary("data_without_stopwords.txt","vocabulary.txt")
    #print(unigrams)
    addstartstop.add_start_stop("data_without_stopwords.txt")
    global bigrams
    bigrams = bigramvocabulary.bigramvocabulary("data_with_startstop.txt","vocabulary.txt")
    #print(bigrams)
    createposneg.create_posneg("data_with_startstop.txt")

    bigram_file = open("vocabulary.txt", "r")

    global vocab_count
    vocab_count = 0
    for line in bigram_file :
        vocab_count += 1
    #print(vocab_count)

    return

Example #5

0

Show file

File: unigram.py Project: bharathdintakurti/Natural-Language-Processing

def create_posneg(data):


	stopwords.remove_stopwords(data)
	training_without_stopwords = open("data_without_stopwords.txt", "r")
	positive_superdoc = open("positivedata.txt", "w")
	negative_superdoc = open("negativedata.txt", "w")

	for line in training_without_stopwords:

		words_in_line = line.split()

		FLAG = 0

		for word in words_in_line :
			if word == '+' :
				positive_superdoc.write(word)
				FLAG = 1
				continue
			if word == '-' :
				negative_superdoc.write(word)
				FLAG = 0
				continue
			if FLAG == 1 :
				positive_superdoc.write(" " + word)
			else :
				negative_superdoc.write(" " + word)

		if FLAG == 1 :
			positive_superdoc.write("\n")
		else :
			negative_superdoc.write("\n")

	positive_superdoc.close()
	negative_superdoc.close()

	#print("Positive and Negative files created")


	return

Example #6

0

Show file

File: unigram.py Project: bharathdintakurti/Natural-Language-Processing

def create_posneg(data):

    stopwords.remove_stopwords(data)
    training_without_stopwords = open("data_without_stopwords.txt", "r")
    positive_superdoc = open("positivedata.txt", "w")
    negative_superdoc = open("negativedata.txt", "w")

    for line in training_without_stopwords:

        words_in_line = line.split()

        FLAG = 0

        for word in words_in_line:
            if word == '+':
                positive_superdoc.write(word)
                FLAG = 1
                continue
            if word == '-':
                negative_superdoc.write(word)
                FLAG = 0
                continue
            if FLAG == 1:
                positive_superdoc.write(" " + word)
            else:
                negative_superdoc.write(" " + word)

        if FLAG == 1:
            positive_superdoc.write("\n")
        else:
            negative_superdoc.write("\n")

    positive_superdoc.close()
    negative_superdoc.close()

    #print("Positive and Negative files created")

    return

Example #7

0

Show file

def datainit(data):

    stopwords.remove_stopwords(data)
    global unigrams
    unigrams = unigramvocabulary.unigramvocabulary(
        "data_without_stopwords.txt", "vocabulary.txt")
    #print(unigrams)
    addstartstop.add_start_stop("data_without_stopwords.txt")
    global bigrams
    bigrams = bigramvocabulary.bigramvocabulary("data_with_startstop.txt",
                                                "vocabulary.txt")
    #print(bigrams)
    createposneg.create_posneg("data_with_startstop.txt")

    bigram_file = open("vocabulary.txt", "r")

    global vocab_count
    vocab_count = 0
    for line in bigram_file:
        vocab_count += 1
    #print(vocab_count)

    return

Example #8

0

Show file

File: main.py Project: kovshenin/follerme

def clean_data(data):
	# Remove all the URLs first
	p = re.compile(r'((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)')
	data = p.sub('', data)
	
	# Let's remove everything that doesn't match characters we allow.
	p = re.compile(r'[\-\.\,\!\?\+\=\[\]\/\'\"\:\)\(\;\']')
	data = p.sub(' ', data)
	
	# Remove all the stopwords and lowercase the data.
	data = remove_stopwords(data)
	data = data.lower()
	
	return data

Example #9

0

Show file

def clean_data(data):
    # Remove all the URLs first
    p = re.compile(
        r'((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)'
    )
    data = p.sub('', data)

    # Let's remove everything that doesn't match characters we allow.
    p = re.compile(r'[\-\.\,\!\?\+\=\[\]\/\'\"\:\)\(\;\']')
    data = p.sub(' ', data)

    # Remove all the stopwords and lowercase the data.
    data = remove_stopwords(data)
    data = data.lower()

    return data

Example #10

0

Show file

File: indexer.py Project: awesome-archive/doge-search-engine

    def document_terms():
        for filepath, content, date in documents():
            print(filepath)

            extension = path.splitext(filepath)[1]

            words = None
            title = filename(filepath)

            if extension in ['.html', '.htm', '.jspy']:
                html_title, words, links = tokenize_html(content)
                html_title = html_title.strip()
                if html_title:
                    title = html_title
            else:
                words = tokenize_text(content)

            words = remove_stopwords(words, stopword_list)

            words = (stem(word) for word in words)

            yield title, filepath, words, date

Example #11

0

Show file

File: indexer.py Project: patr0nus/doge-search-engine

 def document_terms():
   for filepath, content, date in documents():
     print(filepath)
     
     extension = path.splitext(filepath)[1]
     
     words = None
     title = filename(filepath)
     
     if extension in ['.html', '.htm', '.jspy']:
       html_title, words, links = tokenize_html(content)
       html_title = html_title.strip()
       if html_title:
         title = html_title
     else:
       words = tokenize_text(content)
     
     
     words = remove_stopwords(words, stopword_list)
     
     words = (stem(word) for word in words)
     
     yield title, filepath, words, date

Example #12

0

Show file

        for entry in timeline:
            data += " %s" % entry['text']

        # Remove all the URLs first
        p = re.compile(
            r'((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)'
        )
        data = p.sub('', data)

        # Let's remove everything that doesn't match characters we allow.
        p = re.compile(
            r'[\-\.\,\!\?\+\=\[\]\/\'\"]')  # Add foreign languages here
        data = p.sub(' ', data)

        # Remove all the stopwords and lowercase the data.
        data = remove_stopwords(data)
        data = data.lower()

        # The three dicts will hold our words and the number of times they've
        # been used for later tag cloud generation.
        topics = {}
        mentions = {}
        hashtags = {}

        # Loop through all the words, separate them into topics, hashtags and mentions.
        for word in data.split():
            if word.startswith('@'):
                d = mentions
            elif word.startswith('#'):
                d = hashtags
            else:

Example #13

0

Show file

File: main.py Project: kovshenin/follerme

		# The data string will hold all our text concatenated. Perhaps this is not the fastest way
		# as strings are unchangable. Might convert this to a list in the future and then join if needed.
		data = ""
		for entry in timeline:
			data += " %s" % entry['text']
		
		# Remove all the URLs first
		p = re.compile(r'((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)')
		data = p.sub('', data)
		
		# Let's remove everything that doesn't match characters we allow.
		p = re.compile(r'[\-\.\,\!\?\+\=\[\]\/\'\"]') # Add foreign languages here
		data = p.sub(' ', data)
		
		# Remove all the stopwords and lowercase the data.
		data = remove_stopwords(data)
		data = data.lower()
		
		# The three dicts will hold our words and the number of times they've
		# been used for later tag cloud generation.
		topics = {}
		mentions = {}
		hashtags = {}
		
		# Loop through all the words, separate them into topics, hashtags and mentions.
		for word in data.split():
			if word.startswith('@'):
				d = mentions
			elif word.startswith('#'):
				d = hashtags
			else: