Exemple #1
0
    def save(self, *args, **kwargs):
        self._en_description = stopwords.save_description(self.en_principio_activo, self.en_accion_terapeutica, self.en_presentacion, self.en_concentracion, self.form, 'en')
        self._es_description = stopwords.save_description(self.principio_activo, self.accion_terapeutica, self.presentacion, self.concentracion, self.form, 'es')
        tmp = stopwords.word_tokenize(self.en_name)
        self._en_name = stopwords.remove_stopwords(tmp, 'en')
        tmp = stopwords.word_tokenize(self.es_name)
        self._es_name = stopwords.remove_stopwords(tmp)
        super(Products, self).save(*args, **kwargs)
        import views

        views.collection_es = views.get_collection('es')
        views.collection_en = views.get_collection('en')
Exemple #2
0
def process_text():
    # text = "the phone camera is awesome"

    collection = read_raw_data()

    for record in collection.find({}):

        text = record['text']
        text = text.lower()
        tmp_text_array = text.split(" ")

        tmp_text_array = remove_unwanted_characters(tmp_text_array)

        # Change caring to care
        tmp_text_array = lemmatize(tmp_text_array)

        # remove and be mine
        tmp_text_array = remove_stopwords(tmp_text_array)

        tmp_text_array = token_postag(tmp_text_array)

        text_array.append(tmp_text_array)

        dict = freq_words(tmp_text_array)

        write_data_text_array(text_array)

        text_array.clear()

        write_data_token_frequency(dict)
def clustering(dic):

    df = pd.read_csv('./data/hateb.csv')
    td = []

    with open("./data/stop.txt", "r") as f:
        stop_list = [v.rstrip() for v in f.readlines() if v != '\n']

    # 1文書ずつ、単語に分割してリストに入れていく[([単語1,単語2,単語3],文書id),...]こんなイメージ
    # words:文書に含まれる単語のリスト(単語の重複あり)
    # tags:文書の識別子(リストで指定.1つの文書に複数のタグを付与できる)
    for i in range(len(df)):
        wordlist = parseText(text=str(df['content'][i]), sysdic=dic)
        # 単語の文字種の統一、つづりや表記揺れの吸収
        normalizedlist = [normalize(word) for word in wordlist]
        # ストップワードの除去
        stopremovedlist = remove_stopwords(normalizedlist, stop_list)
        td.append(TaggedDocument(words=stopremovedlist, tags=[i]))

    #モデル作成
    model = Doc2Vec(documents=td, dm = 1, vector_size=300, window=8, min_count=10, workers=4)

    #ベクトルをリストに格納
    vectors_list=[model.docvecs[n] for n in range(len(model.docvecs))]

    #ドキュメント番号のリスト
    doc_nums=range(len(model.docvecs))

    #クラスタリング設定
    #クラスター数を変えたい場合はn_clustersを変えてください
    n_clusters = 8
    kmeans_model = KMeans(n_clusters=n_clusters, verbose=1, random_state=1, n_jobs=-1)

    #クラスタリング実行
    kmeans_model.fit(vectors_list)

    #クラスタリングデータにラベル付け
    labels=kmeans_model.labels_

    #ラベルとドキュメント番号の辞書づくり
    cluster_to_docs = defaultdict(list)
    for cluster_id, doc_num in zip(labels, doc_nums):
        cluster_to_docs[cluster_id].append(doc_num)

    #クラスター出力
    for docs in cluster_to_docs.values():
        print(docs)


    # DataFrameにcluster_idのカラムを追加
    df['cluster_id'] = labels

    df.to_csv('data/hateb_cluster.csv')
def datainit(data):

    stopwords.remove_stopwords(data)
    global unigrams
    unigrams = unigramvocabulary.unigramvocabulary("data_without_stopwords.txt","vocabulary.txt")
    #print(unigrams)
    addstartstop.add_start_stop("data_without_stopwords.txt")
    global bigrams
    bigrams = bigramvocabulary.bigramvocabulary("data_with_startstop.txt","vocabulary.txt")
    #print(bigrams)
    createposneg.create_posneg("data_with_startstop.txt")

    bigram_file = open("vocabulary.txt", "r")

    global vocab_count
    vocab_count = 0
    for line in bigram_file :
        vocab_count += 1
    #print(vocab_count)

    return
def create_posneg(data):


	stopwords.remove_stopwords(data)
	training_without_stopwords = open("data_without_stopwords.txt", "r")
	positive_superdoc = open("positivedata.txt", "w")
	negative_superdoc = open("negativedata.txt", "w")

	for line in training_without_stopwords:

		words_in_line = line.split()

		FLAG = 0

		for word in words_in_line :
			if word == '+' :
				positive_superdoc.write(word)
				FLAG = 1
				continue
			if word == '-' :
				negative_superdoc.write(word)
				FLAG = 0
				continue
			if FLAG == 1 :
				positive_superdoc.write(" " + word)
			else :
				negative_superdoc.write(" " + word)

		if FLAG == 1 :
			positive_superdoc.write("\n")
		else :
			negative_superdoc.write("\n")

	positive_superdoc.close()
	negative_superdoc.close()

	#print("Positive and Negative files created")


	return
def create_posneg(data):

    stopwords.remove_stopwords(data)
    training_without_stopwords = open("data_without_stopwords.txt", "r")
    positive_superdoc = open("positivedata.txt", "w")
    negative_superdoc = open("negativedata.txt", "w")

    for line in training_without_stopwords:

        words_in_line = line.split()

        FLAG = 0

        for word in words_in_line:
            if word == '+':
                positive_superdoc.write(word)
                FLAG = 1
                continue
            if word == '-':
                negative_superdoc.write(word)
                FLAG = 0
                continue
            if FLAG == 1:
                positive_superdoc.write(" " + word)
            else:
                negative_superdoc.write(" " + word)

        if FLAG == 1:
            positive_superdoc.write("\n")
        else:
            negative_superdoc.write("\n")

    positive_superdoc.close()
    negative_superdoc.close()

    #print("Positive and Negative files created")

    return
Exemple #7
0
def datainit(data):

    stopwords.remove_stopwords(data)
    global unigrams
    unigrams = unigramvocabulary.unigramvocabulary(
        "data_without_stopwords.txt", "vocabulary.txt")
    #print(unigrams)
    addstartstop.add_start_stop("data_without_stopwords.txt")
    global bigrams
    bigrams = bigramvocabulary.bigramvocabulary("data_with_startstop.txt",
                                                "vocabulary.txt")
    #print(bigrams)
    createposneg.create_posneg("data_with_startstop.txt")

    bigram_file = open("vocabulary.txt", "r")

    global vocab_count
    vocab_count = 0
    for line in bigram_file:
        vocab_count += 1
    #print(vocab_count)

    return
Exemple #8
0
def clean_data(data):
	# Remove all the URLs first
	p = re.compile(r'((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)')
	data = p.sub('', data)
	
	# Let's remove everything that doesn't match characters we allow.
	p = re.compile(r'[\-\.\,\!\?\+\=\[\]\/\'\"\:\)\(\;\']')
	data = p.sub(' ', data)
	
	# Remove all the stopwords and lowercase the data.
	data = remove_stopwords(data)
	data = data.lower()
	
	return data
Exemple #9
0
def clean_data(data):
    # Remove all the URLs first
    p = re.compile(
        r'((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)'
    )
    data = p.sub('', data)

    # Let's remove everything that doesn't match characters we allow.
    p = re.compile(r'[\-\.\,\!\?\+\=\[\]\/\'\"\:\)\(\;\']')
    data = p.sub(' ', data)

    # Remove all the stopwords and lowercase the data.
    data = remove_stopwords(data)
    data = data.lower()

    return data
    def document_terms():
        for filepath, content, date in documents():
            print(filepath)

            extension = path.splitext(filepath)[1]

            words = None
            title = filename(filepath)

            if extension in ['.html', '.htm', '.jspy']:
                html_title, words, links = tokenize_html(content)
                html_title = html_title.strip()
                if html_title:
                    title = html_title
            else:
                words = tokenize_text(content)

            words = remove_stopwords(words, stopword_list)

            words = (stem(word) for word in words)

            yield title, filepath, words, date
 def document_terms():
   for filepath, content, date in documents():
     print(filepath)
     
     extension = path.splitext(filepath)[1]
     
     words = None
     title = filename(filepath)
     
     if extension in ['.html', '.htm', '.jspy']:
       html_title, words, links = tokenize_html(content)
       html_title = html_title.strip()
       if html_title:
         title = html_title
     else:
       words = tokenize_text(content)
     
     
     words = remove_stopwords(words, stopword_list)
     
     words = (stem(word) for word in words)
     
     yield title, filepath, words, date
Exemple #12
0
        for entry in timeline:
            data += " %s" % entry['text']

        # Remove all the URLs first
        p = re.compile(
            r'((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)'
        )
        data = p.sub('', data)

        # Let's remove everything that doesn't match characters we allow.
        p = re.compile(
            r'[\-\.\,\!\?\+\=\[\]\/\'\"]')  # Add foreign languages here
        data = p.sub(' ', data)

        # Remove all the stopwords and lowercase the data.
        data = remove_stopwords(data)
        data = data.lower()

        # The three dicts will hold our words and the number of times they've
        # been used for later tag cloud generation.
        topics = {}
        mentions = {}
        hashtags = {}

        # Loop through all the words, separate them into topics, hashtags and mentions.
        for word in data.split():
            if word.startswith('@'):
                d = mentions
            elif word.startswith('#'):
                d = hashtags
            else:
Exemple #13
0
		# The data string will hold all our text concatenated. Perhaps this is not the fastest way
		# as strings are unchangable. Might convert this to a list in the future and then join if needed.
		data = ""
		for entry in timeline:
			data += " %s" % entry['text']
		
		# Remove all the URLs first
		p = re.compile(r'((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)')
		data = p.sub('', data)
		
		# Let's remove everything that doesn't match characters we allow.
		p = re.compile(r'[\-\.\,\!\?\+\=\[\]\/\'\"]') # Add foreign languages here
		data = p.sub(' ', data)
		
		# Remove all the stopwords and lowercase the data.
		data = remove_stopwords(data)
		data = data.lower()
		
		# The three dicts will hold our words and the number of times they've
		# been used for later tag cloud generation.
		topics = {}
		mentions = {}
		hashtags = {}
		
		# Loop through all the words, separate them into topics, hashtags and mentions.
		for word in data.split():
			if word.startswith('@'):
				d = mentions
			elif word.startswith('#'):
				d = hashtags
			else: