def save(self, *args, **kwargs): self._en_description = stopwords.save_description(self.en_principio_activo, self.en_accion_terapeutica, self.en_presentacion, self.en_concentracion, self.form, 'en') self._es_description = stopwords.save_description(self.principio_activo, self.accion_terapeutica, self.presentacion, self.concentracion, self.form, 'es') tmp = stopwords.word_tokenize(self.en_name) self._en_name = stopwords.remove_stopwords(tmp, 'en') tmp = stopwords.word_tokenize(self.es_name) self._es_name = stopwords.remove_stopwords(tmp) super(Products, self).save(*args, **kwargs) import views views.collection_es = views.get_collection('es') views.collection_en = views.get_collection('en')
def process_text(): # text = "the phone camera is awesome" collection = read_raw_data() for record in collection.find({}): text = record['text'] text = text.lower() tmp_text_array = text.split(" ") tmp_text_array = remove_unwanted_characters(tmp_text_array) # Change caring to care tmp_text_array = lemmatize(tmp_text_array) # remove and be mine tmp_text_array = remove_stopwords(tmp_text_array) tmp_text_array = token_postag(tmp_text_array) text_array.append(tmp_text_array) dict = freq_words(tmp_text_array) write_data_text_array(text_array) text_array.clear() write_data_token_frequency(dict)
def clustering(dic): df = pd.read_csv('./data/hateb.csv') td = [] with open("./data/stop.txt", "r") as f: stop_list = [v.rstrip() for v in f.readlines() if v != '\n'] # 1文書ずつ、単語に分割してリストに入れていく[([単語1,単語2,単語3],文書id),...]こんなイメージ # words:文書に含まれる単語のリスト(単語の重複あり) # tags:文書の識別子(リストで指定.1つの文書に複数のタグを付与できる) for i in range(len(df)): wordlist = parseText(text=str(df['content'][i]), sysdic=dic) # 単語の文字種の統一、つづりや表記揺れの吸収 normalizedlist = [normalize(word) for word in wordlist] # ストップワードの除去 stopremovedlist = remove_stopwords(normalizedlist, stop_list) td.append(TaggedDocument(words=stopremovedlist, tags=[i])) #モデル作成 model = Doc2Vec(documents=td, dm = 1, vector_size=300, window=8, min_count=10, workers=4) #ベクトルをリストに格納 vectors_list=[model.docvecs[n] for n in range(len(model.docvecs))] #ドキュメント番号のリスト doc_nums=range(len(model.docvecs)) #クラスタリング設定 #クラスター数を変えたい場合はn_clustersを変えてください n_clusters = 8 kmeans_model = KMeans(n_clusters=n_clusters, verbose=1, random_state=1, n_jobs=-1) #クラスタリング実行 kmeans_model.fit(vectors_list) #クラスタリングデータにラベル付け labels=kmeans_model.labels_ #ラベルとドキュメント番号の辞書づくり cluster_to_docs = defaultdict(list) for cluster_id, doc_num in zip(labels, doc_nums): cluster_to_docs[cluster_id].append(doc_num) #クラスター出力 for docs in cluster_to_docs.values(): print(docs) # DataFrameにcluster_idのカラムを追加 df['cluster_id'] = labels df.to_csv('data/hateb_cluster.csv')
def datainit(data): stopwords.remove_stopwords(data) global unigrams unigrams = unigramvocabulary.unigramvocabulary("data_without_stopwords.txt","vocabulary.txt") #print(unigrams) addstartstop.add_start_stop("data_without_stopwords.txt") global bigrams bigrams = bigramvocabulary.bigramvocabulary("data_with_startstop.txt","vocabulary.txt") #print(bigrams) createposneg.create_posneg("data_with_startstop.txt") bigram_file = open("vocabulary.txt", "r") global vocab_count vocab_count = 0 for line in bigram_file : vocab_count += 1 #print(vocab_count) return
def create_posneg(data): stopwords.remove_stopwords(data) training_without_stopwords = open("data_without_stopwords.txt", "r") positive_superdoc = open("positivedata.txt", "w") negative_superdoc = open("negativedata.txt", "w") for line in training_without_stopwords: words_in_line = line.split() FLAG = 0 for word in words_in_line : if word == '+' : positive_superdoc.write(word) FLAG = 1 continue if word == '-' : negative_superdoc.write(word) FLAG = 0 continue if FLAG == 1 : positive_superdoc.write(" " + word) else : negative_superdoc.write(" " + word) if FLAG == 1 : positive_superdoc.write("\n") else : negative_superdoc.write("\n") positive_superdoc.close() negative_superdoc.close() #print("Positive and Negative files created") return
def create_posneg(data): stopwords.remove_stopwords(data) training_without_stopwords = open("data_without_stopwords.txt", "r") positive_superdoc = open("positivedata.txt", "w") negative_superdoc = open("negativedata.txt", "w") for line in training_without_stopwords: words_in_line = line.split() FLAG = 0 for word in words_in_line: if word == '+': positive_superdoc.write(word) FLAG = 1 continue if word == '-': negative_superdoc.write(word) FLAG = 0 continue if FLAG == 1: positive_superdoc.write(" " + word) else: negative_superdoc.write(" " + word) if FLAG == 1: positive_superdoc.write("\n") else: negative_superdoc.write("\n") positive_superdoc.close() negative_superdoc.close() #print("Positive and Negative files created") return
def datainit(data): stopwords.remove_stopwords(data) global unigrams unigrams = unigramvocabulary.unigramvocabulary( "data_without_stopwords.txt", "vocabulary.txt") #print(unigrams) addstartstop.add_start_stop("data_without_stopwords.txt") global bigrams bigrams = bigramvocabulary.bigramvocabulary("data_with_startstop.txt", "vocabulary.txt") #print(bigrams) createposneg.create_posneg("data_with_startstop.txt") bigram_file = open("vocabulary.txt", "r") global vocab_count vocab_count = 0 for line in bigram_file: vocab_count += 1 #print(vocab_count) return
def clean_data(data): # Remove all the URLs first p = re.compile(r'((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)') data = p.sub('', data) # Let's remove everything that doesn't match characters we allow. p = re.compile(r'[\-\.\,\!\?\+\=\[\]\/\'\"\:\)\(\;\']') data = p.sub(' ', data) # Remove all the stopwords and lowercase the data. data = remove_stopwords(data) data = data.lower() return data
def clean_data(data): # Remove all the URLs first p = re.compile( r'((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)' ) data = p.sub('', data) # Let's remove everything that doesn't match characters we allow. p = re.compile(r'[\-\.\,\!\?\+\=\[\]\/\'\"\:\)\(\;\']') data = p.sub(' ', data) # Remove all the stopwords and lowercase the data. data = remove_stopwords(data) data = data.lower() return data
def document_terms(): for filepath, content, date in documents(): print(filepath) extension = path.splitext(filepath)[1] words = None title = filename(filepath) if extension in ['.html', '.htm', '.jspy']: html_title, words, links = tokenize_html(content) html_title = html_title.strip() if html_title: title = html_title else: words = tokenize_text(content) words = remove_stopwords(words, stopword_list) words = (stem(word) for word in words) yield title, filepath, words, date
for entry in timeline: data += " %s" % entry['text'] # Remove all the URLs first p = re.compile( r'((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)' ) data = p.sub('', data) # Let's remove everything that doesn't match characters we allow. p = re.compile( r'[\-\.\,\!\?\+\=\[\]\/\'\"]') # Add foreign languages here data = p.sub(' ', data) # Remove all the stopwords and lowercase the data. data = remove_stopwords(data) data = data.lower() # The three dicts will hold our words and the number of times they've # been used for later tag cloud generation. topics = {} mentions = {} hashtags = {} # Loop through all the words, separate them into topics, hashtags and mentions. for word in data.split(): if word.startswith('@'): d = mentions elif word.startswith('#'): d = hashtags else:
# The data string will hold all our text concatenated. Perhaps this is not the fastest way # as strings are unchangable. Might convert this to a list in the future and then join if needed. data = "" for entry in timeline: data += " %s" % entry['text'] # Remove all the URLs first p = re.compile(r'((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)') data = p.sub('', data) # Let's remove everything that doesn't match characters we allow. p = re.compile(r'[\-\.\,\!\?\+\=\[\]\/\'\"]') # Add foreign languages here data = p.sub(' ', data) # Remove all the stopwords and lowercase the data. data = remove_stopwords(data) data = data.lower() # The three dicts will hold our words and the number of times they've # been used for later tag cloud generation. topics = {} mentions = {} hashtags = {} # Loop through all the words, separate them into topics, hashtags and mentions. for word in data.split(): if word.startswith('@'): d = mentions elif word.startswith('#'): d = hashtags else: