def findSimilarities(self, texts): gsDir = os.getcwd() logger.debug(u"GSDir %s" % gsDir) gss = gsDir + os.sep + u"gensim_server" + os.sep logger.debug(u"%s" % gss) server = SessionServer(gss) corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)] # send 1k docs at a time # utils.upload_chunked(server, corpus, chunksize=1000) # server.train(corpus, method=u"lsi") # index the same documents that we trained on... # server.index(corpus) # overall index size unchanged (just 3 docs overwritten) # server.index(corpus[:3]) # Option Ons if True: for n in range(0, len(texts)): doc = u"doc_%d" % n self.output += u"Find similar doc_%d to %s%s" % (n, corpus[n][u"tokens"], os.linesep) logger.info(self.output[:-1]) for sim in server.find_similar(doc): m = int(sim[0][-1:]) if m != n: self.output += u"\t%s \t %3.2f : %s%s" % (sim[0], float(sim[1]), corpus[m][u"tokens"], os.linesep) logger.info(self.output[:-1]) d = [unicode(x) for x in corpus[n][u"tokens"]] e = [unicode(y) for y in corpus[m][u"tokens"]] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] self.output += u"\tCommon Topics : %s%s" % (lc, os.linesep) logger.info(self.output[:-1]) else: # Option two doc = {u"tokens": utils.simple_preprocess(u"Graph and minors and humans and trees.")} logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50)) return self.output
def _clean_body(self): """ Preprocess the body of the post """ tmp_text = self.strip_code_blocks(self.body) tmp_text = self.remove_special_characters(tmp_text) tokens = utils.simple_preprocess(tmp_text) self.body = self.remove_short_words(tokens)
def iter_rows(_host, _user, _passwd, _db, _table, _group, _value): """Iterate over all the table rows, yielding one row at a time.""" try: mydb = MySQLdb.connect(host=_host, user=_user, passwd=_passwd, db=_db) cursor = mydb.cursor() # Prepare SQL query sql = "SELECT tweet_text FROM %s" % (_table) if _group is not None: sql += " WHERE %s = '%s'" % (_group, _value) # Execute the SQL command cursor.execute(sql) # Fetch all the rows in a list of lists. results = cursor.fetchall() for row in results: document = row[0] # parse document into a list of utf8 tokens yield utils.simple_preprocess(document) cursor.close() mydb.close() except MySQLdb.Error as e: print(e) except : print("Unknown error occurred")
def iter_rows(table_name): """Iterate over all the table rows, yielding one row at a time.""" try: mydb = MySQLdb.connect(host='localhost', user='******', passwd='root', db='text_mining') cursor = mydb.cursor() # Prepare SQL query sql = "SELECT tweet_text FROM %s" % (table_name) # Execute the SQL command cursor.execute(sql) # Fetch all the rows in a list of lists. results = cursor.fetchall() for row in results: document = row[0] # parse document into a list of utf8 tokens yield utils.simple_preprocess(document) cursor.close() mydb.close() except MySQLdb.Error as e: print(e) except : print("Unknown error occurred")
def iter_documents(reuters_dir): """Iterate over Reuters documents, yielding one document at a time.""" for fname in os.listdir(reuters_dir): # read each document as one big string document = open(os.path.join(reuters_dir, fname)).read() # parse document into a list of utf8 tokens yield utils.simple_preprocess(document)
def test_index(self): # a stupid first training set texts = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey"] corpus = [{'id': 'doc_%i' % num, 'tokens': utils.simple_preprocess(text)} for num, text in enumerate(texts)] service_id = '1234' #from gensim.similarities.simserver import SessionServer # service = similarities.SessionServer('bla') # create a local server # service = SessionServer(self.rootlocation, autosession=True) import Pyro4 service = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver')) service.train(corpus, method='lsi') ## TODO we don't have a corpus yet, but we definatly need one big ''' texts = ["Human machine interface for lab abc computer applications",
def preprocessText(self, text): preprocessedData = simple_preprocess(text) dataNoStop = [word.strip() for word in preprocessedData \ if not word in self.stopWords] dataStem = [self.stemmer.stem(word) for word in dataNoStop] return dataStem
def main(): json_data = open('./items.json') data = json.load(json_data) print 'starting' for i in range(0, len(data)-1): print i s = "" identifier = "" title = "" totalText = "" try: s = data[i]['identifier'] identifier = s[0][18:].replace("%3A", "") summary = data[i]['desc'][0].strip() title = data[i]['title'][0].strip() totalText += summary totalText += " " totalText += title totalText += " " totalText += identifier except: print "error" documentPayload = ({'identifier':identifier, 'title': title, 'summary' : summary}) documents.append({'text' : totalText, 'payload' : documentPayload}) corpus =[{'id': text['payload']['identifier'], 'tokens' : utils.simple_preprocess(text['text']), 'payload' : text['payload']} for num, text in enumerate(documents)] service = SessionServer('./thesite/simdatabase') service.train(corpus, method='lsi') service.index(corpus) service.commit()
def prune(doc, stoplist = None, stem = True, english_dictionary_words = False): """This takes a single document and tokenizes the words, removes undesirable elements, and prepares it to be loaded into a dictionary. """ # Tokenize the document and make it lowercase temp = utils.simple_preprocess(doc.lower()) # Remove freestanding punctuation and punctuation in words temp = [w for w in temp if w not in string.punctuation] temp = [rmPunct(w) for w in temp] # Remove words in passed stoplist if stoplist: temp = [w for w in temp if w not in stoplist] # Remove specific tokens temp = [w for w in temp if w not in set(['[', ']', "'", '\n', 'com'])] # Remove stopwords temp = [w for w in temp if w not in stopwords.words('english')] # Stem the remaining words if stem: stemmer = SnowballStemmer('english') temp = [stemmer.stem(w) for w in temp] if english_dictionary_words: d = enchant.Dict("en_US") temp = [w for w in temp if d.check(w)] return temp
def commit_indexing_set(self): ''' after filling an indexing set the actual indexing needs to be done ''' #indexing_data = c.fetchmany(500); #service = similarities.SessionServer(self.rootlocation, autosession=True) #service = SessionServer(self.rootlocation + 'gensimTraining'+str(self.training_id), autosession=True) # create a local server service = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver')) indexing_data = self.get_indexing_set() keys = self.redis_con.smembers(self.s_redis_new_item_list) self.redis_con.srem(self.s_redis_new_item_list, keys) indexing_data = self.redis_con.hmget(self.h_redis_itemid_document_text, keys) corpus = [] #for (id, text) in indexing_data.iteritems(): # corpus.append({ 'id' : str(id), "tokens" : utils.simple_preprocess(text) }) for id in keys: corpus.append({ "id" : id, "tokens" : utils.simple_preprocess( self.redis_con.hget(self.h_redis_itemid_document_text, id) ) } ) # print corpus service.index(corpus) ## TODO we don't have a corpus yet, but we definatly need one big return 'indexing done'
def _clean_text(self, item_description): from gensim.utils import simple_preprocess from string import printable # Filter the string for non printable char and process it to an array # of words. s = simple_preprocess("".join((e for e in item_description if e in printable))) return [i for i in s if i not in self._stopwords]
def _create_corpus(self, texts): corpus = [] for id, text in texts: corpus.append({ 'id': id, 'tokens': utils.simple_preprocess(text) }) return corpus
def read_corpus(self, fname, stop_words=False): with open(fname) as f: for i, line in enumerate(f): if stop_words: yield TaggedDocument(utils.simple_preprocess(line), [i]) else: # For training data, add tags yield TaggedDocument(line.split(), [i])
def topicos(frase: str): """Informa os topicos de uma frase qualquer""" tokens = simple_preprocess(frase) inferred_vector = model.infer_vector(tokens) similars = model.docvecs.most_similar([inferred_vector], topn=10) return { 'topicos': similars }
def generate_ngrams(train_set, n=1): """ split sentence in tokens, compare how many tokens are one in another? generates sentences of max 15 tokens """ sentences_tokens = [utils.simple_preprocess(s[2], max_len=20) for s in train_set] token_dict = generate_dictionary(sentences_tokens) matrix_train = generate_matrix(sentences_tokens, token_dict, maxlen=len(token_dict), is_train=True) return matrix_train, token_dict
def test_doc2vec_inference(): tagged_docs = [TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(documents)] model = Doc2Vec(tagged_docs, epochs=1, min_count=1) d2v = Doc2VecInference(model, DEFAULT_ANALYZER) match_op = Matching() retrieval = Retrieval(d2v, matching=match_op).fit(documents) result = retrieval.query("scientists") assert result[0] == 1
def iter_wiki(self): """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple.""" ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() for title, text, pageid in _extract_pages(smart_open(self.dump_file)): text = filter_wiki(text) tokens = [token for token in simple_preprocess(text) if token not in STOPWORDS] if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces): continue # ignore short articles and various meta-articles yield title, tokens
def tokenize(text, rm_ascii=False): """Tokenize and rm stopwords. The Gensim `simple_preprocess` will work fine here becuase the Greek text has already been aggressively cleaned up. https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess """ if rm_ascii: text = [char for char in text if char not in ascii_str] text = ''.join(text) tokens = [token for token in simple_preprocess(text, deacc=PREPROCESS_DEACCENT, min_len=TOK_MIN, max_len=TOK_MAX)] return [token for token in tokens if token not in STOPS_LIST]
def tokenize(text): """ Simple tokenizer. Also filters stopwords. INPUT: [1] text (str): some raw text OUTPUT: [1] vector_tokens (list): tokenized + filtered version of text """ vector_tokens = [token for token in simple_preprocess(text) if token not in STOPWORDS] return vector_tokens
def __iter__(self): # Read only one line at a time from the text files, to be memory friendly for f in self.files: f.seek(0) # Reset the file pointer before a new iteration for line in f: post = json.loads(line) try: # parse and split the content up into a list of lower-case words content = strip_tags(post["content"]) doc_words = utils.simple_preprocess(content) except: # Fails on some nasty unicode doc_words = [] yield doc_words
def testMallet2ModelOn20NewsGroups(self): corpus = [simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups")] dictionary = Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] lda_mallet_model = ldamallet.LdaMallet( self.mallet_path, corpus=corpus, num_topics=20, id2word=dictionary, iterations=500) lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model, iterations=1000) self.assertEqual(lda_mallet_model.show_topics(20, 50), lda_gensim_model.show_topics(20, 50))
def add_document(corpus_name, document): service = get_service() if corpus_name not in corpuses: corpuses[corpus_name] = list() corpus = corpuses[corpus_name] doc_id = 'doc_%s' % len(corpus) doc = dict() tokens = utils.simple_preprocess(document) doc = {'id': doc_id, 'tokens': tokens , 'body':document} documents[doc_id] = doc corpus.append(doc) return doc_id
def generate_index(self): def page_text(): for page_file in os.listdir('CrawlData'): content = open('CrawlData/'+page_file, 'r') page_content = content.read() content.close() page_url = re.sub('\s', '/', page_file) yield page_url, page_content corpus = [{'id': '%s' % url, 'tokens': utils.simple_preprocess(text)} for url, text in page_text()] self.service.train(corpus, method='lsi') self.service.index(corpus)
def create_clean_document(self, document): #logging.info("\nCleaning document ...") clean_document = document clean_document.content = self.remove_special_characters(clean_document.content) # Convert document into tokens and clean it tokens = utils.simple_preprocess(clean_document.content) clean_content = self.remove_stop_words(tokens) clean_document.content = ' '.join(clean_content) return clean_document
def tokenize(doc, *, stopwords=None): """Tokenizes a document (optionally removing stopwords) :param doc: A file-like object supporting .read() :param stopwords: The set of stopwords to remove :return: The list of tokens of the document """ text = doc.read() tokens = utils.simple_preprocess(text, deacc=True) if stopwords is not None: tokens = [t for t in tokens if t not in stopwords] return tokens
def create_clean_corpus(self, raw_data): logging.info("\nCleaning data ...") for document in raw_data: # Convert document into tokens self.corpus.append( utils.simple_preprocess(document) ) logging.info("\nRemoving short stop words ...") for (index, document) in enumerate(self.corpus): self.corpus[index] = self.remove_stop_words(document) logging.info("\nRetrieving vocabulary ...") self.vocabulary = corpora.Dictionary(self.corpus)
def getvector(self,text,shelvedb): doc_vector=np.array([0.0]*100) final_vector=np.array([0.0]*100) tokens = utils.simple_preprocess(text) for token in tokens: try: existing = shelvedb[str(token)] doc_vector+=existing except Exception: pass if len(tokens)!=0.0: final_vector = doc_vector/float(len(tokens)) else: final_vector = doc_vector return final_vector
def buildCorpus(): corpus = [] for d in os.listdir('data'): if not d == '0': continue cnt = os.listdir('data/'+d) i = 0 for f in os.listdir('data/'+d): document = open('data/'+d+'/'+f).read() pmcid = f.split('.')[0] docin = {'id' : pmcid, 'tokens' : utils.simple_preprocess(document) } corpus.append(docin) return corpus
def run_evaluation(classifiers, models, eval_samples): ln.info("Beginning evaluation") classifications = dict() for modelname, classifier in classifiers.items(): model = models[modelname] model_classifications = defaultdict(int) for sample_no, (eval_sample_text, actual_label) in enumerate(eval_samples): bow = dictionary.doc2bow(simple_preprocess(eval_sample_text)) model_features = sparse2full(model[bow], model.__out_size) predicted_label = classifier.predict(model_features)[0] model_classifications[(actual_label, predicted_label)] += 1 if sample_no % 500 == 0: ln.debug("Classifier for %s evaluated %s samples so far." % (modelname, sample_no)) classifications[modelname] = model_classifications ln.info("Finished evaluation") return classifications
def findSimilar(texts, server, corpus): similarities = list() # Option Ons for n in range(0, len(texts)): doc_n = u"doc_%d" % n logger.info(u"%s" % doc_n) try: for sim in server.find_similar(doc_n): doc_m = sim[0] doc_similarity = float(sim[1]) # Compares 'doc_m' to 'doc_n' if doc_m != doc_n and doc_similarity > similarity: mi = int(doc_m.index(u"_") + 1) nm = int(doc_m[mi:]) d = [unicode(x) for x in corpus[n][u"tokens"]] e = [unicode(y) for y in corpus[nm][u"tokens"]] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] if len(lc) == 0: logger.error(u"Something is wrong here!") raise Exception else: similar = list() similar.append(doc_n) similar.append(doc_m) similar.append(float(sim[1])) similar.append(lc) similarities.append(similar) logger.info(u"\t%s\t%s\t%3.2f\tCommon : %s" % (doc_n, doc_m, doc_similarity, lc)) except Exception, msg: logger.error(u"%s", msg) if False: # Option two doc = {u"tokens": utils.simple_preprocess(u"Graph and minors and humans and trees.")} logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50)) logger.error(u"%s - %d : %d" % (msg, nm, n))
def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname), encoding='latin'): yield simple_preprocess(line)
def make_texts_corpus(sentences): for sentence in sentences: yield simple_preprocess(sentence, deacc=True)
def __iter__(self): with open(datapath('lee_background.cor')) as f: for i, line in enumerate(f): yield doc2vec.TaggedDocument(utils.simple_preprocess(line), [self._tag(i)])
model_name = "models/teached_d2w_v{}".format(version) policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) print('Compute dtype: %s' % policy.compute_dtype) print('Variable dtype: %s' % policy.variable_dtype) os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' model = load_model('models/model_{}/teached_model_part_{}'.format( model_num, part)) d2v_model = Doc2Vec.load(model_name) while True: #Text input predict_text = input("Введіть текст для розпізнавання: ") #Text preprocess tokenized_text = simple_preprocess(predict_text) #Vector presentation vector = d2v_model.infer_vector(tokenized_text).tolist() test_text = np.asarray([vector]) test_text = np.expand_dims(test_text, -1) #Text class prediction y_pred = model.predict(test_text) y_pred = [np.argmax(_) for _ in y_pred] if y_pred[0] == 0: print("Політичний напрям тексту Консерватизм") else: print("Політичний напрям тексту Лібералізм")
def __iter__(self): for i in range(dfR.shape[0]): yield TaggedDocument(words=simple_preprocess(dfR.iloc[i,-1]), tags=[str.format(dfR.iloc[i,0])])
print("Pre-processing the dataset...") stemmer = PorterStemmer() # Define the type of stemmer to use additional_stop_words = [ 'does', 'don', 'did', 'think', 'help', 'need', 'just', 'know', 'hi', 'want', 'really', 'thanks', 'way', 'good', 'say', 'like', 'use', 'www', 'com', 'http', 'nhttp' ] stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words) stop_words = set([stemmer.stem(word) for word in stop_words]) # Stem the stop words for larger detection processed_data = [] id_to_delete = [] for i, doc in enumerate(data): tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2)) stemmed_doc = [] for word in tokenized_doc: stemmed_word = stemmer.stem(word) if stemmed_word not in stop_words: stemmed_doc.append(stemmed_word) if stemmed_doc == []: # Empty document after pre-processing: to be removed id_to_delete.append(i) else: processed_data.append(stemmed_doc) data = processed_data target = np.delete(target, id_to_delete, axis=0) window = 10 #50 model_path = "models/yahoo_w2v_window" + str(window) + ".model"
from util.load_data import normalize_text from gensim.utils import simple_preprocess from time import time parser = argparse.ArgumentParser("classification.py") text = parser.add_argument_group("The following arguments are mandatory for text option") text.add_argument("--text", metavar="TEXT", help="text to predict", nargs="?") args = parser.parse_args() path = join(dirname(__file__), "models") transformer = pickle.load(open(join(path, "transformer.pkl"), 'rb')) estimator = pickle.load(open(join(path, "classifier.pkl"), 'rb')) if not args.text: parser.print_help() if args.text: t0 = time() text = args.text text = normalize_text(text) print(text) X = transformer.infer_vector(simple_preprocess(text), steps=20) y = estimator.predict(X.reshape(1, -1))[0] classify_time = time() - t0 print(y) print("process time: %0.3fs" % classify_time)
def remove_stopwords(texts): #remove stopwords to do more effective extraction return [[ word for word in simple_preprocess(str(doc)) if word not in stop_words ] for doc in texts]
import gensim import pickle from sklearn.datasets import fetch_20newsgroups from gensim.models import Word2Vec from gensim.utils import simple_preprocess print('[INFO] Necessary Libraries Imported') # Fetch the training data training_data = fetch_20newsgroups(subset='train') print('[INFO] Reading training data') wordlist = [] for news in training_data.data: wordlist.append(simple_preprocess(news)) print('[INFO] Creating Vocabulary') model = Word2Vec(wordlist, size=10, window=4, workers=10) print('[INFO] Training model') model.train(wordlist, total_examples=len(wordlist), epochs=10) word2vec_file = open('word2vec_model', 'wb') pickle.dump(model, word2vec_file) print('[INFO] Model Serialized and written to file')
import gensim.utils import pandas as pd from gensim.corpora import Dictionary from gensim.utils import simple_preprocess from gensim.models import TfidfModel from gensim.test.utils import get_tmpfile from gensim.similarities import Similarity if len(sys.argv)!=2: print('You must enter a similarity threshold as an argument during the execution command i.e. python duplicates.py 0.7') exit() similarity = float(sys.argv[1]) df = pd.read_csv('./train_set.csv', sep='\t') dictionary = Dictionary([simple_preprocess(article) for article in df.Content]) corpus = [dictionary.doc2bow(simple_preprocess(article)) for article in df.Content] tfidf = TfidfModel(corpus) index_temp = get_tmpfile("index") index = Similarity(index_temp, tfidf[corpus], num_features=len(dictionary)) duplicate_count = 0 duplicates = [] for i, s in enumerate(index): for j, similarity_value in enumerate(s): if similarity_value >= similarity and i < j: duplicate_count += 1 duplicates.append([i, j, similarity_value])
def sent_to_words(sentences): for sentence in sentences: yield(simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts, passed_stop_words): stop_words = stopwords.words('english') stop_words.extend(passed_stop_words) return [[ word for word in simple_preprocess(str(doc)) if word not in stop_words ] for doc in texts]
def preprocess(text): result=[] for token in simple_preprocess(text) : if token not in STOPWORDS and len(token) > 2: result.append(lemmatize_stemming(token)) return result
train_arrays = numpy.zeros((32398, 300)) train_labels = numpy.zeros(32398) for i in range(16199): prefix_train_tox = 'TRAIN_TOXIC_' + str(i) prefix_train_non = 'TRAIN_NONTOXIC_' + str(i) train_arrays[i] = model[prefix_train_tox] train_arrays[16199 + i] = model[prefix_train_non] train_labels[i] = 1 train_labels[16199 + i] = 0 classifier = LogisticRegression() classifier.fit(train_arrays, train_labels) if ('y' == input('Test classifier? (y/n): ')): tmp = input('Enter a phrase (0 to quit): ') while(tmp != '0'): vec = model.infer_vector(utils.simple_preprocess(doc = tmp, deacc = True)) prob = classifier.predict_proba(vec.reshape(1, -1)) if (prob[0][0] > prob[0][1]): print('Non-toxic') print('Confidence: ', prob[0][0]) else: print('Toxic') print('Confidence: ', prob[0][1]) tmp = input('Enter a phrase (0 to quit): ')
from keras.layers import Flatten, merge from keras.layers.embeddings import Embedding from keras.utils import to_categorical from gensim.models import Word2Vec from gensim.utils import simple_preprocess from keras.engine import Input from keras.layers import Dense, Input, Flatten from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, TimeDistributed from keras.models import Model #================================ #============word2vec============ #================================ # tokenizer: can change this as needed tokenize = lambda x: simple_preprocess(x) def create_embeddings(data_dir, embeddings_path='embeddings.npz', vocab_path='map.json', **params): """ Generate embeddings from a batch of text :param embeddings_path: where to save the embeddings :param vocab_path: where to save the word-index map """ class SentenceGenerator(object): def __init__(self, dirname): self.dirname = dirname
fileName = 'MonAmiGabiTraining' training = extract_reviews('../data/' + fileName + '.pkl') documents = training more_stopwords = [ 'great', 'good', 'like', 'le', 'la', 'time', 'think', 'wasnt', 'est', 've', 'et', 'les', 'restaurant', 'nice', 'service', 'yelp', 'www', 'http', 'com', 'select' ] more_stopwords.extend(STOPWORDS) #print("Texts before STOPWORDS: ",documents) texts = [] texts = [[ word for word in simple_preprocess(document) if word not in more_stopwords ] for document in documents] #texts = [[word for word in documents.lower().split() if word not in STOPWORDS]] print("Texts after STOPWORDS: ", texts) from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] from pprint import pprint # pretty-printer #pprint(texts)
def remove_stopwords(texts): return [[ word for word in simple_preprocess(str(doc)) if word not in stop_words ] for doc in texts]
#%% pca 버전 for y in range(2007, 2017): s = tic() y1, y2 = y + 2, y + 3 tr = onxy.set_index('date').loc[str(y):str(y1)].reset_index() te = onxy.set_index('date').loc[str(y2)].reset_index() tr['smry'] = [txtsum(t) for t in tr.text] print(toc(s)) # 약 2분 소요 # text summary te['smry'] = [txtsum(t) for t in te.text] tr = tr.loc[tr.smry.notna()] te = te.loc[te.smry.notna()] # doc2vec tokf = lambda t: [ word for word in simple_preprocess(t, deacc=True) if word not in stop_words ] tagged = [ TaggedDocument(words=tokf(t), tags=[i]) for i, t in enumerate(tr.smry) ] dvmod = Doc2Vec(vector_size=100, epochs=10, workers=4) dvmod.build_vocab(tagged) dvmod.train(tagged, total_examples=dvmod.corpus_count, epochs=dvmod.epochs) trdv = pd.DataFrame([dvmod.infer_vector(tokf(t)) for t in tr.smry]) tedv = pd.DataFrame([dvmod.infer_vector(tokf(t)) for t in te.smry]) # doc2vec pca pca = PCA(n_components=20) pca.fit(trdv) trpca = pd.DataFrame(pca.transform(trdv)[:, :11], columns=['pca' + str(i + 1) for i in range(11)])
def __init__(self, documents, speed="fast-learn", document_ids=None, keep_documents=True, workers=None): # validate training inputs if speed == "fast-learn": hs = 0 negative = 5 epochs = 40 elif speed == "learn": hs = 1 negative = 0 epochs = 40 elif speed == "deep-learn": hs = 1 negative = 0 epochs = 400 elif speed == "test-learn": hs = 0 negative = 5 epochs = 1 else: raise ValueError( "speed parameter needs to be one of: fast-learn, learn or deep-learn" ) if workers is None: pass elif isinstance(workers, int): pass else: raise ValueError("workers needs to be an int") # validate documents if not all((isinstance(doc, str) or isinstance(doc, np.str_)) for doc in documents): raise ValueError("Documents need to be a list of strings") if keep_documents: self.documents = np.array(documents) else: self.documents = None # validate document ids if document_ids is not None: if len(documents) != len(document_ids): raise ValueError( "Document ids need to match number of documents") elif len(document_ids) != len(set(document_ids)): raise ValueError("Document ids need to be unique") if all((isinstance(doc_id, str) or isinstance(doc_id, np.str_)) for doc_id in document_ids): self.doc_id_type = np.str_ elif all((isinstance(doc_id, int) or isinstance(doc_id, np.int_)) for doc_id in document_ids): self.doc_id_type = np.int_ else: raise ValueError("Document ids need to be str or int") self.document_ids = np.array(document_ids) self.doc_id2index = dict( zip(document_ids, list(range(0, len(document_ids))))) else: self.document_ids = None self.doc_id2index = None self.doc_id_type = np.int_ # preprocess documents for training - tokenize and remove too long/short words train_corpus = [ TaggedDocument(simple_preprocess(strip_tags(doc), deacc=True), [i]) for i, doc in enumerate(documents) ] # create documents and word embeddings with doc2vec if workers is None: self.model = Doc2Vec(documents=train_corpus, vector_size=300, min_count=50, window=15, sample=1e-5, negative=negative, hs=hs, epochs=epochs, dm=0, dbow_words=1) else: self.model = Doc2Vec(documents=train_corpus, vector_size=300, min_count=50, window=15, sample=1e-5, negative=negative, hs=hs, workers=workers, epochs=epochs, dm=0, dbow_words=1) # create 5D embeddings of documents umap_model = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine').fit( self.model.docvecs.vectors_docs) # find dense areas of document vectors cluster = hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom').fit( umap_model.embedding_) # calculate topic vectors from dense areas of documents self._create_topic_vectors(cluster.labels_) # deduplicate topics self._deduplicate_topics() # calculate topic sizes and index nearest topic for each document self._calculate_topic_sizes() # find topic words and scores self._find_topic_words_scores()
def tokenize(text): return [token for token in simple_preprocess(text)]
#nltk.download() conn = MongoClient('mongodb://localhost:27017') print(conn) db = conn.tcc twitter_clean = db.twitter_clean to_pandas = twitter_clean.find({}) # Get data from MongoDB df = pd.DataFrame(list(to_pandas)) # Convert data to Pandas DataFrame del df['_id'] # Delete column _id # Tokenize words in sentences and keep in a new column df['tokenized_text'] = [ simple_preprocess(line, deacc=True) for line in df['tweet_text'] ] # print(df['tokenized_text'].head(10)) # Stemm sentences for idx, sentence in enumerate(df['tokenized_text']): df['tokenized_text'][idx] = Stemming(sentence) # print(df['tokenized_text'].head(10)) def Stemming(sentence): # Function to Stemm words in sentences stemmer = RSLPStemmer() # to their root form phrase = [] for word in sentence: phrase.append(stemmer.stem(word.lower()))
# @Author: Yiheng # @Email: [email protected] # @Time: 7/5/2019 11:50 from gensim.models import TfidfModel from gensim.corpora import Dictionary from gensim.utils import simple_preprocess import numpy as np if __name__ == '__main__': docs = [ '一种 大头菜 自然风 脱水 设备 其 特征 在于 所述 的 大头菜 自然风 脱水 设备 主要 包括 大头菜', '风 脱水 架 和 大头菜 风 脱水 网袋 所述 的 大头菜 风 脱水 架 主要 包括 底座 支柱 横架 横向', '连接 承重杆 所述 的 底座 通过 中间 的 多边形 孔 与 支柱 的 下端 的 多边形 柱 配合 而 固定' ] tokenized_docs = [simple_preprocess(doc, min_len=2) for doc in docs] my_dct = Dictionary(tokenized_docs) print('dictionary is {}'.format(my_dct.token2id)) corpus = [my_dct.doc2bow(doc) for doc in tokenized_docs] for index, bow in enumerate(corpus): bow = [[my_dct[index], count] for index, count in bow] print('bow of doc {} is'.format(index)) print(bow) tf_idf_model = TfidfModel(corpus, id2word=my_dct, dictionary=my_dct, smartirs='ntc') for index, doc in enumerate(tf_idf_model[corpus]):
def __iter__(self): with open(datapath('lee_background.cor')) as f: for line in f: yield utils.simple_preprocess(line)
for tokens in stream_from_file(filename): yield self.dictionary.doc2bow(tokens) filename = '/home/ashwath/Programs/ACLAAn/acl_training_data.txt' # memory-hungry print("Starting") try: with open('processedtext.pickle', 'rb') as ipick: data_stemmed = pickle.load(ipick) except FileNotFoundError: with open(filename, 'r') as file: # list of lists # Remove punctuation at this stage data = [ simple_preprocess(line, deacc=True, min_len=2) for line in tqdm(file) ] # REMOVE THIS PICKLE LATER!!!!!! #with open('inputtext.pickle', 'wb') as dpick: # pickle.dump(data, dpick) # Create a Phrases model bigram = Phrases(data, min_count=10, threshold=100) # higher threshold fewer phrases. bigram_mod = Phraser(bigram) data_nostops = remove_stopwords(data) # Form Bigrams data_bigrams = make_bigrams(data_nostops) #data_lemmatized = lemmatization(data_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']) # Use a Snowball stemmer, lemmatization takes too much time and CPU
def remove_stopwords(texts): print("Removing stop words.") return [[ word for word in simple_preprocess(str(doc)) if word not in nlp.Defaults.stop_words ] for doc in tqdm(texts)]
]].rename(columns={ 'Segment': 'questions', 'category': 'category' }) newDataSetVersion['category'] = newDataSetVersion['category'].str.replace( ' ', '_') train, test = sm.train_test_split(newDataSetVersion, test_size=0.4, shuffle=True, random_state=42) train.iloc[:, 0] = train.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x))) test.iloc[:, 0] = test.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x))) train.iloc[:, 1] = train.iloc[:, 1].apply(lambda x: '__label__' + x) test.iloc[:, 1] = test.iloc[:, 1].apply(lambda x: '__label__' + x) train[['category', 'questions']].to_csv('train.txt', index=False, sep=' ', header=None, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ") test[['category', 'questions']].to_csv('test.txt',
def __iter__(self): for i in range(df_combo.shape[0]): yield TaggedDocument(words=simple_preprocess(df_combo.iloc[i,-1]), tags=['%s' % df_combo.iloc[i,0]])
def Tokenize(doc): """Tokenize documents for training and remove too long/short words""" return simple_preprocess(strip_tags(doc), deacc=True)
def iter_documents(reuters_dir): """Iterate over Reuters documents, yielding one document at a time.""" for fname in os.listdir(reuters_dir): document = open(os.path.join(reuters_dir, fname)).read() yield utils.simple_preprocess(document)
def _process_file(self, file): tokens = simple_preprocess(file) return self.doc2vec.infer_vector(tokens)