Example #1
0
    def findSimilarities(self, texts):
        gsDir = os.getcwd()
        logger.debug(u"GSDir %s" % gsDir)

        gss = gsDir + os.sep + u"gensim_server" + os.sep
        logger.debug(u"%s" % gss)

        server = SessionServer(gss)

        corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)]

        # send 1k docs at a time
        # utils.upload_chunked(server, corpus, chunksize=1000)

        # server.train(corpus, method=u"lsi")

        # index the same documents that we trained on...
        # server.index(corpus)

        # overall index size unchanged (just 3 docs overwritten)
        # server.index(corpus[:3])

        # Option Ons
        if True:
            for n in range(0, len(texts)):
                doc = u"doc_%d" % n
                self.output += u"Find similar doc_%d to %s%s" % (n, corpus[n][u"tokens"], os.linesep)
                logger.info(self.output[:-1])

                for sim in server.find_similar(doc):
                    m = int(sim[0][-1:])
                    if m != n:
                        self.output += u"\t%s \t %3.2f : %s%s" % (sim[0], float(sim[1]), corpus[m][u"tokens"], os.linesep)
                        logger.info(self.output[:-1])

                        d = [unicode(x) for x in corpus[n][u"tokens"]]
                        e = [unicode(y) for y in corpus[m][u"tokens"]]

                        s1 = set(e)
                        s2 = set(d)
                        common = s1 & s2
                        lc = [x for x in common]
                        self.output += u"\tCommon Topics : %s%s" % (lc, os.linesep)
                        logger.info(self.output[:-1])

            else:
                # Option two
                doc = {u"tokens": utils.simple_preprocess(u"Graph and minors and humans and trees.")}
                logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))

        return self.output
	def _clean_body(self):
		""" Preprocess the body of the post """

		tmp_text  = self.strip_code_blocks(self.body)
		tmp_text  = self.remove_special_characters(tmp_text)
		tokens    = utils.simple_preprocess(tmp_text)
		self.body = self.remove_short_words(tokens)
Example #3
0
def iter_rows(_host, _user, _passwd, _db, _table, _group, _value):
    """Iterate over all the table rows, yielding one row at a time."""
    try:
        mydb = MySQLdb.connect(host=_host,
        user=_user,
        passwd=_passwd,
        db=_db)

        cursor = mydb.cursor()
        # Prepare SQL query
        sql = "SELECT tweet_text FROM %s" % (_table)
        if _group is not None:
            sql += " WHERE %s = '%s'" % (_group, _value)

        # Execute the SQL command
        cursor.execute(sql)
        # Fetch all the rows in a list of lists.
        results = cursor.fetchall()
        for row in results:
            document = row[0]
            # parse document into a list of utf8 tokens
            yield utils.simple_preprocess(document)

        cursor.close()
        mydb.close()

    except MySQLdb.Error as e:
        print(e)

    except :
        print("Unknown error occurred")
Example #4
0
def iter_rows(table_name):
    """Iterate over all the table rows, yielding one row at a time."""
    try:
        mydb = MySQLdb.connect(host='localhost',
        user='******',
        passwd='root',
        db='text_mining')

        cursor = mydb.cursor()
        # Prepare SQL query
        sql = "SELECT tweet_text FROM %s" % (table_name)

        # Execute the SQL command
        cursor.execute(sql)
        # Fetch all the rows in a list of lists.
        results = cursor.fetchall()
        for row in results:
            document = row[0]
            # parse document into a list of utf8 tokens
            yield utils.simple_preprocess(document)

        cursor.close()
        mydb.close()

    except MySQLdb.Error as e:
        print(e)

    except :
        print("Unknown error occurred")
Example #5
0
def iter_documents(reuters_dir):
    """Iterate over Reuters documents, yielding one document at a time."""
    for fname in os.listdir(reuters_dir):
        # read each document as one big string
        document = open(os.path.join(reuters_dir, fname)).read()
        # parse document into a list of utf8 tokens
        yield utils.simple_preprocess(document)
Example #6
0
 def test_index(self):
         
         # a stupid first training set               
         texts = ["Human machine interface for lab abc computer applications",
         "A survey of user opinion of computer system response time",
         "The EPS user interface management system",
         "System and human system engineering testing of EPS",
         "Relation of user perceived response time to error measurement",
         "The generation of random binary unordered trees",
         "The intersection graph of paths in trees",
         "Graph minors IV Widths of trees and well quasi ordering",
         "Graph minors A survey"]
         corpus = [{'id': 'doc_%i' % num, 'tokens': utils.simple_preprocess(text)}
                    for num, text in enumerate(texts)]
         
         service_id = '1234'
         #from gensim.similarities.simserver import SessionServer
         # service = similarities.SessionServer('bla') # create a local server
         # service = SessionServer(self.rootlocation, autosession=True)
         import Pyro4
         service = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
         
         
         service.train(corpus, method='lsi') ## TODO we don't have a corpus yet, but we definatly need one big
         
         ''' texts = ["Human machine interface for lab abc computer applications",
Example #7
0
    def preprocessText(self, text):
        preprocessedData = simple_preprocess(text)
        dataNoStop = [word.strip() for word in preprocessedData \
                            if not word in self.stopWords]

        dataStem = [self.stemmer.stem(word) for word in dataNoStop]
        return dataStem
def main():
    json_data = open('./items.json')
    data = json.load(json_data)
    print 'starting'
    for i in range(0, len(data)-1):
        print i
        s = ""
        identifier = ""
        title = ""
        totalText = ""
        try:
            s = data[i]['identifier']
            identifier = s[0][18:].replace("%3A", "")
            summary = data[i]['desc'][0].strip()
            title = data[i]['title'][0].strip()
            totalText += summary
            totalText += " "
            totalText += title
            totalText += " "
            totalText += identifier
        except:
            print "error"
        documentPayload = ({'identifier':identifier, 'title': title, 'summary' : summary})
        documents.append({'text' : totalText, 'payload' : documentPayload})
    corpus =[{'id': text['payload']['identifier'], 'tokens' : utils.simple_preprocess(text['text']), 'payload' : text['payload']} for num, text in enumerate(documents)]
    service = SessionServer('./thesite/simdatabase')
    service.train(corpus, method='lsi')
    service.index(corpus)
    service.commit()
Example #9
0
def prune(doc, stoplist = None, stem = True, english_dictionary_words = False):
    """This takes a single document and tokenizes the words, removes
    undesirable elements, and prepares it to be loaded into a dictionary.
    """
    # Tokenize the document and make it lowercase
    temp = utils.simple_preprocess(doc.lower())

    # Remove freestanding punctuation and punctuation in words
    temp = [w for w in temp if w not in string.punctuation]
    temp = [rmPunct(w) for w in temp]

    # Remove words in passed stoplist
    if stoplist:
        temp = [w for w in temp if w not in stoplist]

    # Remove specific tokens
    temp = [w for w in temp if w not in set(['[', ']', "'", '\n', 'com'])]

    # Remove stopwords
    temp = [w for w in temp if w not in stopwords.words('english')]

    # Stem the remaining words
    if stem:
        stemmer = SnowballStemmer('english')
        temp = [stemmer.stem(w) for w in temp]

    if english_dictionary_words:
        d = enchant.Dict("en_US")
        temp = [w for w in temp if d.check(w)]
    return temp
Example #10
0
 def commit_indexing_set(self):
     ''' after filling an indexing set the actual indexing needs to be done
     '''         
     #indexing_data = c.fetchmany(500);
     
     #service = similarities.SessionServer(self.rootlocation, autosession=True)
     #service = SessionServer(self.rootlocation + 'gensimTraining'+str(self.training_id), autosession=True) # create a local server
     
     service = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
     indexing_data = self.get_indexing_set()
     
     keys = self.redis_con.smembers(self.s_redis_new_item_list)
     self.redis_con.srem(self.s_redis_new_item_list, keys)
     indexing_data = self.redis_con.hmget(self.h_redis_itemid_document_text, keys)
                    
     corpus = []
     #for (id, text) in indexing_data.iteritems():
     #    corpus.append({ 'id' : str(id), "tokens" : utils.simple_preprocess(text) })
     for id in keys:
         corpus.append({ "id" : id, "tokens" : utils.simple_preprocess( self.redis_con.hget(self.h_redis_itemid_document_text, id) ) } )    
     
     # print corpus
     service.index(corpus) ## TODO we don't have a corpus yet, but we definatly need one big
             
     return 'indexing done'
Example #11
0
    def _clean_text(self, item_description):
        from gensim.utils import simple_preprocess
        from string import printable

        # Filter the string for non printable char and process it to an array
        # of words.
        s = simple_preprocess("".join((e for e in item_description if e in printable)))
        return [i for i in s if i not in self._stopwords]
 def _create_corpus(self, texts):
     corpus = []
     for id, text in texts:
         corpus.append({
             'id': id,
             'tokens': utils.simple_preprocess(text)
         })
     return corpus
	def read_corpus(self, fname, stop_words=False):
		with open(fname) as f:
			for i, line in enumerate(f):
				if stop_words:
					yield TaggedDocument(utils.simple_preprocess(line), [i])
				else:
					 # For training data, add tags
					 yield TaggedDocument(line.split(), [i])
Example #14
0
def topicos(frase: str):
    """Informa os topicos de uma frase qualquer"""
    tokens = simple_preprocess(frase)
    inferred_vector = model.infer_vector(tokens)
    similars = model.docvecs.most_similar([inferred_vector], topn=10)

    return {
        'topicos': similars
    }
Example #15
0
def generate_ngrams(train_set, n=1):
    """
    split sentence in tokens, compare how many tokens are one in another?
    generates sentences of max 15 tokens
    """
    sentences_tokens = [utils.simple_preprocess(s[2], max_len=20) for s in train_set]
    token_dict = generate_dictionary(sentences_tokens)
    matrix_train = generate_matrix(sentences_tokens, token_dict, maxlen=len(token_dict), is_train=True)
    return matrix_train, token_dict
Example #16
0
def test_doc2vec_inference():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1
 def iter_wiki(self):
     """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
     ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
     for title, text, pageid in _extract_pages(smart_open(self.dump_file)):
         text = filter_wiki(text)
         tokens = [token for token in simple_preprocess(text) if token not in STOPWORDS]
         if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
             continue  # ignore short articles and various meta-articles
         yield title, tokens
Example #18
0
def tokenize(text, rm_ascii=False):
    """Tokenize and rm stopwords. The Gensim `simple_preprocess` will work fine
    here becuase the Greek text has already been aggressively cleaned up.
    https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
    """
    if rm_ascii:
        text = [char for char in text if char not in ascii_str]
        text = ''.join(text)
    tokens = [token for token in simple_preprocess(text, deacc=PREPROCESS_DEACCENT, min_len=TOK_MIN, max_len=TOK_MAX)]
    return [token for token in tokens if token not in STOPS_LIST]
Example #19
0
def tokenize(text):
    """
    Simple tokenizer. Also filters stopwords.
    INPUT:
    [1] text (str): some raw text
    OUTPUT:
    [1] vector_tokens (list): tokenized + filtered version of text
    """
    vector_tokens = [token for token in simple_preprocess(text) if token not in STOPWORDS]
    return vector_tokens
Example #20
0
 def __iter__(self): # Read only one line at a time from the text files, to be memory friendly
     for f in self.files:
         f.seek(0) # Reset the file pointer before a new iteration
         for line in f:
             post = json.loads(line)
             try: # parse and split the content up into a list of lower-case words
                 content = strip_tags(post["content"])
                 doc_words = utils.simple_preprocess(content)
             except: # Fails on some nasty unicode
                 doc_words = []
             yield doc_words
    def testMallet2ModelOn20NewsGroups(self):
        corpus = [simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups")]
        dictionary = Dictionary(corpus)

        corpus = [dictionary.doc2bow(text) for text in corpus]

        lda_mallet_model = ldamallet.LdaMallet(
            self.mallet_path, corpus=corpus,
            num_topics=20, id2word=dictionary, iterations=500)

        lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model, iterations=1000)
        self.assertEqual(lda_mallet_model.show_topics(20, 50), lda_gensim_model.show_topics(20, 50))
Example #22
0
def add_document(corpus_name, document):
  service = get_service()
  if corpus_name not in corpuses:
    corpuses[corpus_name] = list()
  corpus = corpuses[corpus_name]
  doc_id = 'doc_%s' % len(corpus)
  doc = dict()
  tokens = utils.simple_preprocess(document)
  doc = {'id': doc_id, 'tokens': tokens , 'body':document}
  documents[doc_id] = doc
  corpus.append(doc)
  return doc_id
Example #23
0
 def generate_index(self):
     def page_text():
         for page_file in os.listdir('CrawlData'):
             content = open('CrawlData/'+page_file, 'r')
             page_content = content.read()
             content.close()
             page_url = re.sub('\s', '/', page_file)
             yield page_url, page_content
     corpus = [{'id': '%s' % url, 'tokens': utils.simple_preprocess(text)}
             for url, text in page_text()]
     self.service.train(corpus, method='lsi')
     self.service.index(corpus)
	def create_clean_document(self, document):

		#logging.info("\nCleaning document ...")
		clean_document = document
		clean_document.content = self.remove_special_characters(clean_document.content)

		# Convert document into tokens and clean it
		tokens = utils.simple_preprocess(clean_document.content)
		clean_content = self.remove_stop_words(tokens)
		clean_document.content = ' '.join(clean_content)

		return clean_document
Example #25
0
def tokenize(doc, *, stopwords=None):
    """Tokenizes a document (optionally removing stopwords)
    :param doc: A file-like object supporting .read()
    :param stopwords: The set of stopwords to remove
    :return: The list of tokens of the document
    """
    text = doc.read()

    tokens = utils.simple_preprocess(text, deacc=True)
    if stopwords is not None:
        tokens = [t for t in tokens if t not in stopwords]

    return tokens
	def create_clean_corpus(self, raw_data):

		logging.info("\nCleaning data ...")
		for document in raw_data:
			# Convert document into tokens
			self.corpus.append( utils.simple_preprocess(document) )

		logging.info("\nRemoving short stop words ...")
		for (index, document) in enumerate(self.corpus):
			self.corpus[index] = self.remove_stop_words(document)

		logging.info("\nRetrieving vocabulary ...")
		self.vocabulary = corpora.Dictionary(self.corpus)
	def getvector(self,text,shelvedb):
		doc_vector=np.array([0.0]*100)
                final_vector=np.array([0.0]*100)
		tokens = utils.simple_preprocess(text)
		for token in tokens:
                        try:
                                existing = shelvedb[str(token)]
                                doc_vector+=existing
                        except Exception:
                                pass
                if len(tokens)!=0.0:
                        final_vector = doc_vector/float(len(tokens))
                else:
                        final_vector = doc_vector
                return final_vector		
Example #28
0
def buildCorpus():
    
    corpus = []
    for d in os.listdir('data'):
        if not d == '0':
            continue
        cnt = os.listdir('data/'+d)
        i = 0
        for f in os.listdir('data/'+d):
            document = open('data/'+d+'/'+f).read()
            pmcid = f.split('.')[0]
            docin = {'id' : pmcid,
                     'tokens' : utils.simple_preprocess(document)
            }
            corpus.append(docin)
    return corpus
Example #29
0
def run_evaluation(classifiers, models, eval_samples):
    ln.info("Beginning evaluation")
    classifications = dict()
    for modelname, classifier in classifiers.items():
        model = models[modelname]
        model_classifications = defaultdict(int)
        for sample_no, (eval_sample_text, actual_label) in enumerate(eval_samples):
            bow = dictionary.doc2bow(simple_preprocess(eval_sample_text))
            model_features = sparse2full(model[bow], model.__out_size)
            predicted_label = classifier.predict(model_features)[0]

            model_classifications[(actual_label, predicted_label)] += 1
            if sample_no % 500 == 0:
                ln.debug("Classifier for %s evaluated %s samples so far." % (modelname, sample_no))
        classifications[modelname] = model_classifications
    ln.info("Finished evaluation")
    return classifications
Example #30
0
def findSimilar(texts, server, corpus):

    similarities = list()

    # Option Ons
    for n in range(0, len(texts)):
        doc_n = u"doc_%d" % n
        logger.info(u"%s" % doc_n)
        try:
            for sim in server.find_similar(doc_n):
                doc_m = sim[0]
                doc_similarity = float(sim[1])
                # Compares 'doc_m' to 'doc_n'
                if doc_m != doc_n and doc_similarity > similarity:
                    mi = int(doc_m.index(u"_") + 1)
                    nm = int(doc_m[mi:])

                    d = [unicode(x) for x in corpus[n][u"tokens"]]
                    e = [unicode(y) for y in corpus[nm][u"tokens"]]

                    s1 = set(e)
                    s2 = set(d)
                    common = s1 & s2
                    lc = [x for x in common]

                    if len(lc) == 0:
                        logger.error(u"Something is wrong here!")
                        raise Exception
                    else:
                        similar = list()
                        similar.append(doc_n)
                        similar.append(doc_m)
                        similar.append(float(sim[1]))
                        similar.append(lc)
                        similarities.append(similar)

                        logger.info(u"\t%s\t%s\t%3.2f\tCommon : %s" % (doc_n, doc_m, doc_similarity, lc))

        except Exception, msg:
            logger.error(u"%s", msg)

        if False:
            # Option two
            doc = {u"tokens": utils.simple_preprocess(u"Graph and minors and humans and trees.")}
            logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))
            logger.error(u"%s - %d : %d" % (msg, nm, n))
 def __iter__(self):
     for fname in os.listdir(self.dirname):
         for line in open(os.path.join(self.dirname, fname), encoding='latin'):
             yield simple_preprocess(line)
Example #32
0
def make_texts_corpus(sentences):
    for sentence in sentences:
        yield simple_preprocess(sentence, deacc=True)
 def __iter__(self):
     with open(datapath('lee_background.cor')) as f:
         for i, line in enumerate(f):
             yield doc2vec.TaggedDocument(utils.simple_preprocess(line), [self._tag(i)])
Example #34
0
model_name = "models/teached_d2w_v{}".format(version)

policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)
print('Compute dtype: %s' % policy.compute_dtype)
print('Variable dtype: %s' % policy.variable_dtype)
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'

model = load_model('models/model_{}/teached_model_part_{}'.format(
    model_num, part))
d2v_model = Doc2Vec.load(model_name)

while True:
    #Text input
    predict_text = input("Введіть текст для розпізнавання: ")

    #Text preprocess
    tokenized_text = simple_preprocess(predict_text)

    #Vector presentation
    vector = d2v_model.infer_vector(tokenized_text).tolist()
    test_text = np.asarray([vector])
    test_text = np.expand_dims(test_text, -1)

    #Text class prediction
    y_pred = model.predict(test_text)
    y_pred = [np.argmax(_) for _ in y_pred]
    if y_pred[0] == 0:
        print("Політичний напрям тексту Консерватизм")
    else:
        print("Політичний напрям тексту Лібералізм")
Example #35
0
 def __iter__(self):
     for i in range(dfR.shape[0]):
         yield TaggedDocument(words=simple_preprocess(dfR.iloc[i,-1]), tags=[str.format(dfR.iloc[i,0])])
Example #36
0
print("Pre-processing the dataset...")
stemmer = PorterStemmer()  # Define the type of stemmer to use
additional_stop_words = [
    'does', 'don', 'did', 'think', 'help', 'need', 'just', 'know', 'hi',
    'want', 'really', 'thanks', 'way', 'good', 'say', 'like', 'use', 'www',
    'com', 'http', 'nhttp'
]

stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)
stop_words = set([stemmer.stem(word) for word in stop_words])
# Stem the stop words for larger detection

processed_data = []
id_to_delete = []
for i, doc in enumerate(data):
    tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2))
    stemmed_doc = []
    for word in tokenized_doc:
        stemmed_word = stemmer.stem(word)
        if stemmed_word not in stop_words:
            stemmed_doc.append(stemmed_word)

    if stemmed_doc == []:  # Empty document after pre-processing: to be removed
        id_to_delete.append(i)
    else:
        processed_data.append(stemmed_doc)
data = processed_data
target = np.delete(target, id_to_delete, axis=0)

window = 10  #50
model_path = "models/yahoo_w2v_window" + str(window) + ".model"
from util.load_data import normalize_text
from gensim.utils import simple_preprocess
from time import time

parser = argparse.ArgumentParser("classification.py")
text = parser.add_argument_group("The following arguments are mandatory for text option")
text.add_argument("--text", metavar="TEXT", help="text to predict", nargs="?")
args = parser.parse_args()

path = join(dirname(__file__), "models")

transformer = pickle.load(open(join(path, "transformer.pkl"), 'rb'))
estimator = pickle.load(open(join(path, "classifier.pkl"), 'rb'))

if not args.text:
    parser.print_help()


if args.text:
    t0 = time()
    text = args.text
    text = normalize_text(text)
    print(text)
    X = transformer.infer_vector(simple_preprocess(text), steps=20)
    y = estimator.predict(X.reshape(1, -1))[0]
    classify_time = time() - t0
    print(y)
    print("process time: %0.3fs" % classify_time)
    
    
def remove_stopwords(texts):  #remove stopwords to do more effective extraction
    return [[
        word for word in simple_preprocess(str(doc)) if word not in stop_words
    ] for doc in texts]
Example #39
0
import gensim
import pickle
from sklearn.datasets import fetch_20newsgroups
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

print('[INFO] Necessary Libraries Imported')

# Fetch the training data
training_data = fetch_20newsgroups(subset='train')
print('[INFO] Reading training data')
wordlist = []
for news in training_data.data:
    wordlist.append(simple_preprocess(news))

print('[INFO] Creating Vocabulary')
model = Word2Vec(wordlist, size=10, window=4, workers=10)
print('[INFO] Training model')
model.train(wordlist, total_examples=len(wordlist), epochs=10)
word2vec_file = open('word2vec_model', 'wb')
pickle.dump(model, word2vec_file)
print('[INFO] Model Serialized and written to file')
import gensim.utils
import pandas as pd
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim.test.utils import get_tmpfile
from gensim.similarities import Similarity

if len(sys.argv)!=2:
	print('You must enter a similarity threshold as an argument during the execution command i.e. python duplicates.py 0.7')
	exit()
similarity = float(sys.argv[1])

df = pd.read_csv('./train_set.csv', sep='\t')

dictionary = Dictionary([simple_preprocess(article) for article in df.Content])
corpus = [dictionary.doc2bow(simple_preprocess(article)) for article in df.Content]

tfidf = TfidfModel(corpus)

index_temp = get_tmpfile("index")
index = Similarity(index_temp, tfidf[corpus], num_features=len(dictionary))

duplicate_count = 0
duplicates = []

for i, s in enumerate(index):
    for j, similarity_value in enumerate(s):
        if similarity_value >= similarity and i < j:
            duplicate_count += 1
            duplicates.append([i, j, similarity_value])
Example #41
0
def sent_to_words(sentences):
        for sentence in sentences:
             yield(simple_preprocess(str(sentence), deacc=True)) 
Example #42
0
def remove_stopwords(texts, passed_stop_words):
    stop_words = stopwords.words('english')
    stop_words.extend(passed_stop_words)
    return [[
        word for word in simple_preprocess(str(doc)) if word not in stop_words
    ] for doc in texts]
Example #43
0
def preprocess(text):
    result=[]
    for token in simple_preprocess(text) :
        if token not in STOPWORDS and len(token) > 2:
            result.append(lemmatize_stemming(token))
    return result
Example #44
0
train_arrays = numpy.zeros((32398, 300))
train_labels = numpy.zeros(32398)

for i in range(16199):
	prefix_train_tox = 'TRAIN_TOXIC_' + str(i)
	prefix_train_non = 'TRAIN_NONTOXIC_' + str(i)
	
	train_arrays[i] = model[prefix_train_tox]
	train_arrays[16199 + i] = model[prefix_train_non]

	train_labels[i] = 1
	train_labels[16199 + i] = 0

classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

if ('y' == input('Test classifier? (y/n): ')):
	tmp = input('Enter a phrase (0 to quit): ')
	while(tmp != '0'):
		vec = model.infer_vector(utils.simple_preprocess(doc = tmp, deacc = True))
		prob = classifier.predict_proba(vec.reshape(1, -1))
		
		if (prob[0][0] > prob[0][1]):
			print('Non-toxic')
			print('Confidence: ', prob[0][0])
		else:
			print('Toxic')
			print('Confidence: ', prob[0][1])
		
		tmp = input('Enter a phrase (0 to quit): ')
Example #45
0
from keras.layers import Flatten, merge
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input

from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, TimeDistributed
from keras.models import Model
#================================
#============word2vec============
#================================

# tokenizer: can change this as needed
tokenize = lambda x: simple_preprocess(x)


def create_embeddings(data_dir,
                      embeddings_path='embeddings.npz',
                      vocab_path='map.json',
                      **params):
    """
    Generate embeddings from a batch of text
    :param embeddings_path: where to save the embeddings
    :param vocab_path: where to save the word-index map
    """
    class SentenceGenerator(object):
        def __init__(self, dirname):
            self.dirname = dirname
fileName = 'MonAmiGabiTraining'
training = extract_reviews('../data/' + fileName + '.pkl')

documents = training
more_stopwords = [
    'great', 'good', 'like', 'le', 'la', 'time', 'think', 'wasnt', 'est', 've',
    'et', 'les', 'restaurant', 'nice', 'service', 'yelp', 'www', 'http', 'com',
    'select'
]
more_stopwords.extend(STOPWORDS)
#print("Texts before STOPWORDS: ",documents)

texts = []

texts = [[
    word for word in simple_preprocess(document) if word not in more_stopwords
] for document in documents]
#texts = [[word for word in documents.lower().split() if word not in STOPWORDS]]

print("Texts after STOPWORDS: ", texts)
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

from pprint import pprint  # pretty-printer
#pprint(texts)
Example #47
0
def remove_stopwords(texts):
    return [[
        word for word in simple_preprocess(str(doc)) if word not in stop_words
    ] for doc in texts]
Example #48
0
#%% pca 버전
for y in range(2007, 2017):
    s = tic()
    y1, y2 = y + 2, y + 3
    tr = onxy.set_index('date').loc[str(y):str(y1)].reset_index()
    te = onxy.set_index('date').loc[str(y2)].reset_index()

    tr['smry'] = [txtsum(t) for t in tr.text]
    print(toc(s))  # 약 2분 소요
    # text summary
    te['smry'] = [txtsum(t) for t in te.text]
    tr = tr.loc[tr.smry.notna()]
    te = te.loc[te.smry.notna()]
    # doc2vec
    tokf = lambda t: [
        word for word in simple_preprocess(t, deacc=True)
        if word not in stop_words
    ]
    tagged = [
        TaggedDocument(words=tokf(t), tags=[i]) for i, t in enumerate(tr.smry)
    ]
    dvmod = Doc2Vec(vector_size=100, epochs=10, workers=4)
    dvmod.build_vocab(tagged)
    dvmod.train(tagged, total_examples=dvmod.corpus_count, epochs=dvmod.epochs)
    trdv = pd.DataFrame([dvmod.infer_vector(tokf(t)) for t in tr.smry])
    tedv = pd.DataFrame([dvmod.infer_vector(tokf(t)) for t in te.smry])
    # doc2vec pca
    pca = PCA(n_components=20)
    pca.fit(trdv)
    trpca = pd.DataFrame(pca.transform(trdv)[:, :11],
                         columns=['pca' + str(i + 1) for i in range(11)])
Example #49
0
    def __init__(self,
                 documents,
                 speed="fast-learn",
                 document_ids=None,
                 keep_documents=True,
                 workers=None):

        # validate training inputs
        if speed == "fast-learn":
            hs = 0
            negative = 5
            epochs = 40
        elif speed == "learn":
            hs = 1
            negative = 0
            epochs = 40
        elif speed == "deep-learn":
            hs = 1
            negative = 0
            epochs = 400
        elif speed == "test-learn":
            hs = 0
            negative = 5
            epochs = 1
        else:
            raise ValueError(
                "speed parameter needs to be one of: fast-learn, learn or deep-learn"
            )

        if workers is None:
            pass
        elif isinstance(workers, int):
            pass
        else:
            raise ValueError("workers needs to be an int")

        # validate documents
        if not all((isinstance(doc, str) or isinstance(doc, np.str_))
                   for doc in documents):
            raise ValueError("Documents need to be a list of strings")
        if keep_documents:
            self.documents = np.array(documents)
        else:
            self.documents = None

        # validate document ids
        if document_ids is not None:

            if len(documents) != len(document_ids):
                raise ValueError(
                    "Document ids need to match number of documents")
            elif len(document_ids) != len(set(document_ids)):
                raise ValueError("Document ids need to be unique")

            if all((isinstance(doc_id, str) or isinstance(doc_id, np.str_))
                   for doc_id in document_ids):
                self.doc_id_type = np.str_
            elif all((isinstance(doc_id, int) or isinstance(doc_id, np.int_))
                     for doc_id in document_ids):
                self.doc_id_type = np.int_
            else:
                raise ValueError("Document ids need to be str or int")

            self.document_ids = np.array(document_ids)
            self.doc_id2index = dict(
                zip(document_ids, list(range(0, len(document_ids)))))
        else:
            self.document_ids = None
            self.doc_id2index = None
            self.doc_id_type = np.int_

        # preprocess documents for training - tokenize and remove too long/short words
        train_corpus = [
            TaggedDocument(simple_preprocess(strip_tags(doc), deacc=True), [i])
            for i, doc in enumerate(documents)
        ]

        # create documents and word embeddings with doc2vec
        if workers is None:
            self.model = Doc2Vec(documents=train_corpus,
                                 vector_size=300,
                                 min_count=50,
                                 window=15,
                                 sample=1e-5,
                                 negative=negative,
                                 hs=hs,
                                 epochs=epochs,
                                 dm=0,
                                 dbow_words=1)
        else:
            self.model = Doc2Vec(documents=train_corpus,
                                 vector_size=300,
                                 min_count=50,
                                 window=15,
                                 sample=1e-5,
                                 negative=negative,
                                 hs=hs,
                                 workers=workers,
                                 epochs=epochs,
                                 dm=0,
                                 dbow_words=1)

        # create 5D embeddings of documents
        umap_model = umap.UMAP(n_neighbors=15, n_components=5,
                               metric='cosine').fit(
                                   self.model.docvecs.vectors_docs)

        # find dense areas of document vectors
        cluster = hdbscan.HDBSCAN(min_cluster_size=15,
                                  metric='euclidean',
                                  cluster_selection_method='eom').fit(
                                      umap_model.embedding_)

        # calculate topic vectors from dense areas of documents
        self._create_topic_vectors(cluster.labels_)

        # deduplicate topics
        self._deduplicate_topics()

        # calculate topic sizes and index nearest topic for each document
        self._calculate_topic_sizes()

        # find topic words and scores
        self._find_topic_words_scores()
def tokenize(text):
    return [token for token in simple_preprocess(text)]
#nltk.download()

conn = MongoClient('mongodb://localhost:27017')
print(conn)
db = conn.tcc
twitter_clean = db.twitter_clean

to_pandas = twitter_clean.find({})  # Get data from MongoDB

df = pd.DataFrame(list(to_pandas))  # Convert data to Pandas DataFrame

del df['_id']  # Delete column _id

# Tokenize words in sentences and keep in a new column
df['tokenized_text'] = [
    simple_preprocess(line, deacc=True) for line in df['tweet_text']
]
# print(df['tokenized_text'].head(10))

# Stemm sentences
for idx, sentence in enumerate(df['tokenized_text']):
    df['tokenized_text'][idx] = Stemming(sentence)

# print(df['tokenized_text'].head(10))


def Stemming(sentence):  # Function to Stemm words in sentences
    stemmer = RSLPStemmer()  # to their root form
    phrase = []
    for word in sentence:
        phrase.append(stemmer.stem(word.lower()))
Example #52
0
# @Author: Yiheng
# @Email: [email protected]
# @Time: 7/5/2019 11:50
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
import numpy as np

if __name__ == '__main__':
    docs = [
        '一种 大头菜 自然风 脱水 设备 其 特征 在于 所述 的 大头菜 自然风 脱水 设备 主要 包括 大头菜',
        '风 脱水 架 和 大头菜 风 脱水 网袋 所述 的 大头菜 风 脱水 架 主要 包括 底座 支柱 横架 横向',
        '连接 承重杆 所述 的 底座 通过 中间 的 多边形 孔 与 支柱 的 下端 的 多边形 柱 配合 而 固定'
    ]

    tokenized_docs = [simple_preprocess(doc, min_len=2) for doc in docs]

    my_dct = Dictionary(tokenized_docs)
    print('dictionary is {}'.format(my_dct.token2id))

    corpus = [my_dct.doc2bow(doc) for doc in tokenized_docs]
    for index, bow in enumerate(corpus):
        bow = [[my_dct[index], count] for index, count in bow]
        print('bow of doc {} is'.format(index))
        print(bow)

    tf_idf_model = TfidfModel(corpus,
                              id2word=my_dct,
                              dictionary=my_dct,
                              smartirs='ntc')
    for index, doc in enumerate(tf_idf_model[corpus]):
Example #53
0
 def __iter__(self):
     with open(datapath('lee_background.cor')) as f:
         for line in f:
             yield utils.simple_preprocess(line)
        for tokens in stream_from_file(filename):
            yield self.dictionary.doc2bow(tokens)


filename = '/home/ashwath/Programs/ACLAAn/acl_training_data.txt'
# memory-hungry
print("Starting")
try:
    with open('processedtext.pickle', 'rb') as ipick:
        data_stemmed = pickle.load(ipick)
except FileNotFoundError:
    with open(filename, 'r') as file:
        # list of lists
        # Remove punctuation at this stage
        data = [
            simple_preprocess(line, deacc=True, min_len=2)
            for line in tqdm(file)
        ]
        # REMOVE THIS PICKLE LATER!!!!!!
        #with open('inputtext.pickle', 'wb') as dpick:
        #    pickle.dump(data, dpick)
        # Create a Phrases model
        bigram = Phrases(data, min_count=10,
                         threshold=100)  # higher threshold fewer phrases.
        bigram_mod = Phraser(bigram)

        data_nostops = remove_stopwords(data)
        # Form Bigrams
        data_bigrams = make_bigrams(data_nostops)
        #data_lemmatized = lemmatization(data_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN'])
        # Use a Snowball stemmer, lemmatization takes too much time and CPU
def remove_stopwords(texts):
    print("Removing stop words.")
    return [[
        word for word in simple_preprocess(str(doc))
        if word not in nlp.Defaults.stop_words
    ] for doc in tqdm(texts)]
Example #56
0
                                       ]].rename(columns={
                                           'Segment': 'questions',
                                           'category': 'category'
                                       })

newDataSetVersion['category'] = newDataSetVersion['category'].str.replace(
    ' ', '_')

train, test = sm.train_test_split(newDataSetVersion,
                                  test_size=0.4,
                                  shuffle=True,
                                  random_state=42)

train.iloc[:,
           0] = train.iloc[:,
                           0].apply(lambda x: ' '.join(simple_preprocess(x)))
test.iloc[:, 0] = test.iloc[:,
                            0].apply(lambda x: ' '.join(simple_preprocess(x)))

train.iloc[:, 1] = train.iloc[:, 1].apply(lambda x: '__label__' + x)
test.iloc[:, 1] = test.iloc[:, 1].apply(lambda x: '__label__' + x)

train[['category', 'questions']].to_csv('train.txt',
                                        index=False,
                                        sep=' ',
                                        header=None,
                                        quoting=csv.QUOTE_NONE,
                                        quotechar="",
                                        escapechar=" ")

test[['category', 'questions']].to_csv('test.txt',
Example #57
0
 def __iter__(self):
     for i in range(df_combo.shape[0]):
         yield TaggedDocument(words=simple_preprocess(df_combo.iloc[i,-1]), tags=['%s' % df_combo.iloc[i,0]])
Example #58
0
 def Tokenize(doc):
     """Tokenize documents for training and remove too long/short words"""
     return simple_preprocess(strip_tags(doc), deacc=True)
Example #59
0
def iter_documents(reuters_dir):
    """Iterate over Reuters documents, yielding one document at a time."""
    for fname in os.listdir(reuters_dir):
        document = open(os.path.join(reuters_dir, fname)).read()
        yield utils.simple_preprocess(document)
Example #60
0
 def _process_file(self, file):
     tokens = simple_preprocess(file)
     return self.doc2vec.infer_vector(tokens)