def read_config(): bots = getTokens() if not bots: exit(-1) for b, t in bots.items(): bot = Bot(b.lower(), t) apiai.bots.addBot(bot)
def getTokensTFDF(texts): tokensTF = [] #allTokensList=[] allTokens = [] allSents = [] for t in texts: sents = utils.getSentences(t) toks = utils.getTokens(sents) toksFreqs = utils.getFreq(toks) allTokens.extend(toksFreqs.keys()) #allTokensList.append(toks) allSents.append(sents) sortedToksFreqs = utils.getSorted(toksFreqs.items(), 1) tokensTF.append(sortedToksFreqs) tokensDF = utils.getFreq(allTokens).items() tokensTFDF = {} for t in tokensTF: for tok in t: if tok[0] in tokensTFDF: tokensTFDF[tok[0]] += tok[1] else: tokensTFDF[tok[0]] = tok[1] for t in tokensDF: tokensTFDF[t[0]] = (tokensTFDF[t[0]],t[1]) return tokensTFDF,allSents
def getEntities(self,text): """ Traigo entidades y clases candidatas de un diccionario en la base """ result = [] prev_pos = 0 for token in getTokens(text): candidate = self.getEntityCollection().find_one({'entity': token.upper()}) pos = text.find(token,prev_pos) prev_pos = pos length = len(token) if candidate and pos > -1 and length > 0: result.append({'entity':token, 'class':candidate['class'], 'pos':pos, 'len':length}) elif pos > -1 and length > 0: result.append({'entity':token, 'class':'Ninguna', 'pos':pos, 'len':length}) return result
print len(texts) toksTFDF,allSents = getTokensTFDF(texts) sortedToksTFDF = sorted(toksTFDF.items(), key=lambda x: x[1][0]*x[1][1], reverse=True) writeToFileSystem(sortedToksTFDF, '../output/toksTFDF_NY.txt',"TFDF") topToksTuples = sortedToksTFDF[:10] topToks = [k for k,_ in topToksTuples] allImptSents = [] eventModelInstances = [] for sents in allSents: impSents =[] #print len(sents) for sent in sents: sentToks = utils.getTokens(sent) intersect = utils.getIntersection(topToks, sentToks) if len(intersect) > 1: #if not utils.isListsDisjoint(topToks, sentToks): impSents.append(sent) evtModelInstance = {} sentEnts = utils.getEntities(sent)[0] evtModelInstance["Topic"] = list(intersect) for ent in sentEnts: evtModelInstance[ent] = sentEnts[ent] eventModelInstances.append(evtModelInstance) #print len(impSents) allImptSents.append(impSents) for impSents in allImptSents:
# Get data from fields urls = form.getvalue('urls') if not urls: urls = 'http://www.nbcnews.com/storyline/ebola-virus-outbreak/why-its-not-enough-just-eradicate-ebola-n243891\nhttp://www.npr.org/blogs/thetwo-way/2014/11/09/362770821/maine-nurse-to-move-out-of-state-following-ebola-quarantine-row' topK = 10 intersectionTh = 2 webpagesURLs = urls.split('\n') webpagesText = utils.getWebpageText(webpagesURLs) texts = [ t['text'] for t in webpagesText if t.has_key('text') and len(t['text']) > 0 ] #Get Frequent Tokens tokens = utils.getTokens(texts) f = utils.getFreq(tokens) tokensFreqs = f.items() sortedTokensFreqs = utils.getSorted(tokensFreqs, 1) #Get Indicative tokens toksTFDF, allSents = getTokensTFDF(texts) #sortedToksTFDF = sorted(filteredToksTFDF, key=lambda x: x[1][0]*x[1][1], reverse=True) sortedToksTFDF = sorted(toksTFDF.items(), key=lambda x: x[1][0] * x[1][1], reverse=True) ''' filteredToksTFDF = [] toks = " ".join([]) #print toks
form = cgi.FieldStorage() # Get data from fields urls = form.getvalue('urls') if not urls: urls = 'http://www.nbcnews.com/storyline/ebola-virus-outbreak/why-its-not-enough-just-eradicate-ebola-n243891\nhttp://www.npr.org/blogs/thetwo-way/2014/11/09/362770821/maine-nurse-to-move-out-of-state-following-ebola-quarantine-row' topK = 10 intersectionTh = 2 webpagesURLs = urls.split('\n') webpagesText = utils.getWebpageText(webpagesURLs) texts = [t['text'] for t in webpagesText if t.has_key('text') and len(t['text'])>0] #Get Frequent Tokens tokens = utils.getTokens(texts) f = utils.getFreq(tokens) tokensFreqs = f.items() sortedTokensFreqs = utils.getSorted(tokensFreqs,1) #Get Indicative tokens toksTFDF,allSents = getTokensTFDF(texts) #sortedToksTFDF = sorted(filteredToksTFDF, key=lambda x: x[1][0]*x[1][1], reverse=True) sortedToksTFDF = sorted(toksTFDF.items(), key=lambda x: x[1][0]*x[1][1], reverse=True) ''' filteredToksTFDF = [] toks = " ".join([]) #print toks tokEntsDict = utils.getEntities(toks)[0] tokEntsList = []
if __name__ == "__main__": # 1. load your training data # 2. Train your network # Make sure to print your training loss and accuracy within training to show progress # Make sure you print the final training accuracy # 3. Save your model tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) # Reading the training data. train_raw, labels = getInput('train') # Tokenizing the training data. tokens = getTokens(train_raw) # tokens = removeStopWords(tokens) opt_dim = 50 # Finding the 80 percentile sentence length. # percentile = int(np.percentile([len(seq) for seq in tokens], 80)) # print('80th Percentile Sentence Length:', percentile) percentile = 295 # 80th Percentile Sentence Length, Found using above two lines. # Truncate the data at 80 percentile sentence length. truncatedData = [' '.join(seq[:percentile]) for seq in tokens] # Vectorize the data. final_data, tok = prepareData(truncatedData, percentile)
# Thoroughly comment your code to make it easy to follow if __name__ == "__main__": # 1. Load your saved model tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) maxSentLen = 295 # 80 percentile sentence length in training dataset. model = load_model("models/20829490_NLP_model.model") # 2. Load your testing data test_raw, labels = getInput('test') # Tokenizing, removing stop words and lemmetizing. tokens1 = getTokens(test_raw) # Truncating longer sentences to 80 percentile sentence length. truncatedData = [' '.join(seq[:maxSentLen]) for seq in tokens1] # Processing Test Data. tokenizer = pickle.load(open("data/token.p", "rb")) final_data = tokenizer.texts_to_sequences(truncatedData) # Padding the data. final_data = pad_sequences(final_data, maxlen=maxSentLen, padding='post') # 3. Run prediction on the test data and print the test accuracy evalu = model.evaluate(final_data, labels)
from gensim import corpora, models import utils stoplist = utils.stopwordsList documents = [] #texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] texts = [] for doc in documents: docToks = utils.getTokens(doc) texts.append(docToks) #Build the dictionary and the corpus dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] #Define the LDA model and the number of topics. notopics = 3 lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=notopics) #print lda.show_topics(notopics) #Printing the topic with their probabilities print "\n\n", notopics, "Topics with their corresponding probabilities\n" for i in range(0, lda.num_topics): print "Topic", i+1, ":", lda.print_topic(i)
from gensim import corpora, models import utils stoplist = utils.stopwordsList documents = [] #texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] texts = [] for doc in documents: docToks = utils.getTokens(doc) texts.append(docToks) #Build the dictionary and the corpus dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] #Define the LDA model and the number of topics. notopics = 3 lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=notopics) #print lda.show_topics(notopics) #Printing the topic with their probabilities print "\n\n", notopics, "Topics with their corresponding probabilities\n" for i in range(0, lda.num_topics): print "Topic", i + 1, ":", lda.print_topic(i)