Ejemplo n.º 1
def read_config():
    bots = getTokens()
    if not bots:
    for b, t in bots.items():
        bot = Bot(b.lower(), t)
Ejemplo n.º 2
def getTokensTFDF(texts):
    tokensTF = []
    allTokens = []
    allSents = []
    for t in texts:
        sents = utils.getSentences(t)
        toks = utils.getTokens(sents)
        toksFreqs = utils.getFreq(toks)
        sortedToksFreqs = utils.getSorted(toksFreqs.items(), 1)
    tokensDF = utils.getFreq(allTokens).items()
    tokensTFDF = {}
    for t in tokensTF:
        for tok in t:
            if tok[0] in tokensTFDF:
                tokensTFDF[tok[0]] += tok[1]
                tokensTFDF[tok[0]] = tok[1]
    for t in tokensDF:
        tokensTFDF[t[0]] = (tokensTFDF[t[0]],t[1])
    return tokensTFDF,allSents
Ejemplo n.º 3
 def getEntities(self,text):
     Traigo entidades y clases candidatas de un diccionario en la base
     result = []
     prev_pos = 0
     for token in getTokens(text):
         candidate = self.getEntityCollection().find_one({'entity': token.upper()})
         pos = text.find(token,prev_pos)
         prev_pos = pos
         length = len(token)
         if candidate and pos > -1 and length > 0:
             result.append({'entity':token, 'class':candidate['class'], 'pos':pos, 'len':length})
         elif pos > -1 and length > 0:
             result.append({'entity':token, 'class':'Ninguna', 'pos':pos, 'len':length})
     return result
Ejemplo n.º 4
 print len(texts)
 toksTFDF,allSents = getTokensTFDF(texts)
 sortedToksTFDF = sorted(toksTFDF.items(), key=lambda x: x[1][0]*x[1][1], reverse=True)
 writeToFileSystem(sortedToksTFDF, '../output/toksTFDF_NY.txt',"TFDF")
 topToksTuples = sortedToksTFDF[:10]
 topToks = [k for k,_ in topToksTuples]
 allImptSents = []
 eventModelInstances = []
 for sents in allSents:
     impSents =[]
     #print len(sents)
     for sent in sents:
         sentToks = utils.getTokens(sent)
         intersect = utils.getIntersection(topToks, sentToks)
         if len(intersect) > 1:
         #if not utils.isListsDisjoint(topToks, sentToks):
             evtModelInstance = {}
             sentEnts = utils.getEntities(sent)[0]
             evtModelInstance["Topic"] = list(intersect)
             for ent in sentEnts:
                 evtModelInstance[ent] = sentEnts[ent]
     #print len(impSents)
 for impSents in allImptSents:
Ejemplo n.º 5
# Get data from fields
urls = form.getvalue('urls')
if not urls:
    urls = 'http://www.nbcnews.com/storyline/ebola-virus-outbreak/why-its-not-enough-just-eradicate-ebola-n243891\nhttp://www.npr.org/blogs/thetwo-way/2014/11/09/362770821/maine-nurse-to-move-out-of-state-following-ebola-quarantine-row'
topK = 10
intersectionTh = 2

webpagesURLs = urls.split('\n')
webpagesText = utils.getWebpageText(webpagesURLs)
texts = [
    t['text'] for t in webpagesText if t.has_key('text') and len(t['text']) > 0

#Get Frequent Tokens
tokens = utils.getTokens(texts)
f = utils.getFreq(tokens)
tokensFreqs = f.items()
sortedTokensFreqs = utils.getSorted(tokensFreqs, 1)

#Get Indicative tokens
toksTFDF, allSents = getTokensTFDF(texts)

#sortedToksTFDF = sorted(filteredToksTFDF, key=lambda x: x[1][0]*x[1][1], reverse=True)
sortedToksTFDF = sorted(toksTFDF.items(),
                        key=lambda x: x[1][0] * x[1][1],
filteredToksTFDF = []
toks = " ".join([])
#print toks
if __name__ == "__main__":
    # 1. load your training data

    # 2. Train your network
    # 		Make sure to print your training loss and accuracy within training to show progress
    # 		Make sure you print the final training accuracy

    # 3. Save your model

    # Reading the training data.
    train_raw, labels = getInput('train')

    # Tokenizing the training data.
    tokens = getTokens(train_raw)
    # tokens = removeStopWords(tokens)

    opt_dim = 50

    # Finding the 80 percentile sentence length.
    # percentile = int(np.percentile([len(seq) for seq in tokens], 80))
    # print('80th Percentile Sentence Length:', percentile)
    percentile = 295  # 80th Percentile Sentence Length, Found using above two lines.

    # Truncate the data at 80 percentile sentence length.
    truncatedData = [' '.join(seq[:percentile]) for seq in tokens]

    # Vectorize the data.
    final_data, tok = prepareData(truncatedData, percentile)
Ejemplo n.º 8
# Thoroughly comment your code to make it easy to follow

if __name__ == "__main__":
    # 1. Load your saved model

    maxSentLen = 295  # 80 percentile sentence length in training dataset.

    model = load_model("models/20829490_NLP_model.model")

    # 2. Load your testing data

    test_raw, labels = getInput('test')

    # Tokenizing, removing stop words and lemmetizing.
    tokens1 = getTokens(test_raw)

    # Truncating longer sentences to 80 percentile sentence length.
    truncatedData = [' '.join(seq[:maxSentLen]) for seq in tokens1]

    # Processing Test Data.
    tokenizer = pickle.load(open("data/token.p", "rb"))
    final_data = tokenizer.texts_to_sequences(truncatedData)

    # Padding the data.
    final_data = pad_sequences(final_data, maxlen=maxSentLen, padding='post')

    # 3. Run prediction on the test data and print the test accuracy

    evalu = model.evaluate(final_data, labels)
Ejemplo n.º 9
from gensim import corpora, models
import utils
stoplist = utils.stopwordsList
documents = []

#texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
texts = []
for doc in documents:
    docToks = utils.getTokens(doc)
#Build the dictionary and the corpus
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
#Define the LDA model and the number of topics.
notopics = 3
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=notopics)
#print lda.show_topics(notopics)
#Printing the topic with their probabilities
print "\n\n", notopics, "Topics with their corresponding probabilities\n"
for i in range(0, lda.num_topics):
    print "Topic", i+1, ":", lda.print_topic(i)
