Beispiel #1
0
def NER():
    trainFiles = util.findTrainingFiles()
    testFiles = util.findTestFiles()
    trainCorpus, usedTrainFiles = util.findCorpus(trainFiles)
    testCorpus, usedTestFiles = util.findCorpus(testFiles)   
    for text in trainCorpus:
        nerText = "Chelsea began their Premier League title defence with a thrilling 2-2 draw against Swansea, as Thibaut Courtois was sent off and will now miss the crunch clash with Manchester City"
        for sent in nltk.sent_tokenize(nerText):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                 if hasattr(chunk, 'node'):
                     print chunk.node, ' '.join(c[0] for c in chunk.leaves())
Beispiel #2
0
def ConnectionClarity():
    todayDate = util.getYesterdayDateFolder()
    lastClarityDate = util.loadSettings(Constants.LAST_CLARITY_DIR)
    lastSuggClarityDate = util.loadSettings(Constants.LAST_SUGG_CLARITY_DIR)
    if lastClarityDate:
        util.logger.info("Google Clarity done last for =" + lastClarityDate)
    else:
        util.logger.info("Google Clarity done last for none")
    if lastSuggClarityDate:
        util.logger.info("Sugg Clarity done last for =" + lastClarityDate)
    else:
        util.logger.info("Sugg Clarity done last for none")

    if todayDate == lastClarityDate and todayDate == lastSuggClarityDate:
        util.logger.info("Clarity signal done for today =" + todayDate)
        return True

    trainFiles = util.findTrainingFiles()
    trainFiles = util.random_select(trainFiles)
    trainCorpus, usedTrainFiles = util.findCorpus(trainFiles)

    normalClarity = True
    if todayDate != lastClarityDate:
        testFiles = util.findTestFiles()
        testCorpus, usedTestFiles = util.findCorpus(testFiles)
        clarityobj = Clarity(trainCorpus, testCorpus)
        clarityScore = clarityobj.ClarityScore()
        normalClarity = printNormalRankedDocs(clarityScore, usedTestFiles)
        if normalClarity == True:
            util.saveSettings(Constants.LAST_CLARITY_DIR, todayDate)
            util.logger.info("Google Clarity info just completed for =" +
                             todayDate)

    suggClarity = True
    if todayDate != lastClarityDate:
        testFiles = util.findSuggTestFiles()
        testCorpus, usedTestFiles = util.findCorpus(testFiles)
        clarityobj = Clarity(trainCorpus, testCorpus)
        clarityScore = clarityobj.ClarityScore()
        suggClarity = printSuggRankedDocs(clarityScore, usedTestFiles)
        if suggClarity == True:
            util.saveSettings(Constants.LAST_SUGG_CLARITY_DIR, todayDate)
            util.logger.info("SuggGoogle Clarity info just completed for =" +
                             todayDate)

    return normalClarity or suggClarity
Beispiel #3
0
def ConnectionClarity():
    todayDate = util.getTodayDateFolder()
    lastClarityDate = util.loadSettings(Constants.LAST_CLARITY_DIR)
    if todayDate == lastClarityDate :
        util.logger.info("Clarity signal done for today =" + todayDate)
        return True
    trainFiles = util.findTrainingFiles()
    testFiles = util.findTestFiles()
    trainCorpus, usedTrainFiles = util.findCorpus(trainFiles)
    testCorpus, usedTestFiles = util.findCorpus(testFiles)   
    clarityobj = Clarity(trainCorpus,testCorpus)
    clarityScore = clarityobj.ClarityScore()
    ret = printRankedDocs(clarityScore, usedTestFiles)
    if ret == True:
        util.saveSettings(Constants.LAST_CLARITY_DIR, todayDate)
        util.logger.info("Clarity info just completed for ="+todayDate)
    return ret
Beispiel #4
0
def Relevance():
    todayDate = util.getTodayDateFolder()
    lastRelevanceDate = util.loadSettings(Constants.LAST_RELEVANCE_DIR)
    if todayDate == lastRelevanceDate :
        util.logger.info("Relevance signal already done for today :" + todayDate)
        return True
    trainFiles = util.findTrainingFiles()
    testFiles = util.findTestFiles()
    trainCorpus, usedTrainFiles = util.findCorpus(trainFiles)
    testCorpus, usedTestFiles = util.findCorpus(testFiles)   
    all_tokens = sum(trainCorpus, [])
    tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
    texts = [[word for word in text if word not in tokens_once]
         for text in trainCorpus]
    pass
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(corpus=corpus, id2word=dictionary,normalize=True)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(dictionary))
    count = 0
    testJson = {}
    for text in testCorpus:
        vec=dictionary.doc2bow(text)
        sims = index[tfidf[vec]]
        score = sum(sims)
        #print(list(enumerate(sims))) 
        testJson[usedTestFiles[count]] = score
        count = count + 1
    ret = printRankedDocs(testJson)
    if ret == True:
        util.saveSettings(Constants.LAST_RELEVANCE_DIR, todayDate)
        util.logger.info("Relevance info just completed for ="+todayDate)
    return ret




            
Beispiel #5
0
def Relevance():
    todayDate = util.getYesterdayDateFolder()
    lastRelevanceDate = util.loadSettings(Constants.LAST_RELEVANCE_DIR)
    lastSuggRelevanceDate = util.loadSettings(
        Constants.LAST_SUGG_RELEVANCE_DIR)

    if lastRelevanceDate:
        util.logger.info("Google Relevance done last for =" +
                         lastRelevanceDate)
    else:
        util.logger.info("Google Relevance done last for None")

    if lastSuggRelevanceDate:
        util.logger.info("Sugg Relevance done last for =" + lastRelevanceDate)
    else:
        util.logger.info("Sugg Relevance done last for None")

    if todayDate == lastRelevanceDate and todayDate == lastSuggRelevanceDate:
        util.logger.info("Relevance signal already done for today :" +
                         todayDate)
        return True
    trainFiles = util.findTrainingFiles()
    trainCorpus, usedTrainFiles = util.findCorpus(trainFiles)
    all_tokens = sum(trainCorpus, [])
    tokens_once = set(word for word in set(all_tokens)
                      if all_tokens.count(word) == 1)
    texts = [[word for word in text if word not in tokens_once]
             for text in trainCorpus]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(corpus=corpus,
                              id2word=dictionary,
                              normalize=True)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus],
                                                num_features=len(dictionary))

    normalRelevance = True
    if todayDate != lastRelevanceDate:
        testFiles = util.findTestFiles()
        testCorpus, usedTestFiles = util.findCorpus(testFiles)
        count = 0
        testJson = {}
        for text in testCorpus:
            vec = dictionary.doc2bow(text)
            sims = index[tfidf[vec]]
            score = sum(sims)
            #print(list(enumerate(sims)))
            testJson[usedTestFiles[count]] = score
            count = count + 1
        normalRelevance = printNormalRankedDocs(testJson)
        if normalRelevance == True:
            util.saveSettings(Constants.LAST_RELEVANCE_DIR, todayDate)
            util.logger.info("Google Relevance info just completed for =" +
                             todayDate)

    suggRelevance = True
    if todayDate != lastSuggRelevanceDate:
        testFiles = util.findSuggTestFiles()
        testCorpus, usedTestFiles = util.findCorpus(testFiles)
        count = 0
        testJson = {}
        for text in testCorpus:
            vec = dictionary.doc2bow(text)
            sims = index[tfidf[vec]]
            score = sum(sims)
            #print(list(enumerate(sims)))
            testJson[usedTestFiles[count]] = score
            count = count + 1
        suggRelevance = printSuggRankedDocs(testJson)
        if suggRelevance == True:
            util.saveSettings(Constants.LAST_SUGG_RELEVANCE_DIR, todayDate)
            util.logger.info("Google Relevance info just completed for =" +
                             todayDate)

    return normalRelevance or suggRelevance
Beispiel #6
0
def Smoothness():
    todayDate = util.getYesterdayDateFolder()
    lastSmoothnessDate = util.loadSettings(Constants.LAST_SMOOTHNESS_DIR)
    lastSuggSmoothnessDate = util.loadSettings(
        Constants.LAST_SUGG_SMOOTHNESS_DIR)

    if lastSmoothnessDate:
        util.logger.info("Google Smoothness done last for =" +
                         lastSmoothnessDate)
    else:
        util.logger.info("Google Smoothness done last for none")

    if lastSuggSmoothnessDate:
        util.logger.info("Sugg Google Smoothness done last for =" +
                         lastSuggSmoothnessDate)
    else:
        util.logger.info("Sugg Google Smoothness done last for none")

    if todayDate == lastSmoothnessDate and todayDate == lastSuggSmoothnessDate:
        util.logger.info("Smoothness signal done for today" + todayDate)
        return True

    trainFiles = util.findTrainingFiles()
    trainFiles = util.random_select(trainFiles)
    trainCorpus, usedTrainFiles = util.findCorpus(trainFiles)
    bm25obj = Bm25(trainCorpus)
    trainUniqueWords = []
    for trainText in trainCorpus:
        trainUniqueWords.append(set(trainText))

    normalSmoothness = True
    if todayDate != lastSmoothnessDate:
        testFiles = util.findTestFiles()
        testCorpus, usedTestFiles = util.findCorpus(testFiles)
        testJson = {}
        testUniqueWords = []
        smoothness = zeros((len(testCorpus), len(trainCorpus)))
        for testText in testCorpus:
            testUniqueWords.append(set(testText))
        for testDoc in range(len(testCorpus)):
            uniqueTest = testUniqueWords[testDoc]
            SminusDcontext = zeros(bm25obj.N)
            DminusScontext = zeros(bm25obj.N)
            for trainDoc in range(len(trainCorpus)):
                uniqueTrain = trainUniqueWords[trainDoc]
                #     t0 = time()
                SminusD = [
                    word for word in trainCorpus[trainDoc]
                    if word not in uniqueTest
                ]
                #    t1 = time()
                #print "time 1 = "+str(t1-t0)
                DminusS = [
                    word for word in testCorpus[testDoc]
                    if word not in uniqueTrain
                ]
                #    t2 = time()
                #       print "time 2 = "+str(t2-t1)
                SminusDcontext = bm25obj.BM25Score(SminusD)
                #    t3 = time()
                #      print "time 3 = "+str(t3-t2)
                DminusScontext = bm25obj.BM25Score(DminusS)
                #    t4 = time()
                #      print "time 4 = "+str(t4-t3)
                smoothness[testDoc][trainDoc] = np.dot(SminusDcontext,
                                                       DminusScontext)
        #      t5 = time()
        #print "time 5 = "+str(t5-t4)
        normalSmoothness = printNormalRankedDocs(smoothness, usedTestFiles)
        if normalSmoothness == True:
            util.saveSettings(Constants.LAST_SMOOTHNESS_DIR, todayDate)
            util.logger.info("Google Smoothness info just completed for =" +
                             todayDate)

    suggSmoothness = True
    if todayDate != lastSuggSmoothnessDate:
        testFiles = util.findSuggTestFiles()
        testCorpus, usedTestFiles = util.findCorpus(testFiles)
        testJson = {}
        testUniqueWords = []
        smoothness = zeros((len(testCorpus), len(trainCorpus)))
        for testText in testCorpus:
            testUniqueWords.append(set(testText))
        for testDoc in range(len(testCorpus)):
            uniqueTest = testUniqueWords[testDoc]
            SminusDcontext = zeros(bm25obj.N)
            DminusScontext = zeros(bm25obj.N)
            for trainDoc in range(len(trainCorpus)):
                uniqueTrain = trainUniqueWords[trainDoc]
                SminusD = [
                    word for word in trainCorpus[trainDoc]
                    if word not in uniqueTest
                ]
                DminusS = [
                    word for word in testCorpus[testDoc]
                    if word not in uniqueTrain
                ]
                SminusDcontext = bm25obj.BM25Score(SminusD)
                DminusScontext = bm25obj.BM25Score(DminusS)
                smoothness[testDoc][trainDoc] = np.dot(SminusDcontext,
                                                       DminusScontext)
        suggSmoothness = printSuggRankedDocs(smoothness, usedTestFiles)
        if suggSmoothness == True:
            util.saveSettings(Constants.LAST_SUGG_SMOOTHNESS_DIR, todayDate)
            util.logger.info("Sugg Smoothness info just completed for =" +
                             todayDate)

    return normalSmoothness or suggSmoothness