Ejemplo n.º 1
0
class DocumentsSimilarity(object):
    concepts = None
    conceptsSimilarity = None
    tm = None
    documentsList = None
    wordcount = None
    threads = None
    topics = None
    topicConcepts = None
    mapDocumentList = None
    df = None
    mapDocuments = None
    
    def __init__(self):
        self.threads = list()
    
    def createTopics(self, conceptsFile, concepts=None):

        if concepts is None:
            logger.info(u"Load Concepts from " + conceptsFile)
            self.concepts = Concepts.loadConcepts(conceptsFile)
            logger.info(u"Loaded Concepts")
        else:
            self.concepts = concepts

        self.tm = TopicsModel()

        logger.info(u"Load Documents from Concepts")

        self.documentsList, self.wordcount = self.tm.loadConceptsWords(self.concepts)

        self.mapDocuments = { self.documentsList.index(x) : x for x in self.documentsList}

        self.df = self.concepts.getConcepts().keys()

        logger.info(u"Read " + str(len(self.documentsList)) + u" Documents, with " + str(self.wordcount) + u" words.")

        if self.wordcount == 0:
            logger.error(u"No topics to use!")
            return None

        logger.info(u"Compute Topics")
        self.topics = self.tm.computeTopics(self.documentsList, nt=num_topics, nw=num_words)

        logger.info(u"Log Topics")
        self.tm.logTopics(self.topics)

        self.listTopics = [unicode(x[0]).strip().replace(u"\"", u"") for x in self.topics]

        logger.info(u"Saving Topics")
        self.topicConcepts = self.tm.saveTopics(self.topics)

        logger.info(u"Complete createTopics")

        return self.concepts

    def findSimilarties(self, conceptsSimilarityFile):

        logger.info(u"Compute Similarity")

        self.conceptsSimilarity = Concepts(u"ConceptsSimilarity", u"Similarities")

        # Compute similarity between documents / concepts
        similarityThreshold = similarity

        if THREAD:
            # startup the threads
            for threadID in range(0,ThreadDepth):
                thread = myThread(threadID, self)
                thread.start()
                self.threads.append(thread)

        for document in self.documentsList:
            indexNum = self.documentsList.index(document)

            logger.info(u"Document %s" % (self.df[indexNum]))

            pj = self.conceptsSimilarity.addConceptKeyType(self.df[indexNum], u"Document")

            logger.debug(u"  documentsList[%d] = %s" % (indexNum, str(document)))

            # Show common topics

            d = [unicode(x).strip().replace(u"'", u"") for x in document]
            e = [unicode(y).strip().replace(u"\"", u"") for y in self.listTopics]

            s1 = set(e)
            s2 = set(d)
            common = s1 & s2
            lc = [x for x in common]
            logger.debug(u"  Common Topics : %s" % (lc))

            if THREAD is False:

                self.doComputation(document, similarityThreshold, pj, Topics=True)

            else:
                logger.debug(u"npbtAquire  Queue Lock")
                queueLock.acquire()
                logger.debug(u"npbtPut     Queue     ")
                rl = [document, similarityThreshold]

                while workQueue.qsize() == QueueDepth:
                    time.sleep(1)

                workQueue.put(rl)
                queueLock.release()
                logger.debug(u"npbtRelease Queue Lock")

                qs = workQueue.qsize()
                if qs % QueueDelta == 0:
                    logger.info(u"rQueue Size = %s" % qs)

                # Wait for queue to empty
                qs = workQueue.qsize()
                while qs != 0:
                    time.sleep(1)
                    qs = workQueue.qsize()
                    if qs % QueueDelta == 0:
                        logger.info(u"wQueue Size = %s" % qs)

                # Notify threads it's time to exit
                exitFlag = 1

                # Wait for all threads to complete
                for t in self.threads:
                    logger.info(u"Waiting for thread %s to end..." % t)
                    t.join(0.5)

        Concepts.saveConcepts(self.conceptsSimilarity, conceptsSimilarityFile)

        # Concepts.outputConceptsToCSV(self.conceptsSimilarity, fileExport=u"BusinessRequirements.csv")

        logger.info(u"Complete - findSimilarties")

        return self.conceptsSimilarity

    def doComputation(self, j, similarityThreshold, pj, Topics=False):

        pt = None

        pl = self.tm.computeSimilar(self.documentsList.index(j), self.documentsList, similarityThreshold)

        if len(pl) != 0:
            logger.debug(u"   similarity above threshold")
            logger.debug(u"   pl:" + str(pl))

            for l in pl:
                if l[1] != l[2]:
                    logger.debug(u"   l:" + str(l))

                    ni = self.documentsList.index(l[2])
                    mdl = u",".join([ q for q in self.mapDocuments[ni]])

                    dfni = unicode(self.df[ni])

                    logger.info(u"    Similar Document : %s" % (dfni))

                    ps = pj.addConceptKeyType(dfni, u"SimilarDocument")
                    # ps.count = TopicsModel.convertMetric(l[0])

                    common = set(l[1]) & set(l[2])
                    lc = [x for x in common]

                    logger.debug(u"  l[1] : %s" % (l[1]))
                    logger.debug(u"  l[2] : %s" % (l[2]))
                    logger.debug(u"  Common : %s" % (lc))

                    pt = ps.addConceptKeyType(mdl, u"DocumentTopics")

                    for x in common:
                        pc = pt.addConceptKeyType(x, u"CommonTopic")
                        pc.count = len(lc)
                
        else:
            logger.debug(u"   similarity below threshold")
Ejemplo n.º 2
0
class DocumentsSimilarity(object):
    conceptsDoc = None
    conceptsSimilarity = None
    conceptsSimilarityFile = None
    tm = None
    documentsList = None
    wordcount = None
    threads = None
    topics = None
    topicConcepts = None
    lemmatizer = None
    df = None

    def __init__(self, al):
        self.num_topics = 100
        self.num_words = 50
        self.similarity = 0.95
        self.al = al

        self.conceptsSimilarityFile = u"GapsSimilarity.p"

    def createTopics(self, concepts):

        self.concepts = concepts

        self.tm = TopicsModel(directory=os.getcwd() + os.sep)

        logger.debug(u"--Load Documents from Concepts")
        self.documentsList, self.wordcount = self.tm.loadConceptsWords(self.concepts)

        logger.info(u"--Read %s Documents, with %s words." % (str(len(self.documentsList)), str(self.wordcount)))

        logger.info(u"--Compute Topics--")
        self.topics = self.tm.computeTopics(self.documentsList, nt=self.num_topics, nw=self.num_words)

        if False:
            logger.info(u"--Log Topics--")
            self.tm.logTopics(self.topics)

        # self.listTopics = [x[0].encode('ascii', errors="ignore").strip() for x in self.topics]
        self.listTopics = [x[0] for x in self.topics]

        logger.info(u"--Saving Topics--")

        self.topicConcepts = self.tm.saveTopics(self.topics)

    def findSimilarties(self):

        logger.info(u"Compute Similarity")

        self.conceptsSimilarity = Concepts(u"ConceptsSimilarity", u"Similarities")

        # Compute similarity between documents / concepts
        similarityThreshold = self.similarity

        for document in self.documentsList:
            indexNum = self.documentsList.index(document)

            self.df = self.concepts.getConcepts().keys()

            logger.info(u"++conceptsDoc %s" % (self.df[indexNum]))
            logger.info(u"  documentsList[" + str(indexNum) + u"]=" + u"".join(x + u" " for x in document))

            # Show common topics
            d = [unicode(x).strip().replace(u"'", u"") for x in document]
            e = [unicode(y).strip().replace(u"\"", u"") for y in self.listTopics]

            s1 = set(e)
            s2 = set(d)
            common = s1 & s2
            lc = [x for x in common]
            logger.info(u"  Common Topics : %s{%s}" % (lc, self.al.dictName[document][ARCHI_TYPE]))

            self.doComputation(indexNum, similarityThreshold, tfAddWords=True)

        Concepts.saveConcepts(self.conceptsSimilarity, conceptsSimilarityFile)

        logger.info(u"Saved Concepts : %s" % conceptsSimilarityFile)

        return self.conceptsSimilarity

    def doComputation(self, j, similarityThreshold, tfAddWords=True):
        logger.debug(u"--doComputation--")
        pl = self.tm.computeSimilar(j, self.documentsList, similarityThreshold)

        if len(pl) != 0:
            logger.debug(u"   similarity above threshold - %2.3f" % (100.0 * float(pl[0][0])))
            logger.debug(u"   pl:" + str(pl))

            for l in pl:
                if l[1] != l[2]:
                    logger.debug(u"  l:" + str(l))
                    l1 = u"".join(x + u" " for x in l[1])
                    ps = self.conceptsSimilarity.addConceptKeyType(l1, u"Similar")
                    ps.count = TopicsModel.convertMetric(l[0])

                    l2 = u"".join(x + " " for x in l[2])
                    pt = ps.addConceptKeyType(l2, u"Concept")

                    common = set(l[1]) & set(l[2])
                    lc = [x for x in common]

                    logger.debug(u"  l    : %s" % l)
                    logger.debug(u"  l[1] : %s" % (l1))
                    logger.debug(u"  l[2] : %s" % (l2))
                    logger.debug(u"  Common : %s" % (lc))

                    if tfAddWords is True:
                        for x in common:
                            if not x in stop:
                                logger.debug(u"word : %s" % x)
                                pc = pt.addConceptKeyType(x, u"CommonTopic")
                                pc.count = len(lc)

        else:
            logger.debug(u"   similarity below threshold")