class Collocations(object): concepts = None conceptsNGram = None conceptNGramScore = None conceptsNGramSubject = None conceptFile = u"documents.p" ngramFile = u"ngrams.p" ngramScoreFile = u"ngramscore.p" ngramSubjectFile = u"ngramsubject.p" def __init__(self, conceptFile=None): if conceptFile == None: conceptFile = u"documents.p" logger.info(u"Load Concepts from %s " % (conceptFile)) self.concepts = Concepts.loadConcepts(conceptFile) logger.info(u"Loaded Concepts") self.conceptsNGram = Concepts(u"n-gram", u"NGRAM") self.conceptsNGramScore = Concepts(u"NGram_Score", u"Score") self.conceptsNGramSubject = Concepts(u"Subject", u"Subjects") def getCollocationConcepts(self): return self.conceptsNGram, self.conceptsNGramScore, self.conceptsNGramSubject def find_collocations(self): lemmatizer = WordNetLemmatizer() stopset = set(stop) filter_stops = lambda w: len(w) < 3 or w in stopset words = list() dictWords = dict() for document in self.concepts.getConcepts().values(): logger.debug(document.name) for concept in document.getConcepts().values(): logger.debug(concept.name) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept.name)): logger.debug(u"Word: " + word + u" POS: " + pos) lemmaWord = lemmatizer.lemmatize(word.lower()) logger.debug(u"Word: " + word + u" Lemma: " + lemmaWord) words.append(lemmaWord) if pos[0] == u"N": dictWords[lemmaWord] = word for x in dictWords.keys(): logger.info(u"noun : %s" % x) bcf = BigramCollocationFinder.from_words(words) tcf = TrigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter_stops) tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(3) listBCF = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 100) for bigram in listBCF: concept = u' '.join([bg for bg in bigram]) e = self.conceptsNGram.addConceptKeyType(concept, u"BiGram") logger.info(u"Bigram : %s" % concept) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept)): e.addConceptKeyType(word, pos) listTCF = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 100) for trigram in listTCF: concept = u' '.join([bg for bg in trigram]) e = self.conceptsNGram.addConceptKeyType(concept, u"TriGram") logger.info(u"Trigram : %s" % concept) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept)): e.addConceptKeyType(word, pos) bcfscored = bcf.score_ngrams(BigramAssocMeasures.likelihood_ratio) lt = sorted(bcfscored, key=lambda c: c[1], reverse=True) for score in lt: name = ' '.join([w for w in score[0]]) count = float(score[1]) e = self.conceptsNGramScore.addConceptKeyType(name, u"BiGram") for x in score[0]: e.addConceptKeyType(x, u"BWord") e.count = count logger.debug(u"bcfscored: %s=%s" % (name, count)) tcfscored = tcf.score_ngrams(TrigramAssocMeasures.likelihood_ratio) lt = sorted(tcfscored, key=lambda c: c[1], reverse=True) for score in lt: name = ' '.join([w for w in score[0]]) count = float(score[1]) e = self.conceptsNGramScore.addConceptKeyType(name, u"TriGram") for x in score[0]: e.addConceptKeyType(x, u"TWord") e.count = count logger.debug(u"tcfscored: %s = %s" % (name, count)) Concepts.saveConcepts(self.conceptsNGramScore, self.ngramScoreFile) Concepts.saveConcepts(self.conceptsNGram, self.ngramFile) for concept in self.conceptsNGram.getConcepts().values(): for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept.name)): if pos[0] == u"N": e = self.conceptsNGramSubject.addConceptKeyType(word, pos) e.addConceptKeyType(concept.name, u"NGRAM") Concepts.saveConcepts(self.conceptsNGramSubject, self.ngramSubjectFile)