def calculate_score2(self, doc=""): entities = self.webpageEntities(doc) if len(entities) > 1: uentities = {"Disaster": [], "LOCATION": [], "DATE": []} #entityList = [] for sent in entities: dictval = sent[1] if dictval.has_key("Disaster"): for k in dictval: #entityList.extend(dictval[k]) if k in ["LOCATION", "Disaster", "DATE"]: if uentities.has_key(k): uentities[k].extend(dictval[k]) else: uentities[k] = [] uentities[k].extend(dictval[k]) webpageEntities = [] for k in uentities: temp = uentities[k] ltext = " ".join(temp) if k != "Disaster": tokens = getTokenizedDoc(ltext) else: tokens = temp webpageEntities.extend(tokens) locs = set(tokens) #tempList = [i for i in tempSet] uentities[k] = [i for i in locs] webpageEntitiesSet = set(webpageEntities) #print webpageEntitiesSet intersect = len(webpageEntitiesSet & self.entity_set) score = intersect * 1.0 / len(self.entity_set) #print intersect if score < 0: score = 0.0 else: score = self.calculate_similarity(doc) #score = 1.0/distance return score
def calculate_score2(self,doc=""): entities = self.webpageEntities(doc) if len(entities) > 1: uentities = {"Topic":[],"LOCATION":[],"DATE":[]} #entityList = [] for sent in entities: dictval = sent[1] if dictval.has_key("Topic"): for k in dictval: #entityList.extend(dictval[k]) if k in ["LOCATION","Topic","DATE"]: if uentities.has_key(k): uentities[k].extend(dictval[k]) else: uentities[k] = [] uentities[k].extend(dictval[k]) webpageEntities = [] for k in uentities: temp = uentities[k] ltext = " ".join(temp) if k != "Topic": tokens = getTokenizedDoc(ltext) else: tokens = temp webpageEntities.extend(tokens) locs = set(tokens) #tempList = [i for i in tempSet] uentities[k] = [i for i in locs] webpageEntitiesSet = set(webpageEntities) #print webpageEntitiesSet intersect = len(webpageEntitiesSet & self.entity_set) score = intersect * 1.0 / len(self.entity_set) #print intersect if score < 0: score = 0.0 else: score = self.calculate_similarity(doc) #score = 1.0/distance return score
def convertDoctoTFIDF(self,doc): ''' stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() stopwords_e = stopwords.words('english') stopwords_e.extend(["last","time","week","favorite","home","search","follow","year","account","update","com","video","close","http","retweet","tweet","twitter","news","people","said","comment","comments","share","email","new","would","one","world"]) tokens = tokenizer.tokenize(doc) clean = [token for token in tokens if token.isalnum()] clean = [token.lower() for token in clean if token.lower() not in stopwords_e] clean = [token for token in clean if len(token) > 2] final_doc = [stemmer.stem(word) for word in clean] ''' final_doc = getTokenizedDoc(doc) doc_tfidf=[] words = self.model[1] for i in range(0,len(words)): tf = final_doc.count(words[i]) if tf > 0: tf = 1 + math.log(tf) doc_tfidf.append((tf,words[i])) return doc_tfidf