def save(self, *args, **kwargs): if self.duplication: self.merge() else: document = Document() document.description = self.description document.save() titleTokens = getTokensFromText(self.title) for title in titleTokens: titleToken = Token(title) titleToken.save() TitleOrder.objects.create(token=titleToken, document=document) dateTokens = getTokensFromText(self.date) for date in dateTokens: dateToken = Token(date) dateToken.save() DateOrder.objects.create(token=dateToken, document=document) locationTokens = getTokensFromText(self.location) for location in locationTokens: locationToken = Token(location) locationToken.save() LocationOrder.objects.create(token=locationToken, document=document) genreTokens = getTokensFromList(self.genres) for genre in genreTokens: genreToken = Token(genre) genreToken.save() GenresOrder.objects.create(token=genreToken, document=document) artistTokens = getTokensFromList(self.artists) for artist in artistTokens: artistToken = Token(artist) artistToken.save() ArtistOrder.objects.create(token=artistToken, document=document) tagTokens = getTokensFromList(self.tags) for tag in tagTokens: tagToken = Token(tag) tagToken.save() TagOrder.objects.create(token=tagToken, document=document) for url in self.urls: urlUrl = Url(url) urlUrl.save() UrlOrder.objects.create(url=urlUrl, document=document) for imageUrl in self.imageUrls: imageUrlUrl = Url(imageUrl) imageUrlUrl.save() ImageOrder.objects.create(url=imageUrlUrl, document=document) document.save
def merge(self): if self.duplication: if not self.duplication.description: self.duplication.description = self.description self.duplication.save() dateTokens = getTokensFromText(self.date) for date in dateTokens: dateToken = Token(date) dateToken.save() if not DateOrder.objects.filter(token=dateToken, document=self.duplication).exists(): DateOrder.objects.create(token=dateToken, document=self.duplication) locationTokens = getTokensFromText(self.location) for location in locationTokens: locationToken = Token(location) locationToken.save() if not LocationOrder.objects.filter(token=locationToken, document=self.duplication).exists(): LocationOrder.objects.create(token=locationToken, document=self.duplication) genreTokens = getTokensFromList(self.genres) for genre in genreTokens: genreToken = Token(genre) genreToken.save() if not GenresOrder.objects.filter(token=genreToken, document=self.duplication).exists(): GenresOrder.objects.create(token=genreToken, document=self.duplication) artistTokens = getTokensFromList(self.artists) for artist in artistTokens: artistToken = Token(artist) artistToken.save() if not ArtistOrder.objects.filter(token=artistToken, document=self.duplication).exists(): ArtistOrder.objects.create(token=artistToken, document=self.duplication) tagTokens = getTokensFromList(self.tags) for tag in tagTokens: tagToken = Token(tag) tagToken.save() if not TagOrder.objects.filter(token=tagToken, document=self.duplication).exists(): TagOrder.objects.create(token=tagToken, document=self.duplication) for url in self.urls: urlUrl = Url(url) urlUrl.save() if not UrlOrder.objects.filter(url=urlUrl, document=self.duplication).exists(): UrlOrder.objects.create(url=urlUrl, document=self.duplication) for imageUrl in self.imageUrls: imageUrlUrl = Url(imageUrl) imageUrlUrl.save() if not ImageOrder.objects.filter(url=imageUrlUrl, document=self.duplication).exists(): ImageOrder.objects.create(url=imageUrlUrl, document=self.duplication) self.duplication.save
def multiLabelClassification(document): #print("Start classification") if document.description: text = document.description lentext=len(text) ## lemmatize the words in the text lem = WordNetLemmatizer() words=text.split() text=' '.join([lem.lemmatize(i) for i in words]) ## extract keywords, output with scores if lentext<10: ## when the description is too short we don't need keywords tags=None else: if lentext<150: rake = Rake(eventbook_settings.PROJECT_ROOT + "common/SmartStoplist.txt",3,3,1) tags = rake.run(text) else: rake = Rake(eventbook_settings.PROJECT_ROOT + "common/SmartStoplist.txt",3,3,2) tags = rake.run(text) if tags: for tag in tags: tokens = getTokensFromText(tag[0]) for token in tokens: document.tags.append(token) #print("Found token: " + token) #print("End classification") return document
def findDuplicate(document): #print("START DUP"); #print(document.title) results = [] # Check for the same title tokens, must be exactly the same titleTokens = getTokensFromText(document.title) for index, titleToken in enumerate(titleTokens): tokens = Token.objects.filter(name=titleToken) for token in tokens: documentResults = token.title_tokens.all() if index == 0: for result in documentResults: if result not in results: results.append(result) else: results = list(set(results) & set(documentResults)) if len(results) > 0: document.duplication = findDuplicateInResults(document, results) #print(str(document.duplication)) #print("END DUP"); return document
def findDuplicateInResults(document, results): for result in results: for url in result.urls.all(): # If the same url -> definitly duplicate if url.name in document.urls: #print("FOUND DUPLICATE"); return result # Check for overlap in Artist, Genre, Location and Date if (hasOverlap(getTokensFromText(document.date), result.date.all()) and hasOverlap(getTokensFromText(document.location), result.location.all()) and hasOverlap(getTokensFromList(document.genres), result.genres.all()) and hasOverlap(getTokensFromList(document.artists), result.artists.all())): #print("FOUND DUPLICATE"); return result
def vcspace(docTexts): #original text, a set/list doclists = [] rowlists = [] #n=0 tokenlist = [] for text in docTexts: text = decompose(text, False) #every text becomes a list list = getTokensFromText(text) doclists.append(list) for token in list: if token not in tokenlist: tokenlist.append(token) list = [] rowlists.append(list) for list in rowlists: #initialize i = 0 for i in range(0, len(tokenlist)): list.append(0) textNum = len(doclists) # number of rows tokenNum = len(tokenlist) # number of columns i = 0 #print(tokenlist) n = -1 for list in doclists: n = n + 1 for i in range(0, len(list)): if list[i] in tokenlist: index = tokenlist.index(list[i]) rowlists[n][index] = rowlists[n][index] + 1 k = 0 m = 0 Similarity = [] for list1 in rowlists: for list2 in rowlists: normText1 = 0 normText2 = 0 for k in range(0, tokenNum): normText1 = normText1 + list1[k] normText2 = normText2 + list2[k] #normText1=power(normText1,1.0/2) #normText2=power(normText2,1.0/2) normText1 = normText1**(1. / 2) normText2 = normText2**(1. / 2) Similarity.append(0) for k in range(0, tokenNum): Similarity[m] = Similarity[m] + list1[k] * list2[k] Similarity[m] = Similarity[m] / normText1 / normText2 #print(Similarity[m]) m += 1 return Similarity
def createSuggestions( query, documents ): ## input the original query and relevant documents(Top 5 documents retrieved in our case) doclists = [] rowlists = [] tokenlist = [] for document in documents: tokens = document[0].getAllTokensAsText() doclists.append(tokens) for token in tokens: if token not in tokenlist: tokenlist.append(token) rowlists.append([]) ## deal with query queryTokens = getTokensFromText(query) doclists.insert(0, queryTokens) rowlists.insert(0, []) for list in rowlists: # initialize i = 0 for i in range(0, len(tokenlist)): list.append(0) i = 0 # print(tokenlist) n = -1 for list in doclists: n = n + 1 for i in range(0, len(list)): if list[i] in tokenlist: index = tokenlist.index(list[i]) rowlists[n][index] = rowlists[n][index] + 1 score = {} for i in range(0, len(tokenlist) - 1): score[tokenlist[i]] = rowlists[0][i] for j in range(1, len(rowlists) - 1): score[tokenlist[i]] = score[tokenlist[i]] + 0.75 * rowlists[j][i] rankwords = sorted(score.items(), key=lambda map: map[1], reverse=True) # print(rankwords) ## load stopwords stopwords = open(eventbook_settings.PROJECT_ROOT + "common/SmartStoplist.txt") stop_words = [] for line in stopwords: if line.strip()[0:1] != "#": for word in line.split(): # in case more than one per line stop_words.append(word) # print(stop_words) ## generate new query ## find words with score bigger than 1.5 and don't show in the original query and the stop_words. newrank = [] # print(len(query)) for i in range(0, len(tokenlist) - 1): if rankwords[i][1] >= 1.5 and rankwords[i][0] not in query and rankwords[i][0] not in stop_words: newrank.append(rankwords[i][0]) k = len(newrank) ## if there are no words in newrank, we don't have expanded query if k == 0: suggestions = None else: suggestions = [] queryTokens = getTokensFromText(query) # we only generate at most 5 new queries for i in range(0, min(k, 5)): tokens = getTokensFromText(newrank[i]) suggestion = decompose(newrank[i], False) # Only add the suggested word if it is not in the query yet if suggestion and suggestion != "" and not any(suggestion in s for s in queryTokens): suggestions.append(suggestion) # print(suggestions) return suggestions
def vcspace(docTexts): #original text, a set/list doclists=[] rowlists=[] #n=0 tokenlist=[] for text in docTexts: text=decompose(text, False) #every text becomes a list list=getTokensFromText(text) doclists.append(list) for token in list: if token not in tokenlist: tokenlist.append(token) list=[] rowlists.append(list) for list in rowlists: #initialize i=0 for i in range(0,len(tokenlist)): list.append(0) textNum=len(doclists) # number of rows tokenNum=len(tokenlist) # number of columns i=0 #print(tokenlist) n=-1 for list in doclists: n=n+1 for i in range(0,len(list)): if list[i] in tokenlist: index=tokenlist.index(list[i]) rowlists[n][index]=rowlists[n][index]+1 k=0 m=0 Similarity=[] for list1 in rowlists: for list2 in rowlists: normText1=0 normText2=0 for k in range(0,tokenNum): normText1=normText1+list1[k] normText2=normText2+list2[k] #normText1=power(normText1,1.0/2) #normText2=power(normText2,1.0/2) normText1=normText1**(1./2) normText2=normText2**(1./2) Similarity.append(0) for k in range(0,tokenNum): Similarity[m]=Similarity[m]+list1[k]*list2[k] Similarity[m]=Similarity[m]/normText1/normText2 #print(Similarity[m]) m+=1 return Similarity