Beispiel #1
0
    def save(self, *args, **kwargs):

        if self.duplication:
            self.merge()
        else:
            document = Document()

            document.description = self.description

            document.save()

            titleTokens = getTokensFromText(self.title)
            for title in titleTokens:
                titleToken = Token(title)
                titleToken.save()
                TitleOrder.objects.create(token=titleToken, document=document)

            dateTokens = getTokensFromText(self.date)
            for date in dateTokens:
                dateToken = Token(date)
                dateToken.save()
                DateOrder.objects.create(token=dateToken, document=document)

            locationTokens = getTokensFromText(self.location)
            for location in locationTokens:
                locationToken = Token(location)
                locationToken.save()
                LocationOrder.objects.create(token=locationToken, document=document)

            genreTokens = getTokensFromList(self.genres)
            for genre in genreTokens:
                genreToken = Token(genre)
                genreToken.save()
                GenresOrder.objects.create(token=genreToken, document=document)

            artistTokens = getTokensFromList(self.artists)
            for artist in artistTokens:
                artistToken = Token(artist)
                artistToken.save()
                ArtistOrder.objects.create(token=artistToken, document=document)

            tagTokens = getTokensFromList(self.tags)
            for tag in tagTokens:
                tagToken = Token(tag)
                tagToken.save()
                TagOrder.objects.create(token=tagToken, document=document)

            for url in self.urls:
                urlUrl = Url(url)
                urlUrl.save()
                UrlOrder.objects.create(url=urlUrl, document=document)

            for imageUrl in self.imageUrls:
                imageUrlUrl = Url(imageUrl)
                imageUrlUrl.save()
                ImageOrder.objects.create(url=imageUrlUrl, document=document)

            document.save
Beispiel #2
0
    def merge(self):
        if self.duplication:

            if not self.duplication.description:
                self.duplication.description = self.description

            self.duplication.save()

            dateTokens = getTokensFromText(self.date)
            for date in dateTokens:
                dateToken = Token(date)
                dateToken.save()
                if not DateOrder.objects.filter(token=dateToken, document=self.duplication).exists():
                    DateOrder.objects.create(token=dateToken, document=self.duplication)

            locationTokens = getTokensFromText(self.location)
            for location in locationTokens:
                locationToken = Token(location)
                locationToken.save()
                if not LocationOrder.objects.filter(token=locationToken, document=self.duplication).exists():
                    LocationOrder.objects.create(token=locationToken, document=self.duplication)

            genreTokens = getTokensFromList(self.genres)
            for genre in genreTokens:
                genreToken = Token(genre)
                genreToken.save()
                if not GenresOrder.objects.filter(token=genreToken, document=self.duplication).exists():
                    GenresOrder.objects.create(token=genreToken, document=self.duplication)

            artistTokens = getTokensFromList(self.artists)
            for artist in artistTokens:
                artistToken = Token(artist)
                artistToken.save()
                if not ArtistOrder.objects.filter(token=artistToken, document=self.duplication).exists():
                    ArtistOrder.objects.create(token=artistToken, document=self.duplication)

            tagTokens = getTokensFromList(self.tags)
            for tag in tagTokens:
                tagToken = Token(tag)
                tagToken.save()
                if not TagOrder.objects.filter(token=tagToken, document=self.duplication).exists():
                    TagOrder.objects.create(token=tagToken, document=self.duplication)

            for url in self.urls:
                urlUrl = Url(url)
                urlUrl.save()
                if not UrlOrder.objects.filter(url=urlUrl, document=self.duplication).exists():
                    UrlOrder.objects.create(url=urlUrl, document=self.duplication)

            for imageUrl in self.imageUrls:
                imageUrlUrl = Url(imageUrl)
                imageUrlUrl.save()
                if not ImageOrder.objects.filter(url=imageUrlUrl, document=self.duplication).exists():
                    ImageOrder.objects.create(url=imageUrlUrl, document=self.duplication)

            self.duplication.save
Beispiel #3
0
def multiLabelClassification(document):
    #print("Start classification")
    if document.description:
        text = document.description
        
        lentext=len(text)
        
        ## lemmatize the words in the text
        lem = WordNetLemmatizer()
        words=text.split()
        text=' '.join([lem.lemmatize(i) for i in words])
        
        ## extract keywords, output with scores  
        if lentext<10: ## when the description is too short we don't need keywords
            tags=None
        else:
            if lentext<150:
                rake = Rake(eventbook_settings.PROJECT_ROOT + "common/SmartStoplist.txt",3,3,1)
                tags = rake.run(text)
            else:
                rake = Rake(eventbook_settings.PROJECT_ROOT + "common/SmartStoplist.txt",3,3,2)
                tags = rake.run(text)
                
        if tags:
            for tag in tags:
                tokens = getTokensFromText(tag[0])
                for token in tokens:
                    document.tags.append(token)
                    #print("Found token: " + token)

    #print("End classification")
    return document
Beispiel #4
0
def findDuplicate(document):
    
    #print("START DUP");
    #print(document.title)
    
    results = []
 
    # Check for the same title tokens, must be exactly the same   
    titleTokens = getTokensFromText(document.title)
    
    for index, titleToken in enumerate(titleTokens):
        tokens = Token.objects.filter(name=titleToken)
        
        for token in tokens:         
            documentResults = token.title_tokens.all()
            
            if index == 0:
                for result in documentResults: 
                    if result not in results: 
                        results.append(result)
            else:
                results = list(set(results) & set(documentResults))
    
    if len(results) > 0:
        document.duplication = findDuplicateInResults(document, results)
        #print(str(document.duplication))
    
    #print("END DUP");
    return document
Beispiel #5
0
def findDuplicateInResults(document, results):
    for result in results:
        for url in result.urls.all():
            # If the same url -> definitly duplicate
            if url.name in document.urls:
                #print("FOUND DUPLICATE");
                return result

        # Check for overlap in Artist, Genre, Location and Date
        if (hasOverlap(getTokensFromText(document.date), result.date.all()) and 
            hasOverlap(getTokensFromText(document.location), result.location.all()) and 
            hasOverlap(getTokensFromList(document.genres), result.genres.all()) and 
            hasOverlap(getTokensFromList(document.artists), result.artists.all())):
            
            #print("FOUND DUPLICATE");
            return result
def vcspace(docTexts):  #original text, a set/list

    doclists = []
    rowlists = []
    #n=0
    tokenlist = []
    for text in docTexts:
        text = decompose(text, False)  #every text becomes a list
        list = getTokensFromText(text)
        doclists.append(list)
        for token in list:
            if token not in tokenlist:
                tokenlist.append(token)
        list = []
        rowlists.append(list)

    for list in rowlists:  #initialize
        i = 0
        for i in range(0, len(tokenlist)):
            list.append(0)

    textNum = len(doclists)  # number of rows
    tokenNum = len(tokenlist)  # number of columns
    i = 0
    #print(tokenlist)

    n = -1
    for list in doclists:
        n = n + 1
        for i in range(0, len(list)):
            if list[i] in tokenlist:
                index = tokenlist.index(list[i])
                rowlists[n][index] = rowlists[n][index] + 1

    k = 0
    m = 0
    Similarity = []
    for list1 in rowlists:
        for list2 in rowlists:
            normText1 = 0
            normText2 = 0
            for k in range(0, tokenNum):
                normText1 = normText1 + list1[k]
                normText2 = normText2 + list2[k]
                #normText1=power(normText1,1.0/2)
                #normText2=power(normText2,1.0/2)
            normText1 = normText1**(1. / 2)
            normText2 = normText2**(1. / 2)
            Similarity.append(0)
            for k in range(0, tokenNum):
                Similarity[m] = Similarity[m] + list1[k] * list2[k]
            Similarity[m] = Similarity[m] / normText1 / normText2
            #print(Similarity[m])
            m += 1

    return Similarity
Beispiel #7
0
def createSuggestions(
    query, documents
):  ## input the original query and relevant documents(Top 5 documents retrieved in our case)
    doclists = []
    rowlists = []
    tokenlist = []

    for document in documents:
        tokens = document[0].getAllTokensAsText()
        doclists.append(tokens)
        for token in tokens:
            if token not in tokenlist:
                tokenlist.append(token)
        rowlists.append([])

    ## deal with query
    queryTokens = getTokensFromText(query)
    doclists.insert(0, queryTokens)

    rowlists.insert(0, [])

    for list in rowlists:  # initialize
        i = 0
        for i in range(0, len(tokenlist)):
            list.append(0)

    i = 0
    # print(tokenlist)
    n = -1
    for list in doclists:
        n = n + 1
        for i in range(0, len(list)):
            if list[i] in tokenlist:
                index = tokenlist.index(list[i])
                rowlists[n][index] = rowlists[n][index] + 1

    score = {}
    for i in range(0, len(tokenlist) - 1):
        score[tokenlist[i]] = rowlists[0][i]
        for j in range(1, len(rowlists) - 1):
            score[tokenlist[i]] = score[tokenlist[i]] + 0.75 * rowlists[j][i]
    rankwords = sorted(score.items(), key=lambda map: map[1], reverse=True)

    # print(rankwords)

    ## load stopwords
    stopwords = open(eventbook_settings.PROJECT_ROOT + "common/SmartStoplist.txt")
    stop_words = []
    for line in stopwords:
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    # print(stop_words)

    ## generate new query
    ## find words with score bigger than 1.5 and don't show in the original query and the stop_words.
    newrank = []
    # print(len(query))
    for i in range(0, len(tokenlist) - 1):
        if rankwords[i][1] >= 1.5 and rankwords[i][0] not in query and rankwords[i][0] not in stop_words:
            newrank.append(rankwords[i][0])
    k = len(newrank)

    ## if there are no words in newrank, we don't have expanded query
    if k == 0:
        suggestions = None
    else:
        suggestions = []
        queryTokens = getTokensFromText(query)

        #  we only generate at most 5 new queries
        for i in range(0, min(k, 5)):
            tokens = getTokensFromText(newrank[i])
            suggestion = decompose(newrank[i], False)
            # Only add the suggested word if it is not in the query yet
            if suggestion and suggestion != "" and not any(suggestion in s for s in queryTokens):
                suggestions.append(suggestion)

    # print(suggestions)

    return suggestions
Beispiel #8
0
def vcspace(docTexts):    #original text, a set/list
    
    doclists=[]
    rowlists=[]
    #n=0
    tokenlist=[]
    for text in docTexts:
        text=decompose(text, False)  #every text becomes a list
        list=getTokensFromText(text)       
        doclists.append(list)
        for token in list:
            if token not in tokenlist:
                tokenlist.append(token)
        list=[]
        rowlists.append(list)
    
    for list in rowlists:  #initialize
        i=0
        for i in range(0,len(tokenlist)): 
            list.append(0)
        
    
    textNum=len(doclists) # number of rows
    tokenNum=len(tokenlist) # number of columns 
    i=0
    #print(tokenlist)
    
    n=-1
    for list in doclists:
        n=n+1
        for i in range(0,len(list)):
            if list[i] in tokenlist:
                index=tokenlist.index(list[i])
                rowlists[n][index]=rowlists[n][index]+1   

    k=0
    m=0
    Similarity=[]
    for list1 in rowlists:
        for list2 in rowlists:
            normText1=0
            normText2=0
            for k in range(0,tokenNum):
                normText1=normText1+list1[k]
                normText2=normText2+list2[k]
                #normText1=power(normText1,1.0/2)
                #normText2=power(normText2,1.0/2)
            normText1=normText1**(1./2)
            normText2=normText2**(1./2)
            Similarity.append(0)
            for k in range(0,tokenNum):
                Similarity[m]=Similarity[m]+list1[k]*list2[k]
            Similarity[m]=Similarity[m]/normText1/normText2
            #print(Similarity[m])
            m+=1
    
    return Similarity