Beispiel #1
0
def vector_controller(query, corpus):
    '''
    Returns a DataFrame containing the results of the VSM for the given query on the given corpus
    '''
    query.lower()
    rel_dict = np.load('relevant_dict.npy', allow_pickle='TRUE').item()
    if corpus == 1:
        newdict = np.load('models/complete_dict.npy',
                          allow_pickle='TRUE').item()
        desc = pd.read_pickle(
            "save_files/UO/descriptions_index_with_weight.obj")
        title = pd.read_pickle("save_files/UO/title_index_with_weight.obj")
    else:
        newdict = np.load('models/complete_dict_reuters.npy',
                          allow_pickle='TRUE').item()
        desc = pd.read_pickle(
            "save_files/Reuters/descriptions_index_with_weight.obj")
        title = pd.read_pickle(
            "save_files/Reuters/title_index_with_weight.obj")
    r_query = rocchio(get_formatted_tokens(query), query.split(), rel_dict,
                      newdict)
    query, expanded_values = expand_query(query, 'vsm')
    r_query = Counter(r_query)
    expanded_values = Counter(expanded_values)
    final_expanded = dict(r_query + expanded_values)
    repr = vsm(corpus, get_formatted_tokens(query), title, desc,
               final_expanded)
    if corpus == 1:
        corp = pd.read_csv("save_files/UO/corpus.csv", sep="|")
    else:
        corp = pd.read_csv("save_files/Reuters/corpus.csv", sep="|")
    result = corp.loc[repr[0], ["title", "description"]]
    result['score'] = repr[1]
    return result
Beispiel #2
0
def next_word(string, corpus, model):

    if corpus == 1:
        path = "save_files/UO/"
    else:
        path = "save_files/Reuters/"

    last_word = string.split()
    if model == 2:
        last_word = last_word[-1]
    elif last_word[-1] in ["AND", "OR", "AND_NOT"]:
        last_word = last_word[-2]
    else:
        return ["AND", "OR", "AND_NOT"]

    try:
        formatted = get_formatted_tokens(last_word)[0]
    except:
        formatted = get_bigram_tokens(last_word)[0]

    with open(path + "blm_dic.pkl", 'rb') as f:
        dict = pickle.load(f)
        try:
            return dict[formatted]
        except:
            return []
Beispiel #3
0
def expand_query(query, model):
    if model == "boolean":
        if "(" in query:
            query = query.replace("(", " ( ")
        if ")" in query:
            query = query.replace(")", " ) ")

        lst = query.split()
        synonyms = []
        new_syn = []
        # based off of code from https://www.geeksforgeeks.org/get-synonymsantonyms-nltk-wordnet-python/
        for i in range(len(lst)):
            if lst[i] not in ['AND', 'OR', 'AND_NOT', ')', '(']:
                for syn in wordnet.synsets(lst[i]):
                    for l in syn.lemmas():
                        synonyms.append(l.name())
                synonyms = list(dict.fromkeys(synonyms))
                for j in range(len(synonyms)):
                    if "_" not in synonyms[j]:
                        new_syn.append(synonyms[j])
                new_syn = new_syn[:5]
                lst[i] = create_term(new_syn[1:], new_syn[0])
                new_syn = []
                synonyms = []
        print(" ".join(lst))
        return " ".join(lst)
    if model == "vsm":
        lst = query.split()
        values = {}
        for i in range(len(lst)):
            values[lst[i]] = 1
        synonyms = []
        new_syn = []
        # based off of code from https://www.geeksforgeeks.org/get-synonymsantonyms-nltk-wordnet-python/
        for i in range(len(lst)):
            for syn in wordnet.synsets(lst[i]):
                for l in syn.lemmas():
                    if l.name() != lst[i]:
                        synonyms.append(l.name())
            synonyms = list(dict.fromkeys(synonyms))
            for j in range(len(synonyms)):
                if "_" not in synonyms[j]:
                    new_syn.append(synonyms[j])
            new_syn = new_syn[:5]
            weight = 1 / len(new_syn)
            if weight == 1:
                weight = 0.8
            for j in range(len(new_syn)):
                values["".join(get_formatted_tokens(new_syn[j]))] = weight
                lst.append(new_syn[j])
            new_syn = []
            synonyms = []
        return " ".join(lst), values
def complete(first_word, model_path):
    '''
    Returns a list of up to 5 words to recommend as the following word based on the given first_word
    '''
    with open(model_path, 'rb') as f:  #open file
        blm_dict = pickle.load(f)

    word = get_formatted_tokens(first_word)
    print(word)
    if word == []:
        word = get_bigram_tokens(first_word)

    word = word[0]

    try:
        return blm_dict[word]
    except:
        return []
Beispiel #5
0
    def build_primary_index(self, df, name):
        '''
        Generates a primary index from the given dataframe and column name
        '''

        for x in self.dic_list:
            self.index[x] = set()  #generates the dict entry and its empty set

        count = 0
        for _, row in df.iterrows():
            print("\r\tprimary {}: ".format(name) +
                  str(round(count / len(df), 2) * 100).split(".")[0] + "%",
                  end="")
            count += 1

            tokens = string_formatting.get_formatted_tokens(row[name])
            for token in tokens:
                self.index[str(token)].add((row["id"]))  #adds to dict sets
        print()
def build_dic(df, field):
    '''
    Generates a set of unique, formatted terms from the given list and saves it
    '''
    #Make list of df[field]
    lst = []
    for x in df[field]:
        lst.append(x)

    count = 0
    dic = set()  #uses a set to avoid duplicates
    for elem in lst:
        count += 1
        print("\r\t{}s: ".format(field) +
              str(round(count / len(df), 2) * 100).split(".")[0] + "%",
              end="")
        tokens = get_formatted_tokens(elem)
        dic.update(tokens)
    print()
    return dic
Beispiel #7
0
    def lookup(self, string):
        '''
        Returns the ids that match the given string. Handles both strings with wildcards and regular strings, first tokenizing then using the index get the ids
        '''

        #Formatting
        terms = []
        if "*" in string:
            bigrams = get_bigrams(string)
            #print(bigrams)
            for bigram in bigrams:

                try:  #just in case it isn't there
                    #print(bigram, "IND", self.secondary_index[bigram])
                    terms += self.secondary_index[bigram]
                except:
                    continue
            terms = [t for t in terms if len(t) > 2]  #filter
            x = pd.Series(terms).value_counts().tolist()  #sorts by frequency
            y = pd.Series(
                terms).value_counts().index.tolist()  #sorts by frequency

            ind = len(x) - x[::-1].index(x[0])

            terms = y[:ind]
        else:
            terms = [string]
        #ID retrievals
        ids = []

        for term in terms:
            try:
                formatted = get_formatted_tokens(term)[0]
                ids += self.primary_index[formatted]
            except:
                continue
        return ids
def blm_generator(df):

    #Create a list of lists of tokens(strings), each representing the tokens of a sentence, to be used to create the BLM
    clean_sentences = []
    count = 0
    for _, row in df.iterrows():
        print("\r\tGathering sentences: " +
              str(round(count / len(df), 2) * 100).split(".")[0] + "%",
              end="")
        count += 1
        #Since you can't indicate whether you're searching on titles or descriptions, they're treated equally.
        title = get_bigram_tokens(row["title"])
        description = get_bigram_tokens(row["description"])
        if title != []:
            clean_sentences.append(title)
        if description != []:
            clean_sentences.append(description)
    print()

    #Create a set of all unique words, stemmed or lemmatized stop-words (so that all words can have a prediction)
    unique_words = set()
    count = 0
    for x in clean_sentences:
        print("\r\tGathering unique words: " +
              str(round(count / len(clean_sentences), 2) * 100).split(".")[0] +
              "%",
              end="")
        count += 1
        for y in x:
            stemmed = get_formatted_tokens(y)
            if len(stemmed) != 0:
                stemmed = stemmed[0]
            else:
                stemmed = y
            unique_words.add(stemmed)
    print()

    #Time to generate the BLM
    blm = dict()
    for x in unique_words:  #Fill it with empty lists
        blm[x] = []

    #Add every second word to the associated list of the first word
    count = 0
    for sentence in clean_sentences:
        print("\r\tBuilding lists of bigram for each word: " +
              str(round(count / len(clean_sentences), 2) * 100).split(".")[0] +
              "%",
              end="")
        count += 1
        for i in range(len(sentence) - 1):
            first = get_formatted_tokens(sentence[i])
            if len(first) != 0:
                first = first[0]
            else:
                first = sentence[i]

            second = sentence[i + 1]

            blm[first].append(second)
    print()

    #Using those lists of words, sort, count, and order so that the list of each dict entry only contains the 5 most popular distinct words from that list
    count = 0
    for x in unique_words:
        print("\r\tCleaning up lists: " +
              str(round(count / len(unique_words), 2) * 100).split(".")[0] +
              "%",
              end="")
        count += 1

        blm[x].sort()

        unique_words_in_x = set(blm[x])

        tuples = []  #list of tuples
        for word in unique_words_in_x:
            tuples.append((word, len([i for i in blm[x] if i == word])))

        #Sort words in descending order of counts
        tuples.sort(key=lambda count: count[1], reverse=True)
        #Assign the list
        blm[x] = [i[0] for i in tuples][:5]
    print()

    return blm
Beispiel #9
0
def generate_td_idf():
    infile = open("./save_files/UO/descriptions_index.obj", "rb")
    desc = pickle.load(infile)
    infile.close()
    infile = open("./save_files/UO/title_index.obj", "rb")
    title = pickle.load(infile)
    infile.close()
    corpus = pd.read_csv("save_files/UO/corpus.csv", sep="|")

    corpus['title'] = corpus['title'].apply(lambda x: get_formatted_tokens(x))
    corpus['description'] = corpus['description'].apply(
        lambda x: get_formatted_tokens(x))

    # Generate tf
    new = {}
    for k, v in title.items():
        v = list(v)
        for i in range(0, len(v)):
            lst = corpus.loc[v[i], 'title']
            v[i] = (v[i], lst.count(k))
        v = set(v)
        new.update({k: v})
    title = new

    new = {}
    for k, v in desc.items():
        v = list(v)
        for i in range(len(v)):
            lst = corpus.loc[v[i], 'description']
            v[i] = (v[i], lst.count(k))
        v = set(v)
        new.update({k: v})
    desc = new

    # Normalize using log(1+tf) where tf is the raw frequency
    new = {}
    for k, v in desc.items():
        v = list(v)
        for i in range(len(v)):
            v[i] = list(v[i])
            v[i][1] = math.log(1 + v[i][1])
            v[i] = tuple(v[i])
        v = set(v)
        new.update({k: v})
    desc = new

    new = {}
    for k, v in title.items():
        v = list(v)
        for i in range(len(v)):
            v[i] = list(v[i])
            v[i][1] = math.log(1 + v[i][1])
            v[i] = tuple(v[i])
        v = set(v)
        new.update({k: v})
    title = new

    #tf-idf
    new = {}
    for k, v in title.items():
        v = list(v)
        for i in range(len(v)):
            v[i] = list(v[i])
            v[i][1] = v[i][1] * math.log(624 / len(v))
            v[i] = tuple(v[i])
        v = set(v)
        new.update({k: v})
    title = new

    new = {}
    for k, v in desc.items():
        v = list(v)
        for i in range(len(v)):
            v[i] = list(v[i])
            v[i][1] = v[i][1] * math.log(624 / len(v))
            v[i] = tuple(v[i])
        v = set(v)
        new.update({k: v})
    desc = new

    pickle.dump(title, open("./save_files/UO/title_index_with_weight.obj",
                            "wb"))
    pickle.dump(
        desc, open("./save_files/UO/descriptions_index_with_weight.obj", "wb"))