Ejemplo n.º 1
0
    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a Series of terms ranked by their similarity to the query.
        The query can be:

        - A DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term

        If the query contains 5 or fewer terms, it will be expanded to include
        neighboring terms in ConceptNet.

        TODO: is this sometimes returning a DataFrame? Should it accept a
        Series as well as a DataFrame?
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[:self.small_k]
        search_frame = self.small_frame
        if filter:
            exact_only = filter.count('/') >= 3
            # TODO: Is this duplicating something that field_match was supposed
            # to do?
            if filter.endswith('/.'):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx:idx + 1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_key = filter
                # '0' is the character after '/', so end_key is the first possible
                # key that's not a descendant of the given filter key
                end_key = filter + '0'
                try:
                    start_idx = search_frame.index.get_loc(start_key,
                                                           method='bfill')
                except KeyError:
                    start_idx = len(search_frame.index)
                try:
                    end_idx = search_frame.index.get_loc(end_key,
                                                         method='bfill')
                except KeyError:
                    end_idx = len(search_frame.index)
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame,
                                        small_vec,
                                        limit=limit * 50)
        similar_choices = l2_normalize_rows(
            self.frame.loc[similar_sloppy.index].astype('f'))

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar
Ejemplo n.º 2
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
Ejemplo n.º 3
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
Ejemplo n.º 4
0
    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a DataFrame of terms ranked by their similarity to the query.
        The query can be:

        - A DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term

        If the query contains 5 or fewer terms, it will be expanded to include
        neighboring terms in ConceptNet.
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[: self.small_k]
        search_frame = self.small_frame
        if filter:
            exact_only = filter.count("/") >= 3
            if filter.endswith("/."):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx : idx + 1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_key = filter
                # '0' is the character after '/', so end_key is the first possible
                # key that's not a descendant of the given filter key
                end_key = filter + "0"
                try:
                    start_idx = search_frame.index.get_loc(start_key, method="bfill")
                except KeyError:
                    start_idx = len(search_frame.index)
                try:
                    end_idx = search_frame.index.get_loc(end_key, method="bfill")
                except KeyError:
                    end_idx = len(search_frame.index)
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame, small_vec, limit=limit * 50)
        similar_choices = l2_normalize_rows(self.frame.loc[similar_sloppy.index].astype("f"))

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar
Ejemplo n.º 5
0
    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a Series of terms ranked by their similarity to the query.
        The query can be:

        - A pandas Series of weighted terms
        - A pandas DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term
        - An existing vector

        If the query contains 5 or fewer terms, it will be expanded using the
        out-of-vocab strategy.
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[:self.small_k]
        search_frame = self.small_frame
        # TODO: document filter
        if filter:
            exact_only = filter.count('/') >= 3
            if filter.endswith('/.'):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx:idx + 1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_idx, end_idx = self._index_prefix_range(filter + '/')
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame,
                                        small_vec,
                                        limit=limit * 50)
        similar_choices = l2_normalize_rows(
            self.frame.loc[similar_sloppy.index].astype('f'))

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar
Ejemplo n.º 6
0
    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a Series of terms ranked by their similarity to the query.
        The query can be:

        - A pandas Series of weighted terms
        - A pandas DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term
        - An existing vector

        If the query contains 5 or fewer terms, it will be expanded to include
        neighboring terms in ConceptNet.
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[: self.small_k]
        search_frame = self.small_frame
        # TODO: document filter
        if filter:
            exact_only = filter.count('/') >= 3
            if filter.endswith('/.'):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx : idx + 1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_idx, end_idx = self._index_prefix_range(filter + '/')
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame, small_vec, limit=limit * 50)
        similar_choices = l2_normalize_rows(
            self.frame.loc[similar_sloppy.index].astype('f')
        )

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar
Ejemplo n.º 7
0
def make_replacements(small_frame, big_frame):
    """
    Create a replacements dictionary to map terms only present in a big frame to the closest term
    in a small_frame. This method uses a brute-force solution.
    """
    intersected = big_frame.loc[small_frame.index].dropna()
    replacements = {}
    for term in big_frame.index:
        if term not in small_frame.index:
            most_similar = similar_to_vec(intersected, big_frame.loc[term], limit=1)
            got = list(most_similar.index)
            if got:
                replacements[term] = got[0]
    return replacements