Example #1
0
class FeaturesW2VSim(object):
    """Implements our simCosW2V feature, i.e., the cosine similarity between the profession and person vectors, \
    where the profession (resp. person) vector is the centroid of TFIDF-weighted word2vec vectors of top-K profession \
    (resp. person) terms.
    """

    # Formula for feature computation:
    #
    # $cos(\vec{t}^{w2v}_{pe, k}, \vec{t}^{w2v}_{pr, k})$, where for item $\in \{pe, pr\}$:
    # $$\vec{t}^{w2v}_{item, k} = \sum_{t \in T_k(item)} w(t, item) w2v(t)$$
    # (note that using these unnormalized sums in the computation of $cos()$ is equivalent to use the actual centroids).

    CONTENT_FIELD = "content"
    PROF_FIELD = "professions"
    K_VALUES = [10, 50, 100, 200, 500, 1000]
    MAX_K = max(K_VALUES)

    def __init__(self, index_name=WP_ST_INDEX_ID):
        self.__elastic = ElasticCache(index_name)
        self.__stats = None

    def load_termstats(self, input_file):
        self.__stats = {}
        with FileUtils.open_file_by_type(input_file) as f_in:
            rank = 0
            last_prof = None
            for line in f_in:
                prof, term, tf, df, tfidf = line.strip().split("\t")
                if prof != last_prof:
                    rank = 0
                    last_prof = prof
                rank += 1
                if term in STOPWORDS:  # filter stopwords
                    continue
                if term.startswith("fb_"):  # filter entity terms
                    continue
                if prof not in self.__stats:
                    self.__stats[prof] = {}
                self.__stats[prof][term] = {
                    "tf": int(tf),
                    "df": int(df),
                    "tfidf": float(tfidf),
                    "rank": rank
                }

    def get_person_tf(self, person_id):
        """Get aggregated TF for a person.

        :param person_id: dict with TFs.
        :return:
        """
        doc_ids = self.__elastic.search(person_id,
                                        self.CONTENT_FIELD,
                                        num=10000).keys()

        tf_agg = {}
        for doc_id in doc_ids:
            tv = self.__elastic.get_termvector(
                doc_id, self.CONTENT_FIELD)  # , term_stats=True)
            for t, val in tv.items():
                tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"]
        return tf_agg, len(doc_ids)

    def generate_features(self, kb_file, output_file):
        """Core function for generating into output_file the features, with person-item data from kb_file.

        :param kb_file: path to the file with person items (a '.kb'-extension file).
        :param output_file:
        :return:
        """
        feat_w2v_approx = FeaturesW2VSimApprox()

        with open(output_file, "w") as f_out:
            # write tsv header
            header = ["person_id", "prof_id"]
            for k in self.K_VALUES:
                header.append("simCos_w2v_" + str(k))
            f_out.write("\t".join(header) + "\n")

            for line in FileUtils.read_file_as_list(kb_file):
                person_id, prof_id = line.split(
                    "\t")  # strip() done in read_file_as_list()
                values = [person_id, prof_id]

                person_tf, num_sent = self.get_person_tf(person_id)

                for k in self.K_VALUES:
                    # we take top-K profession terms

                    # compute simCosK
                    # where K is the top-K terms for the profession
                    term_weights_pr = {
                    }  # dict from top-K profession terms to their tfidf weights
                    term_weights_pe = {
                    }  # dict from top-K person terms to their tfidf weights

                    if prof_id in self.__stats:
                        for term, s in self.__stats[prof_id].items():
                            if s["rank"] <= k:
                                term_weights_pr[term] = float(s["tfidf"])
                                idf = s["tfidf"] / s[
                                    "tf"]  # we back-generate IDF from profession's TF-IDF
                                term_weights_pe[term] = person_tf.get(term,
                                                                      0) * idf

                        vec_pr = feat_w2v_approx.get_vector(term_weights_pr)
                        vec_pe = feat_w2v_approx.get_vector(term_weights_pe)
                        cos = cos_sim(vec_pr, vec_pe)
                    else:
                        cos = 0  # in some exceptional cases the profession does not have any sentences
                    values.append(str(cos))

                f_out.write("\t".join(values) + "\n")
Example #2
0
class FeaturesTermStats():
    CONTENT_FIELD = "content"
    PROF_FIELD = "professions"
    K_VALUES = [10, 50, 100, 200, 500, 1000]
    STOPWORDS = [
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
        "your", "yours", "yourself", "yourselves", "he", "him", "his",
        "himself", "she", "her", "hers", "herself", "it", "its", "itself",
        "they", "them", "their", "theirs", "themselves", "what", "which",
        "who", "whom", "this", "that", "these", "those", "am", "is", "are",
        "was", "were", "be", "been", "being", "have", "has", "had", "having",
        "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if",
        "or", "because", "as", "until", "while", "of", "at", "by", "for",
        "with", "about", "against", "between", "into", "through", "during",
        "before", "after", "above", "below", "to", "from", "up", "down", "in",
        "out", "on", "off", "over", "under", "again", "further", "then",
        "once", "here", "there", "when", "where", "why", "how", "all", "any",
        "both", "each", "few", "more", "most", "other", "some", "such", "no",
        "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
        "t", "can", "will", "just", "don", "should", "now"
    ]
    MAX_K = max(K_VALUES)

    def __init__(self, index_name=WP_ST_INDEX_ID):
        self.__elastic = ElasticCache(index_name)
        self.__stats = None

    def load_termstats(self, input_file):
        """load term statistics from file"""
        self.__stats = {}
        with FileUtils.open_file_by_type(input_file) as f_in:
            rank = 0
            last_prof = None
            for line in f_in:
                prof, term, tf, df, tfidf = line.strip().split("\t")
                if prof != last_prof:
                    rank = 0
                    last_prof = prof
                rank += 1
                if term in self.STOPWORDS:  # filter stopwords
                    continue
                if term.startswith("fb_"):  # filter entity terms
                    continue
                if prof not in self.__stats:
                    self.__stats[prof] = {}
                self.__stats[prof][term] = {
                    "tf": int(tf),
                    "df": int(df),
                    "tfidf": float(tfidf),
                    "rank": rank
                }

    def get_person_tf(self, person_id):
        """
        Get aggregated TF for a person
        :param person_id: dict with TFs
        :return:
        """
        doc_ids = self.__elastic.search(person_id,
                                        self.CONTENT_FIELD,
                                        num=10000).keys()
        print(person_id, "with", len(doc_ids), "sentences")
        tf_agg = {}
        for doc_id in doc_ids:
            tv = self.__elastic.get_termvector(
                doc_id, self.CONTENT_FIELD)  # , term_stats=True)
            for t, val in tv.items():
                tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"]
        return tf_agg, len(doc_ids)

    def generate_features(self, kb_file, output_file):
        """Generating features related to term statistics"""

        fout = open(output_file, "w")

        # write tsv header
        header = ["person", "profession"]
        for k in self.K_VALUES:
            header.append("sumProfTerms_" + str(k))
            header.append("simCos_" + str(k))
        fout.write("\t".join(header) + "\n")

        with FileUtils.open_file_by_type(kb_file) as kb_f:
            for line in kb_f:
                person_id, prof_id = line.strip().split("\t")
                values = [person_id, prof_id]

                person_tf, num_sent = self.get_person_tf(person_id)

                for k in self.K_VALUES:
                    # we take top-K profession terms

                    # Compute sumProfTerms: \sum_{t \in T_k(pr)}\sum_{s \in S(pe)} tf(t,s) w(t,pr)
                    # where w(t,pe )= TFIDF(t,pr) = \frac{\sum_{s \in S(pr)} tf(t,s)}
                    sum_prof_terms = 0
                    for term, tf in person_tf.items():
                        pt = self.__stats.get(prof_id, {}).get(term, {})
                        if pt.get("rank",
                                  100000) > k:  # skip term if not in top-K
                            continue
                        sum_prof_terms += tf * pt.get("tfidf", 0)
                    values.append(str(sum_prof_terms))

                    # compute simCosK
                    # where K is the top-K terms for the profession
                    vec_pr = []  # construct prof vector
                    vec_pe = []  # construct person vector

                    if prof_id in self.__stats:
                        for term, s in self.__stats[prof_id].items():
                            if s["rank"] <= k:
                                vec_pr.append(s["tfidf"])
                                idf = s["tfidf"] / s[
                                    "tf"]  # we back-generate IDF from profession's TF-IDF
                                vec_pe.append(person_tf.get(term, 0) * idf)
                        cos = cos_sim(vec_pr, vec_pe)
                    else:
                        cos = 0  # in some exceptional cases the profession does not have any sentences
                    values.append(str(cos))

                fout.write("\t".join(values) + "\n")
                print(values)

        fout.close()
Example #3
0
class ProfStats():
    CONTENT_FIELD = "content"
    PROF_FIELD = "professions"
    K = 30000  # keep top-K profession terms

    def __init__(self, index_name=WP_ST_INDEX_ID):
        self.__elastic = ElasticCache(index_name)

    def gen_stats(self, prof, output_file):
        """Writes the stats into the file."""
        print("\tgetting term frequencies ...")
        tf, df = self.get_tf_agg(prof)
        # print("\tgetting document frequencies ... (", len(tf.keys()), "terms)")
        # df2 = self.get_df(tf.keys())
        print("\tcomputing tf-idf ...")
        tf_idf = self.compute_tf_idf(tf, df)

        out_str = ""
        i = 0
        for t, tfidf in sorted(tf_idf.items(),
                               key=lambda x: x[1],
                               reverse=True):
            out_str += prof + "\t" + t + "\t" + str(tf[t]) + "\t" + str(
                df[t]) + "\t" + str(tfidf) + "\n"
            i += 1
            if i == self.K:  # Only print top-k terms
                break
        open(output_file, "a").write(out_str)
        return

    def compute_tf_idf(self, tf, df):
        """Computes tf.idf = (tf/doc_len) * (log n(docs)/df)

        :param tf: dictionary of tf for all terms
        :param df: dictionary of df for all terms
        :return: dictionary of tf.idf scores
        """
        tf_idf = {}
        prof_doc_len = sum(tf.values())
        for t in tf.keys():
            normalized_tf = tf[t] / prof_doc_len
            n_docs = self.__elastic.num_docs()
            idf = math.log(n_docs / df[t])
            tf_idf[t] = normalized_tf * idf
        return tf_idf

    def get_df(self, terms):
        """Returns document frequency for all terms."""
        df = {}
        for t in terms:
            df[t] = self.__elastic.doc_freq(t, field=self.CONTENT_FIELD)
        return df

    def get_tf_agg(self, prof):
        """Given a list of ids to get all their tf_idf in a dictionary."""
        size = 1000
        tf_agg = {}
        df = {}
        # doc_ids = self.__elastic.search(prof, self.PROF_FIELD, num=size).keys()
        doc_ids = self.__elastic.search_scroll(prof,
                                               field=self.PROF_FIELD,
                                               num=size).keys()
        print(len(doc_ids), "sentences")
        for i, doc_id in enumerate(doc_ids):
            tv = self.__elastic.get_termvector(doc_id,
                                               self.CONTENT_FIELD,
                                               term_stats=True)
            for t, val in tv.items():
                tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"]
                if t not in df:
                    df[t] = val["doc_freq"]
        return tf_agg, df