コード例 #1
    def __init__(self):
        self.aan = aanmeta()
        self.cnw = CitationNetwork()

        self.prestige_features = {}
        self.position_features = {}
        self.content_features = {}
        self.style_features = {}

コード例 #2
    def main(self):

        n = int(sys.argv[1])
        diff = sys.argv[2] if len(sys.argv) > 2 else 3

        init = 1980
        last = 2006

        aan = aanmeta()
        all_papers = aan.get_restricted_papers(init, last)

        till_n = [p for p in all_papers if p.year <= n]

        training = random.sample(till_n, int(math.ceil(0.8 * len(till_n))))
        model_test = [i for i in till_n if i not in training]
        test = [i for i in all_papers if i.year == (n + diff)]

        print "Total files: %d" % (len(training) + len(model_test) + len(test))

        self.feats = {}
        featfile = open("1980_2006.pruned_feats", "r")
        for line in featfile:
            line = line.strip()
            [pid, featstr] = line.split("\t")
            self.feats[pid] = featstr.split("<>")

        training_fname = "experiment_files/1980_%s.train.txt" % n
        model_test_fname = "experiment_files/1980_%s.modeltest.txt" % n
        test_fname = "experiment_files/%s.test.txt" % (n + diff)

        self.write_data(training, training_fname)
        self.write_data(model_test, model_test_fname)
        self.write_data(test, test_fname)

        # creating the response files
        self.cnw = CitationNetwork()
        training_resp_file = open("experiment_files/1980_%s.train.resp.txt" % n, "w")
        model_test_resp_file = open("experiment_files/1980_%s.modeltest.resp.txt" % n, "w")
        test_resp_file = open("experiment_files/%s.test.resp.txt" % (n + diff), "w")
        self.write_response(training, training_resp_file, n)
        self.write_response(model_test, model_test_resp_file, n)
        self.write_response(test, test_resp_file, n)

        # write the time step files
        ts_file = open("experiment_files/%s_%s_timesteps.txt" % (init, n), "w")
        for pid in [i.pid for i in all_papers if i.year >= init and i.year <= n]:
            ts_file.write("%s\t%d\n" % (pid, get_year_from_id(pid)))
コード例 #3
class BuildExperiment:
    def __init__(self):
        self.aan = aanmeta()
        self.cnw = CitationNetwork()

        self.prestige_features = {}
        self.position_features = {}
        self.content_features = {}
        self.style_features = {}


    def write_features(self, tyear):

        all_papers = [p for p in self.aan.get_restricted_papers(1980, tyear)]
        train_normal = [p.pid for p in all_papers if p.year <= (tyear - 3)]
        train_ext_1 = [p.pid for p in all_papers if p.year == (tyear - 1)]
        train_ext_2 = [p.pid for p in all_papers if p.year == (tyear - 2)]
        test = [p.pid for p in all_papers if p.year == tyear]

        # write feature files
        ft_types = {
            "prestige": self.prestige_features,
            "position": self.position_features,
            "content": self.content_features,
            "style": self.style_features,

        experiment_combs = [
            ["prestige", "position"],
            ["prestige", "content"],
            ["prestige", "position", "content"],
            ["prestige", "style"],
            ["prestige", "content", "style"],
            ["prestige", "content", "style", "position"],

        for comb in experiment_combs:
            print "Writing %s features" % str(comb)
            trainfile = open(
                "experiment_files/train_feat." + str(tyear) + "." + "_".join([i[0:2] for i in comb]) + ".txt", "w"
            for pid in (
                train_normal + train_ext_1 + train_ext_2
            ):  # join these since there is no difference in terms of features
                # dict(dict1.items()+dic2.items()) merges two dictionaries; reduce adds all the feature dictionaries in the comb
                features = dict(
                    reduce(lambda x, y: x + y, [ft_types[c][pid].items() for c in comb if pid in ft_types[c]])
                trainfile.write(pid + "\t" + json.dumps(features) + "\n")
            print "Finished training features"

            testfile = open(
                "experiment_files/test_feat." + str(tyear) + "." + "_".join([i[0:2] for i in comb]) + ".txt", "w"
            for pid in test:
                features = dict(
                    reduce(lambda x, y: x + y, [ft_types[c][pid].items() for c in comb if pid in ft_types[c]])
                testfile.write(pid + "\t" + json.dumps(features) + "\n")
            print "Finished test features"

            normal_train_labels = {pid: self.cnw.if_cited_first_n_years(pid, 3) for pid in train_normal}
            ext_1_labels = self.get_ext_labels(train_ext_1, train_normal, 1)
            ext_2_labels = self.get_ext_labels(train_ext_2, train_normal, 2)
            training_labels = dict(normal_train_labels.items() + ext_1_labels.items() + ext_2_labels.items())
            test_labels = {pid: self.cnw.if_cited_first_n_years(pid, 3) for pid in test}

            train_label_f = open("experiment_files/train_resp." + str(tyear) + ".txt", "w")
            for (pid, label) in training_labels.items():
                train_label_f.write("%s\t%s\n" % (pid, str(label)))

            test_label_f = open("experiment_files/test_resp." + str(tyear) + ".txt", "w")
            for (pid, label) in test_labels.items():
                test_label_f.write("%s\t%s\n" % (pid, str(label)))

    def get_ext_labels(self, train_ext, train_normal, n):
        gold = 3
        ratios = []
        for pid in train_normal:
            cits_3 = self.cnw.citations_first_n_years(pid, 3)
            cits_n = self.cnw.citations_first_n_years(pid, n)
            if cits_3 > 0:
                ratios.append(cits_n / float(cits_3))
        mult = sum(ratios) / len(ratios)

        ret_labels = {}
        for pid in train_ext:
            ret_labels[pid] = True if math.ceil(self.cnw.citations_first_n_years(pid, n) / mult) > 1 else False
        return ret_labels

    def load_features(self):
        print "Loading base features ..."
        print "Loading ft ngram features ..."
        print "Loading cited sentiment/purpose features ..."
        print "Loading liwc features ..."
        print "Loading cited network features ..."
        print "Loading lexical network features ..."

    def load_network_features(self):

        nw_file = open("/data0/projects/fuse/entity_prediction/features_cache/citednw_features_cache.txt", "r")
        for line in nw_file:
            items = line.strip().split(";")
            pid = items[0]

            if not pid in self.position_features:
                self.position_features[pid] = {}

            for item in items[1:]:
                (feat, value) = item.split(":")
                self.position_features[pid][feat] = float(value)

    def load_liwc_features(self):

        liwc_file = open("/data0/projects/fuse/entity_prediction/features_cache/wordclass_features_cache.txt", "r")
        for line in liwc_file:
            items = line.strip().split(";")
            pid = items[0]

            if not pid in self.style_features:
                self.style_features[pid] = {}

            for item in items[1:]:
                if item == "":
                (feat, value) = item.split(":")
                self.style_features[pid][feat] = float(value)

    def load_base_features(self):
        # get author, venue, institution and term features
        for p in self.aan.metadata:
            authors = ["author_" + str(i.authorid) for i in p.authors]
            insts = ["inst_" + str(i.instid) for i in p.institutions]

            pid = p.pid

            if pid not in self.prestige_features:
                self.prestige_features[pid] = {}

            for a in authors:
                self.prestige_features[pid][a] = 1
            for i in insts:
                self.prestige_features[pid][i] = 1  # add inst so that features won't be mixed up

            self.prestige_features[pid]["Venue_" + p.pid[0]] = 1

            if pid not in self.content_features:
                self.content_features[pid] = {}

            for term in p.terms:
                self.content_features[pid][term.termid] = 1

    def load_ft_features(self):
        # features from Yogatama's paper
        ngrams_file = open("features_cache/1980_2006.pruned_feats.ft", "r")

        for line in ngrams_file:
            (pid, featstr) = line.strip().split("\t")

            if pid not in self.content_features:
                self.content_features[pid] = {}

            feats = featstr.split("<>")

            venue_regex = re.compile("^Venue_")
            author_regex = re.compile("^author_")

            for f in feats:
                if venue_regex.match(f) or author_regex.match(
                ):  # we have better venue and author features now, so ignore these
                self.content_features[pid][f] = 1

    def load_amjad_features(self):
        # features from Amjad's code
        position_cit_features = [

        amjad_file = open("features_cache/amjad_features_cache.txt", "r")
        for line in amjad_file:
            items = line.strip().split(";")
            pid = items[0]

            if pid not in self.position_features:
                self.position_features[pid] = {}

            feat_hash = {}
            for item in items[1:]:
                (feat, val) = item.split(":")
                feat_hash[feat] = float(val)

            total_out = feat_hash["outgoing_citation_cnt"] if "outgoing_citation_cnt" in feat_hash else 0
            for i in position_cit_features:
                if i in feat_hash:
                    self.position_features[pid][i] = feat_hash[i] / total_out if total_out > 0 else 0

    def load_lexical_features(self):
        # features from lexical code

        #        papers = [p.pid for p in self.aan.get_restricted_papers(1980, 2011)]
        lexrank_features = [
        ]  # from lexrank file
        sim_features = [
        ]  # from sim file
        title_density_features = [
        ]  # from title term
        abs_density_features = ["min_abs_term_weight", "max_abs_term_weight", "avg_abs_term_weight", "abs_term_density"]

        lexrank_file = open(
            "/data0/projects/fuse/citation_prediction/lexical_features/lexrank_features_allyears.txt", "r"
        sim_file = open(
            "/data0/projects/fuse/citation_prediction/lexical_features/similarity_features_allyears.txt", "r"
        title_d_file = open(
            "/data0/projects/fuse/citation_prediction/lexical_features/title_term_densification_allyears.txt", "r"
        abs_d_file = open(
            "/data0/projects/fuse/citation_prediction/lexical_features/abstract_term_densification_allyears.txt", "r"

        self.read_lex_features(lexrank_features, lexrank_file)
        self.read_lex_features(sim_features, sim_file)
        self.read_lex_features(title_density_features, title_d_file)
        self.read_lex_features(abs_density_features, abs_d_file)

    def read_lex_features(self, feat_list, in_file):
        for line in in_file:
            items = line.strip().split("\t")
            pid = items[0]
            if pid not in self.position_features:
                self.position_features[pid] = {}
            feats = items[1:]
            for idx, feat in enumerate(feat_list):
                    self.position_features[pid][feat] = float(feats[idx])
                    self.position_features[pid][feat] = 0
コード例 #4
class create_data:
    def __init__(self):
        self.cnw_feat = CitationNetworkFeatures()

    def main(self):

        n = int(sys.argv[1])
        diff = sys.argv[2] if len(sys.argv) > 2 else 3

        init = 1980
        last = 2006

        aan = aanmeta()
        all_papers = aan.get_restricted_papers(init, last)

        till_n = [p for p in all_papers if p.year <= n]

        training = random.sample(till_n, int(math.ceil(0.8 * len(till_n))))
        model_test = [i for i in till_n if i not in training]
        test = [i for i in all_papers if i.year == (n + diff)]

        print "Total files: %d" % (len(training) + len(model_test) + len(test))

        self.feats = {}
        featfile = open("1980_2006.pruned_feats", "r")
        for line in featfile:
            line = line.strip()
            [pid, featstr] = line.split("\t")
            self.feats[pid] = featstr.split("<>")

        training_fname = "experiment_files/1980_%s.train.txt" % n
        model_test_fname = "experiment_files/1980_%s.modeltest.txt" % n
        test_fname = "experiment_files/%s.test.txt" % (n + diff)

        self.write_data(training, training_fname)
        self.write_data(model_test, model_test_fname)
        self.write_data(test, test_fname)

        # creating the response files
        self.cnw = CitationNetwork()
        training_resp_file = open("experiment_files/1980_%s.train.resp.txt" % n, "w")
        model_test_resp_file = open("experiment_files/1980_%s.modeltest.resp.txt" % n, "w")
        test_resp_file = open("experiment_files/%s.test.resp.txt" % (n + diff), "w")
        self.write_response(training, training_resp_file, n)
        self.write_response(model_test, model_test_resp_file, n)
        self.write_response(test, test_resp_file, n)

        # write the time step files
        ts_file = open("experiment_files/%s_%s_timesteps.txt" % (init, n), "w")
        for pid in [i.pid for i in all_papers if i.year >= init and i.year <= n]:
            ts_file.write("%s\t%d\n" % (pid, get_year_from_id(pid)))

    def get_lex_features(self, pid):
        features = {}
        #        features_str = Popen(["perl", "../citation_prediction/get_lexical_features.pl", pid, str(get_year_from_id(pid))], stdout=PIPE).communicate()[0]
        features_str = Popen(
            ["perl", "../citation_prediction/get_lexical_features.pl", pid, "0"], stdout=PIPE
        for idx, feat in enumerate(features_str.split("\t")[1:]):
                features["lf_" + str(idx)] = float(feat)
                features["lf_" + str(idx)] = 0.0
        return features

    def write_data(self, pobjs, data_fname):
        pids = [p.pid for p in pobjs]
        dataout = open(data_fname, "w")
        #       dataout_nw = open(data_fname+".nw", "w")
        #        dataout_nwlex = open(data_fname+".nwlex", "w")
        dataout_lex = open(data_fname + ".lex", "w")

        for pid in pids:
            print pid
            out = {feat: 1 for feat in self.feats[pid]}
            lex = self.get_lex_features(pid)
            #            nw = self.cnw_feat.cited_nw_features(pid)
            #            out_nw =  dict(out.items() + nw.items())
            #            out_nw_lex = dict(out.items() + nw.items() +lex.items())
            out_lex = dict(out.items() + lex.items())
                jsonout = json.dumps(out)
                jsonout_lex = json.dumps(out_lex)
            #               jsonout_nw = json.dumps(out_nw)
            #                jsonout_nwlex = json.dumps(out_nw_lex)
            except (UnicodeDecodeError):
                print "error with " + str(out)

            dataout.write(pid + "\t" + jsonout + "\n")
            dataout_lex.write(pid + "\t" + jsonout_lex + "\n")
        #            dataout_nw.write(pid+"\t"+jsonout_nw+"\n")
        #            dataout_nwlex.write(pid+"\t"+jsonout_nwlex+"\n")

        #        dataout_nw.close()
        #        dataout_nwlex.close()

    def write_response(self, pobjs, outfile, n=0):
        pids = [p.pid for p in pobjs]
        for pid in pids:
            outfile.write("%s\t%s\n" % (pid, self.cnw.if_cited_first_n_years(pid, 3)))